Compare commits
123 Commits
feat/auto-
...
feat/minio
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
822d2b0609 | ||
|
|
dfd36dacf8 | ||
|
|
4ea7139ed5 | ||
|
|
63023a4061 | ||
|
|
c98c409f59 | ||
|
|
92f88fdcd6 | ||
|
|
832ef1cf83 | ||
|
|
9a24b4896c | ||
|
|
dd8fce6e35 | ||
|
|
f82eed4dc3 | ||
|
|
6490df9faf | ||
|
|
a077f81c65 | ||
|
|
6bcadd9e71 | ||
|
|
a77bf8611a | ||
|
|
33feca3138 | ||
|
|
7d85a97b63 | ||
|
|
ce081effd4 | ||
|
|
2ed088b4d8 | ||
|
|
d3c49fa246 | ||
|
|
52cb5014fd | ||
|
|
50654be910 | ||
|
|
cdab71a1ee | ||
|
|
a35976b9e9 | ||
|
|
c68210c485 | ||
|
|
f2864bd2ad | ||
|
|
eca9e85242 | ||
|
|
3f958fbff3 | ||
|
|
c84ef0396b | ||
|
|
e1c67dcee5 | ||
|
|
34c8a8cc67 | ||
|
|
6cd1f55119 | ||
|
|
e918234928 | ||
|
|
888a608485 | ||
|
|
b5c3b05246 | ||
|
|
fdce5e0302 | ||
|
|
4679b245de | ||
|
|
a837070f54 | ||
|
|
5a929e9803 | ||
|
|
52b0fad410 | ||
|
|
9944031eea | ||
|
|
2babaa7136 | ||
|
|
90567511dd | ||
|
|
beb16ad0cb | ||
|
|
fc7fc5ea85 | ||
|
|
ab8956b14b | ||
|
|
1d9c90641f | ||
|
|
6126b907f2 | ||
|
|
cc93d2d483 | ||
|
|
7642c17ec0 | ||
|
|
cb60dcf352 | ||
|
|
5ffe05d519 | ||
|
|
8e2f07c941 | ||
|
|
0b6e615075 | ||
|
|
be251c6fb3 | ||
|
|
efb1e89e33 | ||
|
|
529c447413 | ||
|
|
1eaf95c06b | ||
|
|
138ed17d8b | ||
|
|
a880c41d89 | ||
|
|
2a9ae61dce | ||
|
|
1f21911fa1 | ||
|
|
6f0a58f5d2 | ||
|
|
8206dce821 | ||
|
|
ced1afaa8a | ||
|
|
d6c602c567 | ||
|
|
a252a7fefd | ||
|
|
83b06c21cc | ||
|
|
f5214da54c | ||
|
|
e3d4dd0127 | ||
|
|
d0ee0d72f5 | ||
|
|
521f0550cd | ||
|
|
8a09691e91 | ||
|
|
459ad7d9c9 | ||
|
|
d102d27731 | ||
|
|
01810c40a1 | ||
|
|
b7d33e1cbf | ||
|
|
5b34b5a78c | ||
|
|
c091d2316b | ||
|
|
e8862b8a8b | ||
|
|
1b46ab699d | ||
|
|
ac1995f63f | ||
|
|
de93669652 | ||
|
|
dffc124920 | ||
|
|
932ceb0287 | ||
|
|
824d48fd85 | ||
|
|
47fdab0382 | ||
|
|
ed7ddc6375 | ||
|
|
cf06f4a8c0 | ||
|
|
a2fa21f65c | ||
|
|
61e915968f | ||
|
|
4949b22457 | ||
|
|
1fb0eb94c2 | ||
|
|
9aefb554bc | ||
|
|
a4338669a9 | ||
|
|
1fa9ea496c | ||
|
|
31756a2233 | ||
|
|
166583621b | ||
|
|
ca952c4674 | ||
|
|
4054778b6c | ||
|
|
56a5f00015 | ||
|
|
a96d50c481 | ||
|
|
4806212f46 | ||
|
|
2486f3c6b2 | ||
|
|
f25bebf6ee | ||
|
|
22dad6d0fc | ||
|
|
03eab66d35 | ||
|
|
97b1ab23d8 | ||
|
|
9fff0ba430 | ||
|
|
7d3e91b2e6 | ||
|
|
74957a9ec5 | ||
|
|
2d035c46cf | ||
|
|
53445fe72a | ||
|
|
37cc8956c5 | ||
|
|
197c82f921 | ||
|
|
2c52493a9c | ||
|
|
2ee2ba6b8c | ||
|
|
bafcf1694a | ||
|
|
95792aab15 | ||
|
|
38ae2c3a3e | ||
|
|
249d3c1b7f | ||
|
|
9647f94f89 | ||
|
|
afc288d2cf | ||
|
|
df01ce6aad |
@@ -1,6 +1,3 @@
|
||||
when:
|
||||
- event: [push, pull_request]
|
||||
|
||||
steps:
|
||||
# ===========================================
|
||||
# PR VALIDATION: Parallel type checks (PRs only)
|
||||
@@ -45,8 +42,34 @@ steps:
|
||||
when:
|
||||
event: pull_request
|
||||
|
||||
# ===========================================
|
||||
# AUTO-MERGE: Merge PR after all checks pass
|
||||
# ===========================================
|
||||
auto-merge:
|
||||
image: alpine:latest
|
||||
environment:
|
||||
GITEA_TOKEN:
|
||||
from_secret: gitea_token
|
||||
commands:
|
||||
- apk add --no-cache curl
|
||||
- |
|
||||
echo "Merging PR #${CI_COMMIT_PULL_REQUEST}..."
|
||||
curl -s -X POST \
|
||||
-H "Authorization: token $GITEA_TOKEN" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"Do":"merge"}' \
|
||||
"https://code.cannabrands.app/api/v1/repos/Creationshop/dispensary-scraper/pulls/${CI_COMMIT_PULL_REQUEST}/merge"
|
||||
depends_on:
|
||||
- typecheck-backend
|
||||
- typecheck-cannaiq
|
||||
- typecheck-findadispo
|
||||
- typecheck-findagram
|
||||
when:
|
||||
event: pull_request
|
||||
|
||||
# ===========================================
|
||||
# MASTER DEPLOY: Parallel Docker builds
|
||||
# NOTE: cache_from/cache_to removed due to plugin bug splitting on commas
|
||||
# ===========================================
|
||||
docker-backend:
|
||||
image: woodpeckerci/plugin-docker-buildx
|
||||
@@ -65,10 +88,10 @@ steps:
|
||||
platforms: linux/amd64
|
||||
provenance: false
|
||||
build_args:
|
||||
- APP_BUILD_VERSION=${CI_COMMIT_SHA}
|
||||
- APP_GIT_SHA=${CI_COMMIT_SHA}
|
||||
- APP_BUILD_TIME=${CI_PIPELINE_CREATED}
|
||||
- CONTAINER_IMAGE_TAG=${CI_COMMIT_SHA:0:8}
|
||||
APP_BUILD_VERSION: ${CI_COMMIT_SHA:0:8}
|
||||
APP_GIT_SHA: ${CI_COMMIT_SHA}
|
||||
APP_BUILD_TIME: ${CI_PIPELINE_CREATED}
|
||||
CONTAINER_IMAGE_TAG: ${CI_COMMIT_SHA:0:8}
|
||||
depends_on: []
|
||||
when:
|
||||
branch: master
|
||||
@@ -138,7 +161,7 @@ steps:
|
||||
event: push
|
||||
|
||||
# ===========================================
|
||||
# STAGE 3: Deploy (after Docker builds)
|
||||
# STAGE 3: Deploy and Run Migrations
|
||||
# ===========================================
|
||||
deploy:
|
||||
image: bitnami/kubectl:latest
|
||||
@@ -149,12 +172,17 @@ steps:
|
||||
- mkdir -p ~/.kube
|
||||
- echo "$KUBECONFIG_CONTENT" | tr -d '[:space:]' | base64 -d > ~/.kube/config
|
||||
- chmod 600 ~/.kube/config
|
||||
# Deploy backend first
|
||||
- kubectl set image deployment/scraper scraper=code.cannabrands.app/creationshop/dispensary-scraper:${CI_COMMIT_SHA:0:8} -n dispensary-scraper
|
||||
- kubectl rollout status deployment/scraper -n dispensary-scraper --timeout=300s
|
||||
# Note: Migrations run automatically at startup via auto-migrate
|
||||
# Deploy remaining services
|
||||
# Resilience: ensure workers are scaled up if at 0
|
||||
- REPLICAS=$(kubectl get deployment scraper-worker -n dispensary-scraper -o jsonpath='{.spec.replicas}'); if [ "$REPLICAS" = "0" ]; then echo "Scaling workers from 0 to 5"; kubectl scale deployment/scraper-worker --replicas=5 -n dispensary-scraper; fi
|
||||
- kubectl set image deployment/scraper-worker worker=code.cannabrands.app/creationshop/dispensary-scraper:${CI_COMMIT_SHA:0:8} -n dispensary-scraper
|
||||
- kubectl set image deployment/cannaiq-frontend cannaiq-frontend=code.cannabrands.app/creationshop/cannaiq-frontend:${CI_COMMIT_SHA:0:8} -n dispensary-scraper
|
||||
- kubectl set image deployment/findadispo-frontend findadispo-frontend=code.cannabrands.app/creationshop/findadispo-frontend:${CI_COMMIT_SHA:0:8} -n dispensary-scraper
|
||||
- kubectl set image deployment/findagram-frontend findagram-frontend=code.cannabrands.app/creationshop/findagram-frontend:${CI_COMMIT_SHA:0:8} -n dispensary-scraper
|
||||
- kubectl rollout status deployment/scraper -n dispensary-scraper --timeout=300s
|
||||
- kubectl rollout status deployment/cannaiq-frontend -n dispensary-scraper --timeout=120s
|
||||
depends_on:
|
||||
- docker-backend
|
||||
191
.woodpecker/ci.yml
Normal file
191
.woodpecker/ci.yml
Normal file
@@ -0,0 +1,191 @@
|
||||
steps:
|
||||
# ===========================================
|
||||
# PR VALIDATION: Only typecheck changed projects
|
||||
# ===========================================
|
||||
typecheck-backend:
|
||||
image: code.cannabrands.app/creationshop/node:20
|
||||
commands:
|
||||
- npm config set cache /npm-cache/backend --global
|
||||
- cd backend
|
||||
- npm ci --prefer-offline
|
||||
- npx tsc --noEmit
|
||||
volumes:
|
||||
- npm-cache:/npm-cache
|
||||
depends_on: []
|
||||
when:
|
||||
event: pull_request
|
||||
path:
|
||||
include: ['backend/**']
|
||||
|
||||
typecheck-cannaiq:
|
||||
image: code.cannabrands.app/creationshop/node:20
|
||||
commands:
|
||||
- npm config set cache /npm-cache/cannaiq --global
|
||||
- cd cannaiq
|
||||
- npm ci --prefer-offline
|
||||
- npx tsc --noEmit
|
||||
volumes:
|
||||
- npm-cache:/npm-cache
|
||||
depends_on: []
|
||||
when:
|
||||
event: pull_request
|
||||
path:
|
||||
include: ['cannaiq/**']
|
||||
|
||||
# findadispo/findagram typechecks skipped - they have || true anyway
|
||||
|
||||
# ===========================================
|
||||
# AUTO-MERGE: Merge PR after all checks pass
|
||||
# ===========================================
|
||||
auto-merge:
|
||||
image: alpine:latest
|
||||
environment:
|
||||
GITEA_TOKEN:
|
||||
from_secret: gitea_token
|
||||
commands:
|
||||
- apk add --no-cache curl
|
||||
- |
|
||||
echo "Merging PR #${CI_COMMIT_PULL_REQUEST}..."
|
||||
curl -s -X POST \
|
||||
-H "Authorization: token $GITEA_TOKEN" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"Do":"merge"}' \
|
||||
"https://code.cannabrands.app/api/v1/repos/Creationshop/dispensary-scraper/pulls/${CI_COMMIT_PULL_REQUEST}/merge"
|
||||
depends_on:
|
||||
- typecheck-backend
|
||||
- typecheck-cannaiq
|
||||
when:
|
||||
event: pull_request
|
||||
|
||||
# ===========================================
|
||||
# MASTER DEPLOY: Parallel Docker builds
|
||||
# ===========================================
|
||||
docker-backend:
|
||||
image: woodpeckerci/plugin-docker-buildx
|
||||
settings:
|
||||
registry: code.cannabrands.app
|
||||
repo: code.cannabrands.app/creationshop/dispensary-scraper
|
||||
tags:
|
||||
- latest
|
||||
- ${CI_COMMIT_SHA:0:8}
|
||||
dockerfile: backend/Dockerfile
|
||||
context: backend
|
||||
username:
|
||||
from_secret: registry_username
|
||||
password:
|
||||
from_secret: registry_password
|
||||
platforms: linux/amd64
|
||||
provenance: false
|
||||
cache_from: type=registry,ref=code.cannabrands.app/creationshop/dispensary-scraper:cache
|
||||
cache_to: type=registry,ref=code.cannabrands.app/creationshop/dispensary-scraper:cache,mode=max
|
||||
build_args:
|
||||
APP_BUILD_VERSION: ${CI_COMMIT_SHA:0:8}
|
||||
APP_GIT_SHA: ${CI_COMMIT_SHA}
|
||||
APP_BUILD_TIME: ${CI_PIPELINE_CREATED}
|
||||
CONTAINER_IMAGE_TAG: ${CI_COMMIT_SHA:0:8}
|
||||
depends_on: []
|
||||
when:
|
||||
branch: master
|
||||
event: push
|
||||
|
||||
docker-cannaiq:
|
||||
image: woodpeckerci/plugin-docker-buildx
|
||||
settings:
|
||||
registry: code.cannabrands.app
|
||||
repo: code.cannabrands.app/creationshop/cannaiq-frontend
|
||||
tags:
|
||||
- latest
|
||||
- ${CI_COMMIT_SHA:0:8}
|
||||
dockerfile: cannaiq/Dockerfile
|
||||
context: cannaiq
|
||||
username:
|
||||
from_secret: registry_username
|
||||
password:
|
||||
from_secret: registry_password
|
||||
platforms: linux/amd64
|
||||
provenance: false
|
||||
cache_from: type=registry,ref=code.cannabrands.app/creationshop/cannaiq-frontend:cache
|
||||
cache_to: type=registry,ref=code.cannabrands.app/creationshop/cannaiq-frontend:cache,mode=max
|
||||
depends_on: []
|
||||
when:
|
||||
branch: master
|
||||
event: push
|
||||
|
||||
docker-findadispo:
|
||||
image: woodpeckerci/plugin-docker-buildx
|
||||
settings:
|
||||
registry: code.cannabrands.app
|
||||
repo: code.cannabrands.app/creationshop/findadispo-frontend
|
||||
tags:
|
||||
- latest
|
||||
- ${CI_COMMIT_SHA:0:8}
|
||||
dockerfile: findadispo/frontend/Dockerfile
|
||||
context: findadispo/frontend
|
||||
username:
|
||||
from_secret: registry_username
|
||||
password:
|
||||
from_secret: registry_password
|
||||
platforms: linux/amd64
|
||||
provenance: false
|
||||
cache_from: type=registry,ref=code.cannabrands.app/creationshop/findadispo-frontend:cache
|
||||
cache_to: type=registry,ref=code.cannabrands.app/creationshop/findadispo-frontend:cache,mode=max
|
||||
depends_on: []
|
||||
when:
|
||||
branch: master
|
||||
event: push
|
||||
|
||||
docker-findagram:
|
||||
image: woodpeckerci/plugin-docker-buildx
|
||||
settings:
|
||||
registry: code.cannabrands.app
|
||||
repo: code.cannabrands.app/creationshop/findagram-frontend
|
||||
tags:
|
||||
- latest
|
||||
- ${CI_COMMIT_SHA:0:8}
|
||||
dockerfile: findagram/frontend/Dockerfile
|
||||
context: findagram/frontend
|
||||
username:
|
||||
from_secret: registry_username
|
||||
password:
|
||||
from_secret: registry_password
|
||||
platforms: linux/amd64
|
||||
provenance: false
|
||||
cache_from: type=registry,ref=code.cannabrands.app/creationshop/findagram-frontend:cache
|
||||
cache_to: type=registry,ref=code.cannabrands.app/creationshop/findagram-frontend:cache,mode=max
|
||||
depends_on: []
|
||||
when:
|
||||
branch: master
|
||||
event: push
|
||||
|
||||
# ===========================================
|
||||
# STAGE 3: Deploy and Run Migrations
|
||||
# ===========================================
|
||||
deploy:
|
||||
image: bitnami/kubectl:latest
|
||||
environment:
|
||||
KUBECONFIG_CONTENT:
|
||||
from_secret: kubeconfig_data
|
||||
commands:
|
||||
- mkdir -p ~/.kube
|
||||
- echo "$KUBECONFIG_CONTENT" | tr -d '[:space:]' | base64 -d > ~/.kube/config
|
||||
- chmod 600 ~/.kube/config
|
||||
# Deploy backend first
|
||||
- kubectl set image deployment/scraper scraper=code.cannabrands.app/creationshop/dispensary-scraper:${CI_COMMIT_SHA:0:8} -n dispensary-scraper
|
||||
- kubectl rollout status deployment/scraper -n dispensary-scraper --timeout=300s
|
||||
# Note: Migrations run automatically at startup via auto-migrate
|
||||
# Deploy remaining services
|
||||
# Resilience: ensure workers are scaled up if at 0
|
||||
- REPLICAS=$(kubectl get deployment scraper-worker -n dispensary-scraper -o jsonpath='{.spec.replicas}'); if [ "$REPLICAS" = "0" ]; then echo "Scaling workers from 0 to 5"; kubectl scale deployment/scraper-worker --replicas=5 -n dispensary-scraper; fi
|
||||
- kubectl set image deployment/scraper-worker worker=code.cannabrands.app/creationshop/dispensary-scraper:${CI_COMMIT_SHA:0:8} -n dispensary-scraper
|
||||
- kubectl set image deployment/cannaiq-frontend cannaiq-frontend=code.cannabrands.app/creationshop/cannaiq-frontend:${CI_COMMIT_SHA:0:8} -n dispensary-scraper
|
||||
- kubectl set image deployment/findadispo-frontend findadispo-frontend=code.cannabrands.app/creationshop/findadispo-frontend:${CI_COMMIT_SHA:0:8} -n dispensary-scraper
|
||||
- kubectl set image deployment/findagram-frontend findagram-frontend=code.cannabrands.app/creationshop/findagram-frontend:${CI_COMMIT_SHA:0:8} -n dispensary-scraper
|
||||
- kubectl rollout status deployment/cannaiq-frontend -n dispensary-scraper --timeout=120s
|
||||
depends_on:
|
||||
- docker-backend
|
||||
- docker-cannaiq
|
||||
- docker-findadispo
|
||||
- docker-findagram
|
||||
when:
|
||||
branch: master
|
||||
event: push
|
||||
@@ -5,7 +5,7 @@ FROM code.cannabrands.app/creationshop/node:20-slim AS builder
|
||||
WORKDIR /app
|
||||
|
||||
COPY package*.json ./
|
||||
RUN npm ci
|
||||
RUN npm install
|
||||
|
||||
COPY . .
|
||||
RUN npm run build
|
||||
@@ -25,8 +25,9 @@ ENV APP_GIT_SHA=${APP_GIT_SHA}
|
||||
ENV APP_BUILD_TIME=${APP_BUILD_TIME}
|
||||
ENV CONTAINER_IMAGE_TAG=${CONTAINER_IMAGE_TAG}
|
||||
|
||||
# Install Chromium dependencies
|
||||
# Install Chromium dependencies and curl for HTTP requests
|
||||
RUN apt-get update && apt-get install -y \
|
||||
curl \
|
||||
chromium \
|
||||
fonts-liberation \
|
||||
libnss3 \
|
||||
@@ -43,10 +44,13 @@ ENV PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium
|
||||
WORKDIR /app
|
||||
|
||||
COPY package*.json ./
|
||||
RUN npm ci --omit=dev
|
||||
RUN npm install --omit=dev
|
||||
|
||||
COPY --from=builder /app/dist ./dist
|
||||
|
||||
# Copy migrations for auto-migrate on startup
|
||||
COPY migrations ./migrations
|
||||
|
||||
# Create local images directory for when MinIO is not configured
|
||||
RUN mkdir -p /app/public/images/products
|
||||
|
||||
|
||||
218
backend/docs/CODEBASE_MAP.md
Normal file
218
backend/docs/CODEBASE_MAP.md
Normal file
@@ -0,0 +1,218 @@
|
||||
# CannaiQ Backend Codebase Map
|
||||
|
||||
**Last Updated:** 2025-12-12
|
||||
**Purpose:** Help Claude and developers understand which code is current vs deprecated
|
||||
|
||||
---
|
||||
|
||||
## Quick Reference: What to Use
|
||||
|
||||
### For Crawling/Scraping
|
||||
| Task | Use This | NOT This |
|
||||
|------|----------|----------|
|
||||
| Fetch products | `src/tasks/handlers/payload-fetch.ts` | `src/hydration/*` |
|
||||
| Process products | `src/tasks/handlers/product-refresh.ts` | `src/scraper-v2/*` |
|
||||
| GraphQL client | `src/platforms/dutchie/client.ts` | `src/dutchie-az/services/graphql-client.ts` |
|
||||
| Worker system | `src/tasks/task-worker.ts` | `src/dutchie-az/services/worker.ts` |
|
||||
|
||||
### For Database
|
||||
| Task | Use This | NOT This |
|
||||
|------|----------|----------|
|
||||
| Get DB pool | `src/db/pool.ts` | `src/dutchie-az/db/connection.ts` |
|
||||
| Run migrations | `src/db/migrate.ts` (CLI only) | Never import at runtime |
|
||||
| Query products | `store_products` table | `products`, `dutchie_products` |
|
||||
| Query stores | `dispensaries` table | `stores` table |
|
||||
|
||||
### For Discovery
|
||||
| Task | Use This |
|
||||
|------|----------|
|
||||
| Discover stores | `src/discovery/*.ts` |
|
||||
| Run discovery | `npx tsx src/scripts/run-discovery.ts` |
|
||||
|
||||
---
|
||||
|
||||
## Directory Status
|
||||
|
||||
### ACTIVE DIRECTORIES (Use These)
|
||||
|
||||
```
|
||||
src/
|
||||
├── auth/ # JWT/session auth, middleware
|
||||
├── db/ # Database pool, migrations
|
||||
├── discovery/ # Dutchie store discovery pipeline
|
||||
├── middleware/ # Express middleware
|
||||
├── multi-state/ # Multi-state query support
|
||||
├── platforms/ # Platform-specific clients (Dutchie, Jane, etc)
|
||||
│ └── dutchie/ # THE Dutchie client - use this one
|
||||
├── routes/ # Express API routes
|
||||
├── services/ # Core services (logger, scheduler, etc)
|
||||
├── tasks/ # Task system (workers, handlers, scheduler)
|
||||
│ └── handlers/ # Task handlers (payload_fetch, product_refresh, etc)
|
||||
├── types/ # TypeScript types
|
||||
└── utils/ # Utilities (storage, image processing)
|
||||
```
|
||||
|
||||
### DEPRECATED DIRECTORIES (DO NOT USE)
|
||||
|
||||
```
|
||||
src/
|
||||
├── hydration/ # DEPRECATED - Old pipeline approach
|
||||
├── scraper-v2/ # DEPRECATED - Old scraper engine
|
||||
├── canonical-hydration/# DEPRECATED - Merged into tasks/handlers
|
||||
├── dutchie-az/ # PARTIAL - Some parts deprecated, some active
|
||||
│ ├── db/ # DEPRECATED - Use src/db/pool.ts
|
||||
│ └── services/ # PARTIAL - worker.ts still runs, graphql-client.ts deprecated
|
||||
├── portals/ # FUTURE - Not yet implemented
|
||||
├── seo/ # PARTIAL - Settings work, templates WIP
|
||||
└── system/ # DEPRECATED - Old orchestration system
|
||||
```
|
||||
|
||||
### DEPRECATED FILES (DO NOT USE)
|
||||
|
||||
```
|
||||
src/dutchie-az/db/connection.ts # Use src/db/pool.ts instead
|
||||
src/dutchie-az/services/graphql-client.ts # Use src/platforms/dutchie/client.ts
|
||||
src/hydration/*.ts # Entire directory deprecated
|
||||
src/scraper-v2/*.ts # Entire directory deprecated
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Key Files Reference
|
||||
|
||||
### Entry Points
|
||||
| File | Purpose | Status |
|
||||
|------|---------|--------|
|
||||
| `src/index.ts` | Main Express server | ACTIVE |
|
||||
| `src/dutchie-az/services/worker.ts` | Worker process entry | ACTIVE |
|
||||
| `src/tasks/task-worker.ts` | Task worker (new system) | ACTIVE |
|
||||
|
||||
### Dutchie Integration
|
||||
| File | Purpose | Status |
|
||||
|------|---------|--------|
|
||||
| `src/platforms/dutchie/client.ts` | GraphQL client, hashes, curl | **PRIMARY** |
|
||||
| `src/platforms/dutchie/queries.ts` | High-level query functions | ACTIVE |
|
||||
| `src/platforms/dutchie/index.ts` | Re-exports | ACTIVE |
|
||||
|
||||
### Task Handlers
|
||||
| File | Purpose | Status |
|
||||
|------|---------|--------|
|
||||
| `src/tasks/handlers/payload-fetch.ts` | Fetch products from Dutchie | **PRIMARY** |
|
||||
| `src/tasks/handlers/product-refresh.ts` | Process payload into DB | **PRIMARY** |
|
||||
| `src/tasks/handlers/menu-detection.ts` | Detect menu type | ACTIVE |
|
||||
| `src/tasks/handlers/id-resolution.ts` | Resolve platform IDs | ACTIVE |
|
||||
| `src/tasks/handlers/image-download.ts` | Download product images | ACTIVE |
|
||||
|
||||
### Database
|
||||
| File | Purpose | Status |
|
||||
|------|---------|--------|
|
||||
| `src/db/pool.ts` | Canonical DB pool | **PRIMARY** |
|
||||
| `src/db/migrate.ts` | Migration runner (CLI only) | CLI ONLY |
|
||||
| `src/db/auto-migrate.ts` | Auto-run migrations on startup | ACTIVE |
|
||||
|
||||
### Configuration
|
||||
| File | Purpose | Status |
|
||||
|------|---------|--------|
|
||||
| `.env` | Environment variables | ACTIVE |
|
||||
| `package.json` | Dependencies | ACTIVE |
|
||||
| `tsconfig.json` | TypeScript config | ACTIVE |
|
||||
|
||||
---
|
||||
|
||||
## GraphQL Hashes (CRITICAL)
|
||||
|
||||
The correct hashes are in `src/platforms/dutchie/client.ts`:
|
||||
|
||||
```typescript
|
||||
export const GRAPHQL_HASHES = {
|
||||
FilteredProducts: 'ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0',
|
||||
GetAddressBasedDispensaryData: '13461f73abf7268770dfd05fe7e10c523084b2bb916a929c08efe3d87531977b',
|
||||
ConsumerDispensaries: '0a5bfa6ca1d64ae47bcccb7c8077c87147cbc4e6982c17ceec97a2a4948b311b',
|
||||
GetAllCitiesByState: 'ae547a0466ace5a48f91e55bf6699eacd87e3a42841560f0c0eabed5a0a920e6',
|
||||
};
|
||||
```
|
||||
|
||||
**ALWAYS** use `Status: 'Active'` for FilteredProducts (not `null` or `'All'`).
|
||||
|
||||
---
|
||||
|
||||
## Scripts Reference
|
||||
|
||||
### Useful Scripts (in `src/scripts/`)
|
||||
| Script | Purpose |
|
||||
|--------|---------|
|
||||
| `run-discovery.ts` | Run Dutchie discovery |
|
||||
| `crawl-single-store.ts` | Test crawl a single store |
|
||||
| `test-dutchie-graphql.ts` | Test GraphQL queries |
|
||||
|
||||
### One-Off Scripts (probably don't need)
|
||||
| Script | Purpose |
|
||||
|--------|---------|
|
||||
| `harmonize-az-dispensaries.ts` | One-time data cleanup |
|
||||
| `bootstrap-stores-for-dispensaries.ts` | One-time migration |
|
||||
| `backfill-*.ts` | Historical backfill scripts |
|
||||
|
||||
---
|
||||
|
||||
## API Routes
|
||||
|
||||
### Active Routes (in `src/routes/`)
|
||||
| Route File | Mount Point | Purpose |
|
||||
|------------|-------------|---------|
|
||||
| `auth.ts` | `/api/auth` | Login/logout/session |
|
||||
| `stores.ts` | `/api/stores` | Store CRUD |
|
||||
| `dashboard.ts` | `/api/dashboard` | Dashboard stats |
|
||||
| `workers.ts` | `/api/workers` | Worker monitoring |
|
||||
| `pipeline.ts` | `/api/pipeline` | Crawl triggers |
|
||||
| `discovery.ts` | `/api/discovery` | Discovery management |
|
||||
| `analytics.ts` | `/api/analytics` | Analytics queries |
|
||||
| `wordpress.ts` | `/api/v1/wordpress` | WordPress plugin API |
|
||||
|
||||
---
|
||||
|
||||
## Documentation Files
|
||||
|
||||
### Current Docs (in `backend/docs/`)
|
||||
| Doc | Purpose | Currency |
|
||||
|-----|---------|----------|
|
||||
| `TASK_WORKFLOW_2024-12-10.md` | Task system architecture | CURRENT |
|
||||
| `WORKER_TASK_ARCHITECTURE.md` | Worker/task design | CURRENT |
|
||||
| `CRAWL_PIPELINE.md` | Crawl pipeline overview | CURRENT |
|
||||
| `ORGANIC_SCRAPING_GUIDE.md` | Browser-based scraping | CURRENT |
|
||||
| `CODEBASE_MAP.md` | This file | CURRENT |
|
||||
| `ANALYTICS_V2_EXAMPLES.md` | Analytics API examples | CURRENT |
|
||||
| `BRAND_INTELLIGENCE_API.md` | Brand API docs | CURRENT |
|
||||
|
||||
### Root Docs
|
||||
| Doc | Purpose | Currency |
|
||||
|-----|---------|----------|
|
||||
| `CLAUDE.md` | Claude instructions | **PRIMARY** |
|
||||
| `README.md` | Project overview | NEEDS UPDATE |
|
||||
|
||||
---
|
||||
|
||||
## Common Mistakes to Avoid
|
||||
|
||||
1. **Don't use `src/hydration/`** - It's an old approach that was superseded by the task system
|
||||
|
||||
2. **Don't use `src/dutchie-az/db/connection.ts`** - Use `src/db/pool.ts` instead
|
||||
|
||||
3. **Don't import `src/db/migrate.ts` at runtime** - It will crash. Only use for CLI migrations.
|
||||
|
||||
4. **Don't query `stores` table** - It's empty. Use `dispensaries`.
|
||||
|
||||
5. **Don't query `products` table** - It's empty. Use `store_products`.
|
||||
|
||||
6. **Don't use wrong GraphQL hash** - Always get hash from `GRAPHQL_HASHES` in client.ts
|
||||
|
||||
7. **Don't use `Status: null`** - It returns 0 products. Use `Status: 'Active'`.
|
||||
|
||||
---
|
||||
|
||||
## When in Doubt
|
||||
|
||||
1. Check if the file is imported in `src/index.ts` - if not, it may be deprecated
|
||||
2. Check the last modified date - older files may be stale
|
||||
3. Look for `DEPRECATED` comments in the code
|
||||
4. Ask: "Is there a newer version of this in `src/tasks/` or `src/platforms/`?"
|
||||
5. Read the relevant doc in `docs/` before modifying code
|
||||
394
backend/docs/_archive/BRAND_INTELLIGENCE_API.md
Normal file
394
backend/docs/_archive/BRAND_INTELLIGENCE_API.md
Normal file
@@ -0,0 +1,394 @@
|
||||
# Brand Intelligence API
|
||||
|
||||
## Endpoint
|
||||
|
||||
```
|
||||
GET /api/analytics/v2/brand/:name/intelligence
|
||||
```
|
||||
|
||||
## Query Parameters
|
||||
|
||||
| Param | Type | Default | Description |
|
||||
|-------|------|---------|-------------|
|
||||
| `window` | `7d\|30d\|90d` | `30d` | Time window for trend calculations |
|
||||
| `state` | string | - | Filter by state code (e.g., `AZ`) |
|
||||
| `category` | string | - | Filter by category (e.g., `Flower`) |
|
||||
|
||||
## Response Payload Schema
|
||||
|
||||
```typescript
|
||||
interface BrandIntelligenceResult {
|
||||
brand_name: string;
|
||||
window: '7d' | '30d' | '90d';
|
||||
generated_at: string; // ISO timestamp when data was computed
|
||||
|
||||
performance_snapshot: PerformanceSnapshot;
|
||||
alerts: Alerts;
|
||||
sku_performance: SkuPerformance[];
|
||||
retail_footprint: RetailFootprint;
|
||||
competitive_landscape: CompetitiveLandscape;
|
||||
inventory_health: InventoryHealth;
|
||||
promo_performance: PromoPerformance;
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Section 1: Performance Snapshot
|
||||
|
||||
Summary cards with key brand metrics.
|
||||
|
||||
```typescript
|
||||
interface PerformanceSnapshot {
|
||||
active_skus: number; // Total products in catalog
|
||||
total_revenue_30d: number | null; // Estimated from qty × price
|
||||
total_stores: number; // Active retail partners
|
||||
new_stores_30d: number; // New distribution in window
|
||||
market_share: number | null; // % of category SKUs
|
||||
avg_wholesale_price: number | null;
|
||||
price_position: 'premium' | 'value' | 'competitive';
|
||||
}
|
||||
```
|
||||
|
||||
**UI Label Mapping:**
|
||||
| Field | User-Facing Label | Helper Text |
|
||||
|-------|-------------------|-------------|
|
||||
| `active_skus` | Active Products | X total in catalog |
|
||||
| `total_revenue_30d` | Monthly Revenue | Estimated from sales |
|
||||
| `total_stores` | Retail Distribution | Active retail partners |
|
||||
| `new_stores_30d` | New Opportunities | X new in last 30 days |
|
||||
| `market_share` | Category Position | % of category |
|
||||
| `avg_wholesale_price` | Avg Wholesale | Per unit |
|
||||
| `price_position` | Pricing Tier | Premium/Value/Market Rate |
|
||||
|
||||
---
|
||||
|
||||
## Section 2: Alerts
|
||||
|
||||
Issues requiring attention.
|
||||
|
||||
```typescript
|
||||
interface Alerts {
|
||||
lost_stores_30d_count: number;
|
||||
lost_skus_30d_count: number;
|
||||
competitor_takeover_count: number;
|
||||
avg_oos_duration_days: number | null;
|
||||
avg_reorder_lag_days: number | null;
|
||||
items: AlertItem[];
|
||||
}
|
||||
|
||||
interface AlertItem {
|
||||
type: 'lost_store' | 'delisted_sku' | 'shelf_loss' | 'extended_oos';
|
||||
severity: 'critical' | 'warning';
|
||||
store_name?: string;
|
||||
product_name?: string;
|
||||
competitor_brand?: string;
|
||||
days_since?: number;
|
||||
state_code?: string;
|
||||
}
|
||||
```
|
||||
|
||||
**UI Label Mapping:**
|
||||
| Field | User-Facing Label |
|
||||
|-------|-------------------|
|
||||
| `lost_stores_30d_count` | Accounts at Risk |
|
||||
| `lost_skus_30d_count` | Delisted SKUs |
|
||||
| `competitor_takeover_count` | Shelf Losses |
|
||||
| `avg_oos_duration_days` | Avg Stockout Length |
|
||||
| `avg_reorder_lag_days` | Avg Restock Time |
|
||||
| `severity: critical` | Urgent |
|
||||
| `severity: warning` | Watch |
|
||||
|
||||
---
|
||||
|
||||
## Section 3: SKU Performance (Product Velocity)
|
||||
|
||||
How fast each SKU sells.
|
||||
|
||||
```typescript
|
||||
interface SkuPerformance {
|
||||
store_product_id: number;
|
||||
product_name: string;
|
||||
category: string | null;
|
||||
daily_velocity: number; // Units/day estimate
|
||||
velocity_status: 'hot' | 'steady' | 'slow' | 'stale';
|
||||
retail_price: number | null;
|
||||
on_sale: boolean;
|
||||
stores_carrying: number;
|
||||
stock_status: 'in_stock' | 'low_stock' | 'out_of_stock';
|
||||
}
|
||||
```
|
||||
|
||||
**UI Label Mapping:**
|
||||
| Field | User-Facing Label |
|
||||
|-------|-------------------|
|
||||
| `daily_velocity` | Daily Rate |
|
||||
| `velocity_status` | Momentum |
|
||||
| `velocity_status: hot` | Hot |
|
||||
| `velocity_status: steady` | Steady |
|
||||
| `velocity_status: slow` | Slow |
|
||||
| `velocity_status: stale` | Stale |
|
||||
| `retail_price` | Retail Price |
|
||||
| `on_sale` | Promo (badge) |
|
||||
|
||||
**Velocity Thresholds:**
|
||||
- `hot`: >= 5 units/day
|
||||
- `steady`: >= 1 unit/day
|
||||
- `slow`: >= 0.1 units/day
|
||||
- `stale`: < 0.1 units/day
|
||||
|
||||
---
|
||||
|
||||
## Section 4: Retail Footprint
|
||||
|
||||
Store placement and coverage.
|
||||
|
||||
```typescript
|
||||
interface RetailFootprint {
|
||||
total_stores: number;
|
||||
in_stock_count: number;
|
||||
out_of_stock_count: number;
|
||||
penetration_by_region: RegionPenetration[];
|
||||
whitespace_stores: WhitespaceStore[];
|
||||
}
|
||||
|
||||
interface RegionPenetration {
|
||||
state_code: string;
|
||||
store_count: number;
|
||||
percent_reached: number; // % of state's dispensaries
|
||||
in_stock: number;
|
||||
out_of_stock: number;
|
||||
}
|
||||
|
||||
interface WhitespaceStore {
|
||||
store_id: number;
|
||||
store_name: string;
|
||||
state_code: string;
|
||||
city: string | null;
|
||||
category_fit: number; // How many competing brands they carry
|
||||
competitor_brands: string[];
|
||||
}
|
||||
```
|
||||
|
||||
**UI Label Mapping:**
|
||||
| Field | User-Facing Label |
|
||||
|-------|-------------------|
|
||||
| `penetration_by_region` | Market Coverage by Region |
|
||||
| `percent_reached` | X% reached |
|
||||
| `in_stock` | X stocked |
|
||||
| `out_of_stock` | X out |
|
||||
| `whitespace_stores` | Expansion Opportunities |
|
||||
| `category_fit` | X fit |
|
||||
|
||||
---
|
||||
|
||||
## Section 5: Competitive Landscape
|
||||
|
||||
Market positioning vs competitors.
|
||||
|
||||
```typescript
|
||||
interface CompetitiveLandscape {
|
||||
brand_price_position: 'premium' | 'value' | 'competitive';
|
||||
market_share_trend: MarketSharePoint[];
|
||||
competitors: Competitor[];
|
||||
head_to_head_skus: HeadToHead[];
|
||||
}
|
||||
|
||||
interface MarketSharePoint {
|
||||
date: string;
|
||||
share_percent: number;
|
||||
}
|
||||
|
||||
interface Competitor {
|
||||
brand_name: string;
|
||||
store_overlap_percent: number;
|
||||
price_position: 'premium' | 'value' | 'competitive';
|
||||
avg_price: number | null;
|
||||
sku_count: number;
|
||||
}
|
||||
|
||||
interface HeadToHead {
|
||||
product_name: string;
|
||||
brand_price: number;
|
||||
competitor_brand: string;
|
||||
competitor_price: number;
|
||||
price_diff_percent: number;
|
||||
}
|
||||
```
|
||||
|
||||
**UI Label Mapping:**
|
||||
| Field | User-Facing Label |
|
||||
|-------|-------------------|
|
||||
| `price_position: premium` | Premium Tier |
|
||||
| `price_position: value` | Value Leader |
|
||||
| `price_position: competitive` | Market Rate |
|
||||
| `market_share_trend` | Share of Shelf Trend |
|
||||
| `head_to_head_skus` | Price Comparison |
|
||||
| `store_overlap_percent` | X% store overlap |
|
||||
|
||||
---
|
||||
|
||||
## Section 6: Inventory Health
|
||||
|
||||
Stock projections and risk levels.
|
||||
|
||||
```typescript
|
||||
interface InventoryHealth {
|
||||
critical_count: number; // <7 days stock
|
||||
warning_count: number; // 7-14 days stock
|
||||
healthy_count: number; // 14-90 days stock
|
||||
overstocked_count: number; // >90 days stock
|
||||
skus: InventorySku[];
|
||||
overstock_alert: OverstockItem[];
|
||||
}
|
||||
|
||||
interface InventorySku {
|
||||
store_product_id: number;
|
||||
product_name: string;
|
||||
store_name: string;
|
||||
days_of_stock: number | null;
|
||||
risk_level: 'critical' | 'elevated' | 'moderate' | 'healthy';
|
||||
current_quantity: number | null;
|
||||
daily_sell_rate: number | null;
|
||||
}
|
||||
|
||||
interface OverstockItem {
|
||||
product_name: string;
|
||||
store_name: string;
|
||||
excess_units: number;
|
||||
days_of_stock: number;
|
||||
}
|
||||
```
|
||||
|
||||
**UI Label Mapping:**
|
||||
| Field | User-Facing Label |
|
||||
|-------|-------------------|
|
||||
| `risk_level: critical` | Reorder Now |
|
||||
| `risk_level: elevated` | Low Stock |
|
||||
| `risk_level: moderate` | Monitor |
|
||||
| `risk_level: healthy` | Healthy |
|
||||
| `critical_count` | Urgent (<7 days) |
|
||||
| `warning_count` | Low (7-14 days) |
|
||||
| `overstocked_count` | Excess (>90 days) |
|
||||
| `days_of_stock` | X days remaining |
|
||||
| `overstock_alert` | Overstock Alert |
|
||||
| `excess_units` | X excess units |
|
||||
|
||||
---
|
||||
|
||||
## Section 7: Promotion Effectiveness
|
||||
|
||||
How promotions impact sales.
|
||||
|
||||
```typescript
|
||||
interface PromoPerformance {
|
||||
avg_baseline_velocity: number | null;
|
||||
avg_promo_velocity: number | null;
|
||||
avg_velocity_lift: number | null; // % increase during promo
|
||||
avg_efficiency_score: number | null; // ROI proxy
|
||||
promotions: Promotion[];
|
||||
}
|
||||
|
||||
interface Promotion {
|
||||
product_name: string;
|
||||
store_name: string;
|
||||
status: 'active' | 'scheduled' | 'ended';
|
||||
start_date: string;
|
||||
end_date: string | null;
|
||||
regular_price: number;
|
||||
promo_price: number;
|
||||
discount_percent: number;
|
||||
baseline_velocity: number | null;
|
||||
promo_velocity: number | null;
|
||||
velocity_lift: number | null;
|
||||
efficiency_score: number | null;
|
||||
}
|
||||
```
|
||||
|
||||
**UI Label Mapping:**
|
||||
| Field | User-Facing Label |
|
||||
|-------|-------------------|
|
||||
| `avg_baseline_velocity` | Normal Rate |
|
||||
| `avg_promo_velocity` | During Promos |
|
||||
| `avg_velocity_lift` | Avg Sales Lift |
|
||||
| `avg_efficiency_score` | ROI Score |
|
||||
| `velocity_lift` | Sales Lift |
|
||||
| `efficiency_score` | ROI Score |
|
||||
| `status: active` | Live |
|
||||
| `status: scheduled` | Scheduled |
|
||||
| `status: ended` | Ended |
|
||||
|
||||
---
|
||||
|
||||
## Example Queries
|
||||
|
||||
### Get full payload
|
||||
```javascript
|
||||
const response = await fetch('/api/analytics/v2/brand/Wyld/intelligence?window=30d');
|
||||
const data = await response.json();
|
||||
```
|
||||
|
||||
### Extract summary cards (flattened)
|
||||
```javascript
|
||||
const { performance_snapshot: ps, alerts } = data;
|
||||
|
||||
const summaryCards = {
|
||||
activeProducts: ps.active_skus,
|
||||
monthlyRevenue: ps.total_revenue_30d,
|
||||
retailDistribution: ps.total_stores,
|
||||
newOpportunities: ps.new_stores_30d,
|
||||
categoryPosition: ps.market_share,
|
||||
avgWholesale: ps.avg_wholesale_price,
|
||||
pricingTier: ps.price_position,
|
||||
accountsAtRisk: alerts.lost_stores_30d_count,
|
||||
delistedSkus: alerts.lost_skus_30d_count,
|
||||
shelfLosses: alerts.competitor_takeover_count,
|
||||
};
|
||||
```
|
||||
|
||||
### Get top 10 fastest selling SKUs
|
||||
```javascript
|
||||
const topSkus = data.sku_performance
|
||||
.filter(sku => sku.velocity_status === 'hot' || sku.velocity_status === 'steady')
|
||||
.sort((a, b) => b.daily_velocity - a.daily_velocity)
|
||||
.slice(0, 10);
|
||||
```
|
||||
|
||||
### Get critical inventory alerts only
|
||||
```javascript
|
||||
const criticalInventory = data.inventory_health.skus
|
||||
.filter(sku => sku.risk_level === 'critical');
|
||||
```
|
||||
|
||||
### Get states with <50% penetration
|
||||
```javascript
|
||||
const underPenetrated = data.retail_footprint.penetration_by_region
|
||||
.filter(region => region.percent_reached < 50)
|
||||
.sort((a, b) => a.percent_reached - b.percent_reached);
|
||||
```
|
||||
|
||||
### Get active promotions with positive lift
|
||||
```javascript
|
||||
const effectivePromos = data.promo_performance.promotions
|
||||
.filter(p => p.status === 'active' && p.velocity_lift > 0)
|
||||
.sort((a, b) => b.velocity_lift - a.velocity_lift);
|
||||
```
|
||||
|
||||
### Build chart data for market share trend
|
||||
```javascript
|
||||
const chartData = data.competitive_landscape.market_share_trend.map(point => ({
|
||||
x: new Date(point.date),
|
||||
y: point.share_percent,
|
||||
}));
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Notes for Frontend Implementation
|
||||
|
||||
1. **All fields are snake_case** - transform to camelCase if needed
|
||||
2. **Null values are possible** - handle gracefully in UI
|
||||
3. **Arrays may be empty** - show appropriate empty states
|
||||
4. **Timestamps are ISO format** - parse with `new Date()`
|
||||
5. **Percentages are already computed** - no need to multiply by 100
|
||||
6. **The `window` parameter affects trend calculations** - 7d/30d/90d
|
||||
@@ -500,17 +500,18 @@ CREATE TABLE proxies (
|
||||
|
||||
Proxies are mandatory. There is no environment variable to disable them. Workers will refuse to start without active proxies in the database.
|
||||
|
||||
### Fingerprints Available
|
||||
### User-Agent Generation
|
||||
|
||||
The client includes 6 browser fingerprints:
|
||||
- Chrome 131 on Windows
|
||||
- Chrome 131 on macOS
|
||||
- Chrome 120 on Windows
|
||||
- Firefox 133 on Windows
|
||||
- Safari 17.2 on macOS
|
||||
- Edge 131 on Windows
|
||||
See `workflow-12102025.md` for full specification.
|
||||
|
||||
Each includes proper `sec-ch-ua`, `sec-ch-ua-platform`, and `sec-ch-ua-mobile` headers.
|
||||
**Summary:**
|
||||
- Uses `intoli/user-agents` library (daily-updated market share data)
|
||||
- Device distribution: Mobile 62%, Desktop 36%, Tablet 2%
|
||||
- Browser whitelist: Chrome, Safari, Edge, Firefox only
|
||||
- UA sticks until IP rotates (403 or manual rotation)
|
||||
- Failure = alert admin + stop crawl (no fallback)
|
||||
|
||||
Each fingerprint includes proper `sec-ch-ua`, `sec-ch-ua-platform`, and `sec-ch-ua-mobile` headers.
|
||||
|
||||
---
|
||||
|
||||
297
backend/docs/_archive/ORGANIC_SCRAPING_GUIDE.md
Normal file
297
backend/docs/_archive/ORGANIC_SCRAPING_GUIDE.md
Normal file
@@ -0,0 +1,297 @@
|
||||
# Organic Browser-Based Scraping Guide
|
||||
|
||||
**Last Updated:** 2025-12-12
|
||||
**Status:** Production-ready proof of concept
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
This document describes the "organic" browser-based approach to scraping Dutchie dispensary menus. Unlike direct curl/axios requests, this method uses a real browser session to make API calls, making requests appear natural and reducing detection risk.
|
||||
|
||||
---
|
||||
|
||||
## Why Organic Scraping?
|
||||
|
||||
| Approach | Detection Risk | Speed | Complexity |
|
||||
|----------|---------------|-------|------------|
|
||||
| Direct curl | Higher | Fast | Low |
|
||||
| curl-impersonate | Medium | Fast | Medium |
|
||||
| **Browser-based (organic)** | **Lowest** | Slower | Higher |
|
||||
|
||||
Direct curl requests can be fingerprinted via:
|
||||
- TLS fingerprint (cipher suites, extensions)
|
||||
- Header order and values
|
||||
- Missing cookies/session data
|
||||
- Request patterns
|
||||
|
||||
Browser-based requests inherit:
|
||||
- Real Chrome TLS fingerprint
|
||||
- Session cookies from page visit
|
||||
- Natural header order
|
||||
- JavaScript execution environment
|
||||
|
||||
---
|
||||
|
||||
## Implementation
|
||||
|
||||
### Dependencies
|
||||
|
||||
```bash
|
||||
npm install puppeteer puppeteer-extra puppeteer-extra-plugin-stealth
|
||||
```
|
||||
|
||||
### Core Script: `test-intercept.js`
|
||||
|
||||
Located at: `backend/test-intercept.js`
|
||||
|
||||
```javascript
|
||||
const puppeteer = require('puppeteer-extra');
|
||||
const StealthPlugin = require('puppeteer-extra-plugin-stealth');
|
||||
const fs = require('fs');
|
||||
|
||||
puppeteer.use(StealthPlugin());
|
||||
|
||||
async function capturePayload(config) {
|
||||
const { dispensaryId, platformId, cName, outputPath } = config;
|
||||
|
||||
const browser = await puppeteer.launch({
|
||||
headless: 'new',
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox']
|
||||
});
|
||||
|
||||
const page = await browser.newPage();
|
||||
|
||||
// STEP 1: Establish session by visiting the menu
|
||||
const embedUrl = `https://dutchie.com/embedded-menu/${cName}?menuType=rec`;
|
||||
await page.goto(embedUrl, { waitUntil: 'networkidle2', timeout: 60000 });
|
||||
|
||||
// STEP 2: Fetch ALL products using GraphQL from browser context
|
||||
const result = await page.evaluate(async (platformId) => {
|
||||
const allProducts = [];
|
||||
let pageNum = 0;
|
||||
const perPage = 100;
|
||||
let totalCount = 0;
|
||||
const sessionId = 'browser-session-' + Date.now();
|
||||
|
||||
while (pageNum < 30) {
|
||||
const variables = {
|
||||
includeEnterpriseSpecials: false,
|
||||
productsFilter: {
|
||||
dispensaryId: platformId,
|
||||
pricingType: 'rec',
|
||||
Status: 'Active', // CRITICAL: Must be 'Active', not null
|
||||
types: [],
|
||||
useCache: true,
|
||||
isDefaultSort: true,
|
||||
sortBy: 'popularSortIdx',
|
||||
sortDirection: 1,
|
||||
bypassOnlineThresholds: true,
|
||||
isKioskMenu: false,
|
||||
removeProductsBelowOptionThresholds: false,
|
||||
},
|
||||
page: pageNum,
|
||||
perPage: perPage,
|
||||
};
|
||||
|
||||
const extensions = {
|
||||
persistedQuery: {
|
||||
version: 1,
|
||||
sha256Hash: 'ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0'
|
||||
}
|
||||
};
|
||||
|
||||
const qs = new URLSearchParams({
|
||||
operationName: 'FilteredProducts',
|
||||
variables: JSON.stringify(variables),
|
||||
extensions: JSON.stringify(extensions)
|
||||
});
|
||||
|
||||
const response = await fetch(`https://dutchie.com/api-3/graphql?${qs}`, {
|
||||
method: 'GET',
|
||||
headers: {
|
||||
'Accept': 'application/json',
|
||||
'content-type': 'application/json',
|
||||
'x-dutchie-session': sessionId,
|
||||
'apollographql-client-name': 'Marketplace (production)',
|
||||
},
|
||||
credentials: 'include'
|
||||
});
|
||||
|
||||
const json = await response.json();
|
||||
const data = json?.data?.filteredProducts;
|
||||
if (!data?.products) break;
|
||||
|
||||
allProducts.push(...data.products);
|
||||
if (pageNum === 0) totalCount = data.queryInfo?.totalCount || 0;
|
||||
if (allProducts.length >= totalCount) break;
|
||||
|
||||
pageNum++;
|
||||
await new Promise(r => setTimeout(r, 200)); // Polite delay
|
||||
}
|
||||
|
||||
return { products: allProducts, totalCount };
|
||||
}, platformId);
|
||||
|
||||
await browser.close();
|
||||
|
||||
// STEP 3: Save payload
|
||||
const payload = {
|
||||
dispensaryId,
|
||||
platformId,
|
||||
cName,
|
||||
fetchedAt: new Date().toISOString(),
|
||||
productCount: result.products.length,
|
||||
products: result.products,
|
||||
};
|
||||
|
||||
fs.writeFileSync(outputPath, JSON.stringify(payload, null, 2));
|
||||
return payload;
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Critical Parameters
|
||||
|
||||
### GraphQL Hash (FilteredProducts)
|
||||
|
||||
```
|
||||
ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0
|
||||
```
|
||||
|
||||
**WARNING:** Using the wrong hash returns HTTP 400.
|
||||
|
||||
### Status Parameter
|
||||
|
||||
| Value | Result |
|
||||
|-------|--------|
|
||||
| `'Active'` | Returns in-stock products (1019 in test) |
|
||||
| `null` | Returns 0 products |
|
||||
| `'All'` | Returns HTTP 400 |
|
||||
|
||||
**ALWAYS use `Status: 'Active'`**
|
||||
|
||||
### Required Headers
|
||||
|
||||
```javascript
|
||||
{
|
||||
'Accept': 'application/json',
|
||||
'content-type': 'application/json',
|
||||
'x-dutchie-session': 'unique-session-id',
|
||||
'apollographql-client-name': 'Marketplace (production)',
|
||||
}
|
||||
```
|
||||
|
||||
### Endpoint
|
||||
|
||||
```
|
||||
https://dutchie.com/api-3/graphql
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Performance Benchmarks
|
||||
|
||||
Test store: AZ-Deeply-Rooted (1019 products)
|
||||
|
||||
| Metric | Value |
|
||||
|--------|-------|
|
||||
| Total products | 1019 |
|
||||
| Time | 18.5 seconds |
|
||||
| Payload size | 11.8 MB |
|
||||
| Pages fetched | 11 (100 per page) |
|
||||
| Success rate | 100% |
|
||||
|
||||
---
|
||||
|
||||
## Payload Format
|
||||
|
||||
The output matches the existing `payload-fetch.ts` handler format:
|
||||
|
||||
```json
|
||||
{
|
||||
"dispensaryId": 123,
|
||||
"platformId": "6405ef617056e8014d79101b",
|
||||
"cName": "AZ-Deeply-Rooted",
|
||||
"fetchedAt": "2025-12-12T05:05:19.837Z",
|
||||
"productCount": 1019,
|
||||
"products": [
|
||||
{
|
||||
"id": "6927508db4851262f629a869",
|
||||
"Name": "Product Name",
|
||||
"brand": { "name": "Brand Name", ... },
|
||||
"type": "Flower",
|
||||
"THC": "25%",
|
||||
"Prices": [...],
|
||||
"Options": [...],
|
||||
...
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Integration Points
|
||||
|
||||
### As a Task Handler
|
||||
|
||||
The organic approach can be integrated as an alternative to curl-based fetching:
|
||||
|
||||
```typescript
|
||||
// In src/tasks/handlers/organic-payload-fetch.ts
|
||||
export async function handleOrganicPayloadFetch(ctx: TaskContext): Promise<TaskResult> {
|
||||
// Use puppeteer-based capture
|
||||
// Save to same payload storage
|
||||
// Queue product_refresh task
|
||||
}
|
||||
```
|
||||
|
||||
### Worker Configuration
|
||||
|
||||
Add to job_schedules:
|
||||
```sql
|
||||
INSERT INTO job_schedules (name, role, cron_expression)
|
||||
VALUES ('organic_product_crawl', 'organic_payload_fetch', '0 */6 * * *');
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### HTTP 400 Bad Request
|
||||
- Check hash is correct: `ee29c060...`
|
||||
- Verify Status is `'Active'` (string, not null)
|
||||
|
||||
### 0 Products Returned
|
||||
- Status was likely `null` or `'All'` - use `'Active'`
|
||||
- Check platformId is valid MongoDB ObjectId
|
||||
|
||||
### Session Not Established
|
||||
- Increase timeout on initial page.goto()
|
||||
- Check cName is valid (matches embedded-menu URL)
|
||||
|
||||
### Detection/Blocking
|
||||
- StealthPlugin should handle most cases
|
||||
- Add random delays between pages
|
||||
- Use headless: 'new' (not true/false)
|
||||
|
||||
---
|
||||
|
||||
## Files Reference
|
||||
|
||||
| File | Purpose |
|
||||
|------|---------|
|
||||
| `backend/test-intercept.js` | Proof of concept script |
|
||||
| `backend/src/platforms/dutchie/client.ts` | GraphQL hashes, curl implementation |
|
||||
| `backend/src/tasks/handlers/payload-fetch.ts` | Current curl-based handler |
|
||||
| `backend/src/utils/payload-storage.ts` | Payload save/load utilities |
|
||||
|
||||
---
|
||||
|
||||
## See Also
|
||||
|
||||
- `DUTCHIE_CRAWL_WORKFLOW.md` - Full crawl pipeline documentation
|
||||
- `TASK_WORKFLOW_2024-12-10.md` - Task system architecture
|
||||
- `CLAUDE.md` - Project rules and constraints
|
||||
25
backend/docs/_archive/README.md
Normal file
25
backend/docs/_archive/README.md
Normal file
@@ -0,0 +1,25 @@
|
||||
# ARCHIVED DOCUMENTATION
|
||||
|
||||
**WARNING: These docs may be outdated or inaccurate.**
|
||||
|
||||
The code has evolved significantly. These docs are kept for historical reference only.
|
||||
|
||||
## What to Use Instead
|
||||
|
||||
**The single source of truth is:**
|
||||
- `CLAUDE.md` (root) - Essential rules and quick reference
|
||||
- `docs/CODEBASE_MAP.md` - Current file/directory reference
|
||||
|
||||
## Why Archive?
|
||||
|
||||
These docs were written during development iterations and may reference:
|
||||
- Old file paths that no longer exist
|
||||
- Deprecated approaches (hydration, scraper-v2)
|
||||
- APIs that have changed
|
||||
- Database schemas that evolved
|
||||
|
||||
## If You Need Details
|
||||
|
||||
1. First check CODEBASE_MAP.md for current file locations
|
||||
2. Then read the actual source code
|
||||
3. Only use archive docs as a last resort for historical context
|
||||
584
backend/docs/_archive/TASK_WORKFLOW_2024-12-10.md
Normal file
584
backend/docs/_archive/TASK_WORKFLOW_2024-12-10.md
Normal file
@@ -0,0 +1,584 @@
|
||||
# Task Workflow Documentation
|
||||
**Date: 2024-12-10**
|
||||
|
||||
This document describes the complete task/job processing architecture after the 2024-12-10 rewrite.
|
||||
|
||||
---
|
||||
|
||||
## Complete Architecture
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ KUBERNETES CLUSTER │
|
||||
├─────────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ API SERVER POD (scraper) │ │
|
||||
│ │ │ │
|
||||
│ │ ┌──────────────────┐ ┌────────────────────────────────────────┐ │ │
|
||||
│ │ │ Express API │ │ TaskScheduler │ │ │
|
||||
│ │ │ │ │ (src/services/task-scheduler.ts) │ │ │
|
||||
│ │ │ /api/job-queue │ │ │ │ │
|
||||
│ │ │ /api/tasks │ │ • Polls every 60s │ │ │
|
||||
│ │ │ /api/schedules │ │ • Checks task_schedules table │ │ │
|
||||
│ │ └────────┬─────────┘ │ • SELECT FOR UPDATE SKIP LOCKED │ │ │
|
||||
│ │ │ │ • Generates tasks when due │ │ │
|
||||
│ │ │ └──────────────────┬─────────────────────┘ │ │
|
||||
│ │ │ │ │ │
|
||||
│ └────────────┼──────────────────────────────────┼──────────────────────────┘ │
|
||||
│ │ │ │
|
||||
│ │ ┌────────────────────────┘ │
|
||||
│ │ │ │
|
||||
│ ▼ ▼ │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ POSTGRESQL DATABASE │ │
|
||||
│ │ │ │
|
||||
│ │ ┌─────────────────────┐ ┌─────────────────────┐ │ │
|
||||
│ │ │ task_schedules │ │ worker_tasks │ │ │
|
||||
│ │ │ │ │ │ │ │
|
||||
│ │ │ • product_refresh │───────►│ • pending tasks │ │ │
|
||||
│ │ │ • store_discovery │ create │ • claimed tasks │ │ │
|
||||
│ │ │ • analytics_refresh │ tasks │ • running tasks │ │ │
|
||||
│ │ │ │ │ • completed tasks │ │ │
|
||||
│ │ │ next_run_at │ │ │ │ │
|
||||
│ │ │ last_run_at │ │ role, dispensary_id │ │ │
|
||||
│ │ │ interval_hours │ │ priority, status │ │ │
|
||||
│ │ └─────────────────────┘ └──────────┬──────────┘ │ │
|
||||
│ │ │ │ │
|
||||
│ └─────────────────────────────────────────────┼────────────────────────────┘ │
|
||||
│ │ │
|
||||
│ ┌──────────────────────┘ │
|
||||
│ │ Workers poll for tasks │
|
||||
│ │ (SELECT FOR UPDATE SKIP LOCKED) │
|
||||
│ ▼ │
|
||||
│ ┌─────────────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ WORKER PODS (StatefulSet: scraper-worker) │ │
|
||||
│ │ │ │
|
||||
│ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ │
|
||||
│ │ │ Worker 0 │ │ Worker 1 │ │ Worker 2 │ │ Worker N │ │ │
|
||||
│ │ │ │ │ │ │ │ │ │ │ │
|
||||
│ │ │ task-worker │ │ task-worker │ │ task-worker │ │ task-worker │ │ │
|
||||
│ │ │ .ts │ │ .ts │ │ .ts │ │ .ts │ │ │
|
||||
│ │ └─────────────┘ └─────────────┘ └─────────────┘ └─────────────┘ │ │
|
||||
│ │ │ │
|
||||
│ └──────────────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
└──────────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Startup Sequence
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────┐
|
||||
│ API SERVER STARTUP │
|
||||
├─────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ 1. Express app initializes │
|
||||
│ │ │
|
||||
│ ▼ │
|
||||
│ 2. runAutoMigrations() │
|
||||
│ • Runs pending migrations (including 079_task_schedules.sql) │
|
||||
│ │ │
|
||||
│ ▼ │
|
||||
│ 3. initializeMinio() / initializeImageStorage() │
|
||||
│ │ │
|
||||
│ ▼ │
|
||||
│ 4. cleanupOrphanedJobs() │
|
||||
│ │ │
|
||||
│ ▼ │
|
||||
│ 5. taskScheduler.start() ◄─── NEW (per TASK_WORKFLOW_2024-12-10.md) │
|
||||
│ │ │
|
||||
│ ├── Recover stale tasks (workers that died) │
|
||||
│ ├── Ensure default schedules exist in task_schedules │
|
||||
│ ├── Check and run any due schedules immediately │
|
||||
│ └── Start 60-second poll interval │
|
||||
│ │ │
|
||||
│ ▼ │
|
||||
│ 6. app.listen(PORT) │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────────────────────┘
|
||||
|
||||
┌─────────────────────────────────────────────────────────────────────────────┐
|
||||
│ WORKER POD STARTUP │
|
||||
├─────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ 1. K8s starts pod from StatefulSet │
|
||||
│ │ │
|
||||
│ ▼ │
|
||||
│ 2. TaskWorker.constructor() │
|
||||
│ • Create DB pool │
|
||||
│ • Create CrawlRotator │
|
||||
│ │ │
|
||||
│ ▼ │
|
||||
│ 3. initializeStealth() │
|
||||
│ • Load proxies from DB (REQUIRED - fails if none) │
|
||||
│ • Wire rotator to Dutchie client │
|
||||
│ │ │
|
||||
│ ▼ │
|
||||
│ 4. register() with API │
|
||||
│ • Optional - continues if fails │
|
||||
│ │ │
|
||||
│ ▼ │
|
||||
│ 5. startRegistryHeartbeat() every 30s │
|
||||
│ │ │
|
||||
│ ▼ │
|
||||
│ 6. processNextTask() loop │
|
||||
│ │ │
|
||||
│ ├── Poll for pending task (FOR UPDATE SKIP LOCKED) │
|
||||
│ ├── Claim task atomically │
|
||||
│ ├── Execute handler (product_refresh, store_discovery, etc.) │
|
||||
│ ├── Mark complete/failed │
|
||||
│ ├── Chain next task if applicable │
|
||||
│ └── Loop │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Schedule Flow
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────────┐
|
||||
│ SCHEDULER POLL (every 60 seconds) │
|
||||
├─────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ BEGIN TRANSACTION │
|
||||
│ │ │
|
||||
│ ▼ │
|
||||
│ SELECT * FROM task_schedules │
|
||||
│ WHERE enabled = true AND next_run_at <= NOW() │
|
||||
│ FOR UPDATE SKIP LOCKED ◄─── Prevents duplicate execution across replicas │
|
||||
│ │ │
|
||||
│ ▼ │
|
||||
│ For each due schedule: │
|
||||
│ │ │
|
||||
│ ├── product_refresh_all │
|
||||
│ │ └─► Query dispensaries needing crawl │
|
||||
│ │ └─► Create product_refresh tasks in worker_tasks │
|
||||
│ │ │
|
||||
│ ├── store_discovery_dutchie │
|
||||
│ │ └─► Create single store_discovery task │
|
||||
│ │ │
|
||||
│ └── analytics_refresh │
|
||||
│ └─► Create single analytics_refresh task │
|
||||
│ │ │
|
||||
│ ▼ │
|
||||
│ UPDATE task_schedules SET │
|
||||
│ last_run_at = NOW(), │
|
||||
│ next_run_at = NOW() + interval_hours │
|
||||
│ │ │
|
||||
│ ▼ │
|
||||
│ COMMIT │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Task Lifecycle
|
||||
|
||||
```
|
||||
┌──────────┐
|
||||
│ SCHEDULE │
|
||||
│ DUE │
|
||||
└────┬─────┘
|
||||
│
|
||||
▼
|
||||
┌──────────────┐ claim ┌──────────────┐ start ┌──────────────┐
|
||||
│ PENDING │────────────►│ CLAIMED │────────────►│ RUNNING │
|
||||
└──────────────┘ └──────────────┘ └──────┬───────┘
|
||||
▲ │
|
||||
│ ┌──────────────┼──────────────┐
|
||||
│ retry │ │ │
|
||||
│ (if retries < max) ▼ ▼ ▼
|
||||
│ ┌──────────┐ ┌──────────┐ ┌──────────┐
|
||||
└──────────────────────────────────│ FAILED │ │ COMPLETED│ │ STALE │
|
||||
└──────────┘ └──────────┘ └────┬─────┘
|
||||
│
|
||||
recover_stale_tasks()
|
||||
│
|
||||
▼
|
||||
┌──────────┐
|
||||
│ PENDING │
|
||||
└──────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Database Tables
|
||||
|
||||
### task_schedules (NEW - migration 079)
|
||||
|
||||
Stores schedule definitions. Survives restarts.
|
||||
|
||||
```sql
|
||||
CREATE TABLE task_schedules (
|
||||
id SERIAL PRIMARY KEY,
|
||||
name VARCHAR(100) NOT NULL UNIQUE,
|
||||
role VARCHAR(50) NOT NULL, -- product_refresh, store_discovery, etc.
|
||||
enabled BOOLEAN DEFAULT TRUE,
|
||||
interval_hours INTEGER NOT NULL, -- How often to run
|
||||
priority INTEGER DEFAULT 0, -- Task priority when created
|
||||
state_code VARCHAR(2), -- Optional filter
|
||||
last_run_at TIMESTAMPTZ, -- When it last ran
|
||||
next_run_at TIMESTAMPTZ, -- When it's due next
|
||||
last_task_count INTEGER, -- Tasks created last run
|
||||
last_error TEXT -- Error message if failed
|
||||
);
|
||||
```
|
||||
|
||||
### worker_tasks (migration 074)
|
||||
|
||||
The task queue. Workers pull from here.
|
||||
|
||||
```sql
|
||||
CREATE TABLE worker_tasks (
|
||||
id SERIAL PRIMARY KEY,
|
||||
role task_role NOT NULL, -- What type of work
|
||||
dispensary_id INTEGER, -- Which store (if applicable)
|
||||
platform VARCHAR(50), -- Which platform
|
||||
status task_status DEFAULT 'pending',
|
||||
priority INTEGER DEFAULT 0, -- Higher = process first
|
||||
scheduled_for TIMESTAMP, -- Don't process before this time
|
||||
worker_id VARCHAR(100), -- Which worker claimed it
|
||||
claimed_at TIMESTAMP,
|
||||
started_at TIMESTAMP,
|
||||
completed_at TIMESTAMP,
|
||||
last_heartbeat_at TIMESTAMP, -- For stale detection
|
||||
result JSONB,
|
||||
error_message TEXT,
|
||||
retry_count INTEGER DEFAULT 0,
|
||||
max_retries INTEGER DEFAULT 3
|
||||
);
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Default Schedules
|
||||
|
||||
| Name | Role | Interval | Priority | Description |
|
||||
|------|------|----------|----------|-------------|
|
||||
| `payload_fetch_all` | payload_fetch | 4 hours | 0 | Fetch payloads from Dutchie API (chains to product_refresh) |
|
||||
| `store_discovery_dutchie` | store_discovery | 24 hours | 5 | Find new Dutchie stores |
|
||||
| `analytics_refresh` | analytics_refresh | 6 hours | 0 | Refresh MVs |
|
||||
|
||||
---
|
||||
|
||||
## Task Roles
|
||||
|
||||
| Role | Description | Creates Tasks For |
|
||||
|------|-------------|-------------------|
|
||||
| `payload_fetch` | **NEW** - Fetch from Dutchie API, save to disk | Each dispensary needing crawl |
|
||||
| `product_refresh` | **CHANGED** - Read local payload, normalize, upsert to DB | Chained from payload_fetch |
|
||||
| `store_discovery` | Find new dispensaries, returns newStoreIds[] | Single task per platform |
|
||||
| `entry_point_discovery` | **DEPRECATED** - Resolve platform IDs | No longer used |
|
||||
| `product_discovery` | Initial product fetch for new stores | Chained from store_discovery |
|
||||
| `analytics_refresh` | Refresh MVs | Single global task |
|
||||
|
||||
### Payload/Refresh Separation (2024-12-10)
|
||||
|
||||
The crawl workflow is now split into two phases:
|
||||
|
||||
```
|
||||
payload_fetch (scheduled every 4h)
|
||||
└─► Hit Dutchie GraphQL API
|
||||
└─► Save raw JSON to /storage/payloads/{year}/{month}/{day}/store_{id}_{ts}.json.gz
|
||||
└─► Record metadata in raw_crawl_payloads table
|
||||
└─► Queue product_refresh task with payload_id
|
||||
|
||||
product_refresh (chained from payload_fetch)
|
||||
└─► Load payload from filesystem (NOT from API)
|
||||
└─► Normalize via DutchieNormalizer
|
||||
└─► Upsert to store_products
|
||||
└─► Create snapshots
|
||||
└─► Track missing products
|
||||
└─► Download images
|
||||
```
|
||||
|
||||
**Benefits:**
|
||||
- **Retry-friendly**: If normalize fails, re-run product_refresh without re-crawling
|
||||
- **Replay-able**: Run product_refresh against any historical payload
|
||||
- **Faster refreshes**: Local file read vs network call
|
||||
- **Historical diffs**: Compare payloads to see what changed between crawls
|
||||
- **Less API pressure**: Only payload_fetch hits Dutchie
|
||||
|
||||
---
|
||||
|
||||
## Task Chaining
|
||||
|
||||
Tasks automatically queue follow-up tasks upon successful completion. This creates two main flows:
|
||||
|
||||
### Discovery Flow (New Stores)
|
||||
|
||||
When `store_discovery` finds new dispensaries, they automatically get their initial product data:
|
||||
|
||||
```
|
||||
store_discovery
|
||||
└─► Discovers new locations via Dutchie GraphQL
|
||||
└─► Auto-promotes valid locations to dispensaries table
|
||||
└─► Collects newDispensaryIds[] from promotions
|
||||
└─► Returns { newStoreIds: [...] } in result
|
||||
|
||||
chainNextTask() detects newStoreIds
|
||||
└─► Creates product_discovery task for each new store
|
||||
|
||||
product_discovery
|
||||
└─► Calls handlePayloadFetch() internally
|
||||
└─► payload_fetch hits Dutchie API
|
||||
└─► Saves raw JSON to /storage/payloads/
|
||||
└─► Queues product_refresh task with payload_id
|
||||
|
||||
product_refresh
|
||||
└─► Loads payload from filesystem
|
||||
└─► Normalizes and upserts to store_products
|
||||
└─► Creates snapshots, downloads images
|
||||
```
|
||||
|
||||
**Complete Discovery Chain:**
|
||||
```
|
||||
store_discovery → product_discovery → payload_fetch → product_refresh
|
||||
(internal call) (queues next)
|
||||
```
|
||||
|
||||
### Scheduled Flow (Existing Stores)
|
||||
|
||||
For existing stores, `payload_fetch_all` schedule runs every 4 hours:
|
||||
|
||||
```
|
||||
TaskScheduler (every 60s)
|
||||
└─► Checks task_schedules for due schedules
|
||||
└─► payload_fetch_all is due
|
||||
└─► Generates payload_fetch task for each dispensary
|
||||
|
||||
payload_fetch
|
||||
└─► Hits Dutchie GraphQL API
|
||||
└─► Saves raw JSON to /storage/payloads/
|
||||
└─► Queues product_refresh task with payload_id
|
||||
|
||||
product_refresh
|
||||
└─► Loads payload from filesystem (NOT API)
|
||||
└─► Normalizes via DutchieNormalizer
|
||||
└─► Upserts to store_products
|
||||
└─► Creates snapshots
|
||||
```
|
||||
|
||||
**Complete Scheduled Chain:**
|
||||
```
|
||||
payload_fetch → product_refresh
|
||||
(queues) (reads local)
|
||||
```
|
||||
|
||||
### Chaining Implementation
|
||||
|
||||
Task chaining is handled in two places:
|
||||
|
||||
1. **Internal chaining (handler calls handler):**
|
||||
- `product_discovery` calls `handlePayloadFetch()` directly
|
||||
|
||||
2. **External chaining (chainNextTask() in task-service.ts):**
|
||||
- Called after task completion
|
||||
- `store_discovery` → queues `product_discovery` for each newStoreId
|
||||
|
||||
3. **Queue-based chaining (taskService.createTask):**
|
||||
- `payload_fetch` queues `product_refresh` with `payload: { payload_id }`
|
||||
|
||||
---
|
||||
|
||||
## Payload API Endpoints
|
||||
|
||||
Raw crawl payloads can be accessed via the Payloads API:
|
||||
|
||||
| Endpoint | Method | Description |
|
||||
|----------|--------|-------------|
|
||||
| `GET /api/payloads` | GET | List payload metadata (paginated) |
|
||||
| `GET /api/payloads/:id` | GET | Get payload metadata by ID |
|
||||
| `GET /api/payloads/:id/data` | GET | Get full payload JSON (decompressed) |
|
||||
| `GET /api/payloads/store/:dispensaryId` | GET | List payloads for a store |
|
||||
| `GET /api/payloads/store/:dispensaryId/latest` | GET | Get latest payload for a store |
|
||||
| `GET /api/payloads/store/:dispensaryId/diff` | GET | Diff two payloads for changes |
|
||||
|
||||
### Payload Diff Response
|
||||
|
||||
The diff endpoint returns:
|
||||
```json
|
||||
{
|
||||
"success": true,
|
||||
"from": { "id": 123, "fetchedAt": "...", "productCount": 100 },
|
||||
"to": { "id": 456, "fetchedAt": "...", "productCount": 105 },
|
||||
"diff": {
|
||||
"added": 10,
|
||||
"removed": 5,
|
||||
"priceChanges": 8,
|
||||
"stockChanges": 12
|
||||
},
|
||||
"details": {
|
||||
"added": [...],
|
||||
"removed": [...],
|
||||
"priceChanges": [...],
|
||||
"stockChanges": [...]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## API Endpoints
|
||||
|
||||
### Schedules (NEW)
|
||||
|
||||
| Endpoint | Method | Description |
|
||||
|----------|--------|-------------|
|
||||
| `GET /api/schedules` | GET | List all schedules |
|
||||
| `PUT /api/schedules/:id` | PUT | Update schedule |
|
||||
| `POST /api/schedules/:id/trigger` | POST | Run schedule immediately |
|
||||
|
||||
### Task Creation (rewired 2024-12-10)
|
||||
|
||||
| Endpoint | Method | Description |
|
||||
|----------|--------|-------------|
|
||||
| `POST /api/job-queue/enqueue` | POST | Create single task |
|
||||
| `POST /api/job-queue/enqueue-batch` | POST | Create batch tasks |
|
||||
| `POST /api/job-queue/enqueue-state` | POST | Create tasks for state |
|
||||
| `POST /api/tasks` | POST | Direct task creation |
|
||||
|
||||
### Task Management
|
||||
|
||||
| Endpoint | Method | Description |
|
||||
|----------|--------|-------------|
|
||||
| `GET /api/tasks` | GET | List tasks |
|
||||
| `GET /api/tasks/:id` | GET | Get single task |
|
||||
| `GET /api/tasks/counts` | GET | Task counts by status |
|
||||
| `POST /api/tasks/recover-stale` | POST | Recover stale tasks |
|
||||
|
||||
---
|
||||
|
||||
## Key Files
|
||||
|
||||
| File | Purpose |
|
||||
|------|---------|
|
||||
| `src/services/task-scheduler.ts` | **NEW** - DB-driven scheduler |
|
||||
| `src/tasks/task-worker.ts` | Worker that processes tasks |
|
||||
| `src/tasks/task-service.ts` | Task CRUD operations |
|
||||
| `src/tasks/handlers/payload-fetch.ts` | **NEW** - Fetches from API, saves to disk |
|
||||
| `src/tasks/handlers/product-refresh.ts` | **CHANGED** - Reads from disk, processes to DB |
|
||||
| `src/utils/payload-storage.ts` | **NEW** - Payload save/load utilities |
|
||||
| `src/routes/tasks.ts` | Task API endpoints |
|
||||
| `src/routes/job-queue.ts` | Job Queue UI endpoints (rewired) |
|
||||
| `migrations/079_task_schedules.sql` | Schedule table |
|
||||
| `migrations/080_raw_crawl_payloads.sql` | Payload metadata table |
|
||||
| `migrations/081_payload_fetch_columns.sql` | payload, last_fetch_at columns |
|
||||
| `migrations/074_worker_task_queue.sql` | Task queue table |
|
||||
|
||||
---
|
||||
|
||||
## Legacy Code (DEPRECATED)
|
||||
|
||||
| File | Status | Replacement |
|
||||
|------|--------|-------------|
|
||||
| `src/services/scheduler.ts` | DEPRECATED | `task-scheduler.ts` |
|
||||
| `dispensary_crawl_jobs` table | ORPHANED | `worker_tasks` |
|
||||
| `job_schedules` table | LEGACY | `task_schedules` |
|
||||
|
||||
---
|
||||
|
||||
## Dashboard Integration
|
||||
|
||||
Both pages remain wired to the dashboard:
|
||||
|
||||
| Page | Data Source | Actions |
|
||||
|------|-------------|---------|
|
||||
| **Job Queue** | `worker_tasks`, `task_schedules` | Create tasks, view schedules |
|
||||
| **Task Queue** | `worker_tasks` | View tasks, recover stale |
|
||||
|
||||
---
|
||||
|
||||
## Multi-Replica Safety
|
||||
|
||||
The scheduler uses `SELECT FOR UPDATE SKIP LOCKED` to ensure:
|
||||
|
||||
1. **Only one replica** executes a schedule at a time
|
||||
2. **No duplicate tasks** created
|
||||
3. **Survives pod restarts** - state in DB, not memory
|
||||
4. **Self-healing** - recovers stale tasks on startup
|
||||
|
||||
```sql
|
||||
-- This query is atomic across all API server replicas
|
||||
SELECT * FROM task_schedules
|
||||
WHERE enabled = true AND next_run_at <= NOW()
|
||||
FOR UPDATE SKIP LOCKED
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Worker Scaling (K8s)
|
||||
|
||||
Workers run as a StatefulSet in Kubernetes. You can scale from the admin UI or CLI.
|
||||
|
||||
### From Admin UI
|
||||
|
||||
The Workers page (`/admin/workers`) provides:
|
||||
- Current replica count display
|
||||
- Scale up/down buttons
|
||||
- Target replica input
|
||||
|
||||
### API Endpoints
|
||||
|
||||
| Endpoint | Method | Description |
|
||||
|----------|--------|-------------|
|
||||
| `GET /api/workers/k8s/replicas` | GET | Get current/desired replica counts |
|
||||
| `POST /api/workers/k8s/scale` | POST | Scale to N replicas (body: `{ replicas: N }`) |
|
||||
|
||||
### From CLI
|
||||
|
||||
```bash
|
||||
# View current replicas
|
||||
kubectl get statefulset scraper-worker -n dispensary-scraper
|
||||
|
||||
# Scale to 10 workers
|
||||
kubectl scale statefulset scraper-worker -n dispensary-scraper --replicas=10
|
||||
|
||||
# Scale down to 3 workers
|
||||
kubectl scale statefulset scraper-worker -n dispensary-scraper --replicas=3
|
||||
```
|
||||
|
||||
### Configuration
|
||||
|
||||
Environment variables for the API server:
|
||||
|
||||
| Variable | Default | Description |
|
||||
|----------|---------|-------------|
|
||||
| `K8S_NAMESPACE` | `dispensary-scraper` | Kubernetes namespace |
|
||||
| `K8S_WORKER_STATEFULSET` | `scraper-worker` | StatefulSet name |
|
||||
|
||||
### RBAC Requirements
|
||||
|
||||
The API server pod needs these K8s permissions:
|
||||
|
||||
```yaml
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: Role
|
||||
metadata:
|
||||
name: worker-scaler
|
||||
namespace: dispensary-scraper
|
||||
rules:
|
||||
- apiGroups: ["apps"]
|
||||
resources: ["statefulsets"]
|
||||
verbs: ["get", "patch"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: RoleBinding
|
||||
metadata:
|
||||
name: scraper-worker-scaler
|
||||
namespace: dispensary-scraper
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: default
|
||||
namespace: dispensary-scraper
|
||||
roleRef:
|
||||
kind: Role
|
||||
name: worker-scaler
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
```
|
||||
@@ -362,6 +362,245 @@ SET status = 'pending', retry_count = retry_count + 1
|
||||
WHERE status = 'failed' AND retry_count < max_retries;
|
||||
```
|
||||
|
||||
## Concurrent Task Processing (Added 2024-12)
|
||||
|
||||
Workers can now process multiple tasks concurrently within a single worker instance. This improves throughput by utilizing async I/O efficiently.
|
||||
|
||||
### Architecture
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ Pod (K8s) │
|
||||
│ │
|
||||
│ ┌─────────────────────────────────────────────────────┐ │
|
||||
│ │ TaskWorker │ │
|
||||
│ │ │ │
|
||||
│ │ ┌─────────┐ ┌─────────┐ ┌─────────┐ │ │
|
||||
│ │ │ Task 1 │ │ Task 2 │ │ Task 3 │ (concurrent)│ │
|
||||
│ │ └─────────┘ └─────────┘ └─────────┘ │ │
|
||||
│ │ │ │
|
||||
│ │ Resource Monitor │ │
|
||||
│ │ ├── Memory: 65% (threshold: 85%) │ │
|
||||
│ │ ├── CPU: 45% (threshold: 90%) │ │
|
||||
│ │ └── Status: Normal │ │
|
||||
│ └─────────────────────────────────────────────────────┘ │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
### Environment Variables
|
||||
|
||||
| Variable | Default | Description |
|
||||
|----------|---------|-------------|
|
||||
| `MAX_CONCURRENT_TASKS` | 3 | Maximum tasks a worker will run concurrently |
|
||||
| `MEMORY_BACKOFF_THRESHOLD` | 0.85 | Back off when heap memory exceeds 85% |
|
||||
| `CPU_BACKOFF_THRESHOLD` | 0.90 | Back off when CPU exceeds 90% |
|
||||
| `BACKOFF_DURATION_MS` | 10000 | How long to wait when backing off (10s) |
|
||||
|
||||
### How It Works
|
||||
|
||||
1. **Main Loop**: Worker continuously tries to fill up to `MAX_CONCURRENT_TASKS`
|
||||
2. **Resource Monitoring**: Before claiming a new task, worker checks memory and CPU
|
||||
3. **Backoff**: If resources exceed thresholds, worker pauses and stops claiming new tasks
|
||||
4. **Concurrent Execution**: Tasks run in parallel using `Promise` - they don't block each other
|
||||
5. **Graceful Shutdown**: On SIGTERM/decommission, worker stops claiming but waits for active tasks
|
||||
|
||||
### Resource Monitoring
|
||||
|
||||
```typescript
|
||||
// ResourceStats interface
|
||||
interface ResourceStats {
|
||||
memoryPercent: number; // Current heap usage as decimal (0.0-1.0)
|
||||
memoryMb: number; // Current heap used in MB
|
||||
memoryTotalMb: number; // Total heap available in MB
|
||||
cpuPercent: number; // CPU usage as percentage (0-100)
|
||||
isBackingOff: boolean; // True if worker is in backoff state
|
||||
backoffReason: string; // Why the worker is backing off
|
||||
}
|
||||
```
|
||||
|
||||
### Heartbeat Data
|
||||
|
||||
Workers report the following in their heartbeat:
|
||||
|
||||
```json
|
||||
{
|
||||
"worker_id": "worker-abc123",
|
||||
"current_task_id": 456,
|
||||
"current_task_ids": [456, 457, 458],
|
||||
"active_task_count": 3,
|
||||
"max_concurrent_tasks": 3,
|
||||
"status": "active",
|
||||
"resources": {
|
||||
"memory_mb": 256,
|
||||
"memory_total_mb": 512,
|
||||
"memory_rss_mb": 320,
|
||||
"memory_percent": 50,
|
||||
"cpu_user_ms": 12500,
|
||||
"cpu_system_ms": 3200,
|
||||
"cpu_percent": 45,
|
||||
"is_backing_off": false,
|
||||
"backoff_reason": null
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Backoff Behavior
|
||||
|
||||
When resources exceed thresholds:
|
||||
|
||||
1. Worker logs the backoff reason:
|
||||
```
|
||||
[TaskWorker] MyWorker backing off: Memory at 87.3% (threshold: 85%)
|
||||
```
|
||||
|
||||
2. Worker stops claiming new tasks but continues existing tasks
|
||||
|
||||
3. After `BACKOFF_DURATION_MS`, worker rechecks resources
|
||||
|
||||
4. When resources return to normal:
|
||||
```
|
||||
[TaskWorker] MyWorker resuming normal operation
|
||||
```
|
||||
|
||||
### UI Display
|
||||
|
||||
The Workers Dashboard shows:
|
||||
|
||||
- **Tasks Column**: `2/3 tasks` (active/max concurrent)
|
||||
- **Resources Column**: Memory % and CPU % with color coding
|
||||
- Green: < 50%
|
||||
- Yellow: 50-74%
|
||||
- Amber: 75-89%
|
||||
- Red: 90%+
|
||||
- **Backing Off**: Orange warning badge when worker is in backoff state
|
||||
|
||||
### Task Count Badge Details
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────┐
|
||||
│ Worker: "MyWorker" │
|
||||
│ Tasks: 2/3 tasks #456, #457 │
|
||||
│ Resources: 🧠 65% 💻 45% │
|
||||
│ Status: ● Active │
|
||||
└─────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
### Best Practices
|
||||
|
||||
1. **Start Conservative**: Use `MAX_CONCURRENT_TASKS=3` initially
|
||||
2. **Monitor Resources**: Watch for frequent backoffs in logs
|
||||
3. **Tune Per Workload**: I/O-bound tasks benefit from higher concurrency
|
||||
4. **Scale Horizontally**: Add more pods rather than cranking concurrency too high
|
||||
|
||||
### Code References
|
||||
|
||||
| File | Purpose |
|
||||
|------|---------|
|
||||
| `src/tasks/task-worker.ts:68-71` | Concurrency environment variables |
|
||||
| `src/tasks/task-worker.ts:104-111` | ResourceStats interface |
|
||||
| `src/tasks/task-worker.ts:149-179` | getResourceStats() method |
|
||||
| `src/tasks/task-worker.ts:184-196` | shouldBackOff() method |
|
||||
| `src/tasks/task-worker.ts:462-516` | mainLoop() with concurrent claiming |
|
||||
| `src/routes/worker-registry.ts:148-195` | Heartbeat endpoint handling |
|
||||
| `cannaiq/src/pages/WorkersDashboard.tsx:233-305` | UI components for resources |
|
||||
|
||||
## Browser Task Memory Limits (Updated 2025-12)
|
||||
|
||||
Browser-based tasks (Puppeteer/Chrome) have strict memory constraints that limit concurrency.
|
||||
|
||||
### Why Browser Tasks Are Different
|
||||
|
||||
Each browser task launches a Chrome process. Unlike I/O-bound API calls, browsers consume significant RAM:
|
||||
|
||||
| Component | RAM Usage |
|
||||
|-----------|-----------|
|
||||
| Node.js runtime | ~150 MB |
|
||||
| Chrome browser (base) | ~200-250 MB |
|
||||
| Dutchie menu page (loaded) | ~100-150 MB |
|
||||
| **Per browser total** | **~350-450 MB** |
|
||||
|
||||
### Memory Math for Pod Limits
|
||||
|
||||
```
|
||||
Pod memory limit: 2 GB (2000 MB)
|
||||
Node.js runtime: -150 MB
|
||||
Safety buffer: -100 MB
|
||||
────────────────────────────────
|
||||
Available for browsers: 1750 MB
|
||||
|
||||
Per browser + page: ~400 MB
|
||||
|
||||
Max browsers: 1750 ÷ 400 = ~4 browsers
|
||||
|
||||
Recommended: 3 browsers (leaves headroom for spikes)
|
||||
```
|
||||
|
||||
### MAX_CONCURRENT_TASKS for Browser Tasks
|
||||
|
||||
| Browsers per Pod | RAM Used | Risk Level |
|
||||
|------------------|----------|------------|
|
||||
| 1 | ~500 MB | Very safe |
|
||||
| 2 | ~900 MB | Safe |
|
||||
| **3** | **~1.3 GB** | **Recommended** |
|
||||
| 4 | ~1.7 GB | Tight (may OOM) |
|
||||
| 5+ | >2 GB | Will OOM crash |
|
||||
|
||||
**CRITICAL**: `MAX_CONCURRENT_TASKS=3` is the maximum safe value for browser tasks with current pod limits.
|
||||
|
||||
### Scaling Strategy
|
||||
|
||||
Scale **horizontally** (more pods) rather than vertically (more concurrency per pod):
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────┐
|
||||
│ Cluster: 8 pods × 3 browsers = 24 concurrent tasks │
|
||||
│ │
|
||||
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
|
||||
│ │ Pod 0 │ │ Pod 1 │ │ Pod 2 │ │ Pod 3 │ │
|
||||
│ │ 3 browsers │ │ 3 browsers │ │ 3 browsers │ │ 3 browsers │ │
|
||||
│ └─────────────┘ └─────────────┘ └─────────────┘ └─────────────┘ │
|
||||
│ │
|
||||
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
|
||||
│ │ Pod 4 │ │ Pod 5 │ │ Pod 6 │ │ Pod 7 │ │
|
||||
│ │ 3 browsers │ │ 3 browsers │ │ 3 browsers │ │ 3 browsers │ │
|
||||
│ └─────────────┘ └─────────────┘ └─────────────┘ └─────────────┘ │
|
||||
└─────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
### Browser Lifecycle Per Task
|
||||
|
||||
Each task gets a fresh browser with fresh IP/identity:
|
||||
|
||||
```
|
||||
1. Claim task from queue
|
||||
2. Get fresh proxy from pool
|
||||
3. Launch browser with proxy
|
||||
4. Run preflight (verify IP)
|
||||
5. Execute scrape
|
||||
6. Close browser
|
||||
7. Repeat
|
||||
```
|
||||
|
||||
This ensures:
|
||||
- Fresh IP per task (proxy rotation)
|
||||
- Fresh fingerprint per task (UA rotation)
|
||||
- No cookie/session bleed between tasks
|
||||
- Predictable memory usage
|
||||
|
||||
### Increasing Capacity
|
||||
|
||||
To handle more concurrent tasks:
|
||||
|
||||
1. **Add more pods** (up to 8 per CLAUDE.md limit)
|
||||
2. **Increase pod memory** (allows 4 browsers per pod):
|
||||
```yaml
|
||||
resources:
|
||||
limits:
|
||||
memory: "2.5Gi" # from 2Gi
|
||||
```
|
||||
|
||||
**DO NOT** simply increase `MAX_CONCURRENT_TASKS` without also increasing pod memory limits.
|
||||
|
||||
## Monitoring
|
||||
|
||||
### Logs
|
||||
77
backend/k8s/scraper-worker-statefulset.yaml
Normal file
77
backend/k8s/scraper-worker-statefulset.yaml
Normal file
@@ -0,0 +1,77 @@
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: scraper-worker
|
||||
namespace: dispensary-scraper
|
||||
labels:
|
||||
app: scraper-worker
|
||||
spec:
|
||||
clusterIP: None # Headless service required for StatefulSet
|
||||
selector:
|
||||
app: scraper-worker
|
||||
ports:
|
||||
- port: 3010
|
||||
name: http
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: StatefulSet
|
||||
metadata:
|
||||
name: scraper-worker
|
||||
namespace: dispensary-scraper
|
||||
spec:
|
||||
serviceName: scraper-worker
|
||||
replicas: 8
|
||||
podManagementPolicy: Parallel # Start all pods at once
|
||||
updateStrategy:
|
||||
type: OnDelete # Pods only update when manually deleted - no automatic restarts
|
||||
selector:
|
||||
matchLabels:
|
||||
app: scraper-worker
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: scraper-worker
|
||||
spec:
|
||||
terminationGracePeriodSeconds: 60
|
||||
imagePullSecrets:
|
||||
- name: regcred
|
||||
containers:
|
||||
- name: worker
|
||||
image: code.cannabrands.app/creationshop/dispensary-scraper:latest
|
||||
imagePullPolicy: Always
|
||||
command: ["node"]
|
||||
args: ["dist/tasks/task-worker.js"]
|
||||
env:
|
||||
- name: WORKER_MODE
|
||||
value: "true"
|
||||
- name: POD_NAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.name
|
||||
- name: MAX_CONCURRENT_TASKS
|
||||
value: "50"
|
||||
- name: API_BASE_URL
|
||||
value: http://scraper
|
||||
- name: NODE_OPTIONS
|
||||
value: --max-old-space-size=1500
|
||||
envFrom:
|
||||
- configMapRef:
|
||||
name: scraper-config
|
||||
- secretRef:
|
||||
name: scraper-secrets
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 1Gi
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 2Gi
|
||||
livenessProbe:
|
||||
exec:
|
||||
command:
|
||||
- /bin/sh
|
||||
- -c
|
||||
- pgrep -f 'task-worker' > /dev/null
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 30
|
||||
failureThreshold: 3
|
||||
27
backend/migrations/074_worker_commands.sql
Normal file
27
backend/migrations/074_worker_commands.sql
Normal file
@@ -0,0 +1,27 @@
|
||||
-- Migration: Worker Commands Table
|
||||
-- Purpose: Store commands for workers (decommission, etc.)
|
||||
-- Workers poll this table after each task to check for commands
|
||||
|
||||
CREATE TABLE IF NOT EXISTS worker_commands (
|
||||
id SERIAL PRIMARY KEY,
|
||||
worker_id TEXT NOT NULL,
|
||||
command TEXT NOT NULL, -- 'decommission', 'pause', 'resume'
|
||||
reason TEXT,
|
||||
issued_by TEXT,
|
||||
issued_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
acknowledged_at TIMESTAMPTZ,
|
||||
executed_at TIMESTAMPTZ,
|
||||
status TEXT DEFAULT 'pending' -- 'pending', 'acknowledged', 'executed', 'cancelled'
|
||||
);
|
||||
|
||||
-- Index for worker lookups
|
||||
CREATE INDEX IF NOT EXISTS idx_worker_commands_worker_id ON worker_commands(worker_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_worker_commands_pending ON worker_commands(worker_id, status) WHERE status = 'pending';
|
||||
|
||||
-- Add decommission_requested column to worker_registry for quick checks
|
||||
ALTER TABLE worker_registry ADD COLUMN IF NOT EXISTS decommission_requested BOOLEAN DEFAULT FALSE;
|
||||
ALTER TABLE worker_registry ADD COLUMN IF NOT EXISTS decommission_reason TEXT;
|
||||
ALTER TABLE worker_registry ADD COLUMN IF NOT EXISTS decommission_requested_at TIMESTAMPTZ;
|
||||
|
||||
-- Comment
|
||||
COMMENT ON TABLE worker_commands IS 'Commands issued to workers (decommission after task, pause, etc.)';
|
||||
8
backend/migrations/078_proxy_consecutive_403.sql
Normal file
8
backend/migrations/078_proxy_consecutive_403.sql
Normal file
@@ -0,0 +1,8 @@
|
||||
-- Migration 078: Add consecutive_403_count to proxies table
|
||||
-- Per workflow-12102025.md: Track consecutive 403s per proxy
|
||||
-- After 3 consecutive 403s with different fingerprints → disable proxy
|
||||
|
||||
ALTER TABLE proxies ADD COLUMN IF NOT EXISTS consecutive_403_count INTEGER DEFAULT 0;
|
||||
|
||||
-- Add comment explaining the column
|
||||
COMMENT ON COLUMN proxies.consecutive_403_count IS 'Tracks consecutive 403 blocks. Reset to 0 on success. Proxy disabled at 3.';
|
||||
49
backend/migrations/079_task_schedules.sql
Normal file
49
backend/migrations/079_task_schedules.sql
Normal file
@@ -0,0 +1,49 @@
|
||||
-- Migration 079: Task Schedules for Database-Driven Scheduler
|
||||
-- Per TASK_WORKFLOW_2024-12-10.md: Replaces node-cron with DB-driven scheduling
|
||||
--
|
||||
-- 2024-12-10: Created for reliable, multi-replica-safe task scheduling
|
||||
|
||||
-- task_schedules: Stores schedule definitions and state
|
||||
CREATE TABLE IF NOT EXISTS task_schedules (
|
||||
id SERIAL PRIMARY KEY,
|
||||
name VARCHAR(100) NOT NULL UNIQUE,
|
||||
role VARCHAR(50) NOT NULL, -- TaskRole: product_refresh, store_discovery, etc.
|
||||
description TEXT,
|
||||
|
||||
-- Schedule configuration
|
||||
enabled BOOLEAN DEFAULT TRUE,
|
||||
interval_hours INTEGER NOT NULL DEFAULT 4,
|
||||
priority INTEGER DEFAULT 0,
|
||||
|
||||
-- Optional scope filters
|
||||
state_code VARCHAR(2), -- NULL = all states
|
||||
platform VARCHAR(50), -- NULL = all platforms
|
||||
|
||||
-- Execution state (updated by scheduler)
|
||||
last_run_at TIMESTAMPTZ,
|
||||
next_run_at TIMESTAMPTZ,
|
||||
last_task_count INTEGER DEFAULT 0,
|
||||
last_error TEXT,
|
||||
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
-- Indexes for scheduler queries
|
||||
CREATE INDEX IF NOT EXISTS idx_task_schedules_enabled ON task_schedules(enabled) WHERE enabled = TRUE;
|
||||
CREATE INDEX IF NOT EXISTS idx_task_schedules_next_run ON task_schedules(next_run_at) WHERE enabled = TRUE;
|
||||
|
||||
-- Insert default schedules
|
||||
INSERT INTO task_schedules (name, role, interval_hours, priority, description, next_run_at)
|
||||
VALUES
|
||||
('product_refresh_all', 'product_refresh', 4, 0, 'Generate product refresh tasks for all crawl-enabled stores every 4 hours', NOW()),
|
||||
('store_discovery_dutchie', 'store_discovery', 24, 5, 'Discover new Dutchie stores daily', NOW()),
|
||||
('analytics_refresh', 'analytics_refresh', 6, 0, 'Refresh analytics materialized views every 6 hours', NOW())
|
||||
ON CONFLICT (name) DO NOTHING;
|
||||
|
||||
-- Comment for documentation
|
||||
COMMENT ON TABLE task_schedules IS 'Database-driven task scheduler configuration. Per TASK_WORKFLOW_2024-12-10.md:
|
||||
- Schedules persist in DB (survive restarts)
|
||||
- Uses SELECT FOR UPDATE SKIP LOCKED for multi-replica safety
|
||||
- Scheduler polls every 60s and executes due schedules
|
||||
- Creates tasks in worker_tasks for task-worker.ts to process';
|
||||
58
backend/migrations/080_raw_crawl_payloads.sql
Normal file
58
backend/migrations/080_raw_crawl_payloads.sql
Normal file
@@ -0,0 +1,58 @@
|
||||
-- Migration 080: Raw Crawl Payloads Metadata Table
|
||||
-- Per TASK_WORKFLOW_2024-12-10.md: Store full GraphQL payloads for historical analysis
|
||||
--
|
||||
-- Design Pattern: Metadata/Payload Separation
|
||||
-- - Metadata (this table): Small, indexed, queryable
|
||||
-- - Payload (filesystem): Gzipped JSON at storage_path
|
||||
--
|
||||
-- Benefits:
|
||||
-- - Compare any two crawls to see what changed
|
||||
-- - Replay/re-normalize historical data if logic changes
|
||||
-- - Debug issues by seeing exactly what the API returned
|
||||
-- - DB stays small, backups stay fast
|
||||
--
|
||||
-- Storage location: /storage/payloads/{year}/{month}/{day}/store_{id}_{timestamp}.json.gz
|
||||
-- Compression: ~90% reduction (1.5MB -> 150KB per crawl)
|
||||
|
||||
CREATE TABLE IF NOT EXISTS raw_crawl_payloads (
|
||||
id SERIAL PRIMARY KEY,
|
||||
|
||||
-- Links to crawl tracking
|
||||
crawl_run_id INTEGER REFERENCES crawl_runs(id) ON DELETE SET NULL,
|
||||
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
|
||||
|
||||
-- File location (gzipped JSON)
|
||||
storage_path TEXT NOT NULL,
|
||||
|
||||
-- Metadata for quick queries without loading file
|
||||
product_count INTEGER NOT NULL DEFAULT 0,
|
||||
size_bytes INTEGER, -- Compressed size
|
||||
size_bytes_raw INTEGER, -- Uncompressed size
|
||||
|
||||
-- Timestamps
|
||||
fetched_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
|
||||
-- Optional: checksum for integrity verification
|
||||
checksum_sha256 VARCHAR(64)
|
||||
);
|
||||
|
||||
-- Indexes for common queries
|
||||
CREATE INDEX IF NOT EXISTS idx_raw_crawl_payloads_dispensary
|
||||
ON raw_crawl_payloads(dispensary_id);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_raw_crawl_payloads_dispensary_fetched
|
||||
ON raw_crawl_payloads(dispensary_id, fetched_at DESC);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_raw_crawl_payloads_fetched
|
||||
ON raw_crawl_payloads(fetched_at DESC);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_raw_crawl_payloads_crawl_run
|
||||
ON raw_crawl_payloads(crawl_run_id)
|
||||
WHERE crawl_run_id IS NOT NULL;
|
||||
|
||||
-- Comments
|
||||
COMMENT ON TABLE raw_crawl_payloads IS 'Metadata for raw GraphQL payloads stored on filesystem. Per TASK_WORKFLOW_2024-12-10.md: Full payloads enable historical diffs and replay.';
|
||||
COMMENT ON COLUMN raw_crawl_payloads.storage_path IS 'Path to gzipped JSON file, e.g. /storage/payloads/2024/12/10/store_123_1702234567.json.gz';
|
||||
COMMENT ON COLUMN raw_crawl_payloads.size_bytes IS 'Compressed file size in bytes';
|
||||
COMMENT ON COLUMN raw_crawl_payloads.size_bytes_raw IS 'Uncompressed payload size in bytes';
|
||||
37
backend/migrations/081_payload_fetch_columns.sql
Normal file
37
backend/migrations/081_payload_fetch_columns.sql
Normal file
@@ -0,0 +1,37 @@
|
||||
-- Migration 081: Payload Fetch Columns
|
||||
-- Per TASK_WORKFLOW_2024-12-10.md: Separates API fetch from data processing
|
||||
--
|
||||
-- New architecture:
|
||||
-- - payload_fetch: Hits Dutchie API, saves raw payload to disk
|
||||
-- - product_refresh: Reads local payload, normalizes, upserts to DB
|
||||
--
|
||||
-- This migration adds:
|
||||
-- 1. payload column to worker_tasks (for task chaining data)
|
||||
-- 2. processed_at column to raw_crawl_payloads (track when payload was processed)
|
||||
-- 3. last_fetch_at column to dispensaries (track when last payload was fetched)
|
||||
|
||||
-- Add payload column to worker_tasks for task chaining
|
||||
-- Used by payload_fetch to pass payload_id to product_refresh
|
||||
ALTER TABLE worker_tasks
|
||||
ADD COLUMN IF NOT EXISTS payload JSONB DEFAULT NULL;
|
||||
|
||||
COMMENT ON COLUMN worker_tasks.payload IS 'Per TASK_WORKFLOW_2024-12-10.md: Task chaining data (e.g., payload_id from payload_fetch to product_refresh)';
|
||||
|
||||
-- Add processed_at to raw_crawl_payloads
|
||||
-- Tracks when the payload was processed by product_refresh
|
||||
ALTER TABLE raw_crawl_payloads
|
||||
ADD COLUMN IF NOT EXISTS processed_at TIMESTAMPTZ DEFAULT NULL;
|
||||
|
||||
COMMENT ON COLUMN raw_crawl_payloads.processed_at IS 'When this payload was processed by product_refresh handler';
|
||||
|
||||
-- Index for finding unprocessed payloads
|
||||
CREATE INDEX IF NOT EXISTS idx_raw_crawl_payloads_unprocessed
|
||||
ON raw_crawl_payloads(dispensary_id, fetched_at DESC)
|
||||
WHERE processed_at IS NULL;
|
||||
|
||||
-- Add last_fetch_at to dispensaries
|
||||
-- Tracks when the last payload was fetched (separate from last_crawl_at which is when processing completed)
|
||||
ALTER TABLE dispensaries
|
||||
ADD COLUMN IF NOT EXISTS last_fetch_at TIMESTAMPTZ DEFAULT NULL;
|
||||
|
||||
COMMENT ON COLUMN dispensaries.last_fetch_at IS 'Per TASK_WORKFLOW_2024-12-10.md: When last payload was fetched from API (separate from last_crawl_at which is when processing completed)';
|
||||
27
backend/migrations/082_proxy_notification_trigger.sql
Normal file
27
backend/migrations/082_proxy_notification_trigger.sql
Normal file
@@ -0,0 +1,27 @@
|
||||
-- Migration: 082_proxy_notification_trigger
|
||||
-- Date: 2024-12-11
|
||||
-- Description: Add PostgreSQL NOTIFY trigger to alert workers when proxies are added
|
||||
|
||||
-- Create function to notify workers when active proxy is added/activated
|
||||
CREATE OR REPLACE FUNCTION notify_proxy_added()
|
||||
RETURNS TRIGGER AS $$
|
||||
BEGIN
|
||||
-- Only notify if proxy is active
|
||||
IF NEW.active = true THEN
|
||||
PERFORM pg_notify('proxy_added', NEW.id::text);
|
||||
END IF;
|
||||
RETURN NEW;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- Drop existing trigger if any
|
||||
DROP TRIGGER IF EXISTS proxy_added_trigger ON proxies;
|
||||
|
||||
-- Create trigger on insert and update of active column
|
||||
CREATE TRIGGER proxy_added_trigger
|
||||
AFTER INSERT OR UPDATE OF active ON proxies
|
||||
FOR EACH ROW
|
||||
EXECUTE FUNCTION notify_proxy_added();
|
||||
|
||||
COMMENT ON FUNCTION notify_proxy_added() IS
|
||||
'Sends PostgreSQL NOTIFY to proxy_added channel when an active proxy is added or activated. Workers LISTEN on this channel to wake up immediately.';
|
||||
88
backend/migrations/083_discovery_runs.sql
Normal file
88
backend/migrations/083_discovery_runs.sql
Normal file
@@ -0,0 +1,88 @@
|
||||
-- Migration 083: Discovery Run Tracking
|
||||
-- Tracks progress of store discovery runs step-by-step
|
||||
|
||||
-- Main discovery runs table
|
||||
CREATE TABLE IF NOT EXISTS discovery_runs (
|
||||
id SERIAL PRIMARY KEY,
|
||||
platform VARCHAR(50) NOT NULL DEFAULT 'dutchie',
|
||||
status VARCHAR(20) NOT NULL DEFAULT 'running', -- running, completed, failed
|
||||
started_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
finished_at TIMESTAMPTZ,
|
||||
task_id INTEGER REFERENCES worker_task_queue(id),
|
||||
|
||||
-- Totals
|
||||
states_total INTEGER DEFAULT 0,
|
||||
states_completed INTEGER DEFAULT 0,
|
||||
locations_discovered INTEGER DEFAULT 0,
|
||||
locations_promoted INTEGER DEFAULT 0,
|
||||
new_store_ids INTEGER[] DEFAULT '{}',
|
||||
|
||||
-- Error info
|
||||
error_message TEXT,
|
||||
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
);
|
||||
|
||||
-- Per-state progress within a run
|
||||
CREATE TABLE IF NOT EXISTS discovery_run_states (
|
||||
id SERIAL PRIMARY KEY,
|
||||
run_id INTEGER NOT NULL REFERENCES discovery_runs(id) ON DELETE CASCADE,
|
||||
state_code VARCHAR(2) NOT NULL,
|
||||
status VARCHAR(20) NOT NULL DEFAULT 'pending', -- pending, running, completed, failed
|
||||
started_at TIMESTAMPTZ,
|
||||
finished_at TIMESTAMPTZ,
|
||||
|
||||
-- Results
|
||||
cities_found INTEGER DEFAULT 0,
|
||||
locations_found INTEGER DEFAULT 0,
|
||||
locations_upserted INTEGER DEFAULT 0,
|
||||
new_dispensary_ids INTEGER[] DEFAULT '{}',
|
||||
|
||||
-- Error info
|
||||
error_message TEXT,
|
||||
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
|
||||
UNIQUE(run_id, state_code)
|
||||
);
|
||||
|
||||
-- Step-by-step log for detailed progress tracking
|
||||
CREATE TABLE IF NOT EXISTS discovery_run_steps (
|
||||
id SERIAL PRIMARY KEY,
|
||||
run_id INTEGER NOT NULL REFERENCES discovery_runs(id) ON DELETE CASCADE,
|
||||
state_code VARCHAR(2),
|
||||
step_name VARCHAR(100) NOT NULL,
|
||||
status VARCHAR(20) NOT NULL DEFAULT 'started', -- started, completed, failed
|
||||
started_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
finished_at TIMESTAMPTZ,
|
||||
|
||||
-- Details (JSON for flexibility)
|
||||
details JSONB DEFAULT '{}',
|
||||
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
);
|
||||
|
||||
-- Indexes for querying
|
||||
CREATE INDEX IF NOT EXISTS idx_discovery_runs_status ON discovery_runs(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_discovery_runs_platform ON discovery_runs(platform);
|
||||
CREATE INDEX IF NOT EXISTS idx_discovery_runs_started_at ON discovery_runs(started_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_discovery_run_states_run_id ON discovery_run_states(run_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_discovery_run_steps_run_id ON discovery_run_steps(run_id);
|
||||
|
||||
-- View for latest run status per platform
|
||||
CREATE OR REPLACE VIEW v_latest_discovery_runs AS
|
||||
SELECT DISTINCT ON (platform)
|
||||
id,
|
||||
platform,
|
||||
status,
|
||||
started_at,
|
||||
finished_at,
|
||||
states_total,
|
||||
states_completed,
|
||||
locations_discovered,
|
||||
locations_promoted,
|
||||
array_length(new_store_ids, 1) as new_stores_count,
|
||||
error_message,
|
||||
EXTRACT(EPOCH FROM (COALESCE(finished_at, NOW()) - started_at)) as duration_seconds
|
||||
FROM discovery_runs
|
||||
ORDER BY platform, started_at DESC;
|
||||
253
backend/migrations/084_dual_transport_preflight.sql
Normal file
253
backend/migrations/084_dual_transport_preflight.sql
Normal file
@@ -0,0 +1,253 @@
|
||||
-- Migration 084: Dual Transport Preflight System
|
||||
-- Workers run both curl and http (Puppeteer) preflights on startup
|
||||
-- Tasks can require a specific transport method
|
||||
|
||||
-- ===================================================================
|
||||
-- PART 1: Add preflight columns to worker_registry
|
||||
-- ===================================================================
|
||||
|
||||
-- Preflight status for curl/axios transport (proxy-based)
|
||||
ALTER TABLE worker_registry
|
||||
ADD COLUMN IF NOT EXISTS preflight_curl_status VARCHAR(20) DEFAULT 'pending';
|
||||
|
||||
-- Preflight status for http/Puppeteer transport (browser-based)
|
||||
ALTER TABLE worker_registry
|
||||
ADD COLUMN IF NOT EXISTS preflight_http_status VARCHAR(20) DEFAULT 'pending';
|
||||
|
||||
-- Timestamps for when each preflight completed
|
||||
ALTER TABLE worker_registry
|
||||
ADD COLUMN IF NOT EXISTS preflight_curl_at TIMESTAMPTZ;
|
||||
|
||||
ALTER TABLE worker_registry
|
||||
ADD COLUMN IF NOT EXISTS preflight_http_at TIMESTAMPTZ;
|
||||
|
||||
-- Error messages for failed preflights
|
||||
ALTER TABLE worker_registry
|
||||
ADD COLUMN IF NOT EXISTS preflight_curl_error TEXT;
|
||||
|
||||
ALTER TABLE worker_registry
|
||||
ADD COLUMN IF NOT EXISTS preflight_http_error TEXT;
|
||||
|
||||
-- Response time for successful preflights (ms)
|
||||
ALTER TABLE worker_registry
|
||||
ADD COLUMN IF NOT EXISTS preflight_curl_ms INTEGER;
|
||||
|
||||
ALTER TABLE worker_registry
|
||||
ADD COLUMN IF NOT EXISTS preflight_http_ms INTEGER;
|
||||
|
||||
-- Constraints for preflight status values
|
||||
ALTER TABLE worker_registry
|
||||
DROP CONSTRAINT IF EXISTS valid_preflight_curl_status;
|
||||
|
||||
ALTER TABLE worker_registry
|
||||
ADD CONSTRAINT valid_preflight_curl_status
|
||||
CHECK (preflight_curl_status IN ('pending', 'passed', 'failed', 'skipped'));
|
||||
|
||||
ALTER TABLE worker_registry
|
||||
DROP CONSTRAINT IF EXISTS valid_preflight_http_status;
|
||||
|
||||
ALTER TABLE worker_registry
|
||||
ADD CONSTRAINT valid_preflight_http_status
|
||||
CHECK (preflight_http_status IN ('pending', 'passed', 'failed', 'skipped'));
|
||||
|
||||
-- ===================================================================
|
||||
-- PART 2: Add method column to worker_tasks
|
||||
-- ===================================================================
|
||||
|
||||
-- Transport method requirement for the task
|
||||
-- NULL = no preference (any worker can claim)
|
||||
-- 'curl' = requires curl/axios transport (proxy-based, fast)
|
||||
-- 'http' = requires http/Puppeteer transport (browser-based, anti-detect)
|
||||
ALTER TABLE worker_tasks
|
||||
ADD COLUMN IF NOT EXISTS method VARCHAR(10);
|
||||
|
||||
-- Constraint for valid method values
|
||||
ALTER TABLE worker_tasks
|
||||
DROP CONSTRAINT IF EXISTS valid_task_method;
|
||||
|
||||
ALTER TABLE worker_tasks
|
||||
ADD CONSTRAINT valid_task_method
|
||||
CHECK (method IS NULL OR method IN ('curl', 'http'));
|
||||
|
||||
-- Index for method-based task claiming
|
||||
CREATE INDEX IF NOT EXISTS idx_worker_tasks_method
|
||||
ON worker_tasks(method)
|
||||
WHERE status = 'pending';
|
||||
|
||||
-- Set default method for all existing pending tasks to 'http'
|
||||
-- ALL current tasks require Puppeteer/browser-based transport
|
||||
UPDATE worker_tasks
|
||||
SET method = 'http'
|
||||
WHERE method IS NULL;
|
||||
|
||||
-- ===================================================================
|
||||
-- PART 3: Update claim_task function for method compatibility
|
||||
-- ===================================================================
|
||||
|
||||
CREATE OR REPLACE FUNCTION claim_task(
|
||||
p_role VARCHAR(50),
|
||||
p_worker_id VARCHAR(100),
|
||||
p_curl_passed BOOLEAN DEFAULT TRUE,
|
||||
p_http_passed BOOLEAN DEFAULT FALSE
|
||||
) RETURNS worker_tasks AS $$
|
||||
DECLARE
|
||||
claimed_task worker_tasks;
|
||||
BEGIN
|
||||
UPDATE worker_tasks
|
||||
SET
|
||||
status = 'claimed',
|
||||
worker_id = p_worker_id,
|
||||
claimed_at = NOW(),
|
||||
updated_at = NOW()
|
||||
WHERE id = (
|
||||
SELECT id FROM worker_tasks
|
||||
WHERE role = p_role
|
||||
AND status = 'pending'
|
||||
AND (scheduled_for IS NULL OR scheduled_for <= NOW())
|
||||
-- Method compatibility: worker must have passed the required preflight
|
||||
AND (
|
||||
method IS NULL -- No preference, any worker can claim
|
||||
OR (method = 'curl' AND p_curl_passed = TRUE)
|
||||
OR (method = 'http' AND p_http_passed = TRUE)
|
||||
)
|
||||
-- Exclude stores that already have an active task
|
||||
AND (dispensary_id IS NULL OR dispensary_id NOT IN (
|
||||
SELECT dispensary_id FROM worker_tasks
|
||||
WHERE status IN ('claimed', 'running')
|
||||
AND dispensary_id IS NOT NULL
|
||||
))
|
||||
ORDER BY priority DESC, created_at ASC
|
||||
LIMIT 1
|
||||
FOR UPDATE SKIP LOCKED
|
||||
)
|
||||
RETURNING * INTO claimed_task;
|
||||
|
||||
RETURN claimed_task;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- ===================================================================
|
||||
-- PART 4: Update v_active_workers view
|
||||
-- ===================================================================
|
||||
|
||||
DROP VIEW IF EXISTS v_active_workers;
|
||||
|
||||
CREATE VIEW v_active_workers AS
|
||||
SELECT
|
||||
wr.id,
|
||||
wr.worker_id,
|
||||
wr.friendly_name,
|
||||
wr.role,
|
||||
wr.status,
|
||||
wr.pod_name,
|
||||
wr.hostname,
|
||||
wr.started_at,
|
||||
wr.last_heartbeat_at,
|
||||
wr.last_task_at,
|
||||
wr.tasks_completed,
|
||||
wr.tasks_failed,
|
||||
wr.current_task_id,
|
||||
-- Preflight status
|
||||
wr.preflight_curl_status,
|
||||
wr.preflight_http_status,
|
||||
wr.preflight_curl_at,
|
||||
wr.preflight_http_at,
|
||||
wr.preflight_curl_error,
|
||||
wr.preflight_http_error,
|
||||
wr.preflight_curl_ms,
|
||||
wr.preflight_http_ms,
|
||||
-- Computed fields
|
||||
EXTRACT(EPOCH FROM (NOW() - wr.last_heartbeat_at)) as seconds_since_heartbeat,
|
||||
CASE
|
||||
WHEN wr.status = 'offline' THEN 'offline'
|
||||
WHEN wr.last_heartbeat_at < NOW() - INTERVAL '2 minutes' THEN 'stale'
|
||||
WHEN wr.current_task_id IS NOT NULL THEN 'busy'
|
||||
ELSE 'ready'
|
||||
END as health_status,
|
||||
-- Capability flags (can this worker handle curl/http tasks?)
|
||||
(wr.preflight_curl_status = 'passed') as can_curl,
|
||||
(wr.preflight_http_status = 'passed') as can_http
|
||||
FROM worker_registry wr
|
||||
WHERE wr.status != 'terminated'
|
||||
ORDER BY wr.status = 'active' DESC, wr.last_heartbeat_at DESC;
|
||||
|
||||
-- ===================================================================
|
||||
-- PART 5: View for task queue with method info
|
||||
-- ===================================================================
|
||||
|
||||
DROP VIEW IF EXISTS v_task_history;
|
||||
|
||||
CREATE VIEW v_task_history AS
|
||||
SELECT
|
||||
t.id,
|
||||
t.role,
|
||||
t.dispensary_id,
|
||||
d.name as dispensary_name,
|
||||
t.platform,
|
||||
t.status,
|
||||
t.priority,
|
||||
t.method,
|
||||
t.worker_id,
|
||||
t.scheduled_for,
|
||||
t.claimed_at,
|
||||
t.started_at,
|
||||
t.completed_at,
|
||||
t.error_message,
|
||||
t.retry_count,
|
||||
t.created_at,
|
||||
EXTRACT(EPOCH FROM (t.completed_at - t.started_at)) as duration_sec
|
||||
FROM worker_tasks t
|
||||
LEFT JOIN dispensaries d ON d.id = t.dispensary_id
|
||||
ORDER BY t.created_at DESC;
|
||||
|
||||
-- ===================================================================
|
||||
-- PART 6: Helper function to update worker preflight status
|
||||
-- ===================================================================
|
||||
|
||||
CREATE OR REPLACE FUNCTION update_worker_preflight(
|
||||
p_worker_id VARCHAR(100),
|
||||
p_transport VARCHAR(10), -- 'curl' or 'http'
|
||||
p_status VARCHAR(20), -- 'passed', 'failed', 'skipped'
|
||||
p_response_ms INTEGER DEFAULT NULL,
|
||||
p_error TEXT DEFAULT NULL
|
||||
) RETURNS VOID AS $$
|
||||
BEGIN
|
||||
IF p_transport = 'curl' THEN
|
||||
UPDATE worker_registry
|
||||
SET
|
||||
preflight_curl_status = p_status,
|
||||
preflight_curl_at = NOW(),
|
||||
preflight_curl_ms = p_response_ms,
|
||||
preflight_curl_error = p_error,
|
||||
updated_at = NOW()
|
||||
WHERE worker_id = p_worker_id;
|
||||
ELSIF p_transport = 'http' THEN
|
||||
UPDATE worker_registry
|
||||
SET
|
||||
preflight_http_status = p_status,
|
||||
preflight_http_at = NOW(),
|
||||
preflight_http_ms = p_response_ms,
|
||||
preflight_http_error = p_error,
|
||||
updated_at = NOW()
|
||||
WHERE worker_id = p_worker_id;
|
||||
END IF;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- ===================================================================
|
||||
-- Comments
|
||||
-- ===================================================================
|
||||
|
||||
COMMENT ON COLUMN worker_registry.preflight_curl_status IS 'Status of curl/axios preflight: pending, passed, failed, skipped';
|
||||
COMMENT ON COLUMN worker_registry.preflight_http_status IS 'Status of http/Puppeteer preflight: pending, passed, failed, skipped';
|
||||
COMMENT ON COLUMN worker_registry.preflight_curl_at IS 'When curl preflight completed';
|
||||
COMMENT ON COLUMN worker_registry.preflight_http_at IS 'When http preflight completed';
|
||||
COMMENT ON COLUMN worker_registry.preflight_curl_error IS 'Error message if curl preflight failed';
|
||||
COMMENT ON COLUMN worker_registry.preflight_http_error IS 'Error message if http preflight failed';
|
||||
COMMENT ON COLUMN worker_registry.preflight_curl_ms IS 'Response time of successful curl preflight (ms)';
|
||||
COMMENT ON COLUMN worker_registry.preflight_http_ms IS 'Response time of successful http preflight (ms)';
|
||||
|
||||
COMMENT ON COLUMN worker_tasks.method IS 'Transport method required: NULL=any, curl=proxy-based, http=browser-based';
|
||||
|
||||
COMMENT ON FUNCTION claim_task IS 'Atomically claim a task, respecting method requirements and per-store locking';
|
||||
COMMENT ON FUNCTION update_worker_preflight IS 'Update a workers preflight status for a given transport';
|
||||
168
backend/migrations/085_preflight_ip_fingerprint.sql
Normal file
168
backend/migrations/085_preflight_ip_fingerprint.sql
Normal file
@@ -0,0 +1,168 @@
|
||||
-- Migration 085: Add IP and fingerprint columns for preflight reporting
|
||||
-- These columns were missing from migration 084
|
||||
|
||||
-- ===================================================================
|
||||
-- PART 1: Add IP address columns to worker_registry
|
||||
-- ===================================================================
|
||||
|
||||
-- IP address detected during curl/axios preflight
|
||||
ALTER TABLE worker_registry
|
||||
ADD COLUMN IF NOT EXISTS curl_ip VARCHAR(45);
|
||||
|
||||
-- IP address detected during http/Puppeteer preflight
|
||||
ALTER TABLE worker_registry
|
||||
ADD COLUMN IF NOT EXISTS http_ip VARCHAR(45);
|
||||
|
||||
-- ===================================================================
|
||||
-- PART 2: Add fingerprint data column
|
||||
-- ===================================================================
|
||||
|
||||
-- Browser fingerprint data captured during Puppeteer preflight
|
||||
ALTER TABLE worker_registry
|
||||
ADD COLUMN IF NOT EXISTS fingerprint_data JSONB;
|
||||
|
||||
-- ===================================================================
|
||||
-- PART 3: Add combined preflight status/timestamp for convenience
|
||||
-- ===================================================================
|
||||
|
||||
-- Overall preflight status (computed from both transports)
|
||||
-- Values: 'pending', 'passed', 'partial', 'failed'
|
||||
-- - 'pending': neither transport tested
|
||||
-- - 'passed': both transports passed (or http passed for browser-only)
|
||||
-- - 'partial': at least one passed
|
||||
-- - 'failed': no transport passed
|
||||
ALTER TABLE worker_registry
|
||||
ADD COLUMN IF NOT EXISTS preflight_status VARCHAR(20) DEFAULT 'pending';
|
||||
|
||||
-- Most recent preflight completion timestamp
|
||||
ALTER TABLE worker_registry
|
||||
ADD COLUMN IF NOT EXISTS preflight_at TIMESTAMPTZ;
|
||||
|
||||
-- ===================================================================
|
||||
-- PART 4: Update function to set preflight status
|
||||
-- ===================================================================
|
||||
|
||||
CREATE OR REPLACE FUNCTION update_worker_preflight(
|
||||
p_worker_id VARCHAR(100),
|
||||
p_transport VARCHAR(10), -- 'curl' or 'http'
|
||||
p_status VARCHAR(20), -- 'passed', 'failed', 'skipped'
|
||||
p_ip VARCHAR(45) DEFAULT NULL,
|
||||
p_response_ms INTEGER DEFAULT NULL,
|
||||
p_error TEXT DEFAULT NULL,
|
||||
p_fingerprint JSONB DEFAULT NULL
|
||||
) RETURNS VOID AS $$
|
||||
DECLARE
|
||||
v_curl_status VARCHAR(20);
|
||||
v_http_status VARCHAR(20);
|
||||
v_overall_status VARCHAR(20);
|
||||
BEGIN
|
||||
IF p_transport = 'curl' THEN
|
||||
UPDATE worker_registry
|
||||
SET
|
||||
preflight_curl_status = p_status,
|
||||
preflight_curl_at = NOW(),
|
||||
preflight_curl_ms = p_response_ms,
|
||||
preflight_curl_error = p_error,
|
||||
curl_ip = p_ip,
|
||||
updated_at = NOW()
|
||||
WHERE worker_id = p_worker_id;
|
||||
ELSIF p_transport = 'http' THEN
|
||||
UPDATE worker_registry
|
||||
SET
|
||||
preflight_http_status = p_status,
|
||||
preflight_http_at = NOW(),
|
||||
preflight_http_ms = p_response_ms,
|
||||
preflight_http_error = p_error,
|
||||
http_ip = p_ip,
|
||||
fingerprint_data = COALESCE(p_fingerprint, fingerprint_data),
|
||||
updated_at = NOW()
|
||||
WHERE worker_id = p_worker_id;
|
||||
END IF;
|
||||
|
||||
-- Update overall preflight status
|
||||
SELECT preflight_curl_status, preflight_http_status
|
||||
INTO v_curl_status, v_http_status
|
||||
FROM worker_registry
|
||||
WHERE worker_id = p_worker_id;
|
||||
|
||||
-- Compute overall status
|
||||
IF v_curl_status = 'passed' AND v_http_status = 'passed' THEN
|
||||
v_overall_status := 'passed';
|
||||
ELSIF v_curl_status = 'passed' OR v_http_status = 'passed' THEN
|
||||
v_overall_status := 'partial';
|
||||
ELSIF v_curl_status = 'failed' OR v_http_status = 'failed' THEN
|
||||
v_overall_status := 'failed';
|
||||
ELSE
|
||||
v_overall_status := 'pending';
|
||||
END IF;
|
||||
|
||||
UPDATE worker_registry
|
||||
SET
|
||||
preflight_status = v_overall_status,
|
||||
preflight_at = NOW()
|
||||
WHERE worker_id = p_worker_id;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- ===================================================================
|
||||
-- PART 5: Update v_active_workers view
|
||||
-- ===================================================================
|
||||
|
||||
DROP VIEW IF EXISTS v_active_workers;
|
||||
|
||||
CREATE VIEW v_active_workers AS
|
||||
SELECT
|
||||
wr.id,
|
||||
wr.worker_id,
|
||||
wr.friendly_name,
|
||||
wr.role,
|
||||
wr.status,
|
||||
wr.pod_name,
|
||||
wr.hostname,
|
||||
wr.started_at,
|
||||
wr.last_heartbeat_at,
|
||||
wr.last_task_at,
|
||||
wr.tasks_completed,
|
||||
wr.tasks_failed,
|
||||
wr.current_task_id,
|
||||
-- IP addresses from preflights
|
||||
wr.curl_ip,
|
||||
wr.http_ip,
|
||||
-- Combined preflight status
|
||||
wr.preflight_status,
|
||||
wr.preflight_at,
|
||||
-- Detailed preflight status per transport
|
||||
wr.preflight_curl_status,
|
||||
wr.preflight_http_status,
|
||||
wr.preflight_curl_at,
|
||||
wr.preflight_http_at,
|
||||
wr.preflight_curl_error,
|
||||
wr.preflight_http_error,
|
||||
wr.preflight_curl_ms,
|
||||
wr.preflight_http_ms,
|
||||
-- Fingerprint data
|
||||
wr.fingerprint_data,
|
||||
-- Computed fields
|
||||
EXTRACT(EPOCH FROM (NOW() - wr.last_heartbeat_at)) as seconds_since_heartbeat,
|
||||
CASE
|
||||
WHEN wr.status = 'offline' THEN 'offline'
|
||||
WHEN wr.last_heartbeat_at < NOW() - INTERVAL '2 minutes' THEN 'stale'
|
||||
WHEN wr.current_task_id IS NOT NULL THEN 'busy'
|
||||
ELSE 'ready'
|
||||
END as health_status,
|
||||
-- Capability flags (can this worker handle curl/http tasks?)
|
||||
(wr.preflight_curl_status = 'passed') as can_curl,
|
||||
(wr.preflight_http_status = 'passed') as can_http
|
||||
FROM worker_registry wr
|
||||
WHERE wr.status != 'terminated'
|
||||
ORDER BY wr.status = 'active' DESC, wr.last_heartbeat_at DESC;
|
||||
|
||||
-- ===================================================================
|
||||
-- Comments
|
||||
-- ===================================================================
|
||||
|
||||
COMMENT ON COLUMN worker_registry.curl_ip IS 'IP address detected during curl/axios preflight';
|
||||
COMMENT ON COLUMN worker_registry.http_ip IS 'IP address detected during Puppeteer preflight';
|
||||
COMMENT ON COLUMN worker_registry.fingerprint_data IS 'Browser fingerprint captured during Puppeteer preflight';
|
||||
COMMENT ON COLUMN worker_registry.preflight_status IS 'Overall preflight status: pending, passed, partial, failed';
|
||||
COMMENT ON COLUMN worker_registry.preflight_at IS 'Most recent preflight completion timestamp';
|
||||
10
backend/migrations/086_proxy_url_column.sql
Normal file
10
backend/migrations/086_proxy_url_column.sql
Normal file
@@ -0,0 +1,10 @@
|
||||
-- Migration 086: Add proxy_url column for alternative URL formats
|
||||
-- Some proxy providers use non-standard URL formats (e.g., host:port:user:pass)
|
||||
-- This column allows storing the raw URL directly
|
||||
|
||||
-- Add proxy_url column - if set, used directly instead of constructing from parts
|
||||
ALTER TABLE proxies
|
||||
ADD COLUMN IF NOT EXISTS proxy_url TEXT;
|
||||
|
||||
-- Add comment
|
||||
COMMENT ON COLUMN proxies.proxy_url IS 'Raw proxy URL (if provider uses non-standard format). Takes precedence over constructed URL from host/port/user/pass.';
|
||||
30
backend/migrations/088_discovery_payloads.sql
Normal file
30
backend/migrations/088_discovery_payloads.sql
Normal file
@@ -0,0 +1,30 @@
|
||||
-- Migration 088: Extend raw_crawl_payloads for discovery payloads
|
||||
--
|
||||
-- Enables saving raw store data from Dutchie discovery crawls.
|
||||
-- Store discovery returns raw dispensary objects - save them for historical analysis.
|
||||
|
||||
-- Add payload_type to distinguish product crawls from discovery crawls
|
||||
ALTER TABLE raw_crawl_payloads
|
||||
ADD COLUMN IF NOT EXISTS payload_type VARCHAR(32) NOT NULL DEFAULT 'product';
|
||||
|
||||
-- Add state_code for discovery payloads (null for product payloads)
|
||||
ALTER TABLE raw_crawl_payloads
|
||||
ADD COLUMN IF NOT EXISTS state_code VARCHAR(10);
|
||||
|
||||
-- Add store_count for discovery payloads (alternative to product_count)
|
||||
ALTER TABLE raw_crawl_payloads
|
||||
ADD COLUMN IF NOT EXISTS store_count INTEGER;
|
||||
|
||||
-- Make dispensary_id nullable for discovery payloads
|
||||
ALTER TABLE raw_crawl_payloads
|
||||
ALTER COLUMN dispensary_id DROP NOT NULL;
|
||||
|
||||
-- Add index for discovery payload queries
|
||||
CREATE INDEX IF NOT EXISTS idx_raw_crawl_payloads_type_state
|
||||
ON raw_crawl_payloads(payload_type, state_code)
|
||||
WHERE payload_type = 'store_discovery';
|
||||
|
||||
-- Comments
|
||||
COMMENT ON COLUMN raw_crawl_payloads.payload_type IS 'Type: product (default), store_discovery';
|
||||
COMMENT ON COLUMN raw_crawl_payloads.state_code IS 'State code for discovery payloads (e.g., AZ, MI)';
|
||||
COMMENT ON COLUMN raw_crawl_payloads.store_count IS 'Number of stores in discovery payload';
|
||||
105
backend/migrations/089_immutable_schedules.sql
Normal file
105
backend/migrations/089_immutable_schedules.sql
Normal file
@@ -0,0 +1,105 @@
|
||||
-- Migration 089: Immutable Schedules with Per-State Product Discovery
|
||||
--
|
||||
-- Key changes:
|
||||
-- 1. Add is_immutable column - schedules can be edited but not deleted
|
||||
-- 2. Add method column - all tasks use 'http' (Puppeteer transport)
|
||||
-- 3. Store discovery weekly (168h)
|
||||
-- 4. Per-state product_discovery schedules (4h default)
|
||||
-- 5. Remove old payload_fetch schedules
|
||||
|
||||
-- =====================================================
|
||||
-- 1) Add new columns to task_schedules
|
||||
-- =====================================================
|
||||
ALTER TABLE task_schedules
|
||||
ADD COLUMN IF NOT EXISTS is_immutable BOOLEAN DEFAULT FALSE;
|
||||
|
||||
ALTER TABLE task_schedules
|
||||
ADD COLUMN IF NOT EXISTS method VARCHAR(10) DEFAULT 'http';
|
||||
|
||||
-- =====================================================
|
||||
-- 2) Update store_discovery to weekly and immutable
|
||||
-- =====================================================
|
||||
UPDATE task_schedules
|
||||
SET interval_hours = 168, -- 7 days
|
||||
is_immutable = TRUE,
|
||||
method = 'http',
|
||||
description = 'Discover new Dutchie stores weekly (HTTP transport)'
|
||||
WHERE name = 'store_discovery_dutchie';
|
||||
|
||||
-- Insert if doesn't exist
|
||||
INSERT INTO task_schedules (name, role, interval_hours, priority, description, is_immutable, method, platform, next_run_at)
|
||||
VALUES ('store_discovery_dutchie', 'store_discovery', 168, 5, 'Discover new Dutchie stores weekly (HTTP transport)', TRUE, 'http', 'dutchie', NOW())
|
||||
ON CONFLICT (name) DO UPDATE SET
|
||||
interval_hours = 168,
|
||||
is_immutable = TRUE,
|
||||
method = 'http',
|
||||
description = 'Discover new Dutchie stores weekly (HTTP transport)';
|
||||
|
||||
-- =====================================================
|
||||
-- 3) Remove old payload_fetch and product_refresh_all schedules
|
||||
-- =====================================================
|
||||
DELETE FROM task_schedules WHERE name IN ('payload_fetch_all', 'product_refresh_all');
|
||||
|
||||
-- =====================================================
|
||||
-- 4) Create per-state product_discovery schedules
|
||||
-- =====================================================
|
||||
-- One schedule per state that has dispensaries with active cannabis programs
|
||||
INSERT INTO task_schedules (name, role, state_code, interval_hours, priority, description, is_immutable, method, enabled, next_run_at)
|
||||
SELECT
|
||||
'product_discovery_' || lower(s.code) AS name,
|
||||
'product_discovery' AS role,
|
||||
s.code AS state_code,
|
||||
4 AS interval_hours, -- 4 hours default, editable
|
||||
10 AS priority,
|
||||
'Product discovery for ' || s.name || ' dispensaries (HTTP transport)' AS description,
|
||||
TRUE AS is_immutable, -- Can edit but not delete
|
||||
'http' AS method,
|
||||
CASE WHEN s.is_active THEN TRUE ELSE FALSE END AS enabled,
|
||||
-- Stagger start times: each state starts 5 minutes after the previous
|
||||
NOW() + (ROW_NUMBER() OVER (ORDER BY s.code) * INTERVAL '5 minutes') AS next_run_at
|
||||
FROM states s
|
||||
WHERE EXISTS (
|
||||
SELECT 1 FROM dispensaries d
|
||||
WHERE d.state_id = s.id AND d.crawl_enabled = true
|
||||
)
|
||||
ON CONFLICT (name) DO UPDATE SET
|
||||
is_immutable = TRUE,
|
||||
method = 'http',
|
||||
description = EXCLUDED.description;
|
||||
|
||||
-- Also create schedules for states that might have stores discovered later
|
||||
INSERT INTO task_schedules (name, role, state_code, interval_hours, priority, description, is_immutable, method, enabled, next_run_at)
|
||||
SELECT
|
||||
'product_discovery_' || lower(s.code) AS name,
|
||||
'product_discovery' AS role,
|
||||
s.code AS state_code,
|
||||
4 AS interval_hours,
|
||||
10 AS priority,
|
||||
'Product discovery for ' || s.name || ' dispensaries (HTTP transport)' AS description,
|
||||
TRUE AS is_immutable,
|
||||
'http' AS method,
|
||||
FALSE AS enabled, -- Disabled until stores exist
|
||||
NOW() + INTERVAL '1 hour'
|
||||
FROM states s
|
||||
WHERE NOT EXISTS (
|
||||
SELECT 1 FROM task_schedules ts WHERE ts.name = 'product_discovery_' || lower(s.code)
|
||||
)
|
||||
ON CONFLICT (name) DO NOTHING;
|
||||
|
||||
-- =====================================================
|
||||
-- 5) Make analytics_refresh immutable
|
||||
-- =====================================================
|
||||
UPDATE task_schedules
|
||||
SET is_immutable = TRUE, method = 'http'
|
||||
WHERE name = 'analytics_refresh';
|
||||
|
||||
-- =====================================================
|
||||
-- 6) Add index for schedule lookups
|
||||
-- =====================================================
|
||||
CREATE INDEX IF NOT EXISTS idx_task_schedules_state_code
|
||||
ON task_schedules(state_code)
|
||||
WHERE state_code IS NOT NULL;
|
||||
|
||||
-- Comments
|
||||
COMMENT ON COLUMN task_schedules.is_immutable IS 'If TRUE, schedule cannot be deleted (only edited)';
|
||||
COMMENT ON COLUMN task_schedules.method IS 'Transport method: http (Puppeteer/browser) or curl (axios)';
|
||||
286
backend/node_modules/.package-lock.json
generated
vendored
286
backend/node_modules/.package-lock.json
generated
vendored
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "dutchie-menus-backend",
|
||||
"version": "1.5.1",
|
||||
"version": "1.6.0",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
@@ -46,6 +46,97 @@
|
||||
"resolved": "https://registry.npmjs.org/@ioredis/commands/-/commands-1.4.0.tgz",
|
||||
"integrity": "sha512-aFT2yemJJo+TZCmieA7qnYGQooOS7QfNmYrzGtsYd3g9j5iDP8AimYYAesf79ohjbLG12XxC4nG5DyEnC88AsQ=="
|
||||
},
|
||||
"node_modules/@jsep-plugin/assignment": {
|
||||
"version": "1.3.0",
|
||||
"resolved": "https://registry.npmjs.org/@jsep-plugin/assignment/-/assignment-1.3.0.tgz",
|
||||
"integrity": "sha512-VVgV+CXrhbMI3aSusQyclHkenWSAm95WaiKrMxRFam3JSUiIaQjoMIw2sEs/OX4XifnqeQUN4DYbJjlA8EfktQ==",
|
||||
"engines": {
|
||||
"node": ">= 10.16.0"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"jsep": "^0.4.0||^1.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@jsep-plugin/regex": {
|
||||
"version": "1.0.4",
|
||||
"resolved": "https://registry.npmjs.org/@jsep-plugin/regex/-/regex-1.0.4.tgz",
|
||||
"integrity": "sha512-q7qL4Mgjs1vByCaTnDFcBnV9HS7GVPJX5vyVoCgZHNSC9rjwIlmbXG5sUuorR5ndfHAIlJ8pVStxvjXHbNvtUg==",
|
||||
"engines": {
|
||||
"node": ">= 10.16.0"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"jsep": "^0.4.0||^1.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@kubernetes/client-node": {
|
||||
"version": "1.4.0",
|
||||
"resolved": "https://registry.npmjs.org/@kubernetes/client-node/-/client-node-1.4.0.tgz",
|
||||
"integrity": "sha512-Zge3YvF7DJi264dU1b3wb/GmzR99JhUpqTvp+VGHfwZT+g7EOOYNScDJNZwXy9cszyIGPIs0VHr+kk8e95qqrA==",
|
||||
"dependencies": {
|
||||
"@types/js-yaml": "^4.0.1",
|
||||
"@types/node": "^24.0.0",
|
||||
"@types/node-fetch": "^2.6.13",
|
||||
"@types/stream-buffers": "^3.0.3",
|
||||
"form-data": "^4.0.0",
|
||||
"hpagent": "^1.2.0",
|
||||
"isomorphic-ws": "^5.0.0",
|
||||
"js-yaml": "^4.1.0",
|
||||
"jsonpath-plus": "^10.3.0",
|
||||
"node-fetch": "^2.7.0",
|
||||
"openid-client": "^6.1.3",
|
||||
"rfc4648": "^1.3.0",
|
||||
"socks-proxy-agent": "^8.0.4",
|
||||
"stream-buffers": "^3.0.2",
|
||||
"tar-fs": "^3.0.9",
|
||||
"ws": "^8.18.2"
|
||||
}
|
||||
},
|
||||
"node_modules/@kubernetes/client-node/node_modules/@types/node": {
|
||||
"version": "24.10.3",
|
||||
"resolved": "https://registry.npmjs.org/@types/node/-/node-24.10.3.tgz",
|
||||
"integrity": "sha512-gqkrWUsS8hcm0r44yn7/xZeV1ERva/nLgrLxFRUGb7aoNMIJfZJ3AC261zDQuOAKC7MiXai1WCpYc48jAHoShQ==",
|
||||
"dependencies": {
|
||||
"undici-types": "~7.16.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@kubernetes/client-node/node_modules/tar-fs": {
|
||||
"version": "3.1.1",
|
||||
"resolved": "https://registry.npmjs.org/tar-fs/-/tar-fs-3.1.1.tgz",
|
||||
"integrity": "sha512-LZA0oaPOc2fVo82Txf3gw+AkEd38szODlptMYejQUhndHMLQ9M059uXR+AfS7DNo0NpINvSqDsvyaCrBVkptWg==",
|
||||
"dependencies": {
|
||||
"pump": "^3.0.0",
|
||||
"tar-stream": "^3.1.5"
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"bare-fs": "^4.0.1",
|
||||
"bare-path": "^3.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@kubernetes/client-node/node_modules/undici-types": {
|
||||
"version": "7.16.0",
|
||||
"resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.16.0.tgz",
|
||||
"integrity": "sha512-Zz+aZWSj8LE6zoxD+xrjh4VfkIG8Ya6LvYkZqtUQGJPZjYl53ypCaUwWqo7eI0x66KBGeRo+mlBEkMSeSZ38Nw=="
|
||||
},
|
||||
"node_modules/@kubernetes/client-node/node_modules/ws": {
|
||||
"version": "8.18.3",
|
||||
"resolved": "https://registry.npmjs.org/ws/-/ws-8.18.3.tgz",
|
||||
"integrity": "sha512-PEIGCY5tSlUt50cqyMXfCzX+oOPqN0vuGqWzbcJ2xvnkzkq46oOpz7dQaTDBdfICb4N14+GARUDw2XV2N4tvzg==",
|
||||
"engines": {
|
||||
"node": ">=10.0.0"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"bufferutil": "^4.0.1",
|
||||
"utf-8-validate": ">=5.0.2"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"bufferutil": {
|
||||
"optional": true
|
||||
},
|
||||
"utf-8-validate": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/@mapbox/node-pre-gyp": {
|
||||
"version": "1.0.11",
|
||||
"resolved": "https://registry.npmjs.org/@mapbox/node-pre-gyp/-/node-pre-gyp-1.0.11.tgz",
|
||||
@@ -251,6 +342,11 @@
|
||||
"integrity": "sha512-r8Tayk8HJnX0FztbZN7oVqGccWgw98T/0neJphO91KkmOzug1KkofZURD4UaD5uH8AqcFLfdPErnBod0u71/qg==",
|
||||
"dev": true
|
||||
},
|
||||
"node_modules/@types/js-yaml": {
|
||||
"version": "4.0.9",
|
||||
"resolved": "https://registry.npmjs.org/@types/js-yaml/-/js-yaml-4.0.9.tgz",
|
||||
"integrity": "sha512-k4MGaQl5TGo/iipqb2UDG2UwjXziSWkh0uysQelTlJpX1qGlpUZYm8PnO4DxG1qBomtJUdYJ6qR6xdIah10JLg=="
|
||||
},
|
||||
"node_modules/@types/jsonwebtoken": {
|
||||
"version": "9.0.10",
|
||||
"resolved": "https://registry.npmjs.org/@types/jsonwebtoken/-/jsonwebtoken-9.0.10.tgz",
|
||||
@@ -276,7 +372,6 @@
|
||||
"version": "20.19.25",
|
||||
"resolved": "https://registry.npmjs.org/@types/node/-/node-20.19.25.tgz",
|
||||
"integrity": "sha512-ZsJzA5thDQMSQO788d7IocwwQbI8B5OPzmqNvpf3NY/+MHDAS759Wo0gd2WQeXYt5AAAQjzcrTVC6SKCuYgoCQ==",
|
||||
"devOptional": true,
|
||||
"dependencies": {
|
||||
"undici-types": "~6.21.0"
|
||||
}
|
||||
@@ -287,6 +382,15 @@
|
||||
"integrity": "sha512-0ikrnug3/IyneSHqCBeslAhlK2aBfYek1fGo4bP4QnZPmiqSGRK+Oy7ZMisLWkesffJvQ1cqAcBnJC+8+nxIAg==",
|
||||
"dev": true
|
||||
},
|
||||
"node_modules/@types/node-fetch": {
|
||||
"version": "2.6.13",
|
||||
"resolved": "https://registry.npmjs.org/@types/node-fetch/-/node-fetch-2.6.13.tgz",
|
||||
"integrity": "sha512-QGpRVpzSaUs30JBSGPjOg4Uveu384erbHBoT1zeONvyCfwQxIkUshLAOqN/k9EjGviPRmWTTe6aH2qySWKTVSw==",
|
||||
"dependencies": {
|
||||
"@types/node": "*",
|
||||
"form-data": "^4.0.4"
|
||||
}
|
||||
},
|
||||
"node_modules/@types/pg": {
|
||||
"version": "8.15.6",
|
||||
"resolved": "https://registry.npmjs.org/@types/pg/-/pg-8.15.6.tgz",
|
||||
@@ -340,6 +444,14 @@
|
||||
"@types/node": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/@types/stream-buffers": {
|
||||
"version": "3.0.8",
|
||||
"resolved": "https://registry.npmjs.org/@types/stream-buffers/-/stream-buffers-3.0.8.tgz",
|
||||
"integrity": "sha512-J+7VaHKNvlNPJPEJXX/fKa9DZtR/xPMwuIbe+yNOwp1YB+ApUOBv2aUpEoBJEi8nJgbgs1x8e73ttg0r1rSUdw==",
|
||||
"dependencies": {
|
||||
"@types/node": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/@types/uuid": {
|
||||
"version": "9.0.8",
|
||||
"resolved": "https://registry.npmjs.org/@types/uuid/-/uuid-9.0.8.tgz",
|
||||
@@ -520,6 +632,78 @@
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/bare-fs": {
|
||||
"version": "4.5.2",
|
||||
"resolved": "https://registry.npmjs.org/bare-fs/-/bare-fs-4.5.2.tgz",
|
||||
"integrity": "sha512-veTnRzkb6aPHOvSKIOy60KzURfBdUflr5VReI+NSaPL6xf+XLdONQgZgpYvUuZLVQ8dCqxpBAudaOM1+KpAUxw==",
|
||||
"optional": true,
|
||||
"dependencies": {
|
||||
"bare-events": "^2.5.4",
|
||||
"bare-path": "^3.0.0",
|
||||
"bare-stream": "^2.6.4",
|
||||
"bare-url": "^2.2.2",
|
||||
"fast-fifo": "^1.3.2"
|
||||
},
|
||||
"engines": {
|
||||
"bare": ">=1.16.0"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"bare-buffer": "*"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"bare-buffer": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/bare-os": {
|
||||
"version": "3.6.2",
|
||||
"resolved": "https://registry.npmjs.org/bare-os/-/bare-os-3.6.2.tgz",
|
||||
"integrity": "sha512-T+V1+1srU2qYNBmJCXZkUY5vQ0B4FSlL3QDROnKQYOqeiQR8UbjNHlPa+TIbM4cuidiN9GaTaOZgSEgsvPbh5A==",
|
||||
"optional": true,
|
||||
"engines": {
|
||||
"bare": ">=1.14.0"
|
||||
}
|
||||
},
|
||||
"node_modules/bare-path": {
|
||||
"version": "3.0.0",
|
||||
"resolved": "https://registry.npmjs.org/bare-path/-/bare-path-3.0.0.tgz",
|
||||
"integrity": "sha512-tyfW2cQcB5NN8Saijrhqn0Zh7AnFNsnczRcuWODH0eYAXBsJ5gVxAUuNr7tsHSC6IZ77cA0SitzT+s47kot8Mw==",
|
||||
"optional": true,
|
||||
"dependencies": {
|
||||
"bare-os": "^3.0.1"
|
||||
}
|
||||
},
|
||||
"node_modules/bare-stream": {
|
||||
"version": "2.7.0",
|
||||
"resolved": "https://registry.npmjs.org/bare-stream/-/bare-stream-2.7.0.tgz",
|
||||
"integrity": "sha512-oyXQNicV1y8nc2aKffH+BUHFRXmx6VrPzlnaEvMhram0nPBrKcEdcyBg5r08D0i8VxngHFAiVyn1QKXpSG0B8A==",
|
||||
"optional": true,
|
||||
"dependencies": {
|
||||
"streamx": "^2.21.0"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"bare-buffer": "*",
|
||||
"bare-events": "*"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"bare-buffer": {
|
||||
"optional": true
|
||||
},
|
||||
"bare-events": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/bare-url": {
|
||||
"version": "2.3.2",
|
||||
"resolved": "https://registry.npmjs.org/bare-url/-/bare-url-2.3.2.tgz",
|
||||
"integrity": "sha512-ZMq4gd9ngV5aTMa5p9+UfY0b3skwhHELaDkhEHetMdX0LRkW9kzaym4oo/Eh+Ghm0CCDuMTsRIGM/ytUc1ZYmw==",
|
||||
"optional": true,
|
||||
"dependencies": {
|
||||
"bare-path": "^3.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/base64-js": {
|
||||
"version": "1.5.1",
|
||||
"resolved": "https://registry.npmjs.org/base64-js/-/base64-js-1.5.1.tgz",
|
||||
@@ -2019,6 +2203,14 @@
|
||||
"node": ">=16.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/hpagent": {
|
||||
"version": "1.2.0",
|
||||
"resolved": "https://registry.npmjs.org/hpagent/-/hpagent-1.2.0.tgz",
|
||||
"integrity": "sha512-A91dYTeIB6NoXG+PxTQpCCDDnfHsW9kc06Lvpu1TEe9gnd6ZFeiBoRO9JvzEv6xK7EX97/dUE8g/vBMTqTS3CA==",
|
||||
"engines": {
|
||||
"node": ">=14"
|
||||
}
|
||||
},
|
||||
"node_modules/htmlparser2": {
|
||||
"version": "10.0.0",
|
||||
"resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-10.0.0.tgz",
|
||||
@@ -2382,6 +2574,22 @@
|
||||
"node": ">=0.10.0"
|
||||
}
|
||||
},
|
||||
"node_modules/isomorphic-ws": {
|
||||
"version": "5.0.0",
|
||||
"resolved": "https://registry.npmjs.org/isomorphic-ws/-/isomorphic-ws-5.0.0.tgz",
|
||||
"integrity": "sha512-muId7Zzn9ywDsyXgTIafTry2sV3nySZeUDe6YedVd1Hvuuep5AsIlqK+XefWpYTyJG5e503F2xIuT2lcU6rCSw==",
|
||||
"peerDependencies": {
|
||||
"ws": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/jose": {
|
||||
"version": "6.1.3",
|
||||
"resolved": "https://registry.npmjs.org/jose/-/jose-6.1.3.tgz",
|
||||
"integrity": "sha512-0TpaTfihd4QMNwrz/ob2Bp7X04yuxJkjRGi4aKmOqwhov54i6u79oCv7T+C7lo70MKH6BesI3vscD1yb/yzKXQ==",
|
||||
"funding": {
|
||||
"url": "https://github.com/sponsors/panva"
|
||||
}
|
||||
},
|
||||
"node_modules/js-tokens": {
|
||||
"version": "4.0.0",
|
||||
"resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-4.0.0.tgz",
|
||||
@@ -2398,6 +2606,14 @@
|
||||
"js-yaml": "bin/js-yaml.js"
|
||||
}
|
||||
},
|
||||
"node_modules/jsep": {
|
||||
"version": "1.4.0",
|
||||
"resolved": "https://registry.npmjs.org/jsep/-/jsep-1.4.0.tgz",
|
||||
"integrity": "sha512-B7qPcEVE3NVkmSJbaYxvv4cHkVW7DQsZz13pUMrfS8z8Q/BuShN+gcTXrUlPiGqM2/t/EEaI030bpxMqY8gMlw==",
|
||||
"engines": {
|
||||
"node": ">= 10.16.0"
|
||||
}
|
||||
},
|
||||
"node_modules/json-parse-even-better-errors": {
|
||||
"version": "2.3.1",
|
||||
"resolved": "https://registry.npmjs.org/json-parse-even-better-errors/-/json-parse-even-better-errors-2.3.1.tgz",
|
||||
@@ -2419,6 +2635,23 @@
|
||||
"graceful-fs": "^4.1.6"
|
||||
}
|
||||
},
|
||||
"node_modules/jsonpath-plus": {
|
||||
"version": "10.3.0",
|
||||
"resolved": "https://registry.npmjs.org/jsonpath-plus/-/jsonpath-plus-10.3.0.tgz",
|
||||
"integrity": "sha512-8TNmfeTCk2Le33A3vRRwtuworG/L5RrgMvdjhKZxvyShO+mBu2fP50OWUjRLNtvw344DdDarFh9buFAZs5ujeA==",
|
||||
"dependencies": {
|
||||
"@jsep-plugin/assignment": "^1.3.0",
|
||||
"@jsep-plugin/regex": "^1.0.4",
|
||||
"jsep": "^1.4.0"
|
||||
},
|
||||
"bin": {
|
||||
"jsonpath": "bin/jsonpath-cli.js",
|
||||
"jsonpath-plus": "bin/jsonpath-cli.js"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=18.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/jsonwebtoken": {
|
||||
"version": "9.0.2",
|
||||
"resolved": "https://registry.npmjs.org/jsonwebtoken/-/jsonwebtoken-9.0.2.tgz",
|
||||
@@ -2493,6 +2726,11 @@
|
||||
"resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz",
|
||||
"integrity": "sha512-v2kDEe57lecTulaDIuNTPy3Ry4gLGJ6Z1O3vE1krgXZNrsQ+LFTGHVxVjcXPs17LhbZVGedAJv8XZ1tvj5FvSg=="
|
||||
},
|
||||
"node_modules/lodash.clonedeep": {
|
||||
"version": "4.5.0",
|
||||
"resolved": "https://registry.npmjs.org/lodash.clonedeep/-/lodash.clonedeep-4.5.0.tgz",
|
||||
"integrity": "sha512-H5ZhCF25riFd9uB5UCkVKo61m3S/xZk1x4wA6yp/L3RFP6Z/eHH1ymQcGLo7J3GMPfm0V/7m1tryHuGVxpqEBQ=="
|
||||
},
|
||||
"node_modules/lodash.defaults": {
|
||||
"version": "4.2.0",
|
||||
"resolved": "https://registry.npmjs.org/lodash.defaults/-/lodash.defaults-4.2.0.tgz",
|
||||
@@ -2942,6 +3180,14 @@
|
||||
"url": "https://github.com/fb55/nth-check?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/oauth4webapi": {
|
||||
"version": "3.8.3",
|
||||
"resolved": "https://registry.npmjs.org/oauth4webapi/-/oauth4webapi-3.8.3.tgz",
|
||||
"integrity": "sha512-pQ5BsX3QRTgnt5HxgHwgunIRaDXBdkT23tf8dfzmtTIL2LTpdmxgbpbBm0VgFWAIDlezQvQCTgnVIUmHupXHxw==",
|
||||
"funding": {
|
||||
"url": "https://github.com/sponsors/panva"
|
||||
}
|
||||
},
|
||||
"node_modules/object-assign": {
|
||||
"version": "4.1.1",
|
||||
"resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz",
|
||||
@@ -2980,6 +3226,18 @@
|
||||
"wrappy": "1"
|
||||
}
|
||||
},
|
||||
"node_modules/openid-client": {
|
||||
"version": "6.8.1",
|
||||
"resolved": "https://registry.npmjs.org/openid-client/-/openid-client-6.8.1.tgz",
|
||||
"integrity": "sha512-VoYT6enBo6Vj2j3Q5Ec0AezS+9YGzQo1f5Xc42lreMGlfP4ljiXPKVDvCADh+XHCV/bqPu/wWSiCVXbJKvrODw==",
|
||||
"dependencies": {
|
||||
"jose": "^6.1.0",
|
||||
"oauth4webapi": "^3.8.2"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/sponsors/panva"
|
||||
}
|
||||
},
|
||||
"node_modules/pac-proxy-agent": {
|
||||
"version": "7.2.0",
|
||||
"resolved": "https://registry.npmjs.org/pac-proxy-agent/-/pac-proxy-agent-7.2.0.tgz",
|
||||
@@ -3883,6 +4141,11 @@
|
||||
"url": "https://github.com/privatenumber/resolve-pkg-maps?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/rfc4648": {
|
||||
"version": "1.5.4",
|
||||
"resolved": "https://registry.npmjs.org/rfc4648/-/rfc4648-1.5.4.tgz",
|
||||
"integrity": "sha512-rRg/6Lb+IGfJqO05HZkN50UtY7K/JhxJag1kP23+zyMfrvoB0B7RWv06MbOzoc79RgCdNTiUaNsTT1AJZ7Z+cg=="
|
||||
},
|
||||
"node_modules/rimraf": {
|
||||
"version": "3.0.2",
|
||||
"resolved": "https://registry.npmjs.org/rimraf/-/rimraf-3.0.2.tgz",
|
||||
@@ -4313,6 +4576,14 @@
|
||||
"node": ">= 0.8"
|
||||
}
|
||||
},
|
||||
"node_modules/stream-buffers": {
|
||||
"version": "3.0.3",
|
||||
"resolved": "https://registry.npmjs.org/stream-buffers/-/stream-buffers-3.0.3.tgz",
|
||||
"integrity": "sha512-pqMqwQCso0PBJt2PQmDO0cFj0lyqmiwOMiMSkVtRokl7e+ZTRYgDHKnuZNbqjiJXgsg4nuqtD/zxuo9KqTp0Yw==",
|
||||
"engines": {
|
||||
"node": ">= 0.10.0"
|
||||
}
|
||||
},
|
||||
"node_modules/streamx": {
|
||||
"version": "2.23.0",
|
||||
"resolved": "https://registry.npmjs.org/streamx/-/streamx-2.23.0.tgz",
|
||||
@@ -4532,8 +4803,7 @@
|
||||
"node_modules/undici-types": {
|
||||
"version": "6.21.0",
|
||||
"resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.21.0.tgz",
|
||||
"integrity": "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==",
|
||||
"devOptional": true
|
||||
"integrity": "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ=="
|
||||
},
|
||||
"node_modules/universalify": {
|
||||
"version": "2.0.1",
|
||||
@@ -4556,6 +4826,14 @@
|
||||
"resolved": "https://registry.npmjs.org/urlpattern-polyfill/-/urlpattern-polyfill-10.0.0.tgz",
|
||||
"integrity": "sha512-H/A06tKD7sS1O1X2SshBVeA5FLycRpjqiBeqGKmBwBDBy28EnRjORxTNe269KSSr5un5qyWi1iL61wLxpd+ZOg=="
|
||||
},
|
||||
"node_modules/user-agents": {
|
||||
"version": "1.1.669",
|
||||
"resolved": "https://registry.npmjs.org/user-agents/-/user-agents-1.1.669.tgz",
|
||||
"integrity": "sha512-pbIzG+AOqCaIpySKJ4IAm1l0VyE4jMnK4y1thV8lm8PYxI+7X5uWcppOK7zY79TCKKTAnJH3/4gaVIZHsjrmJA==",
|
||||
"dependencies": {
|
||||
"lodash.clonedeep": "^4.5.0"
|
||||
}
|
||||
},
|
||||
"node_modules/util": {
|
||||
"version": "0.12.5",
|
||||
"resolved": "https://registry.npmjs.org/util/-/util-0.12.5.tgz",
|
||||
|
||||
290
backend/package-lock.json
generated
290
backend/package-lock.json
generated
@@ -1,13 +1,14 @@
|
||||
{
|
||||
"name": "dutchie-menus-backend",
|
||||
"version": "1.5.1",
|
||||
"version": "1.6.0",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "dutchie-menus-backend",
|
||||
"version": "1.5.1",
|
||||
"version": "1.6.0",
|
||||
"dependencies": {
|
||||
"@kubernetes/client-node": "^1.4.0",
|
||||
"@types/bcryptjs": "^3.0.0",
|
||||
"axios": "^1.6.2",
|
||||
"bcrypt": "^5.1.1",
|
||||
@@ -34,6 +35,7 @@
|
||||
"puppeteer-extra-plugin-stealth": "^2.11.2",
|
||||
"sharp": "^0.32.0",
|
||||
"socks-proxy-agent": "^8.0.2",
|
||||
"user-agents": "^1.1.669",
|
||||
"uuid": "^9.0.1",
|
||||
"zod": "^3.22.4"
|
||||
},
|
||||
@@ -492,6 +494,97 @@
|
||||
"resolved": "https://registry.npmjs.org/@ioredis/commands/-/commands-1.4.0.tgz",
|
||||
"integrity": "sha512-aFT2yemJJo+TZCmieA7qnYGQooOS7QfNmYrzGtsYd3g9j5iDP8AimYYAesf79ohjbLG12XxC4nG5DyEnC88AsQ=="
|
||||
},
|
||||
"node_modules/@jsep-plugin/assignment": {
|
||||
"version": "1.3.0",
|
||||
"resolved": "https://registry.npmjs.org/@jsep-plugin/assignment/-/assignment-1.3.0.tgz",
|
||||
"integrity": "sha512-VVgV+CXrhbMI3aSusQyclHkenWSAm95WaiKrMxRFam3JSUiIaQjoMIw2sEs/OX4XifnqeQUN4DYbJjlA8EfktQ==",
|
||||
"engines": {
|
||||
"node": ">= 10.16.0"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"jsep": "^0.4.0||^1.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@jsep-plugin/regex": {
|
||||
"version": "1.0.4",
|
||||
"resolved": "https://registry.npmjs.org/@jsep-plugin/regex/-/regex-1.0.4.tgz",
|
||||
"integrity": "sha512-q7qL4Mgjs1vByCaTnDFcBnV9HS7GVPJX5vyVoCgZHNSC9rjwIlmbXG5sUuorR5ndfHAIlJ8pVStxvjXHbNvtUg==",
|
||||
"engines": {
|
||||
"node": ">= 10.16.0"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"jsep": "^0.4.0||^1.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@kubernetes/client-node": {
|
||||
"version": "1.4.0",
|
||||
"resolved": "https://registry.npmjs.org/@kubernetes/client-node/-/client-node-1.4.0.tgz",
|
||||
"integrity": "sha512-Zge3YvF7DJi264dU1b3wb/GmzR99JhUpqTvp+VGHfwZT+g7EOOYNScDJNZwXy9cszyIGPIs0VHr+kk8e95qqrA==",
|
||||
"dependencies": {
|
||||
"@types/js-yaml": "^4.0.1",
|
||||
"@types/node": "^24.0.0",
|
||||
"@types/node-fetch": "^2.6.13",
|
||||
"@types/stream-buffers": "^3.0.3",
|
||||
"form-data": "^4.0.0",
|
||||
"hpagent": "^1.2.0",
|
||||
"isomorphic-ws": "^5.0.0",
|
||||
"js-yaml": "^4.1.0",
|
||||
"jsonpath-plus": "^10.3.0",
|
||||
"node-fetch": "^2.7.0",
|
||||
"openid-client": "^6.1.3",
|
||||
"rfc4648": "^1.3.0",
|
||||
"socks-proxy-agent": "^8.0.4",
|
||||
"stream-buffers": "^3.0.2",
|
||||
"tar-fs": "^3.0.9",
|
||||
"ws": "^8.18.2"
|
||||
}
|
||||
},
|
||||
"node_modules/@kubernetes/client-node/node_modules/@types/node": {
|
||||
"version": "24.10.3",
|
||||
"resolved": "https://registry.npmjs.org/@types/node/-/node-24.10.3.tgz",
|
||||
"integrity": "sha512-gqkrWUsS8hcm0r44yn7/xZeV1ERva/nLgrLxFRUGb7aoNMIJfZJ3AC261zDQuOAKC7MiXai1WCpYc48jAHoShQ==",
|
||||
"dependencies": {
|
||||
"undici-types": "~7.16.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@kubernetes/client-node/node_modules/tar-fs": {
|
||||
"version": "3.1.1",
|
||||
"resolved": "https://registry.npmjs.org/tar-fs/-/tar-fs-3.1.1.tgz",
|
||||
"integrity": "sha512-LZA0oaPOc2fVo82Txf3gw+AkEd38szODlptMYejQUhndHMLQ9M059uXR+AfS7DNo0NpINvSqDsvyaCrBVkptWg==",
|
||||
"dependencies": {
|
||||
"pump": "^3.0.0",
|
||||
"tar-stream": "^3.1.5"
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"bare-fs": "^4.0.1",
|
||||
"bare-path": "^3.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@kubernetes/client-node/node_modules/undici-types": {
|
||||
"version": "7.16.0",
|
||||
"resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.16.0.tgz",
|
||||
"integrity": "sha512-Zz+aZWSj8LE6zoxD+xrjh4VfkIG8Ya6LvYkZqtUQGJPZjYl53ypCaUwWqo7eI0x66KBGeRo+mlBEkMSeSZ38Nw=="
|
||||
},
|
||||
"node_modules/@kubernetes/client-node/node_modules/ws": {
|
||||
"version": "8.18.3",
|
||||
"resolved": "https://registry.npmjs.org/ws/-/ws-8.18.3.tgz",
|
||||
"integrity": "sha512-PEIGCY5tSlUt50cqyMXfCzX+oOPqN0vuGqWzbcJ2xvnkzkq46oOpz7dQaTDBdfICb4N14+GARUDw2XV2N4tvzg==",
|
||||
"engines": {
|
||||
"node": ">=10.0.0"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"bufferutil": "^4.0.1",
|
||||
"utf-8-validate": ">=5.0.2"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"bufferutil": {
|
||||
"optional": true
|
||||
},
|
||||
"utf-8-validate": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/@mapbox/node-pre-gyp": {
|
||||
"version": "1.0.11",
|
||||
"resolved": "https://registry.npmjs.org/@mapbox/node-pre-gyp/-/node-pre-gyp-1.0.11.tgz",
|
||||
@@ -757,6 +850,11 @@
|
||||
"integrity": "sha512-r8Tayk8HJnX0FztbZN7oVqGccWgw98T/0neJphO91KkmOzug1KkofZURD4UaD5uH8AqcFLfdPErnBod0u71/qg==",
|
||||
"dev": true
|
||||
},
|
||||
"node_modules/@types/js-yaml": {
|
||||
"version": "4.0.9",
|
||||
"resolved": "https://registry.npmjs.org/@types/js-yaml/-/js-yaml-4.0.9.tgz",
|
||||
"integrity": "sha512-k4MGaQl5TGo/iipqb2UDG2UwjXziSWkh0uysQelTlJpX1qGlpUZYm8PnO4DxG1qBomtJUdYJ6qR6xdIah10JLg=="
|
||||
},
|
||||
"node_modules/@types/jsonwebtoken": {
|
||||
"version": "9.0.10",
|
||||
"resolved": "https://registry.npmjs.org/@types/jsonwebtoken/-/jsonwebtoken-9.0.10.tgz",
|
||||
@@ -782,7 +880,6 @@
|
||||
"version": "20.19.25",
|
||||
"resolved": "https://registry.npmjs.org/@types/node/-/node-20.19.25.tgz",
|
||||
"integrity": "sha512-ZsJzA5thDQMSQO788d7IocwwQbI8B5OPzmqNvpf3NY/+MHDAS759Wo0gd2WQeXYt5AAAQjzcrTVC6SKCuYgoCQ==",
|
||||
"devOptional": true,
|
||||
"dependencies": {
|
||||
"undici-types": "~6.21.0"
|
||||
}
|
||||
@@ -793,6 +890,15 @@
|
||||
"integrity": "sha512-0ikrnug3/IyneSHqCBeslAhlK2aBfYek1fGo4bP4QnZPmiqSGRK+Oy7ZMisLWkesffJvQ1cqAcBnJC+8+nxIAg==",
|
||||
"dev": true
|
||||
},
|
||||
"node_modules/@types/node-fetch": {
|
||||
"version": "2.6.13",
|
||||
"resolved": "https://registry.npmjs.org/@types/node-fetch/-/node-fetch-2.6.13.tgz",
|
||||
"integrity": "sha512-QGpRVpzSaUs30JBSGPjOg4Uveu384erbHBoT1zeONvyCfwQxIkUshLAOqN/k9EjGviPRmWTTe6aH2qySWKTVSw==",
|
||||
"dependencies": {
|
||||
"@types/node": "*",
|
||||
"form-data": "^4.0.4"
|
||||
}
|
||||
},
|
||||
"node_modules/@types/pg": {
|
||||
"version": "8.15.6",
|
||||
"resolved": "https://registry.npmjs.org/@types/pg/-/pg-8.15.6.tgz",
|
||||
@@ -846,6 +952,14 @@
|
||||
"@types/node": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/@types/stream-buffers": {
|
||||
"version": "3.0.8",
|
||||
"resolved": "https://registry.npmjs.org/@types/stream-buffers/-/stream-buffers-3.0.8.tgz",
|
||||
"integrity": "sha512-J+7VaHKNvlNPJPEJXX/fKa9DZtR/xPMwuIbe+yNOwp1YB+ApUOBv2aUpEoBJEi8nJgbgs1x8e73ttg0r1rSUdw==",
|
||||
"dependencies": {
|
||||
"@types/node": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/@types/uuid": {
|
||||
"version": "9.0.8",
|
||||
"resolved": "https://registry.npmjs.org/@types/uuid/-/uuid-9.0.8.tgz",
|
||||
@@ -1026,6 +1140,78 @@
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/bare-fs": {
|
||||
"version": "4.5.2",
|
||||
"resolved": "https://registry.npmjs.org/bare-fs/-/bare-fs-4.5.2.tgz",
|
||||
"integrity": "sha512-veTnRzkb6aPHOvSKIOy60KzURfBdUflr5VReI+NSaPL6xf+XLdONQgZgpYvUuZLVQ8dCqxpBAudaOM1+KpAUxw==",
|
||||
"optional": true,
|
||||
"dependencies": {
|
||||
"bare-events": "^2.5.4",
|
||||
"bare-path": "^3.0.0",
|
||||
"bare-stream": "^2.6.4",
|
||||
"bare-url": "^2.2.2",
|
||||
"fast-fifo": "^1.3.2"
|
||||
},
|
||||
"engines": {
|
||||
"bare": ">=1.16.0"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"bare-buffer": "*"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"bare-buffer": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/bare-os": {
|
||||
"version": "3.6.2",
|
||||
"resolved": "https://registry.npmjs.org/bare-os/-/bare-os-3.6.2.tgz",
|
||||
"integrity": "sha512-T+V1+1srU2qYNBmJCXZkUY5vQ0B4FSlL3QDROnKQYOqeiQR8UbjNHlPa+TIbM4cuidiN9GaTaOZgSEgsvPbh5A==",
|
||||
"optional": true,
|
||||
"engines": {
|
||||
"bare": ">=1.14.0"
|
||||
}
|
||||
},
|
||||
"node_modules/bare-path": {
|
||||
"version": "3.0.0",
|
||||
"resolved": "https://registry.npmjs.org/bare-path/-/bare-path-3.0.0.tgz",
|
||||
"integrity": "sha512-tyfW2cQcB5NN8Saijrhqn0Zh7AnFNsnczRcuWODH0eYAXBsJ5gVxAUuNr7tsHSC6IZ77cA0SitzT+s47kot8Mw==",
|
||||
"optional": true,
|
||||
"dependencies": {
|
||||
"bare-os": "^3.0.1"
|
||||
}
|
||||
},
|
||||
"node_modules/bare-stream": {
|
||||
"version": "2.7.0",
|
||||
"resolved": "https://registry.npmjs.org/bare-stream/-/bare-stream-2.7.0.tgz",
|
||||
"integrity": "sha512-oyXQNicV1y8nc2aKffH+BUHFRXmx6VrPzlnaEvMhram0nPBrKcEdcyBg5r08D0i8VxngHFAiVyn1QKXpSG0B8A==",
|
||||
"optional": true,
|
||||
"dependencies": {
|
||||
"streamx": "^2.21.0"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"bare-buffer": "*",
|
||||
"bare-events": "*"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"bare-buffer": {
|
||||
"optional": true
|
||||
},
|
||||
"bare-events": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/bare-url": {
|
||||
"version": "2.3.2",
|
||||
"resolved": "https://registry.npmjs.org/bare-url/-/bare-url-2.3.2.tgz",
|
||||
"integrity": "sha512-ZMq4gd9ngV5aTMa5p9+UfY0b3skwhHELaDkhEHetMdX0LRkW9kzaym4oo/Eh+Ghm0CCDuMTsRIGM/ytUc1ZYmw==",
|
||||
"optional": true,
|
||||
"dependencies": {
|
||||
"bare-path": "^3.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/base64-js": {
|
||||
"version": "1.5.1",
|
||||
"resolved": "https://registry.npmjs.org/base64-js/-/base64-js-1.5.1.tgz",
|
||||
@@ -2539,6 +2725,14 @@
|
||||
"node": ">=16.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/hpagent": {
|
||||
"version": "1.2.0",
|
||||
"resolved": "https://registry.npmjs.org/hpagent/-/hpagent-1.2.0.tgz",
|
||||
"integrity": "sha512-A91dYTeIB6NoXG+PxTQpCCDDnfHsW9kc06Lvpu1TEe9gnd6ZFeiBoRO9JvzEv6xK7EX97/dUE8g/vBMTqTS3CA==",
|
||||
"engines": {
|
||||
"node": ">=14"
|
||||
}
|
||||
},
|
||||
"node_modules/htmlparser2": {
|
||||
"version": "10.0.0",
|
||||
"resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-10.0.0.tgz",
|
||||
@@ -2902,6 +3096,22 @@
|
||||
"node": ">=0.10.0"
|
||||
}
|
||||
},
|
||||
"node_modules/isomorphic-ws": {
|
||||
"version": "5.0.0",
|
||||
"resolved": "https://registry.npmjs.org/isomorphic-ws/-/isomorphic-ws-5.0.0.tgz",
|
||||
"integrity": "sha512-muId7Zzn9ywDsyXgTIafTry2sV3nySZeUDe6YedVd1Hvuuep5AsIlqK+XefWpYTyJG5e503F2xIuT2lcU6rCSw==",
|
||||
"peerDependencies": {
|
||||
"ws": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/jose": {
|
||||
"version": "6.1.3",
|
||||
"resolved": "https://registry.npmjs.org/jose/-/jose-6.1.3.tgz",
|
||||
"integrity": "sha512-0TpaTfihd4QMNwrz/ob2Bp7X04yuxJkjRGi4aKmOqwhov54i6u79oCv7T+C7lo70MKH6BesI3vscD1yb/yzKXQ==",
|
||||
"funding": {
|
||||
"url": "https://github.com/sponsors/panva"
|
||||
}
|
||||
},
|
||||
"node_modules/js-tokens": {
|
||||
"version": "4.0.0",
|
||||
"resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-4.0.0.tgz",
|
||||
@@ -2918,6 +3128,14 @@
|
||||
"js-yaml": "bin/js-yaml.js"
|
||||
}
|
||||
},
|
||||
"node_modules/jsep": {
|
||||
"version": "1.4.0",
|
||||
"resolved": "https://registry.npmjs.org/jsep/-/jsep-1.4.0.tgz",
|
||||
"integrity": "sha512-B7qPcEVE3NVkmSJbaYxvv4cHkVW7DQsZz13pUMrfS8z8Q/BuShN+gcTXrUlPiGqM2/t/EEaI030bpxMqY8gMlw==",
|
||||
"engines": {
|
||||
"node": ">= 10.16.0"
|
||||
}
|
||||
},
|
||||
"node_modules/json-parse-even-better-errors": {
|
||||
"version": "2.3.1",
|
||||
"resolved": "https://registry.npmjs.org/json-parse-even-better-errors/-/json-parse-even-better-errors-2.3.1.tgz",
|
||||
@@ -2939,6 +3157,23 @@
|
||||
"graceful-fs": "^4.1.6"
|
||||
}
|
||||
},
|
||||
"node_modules/jsonpath-plus": {
|
||||
"version": "10.3.0",
|
||||
"resolved": "https://registry.npmjs.org/jsonpath-plus/-/jsonpath-plus-10.3.0.tgz",
|
||||
"integrity": "sha512-8TNmfeTCk2Le33A3vRRwtuworG/L5RrgMvdjhKZxvyShO+mBu2fP50OWUjRLNtvw344DdDarFh9buFAZs5ujeA==",
|
||||
"dependencies": {
|
||||
"@jsep-plugin/assignment": "^1.3.0",
|
||||
"@jsep-plugin/regex": "^1.0.4",
|
||||
"jsep": "^1.4.0"
|
||||
},
|
||||
"bin": {
|
||||
"jsonpath": "bin/jsonpath-cli.js",
|
||||
"jsonpath-plus": "bin/jsonpath-cli.js"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=18.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/jsonwebtoken": {
|
||||
"version": "9.0.2",
|
||||
"resolved": "https://registry.npmjs.org/jsonwebtoken/-/jsonwebtoken-9.0.2.tgz",
|
||||
@@ -3013,6 +3248,11 @@
|
||||
"resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz",
|
||||
"integrity": "sha512-v2kDEe57lecTulaDIuNTPy3Ry4gLGJ6Z1O3vE1krgXZNrsQ+LFTGHVxVjcXPs17LhbZVGedAJv8XZ1tvj5FvSg=="
|
||||
},
|
||||
"node_modules/lodash.clonedeep": {
|
||||
"version": "4.5.0",
|
||||
"resolved": "https://registry.npmjs.org/lodash.clonedeep/-/lodash.clonedeep-4.5.0.tgz",
|
||||
"integrity": "sha512-H5ZhCF25riFd9uB5UCkVKo61m3S/xZk1x4wA6yp/L3RFP6Z/eHH1ymQcGLo7J3GMPfm0V/7m1tryHuGVxpqEBQ=="
|
||||
},
|
||||
"node_modules/lodash.defaults": {
|
||||
"version": "4.2.0",
|
||||
"resolved": "https://registry.npmjs.org/lodash.defaults/-/lodash.defaults-4.2.0.tgz",
|
||||
@@ -3462,6 +3702,14 @@
|
||||
"url": "https://github.com/fb55/nth-check?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/oauth4webapi": {
|
||||
"version": "3.8.3",
|
||||
"resolved": "https://registry.npmjs.org/oauth4webapi/-/oauth4webapi-3.8.3.tgz",
|
||||
"integrity": "sha512-pQ5BsX3QRTgnt5HxgHwgunIRaDXBdkT23tf8dfzmtTIL2LTpdmxgbpbBm0VgFWAIDlezQvQCTgnVIUmHupXHxw==",
|
||||
"funding": {
|
||||
"url": "https://github.com/sponsors/panva"
|
||||
}
|
||||
},
|
||||
"node_modules/object-assign": {
|
||||
"version": "4.1.1",
|
||||
"resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz",
|
||||
@@ -3500,6 +3748,18 @@
|
||||
"wrappy": "1"
|
||||
}
|
||||
},
|
||||
"node_modules/openid-client": {
|
||||
"version": "6.8.1",
|
||||
"resolved": "https://registry.npmjs.org/openid-client/-/openid-client-6.8.1.tgz",
|
||||
"integrity": "sha512-VoYT6enBo6Vj2j3Q5Ec0AezS+9YGzQo1f5Xc42lreMGlfP4ljiXPKVDvCADh+XHCV/bqPu/wWSiCVXbJKvrODw==",
|
||||
"dependencies": {
|
||||
"jose": "^6.1.0",
|
||||
"oauth4webapi": "^3.8.2"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/sponsors/panva"
|
||||
}
|
||||
},
|
||||
"node_modules/pac-proxy-agent": {
|
||||
"version": "7.2.0",
|
||||
"resolved": "https://registry.npmjs.org/pac-proxy-agent/-/pac-proxy-agent-7.2.0.tgz",
|
||||
@@ -4416,6 +4676,11 @@
|
||||
"url": "https://github.com/privatenumber/resolve-pkg-maps?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/rfc4648": {
|
||||
"version": "1.5.4",
|
||||
"resolved": "https://registry.npmjs.org/rfc4648/-/rfc4648-1.5.4.tgz",
|
||||
"integrity": "sha512-rRg/6Lb+IGfJqO05HZkN50UtY7K/JhxJag1kP23+zyMfrvoB0B7RWv06MbOzoc79RgCdNTiUaNsTT1AJZ7Z+cg=="
|
||||
},
|
||||
"node_modules/rimraf": {
|
||||
"version": "3.0.2",
|
||||
"resolved": "https://registry.npmjs.org/rimraf/-/rimraf-3.0.2.tgz",
|
||||
@@ -4846,6 +5111,14 @@
|
||||
"node": ">= 0.8"
|
||||
}
|
||||
},
|
||||
"node_modules/stream-buffers": {
|
||||
"version": "3.0.3",
|
||||
"resolved": "https://registry.npmjs.org/stream-buffers/-/stream-buffers-3.0.3.tgz",
|
||||
"integrity": "sha512-pqMqwQCso0PBJt2PQmDO0cFj0lyqmiwOMiMSkVtRokl7e+ZTRYgDHKnuZNbqjiJXgsg4nuqtD/zxuo9KqTp0Yw==",
|
||||
"engines": {
|
||||
"node": ">= 0.10.0"
|
||||
}
|
||||
},
|
||||
"node_modules/streamx": {
|
||||
"version": "2.23.0",
|
||||
"resolved": "https://registry.npmjs.org/streamx/-/streamx-2.23.0.tgz",
|
||||
@@ -5065,8 +5338,7 @@
|
||||
"node_modules/undici-types": {
|
||||
"version": "6.21.0",
|
||||
"resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.21.0.tgz",
|
||||
"integrity": "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==",
|
||||
"devOptional": true
|
||||
"integrity": "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ=="
|
||||
},
|
||||
"node_modules/universalify": {
|
||||
"version": "2.0.1",
|
||||
@@ -5089,6 +5361,14 @@
|
||||
"resolved": "https://registry.npmjs.org/urlpattern-polyfill/-/urlpattern-polyfill-10.0.0.tgz",
|
||||
"integrity": "sha512-H/A06tKD7sS1O1X2SshBVeA5FLycRpjqiBeqGKmBwBDBy28EnRjORxTNe269KSSr5un5qyWi1iL61wLxpd+ZOg=="
|
||||
},
|
||||
"node_modules/user-agents": {
|
||||
"version": "1.1.669",
|
||||
"resolved": "https://registry.npmjs.org/user-agents/-/user-agents-1.1.669.tgz",
|
||||
"integrity": "sha512-pbIzG+AOqCaIpySKJ4IAm1l0VyE4jMnK4y1thV8lm8PYxI+7X5uWcppOK7zY79TCKKTAnJH3/4gaVIZHsjrmJA==",
|
||||
"dependencies": {
|
||||
"lodash.clonedeep": "^4.5.0"
|
||||
}
|
||||
},
|
||||
"node_modules/util": {
|
||||
"version": "0.12.5",
|
||||
"resolved": "https://registry.npmjs.org/util/-/util-0.12.5.tgz",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "dutchie-menus-backend",
|
||||
"version": "1.5.1",
|
||||
"version": "1.6.0",
|
||||
"description": "Backend API for Dutchie Menus scraper and management",
|
||||
"main": "dist/index.js",
|
||||
"scripts": {
|
||||
@@ -22,6 +22,7 @@
|
||||
"seed:dt:cities:bulk": "tsx src/scripts/seed-dt-cities-bulk.ts"
|
||||
},
|
||||
"dependencies": {
|
||||
"@kubernetes/client-node": "^1.4.0",
|
||||
"@types/bcryptjs": "^3.0.0",
|
||||
"axios": "^1.6.2",
|
||||
"bcrypt": "^5.1.1",
|
||||
@@ -48,6 +49,7 @@
|
||||
"puppeteer-extra-plugin-stealth": "^2.11.2",
|
||||
"sharp": "^0.32.0",
|
||||
"socks-proxy-agent": "^8.0.2",
|
||||
"user-agents": "^1.1.669",
|
||||
"uuid": "^9.0.1",
|
||||
"zod": "^3.22.4"
|
||||
},
|
||||
|
||||
BIN
backend/public/downloads/cannaiq-menus-1.6.0.zip
Normal file
BIN
backend/public/downloads/cannaiq-menus-1.6.0.zip
Normal file
Binary file not shown.
1
backend/public/downloads/cannaiq-menus-latest.zip
Symbolic link
1
backend/public/downloads/cannaiq-menus-latest.zip
Symbolic link
@@ -0,0 +1 @@
|
||||
cannaiq-menus-1.6.0.zip
|
||||
46
backend/src/_deprecated/DONT_USE.md
Normal file
46
backend/src/_deprecated/DONT_USE.md
Normal file
@@ -0,0 +1,46 @@
|
||||
# DEPRECATED CODE - DO NOT USE
|
||||
|
||||
**These directories contain OLD, ABANDONED code.**
|
||||
|
||||
## What's Here
|
||||
|
||||
| Directory | What It Was | Why Deprecated |
|
||||
|-----------|-------------|----------------|
|
||||
| `hydration/` | Old pipeline for processing crawl data | Replaced by `src/tasks/handlers/` |
|
||||
| `scraper-v2/` | Old Puppeteer-based scraper engine | Replaced by curl-based `src/platforms/dutchie/client.ts` |
|
||||
| `canonical-hydration/` | Intermediate step toward canonical schema | Merged into task handlers |
|
||||
|
||||
## What to Use Instead
|
||||
|
||||
| Old (DONT USE) | New (USE THIS) |
|
||||
|----------------|----------------|
|
||||
| `hydration/normalizers/dutchie.ts` | `src/tasks/handlers/product-refresh.ts` |
|
||||
| `hydration/producer.ts` | `src/tasks/handlers/payload-fetch.ts` |
|
||||
| `scraper-v2/engine.ts` | `src/platforms/dutchie/client.ts` |
|
||||
| `scraper-v2/scheduler.ts` | `src/services/task-scheduler.ts` |
|
||||
|
||||
## Why Keep This Code?
|
||||
|
||||
- Historical reference only
|
||||
- Some patterns may be useful for debugging
|
||||
- Will be deleted once confirmed not needed
|
||||
|
||||
## Claude Instructions
|
||||
|
||||
**IF YOU ARE CLAUDE:**
|
||||
|
||||
1. NEVER import from `src/_deprecated/`
|
||||
2. NEVER reference these files as examples
|
||||
3. NEVER try to "fix" or "update" code in here
|
||||
4. If you see imports from these directories, suggest replacing them
|
||||
|
||||
**Correct imports:**
|
||||
```typescript
|
||||
// GOOD
|
||||
import { executeGraphQL } from '../platforms/dutchie/client';
|
||||
import { pool } from '../db/pool';
|
||||
|
||||
// BAD - DO NOT USE
|
||||
import { something } from '../_deprecated/hydration/...';
|
||||
import { something } from '../_deprecated/scraper-v2/...';
|
||||
```
|
||||
584
backend/src/_deprecated/system/routes/index.ts
Normal file
584
backend/src/_deprecated/system/routes/index.ts
Normal file
@@ -0,0 +1,584 @@
|
||||
/**
|
||||
* System API Routes
|
||||
*
|
||||
* Provides REST API endpoints for system monitoring and control:
|
||||
* - /api/system/sync/* - Sync orchestrator
|
||||
* - /api/system/dlq/* - Dead-letter queue
|
||||
* - /api/system/integrity/* - Integrity checks
|
||||
* - /api/system/fix/* - Auto-fix routines
|
||||
* - /api/system/alerts/* - System alerts
|
||||
* - /metrics - Prometheus metrics
|
||||
*
|
||||
* Phase 5: Full Production Sync + Monitoring
|
||||
*/
|
||||
|
||||
import { Router, Request, Response } from 'express';
|
||||
import { Pool } from 'pg';
|
||||
import {
|
||||
SyncOrchestrator,
|
||||
MetricsService,
|
||||
DLQService,
|
||||
AlertService,
|
||||
IntegrityService,
|
||||
AutoFixService,
|
||||
} from '../services';
|
||||
|
||||
export function createSystemRouter(pool: Pool): Router {
|
||||
const router = Router();
|
||||
|
||||
// Initialize services
|
||||
const metrics = new MetricsService(pool);
|
||||
const dlq = new DLQService(pool);
|
||||
const alerts = new AlertService(pool);
|
||||
const integrity = new IntegrityService(pool, alerts);
|
||||
const autoFix = new AutoFixService(pool, alerts);
|
||||
const orchestrator = new SyncOrchestrator(pool, metrics, dlq, alerts);
|
||||
|
||||
// ============================================================
|
||||
// SYNC ORCHESTRATOR ENDPOINTS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/system/sync/status
|
||||
* Get current sync status
|
||||
*/
|
||||
router.get('/sync/status', async (_req: Request, res: Response) => {
|
||||
try {
|
||||
const status = await orchestrator.getStatus();
|
||||
res.json(status);
|
||||
} catch (error) {
|
||||
console.error('[System] Sync status error:', error);
|
||||
res.status(500).json({ error: 'Failed to get sync status' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/system/sync/run
|
||||
* Trigger a sync run
|
||||
*/
|
||||
router.post('/sync/run', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const triggeredBy = req.body.triggeredBy || 'api';
|
||||
const result = await orchestrator.runSync();
|
||||
res.json({
|
||||
success: true,
|
||||
triggeredBy,
|
||||
metrics: result,
|
||||
});
|
||||
} catch (error) {
|
||||
console.error('[System] Sync run error:', error);
|
||||
res.status(500).json({
|
||||
success: false,
|
||||
error: error instanceof Error ? error.message : 'Sync run failed',
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/system/sync/queue-depth
|
||||
* Get queue depth information
|
||||
*/
|
||||
router.get('/sync/queue-depth', async (_req: Request, res: Response) => {
|
||||
try {
|
||||
const depth = await orchestrator.getQueueDepth();
|
||||
res.json(depth);
|
||||
} catch (error) {
|
||||
console.error('[System] Queue depth error:', error);
|
||||
res.status(500).json({ error: 'Failed to get queue depth' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/system/sync/health
|
||||
* Get sync health status
|
||||
*/
|
||||
router.get('/sync/health', async (_req: Request, res: Response) => {
|
||||
try {
|
||||
const health = await orchestrator.getHealth();
|
||||
res.status(health.healthy ? 200 : 503).json(health);
|
||||
} catch (error) {
|
||||
console.error('[System] Health check error:', error);
|
||||
res.status(500).json({ healthy: false, error: 'Health check failed' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/system/sync/pause
|
||||
* Pause the orchestrator
|
||||
*/
|
||||
router.post('/sync/pause', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const reason = req.body.reason || 'Manual pause';
|
||||
await orchestrator.pause(reason);
|
||||
res.json({ success: true, message: 'Orchestrator paused' });
|
||||
} catch (error) {
|
||||
console.error('[System] Pause error:', error);
|
||||
res.status(500).json({ error: 'Failed to pause orchestrator' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/system/sync/resume
|
||||
* Resume the orchestrator
|
||||
*/
|
||||
router.post('/sync/resume', async (_req: Request, res: Response) => {
|
||||
try {
|
||||
await orchestrator.resume();
|
||||
res.json({ success: true, message: 'Orchestrator resumed' });
|
||||
} catch (error) {
|
||||
console.error('[System] Resume error:', error);
|
||||
res.status(500).json({ error: 'Failed to resume orchestrator' });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// DLQ ENDPOINTS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/system/dlq
|
||||
* List DLQ payloads
|
||||
*/
|
||||
router.get('/dlq', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const options = {
|
||||
status: req.query.status as string,
|
||||
errorType: req.query.errorType as string,
|
||||
dispensaryId: req.query.dispensaryId ? parseInt(req.query.dispensaryId as string) : undefined,
|
||||
limit: req.query.limit ? parseInt(req.query.limit as string) : 50,
|
||||
offset: req.query.offset ? parseInt(req.query.offset as string) : 0,
|
||||
};
|
||||
|
||||
const result = await dlq.listPayloads(options);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[System] DLQ list error:', error);
|
||||
res.status(500).json({ error: 'Failed to list DLQ payloads' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/system/dlq/stats
|
||||
* Get DLQ statistics
|
||||
*/
|
||||
router.get('/dlq/stats', async (_req: Request, res: Response) => {
|
||||
try {
|
||||
const stats = await dlq.getStats();
|
||||
res.json(stats);
|
||||
} catch (error) {
|
||||
console.error('[System] DLQ stats error:', error);
|
||||
res.status(500).json({ error: 'Failed to get DLQ stats' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/system/dlq/summary
|
||||
* Get DLQ summary by error type
|
||||
*/
|
||||
router.get('/dlq/summary', async (_req: Request, res: Response) => {
|
||||
try {
|
||||
const summary = await dlq.getSummary();
|
||||
res.json(summary);
|
||||
} catch (error) {
|
||||
console.error('[System] DLQ summary error:', error);
|
||||
res.status(500).json({ error: 'Failed to get DLQ summary' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/system/dlq/:id
|
||||
* Get a specific DLQ payload
|
||||
*/
|
||||
router.get('/dlq/:id', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const payload = await dlq.getPayload(req.params.id);
|
||||
if (!payload) {
|
||||
return res.status(404).json({ error: 'Payload not found' });
|
||||
}
|
||||
res.json(payload);
|
||||
} catch (error) {
|
||||
console.error('[System] DLQ get error:', error);
|
||||
res.status(500).json({ error: 'Failed to get DLQ payload' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/system/dlq/:id/retry
|
||||
* Retry a DLQ payload
|
||||
*/
|
||||
router.post('/dlq/:id/retry', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const result = await dlq.retryPayload(req.params.id);
|
||||
if (result.success) {
|
||||
res.json(result);
|
||||
} else {
|
||||
res.status(400).json(result);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('[System] DLQ retry error:', error);
|
||||
res.status(500).json({ error: 'Failed to retry payload' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/system/dlq/:id/abandon
|
||||
* Abandon a DLQ payload
|
||||
*/
|
||||
router.post('/dlq/:id/abandon', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const reason = req.body.reason || 'Manually abandoned';
|
||||
const abandonedBy = req.body.abandonedBy || 'api';
|
||||
const success = await dlq.abandonPayload(req.params.id, reason, abandonedBy);
|
||||
res.json({ success });
|
||||
} catch (error) {
|
||||
console.error('[System] DLQ abandon error:', error);
|
||||
res.status(500).json({ error: 'Failed to abandon payload' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/system/dlq/bulk-retry
|
||||
* Bulk retry payloads by error type
|
||||
*/
|
||||
router.post('/dlq/bulk-retry', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { errorType } = req.body;
|
||||
if (!errorType) {
|
||||
return res.status(400).json({ error: 'errorType is required' });
|
||||
}
|
||||
const result = await dlq.bulkRetryByErrorType(errorType);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[System] DLQ bulk retry error:', error);
|
||||
res.status(500).json({ error: 'Failed to bulk retry' });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// INTEGRITY CHECK ENDPOINTS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* POST /api/system/integrity/run
|
||||
* Run all integrity checks
|
||||
*/
|
||||
router.post('/integrity/run', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const triggeredBy = req.body.triggeredBy || 'api';
|
||||
const result = await integrity.runAllChecks(triggeredBy);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[System] Integrity run error:', error);
|
||||
res.status(500).json({ error: 'Failed to run integrity checks' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/system/integrity/runs
|
||||
* Get recent integrity check runs
|
||||
*/
|
||||
router.get('/integrity/runs', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const limit = req.query.limit ? parseInt(req.query.limit as string) : 10;
|
||||
const runs = await integrity.getRecentRuns(limit);
|
||||
res.json(runs);
|
||||
} catch (error) {
|
||||
console.error('[System] Integrity runs error:', error);
|
||||
res.status(500).json({ error: 'Failed to get integrity runs' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/system/integrity/runs/:runId
|
||||
* Get results for a specific integrity run
|
||||
*/
|
||||
router.get('/integrity/runs/:runId', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const results = await integrity.getRunResults(req.params.runId);
|
||||
res.json(results);
|
||||
} catch (error) {
|
||||
console.error('[System] Integrity run results error:', error);
|
||||
res.status(500).json({ error: 'Failed to get run results' });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// AUTO-FIX ENDPOINTS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/system/fix/routines
|
||||
* Get available fix routines
|
||||
*/
|
||||
router.get('/fix/routines', (_req: Request, res: Response) => {
|
||||
try {
|
||||
const routines = autoFix.getAvailableRoutines();
|
||||
res.json(routines);
|
||||
} catch (error) {
|
||||
console.error('[System] Get routines error:', error);
|
||||
res.status(500).json({ error: 'Failed to get routines' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/system/fix/:routine
|
||||
* Run a fix routine
|
||||
*/
|
||||
router.post('/fix/:routine', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const routineName = req.params.routine;
|
||||
const dryRun = req.body.dryRun === true;
|
||||
const triggeredBy = req.body.triggeredBy || 'api';
|
||||
|
||||
const result = await autoFix.runRoutine(routineName as any, triggeredBy, { dryRun });
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[System] Fix routine error:', error);
|
||||
res.status(500).json({ error: 'Failed to run fix routine' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/system/fix/runs
|
||||
* Get recent fix runs
|
||||
*/
|
||||
router.get('/fix/runs', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const limit = req.query.limit ? parseInt(req.query.limit as string) : 20;
|
||||
const runs = await autoFix.getRecentRuns(limit);
|
||||
res.json(runs);
|
||||
} catch (error) {
|
||||
console.error('[System] Fix runs error:', error);
|
||||
res.status(500).json({ error: 'Failed to get fix runs' });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// ALERTS ENDPOINTS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/system/alerts
|
||||
* List alerts
|
||||
*/
|
||||
router.get('/alerts', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const options = {
|
||||
status: req.query.status as any,
|
||||
severity: req.query.severity as any,
|
||||
type: req.query.type as string,
|
||||
limit: req.query.limit ? parseInt(req.query.limit as string) : 50,
|
||||
offset: req.query.offset ? parseInt(req.query.offset as string) : 0,
|
||||
};
|
||||
|
||||
const result = await alerts.listAlerts(options);
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[System] Alerts list error:', error);
|
||||
res.status(500).json({ error: 'Failed to list alerts' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/system/alerts/active
|
||||
* Get active alerts
|
||||
*/
|
||||
router.get('/alerts/active', async (_req: Request, res: Response) => {
|
||||
try {
|
||||
const activeAlerts = await alerts.getActiveAlerts();
|
||||
res.json(activeAlerts);
|
||||
} catch (error) {
|
||||
console.error('[System] Active alerts error:', error);
|
||||
res.status(500).json({ error: 'Failed to get active alerts' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/system/alerts/summary
|
||||
* Get alert summary
|
||||
*/
|
||||
router.get('/alerts/summary', async (_req: Request, res: Response) => {
|
||||
try {
|
||||
const summary = await alerts.getSummary();
|
||||
res.json(summary);
|
||||
} catch (error) {
|
||||
console.error('[System] Alerts summary error:', error);
|
||||
res.status(500).json({ error: 'Failed to get alerts summary' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/system/alerts/:id/acknowledge
|
||||
* Acknowledge an alert
|
||||
*/
|
||||
router.post('/alerts/:id/acknowledge', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const alertId = parseInt(req.params.id);
|
||||
const acknowledgedBy = req.body.acknowledgedBy || 'api';
|
||||
const success = await alerts.acknowledgeAlert(alertId, acknowledgedBy);
|
||||
res.json({ success });
|
||||
} catch (error) {
|
||||
console.error('[System] Acknowledge alert error:', error);
|
||||
res.status(500).json({ error: 'Failed to acknowledge alert' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/system/alerts/:id/resolve
|
||||
* Resolve an alert
|
||||
*/
|
||||
router.post('/alerts/:id/resolve', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const alertId = parseInt(req.params.id);
|
||||
const resolvedBy = req.body.resolvedBy || 'api';
|
||||
const success = await alerts.resolveAlert(alertId, resolvedBy);
|
||||
res.json({ success });
|
||||
} catch (error) {
|
||||
console.error('[System] Resolve alert error:', error);
|
||||
res.status(500).json({ error: 'Failed to resolve alert' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/system/alerts/bulk-acknowledge
|
||||
* Bulk acknowledge alerts
|
||||
*/
|
||||
router.post('/alerts/bulk-acknowledge', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { ids, acknowledgedBy } = req.body;
|
||||
if (!ids || !Array.isArray(ids)) {
|
||||
return res.status(400).json({ error: 'ids array is required' });
|
||||
}
|
||||
const count = await alerts.bulkAcknowledge(ids, acknowledgedBy || 'api');
|
||||
res.json({ acknowledged: count });
|
||||
} catch (error) {
|
||||
console.error('[System] Bulk acknowledge error:', error);
|
||||
res.status(500).json({ error: 'Failed to bulk acknowledge' });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// METRICS ENDPOINTS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/system/metrics
|
||||
* Get all current metrics
|
||||
*/
|
||||
router.get('/metrics', async (_req: Request, res: Response) => {
|
||||
try {
|
||||
const allMetrics = await metrics.getAllMetrics();
|
||||
res.json(allMetrics);
|
||||
} catch (error) {
|
||||
console.error('[System] Metrics error:', error);
|
||||
res.status(500).json({ error: 'Failed to get metrics' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/system/metrics/:name
|
||||
* Get a specific metric
|
||||
*/
|
||||
router.get('/metrics/:name', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const metric = await metrics.getMetric(req.params.name);
|
||||
if (!metric) {
|
||||
return res.status(404).json({ error: 'Metric not found' });
|
||||
}
|
||||
res.json(metric);
|
||||
} catch (error) {
|
||||
console.error('[System] Metric error:', error);
|
||||
res.status(500).json({ error: 'Failed to get metric' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/system/metrics/:name/history
|
||||
* Get metric time series
|
||||
*/
|
||||
router.get('/metrics/:name/history', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const hours = req.query.hours ? parseInt(req.query.hours as string) : 24;
|
||||
const history = await metrics.getMetricHistory(req.params.name, hours);
|
||||
res.json(history);
|
||||
} catch (error) {
|
||||
console.error('[System] Metric history error:', error);
|
||||
res.status(500).json({ error: 'Failed to get metric history' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/system/errors
|
||||
* Get error summary
|
||||
*/
|
||||
router.get('/errors', async (_req: Request, res: Response) => {
|
||||
try {
|
||||
const summary = await metrics.getErrorSummary();
|
||||
res.json(summary);
|
||||
} catch (error) {
|
||||
console.error('[System] Error summary error:', error);
|
||||
res.status(500).json({ error: 'Failed to get error summary' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/system/errors/recent
|
||||
* Get recent errors
|
||||
*/
|
||||
router.get('/errors/recent', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const limit = req.query.limit ? parseInt(req.query.limit as string) : 50;
|
||||
const errorType = req.query.type as string;
|
||||
const errors = await metrics.getRecentErrors(limit, errorType);
|
||||
res.json(errors);
|
||||
} catch (error) {
|
||||
console.error('[System] Recent errors error:', error);
|
||||
res.status(500).json({ error: 'Failed to get recent errors' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/system/errors/acknowledge
|
||||
* Acknowledge errors
|
||||
*/
|
||||
router.post('/errors/acknowledge', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { ids, acknowledgedBy } = req.body;
|
||||
if (!ids || !Array.isArray(ids)) {
|
||||
return res.status(400).json({ error: 'ids array is required' });
|
||||
}
|
||||
const count = await metrics.acknowledgeErrors(ids, acknowledgedBy || 'api');
|
||||
res.json({ acknowledged: count });
|
||||
} catch (error) {
|
||||
console.error('[System] Acknowledge errors error:', error);
|
||||
res.status(500).json({ error: 'Failed to acknowledge errors' });
|
||||
}
|
||||
});
|
||||
|
||||
return router;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create Prometheus metrics endpoint (standalone)
|
||||
*/
|
||||
export function createPrometheusRouter(pool: Pool): Router {
|
||||
const router = Router();
|
||||
const metrics = new MetricsService(pool);
|
||||
|
||||
/**
|
||||
* GET /metrics
|
||||
* Prometheus-compatible metrics endpoint
|
||||
*/
|
||||
router.get('/', async (_req: Request, res: Response) => {
|
||||
try {
|
||||
const prometheusOutput = await metrics.getPrometheusMetrics();
|
||||
res.set('Content-Type', 'text/plain; version=0.0.4');
|
||||
res.send(prometheusOutput);
|
||||
} catch (error) {
|
||||
console.error('[Prometheus] Metrics error:', error);
|
||||
res.status(500).send('# Error generating metrics');
|
||||
}
|
||||
});
|
||||
|
||||
return router;
|
||||
}
|
||||
@@ -32,6 +32,7 @@ const TRUSTED_ORIGINS = [
|
||||
// Pattern-based trusted origins (wildcards)
|
||||
const TRUSTED_ORIGIN_PATTERNS = [
|
||||
/^https:\/\/.*\.cannabrands\.app$/, // *.cannabrands.app
|
||||
/^https:\/\/.*\.cannaiq\.co$/, // *.cannaiq.co
|
||||
];
|
||||
|
||||
// Trusted IPs for internal pod-to-pod communication
|
||||
@@ -152,7 +153,53 @@ export async function authenticateUser(email: string, password: string): Promise
|
||||
}
|
||||
|
||||
export async function authMiddleware(req: AuthRequest, res: Response, next: NextFunction) {
|
||||
// Allow trusted origins/IPs to bypass auth (internal services, same-origin)
|
||||
const authHeader = req.headers.authorization;
|
||||
|
||||
// If a Bearer token is provided, always try to use it first (logged-in user)
|
||||
if (authHeader && authHeader.startsWith('Bearer ')) {
|
||||
const token = authHeader.substring(7);
|
||||
|
||||
// Try JWT first
|
||||
const jwtUser = verifyToken(token);
|
||||
|
||||
if (jwtUser) {
|
||||
req.user = jwtUser;
|
||||
return next();
|
||||
}
|
||||
|
||||
// If JWT fails, try API token
|
||||
try {
|
||||
const result = await pool.query(`
|
||||
SELECT id, name, rate_limit, active, expires_at, allowed_endpoints
|
||||
FROM api_tokens
|
||||
WHERE token = $1
|
||||
`, [token]);
|
||||
|
||||
if (result.rows.length > 0) {
|
||||
const apiToken = result.rows[0];
|
||||
if (!apiToken.active) {
|
||||
return res.status(401).json({ error: 'API token is inactive' });
|
||||
}
|
||||
if (apiToken.expires_at && new Date(apiToken.expires_at) < new Date()) {
|
||||
return res.status(401).json({ error: 'API token has expired' });
|
||||
}
|
||||
req.user = {
|
||||
id: 0,
|
||||
email: `api:${apiToken.name}`,
|
||||
role: 'api_token'
|
||||
};
|
||||
req.apiToken = apiToken;
|
||||
return next();
|
||||
}
|
||||
} catch (err) {
|
||||
console.error('API token lookup error:', err);
|
||||
}
|
||||
|
||||
// Token provided but invalid
|
||||
return res.status(401).json({ error: 'Invalid token' });
|
||||
}
|
||||
|
||||
// No token provided - check trusted origins for API access (WordPress, etc.)
|
||||
if (isTrustedRequest(req)) {
|
||||
req.user = {
|
||||
id: 0,
|
||||
@@ -162,80 +209,10 @@ export async function authMiddleware(req: AuthRequest, res: Response, next: Next
|
||||
return next();
|
||||
}
|
||||
|
||||
const authHeader = req.headers.authorization;
|
||||
|
||||
if (!authHeader || !authHeader.startsWith('Bearer ')) {
|
||||
return res.status(401).json({ error: 'No token provided' });
|
||||
}
|
||||
|
||||
const token = authHeader.substring(7);
|
||||
|
||||
// Try JWT first
|
||||
const jwtUser = verifyToken(token);
|
||||
|
||||
if (jwtUser) {
|
||||
req.user = jwtUser;
|
||||
return next();
|
||||
}
|
||||
|
||||
// If JWT fails, try API token
|
||||
try {
|
||||
const result = await pool.query(`
|
||||
SELECT id, name, rate_limit, active, expires_at, allowed_endpoints
|
||||
FROM api_tokens
|
||||
WHERE token = $1
|
||||
`, [token]);
|
||||
|
||||
if (result.rows.length === 0) {
|
||||
return res.status(401).json({ error: 'Invalid token' });
|
||||
}
|
||||
|
||||
const apiToken = result.rows[0];
|
||||
|
||||
// Check if token is active
|
||||
if (!apiToken.active) {
|
||||
return res.status(401).json({ error: 'Token is disabled' });
|
||||
}
|
||||
|
||||
// Check if token is expired
|
||||
if (apiToken.expires_at && new Date(apiToken.expires_at) < new Date()) {
|
||||
return res.status(401).json({ error: 'Token has expired' });
|
||||
}
|
||||
|
||||
// Check allowed endpoints
|
||||
if (apiToken.allowed_endpoints && apiToken.allowed_endpoints.length > 0) {
|
||||
const isAllowed = apiToken.allowed_endpoints.some((pattern: string) => {
|
||||
// Simple wildcard matching
|
||||
const regex = new RegExp('^' + pattern.replace('*', '.*') + '$');
|
||||
return regex.test(req.path);
|
||||
});
|
||||
|
||||
if (!isAllowed) {
|
||||
return res.status(403).json({ error: 'Endpoint not allowed for this token' });
|
||||
}
|
||||
}
|
||||
|
||||
// Set API token on request for tracking
|
||||
req.apiToken = {
|
||||
id: apiToken.id,
|
||||
name: apiToken.name,
|
||||
rate_limit: apiToken.rate_limit
|
||||
};
|
||||
|
||||
// Set a generic user for compatibility with existing code
|
||||
req.user = {
|
||||
id: apiToken.id,
|
||||
email: `api-token-${apiToken.id}@system`,
|
||||
role: 'api'
|
||||
};
|
||||
|
||||
next();
|
||||
} catch (error) {
|
||||
console.error('Error verifying API token:', error);
|
||||
return res.status(500).json({ error: 'Authentication failed' });
|
||||
}
|
||||
return res.status(401).json({ error: 'No token provided' });
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Require specific role(s) to access endpoint.
|
||||
*
|
||||
|
||||
@@ -172,6 +172,9 @@ export async function runFullDiscovery(
|
||||
console.log(`Errors: ${totalErrors}`);
|
||||
}
|
||||
|
||||
// Per TASK_WORKFLOW_2024-12-10.md: Track new dispensary IDs for task chaining
|
||||
let newDispensaryIds: number[] = [];
|
||||
|
||||
// Step 4: Auto-validate and promote discovered locations
|
||||
if (!dryRun && totalLocationsUpserted > 0) {
|
||||
console.log('\n[Discovery] Step 4: Auto-promoting discovered locations...');
|
||||
@@ -180,6 +183,13 @@ export async function runFullDiscovery(
|
||||
console.log(` Created: ${promotionResult.created} new dispensaries`);
|
||||
console.log(` Updated: ${promotionResult.updated} existing dispensaries`);
|
||||
console.log(` Rejected: ${promotionResult.rejected} (validation failed)`);
|
||||
|
||||
// Per TASK_WORKFLOW_2024-12-10.md: Capture new IDs for task chaining
|
||||
newDispensaryIds = promotionResult.newDispensaryIds;
|
||||
if (newDispensaryIds.length > 0) {
|
||||
console.log(` New store IDs for crawl: [${newDispensaryIds.join(', ')}]`);
|
||||
}
|
||||
|
||||
if (promotionResult.rejectedRecords.length > 0) {
|
||||
console.log(` Rejection reasons:`);
|
||||
promotionResult.rejectedRecords.slice(0, 5).forEach(r => {
|
||||
@@ -214,6 +224,8 @@ export async function runFullDiscovery(
|
||||
totalLocationsFound,
|
||||
totalLocationsUpserted,
|
||||
durationMs,
|
||||
// Per TASK_WORKFLOW_2024-12-10.md: Return new IDs for task chaining
|
||||
newDispensaryIds,
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@@ -127,6 +127,8 @@ export interface PromotionSummary {
|
||||
errors: string[];
|
||||
}>;
|
||||
durationMs: number;
|
||||
// Per TASK_WORKFLOW_2024-12-10.md: Track new dispensary IDs for task chaining
|
||||
newDispensaryIds: number[];
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -469,6 +471,8 @@ export async function promoteDiscoveredLocations(
|
||||
|
||||
const results: PromotionResult[] = [];
|
||||
const rejectedRecords: PromotionSummary['rejectedRecords'] = [];
|
||||
// Per TASK_WORKFLOW_2024-12-10.md: Track new dispensary IDs for task chaining
|
||||
const newDispensaryIds: number[] = [];
|
||||
let created = 0;
|
||||
let updated = 0;
|
||||
let skipped = 0;
|
||||
@@ -525,6 +529,8 @@ export async function promoteDiscoveredLocations(
|
||||
|
||||
if (promotionResult.action === 'created') {
|
||||
created++;
|
||||
// Per TASK_WORKFLOW_2024-12-10.md: Track new IDs for task chaining
|
||||
newDispensaryIds.push(promotionResult.dispensaryId);
|
||||
} else {
|
||||
updated++;
|
||||
}
|
||||
@@ -548,6 +554,8 @@ export async function promoteDiscoveredLocations(
|
||||
results,
|
||||
rejectedRecords,
|
||||
durationMs: Date.now() - startTime,
|
||||
// Per TASK_WORKFLOW_2024-12-10.md: Return new IDs for task chaining
|
||||
newDispensaryIds,
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@@ -211,6 +211,8 @@ export interface FullDiscoveryResult {
|
||||
totalLocationsFound: number;
|
||||
totalLocationsUpserted: number;
|
||||
durationMs: number;
|
||||
// Per TASK_WORKFLOW_2024-12-10.md: Track new dispensary IDs for task chaining
|
||||
newDispensaryIds?: number[];
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
|
||||
@@ -90,7 +90,7 @@ export async function upsertStoreProducts(
|
||||
name_raw, brand_name_raw, category_raw, subcategory_raw,
|
||||
price_rec, price_med, price_rec_special, price_med_special,
|
||||
is_on_special, discount_percent,
|
||||
is_in_stock, stock_status,
|
||||
is_in_stock, stock_status, stock_quantity, total_quantity_available,
|
||||
thc_percent, cbd_percent,
|
||||
image_url,
|
||||
first_seen_at, last_seen_at, updated_at
|
||||
@@ -99,9 +99,9 @@ export async function upsertStoreProducts(
|
||||
$5, $6, $7, $8,
|
||||
$9, $10, $11, $12,
|
||||
$13, $14,
|
||||
$15, $16,
|
||||
$17, $18,
|
||||
$19,
|
||||
$15, $16, $17, $17,
|
||||
$18, $19,
|
||||
$20,
|
||||
NOW(), NOW(), NOW()
|
||||
)
|
||||
ON CONFLICT (dispensary_id, provider, provider_product_id)
|
||||
@@ -118,6 +118,8 @@ export async function upsertStoreProducts(
|
||||
discount_percent = EXCLUDED.discount_percent,
|
||||
is_in_stock = EXCLUDED.is_in_stock,
|
||||
stock_status = EXCLUDED.stock_status,
|
||||
stock_quantity = EXCLUDED.stock_quantity,
|
||||
total_quantity_available = EXCLUDED.total_quantity_available,
|
||||
thc_percent = EXCLUDED.thc_percent,
|
||||
cbd_percent = EXCLUDED.cbd_percent,
|
||||
image_url = EXCLUDED.image_url,
|
||||
@@ -141,6 +143,7 @@ export async function upsertStoreProducts(
|
||||
productPricing?.discountPercent,
|
||||
productAvailability?.inStock ?? true,
|
||||
productAvailability?.stockStatus || 'unknown',
|
||||
productAvailability?.quantity ?? null, // stock_quantity and total_quantity_available
|
||||
// Clamp THC/CBD to valid percentage range (0-100) - some products report mg as %
|
||||
product.thcPercent !== null && product.thcPercent <= 100 ? product.thcPercent : null,
|
||||
product.cbdPercent !== null && product.cbdPercent <= 100 ? product.cbdPercent : null,
|
||||
|
||||
@@ -6,6 +6,8 @@ import { initializeMinio, isMinioEnabled } from './utils/minio';
|
||||
import { initializeImageStorage } from './utils/image-storage';
|
||||
import { logger } from './services/logger';
|
||||
import { cleanupOrphanedJobs } from './services/proxyTestQueue';
|
||||
// Per TASK_WORKFLOW_2024-12-10.md: Database-driven task scheduler
|
||||
import { taskScheduler } from './services/task-scheduler';
|
||||
import { runAutoMigrations } from './db/auto-migrate';
|
||||
import { getPool } from './db/pool';
|
||||
import healthRoutes from './routes/health';
|
||||
@@ -107,7 +109,7 @@ import scraperMonitorRoutes from './routes/scraper-monitor';
|
||||
import apiTokensRoutes from './routes/api-tokens';
|
||||
import apiPermissionsRoutes from './routes/api-permissions';
|
||||
import parallelScrapeRoutes from './routes/parallel-scrape';
|
||||
import crawlerSandboxRoutes from './routes/crawler-sandbox';
|
||||
// crawler-sandbox moved to _deprecated
|
||||
import versionRoutes from './routes/version';
|
||||
import deployStatusRoutes from './routes/deploy-status';
|
||||
import publicApiRoutes from './routes/public-api';
|
||||
@@ -142,6 +144,9 @@ import seoRoutes from './routes/seo';
|
||||
import priceAnalyticsRoutes from './routes/price-analytics';
|
||||
import tasksRoutes from './routes/tasks';
|
||||
import workerRegistryRoutes from './routes/worker-registry';
|
||||
// Per TASK_WORKFLOW_2024-12-10.md: Raw payload access API
|
||||
import payloadsRoutes from './routes/payloads';
|
||||
import k8sRoutes from './routes/k8s';
|
||||
|
||||
// Mark requests from trusted domains (cannaiq.co, findagram.co, findadispo.com)
|
||||
// These domains can access the API without authentication
|
||||
@@ -182,7 +187,7 @@ app.use('/api/scraper-monitor', scraperMonitorRoutes);
|
||||
app.use('/api/api-tokens', apiTokensRoutes);
|
||||
app.use('/api/api-permissions', apiPermissionsRoutes);
|
||||
app.use('/api/parallel-scrape', parallelScrapeRoutes);
|
||||
app.use('/api/crawler-sandbox', crawlerSandboxRoutes);
|
||||
// crawler-sandbox moved to _deprecated
|
||||
app.use('/api/version', versionRoutes);
|
||||
app.use('/api/admin/deploy-status', deployStatusRoutes);
|
||||
console.log('[DeployStatus] Routes registered at /api/admin/deploy-status');
|
||||
@@ -222,6 +227,14 @@ console.log('[Tasks] Routes registered at /api/tasks');
|
||||
app.use('/api/worker-registry', workerRegistryRoutes);
|
||||
console.log('[WorkerRegistry] Routes registered at /api/worker-registry');
|
||||
|
||||
// Per TASK_WORKFLOW_2024-12-10.md: Raw payload access API
|
||||
app.use('/api/payloads', payloadsRoutes);
|
||||
console.log('[Payloads] Routes registered at /api/payloads');
|
||||
|
||||
// K8s control routes - worker scaling from admin UI
|
||||
app.use('/api/k8s', k8sRoutes);
|
||||
console.log('[K8s] Routes registered at /api/k8s');
|
||||
|
||||
// Phase 3: Analytics V2 - Enhanced analytics with rec/med state segmentation
|
||||
try {
|
||||
const analyticsV2Router = createAnalyticsV2Router(getPool());
|
||||
@@ -326,6 +339,17 @@ async function startServer() {
|
||||
// Clean up any orphaned proxy test jobs from previous server runs
|
||||
await cleanupOrphanedJobs();
|
||||
|
||||
// Per TASK_WORKFLOW_2024-12-10.md: Start database-driven task scheduler
|
||||
// This replaces node-cron - schedules are stored in DB and survive restarts
|
||||
// Uses SELECT FOR UPDATE SKIP LOCKED for multi-replica safety
|
||||
try {
|
||||
await taskScheduler.start();
|
||||
logger.info('system', 'Task scheduler started');
|
||||
} catch (err: any) {
|
||||
// Non-fatal - scheduler can recover on next poll
|
||||
logger.warn('system', `Task scheduler startup warning: ${err.message}`);
|
||||
}
|
||||
|
||||
app.listen(PORT, () => {
|
||||
logger.info('system', `Server running on port ${PORT}`);
|
||||
console.log(`🚀 Server running on port ${PORT}`);
|
||||
|
||||
@@ -5,8 +5,8 @@ import { Request, Response, NextFunction } from 'express';
|
||||
* These are our own frontends that should have unrestricted access.
|
||||
*/
|
||||
const TRUSTED_DOMAINS = [
|
||||
'cannaiq.co',
|
||||
'www.cannaiq.co',
|
||||
'*.cannaiq.co',
|
||||
'*.cannabrands.app',
|
||||
'findagram.co',
|
||||
'www.findagram.co',
|
||||
'findadispo.com',
|
||||
@@ -32,6 +32,24 @@ function extractDomain(header: string): string | null {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if a domain matches any trusted domain (supports *.domain.com wildcards)
|
||||
*/
|
||||
function isTrustedDomain(domain: string): boolean {
|
||||
for (const trusted of TRUSTED_DOMAINS) {
|
||||
if (trusted.startsWith('*.')) {
|
||||
// Wildcard: *.example.com matches example.com and any subdomain
|
||||
const baseDomain = trusted.slice(2);
|
||||
if (domain === baseDomain || domain.endsWith('.' + baseDomain)) {
|
||||
return true;
|
||||
}
|
||||
} else if (domain === trusted) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if the request comes from a trusted domain
|
||||
*/
|
||||
@@ -42,7 +60,7 @@ function isRequestFromTrustedDomain(req: Request): boolean {
|
||||
// Check Origin header first (preferred for CORS requests)
|
||||
if (origin) {
|
||||
const domain = extractDomain(origin);
|
||||
if (domain && TRUSTED_DOMAINS.includes(domain)) {
|
||||
if (domain && isTrustedDomain(domain)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
@@ -50,7 +68,7 @@ function isRequestFromTrustedDomain(req: Request): boolean {
|
||||
// Fallback to Referer header
|
||||
if (referer) {
|
||||
const domain = extractDomain(referer);
|
||||
if (domain && TRUSTED_DOMAINS.includes(domain)) {
|
||||
if (domain && isTrustedDomain(domain)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -702,12 +702,10 @@ export class StateQueryService {
|
||||
async getNationalSummary(): Promise<NationalSummary> {
|
||||
const stateMetrics = await this.getAllStateMetrics();
|
||||
|
||||
// Get all states count and aggregate metrics
|
||||
const result = await this.pool.query(`
|
||||
SELECT
|
||||
COUNT(DISTINCT s.code) AS total_states,
|
||||
COUNT(DISTINCT CASE WHEN EXISTS (
|
||||
SELECT 1 FROM dispensaries d WHERE d.state = s.code AND d.menu_type IS NOT NULL
|
||||
) THEN s.code END) AS active_states,
|
||||
(SELECT COUNT(*) FROM dispensaries WHERE state IS NOT NULL) AS total_stores,
|
||||
(SELECT COUNT(*) FROM store_products sp
|
||||
JOIN dispensaries d ON sp.dispensary_id = d.id
|
||||
@@ -725,7 +723,7 @@ export class StateQueryService {
|
||||
|
||||
return {
|
||||
totalStates: parseInt(data.total_states),
|
||||
activeStates: parseInt(data.active_states),
|
||||
activeStates: parseInt(data.total_states), // Same as totalStates - all states shown
|
||||
totalStores: parseInt(data.total_stores),
|
||||
totalProducts: parseInt(data.total_products),
|
||||
totalBrands: parseInt(data.total_brands),
|
||||
|
||||
@@ -5,22 +5,35 @@
|
||||
*
|
||||
* DO NOT MODIFY THIS FILE WITHOUT EXPLICIT AUTHORIZATION.
|
||||
*
|
||||
* This is the canonical HTTP client for all Dutchie communication.
|
||||
* All Dutchie workers (Alice, Bella, etc.) MUST use this client.
|
||||
* Updated: 2025-12-10 per workflow-12102025.md
|
||||
*
|
||||
* KEY BEHAVIORS (per workflow-12102025.md):
|
||||
* 1. startSession() gets identity from PROXY LOCATION, not task params
|
||||
* 2. On 403: immediately get new IP + new fingerprint, then retry
|
||||
* 3. After 3 consecutive 403s on same proxy → disable it (burned)
|
||||
* 4. Language is always English (en-US)
|
||||
*
|
||||
* IMPLEMENTATION:
|
||||
* - Uses curl via child_process.execSync (bypasses TLS fingerprinting)
|
||||
* - NO Puppeteer, NO axios, NO fetch
|
||||
* - Fingerprint rotation on 403
|
||||
* - Uses intoli/user-agents via CrawlRotator for realistic fingerprints
|
||||
* - Residential IP compatible
|
||||
*
|
||||
* USAGE:
|
||||
* import { curlPost, curlGet, executeGraphQL } from '@dutchie/client';
|
||||
* import { curlPost, curlGet, executeGraphQL, startSession } from '@dutchie/client';
|
||||
*
|
||||
* ============================================================
|
||||
*/
|
||||
|
||||
import { execSync } from 'child_process';
|
||||
import {
|
||||
buildOrderedHeaders,
|
||||
buildRefererFromMenuUrl,
|
||||
getCurlBinary,
|
||||
isCurlImpersonateAvailable,
|
||||
HeaderContext,
|
||||
BrowserType,
|
||||
} from '../../services/http-fingerprint';
|
||||
|
||||
// ============================================================
|
||||
// TYPES
|
||||
@@ -32,6 +45,8 @@ export interface CurlResponse {
|
||||
error?: string;
|
||||
}
|
||||
|
||||
// Per workflow-12102025.md: fingerprint comes from CrawlRotator's BrowserFingerprint
|
||||
// We keep a simplified interface here for header building
|
||||
export interface Fingerprint {
|
||||
userAgent: string;
|
||||
acceptLanguage: string;
|
||||
@@ -57,15 +72,13 @@ export const DUTCHIE_CONFIG = {
|
||||
|
||||
// ============================================================
|
||||
// PROXY SUPPORT
|
||||
// ============================================================
|
||||
// Integrates with the CrawlRotator system from proxy-rotator.ts
|
||||
// On 403 errors:
|
||||
// 1. Record failure on current proxy
|
||||
// 2. Rotate to next proxy
|
||||
// 3. Retry with new proxy
|
||||
// Per workflow-12102025.md:
|
||||
// - On 403: recordBlock() → increment consecutive_403_count
|
||||
// - After 3 consecutive 403s → proxy disabled
|
||||
// - Immediately rotate to new IP + new fingerprint on 403
|
||||
// ============================================================
|
||||
|
||||
import type { CrawlRotator, Proxy } from '../../services/crawl-rotator';
|
||||
import type { CrawlRotator, BrowserFingerprint } from '../../services/crawl-rotator';
|
||||
|
||||
let currentProxy: string | null = null;
|
||||
let crawlRotator: CrawlRotator | null = null;
|
||||
@@ -92,13 +105,12 @@ export function getProxy(): string | null {
|
||||
|
||||
/**
|
||||
* Set CrawlRotator for proxy rotation on 403s
|
||||
* This enables automatic proxy rotation when blocked
|
||||
* Per workflow-12102025.md: enables automatic rotation when blocked
|
||||
*/
|
||||
export function setCrawlRotator(rotator: CrawlRotator | null): void {
|
||||
crawlRotator = rotator;
|
||||
if (rotator) {
|
||||
console.log('[Dutchie Client] CrawlRotator attached - proxy rotation enabled');
|
||||
// Set initial proxy from rotator
|
||||
const proxy = rotator.proxy.getCurrent();
|
||||
if (proxy) {
|
||||
currentProxy = rotator.proxy.getProxyUrl(proxy);
|
||||
@@ -115,30 +127,41 @@ export function getCrawlRotator(): CrawlRotator | null {
|
||||
}
|
||||
|
||||
/**
|
||||
* Rotate to next proxy (called on 403)
|
||||
* Handle 403 block - per workflow-12102025.md:
|
||||
* 1. Record block on current proxy (increments consecutive_403_count)
|
||||
* 2. Immediately rotate to new proxy (new IP)
|
||||
* 3. Rotate fingerprint
|
||||
* Returns false if no more proxies available
|
||||
*/
|
||||
async function rotateProxyOn403(error?: string): Promise<boolean> {
|
||||
async function handle403Block(): Promise<boolean> {
|
||||
if (!crawlRotator) {
|
||||
console.warn('[Dutchie Client] No CrawlRotator - cannot handle 403');
|
||||
return false;
|
||||
}
|
||||
|
||||
// Record failure on current proxy
|
||||
await crawlRotator.recordFailure(error || '403 Forbidden');
|
||||
// Per workflow-12102025.md: record block (tracks consecutive 403s)
|
||||
const wasDisabled = await crawlRotator.recordBlock();
|
||||
if (wasDisabled) {
|
||||
console.log('[Dutchie Client] Current proxy was disabled (3 consecutive 403s)');
|
||||
}
|
||||
|
||||
// Per workflow-12102025.md: immediately get new IP + new fingerprint
|
||||
const { proxy: nextProxy, fingerprint } = crawlRotator.rotateBoth();
|
||||
|
||||
// Rotate to next proxy
|
||||
const nextProxy = crawlRotator.rotateProxy();
|
||||
if (nextProxy) {
|
||||
currentProxy = crawlRotator.proxy.getProxyUrl(nextProxy);
|
||||
console.log(`[Dutchie Client] Rotated proxy: ${currentProxy.replace(/:[^:@]+@/, ':***@')}`);
|
||||
console.log(`[Dutchie Client] Rotated to new proxy: ${currentProxy.replace(/:[^:@]+@/, ':***@')}`);
|
||||
console.log(`[Dutchie Client] New fingerprint: ${fingerprint.userAgent.slice(0, 50)}...`);
|
||||
return true;
|
||||
}
|
||||
|
||||
console.warn('[Dutchie Client] No more proxies available');
|
||||
console.error('[Dutchie Client] No more proxies available!');
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Record success on current proxy
|
||||
* Per workflow-12102025.md: resets consecutive_403_count
|
||||
*/
|
||||
async function recordProxySuccess(responseTimeMs?: number): Promise<void> {
|
||||
if (crawlRotator) {
|
||||
@@ -162,163 +185,69 @@ export const GRAPHQL_HASHES = {
|
||||
GetAllCitiesByState: 'ae547a0466ace5a48f91e55bf6699eacd87e3a42841560f0c0eabed5a0a920e6',
|
||||
};
|
||||
|
||||
// ============================================================
|
||||
// FINGERPRINTS - Browser profiles for anti-detect
|
||||
// ============================================================
|
||||
|
||||
const FINGERPRINTS: Fingerprint[] = [
|
||||
// Chrome Windows (latest) - typical residential user, use first
|
||||
{
|
||||
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
||||
acceptLanguage: 'en-US,en;q=0.9',
|
||||
secChUa: '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
|
||||
secChUaPlatform: '"Windows"',
|
||||
secChUaMobile: '?0',
|
||||
},
|
||||
// Chrome Mac (latest)
|
||||
{
|
||||
userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
||||
acceptLanguage: 'en-US,en;q=0.9',
|
||||
secChUa: '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
|
||||
secChUaPlatform: '"macOS"',
|
||||
secChUaMobile: '?0',
|
||||
},
|
||||
// Chrome Windows (120)
|
||||
{
|
||||
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
acceptLanguage: 'en-US,en;q=0.9',
|
||||
secChUa: '"Chromium";v="120", "Google Chrome";v="120", "Not-A.Brand";v="99"',
|
||||
secChUaPlatform: '"Windows"',
|
||||
secChUaMobile: '?0',
|
||||
},
|
||||
// Firefox Windows
|
||||
{
|
||||
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0',
|
||||
acceptLanguage: 'en-US,en;q=0.5',
|
||||
},
|
||||
// Safari Mac
|
||||
{
|
||||
userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15',
|
||||
acceptLanguage: 'en-US,en;q=0.9',
|
||||
},
|
||||
// Edge Windows
|
||||
{
|
||||
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0',
|
||||
acceptLanguage: 'en-US,en;q=0.9',
|
||||
secChUa: '"Microsoft Edge";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
|
||||
secChUaPlatform: '"Windows"',
|
||||
secChUaMobile: '?0',
|
||||
},
|
||||
];
|
||||
|
||||
let currentFingerprintIndex = 0;
|
||||
|
||||
// Forward declaration for session (actual CrawlSession interface defined later)
|
||||
let currentSession: {
|
||||
sessionId: string;
|
||||
fingerprint: Fingerprint;
|
||||
proxyUrl: string | null;
|
||||
stateCode?: string;
|
||||
timezone?: string;
|
||||
startedAt: Date;
|
||||
} | null = null;
|
||||
|
||||
/**
|
||||
* Get current fingerprint - returns session fingerprint if active, otherwise default
|
||||
*/
|
||||
export function getFingerprint(): Fingerprint {
|
||||
// Use session fingerprint if a session is active
|
||||
if (currentSession) {
|
||||
return currentSession.fingerprint;
|
||||
}
|
||||
return FINGERPRINTS[currentFingerprintIndex];
|
||||
}
|
||||
|
||||
export function rotateFingerprint(): Fingerprint {
|
||||
currentFingerprintIndex = (currentFingerprintIndex + 1) % FINGERPRINTS.length;
|
||||
const fp = FINGERPRINTS[currentFingerprintIndex];
|
||||
console.log(`[Dutchie Client] Rotated to fingerprint: ${fp.userAgent.slice(0, 50)}...`);
|
||||
return fp;
|
||||
}
|
||||
|
||||
export function resetFingerprint(): void {
|
||||
currentFingerprintIndex = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a random fingerprint from the pool
|
||||
*/
|
||||
export function getRandomFingerprint(): Fingerprint {
|
||||
const index = Math.floor(Math.random() * FINGERPRINTS.length);
|
||||
return FINGERPRINTS[index];
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// SESSION MANAGEMENT
|
||||
// Per-session fingerprint rotation for stealth
|
||||
// Per workflow-12102025.md:
|
||||
// - Session identity comes from PROXY LOCATION
|
||||
// - NOT from task params (no stateCode/timezone params)
|
||||
// - Language is always English
|
||||
// ============================================================
|
||||
|
||||
export interface CrawlSession {
|
||||
sessionId: string;
|
||||
fingerprint: Fingerprint;
|
||||
fingerprint: BrowserFingerprint;
|
||||
proxyUrl: string | null;
|
||||
stateCode?: string;
|
||||
timezone?: string;
|
||||
proxyTimezone?: string;
|
||||
proxyState?: string;
|
||||
startedAt: Date;
|
||||
// Per workflow-12102025.md: Dynamic Referer per dispensary
|
||||
menuUrl?: string;
|
||||
referer: string;
|
||||
}
|
||||
|
||||
// Note: currentSession variable declared earlier in file for proper scoping
|
||||
let currentSession: CrawlSession | null = null;
|
||||
|
||||
/**
|
||||
* Timezone to Accept-Language mapping
|
||||
* US timezones all use en-US but this can be extended for international
|
||||
* Start a new crawl session
|
||||
*
|
||||
* Per workflow-12102025.md:
|
||||
* - NO state/timezone params - identity comes from proxy location
|
||||
* - Gets fingerprint from CrawlRotator (uses intoli/user-agents)
|
||||
* - Language is always English (en-US)
|
||||
* - Dynamic Referer per dispensary (from menuUrl)
|
||||
*
|
||||
* @param menuUrl - The dispensary's menu URL for dynamic Referer header
|
||||
*/
|
||||
const TIMEZONE_TO_LOCALE: Record<string, string> = {
|
||||
'America/Phoenix': 'en-US,en;q=0.9',
|
||||
'America/Los_Angeles': 'en-US,en;q=0.9',
|
||||
'America/Denver': 'en-US,en;q=0.9',
|
||||
'America/Chicago': 'en-US,en;q=0.9',
|
||||
'America/New_York': 'en-US,en;q=0.9',
|
||||
'America/Detroit': 'en-US,en;q=0.9',
|
||||
'America/Anchorage': 'en-US,en;q=0.9',
|
||||
'Pacific/Honolulu': 'en-US,en;q=0.9',
|
||||
};
|
||||
export function startSession(menuUrl?: string): CrawlSession {
|
||||
if (!crawlRotator) {
|
||||
throw new Error('[Dutchie Client] Cannot start session without CrawlRotator');
|
||||
}
|
||||
|
||||
/**
|
||||
* Get Accept-Language header for a given timezone
|
||||
*/
|
||||
export function getLocaleForTimezone(timezone?: string): string {
|
||||
if (!timezone) return 'en-US,en;q=0.9';
|
||||
return TIMEZONE_TO_LOCALE[timezone] || 'en-US,en;q=0.9';
|
||||
}
|
||||
// Per workflow-12102025.md: get identity from proxy location
|
||||
const proxyLocation = crawlRotator.getProxyLocation();
|
||||
const fingerprint = crawlRotator.userAgent.getCurrent();
|
||||
|
||||
/**
|
||||
* Start a new crawl session with a random fingerprint
|
||||
* Call this before crawling a store to get a fresh identity
|
||||
*/
|
||||
export function startSession(stateCode?: string, timezone?: string): CrawlSession {
|
||||
const baseFp = getRandomFingerprint();
|
||||
|
||||
// Override Accept-Language based on timezone for geographic consistency
|
||||
const fingerprint: Fingerprint = {
|
||||
...baseFp,
|
||||
acceptLanguage: getLocaleForTimezone(timezone),
|
||||
};
|
||||
// Per workflow-12102025.md: Dynamic Referer per dispensary
|
||||
const referer = buildRefererFromMenuUrl(menuUrl);
|
||||
|
||||
currentSession = {
|
||||
sessionId: `session_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`,
|
||||
fingerprint,
|
||||
proxyUrl: currentProxy,
|
||||
stateCode,
|
||||
timezone,
|
||||
proxyTimezone: proxyLocation?.timezone,
|
||||
proxyState: proxyLocation?.state,
|
||||
startedAt: new Date(),
|
||||
menuUrl,
|
||||
referer,
|
||||
};
|
||||
|
||||
console.log(`[Dutchie Client] Started session ${currentSession.sessionId}`);
|
||||
console.log(`[Dutchie Client] Fingerprint: ${fingerprint.userAgent.slice(0, 50)}...`);
|
||||
console.log(`[Dutchie Client] Accept-Language: ${fingerprint.acceptLanguage}`);
|
||||
if (timezone) {
|
||||
console.log(`[Dutchie Client] Timezone: ${timezone}`);
|
||||
console.log(`[Dutchie Client] Browser: ${fingerprint.browserName} (${fingerprint.deviceCategory})`);
|
||||
console.log(`[Dutchie Client] DNT: ${fingerprint.httpFingerprint.hasDNT ? 'enabled' : 'disabled'}`);
|
||||
console.log(`[Dutchie Client] TLS: ${fingerprint.httpFingerprint.curlImpersonateBinary}`);
|
||||
console.log(`[Dutchie Client] Referer: ${referer}`);
|
||||
if (proxyLocation?.timezone) {
|
||||
console.log(`[Dutchie Client] Proxy: ${proxyLocation.state || 'unknown'} (${proxyLocation.timezone})`);
|
||||
}
|
||||
|
||||
return currentSession;
|
||||
@@ -347,48 +276,80 @@ export function getCurrentSession(): CrawlSession | null {
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* Build headers for Dutchie requests
|
||||
* Per workflow-12102025.md: Build headers using HTTP fingerprint system
|
||||
* Returns headers in browser-specific order with all natural variations
|
||||
*/
|
||||
export function buildHeaders(refererPath: string, fingerprint?: Fingerprint): Record<string, string> {
|
||||
const fp = fingerprint || getFingerprint();
|
||||
const refererUrl = `https://dutchie.com${refererPath}`;
|
||||
|
||||
const headers: Record<string, string> = {
|
||||
'accept': 'application/json, text/plain, */*',
|
||||
'accept-language': fp.acceptLanguage,
|
||||
'content-type': 'application/json',
|
||||
'origin': 'https://dutchie.com',
|
||||
'referer': refererUrl,
|
||||
'user-agent': fp.userAgent,
|
||||
'apollographql-client-name': 'Marketplace (production)',
|
||||
};
|
||||
|
||||
if (fp.secChUa) {
|
||||
headers['sec-ch-ua'] = fp.secChUa;
|
||||
headers['sec-ch-ua-mobile'] = fp.secChUaMobile || '?0';
|
||||
headers['sec-ch-ua-platform'] = fp.secChUaPlatform || '"Windows"';
|
||||
headers['sec-fetch-dest'] = 'empty';
|
||||
headers['sec-fetch-mode'] = 'cors';
|
||||
headers['sec-fetch-site'] = 'same-site';
|
||||
export function buildHeaders(isPost: boolean, contentLength?: number): { headers: Record<string, string>; orderedHeaders: string[] } {
|
||||
if (!currentSession || !crawlRotator) {
|
||||
throw new Error('[Dutchie Client] Cannot build headers without active session');
|
||||
}
|
||||
|
||||
return headers;
|
||||
const fp = currentSession.fingerprint;
|
||||
const httpFp = fp.httpFingerprint;
|
||||
|
||||
// Per workflow-12102025.md: Build context for ordered headers
|
||||
const context: HeaderContext = {
|
||||
userAgent: fp.userAgent,
|
||||
secChUa: fp.secChUa,
|
||||
secChUaPlatform: fp.secChUaPlatform,
|
||||
secChUaMobile: fp.secChUaMobile,
|
||||
referer: currentSession.referer,
|
||||
isPost,
|
||||
contentLength,
|
||||
};
|
||||
|
||||
// Per workflow-12102025.md: Get ordered headers from HTTP fingerprint service
|
||||
return buildOrderedHeaders(httpFp, context);
|
||||
}
|
||||
|
||||
/**
|
||||
* Execute HTTP POST using curl (bypasses TLS fingerprinting)
|
||||
* Per workflow-12102025.md: Get curl binary for current session's browser
|
||||
* Uses curl-impersonate for TLS fingerprint matching
|
||||
*/
|
||||
export function curlPost(url: string, body: any, headers: Record<string, string>, timeout = 30000): CurlResponse {
|
||||
const filteredHeaders = Object.entries(headers)
|
||||
.filter(([k]) => k.toLowerCase() !== 'accept-encoding')
|
||||
.map(([k, v]) => `-H '${k}: ${v}'`)
|
||||
function getCurlBinaryForSession(): string {
|
||||
if (!currentSession) {
|
||||
return 'curl'; // Fallback to standard curl
|
||||
}
|
||||
|
||||
const browserType = currentSession.fingerprint.browserName as BrowserType;
|
||||
|
||||
// Per workflow-12102025.md: Check if curl-impersonate is available
|
||||
if (isCurlImpersonateAvailable(browserType)) {
|
||||
return getCurlBinary(browserType);
|
||||
}
|
||||
|
||||
// Fallback to standard curl with warning
|
||||
console.warn(`[Dutchie Client] curl-impersonate not available for ${browserType}, using standard curl`);
|
||||
return 'curl';
|
||||
}
|
||||
|
||||
/**
|
||||
* Per workflow-12102025.md: Execute HTTP POST using curl/curl-impersonate
|
||||
* - Uses browser-specific TLS fingerprint via curl-impersonate
|
||||
* - Headers sent in browser-specific order
|
||||
* - Dynamic Referer per dispensary
|
||||
*/
|
||||
export function curlPost(url: string, body: any, timeout = 30000): CurlResponse {
|
||||
const bodyJson = JSON.stringify(body);
|
||||
|
||||
// Per workflow-12102025.md: Build ordered headers for POST request
|
||||
const { headers, orderedHeaders } = buildHeaders(true, bodyJson.length);
|
||||
|
||||
// Per workflow-12102025.md: Build header args in browser-specific order
|
||||
const headerArgs = orderedHeaders
|
||||
.filter(h => h !== 'Host' && h !== 'Content-Length') // curl handles these
|
||||
.map(h => `-H '${h}: ${headers[h]}'`)
|
||||
.join(' ');
|
||||
|
||||
const bodyJson = JSON.stringify(body).replace(/'/g, "'\\''");
|
||||
const bodyEscaped = bodyJson.replace(/'/g, "'\\''");
|
||||
const timeoutSec = Math.ceil(timeout / 1000);
|
||||
const separator = '___HTTP_STATUS___';
|
||||
const proxyArg = getProxyArg();
|
||||
const cmd = `curl -s --compressed ${proxyArg} -w '${separator}%{http_code}' --max-time ${timeoutSec} ${filteredHeaders} -d '${bodyJson}' '${url}'`;
|
||||
|
||||
// Per workflow-12102025.md: Use curl-impersonate for TLS fingerprint matching
|
||||
const curlBinary = getCurlBinaryForSession();
|
||||
|
||||
const cmd = `${curlBinary} -s --compressed ${proxyArg} -w '${separator}%{http_code}' --max-time ${timeoutSec} ${headerArgs} -d '${bodyEscaped}' '${url}'`;
|
||||
|
||||
try {
|
||||
const output = execSync(cmd, {
|
||||
@@ -427,19 +388,29 @@ export function curlPost(url: string, body: any, headers: Record<string, string>
|
||||
}
|
||||
|
||||
/**
|
||||
* Execute HTTP GET using curl (bypasses TLS fingerprinting)
|
||||
* Returns HTML or JSON depending on response content-type
|
||||
* Per workflow-12102025.md: Execute HTTP GET using curl/curl-impersonate
|
||||
* - Uses browser-specific TLS fingerprint via curl-impersonate
|
||||
* - Headers sent in browser-specific order
|
||||
* - Dynamic Referer per dispensary
|
||||
*/
|
||||
export function curlGet(url: string, headers: Record<string, string>, timeout = 30000): CurlResponse {
|
||||
const filteredHeaders = Object.entries(headers)
|
||||
.filter(([k]) => k.toLowerCase() !== 'accept-encoding')
|
||||
.map(([k, v]) => `-H '${k}: ${v}'`)
|
||||
export function curlGet(url: string, timeout = 30000): CurlResponse {
|
||||
// Per workflow-12102025.md: Build ordered headers for GET request
|
||||
const { headers, orderedHeaders } = buildHeaders(false);
|
||||
|
||||
// Per workflow-12102025.md: Build header args in browser-specific order
|
||||
const headerArgs = orderedHeaders
|
||||
.filter(h => h !== 'Host' && h !== 'Content-Length') // curl handles these
|
||||
.map(h => `-H '${h}: ${headers[h]}'`)
|
||||
.join(' ');
|
||||
|
||||
const timeoutSec = Math.ceil(timeout / 1000);
|
||||
const separator = '___HTTP_STATUS___';
|
||||
const proxyArg = getProxyArg();
|
||||
const cmd = `curl -s --compressed ${proxyArg} -w '${separator}%{http_code}' --max-time ${timeoutSec} ${filteredHeaders} '${url}'`;
|
||||
|
||||
// Per workflow-12102025.md: Use curl-impersonate for TLS fingerprint matching
|
||||
const curlBinary = getCurlBinaryForSession();
|
||||
|
||||
const cmd = `${curlBinary} -s --compressed ${proxyArg} -w '${separator}%{http_code}' --max-time ${timeoutSec} ${headerArgs} '${url}'`;
|
||||
|
||||
try {
|
||||
const output = execSync(cmd, {
|
||||
@@ -459,7 +430,6 @@ export function curlGet(url: string, headers: Record<string, string>, timeout =
|
||||
const responseBody = output.slice(0, separatorIndex);
|
||||
const statusCode = parseInt(output.slice(separatorIndex + separator.length).trim(), 10);
|
||||
|
||||
// Try to parse as JSON, otherwise return as string (HTML)
|
||||
try {
|
||||
return { status: statusCode, data: JSON.parse(responseBody) };
|
||||
} catch {
|
||||
@@ -476,16 +446,22 @@ export function curlGet(url: string, headers: Record<string, string>, timeout =
|
||||
|
||||
// ============================================================
|
||||
// GRAPHQL EXECUTION
|
||||
// Per workflow-12102025.md:
|
||||
// - On 403: immediately rotate IP + fingerprint (no delay first)
|
||||
// - Then retry
|
||||
// ============================================================
|
||||
|
||||
export interface ExecuteGraphQLOptions {
|
||||
maxRetries?: number;
|
||||
retryOn403?: boolean;
|
||||
cName?: string; // Optional - used for Referer header, defaults to 'cities'
|
||||
cName?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Execute GraphQL query with curl (bypasses TLS fingerprinting)
|
||||
* Per workflow-12102025.md: Execute GraphQL query with curl/curl-impersonate
|
||||
* - Uses browser-specific TLS fingerprint
|
||||
* - Headers in browser-specific order
|
||||
* - On 403: immediately rotate IP + fingerprint, then retry
|
||||
*/
|
||||
export async function executeGraphQL(
|
||||
operationName: string,
|
||||
@@ -493,7 +469,12 @@ export async function executeGraphQL(
|
||||
hash: string,
|
||||
options: ExecuteGraphQLOptions
|
||||
): Promise<any> {
|
||||
const { maxRetries = 3, retryOn403 = true, cName = 'cities' } = options;
|
||||
const { maxRetries = 3, retryOn403 = true } = options;
|
||||
|
||||
// Per workflow-12102025.md: Session must be active for requests
|
||||
if (!currentSession) {
|
||||
throw new Error('[Dutchie Client] Cannot execute GraphQL without active session - call startSession() first');
|
||||
}
|
||||
|
||||
const body = {
|
||||
operationName,
|
||||
@@ -507,14 +488,14 @@ export async function executeGraphQL(
|
||||
let attempt = 0;
|
||||
|
||||
while (attempt <= maxRetries) {
|
||||
const fingerprint = getFingerprint();
|
||||
const headers = buildHeaders(`/embedded-menu/${cName}`, fingerprint);
|
||||
|
||||
console.log(`[Dutchie Client] curl POST ${operationName} (attempt ${attempt + 1}/${maxRetries + 1})`);
|
||||
|
||||
const response = curlPost(DUTCHIE_CONFIG.graphqlEndpoint, body, headers, DUTCHIE_CONFIG.timeout);
|
||||
const startTime = Date.now();
|
||||
// Per workflow-12102025.md: curlPost now uses ordered headers and curl-impersonate
|
||||
const response = curlPost(DUTCHIE_CONFIG.graphqlEndpoint, body, DUTCHIE_CONFIG.timeout);
|
||||
const responseTime = Date.now() - startTime;
|
||||
|
||||
console.log(`[Dutchie Client] Response status: ${response.status}`);
|
||||
console.log(`[Dutchie Client] Response status: ${response.status} (${responseTime}ms)`);
|
||||
|
||||
if (response.error) {
|
||||
console.error(`[Dutchie Client] curl error: ${response.error}`);
|
||||
@@ -527,6 +508,9 @@ export async function executeGraphQL(
|
||||
}
|
||||
|
||||
if (response.status === 200) {
|
||||
// Per workflow-12102025.md: success resets consecutive 403 count
|
||||
await recordProxySuccess(responseTime);
|
||||
|
||||
if (response.data?.errors?.length > 0) {
|
||||
console.warn(`[Dutchie Client] GraphQL errors: ${JSON.stringify(response.data.errors[0])}`);
|
||||
}
|
||||
@@ -534,11 +518,20 @@ export async function executeGraphQL(
|
||||
}
|
||||
|
||||
if (response.status === 403 && retryOn403) {
|
||||
console.warn(`[Dutchie Client] 403 blocked - rotating proxy and fingerprint...`);
|
||||
await rotateProxyOn403('403 Forbidden on GraphQL');
|
||||
rotateFingerprint();
|
||||
// Per workflow-12102025.md: immediately rotate IP + fingerprint
|
||||
console.warn(`[Dutchie Client] 403 blocked - immediately rotating proxy + fingerprint...`);
|
||||
const hasMoreProxies = await handle403Block();
|
||||
|
||||
if (!hasMoreProxies) {
|
||||
throw new Error('All proxies exhausted - no more IPs available');
|
||||
}
|
||||
|
||||
// Per workflow-12102025.md: Update session referer after rotation
|
||||
currentSession.referer = buildRefererFromMenuUrl(currentSession.menuUrl);
|
||||
|
||||
attempt++;
|
||||
await sleep(1000 * attempt);
|
||||
// Per workflow-12102025.md: small backoff after rotation
|
||||
await sleep(500);
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -567,8 +560,10 @@ export interface FetchPageOptions {
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch HTML page from Dutchie (for city pages, dispensary pages, etc.)
|
||||
* Returns raw HTML string
|
||||
* Per workflow-12102025.md: Fetch HTML page from Dutchie
|
||||
* - Uses browser-specific TLS fingerprint
|
||||
* - Headers in browser-specific order
|
||||
* - Same 403 handling as GraphQL
|
||||
*/
|
||||
export async function fetchPage(
|
||||
path: string,
|
||||
@@ -577,32 +572,22 @@ export async function fetchPage(
|
||||
const { maxRetries = 3, retryOn403 = true } = options;
|
||||
const url = `${DUTCHIE_CONFIG.baseUrl}${path}`;
|
||||
|
||||
// Per workflow-12102025.md: Session must be active for requests
|
||||
if (!currentSession) {
|
||||
throw new Error('[Dutchie Client] Cannot fetch page without active session - call startSession() first');
|
||||
}
|
||||
|
||||
let attempt = 0;
|
||||
|
||||
while (attempt <= maxRetries) {
|
||||
const fingerprint = getFingerprint();
|
||||
const headers: Record<string, string> = {
|
||||
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
|
||||
'accept-language': fingerprint.acceptLanguage,
|
||||
'user-agent': fingerprint.userAgent,
|
||||
};
|
||||
|
||||
if (fingerprint.secChUa) {
|
||||
headers['sec-ch-ua'] = fingerprint.secChUa;
|
||||
headers['sec-ch-ua-mobile'] = fingerprint.secChUaMobile || '?0';
|
||||
headers['sec-ch-ua-platform'] = fingerprint.secChUaPlatform || '"Windows"';
|
||||
headers['sec-fetch-dest'] = 'document';
|
||||
headers['sec-fetch-mode'] = 'navigate';
|
||||
headers['sec-fetch-site'] = 'none';
|
||||
headers['sec-fetch-user'] = '?1';
|
||||
headers['upgrade-insecure-requests'] = '1';
|
||||
}
|
||||
|
||||
// Per workflow-12102025.md: curlGet now uses ordered headers and curl-impersonate
|
||||
console.log(`[Dutchie Client] curl GET ${path} (attempt ${attempt + 1}/${maxRetries + 1})`);
|
||||
|
||||
const response = curlGet(url, headers, DUTCHIE_CONFIG.timeout);
|
||||
const startTime = Date.now();
|
||||
const response = curlGet(url, DUTCHIE_CONFIG.timeout);
|
||||
const responseTime = Date.now() - startTime;
|
||||
|
||||
console.log(`[Dutchie Client] Response status: ${response.status}`);
|
||||
console.log(`[Dutchie Client] Response status: ${response.status} (${responseTime}ms)`);
|
||||
|
||||
if (response.error) {
|
||||
console.error(`[Dutchie Client] curl error: ${response.error}`);
|
||||
@@ -614,15 +599,26 @@ export async function fetchPage(
|
||||
}
|
||||
|
||||
if (response.status === 200) {
|
||||
// Per workflow-12102025.md: success resets consecutive 403 count
|
||||
await recordProxySuccess(responseTime);
|
||||
return { html: response.data, status: response.status };
|
||||
}
|
||||
|
||||
if (response.status === 403 && retryOn403) {
|
||||
console.warn(`[Dutchie Client] 403 blocked - rotating proxy and fingerprint...`);
|
||||
await rotateProxyOn403('403 Forbidden on page fetch');
|
||||
rotateFingerprint();
|
||||
// Per workflow-12102025.md: immediately rotate IP + fingerprint
|
||||
console.warn(`[Dutchie Client] 403 blocked - immediately rotating proxy + fingerprint...`);
|
||||
const hasMoreProxies = await handle403Block();
|
||||
|
||||
if (!hasMoreProxies) {
|
||||
throw new Error('All proxies exhausted - no more IPs available');
|
||||
}
|
||||
|
||||
// Per workflow-12102025.md: Update session after rotation
|
||||
currentSession.referer = buildRefererFromMenuUrl(currentSession.menuUrl);
|
||||
|
||||
attempt++;
|
||||
await sleep(1000 * attempt);
|
||||
// Per workflow-12102025.md: small backoff after rotation
|
||||
await sleep(500);
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
@@ -6,22 +6,17 @@
|
||||
*/
|
||||
|
||||
export {
|
||||
// HTTP Client
|
||||
// HTTP Client (per workflow-12102025.md: uses curl-impersonate + ordered headers)
|
||||
curlPost,
|
||||
curlGet,
|
||||
executeGraphQL,
|
||||
fetchPage,
|
||||
extractNextData,
|
||||
|
||||
// Headers & Fingerprints
|
||||
// Headers (per workflow-12102025.md: browser-specific ordering)
|
||||
buildHeaders,
|
||||
getFingerprint,
|
||||
rotateFingerprint,
|
||||
resetFingerprint,
|
||||
getRandomFingerprint,
|
||||
getLocaleForTimezone,
|
||||
|
||||
// Session Management (per-store fingerprint rotation)
|
||||
// Session Management (per workflow-12102025.md: menuUrl for dynamic Referer)
|
||||
startSession,
|
||||
endSession,
|
||||
getCurrentSession,
|
||||
|
||||
@@ -7,15 +7,23 @@
|
||||
* Routes are prefixed with /api/analytics/v2
|
||||
*
|
||||
* Phase 3: Analytics Engine + Rec/Med by State
|
||||
*
|
||||
* SECURITY: All routes require authentication via authMiddleware.
|
||||
* Access is granted to:
|
||||
* - Trusted origins (cannaiq.co, findadispo.com, etc.)
|
||||
* - Trusted IPs (localhost, internal pods)
|
||||
* - Valid JWT or API tokens
|
||||
*/
|
||||
|
||||
import { Router, Request, Response } from 'express';
|
||||
import { Pool } from 'pg';
|
||||
import { authMiddleware } from '../auth/middleware';
|
||||
import { PriceAnalyticsService } from '../services/analytics/PriceAnalyticsService';
|
||||
import { BrandPenetrationService } from '../services/analytics/BrandPenetrationService';
|
||||
import { CategoryAnalyticsService } from '../services/analytics/CategoryAnalyticsService';
|
||||
import { StoreAnalyticsService } from '../services/analytics/StoreAnalyticsService';
|
||||
import { StateAnalyticsService } from '../services/analytics/StateAnalyticsService';
|
||||
import { BrandIntelligenceService } from '../services/analytics/BrandIntelligenceService';
|
||||
import { TimeWindow, LegalType } from '../services/analytics/types';
|
||||
|
||||
function parseTimeWindow(window?: string): TimeWindow {
|
||||
@@ -35,12 +43,17 @@ function parseLegalType(legalType?: string): LegalType {
|
||||
export function createAnalyticsV2Router(pool: Pool): Router {
|
||||
const router = Router();
|
||||
|
||||
// SECURITY: Apply auth middleware to ALL routes
|
||||
// This gate ensures only authenticated requests can access analytics data
|
||||
router.use(authMiddleware);
|
||||
|
||||
// Initialize services
|
||||
const priceService = new PriceAnalyticsService(pool);
|
||||
const brandService = new BrandPenetrationService(pool);
|
||||
const categoryService = new CategoryAnalyticsService(pool);
|
||||
const storeService = new StoreAnalyticsService(pool);
|
||||
const stateService = new StateAnalyticsService(pool);
|
||||
const brandIntelligenceService = new BrandIntelligenceService(pool);
|
||||
|
||||
// ============================================================
|
||||
// PRICE ANALYTICS
|
||||
@@ -231,6 +244,76 @@ export function createAnalyticsV2Router(pool: Pool): Router {
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /brand/:name/promotions
|
||||
* Get brand promotional history - tracks specials, discounts, duration, and sales estimates
|
||||
*
|
||||
* Query params:
|
||||
* - window: 7d|30d|90d (default: 90d)
|
||||
* - state: state code filter (e.g., AZ)
|
||||
* - category: category filter (e.g., Flower)
|
||||
*/
|
||||
router.get('/brand/:name/promotions', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const brandName = decodeURIComponent(req.params.name);
|
||||
const window = parseTimeWindow(req.query.window as string) || '90d';
|
||||
const stateCode = req.query.state as string | undefined;
|
||||
const category = req.query.category as string | undefined;
|
||||
|
||||
const result = await brandService.getBrandPromotionalHistory(brandName, {
|
||||
window,
|
||||
stateCode,
|
||||
category,
|
||||
});
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[AnalyticsV2] Brand promotions error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch brand promotional history' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /brand/:name/intelligence
|
||||
* Get comprehensive B2B brand intelligence dashboard data
|
||||
*
|
||||
* Returns all brand metrics in a single unified response:
|
||||
* - Performance Snapshot (active SKUs, revenue, stores, market share)
|
||||
* - Alerts/Slippage (lost stores, delisted SKUs, competitor takeovers)
|
||||
* - Product Velocity (daily rates, velocity status)
|
||||
* - Retail Footprint (penetration, whitespace opportunities)
|
||||
* - Competitive Landscape (price position, market share trend)
|
||||
* - Inventory Health (days of stock, risk levels)
|
||||
* - Promotion Effectiveness (baseline vs promo velocity, ROI)
|
||||
*
|
||||
* Query params:
|
||||
* - window: 7d|30d|90d (default: 30d)
|
||||
* - state: state code filter (e.g., AZ)
|
||||
* - category: category filter (e.g., Flower)
|
||||
*/
|
||||
router.get('/brand/:name/intelligence', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const brandName = decodeURIComponent(req.params.name);
|
||||
const window = parseTimeWindow(req.query.window as string);
|
||||
const stateCode = req.query.state as string | undefined;
|
||||
const category = req.query.category as string | undefined;
|
||||
|
||||
const result = await brandIntelligenceService.getBrandIntelligence(brandName, {
|
||||
window,
|
||||
stateCode,
|
||||
category,
|
||||
});
|
||||
|
||||
if (!result) {
|
||||
return res.status(404).json({ error: 'Brand not found' });
|
||||
}
|
||||
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[AnalyticsV2] Brand intelligence error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch brand intelligence' });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// CATEGORY ANALYTICS
|
||||
// ============================================================
|
||||
@@ -400,6 +483,31 @@ export function createAnalyticsV2Router(pool: Pool): Router {
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /store/:id/quantity-changes
|
||||
* Get quantity changes for a store (increases/decreases)
|
||||
* Useful for estimating sales (decreases) or restocks (increases)
|
||||
*
|
||||
* Query params:
|
||||
* - window: 7d|30d|90d (default: 7d)
|
||||
* - direction: increase|decrease|all (default: all)
|
||||
* - limit: number (default: 100)
|
||||
*/
|
||||
router.get('/store/:id/quantity-changes', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const dispensaryId = parseInt(req.params.id);
|
||||
const window = parseTimeWindow(req.query.window as string);
|
||||
const direction = (req.query.direction as 'increase' | 'decrease' | 'all') || 'all';
|
||||
const limit = req.query.limit ? parseInt(req.query.limit as string) : 100;
|
||||
|
||||
const result = await storeService.getQuantityChanges(dispensaryId, { window, direction, limit });
|
||||
res.json(result);
|
||||
} catch (error) {
|
||||
console.error('[AnalyticsV2] Store quantity changes error:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch store quantity changes' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /store/:id/inventory
|
||||
* Get store inventory composition
|
||||
|
||||
@@ -47,4 +47,27 @@ router.post('/refresh', authMiddleware, async (req: AuthRequest, res) => {
|
||||
res.json({ token });
|
||||
});
|
||||
|
||||
// Verify password for sensitive actions (requires current user to be authenticated)
|
||||
router.post('/verify-password', authMiddleware, async (req: AuthRequest, res) => {
|
||||
try {
|
||||
const { password } = req.body;
|
||||
|
||||
if (!password) {
|
||||
return res.status(400).json({ error: 'Password required' });
|
||||
}
|
||||
|
||||
// Re-authenticate the current user with the provided password
|
||||
const user = await authenticateUser(req.user!.email, password);
|
||||
|
||||
if (!user) {
|
||||
return res.status(401).json({ error: 'Invalid password', verified: false });
|
||||
}
|
||||
|
||||
res.json({ verified: true });
|
||||
} catch (error) {
|
||||
console.error('Password verification error:', error);
|
||||
res.status(500).json({ error: 'Internal server error' });
|
||||
}
|
||||
});
|
||||
|
||||
export default router;
|
||||
|
||||
@@ -14,35 +14,56 @@ router.use(authMiddleware);
|
||||
/**
|
||||
* GET /api/admin/intelligence/brands
|
||||
* List all brands with state presence, store counts, and pricing
|
||||
* Query params:
|
||||
* - state: Filter by state (e.g., "AZ")
|
||||
* - limit: Max results (default 500)
|
||||
* - offset: Pagination offset
|
||||
*/
|
||||
router.get('/brands', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { limit = '500', offset = '0' } = req.query;
|
||||
const { limit = '500', offset = '0', state } = req.query;
|
||||
const limitNum = Math.min(parseInt(limit as string, 10), 1000);
|
||||
const offsetNum = parseInt(offset as string, 10);
|
||||
|
||||
// Build WHERE clause based on state filter
|
||||
let stateFilter = '';
|
||||
const params: any[] = [limitNum, offsetNum];
|
||||
if (state && state !== 'all') {
|
||||
stateFilter = 'AND d.state = $3';
|
||||
params.push(state);
|
||||
}
|
||||
|
||||
const { rows } = await pool.query(`
|
||||
SELECT
|
||||
sp.brand_name_raw as brand_name,
|
||||
array_agg(DISTINCT d.state) FILTER (WHERE d.state IS NOT NULL) as states,
|
||||
COUNT(DISTINCT d.id) as store_count,
|
||||
COUNT(DISTINCT sp.id) as sku_count,
|
||||
ROUND(AVG(sp.price_rec)::numeric, 2) FILTER (WHERE sp.price_rec > 0) as avg_price_rec,
|
||||
ROUND(AVG(sp.price_med)::numeric, 2) FILTER (WHERE sp.price_med > 0) as avg_price_med
|
||||
ROUND(AVG(sp.price_rec) FILTER (WHERE sp.price_rec > 0)::numeric, 2) as avg_price_rec,
|
||||
ROUND(AVG(sp.price_med) FILTER (WHERE sp.price_med > 0)::numeric, 2) as avg_price_med
|
||||
FROM store_products sp
|
||||
JOIN dispensaries d ON sp.dispensary_id = d.id
|
||||
WHERE sp.brand_name_raw IS NOT NULL AND sp.brand_name_raw != ''
|
||||
${stateFilter}
|
||||
GROUP BY sp.brand_name_raw
|
||||
ORDER BY store_count DESC, sku_count DESC
|
||||
LIMIT $1 OFFSET $2
|
||||
`, [limitNum, offsetNum]);
|
||||
`, params);
|
||||
|
||||
// Get total count
|
||||
// Get total count with same state filter
|
||||
const countParams: any[] = [];
|
||||
let countStateFilter = '';
|
||||
if (state && state !== 'all') {
|
||||
countStateFilter = 'AND d.state = $1';
|
||||
countParams.push(state);
|
||||
}
|
||||
const { rows: countRows } = await pool.query(`
|
||||
SELECT COUNT(DISTINCT brand_name_raw) as total
|
||||
FROM store_products
|
||||
WHERE brand_name_raw IS NOT NULL AND brand_name_raw != ''
|
||||
`);
|
||||
SELECT COUNT(DISTINCT sp.brand_name_raw) as total
|
||||
FROM store_products sp
|
||||
JOIN dispensaries d ON sp.dispensary_id = d.id
|
||||
WHERE sp.brand_name_raw IS NOT NULL AND sp.brand_name_raw != ''
|
||||
${countStateFilter}
|
||||
`, countParams);
|
||||
|
||||
res.json({
|
||||
brands: rows.map((r: any) => ({
|
||||
@@ -147,29 +168,63 @@ router.get('/brands/:brandName/penetration', async (req: Request, res: Response)
|
||||
/**
|
||||
* GET /api/admin/intelligence/pricing
|
||||
* Get pricing analytics by category
|
||||
* Query params:
|
||||
* - state: Filter by state (e.g., "AZ")
|
||||
*/
|
||||
router.get('/pricing', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { rows: categoryRows } = await pool.query(`
|
||||
SELECT
|
||||
sp.category_raw as category,
|
||||
ROUND(AVG(sp.price_rec)::numeric, 2) as avg_price,
|
||||
MIN(sp.price_rec) FILTER (WHERE sp.price_rec > 0) as min_price,
|
||||
MAX(sp.price_rec) as max_price,
|
||||
ROUND(PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY sp.price_rec)::numeric, 2)
|
||||
FILTER (WHERE sp.price_rec > 0) as median_price,
|
||||
COUNT(*) as product_count
|
||||
FROM store_products sp
|
||||
WHERE sp.category_raw IS NOT NULL AND sp.price_rec > 0
|
||||
GROUP BY sp.category_raw
|
||||
ORDER BY product_count DESC
|
||||
`);
|
||||
const { state } = req.query;
|
||||
|
||||
// Build WHERE clause based on state filter
|
||||
let stateFilter = '';
|
||||
const categoryParams: any[] = [];
|
||||
const stateQueryParams: any[] = [];
|
||||
const overallParams: any[] = [];
|
||||
|
||||
if (state && state !== 'all') {
|
||||
stateFilter = 'AND d.state = $1';
|
||||
categoryParams.push(state);
|
||||
overallParams.push(state);
|
||||
}
|
||||
|
||||
// Category pricing with optional state filter
|
||||
const categoryQuery = state && state !== 'all'
|
||||
? `
|
||||
SELECT
|
||||
sp.category_raw as category,
|
||||
ROUND(AVG(sp.price_rec)::numeric, 2) as avg_price,
|
||||
MIN(sp.price_rec) as min_price,
|
||||
MAX(sp.price_rec) as max_price,
|
||||
ROUND(PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY sp.price_rec)::numeric, 2) as median_price,
|
||||
COUNT(*) as product_count
|
||||
FROM store_products sp
|
||||
JOIN dispensaries d ON sp.dispensary_id = d.id
|
||||
WHERE sp.category_raw IS NOT NULL AND sp.price_rec > 0 ${stateFilter}
|
||||
GROUP BY sp.category_raw
|
||||
ORDER BY product_count DESC
|
||||
`
|
||||
: `
|
||||
SELECT
|
||||
sp.category_raw as category,
|
||||
ROUND(AVG(sp.price_rec)::numeric, 2) as avg_price,
|
||||
MIN(sp.price_rec) as min_price,
|
||||
MAX(sp.price_rec) as max_price,
|
||||
ROUND(PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY sp.price_rec)::numeric, 2) as median_price,
|
||||
COUNT(*) as product_count
|
||||
FROM store_products sp
|
||||
WHERE sp.category_raw IS NOT NULL AND sp.price_rec > 0
|
||||
GROUP BY sp.category_raw
|
||||
ORDER BY product_count DESC
|
||||
`;
|
||||
|
||||
const { rows: categoryRows } = await pool.query(categoryQuery, categoryParams);
|
||||
|
||||
// State pricing
|
||||
const { rows: stateRows } = await pool.query(`
|
||||
SELECT
|
||||
d.state,
|
||||
ROUND(AVG(sp.price_rec)::numeric, 2) as avg_price,
|
||||
MIN(sp.price_rec) FILTER (WHERE sp.price_rec > 0) as min_price,
|
||||
MIN(sp.price_rec) as min_price,
|
||||
MAX(sp.price_rec) as max_price,
|
||||
COUNT(DISTINCT sp.id) as product_count
|
||||
FROM store_products sp
|
||||
@@ -179,6 +234,31 @@ router.get('/pricing', async (req: Request, res: Response) => {
|
||||
ORDER BY avg_price DESC
|
||||
`);
|
||||
|
||||
// Overall stats with optional state filter
|
||||
const overallQuery = state && state !== 'all'
|
||||
? `
|
||||
SELECT
|
||||
ROUND(AVG(sp.price_rec)::numeric, 2) as avg_price,
|
||||
MIN(sp.price_rec) as min_price,
|
||||
MAX(sp.price_rec) as max_price,
|
||||
COUNT(*) as total_products
|
||||
FROM store_products sp
|
||||
JOIN dispensaries d ON sp.dispensary_id = d.id
|
||||
WHERE sp.price_rec > 0 ${stateFilter}
|
||||
`
|
||||
: `
|
||||
SELECT
|
||||
ROUND(AVG(sp.price_rec)::numeric, 2) as avg_price,
|
||||
MIN(sp.price_rec) as min_price,
|
||||
MAX(sp.price_rec) as max_price,
|
||||
COUNT(*) as total_products
|
||||
FROM store_products sp
|
||||
WHERE sp.price_rec > 0
|
||||
`;
|
||||
|
||||
const { rows: overallRows } = await pool.query(overallQuery, overallParams);
|
||||
const overall = overallRows[0];
|
||||
|
||||
res.json({
|
||||
byCategory: categoryRows.map((r: any) => ({
|
||||
category: r.category,
|
||||
@@ -195,6 +275,12 @@ router.get('/pricing', async (req: Request, res: Response) => {
|
||||
maxPrice: r.max_price ? parseFloat(r.max_price) : null,
|
||||
productCount: parseInt(r.product_count, 10),
|
||||
})),
|
||||
overall: {
|
||||
avgPrice: overall?.avg_price ? parseFloat(overall.avg_price) : null,
|
||||
minPrice: overall?.min_price ? parseFloat(overall.min_price) : null,
|
||||
maxPrice: overall?.max_price ? parseFloat(overall.max_price) : null,
|
||||
totalProducts: parseInt(overall?.total_products || '0', 10),
|
||||
},
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('[Intelligence] Error fetching pricing:', error.message);
|
||||
@@ -205,9 +291,23 @@ router.get('/pricing', async (req: Request, res: Response) => {
|
||||
/**
|
||||
* GET /api/admin/intelligence/stores
|
||||
* Get store intelligence summary
|
||||
* Query params:
|
||||
* - state: Filter by state (e.g., "AZ")
|
||||
* - limit: Max results (default 200)
|
||||
*/
|
||||
router.get('/stores', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { state, limit = '200' } = req.query;
|
||||
const limitNum = Math.min(parseInt(limit as string, 10), 500);
|
||||
|
||||
// Build WHERE clause based on state filter
|
||||
let stateFilter = '';
|
||||
const params: any[] = [limitNum];
|
||||
if (state && state !== 'all') {
|
||||
stateFilter = 'AND d.state = $2';
|
||||
params.push(state);
|
||||
}
|
||||
|
||||
const { rows: storeRows } = await pool.query(`
|
||||
SELECT
|
||||
d.id,
|
||||
@@ -217,17 +317,22 @@ router.get('/stores', async (req: Request, res: Response) => {
|
||||
d.state,
|
||||
d.menu_type,
|
||||
d.crawl_enabled,
|
||||
COUNT(DISTINCT sp.id) as product_count,
|
||||
c.name as chain_name,
|
||||
COUNT(DISTINCT sp.id) as sku_count,
|
||||
COUNT(DISTINCT sp.brand_name_raw) as brand_count,
|
||||
ROUND(AVG(sp.price_rec)::numeric, 2) as avg_price,
|
||||
MAX(sp.updated_at) as last_product_update
|
||||
MAX(sp.updated_at) as last_crawl,
|
||||
(SELECT COUNT(*) FROM store_product_snapshots sps
|
||||
WHERE sps.store_product_id IN (SELECT id FROM store_products WHERE dispensary_id = d.id)) as snapshot_count
|
||||
FROM dispensaries d
|
||||
LEFT JOIN store_products sp ON sp.dispensary_id = d.id
|
||||
WHERE d.state IS NOT NULL
|
||||
GROUP BY d.id, d.name, d.dba_name, d.city, d.state, d.menu_type, d.crawl_enabled
|
||||
ORDER BY product_count DESC
|
||||
LIMIT 200
|
||||
`);
|
||||
LEFT JOIN chains c ON d.chain_id = c.id
|
||||
WHERE d.state IS NOT NULL AND d.crawl_enabled = true
|
||||
${stateFilter}
|
||||
GROUP BY d.id, d.name, d.dba_name, d.city, d.state, d.menu_type, d.crawl_enabled, c.name
|
||||
ORDER BY sku_count DESC
|
||||
LIMIT $1
|
||||
`, params);
|
||||
|
||||
res.json({
|
||||
stores: storeRows.map((r: any) => ({
|
||||
@@ -238,10 +343,13 @@ router.get('/stores', async (req: Request, res: Response) => {
|
||||
state: r.state,
|
||||
menuType: r.menu_type,
|
||||
crawlEnabled: r.crawl_enabled,
|
||||
productCount: parseInt(r.product_count || '0', 10),
|
||||
chainName: r.chain_name || null,
|
||||
skuCount: parseInt(r.sku_count || '0', 10),
|
||||
snapshotCount: parseInt(r.snapshot_count || '0', 10),
|
||||
brandCount: parseInt(r.brand_count || '0', 10),
|
||||
avgPrice: r.avg_price ? parseFloat(r.avg_price) : null,
|
||||
lastProductUpdate: r.last_product_update,
|
||||
lastCrawl: r.last_crawl,
|
||||
crawlFrequencyHours: 4, // Default crawl frequency
|
||||
})),
|
||||
total: storeRows.length,
|
||||
});
|
||||
|
||||
@@ -543,6 +543,9 @@ router.post('/bulk-priority', async (req: Request, res: Response) => {
|
||||
|
||||
/**
|
||||
* POST /api/job-queue/enqueue - Add a new job to the queue
|
||||
*
|
||||
* 2024-12-10: Rewired to use worker_tasks via taskService.
|
||||
* Legacy dispensary_crawl_jobs code commented out below.
|
||||
*/
|
||||
router.post('/enqueue', async (req: Request, res: Response) => {
|
||||
try {
|
||||
@@ -552,6 +555,59 @@ router.post('/enqueue', async (req: Request, res: Response) => {
|
||||
return res.status(400).json({ success: false, error: 'dispensary_id is required' });
|
||||
}
|
||||
|
||||
// 2024-12-10: Map legacy job_type to new task role
|
||||
const roleMap: Record<string, string> = {
|
||||
'dutchie_product_crawl': 'product_refresh',
|
||||
'menu_detection': 'entry_point_discovery',
|
||||
'menu_detection_single': 'entry_point_discovery',
|
||||
'product_discovery': 'product_discovery',
|
||||
'store_discovery': 'store_discovery',
|
||||
};
|
||||
const role = roleMap[job_type] || 'product_refresh';
|
||||
|
||||
// 2024-12-10: Use taskService to create task in worker_tasks table
|
||||
const { taskService } = await import('../tasks/task-service');
|
||||
|
||||
// Check if task already pending for this dispensary
|
||||
const existingTasks = await taskService.listTasks({
|
||||
dispensary_id,
|
||||
role: role as any,
|
||||
status: ['pending', 'claimed', 'running'],
|
||||
limit: 1,
|
||||
});
|
||||
|
||||
if (existingTasks.length > 0) {
|
||||
return res.json({
|
||||
success: true,
|
||||
task_id: existingTasks[0].id,
|
||||
message: 'Task already queued'
|
||||
});
|
||||
}
|
||||
|
||||
const task = await taskService.createTask({
|
||||
role: role as any,
|
||||
dispensary_id,
|
||||
priority,
|
||||
});
|
||||
|
||||
res.json({ success: true, task_id: task.id, message: 'Task enqueued' });
|
||||
} catch (error: any) {
|
||||
console.error('[JobQueue] Error enqueuing task:', error);
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
/*
|
||||
* LEGACY CODE - 2024-12-10: Commented out, was using orphaned dispensary_crawl_jobs table
|
||||
*
|
||||
router.post('/enqueue', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { dispensary_id, job_type = 'dutchie_product_crawl', priority = 0 } = req.body;
|
||||
|
||||
if (!dispensary_id) {
|
||||
return res.status(400).json({ success: false, error: 'dispensary_id is required' });
|
||||
}
|
||||
|
||||
// Check if job already pending for this dispensary
|
||||
const existing = await pool.query(`
|
||||
SELECT id FROM dispensary_crawl_jobs
|
||||
@@ -585,6 +641,7 @@ router.post('/enqueue', async (req: Request, res: Response) => {
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
*/
|
||||
|
||||
/**
|
||||
* POST /api/job-queue/pause - Pause queue processing
|
||||
@@ -612,6 +669,8 @@ router.get('/paused', async (_req: Request, res: Response) => {
|
||||
/**
|
||||
* POST /api/job-queue/enqueue-batch - Queue multiple dispensaries at once
|
||||
* Body: { dispensary_ids: number[], job_type?: string, priority?: number }
|
||||
*
|
||||
* 2024-12-10: Rewired to use worker_tasks via taskService.
|
||||
*/
|
||||
router.post('/enqueue-batch', async (req: Request, res: Response) => {
|
||||
try {
|
||||
@@ -625,35 +684,30 @@ router.post('/enqueue-batch', async (req: Request, res: Response) => {
|
||||
return res.status(400).json({ success: false, error: 'Maximum 500 dispensaries per batch' });
|
||||
}
|
||||
|
||||
// Insert jobs, skipping duplicates
|
||||
const { rows } = await pool.query(`
|
||||
INSERT INTO dispensary_crawl_jobs (dispensary_id, job_type, priority, trigger_type, status, created_at)
|
||||
SELECT
|
||||
d.id,
|
||||
$2::text,
|
||||
$3::integer,
|
||||
'api_batch',
|
||||
'pending',
|
||||
NOW()
|
||||
FROM dispensaries d
|
||||
WHERE d.id = ANY($1::int[])
|
||||
AND d.crawl_enabled = true
|
||||
AND d.platform_dispensary_id IS NOT NULL
|
||||
AND NOT EXISTS (
|
||||
SELECT 1 FROM dispensary_crawl_jobs cj
|
||||
WHERE cj.dispensary_id = d.id
|
||||
AND cj.job_type = $2::text
|
||||
AND cj.status IN ('pending', 'running')
|
||||
)
|
||||
RETURNING id, dispensary_id
|
||||
`, [dispensary_ids, job_type, priority]);
|
||||
// 2024-12-10: Map legacy job_type to new task role
|
||||
const roleMap: Record<string, string> = {
|
||||
'dutchie_product_crawl': 'product_refresh',
|
||||
'menu_detection': 'entry_point_discovery',
|
||||
'product_discovery': 'product_discovery',
|
||||
};
|
||||
const role = roleMap[job_type] || 'product_refresh';
|
||||
|
||||
// 2024-12-10: Use taskService to create tasks in worker_tasks table
|
||||
const { taskService } = await import('../tasks/task-service');
|
||||
|
||||
const tasks = dispensary_ids.map(dispensary_id => ({
|
||||
role: role as any,
|
||||
dispensary_id,
|
||||
priority,
|
||||
}));
|
||||
|
||||
const createdCount = await taskService.createTasks(tasks);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
queued: rows.length,
|
||||
queued: createdCount,
|
||||
requested: dispensary_ids.length,
|
||||
job_ids: rows.map(r => r.id),
|
||||
message: `Queued ${rows.length} of ${dispensary_ids.length} dispensaries`
|
||||
message: `Queued ${createdCount} of ${dispensary_ids.length} dispensaries`
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('[JobQueue] Error batch enqueuing:', error);
|
||||
@@ -664,6 +718,8 @@ router.post('/enqueue-batch', async (req: Request, res: Response) => {
|
||||
/**
|
||||
* POST /api/job-queue/enqueue-state - Queue all crawl-enabled dispensaries for a state
|
||||
* Body: { state_code: string, job_type?: string, priority?: number, limit?: number }
|
||||
*
|
||||
* 2024-12-10: Rewired to use worker_tasks via taskService.
|
||||
*/
|
||||
router.post('/enqueue-state', async (req: Request, res: Response) => {
|
||||
try {
|
||||
@@ -673,52 +729,55 @@ router.post('/enqueue-state', async (req: Request, res: Response) => {
|
||||
return res.status(400).json({ success: false, error: 'state_code is required (e.g., "AZ")' });
|
||||
}
|
||||
|
||||
// Get state_id and queue jobs
|
||||
const { rows } = await pool.query(`
|
||||
WITH target_state AS (
|
||||
SELECT id FROM states WHERE code = $1
|
||||
)
|
||||
INSERT INTO dispensary_crawl_jobs (dispensary_id, job_type, priority, trigger_type, status, created_at)
|
||||
SELECT
|
||||
d.id,
|
||||
$2::text,
|
||||
$3::integer,
|
||||
'api_state',
|
||||
'pending',
|
||||
NOW()
|
||||
FROM dispensaries d, target_state
|
||||
WHERE d.state_id = target_state.id
|
||||
// 2024-12-10: Map legacy job_type to new task role
|
||||
const roleMap: Record<string, string> = {
|
||||
'dutchie_product_crawl': 'product_refresh',
|
||||
'menu_detection': 'entry_point_discovery',
|
||||
'product_discovery': 'product_discovery',
|
||||
};
|
||||
const role = roleMap[job_type] || 'product_refresh';
|
||||
|
||||
// Get dispensary IDs for the state
|
||||
const dispensaryResult = await pool.query(`
|
||||
SELECT d.id
|
||||
FROM dispensaries d
|
||||
JOIN states s ON s.id = d.state_id
|
||||
WHERE s.code = $1
|
||||
AND d.crawl_enabled = true
|
||||
AND d.platform_dispensary_id IS NOT NULL
|
||||
AND NOT EXISTS (
|
||||
SELECT 1 FROM dispensary_crawl_jobs cj
|
||||
WHERE cj.dispensary_id = d.id
|
||||
AND cj.job_type = $2::text
|
||||
AND cj.status IN ('pending', 'running')
|
||||
)
|
||||
LIMIT $4::integer
|
||||
RETURNING id, dispensary_id
|
||||
`, [state_code.toUpperCase(), job_type, priority, limit]);
|
||||
LIMIT $2
|
||||
`, [state_code.toUpperCase(), limit]);
|
||||
|
||||
const dispensary_ids = dispensaryResult.rows.map((r: any) => r.id);
|
||||
|
||||
// 2024-12-10: Use taskService to create tasks in worker_tasks table
|
||||
const { taskService } = await import('../tasks/task-service');
|
||||
|
||||
const tasks = dispensary_ids.map((dispensary_id: number) => ({
|
||||
role: role as any,
|
||||
dispensary_id,
|
||||
priority,
|
||||
}));
|
||||
|
||||
const createdCount = await taskService.createTasks(tasks);
|
||||
|
||||
// Get total available count
|
||||
const countResult = await pool.query(`
|
||||
WITH target_state AS (
|
||||
SELECT id FROM states WHERE code = $1
|
||||
)
|
||||
SELECT COUNT(*) as total
|
||||
FROM dispensaries d, target_state
|
||||
WHERE d.state_id = target_state.id
|
||||
FROM dispensaries d
|
||||
JOIN states s ON s.id = d.state_id
|
||||
WHERE s.code = $1
|
||||
AND d.crawl_enabled = true
|
||||
AND d.platform_dispensary_id IS NOT NULL
|
||||
`, [state_code.toUpperCase()]);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
queued: rows.length,
|
||||
queued: createdCount,
|
||||
total_available: parseInt(countResult.rows[0].total),
|
||||
state: state_code.toUpperCase(),
|
||||
job_type,
|
||||
message: `Queued ${rows.length} dispensaries for ${state_code.toUpperCase()}`
|
||||
role,
|
||||
message: `Queued ${createdCount} dispensaries for ${state_code.toUpperCase()}`
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('[JobQueue] Error enqueuing state:', error);
|
||||
|
||||
140
backend/src/routes/k8s.ts
Normal file
140
backend/src/routes/k8s.ts
Normal file
@@ -0,0 +1,140 @@
|
||||
/**
|
||||
* Kubernetes Control Routes
|
||||
*
|
||||
* Provides admin UI control over k8s resources like worker scaling.
|
||||
* Uses in-cluster config when running in k8s, or kubeconfig locally.
|
||||
*/
|
||||
|
||||
import { Router, Request, Response } from 'express';
|
||||
import * as k8s from '@kubernetes/client-node';
|
||||
|
||||
const router = Router();
|
||||
|
||||
// K8s client setup - lazy initialization
|
||||
let appsApi: k8s.AppsV1Api | null = null;
|
||||
let k8sError: string | null = null;
|
||||
|
||||
function getK8sClient(): k8s.AppsV1Api | null {
|
||||
if (appsApi) return appsApi;
|
||||
if (k8sError) return null;
|
||||
|
||||
try {
|
||||
const kc = new k8s.KubeConfig();
|
||||
|
||||
// Try in-cluster config first (when running in k8s)
|
||||
try {
|
||||
kc.loadFromCluster();
|
||||
console.log('[K8s] Loaded in-cluster config');
|
||||
} catch {
|
||||
// Fall back to default kubeconfig (local dev)
|
||||
try {
|
||||
kc.loadFromDefault();
|
||||
console.log('[K8s] Loaded default kubeconfig');
|
||||
} catch (e) {
|
||||
k8sError = 'No k8s config available';
|
||||
console.log('[K8s] No config available - k8s routes disabled');
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
appsApi = kc.makeApiClient(k8s.AppsV1Api);
|
||||
return appsApi;
|
||||
} catch (e: any) {
|
||||
k8sError = e.message;
|
||||
console.error('[K8s] Failed to initialize client:', e.message);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
const NAMESPACE = process.env.K8S_NAMESPACE || 'dispensary-scraper';
|
||||
const WORKER_DEPLOYMENT = 'scraper-worker';
|
||||
|
||||
/**
|
||||
* GET /api/k8s/workers
|
||||
* Get current worker deployment status
|
||||
*/
|
||||
router.get('/workers', async (_req: Request, res: Response) => {
|
||||
const client = getK8sClient();
|
||||
|
||||
if (!client) {
|
||||
return res.json({
|
||||
success: true,
|
||||
available: false,
|
||||
error: k8sError || 'K8s not available',
|
||||
replicas: 0,
|
||||
readyReplicas: 0,
|
||||
});
|
||||
}
|
||||
|
||||
try {
|
||||
const deployment = await client.readNamespacedDeployment({
|
||||
name: WORKER_DEPLOYMENT,
|
||||
namespace: NAMESPACE,
|
||||
});
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
available: true,
|
||||
replicas: deployment.spec?.replicas || 0,
|
||||
readyReplicas: deployment.status?.readyReplicas || 0,
|
||||
availableReplicas: deployment.status?.availableReplicas || 0,
|
||||
updatedReplicas: deployment.status?.updatedReplicas || 0,
|
||||
});
|
||||
} catch (e: any) {
|
||||
console.error('[K8s] Error getting deployment:', e.message);
|
||||
res.status(500).json({
|
||||
success: false,
|
||||
error: e.message,
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/k8s/workers/scale
|
||||
* Scale worker deployment
|
||||
* Body: { replicas: number }
|
||||
*/
|
||||
router.post('/workers/scale', async (req: Request, res: Response) => {
|
||||
const client = getK8sClient();
|
||||
|
||||
if (!client) {
|
||||
return res.status(503).json({
|
||||
success: false,
|
||||
error: k8sError || 'K8s not available',
|
||||
});
|
||||
}
|
||||
|
||||
const { replicas } = req.body;
|
||||
|
||||
if (typeof replicas !== 'number' || replicas < 0 || replicas > 50) {
|
||||
return res.status(400).json({
|
||||
success: false,
|
||||
error: 'replicas must be a number between 0 and 50',
|
||||
});
|
||||
}
|
||||
|
||||
try {
|
||||
// Patch the deployment to set replicas
|
||||
await client.patchNamespacedDeploymentScale({
|
||||
name: WORKER_DEPLOYMENT,
|
||||
namespace: NAMESPACE,
|
||||
body: { spec: { replicas } },
|
||||
});
|
||||
|
||||
console.log(`[K8s] Scaled ${WORKER_DEPLOYMENT} to ${replicas} replicas`);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
replicas,
|
||||
message: `Scaled to ${replicas} workers`,
|
||||
});
|
||||
} catch (e: any) {
|
||||
console.error('[K8s] Error scaling deployment:', e.message);
|
||||
res.status(500).json({
|
||||
success: false,
|
||||
error: e.message,
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
export default router;
|
||||
@@ -291,6 +291,107 @@ router.get('/stores/:id/summary', async (req: Request, res: Response) => {
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/markets/stores/:id/crawl-history
|
||||
* Get crawl history for a specific store
|
||||
*/
|
||||
router.get('/stores/:id/crawl-history', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { id } = req.params;
|
||||
const { limit = '50' } = req.query;
|
||||
const dispensaryId = parseInt(id, 10);
|
||||
const limitNum = Math.min(parseInt(limit as string, 10), 100);
|
||||
|
||||
// Get crawl history from crawl_orchestration_traces
|
||||
const { rows: historyRows } = await pool.query(`
|
||||
SELECT
|
||||
id,
|
||||
run_id,
|
||||
profile_key,
|
||||
crawler_module,
|
||||
state_at_start,
|
||||
state_at_end,
|
||||
total_steps,
|
||||
duration_ms,
|
||||
success,
|
||||
error_message,
|
||||
products_found,
|
||||
started_at,
|
||||
completed_at
|
||||
FROM crawl_orchestration_traces
|
||||
WHERE dispensary_id = $1
|
||||
ORDER BY started_at DESC
|
||||
LIMIT $2
|
||||
`, [dispensaryId, limitNum]);
|
||||
|
||||
// Get next scheduled crawl if available
|
||||
const { rows: scheduleRows } = await pool.query(`
|
||||
SELECT
|
||||
js.id as schedule_id,
|
||||
js.job_name,
|
||||
js.enabled,
|
||||
js.base_interval_minutes,
|
||||
js.jitter_minutes,
|
||||
js.next_run_at,
|
||||
js.last_run_at,
|
||||
js.last_status
|
||||
FROM job_schedules js
|
||||
WHERE js.enabled = true
|
||||
AND js.job_config->>'dispensaryId' = $1::text
|
||||
ORDER BY js.next_run_at
|
||||
LIMIT 1
|
||||
`, [dispensaryId.toString()]);
|
||||
|
||||
// Get dispensary info for slug
|
||||
const { rows: dispRows } = await pool.query(`
|
||||
SELECT
|
||||
id,
|
||||
name,
|
||||
dba_name,
|
||||
slug,
|
||||
state,
|
||||
city,
|
||||
menu_type,
|
||||
platform_dispensary_id,
|
||||
last_menu_scrape
|
||||
FROM dispensaries
|
||||
WHERE id = $1
|
||||
`, [dispensaryId]);
|
||||
|
||||
res.json({
|
||||
dispensary: dispRows[0] || null,
|
||||
history: historyRows.map(row => ({
|
||||
id: row.id,
|
||||
runId: row.run_id,
|
||||
profileKey: row.profile_key,
|
||||
crawlerModule: row.crawler_module,
|
||||
stateAtStart: row.state_at_start,
|
||||
stateAtEnd: row.state_at_end,
|
||||
totalSteps: row.total_steps,
|
||||
durationMs: row.duration_ms,
|
||||
success: row.success,
|
||||
errorMessage: row.error_message,
|
||||
productsFound: row.products_found,
|
||||
startedAt: row.started_at?.toISOString() || null,
|
||||
completedAt: row.completed_at?.toISOString() || null,
|
||||
})),
|
||||
nextSchedule: scheduleRows[0] ? {
|
||||
scheduleId: scheduleRows[0].schedule_id,
|
||||
jobName: scheduleRows[0].job_name,
|
||||
enabled: scheduleRows[0].enabled,
|
||||
baseIntervalMinutes: scheduleRows[0].base_interval_minutes,
|
||||
jitterMinutes: scheduleRows[0].jitter_minutes,
|
||||
nextRunAt: scheduleRows[0].next_run_at?.toISOString() || null,
|
||||
lastRunAt: scheduleRows[0].last_run_at?.toISOString() || null,
|
||||
lastStatus: scheduleRows[0].last_status,
|
||||
} : null,
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('[Markets] Error fetching crawl history:', error.message);
|
||||
res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/markets/stores/:id/products
|
||||
* Get products for a store with filtering and pagination
|
||||
|
||||
@@ -78,14 +78,14 @@ router.get('/metrics', async (_req: Request, res: Response) => {
|
||||
|
||||
/**
|
||||
* GET /api/admin/orchestrator/states
|
||||
* Returns array of states with at least one known dispensary
|
||||
* Returns array of states with at least one crawl-enabled dispensary
|
||||
*/
|
||||
router.get('/states', async (_req: Request, res: Response) => {
|
||||
try {
|
||||
const { rows } = await pool.query(`
|
||||
SELECT DISTINCT state, COUNT(*) as store_count
|
||||
FROM dispensaries
|
||||
WHERE state IS NOT NULL
|
||||
WHERE state IS NOT NULL AND crawl_enabled = true
|
||||
GROUP BY state
|
||||
ORDER BY state
|
||||
`);
|
||||
|
||||
334
backend/src/routes/payloads.ts
Normal file
334
backend/src/routes/payloads.ts
Normal file
@@ -0,0 +1,334 @@
|
||||
/**
|
||||
* Payload Routes
|
||||
*
|
||||
* Per TASK_WORKFLOW_2024-12-10.md: API access to raw crawl payloads.
|
||||
*
|
||||
* Endpoints:
|
||||
* - GET /api/payloads - List payload metadata (paginated)
|
||||
* - GET /api/payloads/:id - Get payload metadata by ID
|
||||
* - GET /api/payloads/:id/data - Get full payload JSON
|
||||
* - GET /api/payloads/store/:dispensaryId - List payloads for a store
|
||||
* - GET /api/payloads/store/:dispensaryId/latest - Get latest payload for a store
|
||||
* - GET /api/payloads/store/:dispensaryId/diff - Diff two payloads
|
||||
*/
|
||||
|
||||
import { Router, Request, Response } from 'express';
|
||||
import { getPool } from '../db/pool';
|
||||
import {
|
||||
loadRawPayloadById,
|
||||
getLatestPayload,
|
||||
getRecentPayloads,
|
||||
listPayloadMetadata,
|
||||
} from '../utils/payload-storage';
|
||||
import { Pool } from 'pg';
|
||||
|
||||
const router = Router();
|
||||
|
||||
// Get pool instance for queries
|
||||
const getDbPool = (): Pool => getPool() as unknown as Pool;
|
||||
|
||||
/**
|
||||
* GET /api/payloads
|
||||
* List payload metadata (paginated)
|
||||
*/
|
||||
router.get('/', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const pool = getDbPool();
|
||||
const limit = Math.min(parseInt(req.query.limit as string) || 50, 100);
|
||||
const offset = parseInt(req.query.offset as string) || 0;
|
||||
const dispensaryId = req.query.dispensary_id ? parseInt(req.query.dispensary_id as string) : undefined;
|
||||
|
||||
const payloads = await listPayloadMetadata(pool, {
|
||||
dispensaryId,
|
||||
limit,
|
||||
offset,
|
||||
});
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
payloads,
|
||||
pagination: { limit, offset },
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('[Payloads] List error:', error.message);
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/payloads/:id
|
||||
* Get payload metadata by ID
|
||||
*/
|
||||
router.get('/:id', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const pool = getDbPool();
|
||||
const id = parseInt(req.params.id);
|
||||
|
||||
const result = await pool.query(`
|
||||
SELECT
|
||||
p.id,
|
||||
p.dispensary_id,
|
||||
p.crawl_run_id,
|
||||
p.storage_path,
|
||||
p.product_count,
|
||||
p.size_bytes,
|
||||
p.size_bytes_raw,
|
||||
p.fetched_at,
|
||||
p.processed_at,
|
||||
p.checksum_sha256,
|
||||
d.name as dispensary_name
|
||||
FROM raw_crawl_payloads p
|
||||
LEFT JOIN dispensaries d ON d.id = p.dispensary_id
|
||||
WHERE p.id = $1
|
||||
`, [id]);
|
||||
|
||||
if (result.rows.length === 0) {
|
||||
return res.status(404).json({ success: false, error: 'Payload not found' });
|
||||
}
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
payload: result.rows[0],
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('[Payloads] Get error:', error.message);
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/payloads/:id/data
|
||||
* Get full payload JSON (decompressed from disk)
|
||||
*/
|
||||
router.get('/:id/data', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const pool = getDbPool();
|
||||
const id = parseInt(req.params.id);
|
||||
|
||||
const result = await loadRawPayloadById(pool, id);
|
||||
|
||||
if (!result) {
|
||||
return res.status(404).json({ success: false, error: 'Payload not found' });
|
||||
}
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
metadata: result.metadata,
|
||||
data: result.payload,
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('[Payloads] Get data error:', error.message);
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/payloads/store/:dispensaryId
|
||||
* List payloads for a specific store
|
||||
*/
|
||||
router.get('/store/:dispensaryId', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const pool = getDbPool();
|
||||
const dispensaryId = parseInt(req.params.dispensaryId);
|
||||
const limit = Math.min(parseInt(req.query.limit as string) || 20, 100);
|
||||
const offset = parseInt(req.query.offset as string) || 0;
|
||||
|
||||
const payloads = await listPayloadMetadata(pool, {
|
||||
dispensaryId,
|
||||
limit,
|
||||
offset,
|
||||
});
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
dispensaryId,
|
||||
payloads,
|
||||
pagination: { limit, offset },
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('[Payloads] Store list error:', error.message);
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/payloads/store/:dispensaryId/latest
|
||||
* Get the latest payload for a store (with full data)
|
||||
*/
|
||||
router.get('/store/:dispensaryId/latest', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const pool = getDbPool();
|
||||
const dispensaryId = parseInt(req.params.dispensaryId);
|
||||
|
||||
const result = await getLatestPayload(pool, dispensaryId);
|
||||
|
||||
if (!result) {
|
||||
return res.status(404).json({
|
||||
success: false,
|
||||
error: `No payloads found for dispensary ${dispensaryId}`,
|
||||
});
|
||||
}
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
metadata: result.metadata,
|
||||
data: result.payload,
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('[Payloads] Latest error:', error.message);
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/payloads/store/:dispensaryId/diff
|
||||
* Compare two payloads for a store
|
||||
*
|
||||
* Query params:
|
||||
* - from: payload ID (older)
|
||||
* - to: payload ID (newer) - optional, defaults to latest
|
||||
*/
|
||||
router.get('/store/:dispensaryId/diff', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const pool = getDbPool();
|
||||
const dispensaryId = parseInt(req.params.dispensaryId);
|
||||
const fromId = req.query.from ? parseInt(req.query.from as string) : undefined;
|
||||
const toId = req.query.to ? parseInt(req.query.to as string) : undefined;
|
||||
|
||||
let fromPayload: any;
|
||||
let toPayload: any;
|
||||
|
||||
if (fromId && toId) {
|
||||
// Load specific payloads
|
||||
const [from, to] = await Promise.all([
|
||||
loadRawPayloadById(pool, fromId),
|
||||
loadRawPayloadById(pool, toId),
|
||||
]);
|
||||
fromPayload = from;
|
||||
toPayload = to;
|
||||
} else {
|
||||
// Load two most recent
|
||||
const recent = await getRecentPayloads(pool, dispensaryId, 2);
|
||||
if (recent.length < 2) {
|
||||
return res.status(400).json({
|
||||
success: false,
|
||||
error: 'Need at least 2 payloads to diff. Only found ' + recent.length,
|
||||
});
|
||||
}
|
||||
toPayload = recent[0]; // Most recent
|
||||
fromPayload = recent[1]; // Previous
|
||||
}
|
||||
|
||||
if (!fromPayload || !toPayload) {
|
||||
return res.status(404).json({ success: false, error: 'One or both payloads not found' });
|
||||
}
|
||||
|
||||
// Build product maps by ID
|
||||
const fromProducts = new Map<string, any>();
|
||||
const toProducts = new Map<string, any>();
|
||||
|
||||
for (const p of fromPayload.payload.products || []) {
|
||||
const id = p._id || p.id;
|
||||
if (id) fromProducts.set(id, p);
|
||||
}
|
||||
|
||||
for (const p of toPayload.payload.products || []) {
|
||||
const id = p._id || p.id;
|
||||
if (id) toProducts.set(id, p);
|
||||
}
|
||||
|
||||
// Find differences
|
||||
const added: any[] = [];
|
||||
const removed: any[] = [];
|
||||
const priceChanges: any[] = [];
|
||||
const stockChanges: any[] = [];
|
||||
|
||||
// Products in "to" but not in "from" = added
|
||||
for (const [id, product] of toProducts) {
|
||||
if (!fromProducts.has(id)) {
|
||||
added.push({
|
||||
id,
|
||||
name: product.name,
|
||||
brand: product.brand?.name,
|
||||
price: product.Prices?.[0]?.price,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Products in "from" but not in "to" = removed
|
||||
for (const [id, product] of fromProducts) {
|
||||
if (!toProducts.has(id)) {
|
||||
removed.push({
|
||||
id,
|
||||
name: product.name,
|
||||
brand: product.brand?.name,
|
||||
price: product.Prices?.[0]?.price,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Products in both - check for changes
|
||||
for (const [id, toProduct] of toProducts) {
|
||||
const fromProduct = fromProducts.get(id);
|
||||
if (!fromProduct) continue;
|
||||
|
||||
const fromPrice = fromProduct.Prices?.[0]?.price;
|
||||
const toPrice = toProduct.Prices?.[0]?.price;
|
||||
|
||||
if (fromPrice !== toPrice) {
|
||||
priceChanges.push({
|
||||
id,
|
||||
name: toProduct.name,
|
||||
brand: toProduct.brand?.name,
|
||||
oldPrice: fromPrice,
|
||||
newPrice: toPrice,
|
||||
change: toPrice && fromPrice ? toPrice - fromPrice : null,
|
||||
});
|
||||
}
|
||||
|
||||
const fromStock = fromProduct.Status || fromProduct.status;
|
||||
const toStock = toProduct.Status || toProduct.status;
|
||||
|
||||
if (fromStock !== toStock) {
|
||||
stockChanges.push({
|
||||
id,
|
||||
name: toProduct.name,
|
||||
brand: toProduct.brand?.name,
|
||||
oldStatus: fromStock,
|
||||
newStatus: toStock,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
from: {
|
||||
id: fromPayload.metadata.id,
|
||||
fetchedAt: fromPayload.metadata.fetchedAt,
|
||||
productCount: fromPayload.metadata.productCount,
|
||||
},
|
||||
to: {
|
||||
id: toPayload.metadata.id,
|
||||
fetchedAt: toPayload.metadata.fetchedAt,
|
||||
productCount: toPayload.metadata.productCount,
|
||||
},
|
||||
diff: {
|
||||
added: added.length,
|
||||
removed: removed.length,
|
||||
priceChanges: priceChanges.length,
|
||||
stockChanges: stockChanges.length,
|
||||
},
|
||||
details: {
|
||||
added,
|
||||
removed,
|
||||
priceChanges,
|
||||
stockChanges,
|
||||
},
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('[Payloads] Diff error:', error.message);
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
export default router;
|
||||
@@ -183,8 +183,8 @@ router.post('/test-all', requireRole('superadmin', 'admin'), async (req, res) =>
|
||||
return res.status(400).json({ error: 'Concurrency must be between 1 and 50' });
|
||||
}
|
||||
|
||||
const jobId = await createProxyTestJob(mode, concurrency);
|
||||
res.json({ jobId, mode, concurrency, message: `Proxy test job started (mode: ${mode}, concurrency: ${concurrency})` });
|
||||
const { jobId, totalProxies } = await createProxyTestJob(mode, concurrency);
|
||||
res.json({ jobId, total: totalProxies, mode, concurrency, message: `Proxy test job started (mode: ${mode}, concurrency: ${concurrency})` });
|
||||
} catch (error: any) {
|
||||
console.error('Error starting proxy test job:', error);
|
||||
res.status(500).json({ error: error.message || 'Failed to start proxy test job' });
|
||||
@@ -195,8 +195,8 @@ router.post('/test-all', requireRole('superadmin', 'admin'), async (req, res) =>
|
||||
router.post('/test-failed', requireRole('superadmin', 'admin'), async (req, res) => {
|
||||
try {
|
||||
const concurrency = parseInt(req.query.concurrency as string) || 10;
|
||||
const jobId = await createProxyTestJob('failed', concurrency);
|
||||
res.json({ jobId, mode: 'failed', concurrency, message: 'Retesting failed proxies...' });
|
||||
const { jobId, totalProxies } = await createProxyTestJob('failed', concurrency);
|
||||
res.json({ jobId, total: totalProxies, mode: 'failed', concurrency, message: 'Retesting failed proxies...' });
|
||||
} catch (error: any) {
|
||||
console.error('Error starting failed proxy test:', error);
|
||||
res.status(500).json({ error: error.message || 'Failed to start proxy test job' });
|
||||
@@ -278,7 +278,7 @@ router.post('/update-locations', requireRole('superadmin', 'admin'), async (req,
|
||||
|
||||
// Run in background
|
||||
updateAllProxyLocations().catch(err => {
|
||||
console.error('❌ Location update failed:', err);
|
||||
console.error('Location update failed:', err);
|
||||
});
|
||||
|
||||
res.json({ message: 'Location update job started' });
|
||||
|
||||
@@ -130,6 +130,12 @@ const CONSUMER_TRUSTED_ORIGINS = [
|
||||
'http://localhost:3002',
|
||||
];
|
||||
|
||||
// Wildcard trusted origin patterns (*.domain.com)
|
||||
const CONSUMER_TRUSTED_PATTERNS = [
|
||||
/^https:\/\/([a-z0-9-]+\.)?cannaiq\.co$/,
|
||||
/^https:\/\/([a-z0-9-]+\.)?cannabrands\.app$/,
|
||||
];
|
||||
|
||||
// Trusted IPs for local development (bypass API key auth)
|
||||
const TRUSTED_IPS = ['127.0.0.1', '::1', '::ffff:127.0.0.1'];
|
||||
|
||||
@@ -150,8 +156,17 @@ function isConsumerTrustedRequest(req: Request): boolean {
|
||||
return true;
|
||||
}
|
||||
const origin = req.headers.origin;
|
||||
if (origin && CONSUMER_TRUSTED_ORIGINS.includes(origin)) {
|
||||
return true;
|
||||
if (origin) {
|
||||
// Check exact matches
|
||||
if (CONSUMER_TRUSTED_ORIGINS.includes(origin)) {
|
||||
return true;
|
||||
}
|
||||
// Check wildcard patterns
|
||||
for (const pattern of CONSUMER_TRUSTED_PATTERNS) {
|
||||
if (pattern.test(origin)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
const referer = req.headers.referer;
|
||||
if (referer) {
|
||||
@@ -160,6 +175,18 @@ function isConsumerTrustedRequest(req: Request): boolean {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
// Check wildcard patterns against referer origin
|
||||
try {
|
||||
const refererUrl = new URL(referer);
|
||||
const refererOrigin = refererUrl.origin;
|
||||
for (const pattern of CONSUMER_TRUSTED_PATTERNS) {
|
||||
if (pattern.test(refererOrigin)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
// Invalid referer URL, ignore
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -3,6 +3,24 @@
|
||||
*
|
||||
* Endpoints for managing worker tasks, viewing capacity metrics,
|
||||
* and generating batch tasks.
|
||||
*
|
||||
* SCHEDULE MANAGEMENT (added 2025-12-12):
|
||||
* This file now contains the canonical schedule management endpoints.
|
||||
* The job_schedules table has been deprecated and all schedule management
|
||||
* is now consolidated into task_schedules:
|
||||
*
|
||||
* Schedule endpoints:
|
||||
* GET /api/tasks/schedules - List all schedules
|
||||
* POST /api/tasks/schedules - Create new schedule
|
||||
* GET /api/tasks/schedules/:id - Get schedule by ID
|
||||
* PUT /api/tasks/schedules/:id - Update schedule
|
||||
* DELETE /api/tasks/schedules/:id - Delete schedule
|
||||
* DELETE /api/tasks/schedules - Bulk delete schedules
|
||||
* POST /api/tasks/schedules/:id/run-now - Trigger schedule immediately
|
||||
* POST /api/tasks/schedules/:id/toggle - Toggle schedule enabled/disabled
|
||||
*
|
||||
* Note: Schedule routes are defined BEFORE /:id to avoid route conflicts
|
||||
* (Express matches routes in order, and "schedules" would match /:id otherwise)
|
||||
*/
|
||||
|
||||
import { Router, Request, Response } from 'express';
|
||||
@@ -13,6 +31,12 @@ import {
|
||||
TaskFilter,
|
||||
} from '../tasks/task-service';
|
||||
import { pool } from '../db/pool';
|
||||
import {
|
||||
isTaskPoolPaused,
|
||||
pauseTaskPool,
|
||||
resumeTaskPool,
|
||||
getTaskPoolStatus,
|
||||
} from '../tasks/task-pool-state';
|
||||
|
||||
const router = Router();
|
||||
|
||||
@@ -125,6 +149,520 @@ router.get('/capacity/:role', async (req: Request, res: Response) => {
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// SCHEDULE MANAGEMENT ROUTES
|
||||
// (Must be before /:id to avoid route conflicts)
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/tasks/schedules
|
||||
* List all task schedules
|
||||
*
|
||||
* Returns schedules with is_immutable flag - immutable schedules can only
|
||||
* have their interval_hours, priority, and enabled fields updated (not deleted).
|
||||
*/
|
||||
router.get('/schedules', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const enabledOnly = req.query.enabled === 'true';
|
||||
|
||||
let query = `
|
||||
SELECT id, name, role, description, enabled, interval_hours,
|
||||
priority, state_code, platform, method,
|
||||
COALESCE(is_immutable, false) as is_immutable,
|
||||
last_run_at, next_run_at,
|
||||
last_task_count, last_error, created_at, updated_at
|
||||
FROM task_schedules
|
||||
`;
|
||||
|
||||
if (enabledOnly) {
|
||||
query += ` WHERE enabled = true`;
|
||||
}
|
||||
|
||||
query += ` ORDER BY
|
||||
CASE role
|
||||
WHEN 'store_discovery' THEN 1
|
||||
WHEN 'product_discovery' THEN 2
|
||||
WHEN 'analytics_refresh' THEN 3
|
||||
ELSE 4
|
||||
END,
|
||||
state_code NULLS FIRST,
|
||||
name`;
|
||||
|
||||
const result = await pool.query(query);
|
||||
res.json({ schedules: result.rows });
|
||||
} catch (error: unknown) {
|
||||
console.error('Error listing schedules:', error);
|
||||
res.status(500).json({ error: 'Failed to list schedules' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* DELETE /api/tasks/schedules
|
||||
* Bulk delete schedules
|
||||
*
|
||||
* Immutable schedules are automatically skipped (not deleted).
|
||||
*
|
||||
* Body:
|
||||
* - ids: number[] (required) - array of schedule IDs to delete
|
||||
* - all: boolean (optional) - if true, delete all non-immutable schedules (ids ignored)
|
||||
*/
|
||||
router.delete('/schedules', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { ids, all } = req.body;
|
||||
|
||||
let result;
|
||||
let skippedImmutable: { id: number; name: string }[] = [];
|
||||
|
||||
if (all === true) {
|
||||
// First, find immutable schedules that will be skipped
|
||||
const immutableResult = await pool.query(`
|
||||
SELECT id, name FROM task_schedules WHERE is_immutable = true
|
||||
`);
|
||||
skippedImmutable = immutableResult.rows;
|
||||
|
||||
// Delete all non-immutable schedules
|
||||
result = await pool.query(`
|
||||
DELETE FROM task_schedules
|
||||
WHERE COALESCE(is_immutable, false) = false
|
||||
RETURNING id, name
|
||||
`);
|
||||
} else if (Array.isArray(ids) && ids.length > 0) {
|
||||
// First, find which of the requested IDs are immutable
|
||||
const immutableResult = await pool.query(`
|
||||
SELECT id, name FROM task_schedules
|
||||
WHERE id = ANY($1) AND is_immutable = true
|
||||
`, [ids]);
|
||||
skippedImmutable = immutableResult.rows;
|
||||
|
||||
// Delete only non-immutable schedules from the requested IDs
|
||||
result = await pool.query(`
|
||||
DELETE FROM task_schedules
|
||||
WHERE id = ANY($1) AND COALESCE(is_immutable, false) = false
|
||||
RETURNING id, name
|
||||
`, [ids]);
|
||||
} else {
|
||||
return res.status(400).json({
|
||||
error: 'Either provide ids array or set all=true',
|
||||
});
|
||||
}
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
deleted_count: result.rowCount,
|
||||
deleted: result.rows,
|
||||
skipped_immutable_count: skippedImmutable.length,
|
||||
skipped_immutable: skippedImmutable,
|
||||
message: skippedImmutable.length > 0
|
||||
? `Deleted ${result.rowCount} schedule(s), skipped ${skippedImmutable.length} immutable schedule(s)`
|
||||
: `Deleted ${result.rowCount} schedule(s)`,
|
||||
});
|
||||
} catch (error: unknown) {
|
||||
console.error('Error bulk deleting schedules:', error);
|
||||
res.status(500).json({ error: 'Failed to delete schedules' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/tasks/schedules
|
||||
* Create a new schedule
|
||||
*
|
||||
* Body:
|
||||
* - name: string (required, unique)
|
||||
* - role: TaskRole (required)
|
||||
* - description: string (optional)
|
||||
* - enabled: boolean (default true)
|
||||
* - interval_hours: number (required)
|
||||
* - priority: number (default 0)
|
||||
* - state_code: string (optional)
|
||||
* - platform: string (optional)
|
||||
*/
|
||||
router.post('/schedules', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const {
|
||||
name,
|
||||
role,
|
||||
description,
|
||||
enabled = true,
|
||||
interval_hours,
|
||||
priority = 0,
|
||||
state_code,
|
||||
platform,
|
||||
} = req.body;
|
||||
|
||||
if (!name || !role || !interval_hours) {
|
||||
return res.status(400).json({
|
||||
error: 'name, role, and interval_hours are required',
|
||||
});
|
||||
}
|
||||
|
||||
// Calculate next_run_at based on interval
|
||||
const nextRunAt = new Date(Date.now() + interval_hours * 60 * 60 * 1000);
|
||||
|
||||
const result = await pool.query(`
|
||||
INSERT INTO task_schedules
|
||||
(name, role, description, enabled, interval_hours, priority, state_code, platform, next_run_at)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)
|
||||
RETURNING id, name, role, description, enabled, interval_hours,
|
||||
priority, state_code, platform, last_run_at, next_run_at,
|
||||
last_task_count, last_error, created_at, updated_at
|
||||
`, [name, role, description, enabled, interval_hours, priority, state_code, platform, nextRunAt]);
|
||||
|
||||
res.status(201).json(result.rows[0]);
|
||||
} catch (error: any) {
|
||||
if (error.code === '23505') {
|
||||
// Unique constraint violation
|
||||
return res.status(409).json({ error: 'A schedule with this name already exists' });
|
||||
}
|
||||
console.error('Error creating schedule:', error);
|
||||
res.status(500).json({ error: 'Failed to create schedule' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/tasks/schedules/:id
|
||||
* Get a specific schedule by ID
|
||||
*/
|
||||
router.get('/schedules/:id', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const scheduleId = parseInt(req.params.id, 10);
|
||||
|
||||
const result = await pool.query(`
|
||||
SELECT id, name, role, description, enabled, interval_hours,
|
||||
priority, state_code, platform, last_run_at, next_run_at,
|
||||
last_task_count, last_error, created_at, updated_at
|
||||
FROM task_schedules
|
||||
WHERE id = $1
|
||||
`, [scheduleId]);
|
||||
|
||||
if (result.rows.length === 0) {
|
||||
return res.status(404).json({ error: 'Schedule not found' });
|
||||
}
|
||||
|
||||
res.json(result.rows[0]);
|
||||
} catch (error: unknown) {
|
||||
console.error('Error getting schedule:', error);
|
||||
res.status(500).json({ error: 'Failed to get schedule' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* PUT /api/tasks/schedules/:id
|
||||
* Update an existing schedule
|
||||
*
|
||||
* For IMMUTABLE schedules, only these fields can be updated:
|
||||
* - enabled (turn on/off)
|
||||
* - interval_hours (change frequency)
|
||||
* - priority (change priority)
|
||||
*
|
||||
* For regular schedules, all fields can be updated.
|
||||
*/
|
||||
router.put('/schedules/:id', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const scheduleId = parseInt(req.params.id, 10);
|
||||
const {
|
||||
name,
|
||||
role,
|
||||
description,
|
||||
enabled,
|
||||
interval_hours,
|
||||
priority,
|
||||
state_code,
|
||||
platform,
|
||||
} = req.body;
|
||||
|
||||
// First check if schedule exists and if it's immutable
|
||||
const checkResult = await pool.query(`
|
||||
SELECT id, name, COALESCE(is_immutable, false) as is_immutable
|
||||
FROM task_schedules WHERE id = $1
|
||||
`, [scheduleId]);
|
||||
|
||||
if (checkResult.rows.length === 0) {
|
||||
return res.status(404).json({ error: 'Schedule not found' });
|
||||
}
|
||||
|
||||
const schedule = checkResult.rows[0];
|
||||
const isImmutable = schedule.is_immutable;
|
||||
|
||||
// For immutable schedules, reject attempts to change protected fields
|
||||
if (isImmutable) {
|
||||
const protectedFields: string[] = [];
|
||||
if (name !== undefined) protectedFields.push('name');
|
||||
if (role !== undefined) protectedFields.push('role');
|
||||
if (description !== undefined) protectedFields.push('description');
|
||||
if (state_code !== undefined) protectedFields.push('state_code');
|
||||
if (platform !== undefined) protectedFields.push('platform');
|
||||
|
||||
if (protectedFields.length > 0) {
|
||||
return res.status(403).json({
|
||||
error: 'Cannot modify protected fields on immutable schedule',
|
||||
message: `Schedule "${schedule.name}" is immutable. Only enabled, interval_hours, and priority can be changed.`,
|
||||
protected_fields: protectedFields,
|
||||
allowed_fields: ['enabled', 'interval_hours', 'priority'],
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Build dynamic update query
|
||||
const updates: string[] = [];
|
||||
const values: any[] = [];
|
||||
let paramIndex = 1;
|
||||
|
||||
// These fields can only be updated on non-immutable schedules
|
||||
if (!isImmutable) {
|
||||
if (name !== undefined) {
|
||||
updates.push(`name = $${paramIndex++}`);
|
||||
values.push(name);
|
||||
}
|
||||
if (role !== undefined) {
|
||||
updates.push(`role = $${paramIndex++}`);
|
||||
values.push(role);
|
||||
}
|
||||
if (description !== undefined) {
|
||||
updates.push(`description = $${paramIndex++}`);
|
||||
values.push(description);
|
||||
}
|
||||
if (state_code !== undefined) {
|
||||
updates.push(`state_code = $${paramIndex++}`);
|
||||
values.push(state_code || null);
|
||||
}
|
||||
if (platform !== undefined) {
|
||||
updates.push(`platform = $${paramIndex++}`);
|
||||
values.push(platform || null);
|
||||
}
|
||||
}
|
||||
|
||||
// These fields can be updated on ALL schedules (including immutable)
|
||||
if (enabled !== undefined) {
|
||||
updates.push(`enabled = $${paramIndex++}`);
|
||||
values.push(enabled);
|
||||
}
|
||||
if (interval_hours !== undefined) {
|
||||
updates.push(`interval_hours = $${paramIndex++}`);
|
||||
values.push(interval_hours);
|
||||
|
||||
// Recalculate next_run_at if interval changed
|
||||
const nextRunAt = new Date(Date.now() + interval_hours * 60 * 60 * 1000);
|
||||
updates.push(`next_run_at = $${paramIndex++}`);
|
||||
values.push(nextRunAt);
|
||||
}
|
||||
if (priority !== undefined) {
|
||||
updates.push(`priority = $${paramIndex++}`);
|
||||
values.push(priority);
|
||||
}
|
||||
|
||||
if (updates.length === 0) {
|
||||
return res.status(400).json({ error: 'No fields to update' });
|
||||
}
|
||||
|
||||
updates.push('updated_at = NOW()');
|
||||
values.push(scheduleId);
|
||||
|
||||
const result = await pool.query(`
|
||||
UPDATE task_schedules
|
||||
SET ${updates.join(', ')}
|
||||
WHERE id = $${paramIndex}
|
||||
RETURNING id, name, role, description, enabled, interval_hours,
|
||||
priority, state_code, platform, method,
|
||||
COALESCE(is_immutable, false) as is_immutable,
|
||||
last_run_at, next_run_at,
|
||||
last_task_count, last_error, created_at, updated_at
|
||||
`, values);
|
||||
|
||||
res.json(result.rows[0]);
|
||||
} catch (error: any) {
|
||||
if (error.code === '23505') {
|
||||
return res.status(409).json({ error: 'A schedule with this name already exists' });
|
||||
}
|
||||
console.error('Error updating schedule:', error);
|
||||
res.status(500).json({ error: 'Failed to update schedule' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* DELETE /api/tasks/schedules/:id
|
||||
* Delete a schedule
|
||||
*
|
||||
* Immutable schedules cannot be deleted - they can only be disabled.
|
||||
*/
|
||||
router.delete('/schedules/:id', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const scheduleId = parseInt(req.params.id, 10);
|
||||
|
||||
// First check if schedule exists and is immutable
|
||||
const checkResult = await pool.query(`
|
||||
SELECT id, name, COALESCE(is_immutable, false) as is_immutable
|
||||
FROM task_schedules WHERE id = $1
|
||||
`, [scheduleId]);
|
||||
|
||||
if (checkResult.rows.length === 0) {
|
||||
return res.status(404).json({ error: 'Schedule not found' });
|
||||
}
|
||||
|
||||
const schedule = checkResult.rows[0];
|
||||
|
||||
// Prevent deletion of immutable schedules
|
||||
if (schedule.is_immutable) {
|
||||
return res.status(403).json({
|
||||
error: 'Cannot delete immutable schedule',
|
||||
message: `Schedule "${schedule.name}" is immutable and cannot be deleted. You can disable it instead.`,
|
||||
schedule_id: scheduleId,
|
||||
is_immutable: true,
|
||||
});
|
||||
}
|
||||
|
||||
// Delete the schedule
|
||||
await pool.query(`DELETE FROM task_schedules WHERE id = $1`, [scheduleId]);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
message: `Schedule "${schedule.name}" deleted`,
|
||||
});
|
||||
} catch (error: unknown) {
|
||||
console.error('Error deleting schedule:', error);
|
||||
res.status(500).json({ error: 'Failed to delete schedule' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/tasks/schedules/:id/run-now
|
||||
* Manually trigger a scheduled task to run immediately
|
||||
*
|
||||
* For product_discovery schedules with state_code, this creates individual
|
||||
* tasks for each store in that state (fans out properly).
|
||||
*/
|
||||
router.post('/schedules/:id/run-now', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const scheduleId = parseInt(req.params.id, 10);
|
||||
|
||||
// Get the full schedule
|
||||
const scheduleResult = await pool.query(`
|
||||
SELECT id, name, role, state_code, platform, priority, interval_hours, method
|
||||
FROM task_schedules WHERE id = $1
|
||||
`, [scheduleId]);
|
||||
|
||||
if (scheduleResult.rows.length === 0) {
|
||||
return res.status(404).json({ error: 'Schedule not found' });
|
||||
}
|
||||
|
||||
const schedule = scheduleResult.rows[0];
|
||||
let tasksCreated = 0;
|
||||
|
||||
// For product_discovery with state_code, fan out to individual stores
|
||||
if (schedule.role === 'product_discovery' && schedule.state_code) {
|
||||
// Find stores in this state needing refresh
|
||||
const storeResult = await pool.query(`
|
||||
SELECT d.id
|
||||
FROM dispensaries d
|
||||
JOIN states s ON d.state_id = s.id
|
||||
WHERE d.crawl_enabled = true
|
||||
AND d.platform_dispensary_id IS NOT NULL
|
||||
AND s.code = $1
|
||||
-- No pending/running product_discovery task already
|
||||
AND NOT EXISTS (
|
||||
SELECT 1 FROM worker_tasks t
|
||||
WHERE t.dispensary_id = d.id
|
||||
AND t.role = 'product_discovery'
|
||||
AND t.status IN ('pending', 'claimed', 'running')
|
||||
)
|
||||
ORDER BY d.last_fetch_at NULLS FIRST, d.id
|
||||
`, [schedule.state_code]);
|
||||
|
||||
const dispensaryIds = storeResult.rows.map((r: { id: number }) => r.id);
|
||||
|
||||
if (dispensaryIds.length > 0) {
|
||||
// Create staggered tasks for all stores
|
||||
const result = await taskService.createStaggeredTasks(
|
||||
dispensaryIds,
|
||||
'product_discovery',
|
||||
15, // 15 seconds stagger
|
||||
schedule.platform || 'dutchie',
|
||||
schedule.method || 'http'
|
||||
);
|
||||
tasksCreated = result.created;
|
||||
} else {
|
||||
// No stores need refresh - return early with message
|
||||
return res.json({
|
||||
success: true,
|
||||
message: `No ${schedule.state_code} stores need refresh at this time`,
|
||||
tasksCreated: 0,
|
||||
stateCode: schedule.state_code,
|
||||
});
|
||||
}
|
||||
} else if (schedule.role !== 'product_discovery') {
|
||||
// For other schedules (store_discovery, analytics_refresh), create a single task
|
||||
await taskService.createTask({
|
||||
role: schedule.role,
|
||||
platform: schedule.platform,
|
||||
priority: schedule.priority + 10,
|
||||
method: schedule.method,
|
||||
});
|
||||
tasksCreated = 1;
|
||||
} else {
|
||||
// product_discovery without state_code - shouldn't happen, reject
|
||||
return res.status(400).json({
|
||||
error: 'product_discovery schedules require a state_code',
|
||||
});
|
||||
}
|
||||
|
||||
// Update last_run_at on the schedule
|
||||
await pool.query(`
|
||||
UPDATE task_schedules
|
||||
SET last_run_at = NOW(),
|
||||
next_run_at = NOW() + (interval_hours || ' hours')::interval,
|
||||
last_task_count = $2,
|
||||
updated_at = NOW()
|
||||
WHERE id = $1
|
||||
`, [scheduleId, tasksCreated]);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
message: `Schedule "${schedule.name}" triggered`,
|
||||
tasksCreated,
|
||||
stateCode: schedule.state_code,
|
||||
});
|
||||
} catch (error: unknown) {
|
||||
console.error('Error running schedule:', error);
|
||||
res.status(500).json({ error: 'Failed to run schedule' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/tasks/schedules/:id/toggle
|
||||
* Toggle a schedule's enabled status
|
||||
*/
|
||||
router.post('/schedules/:id/toggle', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const scheduleId = parseInt(req.params.id, 10);
|
||||
|
||||
const result = await pool.query(`
|
||||
UPDATE task_schedules
|
||||
SET enabled = NOT enabled,
|
||||
updated_at = NOW()
|
||||
WHERE id = $1
|
||||
RETURNING id, name, enabled
|
||||
`, [scheduleId]);
|
||||
|
||||
if (result.rows.length === 0) {
|
||||
return res.status(404).json({ error: 'Schedule not found' });
|
||||
}
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
schedule: result.rows[0],
|
||||
message: result.rows[0].enabled
|
||||
? `Schedule "${result.rows[0].name}" enabled`
|
||||
: `Schedule "${result.rows[0].name}" disabled`,
|
||||
});
|
||||
} catch (error: unknown) {
|
||||
console.error('Error toggling schedule:', error);
|
||||
res.status(500).json({ error: 'Failed to toggle schedule' });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// TASK-SPECIFIC ROUTES (with :id parameter)
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/tasks/:id
|
||||
* Get a specific task by ID
|
||||
@@ -592,4 +1130,378 @@ router.post('/migration/full-migrate', async (req: Request, res: Response) => {
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// STAGGERED BATCH TASK CREATION
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* POST /api/tasks/batch/staggered
|
||||
* Create multiple tasks with staggered start times
|
||||
*
|
||||
* This endpoint prevents resource contention when creating many tasks by
|
||||
* staggering their scheduled_for timestamps. Each task becomes eligible
|
||||
* for claiming only after its scheduled time.
|
||||
*
|
||||
* WORKFLOW:
|
||||
* 1. Tasks created with scheduled_for = NOW() + (index * stagger_seconds)
|
||||
* 2. Worker claims task only when scheduled_for <= NOW()
|
||||
* 3. Worker runs preflight on EVERY task claim
|
||||
* 4. If preflight passes, worker executes task
|
||||
* 5. If preflight fails, task released back to pending for another worker
|
||||
*
|
||||
* Body:
|
||||
* - dispensary_ids: number[] (required) - Array of dispensary IDs
|
||||
* - role: TaskRole (required) - 'product_refresh' | 'product_discovery'
|
||||
* - stagger_seconds: number (default: 15) - Seconds between each task start
|
||||
* - platform: string (default: 'dutchie')
|
||||
* - method: 'curl' | 'http' | null (default: null)
|
||||
*/
|
||||
router.post('/batch/staggered', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const {
|
||||
dispensary_ids,
|
||||
role,
|
||||
stagger_seconds = 15,
|
||||
platform = 'dutchie',
|
||||
method = null,
|
||||
} = req.body;
|
||||
|
||||
if (!dispensary_ids || !Array.isArray(dispensary_ids) || dispensary_ids.length === 0) {
|
||||
return res.status(400).json({ error: 'dispensary_ids array is required' });
|
||||
}
|
||||
|
||||
if (!role) {
|
||||
return res.status(400).json({ error: 'role is required' });
|
||||
}
|
||||
|
||||
const result = await taskService.createStaggeredTasks(
|
||||
dispensary_ids,
|
||||
role as TaskRole,
|
||||
stagger_seconds,
|
||||
platform,
|
||||
method
|
||||
);
|
||||
|
||||
const totalDuration = (dispensary_ids.length - 1) * stagger_seconds;
|
||||
const estimatedEndTime = new Date(Date.now() + totalDuration * 1000);
|
||||
|
||||
res.status(201).json({
|
||||
success: true,
|
||||
created: result.created,
|
||||
task_ids: result.taskIds,
|
||||
stagger_seconds,
|
||||
total_duration_seconds: totalDuration,
|
||||
estimated_completion: estimatedEndTime.toISOString(),
|
||||
message: `Created ${result.created} staggered ${role} tasks (${stagger_seconds}s apart, ~${Math.ceil(totalDuration / 60)} min total)`,
|
||||
});
|
||||
} catch (error: unknown) {
|
||||
console.error('Error creating staggered tasks:', error);
|
||||
res.status(500).json({ error: 'Failed to create staggered tasks' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/tasks/batch/az-stores
|
||||
* Convenience endpoint to create staggered tasks for Arizona stores
|
||||
*
|
||||
* Body:
|
||||
* - total_tasks: number (default: 24) - Total tasks to create
|
||||
* - stagger_seconds: number (default: 15) - Seconds between each task
|
||||
* - split_roles: boolean (default: true) - Split between product_refresh and product_discovery
|
||||
*/
|
||||
router.post('/batch/az-stores', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const {
|
||||
total_tasks = 24,
|
||||
stagger_seconds = 15,
|
||||
split_roles = true,
|
||||
} = req.body;
|
||||
|
||||
const result = await taskService.createAZStoreTasks(
|
||||
total_tasks,
|
||||
stagger_seconds,
|
||||
split_roles
|
||||
);
|
||||
|
||||
const totalDuration = (result.total - 1) * stagger_seconds;
|
||||
const estimatedEndTime = new Date(Date.now() + totalDuration * 1000);
|
||||
|
||||
res.status(201).json({
|
||||
success: true,
|
||||
total: result.total,
|
||||
product_refresh: result.product_refresh,
|
||||
product_discovery: result.product_discovery,
|
||||
task_ids: result.taskIds,
|
||||
stagger_seconds,
|
||||
total_duration_seconds: totalDuration,
|
||||
estimated_completion: estimatedEndTime.toISOString(),
|
||||
message: `Created ${result.total} staggered tasks for AZ stores (${result.product_refresh} refresh, ${result.product_discovery} discovery)`,
|
||||
});
|
||||
} catch (error: unknown) {
|
||||
console.error('Error creating AZ store tasks:', error);
|
||||
res.status(500).json({ error: 'Failed to create AZ store tasks' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/tasks/batch/entry-point-discovery
|
||||
* Create entry_point_discovery tasks for stores missing platform_dispensary_id
|
||||
*
|
||||
* This is idempotent - stores that already have platform_dispensary_id are skipped.
|
||||
* Only creates tasks for stores with menu_url set and crawl_enabled = true.
|
||||
*
|
||||
* Body (optional):
|
||||
* - state_code: string (optional) - Filter by state code
|
||||
* - stagger_seconds: number (default: 5) - Seconds between tasks
|
||||
* - force: boolean (default: false) - Re-run even for previously failed stores
|
||||
*/
|
||||
router.post('/batch/entry-point-discovery', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const {
|
||||
state_code,
|
||||
stagger_seconds = 5,
|
||||
force = false,
|
||||
} = req.body;
|
||||
|
||||
// Find stores that need entry point discovery
|
||||
const storeResult = await pool.query(`
|
||||
SELECT d.id, d.name, d.menu_url
|
||||
FROM dispensaries d
|
||||
JOIN states s ON d.state_id = s.id
|
||||
WHERE d.crawl_enabled = true
|
||||
AND d.menu_url IS NOT NULL
|
||||
AND d.platform_dispensary_id IS NULL
|
||||
${state_code ? 'AND s.code = $1' : ''}
|
||||
${!force ? "AND (d.id_resolution_status IS NULL OR d.id_resolution_status = 'pending')" : ''}
|
||||
-- No pending/running entry_point_discovery task already
|
||||
AND NOT EXISTS (
|
||||
SELECT 1 FROM worker_tasks t
|
||||
WHERE t.dispensary_id = d.id
|
||||
AND t.role = 'entry_point_discovery'
|
||||
AND t.status IN ('pending', 'claimed', 'running')
|
||||
)
|
||||
ORDER BY d.id
|
||||
`, state_code ? [state_code.toUpperCase()] : []);
|
||||
|
||||
const dispensaryIds = storeResult.rows.map((r: { id: number }) => r.id);
|
||||
|
||||
if (dispensaryIds.length === 0) {
|
||||
return res.json({
|
||||
success: true,
|
||||
message: state_code
|
||||
? `No ${state_code.toUpperCase()} stores need entry point discovery`
|
||||
: 'No stores need entry point discovery',
|
||||
tasks_created: 0,
|
||||
});
|
||||
}
|
||||
|
||||
// Create staggered tasks
|
||||
const taskIds: number[] = [];
|
||||
for (let i = 0; i < dispensaryIds.length; i++) {
|
||||
const scheduledFor = new Date(Date.now() + i * stagger_seconds * 1000);
|
||||
const result = await pool.query(`
|
||||
INSERT INTO worker_tasks (role, dispensary_id, priority, scheduled_for, method)
|
||||
VALUES ('entry_point_discovery', $1, 10, $2, 'http')
|
||||
RETURNING id
|
||||
`, [dispensaryIds[i], scheduledFor]);
|
||||
taskIds.push(result.rows[0].id);
|
||||
}
|
||||
|
||||
const totalDuration = dispensaryIds.length * stagger_seconds;
|
||||
const estimatedEndTime = new Date(Date.now() + totalDuration * 1000);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
tasks_created: taskIds.length,
|
||||
task_ids: taskIds,
|
||||
stores: storeResult.rows.map((r: { id: number; name: string }) => ({ id: r.id, name: r.name })),
|
||||
stagger_seconds,
|
||||
total_duration_seconds: totalDuration,
|
||||
estimated_completion: estimatedEndTime.toISOString(),
|
||||
message: `Created ${taskIds.length} entry_point_discovery tasks${state_code ? ` for ${state_code.toUpperCase()}` : ''}`,
|
||||
});
|
||||
} catch (error: unknown) {
|
||||
console.error('Error creating entry point discovery tasks:', error);
|
||||
res.status(500).json({ error: 'Failed to create entry point discovery tasks' });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// STATE-BASED CRAWL ENDPOINTS
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* POST /api/tasks/crawl-state/:stateCode
|
||||
* Create product_discovery tasks for all stores in a state
|
||||
*
|
||||
* This is the primary endpoint for triggering crawls by state.
|
||||
* Creates staggered tasks for all crawl-enabled stores in the specified state.
|
||||
*
|
||||
* Params:
|
||||
* - stateCode: State code (e.g., 'AZ', 'CA', 'CO')
|
||||
*
|
||||
* Body (optional):
|
||||
* - stagger_seconds: number (default: 15) - Seconds between each task
|
||||
* - priority: number (default: 10) - Task priority
|
||||
* - method: 'curl' | 'http' | null (default: 'http')
|
||||
*
|
||||
* Returns:
|
||||
* - tasks_created: Number of tasks created
|
||||
* - stores_in_state: Total stores found for the state
|
||||
* - skipped: Number skipped (already have active tasks)
|
||||
*/
|
||||
router.post('/crawl-state/:stateCode', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const stateCode = req.params.stateCode.toUpperCase();
|
||||
const {
|
||||
stagger_seconds = 15,
|
||||
priority = 10,
|
||||
method = 'http',
|
||||
} = req.body;
|
||||
|
||||
// Verify state exists
|
||||
const stateResult = await pool.query(`
|
||||
SELECT id, code, name FROM states WHERE code = $1
|
||||
`, [stateCode]);
|
||||
|
||||
if (stateResult.rows.length === 0) {
|
||||
return res.status(404).json({
|
||||
error: 'State not found',
|
||||
state_code: stateCode,
|
||||
});
|
||||
}
|
||||
|
||||
const state = stateResult.rows[0];
|
||||
|
||||
// Get all crawl-enabled dispensaries in this state
|
||||
const dispensariesResult = await pool.query(`
|
||||
SELECT d.id, d.name
|
||||
FROM dispensaries d
|
||||
WHERE d.state_id = $1
|
||||
AND d.crawl_enabled = true
|
||||
AND d.platform_dispensary_id IS NOT NULL
|
||||
ORDER BY d.last_fetch_at NULLS FIRST, d.id
|
||||
`, [state.id]);
|
||||
|
||||
if (dispensariesResult.rows.length === 0) {
|
||||
return res.status(200).json({
|
||||
success: true,
|
||||
message: `No crawl-enabled stores found in ${state.name}`,
|
||||
state_code: stateCode,
|
||||
state_name: state.name,
|
||||
tasks_created: 0,
|
||||
stores_in_state: 0,
|
||||
});
|
||||
}
|
||||
|
||||
const dispensaryIds = dispensariesResult.rows.map((d: { id: number }) => d.id);
|
||||
|
||||
// Create staggered tasks
|
||||
const result = await taskService.createStaggeredTasks(
|
||||
dispensaryIds,
|
||||
'product_discovery',
|
||||
stagger_seconds,
|
||||
'dutchie',
|
||||
method
|
||||
);
|
||||
|
||||
const totalDuration = (result.created - 1) * stagger_seconds;
|
||||
const estimatedEndTime = new Date(Date.now() + totalDuration * 1000);
|
||||
|
||||
res.status(201).json({
|
||||
success: true,
|
||||
state_code: stateCode,
|
||||
state_name: state.name,
|
||||
tasks_created: result.created,
|
||||
stores_in_state: dispensariesResult.rows.length,
|
||||
skipped: dispensariesResult.rows.length - result.created,
|
||||
stagger_seconds,
|
||||
total_duration_seconds: totalDuration,
|
||||
estimated_completion: estimatedEndTime.toISOString(),
|
||||
message: `Created ${result.created} product_discovery tasks for ${state.name} (${stagger_seconds}s apart, ~${Math.ceil(totalDuration / 60)} min total)`,
|
||||
});
|
||||
} catch (error: unknown) {
|
||||
console.error('Error creating state crawl tasks:', error);
|
||||
res.status(500).json({ error: 'Failed to create state crawl tasks' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/tasks/states
|
||||
* List all states with their store counts and crawl status
|
||||
*/
|
||||
router.get('/states', async (_req: Request, res: Response) => {
|
||||
try {
|
||||
const result = await pool.query(`
|
||||
SELECT
|
||||
s.code,
|
||||
s.name,
|
||||
COUNT(d.id)::int as total_stores,
|
||||
COUNT(d.id) FILTER (WHERE d.crawl_enabled = true AND d.platform_dispensary_id IS NOT NULL)::int as crawl_enabled_stores,
|
||||
COUNT(d.id) FILTER (WHERE d.crawl_enabled = true AND d.platform_dispensary_id IS NULL)::int as missing_platform_id,
|
||||
MAX(d.last_fetch_at) as last_crawl_at,
|
||||
(SELECT COUNT(*) FROM worker_tasks t
|
||||
JOIN dispensaries d2 ON t.dispensary_id = d2.id
|
||||
WHERE d2.state_id = s.id
|
||||
AND t.role = 'product_discovery'
|
||||
AND t.status IN ('pending', 'claimed', 'running'))::int as active_tasks
|
||||
FROM states s
|
||||
LEFT JOIN dispensaries d ON d.state_id = s.id
|
||||
GROUP BY s.id, s.code, s.name
|
||||
HAVING COUNT(d.id) > 0
|
||||
ORDER BY COUNT(d.id) DESC
|
||||
`);
|
||||
|
||||
res.json({
|
||||
states: result.rows,
|
||||
total_states: result.rows.length,
|
||||
});
|
||||
} catch (error: unknown) {
|
||||
console.error('Error listing states:', error);
|
||||
res.status(500).json({ error: 'Failed to list states' });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// TASK POOL MANAGEMENT
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/tasks/pool/status
|
||||
* Check if task pool is paused
|
||||
*/
|
||||
router.get('/pool/status', async (_req: Request, res: Response) => {
|
||||
const status = getTaskPoolStatus();
|
||||
res.json({
|
||||
success: true,
|
||||
...status,
|
||||
});
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/tasks/pool/pause
|
||||
* Pause the task pool - workers won't pick up new tasks
|
||||
*/
|
||||
router.post('/pool/pause', async (_req: Request, res: Response) => {
|
||||
pauseTaskPool();
|
||||
res.json({
|
||||
success: true,
|
||||
paused: true,
|
||||
message: 'Task pool paused - workers will not pick up new tasks',
|
||||
});
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/tasks/pool/resume
|
||||
* Resume the task pool - workers will pick up tasks again
|
||||
*/
|
||||
router.post('/pool/resume', async (_req: Request, res: Response) => {
|
||||
resumeTaskPool();
|
||||
res.json({
|
||||
success: true,
|
||||
paused: false,
|
||||
message: 'Task pool resumed - workers will pick up new tasks',
|
||||
});
|
||||
});
|
||||
|
||||
export default router;
|
||||
|
||||
@@ -14,23 +14,36 @@ router.get('/', async (req: AuthRequest, res) => {
|
||||
try {
|
||||
const { search, domain } = req.query;
|
||||
|
||||
let query = `
|
||||
SELECT id, email, role, first_name, last_name, phone, domain, created_at, updated_at
|
||||
FROM users
|
||||
WHERE 1=1
|
||||
`;
|
||||
// Check which columns exist (schema-tolerant)
|
||||
const columnsResult = await pool.query(`
|
||||
SELECT column_name FROM information_schema.columns
|
||||
WHERE table_name = 'users' AND column_name IN ('first_name', 'last_name', 'phone', 'domain')
|
||||
`);
|
||||
const existingColumns = new Set(columnsResult.rows.map((r: any) => r.column_name));
|
||||
|
||||
// Build column list based on what exists
|
||||
const selectCols = ['id', 'email', 'role', 'created_at', 'updated_at'];
|
||||
if (existingColumns.has('first_name')) selectCols.push('first_name');
|
||||
if (existingColumns.has('last_name')) selectCols.push('last_name');
|
||||
if (existingColumns.has('phone')) selectCols.push('phone');
|
||||
if (existingColumns.has('domain')) selectCols.push('domain');
|
||||
|
||||
let query = `SELECT ${selectCols.join(', ')} FROM users WHERE 1=1`;
|
||||
const params: any[] = [];
|
||||
let paramIndex = 1;
|
||||
|
||||
// Search by email, first_name, or last_name
|
||||
// Search by email (and optionally first_name, last_name if they exist)
|
||||
if (search && typeof search === 'string') {
|
||||
query += ` AND (email ILIKE $${paramIndex} OR first_name ILIKE $${paramIndex} OR last_name ILIKE $${paramIndex})`;
|
||||
const searchClauses = ['email ILIKE $' + paramIndex];
|
||||
if (existingColumns.has('first_name')) searchClauses.push('first_name ILIKE $' + paramIndex);
|
||||
if (existingColumns.has('last_name')) searchClauses.push('last_name ILIKE $' + paramIndex);
|
||||
query += ` AND (${searchClauses.join(' OR ')})`;
|
||||
params.push(`%${search}%`);
|
||||
paramIndex++;
|
||||
}
|
||||
|
||||
// Filter by domain
|
||||
if (domain && typeof domain === 'string') {
|
||||
// Filter by domain (if column exists)
|
||||
if (domain && typeof domain === 'string' && existingColumns.has('domain')) {
|
||||
query += ` AND domain = $${paramIndex}`;
|
||||
params.push(domain);
|
||||
paramIndex++;
|
||||
@@ -50,8 +63,22 @@ router.get('/', async (req: AuthRequest, res) => {
|
||||
router.get('/:id', async (req: AuthRequest, res) => {
|
||||
try {
|
||||
const { id } = req.params;
|
||||
|
||||
// Check which columns exist (schema-tolerant)
|
||||
const columnsResult = await pool.query(`
|
||||
SELECT column_name FROM information_schema.columns
|
||||
WHERE table_name = 'users' AND column_name IN ('first_name', 'last_name', 'phone', 'domain')
|
||||
`);
|
||||
const existingColumns = new Set(columnsResult.rows.map((r: any) => r.column_name));
|
||||
|
||||
const selectCols = ['id', 'email', 'role', 'created_at', 'updated_at'];
|
||||
if (existingColumns.has('first_name')) selectCols.push('first_name');
|
||||
if (existingColumns.has('last_name')) selectCols.push('last_name');
|
||||
if (existingColumns.has('phone')) selectCols.push('phone');
|
||||
if (existingColumns.has('domain')) selectCols.push('domain');
|
||||
|
||||
const result = await pool.query(`
|
||||
SELECT id, email, role, first_name, last_name, phone, domain, created_at, updated_at
|
||||
SELECT ${selectCols.join(', ')}
|
||||
FROM users
|
||||
WHERE id = $1
|
||||
`, [id]);
|
||||
|
||||
@@ -23,6 +23,8 @@
|
||||
import { Router, Request, Response } from 'express';
|
||||
import { pool } from '../db/pool';
|
||||
import os from 'os';
|
||||
import { runPuppeteerPreflightWithRetry } from '../services/puppeteer-preflight';
|
||||
import { CrawlRotator } from '../services/crawl-rotator';
|
||||
|
||||
const router = Router();
|
||||
|
||||
@@ -70,21 +72,20 @@ router.post('/register', async (req: Request, res: Response) => {
|
||||
);
|
||||
|
||||
if (existing.rows.length > 0) {
|
||||
// Re-activate existing worker
|
||||
// Re-activate existing worker - keep existing pod_name (fantasy name), don't overwrite with K8s name
|
||||
const { rows } = await pool.query(`
|
||||
UPDATE worker_registry
|
||||
SET status = 'active',
|
||||
role = $1,
|
||||
pod_name = $2,
|
||||
hostname = $3,
|
||||
ip_address = $4,
|
||||
hostname = $2,
|
||||
ip_address = $3,
|
||||
last_heartbeat_at = NOW(),
|
||||
started_at = NOW(),
|
||||
metadata = $5,
|
||||
metadata = $4,
|
||||
updated_at = NOW()
|
||||
WHERE worker_id = $6
|
||||
RETURNING id, worker_id, friendly_name, role
|
||||
`, [role, pod_name, finalHostname, clientIp, metadata, finalWorkerId]);
|
||||
WHERE worker_id = $5
|
||||
RETURNING id, worker_id, friendly_name, pod_name, role
|
||||
`, [role, finalHostname, clientIp, metadata, finalWorkerId]);
|
||||
|
||||
const worker = rows[0];
|
||||
const roleMsg = role ? `for ${role}` : 'as role-agnostic';
|
||||
@@ -105,13 +106,13 @@ router.post('/register', async (req: Request, res: Response) => {
|
||||
const nameResult = await pool.query('SELECT assign_worker_name($1) as name', [finalWorkerId]);
|
||||
const friendlyName = nameResult.rows[0].name;
|
||||
|
||||
// Register the worker
|
||||
// Register the worker - use friendlyName as pod_name (not K8s name)
|
||||
const { rows } = await pool.query(`
|
||||
INSERT INTO worker_registry (
|
||||
worker_id, friendly_name, role, pod_name, hostname, ip_address, status, metadata
|
||||
) VALUES ($1, $2, $3, $4, $5, $6, 'active', $7)
|
||||
RETURNING id, worker_id, friendly_name, role
|
||||
`, [finalWorkerId, friendlyName, role, pod_name, finalHostname, clientIp, metadata]);
|
||||
RETURNING id, worker_id, friendly_name, pod_name, role
|
||||
`, [finalWorkerId, friendlyName, role, friendlyName, finalHostname, clientIp, metadata]);
|
||||
|
||||
const worker = rows[0];
|
||||
const roleMsg = role ? `for ${role}` : 'as role-agnostic';
|
||||
@@ -138,17 +139,46 @@ router.post('/register', async (req: Request, res: Response) => {
|
||||
*
|
||||
* Body:
|
||||
* - worker_id: string (required)
|
||||
* - current_task_id: number (optional) - task currently being processed
|
||||
* - current_task_id: number (optional) - task currently being processed (primary task)
|
||||
* - current_task_ids: number[] (optional) - all tasks currently being processed (concurrent)
|
||||
* - active_task_count: number (optional) - number of tasks currently running
|
||||
* - max_concurrent_tasks: number (optional) - max concurrent tasks this worker can handle
|
||||
* - status: string (optional) - 'active', 'idle'
|
||||
* - resources: object (optional) - memory_mb, cpu_user_ms, cpu_system_ms, etc.
|
||||
*/
|
||||
router.post('/heartbeat', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { worker_id, current_task_id, status = 'active', resources } = req.body;
|
||||
const {
|
||||
worker_id,
|
||||
current_task_id,
|
||||
current_task_ids,
|
||||
active_task_count,
|
||||
max_concurrent_tasks,
|
||||
status = 'active',
|
||||
resources,
|
||||
// Step tracking fields
|
||||
current_step,
|
||||
current_step_detail,
|
||||
current_step_started_at,
|
||||
task_steps,
|
||||
} = req.body;
|
||||
|
||||
if (!worker_id) {
|
||||
return res.status(400).json({ success: false, error: 'worker_id is required' });
|
||||
}
|
||||
|
||||
// Build metadata object with all the new fields
|
||||
const metadata: Record<string, unknown> = {};
|
||||
if (resources) Object.assign(metadata, resources);
|
||||
if (current_task_ids) metadata.current_task_ids = current_task_ids;
|
||||
if (active_task_count !== undefined) metadata.active_task_count = active_task_count;
|
||||
if (max_concurrent_tasks !== undefined) metadata.max_concurrent_tasks = max_concurrent_tasks;
|
||||
// Step tracking
|
||||
if (current_step) metadata.current_step = current_step;
|
||||
if (current_step_detail) metadata.current_step_detail = current_step_detail;
|
||||
if (current_step_started_at) metadata.current_step_started_at = current_step_started_at;
|
||||
if (task_steps) metadata.task_steps = task_steps;
|
||||
|
||||
// Store resources in metadata jsonb column
|
||||
const { rows } = await pool.query(`
|
||||
UPDATE worker_registry
|
||||
@@ -159,7 +189,7 @@ router.post('/heartbeat', async (req: Request, res: Response) => {
|
||||
updated_at = NOW()
|
||||
WHERE worker_id = $3
|
||||
RETURNING id, friendly_name, status
|
||||
`, [current_task_id || null, status, worker_id, resources ? JSON.stringify(resources) : null]);
|
||||
`, [current_task_id || null, status, worker_id, Object.keys(metadata).length > 0 ? JSON.stringify(metadata) : null]);
|
||||
|
||||
if (rows.length === 0) {
|
||||
return res.status(404).json({ success: false, error: 'Worker not found - please register first' });
|
||||
@@ -232,12 +262,9 @@ router.post('/deregister', async (req: Request, res: Response) => {
|
||||
// Release the name back to the pool
|
||||
await pool.query('SELECT release_worker_name($1)', [worker_id]);
|
||||
|
||||
// Mark as terminated
|
||||
// Delete the worker entry (clean shutdown)
|
||||
const { rows } = await pool.query(`
|
||||
UPDATE worker_registry
|
||||
SET status = 'terminated',
|
||||
current_task_id = NULL,
|
||||
updated_at = NOW()
|
||||
DELETE FROM worker_registry
|
||||
WHERE worker_id = $1
|
||||
RETURNING id, friendly_name
|
||||
`, [worker_id]);
|
||||
@@ -273,6 +300,29 @@ router.post('/deregister', async (req: Request, res: Response) => {
|
||||
*/
|
||||
router.get('/workers', async (req: Request, res: Response) => {
|
||||
try {
|
||||
// Check if worker_registry table exists
|
||||
const tableCheck = await pool.query(`
|
||||
SELECT EXISTS (
|
||||
SELECT FROM information_schema.tables
|
||||
WHERE table_name = 'worker_registry'
|
||||
) as exists
|
||||
`);
|
||||
|
||||
if (!tableCheck.rows[0].exists) {
|
||||
// Return empty result if table doesn't exist yet
|
||||
return res.json({
|
||||
success: true,
|
||||
workers: [],
|
||||
summary: {
|
||||
active_count: 0,
|
||||
idle_count: 0,
|
||||
offline_count: 0,
|
||||
total_count: 0,
|
||||
active_roles: 0
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
const { status, role, include_terminated = 'false' } = req.query;
|
||||
|
||||
let whereClause = include_terminated === 'true' ? 'WHERE 1=1' : "WHERE status != 'terminated'";
|
||||
@@ -307,12 +357,27 @@ router.get('/workers', async (req: Request, res: Response) => {
|
||||
tasks_completed,
|
||||
tasks_failed,
|
||||
current_task_id,
|
||||
-- Concurrent task fields from metadata
|
||||
(metadata->>'current_task_ids')::jsonb as current_task_ids,
|
||||
(metadata->>'active_task_count')::int as active_task_count,
|
||||
(metadata->>'max_concurrent_tasks')::int as max_concurrent_tasks,
|
||||
-- Decommission fields
|
||||
COALESCE(decommission_requested, false) as decommission_requested,
|
||||
decommission_reason,
|
||||
-- Preflight fields (dual-transport verification)
|
||||
curl_ip,
|
||||
http_ip,
|
||||
preflight_status,
|
||||
preflight_at,
|
||||
fingerprint_data,
|
||||
-- Full metadata for resources
|
||||
metadata,
|
||||
EXTRACT(EPOCH FROM (NOW() - last_heartbeat_at)) as seconds_since_heartbeat,
|
||||
CASE
|
||||
WHEN status = 'offline' OR status = 'terminated' THEN status
|
||||
WHEN last_heartbeat_at < NOW() - INTERVAL '2 minutes' THEN 'stale'
|
||||
WHEN current_task_id IS NOT NULL THEN 'busy'
|
||||
WHEN (metadata->>'active_task_count')::int > 0 THEN 'busy'
|
||||
ELSE 'ready'
|
||||
END as health_status,
|
||||
created_at
|
||||
@@ -649,4 +714,217 @@ router.get('/capacity', async (_req: Request, res: Response) => {
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// WORKER LIFECYCLE MANAGEMENT
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* POST /api/worker-registry/workers/:workerId/decommission
|
||||
* Request graceful decommission of a worker (will stop after current task)
|
||||
*/
|
||||
router.post('/workers/:workerId/decommission', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { workerId } = req.params;
|
||||
const { reason, issued_by } = req.body;
|
||||
|
||||
// Update worker_registry to flag for decommission
|
||||
const result = await pool.query(
|
||||
`UPDATE worker_registry
|
||||
SET decommission_requested = true,
|
||||
decommission_reason = $2,
|
||||
decommission_requested_at = NOW()
|
||||
WHERE worker_id = $1
|
||||
RETURNING friendly_name, status, current_task_id`,
|
||||
[workerId, reason || 'Manual decommission from admin']
|
||||
);
|
||||
|
||||
if (result.rows.length === 0) {
|
||||
return res.status(404).json({ success: false, error: 'Worker not found' });
|
||||
}
|
||||
|
||||
const worker = result.rows[0];
|
||||
|
||||
// Also log to worker_commands for audit trail
|
||||
await pool.query(
|
||||
`INSERT INTO worker_commands (worker_id, command, reason, issued_by)
|
||||
VALUES ($1, 'decommission', $2, $3)
|
||||
ON CONFLICT DO NOTHING`,
|
||||
[workerId, reason || 'Manual decommission', issued_by || 'admin']
|
||||
).catch(() => {
|
||||
// Table might not exist yet - ignore
|
||||
});
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
message: worker.current_task_id
|
||||
? `Worker ${worker.friendly_name} will stop after completing task #${worker.current_task_id}`
|
||||
: `Worker ${worker.friendly_name} will stop on next poll`,
|
||||
worker: {
|
||||
friendly_name: worker.friendly_name,
|
||||
status: worker.status,
|
||||
current_task_id: worker.current_task_id,
|
||||
decommission_requested: true
|
||||
}
|
||||
});
|
||||
} catch (error: any) {
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/worker-registry/workers/:workerId/cancel-decommission
|
||||
* Cancel a pending decommission request
|
||||
*/
|
||||
router.post('/workers/:workerId/cancel-decommission', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { workerId } = req.params;
|
||||
|
||||
const result = await pool.query(
|
||||
`UPDATE worker_registry
|
||||
SET decommission_requested = false,
|
||||
decommission_reason = NULL,
|
||||
decommission_requested_at = NULL
|
||||
WHERE worker_id = $1
|
||||
RETURNING friendly_name`,
|
||||
[workerId]
|
||||
);
|
||||
|
||||
if (result.rows.length === 0) {
|
||||
return res.status(404).json({ success: false, error: 'Worker not found' });
|
||||
}
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
message: `Decommission cancelled for ${result.rows[0].friendly_name}`
|
||||
});
|
||||
} catch (error: any) {
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/worker-registry/spawn
|
||||
* Spawn a new worker in the current pod (only works in multi-worker-per-pod mode)
|
||||
* For now, this is a placeholder - actual spawning requires the pod supervisor
|
||||
*/
|
||||
router.post('/spawn', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { pod_name, role } = req.body;
|
||||
|
||||
// For now, we can't actually spawn workers from the API
|
||||
// This would require a supervisor process in each pod that listens for spawn commands
|
||||
// Instead, return instructions for how to scale
|
||||
res.json({
|
||||
success: false,
|
||||
error: 'Direct worker spawning not yet implemented',
|
||||
instructions: 'To add workers, scale the K8s deployment: kubectl scale deployment/scraper-worker --replicas=N'
|
||||
});
|
||||
} catch (error: any) {
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/worker-registry/pods
|
||||
* Get workers grouped by pod
|
||||
*/
|
||||
router.get('/pods', async (_req: Request, res: Response) => {
|
||||
try {
|
||||
const { rows } = await pool.query(`
|
||||
SELECT
|
||||
COALESCE(pod_name, 'Unknown') as pod_name,
|
||||
COUNT(*) as worker_count,
|
||||
COUNT(*) FILTER (WHERE current_task_id IS NOT NULL) as busy_count,
|
||||
COUNT(*) FILTER (WHERE current_task_id IS NULL) as idle_count,
|
||||
SUM(tasks_completed) as total_completed,
|
||||
SUM(tasks_failed) as total_failed,
|
||||
SUM((metadata->>'memory_rss_mb')::int) as total_memory_mb,
|
||||
array_agg(json_build_object(
|
||||
'worker_id', worker_id,
|
||||
'friendly_name', friendly_name,
|
||||
'status', status,
|
||||
'current_task_id', current_task_id,
|
||||
'tasks_completed', tasks_completed,
|
||||
'tasks_failed', tasks_failed,
|
||||
'decommission_requested', COALESCE(decommission_requested, false),
|
||||
'last_heartbeat_at', last_heartbeat_at
|
||||
)) as workers
|
||||
FROM worker_registry
|
||||
WHERE status NOT IN ('offline', 'terminated')
|
||||
GROUP BY pod_name
|
||||
ORDER BY pod_name
|
||||
`);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
pods: rows.map(row => ({
|
||||
pod_name: row.pod_name,
|
||||
worker_count: parseInt(row.worker_count),
|
||||
busy_count: parseInt(row.busy_count),
|
||||
idle_count: parseInt(row.idle_count),
|
||||
total_completed: parseInt(row.total_completed) || 0,
|
||||
total_failed: parseInt(row.total_failed) || 0,
|
||||
total_memory_mb: parseInt(row.total_memory_mb) || 0,
|
||||
workers: row.workers
|
||||
}))
|
||||
});
|
||||
} catch (error: any) {
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// PREFLIGHT SMOKE TEST
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* POST /api/worker-registry/preflight-test
|
||||
* Run an HTTP (Puppeteer) preflight test and return results
|
||||
*
|
||||
* This is a smoke test endpoint to verify the preflight system works.
|
||||
* Returns IP, fingerprint data, bot detection results, and products fetched.
|
||||
*/
|
||||
router.post('/preflight-test', async (_req: Request, res: Response) => {
|
||||
try {
|
||||
console.log('[PreflightTest] Starting HTTP preflight smoke test...');
|
||||
|
||||
// Create a temporary CrawlRotator for the test
|
||||
const crawlRotator = new CrawlRotator();
|
||||
|
||||
// Run the Puppeteer preflight (with 1 retry)
|
||||
const startTime = Date.now();
|
||||
const result = await runPuppeteerPreflightWithRetry(crawlRotator, 1);
|
||||
const duration = Date.now() - startTime;
|
||||
|
||||
console.log(`[PreflightTest] Completed in ${duration}ms - passed: ${result.passed}`);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
test: 'http_preflight',
|
||||
duration_ms: duration,
|
||||
result: {
|
||||
passed: result.passed,
|
||||
proxy_ip: result.proxyIp,
|
||||
fingerprint: result.fingerprint,
|
||||
bot_detection: result.botDetection,
|
||||
products_returned: result.productsReturned,
|
||||
browser_user_agent: result.browserUserAgent,
|
||||
ip_verified: result.ipVerified,
|
||||
proxy_available: result.proxyAvailable,
|
||||
proxy_connected: result.proxyConnected,
|
||||
antidetect_ready: result.antidetectReady,
|
||||
response_time_ms: result.responseTimeMs,
|
||||
error: result.error
|
||||
}
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('[PreflightTest] Error:', error.message);
|
||||
res.status(500).json({
|
||||
success: false,
|
||||
test: 'http_preflight',
|
||||
error: error.message
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
export default router;
|
||||
|
||||
@@ -4,10 +4,25 @@
|
||||
* Provider-agnostic worker management and job monitoring.
|
||||
* Replaces legacy /api/dutchie-az/admin/schedules and /api/dutchie-az/monitor/* routes.
|
||||
*
|
||||
* DEPRECATION NOTE (2025-12-12):
|
||||
* This file still queries job_schedules for backwards compatibility with
|
||||
* the /api/workers endpoints that display worker status. However, the
|
||||
* job_schedules table is DEPRECATED - all entries have been disabled.
|
||||
*
|
||||
* Schedule management has been consolidated into task_schedules:
|
||||
* - Use /api/tasks/schedules for schedule CRUD operations
|
||||
* - Use TasksDashboard.tsx (/admin/tasks) for schedule management UI
|
||||
* - task_schedules uses interval_hours (simpler than base_interval_minutes + jitter)
|
||||
*
|
||||
* The /api/workers endpoints remain useful for:
|
||||
* - Monitoring active workers and job status
|
||||
* - K8s scaling controls
|
||||
* - Job history and logs
|
||||
*
|
||||
* Endpoints:
|
||||
* GET /api/workers - List all workers/schedules
|
||||
* GET /api/workers/active - List currently active workers
|
||||
* GET /api/workers/schedule - Get all job schedules
|
||||
* GET /api/workers/schedule - Get all job schedules (DEPRECATED - use /api/tasks/schedules)
|
||||
* GET /api/workers/:workerName - Get specific worker details
|
||||
* GET /api/workers/:workerName/scope - Get worker's scope (states, etc.)
|
||||
* GET /api/workers/:workerName/stats - Get worker statistics
|
||||
@@ -17,13 +32,234 @@
|
||||
* GET /api/monitor/jobs - Get recent job history
|
||||
* GET /api/monitor/active-jobs - Get currently running jobs
|
||||
* GET /api/monitor/summary - Get monitoring summary
|
||||
*
|
||||
* K8s Scaling (added 2024-12-10):
|
||||
* GET /api/workers/k8s/replicas - Get current replica count
|
||||
* POST /api/workers/k8s/scale - Scale worker replicas up/down
|
||||
*/
|
||||
|
||||
import { Router, Request, Response } from 'express';
|
||||
import { pool } from '../db/pool';
|
||||
import * as k8s from '@kubernetes/client-node';
|
||||
|
||||
const router = Router();
|
||||
|
||||
// ============================================================
|
||||
// K8S SCALING CONFIGURATION (added 2024-12-10)
|
||||
// Per TASK_WORKFLOW_2024-12-10.md: Admin can scale workers from UI
|
||||
// ============================================================
|
||||
|
||||
const K8S_NAMESPACE = process.env.K8S_NAMESPACE || 'dispensary-scraper';
|
||||
const K8S_DEPLOYMENT_NAME = process.env.K8S_WORKER_DEPLOYMENT || 'scraper-worker';
|
||||
|
||||
// Initialize K8s client - uses in-cluster config when running in K8s,
|
||||
// or kubeconfig when running locally
|
||||
let k8sAppsApi: k8s.AppsV1Api | null = null;
|
||||
|
||||
function getK8sClient(): k8s.AppsV1Api | null {
|
||||
if (k8sAppsApi) return k8sAppsApi;
|
||||
|
||||
try {
|
||||
const kc = new k8s.KubeConfig();
|
||||
|
||||
// Try in-cluster config first (when running as a pod)
|
||||
// Falls back to default kubeconfig (~/.kube/config) for local dev
|
||||
try {
|
||||
kc.loadFromCluster();
|
||||
} catch {
|
||||
kc.loadFromDefault();
|
||||
}
|
||||
|
||||
k8sAppsApi = kc.makeApiClient(k8s.AppsV1Api);
|
||||
return k8sAppsApi;
|
||||
} catch (err: any) {
|
||||
console.warn('[Workers] K8s client not available:', err.message);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// K8S SCALING ROUTES (added 2024-12-10)
|
||||
// Per TASK_WORKFLOW_2024-12-10.md: Admin can scale workers from UI
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/workers/k8s/replicas - Get current worker replica count
|
||||
* Returns current and desired replica counts from the Deployment
|
||||
*/
|
||||
router.get('/k8s/replicas', async (_req: Request, res: Response) => {
|
||||
const client = getK8sClient();
|
||||
|
||||
if (!client) {
|
||||
return res.status(503).json({
|
||||
success: false,
|
||||
error: 'K8s client not available (not running in cluster or no kubeconfig)',
|
||||
replicas: null,
|
||||
});
|
||||
}
|
||||
|
||||
try {
|
||||
const response = await client.readNamespacedDeployment({
|
||||
name: K8S_DEPLOYMENT_NAME,
|
||||
namespace: K8S_NAMESPACE,
|
||||
});
|
||||
|
||||
const deployment = response;
|
||||
res.json({
|
||||
success: true,
|
||||
replicas: {
|
||||
current: deployment.status?.readyReplicas || 0,
|
||||
desired: deployment.spec?.replicas || 0,
|
||||
available: deployment.status?.availableReplicas || 0,
|
||||
updated: deployment.status?.updatedReplicas || 0,
|
||||
},
|
||||
deployment: K8S_DEPLOYMENT_NAME,
|
||||
namespace: K8S_NAMESPACE,
|
||||
});
|
||||
} catch (err: any) {
|
||||
console.error('[Workers] K8s replicas error:', err.body?.message || err.message);
|
||||
res.status(500).json({
|
||||
success: false,
|
||||
error: err.body?.message || err.message,
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/workers/k8s/scale - Scale worker replicas
|
||||
* Body: { replicas: number } - desired replica count (0-20)
|
||||
*/
|
||||
router.post('/k8s/scale', async (req: Request, res: Response) => {
|
||||
const client = getK8sClient();
|
||||
|
||||
if (!client) {
|
||||
return res.status(503).json({
|
||||
success: false,
|
||||
error: 'K8s client not available (not running in cluster or no kubeconfig)',
|
||||
});
|
||||
}
|
||||
|
||||
const { replicas } = req.body;
|
||||
|
||||
// Validate replica count
|
||||
if (typeof replicas !== 'number' || replicas < 0 || replicas > 20) {
|
||||
return res.status(400).json({
|
||||
success: false,
|
||||
error: 'replicas must be a number between 0 and 20',
|
||||
});
|
||||
}
|
||||
|
||||
try {
|
||||
// Get current state first
|
||||
const currentResponse = await client.readNamespacedDeploymentScale({
|
||||
name: K8S_DEPLOYMENT_NAME,
|
||||
namespace: K8S_NAMESPACE,
|
||||
});
|
||||
const currentReplicas = currentResponse.spec?.replicas || 0;
|
||||
|
||||
// Update scale using replaceNamespacedDeploymentScale
|
||||
await client.replaceNamespacedDeploymentScale({
|
||||
name: K8S_DEPLOYMENT_NAME,
|
||||
namespace: K8S_NAMESPACE,
|
||||
body: {
|
||||
apiVersion: 'autoscaling/v1',
|
||||
kind: 'Scale',
|
||||
metadata: {
|
||||
name: K8S_DEPLOYMENT_NAME,
|
||||
namespace: K8S_NAMESPACE,
|
||||
},
|
||||
spec: {
|
||||
replicas: replicas,
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
console.log(`[Workers] Scaled ${K8S_DEPLOYMENT_NAME} from ${currentReplicas} to ${replicas} replicas`);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
message: `Scaled from ${currentReplicas} to ${replicas} replicas`,
|
||||
previous: currentReplicas,
|
||||
desired: replicas,
|
||||
deployment: K8S_DEPLOYMENT_NAME,
|
||||
namespace: K8S_NAMESPACE,
|
||||
});
|
||||
} catch (err: any) {
|
||||
console.error('[Workers] K8s scale error:', err.body?.message || err.message);
|
||||
res.status(500).json({
|
||||
success: false,
|
||||
error: err.body?.message || err.message,
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/workers/k8s/scale-up - Scale up worker replicas by 1
|
||||
* Convenience endpoint for adding a single worker
|
||||
*/
|
||||
router.post('/k8s/scale-up', async (_req: Request, res: Response) => {
|
||||
const client = getK8sClient();
|
||||
|
||||
if (!client) {
|
||||
return res.status(503).json({
|
||||
success: false,
|
||||
error: 'K8s client not available (not running in cluster or no kubeconfig)',
|
||||
});
|
||||
}
|
||||
|
||||
try {
|
||||
// Get current replica count
|
||||
const currentResponse = await client.readNamespacedDeploymentScale({
|
||||
name: K8S_DEPLOYMENT_NAME,
|
||||
namespace: K8S_NAMESPACE,
|
||||
});
|
||||
const currentReplicas = currentResponse.spec?.replicas || 0;
|
||||
const newReplicas = currentReplicas + 1;
|
||||
|
||||
// Cap at 20 replicas
|
||||
if (newReplicas > 20) {
|
||||
return res.status(400).json({
|
||||
success: false,
|
||||
error: 'Maximum replica count (20) reached',
|
||||
});
|
||||
}
|
||||
|
||||
// Scale up by 1
|
||||
await client.replaceNamespacedDeploymentScale({
|
||||
name: K8S_DEPLOYMENT_NAME,
|
||||
namespace: K8S_NAMESPACE,
|
||||
body: {
|
||||
apiVersion: 'autoscaling/v1',
|
||||
kind: 'Scale',
|
||||
metadata: {
|
||||
name: K8S_DEPLOYMENT_NAME,
|
||||
namespace: K8S_NAMESPACE,
|
||||
},
|
||||
spec: {
|
||||
replicas: newReplicas,
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
console.log(`[Workers] Scaled up ${K8S_DEPLOYMENT_NAME} from ${currentReplicas} to ${newReplicas} replicas`);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
message: `Added worker (${currentReplicas} → ${newReplicas} replicas)`,
|
||||
previous: currentReplicas,
|
||||
desired: newReplicas,
|
||||
deployment: K8S_DEPLOYMENT_NAME,
|
||||
namespace: K8S_NAMESPACE,
|
||||
});
|
||||
} catch (err: any) {
|
||||
console.error('[Workers] K8s scale-up error:', err.body?.message || err.message);
|
||||
res.status(500).json({
|
||||
success: false,
|
||||
error: err.body?.message || err.message,
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// STATIC ROUTES (must come before parameterized routes)
|
||||
// ============================================================
|
||||
|
||||
@@ -16,10 +16,11 @@ import {
|
||||
executeGraphQL,
|
||||
startSession,
|
||||
endSession,
|
||||
getFingerprint,
|
||||
setCrawlRotator,
|
||||
GRAPHQL_HASHES,
|
||||
DUTCHIE_CONFIG,
|
||||
} from '../platforms/dutchie';
|
||||
import { CrawlRotator } from '../services/crawl-rotator';
|
||||
|
||||
dotenv.config();
|
||||
|
||||
@@ -108,19 +109,27 @@ async function main() {
|
||||
|
||||
// ============================================================
|
||||
// STEP 2: Start stealth session
|
||||
// Per workflow-12102025.md: Initialize CrawlRotator and start session with menuUrl
|
||||
// ============================================================
|
||||
console.log('┌─────────────────────────────────────────────────────────────┐');
|
||||
console.log('│ STEP 2: Start Stealth Session │');
|
||||
console.log('└─────────────────────────────────────────────────────────────┘');
|
||||
|
||||
// Use Arizona timezone for this store
|
||||
const session = startSession(disp.state || 'AZ', 'America/Phoenix');
|
||||
// Per workflow-12102025.md: Initialize CrawlRotator (required for sessions)
|
||||
const rotator = new CrawlRotator();
|
||||
setCrawlRotator(rotator);
|
||||
|
||||
const fp = getFingerprint();
|
||||
// Per workflow-12102025.md: startSession takes menuUrl for dynamic Referer
|
||||
const session = startSession(disp.menu_url);
|
||||
|
||||
const fp = session.fingerprint;
|
||||
console.log(` Session ID: ${session.sessionId}`);
|
||||
console.log(` Browser: ${fp.browserName} (${fp.deviceCategory})`);
|
||||
console.log(` User-Agent: ${fp.userAgent.slice(0, 60)}...`);
|
||||
console.log(` Accept-Language: ${fp.acceptLanguage}`);
|
||||
console.log(` Sec-CH-UA: ${fp.secChUa || '(not set)'}`);
|
||||
console.log(` Referer: ${session.referer}`);
|
||||
console.log(` DNT: ${fp.httpFingerprint.hasDNT ? 'enabled' : 'disabled'}`);
|
||||
console.log(` TLS: ${fp.httpFingerprint.curlImpersonateBinary}`);
|
||||
console.log('');
|
||||
|
||||
// ============================================================
|
||||
|
||||
284
backend/src/scripts/import-proxies.ts
Normal file
284
backend/src/scripts/import-proxies.ts
Normal file
@@ -0,0 +1,284 @@
|
||||
/**
|
||||
* Bulk Proxy Import Script
|
||||
*
|
||||
* Imports proxies from various formats into the proxies table.
|
||||
* Supports:
|
||||
* - Standard format: http://user:pass@host:port
|
||||
* - Colon format: http://host:port:user:pass
|
||||
* - Simple format: host:port:user:pass (defaults to http)
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx src/scripts/import-proxies.ts < proxies.txt
|
||||
* echo "http://host:port:user:pass" | npx tsx src/scripts/import-proxies.ts
|
||||
* npx tsx src/scripts/import-proxies.ts --file proxies.txt
|
||||
* npx tsx src/scripts/import-proxies.ts --url "http://host:port:user:pass"
|
||||
*
|
||||
* Options:
|
||||
* --file <path> Read proxies from file (one per line)
|
||||
* --url <url> Import a single proxy URL
|
||||
* --max-connections Set max_connections for all imported proxies (default: 1)
|
||||
* --dry-run Parse and show what would be imported without inserting
|
||||
*/
|
||||
|
||||
import { getPool } from '../db/pool';
|
||||
import * as fs from 'fs';
|
||||
import * as readline from 'readline';
|
||||
|
||||
interface ParsedProxy {
|
||||
protocol: string;
|
||||
host: string;
|
||||
port: number;
|
||||
username?: string;
|
||||
password?: string;
|
||||
rawUrl: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse a proxy URL in various formats
|
||||
*/
|
||||
function parseProxyUrl(input: string): ParsedProxy | null {
|
||||
const trimmed = input.trim();
|
||||
if (!trimmed || trimmed.startsWith('#')) return null;
|
||||
|
||||
// Format 1: Standard URL format - http://user:pass@host:port
|
||||
const standardMatch = trimmed.match(/^(https?|socks5):\/\/([^:]+):([^@]+)@([^:]+):(\d+)$/);
|
||||
if (standardMatch) {
|
||||
return {
|
||||
protocol: standardMatch[1],
|
||||
username: standardMatch[2],
|
||||
password: standardMatch[3],
|
||||
host: standardMatch[4],
|
||||
port: parseInt(standardMatch[5], 10),
|
||||
rawUrl: trimmed,
|
||||
};
|
||||
}
|
||||
|
||||
// Format 2: Standard URL without auth - http://host:port
|
||||
const noAuthMatch = trimmed.match(/^(https?|socks5):\/\/([^:]+):(\d+)$/);
|
||||
if (noAuthMatch) {
|
||||
return {
|
||||
protocol: noAuthMatch[1],
|
||||
host: noAuthMatch[2],
|
||||
port: parseInt(noAuthMatch[3], 10),
|
||||
rawUrl: trimmed,
|
||||
};
|
||||
}
|
||||
|
||||
// Format 3: Colon format with protocol - http://host:port:user:pass
|
||||
const colonWithProtocolMatch = trimmed.match(/^(https?|socks5):\/\/([^:]+):(\d+):([^:]+):(.+)$/);
|
||||
if (colonWithProtocolMatch) {
|
||||
return {
|
||||
protocol: colonWithProtocolMatch[1],
|
||||
host: colonWithProtocolMatch[2],
|
||||
port: parseInt(colonWithProtocolMatch[3], 10),
|
||||
username: colonWithProtocolMatch[4],
|
||||
password: colonWithProtocolMatch[5],
|
||||
rawUrl: trimmed, // Keep raw URL for non-standard format
|
||||
};
|
||||
}
|
||||
|
||||
// Format 4: Colon format without protocol - host:port:user:pass
|
||||
const colonMatch = trimmed.match(/^([^:]+):(\d+):([^:]+):(.+)$/);
|
||||
if (colonMatch) {
|
||||
return {
|
||||
protocol: 'http',
|
||||
host: colonMatch[1],
|
||||
port: parseInt(colonMatch[2], 10),
|
||||
username: colonMatch[3],
|
||||
password: colonMatch[4],
|
||||
rawUrl: `http://${trimmed}`, // Construct raw URL
|
||||
};
|
||||
}
|
||||
|
||||
// Format 5: Simple host:port
|
||||
const simpleMatch = trimmed.match(/^([^:]+):(\d+)$/);
|
||||
if (simpleMatch) {
|
||||
return {
|
||||
protocol: 'http',
|
||||
host: simpleMatch[1],
|
||||
port: parseInt(simpleMatch[2], 10),
|
||||
rawUrl: `http://${trimmed}`,
|
||||
};
|
||||
}
|
||||
|
||||
console.error(`[ImportProxies] Could not parse: ${trimmed}`);
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if proxy URL is in non-standard format (needs proxy_url column)
|
||||
*/
|
||||
function isNonStandardFormat(rawUrl: string): boolean {
|
||||
// Colon format: protocol://host:port:user:pass
|
||||
return /^(https?|socks5):\/\/[^:]+:\d+:[^:]+:.+$/.test(rawUrl);
|
||||
}
|
||||
|
||||
async function importProxies(proxies: ParsedProxy[], maxConnections: number, dryRun: boolean) {
|
||||
if (dryRun) {
|
||||
console.log('\n[ImportProxies] DRY RUN - Would import:');
|
||||
for (const p of proxies) {
|
||||
const needsRawUrl = isNonStandardFormat(p.rawUrl);
|
||||
console.log(` ${p.host}:${p.port} (${p.protocol}) user=${p.username || 'none'} needsProxyUrl=${needsRawUrl}`);
|
||||
}
|
||||
console.log(`\nTotal: ${proxies.length} proxies`);
|
||||
return;
|
||||
}
|
||||
|
||||
const pool = getPool();
|
||||
let inserted = 0;
|
||||
let skipped = 0;
|
||||
|
||||
for (const proxy of proxies) {
|
||||
try {
|
||||
// Determine if we need to store the raw URL (non-standard format)
|
||||
const needsRawUrl = isNonStandardFormat(proxy.rawUrl);
|
||||
|
||||
// Use different conflict resolution based on format
|
||||
// Non-standard format: unique by proxy_url (session-based residential proxies)
|
||||
// Standard format: unique by host/port/protocol
|
||||
const query = needsRawUrl
|
||||
? `
|
||||
INSERT INTO proxies (host, port, protocol, username, password, max_connections, proxy_url, active)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, true)
|
||||
ON CONFLICT (proxy_url) WHERE proxy_url IS NOT NULL
|
||||
DO UPDATE SET
|
||||
max_connections = EXCLUDED.max_connections,
|
||||
active = true,
|
||||
updated_at = NOW()
|
||||
RETURNING id, (xmax = 0) as is_insert
|
||||
`
|
||||
: `
|
||||
INSERT INTO proxies (host, port, protocol, username, password, max_connections, proxy_url, active)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, true)
|
||||
ON CONFLICT (host, port, protocol)
|
||||
DO UPDATE SET
|
||||
username = EXCLUDED.username,
|
||||
password = EXCLUDED.password,
|
||||
max_connections = EXCLUDED.max_connections,
|
||||
proxy_url = EXCLUDED.proxy_url,
|
||||
active = true,
|
||||
updated_at = NOW()
|
||||
RETURNING id, (xmax = 0) as is_insert
|
||||
`;
|
||||
|
||||
const result = await pool.query(query, [
|
||||
proxy.host,
|
||||
proxy.port,
|
||||
proxy.protocol,
|
||||
proxy.username || null,
|
||||
proxy.password || null,
|
||||
maxConnections,
|
||||
needsRawUrl ? proxy.rawUrl : null,
|
||||
]);
|
||||
|
||||
const isInsert = result.rows[0]?.is_insert;
|
||||
const sessionId = proxy.password?.match(/session-([A-Z0-9]+)/)?.[1] || '';
|
||||
const displayName = sessionId ? `session ${sessionId}` : `${proxy.host}:${proxy.port}`;
|
||||
|
||||
if (isInsert) {
|
||||
inserted++;
|
||||
console.log(`[ImportProxies] Inserted: ${displayName}`);
|
||||
} else {
|
||||
console.log(`[ImportProxies] Updated: ${displayName}`);
|
||||
inserted++; // Count updates too
|
||||
}
|
||||
} catch (err: any) {
|
||||
const sessionId = proxy.password?.match(/session-([A-Z0-9]+)/)?.[1] || '';
|
||||
const displayName = sessionId ? `session ${sessionId}` : `${proxy.host}:${proxy.port}`;
|
||||
console.error(`[ImportProxies] Error inserting ${displayName}: ${err.message}`);
|
||||
skipped++;
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`\n[ImportProxies] Complete: ${inserted} imported, ${skipped} skipped`);
|
||||
|
||||
// Notify any listening workers
|
||||
try {
|
||||
await pool.query(`NOTIFY proxy_added, 'bulk import'`);
|
||||
console.log('[ImportProxies] Sent proxy_added notification to workers');
|
||||
} catch {
|
||||
// Ignore notification errors
|
||||
}
|
||||
}
|
||||
|
||||
async function readFromStdin(): Promise<string[]> {
|
||||
return new Promise((resolve) => {
|
||||
const lines: string[] = [];
|
||||
const rl = readline.createInterface({
|
||||
input: process.stdin,
|
||||
output: process.stdout,
|
||||
terminal: false,
|
||||
});
|
||||
|
||||
rl.on('line', (line) => {
|
||||
lines.push(line);
|
||||
});
|
||||
|
||||
rl.on('close', () => {
|
||||
resolve(lines);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const args = process.argv.slice(2);
|
||||
let lines: string[] = [];
|
||||
let maxConnections = 1;
|
||||
let dryRun = false;
|
||||
|
||||
// Parse arguments
|
||||
for (let i = 0; i < args.length; i++) {
|
||||
if (args[i] === '--file' && args[i + 1]) {
|
||||
const content = fs.readFileSync(args[i + 1], 'utf-8');
|
||||
lines.push(...content.split('\n'));
|
||||
i++;
|
||||
} else if (args[i] === '--url' && args[i + 1]) {
|
||||
lines.push(args[i + 1]);
|
||||
i++;
|
||||
} else if (args[i] === '--max-connections' && args[i + 1]) {
|
||||
maxConnections = parseInt(args[i + 1], 10);
|
||||
i++;
|
||||
} else if (args[i] === '--dry-run') {
|
||||
dryRun = true;
|
||||
} else if (!args[i].startsWith('--')) {
|
||||
// Treat as URL directly
|
||||
lines.push(args[i]);
|
||||
}
|
||||
}
|
||||
|
||||
// If no lines yet, read from stdin
|
||||
if (lines.length === 0) {
|
||||
console.log('[ImportProxies] Reading from stdin...');
|
||||
lines = await readFromStdin();
|
||||
}
|
||||
|
||||
// Parse all lines
|
||||
const proxies: ParsedProxy[] = [];
|
||||
for (const line of lines) {
|
||||
const parsed = parseProxyUrl(line);
|
||||
if (parsed) {
|
||||
proxies.push(parsed);
|
||||
}
|
||||
}
|
||||
|
||||
if (proxies.length === 0) {
|
||||
console.error('[ImportProxies] No valid proxies found');
|
||||
console.error('\nUsage:');
|
||||
console.error(' npx tsx src/scripts/import-proxies.ts --url "http://host:port:user:pass"');
|
||||
console.error(' npx tsx src/scripts/import-proxies.ts --file proxies.txt');
|
||||
console.error(' echo "host:port:user:pass" | npx tsx src/scripts/import-proxies.ts');
|
||||
console.error('\nSupported formats:');
|
||||
console.error(' http://user:pass@host:port (standard)');
|
||||
console.error(' http://host:port:user:pass (colon format)');
|
||||
console.error(' host:port:user:pass (simple)');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.log(`[ImportProxies] Parsed ${proxies.length} proxies (max_connections=${maxConnections})`);
|
||||
await importProxies(proxies, maxConnections, dryRun);
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
console.error('[ImportProxies] Fatal error:', err);
|
||||
process.exit(1);
|
||||
});
|
||||
@@ -1,10 +1,10 @@
|
||||
/**
|
||||
* Test script for stealth session management
|
||||
*
|
||||
* Tests:
|
||||
* 1. Per-session fingerprint rotation
|
||||
* 2. Geographic consistency (timezone → Accept-Language)
|
||||
* 3. Proxy location loading from database
|
||||
* Per workflow-12102025.md:
|
||||
* - Tests HTTP fingerprinting (browser-specific headers + ordering)
|
||||
* - Tests UA generation (device distribution, browser filtering)
|
||||
* - Tests dynamic Referer per dispensary
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx src/scripts/test-stealth-session.ts
|
||||
@@ -14,104 +14,142 @@ import {
|
||||
startSession,
|
||||
endSession,
|
||||
getCurrentSession,
|
||||
getFingerprint,
|
||||
getRandomFingerprint,
|
||||
getLocaleForTimezone,
|
||||
buildHeaders,
|
||||
setCrawlRotator,
|
||||
} from '../platforms/dutchie';
|
||||
|
||||
import { CrawlRotator } from '../services/crawl-rotator';
|
||||
import {
|
||||
generateHTTPFingerprint,
|
||||
buildRefererFromMenuUrl,
|
||||
BrowserType,
|
||||
} from '../services/http-fingerprint';
|
||||
|
||||
console.log('='.repeat(60));
|
||||
console.log('STEALTH SESSION TEST');
|
||||
console.log('STEALTH SESSION TEST (per workflow-12102025.md)');
|
||||
console.log('='.repeat(60));
|
||||
|
||||
// Test 1: Timezone to Locale mapping
|
||||
console.log('\n[Test 1] Timezone to Locale Mapping:');
|
||||
const testTimezones = [
|
||||
'America/Phoenix',
|
||||
'America/Los_Angeles',
|
||||
'America/New_York',
|
||||
'America/Chicago',
|
||||
// Initialize CrawlRotator (required for sessions)
|
||||
console.log('\n[Setup] Initializing CrawlRotator...');
|
||||
const rotator = new CrawlRotator();
|
||||
setCrawlRotator(rotator);
|
||||
console.log(' CrawlRotator initialized');
|
||||
|
||||
// Test 1: HTTP Fingerprint Generation
|
||||
console.log('\n[Test 1] HTTP Fingerprint Generation:');
|
||||
const browsers: BrowserType[] = ['Chrome', 'Firefox', 'Safari', 'Edge'];
|
||||
|
||||
for (const browser of browsers) {
|
||||
const httpFp = generateHTTPFingerprint(browser);
|
||||
console.log(` ${browser}:`);
|
||||
console.log(` TLS binary: ${httpFp.curlImpersonateBinary}`);
|
||||
console.log(` DNT: ${httpFp.hasDNT ? 'enabled' : 'disabled'}`);
|
||||
console.log(` Header order: ${httpFp.headerOrder.slice(0, 5).join(', ')}...`);
|
||||
}
|
||||
|
||||
// Test 2: Dynamic Referer from menu URLs
|
||||
console.log('\n[Test 2] Dynamic Referer from Menu URLs:');
|
||||
const testUrls = [
|
||||
'https://dutchie.com/embedded-menu/harvest-of-tempe',
|
||||
'https://dutchie.com/dispensary/zen-leaf-mesa',
|
||||
'/embedded-menu/deeply-rooted',
|
||||
'/dispensary/curaleaf-phoenix',
|
||||
null,
|
||||
undefined,
|
||||
'Invalid/Timezone',
|
||||
];
|
||||
|
||||
for (const tz of testTimezones) {
|
||||
const locale = getLocaleForTimezone(tz);
|
||||
console.log(` ${tz || '(undefined)'} → ${locale}`);
|
||||
for (const url of testUrls) {
|
||||
const referer = buildRefererFromMenuUrl(url);
|
||||
console.log(` ${url || '(null/undefined)'}`);
|
||||
console.log(` → ${referer}`);
|
||||
}
|
||||
|
||||
// Test 2: Random fingerprint selection
|
||||
console.log('\n[Test 2] Random Fingerprint Selection (5 samples):');
|
||||
for (let i = 0; i < 5; i++) {
|
||||
const fp = getRandomFingerprint();
|
||||
console.log(` ${i + 1}. ${fp.userAgent.slice(0, 60)}...`);
|
||||
// Test 3: Session with Dynamic Referer
|
||||
console.log('\n[Test 3] Session with Dynamic Referer:');
|
||||
const testMenuUrl = 'https://dutchie.com/dispensary/harvest-of-tempe';
|
||||
console.log(` Starting session with menuUrl: ${testMenuUrl}`);
|
||||
|
||||
const session1 = startSession(testMenuUrl);
|
||||
console.log(` Session ID: ${session1.sessionId}`);
|
||||
console.log(` Browser: ${session1.fingerprint.browserName}`);
|
||||
console.log(` Device: ${session1.fingerprint.deviceCategory}`);
|
||||
console.log(` Referer: ${session1.referer}`);
|
||||
console.log(` DNT: ${session1.fingerprint.httpFingerprint.hasDNT ? 'enabled' : 'disabled'}`);
|
||||
console.log(` TLS: ${session1.fingerprint.httpFingerprint.curlImpersonateBinary}`);
|
||||
|
||||
// Test 4: Build Headers (browser-specific order)
|
||||
console.log('\n[Test 4] Build Headers (browser-specific order):');
|
||||
const { headers, orderedHeaders } = buildHeaders(true, 1000);
|
||||
console.log(` Headers built for ${session1.fingerprint.browserName}:`);
|
||||
console.log(` Order: ${orderedHeaders.join(' → ')}`);
|
||||
console.log(` Sample headers:`);
|
||||
console.log(` User-Agent: ${headers['User-Agent']?.slice(0, 50)}...`);
|
||||
console.log(` Accept: ${headers['Accept']}`);
|
||||
console.log(` Accept-Language: ${headers['Accept-Language']}`);
|
||||
console.log(` Referer: ${headers['Referer']}`);
|
||||
if (headers['sec-ch-ua']) {
|
||||
console.log(` sec-ch-ua: ${headers['sec-ch-ua']}`);
|
||||
}
|
||||
if (headers['DNT']) {
|
||||
console.log(` DNT: ${headers['DNT']}`);
|
||||
}
|
||||
|
||||
// Test 3: Session Management
|
||||
console.log('\n[Test 3] Session Management:');
|
||||
|
||||
// Before session - should use default fingerprint
|
||||
console.log(' Before session:');
|
||||
const beforeFp = getFingerprint();
|
||||
console.log(` getFingerprint(): ${beforeFp.userAgent.slice(0, 50)}...`);
|
||||
console.log(` getCurrentSession(): ${getCurrentSession()}`);
|
||||
|
||||
// Start session with Arizona timezone
|
||||
console.log('\n Starting session (AZ, America/Phoenix):');
|
||||
const session1 = startSession('AZ', 'America/Phoenix');
|
||||
console.log(` Session ID: ${session1.sessionId}`);
|
||||
console.log(` Fingerprint UA: ${session1.fingerprint.userAgent.slice(0, 50)}...`);
|
||||
console.log(` Accept-Language: ${session1.fingerprint.acceptLanguage}`);
|
||||
console.log(` Timezone: ${session1.timezone}`);
|
||||
|
||||
// During session - should use session fingerprint
|
||||
console.log('\n During session:');
|
||||
const duringFp = getFingerprint();
|
||||
console.log(` getFingerprint(): ${duringFp.userAgent.slice(0, 50)}...`);
|
||||
console.log(` Same as session? ${duringFp.userAgent === session1.fingerprint.userAgent}`);
|
||||
|
||||
// Test buildHeaders with session
|
||||
console.log('\n buildHeaders() during session:');
|
||||
const headers = buildHeaders('/embedded-menu/test-store');
|
||||
console.log(` User-Agent: ${headers['user-agent'].slice(0, 50)}...`);
|
||||
console.log(` Accept-Language: ${headers['accept-language']}`);
|
||||
console.log(` Origin: ${headers['origin']}`);
|
||||
console.log(` Referer: ${headers['referer']}`);
|
||||
|
||||
// End session
|
||||
console.log('\n Ending session:');
|
||||
endSession();
|
||||
console.log(` getCurrentSession(): ${getCurrentSession()}`);
|
||||
|
||||
// Test 4: Multiple sessions should have different fingerprints
|
||||
console.log('\n[Test 4] Multiple Sessions (fingerprint variety):');
|
||||
const fingerprints: string[] = [];
|
||||
// Test 5: Multiple Sessions (UA variety)
|
||||
console.log('\n[Test 5] Multiple Sessions (UA & fingerprint variety):');
|
||||
const sessions: {
|
||||
browser: string;
|
||||
device: string;
|
||||
hasDNT: boolean;
|
||||
}[] = [];
|
||||
|
||||
for (let i = 0; i < 10; i++) {
|
||||
const session = startSession('CA', 'America/Los_Angeles');
|
||||
fingerprints.push(session.fingerprint.userAgent);
|
||||
const session = startSession(`/dispensary/store-${i}`);
|
||||
sessions.push({
|
||||
browser: session.fingerprint.browserName,
|
||||
device: session.fingerprint.deviceCategory,
|
||||
hasDNT: session.fingerprint.httpFingerprint.hasDNT,
|
||||
});
|
||||
endSession();
|
||||
}
|
||||
|
||||
const uniqueCount = new Set(fingerprints).size;
|
||||
console.log(` 10 sessions created, ${uniqueCount} unique fingerprints`);
|
||||
console.log(` Variety: ${uniqueCount >= 3 ? '✅ Good' : '⚠️ Low - may need more fingerprint options'}`);
|
||||
// Count distribution
|
||||
const browserCounts: Record<string, number> = {};
|
||||
const deviceCounts: Record<string, number> = {};
|
||||
let dntCount = 0;
|
||||
|
||||
// Test 5: Geographic consistency check
|
||||
console.log('\n[Test 5] Geographic Consistency:');
|
||||
const geoTests = [
|
||||
{ state: 'AZ', tz: 'America/Phoenix' },
|
||||
{ state: 'CA', tz: 'America/Los_Angeles' },
|
||||
{ state: 'NY', tz: 'America/New_York' },
|
||||
{ state: 'IL', tz: 'America/Chicago' },
|
||||
];
|
||||
for (const s of sessions) {
|
||||
browserCounts[s.browser] = (browserCounts[s.browser] || 0) + 1;
|
||||
deviceCounts[s.device] = (deviceCounts[s.device] || 0) + 1;
|
||||
if (s.hasDNT) dntCount++;
|
||||
}
|
||||
|
||||
for (const { state, tz } of geoTests) {
|
||||
const session = startSession(state, tz);
|
||||
const consistent = session.fingerprint.acceptLanguage.includes('en-US');
|
||||
console.log(` ${state} (${tz}): Accept-Language=${session.fingerprint.acceptLanguage} ${consistent ? '✅' : '❌'}`);
|
||||
console.log(` 10 sessions created:`);
|
||||
console.log(` Browsers: ${JSON.stringify(browserCounts)}`);
|
||||
console.log(` Devices: ${JSON.stringify(deviceCounts)}`);
|
||||
console.log(` DNT enabled: ${dntCount}/10 (expected ~30%)`);
|
||||
|
||||
// Test 6: Device distribution check (per workflow-12102025.md: 62/36/2)
|
||||
console.log('\n[Test 6] Device Distribution (larger sample):');
|
||||
const deviceSamples: string[] = [];
|
||||
|
||||
for (let i = 0; i < 100; i++) {
|
||||
const session = startSession();
|
||||
deviceSamples.push(session.fingerprint.deviceCategory);
|
||||
endSession();
|
||||
}
|
||||
|
||||
const mobileCount = deviceSamples.filter(d => d === 'mobile').length;
|
||||
const desktopCount = deviceSamples.filter(d => d === 'desktop').length;
|
||||
const tabletCount = deviceSamples.filter(d => d === 'tablet').length;
|
||||
|
||||
console.log(` 100 sessions (expected: 62% mobile, 36% desktop, 2% tablet):`);
|
||||
console.log(` Mobile: ${mobileCount}%`);
|
||||
console.log(` Desktop: ${desktopCount}%`);
|
||||
console.log(` Tablet: ${tabletCount}%`);
|
||||
console.log(` Distribution: ${Math.abs(mobileCount - 62) < 15 && Math.abs(desktopCount - 36) < 15 ? '✅ Reasonable' : '⚠️ Off target'}`);
|
||||
|
||||
console.log('\n' + '='.repeat(60));
|
||||
console.log('TEST COMPLETE');
|
||||
console.log('='.repeat(60));
|
||||
|
||||
1202
backend/src/services/analytics/BrandIntelligenceService.ts
Normal file
1202
backend/src/services/analytics/BrandIntelligenceService.ts
Normal file
File diff suppressed because it is too large
Load Diff
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user