Compare commits

..

1 Commits

Author SHA1 Message Date
kelly
2e22b439e0 Merge branch 'master' into fix/analytics-v2-queries 2025-12-11 02:55:13 +00:00
143 changed files with 4335 additions and 14744 deletions

View File

@@ -1,3 +1,6 @@
when:
- event: [push, pull_request]
steps: steps:
# =========================================== # ===========================================
# PR VALIDATION: Parallel type checks (PRs only) # PR VALIDATION: Parallel type checks (PRs only)
@@ -69,7 +72,6 @@ steps:
# =========================================== # ===========================================
# MASTER DEPLOY: Parallel Docker builds # MASTER DEPLOY: Parallel Docker builds
# NOTE: cache_from/cache_to removed due to plugin bug splitting on commas
# =========================================== # ===========================================
docker-backend: docker-backend:
image: woodpeckerci/plugin-docker-buildx image: woodpeckerci/plugin-docker-buildx
@@ -161,7 +163,7 @@ steps:
event: push event: push
# =========================================== # ===========================================
# STAGE 3: Deploy and Run Migrations # STAGE 3: Deploy (after Docker builds)
# =========================================== # ===========================================
deploy: deploy:
image: bitnami/kubectl:latest image: bitnami/kubectl:latest
@@ -172,17 +174,12 @@ steps:
- mkdir -p ~/.kube - mkdir -p ~/.kube
- echo "$KUBECONFIG_CONTENT" | tr -d '[:space:]' | base64 -d > ~/.kube/config - echo "$KUBECONFIG_CONTENT" | tr -d '[:space:]' | base64 -d > ~/.kube/config
- chmod 600 ~/.kube/config - chmod 600 ~/.kube/config
# Deploy backend first
- kubectl set image deployment/scraper scraper=code.cannabrands.app/creationshop/dispensary-scraper:${CI_COMMIT_SHA:0:8} -n dispensary-scraper - kubectl set image deployment/scraper scraper=code.cannabrands.app/creationshop/dispensary-scraper:${CI_COMMIT_SHA:0:8} -n dispensary-scraper
- kubectl rollout status deployment/scraper -n dispensary-scraper --timeout=300s
# Note: Migrations run automatically at startup via auto-migrate
# Deploy remaining services
# Resilience: ensure workers are scaled up if at 0
- REPLICAS=$(kubectl get deployment scraper-worker -n dispensary-scraper -o jsonpath='{.spec.replicas}'); if [ "$REPLICAS" = "0" ]; then echo "Scaling workers from 0 to 5"; kubectl scale deployment/scraper-worker --replicas=5 -n dispensary-scraper; fi
- kubectl set image deployment/scraper-worker worker=code.cannabrands.app/creationshop/dispensary-scraper:${CI_COMMIT_SHA:0:8} -n dispensary-scraper - kubectl set image deployment/scraper-worker worker=code.cannabrands.app/creationshop/dispensary-scraper:${CI_COMMIT_SHA:0:8} -n dispensary-scraper
- kubectl set image deployment/cannaiq-frontend cannaiq-frontend=code.cannabrands.app/creationshop/cannaiq-frontend:${CI_COMMIT_SHA:0:8} -n dispensary-scraper - kubectl set image deployment/cannaiq-frontend cannaiq-frontend=code.cannabrands.app/creationshop/cannaiq-frontend:${CI_COMMIT_SHA:0:8} -n dispensary-scraper
- kubectl set image deployment/findadispo-frontend findadispo-frontend=code.cannabrands.app/creationshop/findadispo-frontend:${CI_COMMIT_SHA:0:8} -n dispensary-scraper - kubectl set image deployment/findadispo-frontend findadispo-frontend=code.cannabrands.app/creationshop/findadispo-frontend:${CI_COMMIT_SHA:0:8} -n dispensary-scraper
- kubectl set image deployment/findagram-frontend findagram-frontend=code.cannabrands.app/creationshop/findagram-frontend:${CI_COMMIT_SHA:0:8} -n dispensary-scraper - kubectl set image deployment/findagram-frontend findagram-frontend=code.cannabrands.app/creationshop/findagram-frontend:${CI_COMMIT_SHA:0:8} -n dispensary-scraper
- kubectl rollout status deployment/scraper -n dispensary-scraper --timeout=300s
- kubectl rollout status deployment/cannaiq-frontend -n dispensary-scraper --timeout=120s - kubectl rollout status deployment/cannaiq-frontend -n dispensary-scraper --timeout=120s
depends_on: depends_on:
- docker-backend - docker-backend

View File

@@ -1,191 +0,0 @@
steps:
# ===========================================
# PR VALIDATION: Only typecheck changed projects
# ===========================================
typecheck-backend:
image: code.cannabrands.app/creationshop/node:20
commands:
- npm config set cache /npm-cache/backend --global
- cd backend
- npm ci --prefer-offline
- npx tsc --noEmit
volumes:
- npm-cache:/npm-cache
depends_on: []
when:
event: pull_request
path:
include: ['backend/**']
typecheck-cannaiq:
image: code.cannabrands.app/creationshop/node:20
commands:
- npm config set cache /npm-cache/cannaiq --global
- cd cannaiq
- npm ci --prefer-offline
- npx tsc --noEmit
volumes:
- npm-cache:/npm-cache
depends_on: []
when:
event: pull_request
path:
include: ['cannaiq/**']
# findadispo/findagram typechecks skipped - they have || true anyway
# ===========================================
# AUTO-MERGE: Merge PR after all checks pass
# ===========================================
auto-merge:
image: alpine:latest
environment:
GITEA_TOKEN:
from_secret: gitea_token
commands:
- apk add --no-cache curl
- |
echo "Merging PR #${CI_COMMIT_PULL_REQUEST}..."
curl -s -X POST \
-H "Authorization: token $GITEA_TOKEN" \
-H "Content-Type: application/json" \
-d '{"Do":"merge"}' \
"https://code.cannabrands.app/api/v1/repos/Creationshop/dispensary-scraper/pulls/${CI_COMMIT_PULL_REQUEST}/merge"
depends_on:
- typecheck-backend
- typecheck-cannaiq
when:
event: pull_request
# ===========================================
# MASTER DEPLOY: Parallel Docker builds
# ===========================================
docker-backend:
image: woodpeckerci/plugin-docker-buildx
settings:
registry: code.cannabrands.app
repo: code.cannabrands.app/creationshop/dispensary-scraper
tags:
- latest
- ${CI_COMMIT_SHA:0:8}
dockerfile: backend/Dockerfile
context: backend
username:
from_secret: registry_username
password:
from_secret: registry_password
platforms: linux/amd64
provenance: false
cache_from: type=registry,ref=code.cannabrands.app/creationshop/dispensary-scraper:cache
cache_to: type=registry,ref=code.cannabrands.app/creationshop/dispensary-scraper:cache,mode=max
build_args:
APP_BUILD_VERSION: ${CI_COMMIT_SHA:0:8}
APP_GIT_SHA: ${CI_COMMIT_SHA}
APP_BUILD_TIME: ${CI_PIPELINE_CREATED}
CONTAINER_IMAGE_TAG: ${CI_COMMIT_SHA:0:8}
depends_on: []
when:
branch: master
event: push
docker-cannaiq:
image: woodpeckerci/plugin-docker-buildx
settings:
registry: code.cannabrands.app
repo: code.cannabrands.app/creationshop/cannaiq-frontend
tags:
- latest
- ${CI_COMMIT_SHA:0:8}
dockerfile: cannaiq/Dockerfile
context: cannaiq
username:
from_secret: registry_username
password:
from_secret: registry_password
platforms: linux/amd64
provenance: false
cache_from: type=registry,ref=code.cannabrands.app/creationshop/cannaiq-frontend:cache
cache_to: type=registry,ref=code.cannabrands.app/creationshop/cannaiq-frontend:cache,mode=max
depends_on: []
when:
branch: master
event: push
docker-findadispo:
image: woodpeckerci/plugin-docker-buildx
settings:
registry: code.cannabrands.app
repo: code.cannabrands.app/creationshop/findadispo-frontend
tags:
- latest
- ${CI_COMMIT_SHA:0:8}
dockerfile: findadispo/frontend/Dockerfile
context: findadispo/frontend
username:
from_secret: registry_username
password:
from_secret: registry_password
platforms: linux/amd64
provenance: false
cache_from: type=registry,ref=code.cannabrands.app/creationshop/findadispo-frontend:cache
cache_to: type=registry,ref=code.cannabrands.app/creationshop/findadispo-frontend:cache,mode=max
depends_on: []
when:
branch: master
event: push
docker-findagram:
image: woodpeckerci/plugin-docker-buildx
settings:
registry: code.cannabrands.app
repo: code.cannabrands.app/creationshop/findagram-frontend
tags:
- latest
- ${CI_COMMIT_SHA:0:8}
dockerfile: findagram/frontend/Dockerfile
context: findagram/frontend
username:
from_secret: registry_username
password:
from_secret: registry_password
platforms: linux/amd64
provenance: false
cache_from: type=registry,ref=code.cannabrands.app/creationshop/findagram-frontend:cache
cache_to: type=registry,ref=code.cannabrands.app/creationshop/findagram-frontend:cache,mode=max
depends_on: []
when:
branch: master
event: push
# ===========================================
# STAGE 3: Deploy and Run Migrations
# ===========================================
deploy:
image: bitnami/kubectl:latest
environment:
KUBECONFIG_CONTENT:
from_secret: kubeconfig_data
commands:
- mkdir -p ~/.kube
- echo "$KUBECONFIG_CONTENT" | tr -d '[:space:]' | base64 -d > ~/.kube/config
- chmod 600 ~/.kube/config
# Deploy backend first
- kubectl set image deployment/scraper scraper=code.cannabrands.app/creationshop/dispensary-scraper:${CI_COMMIT_SHA:0:8} -n dispensary-scraper
- kubectl rollout status deployment/scraper -n dispensary-scraper --timeout=300s
# Note: Migrations run automatically at startup via auto-migrate
# Deploy remaining services
# Resilience: ensure workers are scaled up if at 0
- REPLICAS=$(kubectl get deployment scraper-worker -n dispensary-scraper -o jsonpath='{.spec.replicas}'); if [ "$REPLICAS" = "0" ]; then echo "Scaling workers from 0 to 5"; kubectl scale deployment/scraper-worker --replicas=5 -n dispensary-scraper; fi
- kubectl set image deployment/scraper-worker worker=code.cannabrands.app/creationshop/dispensary-scraper:${CI_COMMIT_SHA:0:8} -n dispensary-scraper
- kubectl set image deployment/cannaiq-frontend cannaiq-frontend=code.cannabrands.app/creationshop/cannaiq-frontend:${CI_COMMIT_SHA:0:8} -n dispensary-scraper
- kubectl set image deployment/findadispo-frontend findadispo-frontend=code.cannabrands.app/creationshop/findadispo-frontend:${CI_COMMIT_SHA:0:8} -n dispensary-scraper
- kubectl set image deployment/findagram-frontend findagram-frontend=code.cannabrands.app/creationshop/findagram-frontend:${CI_COMMIT_SHA:0:8} -n dispensary-scraper
- kubectl rollout status deployment/cannaiq-frontend -n dispensary-scraper --timeout=120s
depends_on:
- docker-backend
- docker-cannaiq
- docker-findadispo
- docker-findagram
when:
branch: master
event: push

1535
CLAUDE.md

File diff suppressed because it is too large Load Diff

View File

@@ -25,9 +25,8 @@ ENV APP_GIT_SHA=${APP_GIT_SHA}
ENV APP_BUILD_TIME=${APP_BUILD_TIME} ENV APP_BUILD_TIME=${APP_BUILD_TIME}
ENV CONTAINER_IMAGE_TAG=${CONTAINER_IMAGE_TAG} ENV CONTAINER_IMAGE_TAG=${CONTAINER_IMAGE_TAG}
# Install Chromium dependencies and curl for HTTP requests # Install Chromium dependencies
RUN apt-get update && apt-get install -y \ RUN apt-get update && apt-get install -y \
curl \
chromium \ chromium \
fonts-liberation \ fonts-liberation \
libnss3 \ libnss3 \

View File

@@ -1,218 +0,0 @@
# CannaiQ Backend Codebase Map
**Last Updated:** 2025-12-12
**Purpose:** Help Claude and developers understand which code is current vs deprecated
---
## Quick Reference: What to Use
### For Crawling/Scraping
| Task | Use This | NOT This |
|------|----------|----------|
| Fetch products | `src/tasks/handlers/payload-fetch.ts` | `src/hydration/*` |
| Process products | `src/tasks/handlers/product-refresh.ts` | `src/scraper-v2/*` |
| GraphQL client | `src/platforms/dutchie/client.ts` | `src/dutchie-az/services/graphql-client.ts` |
| Worker system | `src/tasks/task-worker.ts` | `src/dutchie-az/services/worker.ts` |
### For Database
| Task | Use This | NOT This |
|------|----------|----------|
| Get DB pool | `src/db/pool.ts` | `src/dutchie-az/db/connection.ts` |
| Run migrations | `src/db/migrate.ts` (CLI only) | Never import at runtime |
| Query products | `store_products` table | `products`, `dutchie_products` |
| Query stores | `dispensaries` table | `stores` table |
### For Discovery
| Task | Use This |
|------|----------|
| Discover stores | `src/discovery/*.ts` |
| Run discovery | `npx tsx src/scripts/run-discovery.ts` |
---
## Directory Status
### ACTIVE DIRECTORIES (Use These)
```
src/
├── auth/ # JWT/session auth, middleware
├── db/ # Database pool, migrations
├── discovery/ # Dutchie store discovery pipeline
├── middleware/ # Express middleware
├── multi-state/ # Multi-state query support
├── platforms/ # Platform-specific clients (Dutchie, Jane, etc)
│ └── dutchie/ # THE Dutchie client - use this one
├── routes/ # Express API routes
├── services/ # Core services (logger, scheduler, etc)
├── tasks/ # Task system (workers, handlers, scheduler)
│ └── handlers/ # Task handlers (payload_fetch, product_refresh, etc)
├── types/ # TypeScript types
└── utils/ # Utilities (storage, image processing)
```
### DEPRECATED DIRECTORIES (DO NOT USE)
```
src/
├── hydration/ # DEPRECATED - Old pipeline approach
├── scraper-v2/ # DEPRECATED - Old scraper engine
├── canonical-hydration/# DEPRECATED - Merged into tasks/handlers
├── dutchie-az/ # PARTIAL - Some parts deprecated, some active
│ ├── db/ # DEPRECATED - Use src/db/pool.ts
│ └── services/ # PARTIAL - worker.ts still runs, graphql-client.ts deprecated
├── portals/ # FUTURE - Not yet implemented
├── seo/ # PARTIAL - Settings work, templates WIP
└── system/ # DEPRECATED - Old orchestration system
```
### DEPRECATED FILES (DO NOT USE)
```
src/dutchie-az/db/connection.ts # Use src/db/pool.ts instead
src/dutchie-az/services/graphql-client.ts # Use src/platforms/dutchie/client.ts
src/hydration/*.ts # Entire directory deprecated
src/scraper-v2/*.ts # Entire directory deprecated
```
---
## Key Files Reference
### Entry Points
| File | Purpose | Status |
|------|---------|--------|
| `src/index.ts` | Main Express server | ACTIVE |
| `src/dutchie-az/services/worker.ts` | Worker process entry | ACTIVE |
| `src/tasks/task-worker.ts` | Task worker (new system) | ACTIVE |
### Dutchie Integration
| File | Purpose | Status |
|------|---------|--------|
| `src/platforms/dutchie/client.ts` | GraphQL client, hashes, curl | **PRIMARY** |
| `src/platforms/dutchie/queries.ts` | High-level query functions | ACTIVE |
| `src/platforms/dutchie/index.ts` | Re-exports | ACTIVE |
### Task Handlers
| File | Purpose | Status |
|------|---------|--------|
| `src/tasks/handlers/payload-fetch.ts` | Fetch products from Dutchie | **PRIMARY** |
| `src/tasks/handlers/product-refresh.ts` | Process payload into DB | **PRIMARY** |
| `src/tasks/handlers/menu-detection.ts` | Detect menu type | ACTIVE |
| `src/tasks/handlers/id-resolution.ts` | Resolve platform IDs | ACTIVE |
| `src/tasks/handlers/image-download.ts` | Download product images | ACTIVE |
### Database
| File | Purpose | Status |
|------|---------|--------|
| `src/db/pool.ts` | Canonical DB pool | **PRIMARY** |
| `src/db/migrate.ts` | Migration runner (CLI only) | CLI ONLY |
| `src/db/auto-migrate.ts` | Auto-run migrations on startup | ACTIVE |
### Configuration
| File | Purpose | Status |
|------|---------|--------|
| `.env` | Environment variables | ACTIVE |
| `package.json` | Dependencies | ACTIVE |
| `tsconfig.json` | TypeScript config | ACTIVE |
---
## GraphQL Hashes (CRITICAL)
The correct hashes are in `src/platforms/dutchie/client.ts`:
```typescript
export const GRAPHQL_HASHES = {
FilteredProducts: 'ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0',
GetAddressBasedDispensaryData: '13461f73abf7268770dfd05fe7e10c523084b2bb916a929c08efe3d87531977b',
ConsumerDispensaries: '0a5bfa6ca1d64ae47bcccb7c8077c87147cbc4e6982c17ceec97a2a4948b311b',
GetAllCitiesByState: 'ae547a0466ace5a48f91e55bf6699eacd87e3a42841560f0c0eabed5a0a920e6',
};
```
**ALWAYS** use `Status: 'Active'` for FilteredProducts (not `null` or `'All'`).
---
## Scripts Reference
### Useful Scripts (in `src/scripts/`)
| Script | Purpose |
|--------|---------|
| `run-discovery.ts` | Run Dutchie discovery |
| `crawl-single-store.ts` | Test crawl a single store |
| `test-dutchie-graphql.ts` | Test GraphQL queries |
### One-Off Scripts (probably don't need)
| Script | Purpose |
|--------|---------|
| `harmonize-az-dispensaries.ts` | One-time data cleanup |
| `bootstrap-stores-for-dispensaries.ts` | One-time migration |
| `backfill-*.ts` | Historical backfill scripts |
---
## API Routes
### Active Routes (in `src/routes/`)
| Route File | Mount Point | Purpose |
|------------|-------------|---------|
| `auth.ts` | `/api/auth` | Login/logout/session |
| `stores.ts` | `/api/stores` | Store CRUD |
| `dashboard.ts` | `/api/dashboard` | Dashboard stats |
| `workers.ts` | `/api/workers` | Worker monitoring |
| `pipeline.ts` | `/api/pipeline` | Crawl triggers |
| `discovery.ts` | `/api/discovery` | Discovery management |
| `analytics.ts` | `/api/analytics` | Analytics queries |
| `wordpress.ts` | `/api/v1/wordpress` | WordPress plugin API |
---
## Documentation Files
### Current Docs (in `backend/docs/`)
| Doc | Purpose | Currency |
|-----|---------|----------|
| `TASK_WORKFLOW_2024-12-10.md` | Task system architecture | CURRENT |
| `WORKER_TASK_ARCHITECTURE.md` | Worker/task design | CURRENT |
| `CRAWL_PIPELINE.md` | Crawl pipeline overview | CURRENT |
| `ORGANIC_SCRAPING_GUIDE.md` | Browser-based scraping | CURRENT |
| `CODEBASE_MAP.md` | This file | CURRENT |
| `ANALYTICS_V2_EXAMPLES.md` | Analytics API examples | CURRENT |
| `BRAND_INTELLIGENCE_API.md` | Brand API docs | CURRENT |
### Root Docs
| Doc | Purpose | Currency |
|-----|---------|----------|
| `CLAUDE.md` | Claude instructions | **PRIMARY** |
| `README.md` | Project overview | NEEDS UPDATE |
---
## Common Mistakes to Avoid
1. **Don't use `src/hydration/`** - It's an old approach that was superseded by the task system
2. **Don't use `src/dutchie-az/db/connection.ts`** - Use `src/db/pool.ts` instead
3. **Don't import `src/db/migrate.ts` at runtime** - It will crash. Only use for CLI migrations.
4. **Don't query `stores` table** - It's empty. Use `dispensaries`.
5. **Don't query `products` table** - It's empty. Use `store_products`.
6. **Don't use wrong GraphQL hash** - Always get hash from `GRAPHQL_HASHES` in client.ts
7. **Don't use `Status: null`** - It returns 0 products. Use `Status: 'Active'`.
---
## When in Doubt
1. Check if the file is imported in `src/index.ts` - if not, it may be deprecated
2. Check the last modified date - older files may be stale
3. Look for `DEPRECATED` comments in the code
4. Ask: "Is there a newer version of this in `src/tasks/` or `src/platforms/`?"
5. Read the relevant doc in `docs/` before modifying code

View File

@@ -500,18 +500,17 @@ CREATE TABLE proxies (
Proxies are mandatory. There is no environment variable to disable them. Workers will refuse to start without active proxies in the database. Proxies are mandatory. There is no environment variable to disable them. Workers will refuse to start without active proxies in the database.
### User-Agent Generation ### Fingerprints Available
See `workflow-12102025.md` for full specification. The client includes 6 browser fingerprints:
- Chrome 131 on Windows
- Chrome 131 on macOS
- Chrome 120 on Windows
- Firefox 133 on Windows
- Safari 17.2 on macOS
- Edge 131 on Windows
**Summary:** Each includes proper `sec-ch-ua`, `sec-ch-ua-platform`, and `sec-ch-ua-mobile` headers.
- Uses `intoli/user-agents` library (daily-updated market share data)
- Device distribution: Mobile 62%, Desktop 36%, Tablet 2%
- Browser whitelist: Chrome, Safari, Edge, Firefox only
- UA sticks until IP rotates (403 or manual rotation)
- Failure = alert admin + stop crawl (no fallback)
Each fingerprint includes proper `sec-ch-ua`, `sec-ch-ua-platform`, and `sec-ch-ua-mobile` headers.
--- ---

View File

@@ -362,148 +362,6 @@ SET status = 'pending', retry_count = retry_count + 1
WHERE status = 'failed' AND retry_count < max_retries; WHERE status = 'failed' AND retry_count < max_retries;
``` ```
## Concurrent Task Processing (Added 2024-12)
Workers can now process multiple tasks concurrently within a single worker instance. This improves throughput by utilizing async I/O efficiently.
### Architecture
```
┌─────────────────────────────────────────────────────────────┐
│ Pod (K8s) │
│ │
│ ┌─────────────────────────────────────────────────────┐ │
│ │ TaskWorker │ │
│ │ │ │
│ │ ┌─────────┐ ┌─────────┐ ┌─────────┐ │ │
│ │ │ Task 1 │ │ Task 2 │ │ Task 3 │ (concurrent)│ │
│ │ └─────────┘ └─────────┘ └─────────┘ │ │
│ │ │ │
│ │ Resource Monitor │ │
│ │ ├── Memory: 65% (threshold: 85%) │ │
│ │ ├── CPU: 45% (threshold: 90%) │ │
│ │ └── Status: Normal │ │
│ └─────────────────────────────────────────────────────┘ │
└─────────────────────────────────────────────────────────────┘
```
### Environment Variables
| Variable | Default | Description |
|----------|---------|-------------|
| `MAX_CONCURRENT_TASKS` | 3 | Maximum tasks a worker will run concurrently |
| `MEMORY_BACKOFF_THRESHOLD` | 0.85 | Back off when heap memory exceeds 85% |
| `CPU_BACKOFF_THRESHOLD` | 0.90 | Back off when CPU exceeds 90% |
| `BACKOFF_DURATION_MS` | 10000 | How long to wait when backing off (10s) |
### How It Works
1. **Main Loop**: Worker continuously tries to fill up to `MAX_CONCURRENT_TASKS`
2. **Resource Monitoring**: Before claiming a new task, worker checks memory and CPU
3. **Backoff**: If resources exceed thresholds, worker pauses and stops claiming new tasks
4. **Concurrent Execution**: Tasks run in parallel using `Promise` - they don't block each other
5. **Graceful Shutdown**: On SIGTERM/decommission, worker stops claiming but waits for active tasks
### Resource Monitoring
```typescript
// ResourceStats interface
interface ResourceStats {
memoryPercent: number; // Current heap usage as decimal (0.0-1.0)
memoryMb: number; // Current heap used in MB
memoryTotalMb: number; // Total heap available in MB
cpuPercent: number; // CPU usage as percentage (0-100)
isBackingOff: boolean; // True if worker is in backoff state
backoffReason: string; // Why the worker is backing off
}
```
### Heartbeat Data
Workers report the following in their heartbeat:
```json
{
"worker_id": "worker-abc123",
"current_task_id": 456,
"current_task_ids": [456, 457, 458],
"active_task_count": 3,
"max_concurrent_tasks": 3,
"status": "active",
"resources": {
"memory_mb": 256,
"memory_total_mb": 512,
"memory_rss_mb": 320,
"memory_percent": 50,
"cpu_user_ms": 12500,
"cpu_system_ms": 3200,
"cpu_percent": 45,
"is_backing_off": false,
"backoff_reason": null
}
}
```
### Backoff Behavior
When resources exceed thresholds:
1. Worker logs the backoff reason:
```
[TaskWorker] MyWorker backing off: Memory at 87.3% (threshold: 85%)
```
2. Worker stops claiming new tasks but continues existing tasks
3. After `BACKOFF_DURATION_MS`, worker rechecks resources
4. When resources return to normal:
```
[TaskWorker] MyWorker resuming normal operation
```
### UI Display
The Workers Dashboard shows:
- **Tasks Column**: `2/3 tasks` (active/max concurrent)
- **Resources Column**: Memory % and CPU % with color coding
- Green: < 50%
- Yellow: 50-74%
- Amber: 75-89%
- Red: 90%+
- **Backing Off**: Orange warning badge when worker is in backoff state
### Task Count Badge Details
```
┌─────────────────────────────────────────────┐
│ Worker: "MyWorker" │
│ Tasks: 2/3 tasks #456, #457
│ Resources: 🧠 65% 💻 45% │
│ Status: ● Active │
└─────────────────────────────────────────────┘
```
### Best Practices
1. **Start Conservative**: Use `MAX_CONCURRENT_TASKS=3` initially
2. **Monitor Resources**: Watch for frequent backoffs in logs
3. **Tune Per Workload**: I/O-bound tasks benefit from higher concurrency
4. **Scale Horizontally**: Add more pods rather than cranking concurrency too high
### Code References
| File | Purpose |
|------|---------|
| `src/tasks/task-worker.ts:68-71` | Concurrency environment variables |
| `src/tasks/task-worker.ts:104-111` | ResourceStats interface |
| `src/tasks/task-worker.ts:149-179` | getResourceStats() method |
| `src/tasks/task-worker.ts:184-196` | shouldBackOff() method |
| `src/tasks/task-worker.ts:462-516` | mainLoop() with concurrent claiming |
| `src/routes/worker-registry.ts:148-195` | Heartbeat endpoint handling |
| `cannaiq/src/pages/WorkersDashboard.tsx:233-305` | UI components for resources |
## Monitoring ## Monitoring
### Logs ### Logs

View File

@@ -1,297 +0,0 @@
# Organic Browser-Based Scraping Guide
**Last Updated:** 2025-12-12
**Status:** Production-ready proof of concept
---
## Overview
This document describes the "organic" browser-based approach to scraping Dutchie dispensary menus. Unlike direct curl/axios requests, this method uses a real browser session to make API calls, making requests appear natural and reducing detection risk.
---
## Why Organic Scraping?
| Approach | Detection Risk | Speed | Complexity |
|----------|---------------|-------|------------|
| Direct curl | Higher | Fast | Low |
| curl-impersonate | Medium | Fast | Medium |
| **Browser-based (organic)** | **Lowest** | Slower | Higher |
Direct curl requests can be fingerprinted via:
- TLS fingerprint (cipher suites, extensions)
- Header order and values
- Missing cookies/session data
- Request patterns
Browser-based requests inherit:
- Real Chrome TLS fingerprint
- Session cookies from page visit
- Natural header order
- JavaScript execution environment
---
## Implementation
### Dependencies
```bash
npm install puppeteer puppeteer-extra puppeteer-extra-plugin-stealth
```
### Core Script: `test-intercept.js`
Located at: `backend/test-intercept.js`
```javascript
const puppeteer = require('puppeteer-extra');
const StealthPlugin = require('puppeteer-extra-plugin-stealth');
const fs = require('fs');
puppeteer.use(StealthPlugin());
async function capturePayload(config) {
const { dispensaryId, platformId, cName, outputPath } = config;
const browser = await puppeteer.launch({
headless: 'new',
args: ['--no-sandbox', '--disable-setuid-sandbox']
});
const page = await browser.newPage();
// STEP 1: Establish session by visiting the menu
const embedUrl = `https://dutchie.com/embedded-menu/${cName}?menuType=rec`;
await page.goto(embedUrl, { waitUntil: 'networkidle2', timeout: 60000 });
// STEP 2: Fetch ALL products using GraphQL from browser context
const result = await page.evaluate(async (platformId) => {
const allProducts = [];
let pageNum = 0;
const perPage = 100;
let totalCount = 0;
const sessionId = 'browser-session-' + Date.now();
while (pageNum < 30) {
const variables = {
includeEnterpriseSpecials: false,
productsFilter: {
dispensaryId: platformId,
pricingType: 'rec',
Status: 'Active', // CRITICAL: Must be 'Active', not null
types: [],
useCache: true,
isDefaultSort: true,
sortBy: 'popularSortIdx',
sortDirection: 1,
bypassOnlineThresholds: true,
isKioskMenu: false,
removeProductsBelowOptionThresholds: false,
},
page: pageNum,
perPage: perPage,
};
const extensions = {
persistedQuery: {
version: 1,
sha256Hash: 'ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0'
}
};
const qs = new URLSearchParams({
operationName: 'FilteredProducts',
variables: JSON.stringify(variables),
extensions: JSON.stringify(extensions)
});
const response = await fetch(`https://dutchie.com/api-3/graphql?${qs}`, {
method: 'GET',
headers: {
'Accept': 'application/json',
'content-type': 'application/json',
'x-dutchie-session': sessionId,
'apollographql-client-name': 'Marketplace (production)',
},
credentials: 'include'
});
const json = await response.json();
const data = json?.data?.filteredProducts;
if (!data?.products) break;
allProducts.push(...data.products);
if (pageNum === 0) totalCount = data.queryInfo?.totalCount || 0;
if (allProducts.length >= totalCount) break;
pageNum++;
await new Promise(r => setTimeout(r, 200)); // Polite delay
}
return { products: allProducts, totalCount };
}, platformId);
await browser.close();
// STEP 3: Save payload
const payload = {
dispensaryId,
platformId,
cName,
fetchedAt: new Date().toISOString(),
productCount: result.products.length,
products: result.products,
};
fs.writeFileSync(outputPath, JSON.stringify(payload, null, 2));
return payload;
}
```
---
## Critical Parameters
### GraphQL Hash (FilteredProducts)
```
ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0
```
**WARNING:** Using the wrong hash returns HTTP 400.
### Status Parameter
| Value | Result |
|-------|--------|
| `'Active'` | Returns in-stock products (1019 in test) |
| `null` | Returns 0 products |
| `'All'` | Returns HTTP 400 |
**ALWAYS use `Status: 'Active'`**
### Required Headers
```javascript
{
'Accept': 'application/json',
'content-type': 'application/json',
'x-dutchie-session': 'unique-session-id',
'apollographql-client-name': 'Marketplace (production)',
}
```
### Endpoint
```
https://dutchie.com/api-3/graphql
```
---
## Performance Benchmarks
Test store: AZ-Deeply-Rooted (1019 products)
| Metric | Value |
|--------|-------|
| Total products | 1019 |
| Time | 18.5 seconds |
| Payload size | 11.8 MB |
| Pages fetched | 11 (100 per page) |
| Success rate | 100% |
---
## Payload Format
The output matches the existing `payload-fetch.ts` handler format:
```json
{
"dispensaryId": 123,
"platformId": "6405ef617056e8014d79101b",
"cName": "AZ-Deeply-Rooted",
"fetchedAt": "2025-12-12T05:05:19.837Z",
"productCount": 1019,
"products": [
{
"id": "6927508db4851262f629a869",
"Name": "Product Name",
"brand": { "name": "Brand Name", ... },
"type": "Flower",
"THC": "25%",
"Prices": [...],
"Options": [...],
...
}
]
}
```
---
## Integration Points
### As a Task Handler
The organic approach can be integrated as an alternative to curl-based fetching:
```typescript
// In src/tasks/handlers/organic-payload-fetch.ts
export async function handleOrganicPayloadFetch(ctx: TaskContext): Promise<TaskResult> {
// Use puppeteer-based capture
// Save to same payload storage
// Queue product_refresh task
}
```
### Worker Configuration
Add to job_schedules:
```sql
INSERT INTO job_schedules (name, role, cron_expression)
VALUES ('organic_product_crawl', 'organic_payload_fetch', '0 */6 * * *');
```
---
## Troubleshooting
### HTTP 400 Bad Request
- Check hash is correct: `ee29c060...`
- Verify Status is `'Active'` (string, not null)
### 0 Products Returned
- Status was likely `null` or `'All'` - use `'Active'`
- Check platformId is valid MongoDB ObjectId
### Session Not Established
- Increase timeout on initial page.goto()
- Check cName is valid (matches embedded-menu URL)
### Detection/Blocking
- StealthPlugin should handle most cases
- Add random delays between pages
- Use headless: 'new' (not true/false)
---
## Files Reference
| File | Purpose |
|------|---------|
| `backend/test-intercept.js` | Proof of concept script |
| `backend/src/platforms/dutchie/client.ts` | GraphQL hashes, curl implementation |
| `backend/src/tasks/handlers/payload-fetch.ts` | Current curl-based handler |
| `backend/src/utils/payload-storage.ts` | Payload save/load utilities |
---
## See Also
- `DUTCHIE_CRAWL_WORKFLOW.md` - Full crawl pipeline documentation
- `TASK_WORKFLOW_2024-12-10.md` - Task system architecture
- `CLAUDE.md` - Project rules and constraints

View File

@@ -1,25 +0,0 @@
# ARCHIVED DOCUMENTATION
**WARNING: These docs may be outdated or inaccurate.**
The code has evolved significantly. These docs are kept for historical reference only.
## What to Use Instead
**The single source of truth is:**
- `CLAUDE.md` (root) - Essential rules and quick reference
- `docs/CODEBASE_MAP.md` - Current file/directory reference
## Why Archive?
These docs were written during development iterations and may reference:
- Old file paths that no longer exist
- Deprecated approaches (hydration, scraper-v2)
- APIs that have changed
- Database schemas that evolved
## If You Need Details
1. First check CODEBASE_MAP.md for current file locations
2. Then read the actual source code
3. Only use archive docs as a last resort for historical context

View File

@@ -1,584 +0,0 @@
# Task Workflow Documentation
**Date: 2024-12-10**
This document describes the complete task/job processing architecture after the 2024-12-10 rewrite.
---
## Complete Architecture
```
┌─────────────────────────────────────────────────────────────────────────────────┐
│ KUBERNETES CLUSTER │
├─────────────────────────────────────────────────────────────────────────────────┤
│ │
│ ┌─────────────────────────────────────────────────────────────────────────┐ │
│ │ API SERVER POD (scraper) │ │
│ │ │ │
│ │ ┌──────────────────┐ ┌────────────────────────────────────────┐ │ │
│ │ │ Express API │ │ TaskScheduler │ │ │
│ │ │ │ │ (src/services/task-scheduler.ts) │ │ │
│ │ │ /api/job-queue │ │ │ │ │
│ │ │ /api/tasks │ │ • Polls every 60s │ │ │
│ │ │ /api/schedules │ │ • Checks task_schedules table │ │ │
│ │ └────────┬─────────┘ │ • SELECT FOR UPDATE SKIP LOCKED │ │ │
│ │ │ │ • Generates tasks when due │ │ │
│ │ │ └──────────────────┬─────────────────────┘ │ │
│ │ │ │ │ │
│ └────────────┼──────────────────────────────────┼──────────────────────────┘ │
│ │ │ │
│ │ ┌────────────────────────┘ │
│ │ │ │
│ ▼ ▼ │
│ ┌─────────────────────────────────────────────────────────────────────────┐ │
│ │ POSTGRESQL DATABASE │ │
│ │ │ │
│ │ ┌─────────────────────┐ ┌─────────────────────┐ │ │
│ │ │ task_schedules │ │ worker_tasks │ │ │
│ │ │ │ │ │ │ │
│ │ │ • product_refresh │───────►│ • pending tasks │ │ │
│ │ │ • store_discovery │ create │ • claimed tasks │ │ │
│ │ │ • analytics_refresh │ tasks │ • running tasks │ │ │
│ │ │ │ │ • completed tasks │ │ │
│ │ │ next_run_at │ │ │ │ │
│ │ │ last_run_at │ │ role, dispensary_id │ │ │
│ │ │ interval_hours │ │ priority, status │ │ │
│ │ └─────────────────────┘ └──────────┬──────────┘ │ │
│ │ │ │ │
│ └─────────────────────────────────────────────┼────────────────────────────┘ │
│ │ │
│ ┌──────────────────────┘ │
│ │ Workers poll for tasks │
│ │ (SELECT FOR UPDATE SKIP LOCKED) │
│ ▼ │
│ ┌─────────────────────────────────────────────────────────────────────────┐ │
│ │ WORKER PODS (StatefulSet: scraper-worker) │ │
│ │ │ │
│ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ │
│ │ │ Worker 0 │ │ Worker 1 │ │ Worker 2 │ │ Worker N │ │ │
│ │ │ │ │ │ │ │ │ │ │ │
│ │ │ task-worker │ │ task-worker │ │ task-worker │ │ task-worker │ │ │
│ │ │ .ts │ │ .ts │ │ .ts │ │ .ts │ │ │
│ │ └─────────────┘ └─────────────┘ └─────────────┘ └─────────────┘ │ │
│ │ │ │
│ └──────────────────────────────────────────────────────────────────────────┘ │
│ │
└──────────────────────────────────────────────────────────────────────────────────┘
```
---
## Startup Sequence
```
┌─────────────────────────────────────────────────────────────────────────────┐
│ API SERVER STARTUP │
├─────────────────────────────────────────────────────────────────────────────┤
│ │
│ 1. Express app initializes │
│ │ │
│ ▼ │
│ 2. runAutoMigrations() │
│ • Runs pending migrations (including 079_task_schedules.sql) │
│ │ │
│ ▼ │
│ 3. initializeMinio() / initializeImageStorage() │
│ │ │
│ ▼ │
│ 4. cleanupOrphanedJobs() │
│ │ │
│ ▼ │
│ 5. taskScheduler.start() ◄─── NEW (per TASK_WORKFLOW_2024-12-10.md) │
│ │ │
│ ├── Recover stale tasks (workers that died) │
│ ├── Ensure default schedules exist in task_schedules │
│ ├── Check and run any due schedules immediately │
│ └── Start 60-second poll interval │
│ │ │
│ ▼ │
│ 6. app.listen(PORT) │
│ │
└─────────────────────────────────────────────────────────────────────────────┘
┌─────────────────────────────────────────────────────────────────────────────┐
│ WORKER POD STARTUP │
├─────────────────────────────────────────────────────────────────────────────┤
│ │
│ 1. K8s starts pod from StatefulSet │
│ │ │
│ ▼ │
│ 2. TaskWorker.constructor() │
│ • Create DB pool │
│ • Create CrawlRotator │
│ │ │
│ ▼ │
│ 3. initializeStealth() │
│ • Load proxies from DB (REQUIRED - fails if none) │
│ • Wire rotator to Dutchie client │
│ │ │
│ ▼ │
│ 4. register() with API │
│ • Optional - continues if fails │
│ │ │
│ ▼ │
│ 5. startRegistryHeartbeat() every 30s │
│ │ │
│ ▼ │
│ 6. processNextTask() loop │
│ │ │
│ ├── Poll for pending task (FOR UPDATE SKIP LOCKED) │
│ ├── Claim task atomically │
│ ├── Execute handler (product_refresh, store_discovery, etc.) │
│ ├── Mark complete/failed │
│ ├── Chain next task if applicable │
│ └── Loop │
│ │
└─────────────────────────────────────────────────────────────────────────────┘
```
---
## Schedule Flow
```
┌─────────────────────────────────────────────────────────────────────────────┐
│ SCHEDULER POLL (every 60 seconds) │
├─────────────────────────────────────────────────────────────────────────────┤
│ │
│ BEGIN TRANSACTION │
│ │ │
│ ▼ │
│ SELECT * FROM task_schedules │
│ WHERE enabled = true AND next_run_at <= NOW() │
│ FOR UPDATE SKIP LOCKED ◄─── Prevents duplicate execution across replicas │
│ │ │
│ ▼ │
│ For each due schedule: │
│ │ │
│ ├── product_refresh_all │
│ │ └─► Query dispensaries needing crawl │
│ │ └─► Create product_refresh tasks in worker_tasks │
│ │ │
│ ├── store_discovery_dutchie │
│ │ └─► Create single store_discovery task │
│ │ │
│ └── analytics_refresh │
│ └─► Create single analytics_refresh task │
│ │ │
│ ▼ │
│ UPDATE task_schedules SET │
│ last_run_at = NOW(), │
│ next_run_at = NOW() + interval_hours │
│ │ │
│ ▼ │
│ COMMIT │
│ │
└─────────────────────────────────────────────────────────────────────────────┘
```
---
## Task Lifecycle
```
┌──────────┐
│ SCHEDULE │
│ DUE │
└────┬─────┘
┌──────────────┐ claim ┌──────────────┐ start ┌──────────────┐
│ PENDING │────────────►│ CLAIMED │────────────►│ RUNNING │
└──────────────┘ └──────────────┘ └──────┬───────┘
▲ │
│ ┌──────────────┼──────────────┐
│ retry │ │ │
│ (if retries < max) ▼ ▼ ▼
│ ┌──────────┐ ┌──────────┐ ┌──────────┐
└──────────────────────────────────│ FAILED │ │ COMPLETED│ │ STALE │
└──────────┘ └──────────┘ └────┬─────┘
recover_stale_tasks()
┌──────────┐
│ PENDING │
└──────────┘
```
---
## Database Tables
### task_schedules (NEW - migration 079)
Stores schedule definitions. Survives restarts.
```sql
CREATE TABLE task_schedules (
id SERIAL PRIMARY KEY,
name VARCHAR(100) NOT NULL UNIQUE,
role VARCHAR(50) NOT NULL, -- product_refresh, store_discovery, etc.
enabled BOOLEAN DEFAULT TRUE,
interval_hours INTEGER NOT NULL, -- How often to run
priority INTEGER DEFAULT 0, -- Task priority when created
state_code VARCHAR(2), -- Optional filter
last_run_at TIMESTAMPTZ, -- When it last ran
next_run_at TIMESTAMPTZ, -- When it's due next
last_task_count INTEGER, -- Tasks created last run
last_error TEXT -- Error message if failed
);
```
### worker_tasks (migration 074)
The task queue. Workers pull from here.
```sql
CREATE TABLE worker_tasks (
id SERIAL PRIMARY KEY,
role task_role NOT NULL, -- What type of work
dispensary_id INTEGER, -- Which store (if applicable)
platform VARCHAR(50), -- Which platform
status task_status DEFAULT 'pending',
priority INTEGER DEFAULT 0, -- Higher = process first
scheduled_for TIMESTAMP, -- Don't process before this time
worker_id VARCHAR(100), -- Which worker claimed it
claimed_at TIMESTAMP,
started_at TIMESTAMP,
completed_at TIMESTAMP,
last_heartbeat_at TIMESTAMP, -- For stale detection
result JSONB,
error_message TEXT,
retry_count INTEGER DEFAULT 0,
max_retries INTEGER DEFAULT 3
);
```
---
## Default Schedules
| Name | Role | Interval | Priority | Description |
|------|------|----------|----------|-------------|
| `payload_fetch_all` | payload_fetch | 4 hours | 0 | Fetch payloads from Dutchie API (chains to product_refresh) |
| `store_discovery_dutchie` | store_discovery | 24 hours | 5 | Find new Dutchie stores |
| `analytics_refresh` | analytics_refresh | 6 hours | 0 | Refresh MVs |
---
## Task Roles
| Role | Description | Creates Tasks For |
|------|-------------|-------------------|
| `payload_fetch` | **NEW** - Fetch from Dutchie API, save to disk | Each dispensary needing crawl |
| `product_refresh` | **CHANGED** - Read local payload, normalize, upsert to DB | Chained from payload_fetch |
| `store_discovery` | Find new dispensaries, returns newStoreIds[] | Single task per platform |
| `entry_point_discovery` | **DEPRECATED** - Resolve platform IDs | No longer used |
| `product_discovery` | Initial product fetch for new stores | Chained from store_discovery |
| `analytics_refresh` | Refresh MVs | Single global task |
### Payload/Refresh Separation (2024-12-10)
The crawl workflow is now split into two phases:
```
payload_fetch (scheduled every 4h)
└─► Hit Dutchie GraphQL API
└─► Save raw JSON to /storage/payloads/{year}/{month}/{day}/store_{id}_{ts}.json.gz
└─► Record metadata in raw_crawl_payloads table
└─► Queue product_refresh task with payload_id
product_refresh (chained from payload_fetch)
└─► Load payload from filesystem (NOT from API)
└─► Normalize via DutchieNormalizer
└─► Upsert to store_products
└─► Create snapshots
└─► Track missing products
└─► Download images
```
**Benefits:**
- **Retry-friendly**: If normalize fails, re-run product_refresh without re-crawling
- **Replay-able**: Run product_refresh against any historical payload
- **Faster refreshes**: Local file read vs network call
- **Historical diffs**: Compare payloads to see what changed between crawls
- **Less API pressure**: Only payload_fetch hits Dutchie
---
## Task Chaining
Tasks automatically queue follow-up tasks upon successful completion. This creates two main flows:
### Discovery Flow (New Stores)
When `store_discovery` finds new dispensaries, they automatically get their initial product data:
```
store_discovery
└─► Discovers new locations via Dutchie GraphQL
└─► Auto-promotes valid locations to dispensaries table
└─► Collects newDispensaryIds[] from promotions
└─► Returns { newStoreIds: [...] } in result
chainNextTask() detects newStoreIds
└─► Creates product_discovery task for each new store
product_discovery
└─► Calls handlePayloadFetch() internally
└─► payload_fetch hits Dutchie API
└─► Saves raw JSON to /storage/payloads/
└─► Queues product_refresh task with payload_id
product_refresh
└─► Loads payload from filesystem
└─► Normalizes and upserts to store_products
└─► Creates snapshots, downloads images
```
**Complete Discovery Chain:**
```
store_discovery → product_discovery → payload_fetch → product_refresh
(internal call) (queues next)
```
### Scheduled Flow (Existing Stores)
For existing stores, `payload_fetch_all` schedule runs every 4 hours:
```
TaskScheduler (every 60s)
└─► Checks task_schedules for due schedules
└─► payload_fetch_all is due
└─► Generates payload_fetch task for each dispensary
payload_fetch
└─► Hits Dutchie GraphQL API
└─► Saves raw JSON to /storage/payloads/
└─► Queues product_refresh task with payload_id
product_refresh
└─► Loads payload from filesystem (NOT API)
└─► Normalizes via DutchieNormalizer
└─► Upserts to store_products
└─► Creates snapshots
```
**Complete Scheduled Chain:**
```
payload_fetch → product_refresh
(queues) (reads local)
```
### Chaining Implementation
Task chaining is handled in two places:
1. **Internal chaining (handler calls handler):**
- `product_discovery` calls `handlePayloadFetch()` directly
2. **External chaining (chainNextTask() in task-service.ts):**
- Called after task completion
- `store_discovery` → queues `product_discovery` for each newStoreId
3. **Queue-based chaining (taskService.createTask):**
- `payload_fetch` queues `product_refresh` with `payload: { payload_id }`
---
## Payload API Endpoints
Raw crawl payloads can be accessed via the Payloads API:
| Endpoint | Method | Description |
|----------|--------|-------------|
| `GET /api/payloads` | GET | List payload metadata (paginated) |
| `GET /api/payloads/:id` | GET | Get payload metadata by ID |
| `GET /api/payloads/:id/data` | GET | Get full payload JSON (decompressed) |
| `GET /api/payloads/store/:dispensaryId` | GET | List payloads for a store |
| `GET /api/payloads/store/:dispensaryId/latest` | GET | Get latest payload for a store |
| `GET /api/payloads/store/:dispensaryId/diff` | GET | Diff two payloads for changes |
### Payload Diff Response
The diff endpoint returns:
```json
{
"success": true,
"from": { "id": 123, "fetchedAt": "...", "productCount": 100 },
"to": { "id": 456, "fetchedAt": "...", "productCount": 105 },
"diff": {
"added": 10,
"removed": 5,
"priceChanges": 8,
"stockChanges": 12
},
"details": {
"added": [...],
"removed": [...],
"priceChanges": [...],
"stockChanges": [...]
}
}
```
---
## API Endpoints
### Schedules (NEW)
| Endpoint | Method | Description |
|----------|--------|-------------|
| `GET /api/schedules` | GET | List all schedules |
| `PUT /api/schedules/:id` | PUT | Update schedule |
| `POST /api/schedules/:id/trigger` | POST | Run schedule immediately |
### Task Creation (rewired 2024-12-10)
| Endpoint | Method | Description |
|----------|--------|-------------|
| `POST /api/job-queue/enqueue` | POST | Create single task |
| `POST /api/job-queue/enqueue-batch` | POST | Create batch tasks |
| `POST /api/job-queue/enqueue-state` | POST | Create tasks for state |
| `POST /api/tasks` | POST | Direct task creation |
### Task Management
| Endpoint | Method | Description |
|----------|--------|-------------|
| `GET /api/tasks` | GET | List tasks |
| `GET /api/tasks/:id` | GET | Get single task |
| `GET /api/tasks/counts` | GET | Task counts by status |
| `POST /api/tasks/recover-stale` | POST | Recover stale tasks |
---
## Key Files
| File | Purpose |
|------|---------|
| `src/services/task-scheduler.ts` | **NEW** - DB-driven scheduler |
| `src/tasks/task-worker.ts` | Worker that processes tasks |
| `src/tasks/task-service.ts` | Task CRUD operations |
| `src/tasks/handlers/payload-fetch.ts` | **NEW** - Fetches from API, saves to disk |
| `src/tasks/handlers/product-refresh.ts` | **CHANGED** - Reads from disk, processes to DB |
| `src/utils/payload-storage.ts` | **NEW** - Payload save/load utilities |
| `src/routes/tasks.ts` | Task API endpoints |
| `src/routes/job-queue.ts` | Job Queue UI endpoints (rewired) |
| `migrations/079_task_schedules.sql` | Schedule table |
| `migrations/080_raw_crawl_payloads.sql` | Payload metadata table |
| `migrations/081_payload_fetch_columns.sql` | payload, last_fetch_at columns |
| `migrations/074_worker_task_queue.sql` | Task queue table |
---
## Legacy Code (DEPRECATED)
| File | Status | Replacement |
|------|--------|-------------|
| `src/services/scheduler.ts` | DEPRECATED | `task-scheduler.ts` |
| `dispensary_crawl_jobs` table | ORPHANED | `worker_tasks` |
| `job_schedules` table | LEGACY | `task_schedules` |
---
## Dashboard Integration
Both pages remain wired to the dashboard:
| Page | Data Source | Actions |
|------|-------------|---------|
| **Job Queue** | `worker_tasks`, `task_schedules` | Create tasks, view schedules |
| **Task Queue** | `worker_tasks` | View tasks, recover stale |
---
## Multi-Replica Safety
The scheduler uses `SELECT FOR UPDATE SKIP LOCKED` to ensure:
1. **Only one replica** executes a schedule at a time
2. **No duplicate tasks** created
3. **Survives pod restarts** - state in DB, not memory
4. **Self-healing** - recovers stale tasks on startup
```sql
-- This query is atomic across all API server replicas
SELECT * FROM task_schedules
WHERE enabled = true AND next_run_at <= NOW()
FOR UPDATE SKIP LOCKED
```
---
## Worker Scaling (K8s)
Workers run as a StatefulSet in Kubernetes. You can scale from the admin UI or CLI.
### From Admin UI
The Workers page (`/admin/workers`) provides:
- Current replica count display
- Scale up/down buttons
- Target replica input
### API Endpoints
| Endpoint | Method | Description |
|----------|--------|-------------|
| `GET /api/workers/k8s/replicas` | GET | Get current/desired replica counts |
| `POST /api/workers/k8s/scale` | POST | Scale to N replicas (body: `{ replicas: N }`) |
### From CLI
```bash
# View current replicas
kubectl get statefulset scraper-worker -n dispensary-scraper
# Scale to 10 workers
kubectl scale statefulset scraper-worker -n dispensary-scraper --replicas=10
# Scale down to 3 workers
kubectl scale statefulset scraper-worker -n dispensary-scraper --replicas=3
```
### Configuration
Environment variables for the API server:
| Variable | Default | Description |
|----------|---------|-------------|
| `K8S_NAMESPACE` | `dispensary-scraper` | Kubernetes namespace |
| `K8S_WORKER_STATEFULSET` | `scraper-worker` | StatefulSet name |
### RBAC Requirements
The API server pod needs these K8s permissions:
```yaml
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: worker-scaler
namespace: dispensary-scraper
rules:
- apiGroups: ["apps"]
resources: ["statefulsets"]
verbs: ["get", "patch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: scraper-worker-scaler
namespace: dispensary-scraper
subjects:
- kind: ServiceAccount
name: default
namespace: dispensary-scraper
roleRef:
kind: Role
name: worker-scaler
apiGroup: rbac.authorization.k8s.io
```

View File

@@ -1,77 +0,0 @@
apiVersion: v1
kind: Service
metadata:
name: scraper-worker
namespace: dispensary-scraper
labels:
app: scraper-worker
spec:
clusterIP: None # Headless service required for StatefulSet
selector:
app: scraper-worker
ports:
- port: 3010
name: http
---
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: scraper-worker
namespace: dispensary-scraper
spec:
serviceName: scraper-worker
replicas: 8
podManagementPolicy: Parallel # Start all pods at once
updateStrategy:
type: OnDelete # Pods only update when manually deleted - no automatic restarts
selector:
matchLabels:
app: scraper-worker
template:
metadata:
labels:
app: scraper-worker
spec:
terminationGracePeriodSeconds: 60
imagePullSecrets:
- name: regcred
containers:
- name: worker
image: code.cannabrands.app/creationshop/dispensary-scraper:latest
imagePullPolicy: Always
command: ["node"]
args: ["dist/tasks/task-worker.js"]
env:
- name: WORKER_MODE
value: "true"
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: MAX_CONCURRENT_TASKS
value: "50"
- name: API_BASE_URL
value: http://scraper
- name: NODE_OPTIONS
value: --max-old-space-size=1500
envFrom:
- configMapRef:
name: scraper-config
- secretRef:
name: scraper-secrets
resources:
requests:
cpu: 100m
memory: 1Gi
limits:
cpu: 500m
memory: 2Gi
livenessProbe:
exec:
command:
- /bin/sh
- -c
- pgrep -f 'task-worker' > /dev/null
initialDelaySeconds: 10
periodSeconds: 30
failureThreshold: 3

View File

@@ -1,27 +0,0 @@
-- Migration: Worker Commands Table
-- Purpose: Store commands for workers (decommission, etc.)
-- Workers poll this table after each task to check for commands
CREATE TABLE IF NOT EXISTS worker_commands (
id SERIAL PRIMARY KEY,
worker_id TEXT NOT NULL,
command TEXT NOT NULL, -- 'decommission', 'pause', 'resume'
reason TEXT,
issued_by TEXT,
issued_at TIMESTAMPTZ DEFAULT NOW(),
acknowledged_at TIMESTAMPTZ,
executed_at TIMESTAMPTZ,
status TEXT DEFAULT 'pending' -- 'pending', 'acknowledged', 'executed', 'cancelled'
);
-- Index for worker lookups
CREATE INDEX IF NOT EXISTS idx_worker_commands_worker_id ON worker_commands(worker_id);
CREATE INDEX IF NOT EXISTS idx_worker_commands_pending ON worker_commands(worker_id, status) WHERE status = 'pending';
-- Add decommission_requested column to worker_registry for quick checks
ALTER TABLE worker_registry ADD COLUMN IF NOT EXISTS decommission_requested BOOLEAN DEFAULT FALSE;
ALTER TABLE worker_registry ADD COLUMN IF NOT EXISTS decommission_reason TEXT;
ALTER TABLE worker_registry ADD COLUMN IF NOT EXISTS decommission_requested_at TIMESTAMPTZ;
-- Comment
COMMENT ON TABLE worker_commands IS 'Commands issued to workers (decommission after task, pause, etc.)';

View File

@@ -1,8 +0,0 @@
-- Migration 078: Add consecutive_403_count to proxies table
-- Per workflow-12102025.md: Track consecutive 403s per proxy
-- After 3 consecutive 403s with different fingerprints → disable proxy
ALTER TABLE proxies ADD COLUMN IF NOT EXISTS consecutive_403_count INTEGER DEFAULT 0;
-- Add comment explaining the column
COMMENT ON COLUMN proxies.consecutive_403_count IS 'Tracks consecutive 403 blocks. Reset to 0 on success. Proxy disabled at 3.';

View File

@@ -1,49 +0,0 @@
-- Migration 079: Task Schedules for Database-Driven Scheduler
-- Per TASK_WORKFLOW_2024-12-10.md: Replaces node-cron with DB-driven scheduling
--
-- 2024-12-10: Created for reliable, multi-replica-safe task scheduling
-- task_schedules: Stores schedule definitions and state
CREATE TABLE IF NOT EXISTS task_schedules (
id SERIAL PRIMARY KEY,
name VARCHAR(100) NOT NULL UNIQUE,
role VARCHAR(50) NOT NULL, -- TaskRole: product_refresh, store_discovery, etc.
description TEXT,
-- Schedule configuration
enabled BOOLEAN DEFAULT TRUE,
interval_hours INTEGER NOT NULL DEFAULT 4,
priority INTEGER DEFAULT 0,
-- Optional scope filters
state_code VARCHAR(2), -- NULL = all states
platform VARCHAR(50), -- NULL = all platforms
-- Execution state (updated by scheduler)
last_run_at TIMESTAMPTZ,
next_run_at TIMESTAMPTZ,
last_task_count INTEGER DEFAULT 0,
last_error TEXT,
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW()
);
-- Indexes for scheduler queries
CREATE INDEX IF NOT EXISTS idx_task_schedules_enabled ON task_schedules(enabled) WHERE enabled = TRUE;
CREATE INDEX IF NOT EXISTS idx_task_schedules_next_run ON task_schedules(next_run_at) WHERE enabled = TRUE;
-- Insert default schedules
INSERT INTO task_schedules (name, role, interval_hours, priority, description, next_run_at)
VALUES
('product_refresh_all', 'product_refresh', 4, 0, 'Generate product refresh tasks for all crawl-enabled stores every 4 hours', NOW()),
('store_discovery_dutchie', 'store_discovery', 24, 5, 'Discover new Dutchie stores daily', NOW()),
('analytics_refresh', 'analytics_refresh', 6, 0, 'Refresh analytics materialized views every 6 hours', NOW())
ON CONFLICT (name) DO NOTHING;
-- Comment for documentation
COMMENT ON TABLE task_schedules IS 'Database-driven task scheduler configuration. Per TASK_WORKFLOW_2024-12-10.md:
- Schedules persist in DB (survive restarts)
- Uses SELECT FOR UPDATE SKIP LOCKED for multi-replica safety
- Scheduler polls every 60s and executes due schedules
- Creates tasks in worker_tasks for task-worker.ts to process';

View File

@@ -1,58 +0,0 @@
-- Migration 080: Raw Crawl Payloads Metadata Table
-- Per TASK_WORKFLOW_2024-12-10.md: Store full GraphQL payloads for historical analysis
--
-- Design Pattern: Metadata/Payload Separation
-- - Metadata (this table): Small, indexed, queryable
-- - Payload (filesystem): Gzipped JSON at storage_path
--
-- Benefits:
-- - Compare any two crawls to see what changed
-- - Replay/re-normalize historical data if logic changes
-- - Debug issues by seeing exactly what the API returned
-- - DB stays small, backups stay fast
--
-- Storage location: /storage/payloads/{year}/{month}/{day}/store_{id}_{timestamp}.json.gz
-- Compression: ~90% reduction (1.5MB -> 150KB per crawl)
CREATE TABLE IF NOT EXISTS raw_crawl_payloads (
id SERIAL PRIMARY KEY,
-- Links to crawl tracking
crawl_run_id INTEGER REFERENCES crawl_runs(id) ON DELETE SET NULL,
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
-- File location (gzipped JSON)
storage_path TEXT NOT NULL,
-- Metadata for quick queries without loading file
product_count INTEGER NOT NULL DEFAULT 0,
size_bytes INTEGER, -- Compressed size
size_bytes_raw INTEGER, -- Uncompressed size
-- Timestamps
fetched_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
-- Optional: checksum for integrity verification
checksum_sha256 VARCHAR(64)
);
-- Indexes for common queries
CREATE INDEX IF NOT EXISTS idx_raw_crawl_payloads_dispensary
ON raw_crawl_payloads(dispensary_id);
CREATE INDEX IF NOT EXISTS idx_raw_crawl_payloads_dispensary_fetched
ON raw_crawl_payloads(dispensary_id, fetched_at DESC);
CREATE INDEX IF NOT EXISTS idx_raw_crawl_payloads_fetched
ON raw_crawl_payloads(fetched_at DESC);
CREATE INDEX IF NOT EXISTS idx_raw_crawl_payloads_crawl_run
ON raw_crawl_payloads(crawl_run_id)
WHERE crawl_run_id IS NOT NULL;
-- Comments
COMMENT ON TABLE raw_crawl_payloads IS 'Metadata for raw GraphQL payloads stored on filesystem. Per TASK_WORKFLOW_2024-12-10.md: Full payloads enable historical diffs and replay.';
COMMENT ON COLUMN raw_crawl_payloads.storage_path IS 'Path to gzipped JSON file, e.g. /storage/payloads/2024/12/10/store_123_1702234567.json.gz';
COMMENT ON COLUMN raw_crawl_payloads.size_bytes IS 'Compressed file size in bytes';
COMMENT ON COLUMN raw_crawl_payloads.size_bytes_raw IS 'Uncompressed payload size in bytes';

View File

@@ -1,37 +0,0 @@
-- Migration 081: Payload Fetch Columns
-- Per TASK_WORKFLOW_2024-12-10.md: Separates API fetch from data processing
--
-- New architecture:
-- - payload_fetch: Hits Dutchie API, saves raw payload to disk
-- - product_refresh: Reads local payload, normalizes, upserts to DB
--
-- This migration adds:
-- 1. payload column to worker_tasks (for task chaining data)
-- 2. processed_at column to raw_crawl_payloads (track when payload was processed)
-- 3. last_fetch_at column to dispensaries (track when last payload was fetched)
-- Add payload column to worker_tasks for task chaining
-- Used by payload_fetch to pass payload_id to product_refresh
ALTER TABLE worker_tasks
ADD COLUMN IF NOT EXISTS payload JSONB DEFAULT NULL;
COMMENT ON COLUMN worker_tasks.payload IS 'Per TASK_WORKFLOW_2024-12-10.md: Task chaining data (e.g., payload_id from payload_fetch to product_refresh)';
-- Add processed_at to raw_crawl_payloads
-- Tracks when the payload was processed by product_refresh
ALTER TABLE raw_crawl_payloads
ADD COLUMN IF NOT EXISTS processed_at TIMESTAMPTZ DEFAULT NULL;
COMMENT ON COLUMN raw_crawl_payloads.processed_at IS 'When this payload was processed by product_refresh handler';
-- Index for finding unprocessed payloads
CREATE INDEX IF NOT EXISTS idx_raw_crawl_payloads_unprocessed
ON raw_crawl_payloads(dispensary_id, fetched_at DESC)
WHERE processed_at IS NULL;
-- Add last_fetch_at to dispensaries
-- Tracks when the last payload was fetched (separate from last_crawl_at which is when processing completed)
ALTER TABLE dispensaries
ADD COLUMN IF NOT EXISTS last_fetch_at TIMESTAMPTZ DEFAULT NULL;
COMMENT ON COLUMN dispensaries.last_fetch_at IS 'Per TASK_WORKFLOW_2024-12-10.md: When last payload was fetched from API (separate from last_crawl_at which is when processing completed)';

View File

@@ -1,27 +0,0 @@
-- Migration: 082_proxy_notification_trigger
-- Date: 2024-12-11
-- Description: Add PostgreSQL NOTIFY trigger to alert workers when proxies are added
-- Create function to notify workers when active proxy is added/activated
CREATE OR REPLACE FUNCTION notify_proxy_added()
RETURNS TRIGGER AS $$
BEGIN
-- Only notify if proxy is active
IF NEW.active = true THEN
PERFORM pg_notify('proxy_added', NEW.id::text);
END IF;
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
-- Drop existing trigger if any
DROP TRIGGER IF EXISTS proxy_added_trigger ON proxies;
-- Create trigger on insert and update of active column
CREATE TRIGGER proxy_added_trigger
AFTER INSERT OR UPDATE OF active ON proxies
FOR EACH ROW
EXECUTE FUNCTION notify_proxy_added();
COMMENT ON FUNCTION notify_proxy_added() IS
'Sends PostgreSQL NOTIFY to proxy_added channel when an active proxy is added or activated. Workers LISTEN on this channel to wake up immediately.';

View File

@@ -1,88 +0,0 @@
-- Migration 083: Discovery Run Tracking
-- Tracks progress of store discovery runs step-by-step
-- Main discovery runs table
CREATE TABLE IF NOT EXISTS discovery_runs (
id SERIAL PRIMARY KEY,
platform VARCHAR(50) NOT NULL DEFAULT 'dutchie',
status VARCHAR(20) NOT NULL DEFAULT 'running', -- running, completed, failed
started_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
finished_at TIMESTAMPTZ,
task_id INTEGER REFERENCES worker_task_queue(id),
-- Totals
states_total INTEGER DEFAULT 0,
states_completed INTEGER DEFAULT 0,
locations_discovered INTEGER DEFAULT 0,
locations_promoted INTEGER DEFAULT 0,
new_store_ids INTEGER[] DEFAULT '{}',
-- Error info
error_message TEXT,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
-- Per-state progress within a run
CREATE TABLE IF NOT EXISTS discovery_run_states (
id SERIAL PRIMARY KEY,
run_id INTEGER NOT NULL REFERENCES discovery_runs(id) ON DELETE CASCADE,
state_code VARCHAR(2) NOT NULL,
status VARCHAR(20) NOT NULL DEFAULT 'pending', -- pending, running, completed, failed
started_at TIMESTAMPTZ,
finished_at TIMESTAMPTZ,
-- Results
cities_found INTEGER DEFAULT 0,
locations_found INTEGER DEFAULT 0,
locations_upserted INTEGER DEFAULT 0,
new_dispensary_ids INTEGER[] DEFAULT '{}',
-- Error info
error_message TEXT,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
UNIQUE(run_id, state_code)
);
-- Step-by-step log for detailed progress tracking
CREATE TABLE IF NOT EXISTS discovery_run_steps (
id SERIAL PRIMARY KEY,
run_id INTEGER NOT NULL REFERENCES discovery_runs(id) ON DELETE CASCADE,
state_code VARCHAR(2),
step_name VARCHAR(100) NOT NULL,
status VARCHAR(20) NOT NULL DEFAULT 'started', -- started, completed, failed
started_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
finished_at TIMESTAMPTZ,
-- Details (JSON for flexibility)
details JSONB DEFAULT '{}',
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
-- Indexes for querying
CREATE INDEX IF NOT EXISTS idx_discovery_runs_status ON discovery_runs(status);
CREATE INDEX IF NOT EXISTS idx_discovery_runs_platform ON discovery_runs(platform);
CREATE INDEX IF NOT EXISTS idx_discovery_runs_started_at ON discovery_runs(started_at DESC);
CREATE INDEX IF NOT EXISTS idx_discovery_run_states_run_id ON discovery_run_states(run_id);
CREATE INDEX IF NOT EXISTS idx_discovery_run_steps_run_id ON discovery_run_steps(run_id);
-- View for latest run status per platform
CREATE OR REPLACE VIEW v_latest_discovery_runs AS
SELECT DISTINCT ON (platform)
id,
platform,
status,
started_at,
finished_at,
states_total,
states_completed,
locations_discovered,
locations_promoted,
array_length(new_store_ids, 1) as new_stores_count,
error_message,
EXTRACT(EPOCH FROM (COALESCE(finished_at, NOW()) - started_at)) as duration_seconds
FROM discovery_runs
ORDER BY platform, started_at DESC;

View File

@@ -1,253 +0,0 @@
-- Migration 084: Dual Transport Preflight System
-- Workers run both curl and http (Puppeteer) preflights on startup
-- Tasks can require a specific transport method
-- ===================================================================
-- PART 1: Add preflight columns to worker_registry
-- ===================================================================
-- Preflight status for curl/axios transport (proxy-based)
ALTER TABLE worker_registry
ADD COLUMN IF NOT EXISTS preflight_curl_status VARCHAR(20) DEFAULT 'pending';
-- Preflight status for http/Puppeteer transport (browser-based)
ALTER TABLE worker_registry
ADD COLUMN IF NOT EXISTS preflight_http_status VARCHAR(20) DEFAULT 'pending';
-- Timestamps for when each preflight completed
ALTER TABLE worker_registry
ADD COLUMN IF NOT EXISTS preflight_curl_at TIMESTAMPTZ;
ALTER TABLE worker_registry
ADD COLUMN IF NOT EXISTS preflight_http_at TIMESTAMPTZ;
-- Error messages for failed preflights
ALTER TABLE worker_registry
ADD COLUMN IF NOT EXISTS preflight_curl_error TEXT;
ALTER TABLE worker_registry
ADD COLUMN IF NOT EXISTS preflight_http_error TEXT;
-- Response time for successful preflights (ms)
ALTER TABLE worker_registry
ADD COLUMN IF NOT EXISTS preflight_curl_ms INTEGER;
ALTER TABLE worker_registry
ADD COLUMN IF NOT EXISTS preflight_http_ms INTEGER;
-- Constraints for preflight status values
ALTER TABLE worker_registry
DROP CONSTRAINT IF EXISTS valid_preflight_curl_status;
ALTER TABLE worker_registry
ADD CONSTRAINT valid_preflight_curl_status
CHECK (preflight_curl_status IN ('pending', 'passed', 'failed', 'skipped'));
ALTER TABLE worker_registry
DROP CONSTRAINT IF EXISTS valid_preflight_http_status;
ALTER TABLE worker_registry
ADD CONSTRAINT valid_preflight_http_status
CHECK (preflight_http_status IN ('pending', 'passed', 'failed', 'skipped'));
-- ===================================================================
-- PART 2: Add method column to worker_tasks
-- ===================================================================
-- Transport method requirement for the task
-- NULL = no preference (any worker can claim)
-- 'curl' = requires curl/axios transport (proxy-based, fast)
-- 'http' = requires http/Puppeteer transport (browser-based, anti-detect)
ALTER TABLE worker_tasks
ADD COLUMN IF NOT EXISTS method VARCHAR(10);
-- Constraint for valid method values
ALTER TABLE worker_tasks
DROP CONSTRAINT IF EXISTS valid_task_method;
ALTER TABLE worker_tasks
ADD CONSTRAINT valid_task_method
CHECK (method IS NULL OR method IN ('curl', 'http'));
-- Index for method-based task claiming
CREATE INDEX IF NOT EXISTS idx_worker_tasks_method
ON worker_tasks(method)
WHERE status = 'pending';
-- Set default method for all existing pending tasks to 'http'
-- ALL current tasks require Puppeteer/browser-based transport
UPDATE worker_tasks
SET method = 'http'
WHERE method IS NULL;
-- ===================================================================
-- PART 3: Update claim_task function for method compatibility
-- ===================================================================
CREATE OR REPLACE FUNCTION claim_task(
p_role VARCHAR(50),
p_worker_id VARCHAR(100),
p_curl_passed BOOLEAN DEFAULT TRUE,
p_http_passed BOOLEAN DEFAULT FALSE
) RETURNS worker_tasks AS $$
DECLARE
claimed_task worker_tasks;
BEGIN
UPDATE worker_tasks
SET
status = 'claimed',
worker_id = p_worker_id,
claimed_at = NOW(),
updated_at = NOW()
WHERE id = (
SELECT id FROM worker_tasks
WHERE role = p_role
AND status = 'pending'
AND (scheduled_for IS NULL OR scheduled_for <= NOW())
-- Method compatibility: worker must have passed the required preflight
AND (
method IS NULL -- No preference, any worker can claim
OR (method = 'curl' AND p_curl_passed = TRUE)
OR (method = 'http' AND p_http_passed = TRUE)
)
-- Exclude stores that already have an active task
AND (dispensary_id IS NULL OR dispensary_id NOT IN (
SELECT dispensary_id FROM worker_tasks
WHERE status IN ('claimed', 'running')
AND dispensary_id IS NOT NULL
))
ORDER BY priority DESC, created_at ASC
LIMIT 1
FOR UPDATE SKIP LOCKED
)
RETURNING * INTO claimed_task;
RETURN claimed_task;
END;
$$ LANGUAGE plpgsql;
-- ===================================================================
-- PART 4: Update v_active_workers view
-- ===================================================================
DROP VIEW IF EXISTS v_active_workers;
CREATE VIEW v_active_workers AS
SELECT
wr.id,
wr.worker_id,
wr.friendly_name,
wr.role,
wr.status,
wr.pod_name,
wr.hostname,
wr.started_at,
wr.last_heartbeat_at,
wr.last_task_at,
wr.tasks_completed,
wr.tasks_failed,
wr.current_task_id,
-- Preflight status
wr.preflight_curl_status,
wr.preflight_http_status,
wr.preflight_curl_at,
wr.preflight_http_at,
wr.preflight_curl_error,
wr.preflight_http_error,
wr.preflight_curl_ms,
wr.preflight_http_ms,
-- Computed fields
EXTRACT(EPOCH FROM (NOW() - wr.last_heartbeat_at)) as seconds_since_heartbeat,
CASE
WHEN wr.status = 'offline' THEN 'offline'
WHEN wr.last_heartbeat_at < NOW() - INTERVAL '2 minutes' THEN 'stale'
WHEN wr.current_task_id IS NOT NULL THEN 'busy'
ELSE 'ready'
END as health_status,
-- Capability flags (can this worker handle curl/http tasks?)
(wr.preflight_curl_status = 'passed') as can_curl,
(wr.preflight_http_status = 'passed') as can_http
FROM worker_registry wr
WHERE wr.status != 'terminated'
ORDER BY wr.status = 'active' DESC, wr.last_heartbeat_at DESC;
-- ===================================================================
-- PART 5: View for task queue with method info
-- ===================================================================
DROP VIEW IF EXISTS v_task_history;
CREATE VIEW v_task_history AS
SELECT
t.id,
t.role,
t.dispensary_id,
d.name as dispensary_name,
t.platform,
t.status,
t.priority,
t.method,
t.worker_id,
t.scheduled_for,
t.claimed_at,
t.started_at,
t.completed_at,
t.error_message,
t.retry_count,
t.created_at,
EXTRACT(EPOCH FROM (t.completed_at - t.started_at)) as duration_sec
FROM worker_tasks t
LEFT JOIN dispensaries d ON d.id = t.dispensary_id
ORDER BY t.created_at DESC;
-- ===================================================================
-- PART 6: Helper function to update worker preflight status
-- ===================================================================
CREATE OR REPLACE FUNCTION update_worker_preflight(
p_worker_id VARCHAR(100),
p_transport VARCHAR(10), -- 'curl' or 'http'
p_status VARCHAR(20), -- 'passed', 'failed', 'skipped'
p_response_ms INTEGER DEFAULT NULL,
p_error TEXT DEFAULT NULL
) RETURNS VOID AS $$
BEGIN
IF p_transport = 'curl' THEN
UPDATE worker_registry
SET
preflight_curl_status = p_status,
preflight_curl_at = NOW(),
preflight_curl_ms = p_response_ms,
preflight_curl_error = p_error,
updated_at = NOW()
WHERE worker_id = p_worker_id;
ELSIF p_transport = 'http' THEN
UPDATE worker_registry
SET
preflight_http_status = p_status,
preflight_http_at = NOW(),
preflight_http_ms = p_response_ms,
preflight_http_error = p_error,
updated_at = NOW()
WHERE worker_id = p_worker_id;
END IF;
END;
$$ LANGUAGE plpgsql;
-- ===================================================================
-- Comments
-- ===================================================================
COMMENT ON COLUMN worker_registry.preflight_curl_status IS 'Status of curl/axios preflight: pending, passed, failed, skipped';
COMMENT ON COLUMN worker_registry.preflight_http_status IS 'Status of http/Puppeteer preflight: pending, passed, failed, skipped';
COMMENT ON COLUMN worker_registry.preflight_curl_at IS 'When curl preflight completed';
COMMENT ON COLUMN worker_registry.preflight_http_at IS 'When http preflight completed';
COMMENT ON COLUMN worker_registry.preflight_curl_error IS 'Error message if curl preflight failed';
COMMENT ON COLUMN worker_registry.preflight_http_error IS 'Error message if http preflight failed';
COMMENT ON COLUMN worker_registry.preflight_curl_ms IS 'Response time of successful curl preflight (ms)';
COMMENT ON COLUMN worker_registry.preflight_http_ms IS 'Response time of successful http preflight (ms)';
COMMENT ON COLUMN worker_tasks.method IS 'Transport method required: NULL=any, curl=proxy-based, http=browser-based';
COMMENT ON FUNCTION claim_task IS 'Atomically claim a task, respecting method requirements and per-store locking';
COMMENT ON FUNCTION update_worker_preflight IS 'Update a workers preflight status for a given transport';

View File

@@ -1,168 +0,0 @@
-- Migration 085: Add IP and fingerprint columns for preflight reporting
-- These columns were missing from migration 084
-- ===================================================================
-- PART 1: Add IP address columns to worker_registry
-- ===================================================================
-- IP address detected during curl/axios preflight
ALTER TABLE worker_registry
ADD COLUMN IF NOT EXISTS curl_ip VARCHAR(45);
-- IP address detected during http/Puppeteer preflight
ALTER TABLE worker_registry
ADD COLUMN IF NOT EXISTS http_ip VARCHAR(45);
-- ===================================================================
-- PART 2: Add fingerprint data column
-- ===================================================================
-- Browser fingerprint data captured during Puppeteer preflight
ALTER TABLE worker_registry
ADD COLUMN IF NOT EXISTS fingerprint_data JSONB;
-- ===================================================================
-- PART 3: Add combined preflight status/timestamp for convenience
-- ===================================================================
-- Overall preflight status (computed from both transports)
-- Values: 'pending', 'passed', 'partial', 'failed'
-- - 'pending': neither transport tested
-- - 'passed': both transports passed (or http passed for browser-only)
-- - 'partial': at least one passed
-- - 'failed': no transport passed
ALTER TABLE worker_registry
ADD COLUMN IF NOT EXISTS preflight_status VARCHAR(20) DEFAULT 'pending';
-- Most recent preflight completion timestamp
ALTER TABLE worker_registry
ADD COLUMN IF NOT EXISTS preflight_at TIMESTAMPTZ;
-- ===================================================================
-- PART 4: Update function to set preflight status
-- ===================================================================
CREATE OR REPLACE FUNCTION update_worker_preflight(
p_worker_id VARCHAR(100),
p_transport VARCHAR(10), -- 'curl' or 'http'
p_status VARCHAR(20), -- 'passed', 'failed', 'skipped'
p_ip VARCHAR(45) DEFAULT NULL,
p_response_ms INTEGER DEFAULT NULL,
p_error TEXT DEFAULT NULL,
p_fingerprint JSONB DEFAULT NULL
) RETURNS VOID AS $$
DECLARE
v_curl_status VARCHAR(20);
v_http_status VARCHAR(20);
v_overall_status VARCHAR(20);
BEGIN
IF p_transport = 'curl' THEN
UPDATE worker_registry
SET
preflight_curl_status = p_status,
preflight_curl_at = NOW(),
preflight_curl_ms = p_response_ms,
preflight_curl_error = p_error,
curl_ip = p_ip,
updated_at = NOW()
WHERE worker_id = p_worker_id;
ELSIF p_transport = 'http' THEN
UPDATE worker_registry
SET
preflight_http_status = p_status,
preflight_http_at = NOW(),
preflight_http_ms = p_response_ms,
preflight_http_error = p_error,
http_ip = p_ip,
fingerprint_data = COALESCE(p_fingerprint, fingerprint_data),
updated_at = NOW()
WHERE worker_id = p_worker_id;
END IF;
-- Update overall preflight status
SELECT preflight_curl_status, preflight_http_status
INTO v_curl_status, v_http_status
FROM worker_registry
WHERE worker_id = p_worker_id;
-- Compute overall status
IF v_curl_status = 'passed' AND v_http_status = 'passed' THEN
v_overall_status := 'passed';
ELSIF v_curl_status = 'passed' OR v_http_status = 'passed' THEN
v_overall_status := 'partial';
ELSIF v_curl_status = 'failed' OR v_http_status = 'failed' THEN
v_overall_status := 'failed';
ELSE
v_overall_status := 'pending';
END IF;
UPDATE worker_registry
SET
preflight_status = v_overall_status,
preflight_at = NOW()
WHERE worker_id = p_worker_id;
END;
$$ LANGUAGE plpgsql;
-- ===================================================================
-- PART 5: Update v_active_workers view
-- ===================================================================
DROP VIEW IF EXISTS v_active_workers;
CREATE VIEW v_active_workers AS
SELECT
wr.id,
wr.worker_id,
wr.friendly_name,
wr.role,
wr.status,
wr.pod_name,
wr.hostname,
wr.started_at,
wr.last_heartbeat_at,
wr.last_task_at,
wr.tasks_completed,
wr.tasks_failed,
wr.current_task_id,
-- IP addresses from preflights
wr.curl_ip,
wr.http_ip,
-- Combined preflight status
wr.preflight_status,
wr.preflight_at,
-- Detailed preflight status per transport
wr.preflight_curl_status,
wr.preflight_http_status,
wr.preflight_curl_at,
wr.preflight_http_at,
wr.preflight_curl_error,
wr.preflight_http_error,
wr.preflight_curl_ms,
wr.preflight_http_ms,
-- Fingerprint data
wr.fingerprint_data,
-- Computed fields
EXTRACT(EPOCH FROM (NOW() - wr.last_heartbeat_at)) as seconds_since_heartbeat,
CASE
WHEN wr.status = 'offline' THEN 'offline'
WHEN wr.last_heartbeat_at < NOW() - INTERVAL '2 minutes' THEN 'stale'
WHEN wr.current_task_id IS NOT NULL THEN 'busy'
ELSE 'ready'
END as health_status,
-- Capability flags (can this worker handle curl/http tasks?)
(wr.preflight_curl_status = 'passed') as can_curl,
(wr.preflight_http_status = 'passed') as can_http
FROM worker_registry wr
WHERE wr.status != 'terminated'
ORDER BY wr.status = 'active' DESC, wr.last_heartbeat_at DESC;
-- ===================================================================
-- Comments
-- ===================================================================
COMMENT ON COLUMN worker_registry.curl_ip IS 'IP address detected during curl/axios preflight';
COMMENT ON COLUMN worker_registry.http_ip IS 'IP address detected during Puppeteer preflight';
COMMENT ON COLUMN worker_registry.fingerprint_data IS 'Browser fingerprint captured during Puppeteer preflight';
COMMENT ON COLUMN worker_registry.preflight_status IS 'Overall preflight status: pending, passed, partial, failed';
COMMENT ON COLUMN worker_registry.preflight_at IS 'Most recent preflight completion timestamp';

View File

@@ -1,10 +0,0 @@
-- Migration 086: Add proxy_url column for alternative URL formats
-- Some proxy providers use non-standard URL formats (e.g., host:port:user:pass)
-- This column allows storing the raw URL directly
-- Add proxy_url column - if set, used directly instead of constructing from parts
ALTER TABLE proxies
ADD COLUMN IF NOT EXISTS proxy_url TEXT;
-- Add comment
COMMENT ON COLUMN proxies.proxy_url IS 'Raw proxy URL (if provider uses non-standard format). Takes precedence over constructed URL from host/port/user/pass.';

View File

@@ -1,30 +0,0 @@
-- Migration 088: Extend raw_crawl_payloads for discovery payloads
--
-- Enables saving raw store data from Dutchie discovery crawls.
-- Store discovery returns raw dispensary objects - save them for historical analysis.
-- Add payload_type to distinguish product crawls from discovery crawls
ALTER TABLE raw_crawl_payloads
ADD COLUMN IF NOT EXISTS payload_type VARCHAR(32) NOT NULL DEFAULT 'product';
-- Add state_code for discovery payloads (null for product payloads)
ALTER TABLE raw_crawl_payloads
ADD COLUMN IF NOT EXISTS state_code VARCHAR(10);
-- Add store_count for discovery payloads (alternative to product_count)
ALTER TABLE raw_crawl_payloads
ADD COLUMN IF NOT EXISTS store_count INTEGER;
-- Make dispensary_id nullable for discovery payloads
ALTER TABLE raw_crawl_payloads
ALTER COLUMN dispensary_id DROP NOT NULL;
-- Add index for discovery payload queries
CREATE INDEX IF NOT EXISTS idx_raw_crawl_payloads_type_state
ON raw_crawl_payloads(payload_type, state_code)
WHERE payload_type = 'store_discovery';
-- Comments
COMMENT ON COLUMN raw_crawl_payloads.payload_type IS 'Type: product (default), store_discovery';
COMMENT ON COLUMN raw_crawl_payloads.state_code IS 'State code for discovery payloads (e.g., AZ, MI)';
COMMENT ON COLUMN raw_crawl_payloads.store_count IS 'Number of stores in discovery payload';

View File

@@ -1,105 +0,0 @@
-- Migration 089: Immutable Schedules with Per-State Product Discovery
--
-- Key changes:
-- 1. Add is_immutable column - schedules can be edited but not deleted
-- 2. Add method column - all tasks use 'http' (Puppeteer transport)
-- 3. Store discovery weekly (168h)
-- 4. Per-state product_discovery schedules (4h default)
-- 5. Remove old payload_fetch schedules
-- =====================================================
-- 1) Add new columns to task_schedules
-- =====================================================
ALTER TABLE task_schedules
ADD COLUMN IF NOT EXISTS is_immutable BOOLEAN DEFAULT FALSE;
ALTER TABLE task_schedules
ADD COLUMN IF NOT EXISTS method VARCHAR(10) DEFAULT 'http';
-- =====================================================
-- 2) Update store_discovery to weekly and immutable
-- =====================================================
UPDATE task_schedules
SET interval_hours = 168, -- 7 days
is_immutable = TRUE,
method = 'http',
description = 'Discover new Dutchie stores weekly (HTTP transport)'
WHERE name = 'store_discovery_dutchie';
-- Insert if doesn't exist
INSERT INTO task_schedules (name, role, interval_hours, priority, description, is_immutable, method, platform, next_run_at)
VALUES ('store_discovery_dutchie', 'store_discovery', 168, 5, 'Discover new Dutchie stores weekly (HTTP transport)', TRUE, 'http', 'dutchie', NOW())
ON CONFLICT (name) DO UPDATE SET
interval_hours = 168,
is_immutable = TRUE,
method = 'http',
description = 'Discover new Dutchie stores weekly (HTTP transport)';
-- =====================================================
-- 3) Remove old payload_fetch and product_refresh_all schedules
-- =====================================================
DELETE FROM task_schedules WHERE name IN ('payload_fetch_all', 'product_refresh_all');
-- =====================================================
-- 4) Create per-state product_discovery schedules
-- =====================================================
-- One schedule per state that has dispensaries with active cannabis programs
INSERT INTO task_schedules (name, role, state_code, interval_hours, priority, description, is_immutable, method, enabled, next_run_at)
SELECT
'product_discovery_' || lower(s.code) AS name,
'product_discovery' AS role,
s.code AS state_code,
4 AS interval_hours, -- 4 hours default, editable
10 AS priority,
'Product discovery for ' || s.name || ' dispensaries (HTTP transport)' AS description,
TRUE AS is_immutable, -- Can edit but not delete
'http' AS method,
CASE WHEN s.is_active THEN TRUE ELSE FALSE END AS enabled,
-- Stagger start times: each state starts 5 minutes after the previous
NOW() + (ROW_NUMBER() OVER (ORDER BY s.code) * INTERVAL '5 minutes') AS next_run_at
FROM states s
WHERE EXISTS (
SELECT 1 FROM dispensaries d
WHERE d.state_id = s.id AND d.crawl_enabled = true
)
ON CONFLICT (name) DO UPDATE SET
is_immutable = TRUE,
method = 'http',
description = EXCLUDED.description;
-- Also create schedules for states that might have stores discovered later
INSERT INTO task_schedules (name, role, state_code, interval_hours, priority, description, is_immutable, method, enabled, next_run_at)
SELECT
'product_discovery_' || lower(s.code) AS name,
'product_discovery' AS role,
s.code AS state_code,
4 AS interval_hours,
10 AS priority,
'Product discovery for ' || s.name || ' dispensaries (HTTP transport)' AS description,
TRUE AS is_immutable,
'http' AS method,
FALSE AS enabled, -- Disabled until stores exist
NOW() + INTERVAL '1 hour'
FROM states s
WHERE NOT EXISTS (
SELECT 1 FROM task_schedules ts WHERE ts.name = 'product_discovery_' || lower(s.code)
)
ON CONFLICT (name) DO NOTHING;
-- =====================================================
-- 5) Make analytics_refresh immutable
-- =====================================================
UPDATE task_schedules
SET is_immutable = TRUE, method = 'http'
WHERE name = 'analytics_refresh';
-- =====================================================
-- 6) Add index for schedule lookups
-- =====================================================
CREATE INDEX IF NOT EXISTS idx_task_schedules_state_code
ON task_schedules(state_code)
WHERE state_code IS NOT NULL;
-- Comments
COMMENT ON COLUMN task_schedules.is_immutable IS 'If TRUE, schedule cannot be deleted (only edited)';
COMMENT ON COLUMN task_schedules.method IS 'Transport method: http (Puppeteer/browser) or curl (axios)';

View File

@@ -1,6 +1,6 @@
{ {
"name": "dutchie-menus-backend", "name": "dutchie-menus-backend",
"version": "1.6.0", "version": "1.5.1",
"lockfileVersion": 3, "lockfileVersion": 3,
"requires": true, "requires": true,
"packages": { "packages": {
@@ -46,97 +46,6 @@
"resolved": "https://registry.npmjs.org/@ioredis/commands/-/commands-1.4.0.tgz", "resolved": "https://registry.npmjs.org/@ioredis/commands/-/commands-1.4.0.tgz",
"integrity": "sha512-aFT2yemJJo+TZCmieA7qnYGQooOS7QfNmYrzGtsYd3g9j5iDP8AimYYAesf79ohjbLG12XxC4nG5DyEnC88AsQ==" "integrity": "sha512-aFT2yemJJo+TZCmieA7qnYGQooOS7QfNmYrzGtsYd3g9j5iDP8AimYYAesf79ohjbLG12XxC4nG5DyEnC88AsQ=="
}, },
"node_modules/@jsep-plugin/assignment": {
"version": "1.3.0",
"resolved": "https://registry.npmjs.org/@jsep-plugin/assignment/-/assignment-1.3.0.tgz",
"integrity": "sha512-VVgV+CXrhbMI3aSusQyclHkenWSAm95WaiKrMxRFam3JSUiIaQjoMIw2sEs/OX4XifnqeQUN4DYbJjlA8EfktQ==",
"engines": {
"node": ">= 10.16.0"
},
"peerDependencies": {
"jsep": "^0.4.0||^1.0.0"
}
},
"node_modules/@jsep-plugin/regex": {
"version": "1.0.4",
"resolved": "https://registry.npmjs.org/@jsep-plugin/regex/-/regex-1.0.4.tgz",
"integrity": "sha512-q7qL4Mgjs1vByCaTnDFcBnV9HS7GVPJX5vyVoCgZHNSC9rjwIlmbXG5sUuorR5ndfHAIlJ8pVStxvjXHbNvtUg==",
"engines": {
"node": ">= 10.16.0"
},
"peerDependencies": {
"jsep": "^0.4.0||^1.0.0"
}
},
"node_modules/@kubernetes/client-node": {
"version": "1.4.0",
"resolved": "https://registry.npmjs.org/@kubernetes/client-node/-/client-node-1.4.0.tgz",
"integrity": "sha512-Zge3YvF7DJi264dU1b3wb/GmzR99JhUpqTvp+VGHfwZT+g7EOOYNScDJNZwXy9cszyIGPIs0VHr+kk8e95qqrA==",
"dependencies": {
"@types/js-yaml": "^4.0.1",
"@types/node": "^24.0.0",
"@types/node-fetch": "^2.6.13",
"@types/stream-buffers": "^3.0.3",
"form-data": "^4.0.0",
"hpagent": "^1.2.0",
"isomorphic-ws": "^5.0.0",
"js-yaml": "^4.1.0",
"jsonpath-plus": "^10.3.0",
"node-fetch": "^2.7.0",
"openid-client": "^6.1.3",
"rfc4648": "^1.3.0",
"socks-proxy-agent": "^8.0.4",
"stream-buffers": "^3.0.2",
"tar-fs": "^3.0.9",
"ws": "^8.18.2"
}
},
"node_modules/@kubernetes/client-node/node_modules/@types/node": {
"version": "24.10.3",
"resolved": "https://registry.npmjs.org/@types/node/-/node-24.10.3.tgz",
"integrity": "sha512-gqkrWUsS8hcm0r44yn7/xZeV1ERva/nLgrLxFRUGb7aoNMIJfZJ3AC261zDQuOAKC7MiXai1WCpYc48jAHoShQ==",
"dependencies": {
"undici-types": "~7.16.0"
}
},
"node_modules/@kubernetes/client-node/node_modules/tar-fs": {
"version": "3.1.1",
"resolved": "https://registry.npmjs.org/tar-fs/-/tar-fs-3.1.1.tgz",
"integrity": "sha512-LZA0oaPOc2fVo82Txf3gw+AkEd38szODlptMYejQUhndHMLQ9M059uXR+AfS7DNo0NpINvSqDsvyaCrBVkptWg==",
"dependencies": {
"pump": "^3.0.0",
"tar-stream": "^3.1.5"
},
"optionalDependencies": {
"bare-fs": "^4.0.1",
"bare-path": "^3.0.0"
}
},
"node_modules/@kubernetes/client-node/node_modules/undici-types": {
"version": "7.16.0",
"resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.16.0.tgz",
"integrity": "sha512-Zz+aZWSj8LE6zoxD+xrjh4VfkIG8Ya6LvYkZqtUQGJPZjYl53ypCaUwWqo7eI0x66KBGeRo+mlBEkMSeSZ38Nw=="
},
"node_modules/@kubernetes/client-node/node_modules/ws": {
"version": "8.18.3",
"resolved": "https://registry.npmjs.org/ws/-/ws-8.18.3.tgz",
"integrity": "sha512-PEIGCY5tSlUt50cqyMXfCzX+oOPqN0vuGqWzbcJ2xvnkzkq46oOpz7dQaTDBdfICb4N14+GARUDw2XV2N4tvzg==",
"engines": {
"node": ">=10.0.0"
},
"peerDependencies": {
"bufferutil": "^4.0.1",
"utf-8-validate": ">=5.0.2"
},
"peerDependenciesMeta": {
"bufferutil": {
"optional": true
},
"utf-8-validate": {
"optional": true
}
}
},
"node_modules/@mapbox/node-pre-gyp": { "node_modules/@mapbox/node-pre-gyp": {
"version": "1.0.11", "version": "1.0.11",
"resolved": "https://registry.npmjs.org/@mapbox/node-pre-gyp/-/node-pre-gyp-1.0.11.tgz", "resolved": "https://registry.npmjs.org/@mapbox/node-pre-gyp/-/node-pre-gyp-1.0.11.tgz",
@@ -342,11 +251,6 @@
"integrity": "sha512-r8Tayk8HJnX0FztbZN7oVqGccWgw98T/0neJphO91KkmOzug1KkofZURD4UaD5uH8AqcFLfdPErnBod0u71/qg==", "integrity": "sha512-r8Tayk8HJnX0FztbZN7oVqGccWgw98T/0neJphO91KkmOzug1KkofZURD4UaD5uH8AqcFLfdPErnBod0u71/qg==",
"dev": true "dev": true
}, },
"node_modules/@types/js-yaml": {
"version": "4.0.9",
"resolved": "https://registry.npmjs.org/@types/js-yaml/-/js-yaml-4.0.9.tgz",
"integrity": "sha512-k4MGaQl5TGo/iipqb2UDG2UwjXziSWkh0uysQelTlJpX1qGlpUZYm8PnO4DxG1qBomtJUdYJ6qR6xdIah10JLg=="
},
"node_modules/@types/jsonwebtoken": { "node_modules/@types/jsonwebtoken": {
"version": "9.0.10", "version": "9.0.10",
"resolved": "https://registry.npmjs.org/@types/jsonwebtoken/-/jsonwebtoken-9.0.10.tgz", "resolved": "https://registry.npmjs.org/@types/jsonwebtoken/-/jsonwebtoken-9.0.10.tgz",
@@ -372,6 +276,7 @@
"version": "20.19.25", "version": "20.19.25",
"resolved": "https://registry.npmjs.org/@types/node/-/node-20.19.25.tgz", "resolved": "https://registry.npmjs.org/@types/node/-/node-20.19.25.tgz",
"integrity": "sha512-ZsJzA5thDQMSQO788d7IocwwQbI8B5OPzmqNvpf3NY/+MHDAS759Wo0gd2WQeXYt5AAAQjzcrTVC6SKCuYgoCQ==", "integrity": "sha512-ZsJzA5thDQMSQO788d7IocwwQbI8B5OPzmqNvpf3NY/+MHDAS759Wo0gd2WQeXYt5AAAQjzcrTVC6SKCuYgoCQ==",
"devOptional": true,
"dependencies": { "dependencies": {
"undici-types": "~6.21.0" "undici-types": "~6.21.0"
} }
@@ -382,15 +287,6 @@
"integrity": "sha512-0ikrnug3/IyneSHqCBeslAhlK2aBfYek1fGo4bP4QnZPmiqSGRK+Oy7ZMisLWkesffJvQ1cqAcBnJC+8+nxIAg==", "integrity": "sha512-0ikrnug3/IyneSHqCBeslAhlK2aBfYek1fGo4bP4QnZPmiqSGRK+Oy7ZMisLWkesffJvQ1cqAcBnJC+8+nxIAg==",
"dev": true "dev": true
}, },
"node_modules/@types/node-fetch": {
"version": "2.6.13",
"resolved": "https://registry.npmjs.org/@types/node-fetch/-/node-fetch-2.6.13.tgz",
"integrity": "sha512-QGpRVpzSaUs30JBSGPjOg4Uveu384erbHBoT1zeONvyCfwQxIkUshLAOqN/k9EjGviPRmWTTe6aH2qySWKTVSw==",
"dependencies": {
"@types/node": "*",
"form-data": "^4.0.4"
}
},
"node_modules/@types/pg": { "node_modules/@types/pg": {
"version": "8.15.6", "version": "8.15.6",
"resolved": "https://registry.npmjs.org/@types/pg/-/pg-8.15.6.tgz", "resolved": "https://registry.npmjs.org/@types/pg/-/pg-8.15.6.tgz",
@@ -444,14 +340,6 @@
"@types/node": "*" "@types/node": "*"
} }
}, },
"node_modules/@types/stream-buffers": {
"version": "3.0.8",
"resolved": "https://registry.npmjs.org/@types/stream-buffers/-/stream-buffers-3.0.8.tgz",
"integrity": "sha512-J+7VaHKNvlNPJPEJXX/fKa9DZtR/xPMwuIbe+yNOwp1YB+ApUOBv2aUpEoBJEi8nJgbgs1x8e73ttg0r1rSUdw==",
"dependencies": {
"@types/node": "*"
}
},
"node_modules/@types/uuid": { "node_modules/@types/uuid": {
"version": "9.0.8", "version": "9.0.8",
"resolved": "https://registry.npmjs.org/@types/uuid/-/uuid-9.0.8.tgz", "resolved": "https://registry.npmjs.org/@types/uuid/-/uuid-9.0.8.tgz",
@@ -632,78 +520,6 @@
} }
} }
}, },
"node_modules/bare-fs": {
"version": "4.5.2",
"resolved": "https://registry.npmjs.org/bare-fs/-/bare-fs-4.5.2.tgz",
"integrity": "sha512-veTnRzkb6aPHOvSKIOy60KzURfBdUflr5VReI+NSaPL6xf+XLdONQgZgpYvUuZLVQ8dCqxpBAudaOM1+KpAUxw==",
"optional": true,
"dependencies": {
"bare-events": "^2.5.4",
"bare-path": "^3.0.0",
"bare-stream": "^2.6.4",
"bare-url": "^2.2.2",
"fast-fifo": "^1.3.2"
},
"engines": {
"bare": ">=1.16.0"
},
"peerDependencies": {
"bare-buffer": "*"
},
"peerDependenciesMeta": {
"bare-buffer": {
"optional": true
}
}
},
"node_modules/bare-os": {
"version": "3.6.2",
"resolved": "https://registry.npmjs.org/bare-os/-/bare-os-3.6.2.tgz",
"integrity": "sha512-T+V1+1srU2qYNBmJCXZkUY5vQ0B4FSlL3QDROnKQYOqeiQR8UbjNHlPa+TIbM4cuidiN9GaTaOZgSEgsvPbh5A==",
"optional": true,
"engines": {
"bare": ">=1.14.0"
}
},
"node_modules/bare-path": {
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/bare-path/-/bare-path-3.0.0.tgz",
"integrity": "sha512-tyfW2cQcB5NN8Saijrhqn0Zh7AnFNsnczRcuWODH0eYAXBsJ5gVxAUuNr7tsHSC6IZ77cA0SitzT+s47kot8Mw==",
"optional": true,
"dependencies": {
"bare-os": "^3.0.1"
}
},
"node_modules/bare-stream": {
"version": "2.7.0",
"resolved": "https://registry.npmjs.org/bare-stream/-/bare-stream-2.7.0.tgz",
"integrity": "sha512-oyXQNicV1y8nc2aKffH+BUHFRXmx6VrPzlnaEvMhram0nPBrKcEdcyBg5r08D0i8VxngHFAiVyn1QKXpSG0B8A==",
"optional": true,
"dependencies": {
"streamx": "^2.21.0"
},
"peerDependencies": {
"bare-buffer": "*",
"bare-events": "*"
},
"peerDependenciesMeta": {
"bare-buffer": {
"optional": true
},
"bare-events": {
"optional": true
}
}
},
"node_modules/bare-url": {
"version": "2.3.2",
"resolved": "https://registry.npmjs.org/bare-url/-/bare-url-2.3.2.tgz",
"integrity": "sha512-ZMq4gd9ngV5aTMa5p9+UfY0b3skwhHELaDkhEHetMdX0LRkW9kzaym4oo/Eh+Ghm0CCDuMTsRIGM/ytUc1ZYmw==",
"optional": true,
"dependencies": {
"bare-path": "^3.0.0"
}
},
"node_modules/base64-js": { "node_modules/base64-js": {
"version": "1.5.1", "version": "1.5.1",
"resolved": "https://registry.npmjs.org/base64-js/-/base64-js-1.5.1.tgz", "resolved": "https://registry.npmjs.org/base64-js/-/base64-js-1.5.1.tgz",
@@ -2203,14 +2019,6 @@
"node": ">=16.0.0" "node": ">=16.0.0"
} }
}, },
"node_modules/hpagent": {
"version": "1.2.0",
"resolved": "https://registry.npmjs.org/hpagent/-/hpagent-1.2.0.tgz",
"integrity": "sha512-A91dYTeIB6NoXG+PxTQpCCDDnfHsW9kc06Lvpu1TEe9gnd6ZFeiBoRO9JvzEv6xK7EX97/dUE8g/vBMTqTS3CA==",
"engines": {
"node": ">=14"
}
},
"node_modules/htmlparser2": { "node_modules/htmlparser2": {
"version": "10.0.0", "version": "10.0.0",
"resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-10.0.0.tgz", "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-10.0.0.tgz",
@@ -2574,22 +2382,6 @@
"node": ">=0.10.0" "node": ">=0.10.0"
} }
}, },
"node_modules/isomorphic-ws": {
"version": "5.0.0",
"resolved": "https://registry.npmjs.org/isomorphic-ws/-/isomorphic-ws-5.0.0.tgz",
"integrity": "sha512-muId7Zzn9ywDsyXgTIafTry2sV3nySZeUDe6YedVd1Hvuuep5AsIlqK+XefWpYTyJG5e503F2xIuT2lcU6rCSw==",
"peerDependencies": {
"ws": "*"
}
},
"node_modules/jose": {
"version": "6.1.3",
"resolved": "https://registry.npmjs.org/jose/-/jose-6.1.3.tgz",
"integrity": "sha512-0TpaTfihd4QMNwrz/ob2Bp7X04yuxJkjRGi4aKmOqwhov54i6u79oCv7T+C7lo70MKH6BesI3vscD1yb/yzKXQ==",
"funding": {
"url": "https://github.com/sponsors/panva"
}
},
"node_modules/js-tokens": { "node_modules/js-tokens": {
"version": "4.0.0", "version": "4.0.0",
"resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-4.0.0.tgz", "resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-4.0.0.tgz",
@@ -2606,14 +2398,6 @@
"js-yaml": "bin/js-yaml.js" "js-yaml": "bin/js-yaml.js"
} }
}, },
"node_modules/jsep": {
"version": "1.4.0",
"resolved": "https://registry.npmjs.org/jsep/-/jsep-1.4.0.tgz",
"integrity": "sha512-B7qPcEVE3NVkmSJbaYxvv4cHkVW7DQsZz13pUMrfS8z8Q/BuShN+gcTXrUlPiGqM2/t/EEaI030bpxMqY8gMlw==",
"engines": {
"node": ">= 10.16.0"
}
},
"node_modules/json-parse-even-better-errors": { "node_modules/json-parse-even-better-errors": {
"version": "2.3.1", "version": "2.3.1",
"resolved": "https://registry.npmjs.org/json-parse-even-better-errors/-/json-parse-even-better-errors-2.3.1.tgz", "resolved": "https://registry.npmjs.org/json-parse-even-better-errors/-/json-parse-even-better-errors-2.3.1.tgz",
@@ -2635,23 +2419,6 @@
"graceful-fs": "^4.1.6" "graceful-fs": "^4.1.6"
} }
}, },
"node_modules/jsonpath-plus": {
"version": "10.3.0",
"resolved": "https://registry.npmjs.org/jsonpath-plus/-/jsonpath-plus-10.3.0.tgz",
"integrity": "sha512-8TNmfeTCk2Le33A3vRRwtuworG/L5RrgMvdjhKZxvyShO+mBu2fP50OWUjRLNtvw344DdDarFh9buFAZs5ujeA==",
"dependencies": {
"@jsep-plugin/assignment": "^1.3.0",
"@jsep-plugin/regex": "^1.0.4",
"jsep": "^1.4.0"
},
"bin": {
"jsonpath": "bin/jsonpath-cli.js",
"jsonpath-plus": "bin/jsonpath-cli.js"
},
"engines": {
"node": ">=18.0.0"
}
},
"node_modules/jsonwebtoken": { "node_modules/jsonwebtoken": {
"version": "9.0.2", "version": "9.0.2",
"resolved": "https://registry.npmjs.org/jsonwebtoken/-/jsonwebtoken-9.0.2.tgz", "resolved": "https://registry.npmjs.org/jsonwebtoken/-/jsonwebtoken-9.0.2.tgz",
@@ -2726,11 +2493,6 @@
"resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz", "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz",
"integrity": "sha512-v2kDEe57lecTulaDIuNTPy3Ry4gLGJ6Z1O3vE1krgXZNrsQ+LFTGHVxVjcXPs17LhbZVGedAJv8XZ1tvj5FvSg==" "integrity": "sha512-v2kDEe57lecTulaDIuNTPy3Ry4gLGJ6Z1O3vE1krgXZNrsQ+LFTGHVxVjcXPs17LhbZVGedAJv8XZ1tvj5FvSg=="
}, },
"node_modules/lodash.clonedeep": {
"version": "4.5.0",
"resolved": "https://registry.npmjs.org/lodash.clonedeep/-/lodash.clonedeep-4.5.0.tgz",
"integrity": "sha512-H5ZhCF25riFd9uB5UCkVKo61m3S/xZk1x4wA6yp/L3RFP6Z/eHH1ymQcGLo7J3GMPfm0V/7m1tryHuGVxpqEBQ=="
},
"node_modules/lodash.defaults": { "node_modules/lodash.defaults": {
"version": "4.2.0", "version": "4.2.0",
"resolved": "https://registry.npmjs.org/lodash.defaults/-/lodash.defaults-4.2.0.tgz", "resolved": "https://registry.npmjs.org/lodash.defaults/-/lodash.defaults-4.2.0.tgz",
@@ -3180,14 +2942,6 @@
"url": "https://github.com/fb55/nth-check?sponsor=1" "url": "https://github.com/fb55/nth-check?sponsor=1"
} }
}, },
"node_modules/oauth4webapi": {
"version": "3.8.3",
"resolved": "https://registry.npmjs.org/oauth4webapi/-/oauth4webapi-3.8.3.tgz",
"integrity": "sha512-pQ5BsX3QRTgnt5HxgHwgunIRaDXBdkT23tf8dfzmtTIL2LTpdmxgbpbBm0VgFWAIDlezQvQCTgnVIUmHupXHxw==",
"funding": {
"url": "https://github.com/sponsors/panva"
}
},
"node_modules/object-assign": { "node_modules/object-assign": {
"version": "4.1.1", "version": "4.1.1",
"resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz", "resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz",
@@ -3226,18 +2980,6 @@
"wrappy": "1" "wrappy": "1"
} }
}, },
"node_modules/openid-client": {
"version": "6.8.1",
"resolved": "https://registry.npmjs.org/openid-client/-/openid-client-6.8.1.tgz",
"integrity": "sha512-VoYT6enBo6Vj2j3Q5Ec0AezS+9YGzQo1f5Xc42lreMGlfP4ljiXPKVDvCADh+XHCV/bqPu/wWSiCVXbJKvrODw==",
"dependencies": {
"jose": "^6.1.0",
"oauth4webapi": "^3.8.2"
},
"funding": {
"url": "https://github.com/sponsors/panva"
}
},
"node_modules/pac-proxy-agent": { "node_modules/pac-proxy-agent": {
"version": "7.2.0", "version": "7.2.0",
"resolved": "https://registry.npmjs.org/pac-proxy-agent/-/pac-proxy-agent-7.2.0.tgz", "resolved": "https://registry.npmjs.org/pac-proxy-agent/-/pac-proxy-agent-7.2.0.tgz",
@@ -4141,11 +3883,6 @@
"url": "https://github.com/privatenumber/resolve-pkg-maps?sponsor=1" "url": "https://github.com/privatenumber/resolve-pkg-maps?sponsor=1"
} }
}, },
"node_modules/rfc4648": {
"version": "1.5.4",
"resolved": "https://registry.npmjs.org/rfc4648/-/rfc4648-1.5.4.tgz",
"integrity": "sha512-rRg/6Lb+IGfJqO05HZkN50UtY7K/JhxJag1kP23+zyMfrvoB0B7RWv06MbOzoc79RgCdNTiUaNsTT1AJZ7Z+cg=="
},
"node_modules/rimraf": { "node_modules/rimraf": {
"version": "3.0.2", "version": "3.0.2",
"resolved": "https://registry.npmjs.org/rimraf/-/rimraf-3.0.2.tgz", "resolved": "https://registry.npmjs.org/rimraf/-/rimraf-3.0.2.tgz",
@@ -4576,14 +4313,6 @@
"node": ">= 0.8" "node": ">= 0.8"
} }
}, },
"node_modules/stream-buffers": {
"version": "3.0.3",
"resolved": "https://registry.npmjs.org/stream-buffers/-/stream-buffers-3.0.3.tgz",
"integrity": "sha512-pqMqwQCso0PBJt2PQmDO0cFj0lyqmiwOMiMSkVtRokl7e+ZTRYgDHKnuZNbqjiJXgsg4nuqtD/zxuo9KqTp0Yw==",
"engines": {
"node": ">= 0.10.0"
}
},
"node_modules/streamx": { "node_modules/streamx": {
"version": "2.23.0", "version": "2.23.0",
"resolved": "https://registry.npmjs.org/streamx/-/streamx-2.23.0.tgz", "resolved": "https://registry.npmjs.org/streamx/-/streamx-2.23.0.tgz",
@@ -4803,7 +4532,8 @@
"node_modules/undici-types": { "node_modules/undici-types": {
"version": "6.21.0", "version": "6.21.0",
"resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.21.0.tgz", "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.21.0.tgz",
"integrity": "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==" "integrity": "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==",
"devOptional": true
}, },
"node_modules/universalify": { "node_modules/universalify": {
"version": "2.0.1", "version": "2.0.1",
@@ -4826,14 +4556,6 @@
"resolved": "https://registry.npmjs.org/urlpattern-polyfill/-/urlpattern-polyfill-10.0.0.tgz", "resolved": "https://registry.npmjs.org/urlpattern-polyfill/-/urlpattern-polyfill-10.0.0.tgz",
"integrity": "sha512-H/A06tKD7sS1O1X2SshBVeA5FLycRpjqiBeqGKmBwBDBy28EnRjORxTNe269KSSr5un5qyWi1iL61wLxpd+ZOg==" "integrity": "sha512-H/A06tKD7sS1O1X2SshBVeA5FLycRpjqiBeqGKmBwBDBy28EnRjORxTNe269KSSr5un5qyWi1iL61wLxpd+ZOg=="
}, },
"node_modules/user-agents": {
"version": "1.1.669",
"resolved": "https://registry.npmjs.org/user-agents/-/user-agents-1.1.669.tgz",
"integrity": "sha512-pbIzG+AOqCaIpySKJ4IAm1l0VyE4jMnK4y1thV8lm8PYxI+7X5uWcppOK7zY79TCKKTAnJH3/4gaVIZHsjrmJA==",
"dependencies": {
"lodash.clonedeep": "^4.5.0"
}
},
"node_modules/util": { "node_modules/util": {
"version": "0.12.5", "version": "0.12.5",
"resolved": "https://registry.npmjs.org/util/-/util-0.12.5.tgz", "resolved": "https://registry.npmjs.org/util/-/util-0.12.5.tgz",

View File

@@ -1,14 +1,13 @@
{ {
"name": "dutchie-menus-backend", "name": "dutchie-menus-backend",
"version": "1.6.0", "version": "1.5.1",
"lockfileVersion": 3, "lockfileVersion": 3,
"requires": true, "requires": true,
"packages": { "packages": {
"": { "": {
"name": "dutchie-menus-backend", "name": "dutchie-menus-backend",
"version": "1.6.0", "version": "1.5.1",
"dependencies": { "dependencies": {
"@kubernetes/client-node": "^1.4.0",
"@types/bcryptjs": "^3.0.0", "@types/bcryptjs": "^3.0.0",
"axios": "^1.6.2", "axios": "^1.6.2",
"bcrypt": "^5.1.1", "bcrypt": "^5.1.1",
@@ -35,7 +34,6 @@
"puppeteer-extra-plugin-stealth": "^2.11.2", "puppeteer-extra-plugin-stealth": "^2.11.2",
"sharp": "^0.32.0", "sharp": "^0.32.0",
"socks-proxy-agent": "^8.0.2", "socks-proxy-agent": "^8.0.2",
"user-agents": "^1.1.669",
"uuid": "^9.0.1", "uuid": "^9.0.1",
"zod": "^3.22.4" "zod": "^3.22.4"
}, },
@@ -494,97 +492,6 @@
"resolved": "https://registry.npmjs.org/@ioredis/commands/-/commands-1.4.0.tgz", "resolved": "https://registry.npmjs.org/@ioredis/commands/-/commands-1.4.0.tgz",
"integrity": "sha512-aFT2yemJJo+TZCmieA7qnYGQooOS7QfNmYrzGtsYd3g9j5iDP8AimYYAesf79ohjbLG12XxC4nG5DyEnC88AsQ==" "integrity": "sha512-aFT2yemJJo+TZCmieA7qnYGQooOS7QfNmYrzGtsYd3g9j5iDP8AimYYAesf79ohjbLG12XxC4nG5DyEnC88AsQ=="
}, },
"node_modules/@jsep-plugin/assignment": {
"version": "1.3.0",
"resolved": "https://registry.npmjs.org/@jsep-plugin/assignment/-/assignment-1.3.0.tgz",
"integrity": "sha512-VVgV+CXrhbMI3aSusQyclHkenWSAm95WaiKrMxRFam3JSUiIaQjoMIw2sEs/OX4XifnqeQUN4DYbJjlA8EfktQ==",
"engines": {
"node": ">= 10.16.0"
},
"peerDependencies": {
"jsep": "^0.4.0||^1.0.0"
}
},
"node_modules/@jsep-plugin/regex": {
"version": "1.0.4",
"resolved": "https://registry.npmjs.org/@jsep-plugin/regex/-/regex-1.0.4.tgz",
"integrity": "sha512-q7qL4Mgjs1vByCaTnDFcBnV9HS7GVPJX5vyVoCgZHNSC9rjwIlmbXG5sUuorR5ndfHAIlJ8pVStxvjXHbNvtUg==",
"engines": {
"node": ">= 10.16.0"
},
"peerDependencies": {
"jsep": "^0.4.0||^1.0.0"
}
},
"node_modules/@kubernetes/client-node": {
"version": "1.4.0",
"resolved": "https://registry.npmjs.org/@kubernetes/client-node/-/client-node-1.4.0.tgz",
"integrity": "sha512-Zge3YvF7DJi264dU1b3wb/GmzR99JhUpqTvp+VGHfwZT+g7EOOYNScDJNZwXy9cszyIGPIs0VHr+kk8e95qqrA==",
"dependencies": {
"@types/js-yaml": "^4.0.1",
"@types/node": "^24.0.0",
"@types/node-fetch": "^2.6.13",
"@types/stream-buffers": "^3.0.3",
"form-data": "^4.0.0",
"hpagent": "^1.2.0",
"isomorphic-ws": "^5.0.0",
"js-yaml": "^4.1.0",
"jsonpath-plus": "^10.3.0",
"node-fetch": "^2.7.0",
"openid-client": "^6.1.3",
"rfc4648": "^1.3.0",
"socks-proxy-agent": "^8.0.4",
"stream-buffers": "^3.0.2",
"tar-fs": "^3.0.9",
"ws": "^8.18.2"
}
},
"node_modules/@kubernetes/client-node/node_modules/@types/node": {
"version": "24.10.3",
"resolved": "https://registry.npmjs.org/@types/node/-/node-24.10.3.tgz",
"integrity": "sha512-gqkrWUsS8hcm0r44yn7/xZeV1ERva/nLgrLxFRUGb7aoNMIJfZJ3AC261zDQuOAKC7MiXai1WCpYc48jAHoShQ==",
"dependencies": {
"undici-types": "~7.16.0"
}
},
"node_modules/@kubernetes/client-node/node_modules/tar-fs": {
"version": "3.1.1",
"resolved": "https://registry.npmjs.org/tar-fs/-/tar-fs-3.1.1.tgz",
"integrity": "sha512-LZA0oaPOc2fVo82Txf3gw+AkEd38szODlptMYejQUhndHMLQ9M059uXR+AfS7DNo0NpINvSqDsvyaCrBVkptWg==",
"dependencies": {
"pump": "^3.0.0",
"tar-stream": "^3.1.5"
},
"optionalDependencies": {
"bare-fs": "^4.0.1",
"bare-path": "^3.0.0"
}
},
"node_modules/@kubernetes/client-node/node_modules/undici-types": {
"version": "7.16.0",
"resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.16.0.tgz",
"integrity": "sha512-Zz+aZWSj8LE6zoxD+xrjh4VfkIG8Ya6LvYkZqtUQGJPZjYl53ypCaUwWqo7eI0x66KBGeRo+mlBEkMSeSZ38Nw=="
},
"node_modules/@kubernetes/client-node/node_modules/ws": {
"version": "8.18.3",
"resolved": "https://registry.npmjs.org/ws/-/ws-8.18.3.tgz",
"integrity": "sha512-PEIGCY5tSlUt50cqyMXfCzX+oOPqN0vuGqWzbcJ2xvnkzkq46oOpz7dQaTDBdfICb4N14+GARUDw2XV2N4tvzg==",
"engines": {
"node": ">=10.0.0"
},
"peerDependencies": {
"bufferutil": "^4.0.1",
"utf-8-validate": ">=5.0.2"
},
"peerDependenciesMeta": {
"bufferutil": {
"optional": true
},
"utf-8-validate": {
"optional": true
}
}
},
"node_modules/@mapbox/node-pre-gyp": { "node_modules/@mapbox/node-pre-gyp": {
"version": "1.0.11", "version": "1.0.11",
"resolved": "https://registry.npmjs.org/@mapbox/node-pre-gyp/-/node-pre-gyp-1.0.11.tgz", "resolved": "https://registry.npmjs.org/@mapbox/node-pre-gyp/-/node-pre-gyp-1.0.11.tgz",
@@ -850,11 +757,6 @@
"integrity": "sha512-r8Tayk8HJnX0FztbZN7oVqGccWgw98T/0neJphO91KkmOzug1KkofZURD4UaD5uH8AqcFLfdPErnBod0u71/qg==", "integrity": "sha512-r8Tayk8HJnX0FztbZN7oVqGccWgw98T/0neJphO91KkmOzug1KkofZURD4UaD5uH8AqcFLfdPErnBod0u71/qg==",
"dev": true "dev": true
}, },
"node_modules/@types/js-yaml": {
"version": "4.0.9",
"resolved": "https://registry.npmjs.org/@types/js-yaml/-/js-yaml-4.0.9.tgz",
"integrity": "sha512-k4MGaQl5TGo/iipqb2UDG2UwjXziSWkh0uysQelTlJpX1qGlpUZYm8PnO4DxG1qBomtJUdYJ6qR6xdIah10JLg=="
},
"node_modules/@types/jsonwebtoken": { "node_modules/@types/jsonwebtoken": {
"version": "9.0.10", "version": "9.0.10",
"resolved": "https://registry.npmjs.org/@types/jsonwebtoken/-/jsonwebtoken-9.0.10.tgz", "resolved": "https://registry.npmjs.org/@types/jsonwebtoken/-/jsonwebtoken-9.0.10.tgz",
@@ -880,6 +782,7 @@
"version": "20.19.25", "version": "20.19.25",
"resolved": "https://registry.npmjs.org/@types/node/-/node-20.19.25.tgz", "resolved": "https://registry.npmjs.org/@types/node/-/node-20.19.25.tgz",
"integrity": "sha512-ZsJzA5thDQMSQO788d7IocwwQbI8B5OPzmqNvpf3NY/+MHDAS759Wo0gd2WQeXYt5AAAQjzcrTVC6SKCuYgoCQ==", "integrity": "sha512-ZsJzA5thDQMSQO788d7IocwwQbI8B5OPzmqNvpf3NY/+MHDAS759Wo0gd2WQeXYt5AAAQjzcrTVC6SKCuYgoCQ==",
"devOptional": true,
"dependencies": { "dependencies": {
"undici-types": "~6.21.0" "undici-types": "~6.21.0"
} }
@@ -890,15 +793,6 @@
"integrity": "sha512-0ikrnug3/IyneSHqCBeslAhlK2aBfYek1fGo4bP4QnZPmiqSGRK+Oy7ZMisLWkesffJvQ1cqAcBnJC+8+nxIAg==", "integrity": "sha512-0ikrnug3/IyneSHqCBeslAhlK2aBfYek1fGo4bP4QnZPmiqSGRK+Oy7ZMisLWkesffJvQ1cqAcBnJC+8+nxIAg==",
"dev": true "dev": true
}, },
"node_modules/@types/node-fetch": {
"version": "2.6.13",
"resolved": "https://registry.npmjs.org/@types/node-fetch/-/node-fetch-2.6.13.tgz",
"integrity": "sha512-QGpRVpzSaUs30JBSGPjOg4Uveu384erbHBoT1zeONvyCfwQxIkUshLAOqN/k9EjGviPRmWTTe6aH2qySWKTVSw==",
"dependencies": {
"@types/node": "*",
"form-data": "^4.0.4"
}
},
"node_modules/@types/pg": { "node_modules/@types/pg": {
"version": "8.15.6", "version": "8.15.6",
"resolved": "https://registry.npmjs.org/@types/pg/-/pg-8.15.6.tgz", "resolved": "https://registry.npmjs.org/@types/pg/-/pg-8.15.6.tgz",
@@ -952,14 +846,6 @@
"@types/node": "*" "@types/node": "*"
} }
}, },
"node_modules/@types/stream-buffers": {
"version": "3.0.8",
"resolved": "https://registry.npmjs.org/@types/stream-buffers/-/stream-buffers-3.0.8.tgz",
"integrity": "sha512-J+7VaHKNvlNPJPEJXX/fKa9DZtR/xPMwuIbe+yNOwp1YB+ApUOBv2aUpEoBJEi8nJgbgs1x8e73ttg0r1rSUdw==",
"dependencies": {
"@types/node": "*"
}
},
"node_modules/@types/uuid": { "node_modules/@types/uuid": {
"version": "9.0.8", "version": "9.0.8",
"resolved": "https://registry.npmjs.org/@types/uuid/-/uuid-9.0.8.tgz", "resolved": "https://registry.npmjs.org/@types/uuid/-/uuid-9.0.8.tgz",
@@ -1140,78 +1026,6 @@
} }
} }
}, },
"node_modules/bare-fs": {
"version": "4.5.2",
"resolved": "https://registry.npmjs.org/bare-fs/-/bare-fs-4.5.2.tgz",
"integrity": "sha512-veTnRzkb6aPHOvSKIOy60KzURfBdUflr5VReI+NSaPL6xf+XLdONQgZgpYvUuZLVQ8dCqxpBAudaOM1+KpAUxw==",
"optional": true,
"dependencies": {
"bare-events": "^2.5.4",
"bare-path": "^3.0.0",
"bare-stream": "^2.6.4",
"bare-url": "^2.2.2",
"fast-fifo": "^1.3.2"
},
"engines": {
"bare": ">=1.16.0"
},
"peerDependencies": {
"bare-buffer": "*"
},
"peerDependenciesMeta": {
"bare-buffer": {
"optional": true
}
}
},
"node_modules/bare-os": {
"version": "3.6.2",
"resolved": "https://registry.npmjs.org/bare-os/-/bare-os-3.6.2.tgz",
"integrity": "sha512-T+V1+1srU2qYNBmJCXZkUY5vQ0B4FSlL3QDROnKQYOqeiQR8UbjNHlPa+TIbM4cuidiN9GaTaOZgSEgsvPbh5A==",
"optional": true,
"engines": {
"bare": ">=1.14.0"
}
},
"node_modules/bare-path": {
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/bare-path/-/bare-path-3.0.0.tgz",
"integrity": "sha512-tyfW2cQcB5NN8Saijrhqn0Zh7AnFNsnczRcuWODH0eYAXBsJ5gVxAUuNr7tsHSC6IZ77cA0SitzT+s47kot8Mw==",
"optional": true,
"dependencies": {
"bare-os": "^3.0.1"
}
},
"node_modules/bare-stream": {
"version": "2.7.0",
"resolved": "https://registry.npmjs.org/bare-stream/-/bare-stream-2.7.0.tgz",
"integrity": "sha512-oyXQNicV1y8nc2aKffH+BUHFRXmx6VrPzlnaEvMhram0nPBrKcEdcyBg5r08D0i8VxngHFAiVyn1QKXpSG0B8A==",
"optional": true,
"dependencies": {
"streamx": "^2.21.0"
},
"peerDependencies": {
"bare-buffer": "*",
"bare-events": "*"
},
"peerDependenciesMeta": {
"bare-buffer": {
"optional": true
},
"bare-events": {
"optional": true
}
}
},
"node_modules/bare-url": {
"version": "2.3.2",
"resolved": "https://registry.npmjs.org/bare-url/-/bare-url-2.3.2.tgz",
"integrity": "sha512-ZMq4gd9ngV5aTMa5p9+UfY0b3skwhHELaDkhEHetMdX0LRkW9kzaym4oo/Eh+Ghm0CCDuMTsRIGM/ytUc1ZYmw==",
"optional": true,
"dependencies": {
"bare-path": "^3.0.0"
}
},
"node_modules/base64-js": { "node_modules/base64-js": {
"version": "1.5.1", "version": "1.5.1",
"resolved": "https://registry.npmjs.org/base64-js/-/base64-js-1.5.1.tgz", "resolved": "https://registry.npmjs.org/base64-js/-/base64-js-1.5.1.tgz",
@@ -2725,14 +2539,6 @@
"node": ">=16.0.0" "node": ">=16.0.0"
} }
}, },
"node_modules/hpagent": {
"version": "1.2.0",
"resolved": "https://registry.npmjs.org/hpagent/-/hpagent-1.2.0.tgz",
"integrity": "sha512-A91dYTeIB6NoXG+PxTQpCCDDnfHsW9kc06Lvpu1TEe9gnd6ZFeiBoRO9JvzEv6xK7EX97/dUE8g/vBMTqTS3CA==",
"engines": {
"node": ">=14"
}
},
"node_modules/htmlparser2": { "node_modules/htmlparser2": {
"version": "10.0.0", "version": "10.0.0",
"resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-10.0.0.tgz", "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-10.0.0.tgz",
@@ -3096,22 +2902,6 @@
"node": ">=0.10.0" "node": ">=0.10.0"
} }
}, },
"node_modules/isomorphic-ws": {
"version": "5.0.0",
"resolved": "https://registry.npmjs.org/isomorphic-ws/-/isomorphic-ws-5.0.0.tgz",
"integrity": "sha512-muId7Zzn9ywDsyXgTIafTry2sV3nySZeUDe6YedVd1Hvuuep5AsIlqK+XefWpYTyJG5e503F2xIuT2lcU6rCSw==",
"peerDependencies": {
"ws": "*"
}
},
"node_modules/jose": {
"version": "6.1.3",
"resolved": "https://registry.npmjs.org/jose/-/jose-6.1.3.tgz",
"integrity": "sha512-0TpaTfihd4QMNwrz/ob2Bp7X04yuxJkjRGi4aKmOqwhov54i6u79oCv7T+C7lo70MKH6BesI3vscD1yb/yzKXQ==",
"funding": {
"url": "https://github.com/sponsors/panva"
}
},
"node_modules/js-tokens": { "node_modules/js-tokens": {
"version": "4.0.0", "version": "4.0.0",
"resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-4.0.0.tgz", "resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-4.0.0.tgz",
@@ -3128,14 +2918,6 @@
"js-yaml": "bin/js-yaml.js" "js-yaml": "bin/js-yaml.js"
} }
}, },
"node_modules/jsep": {
"version": "1.4.0",
"resolved": "https://registry.npmjs.org/jsep/-/jsep-1.4.0.tgz",
"integrity": "sha512-B7qPcEVE3NVkmSJbaYxvv4cHkVW7DQsZz13pUMrfS8z8Q/BuShN+gcTXrUlPiGqM2/t/EEaI030bpxMqY8gMlw==",
"engines": {
"node": ">= 10.16.0"
}
},
"node_modules/json-parse-even-better-errors": { "node_modules/json-parse-even-better-errors": {
"version": "2.3.1", "version": "2.3.1",
"resolved": "https://registry.npmjs.org/json-parse-even-better-errors/-/json-parse-even-better-errors-2.3.1.tgz", "resolved": "https://registry.npmjs.org/json-parse-even-better-errors/-/json-parse-even-better-errors-2.3.1.tgz",
@@ -3157,23 +2939,6 @@
"graceful-fs": "^4.1.6" "graceful-fs": "^4.1.6"
} }
}, },
"node_modules/jsonpath-plus": {
"version": "10.3.0",
"resolved": "https://registry.npmjs.org/jsonpath-plus/-/jsonpath-plus-10.3.0.tgz",
"integrity": "sha512-8TNmfeTCk2Le33A3vRRwtuworG/L5RrgMvdjhKZxvyShO+mBu2fP50OWUjRLNtvw344DdDarFh9buFAZs5ujeA==",
"dependencies": {
"@jsep-plugin/assignment": "^1.3.0",
"@jsep-plugin/regex": "^1.0.4",
"jsep": "^1.4.0"
},
"bin": {
"jsonpath": "bin/jsonpath-cli.js",
"jsonpath-plus": "bin/jsonpath-cli.js"
},
"engines": {
"node": ">=18.0.0"
}
},
"node_modules/jsonwebtoken": { "node_modules/jsonwebtoken": {
"version": "9.0.2", "version": "9.0.2",
"resolved": "https://registry.npmjs.org/jsonwebtoken/-/jsonwebtoken-9.0.2.tgz", "resolved": "https://registry.npmjs.org/jsonwebtoken/-/jsonwebtoken-9.0.2.tgz",
@@ -3248,11 +3013,6 @@
"resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz", "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz",
"integrity": "sha512-v2kDEe57lecTulaDIuNTPy3Ry4gLGJ6Z1O3vE1krgXZNrsQ+LFTGHVxVjcXPs17LhbZVGedAJv8XZ1tvj5FvSg==" "integrity": "sha512-v2kDEe57lecTulaDIuNTPy3Ry4gLGJ6Z1O3vE1krgXZNrsQ+LFTGHVxVjcXPs17LhbZVGedAJv8XZ1tvj5FvSg=="
}, },
"node_modules/lodash.clonedeep": {
"version": "4.5.0",
"resolved": "https://registry.npmjs.org/lodash.clonedeep/-/lodash.clonedeep-4.5.0.tgz",
"integrity": "sha512-H5ZhCF25riFd9uB5UCkVKo61m3S/xZk1x4wA6yp/L3RFP6Z/eHH1ymQcGLo7J3GMPfm0V/7m1tryHuGVxpqEBQ=="
},
"node_modules/lodash.defaults": { "node_modules/lodash.defaults": {
"version": "4.2.0", "version": "4.2.0",
"resolved": "https://registry.npmjs.org/lodash.defaults/-/lodash.defaults-4.2.0.tgz", "resolved": "https://registry.npmjs.org/lodash.defaults/-/lodash.defaults-4.2.0.tgz",
@@ -3702,14 +3462,6 @@
"url": "https://github.com/fb55/nth-check?sponsor=1" "url": "https://github.com/fb55/nth-check?sponsor=1"
} }
}, },
"node_modules/oauth4webapi": {
"version": "3.8.3",
"resolved": "https://registry.npmjs.org/oauth4webapi/-/oauth4webapi-3.8.3.tgz",
"integrity": "sha512-pQ5BsX3QRTgnt5HxgHwgunIRaDXBdkT23tf8dfzmtTIL2LTpdmxgbpbBm0VgFWAIDlezQvQCTgnVIUmHupXHxw==",
"funding": {
"url": "https://github.com/sponsors/panva"
}
},
"node_modules/object-assign": { "node_modules/object-assign": {
"version": "4.1.1", "version": "4.1.1",
"resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz", "resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz",
@@ -3748,18 +3500,6 @@
"wrappy": "1" "wrappy": "1"
} }
}, },
"node_modules/openid-client": {
"version": "6.8.1",
"resolved": "https://registry.npmjs.org/openid-client/-/openid-client-6.8.1.tgz",
"integrity": "sha512-VoYT6enBo6Vj2j3Q5Ec0AezS+9YGzQo1f5Xc42lreMGlfP4ljiXPKVDvCADh+XHCV/bqPu/wWSiCVXbJKvrODw==",
"dependencies": {
"jose": "^6.1.0",
"oauth4webapi": "^3.8.2"
},
"funding": {
"url": "https://github.com/sponsors/panva"
}
},
"node_modules/pac-proxy-agent": { "node_modules/pac-proxy-agent": {
"version": "7.2.0", "version": "7.2.0",
"resolved": "https://registry.npmjs.org/pac-proxy-agent/-/pac-proxy-agent-7.2.0.tgz", "resolved": "https://registry.npmjs.org/pac-proxy-agent/-/pac-proxy-agent-7.2.0.tgz",
@@ -4676,11 +4416,6 @@
"url": "https://github.com/privatenumber/resolve-pkg-maps?sponsor=1" "url": "https://github.com/privatenumber/resolve-pkg-maps?sponsor=1"
} }
}, },
"node_modules/rfc4648": {
"version": "1.5.4",
"resolved": "https://registry.npmjs.org/rfc4648/-/rfc4648-1.5.4.tgz",
"integrity": "sha512-rRg/6Lb+IGfJqO05HZkN50UtY7K/JhxJag1kP23+zyMfrvoB0B7RWv06MbOzoc79RgCdNTiUaNsTT1AJZ7Z+cg=="
},
"node_modules/rimraf": { "node_modules/rimraf": {
"version": "3.0.2", "version": "3.0.2",
"resolved": "https://registry.npmjs.org/rimraf/-/rimraf-3.0.2.tgz", "resolved": "https://registry.npmjs.org/rimraf/-/rimraf-3.0.2.tgz",
@@ -5111,14 +4846,6 @@
"node": ">= 0.8" "node": ">= 0.8"
} }
}, },
"node_modules/stream-buffers": {
"version": "3.0.3",
"resolved": "https://registry.npmjs.org/stream-buffers/-/stream-buffers-3.0.3.tgz",
"integrity": "sha512-pqMqwQCso0PBJt2PQmDO0cFj0lyqmiwOMiMSkVtRokl7e+ZTRYgDHKnuZNbqjiJXgsg4nuqtD/zxuo9KqTp0Yw==",
"engines": {
"node": ">= 0.10.0"
}
},
"node_modules/streamx": { "node_modules/streamx": {
"version": "2.23.0", "version": "2.23.0",
"resolved": "https://registry.npmjs.org/streamx/-/streamx-2.23.0.tgz", "resolved": "https://registry.npmjs.org/streamx/-/streamx-2.23.0.tgz",
@@ -5338,7 +5065,8 @@
"node_modules/undici-types": { "node_modules/undici-types": {
"version": "6.21.0", "version": "6.21.0",
"resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.21.0.tgz", "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.21.0.tgz",
"integrity": "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==" "integrity": "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==",
"devOptional": true
}, },
"node_modules/universalify": { "node_modules/universalify": {
"version": "2.0.1", "version": "2.0.1",
@@ -5361,14 +5089,6 @@
"resolved": "https://registry.npmjs.org/urlpattern-polyfill/-/urlpattern-polyfill-10.0.0.tgz", "resolved": "https://registry.npmjs.org/urlpattern-polyfill/-/urlpattern-polyfill-10.0.0.tgz",
"integrity": "sha512-H/A06tKD7sS1O1X2SshBVeA5FLycRpjqiBeqGKmBwBDBy28EnRjORxTNe269KSSr5un5qyWi1iL61wLxpd+ZOg==" "integrity": "sha512-H/A06tKD7sS1O1X2SshBVeA5FLycRpjqiBeqGKmBwBDBy28EnRjORxTNe269KSSr5un5qyWi1iL61wLxpd+ZOg=="
}, },
"node_modules/user-agents": {
"version": "1.1.669",
"resolved": "https://registry.npmjs.org/user-agents/-/user-agents-1.1.669.tgz",
"integrity": "sha512-pbIzG+AOqCaIpySKJ4IAm1l0VyE4jMnK4y1thV8lm8PYxI+7X5uWcppOK7zY79TCKKTAnJH3/4gaVIZHsjrmJA==",
"dependencies": {
"lodash.clonedeep": "^4.5.0"
}
},
"node_modules/util": { "node_modules/util": {
"version": "0.12.5", "version": "0.12.5",
"resolved": "https://registry.npmjs.org/util/-/util-0.12.5.tgz", "resolved": "https://registry.npmjs.org/util/-/util-0.12.5.tgz",

View File

@@ -22,7 +22,6 @@
"seed:dt:cities:bulk": "tsx src/scripts/seed-dt-cities-bulk.ts" "seed:dt:cities:bulk": "tsx src/scripts/seed-dt-cities-bulk.ts"
}, },
"dependencies": { "dependencies": {
"@kubernetes/client-node": "^1.4.0",
"@types/bcryptjs": "^3.0.0", "@types/bcryptjs": "^3.0.0",
"axios": "^1.6.2", "axios": "^1.6.2",
"bcrypt": "^5.1.1", "bcrypt": "^5.1.1",
@@ -49,7 +48,6 @@
"puppeteer-extra-plugin-stealth": "^2.11.2", "puppeteer-extra-plugin-stealth": "^2.11.2",
"sharp": "^0.32.0", "sharp": "^0.32.0",
"socks-proxy-agent": "^8.0.2", "socks-proxy-agent": "^8.0.2",
"user-agents": "^1.1.669",
"uuid": "^9.0.1", "uuid": "^9.0.1",
"zod": "^3.22.4" "zod": "^3.22.4"
}, },

View File

@@ -1,46 +0,0 @@
# DEPRECATED CODE - DO NOT USE
**These directories contain OLD, ABANDONED code.**
## What's Here
| Directory | What It Was | Why Deprecated |
|-----------|-------------|----------------|
| `hydration/` | Old pipeline for processing crawl data | Replaced by `src/tasks/handlers/` |
| `scraper-v2/` | Old Puppeteer-based scraper engine | Replaced by curl-based `src/platforms/dutchie/client.ts` |
| `canonical-hydration/` | Intermediate step toward canonical schema | Merged into task handlers |
## What to Use Instead
| Old (DONT USE) | New (USE THIS) |
|----------------|----------------|
| `hydration/normalizers/dutchie.ts` | `src/tasks/handlers/product-refresh.ts` |
| `hydration/producer.ts` | `src/tasks/handlers/payload-fetch.ts` |
| `scraper-v2/engine.ts` | `src/platforms/dutchie/client.ts` |
| `scraper-v2/scheduler.ts` | `src/services/task-scheduler.ts` |
## Why Keep This Code?
- Historical reference only
- Some patterns may be useful for debugging
- Will be deleted once confirmed not needed
## Claude Instructions
**IF YOU ARE CLAUDE:**
1. NEVER import from `src/_deprecated/`
2. NEVER reference these files as examples
3. NEVER try to "fix" or "update" code in here
4. If you see imports from these directories, suggest replacing them
**Correct imports:**
```typescript
// GOOD
import { executeGraphQL } from '../platforms/dutchie/client';
import { pool } from '../db/pool';
// BAD - DO NOT USE
import { something } from '../_deprecated/hydration/...';
import { something } from '../_deprecated/scraper-v2/...';
```

View File

@@ -1,584 +0,0 @@
/**
* System API Routes
*
* Provides REST API endpoints for system monitoring and control:
* - /api/system/sync/* - Sync orchestrator
* - /api/system/dlq/* - Dead-letter queue
* - /api/system/integrity/* - Integrity checks
* - /api/system/fix/* - Auto-fix routines
* - /api/system/alerts/* - System alerts
* - /metrics - Prometheus metrics
*
* Phase 5: Full Production Sync + Monitoring
*/
import { Router, Request, Response } from 'express';
import { Pool } from 'pg';
import {
SyncOrchestrator,
MetricsService,
DLQService,
AlertService,
IntegrityService,
AutoFixService,
} from '../services';
export function createSystemRouter(pool: Pool): Router {
const router = Router();
// Initialize services
const metrics = new MetricsService(pool);
const dlq = new DLQService(pool);
const alerts = new AlertService(pool);
const integrity = new IntegrityService(pool, alerts);
const autoFix = new AutoFixService(pool, alerts);
const orchestrator = new SyncOrchestrator(pool, metrics, dlq, alerts);
// ============================================================
// SYNC ORCHESTRATOR ENDPOINTS
// ============================================================
/**
* GET /api/system/sync/status
* Get current sync status
*/
router.get('/sync/status', async (_req: Request, res: Response) => {
try {
const status = await orchestrator.getStatus();
res.json(status);
} catch (error) {
console.error('[System] Sync status error:', error);
res.status(500).json({ error: 'Failed to get sync status' });
}
});
/**
* POST /api/system/sync/run
* Trigger a sync run
*/
router.post('/sync/run', async (req: Request, res: Response) => {
try {
const triggeredBy = req.body.triggeredBy || 'api';
const result = await orchestrator.runSync();
res.json({
success: true,
triggeredBy,
metrics: result,
});
} catch (error) {
console.error('[System] Sync run error:', error);
res.status(500).json({
success: false,
error: error instanceof Error ? error.message : 'Sync run failed',
});
}
});
/**
* GET /api/system/sync/queue-depth
* Get queue depth information
*/
router.get('/sync/queue-depth', async (_req: Request, res: Response) => {
try {
const depth = await orchestrator.getQueueDepth();
res.json(depth);
} catch (error) {
console.error('[System] Queue depth error:', error);
res.status(500).json({ error: 'Failed to get queue depth' });
}
});
/**
* GET /api/system/sync/health
* Get sync health status
*/
router.get('/sync/health', async (_req: Request, res: Response) => {
try {
const health = await orchestrator.getHealth();
res.status(health.healthy ? 200 : 503).json(health);
} catch (error) {
console.error('[System] Health check error:', error);
res.status(500).json({ healthy: false, error: 'Health check failed' });
}
});
/**
* POST /api/system/sync/pause
* Pause the orchestrator
*/
router.post('/sync/pause', async (req: Request, res: Response) => {
try {
const reason = req.body.reason || 'Manual pause';
await orchestrator.pause(reason);
res.json({ success: true, message: 'Orchestrator paused' });
} catch (error) {
console.error('[System] Pause error:', error);
res.status(500).json({ error: 'Failed to pause orchestrator' });
}
});
/**
* POST /api/system/sync/resume
* Resume the orchestrator
*/
router.post('/sync/resume', async (_req: Request, res: Response) => {
try {
await orchestrator.resume();
res.json({ success: true, message: 'Orchestrator resumed' });
} catch (error) {
console.error('[System] Resume error:', error);
res.status(500).json({ error: 'Failed to resume orchestrator' });
}
});
// ============================================================
// DLQ ENDPOINTS
// ============================================================
/**
* GET /api/system/dlq
* List DLQ payloads
*/
router.get('/dlq', async (req: Request, res: Response) => {
try {
const options = {
status: req.query.status as string,
errorType: req.query.errorType as string,
dispensaryId: req.query.dispensaryId ? parseInt(req.query.dispensaryId as string) : undefined,
limit: req.query.limit ? parseInt(req.query.limit as string) : 50,
offset: req.query.offset ? parseInt(req.query.offset as string) : 0,
};
const result = await dlq.listPayloads(options);
res.json(result);
} catch (error) {
console.error('[System] DLQ list error:', error);
res.status(500).json({ error: 'Failed to list DLQ payloads' });
}
});
/**
* GET /api/system/dlq/stats
* Get DLQ statistics
*/
router.get('/dlq/stats', async (_req: Request, res: Response) => {
try {
const stats = await dlq.getStats();
res.json(stats);
} catch (error) {
console.error('[System] DLQ stats error:', error);
res.status(500).json({ error: 'Failed to get DLQ stats' });
}
});
/**
* GET /api/system/dlq/summary
* Get DLQ summary by error type
*/
router.get('/dlq/summary', async (_req: Request, res: Response) => {
try {
const summary = await dlq.getSummary();
res.json(summary);
} catch (error) {
console.error('[System] DLQ summary error:', error);
res.status(500).json({ error: 'Failed to get DLQ summary' });
}
});
/**
* GET /api/system/dlq/:id
* Get a specific DLQ payload
*/
router.get('/dlq/:id', async (req: Request, res: Response) => {
try {
const payload = await dlq.getPayload(req.params.id);
if (!payload) {
return res.status(404).json({ error: 'Payload not found' });
}
res.json(payload);
} catch (error) {
console.error('[System] DLQ get error:', error);
res.status(500).json({ error: 'Failed to get DLQ payload' });
}
});
/**
* POST /api/system/dlq/:id/retry
* Retry a DLQ payload
*/
router.post('/dlq/:id/retry', async (req: Request, res: Response) => {
try {
const result = await dlq.retryPayload(req.params.id);
if (result.success) {
res.json(result);
} else {
res.status(400).json(result);
}
} catch (error) {
console.error('[System] DLQ retry error:', error);
res.status(500).json({ error: 'Failed to retry payload' });
}
});
/**
* POST /api/system/dlq/:id/abandon
* Abandon a DLQ payload
*/
router.post('/dlq/:id/abandon', async (req: Request, res: Response) => {
try {
const reason = req.body.reason || 'Manually abandoned';
const abandonedBy = req.body.abandonedBy || 'api';
const success = await dlq.abandonPayload(req.params.id, reason, abandonedBy);
res.json({ success });
} catch (error) {
console.error('[System] DLQ abandon error:', error);
res.status(500).json({ error: 'Failed to abandon payload' });
}
});
/**
* POST /api/system/dlq/bulk-retry
* Bulk retry payloads by error type
*/
router.post('/dlq/bulk-retry', async (req: Request, res: Response) => {
try {
const { errorType } = req.body;
if (!errorType) {
return res.status(400).json({ error: 'errorType is required' });
}
const result = await dlq.bulkRetryByErrorType(errorType);
res.json(result);
} catch (error) {
console.error('[System] DLQ bulk retry error:', error);
res.status(500).json({ error: 'Failed to bulk retry' });
}
});
// ============================================================
// INTEGRITY CHECK ENDPOINTS
// ============================================================
/**
* POST /api/system/integrity/run
* Run all integrity checks
*/
router.post('/integrity/run', async (req: Request, res: Response) => {
try {
const triggeredBy = req.body.triggeredBy || 'api';
const result = await integrity.runAllChecks(triggeredBy);
res.json(result);
} catch (error) {
console.error('[System] Integrity run error:', error);
res.status(500).json({ error: 'Failed to run integrity checks' });
}
});
/**
* GET /api/system/integrity/runs
* Get recent integrity check runs
*/
router.get('/integrity/runs', async (req: Request, res: Response) => {
try {
const limit = req.query.limit ? parseInt(req.query.limit as string) : 10;
const runs = await integrity.getRecentRuns(limit);
res.json(runs);
} catch (error) {
console.error('[System] Integrity runs error:', error);
res.status(500).json({ error: 'Failed to get integrity runs' });
}
});
/**
* GET /api/system/integrity/runs/:runId
* Get results for a specific integrity run
*/
router.get('/integrity/runs/:runId', async (req: Request, res: Response) => {
try {
const results = await integrity.getRunResults(req.params.runId);
res.json(results);
} catch (error) {
console.error('[System] Integrity run results error:', error);
res.status(500).json({ error: 'Failed to get run results' });
}
});
// ============================================================
// AUTO-FIX ENDPOINTS
// ============================================================
/**
* GET /api/system/fix/routines
* Get available fix routines
*/
router.get('/fix/routines', (_req: Request, res: Response) => {
try {
const routines = autoFix.getAvailableRoutines();
res.json(routines);
} catch (error) {
console.error('[System] Get routines error:', error);
res.status(500).json({ error: 'Failed to get routines' });
}
});
/**
* POST /api/system/fix/:routine
* Run a fix routine
*/
router.post('/fix/:routine', async (req: Request, res: Response) => {
try {
const routineName = req.params.routine;
const dryRun = req.body.dryRun === true;
const triggeredBy = req.body.triggeredBy || 'api';
const result = await autoFix.runRoutine(routineName as any, triggeredBy, { dryRun });
res.json(result);
} catch (error) {
console.error('[System] Fix routine error:', error);
res.status(500).json({ error: 'Failed to run fix routine' });
}
});
/**
* GET /api/system/fix/runs
* Get recent fix runs
*/
router.get('/fix/runs', async (req: Request, res: Response) => {
try {
const limit = req.query.limit ? parseInt(req.query.limit as string) : 20;
const runs = await autoFix.getRecentRuns(limit);
res.json(runs);
} catch (error) {
console.error('[System] Fix runs error:', error);
res.status(500).json({ error: 'Failed to get fix runs' });
}
});
// ============================================================
// ALERTS ENDPOINTS
// ============================================================
/**
* GET /api/system/alerts
* List alerts
*/
router.get('/alerts', async (req: Request, res: Response) => {
try {
const options = {
status: req.query.status as any,
severity: req.query.severity as any,
type: req.query.type as string,
limit: req.query.limit ? parseInt(req.query.limit as string) : 50,
offset: req.query.offset ? parseInt(req.query.offset as string) : 0,
};
const result = await alerts.listAlerts(options);
res.json(result);
} catch (error) {
console.error('[System] Alerts list error:', error);
res.status(500).json({ error: 'Failed to list alerts' });
}
});
/**
* GET /api/system/alerts/active
* Get active alerts
*/
router.get('/alerts/active', async (_req: Request, res: Response) => {
try {
const activeAlerts = await alerts.getActiveAlerts();
res.json(activeAlerts);
} catch (error) {
console.error('[System] Active alerts error:', error);
res.status(500).json({ error: 'Failed to get active alerts' });
}
});
/**
* GET /api/system/alerts/summary
* Get alert summary
*/
router.get('/alerts/summary', async (_req: Request, res: Response) => {
try {
const summary = await alerts.getSummary();
res.json(summary);
} catch (error) {
console.error('[System] Alerts summary error:', error);
res.status(500).json({ error: 'Failed to get alerts summary' });
}
});
/**
* POST /api/system/alerts/:id/acknowledge
* Acknowledge an alert
*/
router.post('/alerts/:id/acknowledge', async (req: Request, res: Response) => {
try {
const alertId = parseInt(req.params.id);
const acknowledgedBy = req.body.acknowledgedBy || 'api';
const success = await alerts.acknowledgeAlert(alertId, acknowledgedBy);
res.json({ success });
} catch (error) {
console.error('[System] Acknowledge alert error:', error);
res.status(500).json({ error: 'Failed to acknowledge alert' });
}
});
/**
* POST /api/system/alerts/:id/resolve
* Resolve an alert
*/
router.post('/alerts/:id/resolve', async (req: Request, res: Response) => {
try {
const alertId = parseInt(req.params.id);
const resolvedBy = req.body.resolvedBy || 'api';
const success = await alerts.resolveAlert(alertId, resolvedBy);
res.json({ success });
} catch (error) {
console.error('[System] Resolve alert error:', error);
res.status(500).json({ error: 'Failed to resolve alert' });
}
});
/**
* POST /api/system/alerts/bulk-acknowledge
* Bulk acknowledge alerts
*/
router.post('/alerts/bulk-acknowledge', async (req: Request, res: Response) => {
try {
const { ids, acknowledgedBy } = req.body;
if (!ids || !Array.isArray(ids)) {
return res.status(400).json({ error: 'ids array is required' });
}
const count = await alerts.bulkAcknowledge(ids, acknowledgedBy || 'api');
res.json({ acknowledged: count });
} catch (error) {
console.error('[System] Bulk acknowledge error:', error);
res.status(500).json({ error: 'Failed to bulk acknowledge' });
}
});
// ============================================================
// METRICS ENDPOINTS
// ============================================================
/**
* GET /api/system/metrics
* Get all current metrics
*/
router.get('/metrics', async (_req: Request, res: Response) => {
try {
const allMetrics = await metrics.getAllMetrics();
res.json(allMetrics);
} catch (error) {
console.error('[System] Metrics error:', error);
res.status(500).json({ error: 'Failed to get metrics' });
}
});
/**
* GET /api/system/metrics/:name
* Get a specific metric
*/
router.get('/metrics/:name', async (req: Request, res: Response) => {
try {
const metric = await metrics.getMetric(req.params.name);
if (!metric) {
return res.status(404).json({ error: 'Metric not found' });
}
res.json(metric);
} catch (error) {
console.error('[System] Metric error:', error);
res.status(500).json({ error: 'Failed to get metric' });
}
});
/**
* GET /api/system/metrics/:name/history
* Get metric time series
*/
router.get('/metrics/:name/history', async (req: Request, res: Response) => {
try {
const hours = req.query.hours ? parseInt(req.query.hours as string) : 24;
const history = await metrics.getMetricHistory(req.params.name, hours);
res.json(history);
} catch (error) {
console.error('[System] Metric history error:', error);
res.status(500).json({ error: 'Failed to get metric history' });
}
});
/**
* GET /api/system/errors
* Get error summary
*/
router.get('/errors', async (_req: Request, res: Response) => {
try {
const summary = await metrics.getErrorSummary();
res.json(summary);
} catch (error) {
console.error('[System] Error summary error:', error);
res.status(500).json({ error: 'Failed to get error summary' });
}
});
/**
* GET /api/system/errors/recent
* Get recent errors
*/
router.get('/errors/recent', async (req: Request, res: Response) => {
try {
const limit = req.query.limit ? parseInt(req.query.limit as string) : 50;
const errorType = req.query.type as string;
const errors = await metrics.getRecentErrors(limit, errorType);
res.json(errors);
} catch (error) {
console.error('[System] Recent errors error:', error);
res.status(500).json({ error: 'Failed to get recent errors' });
}
});
/**
* POST /api/system/errors/acknowledge
* Acknowledge errors
*/
router.post('/errors/acknowledge', async (req: Request, res: Response) => {
try {
const { ids, acknowledgedBy } = req.body;
if (!ids || !Array.isArray(ids)) {
return res.status(400).json({ error: 'ids array is required' });
}
const count = await metrics.acknowledgeErrors(ids, acknowledgedBy || 'api');
res.json({ acknowledged: count });
} catch (error) {
console.error('[System] Acknowledge errors error:', error);
res.status(500).json({ error: 'Failed to acknowledge errors' });
}
});
return router;
}
/**
* Create Prometheus metrics endpoint (standalone)
*/
export function createPrometheusRouter(pool: Pool): Router {
const router = Router();
const metrics = new MetricsService(pool);
/**
* GET /metrics
* Prometheus-compatible metrics endpoint
*/
router.get('/', async (_req: Request, res: Response) => {
try {
const prometheusOutput = await metrics.getPrometheusMetrics();
res.set('Content-Type', 'text/plain; version=0.0.4');
res.send(prometheusOutput);
} catch (error) {
console.error('[Prometheus] Metrics error:', error);
res.status(500).send('# Error generating metrics');
}
});
return router;
}

View File

@@ -172,9 +172,6 @@ export async function runFullDiscovery(
console.log(`Errors: ${totalErrors}`); console.log(`Errors: ${totalErrors}`);
} }
// Per TASK_WORKFLOW_2024-12-10.md: Track new dispensary IDs for task chaining
let newDispensaryIds: number[] = [];
// Step 4: Auto-validate and promote discovered locations // Step 4: Auto-validate and promote discovered locations
if (!dryRun && totalLocationsUpserted > 0) { if (!dryRun && totalLocationsUpserted > 0) {
console.log('\n[Discovery] Step 4: Auto-promoting discovered locations...'); console.log('\n[Discovery] Step 4: Auto-promoting discovered locations...');
@@ -183,13 +180,6 @@ export async function runFullDiscovery(
console.log(` Created: ${promotionResult.created} new dispensaries`); console.log(` Created: ${promotionResult.created} new dispensaries`);
console.log(` Updated: ${promotionResult.updated} existing dispensaries`); console.log(` Updated: ${promotionResult.updated} existing dispensaries`);
console.log(` Rejected: ${promotionResult.rejected} (validation failed)`); console.log(` Rejected: ${promotionResult.rejected} (validation failed)`);
// Per TASK_WORKFLOW_2024-12-10.md: Capture new IDs for task chaining
newDispensaryIds = promotionResult.newDispensaryIds;
if (newDispensaryIds.length > 0) {
console.log(` New store IDs for crawl: [${newDispensaryIds.join(', ')}]`);
}
if (promotionResult.rejectedRecords.length > 0) { if (promotionResult.rejectedRecords.length > 0) {
console.log(` Rejection reasons:`); console.log(` Rejection reasons:`);
promotionResult.rejectedRecords.slice(0, 5).forEach(r => { promotionResult.rejectedRecords.slice(0, 5).forEach(r => {
@@ -224,8 +214,6 @@ export async function runFullDiscovery(
totalLocationsFound, totalLocationsFound,
totalLocationsUpserted, totalLocationsUpserted,
durationMs, durationMs,
// Per TASK_WORKFLOW_2024-12-10.md: Return new IDs for task chaining
newDispensaryIds,
}; };
} }

View File

@@ -127,8 +127,6 @@ export interface PromotionSummary {
errors: string[]; errors: string[];
}>; }>;
durationMs: number; durationMs: number;
// Per TASK_WORKFLOW_2024-12-10.md: Track new dispensary IDs for task chaining
newDispensaryIds: number[];
} }
/** /**
@@ -471,8 +469,6 @@ export async function promoteDiscoveredLocations(
const results: PromotionResult[] = []; const results: PromotionResult[] = [];
const rejectedRecords: PromotionSummary['rejectedRecords'] = []; const rejectedRecords: PromotionSummary['rejectedRecords'] = [];
// Per TASK_WORKFLOW_2024-12-10.md: Track new dispensary IDs for task chaining
const newDispensaryIds: number[] = [];
let created = 0; let created = 0;
let updated = 0; let updated = 0;
let skipped = 0; let skipped = 0;
@@ -529,8 +525,6 @@ export async function promoteDiscoveredLocations(
if (promotionResult.action === 'created') { if (promotionResult.action === 'created') {
created++; created++;
// Per TASK_WORKFLOW_2024-12-10.md: Track new IDs for task chaining
newDispensaryIds.push(promotionResult.dispensaryId);
} else { } else {
updated++; updated++;
} }
@@ -554,8 +548,6 @@ export async function promoteDiscoveredLocations(
results, results,
rejectedRecords, rejectedRecords,
durationMs: Date.now() - startTime, durationMs: Date.now() - startTime,
// Per TASK_WORKFLOW_2024-12-10.md: Return new IDs for task chaining
newDispensaryIds,
}; };
} }

View File

@@ -211,8 +211,6 @@ export interface FullDiscoveryResult {
totalLocationsFound: number; totalLocationsFound: number;
totalLocationsUpserted: number; totalLocationsUpserted: number;
durationMs: number; durationMs: number;
// Per TASK_WORKFLOW_2024-12-10.md: Track new dispensary IDs for task chaining
newDispensaryIds?: number[];
} }
// ============================================================ // ============================================================

View File

@@ -6,8 +6,6 @@ import { initializeMinio, isMinioEnabled } from './utils/minio';
import { initializeImageStorage } from './utils/image-storage'; import { initializeImageStorage } from './utils/image-storage';
import { logger } from './services/logger'; import { logger } from './services/logger';
import { cleanupOrphanedJobs } from './services/proxyTestQueue'; import { cleanupOrphanedJobs } from './services/proxyTestQueue';
// Per TASK_WORKFLOW_2024-12-10.md: Database-driven task scheduler
import { taskScheduler } from './services/task-scheduler';
import { runAutoMigrations } from './db/auto-migrate'; import { runAutoMigrations } from './db/auto-migrate';
import { getPool } from './db/pool'; import { getPool } from './db/pool';
import healthRoutes from './routes/health'; import healthRoutes from './routes/health';
@@ -109,7 +107,7 @@ import scraperMonitorRoutes from './routes/scraper-monitor';
import apiTokensRoutes from './routes/api-tokens'; import apiTokensRoutes from './routes/api-tokens';
import apiPermissionsRoutes from './routes/api-permissions'; import apiPermissionsRoutes from './routes/api-permissions';
import parallelScrapeRoutes from './routes/parallel-scrape'; import parallelScrapeRoutes from './routes/parallel-scrape';
// crawler-sandbox moved to _deprecated import crawlerSandboxRoutes from './routes/crawler-sandbox';
import versionRoutes from './routes/version'; import versionRoutes from './routes/version';
import deployStatusRoutes from './routes/deploy-status'; import deployStatusRoutes from './routes/deploy-status';
import publicApiRoutes from './routes/public-api'; import publicApiRoutes from './routes/public-api';
@@ -144,9 +142,6 @@ import seoRoutes from './routes/seo';
import priceAnalyticsRoutes from './routes/price-analytics'; import priceAnalyticsRoutes from './routes/price-analytics';
import tasksRoutes from './routes/tasks'; import tasksRoutes from './routes/tasks';
import workerRegistryRoutes from './routes/worker-registry'; import workerRegistryRoutes from './routes/worker-registry';
// Per TASK_WORKFLOW_2024-12-10.md: Raw payload access API
import payloadsRoutes from './routes/payloads';
import k8sRoutes from './routes/k8s';
// Mark requests from trusted domains (cannaiq.co, findagram.co, findadispo.com) // Mark requests from trusted domains (cannaiq.co, findagram.co, findadispo.com)
// These domains can access the API without authentication // These domains can access the API without authentication
@@ -187,7 +182,7 @@ app.use('/api/scraper-monitor', scraperMonitorRoutes);
app.use('/api/api-tokens', apiTokensRoutes); app.use('/api/api-tokens', apiTokensRoutes);
app.use('/api/api-permissions', apiPermissionsRoutes); app.use('/api/api-permissions', apiPermissionsRoutes);
app.use('/api/parallel-scrape', parallelScrapeRoutes); app.use('/api/parallel-scrape', parallelScrapeRoutes);
// crawler-sandbox moved to _deprecated app.use('/api/crawler-sandbox', crawlerSandboxRoutes);
app.use('/api/version', versionRoutes); app.use('/api/version', versionRoutes);
app.use('/api/admin/deploy-status', deployStatusRoutes); app.use('/api/admin/deploy-status', deployStatusRoutes);
console.log('[DeployStatus] Routes registered at /api/admin/deploy-status'); console.log('[DeployStatus] Routes registered at /api/admin/deploy-status');
@@ -227,14 +222,6 @@ console.log('[Tasks] Routes registered at /api/tasks');
app.use('/api/worker-registry', workerRegistryRoutes); app.use('/api/worker-registry', workerRegistryRoutes);
console.log('[WorkerRegistry] Routes registered at /api/worker-registry'); console.log('[WorkerRegistry] Routes registered at /api/worker-registry');
// Per TASK_WORKFLOW_2024-12-10.md: Raw payload access API
app.use('/api/payloads', payloadsRoutes);
console.log('[Payloads] Routes registered at /api/payloads');
// K8s control routes - worker scaling from admin UI
app.use('/api/k8s', k8sRoutes);
console.log('[K8s] Routes registered at /api/k8s');
// Phase 3: Analytics V2 - Enhanced analytics with rec/med state segmentation // Phase 3: Analytics V2 - Enhanced analytics with rec/med state segmentation
try { try {
const analyticsV2Router = createAnalyticsV2Router(getPool()); const analyticsV2Router = createAnalyticsV2Router(getPool());
@@ -339,17 +326,6 @@ async function startServer() {
// Clean up any orphaned proxy test jobs from previous server runs // Clean up any orphaned proxy test jobs from previous server runs
await cleanupOrphanedJobs(); await cleanupOrphanedJobs();
// Per TASK_WORKFLOW_2024-12-10.md: Start database-driven task scheduler
// This replaces node-cron - schedules are stored in DB and survive restarts
// Uses SELECT FOR UPDATE SKIP LOCKED for multi-replica safety
try {
await taskScheduler.start();
logger.info('system', 'Task scheduler started');
} catch (err: any) {
// Non-fatal - scheduler can recover on next poll
logger.warn('system', `Task scheduler startup warning: ${err.message}`);
}
app.listen(PORT, () => { app.listen(PORT, () => {
logger.info('system', `Server running on port ${PORT}`); logger.info('system', `Server running on port ${PORT}`);
console.log(`🚀 Server running on port ${PORT}`); console.log(`🚀 Server running on port ${PORT}`);

View File

@@ -702,10 +702,12 @@ export class StateQueryService {
async getNationalSummary(): Promise<NationalSummary> { async getNationalSummary(): Promise<NationalSummary> {
const stateMetrics = await this.getAllStateMetrics(); const stateMetrics = await this.getAllStateMetrics();
// Get all states count and aggregate metrics
const result = await this.pool.query(` const result = await this.pool.query(`
SELECT SELECT
COUNT(DISTINCT s.code) AS total_states, COUNT(DISTINCT s.code) AS total_states,
COUNT(DISTINCT CASE WHEN EXISTS (
SELECT 1 FROM dispensaries d WHERE d.state = s.code AND d.menu_type IS NOT NULL
) THEN s.code END) AS active_states,
(SELECT COUNT(*) FROM dispensaries WHERE state IS NOT NULL) AS total_stores, (SELECT COUNT(*) FROM dispensaries WHERE state IS NOT NULL) AS total_stores,
(SELECT COUNT(*) FROM store_products sp (SELECT COUNT(*) FROM store_products sp
JOIN dispensaries d ON sp.dispensary_id = d.id JOIN dispensaries d ON sp.dispensary_id = d.id
@@ -723,7 +725,7 @@ export class StateQueryService {
return { return {
totalStates: parseInt(data.total_states), totalStates: parseInt(data.total_states),
activeStates: parseInt(data.total_states), // Same as totalStates - all states shown activeStates: parseInt(data.active_states),
totalStores: parseInt(data.total_stores), totalStores: parseInt(data.total_stores),
totalProducts: parseInt(data.total_products), totalProducts: parseInt(data.total_products),
totalBrands: parseInt(data.total_brands), totalBrands: parseInt(data.total_brands),

View File

@@ -5,35 +5,22 @@
* *
* DO NOT MODIFY THIS FILE WITHOUT EXPLICIT AUTHORIZATION. * DO NOT MODIFY THIS FILE WITHOUT EXPLICIT AUTHORIZATION.
* *
* Updated: 2025-12-10 per workflow-12102025.md * This is the canonical HTTP client for all Dutchie communication.
* * All Dutchie workers (Alice, Bella, etc.) MUST use this client.
* KEY BEHAVIORS (per workflow-12102025.md):
* 1. startSession() gets identity from PROXY LOCATION, not task params
* 2. On 403: immediately get new IP + new fingerprint, then retry
* 3. After 3 consecutive 403s on same proxy → disable it (burned)
* 4. Language is always English (en-US)
* *
* IMPLEMENTATION: * IMPLEMENTATION:
* - Uses curl via child_process.execSync (bypasses TLS fingerprinting) * - Uses curl via child_process.execSync (bypasses TLS fingerprinting)
* - NO Puppeteer, NO axios, NO fetch * - NO Puppeteer, NO axios, NO fetch
* - Uses intoli/user-agents via CrawlRotator for realistic fingerprints * - Fingerprint rotation on 403
* - Residential IP compatible * - Residential IP compatible
* *
* USAGE: * USAGE:
* import { curlPost, curlGet, executeGraphQL, startSession } from '@dutchie/client'; * import { curlPost, curlGet, executeGraphQL } from '@dutchie/client';
* *
* ============================================================ * ============================================================
*/ */
import { execSync } from 'child_process'; import { execSync } from 'child_process';
import {
buildOrderedHeaders,
buildRefererFromMenuUrl,
getCurlBinary,
isCurlImpersonateAvailable,
HeaderContext,
BrowserType,
} from '../../services/http-fingerprint';
// ============================================================ // ============================================================
// TYPES // TYPES
@@ -45,8 +32,6 @@ export interface CurlResponse {
error?: string; error?: string;
} }
// Per workflow-12102025.md: fingerprint comes from CrawlRotator's BrowserFingerprint
// We keep a simplified interface here for header building
export interface Fingerprint { export interface Fingerprint {
userAgent: string; userAgent: string;
acceptLanguage: string; acceptLanguage: string;
@@ -72,13 +57,15 @@ export const DUTCHIE_CONFIG = {
// ============================================================ // ============================================================
// PROXY SUPPORT // PROXY SUPPORT
// Per workflow-12102025.md: // ============================================================
// - On 403: recordBlock() → increment consecutive_403_count // Integrates with the CrawlRotator system from proxy-rotator.ts
// - After 3 consecutive 403s → proxy disabled // On 403 errors:
// - Immediately rotate to new IP + new fingerprint on 403 // 1. Record failure on current proxy
// 2. Rotate to next proxy
// 3. Retry with new proxy
// ============================================================ // ============================================================
import type { CrawlRotator, BrowserFingerprint } from '../../services/crawl-rotator'; import type { CrawlRotator, Proxy } from '../../services/crawl-rotator';
let currentProxy: string | null = null; let currentProxy: string | null = null;
let crawlRotator: CrawlRotator | null = null; let crawlRotator: CrawlRotator | null = null;
@@ -105,12 +92,13 @@ export function getProxy(): string | null {
/** /**
* Set CrawlRotator for proxy rotation on 403s * Set CrawlRotator for proxy rotation on 403s
* Per workflow-12102025.md: enables automatic rotation when blocked * This enables automatic proxy rotation when blocked
*/ */
export function setCrawlRotator(rotator: CrawlRotator | null): void { export function setCrawlRotator(rotator: CrawlRotator | null): void {
crawlRotator = rotator; crawlRotator = rotator;
if (rotator) { if (rotator) {
console.log('[Dutchie Client] CrawlRotator attached - proxy rotation enabled'); console.log('[Dutchie Client] CrawlRotator attached - proxy rotation enabled');
// Set initial proxy from rotator
const proxy = rotator.proxy.getCurrent(); const proxy = rotator.proxy.getCurrent();
if (proxy) { if (proxy) {
currentProxy = rotator.proxy.getProxyUrl(proxy); currentProxy = rotator.proxy.getProxyUrl(proxy);
@@ -127,41 +115,30 @@ export function getCrawlRotator(): CrawlRotator | null {
} }
/** /**
* Handle 403 block - per workflow-12102025.md: * Rotate to next proxy (called on 403)
* 1. Record block on current proxy (increments consecutive_403_count)
* 2. Immediately rotate to new proxy (new IP)
* 3. Rotate fingerprint
* Returns false if no more proxies available
*/ */
async function handle403Block(): Promise<boolean> { async function rotateProxyOn403(error?: string): Promise<boolean> {
if (!crawlRotator) { if (!crawlRotator) {
console.warn('[Dutchie Client] No CrawlRotator - cannot handle 403');
return false; return false;
} }
// Per workflow-12102025.md: record block (tracks consecutive 403s) // Record failure on current proxy
const wasDisabled = await crawlRotator.recordBlock(); await crawlRotator.recordFailure(error || '403 Forbidden');
if (wasDisabled) {
console.log('[Dutchie Client] Current proxy was disabled (3 consecutive 403s)');
}
// Per workflow-12102025.md: immediately get new IP + new fingerprint
const { proxy: nextProxy, fingerprint } = crawlRotator.rotateBoth();
// Rotate to next proxy
const nextProxy = crawlRotator.rotateProxy();
if (nextProxy) { if (nextProxy) {
currentProxy = crawlRotator.proxy.getProxyUrl(nextProxy); currentProxy = crawlRotator.proxy.getProxyUrl(nextProxy);
console.log(`[Dutchie Client] Rotated to new proxy: ${currentProxy.replace(/:[^:@]+@/, ':***@')}`); console.log(`[Dutchie Client] Rotated proxy: ${currentProxy.replace(/:[^:@]+@/, ':***@')}`);
console.log(`[Dutchie Client] New fingerprint: ${fingerprint.userAgent.slice(0, 50)}...`);
return true; return true;
} }
console.error('[Dutchie Client] No more proxies available!'); console.warn('[Dutchie Client] No more proxies available');
return false; return false;
} }
/** /**
* Record success on current proxy * Record success on current proxy
* Per workflow-12102025.md: resets consecutive_403_count
*/ */
async function recordProxySuccess(responseTimeMs?: number): Promise<void> { async function recordProxySuccess(responseTimeMs?: number): Promise<void> {
if (crawlRotator) { if (crawlRotator) {
@@ -185,69 +162,163 @@ export const GRAPHQL_HASHES = {
GetAllCitiesByState: 'ae547a0466ace5a48f91e55bf6699eacd87e3a42841560f0c0eabed5a0a920e6', GetAllCitiesByState: 'ae547a0466ace5a48f91e55bf6699eacd87e3a42841560f0c0eabed5a0a920e6',
}; };
// ============================================================
// FINGERPRINTS - Browser profiles for anti-detect
// ============================================================
const FINGERPRINTS: Fingerprint[] = [
// Chrome Windows (latest) - typical residential user, use first
{
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
acceptLanguage: 'en-US,en;q=0.9',
secChUa: '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
secChUaPlatform: '"Windows"',
secChUaMobile: '?0',
},
// Chrome Mac (latest)
{
userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
acceptLanguage: 'en-US,en;q=0.9',
secChUa: '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
secChUaPlatform: '"macOS"',
secChUaMobile: '?0',
},
// Chrome Windows (120)
{
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
acceptLanguage: 'en-US,en;q=0.9',
secChUa: '"Chromium";v="120", "Google Chrome";v="120", "Not-A.Brand";v="99"',
secChUaPlatform: '"Windows"',
secChUaMobile: '?0',
},
// Firefox Windows
{
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0',
acceptLanguage: 'en-US,en;q=0.5',
},
// Safari Mac
{
userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15',
acceptLanguage: 'en-US,en;q=0.9',
},
// Edge Windows
{
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0',
acceptLanguage: 'en-US,en;q=0.9',
secChUa: '"Microsoft Edge";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
secChUaPlatform: '"Windows"',
secChUaMobile: '?0',
},
];
let currentFingerprintIndex = 0;
// Forward declaration for session (actual CrawlSession interface defined later)
let currentSession: {
sessionId: string;
fingerprint: Fingerprint;
proxyUrl: string | null;
stateCode?: string;
timezone?: string;
startedAt: Date;
} | null = null;
/**
* Get current fingerprint - returns session fingerprint if active, otherwise default
*/
export function getFingerprint(): Fingerprint {
// Use session fingerprint if a session is active
if (currentSession) {
return currentSession.fingerprint;
}
return FINGERPRINTS[currentFingerprintIndex];
}
export function rotateFingerprint(): Fingerprint {
currentFingerprintIndex = (currentFingerprintIndex + 1) % FINGERPRINTS.length;
const fp = FINGERPRINTS[currentFingerprintIndex];
console.log(`[Dutchie Client] Rotated to fingerprint: ${fp.userAgent.slice(0, 50)}...`);
return fp;
}
export function resetFingerprint(): void {
currentFingerprintIndex = 0;
}
/**
* Get a random fingerprint from the pool
*/
export function getRandomFingerprint(): Fingerprint {
const index = Math.floor(Math.random() * FINGERPRINTS.length);
return FINGERPRINTS[index];
}
// ============================================================ // ============================================================
// SESSION MANAGEMENT // SESSION MANAGEMENT
// Per workflow-12102025.md: // Per-session fingerprint rotation for stealth
// - Session identity comes from PROXY LOCATION
// - NOT from task params (no stateCode/timezone params)
// - Language is always English
// ============================================================ // ============================================================
export interface CrawlSession { export interface CrawlSession {
sessionId: string; sessionId: string;
fingerprint: BrowserFingerprint; fingerprint: Fingerprint;
proxyUrl: string | null; proxyUrl: string | null;
proxyTimezone?: string; stateCode?: string;
proxyState?: string; timezone?: string;
startedAt: Date; startedAt: Date;
// Per workflow-12102025.md: Dynamic Referer per dispensary
menuUrl?: string;
referer: string;
} }
let currentSession: CrawlSession | null = null; // Note: currentSession variable declared earlier in file for proper scoping
/** /**
* Start a new crawl session * Timezone to Accept-Language mapping
* * US timezones all use en-US but this can be extended for international
* Per workflow-12102025.md:
* - NO state/timezone params - identity comes from proxy location
* - Gets fingerprint from CrawlRotator (uses intoli/user-agents)
* - Language is always English (en-US)
* - Dynamic Referer per dispensary (from menuUrl)
*
* @param menuUrl - The dispensary's menu URL for dynamic Referer header
*/ */
export function startSession(menuUrl?: string): CrawlSession { const TIMEZONE_TO_LOCALE: Record<string, string> = {
if (!crawlRotator) { 'America/Phoenix': 'en-US,en;q=0.9',
throw new Error('[Dutchie Client] Cannot start session without CrawlRotator'); 'America/Los_Angeles': 'en-US,en;q=0.9',
} 'America/Denver': 'en-US,en;q=0.9',
'America/Chicago': 'en-US,en;q=0.9',
'America/New_York': 'en-US,en;q=0.9',
'America/Detroit': 'en-US,en;q=0.9',
'America/Anchorage': 'en-US,en;q=0.9',
'Pacific/Honolulu': 'en-US,en;q=0.9',
};
// Per workflow-12102025.md: get identity from proxy location /**
const proxyLocation = crawlRotator.getProxyLocation(); * Get Accept-Language header for a given timezone
const fingerprint = crawlRotator.userAgent.getCurrent(); */
export function getLocaleForTimezone(timezone?: string): string {
if (!timezone) return 'en-US,en;q=0.9';
return TIMEZONE_TO_LOCALE[timezone] || 'en-US,en;q=0.9';
}
// Per workflow-12102025.md: Dynamic Referer per dispensary /**
const referer = buildRefererFromMenuUrl(menuUrl); * Start a new crawl session with a random fingerprint
* Call this before crawling a store to get a fresh identity
*/
export function startSession(stateCode?: string, timezone?: string): CrawlSession {
const baseFp = getRandomFingerprint();
// Override Accept-Language based on timezone for geographic consistency
const fingerprint: Fingerprint = {
...baseFp,
acceptLanguage: getLocaleForTimezone(timezone),
};
currentSession = { currentSession = {
sessionId: `session_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`, sessionId: `session_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`,
fingerprint, fingerprint,
proxyUrl: currentProxy, proxyUrl: currentProxy,
proxyTimezone: proxyLocation?.timezone, stateCode,
proxyState: proxyLocation?.state, timezone,
startedAt: new Date(), startedAt: new Date(),
menuUrl,
referer,
}; };
console.log(`[Dutchie Client] Started session ${currentSession.sessionId}`); console.log(`[Dutchie Client] Started session ${currentSession.sessionId}`);
console.log(`[Dutchie Client] Browser: ${fingerprint.browserName} (${fingerprint.deviceCategory})`); console.log(`[Dutchie Client] Fingerprint: ${fingerprint.userAgent.slice(0, 50)}...`);
console.log(`[Dutchie Client] DNT: ${fingerprint.httpFingerprint.hasDNT ? 'enabled' : 'disabled'}`); console.log(`[Dutchie Client] Accept-Language: ${fingerprint.acceptLanguage}`);
console.log(`[Dutchie Client] TLS: ${fingerprint.httpFingerprint.curlImpersonateBinary}`); if (timezone) {
console.log(`[Dutchie Client] Referer: ${referer}`); console.log(`[Dutchie Client] Timezone: ${timezone}`);
if (proxyLocation?.timezone) {
console.log(`[Dutchie Client] Proxy: ${proxyLocation.state || 'unknown'} (${proxyLocation.timezone})`);
} }
return currentSession; return currentSession;
@@ -276,80 +347,48 @@ export function getCurrentSession(): CrawlSession | null {
// ============================================================ // ============================================================
/** /**
* Per workflow-12102025.md: Build headers using HTTP fingerprint system * Build headers for Dutchie requests
* Returns headers in browser-specific order with all natural variations
*/ */
export function buildHeaders(isPost: boolean, contentLength?: number): { headers: Record<string, string>; orderedHeaders: string[] } { export function buildHeaders(refererPath: string, fingerprint?: Fingerprint): Record<string, string> {
if (!currentSession || !crawlRotator) { const fp = fingerprint || getFingerprint();
throw new Error('[Dutchie Client] Cannot build headers without active session'); const refererUrl = `https://dutchie.com${refererPath}`;
}
const fp = currentSession.fingerprint; const headers: Record<string, string> = {
const httpFp = fp.httpFingerprint; 'accept': 'application/json, text/plain, */*',
'accept-language': fp.acceptLanguage,
// Per workflow-12102025.md: Build context for ordered headers 'content-type': 'application/json',
const context: HeaderContext = { 'origin': 'https://dutchie.com',
userAgent: fp.userAgent, 'referer': refererUrl,
secChUa: fp.secChUa, 'user-agent': fp.userAgent,
secChUaPlatform: fp.secChUaPlatform, 'apollographql-client-name': 'Marketplace (production)',
secChUaMobile: fp.secChUaMobile,
referer: currentSession.referer,
isPost,
contentLength,
}; };
// Per workflow-12102025.md: Get ordered headers from HTTP fingerprint service if (fp.secChUa) {
return buildOrderedHeaders(httpFp, context); headers['sec-ch-ua'] = fp.secChUa;
headers['sec-ch-ua-mobile'] = fp.secChUaMobile || '?0';
headers['sec-ch-ua-platform'] = fp.secChUaPlatform || '"Windows"';
headers['sec-fetch-dest'] = 'empty';
headers['sec-fetch-mode'] = 'cors';
headers['sec-fetch-site'] = 'same-site';
}
return headers;
} }
/** /**
* Per workflow-12102025.md: Get curl binary for current session's browser * Execute HTTP POST using curl (bypasses TLS fingerprinting)
* Uses curl-impersonate for TLS fingerprint matching
*/ */
function getCurlBinaryForSession(): string { export function curlPost(url: string, body: any, headers: Record<string, string>, timeout = 30000): CurlResponse {
if (!currentSession) { const filteredHeaders = Object.entries(headers)
return 'curl'; // Fallback to standard curl .filter(([k]) => k.toLowerCase() !== 'accept-encoding')
} .map(([k, v]) => `-H '${k}: ${v}'`)
const browserType = currentSession.fingerprint.browserName as BrowserType;
// Per workflow-12102025.md: Check if curl-impersonate is available
if (isCurlImpersonateAvailable(browserType)) {
return getCurlBinary(browserType);
}
// Fallback to standard curl with warning
console.warn(`[Dutchie Client] curl-impersonate not available for ${browserType}, using standard curl`);
return 'curl';
}
/**
* Per workflow-12102025.md: Execute HTTP POST using curl/curl-impersonate
* - Uses browser-specific TLS fingerprint via curl-impersonate
* - Headers sent in browser-specific order
* - Dynamic Referer per dispensary
*/
export function curlPost(url: string, body: any, timeout = 30000): CurlResponse {
const bodyJson = JSON.stringify(body);
// Per workflow-12102025.md: Build ordered headers for POST request
const { headers, orderedHeaders } = buildHeaders(true, bodyJson.length);
// Per workflow-12102025.md: Build header args in browser-specific order
const headerArgs = orderedHeaders
.filter(h => h !== 'Host' && h !== 'Content-Length') // curl handles these
.map(h => `-H '${h}: ${headers[h]}'`)
.join(' '); .join(' ');
const bodyEscaped = bodyJson.replace(/'/g, "'\\''"); const bodyJson = JSON.stringify(body).replace(/'/g, "'\\''");
const timeoutSec = Math.ceil(timeout / 1000); const timeoutSec = Math.ceil(timeout / 1000);
const separator = '___HTTP_STATUS___'; const separator = '___HTTP_STATUS___';
const proxyArg = getProxyArg(); const proxyArg = getProxyArg();
const cmd = `curl -s --compressed ${proxyArg} -w '${separator}%{http_code}' --max-time ${timeoutSec} ${filteredHeaders} -d '${bodyJson}' '${url}'`;
// Per workflow-12102025.md: Use curl-impersonate for TLS fingerprint matching
const curlBinary = getCurlBinaryForSession();
const cmd = `${curlBinary} -s --compressed ${proxyArg} -w '${separator}%{http_code}' --max-time ${timeoutSec} ${headerArgs} -d '${bodyEscaped}' '${url}'`;
try { try {
const output = execSync(cmd, { const output = execSync(cmd, {
@@ -388,29 +427,19 @@ export function curlPost(url: string, body: any, timeout = 30000): CurlResponse
} }
/** /**
* Per workflow-12102025.md: Execute HTTP GET using curl/curl-impersonate * Execute HTTP GET using curl (bypasses TLS fingerprinting)
* - Uses browser-specific TLS fingerprint via curl-impersonate * Returns HTML or JSON depending on response content-type
* - Headers sent in browser-specific order
* - Dynamic Referer per dispensary
*/ */
export function curlGet(url: string, timeout = 30000): CurlResponse { export function curlGet(url: string, headers: Record<string, string>, timeout = 30000): CurlResponse {
// Per workflow-12102025.md: Build ordered headers for GET request const filteredHeaders = Object.entries(headers)
const { headers, orderedHeaders } = buildHeaders(false); .filter(([k]) => k.toLowerCase() !== 'accept-encoding')
.map(([k, v]) => `-H '${k}: ${v}'`)
// Per workflow-12102025.md: Build header args in browser-specific order
const headerArgs = orderedHeaders
.filter(h => h !== 'Host' && h !== 'Content-Length') // curl handles these
.map(h => `-H '${h}: ${headers[h]}'`)
.join(' '); .join(' ');
const timeoutSec = Math.ceil(timeout / 1000); const timeoutSec = Math.ceil(timeout / 1000);
const separator = '___HTTP_STATUS___'; const separator = '___HTTP_STATUS___';
const proxyArg = getProxyArg(); const proxyArg = getProxyArg();
const cmd = `curl -s --compressed ${proxyArg} -w '${separator}%{http_code}' --max-time ${timeoutSec} ${filteredHeaders} '${url}'`;
// Per workflow-12102025.md: Use curl-impersonate for TLS fingerprint matching
const curlBinary = getCurlBinaryForSession();
const cmd = `${curlBinary} -s --compressed ${proxyArg} -w '${separator}%{http_code}' --max-time ${timeoutSec} ${headerArgs} '${url}'`;
try { try {
const output = execSync(cmd, { const output = execSync(cmd, {
@@ -430,6 +459,7 @@ export function curlGet(url: string, timeout = 30000): CurlResponse {
const responseBody = output.slice(0, separatorIndex); const responseBody = output.slice(0, separatorIndex);
const statusCode = parseInt(output.slice(separatorIndex + separator.length).trim(), 10); const statusCode = parseInt(output.slice(separatorIndex + separator.length).trim(), 10);
// Try to parse as JSON, otherwise return as string (HTML)
try { try {
return { status: statusCode, data: JSON.parse(responseBody) }; return { status: statusCode, data: JSON.parse(responseBody) };
} catch { } catch {
@@ -446,22 +476,16 @@ export function curlGet(url: string, timeout = 30000): CurlResponse {
// ============================================================ // ============================================================
// GRAPHQL EXECUTION // GRAPHQL EXECUTION
// Per workflow-12102025.md:
// - On 403: immediately rotate IP + fingerprint (no delay first)
// - Then retry
// ============================================================ // ============================================================
export interface ExecuteGraphQLOptions { export interface ExecuteGraphQLOptions {
maxRetries?: number; maxRetries?: number;
retryOn403?: boolean; retryOn403?: boolean;
cName?: string; cName?: string; // Optional - used for Referer header, defaults to 'cities'
} }
/** /**
* Per workflow-12102025.md: Execute GraphQL query with curl/curl-impersonate * Execute GraphQL query with curl (bypasses TLS fingerprinting)
* - Uses browser-specific TLS fingerprint
* - Headers in browser-specific order
* - On 403: immediately rotate IP + fingerprint, then retry
*/ */
export async function executeGraphQL( export async function executeGraphQL(
operationName: string, operationName: string,
@@ -469,12 +493,7 @@ export async function executeGraphQL(
hash: string, hash: string,
options: ExecuteGraphQLOptions options: ExecuteGraphQLOptions
): Promise<any> { ): Promise<any> {
const { maxRetries = 3, retryOn403 = true } = options; const { maxRetries = 3, retryOn403 = true, cName = 'cities' } = options;
// Per workflow-12102025.md: Session must be active for requests
if (!currentSession) {
throw new Error('[Dutchie Client] Cannot execute GraphQL without active session - call startSession() first');
}
const body = { const body = {
operationName, operationName,
@@ -488,14 +507,14 @@ export async function executeGraphQL(
let attempt = 0; let attempt = 0;
while (attempt <= maxRetries) { while (attempt <= maxRetries) {
const fingerprint = getFingerprint();
const headers = buildHeaders(`/embedded-menu/${cName}`, fingerprint);
console.log(`[Dutchie Client] curl POST ${operationName} (attempt ${attempt + 1}/${maxRetries + 1})`); console.log(`[Dutchie Client] curl POST ${operationName} (attempt ${attempt + 1}/${maxRetries + 1})`);
const startTime = Date.now(); const response = curlPost(DUTCHIE_CONFIG.graphqlEndpoint, body, headers, DUTCHIE_CONFIG.timeout);
// Per workflow-12102025.md: curlPost now uses ordered headers and curl-impersonate
const response = curlPost(DUTCHIE_CONFIG.graphqlEndpoint, body, DUTCHIE_CONFIG.timeout);
const responseTime = Date.now() - startTime;
console.log(`[Dutchie Client] Response status: ${response.status} (${responseTime}ms)`); console.log(`[Dutchie Client] Response status: ${response.status}`);
if (response.error) { if (response.error) {
console.error(`[Dutchie Client] curl error: ${response.error}`); console.error(`[Dutchie Client] curl error: ${response.error}`);
@@ -508,9 +527,6 @@ export async function executeGraphQL(
} }
if (response.status === 200) { if (response.status === 200) {
// Per workflow-12102025.md: success resets consecutive 403 count
await recordProxySuccess(responseTime);
if (response.data?.errors?.length > 0) { if (response.data?.errors?.length > 0) {
console.warn(`[Dutchie Client] GraphQL errors: ${JSON.stringify(response.data.errors[0])}`); console.warn(`[Dutchie Client] GraphQL errors: ${JSON.stringify(response.data.errors[0])}`);
} }
@@ -518,20 +534,11 @@ export async function executeGraphQL(
} }
if (response.status === 403 && retryOn403) { if (response.status === 403 && retryOn403) {
// Per workflow-12102025.md: immediately rotate IP + fingerprint console.warn(`[Dutchie Client] 403 blocked - rotating proxy and fingerprint...`);
console.warn(`[Dutchie Client] 403 blocked - immediately rotating proxy + fingerprint...`); await rotateProxyOn403('403 Forbidden on GraphQL');
const hasMoreProxies = await handle403Block(); rotateFingerprint();
if (!hasMoreProxies) {
throw new Error('All proxies exhausted - no more IPs available');
}
// Per workflow-12102025.md: Update session referer after rotation
currentSession.referer = buildRefererFromMenuUrl(currentSession.menuUrl);
attempt++; attempt++;
// Per workflow-12102025.md: small backoff after rotation await sleep(1000 * attempt);
await sleep(500);
continue; continue;
} }
@@ -560,10 +567,8 @@ export interface FetchPageOptions {
} }
/** /**
* Per workflow-12102025.md: Fetch HTML page from Dutchie * Fetch HTML page from Dutchie (for city pages, dispensary pages, etc.)
* - Uses browser-specific TLS fingerprint * Returns raw HTML string
* - Headers in browser-specific order
* - Same 403 handling as GraphQL
*/ */
export async function fetchPage( export async function fetchPage(
path: string, path: string,
@@ -572,22 +577,32 @@ export async function fetchPage(
const { maxRetries = 3, retryOn403 = true } = options; const { maxRetries = 3, retryOn403 = true } = options;
const url = `${DUTCHIE_CONFIG.baseUrl}${path}`; const url = `${DUTCHIE_CONFIG.baseUrl}${path}`;
// Per workflow-12102025.md: Session must be active for requests
if (!currentSession) {
throw new Error('[Dutchie Client] Cannot fetch page without active session - call startSession() first');
}
let attempt = 0; let attempt = 0;
while (attempt <= maxRetries) { while (attempt <= maxRetries) {
// Per workflow-12102025.md: curlGet now uses ordered headers and curl-impersonate const fingerprint = getFingerprint();
const headers: Record<string, string> = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
'accept-language': fingerprint.acceptLanguage,
'user-agent': fingerprint.userAgent,
};
if (fingerprint.secChUa) {
headers['sec-ch-ua'] = fingerprint.secChUa;
headers['sec-ch-ua-mobile'] = fingerprint.secChUaMobile || '?0';
headers['sec-ch-ua-platform'] = fingerprint.secChUaPlatform || '"Windows"';
headers['sec-fetch-dest'] = 'document';
headers['sec-fetch-mode'] = 'navigate';
headers['sec-fetch-site'] = 'none';
headers['sec-fetch-user'] = '?1';
headers['upgrade-insecure-requests'] = '1';
}
console.log(`[Dutchie Client] curl GET ${path} (attempt ${attempt + 1}/${maxRetries + 1})`); console.log(`[Dutchie Client] curl GET ${path} (attempt ${attempt + 1}/${maxRetries + 1})`);
const startTime = Date.now(); const response = curlGet(url, headers, DUTCHIE_CONFIG.timeout);
const response = curlGet(url, DUTCHIE_CONFIG.timeout);
const responseTime = Date.now() - startTime;
console.log(`[Dutchie Client] Response status: ${response.status} (${responseTime}ms)`); console.log(`[Dutchie Client] Response status: ${response.status}`);
if (response.error) { if (response.error) {
console.error(`[Dutchie Client] curl error: ${response.error}`); console.error(`[Dutchie Client] curl error: ${response.error}`);
@@ -599,26 +614,15 @@ export async function fetchPage(
} }
if (response.status === 200) { if (response.status === 200) {
// Per workflow-12102025.md: success resets consecutive 403 count
await recordProxySuccess(responseTime);
return { html: response.data, status: response.status }; return { html: response.data, status: response.status };
} }
if (response.status === 403 && retryOn403) { if (response.status === 403 && retryOn403) {
// Per workflow-12102025.md: immediately rotate IP + fingerprint console.warn(`[Dutchie Client] 403 blocked - rotating proxy and fingerprint...`);
console.warn(`[Dutchie Client] 403 blocked - immediately rotating proxy + fingerprint...`); await rotateProxyOn403('403 Forbidden on page fetch');
const hasMoreProxies = await handle403Block(); rotateFingerprint();
if (!hasMoreProxies) {
throw new Error('All proxies exhausted - no more IPs available');
}
// Per workflow-12102025.md: Update session after rotation
currentSession.referer = buildRefererFromMenuUrl(currentSession.menuUrl);
attempt++; attempt++;
// Per workflow-12102025.md: small backoff after rotation await sleep(1000 * attempt);
await sleep(500);
continue; continue;
} }

View File

@@ -6,17 +6,22 @@
*/ */
export { export {
// HTTP Client (per workflow-12102025.md: uses curl-impersonate + ordered headers) // HTTP Client
curlPost, curlPost,
curlGet, curlGet,
executeGraphQL, executeGraphQL,
fetchPage, fetchPage,
extractNextData, extractNextData,
// Headers (per workflow-12102025.md: browser-specific ordering) // Headers & Fingerprints
buildHeaders, buildHeaders,
getFingerprint,
rotateFingerprint,
resetFingerprint,
getRandomFingerprint,
getLocaleForTimezone,
// Session Management (per workflow-12102025.md: menuUrl for dynamic Referer) // Session Management (per-store fingerprint rotation)
startSession, startSession,
endSession, endSession,
getCurrentSession, getCurrentSession,

View File

@@ -47,27 +47,4 @@ router.post('/refresh', authMiddleware, async (req: AuthRequest, res) => {
res.json({ token }); res.json({ token });
}); });
// Verify password for sensitive actions (requires current user to be authenticated)
router.post('/verify-password', authMiddleware, async (req: AuthRequest, res) => {
try {
const { password } = req.body;
if (!password) {
return res.status(400).json({ error: 'Password required' });
}
// Re-authenticate the current user with the provided password
const user = await authenticateUser(req.user!.email, password);
if (!user) {
return res.status(401).json({ error: 'Invalid password', verified: false });
}
res.json({ verified: true });
} catch (error) {
console.error('Password verification error:', error);
res.status(500).json({ error: 'Internal server error' });
}
});
export default router; export default router;

View File

@@ -14,25 +14,13 @@ router.use(authMiddleware);
/** /**
* GET /api/admin/intelligence/brands * GET /api/admin/intelligence/brands
* List all brands with state presence, store counts, and pricing * List all brands with state presence, store counts, and pricing
* Query params:
* - state: Filter by state (e.g., "AZ")
* - limit: Max results (default 500)
* - offset: Pagination offset
*/ */
router.get('/brands', async (req: Request, res: Response) => { router.get('/brands', async (req: Request, res: Response) => {
try { try {
const { limit = '500', offset = '0', state } = req.query; const { limit = '500', offset = '0' } = req.query;
const limitNum = Math.min(parseInt(limit as string, 10), 1000); const limitNum = Math.min(parseInt(limit as string, 10), 1000);
const offsetNum = parseInt(offset as string, 10); const offsetNum = parseInt(offset as string, 10);
// Build WHERE clause based on state filter
let stateFilter = '';
const params: any[] = [limitNum, offsetNum];
if (state && state !== 'all') {
stateFilter = 'AND d.state = $3';
params.push(state);
}
const { rows } = await pool.query(` const { rows } = await pool.query(`
SELECT SELECT
sp.brand_name_raw as brand_name, sp.brand_name_raw as brand_name,
@@ -44,26 +32,17 @@ router.get('/brands', async (req: Request, res: Response) => {
FROM store_products sp FROM store_products sp
JOIN dispensaries d ON sp.dispensary_id = d.id JOIN dispensaries d ON sp.dispensary_id = d.id
WHERE sp.brand_name_raw IS NOT NULL AND sp.brand_name_raw != '' WHERE sp.brand_name_raw IS NOT NULL AND sp.brand_name_raw != ''
${stateFilter}
GROUP BY sp.brand_name_raw GROUP BY sp.brand_name_raw
ORDER BY store_count DESC, sku_count DESC ORDER BY store_count DESC, sku_count DESC
LIMIT $1 OFFSET $2 LIMIT $1 OFFSET $2
`, params); `, [limitNum, offsetNum]);
// Get total count with same state filter // Get total count
const countParams: any[] = [];
let countStateFilter = '';
if (state && state !== 'all') {
countStateFilter = 'AND d.state = $1';
countParams.push(state);
}
const { rows: countRows } = await pool.query(` const { rows: countRows } = await pool.query(`
SELECT COUNT(DISTINCT sp.brand_name_raw) as total SELECT COUNT(DISTINCT brand_name_raw) as total
FROM store_products sp FROM store_products
JOIN dispensaries d ON sp.dispensary_id = d.id WHERE brand_name_raw IS NOT NULL AND brand_name_raw != ''
WHERE sp.brand_name_raw IS NOT NULL AND sp.brand_name_raw != '' `);
${countStateFilter}
`, countParams);
res.json({ res.json({
brands: rows.map((r: any) => ({ brands: rows.map((r: any) => ({
@@ -168,42 +147,10 @@ router.get('/brands/:brandName/penetration', async (req: Request, res: Response)
/** /**
* GET /api/admin/intelligence/pricing * GET /api/admin/intelligence/pricing
* Get pricing analytics by category * Get pricing analytics by category
* Query params:
* - state: Filter by state (e.g., "AZ")
*/ */
router.get('/pricing', async (req: Request, res: Response) => { router.get('/pricing', async (req: Request, res: Response) => {
try { try {
const { state } = req.query; const { rows: categoryRows } = await pool.query(`
// Build WHERE clause based on state filter
let stateFilter = '';
const categoryParams: any[] = [];
const stateQueryParams: any[] = [];
const overallParams: any[] = [];
if (state && state !== 'all') {
stateFilter = 'AND d.state = $1';
categoryParams.push(state);
overallParams.push(state);
}
// Category pricing with optional state filter
const categoryQuery = state && state !== 'all'
? `
SELECT
sp.category_raw as category,
ROUND(AVG(sp.price_rec)::numeric, 2) as avg_price,
MIN(sp.price_rec) as min_price,
MAX(sp.price_rec) as max_price,
ROUND(PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY sp.price_rec)::numeric, 2) as median_price,
COUNT(*) as product_count
FROM store_products sp
JOIN dispensaries d ON sp.dispensary_id = d.id
WHERE sp.category_raw IS NOT NULL AND sp.price_rec > 0 ${stateFilter}
GROUP BY sp.category_raw
ORDER BY product_count DESC
`
: `
SELECT SELECT
sp.category_raw as category, sp.category_raw as category,
ROUND(AVG(sp.price_rec)::numeric, 2) as avg_price, ROUND(AVG(sp.price_rec)::numeric, 2) as avg_price,
@@ -215,11 +162,8 @@ router.get('/pricing', async (req: Request, res: Response) => {
WHERE sp.category_raw IS NOT NULL AND sp.price_rec > 0 WHERE sp.category_raw IS NOT NULL AND sp.price_rec > 0
GROUP BY sp.category_raw GROUP BY sp.category_raw
ORDER BY product_count DESC ORDER BY product_count DESC
`; `);
const { rows: categoryRows } = await pool.query(categoryQuery, categoryParams);
// State pricing
const { rows: stateRows } = await pool.query(` const { rows: stateRows } = await pool.query(`
SELECT SELECT
d.state, d.state,
@@ -234,31 +178,6 @@ router.get('/pricing', async (req: Request, res: Response) => {
ORDER BY avg_price DESC ORDER BY avg_price DESC
`); `);
// Overall stats with optional state filter
const overallQuery = state && state !== 'all'
? `
SELECT
ROUND(AVG(sp.price_rec)::numeric, 2) as avg_price,
MIN(sp.price_rec) as min_price,
MAX(sp.price_rec) as max_price,
COUNT(*) as total_products
FROM store_products sp
JOIN dispensaries d ON sp.dispensary_id = d.id
WHERE sp.price_rec > 0 ${stateFilter}
`
: `
SELECT
ROUND(AVG(sp.price_rec)::numeric, 2) as avg_price,
MIN(sp.price_rec) as min_price,
MAX(sp.price_rec) as max_price,
COUNT(*) as total_products
FROM store_products sp
WHERE sp.price_rec > 0
`;
const { rows: overallRows } = await pool.query(overallQuery, overallParams);
const overall = overallRows[0];
res.json({ res.json({
byCategory: categoryRows.map((r: any) => ({ byCategory: categoryRows.map((r: any) => ({
category: r.category, category: r.category,
@@ -275,12 +194,6 @@ router.get('/pricing', async (req: Request, res: Response) => {
maxPrice: r.max_price ? parseFloat(r.max_price) : null, maxPrice: r.max_price ? parseFloat(r.max_price) : null,
productCount: parseInt(r.product_count, 10), productCount: parseInt(r.product_count, 10),
})), })),
overall: {
avgPrice: overall?.avg_price ? parseFloat(overall.avg_price) : null,
minPrice: overall?.min_price ? parseFloat(overall.min_price) : null,
maxPrice: overall?.max_price ? parseFloat(overall.max_price) : null,
totalProducts: parseInt(overall?.total_products || '0', 10),
},
}); });
} catch (error: any) { } catch (error: any) {
console.error('[Intelligence] Error fetching pricing:', error.message); console.error('[Intelligence] Error fetching pricing:', error.message);
@@ -291,23 +204,9 @@ router.get('/pricing', async (req: Request, res: Response) => {
/** /**
* GET /api/admin/intelligence/stores * GET /api/admin/intelligence/stores
* Get store intelligence summary * Get store intelligence summary
* Query params:
* - state: Filter by state (e.g., "AZ")
* - limit: Max results (default 200)
*/ */
router.get('/stores', async (req: Request, res: Response) => { router.get('/stores', async (req: Request, res: Response) => {
try { try {
const { state, limit = '200' } = req.query;
const limitNum = Math.min(parseInt(limit as string, 10), 500);
// Build WHERE clause based on state filter
let stateFilter = '';
const params: any[] = [limitNum];
if (state && state !== 'all') {
stateFilter = 'AND d.state = $2';
params.push(state);
}
const { rows: storeRows } = await pool.query(` const { rows: storeRows } = await pool.query(`
SELECT SELECT
d.id, d.id,
@@ -317,22 +216,17 @@ router.get('/stores', async (req: Request, res: Response) => {
d.state, d.state,
d.menu_type, d.menu_type,
d.crawl_enabled, d.crawl_enabled,
c.name as chain_name, COUNT(DISTINCT sp.id) as product_count,
COUNT(DISTINCT sp.id) as sku_count,
COUNT(DISTINCT sp.brand_name_raw) as brand_count, COUNT(DISTINCT sp.brand_name_raw) as brand_count,
ROUND(AVG(sp.price_rec)::numeric, 2) as avg_price, ROUND(AVG(sp.price_rec)::numeric, 2) as avg_price,
MAX(sp.updated_at) as last_crawl, MAX(sp.updated_at) as last_product_update
(SELECT COUNT(*) FROM store_product_snapshots sps
WHERE sps.store_product_id IN (SELECT id FROM store_products WHERE dispensary_id = d.id)) as snapshot_count
FROM dispensaries d FROM dispensaries d
LEFT JOIN store_products sp ON sp.dispensary_id = d.id LEFT JOIN store_products sp ON sp.dispensary_id = d.id
LEFT JOIN chains c ON d.chain_id = c.id WHERE d.state IS NOT NULL
WHERE d.state IS NOT NULL AND d.crawl_enabled = true GROUP BY d.id, d.name, d.dba_name, d.city, d.state, d.menu_type, d.crawl_enabled
${stateFilter} ORDER BY product_count DESC
GROUP BY d.id, d.name, d.dba_name, d.city, d.state, d.menu_type, d.crawl_enabled, c.name LIMIT 200
ORDER BY sku_count DESC `);
LIMIT $1
`, params);
res.json({ res.json({
stores: storeRows.map((r: any) => ({ stores: storeRows.map((r: any) => ({
@@ -343,13 +237,10 @@ router.get('/stores', async (req: Request, res: Response) => {
state: r.state, state: r.state,
menuType: r.menu_type, menuType: r.menu_type,
crawlEnabled: r.crawl_enabled, crawlEnabled: r.crawl_enabled,
chainName: r.chain_name || null, productCount: parseInt(r.product_count || '0', 10),
skuCount: parseInt(r.sku_count || '0', 10),
snapshotCount: parseInt(r.snapshot_count || '0', 10),
brandCount: parseInt(r.brand_count || '0', 10), brandCount: parseInt(r.brand_count || '0', 10),
avgPrice: r.avg_price ? parseFloat(r.avg_price) : null, avgPrice: r.avg_price ? parseFloat(r.avg_price) : null,
lastCrawl: r.last_crawl, lastProductUpdate: r.last_product_update,
crawlFrequencyHours: 4, // Default crawl frequency
})), })),
total: storeRows.length, total: storeRows.length,
}); });

View File

@@ -543,9 +543,6 @@ router.post('/bulk-priority', async (req: Request, res: Response) => {
/** /**
* POST /api/job-queue/enqueue - Add a new job to the queue * POST /api/job-queue/enqueue - Add a new job to the queue
*
* 2024-12-10: Rewired to use worker_tasks via taskService.
* Legacy dispensary_crawl_jobs code commented out below.
*/ */
router.post('/enqueue', async (req: Request, res: Response) => { router.post('/enqueue', async (req: Request, res: Response) => {
try { try {
@@ -555,59 +552,6 @@ router.post('/enqueue', async (req: Request, res: Response) => {
return res.status(400).json({ success: false, error: 'dispensary_id is required' }); return res.status(400).json({ success: false, error: 'dispensary_id is required' });
} }
// 2024-12-10: Map legacy job_type to new task role
const roleMap: Record<string, string> = {
'dutchie_product_crawl': 'product_refresh',
'menu_detection': 'entry_point_discovery',
'menu_detection_single': 'entry_point_discovery',
'product_discovery': 'product_discovery',
'store_discovery': 'store_discovery',
};
const role = roleMap[job_type] || 'product_refresh';
// 2024-12-10: Use taskService to create task in worker_tasks table
const { taskService } = await import('../tasks/task-service');
// Check if task already pending for this dispensary
const existingTasks = await taskService.listTasks({
dispensary_id,
role: role as any,
status: ['pending', 'claimed', 'running'],
limit: 1,
});
if (existingTasks.length > 0) {
return res.json({
success: true,
task_id: existingTasks[0].id,
message: 'Task already queued'
});
}
const task = await taskService.createTask({
role: role as any,
dispensary_id,
priority,
});
res.json({ success: true, task_id: task.id, message: 'Task enqueued' });
} catch (error: any) {
console.error('[JobQueue] Error enqueuing task:', error);
res.status(500).json({ success: false, error: error.message });
}
});
/*
* LEGACY CODE - 2024-12-10: Commented out, was using orphaned dispensary_crawl_jobs table
*
router.post('/enqueue', async (req: Request, res: Response) => {
try {
const { dispensary_id, job_type = 'dutchie_product_crawl', priority = 0 } = req.body;
if (!dispensary_id) {
return res.status(400).json({ success: false, error: 'dispensary_id is required' });
}
// Check if job already pending for this dispensary // Check if job already pending for this dispensary
const existing = await pool.query(` const existing = await pool.query(`
SELECT id FROM dispensary_crawl_jobs SELECT id FROM dispensary_crawl_jobs
@@ -641,7 +585,6 @@ router.post('/enqueue', async (req: Request, res: Response) => {
res.status(500).json({ success: false, error: error.message }); res.status(500).json({ success: false, error: error.message });
} }
}); });
*/
/** /**
* POST /api/job-queue/pause - Pause queue processing * POST /api/job-queue/pause - Pause queue processing
@@ -669,8 +612,6 @@ router.get('/paused', async (_req: Request, res: Response) => {
/** /**
* POST /api/job-queue/enqueue-batch - Queue multiple dispensaries at once * POST /api/job-queue/enqueue-batch - Queue multiple dispensaries at once
* Body: { dispensary_ids: number[], job_type?: string, priority?: number } * Body: { dispensary_ids: number[], job_type?: string, priority?: number }
*
* 2024-12-10: Rewired to use worker_tasks via taskService.
*/ */
router.post('/enqueue-batch', async (req: Request, res: Response) => { router.post('/enqueue-batch', async (req: Request, res: Response) => {
try { try {
@@ -684,30 +625,35 @@ router.post('/enqueue-batch', async (req: Request, res: Response) => {
return res.status(400).json({ success: false, error: 'Maximum 500 dispensaries per batch' }); return res.status(400).json({ success: false, error: 'Maximum 500 dispensaries per batch' });
} }
// 2024-12-10: Map legacy job_type to new task role // Insert jobs, skipping duplicates
const roleMap: Record<string, string> = { const { rows } = await pool.query(`
'dutchie_product_crawl': 'product_refresh', INSERT INTO dispensary_crawl_jobs (dispensary_id, job_type, priority, trigger_type, status, created_at)
'menu_detection': 'entry_point_discovery', SELECT
'product_discovery': 'product_discovery', d.id,
}; $2::text,
const role = roleMap[job_type] || 'product_refresh'; $3::integer,
'api_batch',
// 2024-12-10: Use taskService to create tasks in worker_tasks table 'pending',
const { taskService } = await import('../tasks/task-service'); NOW()
FROM dispensaries d
const tasks = dispensary_ids.map(dispensary_id => ({ WHERE d.id = ANY($1::int[])
role: role as any, AND d.crawl_enabled = true
dispensary_id, AND d.platform_dispensary_id IS NOT NULL
priority, AND NOT EXISTS (
})); SELECT 1 FROM dispensary_crawl_jobs cj
WHERE cj.dispensary_id = d.id
const createdCount = await taskService.createTasks(tasks); AND cj.job_type = $2::text
AND cj.status IN ('pending', 'running')
)
RETURNING id, dispensary_id
`, [dispensary_ids, job_type, priority]);
res.json({ res.json({
success: true, success: true,
queued: createdCount, queued: rows.length,
requested: dispensary_ids.length, requested: dispensary_ids.length,
message: `Queued ${createdCount} of ${dispensary_ids.length} dispensaries` job_ids: rows.map(r => r.id),
message: `Queued ${rows.length} of ${dispensary_ids.length} dispensaries`
}); });
} catch (error: any) { } catch (error: any) {
console.error('[JobQueue] Error batch enqueuing:', error); console.error('[JobQueue] Error batch enqueuing:', error);
@@ -718,8 +664,6 @@ router.post('/enqueue-batch', async (req: Request, res: Response) => {
/** /**
* POST /api/job-queue/enqueue-state - Queue all crawl-enabled dispensaries for a state * POST /api/job-queue/enqueue-state - Queue all crawl-enabled dispensaries for a state
* Body: { state_code: string, job_type?: string, priority?: number, limit?: number } * Body: { state_code: string, job_type?: string, priority?: number, limit?: number }
*
* 2024-12-10: Rewired to use worker_tasks via taskService.
*/ */
router.post('/enqueue-state', async (req: Request, res: Response) => { router.post('/enqueue-state', async (req: Request, res: Response) => {
try { try {
@@ -729,55 +673,52 @@ router.post('/enqueue-state', async (req: Request, res: Response) => {
return res.status(400).json({ success: false, error: 'state_code is required (e.g., "AZ")' }); return res.status(400).json({ success: false, error: 'state_code is required (e.g., "AZ")' });
} }
// 2024-12-10: Map legacy job_type to new task role // Get state_id and queue jobs
const roleMap: Record<string, string> = { const { rows } = await pool.query(`
'dutchie_product_crawl': 'product_refresh', WITH target_state AS (
'menu_detection': 'entry_point_discovery', SELECT id FROM states WHERE code = $1
'product_discovery': 'product_discovery', )
}; INSERT INTO dispensary_crawl_jobs (dispensary_id, job_type, priority, trigger_type, status, created_at)
const role = roleMap[job_type] || 'product_refresh'; SELECT
d.id,
// Get dispensary IDs for the state $2::text,
const dispensaryResult = await pool.query(` $3::integer,
SELECT d.id 'api_state',
FROM dispensaries d 'pending',
JOIN states s ON s.id = d.state_id NOW()
WHERE s.code = $1 FROM dispensaries d, target_state
WHERE d.state_id = target_state.id
AND d.crawl_enabled = true AND d.crawl_enabled = true
AND d.platform_dispensary_id IS NOT NULL AND d.platform_dispensary_id IS NOT NULL
LIMIT $2 AND NOT EXISTS (
`, [state_code.toUpperCase(), limit]); SELECT 1 FROM dispensary_crawl_jobs cj
WHERE cj.dispensary_id = d.id
const dispensary_ids = dispensaryResult.rows.map((r: any) => r.id); AND cj.job_type = $2::text
AND cj.status IN ('pending', 'running')
// 2024-12-10: Use taskService to create tasks in worker_tasks table )
const { taskService } = await import('../tasks/task-service'); LIMIT $4::integer
RETURNING id, dispensary_id
const tasks = dispensary_ids.map((dispensary_id: number) => ({ `, [state_code.toUpperCase(), job_type, priority, limit]);
role: role as any,
dispensary_id,
priority,
}));
const createdCount = await taskService.createTasks(tasks);
// Get total available count // Get total available count
const countResult = await pool.query(` const countResult = await pool.query(`
WITH target_state AS (
SELECT id FROM states WHERE code = $1
)
SELECT COUNT(*) as total SELECT COUNT(*) as total
FROM dispensaries d FROM dispensaries d, target_state
JOIN states s ON s.id = d.state_id WHERE d.state_id = target_state.id
WHERE s.code = $1
AND d.crawl_enabled = true AND d.crawl_enabled = true
AND d.platform_dispensary_id IS NOT NULL AND d.platform_dispensary_id IS NOT NULL
`, [state_code.toUpperCase()]); `, [state_code.toUpperCase()]);
res.json({ res.json({
success: true, success: true,
queued: createdCount, queued: rows.length,
total_available: parseInt(countResult.rows[0].total), total_available: parseInt(countResult.rows[0].total),
state: state_code.toUpperCase(), state: state_code.toUpperCase(),
role, job_type,
message: `Queued ${createdCount} dispensaries for ${state_code.toUpperCase()}` message: `Queued ${rows.length} dispensaries for ${state_code.toUpperCase()}`
}); });
} catch (error: any) { } catch (error: any) {
console.error('[JobQueue] Error enqueuing state:', error); console.error('[JobQueue] Error enqueuing state:', error);

View File

@@ -1,140 +0,0 @@
/**
* Kubernetes Control Routes
*
* Provides admin UI control over k8s resources like worker scaling.
* Uses in-cluster config when running in k8s, or kubeconfig locally.
*/
import { Router, Request, Response } from 'express';
import * as k8s from '@kubernetes/client-node';
const router = Router();
// K8s client setup - lazy initialization
let appsApi: k8s.AppsV1Api | null = null;
let k8sError: string | null = null;
function getK8sClient(): k8s.AppsV1Api | null {
if (appsApi) return appsApi;
if (k8sError) return null;
try {
const kc = new k8s.KubeConfig();
// Try in-cluster config first (when running in k8s)
try {
kc.loadFromCluster();
console.log('[K8s] Loaded in-cluster config');
} catch {
// Fall back to default kubeconfig (local dev)
try {
kc.loadFromDefault();
console.log('[K8s] Loaded default kubeconfig');
} catch (e) {
k8sError = 'No k8s config available';
console.log('[K8s] No config available - k8s routes disabled');
return null;
}
}
appsApi = kc.makeApiClient(k8s.AppsV1Api);
return appsApi;
} catch (e: any) {
k8sError = e.message;
console.error('[K8s] Failed to initialize client:', e.message);
return null;
}
}
const NAMESPACE = process.env.K8S_NAMESPACE || 'dispensary-scraper';
const WORKER_DEPLOYMENT = 'scraper-worker';
/**
* GET /api/k8s/workers
* Get current worker deployment status
*/
router.get('/workers', async (_req: Request, res: Response) => {
const client = getK8sClient();
if (!client) {
return res.json({
success: true,
available: false,
error: k8sError || 'K8s not available',
replicas: 0,
readyReplicas: 0,
});
}
try {
const deployment = await client.readNamespacedDeployment({
name: WORKER_DEPLOYMENT,
namespace: NAMESPACE,
});
res.json({
success: true,
available: true,
replicas: deployment.spec?.replicas || 0,
readyReplicas: deployment.status?.readyReplicas || 0,
availableReplicas: deployment.status?.availableReplicas || 0,
updatedReplicas: deployment.status?.updatedReplicas || 0,
});
} catch (e: any) {
console.error('[K8s] Error getting deployment:', e.message);
res.status(500).json({
success: false,
error: e.message,
});
}
});
/**
* POST /api/k8s/workers/scale
* Scale worker deployment
* Body: { replicas: number }
*/
router.post('/workers/scale', async (req: Request, res: Response) => {
const client = getK8sClient();
if (!client) {
return res.status(503).json({
success: false,
error: k8sError || 'K8s not available',
});
}
const { replicas } = req.body;
if (typeof replicas !== 'number' || replicas < 0 || replicas > 50) {
return res.status(400).json({
success: false,
error: 'replicas must be a number between 0 and 50',
});
}
try {
// Patch the deployment to set replicas
await client.patchNamespacedDeploymentScale({
name: WORKER_DEPLOYMENT,
namespace: NAMESPACE,
body: { spec: { replicas } },
});
console.log(`[K8s] Scaled ${WORKER_DEPLOYMENT} to ${replicas} replicas`);
res.json({
success: true,
replicas,
message: `Scaled to ${replicas} workers`,
});
} catch (e: any) {
console.error('[K8s] Error scaling deployment:', e.message);
res.status(500).json({
success: false,
error: e.message,
});
}
});
export default router;

View File

@@ -291,107 +291,6 @@ router.get('/stores/:id/summary', async (req: Request, res: Response) => {
} }
}); });
/**
* GET /api/markets/stores/:id/crawl-history
* Get crawl history for a specific store
*/
router.get('/stores/:id/crawl-history', async (req: Request, res: Response) => {
try {
const { id } = req.params;
const { limit = '50' } = req.query;
const dispensaryId = parseInt(id, 10);
const limitNum = Math.min(parseInt(limit as string, 10), 100);
// Get crawl history from crawl_orchestration_traces
const { rows: historyRows } = await pool.query(`
SELECT
id,
run_id,
profile_key,
crawler_module,
state_at_start,
state_at_end,
total_steps,
duration_ms,
success,
error_message,
products_found,
started_at,
completed_at
FROM crawl_orchestration_traces
WHERE dispensary_id = $1
ORDER BY started_at DESC
LIMIT $2
`, [dispensaryId, limitNum]);
// Get next scheduled crawl if available
const { rows: scheduleRows } = await pool.query(`
SELECT
js.id as schedule_id,
js.job_name,
js.enabled,
js.base_interval_minutes,
js.jitter_minutes,
js.next_run_at,
js.last_run_at,
js.last_status
FROM job_schedules js
WHERE js.enabled = true
AND js.job_config->>'dispensaryId' = $1::text
ORDER BY js.next_run_at
LIMIT 1
`, [dispensaryId.toString()]);
// Get dispensary info for slug
const { rows: dispRows } = await pool.query(`
SELECT
id,
name,
dba_name,
slug,
state,
city,
menu_type,
platform_dispensary_id,
last_menu_scrape
FROM dispensaries
WHERE id = $1
`, [dispensaryId]);
res.json({
dispensary: dispRows[0] || null,
history: historyRows.map(row => ({
id: row.id,
runId: row.run_id,
profileKey: row.profile_key,
crawlerModule: row.crawler_module,
stateAtStart: row.state_at_start,
stateAtEnd: row.state_at_end,
totalSteps: row.total_steps,
durationMs: row.duration_ms,
success: row.success,
errorMessage: row.error_message,
productsFound: row.products_found,
startedAt: row.started_at?.toISOString() || null,
completedAt: row.completed_at?.toISOString() || null,
})),
nextSchedule: scheduleRows[0] ? {
scheduleId: scheduleRows[0].schedule_id,
jobName: scheduleRows[0].job_name,
enabled: scheduleRows[0].enabled,
baseIntervalMinutes: scheduleRows[0].base_interval_minutes,
jitterMinutes: scheduleRows[0].jitter_minutes,
nextRunAt: scheduleRows[0].next_run_at?.toISOString() || null,
lastRunAt: scheduleRows[0].last_run_at?.toISOString() || null,
lastStatus: scheduleRows[0].last_status,
} : null,
});
} catch (error: any) {
console.error('[Markets] Error fetching crawl history:', error.message);
res.status(500).json({ error: error.message });
}
});
/** /**
* GET /api/markets/stores/:id/products * GET /api/markets/stores/:id/products
* Get products for a store with filtering and pagination * Get products for a store with filtering and pagination

View File

@@ -78,14 +78,14 @@ router.get('/metrics', async (_req: Request, res: Response) => {
/** /**
* GET /api/admin/orchestrator/states * GET /api/admin/orchestrator/states
* Returns array of states with at least one crawl-enabled dispensary * Returns array of states with at least one known dispensary
*/ */
router.get('/states', async (_req: Request, res: Response) => { router.get('/states', async (_req: Request, res: Response) => {
try { try {
const { rows } = await pool.query(` const { rows } = await pool.query(`
SELECT DISTINCT state, COUNT(*) as store_count SELECT DISTINCT state, COUNT(*) as store_count
FROM dispensaries FROM dispensaries
WHERE state IS NOT NULL AND crawl_enabled = true WHERE state IS NOT NULL
GROUP BY state GROUP BY state
ORDER BY state ORDER BY state
`); `);

View File

@@ -1,334 +0,0 @@
/**
* Payload Routes
*
* Per TASK_WORKFLOW_2024-12-10.md: API access to raw crawl payloads.
*
* Endpoints:
* - GET /api/payloads - List payload metadata (paginated)
* - GET /api/payloads/:id - Get payload metadata by ID
* - GET /api/payloads/:id/data - Get full payload JSON
* - GET /api/payloads/store/:dispensaryId - List payloads for a store
* - GET /api/payloads/store/:dispensaryId/latest - Get latest payload for a store
* - GET /api/payloads/store/:dispensaryId/diff - Diff two payloads
*/
import { Router, Request, Response } from 'express';
import { getPool } from '../db/pool';
import {
loadRawPayloadById,
getLatestPayload,
getRecentPayloads,
listPayloadMetadata,
} from '../utils/payload-storage';
import { Pool } from 'pg';
const router = Router();
// Get pool instance for queries
const getDbPool = (): Pool => getPool() as unknown as Pool;
/**
* GET /api/payloads
* List payload metadata (paginated)
*/
router.get('/', async (req: Request, res: Response) => {
try {
const pool = getDbPool();
const limit = Math.min(parseInt(req.query.limit as string) || 50, 100);
const offset = parseInt(req.query.offset as string) || 0;
const dispensaryId = req.query.dispensary_id ? parseInt(req.query.dispensary_id as string) : undefined;
const payloads = await listPayloadMetadata(pool, {
dispensaryId,
limit,
offset,
});
res.json({
success: true,
payloads,
pagination: { limit, offset },
});
} catch (error: any) {
console.error('[Payloads] List error:', error.message);
res.status(500).json({ success: false, error: error.message });
}
});
/**
* GET /api/payloads/:id
* Get payload metadata by ID
*/
router.get('/:id', async (req: Request, res: Response) => {
try {
const pool = getDbPool();
const id = parseInt(req.params.id);
const result = await pool.query(`
SELECT
p.id,
p.dispensary_id,
p.crawl_run_id,
p.storage_path,
p.product_count,
p.size_bytes,
p.size_bytes_raw,
p.fetched_at,
p.processed_at,
p.checksum_sha256,
d.name as dispensary_name
FROM raw_crawl_payloads p
LEFT JOIN dispensaries d ON d.id = p.dispensary_id
WHERE p.id = $1
`, [id]);
if (result.rows.length === 0) {
return res.status(404).json({ success: false, error: 'Payload not found' });
}
res.json({
success: true,
payload: result.rows[0],
});
} catch (error: any) {
console.error('[Payloads] Get error:', error.message);
res.status(500).json({ success: false, error: error.message });
}
});
/**
* GET /api/payloads/:id/data
* Get full payload JSON (decompressed from disk)
*/
router.get('/:id/data', async (req: Request, res: Response) => {
try {
const pool = getDbPool();
const id = parseInt(req.params.id);
const result = await loadRawPayloadById(pool, id);
if (!result) {
return res.status(404).json({ success: false, error: 'Payload not found' });
}
res.json({
success: true,
metadata: result.metadata,
data: result.payload,
});
} catch (error: any) {
console.error('[Payloads] Get data error:', error.message);
res.status(500).json({ success: false, error: error.message });
}
});
/**
* GET /api/payloads/store/:dispensaryId
* List payloads for a specific store
*/
router.get('/store/:dispensaryId', async (req: Request, res: Response) => {
try {
const pool = getDbPool();
const dispensaryId = parseInt(req.params.dispensaryId);
const limit = Math.min(parseInt(req.query.limit as string) || 20, 100);
const offset = parseInt(req.query.offset as string) || 0;
const payloads = await listPayloadMetadata(pool, {
dispensaryId,
limit,
offset,
});
res.json({
success: true,
dispensaryId,
payloads,
pagination: { limit, offset },
});
} catch (error: any) {
console.error('[Payloads] Store list error:', error.message);
res.status(500).json({ success: false, error: error.message });
}
});
/**
* GET /api/payloads/store/:dispensaryId/latest
* Get the latest payload for a store (with full data)
*/
router.get('/store/:dispensaryId/latest', async (req: Request, res: Response) => {
try {
const pool = getDbPool();
const dispensaryId = parseInt(req.params.dispensaryId);
const result = await getLatestPayload(pool, dispensaryId);
if (!result) {
return res.status(404).json({
success: false,
error: `No payloads found for dispensary ${dispensaryId}`,
});
}
res.json({
success: true,
metadata: result.metadata,
data: result.payload,
});
} catch (error: any) {
console.error('[Payloads] Latest error:', error.message);
res.status(500).json({ success: false, error: error.message });
}
});
/**
* GET /api/payloads/store/:dispensaryId/diff
* Compare two payloads for a store
*
* Query params:
* - from: payload ID (older)
* - to: payload ID (newer) - optional, defaults to latest
*/
router.get('/store/:dispensaryId/diff', async (req: Request, res: Response) => {
try {
const pool = getDbPool();
const dispensaryId = parseInt(req.params.dispensaryId);
const fromId = req.query.from ? parseInt(req.query.from as string) : undefined;
const toId = req.query.to ? parseInt(req.query.to as string) : undefined;
let fromPayload: any;
let toPayload: any;
if (fromId && toId) {
// Load specific payloads
const [from, to] = await Promise.all([
loadRawPayloadById(pool, fromId),
loadRawPayloadById(pool, toId),
]);
fromPayload = from;
toPayload = to;
} else {
// Load two most recent
const recent = await getRecentPayloads(pool, dispensaryId, 2);
if (recent.length < 2) {
return res.status(400).json({
success: false,
error: 'Need at least 2 payloads to diff. Only found ' + recent.length,
});
}
toPayload = recent[0]; // Most recent
fromPayload = recent[1]; // Previous
}
if (!fromPayload || !toPayload) {
return res.status(404).json({ success: false, error: 'One or both payloads not found' });
}
// Build product maps by ID
const fromProducts = new Map<string, any>();
const toProducts = new Map<string, any>();
for (const p of fromPayload.payload.products || []) {
const id = p._id || p.id;
if (id) fromProducts.set(id, p);
}
for (const p of toPayload.payload.products || []) {
const id = p._id || p.id;
if (id) toProducts.set(id, p);
}
// Find differences
const added: any[] = [];
const removed: any[] = [];
const priceChanges: any[] = [];
const stockChanges: any[] = [];
// Products in "to" but not in "from" = added
for (const [id, product] of toProducts) {
if (!fromProducts.has(id)) {
added.push({
id,
name: product.name,
brand: product.brand?.name,
price: product.Prices?.[0]?.price,
});
}
}
// Products in "from" but not in "to" = removed
for (const [id, product] of fromProducts) {
if (!toProducts.has(id)) {
removed.push({
id,
name: product.name,
brand: product.brand?.name,
price: product.Prices?.[0]?.price,
});
}
}
// Products in both - check for changes
for (const [id, toProduct] of toProducts) {
const fromProduct = fromProducts.get(id);
if (!fromProduct) continue;
const fromPrice = fromProduct.Prices?.[0]?.price;
const toPrice = toProduct.Prices?.[0]?.price;
if (fromPrice !== toPrice) {
priceChanges.push({
id,
name: toProduct.name,
brand: toProduct.brand?.name,
oldPrice: fromPrice,
newPrice: toPrice,
change: toPrice && fromPrice ? toPrice - fromPrice : null,
});
}
const fromStock = fromProduct.Status || fromProduct.status;
const toStock = toProduct.Status || toProduct.status;
if (fromStock !== toStock) {
stockChanges.push({
id,
name: toProduct.name,
brand: toProduct.brand?.name,
oldStatus: fromStock,
newStatus: toStock,
});
}
}
res.json({
success: true,
from: {
id: fromPayload.metadata.id,
fetchedAt: fromPayload.metadata.fetchedAt,
productCount: fromPayload.metadata.productCount,
},
to: {
id: toPayload.metadata.id,
fetchedAt: toPayload.metadata.fetchedAt,
productCount: toPayload.metadata.productCount,
},
diff: {
added: added.length,
removed: removed.length,
priceChanges: priceChanges.length,
stockChanges: stockChanges.length,
},
details: {
added,
removed,
priceChanges,
stockChanges,
},
});
} catch (error: any) {
console.error('[Payloads] Diff error:', error.message);
res.status(500).json({ success: false, error: error.message });
}
});
export default router;

View File

@@ -278,7 +278,7 @@ router.post('/update-locations', requireRole('superadmin', 'admin'), async (req,
// Run in background // Run in background
updateAllProxyLocations().catch(err => { updateAllProxyLocations().catch(err => {
console.error('Location update failed:', err); console.error('Location update failed:', err);
}); });
res.json({ message: 'Location update job started' }); res.json({ message: 'Location update job started' });

View File

@@ -3,24 +3,6 @@
* *
* Endpoints for managing worker tasks, viewing capacity metrics, * Endpoints for managing worker tasks, viewing capacity metrics,
* and generating batch tasks. * and generating batch tasks.
*
* SCHEDULE MANAGEMENT (added 2025-12-12):
* This file now contains the canonical schedule management endpoints.
* The job_schedules table has been deprecated and all schedule management
* is now consolidated into task_schedules:
*
* Schedule endpoints:
* GET /api/tasks/schedules - List all schedules
* POST /api/tasks/schedules - Create new schedule
* GET /api/tasks/schedules/:id - Get schedule by ID
* PUT /api/tasks/schedules/:id - Update schedule
* DELETE /api/tasks/schedules/:id - Delete schedule
* DELETE /api/tasks/schedules - Bulk delete schedules
* POST /api/tasks/schedules/:id/run-now - Trigger schedule immediately
* POST /api/tasks/schedules/:id/toggle - Toggle schedule enabled/disabled
*
* Note: Schedule routes are defined BEFORE /:id to avoid route conflicts
* (Express matches routes in order, and "schedules" would match /:id otherwise)
*/ */
import { Router, Request, Response } from 'express'; import { Router, Request, Response } from 'express';
@@ -31,12 +13,6 @@ import {
TaskFilter, TaskFilter,
} from '../tasks/task-service'; } from '../tasks/task-service';
import { pool } from '../db/pool'; import { pool } from '../db/pool';
import {
isTaskPoolPaused,
pauseTaskPool,
resumeTaskPool,
getTaskPoolStatus,
} from '../tasks/task-pool-state';
const router = Router(); const router = Router();
@@ -149,464 +125,6 @@ router.get('/capacity/:role', async (req: Request, res: Response) => {
} }
}); });
// ============================================================
// SCHEDULE MANAGEMENT ROUTES
// (Must be before /:id to avoid route conflicts)
// ============================================================
/**
* GET /api/tasks/schedules
* List all task schedules
*
* Returns schedules with is_immutable flag - immutable schedules can only
* have their interval_hours, priority, and enabled fields updated (not deleted).
*/
router.get('/schedules', async (req: Request, res: Response) => {
try {
const enabledOnly = req.query.enabled === 'true';
let query = `
SELECT id, name, role, description, enabled, interval_hours,
priority, state_code, platform, method,
COALESCE(is_immutable, false) as is_immutable,
last_run_at, next_run_at,
last_task_count, last_error, created_at, updated_at
FROM task_schedules
`;
if (enabledOnly) {
query += ` WHERE enabled = true`;
}
query += ` ORDER BY
CASE role
WHEN 'store_discovery' THEN 1
WHEN 'product_discovery' THEN 2
WHEN 'analytics_refresh' THEN 3
ELSE 4
END,
state_code NULLS FIRST,
name`;
const result = await pool.query(query);
res.json({ schedules: result.rows });
} catch (error: unknown) {
console.error('Error listing schedules:', error);
res.status(500).json({ error: 'Failed to list schedules' });
}
});
/**
* DELETE /api/tasks/schedules
* Bulk delete schedules
*
* Immutable schedules are automatically skipped (not deleted).
*
* Body:
* - ids: number[] (required) - array of schedule IDs to delete
* - all: boolean (optional) - if true, delete all non-immutable schedules (ids ignored)
*/
router.delete('/schedules', async (req: Request, res: Response) => {
try {
const { ids, all } = req.body;
let result;
let skippedImmutable: { id: number; name: string }[] = [];
if (all === true) {
// First, find immutable schedules that will be skipped
const immutableResult = await pool.query(`
SELECT id, name FROM task_schedules WHERE is_immutable = true
`);
skippedImmutable = immutableResult.rows;
// Delete all non-immutable schedules
result = await pool.query(`
DELETE FROM task_schedules
WHERE COALESCE(is_immutable, false) = false
RETURNING id, name
`);
} else if (Array.isArray(ids) && ids.length > 0) {
// First, find which of the requested IDs are immutable
const immutableResult = await pool.query(`
SELECT id, name FROM task_schedules
WHERE id = ANY($1) AND is_immutable = true
`, [ids]);
skippedImmutable = immutableResult.rows;
// Delete only non-immutable schedules from the requested IDs
result = await pool.query(`
DELETE FROM task_schedules
WHERE id = ANY($1) AND COALESCE(is_immutable, false) = false
RETURNING id, name
`, [ids]);
} else {
return res.status(400).json({
error: 'Either provide ids array or set all=true',
});
}
res.json({
success: true,
deleted_count: result.rowCount,
deleted: result.rows,
skipped_immutable_count: skippedImmutable.length,
skipped_immutable: skippedImmutable,
message: skippedImmutable.length > 0
? `Deleted ${result.rowCount} schedule(s), skipped ${skippedImmutable.length} immutable schedule(s)`
: `Deleted ${result.rowCount} schedule(s)`,
});
} catch (error: unknown) {
console.error('Error bulk deleting schedules:', error);
res.status(500).json({ error: 'Failed to delete schedules' });
}
});
/**
* POST /api/tasks/schedules
* Create a new schedule
*
* Body:
* - name: string (required, unique)
* - role: TaskRole (required)
* - description: string (optional)
* - enabled: boolean (default true)
* - interval_hours: number (required)
* - priority: number (default 0)
* - state_code: string (optional)
* - platform: string (optional)
*/
router.post('/schedules', async (req: Request, res: Response) => {
try {
const {
name,
role,
description,
enabled = true,
interval_hours,
priority = 0,
state_code,
platform,
} = req.body;
if (!name || !role || !interval_hours) {
return res.status(400).json({
error: 'name, role, and interval_hours are required',
});
}
// Calculate next_run_at based on interval
const nextRunAt = new Date(Date.now() + interval_hours * 60 * 60 * 1000);
const result = await pool.query(`
INSERT INTO task_schedules
(name, role, description, enabled, interval_hours, priority, state_code, platform, next_run_at)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)
RETURNING id, name, role, description, enabled, interval_hours,
priority, state_code, platform, last_run_at, next_run_at,
last_task_count, last_error, created_at, updated_at
`, [name, role, description, enabled, interval_hours, priority, state_code, platform, nextRunAt]);
res.status(201).json(result.rows[0]);
} catch (error: any) {
if (error.code === '23505') {
// Unique constraint violation
return res.status(409).json({ error: 'A schedule with this name already exists' });
}
console.error('Error creating schedule:', error);
res.status(500).json({ error: 'Failed to create schedule' });
}
});
/**
* GET /api/tasks/schedules/:id
* Get a specific schedule by ID
*/
router.get('/schedules/:id', async (req: Request, res: Response) => {
try {
const scheduleId = parseInt(req.params.id, 10);
const result = await pool.query(`
SELECT id, name, role, description, enabled, interval_hours,
priority, state_code, platform, last_run_at, next_run_at,
last_task_count, last_error, created_at, updated_at
FROM task_schedules
WHERE id = $1
`, [scheduleId]);
if (result.rows.length === 0) {
return res.status(404).json({ error: 'Schedule not found' });
}
res.json(result.rows[0]);
} catch (error: unknown) {
console.error('Error getting schedule:', error);
res.status(500).json({ error: 'Failed to get schedule' });
}
});
/**
* PUT /api/tasks/schedules/:id
* Update an existing schedule
*
* For IMMUTABLE schedules, only these fields can be updated:
* - enabled (turn on/off)
* - interval_hours (change frequency)
* - priority (change priority)
*
* For regular schedules, all fields can be updated.
*/
router.put('/schedules/:id', async (req: Request, res: Response) => {
try {
const scheduleId = parseInt(req.params.id, 10);
const {
name,
role,
description,
enabled,
interval_hours,
priority,
state_code,
platform,
} = req.body;
// First check if schedule exists and if it's immutable
const checkResult = await pool.query(`
SELECT id, name, COALESCE(is_immutable, false) as is_immutable
FROM task_schedules WHERE id = $1
`, [scheduleId]);
if (checkResult.rows.length === 0) {
return res.status(404).json({ error: 'Schedule not found' });
}
const schedule = checkResult.rows[0];
const isImmutable = schedule.is_immutable;
// For immutable schedules, reject attempts to change protected fields
if (isImmutable) {
const protectedFields: string[] = [];
if (name !== undefined) protectedFields.push('name');
if (role !== undefined) protectedFields.push('role');
if (description !== undefined) protectedFields.push('description');
if (state_code !== undefined) protectedFields.push('state_code');
if (platform !== undefined) protectedFields.push('platform');
if (protectedFields.length > 0) {
return res.status(403).json({
error: 'Cannot modify protected fields on immutable schedule',
message: `Schedule "${schedule.name}" is immutable. Only enabled, interval_hours, and priority can be changed.`,
protected_fields: protectedFields,
allowed_fields: ['enabled', 'interval_hours', 'priority'],
});
}
}
// Build dynamic update query
const updates: string[] = [];
const values: any[] = [];
let paramIndex = 1;
// These fields can only be updated on non-immutable schedules
if (!isImmutable) {
if (name !== undefined) {
updates.push(`name = $${paramIndex++}`);
values.push(name);
}
if (role !== undefined) {
updates.push(`role = $${paramIndex++}`);
values.push(role);
}
if (description !== undefined) {
updates.push(`description = $${paramIndex++}`);
values.push(description);
}
if (state_code !== undefined) {
updates.push(`state_code = $${paramIndex++}`);
values.push(state_code || null);
}
if (platform !== undefined) {
updates.push(`platform = $${paramIndex++}`);
values.push(platform || null);
}
}
// These fields can be updated on ALL schedules (including immutable)
if (enabled !== undefined) {
updates.push(`enabled = $${paramIndex++}`);
values.push(enabled);
}
if (interval_hours !== undefined) {
updates.push(`interval_hours = $${paramIndex++}`);
values.push(interval_hours);
// Recalculate next_run_at if interval changed
const nextRunAt = new Date(Date.now() + interval_hours * 60 * 60 * 1000);
updates.push(`next_run_at = $${paramIndex++}`);
values.push(nextRunAt);
}
if (priority !== undefined) {
updates.push(`priority = $${paramIndex++}`);
values.push(priority);
}
if (updates.length === 0) {
return res.status(400).json({ error: 'No fields to update' });
}
updates.push('updated_at = NOW()');
values.push(scheduleId);
const result = await pool.query(`
UPDATE task_schedules
SET ${updates.join(', ')}
WHERE id = $${paramIndex}
RETURNING id, name, role, description, enabled, interval_hours,
priority, state_code, platform, method,
COALESCE(is_immutable, false) as is_immutable,
last_run_at, next_run_at,
last_task_count, last_error, created_at, updated_at
`, values);
res.json(result.rows[0]);
} catch (error: any) {
if (error.code === '23505') {
return res.status(409).json({ error: 'A schedule with this name already exists' });
}
console.error('Error updating schedule:', error);
res.status(500).json({ error: 'Failed to update schedule' });
}
});
/**
* DELETE /api/tasks/schedules/:id
* Delete a schedule
*
* Immutable schedules cannot be deleted - they can only be disabled.
*/
router.delete('/schedules/:id', async (req: Request, res: Response) => {
try {
const scheduleId = parseInt(req.params.id, 10);
// First check if schedule exists and is immutable
const checkResult = await pool.query(`
SELECT id, name, COALESCE(is_immutable, false) as is_immutable
FROM task_schedules WHERE id = $1
`, [scheduleId]);
if (checkResult.rows.length === 0) {
return res.status(404).json({ error: 'Schedule not found' });
}
const schedule = checkResult.rows[0];
// Prevent deletion of immutable schedules
if (schedule.is_immutable) {
return res.status(403).json({
error: 'Cannot delete immutable schedule',
message: `Schedule "${schedule.name}" is immutable and cannot be deleted. You can disable it instead.`,
schedule_id: scheduleId,
is_immutable: true,
});
}
// Delete the schedule
await pool.query(`DELETE FROM task_schedules WHERE id = $1`, [scheduleId]);
res.json({
success: true,
message: `Schedule "${schedule.name}" deleted`,
});
} catch (error: unknown) {
console.error('Error deleting schedule:', error);
res.status(500).json({ error: 'Failed to delete schedule' });
}
});
/**
* POST /api/tasks/schedules/:id/run-now
* Manually trigger a scheduled task to run immediately
*/
router.post('/schedules/:id/run-now', async (req: Request, res: Response) => {
try {
const scheduleId = parseInt(req.params.id, 10);
// Get the schedule
const scheduleResult = await pool.query(`
SELECT id, name, role, state_code, platform, priority
FROM task_schedules WHERE id = $1
`, [scheduleId]);
if (scheduleResult.rows.length === 0) {
return res.status(404).json({ error: 'Schedule not found' });
}
const schedule = scheduleResult.rows[0];
// Create a task based on the schedule
const task = await taskService.createTask({
role: schedule.role,
platform: schedule.platform,
priority: schedule.priority + 10, // Boost priority for manual runs
});
// Update last_run_at on the schedule
await pool.query(`
UPDATE task_schedules
SET last_run_at = NOW(),
next_run_at = NOW() + (interval_hours || ' hours')::interval,
updated_at = NOW()
WHERE id = $1
`, [scheduleId]);
res.json({
success: true,
message: `Schedule "${schedule.name}" triggered`,
task,
});
} catch (error: unknown) {
console.error('Error running schedule:', error);
res.status(500).json({ error: 'Failed to run schedule' });
}
});
/**
* POST /api/tasks/schedules/:id/toggle
* Toggle a schedule's enabled status
*/
router.post('/schedules/:id/toggle', async (req: Request, res: Response) => {
try {
const scheduleId = parseInt(req.params.id, 10);
const result = await pool.query(`
UPDATE task_schedules
SET enabled = NOT enabled,
updated_at = NOW()
WHERE id = $1
RETURNING id, name, enabled
`, [scheduleId]);
if (result.rows.length === 0) {
return res.status(404).json({ error: 'Schedule not found' });
}
res.json({
success: true,
schedule: result.rows[0],
message: result.rows[0].enabled
? `Schedule "${result.rows[0].name}" enabled`
: `Schedule "${result.rows[0].name}" disabled`,
});
} catch (error: unknown) {
console.error('Error toggling schedule:', error);
res.status(500).json({ error: 'Failed to toggle schedule' });
}
});
// ============================================================
// TASK-SPECIFIC ROUTES (with :id parameter)
// ============================================================
/** /**
* GET /api/tasks/:id * GET /api/tasks/:id
* Get a specific task by ID * Get a specific task by ID
@@ -1074,159 +592,4 @@ router.post('/migration/full-migrate', async (req: Request, res: Response) => {
} }
}); });
// ============================================================
// STAGGERED BATCH TASK CREATION
// ============================================================
/**
* POST /api/tasks/batch/staggered
* Create multiple tasks with staggered start times
*
* This endpoint prevents resource contention when creating many tasks by
* staggering their scheduled_for timestamps. Each task becomes eligible
* for claiming only after its scheduled time.
*
* WORKFLOW:
* 1. Tasks created with scheduled_for = NOW() + (index * stagger_seconds)
* 2. Worker claims task only when scheduled_for <= NOW()
* 3. Worker runs preflight on EVERY task claim
* 4. If preflight passes, worker executes task
* 5. If preflight fails, task released back to pending for another worker
*
* Body:
* - dispensary_ids: number[] (required) - Array of dispensary IDs
* - role: TaskRole (required) - 'product_refresh' | 'product_discovery'
* - stagger_seconds: number (default: 15) - Seconds between each task start
* - platform: string (default: 'dutchie')
* - method: 'curl' | 'http' | null (default: null)
*/
router.post('/batch/staggered', async (req: Request, res: Response) => {
try {
const {
dispensary_ids,
role,
stagger_seconds = 15,
platform = 'dutchie',
method = null,
} = req.body;
if (!dispensary_ids || !Array.isArray(dispensary_ids) || dispensary_ids.length === 0) {
return res.status(400).json({ error: 'dispensary_ids array is required' });
}
if (!role) {
return res.status(400).json({ error: 'role is required' });
}
const result = await taskService.createStaggeredTasks(
dispensary_ids,
role as TaskRole,
stagger_seconds,
platform,
method
);
const totalDuration = (dispensary_ids.length - 1) * stagger_seconds;
const estimatedEndTime = new Date(Date.now() + totalDuration * 1000);
res.status(201).json({
success: true,
created: result.created,
task_ids: result.taskIds,
stagger_seconds,
total_duration_seconds: totalDuration,
estimated_completion: estimatedEndTime.toISOString(),
message: `Created ${result.created} staggered ${role} tasks (${stagger_seconds}s apart, ~${Math.ceil(totalDuration / 60)} min total)`,
});
} catch (error: unknown) {
console.error('Error creating staggered tasks:', error);
res.status(500).json({ error: 'Failed to create staggered tasks' });
}
});
/**
* POST /api/tasks/batch/az-stores
* Convenience endpoint to create staggered tasks for Arizona stores
*
* Body:
* - total_tasks: number (default: 24) - Total tasks to create
* - stagger_seconds: number (default: 15) - Seconds between each task
* - split_roles: boolean (default: true) - Split between product_refresh and product_discovery
*/
router.post('/batch/az-stores', async (req: Request, res: Response) => {
try {
const {
total_tasks = 24,
stagger_seconds = 15,
split_roles = true,
} = req.body;
const result = await taskService.createAZStoreTasks(
total_tasks,
stagger_seconds,
split_roles
);
const totalDuration = (result.total - 1) * stagger_seconds;
const estimatedEndTime = new Date(Date.now() + totalDuration * 1000);
res.status(201).json({
success: true,
total: result.total,
product_refresh: result.product_refresh,
product_discovery: result.product_discovery,
task_ids: result.taskIds,
stagger_seconds,
total_duration_seconds: totalDuration,
estimated_completion: estimatedEndTime.toISOString(),
message: `Created ${result.total} staggered tasks for AZ stores (${result.product_refresh} refresh, ${result.product_discovery} discovery)`,
});
} catch (error: unknown) {
console.error('Error creating AZ store tasks:', error);
res.status(500).json({ error: 'Failed to create AZ store tasks' });
}
});
// ============================================================
// TASK POOL MANAGEMENT
// ============================================================
/**
* GET /api/tasks/pool/status
* Check if task pool is paused
*/
router.get('/pool/status', async (_req: Request, res: Response) => {
const status = getTaskPoolStatus();
res.json({
success: true,
...status,
});
});
/**
* POST /api/tasks/pool/pause
* Pause the task pool - workers won't pick up new tasks
*/
router.post('/pool/pause', async (_req: Request, res: Response) => {
pauseTaskPool();
res.json({
success: true,
paused: true,
message: 'Task pool paused - workers will not pick up new tasks',
});
});
/**
* POST /api/tasks/pool/resume
* Resume the task pool - workers will pick up tasks again
*/
router.post('/pool/resume', async (_req: Request, res: Response) => {
resumeTaskPool();
res.json({
success: true,
paused: false,
message: 'Task pool resumed - workers will pick up new tasks',
});
});
export default router; export default router;

View File

@@ -23,8 +23,6 @@
import { Router, Request, Response } from 'express'; import { Router, Request, Response } from 'express';
import { pool } from '../db/pool'; import { pool } from '../db/pool';
import os from 'os'; import os from 'os';
import { runPuppeteerPreflightWithRetry } from '../services/puppeteer-preflight';
import { CrawlRotator } from '../services/crawl-rotator';
const router = Router(); const router = Router();
@@ -72,20 +70,21 @@ router.post('/register', async (req: Request, res: Response) => {
); );
if (existing.rows.length > 0) { if (existing.rows.length > 0) {
// Re-activate existing worker - keep existing pod_name (fantasy name), don't overwrite with K8s name // Re-activate existing worker
const { rows } = await pool.query(` const { rows } = await pool.query(`
UPDATE worker_registry UPDATE worker_registry
SET status = 'active', SET status = 'active',
role = $1, role = $1,
hostname = $2, pod_name = $2,
ip_address = $3, hostname = $3,
ip_address = $4,
last_heartbeat_at = NOW(), last_heartbeat_at = NOW(),
started_at = NOW(), started_at = NOW(),
metadata = $4, metadata = $5,
updated_at = NOW() updated_at = NOW()
WHERE worker_id = $5 WHERE worker_id = $6
RETURNING id, worker_id, friendly_name, pod_name, role RETURNING id, worker_id, friendly_name, role
`, [role, finalHostname, clientIp, metadata, finalWorkerId]); `, [role, pod_name, finalHostname, clientIp, metadata, finalWorkerId]);
const worker = rows[0]; const worker = rows[0];
const roleMsg = role ? `for ${role}` : 'as role-agnostic'; const roleMsg = role ? `for ${role}` : 'as role-agnostic';
@@ -106,13 +105,13 @@ router.post('/register', async (req: Request, res: Response) => {
const nameResult = await pool.query('SELECT assign_worker_name($1) as name', [finalWorkerId]); const nameResult = await pool.query('SELECT assign_worker_name($1) as name', [finalWorkerId]);
const friendlyName = nameResult.rows[0].name; const friendlyName = nameResult.rows[0].name;
// Register the worker - use friendlyName as pod_name (not K8s name) // Register the worker
const { rows } = await pool.query(` const { rows } = await pool.query(`
INSERT INTO worker_registry ( INSERT INTO worker_registry (
worker_id, friendly_name, role, pod_name, hostname, ip_address, status, metadata worker_id, friendly_name, role, pod_name, hostname, ip_address, status, metadata
) VALUES ($1, $2, $3, $4, $5, $6, 'active', $7) ) VALUES ($1, $2, $3, $4, $5, $6, 'active', $7)
RETURNING id, worker_id, friendly_name, pod_name, role RETURNING id, worker_id, friendly_name, role
`, [finalWorkerId, friendlyName, role, friendlyName, finalHostname, clientIp, metadata]); `, [finalWorkerId, friendlyName, role, pod_name, finalHostname, clientIp, metadata]);
const worker = rows[0]; const worker = rows[0];
const roleMsg = role ? `for ${role}` : 'as role-agnostic'; const roleMsg = role ? `for ${role}` : 'as role-agnostic';
@@ -139,36 +138,17 @@ router.post('/register', async (req: Request, res: Response) => {
* *
* Body: * Body:
* - worker_id: string (required) * - worker_id: string (required)
* - current_task_id: number (optional) - task currently being processed (primary task) * - current_task_id: number (optional) - task currently being processed
* - current_task_ids: number[] (optional) - all tasks currently being processed (concurrent)
* - active_task_count: number (optional) - number of tasks currently running
* - max_concurrent_tasks: number (optional) - max concurrent tasks this worker can handle
* - status: string (optional) - 'active', 'idle' * - status: string (optional) - 'active', 'idle'
* - resources: object (optional) - memory_mb, cpu_user_ms, cpu_system_ms, etc.
*/ */
router.post('/heartbeat', async (req: Request, res: Response) => { router.post('/heartbeat', async (req: Request, res: Response) => {
try { try {
const { const { worker_id, current_task_id, status = 'active', resources } = req.body;
worker_id,
current_task_id,
current_task_ids,
active_task_count,
max_concurrent_tasks,
status = 'active',
resources
} = req.body;
if (!worker_id) { if (!worker_id) {
return res.status(400).json({ success: false, error: 'worker_id is required' }); return res.status(400).json({ success: false, error: 'worker_id is required' });
} }
// Build metadata object with all the new fields
const metadata: Record<string, unknown> = {};
if (resources) Object.assign(metadata, resources);
if (current_task_ids) metadata.current_task_ids = current_task_ids;
if (active_task_count !== undefined) metadata.active_task_count = active_task_count;
if (max_concurrent_tasks !== undefined) metadata.max_concurrent_tasks = max_concurrent_tasks;
// Store resources in metadata jsonb column // Store resources in metadata jsonb column
const { rows } = await pool.query(` const { rows } = await pool.query(`
UPDATE worker_registry UPDATE worker_registry
@@ -179,7 +159,7 @@ router.post('/heartbeat', async (req: Request, res: Response) => {
updated_at = NOW() updated_at = NOW()
WHERE worker_id = $3 WHERE worker_id = $3
RETURNING id, friendly_name, status RETURNING id, friendly_name, status
`, [current_task_id || null, status, worker_id, Object.keys(metadata).length > 0 ? JSON.stringify(metadata) : null]); `, [current_task_id || null, status, worker_id, resources ? JSON.stringify(resources) : null]);
if (rows.length === 0) { if (rows.length === 0) {
return res.status(404).json({ success: false, error: 'Worker not found - please register first' }); return res.status(404).json({ success: false, error: 'Worker not found - please register first' });
@@ -252,9 +232,12 @@ router.post('/deregister', async (req: Request, res: Response) => {
// Release the name back to the pool // Release the name back to the pool
await pool.query('SELECT release_worker_name($1)', [worker_id]); await pool.query('SELECT release_worker_name($1)', [worker_id]);
// Delete the worker entry (clean shutdown) // Mark as terminated
const { rows } = await pool.query(` const { rows } = await pool.query(`
DELETE FROM worker_registry UPDATE worker_registry
SET status = 'terminated',
current_task_id = NULL,
updated_at = NOW()
WHERE worker_id = $1 WHERE worker_id = $1
RETURNING id, friendly_name RETURNING id, friendly_name
`, [worker_id]); `, [worker_id]);
@@ -347,27 +330,12 @@ router.get('/workers', async (req: Request, res: Response) => {
tasks_completed, tasks_completed,
tasks_failed, tasks_failed,
current_task_id, current_task_id,
-- Concurrent task fields from metadata
(metadata->>'current_task_ids')::jsonb as current_task_ids,
(metadata->>'active_task_count')::int as active_task_count,
(metadata->>'max_concurrent_tasks')::int as max_concurrent_tasks,
-- Decommission fields
COALESCE(decommission_requested, false) as decommission_requested,
decommission_reason,
-- Preflight fields (dual-transport verification)
curl_ip,
http_ip,
preflight_status,
preflight_at,
fingerprint_data,
-- Full metadata for resources
metadata, metadata,
EXTRACT(EPOCH FROM (NOW() - last_heartbeat_at)) as seconds_since_heartbeat, EXTRACT(EPOCH FROM (NOW() - last_heartbeat_at)) as seconds_since_heartbeat,
CASE CASE
WHEN status = 'offline' OR status = 'terminated' THEN status WHEN status = 'offline' OR status = 'terminated' THEN status
WHEN last_heartbeat_at < NOW() - INTERVAL '2 minutes' THEN 'stale' WHEN last_heartbeat_at < NOW() - INTERVAL '2 minutes' THEN 'stale'
WHEN current_task_id IS NOT NULL THEN 'busy' WHEN current_task_id IS NOT NULL THEN 'busy'
WHEN (metadata->>'active_task_count')::int > 0 THEN 'busy'
ELSE 'ready' ELSE 'ready'
END as health_status, END as health_status,
created_at created_at
@@ -704,217 +672,4 @@ router.get('/capacity', async (_req: Request, res: Response) => {
} }
}); });
// ============================================================
// WORKER LIFECYCLE MANAGEMENT
// ============================================================
/**
* POST /api/worker-registry/workers/:workerId/decommission
* Request graceful decommission of a worker (will stop after current task)
*/
router.post('/workers/:workerId/decommission', async (req: Request, res: Response) => {
try {
const { workerId } = req.params;
const { reason, issued_by } = req.body;
// Update worker_registry to flag for decommission
const result = await pool.query(
`UPDATE worker_registry
SET decommission_requested = true,
decommission_reason = $2,
decommission_requested_at = NOW()
WHERE worker_id = $1
RETURNING friendly_name, status, current_task_id`,
[workerId, reason || 'Manual decommission from admin']
);
if (result.rows.length === 0) {
return res.status(404).json({ success: false, error: 'Worker not found' });
}
const worker = result.rows[0];
// Also log to worker_commands for audit trail
await pool.query(
`INSERT INTO worker_commands (worker_id, command, reason, issued_by)
VALUES ($1, 'decommission', $2, $3)
ON CONFLICT DO NOTHING`,
[workerId, reason || 'Manual decommission', issued_by || 'admin']
).catch(() => {
// Table might not exist yet - ignore
});
res.json({
success: true,
message: worker.current_task_id
? `Worker ${worker.friendly_name} will stop after completing task #${worker.current_task_id}`
: `Worker ${worker.friendly_name} will stop on next poll`,
worker: {
friendly_name: worker.friendly_name,
status: worker.status,
current_task_id: worker.current_task_id,
decommission_requested: true
}
});
} catch (error: any) {
res.status(500).json({ success: false, error: error.message });
}
});
/**
* POST /api/worker-registry/workers/:workerId/cancel-decommission
* Cancel a pending decommission request
*/
router.post('/workers/:workerId/cancel-decommission', async (req: Request, res: Response) => {
try {
const { workerId } = req.params;
const result = await pool.query(
`UPDATE worker_registry
SET decommission_requested = false,
decommission_reason = NULL,
decommission_requested_at = NULL
WHERE worker_id = $1
RETURNING friendly_name`,
[workerId]
);
if (result.rows.length === 0) {
return res.status(404).json({ success: false, error: 'Worker not found' });
}
res.json({
success: true,
message: `Decommission cancelled for ${result.rows[0].friendly_name}`
});
} catch (error: any) {
res.status(500).json({ success: false, error: error.message });
}
});
/**
* POST /api/worker-registry/spawn
* Spawn a new worker in the current pod (only works in multi-worker-per-pod mode)
* For now, this is a placeholder - actual spawning requires the pod supervisor
*/
router.post('/spawn', async (req: Request, res: Response) => {
try {
const { pod_name, role } = req.body;
// For now, we can't actually spawn workers from the API
// This would require a supervisor process in each pod that listens for spawn commands
// Instead, return instructions for how to scale
res.json({
success: false,
error: 'Direct worker spawning not yet implemented',
instructions: 'To add workers, scale the K8s deployment: kubectl scale deployment/scraper-worker --replicas=N'
});
} catch (error: any) {
res.status(500).json({ success: false, error: error.message });
}
});
/**
* GET /api/worker-registry/pods
* Get workers grouped by pod
*/
router.get('/pods', async (_req: Request, res: Response) => {
try {
const { rows } = await pool.query(`
SELECT
COALESCE(pod_name, 'Unknown') as pod_name,
COUNT(*) as worker_count,
COUNT(*) FILTER (WHERE current_task_id IS NOT NULL) as busy_count,
COUNT(*) FILTER (WHERE current_task_id IS NULL) as idle_count,
SUM(tasks_completed) as total_completed,
SUM(tasks_failed) as total_failed,
SUM((metadata->>'memory_rss_mb')::int) as total_memory_mb,
array_agg(json_build_object(
'worker_id', worker_id,
'friendly_name', friendly_name,
'status', status,
'current_task_id', current_task_id,
'tasks_completed', tasks_completed,
'tasks_failed', tasks_failed,
'decommission_requested', COALESCE(decommission_requested, false),
'last_heartbeat_at', last_heartbeat_at
)) as workers
FROM worker_registry
WHERE status NOT IN ('offline', 'terminated')
GROUP BY pod_name
ORDER BY pod_name
`);
res.json({
success: true,
pods: rows.map(row => ({
pod_name: row.pod_name,
worker_count: parseInt(row.worker_count),
busy_count: parseInt(row.busy_count),
idle_count: parseInt(row.idle_count),
total_completed: parseInt(row.total_completed) || 0,
total_failed: parseInt(row.total_failed) || 0,
total_memory_mb: parseInt(row.total_memory_mb) || 0,
workers: row.workers
}))
});
} catch (error: any) {
res.status(500).json({ success: false, error: error.message });
}
});
// ============================================================
// PREFLIGHT SMOKE TEST
// ============================================================
/**
* POST /api/worker-registry/preflight-test
* Run an HTTP (Puppeteer) preflight test and return results
*
* This is a smoke test endpoint to verify the preflight system works.
* Returns IP, fingerprint data, bot detection results, and products fetched.
*/
router.post('/preflight-test', async (_req: Request, res: Response) => {
try {
console.log('[PreflightTest] Starting HTTP preflight smoke test...');
// Create a temporary CrawlRotator for the test
const crawlRotator = new CrawlRotator();
// Run the Puppeteer preflight (with 1 retry)
const startTime = Date.now();
const result = await runPuppeteerPreflightWithRetry(crawlRotator, 1);
const duration = Date.now() - startTime;
console.log(`[PreflightTest] Completed in ${duration}ms - passed: ${result.passed}`);
res.json({
success: true,
test: 'http_preflight',
duration_ms: duration,
result: {
passed: result.passed,
proxy_ip: result.proxyIp,
fingerprint: result.fingerprint,
bot_detection: result.botDetection,
products_returned: result.productsReturned,
browser_user_agent: result.browserUserAgent,
ip_verified: result.ipVerified,
proxy_available: result.proxyAvailable,
proxy_connected: result.proxyConnected,
antidetect_ready: result.antidetectReady,
response_time_ms: result.responseTimeMs,
error: result.error
}
});
} catch (error: any) {
console.error('[PreflightTest] Error:', error.message);
res.status(500).json({
success: false,
test: 'http_preflight',
error: error.message
});
}
});
export default router; export default router;

View File

@@ -4,25 +4,10 @@
* Provider-agnostic worker management and job monitoring. * Provider-agnostic worker management and job monitoring.
* Replaces legacy /api/dutchie-az/admin/schedules and /api/dutchie-az/monitor/* routes. * Replaces legacy /api/dutchie-az/admin/schedules and /api/dutchie-az/monitor/* routes.
* *
* DEPRECATION NOTE (2025-12-12):
* This file still queries job_schedules for backwards compatibility with
* the /api/workers endpoints that display worker status. However, the
* job_schedules table is DEPRECATED - all entries have been disabled.
*
* Schedule management has been consolidated into task_schedules:
* - Use /api/tasks/schedules for schedule CRUD operations
* - Use TasksDashboard.tsx (/admin/tasks) for schedule management UI
* - task_schedules uses interval_hours (simpler than base_interval_minutes + jitter)
*
* The /api/workers endpoints remain useful for:
* - Monitoring active workers and job status
* - K8s scaling controls
* - Job history and logs
*
* Endpoints: * Endpoints:
* GET /api/workers - List all workers/schedules * GET /api/workers - List all workers/schedules
* GET /api/workers/active - List currently active workers * GET /api/workers/active - List currently active workers
* GET /api/workers/schedule - Get all job schedules (DEPRECATED - use /api/tasks/schedules) * GET /api/workers/schedule - Get all job schedules
* GET /api/workers/:workerName - Get specific worker details * GET /api/workers/:workerName - Get specific worker details
* GET /api/workers/:workerName/scope - Get worker's scope (states, etc.) * GET /api/workers/:workerName/scope - Get worker's scope (states, etc.)
* GET /api/workers/:workerName/stats - Get worker statistics * GET /api/workers/:workerName/stats - Get worker statistics
@@ -32,234 +17,13 @@
* GET /api/monitor/jobs - Get recent job history * GET /api/monitor/jobs - Get recent job history
* GET /api/monitor/active-jobs - Get currently running jobs * GET /api/monitor/active-jobs - Get currently running jobs
* GET /api/monitor/summary - Get monitoring summary * GET /api/monitor/summary - Get monitoring summary
*
* K8s Scaling (added 2024-12-10):
* GET /api/workers/k8s/replicas - Get current replica count
* POST /api/workers/k8s/scale - Scale worker replicas up/down
*/ */
import { Router, Request, Response } from 'express'; import { Router, Request, Response } from 'express';
import { pool } from '../db/pool'; import { pool } from '../db/pool';
import * as k8s from '@kubernetes/client-node';
const router = Router(); const router = Router();
// ============================================================
// K8S SCALING CONFIGURATION (added 2024-12-10)
// Per TASK_WORKFLOW_2024-12-10.md: Admin can scale workers from UI
// ============================================================
const K8S_NAMESPACE = process.env.K8S_NAMESPACE || 'dispensary-scraper';
const K8S_DEPLOYMENT_NAME = process.env.K8S_WORKER_DEPLOYMENT || 'scraper-worker';
// Initialize K8s client - uses in-cluster config when running in K8s,
// or kubeconfig when running locally
let k8sAppsApi: k8s.AppsV1Api | null = null;
function getK8sClient(): k8s.AppsV1Api | null {
if (k8sAppsApi) return k8sAppsApi;
try {
const kc = new k8s.KubeConfig();
// Try in-cluster config first (when running as a pod)
// Falls back to default kubeconfig (~/.kube/config) for local dev
try {
kc.loadFromCluster();
} catch {
kc.loadFromDefault();
}
k8sAppsApi = kc.makeApiClient(k8s.AppsV1Api);
return k8sAppsApi;
} catch (err: any) {
console.warn('[Workers] K8s client not available:', err.message);
return null;
}
}
// ============================================================
// K8S SCALING ROUTES (added 2024-12-10)
// Per TASK_WORKFLOW_2024-12-10.md: Admin can scale workers from UI
// ============================================================
/**
* GET /api/workers/k8s/replicas - Get current worker replica count
* Returns current and desired replica counts from the Deployment
*/
router.get('/k8s/replicas', async (_req: Request, res: Response) => {
const client = getK8sClient();
if (!client) {
return res.status(503).json({
success: false,
error: 'K8s client not available (not running in cluster or no kubeconfig)',
replicas: null,
});
}
try {
const response = await client.readNamespacedDeployment({
name: K8S_DEPLOYMENT_NAME,
namespace: K8S_NAMESPACE,
});
const deployment = response;
res.json({
success: true,
replicas: {
current: deployment.status?.readyReplicas || 0,
desired: deployment.spec?.replicas || 0,
available: deployment.status?.availableReplicas || 0,
updated: deployment.status?.updatedReplicas || 0,
},
deployment: K8S_DEPLOYMENT_NAME,
namespace: K8S_NAMESPACE,
});
} catch (err: any) {
console.error('[Workers] K8s replicas error:', err.body?.message || err.message);
res.status(500).json({
success: false,
error: err.body?.message || err.message,
});
}
});
/**
* POST /api/workers/k8s/scale - Scale worker replicas
* Body: { replicas: number } - desired replica count (0-20)
*/
router.post('/k8s/scale', async (req: Request, res: Response) => {
const client = getK8sClient();
if (!client) {
return res.status(503).json({
success: false,
error: 'K8s client not available (not running in cluster or no kubeconfig)',
});
}
const { replicas } = req.body;
// Validate replica count
if (typeof replicas !== 'number' || replicas < 0 || replicas > 20) {
return res.status(400).json({
success: false,
error: 'replicas must be a number between 0 and 20',
});
}
try {
// Get current state first
const currentResponse = await client.readNamespacedDeploymentScale({
name: K8S_DEPLOYMENT_NAME,
namespace: K8S_NAMESPACE,
});
const currentReplicas = currentResponse.spec?.replicas || 0;
// Update scale using replaceNamespacedDeploymentScale
await client.replaceNamespacedDeploymentScale({
name: K8S_DEPLOYMENT_NAME,
namespace: K8S_NAMESPACE,
body: {
apiVersion: 'autoscaling/v1',
kind: 'Scale',
metadata: {
name: K8S_DEPLOYMENT_NAME,
namespace: K8S_NAMESPACE,
},
spec: {
replicas: replicas,
},
},
});
console.log(`[Workers] Scaled ${K8S_DEPLOYMENT_NAME} from ${currentReplicas} to ${replicas} replicas`);
res.json({
success: true,
message: `Scaled from ${currentReplicas} to ${replicas} replicas`,
previous: currentReplicas,
desired: replicas,
deployment: K8S_DEPLOYMENT_NAME,
namespace: K8S_NAMESPACE,
});
} catch (err: any) {
console.error('[Workers] K8s scale error:', err.body?.message || err.message);
res.status(500).json({
success: false,
error: err.body?.message || err.message,
});
}
});
/**
* POST /api/workers/k8s/scale-up - Scale up worker replicas by 1
* Convenience endpoint for adding a single worker
*/
router.post('/k8s/scale-up', async (_req: Request, res: Response) => {
const client = getK8sClient();
if (!client) {
return res.status(503).json({
success: false,
error: 'K8s client not available (not running in cluster or no kubeconfig)',
});
}
try {
// Get current replica count
const currentResponse = await client.readNamespacedDeploymentScale({
name: K8S_DEPLOYMENT_NAME,
namespace: K8S_NAMESPACE,
});
const currentReplicas = currentResponse.spec?.replicas || 0;
const newReplicas = currentReplicas + 1;
// Cap at 20 replicas
if (newReplicas > 20) {
return res.status(400).json({
success: false,
error: 'Maximum replica count (20) reached',
});
}
// Scale up by 1
await client.replaceNamespacedDeploymentScale({
name: K8S_DEPLOYMENT_NAME,
namespace: K8S_NAMESPACE,
body: {
apiVersion: 'autoscaling/v1',
kind: 'Scale',
metadata: {
name: K8S_DEPLOYMENT_NAME,
namespace: K8S_NAMESPACE,
},
spec: {
replicas: newReplicas,
},
},
});
console.log(`[Workers] Scaled up ${K8S_DEPLOYMENT_NAME} from ${currentReplicas} to ${newReplicas} replicas`);
res.json({
success: true,
message: `Added worker (${currentReplicas}${newReplicas} replicas)`,
previous: currentReplicas,
desired: newReplicas,
deployment: K8S_DEPLOYMENT_NAME,
namespace: K8S_NAMESPACE,
});
} catch (err: any) {
console.error('[Workers] K8s scale-up error:', err.body?.message || err.message);
res.status(500).json({
success: false,
error: err.body?.message || err.message,
});
}
});
// ============================================================ // ============================================================
// STATIC ROUTES (must come before parameterized routes) // STATIC ROUTES (must come before parameterized routes)
// ============================================================ // ============================================================

View File

@@ -16,11 +16,10 @@ import {
executeGraphQL, executeGraphQL,
startSession, startSession,
endSession, endSession,
setCrawlRotator, getFingerprint,
GRAPHQL_HASHES, GRAPHQL_HASHES,
DUTCHIE_CONFIG, DUTCHIE_CONFIG,
} from '../platforms/dutchie'; } from '../platforms/dutchie';
import { CrawlRotator } from '../services/crawl-rotator';
dotenv.config(); dotenv.config();
@@ -109,27 +108,19 @@ async function main() {
// ============================================================ // ============================================================
// STEP 2: Start stealth session // STEP 2: Start stealth session
// Per workflow-12102025.md: Initialize CrawlRotator and start session with menuUrl
// ============================================================ // ============================================================
console.log('┌─────────────────────────────────────────────────────────────┐'); console.log('┌─────────────────────────────────────────────────────────────┐');
console.log('│ STEP 2: Start Stealth Session │'); console.log('│ STEP 2: Start Stealth Session │');
console.log('└─────────────────────────────────────────────────────────────┘'); console.log('└─────────────────────────────────────────────────────────────┘');
// Per workflow-12102025.md: Initialize CrawlRotator (required for sessions) // Use Arizona timezone for this store
const rotator = new CrawlRotator(); const session = startSession(disp.state || 'AZ', 'America/Phoenix');
setCrawlRotator(rotator);
// Per workflow-12102025.md: startSession takes menuUrl for dynamic Referer const fp = getFingerprint();
const session = startSession(disp.menu_url);
const fp = session.fingerprint;
console.log(` Session ID: ${session.sessionId}`); console.log(` Session ID: ${session.sessionId}`);
console.log(` Browser: ${fp.browserName} (${fp.deviceCategory})`);
console.log(` User-Agent: ${fp.userAgent.slice(0, 60)}...`); console.log(` User-Agent: ${fp.userAgent.slice(0, 60)}...`);
console.log(` Accept-Language: ${fp.acceptLanguage}`); console.log(` Accept-Language: ${fp.acceptLanguage}`);
console.log(` Referer: ${session.referer}`); console.log(` Sec-CH-UA: ${fp.secChUa || '(not set)'}`);
console.log(` DNT: ${fp.httpFingerprint.hasDNT ? 'enabled' : 'disabled'}`);
console.log(` TLS: ${fp.httpFingerprint.curlImpersonateBinary}`);
console.log(''); console.log('');
// ============================================================ // ============================================================

View File

@@ -1,284 +0,0 @@
/**
* Bulk Proxy Import Script
*
* Imports proxies from various formats into the proxies table.
* Supports:
* - Standard format: http://user:pass@host:port
* - Colon format: http://host:port:user:pass
* - Simple format: host:port:user:pass (defaults to http)
*
* Usage:
* npx tsx src/scripts/import-proxies.ts < proxies.txt
* echo "http://host:port:user:pass" | npx tsx src/scripts/import-proxies.ts
* npx tsx src/scripts/import-proxies.ts --file proxies.txt
* npx tsx src/scripts/import-proxies.ts --url "http://host:port:user:pass"
*
* Options:
* --file <path> Read proxies from file (one per line)
* --url <url> Import a single proxy URL
* --max-connections Set max_connections for all imported proxies (default: 1)
* --dry-run Parse and show what would be imported without inserting
*/
import { getPool } from '../db/pool';
import * as fs from 'fs';
import * as readline from 'readline';
interface ParsedProxy {
protocol: string;
host: string;
port: number;
username?: string;
password?: string;
rawUrl: string;
}
/**
* Parse a proxy URL in various formats
*/
function parseProxyUrl(input: string): ParsedProxy | null {
const trimmed = input.trim();
if (!trimmed || trimmed.startsWith('#')) return null;
// Format 1: Standard URL format - http://user:pass@host:port
const standardMatch = trimmed.match(/^(https?|socks5):\/\/([^:]+):([^@]+)@([^:]+):(\d+)$/);
if (standardMatch) {
return {
protocol: standardMatch[1],
username: standardMatch[2],
password: standardMatch[3],
host: standardMatch[4],
port: parseInt(standardMatch[5], 10),
rawUrl: trimmed,
};
}
// Format 2: Standard URL without auth - http://host:port
const noAuthMatch = trimmed.match(/^(https?|socks5):\/\/([^:]+):(\d+)$/);
if (noAuthMatch) {
return {
protocol: noAuthMatch[1],
host: noAuthMatch[2],
port: parseInt(noAuthMatch[3], 10),
rawUrl: trimmed,
};
}
// Format 3: Colon format with protocol - http://host:port:user:pass
const colonWithProtocolMatch = trimmed.match(/^(https?|socks5):\/\/([^:]+):(\d+):([^:]+):(.+)$/);
if (colonWithProtocolMatch) {
return {
protocol: colonWithProtocolMatch[1],
host: colonWithProtocolMatch[2],
port: parseInt(colonWithProtocolMatch[3], 10),
username: colonWithProtocolMatch[4],
password: colonWithProtocolMatch[5],
rawUrl: trimmed, // Keep raw URL for non-standard format
};
}
// Format 4: Colon format without protocol - host:port:user:pass
const colonMatch = trimmed.match(/^([^:]+):(\d+):([^:]+):(.+)$/);
if (colonMatch) {
return {
protocol: 'http',
host: colonMatch[1],
port: parseInt(colonMatch[2], 10),
username: colonMatch[3],
password: colonMatch[4],
rawUrl: `http://${trimmed}`, // Construct raw URL
};
}
// Format 5: Simple host:port
const simpleMatch = trimmed.match(/^([^:]+):(\d+)$/);
if (simpleMatch) {
return {
protocol: 'http',
host: simpleMatch[1],
port: parseInt(simpleMatch[2], 10),
rawUrl: `http://${trimmed}`,
};
}
console.error(`[ImportProxies] Could not parse: ${trimmed}`);
return null;
}
/**
* Check if proxy URL is in non-standard format (needs proxy_url column)
*/
function isNonStandardFormat(rawUrl: string): boolean {
// Colon format: protocol://host:port:user:pass
return /^(https?|socks5):\/\/[^:]+:\d+:[^:]+:.+$/.test(rawUrl);
}
async function importProxies(proxies: ParsedProxy[], maxConnections: number, dryRun: boolean) {
if (dryRun) {
console.log('\n[ImportProxies] DRY RUN - Would import:');
for (const p of proxies) {
const needsRawUrl = isNonStandardFormat(p.rawUrl);
console.log(` ${p.host}:${p.port} (${p.protocol}) user=${p.username || 'none'} needsProxyUrl=${needsRawUrl}`);
}
console.log(`\nTotal: ${proxies.length} proxies`);
return;
}
const pool = getPool();
let inserted = 0;
let skipped = 0;
for (const proxy of proxies) {
try {
// Determine if we need to store the raw URL (non-standard format)
const needsRawUrl = isNonStandardFormat(proxy.rawUrl);
// Use different conflict resolution based on format
// Non-standard format: unique by proxy_url (session-based residential proxies)
// Standard format: unique by host/port/protocol
const query = needsRawUrl
? `
INSERT INTO proxies (host, port, protocol, username, password, max_connections, proxy_url, active)
VALUES ($1, $2, $3, $4, $5, $6, $7, true)
ON CONFLICT (proxy_url) WHERE proxy_url IS NOT NULL
DO UPDATE SET
max_connections = EXCLUDED.max_connections,
active = true,
updated_at = NOW()
RETURNING id, (xmax = 0) as is_insert
`
: `
INSERT INTO proxies (host, port, protocol, username, password, max_connections, proxy_url, active)
VALUES ($1, $2, $3, $4, $5, $6, $7, true)
ON CONFLICT (host, port, protocol)
DO UPDATE SET
username = EXCLUDED.username,
password = EXCLUDED.password,
max_connections = EXCLUDED.max_connections,
proxy_url = EXCLUDED.proxy_url,
active = true,
updated_at = NOW()
RETURNING id, (xmax = 0) as is_insert
`;
const result = await pool.query(query, [
proxy.host,
proxy.port,
proxy.protocol,
proxy.username || null,
proxy.password || null,
maxConnections,
needsRawUrl ? proxy.rawUrl : null,
]);
const isInsert = result.rows[0]?.is_insert;
const sessionId = proxy.password?.match(/session-([A-Z0-9]+)/)?.[1] || '';
const displayName = sessionId ? `session ${sessionId}` : `${proxy.host}:${proxy.port}`;
if (isInsert) {
inserted++;
console.log(`[ImportProxies] Inserted: ${displayName}`);
} else {
console.log(`[ImportProxies] Updated: ${displayName}`);
inserted++; // Count updates too
}
} catch (err: any) {
const sessionId = proxy.password?.match(/session-([A-Z0-9]+)/)?.[1] || '';
const displayName = sessionId ? `session ${sessionId}` : `${proxy.host}:${proxy.port}`;
console.error(`[ImportProxies] Error inserting ${displayName}: ${err.message}`);
skipped++;
}
}
console.log(`\n[ImportProxies] Complete: ${inserted} imported, ${skipped} skipped`);
// Notify any listening workers
try {
await pool.query(`NOTIFY proxy_added, 'bulk import'`);
console.log('[ImportProxies] Sent proxy_added notification to workers');
} catch {
// Ignore notification errors
}
}
async function readFromStdin(): Promise<string[]> {
return new Promise((resolve) => {
const lines: string[] = [];
const rl = readline.createInterface({
input: process.stdin,
output: process.stdout,
terminal: false,
});
rl.on('line', (line) => {
lines.push(line);
});
rl.on('close', () => {
resolve(lines);
});
});
}
async function main() {
const args = process.argv.slice(2);
let lines: string[] = [];
let maxConnections = 1;
let dryRun = false;
// Parse arguments
for (let i = 0; i < args.length; i++) {
if (args[i] === '--file' && args[i + 1]) {
const content = fs.readFileSync(args[i + 1], 'utf-8');
lines.push(...content.split('\n'));
i++;
} else if (args[i] === '--url' && args[i + 1]) {
lines.push(args[i + 1]);
i++;
} else if (args[i] === '--max-connections' && args[i + 1]) {
maxConnections = parseInt(args[i + 1], 10);
i++;
} else if (args[i] === '--dry-run') {
dryRun = true;
} else if (!args[i].startsWith('--')) {
// Treat as URL directly
lines.push(args[i]);
}
}
// If no lines yet, read from stdin
if (lines.length === 0) {
console.log('[ImportProxies] Reading from stdin...');
lines = await readFromStdin();
}
// Parse all lines
const proxies: ParsedProxy[] = [];
for (const line of lines) {
const parsed = parseProxyUrl(line);
if (parsed) {
proxies.push(parsed);
}
}
if (proxies.length === 0) {
console.error('[ImportProxies] No valid proxies found');
console.error('\nUsage:');
console.error(' npx tsx src/scripts/import-proxies.ts --url "http://host:port:user:pass"');
console.error(' npx tsx src/scripts/import-proxies.ts --file proxies.txt');
console.error(' echo "host:port:user:pass" | npx tsx src/scripts/import-proxies.ts');
console.error('\nSupported formats:');
console.error(' http://user:pass@host:port (standard)');
console.error(' http://host:port:user:pass (colon format)');
console.error(' host:port:user:pass (simple)');
process.exit(1);
}
console.log(`[ImportProxies] Parsed ${proxies.length} proxies (max_connections=${maxConnections})`);
await importProxies(proxies, maxConnections, dryRun);
}
main().catch((err) => {
console.error('[ImportProxies] Fatal error:', err);
process.exit(1);
});

View File

@@ -1,10 +1,10 @@
/** /**
* Test script for stealth session management * Test script for stealth session management
* *
* Per workflow-12102025.md: * Tests:
* - Tests HTTP fingerprinting (browser-specific headers + ordering) * 1. Per-session fingerprint rotation
* - Tests UA generation (device distribution, browser filtering) * 2. Geographic consistency (timezone → Accept-Language)
* - Tests dynamic Referer per dispensary * 3. Proxy location loading from database
* *
* Usage: * Usage:
* npx tsx src/scripts/test-stealth-session.ts * npx tsx src/scripts/test-stealth-session.ts
@@ -14,142 +14,104 @@ import {
startSession, startSession,
endSession, endSession,
getCurrentSession, getCurrentSession,
getFingerprint,
getRandomFingerprint,
getLocaleForTimezone,
buildHeaders, buildHeaders,
setCrawlRotator,
} from '../platforms/dutchie'; } from '../platforms/dutchie';
import { CrawlRotator } from '../services/crawl-rotator';
import {
generateHTTPFingerprint,
buildRefererFromMenuUrl,
BrowserType,
} from '../services/http-fingerprint';
console.log('='.repeat(60)); console.log('='.repeat(60));
console.log('STEALTH SESSION TEST (per workflow-12102025.md)'); console.log('STEALTH SESSION TEST');
console.log('='.repeat(60)); console.log('='.repeat(60));
// Initialize CrawlRotator (required for sessions) // Test 1: Timezone to Locale mapping
console.log('\n[Setup] Initializing CrawlRotator...'); console.log('\n[Test 1] Timezone to Locale Mapping:');
const rotator = new CrawlRotator(); const testTimezones = [
setCrawlRotator(rotator); 'America/Phoenix',
console.log(' CrawlRotator initialized'); 'America/Los_Angeles',
'America/New_York',
// Test 1: HTTP Fingerprint Generation 'America/Chicago',
console.log('\n[Test 1] HTTP Fingerprint Generation:');
const browsers: BrowserType[] = ['Chrome', 'Firefox', 'Safari', 'Edge'];
for (const browser of browsers) {
const httpFp = generateHTTPFingerprint(browser);
console.log(` ${browser}:`);
console.log(` TLS binary: ${httpFp.curlImpersonateBinary}`);
console.log(` DNT: ${httpFp.hasDNT ? 'enabled' : 'disabled'}`);
console.log(` Header order: ${httpFp.headerOrder.slice(0, 5).join(', ')}...`);
}
// Test 2: Dynamic Referer from menu URLs
console.log('\n[Test 2] Dynamic Referer from Menu URLs:');
const testUrls = [
'https://dutchie.com/embedded-menu/harvest-of-tempe',
'https://dutchie.com/dispensary/zen-leaf-mesa',
'/embedded-menu/deeply-rooted',
'/dispensary/curaleaf-phoenix',
null,
undefined, undefined,
'Invalid/Timezone',
]; ];
for (const url of testUrls) { for (const tz of testTimezones) {
const referer = buildRefererFromMenuUrl(url); const locale = getLocaleForTimezone(tz);
console.log(` ${url || '(null/undefined)'}`); console.log(` ${tz || '(undefined)'}${locale}`);
console.log(`${referer}`);
} }
// Test 3: Session with Dynamic Referer // Test 2: Random fingerprint selection
console.log('\n[Test 3] Session with Dynamic Referer:'); console.log('\n[Test 2] Random Fingerprint Selection (5 samples):');
const testMenuUrl = 'https://dutchie.com/dispensary/harvest-of-tempe'; for (let i = 0; i < 5; i++) {
console.log(` Starting session with menuUrl: ${testMenuUrl}`); const fp = getRandomFingerprint();
console.log(` ${i + 1}. ${fp.userAgent.slice(0, 60)}...`);
}
const session1 = startSession(testMenuUrl); // Test 3: Session Management
console.log('\n[Test 3] Session Management:');
// Before session - should use default fingerprint
console.log(' Before session:');
const beforeFp = getFingerprint();
console.log(` getFingerprint(): ${beforeFp.userAgent.slice(0, 50)}...`);
console.log(` getCurrentSession(): ${getCurrentSession()}`);
// Start session with Arizona timezone
console.log('\n Starting session (AZ, America/Phoenix):');
const session1 = startSession('AZ', 'America/Phoenix');
console.log(` Session ID: ${session1.sessionId}`); console.log(` Session ID: ${session1.sessionId}`);
console.log(` Browser: ${session1.fingerprint.browserName}`); console.log(` Fingerprint UA: ${session1.fingerprint.userAgent.slice(0, 50)}...`);
console.log(` Device: ${session1.fingerprint.deviceCategory}`); console.log(` Accept-Language: ${session1.fingerprint.acceptLanguage}`);
console.log(` Referer: ${session1.referer}`); console.log(` Timezone: ${session1.timezone}`);
console.log(` DNT: ${session1.fingerprint.httpFingerprint.hasDNT ? 'enabled' : 'disabled'}`);
console.log(` TLS: ${session1.fingerprint.httpFingerprint.curlImpersonateBinary}`);
// Test 4: Build Headers (browser-specific order) // During session - should use session fingerprint
console.log('\n[Test 4] Build Headers (browser-specific order):'); console.log('\n During session:');
const { headers, orderedHeaders } = buildHeaders(true, 1000); const duringFp = getFingerprint();
console.log(` Headers built for ${session1.fingerprint.browserName}:`); console.log(` getFingerprint(): ${duringFp.userAgent.slice(0, 50)}...`);
console.log(` Order: ${orderedHeaders.join(' → ')}`); console.log(` Same as session? ${duringFp.userAgent === session1.fingerprint.userAgent}`);
console.log(` Sample headers:`);
console.log(` User-Agent: ${headers['User-Agent']?.slice(0, 50)}...`);
console.log(` Accept: ${headers['Accept']}`);
console.log(` Accept-Language: ${headers['Accept-Language']}`);
console.log(` Referer: ${headers['Referer']}`);
if (headers['sec-ch-ua']) {
console.log(` sec-ch-ua: ${headers['sec-ch-ua']}`);
}
if (headers['DNT']) {
console.log(` DNT: ${headers['DNT']}`);
}
// Test buildHeaders with session
console.log('\n buildHeaders() during session:');
const headers = buildHeaders('/embedded-menu/test-store');
console.log(` User-Agent: ${headers['user-agent'].slice(0, 50)}...`);
console.log(` Accept-Language: ${headers['accept-language']}`);
console.log(` Origin: ${headers['origin']}`);
console.log(` Referer: ${headers['referer']}`);
// End session
console.log('\n Ending session:');
endSession(); endSession();
console.log(` getCurrentSession(): ${getCurrentSession()}`);
// Test 5: Multiple Sessions (UA variety) // Test 4: Multiple sessions should have different fingerprints
console.log('\n[Test 5] Multiple Sessions (UA & fingerprint variety):'); console.log('\n[Test 4] Multiple Sessions (fingerprint variety):');
const sessions: { const fingerprints: string[] = [];
browser: string;
device: string;
hasDNT: boolean;
}[] = [];
for (let i = 0; i < 10; i++) { for (let i = 0; i < 10; i++) {
const session = startSession(`/dispensary/store-${i}`); const session = startSession('CA', 'America/Los_Angeles');
sessions.push({ fingerprints.push(session.fingerprint.userAgent);
browser: session.fingerprint.browserName,
device: session.fingerprint.deviceCategory,
hasDNT: session.fingerprint.httpFingerprint.hasDNT,
});
endSession(); endSession();
} }
// Count distribution const uniqueCount = new Set(fingerprints).size;
const browserCounts: Record<string, number> = {}; console.log(` 10 sessions created, ${uniqueCount} unique fingerprints`);
const deviceCounts: Record<string, number> = {}; console.log(` Variety: ${uniqueCount >= 3 ? '✅ Good' : '⚠️ Low - may need more fingerprint options'}`);
let dntCount = 0;
for (const s of sessions) { // Test 5: Geographic consistency check
browserCounts[s.browser] = (browserCounts[s.browser] || 0) + 1; console.log('\n[Test 5] Geographic Consistency:');
deviceCounts[s.device] = (deviceCounts[s.device] || 0) + 1; const geoTests = [
if (s.hasDNT) dntCount++; { state: 'AZ', tz: 'America/Phoenix' },
} { state: 'CA', tz: 'America/Los_Angeles' },
{ state: 'NY', tz: 'America/New_York' },
{ state: 'IL', tz: 'America/Chicago' },
];
console.log(` 10 sessions created:`); for (const { state, tz } of geoTests) {
console.log(` Browsers: ${JSON.stringify(browserCounts)}`); const session = startSession(state, tz);
console.log(` Devices: ${JSON.stringify(deviceCounts)}`); const consistent = session.fingerprint.acceptLanguage.includes('en-US');
console.log(` DNT enabled: ${dntCount}/10 (expected ~30%)`); console.log(` ${state} (${tz}): Accept-Language=${session.fingerprint.acceptLanguage} ${consistent ? '✅' : '❌'}`);
// Test 6: Device distribution check (per workflow-12102025.md: 62/36/2)
console.log('\n[Test 6] Device Distribution (larger sample):');
const deviceSamples: string[] = [];
for (let i = 0; i < 100; i++) {
const session = startSession();
deviceSamples.push(session.fingerprint.deviceCategory);
endSession(); endSession();
} }
const mobileCount = deviceSamples.filter(d => d === 'mobile').length;
const desktopCount = deviceSamples.filter(d => d === 'desktop').length;
const tabletCount = deviceSamples.filter(d => d === 'tablet').length;
console.log(` 100 sessions (expected: 62% mobile, 36% desktop, 2% tablet):`);
console.log(` Mobile: ${mobileCount}%`);
console.log(` Desktop: ${desktopCount}%`);
console.log(` Tablet: ${tabletCount}%`);
console.log(` Distribution: ${Math.abs(mobileCount - 62) < 15 && Math.abs(desktopCount - 36) < 15 ? '✅ Reasonable' : '⚠️ Off target'}`);
console.log('\n' + '='.repeat(60)); console.log('\n' + '='.repeat(60));
console.log('TEST COMPLETE'); console.log('TEST COMPLETE');
console.log('='.repeat(60)); console.log('='.repeat(60));

View File

@@ -1,53 +1,49 @@
/** /**
* Crawl Rotator - Proxy & User Agent Rotation for Crawlers * Crawl Rotator - Proxy & User Agent Rotation for Crawlers
* *
* Updated: 2025-12-10 per workflow-12102025.md * Manages rotation of proxies and user agents to avoid blocks.
* * Used by platform-specific crawlers (Dutchie, Jane, etc.)
* KEY BEHAVIORS (per workflow-12102025.md):
* 1. Task determines WHAT work to do, proxy determines SESSION IDENTITY
* 2. Proxy location (timezone) sets Accept-Language headers (always English)
* 3. On 403: immediately get new IP, new fingerprint, retry
* 4. After 3 consecutive 403s on same proxy with different fingerprints → disable proxy
*
* USER-AGENT GENERATION (per workflow-12102025.md):
* - Device distribution: Mobile 62%, Desktop 36%, Tablet 2%
* - Browser whitelist: Chrome, Safari, Edge, Firefox only
* - UA sticks until IP rotates
* - Failure = alert admin + stop crawl (no fallback)
*
* Uses intoli/user-agents for realistic UA generation with daily-updated data.
* *
* Canonical location: src/services/crawl-rotator.ts * Canonical location: src/services/crawl-rotator.ts
*/ */
import { Pool } from 'pg'; import { Pool } from 'pg';
import UserAgent from 'user-agents';
import {
HTTPFingerprint,
generateHTTPFingerprint,
BrowserType,
} from './http-fingerprint';
// ============================================================ // ============================================================
// UA CONSTANTS (per workflow-12102025.md) // USER AGENT CONFIGURATION
// ============================================================ // ============================================================
/** /**
* Per workflow-12102025.md: Device category distribution (hardcoded) * Modern browser user agents (Chrome, Firefox, Safari, Edge on various platforms)
* Mobile: 62%, Desktop: 36%, Tablet: 2% * Updated: 2024
*/ */
const DEVICE_WEIGHTS = { export const USER_AGENTS = [
mobile: 62, // Chrome on Windows
desktop: 36, 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
tablet: 2, 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
} as const; 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36',
/** // Chrome on macOS
* Per workflow-12102025.md: Browser whitelist 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
* Only Chrome (67%), Safari (20%), Edge (6%), Firefox (3%) 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
* Samsung Internet, Opera, and other niche browsers are filtered out
*/ // Firefox on Windows
const ALLOWED_BROWSERS = ['Chrome', 'Safari', 'Edge', 'Firefox'] as const; 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0',
// Firefox on macOS
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:121.0) Gecko/20100101 Firefox/121.0',
// Safari on macOS
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15',
// Edge on Windows
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
// Chrome on Linux
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
];
// ============================================================ // ============================================================
// PROXY TYPES // PROXY TYPES
@@ -65,23 +61,13 @@ export interface Proxy {
failureCount: number; failureCount: number;
successCount: number; successCount: number;
avgResponseTimeMs: number | null; avgResponseTimeMs: number | null;
maxConnections: number; maxConnections: number; // Number of concurrent connections allowed (for rotating proxies)
/** // Location info (if known)
* Per workflow-12102025.md: Track consecutive 403s with different fingerprints.
* After 3 consecutive 403s → disable proxy (it's burned).
*/
consecutive403Count: number;
// Location info - determines session headers per workflow-12102025.md
city?: string; city?: string;
state?: string; state?: string;
country?: string; country?: string;
countryCode?: string; countryCode?: string;
timezone?: string; timezone?: string;
/**
* Raw proxy URL override. If set, used directly instead of constructing from parts.
* Supports non-standard formats like: http://host:port:user:pass
*/
proxyUrl?: string;
} }
export interface ProxyStats { export interface ProxyStats {
@@ -91,40 +77,6 @@ export interface ProxyStats {
avgSuccessRate: number; avgSuccessRate: number;
} }
// ============================================================
// FINGERPRINT TYPE
// Per workflow-12102025.md: Full browser fingerprint from user-agents
// ============================================================
export interface BrowserFingerprint {
userAgent: string;
platform: string;
screenWidth: number;
screenHeight: number;
viewportWidth: number;
viewportHeight: number;
deviceCategory: string;
browserName: string; // Per workflow-12102025.md: for session logging
// Derived headers for anti-detect
acceptLanguage: string;
secChUa?: string;
secChUaPlatform?: string;
secChUaMobile?: string;
// Per workflow-12102025.md: HTTP Fingerprinting section
httpFingerprint: HTTPFingerprint;
}
/**
* Per workflow-12102025.md: Session log entry for debugging blocked sessions
*/
export interface UASessionLog {
deviceCategory: string;
browserName: string;
userAgent: string;
proxyIp: string | null;
sessionStartedAt: Date;
}
// ============================================================ // ============================================================
// PROXY ROTATOR CLASS // PROXY ROTATOR CLASS
// ============================================================ // ============================================================
@@ -134,26 +86,18 @@ export class ProxyRotator {
private proxies: Proxy[] = []; private proxies: Proxy[] = [];
private currentIndex: number = 0; private currentIndex: number = 0;
private lastRotation: Date = new Date(); private lastRotation: Date = new Date();
private lastReloadAt: Date = new Date();
// Proxy reload interval - how often to check for proxy changes (default: 60 seconds)
private reloadIntervalMs: number = 60000;
constructor(pool?: Pool) { constructor(pool?: Pool) {
this.pool = pool || null; this.pool = pool || null;
} }
/**
* Initialize with database pool
*/
setPool(pool: Pool): void { setPool(pool: Pool): void {
this.pool = pool; this.pool = pool;
} }
/**
* Set the reload interval for periodic proxy checks
*/
setReloadInterval(ms: number): void {
this.reloadIntervalMs = ms;
}
/** /**
* Load proxies from database * Load proxies from database
*/ */
@@ -178,87 +122,35 @@ export class ProxyRotator {
0 as "successCount", 0 as "successCount",
response_time_ms as "avgResponseTimeMs", response_time_ms as "avgResponseTimeMs",
COALESCE(max_connections, 1) as "maxConnections", COALESCE(max_connections, 1) as "maxConnections",
COALESCE(consecutive_403_count, 0) as "consecutive403Count",
city, city,
state, state,
country, country,
country_code as "countryCode", country_code as "countryCode",
timezone, timezone
proxy_url as "proxyUrl"
FROM proxies FROM proxies
WHERE active = true WHERE active = true
ORDER BY failure_count ASC, last_tested_at ASC NULLS FIRST ORDER BY failure_count ASC, last_tested_at ASC NULLS FIRST
`); `);
this.proxies = result.rows; this.proxies = result.rows;
this.lastReloadAt = new Date();
// Calculate total concurrent capacity
const totalCapacity = this.proxies.reduce((sum, p) => sum + p.maxConnections, 0); const totalCapacity = this.proxies.reduce((sum, p) => sum + p.maxConnections, 0);
console.log(`[ProxyRotator] Loaded ${this.proxies.length} active proxies (${totalCapacity} max concurrent connections / threads)`); console.log(`[ProxyRotator] Loaded ${this.proxies.length} active proxies (${totalCapacity} max concurrent connections)`);
} catch (error) { } catch (error) {
// Table might not exist - that's okay
console.warn(`[ProxyRotator] Could not load proxies: ${error}`); console.warn(`[ProxyRotator] Could not load proxies: ${error}`);
this.proxies = []; this.proxies = [];
} }
} }
/**
* Check if proxy list is stale and needs reload
*/
isStale(): boolean {
const elapsed = Date.now() - this.lastReloadAt.getTime();
return elapsed > this.reloadIntervalMs;
}
/**
* Reload proxies if the cache is stale.
* This ensures workers pick up new proxies or see disabled proxies removed.
* Returns true if proxies were reloaded.
*/
async reloadIfStale(): Promise<boolean> {
if (!this.isStale()) {
return false;
}
const oldCount = this.proxies.length;
const oldCapacity = this.proxies.reduce((sum, p) => sum + p.maxConnections, 0);
const oldIds = new Set(this.proxies.map(p => p.id));
await this.loadProxies();
const newCount = this.proxies.length;
const newCapacity = this.proxies.reduce((sum, p) => sum + p.maxConnections, 0);
const newIds = new Set(this.proxies.map(p => p.id));
// Log changes
const added = this.proxies.filter(p => !oldIds.has(p.id));
const removed = [...oldIds].filter(id => !newIds.has(id));
if (added.length > 0 || removed.length > 0 || oldCapacity !== newCapacity) {
console.log(`[ProxyRotator] Reloaded proxies: ${oldCount}${newCount} proxies, ${oldCapacity}${newCapacity} threads`);
if (added.length > 0) {
console.log(`[ProxyRotator] Added: ${added.map(p => `${p.host}:${p.port} (${p.maxConnections} threads)`).join(', ')}`);
}
if (removed.length > 0) {
console.log(`[ProxyRotator] Removed: ${removed.join(', ')}`);
}
}
return true;
}
/**
* Get time since last reload in seconds
*/
getSecondsSinceReload(): number {
return Math.floor((Date.now() - this.lastReloadAt.getTime()) / 1000);
}
/** /**
* Get next proxy in rotation * Get next proxy in rotation
*/ */
getNext(): Proxy | null { getNext(): Proxy | null {
if (this.proxies.length === 0) return null; if (this.proxies.length === 0) return null;
// Round-robin rotation
this.currentIndex = (this.currentIndex + 1) % this.proxies.length; this.currentIndex = (this.currentIndex + 1) % this.proxies.length;
this.lastRotation = new Date(); this.lastRotation = new Date();
@@ -293,68 +185,23 @@ export class ProxyRotator {
} }
/** /**
* Mark proxy as blocked (403 received) * Mark proxy as failed (temporarily remove from rotation)
* Per workflow-12102025.md:
* - Increment consecutive_403_count
* - After 3 consecutive 403s with different fingerprints → disable proxy
* - This is separate from general failures (timeouts, etc.)
*/
async markBlocked(proxyId: number): Promise<boolean> {
const proxy = this.proxies.find(p => p.id === proxyId);
let shouldDisable = false;
if (proxy) {
proxy.consecutive403Count++;
// Per workflow-12102025.md: 3 consecutive 403s → proxy is burned
if (proxy.consecutive403Count >= 3) {
proxy.isActive = false;
this.proxies = this.proxies.filter(p => p.id !== proxyId);
console.log(`[ProxyRotator] Proxy ${proxyId} DISABLED after ${proxy.consecutive403Count} consecutive 403s (burned)`);
shouldDisable = true;
} else {
console.log(`[ProxyRotator] Proxy ${proxyId} blocked (403 #${proxy.consecutive403Count}/3)`);
}
}
// Update database
if (this.pool) {
try {
await this.pool.query(`
UPDATE proxies
SET
consecutive_403_count = COALESCE(consecutive_403_count, 0) + 1,
last_failure_at = NOW(),
test_result = '403 Forbidden',
active = CASE WHEN COALESCE(consecutive_403_count, 0) >= 2 THEN false ELSE active END,
updated_at = NOW()
WHERE id = $1
`, [proxyId]);
} catch (err) {
console.error(`[ProxyRotator] Failed to update proxy ${proxyId}:`, err);
}
}
return shouldDisable;
}
/**
* Mark proxy as failed (general error - timeout, connection error, etc.)
* Separate from 403 blocking per workflow-12102025.md
*/ */
async markFailed(proxyId: number, error?: string): Promise<void> { async markFailed(proxyId: number, error?: string): Promise<void> {
// Update in-memory
const proxy = this.proxies.find(p => p.id === proxyId); const proxy = this.proxies.find(p => p.id === proxyId);
if (proxy) { if (proxy) {
proxy.failureCount++; proxy.failureCount++;
// Deactivate if too many general failures // Deactivate if too many failures
if (proxy.failureCount >= 5) { if (proxy.failureCount >= 5) {
proxy.isActive = false; proxy.isActive = false;
this.proxies = this.proxies.filter(p => p.id !== proxyId); this.proxies = this.proxies.filter(p => p.id !== proxyId);
console.log(`[ProxyRotator] Proxy ${proxyId} deactivated after ${proxy.failureCount} general failures`); console.log(`[ProxyRotator] Proxy ${proxyId} deactivated after ${proxy.failureCount} failures`);
} }
} }
// Update database
if (this.pool) { if (this.pool) {
try { try {
await this.pool.query(` await this.pool.query(`
@@ -373,22 +220,23 @@ export class ProxyRotator {
} }
/** /**
* Mark proxy as successful - resets consecutive 403 count * Mark proxy as successful
* Per workflow-12102025.md: successful request clears the 403 counter
*/ */
async markSuccess(proxyId: number, responseTimeMs?: number): Promise<void> { async markSuccess(proxyId: number, responseTimeMs?: number): Promise<void> {
// Update in-memory
const proxy = this.proxies.find(p => p.id === proxyId); const proxy = this.proxies.find(p => p.id === proxyId);
if (proxy) { if (proxy) {
proxy.successCount++; proxy.successCount++;
proxy.consecutive403Count = 0; // Reset on success per workflow-12102025.md
proxy.lastUsedAt = new Date(); proxy.lastUsedAt = new Date();
if (responseTimeMs !== undefined) { if (responseTimeMs !== undefined) {
// Rolling average
proxy.avgResponseTimeMs = proxy.avgResponseTimeMs proxy.avgResponseTimeMs = proxy.avgResponseTimeMs
? (proxy.avgResponseTimeMs * 0.8) + (responseTimeMs * 0.2) ? (proxy.avgResponseTimeMs * 0.8) + (responseTimeMs * 0.2)
: responseTimeMs; : responseTimeMs;
} }
} }
// Update database
if (this.pool) { if (this.pool) {
try { try {
await this.pool.query(` await this.pool.query(`
@@ -396,7 +244,6 @@ export class ProxyRotator {
SET SET
last_tested_at = NOW(), last_tested_at = NOW(),
test_result = 'success', test_result = 'success',
consecutive_403_count = 0,
response_time_ms = CASE response_time_ms = CASE
WHEN response_time_ms IS NULL THEN $2 WHEN response_time_ms IS NULL THEN $2
ELSE (response_time_ms * 0.8 + $2 * 0.2)::integer ELSE (response_time_ms * 0.8 + $2 * 0.2)::integer
@@ -412,24 +259,8 @@ export class ProxyRotator {
/** /**
* Get proxy URL for HTTP client * Get proxy URL for HTTP client
* If proxy.proxyUrl is set, uses it directly (supports non-standard formats).
* Otherwise constructs standard format: protocol://user:pass@host:port
*/ */
getProxyUrl(proxy: Proxy): string { getProxyUrl(proxy: Proxy): string {
// If proxyUrl is set, check if it needs conversion from non-standard format
if (proxy.proxyUrl) {
// Check if it's in non-standard format: http://host:port:user:pass
const colonFormatMatch = proxy.proxyUrl.match(/^(https?):\/\/([^:]+):(\d+):([^:]+):(.+)$/);
if (colonFormatMatch) {
// Convert to standard format: http://user:pass@host:port
const [, protocol, host, port, username, password] = colonFormatMatch;
return `${protocol}://${encodeURIComponent(username)}:${encodeURIComponent(password)}@${host}:${port}`;
}
// Already in standard format or unknown format - return as-is
return proxy.proxyUrl;
}
// Construct standard format from individual fields
const auth = proxy.username && proxy.password const auth = proxy.username && proxy.password
? `${proxy.username}:${proxy.password}@` ? `${proxy.username}:${proxy.password}@`
: ''; : '';
@@ -441,8 +272,8 @@ export class ProxyRotator {
*/ */
getStats(): ProxyStats { getStats(): ProxyStats {
const totalProxies = this.proxies.length; const totalProxies = this.proxies.length;
const activeProxies = this.proxies.reduce((sum, p) => sum + p.maxConnections, 0); const activeProxies = this.proxies.reduce((sum, p) => sum + p.maxConnections, 0); // Total concurrent capacity
const blockedProxies = this.proxies.filter(p => p.failureCount >= 5 || p.consecutive403Count >= 3).length; const blockedProxies = this.proxies.filter(p => p.failureCount >= 5).length;
const successRates = this.proxies const successRates = this.proxies
.filter(p => p.successCount + p.failureCount > 0) .filter(p => p.successCount + p.failureCount > 0)
@@ -454,12 +285,15 @@ export class ProxyRotator {
return { return {
totalProxies, totalProxies,
activeProxies, activeProxies, // Total concurrent capacity across all proxies
blockedProxies, blockedProxies,
avgSuccessRate, avgSuccessRate,
}; };
} }
/**
* Check if proxy pool has available proxies
*/
hasAvailableProxies(): boolean { hasAvailableProxies(): boolean {
return this.proxies.length > 0; return this.proxies.length > 0;
} }
@@ -467,194 +301,53 @@ export class ProxyRotator {
// ============================================================ // ============================================================
// USER AGENT ROTATOR CLASS // USER AGENT ROTATOR CLASS
// Per workflow-12102025.md: Uses intoli/user-agents for realistic fingerprints
// ============================================================ // ============================================================
export class UserAgentRotator { export class UserAgentRotator {
private currentFingerprint: BrowserFingerprint | null = null; private userAgents: string[];
private sessionLog: UASessionLog | null = null; private currentIndex: number = 0;
private lastRotation: Date = new Date();
constructor() { constructor(userAgents: string[] = USER_AGENTS) {
// Per workflow-12102025.md: Initialize with first fingerprint this.userAgents = userAgents;
this.rotate(); // Start at random index to avoid patterns
this.currentIndex = Math.floor(Math.random() * userAgents.length);
} }
/** /**
* Per workflow-12102025.md: Roll device category based on distribution * Get next user agent in rotation
* Mobile: 62%, Desktop: 36%, Tablet: 2%
*/ */
private rollDeviceCategory(): 'mobile' | 'desktop' | 'tablet' { getNext(): string {
const roll = Math.random() * 100; this.currentIndex = (this.currentIndex + 1) % this.userAgents.length;
if (roll < DEVICE_WEIGHTS.mobile) { this.lastRotation = new Date();
return 'mobile'; return this.userAgents[this.currentIndex];
} else if (roll < DEVICE_WEIGHTS.mobile + DEVICE_WEIGHTS.desktop) {
return 'desktop';
} else {
return 'tablet';
}
} }
/** /**
* Per workflow-12102025.md: Extract browser name from UA string * Get current user agent without rotating
*/ */
private extractBrowserName(userAgent: string): string { getCurrent(): string {
if (userAgent.includes('Edg/')) return 'Edge'; return this.userAgents[this.currentIndex];
if (userAgent.includes('Firefox/')) return 'Firefox';
if (userAgent.includes('Safari/') && !userAgent.includes('Chrome/')) return 'Safari';
if (userAgent.includes('Chrome/')) return 'Chrome';
return 'Unknown';
} }
/** /**
* Per workflow-12102025.md: Check if browser is in whitelist * Get a random user agent
*/ */
private isAllowedBrowser(userAgent: string): boolean { getRandom(): string {
const browserName = this.extractBrowserName(userAgent); const index = Math.floor(Math.random() * this.userAgents.length);
return ALLOWED_BROWSERS.includes(browserName as typeof ALLOWED_BROWSERS[number]); return this.userAgents[index];
} }
/** /**
* Generate a new random fingerprint * Get total available user agents
* Per workflow-12102025.md:
* - Roll device category (62/36/2)
* - Filter to top 4 browsers only
* - Failure = alert admin + stop (no fallback)
*/ */
rotate(proxyIp?: string): BrowserFingerprint {
// Per workflow-12102025.md: Roll device category
const deviceCategory = this.rollDeviceCategory();
// Per workflow-12102025.md: Generate UA filtered to device category
const generator = new UserAgent({ deviceCategory });
// Per workflow-12102025.md: Try to get an allowed browser (max 50 attempts)
let ua: ReturnType<typeof generator>;
let attempts = 0;
const maxAttempts = 50;
do {
ua = generator();
attempts++;
} while (!this.isAllowedBrowser(ua.data.userAgent) && attempts < maxAttempts);
// Per workflow-12102025.md: If we can't get allowed browser, this is a failure
if (!this.isAllowedBrowser(ua.data.userAgent)) {
const errorMsg = `[UserAgentRotator] CRITICAL: Failed to generate allowed browser after ${maxAttempts} attempts. Device: ${deviceCategory}. Last UA: ${ua.data.userAgent}`;
console.error(errorMsg);
// Per workflow-12102025.md: Alert admin + stop crawl
// TODO: Post alert to admin dashboard
throw new Error(errorMsg);
}
const data = ua.data;
const browserName = this.extractBrowserName(data.userAgent);
// Build sec-ch-ua headers from user agent string
const secChUa = this.buildSecChUa(data.userAgent, deviceCategory);
// Per workflow-12102025.md: HTTP Fingerprinting - generate full HTTP fingerprint
const httpFingerprint = generateHTTPFingerprint(browserName as BrowserType);
this.currentFingerprint = {
userAgent: data.userAgent,
platform: data.platform,
screenWidth: data.screenWidth,
screenHeight: data.screenHeight,
viewportWidth: data.viewportWidth,
viewportHeight: data.viewportHeight,
deviceCategory: data.deviceCategory,
browserName, // Per workflow-12102025.md: for session logging
// Per workflow-12102025.md: always English
acceptLanguage: 'en-US,en;q=0.9',
...secChUa,
// Per workflow-12102025.md: HTTP Fingerprinting section
httpFingerprint,
};
// Per workflow-12102025.md: Log session data
this.sessionLog = {
deviceCategory,
browserName,
userAgent: data.userAgent,
proxyIp: proxyIp || null,
sessionStartedAt: new Date(),
};
console.log(`[UserAgentRotator] New fingerprint: device=${deviceCategory}, browser=${browserName}, UA=${data.userAgent.slice(0, 50)}...`);
return this.currentFingerprint;
}
/**
* Get current fingerprint without rotating
*/
getCurrent(): BrowserFingerprint {
if (!this.currentFingerprint) {
return this.rotate();
}
return this.currentFingerprint;
}
/**
* Get a random fingerprint (rotates and returns)
*/
getRandom(proxyIp?: string): BrowserFingerprint {
return this.rotate(proxyIp);
}
/**
* Per workflow-12102025.md: Get session log for debugging
*/
getSessionLog(): UASessionLog | null {
return this.sessionLog;
}
/**
* Build sec-ch-ua headers from user agent string
* Per workflow-12102025.md: Include mobile indicator based on device category
*/
private buildSecChUa(userAgent: string, deviceCategory: string): { secChUa?: string; secChUaPlatform?: string; secChUaMobile?: string } {
const isMobile = deviceCategory === 'mobile' || deviceCategory === 'tablet';
// Extract Chrome version if present
const chromeMatch = userAgent.match(/Chrome\/(\d+)/);
const edgeMatch = userAgent.match(/Edg\/(\d+)/);
if (edgeMatch) {
const version = edgeMatch[1];
return {
secChUa: `"Microsoft Edge";v="${version}", "Chromium";v="${version}", "Not_A Brand";v="24"`,
secChUaPlatform: userAgent.includes('Windows') ? '"Windows"' : userAgent.includes('Android') ? '"Android"' : '"macOS"',
secChUaMobile: isMobile ? '?1' : '?0',
};
}
if (chromeMatch) {
const version = chromeMatch[1];
let platform = '"Linux"';
if (userAgent.includes('Windows')) platform = '"Windows"';
else if (userAgent.includes('Mac')) platform = '"macOS"';
else if (userAgent.includes('Android')) platform = '"Android"';
else if (userAgent.includes('iPhone') || userAgent.includes('iPad')) platform = '"iOS"';
return {
secChUa: `"Google Chrome";v="${version}", "Chromium";v="${version}", "Not_A Brand";v="24"`,
secChUaPlatform: platform,
secChUaMobile: isMobile ? '?1' : '?0',
};
}
// Firefox/Safari don't send sec-ch-ua
return {};
}
getCount(): number { getCount(): number {
return 1; // user-agents generates dynamically return this.userAgents.length;
} }
} }
// ============================================================ // ============================================================
// COMBINED ROTATOR // COMBINED ROTATOR (for convenience)
// Per workflow-12102025.md: Coordinates proxy + fingerprint rotation
// ============================================================ // ============================================================
export class CrawlRotator { export class CrawlRotator {
@@ -666,68 +359,49 @@ export class CrawlRotator {
this.userAgent = new UserAgentRotator(); this.userAgent = new UserAgentRotator();
} }
/**
* Initialize rotator (load proxies from DB)
*/
async initialize(): Promise<void> { async initialize(): Promise<void> {
await this.proxy.loadProxies(); await this.proxy.loadProxies();
} }
/** /**
* Reload proxy list if stale. * Rotate proxy only
* Workers should call this periodically to pick up proxy changes.
* Returns true if proxies were reloaded.
*/
async reloadIfStale(): Promise<boolean> {
return this.proxy.reloadIfStale();
}
/**
* Set proxy reload interval in milliseconds.
* Default is 60 seconds.
*/
setProxyReloadInterval(ms: number): void {
this.proxy.setReloadInterval(ms);
}
/**
* Rotate proxy only (get new IP)
*/ */
rotateProxy(): Proxy | null { rotateProxy(): Proxy | null {
return this.proxy.getNext(); return this.proxy.getNext();
} }
/** /**
* Rotate fingerprint only (new UA, screen size, etc.) * Rotate user agent only
*/ */
rotateFingerprint(): BrowserFingerprint { rotateUserAgent(): string {
return this.userAgent.rotate(); return this.userAgent.getNext();
} }
/** /**
* Rotate both proxy and fingerprint * Rotate both proxy and user agent
* Per workflow-12102025.md: called on 403 for fresh identity
* Passes proxy IP to UA rotation for session logging
*/ */
rotateBoth(): { proxy: Proxy | null; fingerprint: BrowserFingerprint } { rotateBoth(): { proxy: Proxy | null; userAgent: string } {
const proxy = this.proxy.getNext();
const proxyIp = proxy ? proxy.host : undefined;
return { return {
proxy, proxy: this.proxy.getNext(),
fingerprint: this.userAgent.rotate(proxyIp), userAgent: this.userAgent.getNext(),
}; };
} }
/** /**
* Get current proxy and fingerprint without rotating * Get current proxy and user agent without rotating
*/ */
getCurrent(): { proxy: Proxy | null; fingerprint: BrowserFingerprint } { getCurrent(): { proxy: Proxy | null; userAgent: string } {
return { return {
proxy: this.proxy.getCurrent(), proxy: this.proxy.getCurrent(),
fingerprint: this.userAgent.getCurrent(), userAgent: this.userAgent.getCurrent(),
}; };
} }
/** /**
* Record success for current proxy * Record success for current proxy
* Per workflow-12102025.md: resets consecutive 403 count
*/ */
async recordSuccess(responseTimeMs?: number): Promise<void> { async recordSuccess(responseTimeMs?: number): Promise<void> {
const current = this.proxy.getCurrent(); const current = this.proxy.getCurrent();
@@ -737,20 +411,7 @@ export class CrawlRotator {
} }
/** /**
* Record 403 block for current proxy * Record failure for current proxy
* Per workflow-12102025.md: increments consecutive_403_count, disables after 3
* Returns true if proxy was disabled
*/
async recordBlock(): Promise<boolean> {
const current = this.proxy.getCurrent();
if (current) {
return await this.proxy.markBlocked(current.id);
}
return false;
}
/**
* Record general failure (not 403)
*/ */
async recordFailure(error?: string): Promise<void> { async recordFailure(error?: string): Promise<void> {
const current = this.proxy.getCurrent(); const current = this.proxy.getCurrent();
@@ -760,13 +421,14 @@ export class CrawlRotator {
} }
/** /**
* Get current proxy location info * Get current proxy location info (for reporting)
* Per workflow-12102025.md: proxy location determines session headers * Note: For rotating proxies (like IPRoyal), the actual exit location varies per request
*/ */
getProxyLocation(): { city?: string; state?: string; country?: string; timezone?: string; isRotating: boolean } | null { getProxyLocation(): { city?: string; state?: string; country?: string; timezone?: string; isRotating: boolean } | null {
const current = this.proxy.getCurrent(); const current = this.proxy.getCurrent();
if (!current) return null; if (!current) return null;
// Check if this is a rotating proxy (max_connections > 1 usually indicates rotating)
const isRotating = current.maxConnections > 1; const isRotating = current.maxConnections > 1;
return { return {
@@ -777,127 +439,6 @@ export class CrawlRotator {
isRotating isRotating
}; };
} }
/**
* Get timezone from current proxy
* Per workflow-12102025.md: used for Accept-Language header
*/
getProxyTimezone(): string | undefined {
const current = this.proxy.getCurrent();
return current?.timezone;
}
/**
* Preflight check - verifies proxy and anti-detect are working
* MUST be called before any task execution to ensure anonymity.
*
* Tests:
* 1. Proxy available - a proxy must be loaded and active
* 2. Proxy connectivity - makes HTTP request through proxy to verify connection
* 3. Anti-detect headers - verifies fingerprint is set with required headers
*
* @returns Promise<PreflightResult> with pass/fail status and details
*/
async preflight(): Promise<PreflightResult> {
const result: PreflightResult = {
passed: false,
proxyAvailable: false,
proxyConnected: false,
antidetectReady: false,
proxyIp: null,
fingerprint: null,
error: null,
responseTimeMs: null,
};
// Step 1: Check proxy is available
const currentProxy = this.proxy.getCurrent();
if (!currentProxy) {
result.error = 'No proxy available';
console.log('[Preflight] FAILED - No proxy available');
return result;
}
result.proxyAvailable = true;
result.proxyIp = currentProxy.host;
// Step 2: Check fingerprint/anti-detect is ready
const fingerprint = this.userAgent.getCurrent();
if (!fingerprint || !fingerprint.userAgent) {
result.error = 'Anti-detect fingerprint not initialized';
console.log('[Preflight] FAILED - No fingerprint');
return result;
}
result.antidetectReady = true;
result.fingerprint = {
userAgent: fingerprint.userAgent,
browserName: fingerprint.browserName,
deviceCategory: fingerprint.deviceCategory,
};
// Step 3: Test proxy connectivity with an actual HTTP request
// Use httpbin.org/ip to verify request goes through proxy
const proxyUrl = this.proxy.getProxyUrl(currentProxy);
const testUrl = 'https://httpbin.org/ip';
try {
const { default: axios } = await import('axios');
const { HttpsProxyAgent } = await import('https-proxy-agent');
const agent = new HttpsProxyAgent(proxyUrl);
const startTime = Date.now();
const response = await axios.get(testUrl, {
httpsAgent: agent,
timeout: 15000, // 15 second timeout
headers: {
'User-Agent': fingerprint.userAgent,
'Accept-Language': fingerprint.acceptLanguage,
...(fingerprint.secChUa && { 'sec-ch-ua': fingerprint.secChUa }),
...(fingerprint.secChUaPlatform && { 'sec-ch-ua-platform': fingerprint.secChUaPlatform }),
...(fingerprint.secChUaMobile && { 'sec-ch-ua-mobile': fingerprint.secChUaMobile }),
},
});
result.responseTimeMs = Date.now() - startTime;
result.proxyConnected = true;
result.passed = true;
// Mark success on proxy stats
await this.proxy.markSuccess(currentProxy.id, result.responseTimeMs);
console.log(`[Preflight] PASSED - Proxy ${currentProxy.host} connected (${result.responseTimeMs}ms), UA: ${fingerprint.browserName}/${fingerprint.deviceCategory}`);
} catch (err: any) {
result.error = `Proxy connection failed: ${err.message || 'Unknown error'}`;
console.log(`[Preflight] FAILED - Proxy connection error: ${err.message}`);
// Mark failure on proxy stats
await this.proxy.markFailed(currentProxy.id, err.message);
}
return result;
}
}
/**
* Result from preflight check
*/
export interface PreflightResult {
/** Overall pass/fail */
passed: boolean;
/** Step 1: Is a proxy loaded? */
proxyAvailable: boolean;
/** Step 2: Did HTTP request through proxy succeed? */
proxyConnected: boolean;
/** Step 3: Is fingerprint/anti-detect ready? */
antidetectReady: boolean;
/** Current proxy IP */
proxyIp: string | null;
/** Fingerprint summary */
fingerprint: { userAgent: string; browserName: string; deviceCategory: string } | null;
/** Error message if failed */
error: string | null;
/** Proxy response time in ms */
responseTimeMs: number | null;
} }
// ============================================================ // ============================================================

View File

@@ -1,100 +0,0 @@
/**
* Curl Preflight - Verify curl/axios transport works through proxy
*
* Tests:
* 1. Proxy is available and active
* 2. HTTP request through proxy succeeds
* 3. Anti-detect headers are properly set
*
* Use case: Fast, simple API requests that don't need browser fingerprint
*/
import axios from 'axios';
import { HttpsProxyAgent } from 'https-proxy-agent';
import { CrawlRotator, PreflightResult } from './crawl-rotator';
export interface CurlPreflightResult extends PreflightResult {
method: 'curl';
}
/**
* Run curl preflight check
* Tests proxy connectivity using axios/curl through the proxy
*/
export async function runCurlPreflight(
crawlRotator: CrawlRotator
): Promise<CurlPreflightResult> {
const result: CurlPreflightResult = {
method: 'curl',
passed: false,
proxyAvailable: false,
proxyConnected: false,
antidetectReady: false,
proxyIp: null,
fingerprint: null,
error: null,
responseTimeMs: null,
};
// Step 1: Check proxy is available
const currentProxy = crawlRotator.proxy.getCurrent();
if (!currentProxy) {
result.error = 'No proxy available';
console.log('[CurlPreflight] FAILED - No proxy available');
return result;
}
result.proxyAvailable = true;
result.proxyIp = currentProxy.host;
// Step 2: Check fingerprint/anti-detect is ready
const fingerprint = crawlRotator.userAgent.getCurrent();
if (!fingerprint || !fingerprint.userAgent) {
result.error = 'Anti-detect fingerprint not initialized';
console.log('[CurlPreflight] FAILED - No fingerprint');
return result;
}
result.antidetectReady = true;
result.fingerprint = {
userAgent: fingerprint.userAgent,
browserName: fingerprint.browserName,
deviceCategory: fingerprint.deviceCategory,
};
// Step 3: Test proxy connectivity with an actual HTTP request
const proxyUrl = crawlRotator.proxy.getProxyUrl(currentProxy);
const testUrl = 'https://httpbin.org/ip';
try {
const agent = new HttpsProxyAgent(proxyUrl);
const startTime = Date.now();
const response = await axios.get(testUrl, {
httpsAgent: agent,
timeout: 15000, // 15 second timeout
headers: {
'User-Agent': fingerprint.userAgent,
'Accept-Language': fingerprint.acceptLanguage,
...(fingerprint.secChUa && { 'sec-ch-ua': fingerprint.secChUa }),
...(fingerprint.secChUaPlatform && { 'sec-ch-ua-platform': fingerprint.secChUaPlatform }),
...(fingerprint.secChUaMobile && { 'sec-ch-ua-mobile': fingerprint.secChUaMobile }),
},
});
result.responseTimeMs = Date.now() - startTime;
result.proxyConnected = true;
result.passed = true;
// Mark success on proxy stats
await crawlRotator.proxy.markSuccess(currentProxy.id, result.responseTimeMs);
console.log(`[CurlPreflight] PASSED - Proxy ${currentProxy.host} connected (${result.responseTimeMs}ms), UA: ${fingerprint.browserName}/${fingerprint.deviceCategory}`);
} catch (err: any) {
result.error = `Proxy connection failed: ${err.message || 'Unknown error'}`;
console.log(`[CurlPreflight] FAILED - Proxy connection error: ${err.message}`);
// Mark failure on proxy stats
await crawlRotator.proxy.markFailed(currentProxy.id, err.message);
}
return result;
}

View File

@@ -1,315 +0,0 @@
/**
* HTTP Fingerprinting Service
*
* Per workflow-12102025.md - HTTP Fingerprinting section:
* - Full header set per browser type
* - Browser-specific header ordering
* - Natural randomization (DNT, Accept quality)
* - Dynamic Referer per dispensary
*
* Canonical location: src/services/http-fingerprint.ts
*/
// ============================================================
// TYPES
// ============================================================
export type BrowserType = 'Chrome' | 'Firefox' | 'Safari' | 'Edge';
/**
* Per workflow-12102025.md: Full HTTP fingerprint for a session
*/
export interface HTTPFingerprint {
browserType: BrowserType;
headers: Record<string, string>;
headerOrder: string[];
curlImpersonateBinary: string;
hasDNT: boolean;
}
/**
* Per workflow-12102025.md: Context for building headers
*/
export interface HeaderContext {
userAgent: string;
secChUa?: string;
secChUaPlatform?: string;
secChUaMobile?: string;
referer: string;
isPost: boolean;
contentLength?: number;
}
// ============================================================
// CONSTANTS (per workflow-12102025.md)
// ============================================================
/**
* Per workflow-12102025.md: DNT header distribution (~30% of users)
*/
const DNT_PROBABILITY = 0.30;
/**
* Per workflow-12102025.md: Accept header variations for natural traffic
*/
const ACCEPT_VARIATIONS = [
'application/json, text/plain, */*',
'application/json,text/plain,*/*',
'*/*',
];
/**
* Per workflow-12102025.md: Accept-Language variations
*/
const ACCEPT_LANGUAGE_VARIATIONS = [
'en-US,en;q=0.9',
'en-US,en;q=0.8',
'en-US;q=0.9,en;q=0.8',
];
/**
* Per workflow-12102025.md: curl-impersonate binaries per browser
*/
const CURL_IMPERSONATE_BINARIES: Record<BrowserType, string> = {
Chrome: 'curl_chrome131',
Edge: 'curl_chrome131', // Edge uses Chromium
Firefox: 'curl_ff133',
Safari: 'curl_safari17',
};
// ============================================================
// HEADER ORDERING (per workflow-12102025.md)
// ============================================================
/**
* Per workflow-12102025.md: Chrome header order for GraphQL requests
*/
const CHROME_HEADER_ORDER = [
'Host',
'Connection',
'Content-Length',
'sec-ch-ua',
'DNT',
'sec-ch-ua-mobile',
'User-Agent',
'sec-ch-ua-platform',
'Content-Type',
'Accept',
'Origin',
'sec-fetch-site',
'sec-fetch-mode',
'sec-fetch-dest',
'Referer',
'Accept-Encoding',
'Accept-Language',
];
/**
* Per workflow-12102025.md: Firefox header order for GraphQL requests
*/
const FIREFOX_HEADER_ORDER = [
'Host',
'User-Agent',
'Accept',
'Accept-Language',
'Accept-Encoding',
'Content-Type',
'Content-Length',
'Origin',
'DNT',
'Connection',
'Referer',
'sec-fetch-dest',
'sec-fetch-mode',
'sec-fetch-site',
];
/**
* Per workflow-12102025.md: Safari header order for GraphQL requests
*/
const SAFARI_HEADER_ORDER = [
'Host',
'Connection',
'Content-Length',
'Accept',
'User-Agent',
'Content-Type',
'Origin',
'Referer',
'Accept-Encoding',
'Accept-Language',
];
/**
* Per workflow-12102025.md: Edge uses Chrome order (Chromium-based)
*/
const HEADER_ORDERS: Record<BrowserType, string[]> = {
Chrome: CHROME_HEADER_ORDER,
Edge: CHROME_HEADER_ORDER,
Firefox: FIREFOX_HEADER_ORDER,
Safari: SAFARI_HEADER_ORDER,
};
// ============================================================
// FINGERPRINT GENERATION
// ============================================================
/**
* Per workflow-12102025.md: Generate HTTP fingerprint for a session
* Randomization is done once per session for consistency
*/
export function generateHTTPFingerprint(browserType: BrowserType): HTTPFingerprint {
// Per workflow-12102025.md: DNT randomized per session (~30%)
const hasDNT = Math.random() < DNT_PROBABILITY;
return {
browserType,
headers: {}, // Built dynamically per request
headerOrder: HEADER_ORDERS[browserType],
curlImpersonateBinary: CURL_IMPERSONATE_BINARIES[browserType],
hasDNT,
};
}
/**
* Per workflow-12102025.md: Build complete headers for a request
* Returns headers in browser-specific order
*/
export function buildOrderedHeaders(
fingerprint: HTTPFingerprint,
context: HeaderContext
): { headers: Record<string, string>; orderedHeaders: string[] } {
const { browserType, hasDNT, headerOrder } = fingerprint;
const { userAgent, secChUa, secChUaPlatform, secChUaMobile, referer, isPost, contentLength } = context;
// Per workflow-12102025.md: Natural randomization for Accept
const accept = ACCEPT_VARIATIONS[Math.floor(Math.random() * ACCEPT_VARIATIONS.length)];
const acceptLanguage = ACCEPT_LANGUAGE_VARIATIONS[Math.floor(Math.random() * ACCEPT_LANGUAGE_VARIATIONS.length)];
// Build all possible headers
const allHeaders: Record<string, string> = {
'Connection': 'keep-alive',
'User-Agent': userAgent,
'Accept': accept,
'Accept-Language': acceptLanguage,
'Accept-Encoding': 'gzip, deflate, br',
};
// Per workflow-12102025.md: POST-only headers
if (isPost) {
allHeaders['Content-Type'] = 'application/json';
allHeaders['Origin'] = 'https://dutchie.com';
if (contentLength !== undefined) {
allHeaders['Content-Length'] = String(contentLength);
}
}
// Per workflow-12102025.md: Dynamic Referer per dispensary
allHeaders['Referer'] = referer;
// Per workflow-12102025.md: DNT randomized per session
if (hasDNT) {
allHeaders['DNT'] = '1';
}
// Per workflow-12102025.md: Chromium-only headers (Chrome, Edge)
if (browserType === 'Chrome' || browserType === 'Edge') {
if (secChUa) allHeaders['sec-ch-ua'] = secChUa;
if (secChUaMobile) allHeaders['sec-ch-ua-mobile'] = secChUaMobile;
if (secChUaPlatform) allHeaders['sec-ch-ua-platform'] = secChUaPlatform;
allHeaders['sec-fetch-site'] = 'same-origin';
allHeaders['sec-fetch-mode'] = 'cors';
allHeaders['sec-fetch-dest'] = 'empty';
}
// Per workflow-12102025.md: Firefox has sec-fetch but no sec-ch
if (browserType === 'Firefox') {
allHeaders['sec-fetch-site'] = 'same-origin';
allHeaders['sec-fetch-mode'] = 'cors';
allHeaders['sec-fetch-dest'] = 'empty';
}
// Per workflow-12102025.md: Safari has no sec-* headers
// Filter to only headers that exist and order them
const orderedHeaders: string[] = [];
const headers: Record<string, string> = {};
for (const headerName of headerOrder) {
if (allHeaders[headerName]) {
orderedHeaders.push(headerName);
headers[headerName] = allHeaders[headerName];
}
}
return { headers, orderedHeaders };
}
/**
* Per workflow-12102025.md: Build curl command arguments for headers
* Headers are added in browser-specific order
*/
export function buildCurlHeaderArgs(
fingerprint: HTTPFingerprint,
context: HeaderContext
): string[] {
const { headers, orderedHeaders } = buildOrderedHeaders(fingerprint, context);
const args: string[] = [];
for (const headerName of orderedHeaders) {
// Skip Host and Content-Length - curl handles these
if (headerName === 'Host' || headerName === 'Content-Length') continue;
args.push('-H', `${headerName}: ${headers[headerName]}`);
}
return args;
}
/**
* Per workflow-12102025.md: Extract Referer from dispensary menu_url
*/
export function buildRefererFromMenuUrl(menuUrl: string | null | undefined): string {
if (!menuUrl) {
return 'https://dutchie.com/';
}
// Extract slug from menu_url
// Formats: /embedded-menu/<slug> or /dispensary/<slug> or full URL
let slug: string | null = null;
const embeddedMatch = menuUrl.match(/\/embedded-menu\/([^/?]+)/);
const dispensaryMatch = menuUrl.match(/\/dispensary\/([^/?]+)/);
if (embeddedMatch) {
slug = embeddedMatch[1];
} else if (dispensaryMatch) {
slug = dispensaryMatch[1];
}
if (slug) {
return `https://dutchie.com/dispensary/${slug}`;
}
return 'https://dutchie.com/';
}
/**
* Per workflow-12102025.md: Get curl-impersonate binary for browser
*/
export function getCurlBinary(browserType: BrowserType): string {
return CURL_IMPERSONATE_BINARIES[browserType];
}
/**
* Per workflow-12102025.md: Check if curl-impersonate is available
*/
export function isCurlImpersonateAvailable(browserType: BrowserType): boolean {
const binary = CURL_IMPERSONATE_BINARIES[browserType];
try {
const { execSync } = require('child_process');
execSync(`which ${binary}`, { stdio: 'ignore' });
return true;
} catch {
return false;
}
}

View File

@@ -1,290 +0,0 @@
/**
* Puppeteer Preflight - Verify browser-based transport works with anti-detect
*
* Uses Puppeteer + StealthPlugin to:
* 1. Launch headless browser with stealth mode + PROXY
* 2. Visit fingerprint.com demo to verify anti-detect and confirm proxy IP
* 3. Establish session by visiting Dutchie embedded menu
* 4. Make GraphQL request from browser context
* 5. Verify we get a valid response (not blocked)
*
* Use case: Anti-detect scraping that needs real browser fingerprint through proxy
*
* Based on test-intercept.js which successfully captures 1000+ products
*/
import { PreflightResult, CrawlRotator } from './crawl-rotator';
// GraphQL hash for FilteredProducts query - MUST match CLAUDE.md
const FILTERED_PRODUCTS_HASH = 'ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0';
// Test dispensary - AZ-Deeply-Rooted (known working)
const TEST_CNAME = 'AZ-Deeply-Rooted';
const TEST_PLATFORM_ID = '6405ef617056e8014d79101b';
// Anti-detect verification sites (primary + fallback)
const FINGERPRINT_DEMO_URL = 'https://demo.fingerprint.com/';
const AMIUNIQUE_URL = 'https://amiunique.org/fingerprint';
// IP geolocation API for timezone lookup (free, no key required)
const IP_API_URL = 'http://ip-api.com/json';
/**
* Look up timezone from IP address using ip-api.com
* Returns IANA timezone (e.g., 'America/New_York') or null on failure
*/
async function getTimezoneFromIp(ip: string): Promise<{ timezone: string; city?: string; region?: string } | null> {
try {
const axios = require('axios');
const response = await axios.get(`${IP_API_URL}/${ip}?fields=status,timezone,city,regionName`, {
timeout: 5000,
});
if (response.data?.status === 'success' && response.data?.timezone) {
return {
timezone: response.data.timezone,
city: response.data.city,
region: response.data.regionName,
};
}
return null;
} catch (err: any) {
console.log(`[PuppeteerPreflight] IP geolocation lookup failed: ${err.message}`);
return null;
}
}
export interface PuppeteerPreflightResult extends PreflightResult {
method: 'http';
/** Number of products returned (proves API access) */
productsReturned?: number;
/** Browser user agent used */
browserUserAgent?: string;
/** Bot detection result from fingerprint.com */
botDetection?: {
detected: boolean;
probability?: number;
type?: string;
};
/** Expected proxy IP (from pool) */
expectedProxyIp?: string;
/** Whether IP verification passed (detected IP matches proxy) */
ipVerified?: boolean;
/** Detected timezone from IP geolocation */
detectedTimezone?: string;
/** Detected location from IP geolocation */
detectedLocation?: {
city?: string;
region?: string;
};
}
/**
* Run Puppeteer preflight check with proxy
* Tests browser-based access with anti-detect verification via fingerprint.com
*
* @param crawlRotator - CrawlRotator instance to get proxy from pool
*/
export async function runPuppeteerPreflight(
crawlRotator?: CrawlRotator
): Promise<PuppeteerPreflightResult> {
const result: PuppeteerPreflightResult = {
method: 'http',
passed: false,
proxyAvailable: false,
proxyConnected: false,
antidetectReady: false,
proxyIp: null,
fingerprint: null,
error: null,
responseTimeMs: null,
productsReturned: 0,
ipVerified: false,
};
let browser: any = null;
try {
// Step 0: Get a proxy from the pool
let proxyUrl: string | null = null;
let expectedProxyHost: string | null = null;
if (crawlRotator) {
const currentProxy = crawlRotator.proxy.getCurrent();
if (currentProxy) {
result.proxyAvailable = true;
proxyUrl = crawlRotator.proxy.getProxyUrl(currentProxy);
expectedProxyHost = currentProxy.host;
result.expectedProxyIp = expectedProxyHost;
console.log(`[PuppeteerPreflight] Using proxy: ${currentProxy.host}:${currentProxy.port}`);
} else {
result.error = 'No proxy available from pool';
console.log(`[PuppeteerPreflight] FAILED - No proxy available`);
return result;
}
} else {
console.log(`[PuppeteerPreflight] WARNING: No CrawlRotator provided - using direct connection`);
result.proxyAvailable = true; // No proxy needed for direct
}
// Dynamic imports to avoid loading Puppeteer unless needed
const puppeteer = require('puppeteer-extra');
const StealthPlugin = require('puppeteer-extra-plugin-stealth');
puppeteer.use(StealthPlugin());
const startTime = Date.now();
// Build browser args
const browserArgs = ['--no-sandbox', '--disable-setuid-sandbox'];
if (proxyUrl) {
// Extract host:port for Puppeteer (it handles auth separately)
const proxyUrlParsed = new URL(proxyUrl);
browserArgs.push(`--proxy-server=${proxyUrlParsed.host}`);
}
// Launch browser with stealth + proxy
browser = await puppeteer.launch({
headless: 'new',
args: browserArgs,
});
const page = await browser.newPage();
// If proxy has auth, set it up
if (proxyUrl) {
const proxyUrlParsed = new URL(proxyUrl);
if (proxyUrlParsed.username && proxyUrlParsed.password) {
await page.authenticate({
username: decodeURIComponent(proxyUrlParsed.username),
password: decodeURIComponent(proxyUrlParsed.password),
});
}
}
// Get browser user agent
const userAgent = await page.evaluate(() => navigator.userAgent);
result.browserUserAgent = userAgent;
result.fingerprint = {
userAgent,
browserName: 'Chrome (Puppeteer)',
deviceCategory: 'desktop',
};
// =========================================================================
// STEP 1a: Get IP address directly via simple API (more reliable than scraping)
// =========================================================================
console.log(`[PuppeteerPreflight] Getting proxy IP address...`);
try {
const ipApiResponse = await page.evaluate(async () => {
try {
const response = await fetch('https://api.ipify.org?format=json');
const data = await response.json();
return { ip: data.ip, error: null };
} catch (err: any) {
return { ip: null, error: err.message };
}
});
if (ipApiResponse.ip) {
result.proxyIp = ipApiResponse.ip;
result.proxyConnected = true;
console.log(`[PuppeteerPreflight] Detected proxy IP: ${ipApiResponse.ip}`);
// Look up timezone from IP
const geoData = await getTimezoneFromIp(ipApiResponse.ip);
if (geoData) {
result.detectedTimezone = geoData.timezone;
result.detectedLocation = { city: geoData.city, region: geoData.region };
console.log(`[PuppeteerPreflight] IP Geolocation: ${geoData.city}, ${geoData.region} (${geoData.timezone})`);
// Set browser timezone to match proxy location via CDP
try {
const client = await page.target().createCDPSession();
await client.send('Emulation.setTimezoneOverride', { timezoneId: geoData.timezone });
console.log(`[PuppeteerPreflight] Browser timezone set to: ${geoData.timezone}`);
} catch (tzErr: any) {
console.log(`[PuppeteerPreflight] Failed to set browser timezone: ${tzErr.message}`);
}
} else {
console.log(`[PuppeteerPreflight] WARNING: Could not determine timezone from IP - timezone mismatch possible`);
}
} else {
console.log(`[PuppeteerPreflight] IP lookup failed: ${ipApiResponse.error || 'unknown error'}`);
}
} catch (ipErr: any) {
console.log(`[PuppeteerPreflight] IP API error: ${ipErr.message}`);
}
// =========================================================================
// STEP 2: Preflight complete - proxy verified via ipify.org
// We skip heavy fingerprint.com/amiunique.org tests - just verify proxy works
// The actual Dutchie test happens at task time.
// =========================================================================
// If we got an IP from ipify.org, proxy is working
if (result.proxyIp) {
result.proxyConnected = true;
result.antidetectReady = true; // Assume stealth plugin is working
}
result.responseTimeMs = Date.now() - startTime;
// If we got here with proxyConnected=true and antidetectReady=true, we're good
if (result.proxyConnected && result.antidetectReady) {
result.passed = true;
console.log(
`[PuppeteerPreflight] PASSED - Proxy connected, anti-detect ready (${result.responseTimeMs}ms)`
);
if (result.proxyIp) {
console.log(`[PuppeteerPreflight] Browser IP via proxy: ${result.proxyIp}`);
}
} else if (result.proxyConnected) {
// Proxy works but anti-detect check failed - still pass (anti-detect is best-effort)
result.passed = true;
result.antidetectReady = true; // Assume ready since proxy works
console.log(
`[PuppeteerPreflight] PASSED - Proxy connected (anti-detect check skipped, ${result.responseTimeMs}ms)`
);
} else {
result.error = result.error || 'Proxy connection failed';
console.log(`[PuppeteerPreflight] FAILED - ${result.error}`);
}
} catch (err: any) {
result.error = `Browser error: ${err.message || 'Unknown error'}`;
console.log(`[PuppeteerPreflight] FAILED - ${result.error}`);
} finally {
if (browser) {
await browser.close().catch(() => {});
}
}
return result;
}
/**
* Run Puppeteer preflight with retry
* Retries once on failure to handle transient issues
*
* @param crawlRotator - CrawlRotator instance to get proxy from pool
* @param maxRetries - Number of retry attempts (default 1)
*/
export async function runPuppeteerPreflightWithRetry(
crawlRotator?: CrawlRotator,
maxRetries: number = 1
): Promise<PuppeteerPreflightResult> {
let lastResult: PuppeteerPreflightResult | null = null;
for (let attempt = 0; attempt <= maxRetries; attempt++) {
if (attempt > 0) {
console.log(`[PuppeteerPreflight] Retry attempt ${attempt}/${maxRetries}...`);
await new Promise((r) => setTimeout(r, 5000)); // Wait 5s between retries
}
lastResult = await runPuppeteerPreflight(crawlRotator);
if (lastResult.passed) {
return lastResult;
}
}
return lastResult!;
}

View File

@@ -1,38 +1,116 @@
/** import cron from 'node-cron';
* LEGACY SCHEDULER - DEPRECATED 2024-12-10 import { pool } from '../db/pool';
* import { scrapeStore, scrapeCategory } from '../scraper-v2';
* DO NOT USE THIS FILE.
* let scheduledJobs: cron.ScheduledTask[] = [];
* Per TASK_WORKFLOW_2024-12-10.md:
* This node-cron scheduler has been replaced by the database-driven async function getSettings(): Promise<{
* task scheduler in src/services/task-scheduler.ts scrapeIntervalHours: number;
* scrapeSpecialsTime: string;
* The new scheduler: }> {
* - Stores schedules in PostgreSQL (survives restarts) const result = await pool.query(`
* - Uses SELECT FOR UPDATE SKIP LOCKED (multi-replica safe) SELECT key, value FROM settings
* - Creates tasks in worker_tasks table (processed by task-worker.ts) WHERE key IN ('scrape_interval_hours', 'scrape_specials_time')
* `);
* This file is kept for reference only. All exports are no-ops.
* Legacy code has been removed - see git history for original implementation. const settings: Record<string, string> = {};
*/ result.rows.forEach((row: { key: string; value: string }) => {
settings[row.key] = row.value;
});
return {
scrapeIntervalHours: parseInt(settings.scrape_interval_hours || '4'),
scrapeSpecialsTime: settings.scrape_specials_time || '00:01'
};
}
async function scrapeAllStores(): Promise<void> {
console.log('🔄 Starting scheduled scrape for all stores...');
const result = await pool.query(`
SELECT id, name FROM stores WHERE active = true AND scrape_enabled = true
`);
for (const store of result.rows) {
try {
console.log(`Scraping store: ${store.name}`);
await scrapeStore(store.id);
} catch (error) {
console.error(`Failed to scrape store ${store.name}:`, error);
}
}
console.log('✅ Scheduled scrape completed');
}
async function scrapeSpecials(): Promise<void> {
console.log('🌟 Starting scheduled specials scrape...');
const result = await pool.query(`
SELECT s.id, s.name, c.id as category_id
FROM stores s
JOIN categories c ON c.store_id = s.id
WHERE s.active = true AND s.scrape_enabled = true
AND c.slug = 'specials' AND c.scrape_enabled = true
`);
for (const row of result.rows) {
try {
console.log(`Scraping specials for: ${row.name}`);
await scrapeCategory(row.id, row.category_id);
} catch (error) {
console.error(`Failed to scrape specials for ${row.name}:`, error);
}
}
console.log('✅ Specials scrape completed');
}
// 2024-12-10: All functions are now no-ops
export async function startScheduler(): Promise<void> { export async function startScheduler(): Promise<void> {
console.warn('[DEPRECATED] startScheduler() called - use taskScheduler from task-scheduler.ts instead'); // Stop any existing jobs
stopScheduler();
const settings = await getSettings();
// Schedule regular store scrapes (every N hours)
const scrapeIntervalCron = `0 */${settings.scrapeIntervalHours} * * *`;
const storeJob = cron.schedule(scrapeIntervalCron, scrapeAllStores);
scheduledJobs.push(storeJob);
console.log(`📅 Scheduled store scraping: every ${settings.scrapeIntervalHours} hours`);
// Schedule specials scraping (daily at specified time)
const [hours, minutes] = settings.scrapeSpecialsTime.split(':');
const specialsCron = `${minutes} ${hours} * * *`;
const specialsJob = cron.schedule(specialsCron, scrapeSpecials);
scheduledJobs.push(specialsJob);
console.log(`📅 Scheduled specials scraping: daily at ${settings.scrapeSpecialsTime}`);
// Initial scrape on startup (after 10 seconds)
setTimeout(() => {
console.log('🚀 Running initial scrape...');
scrapeAllStores().catch(console.error);
}, 10000);
} }
export function stopScheduler(): void { export function stopScheduler(): void {
console.warn('[DEPRECATED] stopScheduler() called - use taskScheduler from task-scheduler.ts instead'); scheduledJobs.forEach(job => job.stop());
scheduledJobs = [];
console.log('🛑 Scheduler stopped');
} }
export async function restartScheduler(): Promise<void> { export async function restartScheduler(): Promise<void> {
console.warn('[DEPRECATED] restartScheduler() called - use taskScheduler from task-scheduler.ts instead'); console.log('🔄 Restarting scheduler...');
stopScheduler();
await startScheduler();
} }
export async function triggerStoreScrape(_storeId: number): Promise<void> { // Manual trigger functions for admin
console.warn('[DEPRECATED] triggerStoreScrape() called - use taskService.createTask() instead'); export async function triggerStoreScrape(storeId: number): Promise<void> {
console.log(`🔧 Manual scrape triggered for store ID: ${storeId}`);
await scrapeStore(storeId);
} }
export async function triggerAllStoresScrape(): Promise<void> { export async function triggerAllStoresScrape(): Promise<void> {
console.warn('[DEPRECATED] triggerAllStoresScrape() called - use taskScheduler.triggerSchedule() instead'); console.log('🔧 Manual scrape triggered for all stores');
await scrapeAllStores();
} }

View File

@@ -1,526 +0,0 @@
/**
* Database-Driven Task Scheduler
*
* Per TASK_WORKFLOW_2024-12-10.md:
* - Schedules stored in DB (survives restarts)
* - Uses SELECT FOR UPDATE to prevent duplicate execution across replicas
* - Polls every 60s to check if schedules are due
* - Generates tasks into worker_tasks table for task-worker.ts to process
*
* 2024-12-10: Created to replace legacy node-cron scheduler
*/
import { pool } from '../db/pool';
import { taskService, TaskRole } from '../tasks/task-service';
// Per TASK_WORKFLOW_2024-12-10.md: Poll interval for checking schedules
const POLL_INTERVAL_MS = 60_000; // 60 seconds
interface TaskSchedule {
id: number;
name: string;
role: TaskRole;
enabled: boolean;
interval_hours: number;
last_run_at: Date | null;
next_run_at: Date | null;
state_code: string | null;
priority: number;
method: 'curl' | 'http' | null;
is_immutable: boolean;
description: string | null;
platform: string | null;
last_task_count: number | null;
last_error: string | null;
}
class TaskScheduler {
private pollTimer: NodeJS.Timeout | null = null;
private isRunning = false;
/**
* Start the scheduler
* Per TASK_WORKFLOW_2024-12-10.md: Called on API server startup
*/
async start(): Promise<void> {
if (this.isRunning) {
console.log('[TaskScheduler] Already running');
return;
}
console.log('[TaskScheduler] Starting database-driven scheduler...');
this.isRunning = true;
// Per TASK_WORKFLOW_2024-12-10.md: On startup, recover stale tasks
try {
const recovered = await taskService.recoverStaleTasks(10);
if (recovered > 0) {
console.log(`[TaskScheduler] Recovered ${recovered} stale tasks from dead workers`);
}
} catch (err: any) {
console.error('[TaskScheduler] Failed to recover stale tasks:', err.message);
}
// Per TASK_WORKFLOW_2024-12-10.md: Ensure default schedules exist
await this.ensureDefaultSchedules();
// Per TASK_WORKFLOW_2024-12-10.md: Check immediately on startup
await this.checkAndRunDueSchedules();
// Per TASK_WORKFLOW_2024-12-10.md: Then poll every 60 seconds
this.pollTimer = setInterval(async () => {
await this.checkAndRunDueSchedules();
}, POLL_INTERVAL_MS);
console.log('[TaskScheduler] Started - polling every 60s');
}
/**
* Stop the scheduler
*/
stop(): void {
if (this.pollTimer) {
clearInterval(this.pollTimer);
this.pollTimer = null;
}
this.isRunning = false;
console.log('[TaskScheduler] Stopped');
}
/**
* Ensure default schedules exist in the database
* Per TASK_WORKFLOW_2024-12-10.md: Creates schedules if they don't exist
*
* NOTE: Per-state product_discovery schedules are created by migration 089.
* This only creates core immutable schedules that should exist regardless.
*/
private async ensureDefaultSchedules(): Promise<void> {
// Core schedules - all use HTTP transport for browser-based scraping
const defaults = [
{
name: 'store_discovery_dutchie',
role: 'store_discovery' as TaskRole,
interval_hours: 168, // Weekly
priority: 5,
description: 'Discover new Dutchie stores weekly (HTTP transport)',
method: 'http',
is_immutable: true,
platform: 'dutchie',
},
{
name: 'analytics_refresh',
role: 'analytics_refresh' as TaskRole,
interval_hours: 6,
priority: 0,
description: 'Refresh analytics materialized views every 6 hours',
method: 'http',
is_immutable: true,
platform: null,
},
];
for (const sched of defaults) {
try {
await pool.query(`
INSERT INTO task_schedules (name, role, interval_hours, priority, description, method, is_immutable, platform, enabled, next_run_at)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, true, NOW())
ON CONFLICT (name) DO UPDATE SET
method = EXCLUDED.method,
is_immutable = EXCLUDED.is_immutable
`, [sched.name, sched.role, sched.interval_hours, sched.priority, sched.description, sched.method, sched.is_immutable, sched.platform]);
} catch (err: any) {
// Table may not exist yet - will be created by migration
if (!err.message.includes('does not exist')) {
console.error(`[TaskScheduler] Failed to create default schedule ${sched.name}:`, err.message);
}
}
}
}
/**
* Check for and run any due schedules
* Per TASK_WORKFLOW_2024-12-10.md: Uses SELECT FOR UPDATE SKIP LOCKED to prevent duplicates
*/
private async checkAndRunDueSchedules(): Promise<void> {
const client = await pool.connect();
try {
await client.query('BEGIN');
// Per TASK_WORKFLOW_2024-12-10.md: Atomic claim of due schedules
const result = await client.query<TaskSchedule>(`
SELECT *
FROM task_schedules
WHERE enabled = true
AND (next_run_at IS NULL OR next_run_at <= NOW())
FOR UPDATE SKIP LOCKED
`);
for (const schedule of result.rows) {
console.log(`[TaskScheduler] Running schedule: ${schedule.name} (${schedule.role})`);
try {
const tasksCreated = await this.executeSchedule(schedule);
console.log(`[TaskScheduler] Schedule ${schedule.name} created ${tasksCreated} tasks`);
// Per TASK_WORKFLOW_2024-12-10.md: Update last_run_at and calculate next_run_at
await client.query(`
UPDATE task_schedules
SET
last_run_at = NOW(),
next_run_at = NOW() + ($1 || ' hours')::interval,
last_task_count = $2,
updated_at = NOW()
WHERE id = $3
`, [schedule.interval_hours, tasksCreated, schedule.id]);
} catch (err: any) {
console.error(`[TaskScheduler] Schedule ${schedule.name} failed:`, err.message);
// Still update next_run_at to prevent infinite retry loop
await client.query(`
UPDATE task_schedules
SET
next_run_at = NOW() + ($1 || ' hours')::interval,
last_error = $2,
updated_at = NOW()
WHERE id = $3
`, [schedule.interval_hours, err.message, schedule.id]);
}
}
await client.query('COMMIT');
} catch (err: any) {
await client.query('ROLLBACK');
console.error('[TaskScheduler] Failed to check schedules:', err.message);
} finally {
client.release();
}
}
/**
* Execute a schedule and create tasks
* Per TASK_WORKFLOW_2024-12-10.md: Different logic per role
*
* TRANSPORT MODES:
* - All schedules now use HTTP transport (Puppeteer/browser)
* - Per-state product_discovery schedules process one state at a time
* - Workers must pass HTTP preflight to claim HTTP tasks
*/
private async executeSchedule(schedule: TaskSchedule): Promise<number> {
switch (schedule.role) {
case 'product_discovery':
// Per-state product discovery using HTTP transport
return this.generateProductDiscoveryTasks(schedule);
case 'payload_fetch':
// DEPRECATED: Legacy payload_fetch redirects to product_discovery
console.log(`[TaskScheduler] payload_fetch is deprecated, using product_discovery instead`);
return this.generateProductDiscoveryTasks(schedule);
case 'product_refresh':
// DEPRECATED: Legacy product_refresh redirects to product_discovery
console.log(`[TaskScheduler] product_refresh is deprecated, using product_discovery instead`);
return this.generateProductDiscoveryTasks(schedule);
case 'store_discovery':
return this.generateStoreDiscoveryTasks(schedule);
case 'analytics_refresh':
return this.generateAnalyticsRefreshTasks(schedule);
default:
console.warn(`[TaskScheduler] Unknown role: ${schedule.role}`);
return 0;
}
}
/**
* Generate product_discovery tasks for stores in a specific state
* Uses HTTP transport (Puppeteer/browser) for all tasks
*
* Per-state scheduling allows:
* - Different crawl frequencies per state (e.g., AZ=4h, MI=6h)
* - Better rate limit management (one state at a time)
* - Easier debugging and monitoring per state
*/
private async generateProductDiscoveryTasks(schedule: TaskSchedule): Promise<number> {
// state_code is required for per-state schedules
if (!schedule.state_code) {
console.warn(`[TaskScheduler] Schedule ${schedule.name} has no state_code, skipping`);
return 0;
}
// Find stores in this state needing refresh
const result = await pool.query(`
SELECT d.id
FROM dispensaries d
JOIN states s ON d.state_id = s.id
WHERE d.crawl_enabled = true
AND d.platform_dispensary_id IS NOT NULL
AND s.code = $1
-- No pending/running product_discovery task already
AND NOT EXISTS (
SELECT 1 FROM worker_tasks t
WHERE t.dispensary_id = d.id
AND t.role = 'product_discovery'
AND t.status IN ('pending', 'claimed', 'running')
)
-- Never fetched OR last fetch > interval ago
AND (
d.last_fetch_at IS NULL
OR d.last_fetch_at < NOW() - ($2 || ' hours')::interval
)
ORDER BY d.last_fetch_at NULLS FIRST, d.id
`, [schedule.state_code, schedule.interval_hours]);
const dispensaryIds = result.rows.map((r: { id: number }) => r.id);
if (dispensaryIds.length === 0) {
console.log(`[TaskScheduler] No stores in ${schedule.state_code} need refresh`);
return 0;
}
console.log(`[TaskScheduler] Creating ${dispensaryIds.length} product_discovery tasks for ${schedule.state_code}`);
// Create product_discovery tasks with HTTP transport
// Stagger by 15 seconds to prevent overwhelming proxies
const { created } = await taskService.createStaggeredTasks(
dispensaryIds,
'product_discovery',
15, // 15 seconds apart
schedule.platform || 'dutchie',
'http' // Force HTTP transport
);
return created;
}
/**
* Generate store_discovery tasks
* Uses HTTP transport (Puppeteer/browser) for browser-based discovery
*/
private async generateStoreDiscoveryTasks(schedule: TaskSchedule): Promise<number> {
// Check if discovery task already pending
const existing = await taskService.listTasks({
role: 'store_discovery',
status: ['pending', 'claimed', 'running'],
limit: 1,
});
if (existing.length > 0) {
console.log('[TaskScheduler] Store discovery task already pending, skipping');
return 0;
}
await taskService.createTask({
role: 'store_discovery',
platform: schedule.platform || 'dutchie',
priority: schedule.priority,
method: 'http', // Force HTTP transport for browser-based discovery
});
return 1;
}
/**
* Generate analytics_refresh tasks
* Per TASK_WORKFLOW_2024-12-10.md: Single task to refresh all MVs
*/
private async generateAnalyticsRefreshTasks(schedule: TaskSchedule): Promise<number> {
// Check if analytics task already pending
const existing = await taskService.listTasks({
role: 'analytics_refresh',
status: ['pending', 'claimed', 'running'],
limit: 1,
});
if (existing.length > 0) {
console.log('[TaskScheduler] Analytics refresh task already pending, skipping');
return 0;
}
await taskService.createTask({
role: 'analytics_refresh',
priority: schedule.priority,
});
return 1;
}
/**
* Get all schedules for dashboard display
* Returns schedules with full metadata including immutability flag
*/
async getSchedules(): Promise<TaskSchedule[]> {
try {
const result = await pool.query(`
SELECT
id,
name,
role,
enabled,
interval_hours,
last_run_at,
next_run_at,
state_code,
priority,
method,
COALESCE(is_immutable, false) as is_immutable,
description,
platform,
last_task_count,
last_error,
created_at,
updated_at
FROM task_schedules
ORDER BY
CASE role
WHEN 'store_discovery' THEN 1
WHEN 'product_discovery' THEN 2
WHEN 'analytics_refresh' THEN 3
ELSE 4
END,
state_code NULLS FIRST,
name
`);
return result.rows as TaskSchedule[];
} catch {
return [];
}
}
/**
* Get a single schedule by ID
*/
async getSchedule(id: number): Promise<TaskSchedule | null> {
try {
const result = await pool.query(`
SELECT * FROM task_schedules WHERE id = $1
`, [id]);
return result.rows[0] as TaskSchedule || null;
} catch {
return null;
}
}
/**
* Update a schedule
* Allows updating: enabled, interval_hours, priority
* Does NOT allow updating: name, role, state_code, is_immutable
*/
async updateSchedule(id: number, updates: Partial<TaskSchedule>): Promise<void> {
const setClauses: string[] = [];
const values: any[] = [];
let paramIndex = 1;
if (updates.enabled !== undefined) {
setClauses.push(`enabled = $${paramIndex++}`);
values.push(updates.enabled);
}
if (updates.interval_hours !== undefined) {
setClauses.push(`interval_hours = $${paramIndex++}`);
values.push(updates.interval_hours);
}
if (updates.priority !== undefined) {
setClauses.push(`priority = $${paramIndex++}`);
values.push(updates.priority);
}
if (setClauses.length === 0) return;
setClauses.push('updated_at = NOW()');
values.push(id);
await pool.query(`
UPDATE task_schedules
SET ${setClauses.join(', ')}
WHERE id = $${paramIndex}
`, values);
}
/**
* Delete a schedule (only if not immutable)
* Returns true if deleted, false if immutable
*/
async deleteSchedule(id: number): Promise<{ deleted: boolean; reason?: string }> {
// Check if schedule is immutable
const result = await pool.query(`
SELECT name, is_immutable FROM task_schedules WHERE id = $1
`, [id]);
if (result.rows.length === 0) {
return { deleted: false, reason: 'Schedule not found' };
}
const schedule = result.rows[0];
if (schedule.is_immutable) {
return {
deleted: false,
reason: `Schedule "${schedule.name}" is immutable and cannot be deleted. You can disable it instead.`
};
}
await pool.query(`DELETE FROM task_schedules WHERE id = $1`, [id]);
return { deleted: true };
}
/**
* Trigger a schedule to run immediately
*/
async triggerSchedule(id: number): Promise<number> {
const result = await pool.query(`
SELECT * FROM task_schedules WHERE id = $1
`, [id]);
if (result.rows.length === 0) {
throw new Error(`Schedule ${id} not found`);
}
return this.executeSchedule(result.rows[0] as TaskSchedule);
}
/**
* Get schedule statistics for dashboard
*/
async getScheduleStats(): Promise<{
total: number;
enabled: number;
byRole: Record<string, number>;
byState: Record<string, number>;
}> {
try {
const result = await pool.query(`
SELECT
COUNT(*)::int as total,
SUM(CASE WHEN enabled THEN 1 ELSE 0 END)::int as enabled_count,
role,
state_code
FROM task_schedules
GROUP BY role, state_code
`);
let total = 0;
let enabled = 0;
const byRole: Record<string, number> = {};
const byState: Record<string, number> = {};
for (const row of result.rows) {
total += row.total;
enabled += row.enabled_count;
byRole[row.role] = (byRole[row.role] || 0) + row.total;
if (row.state_code) {
byState[row.state_code] = (byState[row.state_code] || 0) + row.total;
}
}
return { total, enabled, byRole, byState };
} catch {
return { total: 0, enabled: 0, byRole: {}, byState: {} };
}
}
}
// Per TASK_WORKFLOW_2024-12-10.md: Singleton instance
export const taskScheduler = new TaskScheduler();

View File

@@ -1,30 +1,566 @@
/** /**
* System API Routes (Stub) * System API Routes
* *
* The full system routes depend on SyncOrchestrator which was moved to _deprecated. * Provides REST API endpoints for system monitoring and control:
* This stub provides empty routers to maintain backward compatibility. * - /api/system/sync/* - Sync orchestrator
* - /api/system/dlq/* - Dead-letter queue
* - /api/system/integrity/* - Integrity checks
* - /api/system/fix/* - Auto-fix routines
* - /api/system/alerts/* - System alerts
* - /metrics - Prometheus metrics
* *
* Full implementation available at: src/_deprecated/system/routes/index.ts * Phase 5: Full Production Sync + Monitoring
*/ */
import { Router, Request, Response } from 'express'; import { Router, Request, Response } from 'express';
import { Pool } from 'pg'; import { Pool } from 'pg';
import { MetricsService } from '../services'; import {
SyncOrchestrator,
MetricsService,
DLQService,
AlertService,
IntegrityService,
AutoFixService,
} from '../services';
export function createSystemRouter(_pool: Pool): Router { export function createSystemRouter(pool: Pool): Router {
const router = Router(); const router = Router();
// Stub - full sync/dlq/integrity/fix/alerts routes moved to _deprecated // Initialize services
router.get('/status', (_req: Request, res: Response) => { const metrics = new MetricsService(pool);
res.json({ const dlq = new DLQService(pool);
message: 'System routes temporarily disabled - see _deprecated/system/routes', const alerts = new AlertService(pool);
status: 'stub', const integrity = new IntegrityService(pool, alerts);
const autoFix = new AutoFixService(pool, alerts);
const orchestrator = new SyncOrchestrator(pool, metrics, dlq, alerts);
// ============================================================
// SYNC ORCHESTRATOR ENDPOINTS
// ============================================================
/**
* GET /api/system/sync/status
* Get current sync status
*/
router.get('/sync/status', async (_req: Request, res: Response) => {
try {
const status = await orchestrator.getStatus();
res.json(status);
} catch (error) {
console.error('[System] Sync status error:', error);
res.status(500).json({ error: 'Failed to get sync status' });
}
}); });
/**
* POST /api/system/sync/run
* Trigger a sync run
*/
router.post('/sync/run', async (req: Request, res: Response) => {
try {
const triggeredBy = req.body.triggeredBy || 'api';
const result = await orchestrator.runSync();
res.json({
success: true,
triggeredBy,
metrics: result,
});
} catch (error) {
console.error('[System] Sync run error:', error);
res.status(500).json({
success: false,
error: error instanceof Error ? error.message : 'Sync run failed',
});
}
});
/**
* GET /api/system/sync/queue-depth
* Get queue depth information
*/
router.get('/sync/queue-depth', async (_req: Request, res: Response) => {
try {
const depth = await orchestrator.getQueueDepth();
res.json(depth);
} catch (error) {
console.error('[System] Queue depth error:', error);
res.status(500).json({ error: 'Failed to get queue depth' });
}
});
/**
* GET /api/system/sync/health
* Get sync health status
*/
router.get('/sync/health', async (_req: Request, res: Response) => {
try {
const health = await orchestrator.getHealth();
res.status(health.healthy ? 200 : 503).json(health);
} catch (error) {
console.error('[System] Health check error:', error);
res.status(500).json({ healthy: false, error: 'Health check failed' });
}
});
/**
* POST /api/system/sync/pause
* Pause the orchestrator
*/
router.post('/sync/pause', async (req: Request, res: Response) => {
try {
const reason = req.body.reason || 'Manual pause';
await orchestrator.pause(reason);
res.json({ success: true, message: 'Orchestrator paused' });
} catch (error) {
console.error('[System] Pause error:', error);
res.status(500).json({ error: 'Failed to pause orchestrator' });
}
});
/**
* POST /api/system/sync/resume
* Resume the orchestrator
*/
router.post('/sync/resume', async (_req: Request, res: Response) => {
try {
await orchestrator.resume();
res.json({ success: true, message: 'Orchestrator resumed' });
} catch (error) {
console.error('[System] Resume error:', error);
res.status(500).json({ error: 'Failed to resume orchestrator' });
}
});
// ============================================================
// DLQ ENDPOINTS
// ============================================================
/**
* GET /api/system/dlq
* List DLQ payloads
*/
router.get('/dlq', async (req: Request, res: Response) => {
try {
const options = {
status: req.query.status as string,
errorType: req.query.errorType as string,
dispensaryId: req.query.dispensaryId ? parseInt(req.query.dispensaryId as string) : undefined,
limit: req.query.limit ? parseInt(req.query.limit as string) : 50,
offset: req.query.offset ? parseInt(req.query.offset as string) : 0,
};
const result = await dlq.listPayloads(options);
res.json(result);
} catch (error) {
console.error('[System] DLQ list error:', error);
res.status(500).json({ error: 'Failed to list DLQ payloads' });
}
});
/**
* GET /api/system/dlq/stats
* Get DLQ statistics
*/
router.get('/dlq/stats', async (_req: Request, res: Response) => {
try {
const stats = await dlq.getStats();
res.json(stats);
} catch (error) {
console.error('[System] DLQ stats error:', error);
res.status(500).json({ error: 'Failed to get DLQ stats' });
}
});
/**
* GET /api/system/dlq/summary
* Get DLQ summary by error type
*/
router.get('/dlq/summary', async (_req: Request, res: Response) => {
try {
const summary = await dlq.getSummary();
res.json(summary);
} catch (error) {
console.error('[System] DLQ summary error:', error);
res.status(500).json({ error: 'Failed to get DLQ summary' });
}
});
/**
* GET /api/system/dlq/:id
* Get a specific DLQ payload
*/
router.get('/dlq/:id', async (req: Request, res: Response) => {
try {
const payload = await dlq.getPayload(req.params.id);
if (!payload) {
return res.status(404).json({ error: 'Payload not found' });
}
res.json(payload);
} catch (error) {
console.error('[System] DLQ get error:', error);
res.status(500).json({ error: 'Failed to get DLQ payload' });
}
});
/**
* POST /api/system/dlq/:id/retry
* Retry a DLQ payload
*/
router.post('/dlq/:id/retry', async (req: Request, res: Response) => {
try {
const result = await dlq.retryPayload(req.params.id);
if (result.success) {
res.json(result);
} else {
res.status(400).json(result);
}
} catch (error) {
console.error('[System] DLQ retry error:', error);
res.status(500).json({ error: 'Failed to retry payload' });
}
});
/**
* POST /api/system/dlq/:id/abandon
* Abandon a DLQ payload
*/
router.post('/dlq/:id/abandon', async (req: Request, res: Response) => {
try {
const reason = req.body.reason || 'Manually abandoned';
const abandonedBy = req.body.abandonedBy || 'api';
const success = await dlq.abandonPayload(req.params.id, reason, abandonedBy);
res.json({ success });
} catch (error) {
console.error('[System] DLQ abandon error:', error);
res.status(500).json({ error: 'Failed to abandon payload' });
}
});
/**
* POST /api/system/dlq/bulk-retry
* Bulk retry payloads by error type
*/
router.post('/dlq/bulk-retry', async (req: Request, res: Response) => {
try {
const { errorType } = req.body;
if (!errorType) {
return res.status(400).json({ error: 'errorType is required' });
}
const result = await dlq.bulkRetryByErrorType(errorType);
res.json(result);
} catch (error) {
console.error('[System] DLQ bulk retry error:', error);
res.status(500).json({ error: 'Failed to bulk retry' });
}
});
// ============================================================
// INTEGRITY CHECK ENDPOINTS
// ============================================================
/**
* POST /api/system/integrity/run
* Run all integrity checks
*/
router.post('/integrity/run', async (req: Request, res: Response) => {
try {
const triggeredBy = req.body.triggeredBy || 'api';
const result = await integrity.runAllChecks(triggeredBy);
res.json(result);
} catch (error) {
console.error('[System] Integrity run error:', error);
res.status(500).json({ error: 'Failed to run integrity checks' });
}
});
/**
* GET /api/system/integrity/runs
* Get recent integrity check runs
*/
router.get('/integrity/runs', async (req: Request, res: Response) => {
try {
const limit = req.query.limit ? parseInt(req.query.limit as string) : 10;
const runs = await integrity.getRecentRuns(limit);
res.json(runs);
} catch (error) {
console.error('[System] Integrity runs error:', error);
res.status(500).json({ error: 'Failed to get integrity runs' });
}
});
/**
* GET /api/system/integrity/runs/:runId
* Get results for a specific integrity run
*/
router.get('/integrity/runs/:runId', async (req: Request, res: Response) => {
try {
const results = await integrity.getRunResults(req.params.runId);
res.json(results);
} catch (error) {
console.error('[System] Integrity run results error:', error);
res.status(500).json({ error: 'Failed to get run results' });
}
});
// ============================================================
// AUTO-FIX ENDPOINTS
// ============================================================
/**
* GET /api/system/fix/routines
* Get available fix routines
*/
router.get('/fix/routines', (_req: Request, res: Response) => {
try {
const routines = autoFix.getAvailableRoutines();
res.json(routines);
} catch (error) {
console.error('[System] Get routines error:', error);
res.status(500).json({ error: 'Failed to get routines' });
}
});
/**
* POST /api/system/fix/:routine
* Run a fix routine
*/
router.post('/fix/:routine', async (req: Request, res: Response) => {
try {
const routineName = req.params.routine;
const dryRun = req.body.dryRun === true;
const triggeredBy = req.body.triggeredBy || 'api';
const result = await autoFix.runRoutine(routineName as any, triggeredBy, { dryRun });
res.json(result);
} catch (error) {
console.error('[System] Fix routine error:', error);
res.status(500).json({ error: 'Failed to run fix routine' });
}
});
/**
* GET /api/system/fix/runs
* Get recent fix runs
*/
router.get('/fix/runs', async (req: Request, res: Response) => {
try {
const limit = req.query.limit ? parseInt(req.query.limit as string) : 20;
const runs = await autoFix.getRecentRuns(limit);
res.json(runs);
} catch (error) {
console.error('[System] Fix runs error:', error);
res.status(500).json({ error: 'Failed to get fix runs' });
}
});
// ============================================================
// ALERTS ENDPOINTS
// ============================================================
/**
* GET /api/system/alerts
* List alerts
*/
router.get('/alerts', async (req: Request, res: Response) => {
try {
const options = {
status: req.query.status as any,
severity: req.query.severity as any,
type: req.query.type as string,
limit: req.query.limit ? parseInt(req.query.limit as string) : 50,
offset: req.query.offset ? parseInt(req.query.offset as string) : 0,
};
const result = await alerts.listAlerts(options);
res.json(result);
} catch (error) {
console.error('[System] Alerts list error:', error);
res.status(500).json({ error: 'Failed to list alerts' });
}
});
/**
* GET /api/system/alerts/active
* Get active alerts
*/
router.get('/alerts/active', async (_req: Request, res: Response) => {
try {
const activeAlerts = await alerts.getActiveAlerts();
res.json(activeAlerts);
} catch (error) {
console.error('[System] Active alerts error:', error);
res.status(500).json({ error: 'Failed to get active alerts' });
}
});
/**
* GET /api/system/alerts/summary
* Get alert summary
*/
router.get('/alerts/summary', async (_req: Request, res: Response) => {
try {
const summary = await alerts.getSummary();
res.json(summary);
} catch (error) {
console.error('[System] Alerts summary error:', error);
res.status(500).json({ error: 'Failed to get alerts summary' });
}
});
/**
* POST /api/system/alerts/:id/acknowledge
* Acknowledge an alert
*/
router.post('/alerts/:id/acknowledge', async (req: Request, res: Response) => {
try {
const alertId = parseInt(req.params.id);
const acknowledgedBy = req.body.acknowledgedBy || 'api';
const success = await alerts.acknowledgeAlert(alertId, acknowledgedBy);
res.json({ success });
} catch (error) {
console.error('[System] Acknowledge alert error:', error);
res.status(500).json({ error: 'Failed to acknowledge alert' });
}
});
/**
* POST /api/system/alerts/:id/resolve
* Resolve an alert
*/
router.post('/alerts/:id/resolve', async (req: Request, res: Response) => {
try {
const alertId = parseInt(req.params.id);
const resolvedBy = req.body.resolvedBy || 'api';
const success = await alerts.resolveAlert(alertId, resolvedBy);
res.json({ success });
} catch (error) {
console.error('[System] Resolve alert error:', error);
res.status(500).json({ error: 'Failed to resolve alert' });
}
});
/**
* POST /api/system/alerts/bulk-acknowledge
* Bulk acknowledge alerts
*/
router.post('/alerts/bulk-acknowledge', async (req: Request, res: Response) => {
try {
const { ids, acknowledgedBy } = req.body;
if (!ids || !Array.isArray(ids)) {
return res.status(400).json({ error: 'ids array is required' });
}
const count = await alerts.bulkAcknowledge(ids, acknowledgedBy || 'api');
res.json({ acknowledged: count });
} catch (error) {
console.error('[System] Bulk acknowledge error:', error);
res.status(500).json({ error: 'Failed to bulk acknowledge' });
}
});
// ============================================================
// METRICS ENDPOINTS
// ============================================================
/**
* GET /api/system/metrics
* Get all current metrics
*/
router.get('/metrics', async (_req: Request, res: Response) => {
try {
const allMetrics = await metrics.getAllMetrics();
res.json(allMetrics);
} catch (error) {
console.error('[System] Metrics error:', error);
res.status(500).json({ error: 'Failed to get metrics' });
}
});
/**
* GET /api/system/metrics/:name
* Get a specific metric
*/
router.get('/metrics/:name', async (req: Request, res: Response) => {
try {
const metric = await metrics.getMetric(req.params.name);
if (!metric) {
return res.status(404).json({ error: 'Metric not found' });
}
res.json(metric);
} catch (error) {
console.error('[System] Metric error:', error);
res.status(500).json({ error: 'Failed to get metric' });
}
});
/**
* GET /api/system/metrics/:name/history
* Get metric time series
*/
router.get('/metrics/:name/history', async (req: Request, res: Response) => {
try {
const hours = req.query.hours ? parseInt(req.query.hours as string) : 24;
const history = await metrics.getMetricHistory(req.params.name, hours);
res.json(history);
} catch (error) {
console.error('[System] Metric history error:', error);
res.status(500).json({ error: 'Failed to get metric history' });
}
});
/**
* GET /api/system/errors
* Get error summary
*/
router.get('/errors', async (_req: Request, res: Response) => {
try {
const summary = await metrics.getErrorSummary();
res.json(summary);
} catch (error) {
console.error('[System] Error summary error:', error);
res.status(500).json({ error: 'Failed to get error summary' });
}
});
/**
* GET /api/system/errors/recent
* Get recent errors
*/
router.get('/errors/recent', async (req: Request, res: Response) => {
try {
const limit = req.query.limit ? parseInt(req.query.limit as string) : 50;
const errorType = req.query.type as string;
const errors = await metrics.getRecentErrors(limit, errorType);
res.json(errors);
} catch (error) {
console.error('[System] Recent errors error:', error);
res.status(500).json({ error: 'Failed to get recent errors' });
}
});
/**
* POST /api/system/errors/acknowledge
* Acknowledge errors
*/
router.post('/errors/acknowledge', async (req: Request, res: Response) => {
try {
const { ids, acknowledgedBy } = req.body;
if (!ids || !Array.isArray(ids)) {
return res.status(400).json({ error: 'ids array is required' });
}
const count = await metrics.acknowledgeErrors(ids, acknowledgedBy || 'api');
res.json({ acknowledged: count });
} catch (error) {
console.error('[System] Acknowledge errors error:', error);
res.status(500).json({ error: 'Failed to acknowledge errors' });
}
}); });
return router; return router;
} }
/**
* Create Prometheus metrics endpoint (standalone)
*/
export function createPrometheusRouter(pool: Pool): Router { export function createPrometheusRouter(pool: Pool): Router {
const router = Router(); const router = Router();
const metrics = new MetricsService(pool); const metrics = new MetricsService(pool);

View File

@@ -4,7 +4,7 @@
* Phase 5: Full Production Sync + Monitoring * Phase 5: Full Production Sync + Monitoring
*/ */
// SyncOrchestrator moved to _deprecated (depends on hydration module) export { SyncOrchestrator, type SyncStatus, type QueueDepth, type SyncRunMetrics, type OrchestratorStatus } from './sync-orchestrator';
export { MetricsService, ERROR_TYPES, type Metric, type MetricTimeSeries, type ErrorBucket, type ErrorType } from './metrics'; export { MetricsService, ERROR_TYPES, type Metric, type MetricTimeSeries, type ErrorBucket, type ErrorType } from './metrics';
export { DLQService, type DLQPayload, type DLQStats } from './dlq'; export { DLQService, type DLQPayload, type DLQStats } from './dlq';
export { AlertService, type SystemAlert, type AlertSummary, type AlertSeverity, type AlertStatus } from './alerts'; export { AlertService, type SystemAlert, type AlertSummary, type AlertSeverity, type AlertStatus } from './alerts';

View File

@@ -94,8 +94,7 @@ export async function handleEntryPointDiscovery(ctx: TaskContext): Promise<TaskR
// ============================================================ // ============================================================
// STEP 3: Start stealth session // STEP 3: Start stealth session
// ============================================================ // ============================================================
// Per workflow-12102025.md: session identity comes from proxy location, not task params const session = startSession(dispensary.state || 'AZ', 'America/Phoenix');
const session = startSession();
console.log(`[EntryPointDiscovery] Session started: ${session.sessionId}`); console.log(`[EntryPointDiscovery] Session started: ${session.sessionId}`);
try { try {

View File

@@ -2,18 +2,10 @@
* Task Handlers Index * Task Handlers Index
* *
* Exports all task handlers for the task worker. * Exports all task handlers for the task worker.
*
* Product Discovery:
* - handleProductDiscoveryCurl: curl/axios based (for curl transport)
* - handleProductDiscoveryHttp: Puppeteer browser-based (for http transport)
*/ */
export { handleProductDiscovery as handleProductDiscoveryCurl } from './product-discovery-curl';
export { handleProductDiscoveryHttp } from './product-discovery-http';
export { handlePayloadFetch as handlePayloadFetchCurl } from './payload-fetch-curl';
export { handleProductRefresh } from './product-refresh'; export { handleProductRefresh } from './product-refresh';
export { handleProductDiscovery } from './product-discovery';
export { handleStoreDiscovery } from './store-discovery'; export { handleStoreDiscovery } from './store-discovery';
export { handleStoreDiscoveryHttp } from './store-discovery-http';
export { handleEntryPointDiscovery } from './entry-point-discovery'; export { handleEntryPointDiscovery } from './entry-point-discovery';
export { handleAnalyticsRefresh } from './analytics-refresh'; export { handleAnalyticsRefresh } from './analytics-refresh';
export { handleWhoami } from './whoami';

View File

@@ -1,221 +0,0 @@
/**
* Payload Fetch Handler
*
* Per TASK_WORKFLOW_2024-12-10.md: Separates API fetch from data processing.
*
* This handler ONLY:
* 1. Hits Dutchie GraphQL API
* 2. Saves raw payload to filesystem (gzipped)
* 3. Records metadata in raw_crawl_payloads table
* 4. Queues a product_refresh task to process the payload
*
* Benefits of separation:
* - Retry-friendly: If normalize fails, re-run refresh without re-crawling
* - Faster refreshes: Local file read vs network call
* - Replay-able: Run refresh against any historical payload
* - Less API pressure: Only this role hits Dutchie
*/
import { TaskContext, TaskResult } from '../task-worker';
import {
executeGraphQL,
startSession,
endSession,
GRAPHQL_HASHES,
DUTCHIE_CONFIG,
} from '../../platforms/dutchie';
import { saveRawPayload } from '../../utils/payload-storage';
import { taskService } from '../task-service';
export async function handlePayloadFetch(ctx: TaskContext): Promise<TaskResult> {
const { pool, task } = ctx;
const dispensaryId = task.dispensary_id;
if (!dispensaryId) {
return { success: false, error: 'No dispensary_id specified for payload_fetch task' };
}
try {
// ============================================================
// STEP 1: Load dispensary info
// ============================================================
const dispResult = await pool.query(`
SELECT
id, name, platform_dispensary_id, menu_url, menu_type, city, state
FROM dispensaries
WHERE id = $1 AND crawl_enabled = true
`, [dispensaryId]);
if (dispResult.rows.length === 0) {
return { success: false, error: `Dispensary ${dispensaryId} not found or not crawl_enabled` };
}
const dispensary = dispResult.rows[0];
const platformId = dispensary.platform_dispensary_id;
if (!platformId) {
return { success: false, error: `Dispensary ${dispensaryId} has no platform_dispensary_id` };
}
// Extract cName from menu_url
const cNameMatch = dispensary.menu_url?.match(/\/(?:embedded-menu|dispensary)\/([^/?]+)/);
const cName = cNameMatch ? cNameMatch[1] : 'dispensary';
console.log(`[PayloadFetch] Starting fetch for ${dispensary.name} (ID: ${dispensaryId})`);
console.log(`[PayloadFetch] Platform ID: ${platformId}, cName: ${cName}`);
// ============================================================
// STEP 2: Start stealth session
// ============================================================
const session = startSession();
console.log(`[PayloadFetch] Session started: ${session.sessionId}`);
await ctx.heartbeat();
// ============================================================
// STEP 3: Fetch products via GraphQL (Status: 'All')
// ============================================================
const allProducts: any[] = [];
let page = 0;
let totalCount = 0;
const perPage = DUTCHIE_CONFIG.perPage;
const maxPages = DUTCHIE_CONFIG.maxPages;
try {
while (page < maxPages) {
const variables = {
includeEnterpriseSpecials: false,
productsFilter: {
dispensaryId: platformId,
pricingType: 'rec',
Status: 'All',
types: [],
useCache: false,
isDefaultSort: true,
sortBy: 'popularSortIdx',
sortDirection: 1,
bypassOnlineThresholds: true,
isKioskMenu: false,
removeProductsBelowOptionThresholds: false,
},
page,
perPage,
};
console.log(`[PayloadFetch] Fetching page ${page + 1}...`);
const result = await executeGraphQL(
'FilteredProducts',
variables,
GRAPHQL_HASHES.FilteredProducts,
{ cName, maxRetries: 3 }
);
const data = result?.data?.filteredProducts;
if (!data || !data.products) {
if (page === 0) {
throw new Error('No product data returned from GraphQL');
}
break;
}
const products = data.products;
allProducts.push(...products);
if (page === 0) {
totalCount = data.queryInfo?.totalCount || products.length;
console.log(`[PayloadFetch] Total products reported: ${totalCount}`);
}
if (allProducts.length >= totalCount || products.length < perPage) {
break;
}
page++;
if (page < maxPages) {
await new Promise(r => setTimeout(r, DUTCHIE_CONFIG.pageDelayMs));
}
if (page % 5 === 0) {
await ctx.heartbeat();
}
}
console.log(`[PayloadFetch] Fetched ${allProducts.length} products in ${page + 1} pages`);
} finally {
endSession();
}
if (allProducts.length === 0) {
return {
success: false,
error: 'No products returned from GraphQL',
productsProcessed: 0,
};
}
await ctx.heartbeat();
// ============================================================
// STEP 4: Save raw payload to filesystem
// Per TASK_WORKFLOW_2024-12-10.md: Metadata/Payload separation
// ============================================================
const rawPayload = {
dispensaryId,
platformId,
cName,
fetchedAt: new Date().toISOString(),
productCount: allProducts.length,
products: allProducts,
};
const payloadResult = await saveRawPayload(
pool,
dispensaryId,
rawPayload,
null, // crawl_run_id - not using crawl_runs in new system
allProducts.length
);
console.log(`[PayloadFetch] Saved payload #${payloadResult.id} (${(payloadResult.sizeBytes / 1024).toFixed(1)}KB)`);
// ============================================================
// STEP 5: Update dispensary last_fetch_at
// ============================================================
await pool.query(`
UPDATE dispensaries
SET last_fetch_at = NOW()
WHERE id = $1
`, [dispensaryId]);
// ============================================================
// STEP 6: Queue product_refresh task to process the payload
// Per TASK_WORKFLOW_2024-12-10.md: Task chaining
// ============================================================
await taskService.createTask({
role: 'product_refresh',
dispensary_id: dispensaryId,
priority: task.priority || 0,
payload: { payload_id: payloadResult.id },
});
console.log(`[PayloadFetch] Queued product_refresh task for payload #${payloadResult.id}`);
return {
success: true,
payloadId: payloadResult.id,
productCount: allProducts.length,
sizeBytes: payloadResult.sizeBytes,
};
} catch (error: unknown) {
const errorMessage = error instanceof Error ? error.message : 'Unknown error';
console.error(`[PayloadFetch] Error for dispensary ${dispensaryId}:`, errorMessage);
return {
success: false,
error: errorMessage,
};
}
}

View File

@@ -1,31 +0,0 @@
/**
* Product Discovery Handler
*
* Per TASK_WORKFLOW_2024-12-10.md: Initial product fetch for newly discovered stores.
*
* Flow:
* 1. Triggered after store_discovery promotes a new dispensary
* 2. Chains to payload_fetch to get initial product data
* 3. payload_fetch chains to product_refresh for DB upsert
*
* Chaining:
* store_discovery → (newStoreIds) → product_discovery → payload_fetch → product_refresh
*/
import { TaskContext, TaskResult } from '../task-worker';
import { handlePayloadFetch } from './payload-fetch-curl';
export async function handleProductDiscovery(ctx: TaskContext): Promise<TaskResult> {
const { task } = ctx;
const dispensaryId = task.dispensary_id;
if (!dispensaryId) {
return { success: false, error: 'No dispensary_id provided' };
}
console.log(`[ProductDiscovery] Starting initial product discovery for dispensary ${dispensaryId}`);
// Per TASK_WORKFLOW_2024-12-10.md: Chain to payload_fetch for API → disk
// payload_fetch will then chain to product_refresh for disk → DB
return handlePayloadFetch(ctx);
}

Some files were not shown because too many files have changed in this diff Show More