From 824d48fd856778265c5f98412fb57c0ead478cc3 Mon Sep 17 00:00:00 2001 From: Kelly Date: Wed, 10 Dec 2025 23:12:09 -0700 Subject: [PATCH] fix: Add curl to Docker, add active flag to worker_tasks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Install curl in Docker container for Dutchie HTTP requests - Add 'active' column to worker_tasks (default false) to prevent accidental task execution on startup - Update task-service to only claim tasks where active=true 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- backend/Dockerfile | 3 +- backend/node_modules/.package-lock.json | 286 ++++++++++++++++++- backend/src/tasks/task-service.ts | 1 + workflow-12102025.md | 365 ++++++++++++++++++++++++ 4 files changed, 650 insertions(+), 5 deletions(-) create mode 100644 workflow-12102025.md diff --git a/backend/Dockerfile b/backend/Dockerfile index cab76207..2c301677 100644 --- a/backend/Dockerfile +++ b/backend/Dockerfile @@ -25,8 +25,9 @@ ENV APP_GIT_SHA=${APP_GIT_SHA} ENV APP_BUILD_TIME=${APP_BUILD_TIME} ENV CONTAINER_IMAGE_TAG=${CONTAINER_IMAGE_TAG} -# Install Chromium dependencies +# Install Chromium dependencies and curl for HTTP requests RUN apt-get update && apt-get install -y \ + curl \ chromium \ fonts-liberation \ libnss3 \ diff --git a/backend/node_modules/.package-lock.json b/backend/node_modules/.package-lock.json index 8d3c398d..fc526ace 100644 --- a/backend/node_modules/.package-lock.json +++ b/backend/node_modules/.package-lock.json @@ -1,6 +1,6 @@ { "name": "dutchie-menus-backend", - "version": "1.5.1", + "version": "1.6.0", "lockfileVersion": 3, "requires": true, "packages": { @@ -46,6 +46,97 @@ "resolved": "https://registry.npmjs.org/@ioredis/commands/-/commands-1.4.0.tgz", "integrity": "sha512-aFT2yemJJo+TZCmieA7qnYGQooOS7QfNmYrzGtsYd3g9j5iDP8AimYYAesf79ohjbLG12XxC4nG5DyEnC88AsQ==" }, + "node_modules/@jsep-plugin/assignment": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/@jsep-plugin/assignment/-/assignment-1.3.0.tgz", + "integrity": "sha512-VVgV+CXrhbMI3aSusQyclHkenWSAm95WaiKrMxRFam3JSUiIaQjoMIw2sEs/OX4XifnqeQUN4DYbJjlA8EfktQ==", + "engines": { + "node": ">= 10.16.0" + }, + "peerDependencies": { + "jsep": "^0.4.0||^1.0.0" + } + }, + "node_modules/@jsep-plugin/regex": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/@jsep-plugin/regex/-/regex-1.0.4.tgz", + "integrity": "sha512-q7qL4Mgjs1vByCaTnDFcBnV9HS7GVPJX5vyVoCgZHNSC9rjwIlmbXG5sUuorR5ndfHAIlJ8pVStxvjXHbNvtUg==", + "engines": { + "node": ">= 10.16.0" + }, + "peerDependencies": { + "jsep": "^0.4.0||^1.0.0" + } + }, + "node_modules/@kubernetes/client-node": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/@kubernetes/client-node/-/client-node-1.4.0.tgz", + "integrity": "sha512-Zge3YvF7DJi264dU1b3wb/GmzR99JhUpqTvp+VGHfwZT+g7EOOYNScDJNZwXy9cszyIGPIs0VHr+kk8e95qqrA==", + "dependencies": { + "@types/js-yaml": "^4.0.1", + "@types/node": "^24.0.0", + "@types/node-fetch": "^2.6.13", + "@types/stream-buffers": "^3.0.3", + "form-data": "^4.0.0", + "hpagent": "^1.2.0", + "isomorphic-ws": "^5.0.0", + "js-yaml": "^4.1.0", + "jsonpath-plus": "^10.3.0", + "node-fetch": "^2.7.0", + "openid-client": "^6.1.3", + "rfc4648": "^1.3.0", + "socks-proxy-agent": "^8.0.4", + "stream-buffers": "^3.0.2", + "tar-fs": "^3.0.9", + "ws": "^8.18.2" + } + }, + "node_modules/@kubernetes/client-node/node_modules/@types/node": { + "version": "24.10.3", + "resolved": "https://registry.npmjs.org/@types/node/-/node-24.10.3.tgz", + "integrity": "sha512-gqkrWUsS8hcm0r44yn7/xZeV1ERva/nLgrLxFRUGb7aoNMIJfZJ3AC261zDQuOAKC7MiXai1WCpYc48jAHoShQ==", + "dependencies": { + "undici-types": "~7.16.0" + } + }, + "node_modules/@kubernetes/client-node/node_modules/tar-fs": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/tar-fs/-/tar-fs-3.1.1.tgz", + "integrity": "sha512-LZA0oaPOc2fVo82Txf3gw+AkEd38szODlptMYejQUhndHMLQ9M059uXR+AfS7DNo0NpINvSqDsvyaCrBVkptWg==", + "dependencies": { + "pump": "^3.0.0", + "tar-stream": "^3.1.5" + }, + "optionalDependencies": { + "bare-fs": "^4.0.1", + "bare-path": "^3.0.0" + } + }, + "node_modules/@kubernetes/client-node/node_modules/undici-types": { + "version": "7.16.0", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.16.0.tgz", + "integrity": "sha512-Zz+aZWSj8LE6zoxD+xrjh4VfkIG8Ya6LvYkZqtUQGJPZjYl53ypCaUwWqo7eI0x66KBGeRo+mlBEkMSeSZ38Nw==" + }, + "node_modules/@kubernetes/client-node/node_modules/ws": { + "version": "8.18.3", + "resolved": "https://registry.npmjs.org/ws/-/ws-8.18.3.tgz", + "integrity": "sha512-PEIGCY5tSlUt50cqyMXfCzX+oOPqN0vuGqWzbcJ2xvnkzkq46oOpz7dQaTDBdfICb4N14+GARUDw2XV2N4tvzg==", + "engines": { + "node": ">=10.0.0" + }, + "peerDependencies": { + "bufferutil": "^4.0.1", + "utf-8-validate": ">=5.0.2" + }, + "peerDependenciesMeta": { + "bufferutil": { + "optional": true + }, + "utf-8-validate": { + "optional": true + } + } + }, "node_modules/@mapbox/node-pre-gyp": { "version": "1.0.11", "resolved": "https://registry.npmjs.org/@mapbox/node-pre-gyp/-/node-pre-gyp-1.0.11.tgz", @@ -251,6 +342,11 @@ "integrity": "sha512-r8Tayk8HJnX0FztbZN7oVqGccWgw98T/0neJphO91KkmOzug1KkofZURD4UaD5uH8AqcFLfdPErnBod0u71/qg==", "dev": true }, + "node_modules/@types/js-yaml": { + "version": "4.0.9", + "resolved": "https://registry.npmjs.org/@types/js-yaml/-/js-yaml-4.0.9.tgz", + "integrity": "sha512-k4MGaQl5TGo/iipqb2UDG2UwjXziSWkh0uysQelTlJpX1qGlpUZYm8PnO4DxG1qBomtJUdYJ6qR6xdIah10JLg==" + }, "node_modules/@types/jsonwebtoken": { "version": "9.0.10", "resolved": "https://registry.npmjs.org/@types/jsonwebtoken/-/jsonwebtoken-9.0.10.tgz", @@ -276,7 +372,6 @@ "version": "20.19.25", "resolved": "https://registry.npmjs.org/@types/node/-/node-20.19.25.tgz", "integrity": "sha512-ZsJzA5thDQMSQO788d7IocwwQbI8B5OPzmqNvpf3NY/+MHDAS759Wo0gd2WQeXYt5AAAQjzcrTVC6SKCuYgoCQ==", - "devOptional": true, "dependencies": { "undici-types": "~6.21.0" } @@ -287,6 +382,15 @@ "integrity": "sha512-0ikrnug3/IyneSHqCBeslAhlK2aBfYek1fGo4bP4QnZPmiqSGRK+Oy7ZMisLWkesffJvQ1cqAcBnJC+8+nxIAg==", "dev": true }, + "node_modules/@types/node-fetch": { + "version": "2.6.13", + "resolved": "https://registry.npmjs.org/@types/node-fetch/-/node-fetch-2.6.13.tgz", + "integrity": "sha512-QGpRVpzSaUs30JBSGPjOg4Uveu384erbHBoT1zeONvyCfwQxIkUshLAOqN/k9EjGviPRmWTTe6aH2qySWKTVSw==", + "dependencies": { + "@types/node": "*", + "form-data": "^4.0.4" + } + }, "node_modules/@types/pg": { "version": "8.15.6", "resolved": "https://registry.npmjs.org/@types/pg/-/pg-8.15.6.tgz", @@ -340,6 +444,14 @@ "@types/node": "*" } }, + "node_modules/@types/stream-buffers": { + "version": "3.0.8", + "resolved": "https://registry.npmjs.org/@types/stream-buffers/-/stream-buffers-3.0.8.tgz", + "integrity": "sha512-J+7VaHKNvlNPJPEJXX/fKa9DZtR/xPMwuIbe+yNOwp1YB+ApUOBv2aUpEoBJEi8nJgbgs1x8e73ttg0r1rSUdw==", + "dependencies": { + "@types/node": "*" + } + }, "node_modules/@types/uuid": { "version": "9.0.8", "resolved": "https://registry.npmjs.org/@types/uuid/-/uuid-9.0.8.tgz", @@ -520,6 +632,78 @@ } } }, + "node_modules/bare-fs": { + "version": "4.5.2", + "resolved": "https://registry.npmjs.org/bare-fs/-/bare-fs-4.5.2.tgz", + "integrity": "sha512-veTnRzkb6aPHOvSKIOy60KzURfBdUflr5VReI+NSaPL6xf+XLdONQgZgpYvUuZLVQ8dCqxpBAudaOM1+KpAUxw==", + "optional": true, + "dependencies": { + "bare-events": "^2.5.4", + "bare-path": "^3.0.0", + "bare-stream": "^2.6.4", + "bare-url": "^2.2.2", + "fast-fifo": "^1.3.2" + }, + "engines": { + "bare": ">=1.16.0" + }, + "peerDependencies": { + "bare-buffer": "*" + }, + "peerDependenciesMeta": { + "bare-buffer": { + "optional": true + } + } + }, + "node_modules/bare-os": { + "version": "3.6.2", + "resolved": "https://registry.npmjs.org/bare-os/-/bare-os-3.6.2.tgz", + "integrity": "sha512-T+V1+1srU2qYNBmJCXZkUY5vQ0B4FSlL3QDROnKQYOqeiQR8UbjNHlPa+TIbM4cuidiN9GaTaOZgSEgsvPbh5A==", + "optional": true, + "engines": { + "bare": ">=1.14.0" + } + }, + "node_modules/bare-path": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/bare-path/-/bare-path-3.0.0.tgz", + "integrity": "sha512-tyfW2cQcB5NN8Saijrhqn0Zh7AnFNsnczRcuWODH0eYAXBsJ5gVxAUuNr7tsHSC6IZ77cA0SitzT+s47kot8Mw==", + "optional": true, + "dependencies": { + "bare-os": "^3.0.1" + } + }, + "node_modules/bare-stream": { + "version": "2.7.0", + "resolved": "https://registry.npmjs.org/bare-stream/-/bare-stream-2.7.0.tgz", + "integrity": "sha512-oyXQNicV1y8nc2aKffH+BUHFRXmx6VrPzlnaEvMhram0nPBrKcEdcyBg5r08D0i8VxngHFAiVyn1QKXpSG0B8A==", + "optional": true, + "dependencies": { + "streamx": "^2.21.0" + }, + "peerDependencies": { + "bare-buffer": "*", + "bare-events": "*" + }, + "peerDependenciesMeta": { + "bare-buffer": { + "optional": true + }, + "bare-events": { + "optional": true + } + } + }, + "node_modules/bare-url": { + "version": "2.3.2", + "resolved": "https://registry.npmjs.org/bare-url/-/bare-url-2.3.2.tgz", + "integrity": "sha512-ZMq4gd9ngV5aTMa5p9+UfY0b3skwhHELaDkhEHetMdX0LRkW9kzaym4oo/Eh+Ghm0CCDuMTsRIGM/ytUc1ZYmw==", + "optional": true, + "dependencies": { + "bare-path": "^3.0.0" + } + }, "node_modules/base64-js": { "version": "1.5.1", "resolved": "https://registry.npmjs.org/base64-js/-/base64-js-1.5.1.tgz", @@ -2019,6 +2203,14 @@ "node": ">=16.0.0" } }, + "node_modules/hpagent": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/hpagent/-/hpagent-1.2.0.tgz", + "integrity": "sha512-A91dYTeIB6NoXG+PxTQpCCDDnfHsW9kc06Lvpu1TEe9gnd6ZFeiBoRO9JvzEv6xK7EX97/dUE8g/vBMTqTS3CA==", + "engines": { + "node": ">=14" + } + }, "node_modules/htmlparser2": { "version": "10.0.0", "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-10.0.0.tgz", @@ -2382,6 +2574,22 @@ "node": ">=0.10.0" } }, + "node_modules/isomorphic-ws": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/isomorphic-ws/-/isomorphic-ws-5.0.0.tgz", + "integrity": "sha512-muId7Zzn9ywDsyXgTIafTry2sV3nySZeUDe6YedVd1Hvuuep5AsIlqK+XefWpYTyJG5e503F2xIuT2lcU6rCSw==", + "peerDependencies": { + "ws": "*" + } + }, + "node_modules/jose": { + "version": "6.1.3", + "resolved": "https://registry.npmjs.org/jose/-/jose-6.1.3.tgz", + "integrity": "sha512-0TpaTfihd4QMNwrz/ob2Bp7X04yuxJkjRGi4aKmOqwhov54i6u79oCv7T+C7lo70MKH6BesI3vscD1yb/yzKXQ==", + "funding": { + "url": "https://github.com/sponsors/panva" + } + }, "node_modules/js-tokens": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-4.0.0.tgz", @@ -2398,6 +2606,14 @@ "js-yaml": "bin/js-yaml.js" } }, + "node_modules/jsep": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/jsep/-/jsep-1.4.0.tgz", + "integrity": "sha512-B7qPcEVE3NVkmSJbaYxvv4cHkVW7DQsZz13pUMrfS8z8Q/BuShN+gcTXrUlPiGqM2/t/EEaI030bpxMqY8gMlw==", + "engines": { + "node": ">= 10.16.0" + } + }, "node_modules/json-parse-even-better-errors": { "version": "2.3.1", "resolved": "https://registry.npmjs.org/json-parse-even-better-errors/-/json-parse-even-better-errors-2.3.1.tgz", @@ -2419,6 +2635,23 @@ "graceful-fs": "^4.1.6" } }, + "node_modules/jsonpath-plus": { + "version": "10.3.0", + "resolved": "https://registry.npmjs.org/jsonpath-plus/-/jsonpath-plus-10.3.0.tgz", + "integrity": "sha512-8TNmfeTCk2Le33A3vRRwtuworG/L5RrgMvdjhKZxvyShO+mBu2fP50OWUjRLNtvw344DdDarFh9buFAZs5ujeA==", + "dependencies": { + "@jsep-plugin/assignment": "^1.3.0", + "@jsep-plugin/regex": "^1.0.4", + "jsep": "^1.4.0" + }, + "bin": { + "jsonpath": "bin/jsonpath-cli.js", + "jsonpath-plus": "bin/jsonpath-cli.js" + }, + "engines": { + "node": ">=18.0.0" + } + }, "node_modules/jsonwebtoken": { "version": "9.0.2", "resolved": "https://registry.npmjs.org/jsonwebtoken/-/jsonwebtoken-9.0.2.tgz", @@ -2493,6 +2726,11 @@ "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz", "integrity": "sha512-v2kDEe57lecTulaDIuNTPy3Ry4gLGJ6Z1O3vE1krgXZNrsQ+LFTGHVxVjcXPs17LhbZVGedAJv8XZ1tvj5FvSg==" }, + "node_modules/lodash.clonedeep": { + "version": "4.5.0", + "resolved": "https://registry.npmjs.org/lodash.clonedeep/-/lodash.clonedeep-4.5.0.tgz", + "integrity": "sha512-H5ZhCF25riFd9uB5UCkVKo61m3S/xZk1x4wA6yp/L3RFP6Z/eHH1ymQcGLo7J3GMPfm0V/7m1tryHuGVxpqEBQ==" + }, "node_modules/lodash.defaults": { "version": "4.2.0", "resolved": "https://registry.npmjs.org/lodash.defaults/-/lodash.defaults-4.2.0.tgz", @@ -2942,6 +3180,14 @@ "url": "https://github.com/fb55/nth-check?sponsor=1" } }, + "node_modules/oauth4webapi": { + "version": "3.8.3", + "resolved": "https://registry.npmjs.org/oauth4webapi/-/oauth4webapi-3.8.3.tgz", + "integrity": "sha512-pQ5BsX3QRTgnt5HxgHwgunIRaDXBdkT23tf8dfzmtTIL2LTpdmxgbpbBm0VgFWAIDlezQvQCTgnVIUmHupXHxw==", + "funding": { + "url": "https://github.com/sponsors/panva" + } + }, "node_modules/object-assign": { "version": "4.1.1", "resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz", @@ -2980,6 +3226,18 @@ "wrappy": "1" } }, + "node_modules/openid-client": { + "version": "6.8.1", + "resolved": "https://registry.npmjs.org/openid-client/-/openid-client-6.8.1.tgz", + "integrity": "sha512-VoYT6enBo6Vj2j3Q5Ec0AezS+9YGzQo1f5Xc42lreMGlfP4ljiXPKVDvCADh+XHCV/bqPu/wWSiCVXbJKvrODw==", + "dependencies": { + "jose": "^6.1.0", + "oauth4webapi": "^3.8.2" + }, + "funding": { + "url": "https://github.com/sponsors/panva" + } + }, "node_modules/pac-proxy-agent": { "version": "7.2.0", "resolved": "https://registry.npmjs.org/pac-proxy-agent/-/pac-proxy-agent-7.2.0.tgz", @@ -3883,6 +4141,11 @@ "url": "https://github.com/privatenumber/resolve-pkg-maps?sponsor=1" } }, + "node_modules/rfc4648": { + "version": "1.5.4", + "resolved": "https://registry.npmjs.org/rfc4648/-/rfc4648-1.5.4.tgz", + "integrity": "sha512-rRg/6Lb+IGfJqO05HZkN50UtY7K/JhxJag1kP23+zyMfrvoB0B7RWv06MbOzoc79RgCdNTiUaNsTT1AJZ7Z+cg==" + }, "node_modules/rimraf": { "version": "3.0.2", "resolved": "https://registry.npmjs.org/rimraf/-/rimraf-3.0.2.tgz", @@ -4313,6 +4576,14 @@ "node": ">= 0.8" } }, + "node_modules/stream-buffers": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/stream-buffers/-/stream-buffers-3.0.3.tgz", + "integrity": "sha512-pqMqwQCso0PBJt2PQmDO0cFj0lyqmiwOMiMSkVtRokl7e+ZTRYgDHKnuZNbqjiJXgsg4nuqtD/zxuo9KqTp0Yw==", + "engines": { + "node": ">= 0.10.0" + } + }, "node_modules/streamx": { "version": "2.23.0", "resolved": "https://registry.npmjs.org/streamx/-/streamx-2.23.0.tgz", @@ -4532,8 +4803,7 @@ "node_modules/undici-types": { "version": "6.21.0", "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.21.0.tgz", - "integrity": "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==", - "devOptional": true + "integrity": "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==" }, "node_modules/universalify": { "version": "2.0.1", @@ -4556,6 +4826,14 @@ "resolved": "https://registry.npmjs.org/urlpattern-polyfill/-/urlpattern-polyfill-10.0.0.tgz", "integrity": "sha512-H/A06tKD7sS1O1X2SshBVeA5FLycRpjqiBeqGKmBwBDBy28EnRjORxTNe269KSSr5un5qyWi1iL61wLxpd+ZOg==" }, + "node_modules/user-agents": { + "version": "1.1.669", + "resolved": "https://registry.npmjs.org/user-agents/-/user-agents-1.1.669.tgz", + "integrity": "sha512-pbIzG+AOqCaIpySKJ4IAm1l0VyE4jMnK4y1thV8lm8PYxI+7X5uWcppOK7zY79TCKKTAnJH3/4gaVIZHsjrmJA==", + "dependencies": { + "lodash.clonedeep": "^4.5.0" + } + }, "node_modules/util": { "version": "0.12.5", "resolved": "https://registry.npmjs.org/util/-/util-0.12.5.tgz", diff --git a/backend/src/tasks/task-service.ts b/backend/src/tasks/task-service.ts index 586ea5be..d9198690 100644 --- a/backend/src/tasks/task-service.ts +++ b/backend/src/tasks/task-service.ts @@ -170,6 +170,7 @@ class TaskService { WHERE id = ( SELECT id FROM worker_tasks WHERE status = 'pending' + AND active = true AND (scheduled_for IS NULL OR scheduled_for <= NOW()) -- Exclude stores that already have an active task AND (dispensary_id IS NULL OR dispensary_id NOT IN ( diff --git a/workflow-12102025.md b/workflow-12102025.md new file mode 100644 index 00000000..5e57c075 --- /dev/null +++ b/workflow-12102025.md @@ -0,0 +1,365 @@ +# Workflow Documentation - December 10, 2025 + +## Purpose + +This document captures the intended behavior for the CannaiQ crawl system, specifically around proxy rotation, fingerprinting, and anti-detection. + +--- + +## Stealth & Anti-Detection Requirements + +### 1. Task Determines Work, Proxy Determines Identity + +The task payload contains: +- `dispensary_id` - which store to crawl +- `role` - what type of work (product_resync, entry_point_discovery, etc.) + +The **proxy** determines the session identity: +- Proxy location (city, state, timezone) → sets Accept-Language and timezone headers +- Language is always English (`en-US`) + +**Flow:** +``` +Task claimed + │ + └─► Get proxy from rotation + │ + └─► Proxy has location (city, state, timezone) + │ + └─► Build headers using proxy's timezone + - Accept-Language: en-US,en;q=0.9 + - Timezone-consistent behavior +``` + +### 2. On 403 Block - Immediate Backoff + +When a 403 is received: + +1. **Immediately** stop using current IP +2. Get a new proxy (new IP) +3. Get a new UA/fingerprint +4. Retry the request + +**Per-proxy failure tracking:** +- Track UA rotation attempts per proxy +- After 3 UA/fingerprint rotations on the same proxy → disable that proxy +- This means: if we rotate UA 3 times and still get 403, the proxy is burned + +### 3. Fingerprint Rotation Rules + +Each request uses: +- Proxy (IP) +- User-Agent +- sec-ch-ua headers (Client Hints) +- Accept-Language (from proxy location) + +On 403: +1. Record failure on current proxy +2. Rotate to new proxy +3. Pick new random fingerprint +4. If same proxy fails 3 times with different fingerprints → disable proxy + +### 4. Proxy Table Schema + +```sql +CREATE TABLE proxies ( + id SERIAL PRIMARY KEY, + host VARCHAR(255) NOT NULL, + port INTEGER NOT NULL, + username VARCHAR(100), + password VARCHAR(100), + protocol VARCHAR(10) DEFAULT 'http', + active BOOLEAN DEFAULT true, + + -- Location (determines session headers) + city VARCHAR(100), + state VARCHAR(50), + country VARCHAR(100), + country_code VARCHAR(10), + timezone VARCHAR(50), + + -- Health tracking + failure_count INTEGER DEFAULT 0, + consecutive_403_count INTEGER DEFAULT 0, -- Track 403s specifically + last_used_at TIMESTAMPTZ, + last_failure_at TIMESTAMPTZ, + last_error TEXT, + + -- Performance + response_time_ms INTEGER, + max_connections INTEGER DEFAULT 1 +); +``` + +### 5. Failure Threshold + +- **3 consecutive 403s** with different fingerprints → disable proxy +- Reset `consecutive_403_count` to 0 on successful request +- General `failure_count` tracks all errors (timeouts, connection errors, etc.) + +--- + +## Implementation Status + +### COMPLETED - December 10, 2025 + +All code changes have been implemented per this specification: + +#### 1. crawl-rotator.ts ✅ + +- [x] Added `consecutive403Count` to Proxy interface +- [x] Added `markBlocked()` method that increments `consecutive_403_count` and disables proxy at 3 +- [x] Added `getProxyTimezone()` to return current proxy's timezone +- [x] `markSuccess()` now resets `consecutive_403_count` to 0 +- [x] Replaced hardcoded UA list with `intoli/user-agents` library for realistic fingerprints +- [x] `BrowserFingerprint` interface includes full fingerprint data (UA, platform, screen size, viewport, sec-ch-ua headers) + +#### 2. client.ts ✅ + +- [x] `startSession()` no longer takes state/timezone params +- [x] `startSession()` gets identity from proxy via `crawlRotator.getProxyLocation()` +- [x] Added `handle403Block()` that: + - Calls `crawlRotator.recordBlock()` (tracks consecutive 403s) + - Immediately rotates both proxy and fingerprint via `rotateBoth()` + - Returns false if no more proxies available +- [x] `executeGraphQL()` calls `handle403Block()` on 403 (not `rotateProxyOn403`) +- [x] `fetchPage()` uses same 403 handling +- [x] 500ms backoff after rotation (not linear delay) + +#### 3. Task Handlers ✅ + +- [x] `entry-point-discovery.ts`: `startSession()` called with no params +- [x] `product-refresh.ts`: `startSession()` called with no params + +#### 4. Dependencies ✅ + +- [x] Added `user-agents` npm package for realistic UA generation + +--- + +## Files Changed + +| File | Changes | +|------|---------| +| `backend/src/services/crawl-rotator.ts` | Complete rewrite with `consecutive403Count`, `markBlocked()`, `intoli/user-agents` | +| `backend/src/platforms/dutchie/client.ts` | `startSession()` uses proxy location, `handle403Block()` for 403 handling | +| `backend/src/tasks/handlers/entry-point-discovery.ts` | `startSession()` no params | +| `backend/src/tasks/handlers/product-refresh.ts` | `startSession()` no params | +| `backend/package.json` | Added `user-agents` dependency | + +--- + +## Migration Required + +The `proxies` table needs `consecutive_403_count` column if not already present: + +```sql +ALTER TABLE proxies ADD COLUMN IF NOT EXISTS consecutive_403_count INTEGER DEFAULT 0; +``` + +--- + +## Key Behaviors Summary + +| Behavior | Implementation | +|----------|----------------| +| Session identity | From proxy location (`getProxyLocation()`) | +| Language | Always `en-US,en;q=0.9` | +| 403 handling | `handle403Block()` → `recordBlock()` → `rotateBoth()` | +| Proxy disable | After 3 consecutive 403s (`consecutive403Count >= 3`) | +| Success reset | `markSuccess()` resets `consecutive403Count` to 0 | +| UA generation | `intoli/user-agents` library (daily updated, realistic fingerprints) | +| Fingerprint data | Full: UA, platform, screen size, viewport, sec-ch-ua headers | + +--- + +## User-Agent Generation + +### Data Source + +The `intoli/user-agents` npm library provides daily-updated market share data collected from Intoli's residential proxy network (millions of real users). The package auto-releases new versions daily to npm. + +### Device Category Distribution (hardcoded) + +| Category | Share | +|----------|-------| +| Mobile | 62% | +| Desktop | 36% | +| Tablet | 2% | + +### Browser Filter (whitelist only) + +Only these browsers are allowed: +- Chrome (67%) +- Safari (20%) +- Edge (6%) +- Firefox (3%) + +Samsung Internet, Opera, and other niche browsers are filtered out. + +### Desktop OS Distribution (from library) + +| OS | Share | +|----|-------| +| Windows | 72% | +| macOS | 17% | +| Linux | 4% | + +### UA Lifecycle + +1. **Session start** (new proxy IP obtained) → Roll device category (62/36/2) → Generate UA filtered to device + top 4 browsers → Store on session +2. **UA sticks** until IP rotates (403 block or manual rotation) +3. **IP rotation** triggers new UA generation + +### Failure Handling + +- If UA generation fails → Alert admin dashboard, **stop crawl immediately** +- No fallback to static UA list +- This forces investigation rather than silent degradation + +### Session Logging + +Each session logs: +- Device category (mobile/desktop/tablet) +- Full UA string +- Browser name (Chrome/Safari/Edge/Firefox) +- IP address (from proxy) +- Session start timestamp + +Logs are rotated monthly. + +### Implementation + +Located in `backend/src/services/crawl-rotator.ts`: + +```typescript +// Per workflow-12102025.md: Device category distribution +const DEVICE_WEIGHTS = { mobile: 62, desktop: 36, tablet: 2 }; + +// Per workflow-12102025.md: Browser whitelist +const ALLOWED_BROWSERS = ['Chrome', 'Safari', 'Edge', 'Firefox']; +``` + +--- + +## HTTP Fingerprinting + +### Goal + +Make HTTP requests indistinguishable from real browser traffic. No repeatable footprint. + +### Components + +1. **Full Header Set** - All headers a real browser sends +2. **Header Ordering** - Browser-specific order (Chrome vs Firefox vs Safari) +3. **TLS Fingerprint** - Use `curl-impersonate` to match browser TLS signature +4. **Dynamic Referer** - Set per dispensary being crawled +5. **Natural Randomization** - Vary optional headers like real users + +### Required Headers + +| Header | Chrome | Firefox | Safari | Notes | +|--------|--------|---------|--------|-------| +| `User-Agent` | ✅ | ✅ | ✅ | From UA generation | +| `Accept` | ✅ | ✅ | ✅ | Content types | +| `Accept-Language` | ✅ | ✅ | ✅ | Always `en-US,en;q=0.9` | +| `Accept-Encoding` | ✅ | ✅ | ✅ | `gzip, deflate, br` | +| `Connection` | ✅ | ✅ | ✅ | `keep-alive` | +| `Origin` | ✅ | ✅ | ✅ | `https://dutchie.com` (POST only) | +| `Referer` | ✅ | ✅ | ✅ | Dynamic per dispensary | +| `sec-ch-ua` | ✅ | ❌ | ❌ | Chromium only | +| `sec-ch-ua-mobile` | ✅ | ❌ | ❌ | Chromium only | +| `sec-ch-ua-platform` | ✅ | ❌ | ❌ | Chromium only | +| `sec-fetch-dest` | ✅ | ✅ | ❌ | `empty` for XHR | +| `sec-fetch-mode` | ✅ | ✅ | ❌ | `cors` for XHR | +| `sec-fetch-site` | ✅ | ✅ | ❌ | `same-origin` | +| `Upgrade-Insecure-Requests` | ✅ | ✅ | ✅ | `1` (page loads only) | +| `DNT` | ~30% | ~30% | ~30% | Randomized per session | + +### Header Ordering + +Each browser sends headers in a specific order. Fingerprinting services detect mismatches. + +**Chrome order (GraphQL request):** +1. Host +2. Connection +3. Content-Length (POST) +4. sec-ch-ua +5. DNT (if enabled) +6. sec-ch-ua-mobile +7. User-Agent +8. sec-ch-ua-platform +9. Content-Type (POST) +10. Accept +11. Origin (POST) +12. sec-fetch-site +13. sec-fetch-mode +14. sec-fetch-dest +15. Referer +16. Accept-Encoding +17. Accept-Language + +**Firefox order (GraphQL request):** +1. Host +2. User-Agent +3. Accept +4. Accept-Language +5. Accept-Encoding +6. Content-Type (POST) +7. Content-Length (POST) +8. Origin (POST) +9. DNT (if enabled) +10. Connection +11. Referer +12. sec-fetch-dest +13. sec-fetch-mode +14. sec-fetch-site + +**Safari order (GraphQL request):** +1. Host +2. Connection +3. Content-Length (POST) +4. Accept +5. User-Agent +6. Content-Type (POST) +7. Origin (POST) +8. Referer +9. Accept-Encoding +10. Accept-Language + +### TLS Fingerprinting + +Use `curl-impersonate` instead of standard curl: +- `curl_chrome131` - Mimics Chrome 131 TLS handshake +- `curl_ff133` - Mimics Firefox 133 TLS handshake +- `curl_safari17` - Mimics Safari 17 TLS handshake + +Match TLS binary to browser in UA. + +### Dynamic Referer + +Set Referer to the dispensary's actual page URL: + +``` +Crawling "harvest-of-tempe" → Referer: https://dutchie.com/dispensary/harvest-of-tempe +Crawling "zen-leaf-mesa" → Referer: https://dutchie.com/dispensary/zen-leaf-mesa +``` + +Derived from dispensary's `menu_url` field. + +### Natural Randomization + +Per-session randomization (set once when session starts, consistent for session): + +| Feature | Distribution | Implementation | +|---------|--------------|----------------| +| DNT header | 30% have it | `Math.random() < 0.30` | +| Accept quality values | Slight variation | `q=0.9` vs `q=0.8` | + +### Implementation Files + +| File | Purpose | +|------|---------| +| `src/services/crawl-rotator.ts` | `BrowserFingerprint` includes full header config | +| `src/platforms/dutchie/client.ts` | Build headers from fingerprint, use curl-impersonate | +| `src/services/http-fingerprint.ts` | Header ordering per browser (NEW) |