fix(monitor): remove non-existent worker columns from job_run_logs query

The job_run_logs table tracks scheduled job orchestration, not individual
worker jobs. Worker info (worker_id, worker_hostname) belongs on
dispensary_crawl_jobs, not job_run_logs.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Kelly
2025-12-03 18:45:05 -07:00
parent 54f40d26bb
commit 66e07b2009
466 changed files with 84988 additions and 9226 deletions

View File

@@ -39,18 +39,66 @@ async function authenticateUser(email, password) {
role: user.role
};
}
function authMiddleware(req, res, next) {
async function authMiddleware(req, res, next) {
const authHeader = req.headers.authorization;
if (!authHeader || !authHeader.startsWith('Bearer ')) {
return res.status(401).json({ error: 'No token provided' });
}
const token = authHeader.substring(7);
const user = verifyToken(token);
if (!user) {
return res.status(401).json({ error: 'Invalid token' });
// Try JWT first
const jwtUser = verifyToken(token);
if (jwtUser) {
req.user = jwtUser;
return next();
}
// If JWT fails, try API token
try {
const result = await migrate_1.pool.query(`
SELECT id, name, rate_limit, active, expires_at, allowed_endpoints
FROM api_tokens
WHERE token = $1
`, [token]);
if (result.rows.length === 0) {
return res.status(401).json({ error: 'Invalid token' });
}
const apiToken = result.rows[0];
// Check if token is active
if (!apiToken.active) {
return res.status(401).json({ error: 'Token is disabled' });
}
// Check if token is expired
if (apiToken.expires_at && new Date(apiToken.expires_at) < new Date()) {
return res.status(401).json({ error: 'Token has expired' });
}
// Check allowed endpoints
if (apiToken.allowed_endpoints && apiToken.allowed_endpoints.length > 0) {
const isAllowed = apiToken.allowed_endpoints.some((pattern) => {
// Simple wildcard matching
const regex = new RegExp('^' + pattern.replace('*', '.*') + '$');
return regex.test(req.path);
});
if (!isAllowed) {
return res.status(403).json({ error: 'Endpoint not allowed for this token' });
}
}
// Set API token on request for tracking
req.apiToken = {
id: apiToken.id,
name: apiToken.name,
rate_limit: apiToken.rate_limit
};
// Set a generic user for compatibility with existing code
req.user = {
id: apiToken.id,
email: `api-token-${apiToken.id}@system`,
role: 'api'
};
next();
}
catch (error) {
console.error('Error verifying API token:', error);
return res.status(500).json({ error: 'Authentication failed' });
}
req.user = user;
next();
}
function requireRole(...roles) {
return (req, res, next) => {

View File

@@ -3,8 +3,14 @@ Object.defineProperty(exports, "__esModule", { value: true });
exports.pool = void 0;
exports.runMigrations = runMigrations;
const pg_1 = require("pg");
// Consolidated DB connection:
// - Prefer CRAWLSY_DATABASE_URL (e.g., crawlsy_local, crawlsy_prod)
// - Then DATABASE_URL (default)
const DATABASE_URL = process.env.CRAWLSY_DATABASE_URL ||
process.env.DATABASE_URL ||
'postgresql://dutchie:dutchie_local_pass@localhost:54320/crawlsy_local';
const pool = new pg_1.Pool({
connectionString: process.env.DATABASE_URL,
connectionString: DATABASE_URL,
});
exports.pool = pool;
async function runMigrations() {
@@ -94,6 +100,99 @@ async function runMigrations() {
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
`);
// Add variant column to products table (for different sizes/options of same product)
await client.query(`
ALTER TABLE products ADD COLUMN IF NOT EXISTS variant VARCHAR(255);
`);
// Add special tracking columns (DEPRECATED - not used with new approach)
await client.query(`
ALTER TABLE products ADD COLUMN IF NOT EXISTS special_ends_at TIMESTAMP;
ALTER TABLE products ADD COLUMN IF NOT EXISTS special_text TEXT;
ALTER TABLE products ADD COLUMN IF NOT EXISTS special_type VARCHAR(100);
`);
// ====== NEW SCHEMA ADDITIONS ======
// Add array columns for product attributes
await client.query(`
ALTER TABLE products ADD COLUMN IF NOT EXISTS terpenes TEXT[];
ALTER TABLE products ADD COLUMN IF NOT EXISTS effects TEXT[];
ALTER TABLE products ADD COLUMN IF NOT EXISTS flavors TEXT[];
`);
// Add new price columns (regular_price = market price, sale_price = discount price)
await client.query(`
ALTER TABLE products ADD COLUMN IF NOT EXISTS regular_price DECIMAL(10, 2);
ALTER TABLE products ADD COLUMN IF NOT EXISTS sale_price DECIMAL(10, 2);
`);
// Migrate existing price data
await client.query(`
UPDATE products
SET regular_price = original_price
WHERE regular_price IS NULL AND original_price IS NOT NULL;
`);
await client.query(`
UPDATE products
SET sale_price = price
WHERE sale_price IS NULL AND price IS NOT NULL AND original_price IS NOT NULL AND price < original_price;
`);
// Make slug NOT NULL and add unique constraint
await client.query(`
UPDATE products SET slug = dutchie_product_id WHERE slug IS NULL;
ALTER TABLE products ALTER COLUMN slug SET NOT NULL;
`);
// Drop old unique constraint and add new one on slug
await client.query(`
ALTER TABLE products DROP CONSTRAINT IF EXISTS products_store_id_dutchie_product_id_key;
DO $$
BEGIN
IF NOT EXISTS (SELECT 1 FROM pg_constraint WHERE conname = 'products_store_id_slug_unique') THEN
ALTER TABLE products ADD CONSTRAINT products_store_id_slug_unique UNIQUE (store_id, slug);
END IF;
END$$;
`);
// Product Categories (many-to-many) - products can appear in multiple categories
await client.query(`
CREATE TABLE IF NOT EXISTS product_categories (
id SERIAL PRIMARY KEY,
product_id INTEGER REFERENCES products(id) ON DELETE CASCADE,
category_slug VARCHAR(255) NOT NULL,
first_seen_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
last_seen_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
UNIQUE(product_id, category_slug)
);
`);
await client.query(`
CREATE INDEX IF NOT EXISTS idx_product_categories_slug ON product_categories(category_slug, last_seen_at DESC);
CREATE INDEX IF NOT EXISTS idx_product_categories_product ON product_categories(product_id);
`);
// Price History - track regular and sale price changes over time
await client.query(`
CREATE TABLE IF NOT EXISTS price_history (
id SERIAL PRIMARY KEY,
product_id INTEGER REFERENCES products(id) ON DELETE CASCADE,
regular_price DECIMAL(10, 2),
sale_price DECIMAL(10, 2),
recorded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
`);
await client.query(`
CREATE INDEX IF NOT EXISTS idx_price_history_product ON price_history(product_id, recorded_at DESC);
CREATE INDEX IF NOT EXISTS idx_price_history_recorded ON price_history(recorded_at DESC);
`);
// Batch History - track cannabinoid/terpene changes (different batches)
await client.query(`
CREATE TABLE IF NOT EXISTS batch_history (
id SERIAL PRIMARY KEY,
product_id INTEGER REFERENCES products(id) ON DELETE CASCADE,
thc_percentage DECIMAL(5, 2),
cbd_percentage DECIMAL(5, 2),
terpenes TEXT[],
strain_type VARCHAR(100),
recorded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
`);
await client.query(`
CREATE INDEX IF NOT EXISTS idx_batch_history_product ON batch_history(product_id, recorded_at DESC);
CREATE INDEX IF NOT EXISTS idx_batch_history_recorded ON batch_history(recorded_at DESC);
`);
// Campaign products (many-to-many with ordering)
await client.query(`
@@ -138,10 +237,50 @@ async function runMigrations() {
last_tested_at TIMESTAMP,
test_result VARCHAR(50),
response_time_ms INTEGER,
failure_count INTEGER DEFAULT 0,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
UNIQUE(host, port, protocol)
);
`);
// Add failure_count column if it doesn't exist
await client.query(`
ALTER TABLE proxies ADD COLUMN IF NOT EXISTS failure_count INTEGER DEFAULT 0;
`);
// Failed proxies table
await client.query(`
CREATE TABLE IF NOT EXISTS failed_proxies (
id SERIAL PRIMARY KEY,
host VARCHAR(255) NOT NULL,
port INTEGER NOT NULL,
protocol VARCHAR(10) NOT NULL,
username VARCHAR(255),
password VARCHAR(255),
failure_count INTEGER NOT NULL,
last_error TEXT,
failed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
UNIQUE(host, port, protocol)
);
`);
// Proxy test jobs table
await client.query(`
CREATE TABLE IF NOT EXISTS proxy_test_jobs (
id SERIAL PRIMARY KEY,
status VARCHAR(20) NOT NULL DEFAULT 'pending',
total_proxies INTEGER NOT NULL DEFAULT 0,
tested_proxies INTEGER NOT NULL DEFAULT 0,
passed_proxies INTEGER NOT NULL DEFAULT 0,
failed_proxies INTEGER NOT NULL DEFAULT 0,
started_at TIMESTAMP,
completed_at TIMESTAMP,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
`);
await client.query(`
CREATE INDEX IF NOT EXISTS idx_proxy_test_jobs_status ON proxy_test_jobs(status);
CREATE INDEX IF NOT EXISTS idx_proxy_test_jobs_created_at ON proxy_test_jobs(created_at DESC);
`);
// Settings table
await client.query(`

View File

@@ -0,0 +1,56 @@
"use strict";
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
const migrate_1 = require("./migrate");
const fs = __importStar(require("fs"));
const path = __importStar(require("path"));
async function runNotificationsMigration() {
const client = await migrate_1.pool.connect();
try {
console.log('Running notifications migration...');
const migrationSQL = fs.readFileSync(path.join(__dirname, '../../migrations/005_notifications.sql'), 'utf-8');
await client.query(migrationSQL);
console.log('✅ Notifications migration completed successfully');
process.exit(0);
}
catch (error) {
console.error('❌ Migration failed:', error);
process.exit(1);
}
finally {
client.release();
}
}
runNotificationsMigration();

View File

@@ -0,0 +1,106 @@
"use strict";
/**
* Dutchie Configuration
*
* Centralized configuration for Dutchie GraphQL API interaction.
* Update hashes here when Dutchie changes their persisted query system.
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.ARIZONA_CENTERPOINTS = exports.GRAPHQL_HASHES = exports.dutchieConfig = void 0;
exports.dutchieConfig = {
// ============================================================
// GRAPHQL ENDPOINT
// ============================================================
/** GraphQL endpoint - must be the api-3 graphql endpoint (NOT api-gw.dutchie.com which no longer exists) */
graphqlEndpoint: 'https://dutchie.com/api-3/graphql',
// ============================================================
// GRAPHQL PERSISTED QUERY HASHES
// ============================================================
//
// These hashes identify specific GraphQL operations.
// If Dutchie changes their schema, you may need to capture
// new hashes from live browser traffic (Network tab → graphql requests).
/** FilteredProducts - main product listing query */
filteredProductsHash: 'ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0',
/** GetAddressBasedDispensaryData - resolve slug to internal ID */
getDispensaryDataHash: '13461f73abf7268770dfd05fe7e10c523084b2bb916a929c08efe3d87531977b',
/**
* ConsumerDispensaries - geo-based discovery
* NOTE: This is a placeholder guess. If discovery fails, either:
* 1. Capture the real hash from live traffic
* 2. Rely on known AZDHS slugs instead (set useDiscovery: false)
*/
consumerDispensariesHash: '0a5bfa6ca1d64ae47bcccb7c8077c87147cbc4e6982c17ceec97a2a4948b311b',
// ============================================================
// BEHAVIOR FLAGS
// ============================================================
/** Enable geo-based discovery (false = use known AZDHS slugs only) */
useDiscovery: true,
/** Prefer GET requests (true) or POST (false). GET is default. */
preferGet: true,
/**
* Enable POST fallback when GET fails with 405 or blocked.
* If true, will retry failed GETs as POSTs.
*/
enablePostFallback: true,
// ============================================================
// PAGINATION & RETRY
// ============================================================
/** Products per page for pagination */
perPage: 100,
/** Maximum pages to fetch (safety limit) */
maxPages: 200,
/** Number of retries for failed page fetches */
maxRetries: 1,
/** Delay between pages in ms */
pageDelayMs: 500,
/** Delay between modes in ms */
modeDelayMs: 2000,
// ============================================================
// HTTP HEADERS
// ============================================================
/** Default headers to mimic browser requests */
defaultHeaders: {
'accept': 'application/json, text/plain, */*',
'accept-language': 'en-US,en;q=0.9',
'apollographql-client-name': 'Marketplace (production)',
},
/** User agent string */
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
// ============================================================
// BROWSER LAUNCH OPTIONS
// ============================================================
browserArgs: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-blink-features=AutomationControlled',
],
/** Navigation timeout in ms */
navigationTimeout: 60000,
/** Initial page load delay in ms */
pageLoadDelay: 2000,
};
/**
* Get GraphQL hashes object for backward compatibility
*/
exports.GRAPHQL_HASHES = {
FilteredProducts: exports.dutchieConfig.filteredProductsHash,
GetAddressBasedDispensaryData: exports.dutchieConfig.getDispensaryDataHash,
ConsumerDispensaries: exports.dutchieConfig.consumerDispensariesHash,
};
/**
* Arizona geo centerpoints for discovery scans
*/
exports.ARIZONA_CENTERPOINTS = [
{ name: 'Phoenix', lat: 33.4484, lng: -112.074 },
{ name: 'Tucson', lat: 32.2226, lng: -110.9747 },
{ name: 'Flagstaff', lat: 35.1983, lng: -111.6513 },
{ name: 'Mesa', lat: 33.4152, lng: -111.8315 },
{ name: 'Scottsdale', lat: 33.4942, lng: -111.9261 },
{ name: 'Tempe', lat: 33.4255, lng: -111.94 },
{ name: 'Yuma', lat: 32.6927, lng: -114.6277 },
{ name: 'Prescott', lat: 34.54, lng: -112.4685 },
{ name: 'Lake Havasu', lat: 34.4839, lng: -114.3224 },
{ name: 'Sierra Vista', lat: 31.5455, lng: -110.2773 },
];

View File

@@ -0,0 +1,79 @@
"use strict";
/**
* Dutchie AZ Database Connection
*
* Isolated database connection for Dutchie Arizona data.
* Uses a separate database/schema to prevent cross-contamination with main app data.
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.getDutchieAZPool = getDutchieAZPool;
exports.query = query;
exports.getClient = getClient;
exports.closePool = closePool;
exports.healthCheck = healthCheck;
const pg_1 = require("pg");
// Consolidated DB naming:
// - Prefer CRAWLSY_DATABASE_URL (e.g., crawlsy_local, crawlsy_prod)
// - Then DUTCHIE_AZ_DATABASE_URL (legacy)
// - Finally DATABASE_URL (legacy main DB)
const DUTCHIE_AZ_DATABASE_URL = process.env.CRAWLSY_DATABASE_URL ||
process.env.DUTCHIE_AZ_DATABASE_URL ||
process.env.DATABASE_URL ||
'postgresql://dutchie:dutchie_local_pass@localhost:54320/crawlsy_local';
let pool = null;
/**
* Get the Dutchie AZ database pool (singleton)
*/
function getDutchieAZPool() {
if (!pool) {
pool = new pg_1.Pool({
connectionString: DUTCHIE_AZ_DATABASE_URL,
max: 10,
idleTimeoutMillis: 30000,
connectionTimeoutMillis: 5000,
});
pool.on('error', (err) => {
console.error('[DutchieAZ DB] Unexpected error on idle client:', err);
});
console.log('[DutchieAZ DB] Pool initialized');
}
return pool;
}
/**
* Execute a query on the Dutchie AZ database
*/
async function query(text, params) {
const p = getDutchieAZPool();
const result = await p.query(text, params);
return { rows: result.rows, rowCount: result.rowCount || 0 };
}
/**
* Get a client from the pool for transaction use
*/
async function getClient() {
const p = getDutchieAZPool();
return p.connect();
}
/**
* Close the pool connection
*/
async function closePool() {
if (pool) {
await pool.end();
pool = null;
console.log('[DutchieAZ DB] Pool closed');
}
}
/**
* Check if the database is accessible
*/
async function healthCheck() {
try {
const result = await query('SELECT 1 as ok');
return result.rows.length > 0 && result.rows[0].ok === 1;
}
catch (error) {
console.error('[DutchieAZ DB] Health check failed:', error);
return false;
}
}

30
backend/dist/dutchie-az/db/migrate.js vendored Normal file
View File

@@ -0,0 +1,30 @@
"use strict";
/**
* Dutchie AZ Schema Bootstrap
*
* Run this to create/update the dutchie_az tables (dutchie_products, dutchie_product_snapshots, etc.)
* in the AZ pipeline database. This is separate from the legacy schema.
*
* Usage:
* TS_NODE_TRANSPILE_ONLY=1 npx ts-node src/dutchie-az/db/migrate.ts
* or (after build)
* node dist/dutchie-az/db/migrate.js
*/
Object.defineProperty(exports, "__esModule", { value: true });
const schema_1 = require("./schema");
const connection_1 = require("./connection");
async function main() {
try {
console.log('[DutchieAZ] Running schema migration...');
await (0, schema_1.createSchema)();
console.log('[DutchieAZ] Schema migration complete.');
}
catch (err) {
console.error('[DutchieAZ] Schema migration failed:', err.message);
process.exitCode = 1;
}
finally {
await (0, connection_1.closePool)();
}
}
main();

405
backend/dist/dutchie-az/db/schema.js vendored Normal file
View File

@@ -0,0 +1,405 @@
"use strict";
/**
* Dutchie AZ Database Schema
*
* Creates all tables for the isolated Dutchie Arizona data pipeline.
* Run this to initialize the dutchie_az database.
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.createSchema = createSchema;
exports.dropSchema = dropSchema;
exports.schemaExists = schemaExists;
exports.ensureSchema = ensureSchema;
const connection_1 = require("./connection");
/**
* SQL statements to create all tables
*/
const SCHEMA_SQL = `
-- ============================================================
-- DISPENSARIES TABLE
-- Stores discovered Dutchie dispensaries in Arizona
-- ============================================================
CREATE TABLE IF NOT EXISTS dispensaries (
id SERIAL PRIMARY KEY,
platform VARCHAR(20) NOT NULL DEFAULT 'dutchie',
name VARCHAR(255) NOT NULL,
slug VARCHAR(255) NOT NULL,
city VARCHAR(100) NOT NULL,
state VARCHAR(10) NOT NULL DEFAULT 'AZ',
postal_code VARCHAR(20),
address TEXT,
latitude DECIMAL(10, 7),
longitude DECIMAL(10, 7),
platform_dispensary_id VARCHAR(100),
is_delivery BOOLEAN DEFAULT false,
is_pickup BOOLEAN DEFAULT true,
raw_metadata JSONB,
last_crawled_at TIMESTAMPTZ,
product_count INTEGER DEFAULT 0,
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW(),
CONSTRAINT uk_dispensaries_platform_slug UNIQUE (platform, slug, city, state)
);
CREATE INDEX IF NOT EXISTS idx_dispensaries_platform ON dispensaries(platform);
CREATE INDEX IF NOT EXISTS idx_dispensaries_platform_id ON dispensaries(platform_dispensary_id);
CREATE INDEX IF NOT EXISTS idx_dispensaries_state ON dispensaries(state);
CREATE INDEX IF NOT EXISTS idx_dispensaries_city ON dispensaries(city);
-- ============================================================
-- DUTCHIE_PRODUCTS TABLE
-- Canonical product identity per store
-- ============================================================
CREATE TABLE IF NOT EXISTS dutchie_products (
id SERIAL PRIMARY KEY,
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
platform VARCHAR(20) NOT NULL DEFAULT 'dutchie',
external_product_id VARCHAR(100) NOT NULL,
platform_dispensary_id VARCHAR(100) NOT NULL,
c_name VARCHAR(500),
name VARCHAR(500) NOT NULL,
-- Brand
brand_name VARCHAR(255),
brand_id VARCHAR(100),
brand_logo_url TEXT,
-- Classification
type VARCHAR(100),
subcategory VARCHAR(100),
strain_type VARCHAR(50),
provider VARCHAR(100),
-- Potency
thc DECIMAL(10, 4),
thc_content DECIMAL(10, 4),
cbd DECIMAL(10, 4),
cbd_content DECIMAL(10, 4),
cannabinoids_v2 JSONB,
effects JSONB,
-- Status / flags
status VARCHAR(50),
medical_only BOOLEAN DEFAULT false,
rec_only BOOLEAN DEFAULT false,
featured BOOLEAN DEFAULT false,
coming_soon BOOLEAN DEFAULT false,
certificate_of_analysis_enabled BOOLEAN DEFAULT false,
is_below_threshold BOOLEAN DEFAULT false,
is_below_kiosk_threshold BOOLEAN DEFAULT false,
options_below_threshold BOOLEAN DEFAULT false,
options_below_kiosk_threshold BOOLEAN DEFAULT false,
-- Derived stock status: 'in_stock', 'out_of_stock', 'unknown'
stock_status VARCHAR(20) DEFAULT 'unknown',
total_quantity_available INTEGER DEFAULT 0,
-- Images
primary_image_url TEXT,
images JSONB,
-- Misc
measurements JSONB,
weight VARCHAR(50),
past_c_names TEXT[],
created_at_dutchie TIMESTAMPTZ,
updated_at_dutchie TIMESTAMPTZ,
latest_raw_payload JSONB,
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW(),
CONSTRAINT uk_dutchie_products UNIQUE (dispensary_id, external_product_id)
);
CREATE INDEX IF NOT EXISTS idx_dutchie_products_dispensary ON dutchie_products(dispensary_id);
CREATE INDEX IF NOT EXISTS idx_dutchie_products_external_id ON dutchie_products(external_product_id);
CREATE INDEX IF NOT EXISTS idx_dutchie_products_platform_disp ON dutchie_products(platform_dispensary_id);
CREATE INDEX IF NOT EXISTS idx_dutchie_products_brand ON dutchie_products(brand_name);
CREATE INDEX IF NOT EXISTS idx_dutchie_products_type ON dutchie_products(type);
CREATE INDEX IF NOT EXISTS idx_dutchie_products_subcategory ON dutchie_products(subcategory);
CREATE INDEX IF NOT EXISTS idx_dutchie_products_status ON dutchie_products(status);
CREATE INDEX IF NOT EXISTS idx_dutchie_products_strain ON dutchie_products(strain_type);
CREATE INDEX IF NOT EXISTS idx_dutchie_products_stock_status ON dutchie_products(stock_status);
-- ============================================================
-- DUTCHIE_PRODUCT_SNAPSHOTS TABLE
-- Historical state per crawl, includes options[]
-- ============================================================
CREATE TABLE IF NOT EXISTS dutchie_product_snapshots (
id SERIAL PRIMARY KEY,
dutchie_product_id INTEGER NOT NULL REFERENCES dutchie_products(id) ON DELETE CASCADE,
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
platform_dispensary_id VARCHAR(100) NOT NULL,
external_product_id VARCHAR(100) NOT NULL,
pricing_type VARCHAR(20) DEFAULT 'unknown',
crawl_mode VARCHAR(20) DEFAULT 'mode_a', -- 'mode_a' (UI parity) or 'mode_b' (max coverage)
status VARCHAR(50),
featured BOOLEAN DEFAULT false,
special BOOLEAN DEFAULT false,
medical_only BOOLEAN DEFAULT false,
rec_only BOOLEAN DEFAULT false,
-- Flag indicating if product was present in feed (false = missing_from_feed snapshot)
is_present_in_feed BOOLEAN DEFAULT true,
-- Derived stock status
stock_status VARCHAR(20) DEFAULT 'unknown',
-- Price summary (in cents)
rec_min_price_cents INTEGER,
rec_max_price_cents INTEGER,
rec_min_special_price_cents INTEGER,
med_min_price_cents INTEGER,
med_max_price_cents INTEGER,
med_min_special_price_cents INTEGER,
wholesale_min_price_cents INTEGER,
-- Inventory summary
total_quantity_available INTEGER,
total_kiosk_quantity_available INTEGER,
manual_inventory BOOLEAN DEFAULT false,
is_below_threshold BOOLEAN DEFAULT false,
is_below_kiosk_threshold BOOLEAN DEFAULT false,
-- Option-level data (from POSMetaData.children)
options JSONB,
-- Full raw product node
raw_payload JSONB NOT NULL,
crawled_at TIMESTAMPTZ NOT NULL,
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_snapshots_product ON dutchie_product_snapshots(dutchie_product_id);
CREATE INDEX IF NOT EXISTS idx_snapshots_dispensary ON dutchie_product_snapshots(dispensary_id);
CREATE INDEX IF NOT EXISTS idx_snapshots_crawled_at ON dutchie_product_snapshots(crawled_at);
CREATE INDEX IF NOT EXISTS idx_snapshots_platform_disp ON dutchie_product_snapshots(platform_dispensary_id);
CREATE INDEX IF NOT EXISTS idx_snapshots_external_id ON dutchie_product_snapshots(external_product_id);
CREATE INDEX IF NOT EXISTS idx_snapshots_special ON dutchie_product_snapshots(special) WHERE special = true;
CREATE INDEX IF NOT EXISTS idx_snapshots_stock_status ON dutchie_product_snapshots(stock_status);
CREATE INDEX IF NOT EXISTS idx_snapshots_crawl_mode ON dutchie_product_snapshots(crawl_mode);
-- ============================================================
-- CRAWL_JOBS TABLE
-- Tracks crawl execution status
-- ============================================================
CREATE TABLE IF NOT EXISTS crawl_jobs (
id SERIAL PRIMARY KEY,
job_type VARCHAR(50) NOT NULL,
dispensary_id INTEGER REFERENCES dispensaries(id) ON DELETE SET NULL,
status VARCHAR(20) NOT NULL DEFAULT 'pending',
started_at TIMESTAMPTZ,
completed_at TIMESTAMPTZ,
error_message TEXT,
products_found INTEGER,
snapshots_created INTEGER,
metadata JSONB,
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_crawl_jobs_type ON crawl_jobs(job_type);
CREATE INDEX IF NOT EXISTS idx_crawl_jobs_status ON crawl_jobs(status);
CREATE INDEX IF NOT EXISTS idx_crawl_jobs_dispensary ON crawl_jobs(dispensary_id);
CREATE INDEX IF NOT EXISTS idx_crawl_jobs_created ON crawl_jobs(created_at);
-- ============================================================
-- JOB_SCHEDULES TABLE
-- Stores schedule configuration for recurring jobs with jitter support
-- Each job has independent timing that "wanders" over time
-- ============================================================
CREATE TABLE IF NOT EXISTS job_schedules (
id SERIAL PRIMARY KEY,
job_name VARCHAR(100) NOT NULL UNIQUE,
description TEXT,
enabled BOOLEAN DEFAULT true,
-- Timing configuration (jitter makes times "wander")
base_interval_minutes INTEGER NOT NULL DEFAULT 240, -- e.g., 4 hours
jitter_minutes INTEGER NOT NULL DEFAULT 30, -- e.g., ±30 min
-- Last run tracking
last_run_at TIMESTAMPTZ,
last_status VARCHAR(20), -- 'success', 'error', 'partial', 'running'
last_error_message TEXT,
last_duration_ms INTEGER,
-- Next run (calculated with jitter after each run)
next_run_at TIMESTAMPTZ,
-- Additional config
job_config JSONB, -- e.g., { pricingType: 'rec', useBothModes: true }
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_job_schedules_enabled ON job_schedules(enabled);
CREATE INDEX IF NOT EXISTS idx_job_schedules_next_run ON job_schedules(next_run_at);
-- ============================================================
-- JOB_RUN_LOGS TABLE
-- Stores history of job runs for monitoring
-- ============================================================
CREATE TABLE IF NOT EXISTS job_run_logs (
id SERIAL PRIMARY KEY,
schedule_id INTEGER NOT NULL REFERENCES job_schedules(id) ON DELETE CASCADE,
job_name VARCHAR(100) NOT NULL,
status VARCHAR(20) NOT NULL, -- 'pending', 'running', 'success', 'error', 'partial'
started_at TIMESTAMPTZ,
completed_at TIMESTAMPTZ,
duration_ms INTEGER,
error_message TEXT,
-- Results summary
items_processed INTEGER,
items_succeeded INTEGER,
items_failed INTEGER,
metadata JSONB, -- Additional run details
created_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_job_run_logs_schedule ON job_run_logs(schedule_id);
CREATE INDEX IF NOT EXISTS idx_job_run_logs_job_name ON job_run_logs(job_name);
CREATE INDEX IF NOT EXISTS idx_job_run_logs_status ON job_run_logs(status);
CREATE INDEX IF NOT EXISTS idx_job_run_logs_created ON job_run_logs(created_at);
-- ============================================================
-- VIEWS FOR EASY QUERYING
-- ============================================================
-- Categories derived from products
CREATE OR REPLACE VIEW v_categories AS
SELECT
type,
subcategory,
COUNT(DISTINCT id) as product_count,
COUNT(DISTINCT dispensary_id) as dispensary_count,
AVG(thc) as avg_thc,
MIN(thc) as min_thc,
MAX(thc) as max_thc
FROM dutchie_products
WHERE type IS NOT NULL
GROUP BY type, subcategory
ORDER BY type, subcategory;
-- Brands derived from products
CREATE OR REPLACE VIEW v_brands AS
SELECT
brand_name,
brand_id,
MAX(brand_logo_url) as brand_logo_url,
COUNT(DISTINCT id) as product_count,
COUNT(DISTINCT dispensary_id) as dispensary_count,
ARRAY_AGG(DISTINCT type) FILTER (WHERE type IS NOT NULL) as product_types
FROM dutchie_products
WHERE brand_name IS NOT NULL
GROUP BY brand_name, brand_id
ORDER BY product_count DESC;
-- Latest snapshot per product (most recent crawl data)
CREATE OR REPLACE VIEW v_latest_snapshots AS
SELECT DISTINCT ON (dutchie_product_id)
s.*
FROM dutchie_product_snapshots s
ORDER BY dutchie_product_id, crawled_at DESC;
-- Dashboard stats
CREATE OR REPLACE VIEW v_dashboard_stats AS
SELECT
(SELECT COUNT(*) FROM dispensaries WHERE state = 'AZ') as dispensary_count,
(SELECT COUNT(*) FROM dutchie_products) as product_count,
(SELECT COUNT(*) FROM dutchie_product_snapshots WHERE crawled_at > NOW() - INTERVAL '24 hours') as snapshots_24h,
(SELECT MAX(crawled_at) FROM dutchie_product_snapshots) as last_crawl_time,
(SELECT COUNT(*) FROM crawl_jobs WHERE status = 'failed' AND created_at > NOW() - INTERVAL '24 hours') as failed_jobs_24h,
(SELECT COUNT(DISTINCT brand_name) FROM dutchie_products WHERE brand_name IS NOT NULL) as brand_count,
(SELECT COUNT(DISTINCT (type, subcategory)) FROM dutchie_products WHERE type IS NOT NULL) as category_count;
`;
/**
* Run the schema migration
*/
async function createSchema() {
console.log('[DutchieAZ Schema] Creating database schema...');
const client = await (0, connection_1.getClient)();
try {
await client.query('BEGIN');
// Split into individual statements and execute
const statements = SCHEMA_SQL
.split(';')
.map(s => s.trim())
.filter(s => s.length > 0 && !s.startsWith('--'));
for (const statement of statements) {
if (statement.trim()) {
await client.query(statement + ';');
}
}
await client.query('COMMIT');
console.log('[DutchieAZ Schema] Schema created successfully');
}
catch (error) {
await client.query('ROLLBACK');
console.error('[DutchieAZ Schema] Failed to create schema:', error);
throw error;
}
finally {
client.release();
}
}
/**
* Drop all tables (for development/testing)
*/
async function dropSchema() {
console.log('[DutchieAZ Schema] Dropping all tables...');
await (0, connection_1.query)(`
DROP VIEW IF EXISTS v_dashboard_stats CASCADE;
DROP VIEW IF EXISTS v_latest_snapshots CASCADE;
DROP VIEW IF EXISTS v_brands CASCADE;
DROP VIEW IF EXISTS v_categories CASCADE;
DROP TABLE IF EXISTS crawl_schedule CASCADE;
DROP TABLE IF EXISTS crawl_jobs CASCADE;
DROP TABLE IF EXISTS dutchie_product_snapshots CASCADE;
DROP TABLE IF EXISTS dutchie_products CASCADE;
DROP TABLE IF EXISTS dispensaries CASCADE;
`);
console.log('[DutchieAZ Schema] All tables dropped');
}
/**
* Check if schema exists
*/
async function schemaExists() {
try {
const result = await (0, connection_1.query)(`
SELECT EXISTS (
SELECT FROM information_schema.tables
WHERE table_name = 'dispensaries'
) as exists
`);
return result.rows[0]?.exists === true;
}
catch (error) {
return false;
}
}
/**
* Initialize schema if it doesn't exist
*/
async function ensureSchema() {
const exists = await schemaExists();
if (!exists) {
await createSchema();
}
else {
console.log('[DutchieAZ Schema] Schema already exists');
}
}

95
backend/dist/dutchie-az/index.js vendored Normal file
View File

@@ -0,0 +1,95 @@
"use strict";
/**
* Dutchie AZ Data Pipeline
*
* Isolated data pipeline for crawling and storing Dutchie Arizona dispensary data.
* This module is completely separate from the main application database.
*
* Features:
* - Two-mode crawling (Mode A: UI parity, Mode B: MAX COVERAGE)
* - Derived stockStatus field (in_stock, out_of_stock, unknown)
* - Full raw payload storage for 100% data preservation
* - AZDHS dispensary list as canonical source
*/
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __exportStar = (this && this.__exportStar) || function(m, exports) {
for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
};
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.dutchieAZRouter = exports.getImportStats = exports.importFromJSON = exports.importAZDHSDispensaries = exports.getRunLogs = exports.initializeDefaultSchedules = exports.triggerScheduleNow = exports.deleteSchedule = exports.updateSchedule = exports.createSchedule = exports.getScheduleById = exports.getAllSchedules = exports.crawlSingleDispensary = exports.getSchedulerStatus = exports.triggerImmediateCrawl = exports.stopScheduler = exports.startScheduler = exports.crawlAllArizonaDispensaries = exports.crawlDispensaryProducts = exports.normalizeSnapshot = exports.normalizeProduct = exports.getDispensariesWithPlatformIds = exports.getDispensaryById = exports.getAllDispensaries = exports.resolvePlatformDispensaryIds = exports.discoverAndSaveDispensaries = exports.importFromExistingDispensaries = exports.discoverDispensaries = exports.discoverArizonaDispensaries = exports.fetchAllProductsBothModes = exports.fetchAllProducts = exports.resolveDispensaryId = exports.ARIZONA_CENTERPOINTS = exports.GRAPHQL_HASHES = exports.ensureSchema = exports.schemaExists = exports.dropSchema = exports.createSchema = exports.healthCheck = exports.closePool = exports.getClient = exports.query = exports.getDutchieAZPool = void 0;
// Types
__exportStar(require("./types"), exports);
// Database
var connection_1 = require("./db/connection");
Object.defineProperty(exports, "getDutchieAZPool", { enumerable: true, get: function () { return connection_1.getDutchieAZPool; } });
Object.defineProperty(exports, "query", { enumerable: true, get: function () { return connection_1.query; } });
Object.defineProperty(exports, "getClient", { enumerable: true, get: function () { return connection_1.getClient; } });
Object.defineProperty(exports, "closePool", { enumerable: true, get: function () { return connection_1.closePool; } });
Object.defineProperty(exports, "healthCheck", { enumerable: true, get: function () { return connection_1.healthCheck; } });
var schema_1 = require("./db/schema");
Object.defineProperty(exports, "createSchema", { enumerable: true, get: function () { return schema_1.createSchema; } });
Object.defineProperty(exports, "dropSchema", { enumerable: true, get: function () { return schema_1.dropSchema; } });
Object.defineProperty(exports, "schemaExists", { enumerable: true, get: function () { return schema_1.schemaExists; } });
Object.defineProperty(exports, "ensureSchema", { enumerable: true, get: function () { return schema_1.ensureSchema; } });
// Services - GraphQL Client
var graphql_client_1 = require("./services/graphql-client");
Object.defineProperty(exports, "GRAPHQL_HASHES", { enumerable: true, get: function () { return graphql_client_1.GRAPHQL_HASHES; } });
Object.defineProperty(exports, "ARIZONA_CENTERPOINTS", { enumerable: true, get: function () { return graphql_client_1.ARIZONA_CENTERPOINTS; } });
Object.defineProperty(exports, "resolveDispensaryId", { enumerable: true, get: function () { return graphql_client_1.resolveDispensaryId; } });
Object.defineProperty(exports, "fetchAllProducts", { enumerable: true, get: function () { return graphql_client_1.fetchAllProducts; } });
Object.defineProperty(exports, "fetchAllProductsBothModes", { enumerable: true, get: function () { return graphql_client_1.fetchAllProductsBothModes; } });
Object.defineProperty(exports, "discoverArizonaDispensaries", { enumerable: true, get: function () { return graphql_client_1.discoverArizonaDispensaries; } });
// Alias for backward compatibility
Object.defineProperty(exports, "discoverDispensaries", { enumerable: true, get: function () { return graphql_client_1.discoverArizonaDispensaries; } });
// Services - Discovery
var discovery_1 = require("./services/discovery");
Object.defineProperty(exports, "importFromExistingDispensaries", { enumerable: true, get: function () { return discovery_1.importFromExistingDispensaries; } });
Object.defineProperty(exports, "discoverAndSaveDispensaries", { enumerable: true, get: function () { return discovery_1.discoverDispensaries; } });
Object.defineProperty(exports, "resolvePlatformDispensaryIds", { enumerable: true, get: function () { return discovery_1.resolvePlatformDispensaryIds; } });
Object.defineProperty(exports, "getAllDispensaries", { enumerable: true, get: function () { return discovery_1.getAllDispensaries; } });
Object.defineProperty(exports, "getDispensaryById", { enumerable: true, get: function () { return discovery_1.getDispensaryById; } });
Object.defineProperty(exports, "getDispensariesWithPlatformIds", { enumerable: true, get: function () { return discovery_1.getDispensariesWithPlatformIds; } });
// Services - Product Crawler
var product_crawler_1 = require("./services/product-crawler");
Object.defineProperty(exports, "normalizeProduct", { enumerable: true, get: function () { return product_crawler_1.normalizeProduct; } });
Object.defineProperty(exports, "normalizeSnapshot", { enumerable: true, get: function () { return product_crawler_1.normalizeSnapshot; } });
Object.defineProperty(exports, "crawlDispensaryProducts", { enumerable: true, get: function () { return product_crawler_1.crawlDispensaryProducts; } });
Object.defineProperty(exports, "crawlAllArizonaDispensaries", { enumerable: true, get: function () { return product_crawler_1.crawlAllArizonaDispensaries; } });
// Services - Scheduler
var scheduler_1 = require("./services/scheduler");
Object.defineProperty(exports, "startScheduler", { enumerable: true, get: function () { return scheduler_1.startScheduler; } });
Object.defineProperty(exports, "stopScheduler", { enumerable: true, get: function () { return scheduler_1.stopScheduler; } });
Object.defineProperty(exports, "triggerImmediateCrawl", { enumerable: true, get: function () { return scheduler_1.triggerImmediateCrawl; } });
Object.defineProperty(exports, "getSchedulerStatus", { enumerable: true, get: function () { return scheduler_1.getSchedulerStatus; } });
Object.defineProperty(exports, "crawlSingleDispensary", { enumerable: true, get: function () { return scheduler_1.crawlSingleDispensary; } });
// Schedule config CRUD
Object.defineProperty(exports, "getAllSchedules", { enumerable: true, get: function () { return scheduler_1.getAllSchedules; } });
Object.defineProperty(exports, "getScheduleById", { enumerable: true, get: function () { return scheduler_1.getScheduleById; } });
Object.defineProperty(exports, "createSchedule", { enumerable: true, get: function () { return scheduler_1.createSchedule; } });
Object.defineProperty(exports, "updateSchedule", { enumerable: true, get: function () { return scheduler_1.updateSchedule; } });
Object.defineProperty(exports, "deleteSchedule", { enumerable: true, get: function () { return scheduler_1.deleteSchedule; } });
Object.defineProperty(exports, "triggerScheduleNow", { enumerable: true, get: function () { return scheduler_1.triggerScheduleNow; } });
Object.defineProperty(exports, "initializeDefaultSchedules", { enumerable: true, get: function () { return scheduler_1.initializeDefaultSchedules; } });
// Run logs
Object.defineProperty(exports, "getRunLogs", { enumerable: true, get: function () { return scheduler_1.getRunLogs; } });
// Services - AZDHS Import
var azdhs_import_1 = require("./services/azdhs-import");
Object.defineProperty(exports, "importAZDHSDispensaries", { enumerable: true, get: function () { return azdhs_import_1.importAZDHSDispensaries; } });
Object.defineProperty(exports, "importFromJSON", { enumerable: true, get: function () { return azdhs_import_1.importFromJSON; } });
Object.defineProperty(exports, "getImportStats", { enumerable: true, get: function () { return azdhs_import_1.getImportStats; } });
// Routes
var routes_1 = require("./routes");
Object.defineProperty(exports, "dutchieAZRouter", { enumerable: true, get: function () { return __importDefault(routes_1).default; } });

1610
backend/dist/dutchie-az/routes/index.js vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,229 @@
"use strict";
/**
* AZDHS Import Service
*
* Imports Arizona dispensaries from the main database's dispensaries table
* (which was populated from AZDHS data) into the isolated Dutchie AZ database.
*
* This establishes the canonical list of AZ dispensaries to match against Dutchie.
*/
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
exports.importAZDHSDispensaries = importAZDHSDispensaries;
exports.importFromJSON = importFromJSON;
exports.getImportStats = getImportStats;
const pg_1 = require("pg");
const connection_1 = require("../db/connection");
// Main database connection (source of AZDHS data)
const MAIN_DATABASE_URL = process.env.DATABASE_URL ||
'postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus';
/**
* Create a temporary connection to the main database
*/
function getMainDBPool() {
return new pg_1.Pool({
connectionString: MAIN_DATABASE_URL,
max: 5,
idleTimeoutMillis: 30000,
connectionTimeoutMillis: 5000,
});
}
/**
* Fetch all AZ dispensaries from the main database
*/
async function fetchAZDHSDispensaries() {
const pool = getMainDBPool();
try {
const result = await pool.query(`
SELECT
id, azdhs_id, name, company_name, address, city, state, zip,
latitude, longitude, dba_name, phone, email, website,
google_rating, google_review_count, slug,
menu_provider, product_provider,
created_at, updated_at
FROM dispensaries
WHERE state = 'AZ'
ORDER BY id
`);
return result.rows;
}
finally {
await pool.end();
}
}
/**
* Import a single dispensary into the Dutchie AZ database
*/
async function importDispensary(disp) {
const result = await (0, connection_1.query)(`
INSERT INTO dispensaries (
platform, name, slug, city, state, postal_code, address,
latitude, longitude, is_delivery, is_pickup, raw_metadata, updated_at
) VALUES (
$1, $2, $3, $4, $5, $6, $7,
$8, $9, $10, $11, $12, NOW()
)
ON CONFLICT (platform, slug, city, state) DO UPDATE SET
name = EXCLUDED.name,
postal_code = EXCLUDED.postal_code,
address = EXCLUDED.address,
latitude = EXCLUDED.latitude,
longitude = EXCLUDED.longitude,
raw_metadata = EXCLUDED.raw_metadata,
updated_at = NOW()
RETURNING id
`, [
'dutchie', // Will be updated when Dutchie match is found
disp.dba_name || disp.name,
disp.slug,
disp.city,
disp.state,
disp.zip,
disp.address,
disp.latitude,
disp.longitude,
false, // is_delivery - unknown
true, // is_pickup - assume true
JSON.stringify({
azdhs_id: disp.azdhs_id,
main_db_id: disp.id,
company_name: disp.company_name,
phone: disp.phone,
email: disp.email,
website: disp.website,
google_rating: disp.google_rating,
google_review_count: disp.google_review_count,
menu_provider: disp.menu_provider,
product_provider: disp.product_provider,
}),
]);
return result.rows[0].id;
}
/**
* Import all AZDHS dispensaries into the Dutchie AZ database
*/
async function importAZDHSDispensaries() {
console.log('[AZDHS Import] Starting import from main database...');
const result = {
total: 0,
imported: 0,
skipped: 0,
errors: [],
};
try {
const dispensaries = await fetchAZDHSDispensaries();
result.total = dispensaries.length;
console.log(`[AZDHS Import] Found ${dispensaries.length} AZ dispensaries in main DB`);
for (const disp of dispensaries) {
try {
const id = await importDispensary(disp);
result.imported++;
console.log(`[AZDHS Import] Imported: ${disp.name} (${disp.city}) -> id=${id}`);
}
catch (error) {
if (error.message.includes('duplicate')) {
result.skipped++;
}
else {
result.errors.push(`${disp.name}: ${error.message}`);
}
}
}
}
catch (error) {
result.errors.push(`Failed to fetch from main DB: ${error.message}`);
}
console.log(`[AZDHS Import] Complete: ${result.imported} imported, ${result.skipped} skipped, ${result.errors.length} errors`);
return result;
}
/**
* Import dispensaries from JSON file (backup export)
*/
async function importFromJSON(jsonPath) {
console.log(`[AZDHS Import] Importing from JSON: ${jsonPath}`);
const result = {
total: 0,
imported: 0,
skipped: 0,
errors: [],
};
try {
const fs = await Promise.resolve().then(() => __importStar(require('fs/promises')));
const data = await fs.readFile(jsonPath, 'utf-8');
const dispensaries = JSON.parse(data);
result.total = dispensaries.length;
console.log(`[AZDHS Import] Found ${dispensaries.length} dispensaries in JSON file`);
for (const disp of dispensaries) {
try {
const id = await importDispensary(disp);
result.imported++;
}
catch (error) {
if (error.message.includes('duplicate')) {
result.skipped++;
}
else {
result.errors.push(`${disp.name}: ${error.message}`);
}
}
}
}
catch (error) {
result.errors.push(`Failed to read JSON file: ${error.message}`);
}
console.log(`[AZDHS Import] Complete: ${result.imported} imported, ${result.skipped} skipped`);
return result;
}
/**
* Get import statistics
*/
async function getImportStats() {
const { rows } = await (0, connection_1.query)(`
SELECT
COUNT(*) as total,
COUNT(platform_dispensary_id) as with_platform_id,
COUNT(*) - COUNT(platform_dispensary_id) as without_platform_id,
MAX(updated_at) as last_updated
FROM dispensaries
WHERE state = 'AZ'
`);
const stats = rows[0];
return {
totalDispensaries: parseInt(stats.total, 10),
withPlatformIds: parseInt(stats.with_platform_id, 10),
withoutPlatformIds: parseInt(stats.without_platform_id, 10),
lastImportedAt: stats.last_updated,
};
}

View File

@@ -0,0 +1,380 @@
"use strict";
/**
* Directory-Based Store Matcher
*
* Scrapes provider directory pages (Curaleaf, Sol, etc.) to get store lists,
* then matches them to existing dispensaries by fuzzy name/city/address matching.
*
* This allows us to:
* 1. Find specific store URLs for directory-style websites
* 2. Match stores confidently by name+city
* 3. Mark non-Dutchie providers as not_crawlable until we build crawlers
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.scrapeSolDirectory = scrapeSolDirectory;
exports.scrapeCuraleafDirectory = scrapeCuraleafDirectory;
exports.matchDirectoryToDispensaries = matchDirectoryToDispensaries;
exports.previewDirectoryMatches = previewDirectoryMatches;
exports.applyHighConfidenceMatches = applyHighConfidenceMatches;
const connection_1 = require("../db/connection");
// ============================================================
// NORMALIZATION FUNCTIONS
// ============================================================
/**
* Normalize a string for comparison:
* - Lowercase
* - Remove common suffixes (dispensary, cannabis, etc.)
* - Remove punctuation
* - Collapse whitespace
*/
function normalizeForComparison(str) {
if (!str)
return '';
return str
.toLowerCase()
.replace(/\s+(dispensary|cannabis|marijuana|medical|recreational|shop|store|flower|wellness)(\s|$)/gi, ' ')
.replace(/[^\w\s]/g, ' ') // Remove punctuation
.replace(/\s+/g, ' ') // Collapse whitespace
.trim();
}
/**
* Normalize city name for comparison
*/
function normalizeCity(city) {
if (!city)
return '';
return city
.toLowerCase()
.replace(/[^\w\s]/g, '')
.trim();
}
/**
* Calculate similarity between two strings (0-1)
* Uses Levenshtein distance normalized by max length
*/
function stringSimilarity(a, b) {
if (!a || !b)
return 0;
if (a === b)
return 1;
const longer = a.length > b.length ? a : b;
const shorter = a.length > b.length ? b : a;
if (longer.length === 0)
return 1;
const distance = levenshteinDistance(longer, shorter);
return (longer.length - distance) / longer.length;
}
/**
* Levenshtein distance between two strings
*/
function levenshteinDistance(a, b) {
const matrix = [];
for (let i = 0; i <= b.length; i++) {
matrix[i] = [i];
}
for (let j = 0; j <= a.length; j++) {
matrix[0][j] = j;
}
for (let i = 1; i <= b.length; i++) {
for (let j = 1; j <= a.length; j++) {
if (b.charAt(i - 1) === a.charAt(j - 1)) {
matrix[i][j] = matrix[i - 1][j - 1];
}
else {
matrix[i][j] = Math.min(matrix[i - 1][j - 1] + 1, // substitution
matrix[i][j - 1] + 1, // insertion
matrix[i - 1][j] + 1 // deletion
);
}
}
}
return matrix[b.length][a.length];
}
/**
* Check if string contains another (with normalization)
*/
function containsNormalized(haystack, needle) {
return normalizeForComparison(haystack).includes(normalizeForComparison(needle));
}
// ============================================================
// PROVIDER DIRECTORY SCRAPERS
// ============================================================
/**
* Sol Flower (livewithsol.com) - Static HTML, easy to scrape
*/
async function scrapeSolDirectory() {
console.log('[DirectoryMatcher] Scraping Sol Flower directory...');
try {
const response = await fetch('https://www.livewithsol.com/locations/', {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
Accept: 'text/html',
},
});
if (!response.ok) {
throw new Error(`HTTP ${response.status}`);
}
const html = await response.text();
// Extract store entries from HTML
// Sol's structure: Each location has name, address in specific divs
const stores = [];
// Pattern to find location cards
// Format: <a href="/locations/slug/">NAME</a> with address nearby
const locationRegex = /<a[^>]+href="(\/locations\/[^"]+)"[^>]*>([^<]+)<\/a>[\s\S]*?(\d+[^<]+(?:Ave|St|Blvd|Dr|Rd|Way)[^<]*)/gi;
let match;
while ((match = locationRegex.exec(html)) !== null) {
const [, path, name, address] = match;
// Extract city from common Arizona cities
let city = 'Unknown';
const cityPatterns = [
{ pattern: /phoenix/i, city: 'Phoenix' },
{ pattern: /scottsdale/i, city: 'Scottsdale' },
{ pattern: /tempe/i, city: 'Tempe' },
{ pattern: /tucson/i, city: 'Tucson' },
{ pattern: /mesa/i, city: 'Mesa' },
{ pattern: /sun city/i, city: 'Sun City' },
{ pattern: /glendale/i, city: 'Glendale' },
];
for (const { pattern, city: cityName } of cityPatterns) {
if (pattern.test(name) || pattern.test(address)) {
city = cityName;
break;
}
}
stores.push({
name: name.trim(),
city,
state: 'AZ',
address: address.trim(),
storeUrl: `https://www.livewithsol.com${path}`,
});
}
// If regex didn't work, use known hardcoded values (fallback)
if (stores.length === 0) {
console.log('[DirectoryMatcher] Using hardcoded Sol locations');
return [
{ name: 'Sol Flower 32nd & Shea', city: 'Phoenix', state: 'AZ', address: '3217 E Shea Blvd Suite 1 A', storeUrl: 'https://www.livewithsol.com/locations/deer-valley/' },
{ name: 'Sol Flower Scottsdale Airpark', city: 'Scottsdale', state: 'AZ', address: '14980 N 78th Way Ste 204', storeUrl: 'https://www.livewithsol.com/locations/scottsdale-airpark/' },
{ name: 'Sol Flower Sun City', city: 'Sun City', state: 'AZ', address: '13650 N 99th Ave', storeUrl: 'https://www.livewithsol.com/locations/sun-city/' },
{ name: 'Sol Flower Tempe McClintock', city: 'Tempe', state: 'AZ', address: '1322 N McClintock Dr', storeUrl: 'https://www.livewithsol.com/locations/tempe-mcclintock/' },
{ name: 'Sol Flower Tempe University', city: 'Tempe', state: 'AZ', address: '2424 W University Dr', storeUrl: 'https://www.livewithsol.com/locations/tempe-university/' },
{ name: 'Sol Flower Foothills Tucson', city: 'Tucson', state: 'AZ', address: '6026 N Oracle Rd', storeUrl: 'https://www.livewithsol.com/locations/foothills-tucson/' },
{ name: 'Sol Flower South Tucson', city: 'Tucson', state: 'AZ', address: '3000 W Valencia Rd Ste 210', storeUrl: 'https://www.livewithsol.com/locations/south-tucson/' },
{ name: 'Sol Flower North Tucson', city: 'Tucson', state: 'AZ', address: '4837 N 1st Ave', storeUrl: 'https://www.livewithsol.com/locations/north-tucson/' },
{ name: 'Sol Flower Casas Adobes', city: 'Tucson', state: 'AZ', address: '6437 N Oracle Rd', storeUrl: 'https://www.livewithsol.com/locations/casas-adobes/' },
];
}
console.log(`[DirectoryMatcher] Found ${stores.length} Sol Flower locations`);
return stores;
}
catch (error) {
console.error('[DirectoryMatcher] Error scraping Sol directory:', error.message);
// Return hardcoded fallback
return [
{ name: 'Sol Flower 32nd & Shea', city: 'Phoenix', state: 'AZ', address: '3217 E Shea Blvd Suite 1 A', storeUrl: 'https://www.livewithsol.com/locations/deer-valley/' },
{ name: 'Sol Flower Scottsdale Airpark', city: 'Scottsdale', state: 'AZ', address: '14980 N 78th Way Ste 204', storeUrl: 'https://www.livewithsol.com/locations/scottsdale-airpark/' },
{ name: 'Sol Flower Sun City', city: 'Sun City', state: 'AZ', address: '13650 N 99th Ave', storeUrl: 'https://www.livewithsol.com/locations/sun-city/' },
{ name: 'Sol Flower Tempe McClintock', city: 'Tempe', state: 'AZ', address: '1322 N McClintock Dr', storeUrl: 'https://www.livewithsol.com/locations/tempe-mcclintock/' },
{ name: 'Sol Flower Tempe University', city: 'Tempe', state: 'AZ', address: '2424 W University Dr', storeUrl: 'https://www.livewithsol.com/locations/tempe-university/' },
{ name: 'Sol Flower Foothills Tucson', city: 'Tucson', state: 'AZ', address: '6026 N Oracle Rd', storeUrl: 'https://www.livewithsol.com/locations/foothills-tucson/' },
{ name: 'Sol Flower South Tucson', city: 'Tucson', state: 'AZ', address: '3000 W Valencia Rd Ste 210', storeUrl: 'https://www.livewithsol.com/locations/south-tucson/' },
{ name: 'Sol Flower North Tucson', city: 'Tucson', state: 'AZ', address: '4837 N 1st Ave', storeUrl: 'https://www.livewithsol.com/locations/north-tucson/' },
{ name: 'Sol Flower Casas Adobes', city: 'Tucson', state: 'AZ', address: '6437 N Oracle Rd', storeUrl: 'https://www.livewithsol.com/locations/casas-adobes/' },
];
}
}
/**
* Curaleaf - Has age-gate, so we need hardcoded AZ locations
* In production, this would use Playwright to bypass age-gate
*/
async function scrapeCuraleafDirectory() {
console.log('[DirectoryMatcher] Using hardcoded Curaleaf AZ locations (age-gate blocks simple fetch)...');
// Hardcoded Arizona Curaleaf locations from public knowledge
// These would be scraped via Playwright in production
return [
{ name: 'Curaleaf Phoenix Camelback', city: 'Phoenix', state: 'AZ', address: '4811 E Camelback Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-phoenix-camelback' },
{ name: 'Curaleaf Phoenix Midtown', city: 'Phoenix', state: 'AZ', address: '1928 E Highland Ave', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-phoenix-midtown' },
{ name: 'Curaleaf Glendale East', city: 'Glendale', state: 'AZ', address: '5150 W Glendale Ave', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-glendale-east' },
{ name: 'Curaleaf Glendale West', city: 'Glendale', state: 'AZ', address: '6501 W Glendale Ave', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-glendale-west' },
{ name: 'Curaleaf Gilbert', city: 'Gilbert', state: 'AZ', address: '1736 E Williams Field Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-gilbert' },
{ name: 'Curaleaf Mesa', city: 'Mesa', state: 'AZ', address: '1540 S Power Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-mesa' },
{ name: 'Curaleaf Tempe', city: 'Tempe', state: 'AZ', address: '1815 E Broadway Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-tempe' },
{ name: 'Curaleaf Scottsdale', city: 'Scottsdale', state: 'AZ', address: '8904 E Indian Bend Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-scottsdale' },
{ name: 'Curaleaf Tucson Prince', city: 'Tucson', state: 'AZ', address: '3955 W Prince Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-tucson-prince' },
{ name: 'Curaleaf Tucson Midvale', city: 'Tucson', state: 'AZ', address: '2936 N Midvale Park Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-tucson-midvale' },
{ name: 'Curaleaf Sedona', city: 'Sedona', state: 'AZ', address: '525 AZ-179', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-sedona' },
{ name: 'Curaleaf Youngtown', city: 'Youngtown', state: 'AZ', address: '11125 W Grand Ave', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-youngtown' },
];
}
/**
* Match a directory store to an existing dispensary
*/
function matchStoreToDispensary(store, dispensaries) {
const normalizedStoreName = normalizeForComparison(store.name);
const normalizedStoreCity = normalizeCity(store.city);
let bestMatch = null;
let bestScore = 0;
let matchReason = '';
for (const disp of dispensaries) {
const normalizedDispName = normalizeForComparison(disp.name);
const normalizedDispCity = normalizeCity(disp.city || '');
let score = 0;
const reasons = [];
// 1. Name similarity (max 50 points)
const nameSimilarity = stringSimilarity(normalizedStoreName, normalizedDispName);
score += nameSimilarity * 50;
if (nameSimilarity > 0.8)
reasons.push(`name_match(${(nameSimilarity * 100).toFixed(0)}%)`);
// 2. City match (25 points for exact, 15 for partial)
if (normalizedStoreCity && normalizedDispCity) {
if (normalizedStoreCity === normalizedDispCity) {
score += 25;
reasons.push('city_exact');
}
else if (normalizedStoreCity.includes(normalizedDispCity) ||
normalizedDispCity.includes(normalizedStoreCity)) {
score += 15;
reasons.push('city_partial');
}
}
// 3. Address contains street name (15 points)
if (store.address && disp.address) {
const storeStreet = store.address.toLowerCase().split(/\s+/).slice(1, 4).join(' ');
const dispStreet = disp.address.toLowerCase().split(/\s+/).slice(1, 4).join(' ');
if (storeStreet && dispStreet && stringSimilarity(storeStreet, dispStreet) > 0.7) {
score += 15;
reasons.push('address_match');
}
}
// 4. Brand name in dispensary name (10 points)
const brandName = store.name.split(' ')[0].toLowerCase(); // e.g., "Curaleaf", "Sol"
if (disp.name.toLowerCase().includes(brandName)) {
score += 10;
reasons.push('brand_match');
}
if (score > bestScore) {
bestScore = score;
bestMatch = disp;
matchReason = reasons.join(', ');
}
}
// Determine confidence level
let confidence;
if (bestScore >= 70) {
confidence = 'high';
}
else if (bestScore >= 50) {
confidence = 'medium';
}
else if (bestScore >= 30) {
confidence = 'low';
}
else {
confidence = 'none';
}
return {
directoryStore: store,
dispensaryId: bestMatch?.id || null,
dispensaryName: bestMatch?.name || null,
confidence,
matchReason: matchReason || 'no_match',
};
}
// ============================================================
// MAIN FUNCTIONS
// ============================================================
/**
* Run directory matching for a provider and update database
* Only applies high-confidence matches automatically
*/
async function matchDirectoryToDispensaries(provider, dryRun = true) {
console.log(`[DirectoryMatcher] Running ${provider} directory matching (dryRun=${dryRun})...`);
// Get directory stores
let directoryStores;
if (provider === 'curaleaf') {
directoryStores = await scrapeCuraleafDirectory();
}
else if (provider === 'sol') {
directoryStores = await scrapeSolDirectory();
}
else {
throw new Error(`Unknown provider: ${provider}`);
}
// Get all AZ dispensaries from database
const { rows: dispensaries } = await (0, connection_1.query)(`SELECT id, name, city, state, address, menu_type, menu_url, website
FROM dispensaries
WHERE state = 'AZ'`);
console.log(`[DirectoryMatcher] Matching ${directoryStores.length} directory stores against ${dispensaries.length} dispensaries`);
// Match each directory store
const results = [];
for (const store of directoryStores) {
const match = matchStoreToDispensary(store, dispensaries);
results.push(match);
// Only apply high-confidence matches if not dry run
if (!dryRun && match.confidence === 'high' && match.dispensaryId) {
await applyDirectoryMatch(match.dispensaryId, provider, store);
}
}
// Count results
const report = {
provider,
totalDirectoryStores: directoryStores.length,
highConfidenceMatches: results.filter((r) => r.confidence === 'high').length,
mediumConfidenceMatches: results.filter((r) => r.confidence === 'medium').length,
lowConfidenceMatches: results.filter((r) => r.confidence === 'low').length,
unmatched: results.filter((r) => r.confidence === 'none').length,
results,
};
console.log(`[DirectoryMatcher] ${provider} matching complete:`);
console.log(` - High confidence: ${report.highConfidenceMatches}`);
console.log(` - Medium confidence: ${report.mediumConfidenceMatches}`);
console.log(` - Low confidence: ${report.lowConfidenceMatches}`);
console.log(` - Unmatched: ${report.unmatched}`);
return report;
}
/**
* Apply a directory match to a dispensary
*/
async function applyDirectoryMatch(dispensaryId, provider, store) {
console.log(`[DirectoryMatcher] Applying match: dispensary ${dispensaryId} -> ${store.storeUrl}`);
await (0, connection_1.query)(`
UPDATE dispensaries SET
menu_type = $1,
menu_url = $2,
platform_dispensary_id = NULL,
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
jsonb_build_object(
'detected_provider', $1::text,
'detection_method', 'directory_match'::text,
'detected_at', NOW(),
'directory_store_name', $3::text,
'directory_store_url', $2::text,
'directory_store_city', $4::text,
'directory_store_address', $5::text,
'not_crawlable', true,
'not_crawlable_reason', $6::text
),
updated_at = NOW()
WHERE id = $7
`, [
provider,
store.storeUrl,
store.name,
store.city,
store.address,
`${provider} proprietary menu - no crawler available`,
dispensaryId,
]);
}
/**
* Preview matches without applying them
*/
async function previewDirectoryMatches(provider) {
return matchDirectoryToDispensaries(provider, true);
}
/**
* Apply high-confidence matches
*/
async function applyHighConfidenceMatches(provider) {
return matchDirectoryToDispensaries(provider, false);
}

View File

@@ -0,0 +1,487 @@
"use strict";
/**
* Dutchie AZ Discovery Service
*
* Discovers and manages dispensaries from Dutchie for Arizona.
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.importFromExistingDispensaries = importFromExistingDispensaries;
exports.discoverDispensaries = discoverDispensaries;
exports.extractCNameFromMenuUrl = extractCNameFromMenuUrl;
exports.resolvePlatformDispensaryIds = resolvePlatformDispensaryIds;
exports.getAllDispensaries = getAllDispensaries;
exports.mapDbRowToDispensary = mapDbRowToDispensary;
exports.getDispensaryById = getDispensaryById;
exports.getDispensariesWithPlatformIds = getDispensariesWithPlatformIds;
exports.reResolveDispensaryPlatformId = reResolveDispensaryPlatformId;
exports.updateMenuUrlAndResolve = updateMenuUrlAndResolve;
exports.markDispensaryNotCrawlable = markDispensaryNotCrawlable;
exports.getDispensaryCName = getDispensaryCName;
const connection_1 = require("../db/connection");
const graphql_client_1 = require("./graphql-client");
/**
* Upsert a dispensary record
*/
async function upsertDispensary(dispensary) {
const result = await (0, connection_1.query)(`
INSERT INTO dispensaries (
platform, name, slug, city, state, postal_code, address,
latitude, longitude, platform_dispensary_id,
is_delivery, is_pickup, raw_metadata, updated_at
) VALUES (
$1, $2, $3, $4, $5, $6, $7,
$8, $9, $10,
$11, $12, $13, NOW()
)
ON CONFLICT (platform, slug, city, state) DO UPDATE SET
name = EXCLUDED.name,
postal_code = EXCLUDED.postal_code,
address = EXCLUDED.address,
latitude = EXCLUDED.latitude,
longitude = EXCLUDED.longitude,
platform_dispensary_id = COALESCE(EXCLUDED.platform_dispensary_id, dispensaries.platform_dispensary_id),
is_delivery = EXCLUDED.is_delivery,
is_pickup = EXCLUDED.is_pickup,
raw_metadata = EXCLUDED.raw_metadata,
updated_at = NOW()
RETURNING id
`, [
dispensary.platform || 'dutchie',
dispensary.name,
dispensary.slug,
dispensary.city,
dispensary.state || 'AZ',
dispensary.postalCode,
dispensary.address,
dispensary.latitude,
dispensary.longitude,
dispensary.platformDispensaryId,
dispensary.isDelivery || false,
dispensary.isPickup || true,
dispensary.rawMetadata ? JSON.stringify(dispensary.rawMetadata) : null,
]);
return result.rows[0].id;
}
/**
* Normalize a raw discovery result to Dispensary
*/
function normalizeDispensary(raw) {
return {
platform: 'dutchie',
name: raw.name || raw.Name || '',
slug: raw.slug || raw.cName || raw.id || '',
city: raw.city || raw.address?.city || '',
state: 'AZ',
postalCode: raw.postalCode || raw.address?.postalCode || raw.address?.zip,
address: raw.streetAddress || raw.address?.streetAddress,
latitude: raw.latitude || raw.location?.lat,
longitude: raw.longitude || raw.location?.lng,
platformDispensaryId: raw.dispensaryId || raw.id || null,
isDelivery: raw.isDelivery || raw.delivery || false,
isPickup: raw.isPickup || raw.pickup || true,
rawMetadata: raw,
};
}
/**
* Import dispensaries from the existing dispensaries table (from AZDHS data)
* This creates records in the dutchie_az database for AZ dispensaries
*/
async function importFromExistingDispensaries() {
console.log('[Discovery] Importing from existing dispensaries table...');
// This is a workaround - we'll use the dispensaries we already know about
// and try to resolve their Dutchie IDs
const knownDispensaries = [
{ name: 'Deeply Rooted', slug: 'AZ-Deeply-Rooted', city: 'Phoenix', state: 'AZ' },
{ name: 'Curaleaf Gilbert', slug: 'curaleaf-gilbert', city: 'Gilbert', state: 'AZ' },
{ name: 'Zen Leaf Prescott', slug: 'AZ-zen-leaf-prescott', city: 'Prescott', state: 'AZ' },
// Add more known Dutchie stores here
];
let imported = 0;
for (const disp of knownDispensaries) {
try {
const id = await upsertDispensary({
platform: 'dutchie',
name: disp.name,
slug: disp.slug,
city: disp.city,
state: disp.state,
});
imported++;
console.log(`[Discovery] Imported: ${disp.name} (id=${id})`);
}
catch (error) {
console.error(`[Discovery] Failed to import ${disp.name}:`, error.message);
}
}
return { imported };
}
/**
* Discover all Arizona Dutchie dispensaries via GraphQL
*/
async function discoverDispensaries() {
console.log('[Discovery] Starting Arizona dispensary discovery...');
const errors = [];
let discovered = 0;
try {
const rawDispensaries = await (0, graphql_client_1.discoverArizonaDispensaries)();
console.log(`[Discovery] Found ${rawDispensaries.length} dispensaries from GraphQL`);
for (const raw of rawDispensaries) {
try {
const normalized = normalizeDispensary(raw);
if (normalized.name && normalized.slug && normalized.city) {
await upsertDispensary(normalized);
discovered++;
}
}
catch (error) {
errors.push(`${raw.name || raw.slug}: ${error.message}`);
}
}
}
catch (error) {
errors.push(`Discovery failed: ${error.message}`);
}
console.log(`[Discovery] Completed: ${discovered} dispensaries, ${errors.length} errors`);
return { discovered, errors };
}
/**
* Extract cName (slug) from a Dutchie menu_url
* Supports formats:
* - https://dutchie.com/embedded-menu/<cName>
* - https://dutchie.com/dispensary/<cName>
*/
function extractCNameFromMenuUrl(menuUrl) {
if (!menuUrl)
return null;
try {
const url = new URL(menuUrl);
const pathname = url.pathname;
// Match /embedded-menu/<cName> or /dispensary/<cName>
const embeddedMatch = pathname.match(/^\/embedded-menu\/([^/?]+)/);
if (embeddedMatch)
return embeddedMatch[1];
const dispensaryMatch = pathname.match(/^\/dispensary\/([^/?]+)/);
if (dispensaryMatch)
return dispensaryMatch[1];
return null;
}
catch {
return null;
}
}
/**
* Resolve platform dispensary IDs for all dispensaries that don't have one
* CRITICAL: Uses cName extracted from menu_url, NOT the slug column!
*
* Uses the new resolveDispensaryIdWithDetails which:
* 1. Extracts dispensaryId from window.reactEnv in the embedded menu page (preferred)
* 2. Falls back to GraphQL if reactEnv extraction fails
* 3. Returns HTTP status so we can mark 403/404 stores as not_crawlable
*/
async function resolvePlatformDispensaryIds() {
console.log('[Discovery] Resolving platform dispensary IDs...');
const { rows: dispensaries } = await (0, connection_1.query)(`
SELECT id, name, slug, menu_url, menu_type, platform_dispensary_id, crawl_status
FROM dispensaries
WHERE menu_type = 'dutchie'
AND platform_dispensary_id IS NULL
AND menu_url IS NOT NULL
AND (crawl_status IS NULL OR crawl_status != 'not_crawlable')
ORDER BY id
`);
let resolved = 0;
let failed = 0;
let skipped = 0;
let notCrawlable = 0;
for (const dispensary of dispensaries) {
try {
// Extract cName from menu_url - this is the CORRECT way to get the Dutchie slug
const cName = extractCNameFromMenuUrl(dispensary.menu_url);
if (!cName) {
console.log(`[Discovery] Skipping ${dispensary.name}: Could not extract cName from menu_url: ${dispensary.menu_url}`);
skipped++;
continue;
}
console.log(`[Discovery] Resolving ID for: ${dispensary.name} (cName=${cName}, menu_url=${dispensary.menu_url})`);
// Use the new detailed resolver that extracts from reactEnv first
const result = await (0, graphql_client_1.resolveDispensaryIdWithDetails)(cName);
if (result.dispensaryId) {
// SUCCESS: Store resolved
await (0, connection_1.query)(`
UPDATE dispensaries
SET platform_dispensary_id = $1,
platform_dispensary_id_resolved_at = NOW(),
crawl_status = 'ready',
crawl_status_reason = $2,
crawl_status_updated_at = NOW(),
last_tested_menu_url = $3,
last_http_status = $4,
updated_at = NOW()
WHERE id = $5
`, [
result.dispensaryId,
`Resolved from ${result.source || 'page'}`,
dispensary.menu_url,
result.httpStatus,
dispensary.id,
]);
resolved++;
console.log(`[Discovery] Resolved: ${cName} -> ${result.dispensaryId} (source: ${result.source})`);
}
else if (result.httpStatus === 403 || result.httpStatus === 404) {
// NOT CRAWLABLE: Store removed or not accessible
await (0, connection_1.query)(`
UPDATE dispensaries
SET platform_dispensary_id = NULL,
crawl_status = 'not_crawlable',
crawl_status_reason = $1,
crawl_status_updated_at = NOW(),
last_tested_menu_url = $2,
last_http_status = $3,
updated_at = NOW()
WHERE id = $4
`, [
result.error || `HTTP ${result.httpStatus}: Removed from Dutchie`,
dispensary.menu_url,
result.httpStatus,
dispensary.id,
]);
notCrawlable++;
console.log(`[Discovery] Marked not crawlable: ${cName} (HTTP ${result.httpStatus})`);
}
else {
// FAILED: Could not resolve but page loaded
await (0, connection_1.query)(`
UPDATE dispensaries
SET crawl_status = 'not_ready',
crawl_status_reason = $1,
crawl_status_updated_at = NOW(),
last_tested_menu_url = $2,
last_http_status = $3,
updated_at = NOW()
WHERE id = $4
`, [
result.error || 'Could not extract dispensaryId from page',
dispensary.menu_url,
result.httpStatus,
dispensary.id,
]);
failed++;
console.log(`[Discovery] Could not resolve: ${cName} - ${result.error}`);
}
// Delay between requests
await new Promise((r) => setTimeout(r, 2000));
}
catch (error) {
failed++;
console.error(`[Discovery] Error resolving ${dispensary.name}:`, error.message);
}
}
console.log(`[Discovery] Completed: ${resolved} resolved, ${failed} failed, ${skipped} skipped, ${notCrawlable} not crawlable`);
return { resolved, failed, skipped, notCrawlable };
}
/**
* Get all dispensaries
*/
// Explicit column list for dispensaries table (avoids SELECT * issues with schema differences)
const DISPENSARY_COLUMNS = `
id, name, slug, city, state, zip, address, latitude, longitude,
menu_type, menu_url, platform_dispensary_id, website,
provider_detection_data, created_at, updated_at
`;
async function getAllDispensaries() {
const { rows } = await (0, connection_1.query)(`SELECT ${DISPENSARY_COLUMNS} FROM dispensaries WHERE menu_type = 'dutchie' ORDER BY name`);
return rows.map(mapDbRowToDispensary);
}
/**
* Map snake_case DB row to camelCase Dispensary object
* CRITICAL: DB returns snake_case (platform_dispensary_id) but TypeScript expects camelCase (platformDispensaryId)
* This function is exported for use in other modules that query dispensaries directly.
*
* NOTE: The consolidated dispensaries table column mappings:
* - zip → postalCode
* - menu_type → menuType (keep platform as 'dutchie')
* - last_crawl_at → lastCrawledAt
* - platform_dispensary_id → platformDispensaryId
*/
function mapDbRowToDispensary(row) {
// Extract website from raw_metadata if available (field may not exist in all environments)
let rawMetadata = undefined;
if (row.raw_metadata !== undefined) {
rawMetadata = typeof row.raw_metadata === 'string'
? JSON.parse(row.raw_metadata)
: row.raw_metadata;
}
const website = row.website || rawMetadata?.website || undefined;
return {
id: row.id,
platform: row.platform || 'dutchie', // keep platform as-is, default to 'dutchie'
name: row.name,
slug: row.slug,
city: row.city,
state: row.state,
postalCode: row.postalCode || row.zip || row.postal_code,
latitude: row.latitude ? parseFloat(row.latitude) : undefined,
longitude: row.longitude ? parseFloat(row.longitude) : undefined,
address: row.address,
platformDispensaryId: row.platformDispensaryId || row.platform_dispensary_id, // CRITICAL mapping!
isDelivery: row.is_delivery,
isPickup: row.is_pickup,
rawMetadata: rawMetadata,
lastCrawledAt: row.lastCrawledAt || row.last_crawl_at, // use last_crawl_at
productCount: row.product_count,
createdAt: row.created_at,
updatedAt: row.updated_at,
menuType: row.menuType || row.menu_type,
menuUrl: row.menuUrl || row.menu_url,
scrapeEnabled: row.scrapeEnabled ?? row.scrape_enabled,
providerDetectionData: row.provider_detection_data,
platformDispensaryIdResolvedAt: row.platform_dispensary_id_resolved_at,
website,
};
}
/**
* Get dispensary by ID
* NOTE: Uses SQL aliases to map snake_case → camelCase directly
*/
async function getDispensaryById(id) {
const { rows } = await (0, connection_1.query)(`
SELECT
id,
name,
slug,
city,
state,
zip AS "postalCode",
address,
latitude,
longitude,
menu_type AS "menuType",
menu_url AS "menuUrl",
platform_dispensary_id AS "platformDispensaryId",
website,
provider_detection_data AS "providerDetectionData",
created_at,
updated_at
FROM dispensaries
WHERE id = $1
`, [id]);
if (!rows[0])
return null;
return mapDbRowToDispensary(rows[0]);
}
/**
* Get dispensaries with platform IDs (ready for crawling)
*/
async function getDispensariesWithPlatformIds() {
const { rows } = await (0, connection_1.query)(`
SELECT ${DISPENSARY_COLUMNS} FROM dispensaries
WHERE menu_type = 'dutchie' AND platform_dispensary_id IS NOT NULL
ORDER BY name
`);
return rows.map(mapDbRowToDispensary);
}
/**
* Re-resolve a single dispensary's platform ID
* Clears the existing ID and re-resolves from the menu_url cName
*/
async function reResolveDispensaryPlatformId(dispensaryId) {
console.log(`[Discovery] Re-resolving platform ID for dispensary ${dispensaryId}...`);
const dispensary = await getDispensaryById(dispensaryId);
if (!dispensary) {
return { success: false, platformId: null, cName: null, error: 'Dispensary not found' };
}
const cName = extractCNameFromMenuUrl(dispensary.menuUrl);
if (!cName) {
console.log(`[Discovery] Could not extract cName from menu_url: ${dispensary.menuUrl}`);
return {
success: false,
platformId: null,
cName: null,
error: `Could not extract cName from menu_url: ${dispensary.menuUrl}`,
};
}
console.log(`[Discovery] Extracted cName: ${cName} from menu_url: ${dispensary.menuUrl}`);
try {
const platformId = await (0, graphql_client_1.resolveDispensaryId)(cName);
if (platformId) {
await (0, connection_1.query)(`
UPDATE dispensaries
SET platform_dispensary_id = $1,
platform_dispensary_id_resolved_at = NOW(),
updated_at = NOW()
WHERE id = $2
`, [platformId, dispensaryId]);
console.log(`[Discovery] Resolved: ${cName} -> ${platformId}`);
return { success: true, platformId, cName };
}
else {
// Clear the invalid platform ID and mark as not crawlable
await (0, connection_1.query)(`
UPDATE dispensaries
SET platform_dispensary_id = NULL,
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
'{"resolution_error": "cName no longer exists on Dutchie", "not_crawlable": true}'::jsonb,
updated_at = NOW()
WHERE id = $1
`, [dispensaryId]);
console.log(`[Discovery] Could not resolve: ${cName} - marked as not crawlable`);
return {
success: false,
platformId: null,
cName,
error: `cName "${cName}" no longer exists on Dutchie`,
};
}
}
catch (error) {
console.error(`[Discovery] Error resolving ${cName}:`, error.message);
return { success: false, platformId: null, cName, error: error.message };
}
}
/**
* Update menu_url for a dispensary and re-resolve platform ID
*/
async function updateMenuUrlAndResolve(dispensaryId, newMenuUrl) {
console.log(`[Discovery] Updating menu_url for dispensary ${dispensaryId} to: ${newMenuUrl}`);
const cName = extractCNameFromMenuUrl(newMenuUrl);
if (!cName) {
return {
success: false,
platformId: null,
cName: null,
error: `Could not extract cName from new menu_url: ${newMenuUrl}`,
};
}
// Update the menu_url first
await (0, connection_1.query)(`
UPDATE dispensaries
SET menu_url = $1,
menu_type = 'dutchie',
platform_dispensary_id = NULL,
updated_at = NOW()
WHERE id = $2
`, [newMenuUrl, dispensaryId]);
// Now resolve the platform ID with the new cName
return await reResolveDispensaryPlatformId(dispensaryId);
}
/**
* Mark a dispensary as not crawlable (when resolution fails permanently)
*/
async function markDispensaryNotCrawlable(dispensaryId, reason) {
await (0, connection_1.query)(`
UPDATE dispensaries
SET platform_dispensary_id = NULL,
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
jsonb_build_object('not_crawlable', true, 'not_crawlable_reason', $1::text, 'not_crawlable_at', NOW()::text),
updated_at = NOW()
WHERE id = $2
`, [reason, dispensaryId]);
console.log(`[Discovery] Marked dispensary ${dispensaryId} as not crawlable: ${reason}`);
}
/**
* Get the cName for a dispensary (extracted from menu_url)
*/
function getDispensaryCName(dispensary) {
return extractCNameFromMenuUrl(dispensary.menuUrl);
}

View File

@@ -0,0 +1,538 @@
"use strict";
/**
* Dutchie GraphQL Client
*
* Uses Puppeteer to establish a session (get CF cookies), then makes
* SERVER-SIDE fetch calls to api-gw.dutchie.com with those cookies.
*
* DUTCHIE FETCH RULES:
* 1. Server-side only - use axios (never browser fetch with CORS)
* 2. Use dispensaryFilter.cNameOrID, NOT dispensaryId directly
* 3. Headers must mimic Chrome: User-Agent, Origin, Referer
* 4. If 403, extract CF cookies from Puppeteer session and include them
* 5. Log status codes, error bodies, and product counts
*/
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.ARIZONA_CENTERPOINTS = exports.GRAPHQL_HASHES = void 0;
exports.resolveDispensaryId = resolveDispensaryId;
exports.resolveDispensaryIdWithDetails = resolveDispensaryIdWithDetails;
exports.discoverArizonaDispensaries = discoverArizonaDispensaries;
exports.fetchAllProducts = fetchAllProducts;
exports.fetchAllProductsBothModes = fetchAllProductsBothModes;
const axios_1 = __importDefault(require("axios"));
const puppeteer_extra_1 = __importDefault(require("puppeteer-extra"));
const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth"));
const dutchie_1 = require("../config/dutchie");
Object.defineProperty(exports, "GRAPHQL_HASHES", { enumerable: true, get: function () { return dutchie_1.GRAPHQL_HASHES; } });
Object.defineProperty(exports, "ARIZONA_CENTERPOINTS", { enumerable: true, get: function () { return dutchie_1.ARIZONA_CENTERPOINTS; } });
puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)());
/**
* Create a session by navigating to the embedded menu page
* and extracting CF clearance cookies for server-side requests.
* Also extracts dispensaryId from window.reactEnv if available.
*/
async function createSession(cName) {
const browser = await puppeteer_extra_1.default.launch({
headless: 'new',
args: dutchie_1.dutchieConfig.browserArgs,
});
const page = await browser.newPage();
const userAgent = dutchie_1.dutchieConfig.userAgent;
await page.setUserAgent(userAgent);
await page.setViewport({ width: 1920, height: 1080 });
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', { get: () => false });
window.chrome = { runtime: {} };
});
// Navigate to the embedded menu page for this dispensary
const embeddedMenuUrl = `https://dutchie.com/embedded-menu/${cName}`;
console.log(`[GraphQL Client] Loading ${embeddedMenuUrl} to get CF cookies...`);
let httpStatus;
let dispensaryId;
try {
const response = await page.goto(embeddedMenuUrl, {
waitUntil: 'networkidle2',
timeout: dutchie_1.dutchieConfig.navigationTimeout,
});
httpStatus = response?.status();
await new Promise((r) => setTimeout(r, dutchie_1.dutchieConfig.pageLoadDelay));
// Try to extract dispensaryId from window.reactEnv
try {
dispensaryId = await page.evaluate(() => {
return window.reactEnv?.dispensaryId || null;
});
if (dispensaryId) {
console.log(`[GraphQL Client] Extracted dispensaryId from reactEnv: ${dispensaryId}`);
}
}
catch (evalError) {
console.log(`[GraphQL Client] Could not extract dispensaryId from reactEnv: ${evalError.message}`);
}
}
catch (error) {
console.warn(`[GraphQL Client] Navigation warning: ${error.message}`);
// Continue anyway - we may have gotten cookies
}
// Extract cookies
const cookies = await page.cookies();
const cookieString = cookies.map((c) => `${c.name}=${c.value}`).join('; ');
console.log(`[GraphQL Client] Got ${cookies.length} cookies, HTTP status: ${httpStatus}`);
if (cookies.length > 0) {
console.log(`[GraphQL Client] Cookie names: ${cookies.map(c => c.name).join(', ')}`);
}
return { cookies: cookieString, userAgent, browser, page, dispensaryId, httpStatus };
}
/**
* Close session (browser)
*/
async function closeSession(session) {
await session.browser.close();
}
// ============================================================
// SERVER-SIDE GRAPHQL FETCH USING AXIOS
// ============================================================
/**
* Build headers that mimic a real browser request
*/
function buildHeaders(session, cName) {
const embeddedMenuUrl = `https://dutchie.com/embedded-menu/${cName}`;
return {
'accept': 'application/json, text/plain, */*',
'accept-language': 'en-US,en;q=0.9',
'accept-encoding': 'gzip, deflate, br',
'content-type': 'application/json',
'origin': 'https://dutchie.com',
'referer': embeddedMenuUrl,
'user-agent': session.userAgent,
'apollographql-client-name': 'Marketplace (production)',
'sec-ch-ua': '"Chromium";v="120", "Google Chrome";v="120", "Not-A.Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-site',
...(session.cookies ? { 'cookie': session.cookies } : {}),
};
}
/**
* Execute GraphQL query server-side using axios
* Uses cookies from the browser session to bypass CF
*/
async function executeGraphQL(session, operationName, variables, hash, cName) {
const endpoint = dutchie_1.dutchieConfig.graphqlEndpoint;
const headers = buildHeaders(session, cName);
// Build request body for POST
const body = {
operationName,
variables,
extensions: {
persistedQuery: { version: 1, sha256Hash: hash },
},
};
console.log(`[GraphQL Client] POST: ${operationName} -> ${endpoint}`);
console.log(`[GraphQL Client] Variables: ${JSON.stringify(variables).slice(0, 300)}...`);
try {
const response = await axios_1.default.post(endpoint, body, {
headers,
timeout: 30000,
validateStatus: () => true, // Don't throw on non-2xx
});
// Log response details
console.log(`[GraphQL Client] Response status: ${response.status}`);
if (response.status !== 200) {
const bodyPreview = typeof response.data === 'string'
? response.data.slice(0, 500)
: JSON.stringify(response.data).slice(0, 500);
console.error(`[GraphQL Client] HTTP ${response.status}: ${bodyPreview}`);
throw new Error(`HTTP ${response.status}`);
}
// Check for GraphQL errors
if (response.data?.errors && response.data.errors.length > 0) {
console.error(`[GraphQL Client] GraphQL errors: ${JSON.stringify(response.data.errors[0])}`);
}
return response.data;
}
catch (error) {
if (axios_1.default.isAxiosError(error)) {
const axiosError = error;
console.error(`[GraphQL Client] Axios error: ${axiosError.message}`);
if (axiosError.response) {
console.error(`[GraphQL Client] Response status: ${axiosError.response.status}`);
console.error(`[GraphQL Client] Response data: ${JSON.stringify(axiosError.response.data).slice(0, 500)}`);
}
if (axiosError.code) {
console.error(`[GraphQL Client] Error code: ${axiosError.code}`);
}
}
else {
console.error(`[GraphQL Client] Error: ${error.message}`);
}
throw error;
}
}
/**
* Resolve a dispensary slug to its internal platform ID.
*
* STRATEGY:
* 1. Navigate to embedded menu page and extract window.reactEnv.dispensaryId (preferred)
* 2. Fall back to GraphQL GetAddressBasedDispensaryData query if reactEnv fails
*
* Returns the dispensaryId (platform_dispensary_id) or null if not found.
* Throws if page returns 403/404 so caller can mark as not_crawlable.
*/
async function resolveDispensaryId(slug) {
const result = await resolveDispensaryIdWithDetails(slug);
return result.dispensaryId;
}
/**
* Resolve a dispensary slug with full details (HTTP status, source, error).
* Use this when you need to know WHY resolution failed.
*/
async function resolveDispensaryIdWithDetails(slug) {
console.log(`[GraphQL Client] Resolving dispensary ID for slug: ${slug}`);
const session = await createSession(slug);
try {
// Check HTTP status first - if 403/404, the store is not crawlable
if (session.httpStatus && (session.httpStatus === 403 || session.httpStatus === 404)) {
console.log(`[GraphQL Client] Page returned HTTP ${session.httpStatus} for ${slug} - not crawlable`);
return {
dispensaryId: null,
httpStatus: session.httpStatus,
error: `HTTP ${session.httpStatus}: Store removed or not accessible`,
source: 'reactEnv',
};
}
// PREFERRED: Use dispensaryId from window.reactEnv (extracted during createSession)
if (session.dispensaryId) {
console.log(`[GraphQL Client] Resolved ${slug} -> ${session.dispensaryId} (from reactEnv)`);
return {
dispensaryId: session.dispensaryId,
httpStatus: session.httpStatus,
source: 'reactEnv',
};
}
// FALLBACK: Try GraphQL query
console.log(`[GraphQL Client] reactEnv.dispensaryId not found for ${slug}, trying GraphQL...`);
const variables = {
dispensaryFilter: {
cNameOrID: slug,
},
};
const result = await executeGraphQL(session, 'GetAddressBasedDispensaryData', variables, dutchie_1.GRAPHQL_HASHES.GetAddressBasedDispensaryData, slug);
const dispensaryId = result?.data?.dispensaryBySlug?.id ||
result?.data?.dispensary?.id ||
result?.data?.getAddressBasedDispensaryData?.dispensary?.id;
if (dispensaryId) {
console.log(`[GraphQL Client] Resolved ${slug} -> ${dispensaryId} (from GraphQL)`);
return {
dispensaryId,
httpStatus: session.httpStatus,
source: 'graphql',
};
}
console.log(`[GraphQL Client] Could not resolve ${slug}, GraphQL response:`, JSON.stringify(result).slice(0, 300));
return {
dispensaryId: null,
httpStatus: session.httpStatus,
error: 'Could not extract dispensaryId from reactEnv or GraphQL',
};
}
finally {
await closeSession(session);
}
}
/**
* Discover Arizona dispensaries via geo-based query
*/
async function discoverArizonaDispensaries() {
console.log('[GraphQL Client] Discovering Arizona dispensaries...');
// Use Phoenix as the default center
const session = await createSession('AZ-Deeply-Rooted');
const allDispensaries = [];
const seenIds = new Set();
try {
for (const centerpoint of dutchie_1.ARIZONA_CENTERPOINTS) {
console.log(`[GraphQL Client] Scanning ${centerpoint.name}...`);
const variables = {
dispensariesFilter: {
latitude: centerpoint.lat,
longitude: centerpoint.lng,
distance: 100,
state: 'AZ',
},
};
try {
const result = await executeGraphQL(session, 'ConsumerDispensaries', variables, dutchie_1.GRAPHQL_HASHES.ConsumerDispensaries, 'AZ-Deeply-Rooted');
const dispensaries = result?.data?.consumerDispensaries || [];
for (const d of dispensaries) {
const id = d.id || d.dispensaryId;
if (id && !seenIds.has(id)) {
seenIds.add(id);
allDispensaries.push(d);
}
}
console.log(`[GraphQL Client] Found ${dispensaries.length} in ${centerpoint.name} (${allDispensaries.length} total unique)`);
}
catch (error) {
console.warn(`[GraphQL Client] Error scanning ${centerpoint.name}: ${error.message}`);
}
// Delay between requests
await new Promise((r) => setTimeout(r, 1000));
}
}
finally {
await closeSession(session);
}
console.log(`[GraphQL Client] Discovery complete: ${allDispensaries.length} dispensaries`);
return allDispensaries;
}
// ============================================================
// PRODUCT FILTERING VARIABLES
// ============================================================
/**
* Build filter variables for FilteredProducts query
*
* CRITICAL: Uses dispensaryId directly (the MongoDB ObjectId, e.g. "6405ef617056e8014d79101b")
* NOT dispensaryFilter.cNameOrID!
*
* The actual browser request structure is:
* {
* "productsFilter": {
* "dispensaryId": "6405ef617056e8014d79101b",
* "pricingType": "rec",
* "Status": "Active", // Mode A only
* "strainTypes": [],
* "subcategories": [],
* "types": [],
* "useCache": true,
* ...
* },
* "page": 0,
* "perPage": 100
* }
*
* Mode A = UI parity (Status: "Active")
* Mode B = MAX COVERAGE (no Status filter)
*/
function buildFilterVariables(platformDispensaryId, pricingType, crawlMode, page, perPage) {
const isModeA = crawlMode === 'mode_a';
// Per CLAUDE.md Rule #11: Use simple productsFilter with dispensaryId directly
// Do NOT use dispensaryFilter.cNameOrID - that's outdated
const productsFilter = {
dispensaryId: platformDispensaryId,
pricingType: pricingType,
};
// Mode A: Only active products (UI parity) - Status: "Active"
// Mode B: MAX COVERAGE (OOS/inactive) - omit Status or set to null
if (isModeA) {
productsFilter.Status = 'Active';
}
// Mode B: No Status filter = returns all products including OOS/inactive
return {
productsFilter,
page,
perPage,
};
}
// ============================================================
// PRODUCT FETCHING WITH PAGINATION
// ============================================================
/**
* Fetch products for a single mode with pagination
*/
async function fetchProductsForMode(session, platformDispensaryId, cName, pricingType, crawlMode) {
const perPage = dutchie_1.dutchieConfig.perPage;
const maxPages = dutchie_1.dutchieConfig.maxPages;
const maxRetries = dutchie_1.dutchieConfig.maxRetries;
const pageDelayMs = dutchie_1.dutchieConfig.pageDelayMs;
const allProducts = [];
let pageNum = 0;
let totalCount = 0;
let consecutiveEmptyPages = 0;
console.log(`[GraphQL Client] Fetching products for ${cName} (platformId: ${platformDispensaryId}, ${pricingType}, ${crawlMode})...`);
while (pageNum < maxPages) {
const variables = buildFilterVariables(platformDispensaryId, pricingType, crawlMode, pageNum, perPage);
let result = null;
let lastError = null;
// Retry logic
for (let attempt = 0; attempt <= maxRetries; attempt++) {
try {
result = await executeGraphQL(session, 'FilteredProducts', variables, dutchie_1.GRAPHQL_HASHES.FilteredProducts, cName);
lastError = null;
break;
}
catch (error) {
lastError = error;
console.warn(`[GraphQL Client] Page ${pageNum} attempt ${attempt + 1} failed: ${error.message}`);
if (attempt < maxRetries) {
await new Promise((r) => setTimeout(r, 1000 * (attempt + 1)));
}
}
}
if (lastError) {
console.error(`[GraphQL Client] Page ${pageNum} failed after ${maxRetries + 1} attempts`);
break;
}
if (result?.errors) {
console.error('[GraphQL Client] GraphQL errors:', JSON.stringify(result.errors));
break;
}
// Log response shape on first page
if (pageNum === 0) {
console.log(`[GraphQL Client] Response keys: ${Object.keys(result || {}).join(', ')}`);
if (result?.data) {
console.log(`[GraphQL Client] data keys: ${Object.keys(result.data || {}).join(', ')}`);
}
if (!result?.data?.filteredProducts) {
console.log(`[GraphQL Client] WARNING: No filteredProducts in response!`);
console.log(`[GraphQL Client] Full response: ${JSON.stringify(result).slice(0, 1000)}`);
}
}
const products = result?.data?.filteredProducts?.products || [];
const queryInfo = result?.data?.filteredProducts?.queryInfo;
if (queryInfo?.totalCount) {
totalCount = queryInfo.totalCount;
}
console.log(`[GraphQL Client] Page ${pageNum}: ${products.length} products (total so far: ${allProducts.length + products.length}/${totalCount})`);
if (products.length === 0) {
consecutiveEmptyPages++;
if (consecutiveEmptyPages >= 2) {
console.log('[GraphQL Client] Multiple empty pages, stopping pagination');
break;
}
}
else {
consecutiveEmptyPages = 0;
allProducts.push(...products);
}
// Stop if incomplete page (last page)
if (products.length < perPage) {
console.log(`[GraphQL Client] Incomplete page (${products.length} < ${perPage}), stopping`);
break;
}
pageNum++;
await new Promise((r) => setTimeout(r, pageDelayMs));
}
console.log(`[GraphQL Client] Fetched ${allProducts.length} total products (${crawlMode})`);
return { products: allProducts, totalCount: totalCount || allProducts.length, crawlMode };
}
// ============================================================
// LEGACY SINGLE-MODE INTERFACE
// ============================================================
/**
* Fetch all products for a dispensary (single mode)
*/
async function fetchAllProducts(platformDispensaryId, pricingType = 'rec', options = {}) {
const { crawlMode = 'mode_a' } = options;
// cName is now REQUIRED - no default fallback to avoid using wrong store's session
const cName = options.cName;
if (!cName) {
throw new Error('[GraphQL Client] cName is required for fetchAllProducts - cannot use another store\'s session');
}
const session = await createSession(cName);
try {
return await fetchProductsForMode(session, platformDispensaryId, cName, pricingType, crawlMode);
}
finally {
await closeSession(session);
}
}
// ============================================================
// MODE A+B MERGING
// ============================================================
/**
* Merge POSMetaData.children arrays from Mode A and Mode B products
*/
function mergeProductOptions(modeAProduct, modeBProduct) {
const modeAChildren = modeAProduct.POSMetaData?.children || [];
const modeBChildren = modeBProduct.POSMetaData?.children || [];
const getOptionKey = (child) => {
return child.canonicalID || child.canonicalSKU || child.canonicalPackageId || child.option || '';
};
const mergedMap = new Map();
for (const child of modeAChildren) {
const key = getOptionKey(child);
if (key)
mergedMap.set(key, child);
}
for (const child of modeBChildren) {
const key = getOptionKey(child);
if (key && !mergedMap.has(key)) {
mergedMap.set(key, child);
}
}
return Array.from(mergedMap.values());
}
/**
* Merge a Mode A product with a Mode B product
*/
function mergeProducts(modeAProduct, modeBProduct) {
if (!modeBProduct) {
return modeAProduct;
}
const mergedChildren = mergeProductOptions(modeAProduct, modeBProduct);
return {
...modeAProduct,
POSMetaData: {
...modeAProduct.POSMetaData,
children: mergedChildren,
},
};
}
// ============================================================
// MAIN EXPORT: TWO-MODE CRAWL
// ============================================================
/**
* Fetch products using BOTH crawl modes with SINGLE session
* Runs Mode A then Mode B, merges results
*/
async function fetchAllProductsBothModes(platformDispensaryId, pricingType = 'rec', options = {}) {
// cName is now REQUIRED - no default fallback to avoid using wrong store's session
const cName = options.cName;
if (!cName) {
throw new Error('[GraphQL Client] cName is required for fetchAllProductsBothModes - cannot use another store\'s session');
}
console.log(`[GraphQL Client] Running two-mode crawl for ${cName} (${pricingType})...`);
console.log(`[GraphQL Client] Platform ID: ${platformDispensaryId}, cName: ${cName}`);
const session = await createSession(cName);
try {
// Mode A (UI parity)
const modeAResult = await fetchProductsForMode(session, platformDispensaryId, cName, pricingType, 'mode_a');
// Delay between modes
await new Promise((r) => setTimeout(r, dutchie_1.dutchieConfig.modeDelayMs));
// Mode B (MAX COVERAGE)
const modeBResult = await fetchProductsForMode(session, platformDispensaryId, cName, pricingType, 'mode_b');
// Merge results
const modeBMap = new Map();
for (const product of modeBResult.products) {
modeBMap.set(product._id, product);
}
const productMap = new Map();
// Add Mode A products, merging with Mode B if exists
for (const product of modeAResult.products) {
const modeBProduct = modeBMap.get(product._id);
const mergedProduct = mergeProducts(product, modeBProduct);
productMap.set(product._id, mergedProduct);
}
// Add Mode B products not in Mode A
for (const product of modeBResult.products) {
if (!productMap.has(product._id)) {
productMap.set(product._id, product);
}
}
const mergedProducts = Array.from(productMap.values());
console.log(`[GraphQL Client] Merged: ${mergedProducts.length} unique products`);
console.log(`[GraphQL Client] Mode A: ${modeAResult.products.length}, Mode B: ${modeBResult.products.length}`);
return {
modeA: { products: modeAResult.products, totalCount: modeAResult.totalCount },
modeB: { products: modeBResult.products, totalCount: modeBResult.totalCount },
merged: { products: mergedProducts, totalCount: mergedProducts.length },
};
}
finally {
await closeSession(session);
}
}

View File

@@ -0,0 +1,414 @@
"use strict";
/**
* Job Queue Service
*
* DB-backed job queue with claiming/locking for distributed workers.
* Ensures only one worker processes a given store at a time.
*/
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
exports.getWorkerId = getWorkerId;
exports.getWorkerHostname = getWorkerHostname;
exports.enqueueJob = enqueueJob;
exports.bulkEnqueueJobs = bulkEnqueueJobs;
exports.claimNextJob = claimNextJob;
exports.updateJobProgress = updateJobProgress;
exports.heartbeat = heartbeat;
exports.completeJob = completeJob;
exports.failJob = failJob;
exports.getQueueStats = getQueueStats;
exports.getActiveWorkers = getActiveWorkers;
exports.getRunningJobs = getRunningJobs;
exports.recoverStaleJobs = recoverStaleJobs;
exports.cleanupOldJobs = cleanupOldJobs;
const connection_1 = require("../db/connection");
const uuid_1 = require("uuid");
const os = __importStar(require("os"));
// ============================================================
// WORKER IDENTITY
// ============================================================
let _workerId = null;
/**
* Get or create a unique worker ID for this process
* In Kubernetes, uses POD_NAME for clarity; otherwise generates a unique ID
*/
function getWorkerId() {
if (!_workerId) {
// Prefer POD_NAME in K8s (set via fieldRef)
const podName = process.env.POD_NAME;
if (podName) {
_workerId = podName;
}
else {
const hostname = os.hostname();
const pid = process.pid;
const uuid = (0, uuid_1.v4)().slice(0, 8);
_workerId = `${hostname}-${pid}-${uuid}`;
}
}
return _workerId;
}
/**
* Get hostname for worker tracking
* In Kubernetes, uses POD_NAME; otherwise uses os.hostname()
*/
function getWorkerHostname() {
return process.env.POD_NAME || os.hostname();
}
// ============================================================
// JOB ENQUEUEING
// ============================================================
/**
* Enqueue a new job for processing
* Returns null if a pending/running job already exists for this dispensary
*/
async function enqueueJob(options) {
const { jobType, dispensaryId, priority = 0, metadata, maxRetries = 3, } = options;
// Check if there's already a pending/running job for this dispensary
if (dispensaryId) {
const { rows: existing } = await (0, connection_1.query)(`SELECT id FROM dispensary_crawl_jobs
WHERE dispensary_id = $1 AND status IN ('pending', 'running')
LIMIT 1`, [dispensaryId]);
if (existing.length > 0) {
console.log(`[JobQueue] Skipping enqueue - job already exists for dispensary ${dispensaryId}`);
return null;
}
}
const { rows } = await (0, connection_1.query)(`INSERT INTO dispensary_crawl_jobs (job_type, dispensary_id, status, priority, max_retries, metadata, created_at)
VALUES ($1, $2, 'pending', $3, $4, $5, NOW())
RETURNING id`, [jobType, dispensaryId || null, priority, maxRetries, metadata ? JSON.stringify(metadata) : null]);
const jobId = rows[0].id;
console.log(`[JobQueue] Enqueued job ${jobId} (type=${jobType}, dispensary=${dispensaryId})`);
return jobId;
}
/**
* Bulk enqueue jobs for multiple dispensaries
* Skips dispensaries that already have pending/running jobs
*/
async function bulkEnqueueJobs(jobType, dispensaryIds, options = {}) {
const { priority = 0, metadata } = options;
// Get dispensaries that already have pending/running jobs
const { rows: existing } = await (0, connection_1.query)(`SELECT DISTINCT dispensary_id FROM dispensary_crawl_jobs
WHERE dispensary_id = ANY($1) AND status IN ('pending', 'running')`, [dispensaryIds]);
const existingSet = new Set(existing.map((r) => r.dispensary_id));
// Filter out dispensaries with existing jobs
const toEnqueue = dispensaryIds.filter(id => !existingSet.has(id));
if (toEnqueue.length === 0) {
return { enqueued: 0, skipped: dispensaryIds.length };
}
// Bulk insert - each row needs 4 params: job_type, dispensary_id, priority, metadata
const metadataJson = metadata ? JSON.stringify(metadata) : null;
const values = toEnqueue.map((_, i) => {
const offset = i * 4;
return `($${offset + 1}, $${offset + 2}, 'pending', $${offset + 3}, 3, $${offset + 4}, NOW())`;
}).join(', ');
const params = [];
toEnqueue.forEach(dispensaryId => {
params.push(jobType, dispensaryId, priority, metadataJson);
});
await (0, connection_1.query)(`INSERT INTO dispensary_crawl_jobs (job_type, dispensary_id, status, priority, max_retries, metadata, created_at)
VALUES ${values}`, params);
console.log(`[JobQueue] Bulk enqueued ${toEnqueue.length} jobs, skipped ${existingSet.size}`);
return { enqueued: toEnqueue.length, skipped: existingSet.size };
}
// ============================================================
// JOB CLAIMING (with locking)
// ============================================================
/**
* Claim the next available job from the queue
* Uses SELECT FOR UPDATE SKIP LOCKED to prevent double-claims
*/
async function claimNextJob(options) {
const { workerId, jobTypes, lockDurationMinutes = 30 } = options;
const hostname = getWorkerHostname();
const client = await (0, connection_1.getClient)();
try {
await client.query('BEGIN');
// Build job type filter
let typeFilter = '';
const params = [workerId, hostname, lockDurationMinutes];
let paramIndex = 4;
if (jobTypes && jobTypes.length > 0) {
typeFilter = `AND job_type = ANY($${paramIndex})`;
params.push(jobTypes);
paramIndex++;
}
// Claim the next pending job using FOR UPDATE SKIP LOCKED
// This atomically selects and locks a row, skipping any already locked by other workers
const { rows } = await client.query(`UPDATE dispensary_crawl_jobs
SET
status = 'running',
claimed_by = $1,
claimed_at = NOW(),
worker_id = $1,
worker_hostname = $2,
started_at = NOW(),
locked_until = NOW() + ($3 || ' minutes')::INTERVAL,
last_heartbeat_at = NOW(),
updated_at = NOW()
WHERE id = (
SELECT id FROM dispensary_crawl_jobs
WHERE status = 'pending'
${typeFilter}
ORDER BY priority DESC, created_at ASC
FOR UPDATE SKIP LOCKED
LIMIT 1
)
RETURNING *`, params);
await client.query('COMMIT');
if (rows.length === 0) {
return null;
}
const job = mapDbRowToJob(rows[0]);
console.log(`[JobQueue] Worker ${workerId} claimed job ${job.id} (type=${job.jobType}, dispensary=${job.dispensaryId})`);
return job;
}
catch (error) {
await client.query('ROLLBACK');
throw error;
}
finally {
client.release();
}
}
// ============================================================
// JOB PROGRESS & COMPLETION
// ============================================================
/**
* Update job progress (for live monitoring)
*/
async function updateJobProgress(jobId, progress) {
const updates = ['last_heartbeat_at = NOW()', 'updated_at = NOW()'];
const params = [];
let paramIndex = 1;
if (progress.productsFound !== undefined) {
updates.push(`products_found = $${paramIndex++}`);
params.push(progress.productsFound);
}
if (progress.productsUpserted !== undefined) {
updates.push(`products_upserted = $${paramIndex++}`);
params.push(progress.productsUpserted);
}
if (progress.snapshotsCreated !== undefined) {
updates.push(`snapshots_created = $${paramIndex++}`);
params.push(progress.snapshotsCreated);
}
if (progress.currentPage !== undefined) {
updates.push(`current_page = $${paramIndex++}`);
params.push(progress.currentPage);
}
if (progress.totalPages !== undefined) {
updates.push(`total_pages = $${paramIndex++}`);
params.push(progress.totalPages);
}
params.push(jobId);
await (0, connection_1.query)(`UPDATE dispensary_crawl_jobs SET ${updates.join(', ')} WHERE id = $${paramIndex}`, params);
}
/**
* Send heartbeat to keep job alive (prevents timeout)
*/
async function heartbeat(jobId) {
await (0, connection_1.query)(`UPDATE dispensary_crawl_jobs
SET last_heartbeat_at = NOW(), locked_until = NOW() + INTERVAL '30 minutes'
WHERE id = $1 AND status = 'running'`, [jobId]);
}
/**
* Mark job as completed
*/
async function completeJob(jobId, result) {
await (0, connection_1.query)(`UPDATE dispensary_crawl_jobs
SET
status = 'completed',
completed_at = NOW(),
products_found = COALESCE($2, products_found),
products_upserted = COALESCE($3, products_upserted),
snapshots_created = COALESCE($4, snapshots_created),
updated_at = NOW()
WHERE id = $1`, [jobId, result.productsFound, result.productsUpserted, result.snapshotsCreated]);
console.log(`[JobQueue] Job ${jobId} completed`);
}
/**
* Mark job as failed
*/
async function failJob(jobId, errorMessage) {
// Check if we should retry
const { rows } = await (0, connection_1.query)(`SELECT retry_count, max_retries FROM dispensary_crawl_jobs WHERE id = $1`, [jobId]);
if (rows.length === 0)
return false;
const { retry_count, max_retries } = rows[0];
if (retry_count < max_retries) {
// Re-queue for retry
await (0, connection_1.query)(`UPDATE dispensary_crawl_jobs
SET
status = 'pending',
retry_count = retry_count + 1,
claimed_by = NULL,
claimed_at = NULL,
worker_id = NULL,
worker_hostname = NULL,
started_at = NULL,
locked_until = NULL,
last_heartbeat_at = NULL,
error_message = $2,
updated_at = NOW()
WHERE id = $1`, [jobId, errorMessage]);
console.log(`[JobQueue] Job ${jobId} failed, re-queued for retry (${retry_count + 1}/${max_retries})`);
return true; // Will retry
}
else {
// Mark as failed permanently
await (0, connection_1.query)(`UPDATE dispensary_crawl_jobs
SET
status = 'failed',
completed_at = NOW(),
error_message = $2,
updated_at = NOW()
WHERE id = $1`, [jobId, errorMessage]);
console.log(`[JobQueue] Job ${jobId} failed permanently after ${retry_count} retries`);
return false; // No more retries
}
}
// ============================================================
// QUEUE MONITORING
// ============================================================
/**
* Get queue statistics
*/
async function getQueueStats() {
const { rows } = await (0, connection_1.query)(`SELECT * FROM v_queue_stats`);
const stats = rows[0] || {};
return {
pending: parseInt(stats.pending_jobs || '0', 10),
running: parseInt(stats.running_jobs || '0', 10),
completed1h: parseInt(stats.completed_1h || '0', 10),
failed1h: parseInt(stats.failed_1h || '0', 10),
activeWorkers: parseInt(stats.active_workers || '0', 10),
avgDurationSeconds: stats.avg_duration_seconds ? parseFloat(stats.avg_duration_seconds) : null,
};
}
/**
* Get active workers
*/
async function getActiveWorkers() {
const { rows } = await (0, connection_1.query)(`SELECT * FROM v_active_workers`);
return rows.map((row) => ({
workerId: row.worker_id,
hostname: row.worker_hostname,
currentJobs: parseInt(row.current_jobs || '0', 10),
totalProductsFound: parseInt(row.total_products_found || '0', 10),
totalProductsUpserted: parseInt(row.total_products_upserted || '0', 10),
totalSnapshots: parseInt(row.total_snapshots || '0', 10),
firstClaimedAt: new Date(row.first_claimed_at),
lastHeartbeat: row.last_heartbeat ? new Date(row.last_heartbeat) : null,
}));
}
/**
* Get running jobs with worker info
*/
async function getRunningJobs() {
const { rows } = await (0, connection_1.query)(`SELECT cj.*, d.name as dispensary_name, d.city
FROM dispensary_crawl_jobs cj
LEFT JOIN dispensaries d ON cj.dispensary_id = d.id
WHERE cj.status = 'running'
ORDER BY cj.started_at DESC`);
return rows.map(mapDbRowToJob);
}
/**
* Recover stale jobs (workers that died without completing)
*/
async function recoverStaleJobs(staleMinutes = 15) {
const { rowCount } = await (0, connection_1.query)(`UPDATE dispensary_crawl_jobs
SET
status = 'pending',
claimed_by = NULL,
claimed_at = NULL,
worker_id = NULL,
worker_hostname = NULL,
started_at = NULL,
locked_until = NULL,
error_message = 'Recovered from stale worker',
retry_count = retry_count + 1,
updated_at = NOW()
WHERE status = 'running'
AND last_heartbeat_at < NOW() - ($1 || ' minutes')::INTERVAL
AND retry_count < max_retries`, [staleMinutes]);
if (rowCount && rowCount > 0) {
console.log(`[JobQueue] Recovered ${rowCount} stale jobs`);
}
return rowCount || 0;
}
/**
* Clean up old completed/failed jobs
*/
async function cleanupOldJobs(olderThanDays = 7) {
const { rowCount } = await (0, connection_1.query)(`DELETE FROM dispensary_crawl_jobs
WHERE status IN ('completed', 'failed')
AND completed_at < NOW() - ($1 || ' days')::INTERVAL`, [olderThanDays]);
if (rowCount && rowCount > 0) {
console.log(`[JobQueue] Cleaned up ${rowCount} old jobs`);
}
return rowCount || 0;
}
// ============================================================
// HELPERS
// ============================================================
function mapDbRowToJob(row) {
return {
id: row.id,
jobType: row.job_type,
dispensaryId: row.dispensary_id,
status: row.status,
priority: row.priority || 0,
retryCount: row.retry_count || 0,
maxRetries: row.max_retries || 3,
claimedBy: row.claimed_by,
claimedAt: row.claimed_at ? new Date(row.claimed_at) : null,
workerHostname: row.worker_hostname,
startedAt: row.started_at ? new Date(row.started_at) : null,
completedAt: row.completed_at ? new Date(row.completed_at) : null,
errorMessage: row.error_message,
productsFound: row.products_found || 0,
productsUpserted: row.products_upserted || 0,
snapshotsCreated: row.snapshots_created || 0,
currentPage: row.current_page || 0,
totalPages: row.total_pages,
lastHeartbeatAt: row.last_heartbeat_at ? new Date(row.last_heartbeat_at) : null,
metadata: row.metadata,
createdAt: new Date(row.created_at),
// Add extra fields from join if present
...(row.dispensary_name && { dispensaryName: row.dispensary_name }),
...(row.city && { city: row.city }),
};
}

View File

@@ -0,0 +1,837 @@
"use strict";
/**
* Menu Detection Service
*
* Detects menu provider (dutchie, treez, jane, etc.) from dispensary menu_url
* and resolves platform_dispensary_id for dutchie stores.
*
* This service:
* 1. Iterates dispensaries with unknown/missing menu_type or platform_dispensary_id
* 2. Detects provider from menu_url patterns
* 3. For dutchie: extracts cName and resolves platform_dispensary_id via GraphQL
* 4. Logs results to job_run_logs
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.crawlWebsiteForMenuLinks = crawlWebsiteForMenuLinks;
exports.detectProviderFromUrl = detectProviderFromUrl;
exports.detectAndResolveDispensary = detectAndResolveDispensary;
exports.runBulkDetection = runBulkDetection;
exports.executeMenuDetectionJob = executeMenuDetectionJob;
exports.getDetectionStats = getDetectionStats;
exports.getDispensariesNeedingDetection = getDispensariesNeedingDetection;
const connection_1 = require("../db/connection");
const discovery_1 = require("./discovery");
const graphql_client_1 = require("./graphql-client");
// Explicit column list for dispensaries table (avoids SELECT * issues with schema differences)
const DISPENSARY_COLUMNS = `
id, name, slug, city, state, zip, address, latitude, longitude,
menu_type, menu_url, platform_dispensary_id, website,
provider_detection_data, created_at, updated_at
`;
// ============================================================
// PROVIDER DETECTION PATTERNS
// ============================================================
const PROVIDER_URL_PATTERNS = [
// IMPORTANT: Curaleaf and Sol must come BEFORE dutchie to take precedence
// These stores have their own proprietary menu systems (not crawlable via Dutchie)
{
provider: 'curaleaf',
patterns: [
/curaleaf\.com\/stores\//i, // e.g., https://curaleaf.com/stores/curaleaf-az-glendale-east
/curaleaf\.com\/dispensary\//i, // e.g., https://curaleaf.com/dispensary/arizona
],
},
{
provider: 'sol',
patterns: [
/livewithsol\.com/i, // e.g., https://www.livewithsol.com/locations/sun-city/
/solflower\.com/i, // alternate domain if any
],
},
{
provider: 'dutchie',
patterns: [
/dutchie\.com/i,
/\/embedded-menu\//i,
/\/dispensary\/[A-Z]{2}-/i, // e.g., /dispensary/AZ-store-name
/dutchie-plus/i,
],
},
{
provider: 'treez',
patterns: [
/treez\.io/i,
/shop\.treez/i,
/treez-ecommerce/i,
],
},
{
provider: 'jane',
patterns: [
/jane\.co/i,
/iheartjane\.com/i,
/embed\.iheartjane/i,
],
},
{
provider: 'weedmaps',
patterns: [
/weedmaps\.com/i,
/menu\.weedmaps/i,
],
},
{
provider: 'leafly',
patterns: [
/leafly\.com/i,
/order\.leafly/i,
],
},
{
provider: 'meadow',
patterns: [
/getmeadow\.com/i,
/meadow\.co/i,
],
},
{
provider: 'blaze',
patterns: [
/blaze\.me/i,
/blazepos\.com/i,
],
},
{
provider: 'flowhub',
patterns: [
/flowhub\.com/i,
/flowhub\.co/i,
],
},
{
provider: 'dispense',
patterns: [
/dispense\.io/i,
/dispenseapp\.com/i,
],
},
];
/**
* Link patterns that suggest a menu or ordering page
*/
const MENU_LINK_PATTERNS = [
/\/menu/i,
/\/order/i,
/\/shop/i,
/\/products/i,
/\/dispensary/i,
/\/store/i,
/curaleaf\.com/i,
/dutchie\.com/i,
/treez\.io/i,
/jane\.co/i,
/iheartjane\.com/i,
/weedmaps\.com/i,
/leafly\.com/i,
/getmeadow\.com/i,
/blaze\.me/i,
/flowhub\.com/i,
/dispense\.io/i,
];
/**
* Check if a URL is a Curaleaf store URL
*/
function isCuraleafUrl(url) {
if (!url)
return false;
return /curaleaf\.com\/(stores|dispensary)\//i.test(url);
}
/**
* Extract the Curaleaf store URL from a website URL
* Handles both /stores/ and /dispensary/ formats
*/
function extractCuraleafStoreUrl(url) {
if (!url)
return null;
// If it's already a Curaleaf stores/dispensary URL, use it
if (isCuraleafUrl(url)) {
return url;
}
return null;
}
/**
* Fetch a page and extract all links
*/
async function fetchPageLinks(url, timeout = 10000) {
try {
const controller = new AbortController();
const timeoutId = setTimeout(() => controller.abort(), timeout);
const response = await fetch(url, {
signal: controller.signal,
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
},
redirect: 'follow',
});
clearTimeout(timeoutId);
if (!response.ok) {
return { links: [], error: `HTTP ${response.status}` };
}
const html = await response.text();
// Extract all href attributes from anchor tags
const linkRegex = /href=["']([^"']+)["']/gi;
const links = [];
let match;
while ((match = linkRegex.exec(html)) !== null) {
const href = match[1];
// Convert relative URLs to absolute
try {
const absoluteUrl = new URL(href, url).href;
links.push(absoluteUrl);
}
catch {
// Skip invalid URLs
}
}
// Also look for iframe src attributes (common for embedded menus)
const iframeRegex = /src=["']([^"']+)["']/gi;
while ((match = iframeRegex.exec(html)) !== null) {
const src = match[1];
try {
const absoluteUrl = new URL(src, url).href;
// Only add if it matches a provider pattern
for (const { patterns } of PROVIDER_URL_PATTERNS) {
if (patterns.some(p => p.test(absoluteUrl))) {
links.push(absoluteUrl);
break;
}
}
}
catch {
// Skip invalid URLs
}
}
return { links: [...new Set(links)] }; // Deduplicate
}
catch (error) {
if (error.name === 'AbortError') {
return { links: [], error: 'Timeout' };
}
return { links: [], error: error.message };
}
}
/**
* Crawl a dispensary's website to find menu provider links
*
* Strategy:
* 1. Fetch the homepage and extract all links
* 2. Look for links that match known provider patterns (dutchie, treez, etc.)
* 3. If no direct match, look for menu/order/shop links and follow them (1-2 hops)
* 4. Check followed pages for provider patterns
*/
async function crawlWebsiteForMenuLinks(websiteUrl) {
console.log(`[WebsiteCrawl] Crawling ${websiteUrl} for menu links...`);
const result = {
menuUrl: null,
provider: 'unknown',
foundLinks: [],
crawledPages: [],
};
// Normalize URL
let baseUrl;
try {
baseUrl = new URL(websiteUrl);
if (!baseUrl.protocol.startsWith('http')) {
baseUrl = new URL(`https://${websiteUrl}`);
}
}
catch {
result.error = 'Invalid website URL';
return result;
}
// Step 1: Fetch the homepage
const homepage = baseUrl.href;
result.crawledPages.push(homepage);
const { links: homepageLinks, error: homepageError } = await fetchPageLinks(homepage);
if (homepageError) {
result.error = `Failed to fetch homepage: ${homepageError}`;
return result;
}
result.foundLinks = homepageLinks;
// Step 2: Check for direct provider matches in homepage links
for (const link of homepageLinks) {
for (const { provider, patterns } of PROVIDER_URL_PATTERNS) {
if (patterns.some(p => p.test(link))) {
console.log(`[WebsiteCrawl] Found ${provider} link on homepage: ${link}`);
result.menuUrl = link;
result.provider = provider;
return result;
}
}
}
// Step 3: Find menu/order/shop links to follow
const menuLinks = homepageLinks.filter(link => {
// Must be same domain or a known provider domain
try {
const linkUrl = new URL(link);
const isSameDomain = linkUrl.hostname === baseUrl.hostname ||
linkUrl.hostname.endsWith(`.${baseUrl.hostname}`);
const isProviderDomain = PROVIDER_URL_PATTERNS.some(({ patterns }) => patterns.some(p => p.test(link)));
const isMenuPath = MENU_LINK_PATTERNS.some(p => p.test(link));
return (isSameDomain && isMenuPath) || isProviderDomain;
}
catch {
return false;
}
});
console.log(`[WebsiteCrawl] Found ${menuLinks.length} potential menu links to follow`);
// Step 4: Follow menu links (limit to 3 to avoid excessive crawling)
for (const menuLink of menuLinks.slice(0, 3)) {
// Skip if we've already crawled this page
if (result.crawledPages.includes(menuLink))
continue;
// Check if this link itself is a provider URL
for (const { provider, patterns } of PROVIDER_URL_PATTERNS) {
if (patterns.some(p => p.test(menuLink))) {
console.log(`[WebsiteCrawl] Menu link is a ${provider} URL: ${menuLink}`);
result.menuUrl = menuLink;
result.provider = provider;
return result;
}
}
result.crawledPages.push(menuLink);
// Rate limit
await new Promise(r => setTimeout(r, 500));
const { links: pageLinks, error: pageError } = await fetchPageLinks(menuLink);
if (pageError) {
console.log(`[WebsiteCrawl] Failed to fetch ${menuLink}: ${pageError}`);
continue;
}
result.foundLinks.push(...pageLinks);
// Check for provider matches on this page
for (const link of pageLinks) {
for (const { provider, patterns } of PROVIDER_URL_PATTERNS) {
if (patterns.some(p => p.test(link))) {
console.log(`[WebsiteCrawl] Found ${provider} link on ${menuLink}: ${link}`);
result.menuUrl = link;
result.provider = provider;
return result;
}
}
}
}
console.log(`[WebsiteCrawl] No menu provider found on ${websiteUrl}`);
return result;
}
// ============================================================
// CORE DETECTION FUNCTIONS
// ============================================================
/**
* Detect menu provider from a URL
*/
function detectProviderFromUrl(menuUrl) {
if (!menuUrl)
return 'unknown';
for (const { provider, patterns } of PROVIDER_URL_PATTERNS) {
for (const pattern of patterns) {
if (pattern.test(menuUrl)) {
return provider;
}
}
}
// Check if it's a custom website (has a domain but doesn't match known providers)
try {
const url = new URL(menuUrl);
if (url.hostname && !url.hostname.includes('localhost')) {
return 'custom';
}
}
catch {
// Invalid URL
}
return 'unknown';
}
/**
* Detect provider and resolve platform ID for a single dispensary
*/
async function detectAndResolveDispensary(dispensaryId) {
console.log(`[MenuDetection] Processing dispensary ${dispensaryId}...`);
// Get dispensary record
const { rows } = await (0, connection_1.query)(`SELECT ${DISPENSARY_COLUMNS} FROM dispensaries WHERE id = $1`, [dispensaryId]);
if (rows.length === 0) {
return {
dispensaryId,
dispensaryName: 'Unknown',
previousMenuType: null,
detectedProvider: 'unknown',
cName: null,
platformDispensaryId: null,
success: false,
error: 'Dispensary not found',
};
}
const dispensary = (0, discovery_1.mapDbRowToDispensary)(rows[0]);
let menuUrl = dispensary.menuUrl;
const previousMenuType = dispensary.menuType || null;
const website = dispensary.website;
// ============================================================
// CURALEAF CHECK: If website is Curaleaf, override any stale Dutchie menu_url
// This prevents 60s Dutchie timeouts for stores that have migrated to Curaleaf's platform
// ============================================================
if (isCuraleafUrl(website)) {
console.log(`[MenuDetection] ${dispensary.name}: Website is Curaleaf - marking as curaleaf provider`);
// Use the Curaleaf website URL as the menu_url (clearing stale Dutchie URL if any)
// At this point we know website is defined since isCuraleafUrl returned true
const curaleafUrl = extractCuraleafStoreUrl(website) || website;
await (0, connection_1.query)(`
UPDATE dispensaries SET
menu_type = 'curaleaf',
menu_url = $1,
platform_dispensary_id = NULL,
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
jsonb_build_object(
'detected_provider', 'curaleaf'::text,
'detection_method', 'website_pattern'::text,
'detected_at', NOW(),
'curaleaf_store_url', $1::text,
'stale_dutchie_url', $2::text,
'not_crawlable', true,
'not_crawlable_reason', 'Curaleaf proprietary menu - no Dutchie integration'::text
),
updated_at = NOW()
WHERE id = $3
`, [curaleafUrl, menuUrl || null, dispensaryId]);
return {
dispensaryId,
dispensaryName: dispensary.name,
previousMenuType,
detectedProvider: 'curaleaf',
cName: null,
platformDispensaryId: null,
success: true,
error: undefined,
};
}
// If menu_url is null or empty, try to discover it by crawling the dispensary website
if (!menuUrl || menuUrl.trim() === '') {
console.log(`[MenuDetection] ${dispensary.name}: No menu_url - attempting website crawl`);
// Check if website is available
if (!website || website.trim() === '') {
console.log(`[MenuDetection] ${dispensary.name}: No website available - marking as not crawlable`);
await (0, connection_1.query)(`
UPDATE dispensaries SET
menu_type = 'unknown',
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
jsonb_build_object(
'detected_provider', 'unknown'::text,
'detection_method', 'no_data'::text,
'detected_at', NOW(),
'resolution_error', 'No menu_url and no website available'::text,
'not_crawlable', true,
'website_crawl_attempted', false
),
updated_at = NOW()
WHERE id = $1
`, [dispensaryId]);
return {
dispensaryId,
dispensaryName: dispensary.name,
previousMenuType,
detectedProvider: 'unknown',
cName: null,
platformDispensaryId: null,
success: true,
error: 'No menu_url and no website available - marked as not crawlable',
};
}
// Crawl the website to find menu provider links
console.log(`[MenuDetection] ${dispensary.name}: Crawling website ${website} for menu links...`);
const crawlResult = await crawlWebsiteForMenuLinks(website);
if (crawlResult.menuUrl && crawlResult.provider !== 'unknown') {
// SUCCESS: Found a menu URL from website crawl!
console.log(`[MenuDetection] ${dispensary.name}: Found ${crawlResult.provider} menu at ${crawlResult.menuUrl}`);
menuUrl = crawlResult.menuUrl;
// Update the dispensary with the discovered menu_url
await (0, connection_1.query)(`
UPDATE dispensaries SET
menu_url = $1,
menu_type = $2,
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
jsonb_build_object(
'detected_provider', $2::text,
'detection_method', 'website_crawl'::text,
'detected_at', NOW(),
'website_crawled', $3::text,
'website_crawl_pages', $4::jsonb,
'not_crawlable', false
),
updated_at = NOW()
WHERE id = $5
`, [
crawlResult.menuUrl,
crawlResult.provider,
website,
JSON.stringify(crawlResult.crawledPages),
dispensaryId
]);
// Continue with full detection flow using the discovered menu_url
}
else {
// Website crawl failed to find a menu provider
const errorReason = crawlResult.error || 'No menu provider links found on website';
console.log(`[MenuDetection] ${dispensary.name}: Website crawl failed - ${errorReason}`);
await (0, connection_1.query)(`
UPDATE dispensaries SET
menu_type = 'unknown',
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
jsonb_build_object(
'detected_provider', 'unknown'::text,
'detection_method', 'website_crawl'::text,
'detected_at', NOW(),
'website_crawled', $1::text,
'website_crawl_pages', $2::jsonb,
'resolution_error', $3::text,
'not_crawlable', true
),
updated_at = NOW()
WHERE id = $4
`, [
website,
JSON.stringify(crawlResult.crawledPages),
errorReason,
dispensaryId
]);
return {
dispensaryId,
dispensaryName: dispensary.name,
previousMenuType,
detectedProvider: 'unknown',
cName: null,
platformDispensaryId: null,
success: true,
error: `Website crawl failed: ${errorReason}`,
};
}
}
// Detect provider from URL
const detectedProvider = detectProviderFromUrl(menuUrl);
console.log(`[MenuDetection] ${dispensary.name}: Detected provider = ${detectedProvider} from URL: ${menuUrl}`);
// Initialize result
const result = {
dispensaryId,
dispensaryName: dispensary.name,
previousMenuType,
detectedProvider,
cName: null,
platformDispensaryId: null,
success: false,
};
// If not dutchie, just update menu_type and return
if (detectedProvider !== 'dutchie') {
// Special handling for proprietary providers - mark as not_crawlable until we have crawlers
const PROPRIETARY_PROVIDERS = ['curaleaf', 'sol'];
const isProprietaryProvider = PROPRIETARY_PROVIDERS.includes(detectedProvider);
const notCrawlableReason = isProprietaryProvider
? `${detectedProvider} proprietary menu - no crawler available`
: null;
await (0, connection_1.query)(`
UPDATE dispensaries SET
menu_type = $1,
platform_dispensary_id = CASE WHEN $3 THEN NULL ELSE platform_dispensary_id END,
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
jsonb_build_object(
'detected_provider', $1::text,
'detection_method', 'url_pattern'::text,
'detected_at', NOW(),
'not_crawlable', $3,
'not_crawlable_reason', $4::text
),
updated_at = NOW()
WHERE id = $2
`, [detectedProvider, dispensaryId, isProprietaryProvider, notCrawlableReason]);
result.success = true;
console.log(`[MenuDetection] ${dispensary.name}: Updated menu_type to ${detectedProvider}${isProprietaryProvider ? ' (not crawlable)' : ''}`);
return result;
}
// For dutchie: extract cName and resolve platform ID
const cName = (0, discovery_1.extractCNameFromMenuUrl)(menuUrl);
result.cName = cName;
if (!cName) {
result.error = `Could not extract cName from menu_url: ${menuUrl}`;
await (0, connection_1.query)(`
UPDATE dispensaries SET
menu_type = 'dutchie',
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
jsonb_build_object(
'detected_provider', 'dutchie'::text,
'detection_method', 'url_pattern'::text,
'detected_at', NOW(),
'resolution_error', $1::text,
'not_crawlable', true
),
updated_at = NOW()
WHERE id = $2
`, [result.error, dispensaryId]);
console.log(`[MenuDetection] ${dispensary.name}: ${result.error}`);
return result;
}
// Resolve platform_dispensary_id from cName
console.log(`[MenuDetection] ${dispensary.name}: Resolving platform ID for cName = ${cName}`);
try {
const platformId = await (0, graphql_client_1.resolveDispensaryId)(cName);
if (platformId) {
result.platformDispensaryId = platformId;
result.success = true;
await (0, connection_1.query)(`
UPDATE dispensaries SET
menu_type = 'dutchie',
platform_dispensary_id = $1,
platform_dispensary_id_resolved_at = NOW(),
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
jsonb_build_object(
'detected_provider', 'dutchie'::text,
'detection_method', 'url_pattern'::text,
'detected_at', NOW(),
'cname_extracted', $2::text,
'platform_id_resolved', true,
'resolution_error', NULL::text,
'not_crawlable', false
),
updated_at = NOW()
WHERE id = $3
`, [platformId, cName, dispensaryId]);
console.log(`[MenuDetection] ${dispensary.name}: Resolved platform ID = ${platformId}`);
}
else {
result.error = `cName "${cName}" could not be resolved - may not exist on Dutchie`;
await (0, connection_1.query)(`
UPDATE dispensaries SET
menu_type = 'dutchie',
platform_dispensary_id = NULL,
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
jsonb_build_object(
'detected_provider', 'dutchie'::text,
'detection_method', 'url_pattern'::text,
'detected_at', NOW(),
'cname_extracted', $1::text,
'platform_id_resolved', false,
'resolution_error', $2::text,
'not_crawlable', true
),
updated_at = NOW()
WHERE id = $3
`, [cName, result.error, dispensaryId]);
console.log(`[MenuDetection] ${dispensary.name}: ${result.error}`);
}
}
catch (error) {
result.error = `Resolution failed: ${error.message}`;
await (0, connection_1.query)(`
UPDATE dispensaries SET
menu_type = 'dutchie',
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
jsonb_build_object(
'detected_provider', 'dutchie'::text,
'detection_method', 'url_pattern'::text,
'detected_at', NOW(),
'cname_extracted', $1::text,
'platform_id_resolved', false,
'resolution_error', $2::text,
'not_crawlable', true
),
updated_at = NOW()
WHERE id = $3
`, [cName, result.error, dispensaryId]);
console.error(`[MenuDetection] ${dispensary.name}: ${result.error}`);
}
return result;
}
/**
* Run bulk detection on all dispensaries with unknown/missing menu_type or platform_dispensary_id
* Also includes dispensaries with no menu_url but with a website (for website crawl discovery)
*/
async function runBulkDetection(options = {}) {
const { state, onlyUnknown = true, onlyMissingPlatformId = false, includeWebsiteCrawl = true, limit } = options;
console.log('[MenuDetection] Starting bulk detection...');
// Build query to find dispensaries needing detection
// Now includes: dispensaries with menu_url OR (no menu_url but has website and not already marked not_crawlable)
let whereClause = `WHERE (
menu_url IS NOT NULL
${includeWebsiteCrawl ? `OR (
menu_url IS NULL
AND website IS NOT NULL
AND website != ''
AND (provider_detection_data IS NULL OR NOT (provider_detection_data->>'not_crawlable')::boolean)
)` : ''}
)`;
const params = [];
let paramIndex = 1;
if (state) {
whereClause += ` AND state = $${paramIndex++}`;
params.push(state);
}
if (onlyUnknown) {
whereClause += ` AND (menu_type IS NULL OR menu_type = '' OR menu_type = 'unknown')`;
}
if (onlyMissingPlatformId) {
whereClause += ` AND (menu_type = 'dutchie' AND platform_dispensary_id IS NULL)`;
}
let query_str = `
SELECT ${DISPENSARY_COLUMNS} FROM dispensaries
${whereClause}
ORDER BY name
`;
if (limit) {
query_str += ` LIMIT $${paramIndex}`;
params.push(limit);
}
const { rows: dispensaries } = await (0, connection_1.query)(query_str, params);
console.log(`[MenuDetection] Found ${dispensaries.length} dispensaries to process (includeWebsiteCrawl=${includeWebsiteCrawl})`);
const result = {
totalProcessed: 0,
totalSucceeded: 0,
totalFailed: 0,
totalSkipped: 0,
results: [],
errors: [],
};
for (const row of dispensaries) {
result.totalProcessed++;
try {
const detectionResult = await detectAndResolveDispensary(row.id);
result.results.push(detectionResult);
if (detectionResult.success) {
result.totalSucceeded++;
}
else {
result.totalFailed++;
if (detectionResult.error) {
result.errors.push(`${detectionResult.dispensaryName}: ${detectionResult.error}`);
}
}
// Rate limit between requests
await new Promise(r => setTimeout(r, 1000));
}
catch (error) {
result.totalFailed++;
result.errors.push(`${row.name || row.id}: ${error.message}`);
}
}
console.log(`[MenuDetection] Bulk detection complete: ${result.totalSucceeded} succeeded, ${result.totalFailed} failed`);
return result;
}
// ============================================================
// SCHEDULED JOB EXECUTOR
// ============================================================
/**
* Execute the menu detection job (called by scheduler)
*/
async function executeMenuDetectionJob(config = {}) {
const state = config.state || 'AZ';
const onlyUnknown = config.onlyUnknown !== false;
const onlyMissingPlatformId = config.onlyMissingPlatformId || false;
console.log(`[MenuDetection] Executing scheduled job for state=${state}...`);
try {
const result = await runBulkDetection({
state,
onlyUnknown,
onlyMissingPlatformId,
});
const status = result.totalFailed === 0 ? 'success' :
result.totalSucceeded === 0 ? 'error' : 'partial';
return {
status,
itemsProcessed: result.totalProcessed,
itemsSucceeded: result.totalSucceeded,
itemsFailed: result.totalFailed,
errorMessage: result.errors.length > 0 ? result.errors.slice(0, 5).join('; ') : undefined,
metadata: {
state,
onlyUnknown,
onlyMissingPlatformId,
providerCounts: countByProvider(result.results),
},
};
}
catch (error) {
return {
status: 'error',
itemsProcessed: 0,
itemsSucceeded: 0,
itemsFailed: 0,
errorMessage: error.message,
};
}
}
/**
* Count results by detected provider
*/
function countByProvider(results) {
const counts = {};
for (const r of results) {
counts[r.detectedProvider] = (counts[r.detectedProvider] || 0) + 1;
}
return counts;
}
// ============================================================
// UTILITY FUNCTIONS
// ============================================================
/**
* Get detection stats for dashboard
*/
async function getDetectionStats() {
const { rows } = await (0, connection_1.query)(`
SELECT
COUNT(*) as total,
COUNT(*) FILTER (WHERE menu_type IS NOT NULL AND menu_type != '' AND menu_type != 'unknown') as with_menu_type,
COUNT(*) FILTER (WHERE platform_dispensary_id IS NOT NULL) as with_platform_id,
COUNT(*) FILTER (WHERE menu_url IS NOT NULL AND (menu_type IS NULL OR menu_type = '' OR menu_type = 'unknown')) as needs_detection
FROM dispensaries
WHERE state = 'AZ'
`);
const stats = rows[0] || {};
// Get provider breakdown
const { rows: providerRows } = await (0, connection_1.query)(`
SELECT menu_type, COUNT(*) as count
FROM dispensaries
WHERE state = 'AZ' AND menu_type IS NOT NULL AND menu_type != ''
GROUP BY menu_type
ORDER BY count DESC
`);
const byProvider = {};
for (const row of providerRows) {
byProvider[row.menu_type] = parseInt(row.count, 10);
}
return {
totalDispensaries: parseInt(stats.total || '0', 10),
withMenuType: parseInt(stats.with_menu_type || '0', 10),
withPlatformId: parseInt(stats.with_platform_id || '0', 10),
needsDetection: parseInt(stats.needs_detection || '0', 10),
byProvider,
};
}
/**
* Get dispensaries needing detection
* Includes dispensaries with website but no menu_url for website crawl discovery
*/
async function getDispensariesNeedingDetection(options = {}) {
const { state = 'AZ', limit = 100, includeWebsiteCrawl = true } = options;
const { rows } = await (0, connection_1.query)(`
SELECT ${DISPENSARY_COLUMNS} FROM dispensaries
WHERE state = $1
AND (
(menu_url IS NOT NULL AND (menu_type IS NULL OR menu_type = '' OR menu_type = 'unknown'
OR (menu_type = 'dutchie' AND platform_dispensary_id IS NULL)))
${includeWebsiteCrawl ? `OR (
menu_url IS NULL
AND website IS NOT NULL
AND website != ''
AND (provider_detection_data IS NULL OR NOT (provider_detection_data->>'not_crawlable')::boolean)
)` : ''}
)
ORDER BY name
LIMIT $2
`, [state, limit]);
return rows.map(discovery_1.mapDbRowToDispensary);
}

View File

@@ -0,0 +1,843 @@
"use strict";
/**
* Dutchie AZ Product Crawler Service
*
* Crawls products from Dutchie dispensaries and stores them in the dutchie_az database.
* Handles normalization from GraphQL response to database entities.
*
* IMPORTANT: Uses chunked batch processing per CLAUDE.md Rule #15 to avoid OOM.
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.normalizeProduct = normalizeProduct;
exports.normalizeSnapshot = normalizeSnapshot;
exports.crawlDispensaryProducts = crawlDispensaryProducts;
exports.crawlAllArizonaDispensaries = crawlAllArizonaDispensaries;
const connection_1 = require("../db/connection");
const graphql_client_1 = require("./graphql-client");
const discovery_1 = require("./discovery");
const types_1 = require("../types");
const image_storage_1 = require("../../utils/image-storage");
// Explicit column list for dispensaries table (avoids SELECT * issues with schema differences)
const DISPENSARY_COLUMNS = `
id, name, slug, city, state, zip, address, latitude, longitude,
menu_type, menu_url, platform_dispensary_id, website,
provider_detection_data, created_at, updated_at
`;
// ============================================================
// BATCH PROCESSING CONFIGURATION
// ============================================================
/** Chunk size for batch DB writes (per CLAUDE.md Rule #15) */
const BATCH_CHUNK_SIZE = 100;
// ============================================================
// NORMALIZATION FUNCTIONS
// ============================================================
/**
* Convert price to cents
*/
function toCents(price) {
if (price === undefined || price === null)
return undefined;
return Math.round(price * 100);
}
/**
* Get min value from array of numbers
*/
function getMin(arr) {
if (!arr || arr.length === 0)
return undefined;
return Math.min(...arr.filter((n) => n !== null && n !== undefined));
}
/**
* Get max value from array of numbers
*/
function getMax(arr) {
if (!arr || arr.length === 0)
return undefined;
return Math.max(...arr.filter((n) => n !== null && n !== undefined));
}
/**
* Normalize a value to boolean
* Handles Dutchie API returning {} or [] or other non-boolean values
* that would cause "invalid input syntax for type boolean" errors
*/
function normBool(v, defaultVal = false) {
if (v === true)
return true;
if (v === false)
return false;
// Log unexpected object/array values once for debugging
if (v !== null && v !== undefined && typeof v === 'object') {
console.warn(`[normBool] Unexpected object value, coercing to ${defaultVal}:`, JSON.stringify(v));
}
return defaultVal;
}
/**
* Normalize a value to Date or undefined
* Handles Dutchie API returning {} or [] or other non-date values
* that would cause "invalid input syntax for type timestamp" errors
*/
function normDate(v) {
if (!v)
return undefined;
// Reject objects/arrays that aren't dates
if (typeof v === 'object' && !(v instanceof Date)) {
console.warn(`[normDate] Unexpected object value, ignoring:`, JSON.stringify(v));
return undefined;
}
// Try parsing
const d = new Date(v);
if (isNaN(d.getTime())) {
console.warn(`[normDate] Invalid date value, ignoring:`, v);
return undefined;
}
return d;
}
/**
* Extract cName (Dutchie slug) from menuUrl or dispensary slug
* Handles URL formats:
* - https://dutchie.com/embedded-menu/AZ-Deeply-Rooted -> AZ-Deeply-Rooted
* - https://dutchie.com/dispensary/sol-flower-dispensary-mcclintock -> sol-flower-dispensary-mcclintock
* Falls back to dispensary.slug if menuUrl extraction fails
*/
function extractCName(dispensary) {
if (dispensary.menuUrl) {
try {
const url = new URL(dispensary.menuUrl);
// Extract last path segment: /embedded-menu/X or /dispensary/X
const segments = url.pathname.split('/').filter(Boolean);
if (segments.length >= 2) {
const cName = segments[segments.length - 1];
if (cName) {
console.log(`[ProductCrawler] Extracted cName "${cName}" from menuUrl`);
return cName;
}
}
}
catch (e) {
console.warn(`[ProductCrawler] Failed to parse menuUrl: ${dispensary.menuUrl}`);
}
}
// Fallback to slug
console.log(`[ProductCrawler] Using dispensary slug "${dispensary.slug}" as cName`);
return dispensary.slug;
}
/**
* Normalize a POSMetaData.children entry to DutchieProductOptionSnapshot
*/
function normalizeOption(child) {
return {
optionId: child.canonicalID || child.canonicalPackageId || child.canonicalSKU || child.option || 'unknown',
canonicalId: child.canonicalID,
canonicalPackageId: child.canonicalPackageId,
canonicalSKU: child.canonicalSKU,
canonicalName: child.canonicalName,
canonicalCategory: child.canonicalCategory,
canonicalCategoryId: child.canonicalCategoryId,
canonicalBrandId: child.canonicalBrandId,
canonicalBrandName: child.canonicalBrandName,
canonicalStrainId: child.canonicalStrainId,
canonicalVendorId: child.canonicalVendorId,
optionLabel: child.option,
packageQuantity: child.packageQuantity,
recEquivalent: child.recEquivalent,
standardEquivalent: child.standardEquivalent,
priceCents: toCents(child.price),
recPriceCents: toCents(child.recPrice),
medPriceCents: toCents(child.medPrice),
quantity: child.quantity,
quantityAvailable: child.quantityAvailable,
kioskQuantityAvailable: child.kioskQuantityAvailable,
activeBatchTags: child.activeBatchTags,
canonicalImgUrl: child.canonicalImgUrl,
canonicalLabResultUrl: child.canonicalLabResultUrl,
canonicalEffectivePotencyMg: child.canonicalEffectivePotencyMg,
rawChildPayload: child,
};
}
/**
* Normalize a raw Dutchie product to DutchieProduct (canonical identity)
*/
function normalizeProduct(raw, dispensaryId, platformDispensaryId) {
return {
dispensaryId,
platform: 'dutchie',
externalProductId: raw._id || raw.id || '',
platformDispensaryId,
cName: raw.cName,
name: raw.Name,
// Brand
brandName: raw.brandName || raw.brand?.name,
brandId: raw.brandId || raw.brand?.id,
brandLogoUrl: raw.brandLogo || raw.brand?.imageUrl,
// Classification
type: raw.type,
subcategory: raw.subcategory,
strainType: raw.strainType,
provider: raw.provider,
// Potency
thc: raw.THC,
thcContent: raw.THCContent?.range?.[0],
cbd: raw.CBD,
cbdContent: raw.CBDContent?.range?.[0],
cannabinoidsV2: raw.cannabinoidsV2,
effects: raw.effects,
// Status / flags
status: raw.Status,
medicalOnly: normBool(raw.medicalOnly, false),
recOnly: normBool(raw.recOnly, false),
featured: normBool(raw.featured, false),
comingSoon: normBool(raw.comingSoon, false),
certificateOfAnalysisEnabled: normBool(raw.certificateOfAnalysisEnabled, false),
isBelowThreshold: normBool(raw.isBelowThreshold, false),
isBelowKioskThreshold: normBool(raw.isBelowKioskThreshold, false),
optionsBelowThreshold: normBool(raw.optionsBelowThreshold, false),
optionsBelowKioskThreshold: normBool(raw.optionsBelowKioskThreshold, false),
// Derived stock status
stockStatus: (0, types_1.deriveStockStatus)(raw),
totalQuantityAvailable: (0, types_1.calculateTotalQuantity)(raw),
// Images
primaryImageUrl: raw.Image || raw.images?.[0]?.url,
images: raw.images,
// Misc
measurements: raw.measurements,
weight: typeof raw.weight === 'number' ? String(raw.weight) : raw.weight,
pastCNames: raw.pastCNames,
createdAtDutchie: normDate(raw.createdAt),
updatedAtDutchie: normDate(raw.updatedAt),
latestRawPayload: raw,
};
}
/**
* Normalize a raw Dutchie product to DutchieProductSnapshot (time-series data)
*/
function normalizeSnapshot(raw, dutchieProductId, dispensaryId, platformDispensaryId, pricingType, crawlMode = 'mode_a') {
const children = raw.POSMetaData?.children || [];
const options = children.map(normalizeOption);
// Aggregate prices from various sources
const recPrices = raw.recPrices || [];
const medPrices = raw.medicalPrices || [];
const recSpecialPrices = raw.recSpecialPrices || [];
const medSpecialPrices = raw.medicalSpecialPrices || [];
const wholesalePrices = raw.wholesalePrices || [];
// Also consider child prices
const childRecPrices = children.map((c) => c.recPrice).filter((p) => p !== undefined);
const childMedPrices = children.map((c) => c.medPrice).filter((p) => p !== undefined);
const childPrices = children.map((c) => c.price).filter((p) => p !== undefined);
// Aggregate inventory - use calculateTotalQuantity for proper null handling
const totalQty = (0, types_1.calculateTotalQuantity)(raw);
const hasAnyKioskQty = children.some(c => typeof c.kioskQuantityAvailable === 'number');
const totalKioskQty = hasAnyKioskQty
? children.reduce((sum, c) => sum + (c.kioskQuantityAvailable || 0), 0)
: null;
// Determine if on special
const isOnSpecial = raw.special === true ||
(raw.specialData?.saleSpecials && raw.specialData.saleSpecials.length > 0) ||
(recSpecialPrices.length > 0 && recSpecialPrices[0] !== null) ||
(medSpecialPrices.length > 0 && medSpecialPrices[0] !== null);
return {
dutchieProductId,
dispensaryId,
platformDispensaryId,
externalProductId: raw._id || raw.id || '',
pricingType,
crawlMode,
status: raw.Status,
featured: normBool(raw.featured, false),
special: normBool(isOnSpecial, false),
medicalOnly: normBool(raw.medicalOnly, false),
recOnly: normBool(raw.recOnly, false),
// Product was present in feed
isPresentInFeed: true,
// Derived stock status
stockStatus: (0, types_1.deriveStockStatus)(raw),
// Price summary
recMinPriceCents: toCents(getMin([...recPrices, ...childRecPrices, ...childPrices])),
recMaxPriceCents: toCents(getMax([...recPrices, ...childRecPrices, ...childPrices])),
recMinSpecialPriceCents: toCents(getMin(recSpecialPrices)),
medMinPriceCents: toCents(getMin([...medPrices, ...childMedPrices])),
medMaxPriceCents: toCents(getMax([...medPrices, ...childMedPrices])),
medMinSpecialPriceCents: toCents(getMin(medSpecialPrices)),
wholesaleMinPriceCents: toCents(getMin(wholesalePrices)),
// Inventory summary - null = unknown, 0 = all OOS
totalQuantityAvailable: totalQty,
totalKioskQuantityAvailable: totalKioskQty,
manualInventory: normBool(raw.manualInventory, false),
isBelowThreshold: normBool(raw.isBelowThreshold, false),
isBelowKioskThreshold: normBool(raw.isBelowKioskThreshold, false),
options,
rawPayload: raw,
crawledAt: new Date(),
};
}
// ============================================================
// DATABASE OPERATIONS
// ============================================================
/**
* Upsert a DutchieProduct record
*/
async function upsertProduct(product) {
const result = await (0, connection_1.query)(`
INSERT INTO dutchie_products (
dispensary_id, platform, external_product_id, platform_dispensary_id,
c_name, name, brand_name, brand_id, brand_logo_url,
type, subcategory, strain_type, provider,
thc, thc_content, cbd, cbd_content, cannabinoids_v2, effects,
status, medical_only, rec_only, featured, coming_soon, certificate_of_analysis_enabled,
is_below_threshold, is_below_kiosk_threshold, options_below_threshold, options_below_kiosk_threshold,
stock_status, total_quantity_available,
primary_image_url, images, measurements, weight, past_c_names,
created_at_dutchie, updated_at_dutchie, latest_raw_payload, updated_at
) VALUES (
$1, $2, $3, $4,
$5, $6, $7, $8, $9,
$10, $11, $12, $13,
$14, $15, $16, $17, $18, $19,
$20, $21, $22, $23, $24, $25,
$26, $27, $28, $29,
$30, $31,
$32, $33, $34, $35, $36,
$37, $38, $39, NOW()
)
ON CONFLICT (dispensary_id, external_product_id) DO UPDATE SET
c_name = EXCLUDED.c_name,
name = EXCLUDED.name,
brand_name = EXCLUDED.brand_name,
brand_id = EXCLUDED.brand_id,
brand_logo_url = EXCLUDED.brand_logo_url,
type = EXCLUDED.type,
subcategory = EXCLUDED.subcategory,
strain_type = EXCLUDED.strain_type,
provider = EXCLUDED.provider,
thc = EXCLUDED.thc,
thc_content = EXCLUDED.thc_content,
cbd = EXCLUDED.cbd,
cbd_content = EXCLUDED.cbd_content,
cannabinoids_v2 = EXCLUDED.cannabinoids_v2,
effects = EXCLUDED.effects,
status = EXCLUDED.status,
medical_only = EXCLUDED.medical_only,
rec_only = EXCLUDED.rec_only,
featured = EXCLUDED.featured,
coming_soon = EXCLUDED.coming_soon,
certificate_of_analysis_enabled = EXCLUDED.certificate_of_analysis_enabled,
is_below_threshold = EXCLUDED.is_below_threshold,
is_below_kiosk_threshold = EXCLUDED.is_below_kiosk_threshold,
options_below_threshold = EXCLUDED.options_below_threshold,
options_below_kiosk_threshold = EXCLUDED.options_below_kiosk_threshold,
stock_status = EXCLUDED.stock_status,
total_quantity_available = EXCLUDED.total_quantity_available,
primary_image_url = EXCLUDED.primary_image_url,
images = EXCLUDED.images,
measurements = EXCLUDED.measurements,
weight = EXCLUDED.weight,
past_c_names = EXCLUDED.past_c_names,
created_at_dutchie = EXCLUDED.created_at_dutchie,
updated_at_dutchie = EXCLUDED.updated_at_dutchie,
latest_raw_payload = EXCLUDED.latest_raw_payload,
updated_at = NOW()
RETURNING id
`, [
product.dispensaryId,
product.platform,
product.externalProductId,
product.platformDispensaryId,
product.cName,
product.name,
product.brandName,
product.brandId,
product.brandLogoUrl,
product.type,
product.subcategory,
product.strainType,
product.provider,
product.thc,
product.thcContent,
product.cbd,
product.cbdContent,
product.cannabinoidsV2 ? JSON.stringify(product.cannabinoidsV2) : null,
product.effects ? JSON.stringify(product.effects) : null,
product.status,
product.medicalOnly,
product.recOnly,
product.featured,
product.comingSoon,
product.certificateOfAnalysisEnabled,
product.isBelowThreshold,
product.isBelowKioskThreshold,
product.optionsBelowThreshold,
product.optionsBelowKioskThreshold,
product.stockStatus,
product.totalQuantityAvailable,
product.primaryImageUrl,
product.images ? JSON.stringify(product.images) : null,
product.measurements ? JSON.stringify(product.measurements) : null,
product.weight,
product.pastCNames,
product.createdAtDutchie,
product.updatedAtDutchie,
product.latestRawPayload ? JSON.stringify(product.latestRawPayload) : null,
]);
return result.rows[0].id;
}
/**
* Download product image and update local image URLs
* Skips download if local image already exists for this product+URL combo
*/
async function downloadAndUpdateProductImage(productId, dispensaryId, externalProductId, primaryImageUrl) {
if (!primaryImageUrl) {
return { downloaded: false, error: 'No image URL' };
}
try {
// Check if we already have this image locally
const exists = await (0, image_storage_1.imageExists)(dispensaryId, externalProductId, primaryImageUrl);
if (exists) {
return { downloaded: false };
}
// Download and process the image
const result = await (0, image_storage_1.downloadProductImage)(primaryImageUrl, dispensaryId, externalProductId);
if (!result.success || !result.urls) {
return { downloaded: false, error: result.error };
}
// Update the product record with local image URLs
await (0, connection_1.query)(`
UPDATE dutchie_products
SET
local_image_url = $1,
local_image_thumb_url = $2,
local_image_medium_url = $3,
original_image_url = COALESCE(original_image_url, primary_image_url),
updated_at = NOW()
WHERE id = $4
`, [result.urls.full, result.urls.thumb, result.urls.medium, productId]);
return { downloaded: true };
}
catch (error) {
return { downloaded: false, error: error.message };
}
}
/**
* Insert a snapshot record
*/
async function insertSnapshot(snapshot) {
const result = await (0, connection_1.query)(`
INSERT INTO dutchie_product_snapshots (
dutchie_product_id, dispensary_id, platform_dispensary_id, external_product_id,
pricing_type, crawl_mode, status, featured, special, medical_only, rec_only,
is_present_in_feed, stock_status,
rec_min_price_cents, rec_max_price_cents, rec_min_special_price_cents,
med_min_price_cents, med_max_price_cents, med_min_special_price_cents,
wholesale_min_price_cents,
total_quantity_available, total_kiosk_quantity_available, manual_inventory,
is_below_threshold, is_below_kiosk_threshold,
options, raw_payload, crawled_at
) VALUES (
$1, $2, $3, $4,
$5, $6, $7, $8, $9, $10, $11,
$12, $13,
$14, $15, $16,
$17, $18, $19,
$20,
$21, $22, $23,
$24, $25,
$26, $27, $28
)
RETURNING id
`, [
snapshot.dutchieProductId,
snapshot.dispensaryId,
snapshot.platformDispensaryId,
snapshot.externalProductId,
snapshot.pricingType,
snapshot.crawlMode,
snapshot.status,
snapshot.featured,
snapshot.special,
snapshot.medicalOnly,
snapshot.recOnly,
snapshot.isPresentInFeed ?? true,
snapshot.stockStatus,
snapshot.recMinPriceCents,
snapshot.recMaxPriceCents,
snapshot.recMinSpecialPriceCents,
snapshot.medMinPriceCents,
snapshot.medMaxPriceCents,
snapshot.medMinSpecialPriceCents,
snapshot.wholesaleMinPriceCents,
snapshot.totalQuantityAvailable,
snapshot.totalKioskQuantityAvailable,
snapshot.manualInventory,
snapshot.isBelowThreshold,
snapshot.isBelowKioskThreshold,
JSON.stringify(snapshot.options || []),
JSON.stringify(snapshot.rawPayload || {}),
snapshot.crawledAt,
]);
return result.rows[0].id;
}
// ============================================================
// BATCH DATABASE OPERATIONS (per CLAUDE.md Rule #15)
// ============================================================
/**
* Helper to chunk an array into smaller arrays
*/
function chunkArray(array, size) {
const chunks = [];
for (let i = 0; i < array.length; i += size) {
chunks.push(array.slice(i, i + size));
}
return chunks;
}
/**
* Batch upsert products - processes in chunks to avoid OOM
* Returns a Map of externalProductId -> database id
*/
async function batchUpsertProducts(products) {
const productIdMap = new Map();
const chunks = chunkArray(products, BATCH_CHUNK_SIZE);
console.log(`[ProductCrawler] Batch upserting ${products.length} products in ${chunks.length} chunks of ${BATCH_CHUNK_SIZE}...`);
for (let i = 0; i < chunks.length; i++) {
const chunk = chunks[i];
// Process each product in the chunk
for (const product of chunk) {
try {
const id = await upsertProduct(product);
if (product.externalProductId) {
productIdMap.set(product.externalProductId, id);
}
}
catch (error) {
console.error(`[ProductCrawler] Error upserting product ${product.externalProductId}:`, error.message);
}
}
// Log progress
if ((i + 1) % 5 === 0 || i === chunks.length - 1) {
console.log(`[ProductCrawler] Upserted chunk ${i + 1}/${chunks.length} (${productIdMap.size} products so far)`);
}
}
return productIdMap;
}
/**
* Batch insert snapshots - processes in chunks to avoid OOM
*/
async function batchInsertSnapshots(snapshots) {
const chunks = chunkArray(snapshots, BATCH_CHUNK_SIZE);
let inserted = 0;
console.log(`[ProductCrawler] Batch inserting ${snapshots.length} snapshots in ${chunks.length} chunks of ${BATCH_CHUNK_SIZE}...`);
for (let i = 0; i < chunks.length; i++) {
const chunk = chunks[i];
// Process each snapshot in the chunk
for (const snapshot of chunk) {
try {
await insertSnapshot(snapshot);
inserted++;
}
catch (error) {
console.error(`[ProductCrawler] Error inserting snapshot for ${snapshot.externalProductId}:`, error.message);
}
}
// Log progress
if ((i + 1) % 5 === 0 || i === chunks.length - 1) {
console.log(`[ProductCrawler] Inserted snapshot chunk ${i + 1}/${chunks.length} (${inserted} snapshots so far)`);
}
}
return inserted;
}
/**
* Update dispensary last_crawled_at and product_count
*/
async function updateDispensaryCrawlStats(dispensaryId, productCount) {
// Update last_crawl_at to track when we last crawled
// Skip product_count as that column may not exist
await (0, connection_1.query)(`
UPDATE dispensaries
SET last_crawl_at = NOW(), updated_at = NOW()
WHERE id = $1
`, [dispensaryId]);
}
/**
* Mark products as missing from feed
* Creates a snapshot with isPresentInFeed=false and stockStatus='missing_from_feed'
* for products that were NOT in the UNION of Mode A and Mode B product lists
*
* IMPORTANT: Uses UNION of both modes to avoid false positives
* If the union is empty (possible outage), we skip marking to avoid data corruption
*/
async function markMissingProducts(dispensaryId, platformDispensaryId, modeAProductIds, modeBProductIds, pricingType) {
// Build UNION of Mode A + Mode B product IDs
const unionProductIds = new Set([...Array.from(modeAProductIds), ...Array.from(modeBProductIds)]);
// OUTAGE DETECTION: If union is empty, something went wrong - don't mark anything as missing
if (unionProductIds.size === 0) {
console.warn('[ProductCrawler] OUTAGE DETECTED: Both Mode A and Mode B returned 0 products. Skipping missing product marking.');
return 0;
}
// Get all existing products for this dispensary that were not in the UNION
const { rows: missingProducts } = await (0, connection_1.query)(`
SELECT id, external_product_id, name
FROM dutchie_products
WHERE dispensary_id = $1
AND external_product_id NOT IN (SELECT unnest($2::text[]))
`, [dispensaryId, Array.from(unionProductIds)]);
if (missingProducts.length === 0) {
return 0;
}
console.log(`[ProductCrawler] Marking ${missingProducts.length} products as missing from feed (union of ${modeAProductIds.size} Mode A + ${modeBProductIds.size} Mode B = ${unionProductIds.size} unique)...`);
const crawledAt = new Date();
// Build all missing snapshots first (per CLAUDE.md Rule #15 - batch writes)
const missingSnapshots = missingProducts.map(product => ({
dutchieProductId: product.id,
dispensaryId,
platformDispensaryId,
externalProductId: product.external_product_id,
pricingType,
crawlMode: 'mode_a', // Use mode_a for missing snapshots (convention)
status: undefined,
featured: false,
special: false,
medicalOnly: false,
recOnly: false,
isPresentInFeed: false,
stockStatus: 'missing_from_feed',
totalQuantityAvailable: undefined, // null = unknown, not 0
manualInventory: false,
isBelowThreshold: false,
isBelowKioskThreshold: false,
options: [],
rawPayload: { _missingFromFeed: true, lastKnownName: product.name },
crawledAt,
}));
// Batch insert missing snapshots
const snapshotsInserted = await batchInsertSnapshots(missingSnapshots);
// Batch update product stock status in chunks
const productIds = missingProducts.map(p => p.id);
const productChunks = chunkArray(productIds, BATCH_CHUNK_SIZE);
console.log(`[ProductCrawler] Updating ${productIds.length} product statuses in ${productChunks.length} chunks...`);
for (const chunk of productChunks) {
await (0, connection_1.query)(`
UPDATE dutchie_products
SET stock_status = 'missing_from_feed', total_quantity_available = NULL, updated_at = NOW()
WHERE id = ANY($1::int[])
`, [chunk]);
}
console.log(`[ProductCrawler] Marked ${snapshotsInserted} products as missing from feed`);
return snapshotsInserted;
}
/**
* Process a batch of products from a single crawl mode
* IMPORTANT: Stores ALL products, never filters before DB
* Uses chunked batch processing per CLAUDE.md Rule #15 to avoid OOM
* Returns the set of external product IDs that were processed
*/
async function processProducts(products, dispensary, pricingType, crawlMode, options = {}) {
const { downloadImages = true } = options;
const productIds = new Set();
let imagesDownloaded = 0;
let imageErrors = 0;
console.log(`[ProductCrawler] Processing ${products.length} products using chunked batch processing...`);
// Step 1: Normalize all products and collect IDs
const normalizedProducts = [];
const rawByExternalId = new Map();
for (const raw of products) {
const externalId = raw._id || raw.id || '';
productIds.add(externalId);
rawByExternalId.set(externalId, raw);
const normalized = normalizeProduct(raw, dispensary.id, dispensary.platformDispensaryId);
normalizedProducts.push(normalized);
}
// Step 2: Batch upsert products (chunked)
const productIdMap = await batchUpsertProducts(normalizedProducts);
const upserted = productIdMap.size;
// Step 3: Create and batch insert snapshots (chunked)
// IMPORTANT: Do this BEFORE image downloads to ensure snapshots are created even if images fail
const snapshots = [];
for (const [externalId, productId] of Array.from(productIdMap.entries())) {
const raw = rawByExternalId.get(externalId);
if (raw) {
const snapshot = normalizeSnapshot(raw, productId, dispensary.id, dispensary.platformDispensaryId, pricingType, crawlMode);
snapshots.push(snapshot);
}
}
const snapshotsInserted = await batchInsertSnapshots(snapshots);
// Step 4: Download images in chunks (if enabled)
// This is done AFTER snapshots to ensure core data is saved even if image downloads fail
if (downloadImages) {
const imageChunks = chunkArray(Array.from(productIdMap.entries()), BATCH_CHUNK_SIZE);
console.log(`[ProductCrawler] Downloading images in ${imageChunks.length} chunks...`);
for (let i = 0; i < imageChunks.length; i++) {
const chunk = imageChunks[i];
for (const [externalId, productId] of chunk) {
const normalized = normalizedProducts.find(p => p.externalProductId === externalId);
if (normalized?.primaryImageUrl) {
try {
const imageResult = await downloadAndUpdateProductImage(productId, dispensary.id, externalId, normalized.primaryImageUrl);
if (imageResult.downloaded) {
imagesDownloaded++;
}
else if (imageResult.error && imageResult.error !== 'No image URL') {
imageErrors++;
}
}
catch (error) {
imageErrors++;
}
}
}
if ((i + 1) % 5 === 0 || i === imageChunks.length - 1) {
console.log(`[ProductCrawler] Image download chunk ${i + 1}/${imageChunks.length} (${imagesDownloaded} downloaded, ${imageErrors} errors)`);
}
}
}
// Clear references to help GC
normalizedProducts.length = 0;
rawByExternalId.clear();
return { upserted, snapshots: snapshotsInserted, productIds, imagesDownloaded, imageErrors };
}
async function crawlDispensaryProducts(dispensary, pricingType = 'rec', options = {}) {
const { useBothModes = true, downloadImages = true, onProgress } = options;
const startTime = Date.now();
if (!dispensary.platformDispensaryId) {
return {
success: false,
dispensaryId: dispensary.id,
productsFound: 0,
productsFetched: 0,
productsUpserted: 0,
snapshotsCreated: 0,
errorMessage: 'Missing platformDispensaryId',
durationMs: Date.now() - startTime,
};
}
try {
console.log(`[ProductCrawler] Crawling ${dispensary.name} (${dispensary.platformDispensaryId})...`);
let totalUpserted = 0;
let totalSnapshots = 0;
let totalImagesDownloaded = 0;
let totalImageErrors = 0;
let modeAProducts = 0;
let modeBProducts = 0;
let missingMarked = 0;
// Track product IDs separately for each mode (needed for missing product detection)
const modeAProductIds = new Set();
const modeBProductIds = new Set();
// Extract cName for this specific dispensary (used for Puppeteer session & headers)
const cName = extractCName(dispensary);
console.log(`[ProductCrawler] Using cName="${cName}" for dispensary ${dispensary.name}`);
if (useBothModes) {
// Run two-mode crawl for maximum coverage
const bothResults = await (0, graphql_client_1.fetchAllProductsBothModes)(dispensary.platformDispensaryId, pricingType, { cName });
modeAProducts = bothResults.modeA.products.length;
modeBProducts = bothResults.modeB.products.length;
console.log(`[ProductCrawler] Two-mode crawl: Mode A=${modeAProducts}, Mode B=${modeBProducts}, Merged=${bothResults.merged.products.length}`);
// Collect Mode A product IDs
for (const p of bothResults.modeA.products) {
modeAProductIds.add(p._id);
}
// Collect Mode B product IDs
for (const p of bothResults.modeB.products) {
modeBProductIds.add(p._id);
}
// Process MERGED products (includes options from both modes)
if (bothResults.merged.products.length > 0) {
const mergedResult = await processProducts(bothResults.merged.products, dispensary, pricingType, 'mode_a', // Use mode_a for merged products (convention)
{ downloadImages });
totalUpserted = mergedResult.upserted;
totalSnapshots = mergedResult.snapshots;
totalImagesDownloaded = mergedResult.imagesDownloaded;
totalImageErrors = mergedResult.imageErrors;
// Report progress
if (onProgress) {
await onProgress({
productsFound: bothResults.merged.products.length,
productsUpserted: totalUpserted,
snapshotsCreated: totalSnapshots,
currentPage: 1,
totalPages: 1,
});
}
}
}
else {
// Single mode crawl (Mode A only)
const { products, crawlMode } = await (0, graphql_client_1.fetchAllProducts)(dispensary.platformDispensaryId, pricingType, { crawlMode: 'mode_a', cName });
modeAProducts = products.length;
// Collect Mode A product IDs
for (const p of products) {
modeAProductIds.add(p._id);
}
const result = await processProducts(products, dispensary, pricingType, crawlMode, { downloadImages });
totalUpserted = result.upserted;
totalSnapshots = result.snapshots;
totalImagesDownloaded = result.imagesDownloaded;
totalImageErrors = result.imageErrors;
// Report progress
if (onProgress) {
await onProgress({
productsFound: products.length,
productsUpserted: totalUpserted,
snapshotsCreated: totalSnapshots,
currentPage: 1,
totalPages: 1,
});
}
}
// Mark products as missing using UNION of Mode A + Mode B
// The function handles outage detection (empty union = skip marking)
missingMarked = await markMissingProducts(dispensary.id, dispensary.platformDispensaryId, modeAProductIds, modeBProductIds, pricingType);
totalSnapshots += missingMarked;
// Update dispensary stats
await updateDispensaryCrawlStats(dispensary.id, totalUpserted);
console.log(`[ProductCrawler] Completed: ${totalUpserted} products, ${totalSnapshots} snapshots, ${missingMarked} marked missing, ${totalImagesDownloaded} images downloaded`);
const totalProductsFound = modeAProducts + modeBProducts;
return {
success: true,
dispensaryId: dispensary.id,
productsFound: totalProductsFound,
productsFetched: totalProductsFound,
productsUpserted: totalUpserted,
snapshotsCreated: totalSnapshots,
modeAProducts,
modeBProducts,
missingProductsMarked: missingMarked,
imagesDownloaded: totalImagesDownloaded,
imageErrors: totalImageErrors,
durationMs: Date.now() - startTime,
};
}
catch (error) {
console.error(`[ProductCrawler] Failed to crawl ${dispensary.name}:`, error.message);
return {
success: false,
dispensaryId: dispensary.id,
productsFound: 0,
productsFetched: 0,
productsUpserted: 0,
snapshotsCreated: 0,
errorMessage: error.message,
durationMs: Date.now() - startTime,
};
}
}
/**
* Crawl all Arizona dispensaries
*/
async function crawlAllArizonaDispensaries(pricingType = 'rec') {
const results = [];
// Get all AZ dispensaries with platform IDs
const { rows: rawRows } = await (0, connection_1.query)(`
SELECT ${DISPENSARY_COLUMNS} FROM dispensaries
WHERE state = 'AZ' AND menu_type = 'dutchie' AND platform_dispensary_id IS NOT NULL
ORDER BY id
`);
const dispensaries = rawRows.map(discovery_1.mapDbRowToDispensary);
console.log(`[ProductCrawler] Starting crawl of ${dispensaries.length} dispensaries...`);
for (const dispensary of dispensaries) {
const result = await crawlDispensaryProducts(dispensary, pricingType);
results.push(result);
// Delay between dispensaries
await new Promise((r) => setTimeout(r, 2000));
}
const successful = results.filter((r) => r.success).length;
const totalProducts = results.reduce((sum, r) => sum + r.productsUpserted, 0);
const totalSnapshots = results.reduce((sum, r) => sum + r.snapshotsCreated, 0);
console.log(`[ProductCrawler] Completed: ${successful}/${dispensaries.length} stores, ${totalProducts} products, ${totalSnapshots} snapshots`);
return results;
}

View File

@@ -0,0 +1,595 @@
"use strict";
/**
* Dutchie AZ Scheduler Service
*
* Handles scheduled crawling with JITTER - no fixed intervals!
* Each job re-schedules itself with a NEW random offset after each run.
* This makes timing "wander" around the clock, avoiding detectable patterns.
*
* Jitter Logic:
* nextRunAt = lastRunAt + baseIntervalMinutes + random(-jitterMinutes, +jitterMinutes)
*
* Example: 4-hour base with ±30min jitter = runs anywhere from 3h30m to 4h30m apart
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.crawlSingleDispensary = void 0;
exports.getAllSchedules = getAllSchedules;
exports.getScheduleById = getScheduleById;
exports.createSchedule = createSchedule;
exports.updateSchedule = updateSchedule;
exports.deleteSchedule = deleteSchedule;
exports.getRunLogs = getRunLogs;
exports.startScheduler = startScheduler;
exports.stopScheduler = stopScheduler;
exports.getSchedulerStatus = getSchedulerStatus;
exports.triggerScheduleNow = triggerScheduleNow;
exports.initializeDefaultSchedules = initializeDefaultSchedules;
exports.triggerImmediateCrawl = triggerImmediateCrawl;
const connection_1 = require("../db/connection");
const menu_detection_1 = require("./menu-detection");
const job_queue_1 = require("./job-queue");
// Scheduler poll interval (how often we check for due jobs)
const SCHEDULER_POLL_INTERVAL_MS = 60 * 1000; // 1 minute
// Track running state
let isSchedulerRunning = false;
let schedulerInterval = null;
// ============================================================
// JITTER CALCULATION
// ============================================================
/**
* Generate a random jitter value in minutes
* Returns a value between -jitterMinutes and +jitterMinutes
*/
function getRandomJitterMinutes(jitterMinutes) {
// random() returns [0, 1), we want [-jitter, +jitter]
return (Math.random() * 2 - 1) * jitterMinutes;
}
/**
* Calculate next run time with jitter
* nextRunAt = baseTime + baseIntervalMinutes + random(-jitter, +jitter)
*/
function calculateNextRunAt(baseTime, baseIntervalMinutes, jitterMinutes) {
const jitter = getRandomJitterMinutes(jitterMinutes);
const totalMinutes = baseIntervalMinutes + jitter;
const totalMs = totalMinutes * 60 * 1000;
return new Date(baseTime.getTime() + totalMs);
}
// ============================================================
// DATABASE OPERATIONS
// ============================================================
/**
* Get all job schedules
*/
async function getAllSchedules() {
const { rows } = await (0, connection_1.query)(`
SELECT
id, job_name, description, enabled,
base_interval_minutes, jitter_minutes,
last_run_at, last_status, last_error_message, last_duration_ms,
next_run_at, job_config, created_at, updated_at
FROM job_schedules
ORDER BY job_name
`);
return rows.map(row => ({
id: row.id,
jobName: row.job_name,
description: row.description,
enabled: row.enabled,
baseIntervalMinutes: row.base_interval_minutes,
jitterMinutes: row.jitter_minutes,
lastRunAt: row.last_run_at,
lastStatus: row.last_status,
lastErrorMessage: row.last_error_message,
lastDurationMs: row.last_duration_ms,
nextRunAt: row.next_run_at,
jobConfig: row.job_config,
createdAt: row.created_at,
updatedAt: row.updated_at,
}));
}
/**
* Get a single schedule by ID
*/
async function getScheduleById(id) {
const { rows } = await (0, connection_1.query)(`SELECT * FROM job_schedules WHERE id = $1`, [id]);
if (rows.length === 0)
return null;
const row = rows[0];
return {
id: row.id,
jobName: row.job_name,
description: row.description,
enabled: row.enabled,
baseIntervalMinutes: row.base_interval_minutes,
jitterMinutes: row.jitter_minutes,
lastRunAt: row.last_run_at,
lastStatus: row.last_status,
lastErrorMessage: row.last_error_message,
lastDurationMs: row.last_duration_ms,
nextRunAt: row.next_run_at,
jobConfig: row.job_config,
createdAt: row.created_at,
updatedAt: row.updated_at,
};
}
/**
* Create a new schedule
*/
async function createSchedule(schedule) {
// Calculate initial nextRunAt
const nextRunAt = schedule.startImmediately
? new Date() // Start immediately
: calculateNextRunAt(new Date(), schedule.baseIntervalMinutes, schedule.jitterMinutes);
const { rows } = await (0, connection_1.query)(`
INSERT INTO job_schedules (
job_name, description, enabled,
base_interval_minutes, jitter_minutes,
next_run_at, job_config
) VALUES ($1, $2, $3, $4, $5, $6, $7)
RETURNING *
`, [
schedule.jobName,
schedule.description || null,
schedule.enabled ?? true,
schedule.baseIntervalMinutes,
schedule.jitterMinutes,
nextRunAt,
schedule.jobConfig ? JSON.stringify(schedule.jobConfig) : null,
]);
const row = rows[0];
console.log(`[Scheduler] Created schedule "${schedule.jobName}" - next run at ${nextRunAt.toISOString()}`);
return {
id: row.id,
jobName: row.job_name,
description: row.description,
enabled: row.enabled,
baseIntervalMinutes: row.base_interval_minutes,
jitterMinutes: row.jitter_minutes,
lastRunAt: row.last_run_at,
lastStatus: row.last_status,
lastErrorMessage: row.last_error_message,
lastDurationMs: row.last_duration_ms,
nextRunAt: row.next_run_at,
jobConfig: row.job_config,
createdAt: row.created_at,
updatedAt: row.updated_at,
};
}
/**
* Update a schedule
*/
async function updateSchedule(id, updates) {
const setClauses = [];
const params = [];
let paramIndex = 1;
if (updates.description !== undefined) {
setClauses.push(`description = $${paramIndex++}`);
params.push(updates.description);
}
if (updates.enabled !== undefined) {
setClauses.push(`enabled = $${paramIndex++}`);
params.push(updates.enabled);
}
if (updates.baseIntervalMinutes !== undefined) {
setClauses.push(`base_interval_minutes = $${paramIndex++}`);
params.push(updates.baseIntervalMinutes);
}
if (updates.jitterMinutes !== undefined) {
setClauses.push(`jitter_minutes = $${paramIndex++}`);
params.push(updates.jitterMinutes);
}
if (updates.jobConfig !== undefined) {
setClauses.push(`job_config = $${paramIndex++}`);
params.push(JSON.stringify(updates.jobConfig));
}
if (setClauses.length === 0) {
return getScheduleById(id);
}
setClauses.push(`updated_at = NOW()`);
params.push(id);
const { rows } = await (0, connection_1.query)(`UPDATE job_schedules SET ${setClauses.join(', ')} WHERE id = $${paramIndex} RETURNING *`, params);
if (rows.length === 0)
return null;
const row = rows[0];
return {
id: row.id,
jobName: row.job_name,
description: row.description,
enabled: row.enabled,
baseIntervalMinutes: row.base_interval_minutes,
jitterMinutes: row.jitter_minutes,
lastRunAt: row.last_run_at,
lastStatus: row.last_status,
lastErrorMessage: row.last_error_message,
lastDurationMs: row.last_duration_ms,
nextRunAt: row.next_run_at,
jobConfig: row.job_config,
createdAt: row.created_at,
updatedAt: row.updated_at,
};
}
/**
* Delete a schedule
*/
async function deleteSchedule(id) {
const result = await (0, connection_1.query)(`DELETE FROM job_schedules WHERE id = $1`, [id]);
return (result.rowCount || 0) > 0;
}
/**
* Mark a schedule as running
*/
async function markScheduleRunning(id) {
await (0, connection_1.query)(`UPDATE job_schedules SET last_status = 'running', updated_at = NOW() WHERE id = $1`, [id]);
}
/**
* Update schedule after job completion with NEW jittered next_run_at
*/
async function updateScheduleAfterRun(id, status, durationMs, errorMessage) {
// Get current schedule to calculate new nextRunAt
const schedule = await getScheduleById(id);
if (!schedule)
return;
const now = new Date();
const newNextRunAt = calculateNextRunAt(now, schedule.baseIntervalMinutes, schedule.jitterMinutes);
console.log(`[Scheduler] Schedule "${schedule.jobName}" completed (${status}). Next run: ${newNextRunAt.toISOString()}`);
await (0, connection_1.query)(`
UPDATE job_schedules SET
last_run_at = $2,
last_status = $3,
last_error_message = $4,
last_duration_ms = $5,
next_run_at = $6,
updated_at = NOW()
WHERE id = $1
`, [id, now, status, errorMessage || null, durationMs, newNextRunAt]);
}
/**
* Create a job run log entry
*/
async function createRunLog(scheduleId, jobName, status) {
const { rows } = await (0, connection_1.query)(`
INSERT INTO job_run_logs (schedule_id, job_name, status, started_at)
VALUES ($1, $2, $3, NOW())
RETURNING id
`, [scheduleId, jobName, status]);
return rows[0].id;
}
/**
* Update a job run log entry
*/
async function updateRunLog(runLogId, status, results) {
await (0, connection_1.query)(`
UPDATE job_run_logs SET
status = $2,
completed_at = NOW(),
duration_ms = $3,
error_message = $4,
items_processed = $5,
items_succeeded = $6,
items_failed = $7,
metadata = $8
WHERE id = $1
`, [
runLogId,
status,
results.durationMs,
results.errorMessage || null,
results.itemsProcessed || 0,
results.itemsSucceeded || 0,
results.itemsFailed || 0,
results.metadata ? JSON.stringify(results.metadata) : null,
]);
}
/**
* Get job run logs
*/
async function getRunLogs(options) {
const { scheduleId, jobName, limit = 50, offset = 0 } = options;
let whereClause = 'WHERE 1=1';
const params = [];
let paramIndex = 1;
if (scheduleId) {
whereClause += ` AND schedule_id = $${paramIndex++}`;
params.push(scheduleId);
}
if (jobName) {
whereClause += ` AND job_name = $${paramIndex++}`;
params.push(jobName);
}
params.push(limit, offset);
const { rows } = await (0, connection_1.query)(`
SELECT * FROM job_run_logs
${whereClause}
ORDER BY created_at DESC
LIMIT $${paramIndex} OFFSET $${paramIndex + 1}
`, params);
const { rows: countRows } = await (0, connection_1.query)(`SELECT COUNT(*) as total FROM job_run_logs ${whereClause}`, params.slice(0, -2));
return {
logs: rows,
total: parseInt(countRows[0]?.total || '0', 10),
};
}
// ============================================================
// JOB EXECUTION
// ============================================================
/**
* Execute a job based on its name
*/
async function executeJob(schedule) {
const config = schedule.jobConfig || {};
switch (schedule.jobName) {
case 'dutchie_az_product_crawl':
return executeProductCrawl(config);
case 'dutchie_az_discovery':
return executeDiscovery(config);
case 'dutchie_az_menu_detection':
return (0, menu_detection_1.executeMenuDetectionJob)(config);
default:
throw new Error(`Unknown job type: ${schedule.jobName}`);
}
}
/**
* Execute the AZ Dutchie product crawl job
*
* NEW BEHAVIOR: Instead of running crawls directly, this now ENQUEUES jobs
* into the crawl_jobs queue. Workers (running as separate replicas) will
* pick up and process these jobs.
*
* This allows:
* - Multiple workers to process jobs in parallel
* - No double-crawls (DB-level locking per dispensary)
* - Better scalability (add more worker replicas)
* - Live monitoring of individual job progress
*/
async function executeProductCrawl(config) {
const pricingType = config.pricingType || 'rec';
const useBothModes = config.useBothModes !== false;
// Get all "ready" dispensaries (menu_type='dutchie' AND platform_dispensary_id IS NOT NULL AND not failed)
// Note: Menu detection is handled separately by the dutchie_az_menu_detection schedule
const { rows: rawRows } = await (0, connection_1.query)(`
SELECT id FROM dispensaries
WHERE state = 'AZ'
AND menu_type = 'dutchie'
AND platform_dispensary_id IS NOT NULL
AND failed_at IS NULL
ORDER BY last_crawl_at ASC NULLS FIRST
`);
const dispensaryIds = rawRows.map((r) => r.id);
if (dispensaryIds.length === 0) {
return {
status: 'success',
itemsProcessed: 0,
itemsSucceeded: 0,
itemsFailed: 0,
metadata: { message: 'No ready dispensaries to crawl. Run menu detection to discover more.' },
};
}
console.log(`[Scheduler] Enqueueing crawl jobs for ${dispensaryIds.length} dispensaries...`);
// Bulk enqueue jobs (skips dispensaries that already have pending/running jobs)
const { enqueued, skipped } = await (0, job_queue_1.bulkEnqueueJobs)('dutchie_product_crawl', dispensaryIds, {
priority: 0,
metadata: { pricingType, useBothModes },
});
console.log(`[Scheduler] Enqueued ${enqueued} jobs, skipped ${skipped} (already queued)`);
// Get current queue stats
const queueStats = await (0, job_queue_1.getQueueStats)();
return {
status: 'success',
itemsProcessed: dispensaryIds.length,
itemsSucceeded: enqueued,
itemsFailed: 0, // Enqueue itself doesn't fail
metadata: {
enqueued,
skipped,
queueStats,
pricingType,
useBothModes,
message: `Enqueued ${enqueued} jobs. Workers will process them. Check /scraper-monitor for progress.`,
},
};
}
/**
* Execute the AZ Dutchie discovery job (placeholder)
*/
async function executeDiscovery(_config) {
// Placeholder - implement discovery logic
return {
status: 'success',
itemsProcessed: 0,
itemsSucceeded: 0,
itemsFailed: 0,
metadata: { message: 'Discovery not yet implemented' },
};
}
// ============================================================
// SCHEDULER RUNNER
// ============================================================
/**
* Check for due jobs and run them
*/
async function checkAndRunDueJobs() {
try {
// Get enabled schedules where nextRunAt <= now
const { rows } = await (0, connection_1.query)(`
SELECT * FROM job_schedules
WHERE enabled = true
AND next_run_at IS NOT NULL
AND next_run_at <= NOW()
AND (last_status IS NULL OR last_status != 'running')
ORDER BY next_run_at ASC
`);
if (rows.length === 0)
return;
console.log(`[Scheduler] Found ${rows.length} due job(s)`);
for (const row of rows) {
const schedule = {
id: row.id,
jobName: row.job_name,
description: row.description,
enabled: row.enabled,
baseIntervalMinutes: row.base_interval_minutes,
jitterMinutes: row.jitter_minutes,
lastRunAt: row.last_run_at,
lastStatus: row.last_status,
lastErrorMessage: row.last_error_message,
lastDurationMs: row.last_duration_ms,
nextRunAt: row.next_run_at,
jobConfig: row.job_config,
createdAt: row.created_at,
updatedAt: row.updated_at,
};
await runScheduledJob(schedule);
}
}
catch (error) {
console.error('[Scheduler] Error checking for due jobs:', error);
}
}
/**
* Run a single scheduled job
*/
async function runScheduledJob(schedule) {
const startTime = Date.now();
console.log(`[Scheduler] Starting job "${schedule.jobName}"...`);
// Mark as running
await markScheduleRunning(schedule.id);
// Create run log entry
const runLogId = await createRunLog(schedule.id, schedule.jobName, 'running');
try {
// Execute the job
const result = await executeJob(schedule);
const durationMs = Date.now() - startTime;
// Determine final status (exclude 'running' and null)
const finalStatus = result.status === 'running' || result.status === null
? 'success'
: result.status;
// Update run log
await updateRunLog(runLogId, finalStatus, {
durationMs,
errorMessage: result.errorMessage,
itemsProcessed: result.itemsProcessed,
itemsSucceeded: result.itemsSucceeded,
itemsFailed: result.itemsFailed,
metadata: result.metadata,
});
// Update schedule with NEW jittered next_run_at
await updateScheduleAfterRun(schedule.id, result.status, durationMs, result.errorMessage);
console.log(`[Scheduler] Job "${schedule.jobName}" completed in ${Math.round(durationMs / 1000)}s (${result.status})`);
}
catch (error) {
const durationMs = Date.now() - startTime;
console.error(`[Scheduler] Job "${schedule.jobName}" failed:`, error.message);
// Update run log with error
await updateRunLog(runLogId, 'error', {
durationMs,
errorMessage: error.message,
itemsProcessed: 0,
itemsSucceeded: 0,
itemsFailed: 0,
});
// Update schedule with NEW jittered next_run_at
await updateScheduleAfterRun(schedule.id, 'error', durationMs, error.message);
}
}
// ============================================================
// PUBLIC API
// ============================================================
/**
* Start the scheduler
*/
function startScheduler() {
if (isSchedulerRunning) {
console.log('[Scheduler] Scheduler is already running');
return;
}
isSchedulerRunning = true;
console.log(`[Scheduler] Starting scheduler (polling every ${SCHEDULER_POLL_INTERVAL_MS / 1000}s)...`);
// Immediately check for due jobs
checkAndRunDueJobs();
// Set up interval to check for due jobs
schedulerInterval = setInterval(checkAndRunDueJobs, SCHEDULER_POLL_INTERVAL_MS);
}
/**
* Stop the scheduler
*/
function stopScheduler() {
if (!isSchedulerRunning) {
console.log('[Scheduler] Scheduler is not running');
return;
}
isSchedulerRunning = false;
if (schedulerInterval) {
clearInterval(schedulerInterval);
schedulerInterval = null;
}
console.log('[Scheduler] Scheduler stopped');
}
/**
* Get scheduler status
*/
function getSchedulerStatus() {
return {
running: isSchedulerRunning,
pollIntervalMs: SCHEDULER_POLL_INTERVAL_MS,
};
}
/**
* Trigger immediate execution of a schedule
*/
async function triggerScheduleNow(scheduleId) {
const schedule = await getScheduleById(scheduleId);
if (!schedule) {
return { success: false, message: 'Schedule not found' };
}
if (schedule.lastStatus === 'running') {
return { success: false, message: 'Job is already running' };
}
// Run the job
await runScheduledJob(schedule);
return { success: true, message: 'Job triggered successfully' };
}
/**
* Initialize default schedules if they don't exist
*/
async function initializeDefaultSchedules() {
const schedules = await getAllSchedules();
// Check if product crawl schedule exists
const productCrawlExists = schedules.some(s => s.jobName === 'dutchie_az_product_crawl');
if (!productCrawlExists) {
await createSchedule({
jobName: 'dutchie_az_product_crawl',
description: 'Crawl all AZ Dutchie dispensary products',
enabled: true,
baseIntervalMinutes: 240, // 4 hours
jitterMinutes: 30, // ±30 minutes
jobConfig: { pricingType: 'rec', useBothModes: true },
startImmediately: false,
});
console.log('[Scheduler] Created default product crawl schedule');
}
// Check if menu detection schedule exists
const menuDetectionExists = schedules.some(s => s.jobName === 'dutchie_az_menu_detection');
if (!menuDetectionExists) {
await createSchedule({
jobName: 'dutchie_az_menu_detection',
description: 'Detect menu providers and resolve platform IDs for AZ dispensaries',
enabled: true,
baseIntervalMinutes: 1440, // 24 hours
jitterMinutes: 60, // ±1 hour
jobConfig: { state: 'AZ', onlyUnknown: true },
startImmediately: false,
});
console.log('[Scheduler] Created default menu detection schedule');
}
}
// Re-export for backward compatibility
var product_crawler_1 = require("./product-crawler");
Object.defineProperty(exports, "crawlSingleDispensary", { enumerable: true, get: function () { return product_crawler_1.crawlDispensaryProducts; } });
async function triggerImmediateCrawl() {
const schedules = await getAllSchedules();
const productCrawl = schedules.find(s => s.jobName === 'dutchie_az_product_crawl');
if (productCrawl) {
return triggerScheduleNow(productCrawl.id);
}
return { success: false, message: 'Product crawl schedule not found' };
}

View File

@@ -0,0 +1,440 @@
"use strict";
/**
* Worker Service
*
* Polls the job queue and processes crawl jobs.
* Each worker instance runs independently, claiming jobs atomically.
*/
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
exports.startWorker = startWorker;
exports.stopWorker = stopWorker;
exports.getWorkerStatus = getWorkerStatus;
const job_queue_1 = require("./job-queue");
const product_crawler_1 = require("./product-crawler");
const discovery_1 = require("./discovery");
const connection_1 = require("../db/connection");
// Explicit column list for dispensaries table (avoids SELECT * issues with schema differences)
// NOTE: failed_at is included for worker compatibility checks
const DISPENSARY_COLUMNS = `
id, name, slug, city, state, zip, address, latitude, longitude,
menu_type, menu_url, platform_dispensary_id, website,
provider_detection_data, created_at, updated_at, failed_at
`;
// ============================================================
// WORKER CONFIG
// ============================================================
const POLL_INTERVAL_MS = 5000; // Check for jobs every 5 seconds
const HEARTBEAT_INTERVAL_MS = 60000; // Send heartbeat every 60 seconds
const STALE_CHECK_INTERVAL_MS = 300000; // Check for stale jobs every 5 minutes
const SHUTDOWN_GRACE_PERIOD_MS = 30000; // Wait 30s for job to complete on shutdown
// ============================================================
// WORKER STATE
// ============================================================
let isRunning = false;
let currentJob = null;
let pollTimer = null;
let heartbeatTimer = null;
let staleCheckTimer = null;
let shutdownPromise = null;
// ============================================================
// WORKER LIFECYCLE
// ============================================================
/**
* Start the worker
*/
async function startWorker() {
if (isRunning) {
console.log('[Worker] Already running');
return;
}
const workerId = (0, job_queue_1.getWorkerId)();
const hostname = (0, job_queue_1.getWorkerHostname)();
console.log(`[Worker] Starting worker ${workerId} on ${hostname}`);
isRunning = true;
// Set up graceful shutdown
setupShutdownHandlers();
// Start polling for jobs
pollTimer = setInterval(pollForJobs, POLL_INTERVAL_MS);
// Start stale job recovery (only one worker should do this, but it's idempotent)
staleCheckTimer = setInterval(async () => {
try {
await (0, job_queue_1.recoverStaleJobs)(15);
}
catch (error) {
console.error('[Worker] Error recovering stale jobs:', error);
}
}, STALE_CHECK_INTERVAL_MS);
// Immediately poll for a job
await pollForJobs();
console.log(`[Worker] Worker ${workerId} started, polling every ${POLL_INTERVAL_MS}ms`);
}
/**
* Stop the worker gracefully
*/
async function stopWorker() {
if (!isRunning)
return;
console.log('[Worker] Stopping worker...');
isRunning = false;
// Clear timers
if (pollTimer) {
clearInterval(pollTimer);
pollTimer = null;
}
if (heartbeatTimer) {
clearInterval(heartbeatTimer);
heartbeatTimer = null;
}
if (staleCheckTimer) {
clearInterval(staleCheckTimer);
staleCheckTimer = null;
}
// Wait for current job to complete
if (currentJob) {
console.log(`[Worker] Waiting for job ${currentJob.id} to complete...`);
const startWait = Date.now();
while (currentJob && Date.now() - startWait < SHUTDOWN_GRACE_PERIOD_MS) {
await new Promise(r => setTimeout(r, 1000));
}
if (currentJob) {
console.log(`[Worker] Job ${currentJob.id} did not complete in time, marking for retry`);
await (0, job_queue_1.failJob)(currentJob.id, 'Worker shutdown');
}
}
console.log('[Worker] Worker stopped');
}
/**
* Get worker status
*/
function getWorkerStatus() {
return {
isRunning,
workerId: (0, job_queue_1.getWorkerId)(),
hostname: (0, job_queue_1.getWorkerHostname)(),
currentJob,
};
}
// ============================================================
// JOB PROCESSING
// ============================================================
/**
* Poll for and process the next available job
*/
async function pollForJobs() {
if (!isRunning || currentJob) {
return; // Already processing a job
}
try {
const workerId = (0, job_queue_1.getWorkerId)();
// Try to claim a job
const job = await (0, job_queue_1.claimNextJob)({
workerId,
jobTypes: ['dutchie_product_crawl', 'menu_detection', 'menu_detection_single'],
lockDurationMinutes: 30,
});
if (!job) {
return; // No jobs available
}
currentJob = job;
console.log(`[Worker] Processing job ${job.id} (type=${job.jobType}, dispensary=${job.dispensaryId})`);
// Start heartbeat for this job
heartbeatTimer = setInterval(async () => {
if (currentJob) {
try {
await (0, job_queue_1.heartbeat)(currentJob.id);
}
catch (error) {
console.error('[Worker] Heartbeat error:', error);
}
}
}, HEARTBEAT_INTERVAL_MS);
// Process the job
await processJob(job);
}
catch (error) {
console.error('[Worker] Error polling for jobs:', error);
if (currentJob) {
try {
await (0, job_queue_1.failJob)(currentJob.id, error.message);
}
catch (failError) {
console.error('[Worker] Error failing job:', failError);
}
}
}
finally {
// Clear heartbeat timer
if (heartbeatTimer) {
clearInterval(heartbeatTimer);
heartbeatTimer = null;
}
currentJob = null;
}
}
/**
* Process a single job
*/
async function processJob(job) {
try {
switch (job.jobType) {
case 'dutchie_product_crawl':
await processProductCrawlJob(job);
break;
case 'menu_detection':
await processMenuDetectionJob(job);
break;
case 'menu_detection_single':
await processSingleDetectionJob(job);
break;
default:
throw new Error(`Unknown job type: ${job.jobType}`);
}
}
catch (error) {
console.error(`[Worker] Job ${job.id} failed:`, error);
await (0, job_queue_1.failJob)(job.id, error.message);
}
}
// Maximum consecutive failures before flagging a dispensary
const MAX_CONSECUTIVE_FAILURES = 3;
/**
* Record a successful crawl - resets failure counter
*/
async function recordCrawlSuccess(dispensaryId) {
await (0, connection_1.query)(`UPDATE dispensaries
SET consecutive_failures = 0,
last_crawl_at = NOW(),
updated_at = NOW()
WHERE id = $1`, [dispensaryId]);
}
/**
* Record a crawl failure - increments counter and may flag dispensary
* Returns true if dispensary was flagged as failed
*/
async function recordCrawlFailure(dispensaryId, errorMessage) {
// Increment failure counter
const { rows } = await (0, connection_1.query)(`UPDATE dispensaries
SET consecutive_failures = consecutive_failures + 1,
last_failure_at = NOW(),
last_failure_reason = $2,
updated_at = NOW()
WHERE id = $1
RETURNING consecutive_failures`, [dispensaryId, errorMessage]);
const failures = rows[0]?.consecutive_failures || 0;
// If we've hit the threshold, flag the dispensary as failed
if (failures >= MAX_CONSECUTIVE_FAILURES) {
await (0, connection_1.query)(`UPDATE dispensaries
SET failed_at = NOW(),
menu_type = NULL,
platform_dispensary_id = NULL,
failure_notes = $2,
updated_at = NOW()
WHERE id = $1`, [dispensaryId, `Auto-flagged after ${failures} consecutive failures. Last error: ${errorMessage}`]);
console.log(`[Worker] Dispensary ${dispensaryId} flagged as FAILED after ${failures} consecutive failures`);
return true;
}
console.log(`[Worker] Dispensary ${dispensaryId} failure recorded (${failures}/${MAX_CONSECUTIVE_FAILURES})`);
return false;
}
/**
* Process a product crawl job for a single dispensary
*/
async function processProductCrawlJob(job) {
if (!job.dispensaryId) {
throw new Error('Product crawl job requires dispensary_id');
}
// Get dispensary details
const { rows } = await (0, connection_1.query)(`SELECT ${DISPENSARY_COLUMNS} FROM dispensaries WHERE id = $1`, [job.dispensaryId]);
if (rows.length === 0) {
throw new Error(`Dispensary ${job.dispensaryId} not found`);
}
const dispensary = (0, discovery_1.mapDbRowToDispensary)(rows[0]);
// Check if dispensary is already flagged as failed
if (rows[0].failed_at) {
console.log(`[Worker] Skipping dispensary ${job.dispensaryId} - already flagged as failed`);
await (0, job_queue_1.completeJob)(job.id, { productsFound: 0, productsUpserted: 0 });
return;
}
if (!dispensary.platformDispensaryId) {
// Record failure and potentially flag
await recordCrawlFailure(job.dispensaryId, 'Missing platform_dispensary_id');
throw new Error(`Dispensary ${job.dispensaryId} has no platform_dispensary_id`);
}
// Get crawl options from job metadata
const pricingType = job.metadata?.pricingType || 'rec';
const useBothModes = job.metadata?.useBothModes !== false;
try {
// Crawl the dispensary
const result = await (0, product_crawler_1.crawlDispensaryProducts)(dispensary, pricingType, {
useBothModes,
onProgress: async (progress) => {
// Update progress for live monitoring
await (0, job_queue_1.updateJobProgress)(job.id, {
productsFound: progress.productsFound,
productsUpserted: progress.productsUpserted,
snapshotsCreated: progress.snapshotsCreated,
currentPage: progress.currentPage,
totalPages: progress.totalPages,
});
},
});
if (result.success) {
// Success! Reset failure counter
await recordCrawlSuccess(job.dispensaryId);
await (0, job_queue_1.completeJob)(job.id, {
productsFound: result.productsFetched,
productsUpserted: result.productsUpserted,
snapshotsCreated: result.snapshotsCreated,
});
}
else {
// Crawl returned failure - record it
const wasFlagged = await recordCrawlFailure(job.dispensaryId, result.errorMessage || 'Crawl failed');
if (wasFlagged) {
// Don't throw - the dispensary is now flagged, job is "complete"
await (0, job_queue_1.completeJob)(job.id, { productsFound: 0, productsUpserted: 0 });
}
else {
throw new Error(result.errorMessage || 'Crawl failed');
}
}
}
catch (error) {
// Record the failure
const wasFlagged = await recordCrawlFailure(job.dispensaryId, error.message);
if (wasFlagged) {
// Dispensary is now flagged - complete the job rather than fail it
await (0, job_queue_1.completeJob)(job.id, { productsFound: 0, productsUpserted: 0 });
}
else {
throw error;
}
}
}
/**
* Process a menu detection job (bulk)
*/
async function processMenuDetectionJob(job) {
const { executeMenuDetectionJob } = await Promise.resolve().then(() => __importStar(require('./menu-detection')));
const config = job.metadata || {};
const result = await executeMenuDetectionJob(config);
if (result.status === 'error') {
throw new Error(result.errorMessage || 'Menu detection failed');
}
await (0, job_queue_1.completeJob)(job.id, {
productsFound: result.itemsProcessed,
productsUpserted: result.itemsSucceeded,
});
}
/**
* Process a single dispensary menu detection job
* This is the parallelizable version - each worker can detect one dispensary at a time
*/
async function processSingleDetectionJob(job) {
if (!job.dispensaryId) {
throw new Error('Single detection job requires dispensary_id');
}
const { detectAndResolveDispensary } = await Promise.resolve().then(() => __importStar(require('./menu-detection')));
// Get dispensary details
const { rows } = await (0, connection_1.query)(`SELECT ${DISPENSARY_COLUMNS} FROM dispensaries WHERE id = $1`, [job.dispensaryId]);
if (rows.length === 0) {
throw new Error(`Dispensary ${job.dispensaryId} not found`);
}
const dispensary = rows[0];
// Skip if already detected or failed
if (dispensary.failed_at) {
console.log(`[Worker] Skipping dispensary ${job.dispensaryId} - already flagged as failed`);
await (0, job_queue_1.completeJob)(job.id, { productsFound: 0, productsUpserted: 0 });
return;
}
if (dispensary.menu_type && dispensary.menu_type !== 'unknown') {
console.log(`[Worker] Skipping dispensary ${job.dispensaryId} - already detected as ${dispensary.menu_type}`);
await (0, job_queue_1.completeJob)(job.id, { productsFound: 0, productsUpserted: 1 });
return;
}
console.log(`[Worker] Detecting menu for dispensary ${job.dispensaryId} (${dispensary.name})...`);
try {
const result = await detectAndResolveDispensary(job.dispensaryId);
if (result.success) {
console.log(`[Worker] Dispensary ${job.dispensaryId}: detected ${result.detectedProvider}, platformId=${result.platformDispensaryId || 'none'}`);
await (0, job_queue_1.completeJob)(job.id, {
productsFound: 1,
productsUpserted: result.platformDispensaryId ? 1 : 0,
});
}
else {
// Detection failed - record failure
await recordCrawlFailure(job.dispensaryId, result.error || 'Detection failed');
throw new Error(result.error || 'Detection failed');
}
}
catch (error) {
// Record the failure
const wasFlagged = await recordCrawlFailure(job.dispensaryId, error.message);
if (wasFlagged) {
// Dispensary is now flagged - complete the job rather than fail it
await (0, job_queue_1.completeJob)(job.id, { productsFound: 0, productsUpserted: 0 });
}
else {
throw error;
}
}
}
// ============================================================
// SHUTDOWN HANDLING
// ============================================================
function setupShutdownHandlers() {
const shutdown = async (signal) => {
if (shutdownPromise)
return shutdownPromise;
console.log(`\n[Worker] Received ${signal}, shutting down...`);
shutdownPromise = stopWorker();
await shutdownPromise;
process.exit(0);
};
process.on('SIGTERM', () => shutdown('SIGTERM'));
process.on('SIGINT', () => shutdown('SIGINT'));
}
// ============================================================
// STANDALONE WORKER ENTRY POINT
// ============================================================
if (require.main === module) {
// Run as standalone worker
startWorker().catch((error) => {
console.error('[Worker] Fatal error:', error);
process.exit(1);
});
}

96
backend/dist/dutchie-az/types/index.js vendored Normal file
View File

@@ -0,0 +1,96 @@
"use strict";
/**
* Dutchie AZ Data Types
*
* Complete TypeScript interfaces for the isolated Dutchie Arizona data pipeline.
* These types map directly to Dutchie's GraphQL FilteredProducts response.
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.getOptionQuantity = getOptionQuantity;
exports.deriveOptionStockStatus = deriveOptionStockStatus;
exports.deriveStockStatus = deriveStockStatus;
exports.calculateTotalQuantity = calculateTotalQuantity;
exports.calculateTotalKioskQuantity = calculateTotalKioskQuantity;
/**
* Get available quantity for a single option
* Priority: quantityAvailable > kioskQuantityAvailable > quantity
*/
function getOptionQuantity(child) {
if (typeof child.quantityAvailable === 'number')
return child.quantityAvailable;
if (typeof child.kioskQuantityAvailable === 'number')
return child.kioskQuantityAvailable;
if (typeof child.quantity === 'number')
return child.quantity;
return null; // No quantity data available
}
/**
* Derive stock status for a single option
* Returns: 'in_stock' if qty > 0, 'out_of_stock' if qty === 0, 'unknown' if no data
*/
function deriveOptionStockStatus(child) {
const qty = getOptionQuantity(child);
if (qty === null)
return 'unknown';
return qty > 0 ? 'in_stock' : 'out_of_stock';
}
/**
* Derive product-level stock status from POSMetaData.children
*
* Logic per spec:
* - If ANY child is "in_stock" → product is "in_stock"
* - Else if ALL children are "out_of_stock" → product is "out_of_stock"
* - Else → product is "unknown"
*
* IMPORTANT: Threshold flags (isBelowThreshold, etc.) do NOT override stock status.
* They only indicate "low stock" - if qty > 0, status stays "in_stock".
*/
function deriveStockStatus(product) {
const children = product.POSMetaData?.children;
// No children data - unknown
if (!children || children.length === 0) {
return 'unknown';
}
// Get stock status for each option
const optionStatuses = children.map(deriveOptionStockStatus);
// If ANY option is in_stock → product is in_stock
if (optionStatuses.some(status => status === 'in_stock')) {
return 'in_stock';
}
// If ALL options are out_of_stock → product is out_of_stock
if (optionStatuses.every(status => status === 'out_of_stock')) {
return 'out_of_stock';
}
// Otherwise (mix of out_of_stock and unknown) → unknown
return 'unknown';
}
/**
* Calculate total quantity available across all options
* Returns null if no children data (unknown inventory), 0 if children exist but all have 0 qty
*/
function calculateTotalQuantity(product) {
const children = product.POSMetaData?.children;
// No children = unknown inventory, return null (NOT 0)
if (!children || children.length === 0)
return null;
// Check if any child has quantity data
const hasAnyQtyData = children.some(child => getOptionQuantity(child) !== null);
if (!hasAnyQtyData)
return null; // All children lack qty data = unknown
return children.reduce((sum, child) => {
const qty = getOptionQuantity(child);
return sum + (qty ?? 0);
}, 0);
}
/**
* Calculate total kiosk quantity available across all options
*/
function calculateTotalKioskQuantity(product) {
const children = product.POSMetaData?.children;
if (!children || children.length === 0)
return null;
const hasAnyKioskQty = children.some(child => typeof child.kioskQuantityAvailable === 'number');
if (!hasAnyKioskQty)
return null;
return children.reduce((sum, child) => sum + (child.kioskQuantityAvailable ?? 0), 0);
}

64
backend/dist/index.js vendored
View File

@@ -7,18 +7,39 @@ const express_1 = __importDefault(require("express"));
const cors_1 = __importDefault(require("cors"));
const dotenv_1 = __importDefault(require("dotenv"));
const minio_1 = require("./utils/minio");
const image_storage_1 = require("./utils/image-storage");
const logger_1 = require("./services/logger");
const proxyTestQueue_1 = require("./services/proxyTestQueue");
dotenv_1.default.config();
const app = (0, express_1.default)();
const PORT = process.env.PORT || 3010;
app.use((0, cors_1.default)());
app.use(express_1.default.json());
// Serve static images when MinIO is not configured
const LOCAL_IMAGES_PATH = process.env.LOCAL_IMAGES_PATH || '/app/public/images';
app.use('/images', express_1.default.static(LOCAL_IMAGES_PATH));
// Serve static downloads (plugin files, etc.)
const LOCAL_DOWNLOADS_PATH = process.env.LOCAL_DOWNLOADS_PATH || '/app/public/downloads';
app.use('/downloads', express_1.default.static(LOCAL_DOWNLOADS_PATH));
app.get('/health', (req, res) => {
res.json({ status: 'ok', timestamp: new Date().toISOString() });
});
// Endpoint to check server's outbound IP (for proxy whitelist setup)
app.get('/outbound-ip', async (req, res) => {
try {
const axios = require('axios');
const response = await axios.get('https://api.ipify.org?format=json', { timeout: 10000 });
res.json({ outbound_ip: response.data.ip });
}
catch (error) {
res.status(500).json({ error: error.message });
}
});
const auth_1 = __importDefault(require("./routes/auth"));
const dashboard_1 = __importDefault(require("./routes/dashboard"));
const stores_1 = __importDefault(require("./routes/stores"));
const dispensaries_1 = __importDefault(require("./routes/dispensaries"));
const changes_1 = __importDefault(require("./routes/changes"));
const categories_1 = __importDefault(require("./routes/categories"));
const products_1 = __importDefault(require("./routes/products"));
const campaigns_1 = __importDefault(require("./routes/campaigns"));
@@ -27,9 +48,27 @@ const settings_1 = __importDefault(require("./routes/settings"));
const proxies_1 = __importDefault(require("./routes/proxies"));
const logs_1 = __importDefault(require("./routes/logs"));
const scraper_monitor_1 = __importDefault(require("./routes/scraper-monitor"));
const api_tokens_1 = __importDefault(require("./routes/api-tokens"));
const api_permissions_1 = __importDefault(require("./routes/api-permissions"));
const parallel_scrape_1 = __importDefault(require("./routes/parallel-scrape"));
const schedule_1 = __importDefault(require("./routes/schedule"));
const crawler_sandbox_1 = __importDefault(require("./routes/crawler-sandbox"));
const version_1 = __importDefault(require("./routes/version"));
const public_api_1 = __importDefault(require("./routes/public-api"));
const dutchie_az_1 = require("./dutchie-az");
const apiTokenTracker_1 = require("./middleware/apiTokenTracker");
const crawl_scheduler_1 = require("./services/crawl-scheduler");
const wordpressPermissions_1 = require("./middleware/wordpressPermissions");
// Apply WordPress permissions validation first (sets req.apiToken)
app.use(wordpressPermissions_1.validateWordPressPermissions);
// Apply API tracking middleware globally
app.use(apiTokenTracker_1.trackApiUsage);
app.use(apiTokenTracker_1.checkRateLimit);
app.use('/api/auth', auth_1.default);
app.use('/api/dashboard', dashboard_1.default);
app.use('/api/stores', stores_1.default);
app.use('/api/dispensaries', dispensaries_1.default);
app.use('/api/changes', changes_1.default);
app.use('/api/categories', categories_1.default);
app.use('/api/products', products_1.default);
app.use('/api/campaigns', campaigns_1.default);
@@ -38,11 +77,34 @@ app.use('/api/settings', settings_1.default);
app.use('/api/proxies', proxies_1.default);
app.use('/api/logs', logs_1.default);
app.use('/api/scraper-monitor', scraper_monitor_1.default);
app.use('/api/api-tokens', api_tokens_1.default);
app.use('/api/api-permissions', api_permissions_1.default);
app.use('/api/parallel-scrape', parallel_scrape_1.default);
app.use('/api/schedule', schedule_1.default);
app.use('/api/crawler-sandbox', crawler_sandbox_1.default);
app.use('/api/version', version_1.default);
// Vendor-agnostic AZ data pipeline routes (new public surface)
app.use('/api/az', dutchie_az_1.dutchieAZRouter);
// Legacy alias (kept temporarily for backward compatibility)
app.use('/api/dutchie-az', dutchie_az_1.dutchieAZRouter);
// Public API v1 - External consumer endpoints (WordPress, etc.)
// Uses dutchie_az data pipeline with per-dispensary API key auth
app.use('/api/v1', public_api_1.default);
async function startServer() {
try {
logger_1.logger.info('system', 'Starting server...');
await (0, minio_1.initializeMinio)();
logger_1.logger.info('system', 'Minio initialized');
await (0, image_storage_1.initializeImageStorage)();
logger_1.logger.info('system', (0, minio_1.isMinioEnabled)() ? 'MinIO storage initialized' : 'Local filesystem storage initialized');
// Clean up any orphaned proxy test jobs from previous server runs
await (0, proxyTestQueue_1.cleanupOrphanedJobs)();
// Start the crawl scheduler (checks every minute for jobs to run)
(0, crawl_scheduler_1.startCrawlScheduler)();
logger_1.logger.info('system', 'Crawl scheduler started');
// Start the Dutchie AZ scheduler (enqueues jobs for workers)
await (0, dutchie_az_1.initializeDefaultSchedules)();
(0, dutchie_az_1.startScheduler)();
logger_1.logger.info('system', 'Dutchie AZ scheduler started');
app.listen(PORT, () => {
logger_1.logger.info('system', `Server running on port ${PORT}`);
console.log(`🚀 Server running on port ${PORT}`);

View File

@@ -0,0 +1,94 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.trackApiUsage = trackApiUsage;
exports.checkRateLimit = checkRateLimit;
const migrate_1 = require("../db/migrate");
async function trackApiUsage(req, res, next) {
// Only track if authenticated via API token
if (!req.apiToken) {
return next();
}
const startTime = Date.now();
req.startTime = startTime;
// Get request size
const requestSize = req.headers['content-length']
? parseInt(req.headers['content-length'])
: 0;
// Capture original res.json to measure response
const originalJson = res.json.bind(res);
let responseSize = 0;
res.json = function (body) {
responseSize = JSON.stringify(body).length;
return originalJson(body);
};
// Track after response is sent
res.on('finish', async () => {
const responseTime = Date.now() - startTime;
try {
await migrate_1.pool.query(`
INSERT INTO api_token_usage (
token_id,
endpoint,
method,
status_code,
response_time_ms,
request_size,
response_size,
ip_address,
user_agent
)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)
`, [
req.apiToken.id,
req.path,
req.method,
res.statusCode,
responseTime,
requestSize,
responseSize,
req.ip,
req.headers['user-agent'] || null
]);
// Update last_used_at
await migrate_1.pool.query('UPDATE api_tokens SET last_used_at = CURRENT_TIMESTAMP WHERE id = $1', [req.apiToken.id]);
}
catch (error) {
console.error('Error tracking API usage:', error);
}
});
next();
}
// Rate limiting check
async function checkRateLimit(req, res, next) {
if (!req.apiToken) {
return next();
}
const { id, rate_limit } = req.apiToken;
try {
// Count requests in the last minute
const result = await migrate_1.pool.query(`
SELECT COUNT(*) as request_count
FROM api_token_usage
WHERE token_id = $1
AND created_at > NOW() - INTERVAL '1 minute'
`, [id]);
const requestCount = parseInt(result.rows[0].request_count);
if (requestCount >= rate_limit) {
return res.status(429).json({
error: 'Rate limit exceeded',
limit: rate_limit,
current: requestCount,
retry_after: 60
});
}
// Add rate limit headers
res.setHeader('X-RateLimit-Limit', rate_limit.toString());
res.setHeader('X-RateLimit-Remaining', (rate_limit - requestCount).toString());
res.setHeader('X-RateLimit-Reset', new Date(Date.now() + 60000).toISOString());
next();
}
catch (error) {
console.error('Error checking rate limit:', error);
next();
}
}

View File

@@ -0,0 +1,163 @@
"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.validateWordPressPermissions = validateWordPressPermissions;
const migrate_1 = require("../db/migrate");
const ipaddr_js_1 = __importDefault(require("ipaddr.js"));
/**
* Validates if an IP address matches any of the allowed IP patterns
* Supports CIDR notation and wildcards
*/
function isIpAllowed(clientIp, allowedIps) {
try {
const clientAddr = ipaddr_js_1.default.process(clientIp);
for (const allowedIp of allowedIps) {
const trimmed = allowedIp.trim();
if (!trimmed)
continue;
// Check for CIDR notation
if (trimmed.includes('/')) {
try {
const [subnet, bits] = trimmed.split('/');
const range = ipaddr_js_1.default.parseCIDR(trimmed);
if (clientAddr.match(range)) {
return true;
}
}
catch (e) {
console.warn(`Invalid CIDR notation: ${trimmed}`);
continue;
}
}
else {
// Exact match
try {
const allowedAddr = ipaddr_js_1.default.process(trimmed);
if (clientAddr.toString() === allowedAddr.toString()) {
return true;
}
}
catch (e) {
console.warn(`Invalid IP address: ${trimmed}`);
continue;
}
}
}
return false;
}
catch (error) {
console.error('Error processing client IP:', error);
return false;
}
}
/**
* Validates if a domain matches any of the allowed domain patterns
* Supports wildcard subdomains (*.example.com)
*/
function isDomainAllowed(origin, allowedDomains) {
try {
// Extract domain from origin URL
const url = new URL(origin);
const domain = url.hostname;
for (const allowedDomain of allowedDomains) {
const trimmed = allowedDomain.trim();
if (!trimmed)
continue;
// Wildcard subdomain support
if (trimmed.startsWith('*.')) {
const baseDomain = trimmed.substring(2);
if (domain === baseDomain || domain.endsWith('.' + baseDomain)) {
return true;
}
}
else {
// Exact match
if (domain === trimmed) {
return true;
}
}
}
return false;
}
catch (error) {
console.error('Error processing domain:', error);
return false;
}
}
/**
* WordPress API Permissions Middleware
* Validates API access based on WordPress permissions table
*/
async function validateWordPressPermissions(req, res, next) {
// Get API key from header
const apiKey = req.headers['x-api-key'];
// If no API key provided, skip WordPress validation
if (!apiKey) {
return next();
}
try {
// Query WordPress permissions table
const result = await migrate_1.pool.query(`
SELECT id, user_name, api_key, allowed_ips, allowed_domains, is_active
FROM wp_dutchie_api_permissions
WHERE api_key = $1 AND is_active = 1
`, [apiKey]);
if (result.rows.length === 0) {
return res.status(401).json({
error: 'Invalid API key'
});
}
const permission = result.rows[0];
// Get client IP
const clientIp = req.headers['x-forwarded-for']?.split(',')[0].trim() ||
req.headers['x-real-ip'] ||
req.ip ||
req.connection.remoteAddress ||
'';
// Validate IP if configured
if (permission.allowed_ips) {
const allowedIps = permission.allowed_ips.split('\n').filter((ip) => ip.trim());
if (allowedIps.length > 0 && !isIpAllowed(clientIp, allowedIps)) {
return res.status(403).json({
error: 'IP address not allowed',
client_ip: clientIp
});
}
}
// Validate domain if configured
const origin = req.get('origin') || req.get('referer') || '';
if (permission.allowed_domains && origin) {
const allowedDomains = permission.allowed_domains.split('\n').filter((d) => d.trim());
if (allowedDomains.length > 0 && !isDomainAllowed(origin, allowedDomains)) {
return res.status(403).json({
error: 'Domain not allowed',
origin: origin
});
}
}
// Update last_used_at timestamp (async, don't wait)
migrate_1.pool.query(`
UPDATE wp_dutchie_api_permissions
SET last_used_at = CURRENT_TIMESTAMP
WHERE id = $1
`, [permission.id]).catch((err) => {
console.error('Error updating last_used_at:', err);
});
// Set apiToken on request for tracking middleware
// Default rate limit of 100 requests/minute for WordPress permissions
req.apiToken = {
id: permission.id,
name: permission.user_name,
rate_limit: 100
};
next();
}
catch (error) {
console.error('WordPress permissions validation error:', error);
return res.status(500).json({
error: 'Internal server error during API validation'
});
}
}

View File

@@ -0,0 +1,32 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
const migrate_1 = require("../db/migrate");
(async () => {
try {
console.log('🔄 Running image sizes migration...');
// Add thumbnail and medium paths
await migrate_1.pool.query(`
ALTER TABLE products
ADD COLUMN IF NOT EXISTS thumbnail_path TEXT,
ADD COLUMN IF NOT EXISTS medium_path TEXT
`);
console.log('✅ Added thumbnail_path and medium_path columns');
// Rename local_image_path to full_path
await migrate_1.pool.query(`
ALTER TABLE products
RENAME COLUMN local_image_path TO full_path
`);
console.log('✅ Renamed local_image_path to full_path');
// Add index
await migrate_1.pool.query(`
CREATE INDEX IF NOT EXISTS idx_products_images ON products(full_path, thumbnail_path, medium_path)
`);
console.log('✅ Created image index');
console.log('✅ Migration complete!');
process.exit(0);
}
catch (error) {
console.error('❌ Migration failed:', error);
process.exit(1);
}
})();

174
backend/dist/routes/api-permissions.js vendored Normal file
View File

@@ -0,0 +1,174 @@
"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
const express_1 = require("express");
const middleware_1 = require("../auth/middleware");
const migrate_1 = require("../db/migrate");
const crypto_1 = __importDefault(require("crypto"));
const router = (0, express_1.Router)();
router.use(middleware_1.authMiddleware);
// Generate secure random API key (64-character hex)
function generateApiKey() {
return crypto_1.default.randomBytes(32).toString('hex');
}
// Get all API permissions
router.get('/', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
try {
const result = await migrate_1.pool.query(`
SELECT *
FROM wp_dutchie_api_permissions
ORDER BY created_at DESC
`);
res.json({ permissions: result.rows });
}
catch (error) {
console.error('Error fetching API permissions:', error);
res.status(500).json({ error: 'Failed to fetch API permissions' });
}
});
// Get all dispensaries for dropdown (must be before /:id to avoid route conflict)
router.get('/dispensaries', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
try {
const result = await migrate_1.pool.query(`
SELECT id, name
FROM dispensaries
ORDER BY name
`);
res.json({ dispensaries: result.rows });
}
catch (error) {
console.error('Error fetching dispensaries:', error);
res.status(500).json({ error: 'Failed to fetch dispensaries' });
}
});
// Get single API permission
router.get('/:id', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
try {
const { id } = req.params;
const result = await migrate_1.pool.query(`
SELECT *
FROM wp_dutchie_api_permissions
WHERE id = $1
`, [id]);
if (result.rows.length === 0) {
return res.status(404).json({ error: 'Permission not found' });
}
res.json({ permission: result.rows[0] });
}
catch (error) {
console.error('Error fetching API permission:', error);
res.status(500).json({ error: 'Failed to fetch API permission' });
}
});
// Create new API permission
router.post('/', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
try {
// Support both store_id (existing) and dispensary_id (for compatibility)
const { user_name, allowed_ips, allowed_domains, store_id, dispensary_id } = req.body;
const storeIdToUse = store_id || dispensary_id;
if (!user_name) {
return res.status(400).json({ error: 'User name is required' });
}
if (!storeIdToUse) {
return res.status(400).json({ error: 'Store/Dispensary is required' });
}
// Get dispensary name for display
const dispensaryResult = await migrate_1.pool.query('SELECT name FROM dispensaries WHERE id = $1', [storeIdToUse]);
if (dispensaryResult.rows.length === 0) {
return res.status(400).json({ error: 'Invalid store/dispensary ID' });
}
const storeName = dispensaryResult.rows[0].name;
const apiKey = generateApiKey();
const result = await migrate_1.pool.query(`
INSERT INTO wp_dutchie_api_permissions (
user_name,
api_key,
allowed_ips,
allowed_domains,
is_active,
store_id,
store_name
)
VALUES ($1, $2, $3, $4, 1, $5, $6)
RETURNING *
`, [
user_name,
apiKey,
allowed_ips || null,
allowed_domains || null,
storeIdToUse,
storeName
]);
res.status(201).json({
permission: result.rows[0],
message: 'API permission created successfully. Save the API key securely - it cannot be retrieved later.'
});
}
catch (error) {
console.error('Error creating API permission:', error);
res.status(500).json({ error: 'Failed to create API permission' });
}
});
// Update API permission
router.put('/:id', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
try {
const { id } = req.params;
const { user_name, allowed_ips, allowed_domains, is_active } = req.body;
const result = await migrate_1.pool.query(`
UPDATE wp_dutchie_api_permissions
SET
user_name = COALESCE($1, user_name),
allowed_ips = COALESCE($2, allowed_ips),
allowed_domains = COALESCE($3, allowed_domains),
is_active = COALESCE($4, is_active)
WHERE id = $5
RETURNING *
`, [user_name, allowed_ips, allowed_domains, is_active, id]);
if (result.rows.length === 0) {
return res.status(404).json({ error: 'Permission not found' });
}
res.json({ permission: result.rows[0] });
}
catch (error) {
console.error('Error updating API permission:', error);
res.status(500).json({ error: 'Failed to update API permission' });
}
});
// Toggle permission active status
router.patch('/:id/toggle', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
try {
const { id } = req.params;
const result = await migrate_1.pool.query(`
UPDATE wp_dutchie_api_permissions
SET is_active = NOT is_active
WHERE id = $1
RETURNING *
`, [id]);
if (result.rows.length === 0) {
return res.status(404).json({ error: 'Permission not found' });
}
res.json({ permission: result.rows[0] });
}
catch (error) {
console.error('Error toggling API permission:', error);
res.status(500).json({ error: 'Failed to toggle API permission' });
}
});
// Delete API permission
router.delete('/:id', (0, middleware_1.requireRole)('superadmin'), async (req, res) => {
try {
const { id } = req.params;
const result = await migrate_1.pool.query('DELETE FROM wp_dutchie_api_permissions WHERE id = $1 RETURNING *', [id]);
if (result.rows.length === 0) {
return res.status(404).json({ error: 'Permission not found' });
}
res.json({ message: 'API permission deleted successfully' });
}
catch (error) {
console.error('Error deleting API permission:', error);
res.status(500).json({ error: 'Failed to delete API permission' });
}
});
exports.default = router;

265
backend/dist/routes/api-tokens.js vendored Normal file
View File

@@ -0,0 +1,265 @@
"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
const express_1 = require("express");
const middleware_1 = require("../auth/middleware");
const migrate_1 = require("../db/migrate");
const crypto_1 = __importDefault(require("crypto"));
const router = (0, express_1.Router)();
router.use(middleware_1.authMiddleware);
// Generate secure random token
function generateToken() {
return crypto_1.default.randomBytes(32).toString('hex');
}
// Get all API tokens
router.get('/', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
try {
const result = await migrate_1.pool.query(`
SELECT
t.*,
u.email as created_by_email,
(
SELECT COUNT(*)
FROM api_token_usage
WHERE token_id = t.id
AND created_at > NOW() - INTERVAL '24 hours'
) as requests_24h,
(
SELECT COUNT(*)
FROM api_token_usage
WHERE token_id = t.id
AND created_at > NOW() - INTERVAL '7 days'
) as requests_7d,
(
SELECT COUNT(*)
FROM api_token_usage
WHERE token_id = t.id
) as total_requests
FROM api_tokens t
LEFT JOIN users u ON t.user_id = u.id
ORDER BY t.created_at DESC
`);
res.json({ tokens: result.rows });
}
catch (error) {
console.error('Error fetching API tokens:', error);
res.status(500).json({ error: 'Failed to fetch API tokens' });
}
});
// Get single API token
router.get('/:id', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
try {
const { id } = req.params;
const result = await migrate_1.pool.query(`
SELECT
t.*,
u.email as created_by_email
FROM api_tokens t
LEFT JOIN users u ON t.user_id = u.id
WHERE t.id = $1
`, [id]);
if (result.rows.length === 0) {
return res.status(404).json({ error: 'Token not found' });
}
res.json({ token: result.rows[0] });
}
catch (error) {
console.error('Error fetching API token:', error);
res.status(500).json({ error: 'Failed to fetch API token' });
}
});
// Create new API token
router.post('/', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
try {
const { name, description, rate_limit, allowed_endpoints, expires_at } = req.body;
const userId = req.user.userId;
if (!name) {
return res.status(400).json({ error: 'Name is required' });
}
const token = generateToken();
const result = await migrate_1.pool.query(`
INSERT INTO api_tokens (
name,
token,
description,
user_id,
rate_limit,
allowed_endpoints,
expires_at
)
VALUES ($1, $2, $3, $4, $5, $6, $7)
RETURNING *
`, [
name,
token,
description || null,
userId,
rate_limit || 100,
allowed_endpoints || null,
expires_at || null
]);
res.status(201).json({
token: result.rows[0],
message: 'API token created successfully. Save this token securely - it cannot be retrieved later.'
});
}
catch (error) {
console.error('Error creating API token:', error);
res.status(500).json({ error: 'Failed to create API token' });
}
});
// Update API token
router.put('/:id', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
try {
const { id } = req.params;
const { name, description, active, rate_limit, allowed_endpoints, expires_at } = req.body;
const result = await migrate_1.pool.query(`
UPDATE api_tokens
SET
name = COALESCE($1, name),
description = COALESCE($2, description),
active = COALESCE($3, active),
rate_limit = COALESCE($4, rate_limit),
allowed_endpoints = COALESCE($5, allowed_endpoints),
expires_at = COALESCE($6, expires_at)
WHERE id = $7
RETURNING *
`, [name, description, active, rate_limit, allowed_endpoints, expires_at, id]);
if (result.rows.length === 0) {
return res.status(404).json({ error: 'Token not found' });
}
res.json({ token: result.rows[0] });
}
catch (error) {
console.error('Error updating API token:', error);
res.status(500).json({ error: 'Failed to update API token' });
}
});
// Delete API token
router.delete('/:id', (0, middleware_1.requireRole)('superadmin'), async (req, res) => {
try {
const { id } = req.params;
const result = await migrate_1.pool.query('DELETE FROM api_tokens WHERE id = $1 RETURNING *', [id]);
if (result.rows.length === 0) {
return res.status(404).json({ error: 'Token not found' });
}
res.json({ message: 'API token deleted successfully' });
}
catch (error) {
console.error('Error deleting API token:', error);
res.status(500).json({ error: 'Failed to delete API token' });
}
});
// Get token usage statistics
router.get('/:id/usage', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
try {
const { id } = req.params;
const { days = 7 } = req.query;
// Get hourly usage for the past N days
const hourlyUsage = await migrate_1.pool.query(`
SELECT
DATE_TRUNC('hour', created_at) as hour,
COUNT(*) as requests,
AVG(response_time_ms) as avg_response_time,
SUM(CASE WHEN status_code >= 200 AND status_code < 300 THEN 1 ELSE 0 END) as successful_requests,
SUM(CASE WHEN status_code >= 400 THEN 1 ELSE 0 END) as failed_requests
FROM api_token_usage
WHERE token_id = $1
AND created_at > NOW() - INTERVAL '${parseInt(days)} days'
GROUP BY hour
ORDER BY hour DESC
`, [id]);
// Get endpoint usage
const endpointUsage = await migrate_1.pool.query(`
SELECT
endpoint,
method,
COUNT(*) as requests,
AVG(response_time_ms) as avg_response_time
FROM api_token_usage
WHERE token_id = $1
AND created_at > NOW() - INTERVAL '${parseInt(days)} days'
GROUP BY endpoint, method
ORDER BY requests DESC
LIMIT 20
`, [id]);
// Get recent requests
const recentRequests = await migrate_1.pool.query(`
SELECT
endpoint,
method,
status_code,
response_time_ms,
ip_address,
created_at
FROM api_token_usage
WHERE token_id = $1
ORDER BY created_at DESC
LIMIT 100
`, [id]);
res.json({
hourly_usage: hourlyUsage.rows,
endpoint_usage: endpointUsage.rows,
recent_requests: recentRequests.rows
});
}
catch (error) {
console.error('Error fetching token usage:', error);
res.status(500).json({ error: 'Failed to fetch token usage' });
}
});
// Get overall API usage statistics
router.get('/stats/overview', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
try {
const { days = 7 } = req.query;
const stats = await migrate_1.pool.query(`
SELECT
COUNT(DISTINCT token_id) as active_tokens,
COUNT(*) as total_requests,
AVG(response_time_ms) as avg_response_time,
SUM(CASE WHEN status_code >= 200 AND status_code < 300 THEN 1 ELSE 0 END) as successful_requests,
SUM(CASE WHEN status_code >= 400 THEN 1 ELSE 0 END) as failed_requests
FROM api_token_usage
WHERE created_at > NOW() - INTERVAL '${parseInt(days)} days'
`);
// Top tokens by usage
const topTokens = await migrate_1.pool.query(`
SELECT
t.id,
t.name,
COUNT(u.id) as requests,
AVG(u.response_time_ms) as avg_response_time
FROM api_tokens t
LEFT JOIN api_token_usage u ON t.id = u.token_id
WHERE u.created_at > NOW() - INTERVAL '${parseInt(days)} days'
GROUP BY t.id, t.name
ORDER BY requests DESC
LIMIT 10
`);
// Most used endpoints
const topEndpoints = await migrate_1.pool.query(`
SELECT
endpoint,
method,
COUNT(*) as requests,
AVG(response_time_ms) as avg_response_time
FROM api_token_usage
WHERE created_at > NOW() - INTERVAL '${parseInt(days)} days'
GROUP BY endpoint, method
ORDER BY requests DESC
LIMIT 10
`);
res.json({
overview: stats.rows[0],
top_tokens: topTokens.rows,
top_endpoints: topEndpoints.rows
});
}
catch (error) {
console.error('Error fetching API stats:', error);
res.status(500).json({ error: 'Failed to fetch API stats' });
}
});
exports.default = router;

View File

@@ -58,11 +58,11 @@ router.get('/tree', async (req, res) => {
const categoryMap = new Map();
const tree = [];
// First pass: create map
categories.forEach(cat => {
categories.forEach((cat) => {
categoryMap.set(cat.id, { ...cat, children: [] });
});
// Second pass: build tree
categories.forEach(cat => {
categories.forEach((cat) => {
const node = categoryMap.get(cat.id);
if (cat.parent_id) {
const parent = categoryMap.get(cat.parent_id);

152
backend/dist/routes/changes.js vendored Normal file
View File

@@ -0,0 +1,152 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
const express_1 = require("express");
const middleware_1 = require("../auth/middleware");
const migrate_1 = require("../db/migrate");
const router = (0, express_1.Router)();
router.use(middleware_1.authMiddleware);
// Get all changes with optional status filter
router.get('/', async (req, res) => {
try {
const { status } = req.query;
let query = `
SELECT
dc.id,
dc.dispensary_id,
dc.field_name,
dc.old_value,
dc.new_value,
dc.source,
dc.confidence_score,
dc.change_notes,
dc.status,
dc.requires_recrawl,
dc.created_at,
dc.reviewed_at,
dc.reviewed_by,
dc.rejection_reason,
d.name as dispensary_name,
d.slug as dispensary_slug,
d.city,
d.state
FROM dispensary_changes dc
JOIN dispensaries d ON dc.dispensary_id = d.id
`;
const params = [];
if (status) {
query += ` WHERE dc.status = $1`;
params.push(status);
}
query += ` ORDER BY dc.created_at DESC`;
const result = await migrate_1.pool.query(query, params);
res.json({ changes: result.rows });
}
catch (error) {
console.error('Error fetching changes:', error);
res.status(500).json({ error: 'Failed to fetch changes' });
}
});
// Get changes statistics (for alert banner)
router.get('/stats', async (req, res) => {
try {
const result = await migrate_1.pool.query(`
SELECT
COUNT(*) FILTER (WHERE status = 'pending') as pending_count,
COUNT(*) FILTER (WHERE status = 'pending' AND requires_recrawl = TRUE) as pending_recrawl_count,
COUNT(*) FILTER (WHERE status = 'approved') as approved_count,
COUNT(*) FILTER (WHERE status = 'rejected') as rejected_count
FROM dispensary_changes
`);
res.json(result.rows[0]);
}
catch (error) {
console.error('Error fetching change stats:', error);
res.status(500).json({ error: 'Failed to fetch change stats' });
}
});
// Approve a change and apply it to the dispensary
router.post('/:id/approve', async (req, res) => {
const client = await migrate_1.pool.connect();
try {
await client.query('BEGIN');
const { id } = req.params;
const userId = req.user?.id; // From auth middleware
// Get the change record
const changeResult = await client.query(`
SELECT * FROM dispensary_changes WHERE id = $1 AND status = 'pending'
`, [id]);
if (changeResult.rows.length === 0) {
await client.query('ROLLBACK');
return res.status(404).json({ error: 'Pending change not found' });
}
const change = changeResult.rows[0];
// Apply the change to the dispensary table
const updateQuery = `
UPDATE dispensaries
SET ${change.field_name} = $1, updated_at = CURRENT_TIMESTAMP
WHERE id = $2
RETURNING *
`;
const dispensaryResult = await client.query(updateQuery, [
change.new_value,
change.dispensary_id
]);
if (dispensaryResult.rows.length === 0) {
await client.query('ROLLBACK');
return res.status(404).json({ error: 'Dispensary not found' });
}
// Mark the change as approved
await client.query(`
UPDATE dispensary_changes
SET
status = 'approved',
reviewed_at = CURRENT_TIMESTAMP,
reviewed_by = $1
WHERE id = $2
`, [userId, id]);
await client.query('COMMIT');
res.json({
message: 'Change approved and applied',
dispensary: dispensaryResult.rows[0],
requires_recrawl: change.requires_recrawl
});
}
catch (error) {
await client.query('ROLLBACK');
console.error('Error approving change:', error);
res.status(500).json({ error: 'Failed to approve change' });
}
finally {
client.release();
}
});
// Reject a change with optional reason
router.post('/:id/reject', async (req, res) => {
try {
const { id } = req.params;
const { reason } = req.body;
const userId = req.user?.id; // From auth middleware
const result = await migrate_1.pool.query(`
UPDATE dispensary_changes
SET
status = 'rejected',
reviewed_at = CURRENT_TIMESTAMP,
reviewed_by = $1,
rejection_reason = $2
WHERE id = $3 AND status = 'pending'
RETURNING *
`, [userId, reason, id]);
if (result.rows.length === 0) {
return res.status(404).json({ error: 'Pending change not found' });
}
res.json({
message: 'Change rejected',
change: result.rows[0]
});
}
catch (error) {
console.error('Error rejecting change:', error);
res.status(500).json({ error: 'Failed to reject change' });
}
});
exports.default = router;

497
backend/dist/routes/crawler-sandbox.js vendored Normal file
View File

@@ -0,0 +1,497 @@
"use strict";
/**
* Crawler Sandbox API Routes
*
* Endpoints for managing sandbox crawls, templates, and provider detection
*/
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
const express_1 = __importDefault(require("express"));
const migrate_1 = require("../db/migrate");
const middleware_1 = require("../auth/middleware");
const logger_1 = require("../services/logger");
const crawler_jobs_1 = require("../services/crawler-jobs");
const router = express_1.default.Router();
// Apply auth middleware to all routes
router.use(middleware_1.authMiddleware);
// ========================================
// Sandbox Entries
// ========================================
/**
* GET /api/crawler-sandbox
* List sandbox entries with optional filters
*/
router.get('/', async (req, res) => {
try {
const { status, dispensaryId, limit = 50, offset = 0 } = req.query;
let query = `
SELECT cs.*, d.name as dispensary_name, d.website, d.menu_provider, d.crawler_status
FROM crawler_sandboxes cs
JOIN dispensaries d ON d.id = cs.dispensary_id
WHERE 1=1
`;
const params = [];
let paramIndex = 1;
if (status) {
query += ` AND cs.status = $${paramIndex}`;
params.push(status);
paramIndex++;
}
if (dispensaryId) {
query += ` AND cs.dispensary_id = $${paramIndex}`;
params.push(Number(dispensaryId));
paramIndex++;
}
query += ` ORDER BY cs.created_at DESC LIMIT $${paramIndex} OFFSET $${paramIndex + 1}`;
params.push(Number(limit), Number(offset));
const result = await migrate_1.pool.query(query, params);
// Get total count
const countResult = await migrate_1.pool.query(`SELECT COUNT(*) FROM crawler_sandboxes cs WHERE 1=1
${status ? 'AND cs.status = $1' : ''}
${dispensaryId ? `AND cs.dispensary_id = $${status ? 2 : 1}` : ''}`, status && dispensaryId ? [status, dispensaryId] : status ? [status] : dispensaryId ? [dispensaryId] : []);
res.json({
sandboxes: result.rows,
total: parseInt(countResult.rows[0].count),
limit: Number(limit),
offset: Number(offset),
});
}
catch (error) {
logger_1.logger.error('api', `Get sandboxes error: ${error.message}`);
res.status(500).json({ error: error.message });
}
});
/**
* GET /api/crawler-sandbox/:id
* Get a single sandbox entry with full details
*/
router.get('/:id', async (req, res) => {
try {
const { id } = req.params;
const result = await migrate_1.pool.query(`SELECT cs.*, d.name as dispensary_name, d.website, d.menu_url,
d.menu_provider, d.menu_provider_confidence, d.crawler_mode, d.crawler_status
FROM crawler_sandboxes cs
JOIN dispensaries d ON d.id = cs.dispensary_id
WHERE cs.id = $1`, [id]);
if (result.rows.length === 0) {
return res.status(404).json({ error: 'Sandbox entry not found' });
}
// Get related jobs
const jobs = await migrate_1.pool.query(`SELECT * FROM sandbox_crawl_jobs
WHERE sandbox_id = $1 OR dispensary_id = $2
ORDER BY created_at DESC
LIMIT 10`, [id, result.rows[0].dispensary_id]);
res.json({
sandbox: result.rows[0],
jobs: jobs.rows,
});
}
catch (error) {
logger_1.logger.error('api', `Get sandbox error: ${error.message}`);
res.status(500).json({ error: error.message });
}
});
/**
* POST /api/crawler-sandbox/:id/analyze
* Trigger re-analysis of a sandbox entry
*/
router.post('/:id/analyze', (0, middleware_1.requireRole)('admin'), async (req, res) => {
try {
const { id } = req.params;
const sandbox = await migrate_1.pool.query('SELECT * FROM crawler_sandboxes WHERE id = $1', [id]);
if (sandbox.rows.length === 0) {
return res.status(404).json({ error: 'Sandbox entry not found' });
}
// Queue a new sandbox job
const job = await migrate_1.pool.query(`INSERT INTO sandbox_crawl_jobs (dispensary_id, sandbox_id, job_type, status, priority)
VALUES ($1, $2, 'deep_crawl', 'pending', 20)
RETURNING id`, [sandbox.rows[0].dispensary_id, id]);
// Update sandbox status
await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = 'pending', updated_at = NOW() WHERE id = $1`, [id]);
res.json({
message: 'Analysis job queued',
jobId: job.rows[0].id,
});
}
catch (error) {
logger_1.logger.error('api', `Analyze sandbox error: ${error.message}`);
res.status(500).json({ error: error.message });
}
});
/**
* POST /api/crawler-sandbox/:id/move-to-production
* Move a sandbox entry to production (for Dutchie dispensaries)
*/
router.post('/:id/move-to-production', (0, middleware_1.requireRole)('admin'), async (req, res) => {
try {
const { id } = req.params;
const sandbox = await migrate_1.pool.query(`SELECT cs.*, d.menu_provider
FROM crawler_sandboxes cs
JOIN dispensaries d ON d.id = cs.dispensary_id
WHERE cs.id = $1`, [id]);
if (sandbox.rows.length === 0) {
return res.status(404).json({ error: 'Sandbox entry not found' });
}
// Can only move to production if provider is dutchie
if (sandbox.rows[0].menu_provider !== 'dutchie') {
return res.status(400).json({
error: 'Only Dutchie dispensaries can be moved to production currently',
});
}
// Update dispensary to production mode
await migrate_1.pool.query(`UPDATE dispensaries
SET crawler_mode = 'production', crawler_status = 'idle', updated_at = NOW()
WHERE id = $1`, [sandbox.rows[0].dispensary_id]);
// Mark sandbox as moved
await migrate_1.pool.query(`UPDATE crawler_sandboxes
SET status = 'moved_to_production', updated_at = NOW()
WHERE id = $1`, [id]);
res.json({ message: 'Dispensary moved to production' });
}
catch (error) {
logger_1.logger.error('api', `Move to production error: ${error.message}`);
res.status(500).json({ error: error.message });
}
});
/**
* PATCH /api/crawler-sandbox/:id
* Update sandbox entry (e.g., add human review notes)
*/
router.patch('/:id', (0, middleware_1.requireRole)('admin'), async (req, res) => {
try {
const { id } = req.params;
const { human_review_notes, status, suspected_menu_provider } = req.body;
const updates = [];
const params = [];
let paramIndex = 1;
if (human_review_notes !== undefined) {
updates.push(`human_review_notes = $${paramIndex}`);
params.push(human_review_notes);
paramIndex++;
}
if (status) {
updates.push(`status = $${paramIndex}`);
params.push(status);
paramIndex++;
}
if (suspected_menu_provider !== undefined) {
updates.push(`suspected_menu_provider = $${paramIndex}`);
params.push(suspected_menu_provider);
paramIndex++;
}
if (updates.length === 0) {
return res.status(400).json({ error: 'No updates provided' });
}
updates.push('updated_at = NOW()');
if (human_review_notes !== undefined) {
updates.push('reviewed_at = NOW()');
}
params.push(id);
await migrate_1.pool.query(`UPDATE crawler_sandboxes SET ${updates.join(', ')} WHERE id = $${paramIndex}`, params);
res.json({ message: 'Sandbox updated' });
}
catch (error) {
logger_1.logger.error('api', `Update sandbox error: ${error.message}`);
res.status(500).json({ error: error.message });
}
});
// ========================================
// Templates
// ========================================
/**
* GET /api/crawler-sandbox/templates
* List all crawler templates
*/
router.get('/templates/list', async (req, res) => {
try {
const result = await migrate_1.pool.query(`SELECT * FROM crawler_templates ORDER BY provider, is_default_for_provider DESC, name`);
res.json({ templates: result.rows });
}
catch (error) {
logger_1.logger.error('api', `Get templates error: ${error.message}`);
res.status(500).json({ error: error.message });
}
});
/**
* GET /api/crawler-sandbox/templates/:id
* Get a single template
*/
router.get('/templates/:id', async (req, res) => {
try {
const { id } = req.params;
const result = await migrate_1.pool.query('SELECT * FROM crawler_templates WHERE id = $1', [id]);
if (result.rows.length === 0) {
return res.status(404).json({ error: 'Template not found' });
}
res.json({ template: result.rows[0] });
}
catch (error) {
logger_1.logger.error('api', `Get template error: ${error.message}`);
res.status(500).json({ error: error.message });
}
});
/**
* POST /api/crawler-sandbox/templates
* Create a new template
*/
router.post('/templates', (0, middleware_1.requireRole)('admin'), async (req, res) => {
try {
const { provider, name, selector_config, navigation_config, transform_config, validation_rules, notes, } = req.body;
if (!provider || !name) {
return res.status(400).json({ error: 'provider and name are required' });
}
const result = await migrate_1.pool.query(`INSERT INTO crawler_templates
(provider, name, selector_config, navigation_config, transform_config, validation_rules, notes, created_by)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
RETURNING *`, [
provider,
name,
JSON.stringify(selector_config || {}),
JSON.stringify(navigation_config || {}),
JSON.stringify(transform_config || {}),
JSON.stringify(validation_rules || {}),
notes,
req.user?.email || 'system',
]);
res.status(201).json({ template: result.rows[0] });
}
catch (error) {
logger_1.logger.error('api', `Create template error: ${error.message}`);
res.status(500).json({ error: error.message });
}
});
/**
* PUT /api/crawler-sandbox/templates/:id
* Update a template
*/
router.put('/templates/:id', (0, middleware_1.requireRole)('admin'), async (req, res) => {
try {
const { id } = req.params;
const { is_active, is_default_for_provider, selector_config, navigation_config, transform_config, validation_rules, notes, } = req.body;
const updates = [];
const params = [];
let paramIndex = 1;
if (is_active !== undefined) {
updates.push(`is_active = $${paramIndex}`);
params.push(is_active);
paramIndex++;
}
if (is_default_for_provider !== undefined) {
updates.push(`is_default_for_provider = $${paramIndex}`);
params.push(is_default_for_provider);
paramIndex++;
}
if (selector_config !== undefined) {
updates.push(`selector_config = $${paramIndex}`);
params.push(JSON.stringify(selector_config));
paramIndex++;
}
if (navigation_config !== undefined) {
updates.push(`navigation_config = $${paramIndex}`);
params.push(JSON.stringify(navigation_config));
paramIndex++;
}
if (transform_config !== undefined) {
updates.push(`transform_config = $${paramIndex}`);
params.push(JSON.stringify(transform_config));
paramIndex++;
}
if (validation_rules !== undefined) {
updates.push(`validation_rules = $${paramIndex}`);
params.push(JSON.stringify(validation_rules));
paramIndex++;
}
if (notes !== undefined) {
updates.push(`notes = $${paramIndex}`);
params.push(notes);
paramIndex++;
}
if (updates.length === 0) {
return res.status(400).json({ error: 'No updates provided' });
}
updates.push('updated_at = NOW()');
params.push(id);
await migrate_1.pool.query(`UPDATE crawler_templates SET ${updates.join(', ')} WHERE id = $${paramIndex}`, params);
const result = await migrate_1.pool.query('SELECT * FROM crawler_templates WHERE id = $1', [id]);
res.json({ template: result.rows[0] });
}
catch (error) {
logger_1.logger.error('api', `Update template error: ${error.message}`);
res.status(500).json({ error: error.message });
}
});
// ========================================
// Jobs
// ========================================
/**
* GET /api/crawler-sandbox/jobs
* List sandbox crawl jobs
*/
router.get('/jobs/list', async (req, res) => {
try {
const { status, dispensaryId, limit = 50 } = req.query;
let query = `
SELECT sj.*, d.name as dispensary_name
FROM sandbox_crawl_jobs sj
JOIN dispensaries d ON d.id = sj.dispensary_id
WHERE 1=1
`;
const params = [];
let paramIndex = 1;
if (status) {
query += ` AND sj.status = $${paramIndex}`;
params.push(status);
paramIndex++;
}
if (dispensaryId) {
query += ` AND sj.dispensary_id = $${paramIndex}`;
params.push(Number(dispensaryId));
paramIndex++;
}
query += ` ORDER BY sj.created_at DESC LIMIT $${paramIndex}`;
params.push(Number(limit));
const result = await migrate_1.pool.query(query, params);
res.json({ jobs: result.rows });
}
catch (error) {
logger_1.logger.error('api', `Get jobs error: ${error.message}`);
res.status(500).json({ error: error.message });
}
});
/**
* POST /api/crawler-sandbox/jobs/detect/:dispensaryId
* Trigger provider detection for a dispensary
*/
router.post('/jobs/detect/:dispensaryId', (0, middleware_1.requireRole)('admin'), async (req, res) => {
try {
const { dispensaryId } = req.params;
// Create detection job
const job = await migrate_1.pool.query(`INSERT INTO sandbox_crawl_jobs (dispensary_id, job_type, status, priority)
VALUES ($1, 'detection', 'pending', 30)
RETURNING id`, [dispensaryId]);
// Update dispensary status
await migrate_1.pool.query(`UPDATE dispensaries SET crawler_status = 'queued_detection', updated_at = NOW() WHERE id = $1`, [dispensaryId]);
res.json({
message: 'Detection job queued',
jobId: job.rows[0].id,
});
}
catch (error) {
logger_1.logger.error('api', `Queue detection error: ${error.message}`);
res.status(500).json({ error: error.message });
}
});
/**
* POST /api/crawler-sandbox/jobs/run/:id
* Immediately run a sandbox job
*/
router.post('/jobs/run/:id', (0, middleware_1.requireRole)('admin'), async (req, res) => {
try {
const { id } = req.params;
const job = await migrate_1.pool.query('SELECT * FROM sandbox_crawl_jobs WHERE id = $1', [id]);
if (job.rows.length === 0) {
return res.status(404).json({ error: 'Job not found' });
}
const jobData = job.rows[0];
// Run the job immediately
let result;
if (jobData.job_type === 'detection') {
result = await (0, crawler_jobs_1.runDetectMenuProviderJob)(jobData.dispensary_id);
}
else {
result = await (0, crawler_jobs_1.runSandboxCrawlJob)(jobData.dispensary_id, jobData.sandbox_id);
}
// Update job status
await migrate_1.pool.query(`UPDATE sandbox_crawl_jobs
SET status = $1, completed_at = NOW(), result_summary = $2, error_message = $3
WHERE id = $4`, [
result.success ? 'completed' : 'failed',
JSON.stringify(result.data || {}),
result.success ? null : result.message,
id,
]);
res.json(result);
}
catch (error) {
logger_1.logger.error('api', `Run job error: ${error.message}`);
res.status(500).json({ error: error.message });
}
});
// ========================================
// Stats
// ========================================
/**
* GET /api/crawler-sandbox/stats
* Get sandbox/crawler statistics
*/
router.get('/stats/overview', async (req, res) => {
try {
// Dispensary provider stats
const providerStats = await migrate_1.pool.query(`
SELECT
menu_provider,
COUNT(*) as count,
AVG(menu_provider_confidence)::integer as avg_confidence
FROM dispensaries
WHERE menu_provider IS NOT NULL
GROUP BY menu_provider
ORDER BY count DESC
`);
// Mode stats
const modeStats = await migrate_1.pool.query(`
SELECT
crawler_mode,
COUNT(*) as count
FROM dispensaries
GROUP BY crawler_mode
`);
// Status stats
const statusStats = await migrate_1.pool.query(`
SELECT
crawler_status,
COUNT(*) as count
FROM dispensaries
GROUP BY crawler_status
ORDER BY count DESC
`);
// Sandbox stats
const sandboxStats = await migrate_1.pool.query(`
SELECT
status,
COUNT(*) as count
FROM crawler_sandboxes
GROUP BY status
`);
// Job stats
const jobStats = await migrate_1.pool.query(`
SELECT
status,
job_type,
COUNT(*) as count
FROM sandbox_crawl_jobs
GROUP BY status, job_type
`);
// Recent activity
const recentActivity = await migrate_1.pool.query(`
SELECT 'sandbox' as type, id, dispensary_id, status, created_at
FROM crawler_sandboxes
ORDER BY created_at DESC
LIMIT 5
`);
res.json({
providers: providerStats.rows,
modes: modeStats.rows,
statuses: statusStats.rows,
sandbox: sandboxStats.rows,
jobs: jobStats.rows,
recentActivity: recentActivity.rows,
});
}
catch (error) {
logger_1.logger.error('api', `Get stats error: ${error.message}`);
res.status(500).json({ error: error.message });
}
});
exports.default = router;

View File

@@ -2,63 +2,70 @@
Object.defineProperty(exports, "__esModule", { value: true });
const express_1 = require("express");
const middleware_1 = require("../auth/middleware");
const migrate_1 = require("../db/migrate");
const connection_1 = require("../dutchie-az/db/connection");
const router = (0, express_1.Router)();
router.use(middleware_1.authMiddleware);
// Get dashboard stats
// Get dashboard stats - uses consolidated dutchie-az DB
router.get('/stats', async (req, res) => {
try {
// Store stats
const storesResult = await migrate_1.pool.query(`
SELECT
// Store stats from dispensaries table in consolidated DB
const dispensariesResult = await (0, connection_1.query)(`
SELECT
COUNT(*) as total,
COUNT(*) FILTER (WHERE active = true) as active,
MIN(last_scraped_at) as oldest_scrape,
MAX(last_scraped_at) as latest_scrape
FROM stores
COUNT(*) FILTER (WHERE menu_type IS NOT NULL AND menu_type != 'unknown') as active,
COUNT(*) FILTER (WHERE platform_dispensary_id IS NOT NULL) as with_platform_id,
COUNT(*) FILTER (WHERE menu_url IS NOT NULL) as with_menu_url,
MIN(last_crawled_at) as oldest_crawl,
MAX(last_crawled_at) as latest_crawl
FROM dispensaries
`);
// Product stats
const productsResult = await migrate_1.pool.query(`
SELECT
// Product stats from dutchie_products table
const productsResult = await (0, connection_1.query)(`
SELECT
COUNT(*) as total,
COUNT(*) FILTER (WHERE in_stock = true) as in_stock,
COUNT(*) FILTER (WHERE local_image_path IS NOT NULL) as with_images
FROM products
COUNT(*) FILTER (WHERE stock_status = 'in_stock') as in_stock,
COUNT(*) FILTER (WHERE primary_image_url IS NOT NULL) as with_images,
COUNT(DISTINCT brand_name) FILTER (WHERE brand_name IS NOT NULL AND brand_name != '') as unique_brands,
COUNT(DISTINCT dispensary_id) as dispensaries_with_products
FROM dutchie_products
`);
// Campaign stats
const campaignsResult = await migrate_1.pool.query(`
SELECT
COUNT(*) as total,
COUNT(*) FILTER (WHERE active = true) as active
FROM campaigns
`);
// Recent clicks (last 24 hours)
const clicksResult = await migrate_1.pool.query(`
SELECT COUNT(*) as clicks_24h
FROM clicks
WHERE clicked_at >= NOW() - INTERVAL '24 hours'
// Brand stats from dutchie_products
const brandResult = await (0, connection_1.query)(`
SELECT COUNT(DISTINCT brand_name) as total
FROM dutchie_products
WHERE brand_name IS NOT NULL AND brand_name != ''
`);
// Recent products added (last 24 hours)
const recentProductsResult = await migrate_1.pool.query(`
const recentProductsResult = await (0, connection_1.query)(`
SELECT COUNT(*) as new_products_24h
FROM products
WHERE first_seen_at >= NOW() - INTERVAL '24 hours'
`);
// Proxy stats
const proxiesResult = await migrate_1.pool.query(`
SELECT
COUNT(*) as total,
COUNT(*) FILTER (WHERE active = true) as active,
COUNT(*) FILTER (WHERE is_anonymous = true) as anonymous
FROM proxies
FROM dutchie_products
WHERE created_at >= NOW() - INTERVAL '24 hours'
`);
// Combine results
const storeStats = dispensariesResult.rows[0];
const productStats = productsResult.rows[0];
res.json({
stores: storesResult.rows[0],
products: productsResult.rows[0],
campaigns: campaignsResult.rows[0],
clicks: clicksResult.rows[0],
recent: recentProductsResult.rows[0],
proxies: proxiesResult.rows[0]
stores: {
total: parseInt(storeStats.total) || 0,
active: parseInt(storeStats.active) || 0,
with_menu_url: parseInt(storeStats.with_menu_url) || 0,
with_platform_id: parseInt(storeStats.with_platform_id) || 0,
oldest_crawl: storeStats.oldest_crawl,
latest_crawl: storeStats.latest_crawl
},
products: {
total: parseInt(productStats.total) || 0,
in_stock: parseInt(productStats.in_stock) || 0,
with_images: parseInt(productStats.with_images) || 0,
unique_brands: parseInt(productStats.unique_brands) || 0,
dispensaries_with_products: parseInt(productStats.dispensaries_with_products) || 0
},
brands: {
total: parseInt(brandResult.rows[0].total) || 0
},
campaigns: { total: 0, active: 0 }, // Legacy - no longer used
clicks: { clicks_24h: 0 }, // Legacy - no longer used
recent: recentProductsResult.rows[0]
});
}
catch (error) {
@@ -66,27 +73,34 @@ router.get('/stats', async (req, res) => {
res.status(500).json({ error: 'Failed to fetch dashboard stats' });
}
});
// Get recent activity
// Get recent activity - from consolidated dutchie-az DB
router.get('/activity', async (req, res) => {
try {
const { limit = 20 } = req.query;
// Recent scrapes
const scrapesResult = await migrate_1.pool.query(`
SELECT s.name, s.last_scraped_at,
COUNT(p.id) as product_count
FROM stores s
LEFT JOIN products p ON s.id = p.store_id AND p.last_seen_at = s.last_scraped_at
WHERE s.last_scraped_at IS NOT NULL
GROUP BY s.id, s.name, s.last_scraped_at
ORDER BY s.last_scraped_at DESC
// Recent crawls from dispensaries (with product counts from dutchie_products)
const scrapesResult = await (0, connection_1.query)(`
SELECT
d.name,
d.last_crawled_at as last_scraped_at,
d.product_count
FROM dispensaries d
WHERE d.last_crawled_at IS NOT NULL
ORDER BY d.last_crawled_at DESC
LIMIT $1
`, [limit]);
// Recent products
const productsResult = await migrate_1.pool.query(`
SELECT p.name, p.price, s.name as store_name, p.first_seen_at
FROM products p
JOIN stores s ON p.store_id = s.id
ORDER BY p.first_seen_at DESC
// Recent products from dutchie_products
const productsResult = await (0, connection_1.query)(`
SELECT
p.name,
0 as price,
p.brand_name as brand,
p.thc as thc_percentage,
p.cbd as cbd_percentage,
d.name as store_name,
p.created_at as first_seen_at
FROM dutchie_products p
JOIN dispensaries d ON p.dispensary_id = d.id
ORDER BY p.created_at DESC
LIMIT $1
`, [limit]);
res.json({

437
backend/dist/routes/dispensaries.js vendored Normal file
View File

@@ -0,0 +1,437 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
const express_1 = require("express");
const middleware_1 = require("../auth/middleware");
const migrate_1 = require("../db/migrate");
const router = (0, express_1.Router)();
router.use(middleware_1.authMiddleware);
// Valid menu_type values
const VALID_MENU_TYPES = ['dutchie', 'treez', 'jane', 'weedmaps', 'leafly', 'meadow', 'blaze', 'flowhub', 'dispense', 'cova', 'other', 'unknown'];
// Get all dispensaries
router.get('/', async (req, res) => {
try {
const { menu_type } = req.query;
let query = `
SELECT
id,
azdhs_id,
name,
company_name,
slug,
address,
city,
state,
zip,
phone,
email,
website,
dba_name,
google_rating,
google_review_count,
status_line,
azdhs_url,
latitude,
longitude,
menu_url,
menu_type,
menu_provider,
menu_provider_confidence,
scraper_template,
last_menu_scrape,
menu_scrape_status,
platform_dispensary_id,
created_at,
updated_at
FROM dispensaries
`;
const params = [];
// Filter by menu_type if provided
if (menu_type) {
query += ` WHERE menu_type = $1`;
params.push(menu_type);
}
query += ` ORDER BY name`;
const result = await migrate_1.pool.query(query, params);
res.json({ dispensaries: result.rows });
}
catch (error) {
console.error('Error fetching dispensaries:', error);
res.status(500).json({ error: 'Failed to fetch dispensaries' });
}
});
// Get menu type stats
router.get('/stats/menu-types', async (req, res) => {
try {
const result = await migrate_1.pool.query(`
SELECT menu_type, COUNT(*) as count
FROM dispensaries
GROUP BY menu_type
ORDER BY count DESC
`);
res.json({ menu_types: result.rows, valid_types: VALID_MENU_TYPES });
}
catch (error) {
console.error('Error fetching menu type stats:', error);
res.status(500).json({ error: 'Failed to fetch menu type stats' });
}
});
// Get single dispensary by slug
router.get('/:slug', async (req, res) => {
try {
const { slug } = req.params;
const result = await migrate_1.pool.query(`
SELECT
id,
azdhs_id,
name,
company_name,
slug,
address,
city,
state,
zip,
phone,
email,
website,
dba_name,
google_rating,
google_review_count,
status_line,
azdhs_url,
latitude,
longitude,
menu_url,
menu_type,
menu_provider,
menu_provider_confidence,
scraper_template,
scraper_config,
last_menu_scrape,
menu_scrape_status,
platform_dispensary_id,
created_at,
updated_at
FROM dispensaries
WHERE slug = $1
`, [slug]);
if (result.rows.length === 0) {
return res.status(404).json({ error: 'Dispensary not found' });
}
res.json(result.rows[0]);
}
catch (error) {
console.error('Error fetching dispensary:', error);
res.status(500).json({ error: 'Failed to fetch dispensary' });
}
});
// Update dispensary
router.put('/:id', async (req, res) => {
try {
const { id } = req.params;
const { dba_name, website, phone, email, google_rating, google_review_count, menu_url, menu_type, scraper_template, scraper_config, menu_scrape_status } = req.body;
// Validate menu_type if provided
if (menu_type !== undefined && menu_type !== null && menu_type !== '' && !VALID_MENU_TYPES.includes(menu_type)) {
return res.status(400).json({
error: `Invalid menu_type. Must be one of: ${VALID_MENU_TYPES.join(', ')}`
});
}
const result = await migrate_1.pool.query(`
UPDATE dispensaries
SET
dba_name = COALESCE($1, dba_name),
website = COALESCE($2, website),
phone = COALESCE($3, phone),
email = COALESCE($4, email),
google_rating = COALESCE($5, google_rating),
google_review_count = COALESCE($6, google_review_count),
menu_url = COALESCE($7, menu_url),
menu_type = COALESCE($8, menu_type),
scraper_template = COALESCE($9, scraper_template),
scraper_config = COALESCE($10, scraper_config),
menu_scrape_status = COALESCE($11, menu_scrape_status),
updated_at = CURRENT_TIMESTAMP
WHERE id = $12
RETURNING *
`, [
dba_name,
website,
phone,
email,
google_rating,
google_review_count,
menu_url,
menu_type,
scraper_template,
scraper_config,
menu_scrape_status,
id
]);
if (result.rows.length === 0) {
return res.status(404).json({ error: 'Dispensary not found' });
}
res.json(result.rows[0]);
}
catch (error) {
console.error('Error updating dispensary:', error);
res.status(500).json({ error: 'Failed to update dispensary' });
}
});
// Get products for a dispensary by slug
router.get('/:slug/products', async (req, res) => {
try {
const { slug } = req.params;
const { category } = req.query;
// First get the dispensary ID from slug
const dispensaryResult = await migrate_1.pool.query(`
SELECT id FROM dispensaries WHERE slug = $1
`, [slug]);
if (dispensaryResult.rows.length === 0) {
return res.status(404).json({ error: 'Dispensary not found' });
}
const dispensaryId = dispensaryResult.rows[0].id;
// Build query for products
let query = `
SELECT
p.id,
p.name,
p.brand,
p.variant,
p.slug,
p.description,
p.regular_price,
p.sale_price,
p.thc_percentage,
p.cbd_percentage,
p.strain_type,
p.terpenes,
p.effects,
p.flavors,
p.image_url,
p.dutchie_url,
p.in_stock,
p.created_at,
p.updated_at
FROM products p
WHERE p.dispensary_id = $1
`;
const params = [dispensaryId];
if (category) {
query += ` AND p.category = $2`;
params.push(category);
}
query += ` ORDER BY p.created_at DESC`;
const result = await migrate_1.pool.query(query, params);
res.json({ products: result.rows });
}
catch (error) {
console.error('Error fetching dispensary products:', error);
res.status(500).json({ error: 'Failed to fetch products' });
}
});
// Get unique brands for a dispensary by slug
router.get('/:slug/brands', async (req, res) => {
try {
const { slug } = req.params;
const { search } = req.query;
// First get the dispensary ID from slug
const dispensaryResult = await migrate_1.pool.query(`
SELECT id FROM dispensaries WHERE slug = $1
`, [slug]);
if (dispensaryResult.rows.length === 0) {
return res.status(404).json({ error: 'Dispensary not found' });
}
const dispensaryId = dispensaryResult.rows[0].id;
// Build query with optional search filter
let query = `
SELECT DISTINCT
brand,
COUNT(*) as product_count
FROM products
WHERE dispensary_id = $1 AND brand IS NOT NULL
`;
const params = [dispensaryId];
// Add search filter if provided
if (search) {
query += ` AND brand ILIKE $2`;
params.push(`%${search}%`);
}
query += ` GROUP BY brand ORDER BY product_count DESC, brand ASC`;
const result = await migrate_1.pool.query(query, params);
res.json({ brands: result.rows });
}
catch (error) {
console.error('Error fetching dispensary brands:', error);
res.status(500).json({ error: 'Failed to fetch brands' });
}
});
// Get products with discounts/specials for a dispensary by slug
router.get('/:slug/specials', async (req, res) => {
try {
const { slug } = req.params;
const { search } = req.query;
// First get the dispensary ID from slug
const dispensaryResult = await migrate_1.pool.query(`
SELECT id FROM dispensaries WHERE slug = $1
`, [slug]);
if (dispensaryResult.rows.length === 0) {
return res.status(404).json({ error: 'Dispensary not found' });
}
const dispensaryId = dispensaryResult.rows[0].id;
// Build query to get products with discounts
let query = `
SELECT
p.id,
p.name,
p.brand,
p.variant,
p.slug,
p.description,
p.regular_price,
p.sale_price,
p.discount_type,
p.discount_value,
p.thc_percentage,
p.cbd_percentage,
p.strain_type,
p.terpenes,
p.effects,
p.flavors,
p.image_url,
p.dutchie_url,
p.in_stock,
p.created_at,
p.updated_at
FROM products p
WHERE p.dispensary_id = $1
AND p.discount_type IS NOT NULL
AND p.discount_value IS NOT NULL
`;
const params = [dispensaryId];
// Add search filter if provided
if (search) {
query += ` AND (p.name ILIKE $2 OR p.brand ILIKE $2 OR p.description ILIKE $2)`;
params.push(`%${search}%`);
}
query += ` ORDER BY p.created_at DESC`;
const result = await migrate_1.pool.query(query, params);
res.json({ specials: result.rows });
}
catch (error) {
console.error('Error fetching dispensary specials:', error);
res.status(500).json({ error: 'Failed to fetch specials' });
}
});
// Trigger scraping for a dispensary
router.post('/:slug/scrape', async (req, res) => {
try {
const { slug } = req.params;
const { type } = req.body; // 'products' | 'brands' | 'specials' | 'all'
if (!['products', 'brands', 'specials', 'all'].includes(type)) {
return res.status(400).json({ error: 'Invalid type. Must be: products, brands, specials, or all' });
}
// Get the dispensary
const dispensaryResult = await migrate_1.pool.query(`
SELECT id, name, slug, website, menu_url, scraper_template, scraper_config
FROM dispensaries
WHERE slug = $1
`, [slug]);
if (dispensaryResult.rows.length === 0) {
return res.status(404).json({ error: 'Dispensary not found' });
}
const dispensary = dispensaryResult.rows[0];
if (!dispensary.menu_url && !dispensary.website) {
return res.status(400).json({ error: 'Dispensary has no menu URL or website configured' });
}
// Update last_menu_scrape time and status
await migrate_1.pool.query(`
UPDATE dispensaries
SET
last_menu_scrape = CURRENT_TIMESTAMP,
menu_scrape_status = 'pending',
updated_at = CURRENT_TIMESTAMP
WHERE id = $1
`, [dispensary.id]);
// Log the scrape request
console.log(`[SCRAPE REQUEST] Dispensary: ${dispensary.name} (${slug}), Type: ${type}`);
console.log(` Menu URL: ${dispensary.menu_url || dispensary.website}`);
console.log(` Template: ${dispensary.scraper_template || 'N/A'}`);
// TODO: Actually trigger the scraper here
// For now, this is a placeholder that updates the status
// You can integrate with your existing scraper infrastructure
res.json({
success: true,
message: `Scraping queued for ${dispensary.name}`,
type,
dispensary: {
id: dispensary.id,
name: dispensary.name,
slug: dispensary.slug
}
});
}
catch (error) {
console.error('Error triggering scrape:', error);
res.status(500).json({ error: 'Failed to trigger scraping' });
}
});
// Update menu_type for a dispensary (dedicated endpoint)
router.patch('/:id/menu-type', async (req, res) => {
try {
const { id } = req.params;
const { menu_type } = req.body;
// Validate menu_type
if (menu_type !== null && menu_type !== '' && !VALID_MENU_TYPES.includes(menu_type)) {
return res.status(400).json({
error: `Invalid menu_type. Must be one of: ${VALID_MENU_TYPES.join(', ')} (or null to clear)`
});
}
const result = await migrate_1.pool.query(`
UPDATE dispensaries
SET menu_type = $1, updated_at = CURRENT_TIMESTAMP
WHERE id = $2
RETURNING id, name, slug, menu_type, menu_provider, menu_url
`, [menu_type || null, id]);
if (result.rows.length === 0) {
return res.status(404).json({ error: 'Dispensary not found' });
}
res.json({
success: true,
dispensary: result.rows[0]
});
}
catch (error) {
console.error('Error updating menu_type:', error);
res.status(500).json({ error: 'Failed to update menu_type' });
}
});
// Bulk update menu_type for multiple dispensaries
router.post('/bulk/menu-type', async (req, res) => {
try {
const { dispensary_ids, menu_type } = req.body;
if (!Array.isArray(dispensary_ids) || dispensary_ids.length === 0) {
return res.status(400).json({ error: 'dispensary_ids must be a non-empty array' });
}
// Validate menu_type
if (menu_type !== null && menu_type !== '' && !VALID_MENU_TYPES.includes(menu_type)) {
return res.status(400).json({
error: `Invalid menu_type. Must be one of: ${VALID_MENU_TYPES.join(', ')} (or null to clear)`
});
}
const result = await migrate_1.pool.query(`
UPDATE dispensaries
SET menu_type = $1, updated_at = CURRENT_TIMESTAMP
WHERE id = ANY($2::int[])
RETURNING id, name, slug, menu_type
`, [menu_type || null, dispensary_ids]);
res.json({
success: true,
updated_count: result.rowCount,
dispensaries: result.rows
});
}
catch (error) {
console.error('Error bulk updating menu_type:', error);
res.status(500).json({ error: 'Failed to bulk update menu_type' });
}
});
exports.default = router;

182
backend/dist/routes/parallel-scrape.js vendored Normal file
View File

@@ -0,0 +1,182 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
const express_1 = require("express");
const migrate_1 = require("../db/migrate");
const proxy_1 = require("../services/proxy");
const middleware_1 = require("../auth/middleware");
const router = (0, express_1.Router)();
router.use(middleware_1.authMiddleware);
const FIREFOX_USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0';
// In-memory job tracking
const activeJobs = new Map();
// Get job status
router.get('/status/:jobId', (req, res) => {
const job = activeJobs.get(req.params.jobId);
if (!job) {
return res.status(404).json({ error: 'Job not found' });
}
res.json(job);
});
// List active jobs
router.get('/jobs', (req, res) => {
const jobs = Array.from(activeJobs.values());
res.json({ jobs });
});
// Start parallel scrape
router.post('/start', async (req, res) => {
const { storeName = 'Deeply Rooted', workers = 15, useProxies = true } = req.body;
try {
// Find the store
const storeResult = await migrate_1.pool.query(`SELECT id, name, slug, dutchie_url FROM stores WHERE name ILIKE $1 LIMIT 1`, [`%${storeName}%`]);
if (storeResult.rows.length === 0) {
return res.status(404).json({ error: `Store not found: ${storeName}` });
}
const store = storeResult.rows[0];
// Get categories
const categoriesResult = await migrate_1.pool.query(`SELECT id, name, slug, dutchie_url as url FROM categories WHERE store_id = $1 AND scrape_enabled = true`, [store.id]);
if (categoriesResult.rows.length === 0) {
return res.status(404).json({ error: 'No categories found for this store' });
}
const categories = categoriesResult.rows;
// Create job
const jobId = `scrape-${Date.now()}`;
const job = {
id: jobId,
storeName: store.name,
status: 'running',
workers,
startedAt: new Date(),
results: []
};
activeJobs.set(jobId, job);
// Start scraping in background
runParallelScrape(job, store, categories, workers, useProxies).catch(err => {
console.error('Parallel scrape error:', err);
job.status = 'failed';
});
res.json({
message: 'Parallel scrape started',
jobId,
store: store.name,
categories: categories.length,
workers
});
}
catch (error) {
console.error('Failed to start parallel scrape:', error);
res.status(500).json({ error: error.message });
}
});
async function runParallelScrape(job, store, categories, numWorkers, useProxies) {
const puppeteer = require('puppeteer-extra');
const StealthPlugin = require('puppeteer-extra-plugin-stealth');
puppeteer.use(StealthPlugin());
// Expand categories for multiple passes
const expandedCategories = [];
const passes = Math.ceil(numWorkers / Math.max(categories.length, 1));
for (let i = 0; i < passes; i++) {
expandedCategories.push(...categories);
}
const categoryIndex = { current: 0 };
const worker = async (workerId) => {
while (categoryIndex.current < expandedCategories.length) {
const idx = categoryIndex.current++;
const category = expandedCategories[idx];
if (!category)
break;
const result = await scrapeCategory(puppeteer, workerId, category, useProxies);
job.results.push({
category: category.name,
success: result.success,
products: result.products,
error: result.error
});
// Delay between requests
await new Promise(resolve => setTimeout(resolve, 2000 + Math.random() * 3000));
}
};
// Start workers with staggered starts
const workers = [];
for (let i = 0; i < numWorkers; i++) {
workers.push(worker(i + 1));
await new Promise(resolve => setTimeout(resolve, 500));
}
await Promise.all(workers);
job.status = 'completed';
job.completedAt = new Date();
// Clean up job after 1 hour
setTimeout(() => activeJobs.delete(job.id), 60 * 60 * 1000);
}
async function scrapeCategory(puppeteer, workerId, category, useProxies) {
let browser = null;
let proxyId = null;
try {
let proxy = null;
if (useProxies) {
proxy = await (0, proxy_1.getActiveProxy)();
}
const args = [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-accelerated-2d-canvas',
'--disable-gpu',
'--window-size=1920,1080',
];
if (proxy) {
proxyId = proxy.id;
if (proxy.protocol === 'socks5' || proxy.protocol === 'socks') {
args.push(`--proxy-server=socks5://${proxy.host}:${proxy.port}`);
}
else {
args.push(`--proxy-server=${proxy.protocol}://${proxy.host}:${proxy.port}`);
}
}
browser = await puppeteer.launch({
headless: 'new',
args,
executablePath: process.env.PUPPETEER_EXECUTABLE_PATH || '/usr/bin/chromium',
});
const page = await browser.newPage();
await page.setUserAgent(FIREFOX_USER_AGENT);
await page.setViewport({ width: 1920, height: 1080 });
if (proxy?.username && proxy?.password) {
await page.authenticate({
username: proxy.username,
password: proxy.password,
});
}
console.log(`[Worker ${workerId}] Scraping: ${category.name} (${category.url})`);
const response = await page.goto(category.url, {
waitUntil: 'networkidle2',
timeout: 60000,
});
if (!response || !response.ok()) {
throw new Error(`Failed to load page: ${response?.status()}`);
}
await page.waitForSelector('[data-testid="product-list-item"], a[href*="/product/"]', {
timeout: 30000,
}).catch(() => { });
const products = await page.evaluate(() => {
// Try data-testid first, then fall back to product links
const listItems = document.querySelectorAll('[data-testid="product-list-item"]');
if (listItems.length > 0)
return listItems.length;
return document.querySelectorAll('a[href*="/product/"]').length;
});
console.log(`[Worker ${workerId}] Found ${products} products in ${category.name}`);
await browser.close();
return { success: true, products };
}
catch (error) {
console.error(`[Worker ${workerId}] Error:`, error.message);
if (proxyId && (0, proxy_1.isBotDetectionError)(error.message)) {
(0, proxy_1.putProxyInTimeout)(proxyId, error.message);
}
if (browser) {
await browser.close().catch(() => { });
}
return { success: false, products: 0, error: error.message };
}
}
exports.default = router;

View File

@@ -6,10 +6,69 @@ const migrate_1 = require("../db/migrate");
const minio_1 = require("../utils/minio");
const router = (0, express_1.Router)();
router.use(middleware_1.authMiddleware);
// Get all products with filters
// Freshness threshold: data older than this is considered stale
const STALE_THRESHOLD_HOURS = 4;
function calculateFreshness(lastCrawlAt) {
if (!lastCrawlAt) {
return {
last_crawl_at: null,
is_stale: true,
freshness: 'Never crawled',
hours_since_crawl: null
};
}
const now = new Date();
const diffMs = now.getTime() - lastCrawlAt.getTime();
const diffHours = diffMs / (1000 * 60 * 60);
const isStale = diffHours > STALE_THRESHOLD_HOURS;
let freshnessText;
if (diffHours < 1) {
const mins = Math.round(diffHours * 60);
freshnessText = `Last crawled ${mins} minute${mins !== 1 ? 's' : ''} ago`;
}
else if (diffHours < 24) {
const hrs = Math.round(diffHours);
freshnessText = `Last crawled ${hrs} hour${hrs !== 1 ? 's' : ''} ago`;
}
else {
const days = Math.round(diffHours / 24);
freshnessText = `Last crawled ${days} day${days !== 1 ? 's' : ''} ago`;
}
if (isStale) {
freshnessText += ' (STALE)';
}
return {
last_crawl_at: lastCrawlAt.toISOString(),
is_stale: isStale,
freshness: freshnessText,
hours_since_crawl: Math.round(diffHours * 10) / 10
};
}
// Helper function to filter fields from object
function selectFields(obj, fields) {
if (!fields || fields.length === 0)
return obj;
const result = {};
fields.forEach(field => {
if (obj.hasOwnProperty(field)) {
result[field] = obj[field];
}
});
return result;
}
// Get all products with filters, sorting, and field selection
router.get('/', async (req, res) => {
try {
const { store_id, category_id, in_stock, search, limit = 50, offset = 0 } = req.query;
const { store_id, category_id, in_stock, search, brand, min_price, max_price, min_thc, max_thc, strain_type, sort_by = 'last_seen_at', sort_order = 'desc', limit = 50, offset = 0, fields } = req.query;
// Validate sort field to prevent SQL injection
const allowedSortFields = [
'id', 'name', 'brand', 'price', 'thc_percentage',
'cbd_percentage', 'last_seen_at', 'created_at'
];
const sortField = allowedSortFields.includes(sort_by)
? sort_by
: 'last_seen_at';
const sortDirection = sort_order.toLowerCase() === 'asc' ? 'ASC' : 'DESC';
let query = `
SELECT p.*, s.name as store_name, c.name as category_name
FROM products p
@@ -19,35 +78,81 @@ router.get('/', async (req, res) => {
`;
const params = [];
let paramCount = 1;
// Store filter
if (store_id) {
query += ` AND p.store_id = $${paramCount}`;
params.push(store_id);
paramCount++;
}
// Category filter
if (category_id) {
query += ` AND p.category_id = $${paramCount}`;
params.push(category_id);
paramCount++;
}
// Stock filter
if (in_stock !== undefined) {
query += ` AND p.in_stock = $${paramCount}`;
params.push(in_stock === 'true');
paramCount++;
}
// Search filter
if (search) {
query += ` AND (p.name ILIKE $${paramCount} OR p.brand ILIKE $${paramCount})`;
query += ` AND (p.name ILIKE $${paramCount} OR p.brand ILIKE $${paramCount} OR p.description ILIKE $${paramCount})`;
params.push(`%${search}%`);
paramCount++;
}
query += ` ORDER BY p.last_seen_at DESC LIMIT $${paramCount} OFFSET $${paramCount + 1}`;
// Brand filter
if (brand) {
query += ` AND p.brand ILIKE $${paramCount}`;
params.push(`%${brand}%`);
paramCount++;
}
// Price range filter
if (min_price) {
query += ` AND p.price >= $${paramCount}`;
params.push(parseFloat(min_price));
paramCount++;
}
if (max_price) {
query += ` AND p.price <= $${paramCount}`;
params.push(parseFloat(max_price));
paramCount++;
}
// THC range filter
if (min_thc) {
query += ` AND p.thc_percentage >= $${paramCount}`;
params.push(parseFloat(min_thc));
paramCount++;
}
if (max_thc) {
query += ` AND p.thc_percentage <= $${paramCount}`;
params.push(parseFloat(max_thc));
paramCount++;
}
// Strain type filter
if (strain_type) {
query += ` AND p.strain_type = $${paramCount}`;
params.push(strain_type);
paramCount++;
}
// Sorting
query += ` ORDER BY p.${sortField} ${sortDirection} LIMIT $${paramCount} OFFSET $${paramCount + 1}`;
params.push(limit, offset);
const result = await migrate_1.pool.query(query, params);
// Add image URLs
const products = result.rows.map(p => ({
let products = result.rows.map((p) => ({
...p,
image_url_full: p.local_image_path ? (0, minio_1.getImageUrl)(p.local_image_path) : p.image_url
image_url_full: p.local_image_path ? (0, minio_1.getImageUrl)(p.local_image_path) : p.image_url,
thumbnail_url: p.thumbnail_path ? (0, minio_1.getImageUrl)(p.thumbnail_path) : null,
medium_url: p.medium_path ? (0, minio_1.getImageUrl)(p.medium_path) : null,
}));
// Get total count
// Field selection
if (fields) {
const selectedFields = fields.split(',').map(f => f.trim());
products = products.map((p) => selectFields(p, selectedFields));
}
// Get total count (reuse same filters)
let countQuery = `SELECT COUNT(*) FROM products p WHERE 1=1`;
const countParams = [];
let countParamCount = 1;
@@ -67,16 +172,79 @@ router.get('/', async (req, res) => {
countParamCount++;
}
if (search) {
countQuery += ` AND (p.name ILIKE $${countParamCount} OR p.brand ILIKE $${countParamCount})`;
countQuery += ` AND (p.name ILIKE $${countParamCount} OR p.brand ILIKE $${countParamCount} OR p.description ILIKE $${countParamCount})`;
countParams.push(`%${search}%`);
countParamCount++;
}
if (brand) {
countQuery += ` AND p.brand ILIKE $${countParamCount}`;
countParams.push(`%${brand}%`);
countParamCount++;
}
if (min_price) {
countQuery += ` AND p.price >= $${countParamCount}`;
countParams.push(parseFloat(min_price));
countParamCount++;
}
if (max_price) {
countQuery += ` AND p.price <= $${countParamCount}`;
countParams.push(parseFloat(max_price));
countParamCount++;
}
if (min_thc) {
countQuery += ` AND p.thc_percentage >= $${countParamCount}`;
countParams.push(parseFloat(min_thc));
countParamCount++;
}
if (max_thc) {
countQuery += ` AND p.thc_percentage <= $${countParamCount}`;
countParams.push(parseFloat(max_thc));
countParamCount++;
}
if (strain_type) {
countQuery += ` AND p.strain_type = $${countParamCount}`;
countParams.push(strain_type);
countParamCount++;
}
const countResult = await migrate_1.pool.query(countQuery, countParams);
// Get freshness info if store_id is specified
let freshnessInfo = null;
let storeInfo = null;
if (store_id) {
const storeResult = await migrate_1.pool.query('SELECT id, name, last_scraped_at FROM stores WHERE id = $1', [store_id]);
if (storeResult.rows.length > 0) {
const store = storeResult.rows[0];
storeInfo = { id: store.id, name: store.name };
freshnessInfo = calculateFreshness(store.last_scraped_at);
}
}
res.json({
products,
total: parseInt(countResult.rows[0].count),
limit: parseInt(limit),
offset: parseInt(offset)
offset: parseInt(offset),
// Add freshness metadata when store_id is provided
...(freshnessInfo && {
store: storeInfo,
last_crawl_at: freshnessInfo.last_crawl_at,
is_stale: freshnessInfo.is_stale,
freshness: freshnessInfo.freshness,
hours_since_crawl: freshnessInfo.hours_since_crawl
}),
filters: {
store_id,
category_id,
in_stock,
search,
brand,
min_price,
max_price,
min_thc,
max_thc,
strain_type,
sort_by: sortField,
sort_order: sortDirection
}
});
}
catch (error) {
@@ -84,10 +252,11 @@ router.get('/', async (req, res) => {
res.status(500).json({ error: 'Failed to fetch products' });
}
});
// Get single product
// Get single product with optional field selection
router.get('/:id', async (req, res) => {
try {
const { id } = req.params;
const { fields } = req.query;
const result = await migrate_1.pool.query(`
SELECT p.*, s.name as store_name, c.name as category_name
FROM products p
@@ -98,10 +267,17 @@ router.get('/:id', async (req, res) => {
if (result.rows.length === 0) {
return res.status(404).json({ error: 'Product not found' });
}
const product = result.rows[0];
let product = result.rows[0];
product.image_url_full = product.local_image_path
? (0, minio_1.getImageUrl)(product.local_image_path)
: product.image_url;
product.thumbnail_url = product.thumbnail_path ? (0, minio_1.getImageUrl)(product.thumbnail_path) : null;
product.medium_url = product.medium_path ? (0, minio_1.getImageUrl)(product.medium_path) : null;
// Field selection
if (fields) {
const selectedFields = fields.split(',').map(f => f.trim());
product = selectFields(product, selectedFields);
}
res.json({ product });
}
catch (error) {
@@ -109,4 +285,57 @@ router.get('/:id', async (req, res) => {
res.status(500).json({ error: 'Failed to fetch product' });
}
});
// Get available brands (for filter dropdowns)
router.get('/meta/brands', async (req, res) => {
try {
const { store_id } = req.query;
let query = `
SELECT DISTINCT brand
FROM products
WHERE brand IS NOT NULL AND brand != ''
`;
const params = [];
if (store_id) {
query += ' AND store_id = $1';
params.push(store_id);
}
query += ' ORDER BY brand';
const result = await migrate_1.pool.query(query, params);
const brands = result.rows.map((row) => row.brand);
res.json({ brands });
}
catch (error) {
console.error('Error fetching brands:', error);
res.status(500).json({ error: 'Failed to fetch brands' });
}
});
// Get price range (for filter sliders)
router.get('/meta/price-range', async (req, res) => {
try {
const { store_id } = req.query;
let query = `
SELECT
MIN(price) as min_price,
MAX(price) as max_price,
AVG(price) as avg_price
FROM products
WHERE price IS NOT NULL
`;
const params = [];
if (store_id) {
query += ' AND store_id = $1';
params.push(store_id);
}
const result = await migrate_1.pool.query(query, params);
res.json({
min_price: parseFloat(result.rows[0].min_price) || 0,
max_price: parseFloat(result.rows[0].max_price) || 0,
avg_price: parseFloat(result.rows[0].avg_price) || 0
});
}
catch (error) {
console.error('Error fetching price range:', error);
res.status(500).json({ error: 'Failed to fetch price range' });
}
});
exports.default = router;

View File

@@ -1,17 +1,52 @@
"use strict";
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
const express_1 = require("express");
const middleware_1 = require("../auth/middleware");
const migrate_1 = require("../db/migrate");
const proxy_1 = require("../services/proxy");
const proxyTestQueue_1 = require("../services/proxyTestQueue");
const router = (0, express_1.Router)();
router.use(middleware_1.authMiddleware);
// Get all proxies
router.get('/', async (req, res) => {
try {
const result = await migrate_1.pool.query(`
SELECT id, host, port, protocol, active, is_anonymous,
last_tested_at, test_result, response_time_ms, created_at
SELECT id, host, port, protocol, active, is_anonymous,
last_tested_at, test_result, response_time_ms, created_at,
city, state, country, country_code, location_updated_at
FROM proxies
ORDER BY created_at DESC
`);
@@ -22,6 +57,32 @@ router.get('/', async (req, res) => {
res.status(500).json({ error: 'Failed to fetch proxies' });
}
});
// Get active proxy test job (must be before /:id route)
router.get('/test-job', async (req, res) => {
try {
const job = await (0, proxyTestQueue_1.getActiveProxyTestJob)();
res.json({ job });
}
catch (error) {
console.error('Error fetching active job:', error);
res.status(500).json({ error: 'Failed to fetch active job' });
}
});
// Get proxy test job status (must be before /:id route)
router.get('/test-job/:jobId', async (req, res) => {
try {
const { jobId } = req.params;
const job = await (0, proxyTestQueue_1.getProxyTestJob)(parseInt(jobId));
if (!job) {
return res.status(404).json({ error: 'Job not found' });
}
res.json({ job });
}
catch (error) {
console.error('Error fetching job status:', error);
res.status(500).json({ error: 'Failed to fetch job status' });
}
});
// Get single proxy
router.get('/:id', async (req, res) => {
try {
@@ -113,18 +174,30 @@ router.post('/:id/test', (0, middleware_1.requireRole)('superadmin', 'admin'), a
res.status(500).json({ error: 'Failed to test proxy' });
}
});
// Test all proxies
// Start proxy test job
router.post('/test-all', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
try {
// Run in background
(0, proxy_1.testAllProxies)().catch(err => {
console.error('Background proxy testing error:', err);
});
res.json({ message: 'Proxy testing started in background' });
const jobId = await (0, proxyTestQueue_1.createProxyTestJob)();
res.json({ jobId, message: 'Proxy test job started' });
}
catch (error) {
console.error('Error starting proxy tests:', error);
res.status(500).json({ error: 'Failed to start proxy tests' });
console.error('Error starting proxy test job:', error);
res.status(500).json({ error: 'Failed to start proxy test job' });
}
});
// Cancel proxy test job
router.post('/test-job/:jobId/cancel', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
try {
const { jobId } = req.params;
const cancelled = await (0, proxyTestQueue_1.cancelProxyTestJob)(parseInt(jobId));
if (!cancelled) {
return res.status(404).json({ error: 'Job not found or already completed' });
}
res.json({ message: 'Job cancelled successfully' });
}
catch (error) {
console.error('Error cancelling job:', error);
res.status(500).json({ error: 'Failed to cancel job' });
}
});
// Update proxy
@@ -171,4 +244,19 @@ router.delete('/:id', (0, middleware_1.requireRole)('superadmin'), async (req, r
res.status(500).json({ error: 'Failed to delete proxy' });
}
});
// Update all proxy locations
router.post('/update-locations', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
try {
const { updateAllProxyLocations } = await Promise.resolve().then(() => __importStar(require('../services/geolocation')));
// Run in background
updateAllProxyLocations().catch(err => {
console.error('❌ Location update failed:', err);
});
res.json({ message: 'Location update job started' });
}
catch (error) {
console.error('Error starting location update:', error);
res.status(500).json({ error: 'Failed to start location update' });
}
});
exports.default = router;

668
backend/dist/routes/public-api.js vendored Normal file
View File

@@ -0,0 +1,668 @@
"use strict";
/**
* Public API Routes for External Consumers (WordPress, etc.)
*
* These routes use the dutchie_az data pipeline and are protected by API key auth.
* Designed for Deeply Rooted and other WordPress sites consuming menu data.
*/
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
const express_1 = require("express");
const migrate_1 = require("../db/migrate");
const connection_1 = require("../dutchie-az/db/connection");
const ipaddr_js_1 = __importDefault(require("ipaddr.js"));
const router = (0, express_1.Router)();
// ============================================================
// MIDDLEWARE
// ============================================================
/**
* Validates if an IP address matches any of the allowed IP patterns
*/
function isIpAllowed(clientIp, allowedIps) {
try {
const clientAddr = ipaddr_js_1.default.process(clientIp);
for (const allowedIp of allowedIps) {
const trimmed = allowedIp.trim();
if (!trimmed)
continue;
if (trimmed.includes('/')) {
try {
const range = ipaddr_js_1.default.parseCIDR(trimmed);
if (clientAddr.match(range)) {
return true;
}
}
catch (e) {
console.warn(`Invalid CIDR notation: ${trimmed}`);
continue;
}
}
else {
try {
const allowedAddr = ipaddr_js_1.default.process(trimmed);
if (clientAddr.toString() === allowedAddr.toString()) {
return true;
}
}
catch (e) {
console.warn(`Invalid IP address: ${trimmed}`);
continue;
}
}
}
return false;
}
catch (error) {
console.error('Error processing client IP:', error);
return false;
}
}
/**
* Validates if a domain matches any of the allowed domain patterns
*/
function isDomainAllowed(origin, allowedDomains) {
try {
const url = new URL(origin);
const domain = url.hostname;
for (const allowedDomain of allowedDomains) {
const trimmed = allowedDomain.trim();
if (!trimmed)
continue;
if (trimmed.startsWith('*.')) {
const baseDomain = trimmed.substring(2);
if (domain === baseDomain || domain.endsWith('.' + baseDomain)) {
return true;
}
}
else {
if (domain === trimmed) {
return true;
}
}
}
return false;
}
catch (error) {
console.error('Error processing domain:', error);
return false;
}
}
/**
* Middleware to validate API key and resolve dispensary -> dutchie_az store mapping
*/
async function validatePublicApiKey(req, res, next) {
const apiKey = req.headers['x-api-key'];
if (!apiKey) {
return res.status(401).json({
error: 'Missing API key',
message: 'Provide your API key in the X-API-Key header'
});
}
try {
// Query WordPress permissions table with store info
const result = await migrate_1.pool.query(`
SELECT
p.id,
p.user_name,
p.api_key,
p.allowed_ips,
p.allowed_domains,
p.is_active,
p.store_id,
p.store_name
FROM wp_dutchie_api_permissions p
WHERE p.api_key = $1 AND p.is_active = 1
`, [apiKey]);
if (result.rows.length === 0) {
return res.status(401).json({
error: 'Invalid API key'
});
}
const permission = result.rows[0];
// Validate IP if configured
const clientIp = req.headers['x-forwarded-for']?.split(',')[0].trim() ||
req.headers['x-real-ip'] ||
req.ip ||
req.connection.remoteAddress ||
'';
if (permission.allowed_ips) {
const allowedIps = permission.allowed_ips.split('\n').filter((ip) => ip.trim());
if (allowedIps.length > 0 && !isIpAllowed(clientIp, allowedIps)) {
return res.status(403).json({
error: 'IP address not allowed',
client_ip: clientIp
});
}
}
// Validate domain if configured
const origin = req.get('origin') || req.get('referer') || '';
if (permission.allowed_domains && origin) {
const allowedDomains = permission.allowed_domains.split('\n').filter((d) => d.trim());
if (allowedDomains.length > 0 && !isDomainAllowed(origin, allowedDomains)) {
return res.status(403).json({
error: 'Domain not allowed',
origin: origin
});
}
}
// Resolve the dutchie_az store for this store
// Match by store name (from main DB) to dutchie_az.dispensaries.name
const storeResult = await (0, connection_1.query)(`
SELECT id FROM dispensaries
WHERE LOWER(TRIM(name)) = LOWER(TRIM($1))
OR LOWER(TRIM(name)) LIKE LOWER(TRIM($1)) || '%'
OR LOWER(TRIM($1)) LIKE LOWER(TRIM(name)) || '%'
ORDER BY
CASE WHEN LOWER(TRIM(name)) = LOWER(TRIM($1)) THEN 0 ELSE 1 END,
id
LIMIT 1
`, [permission.store_name]);
if (storeResult.rows.length > 0) {
permission.dutchie_az_store_id = storeResult.rows[0].id;
}
// Update last_used_at timestamp (async, don't wait)
migrate_1.pool.query(`
UPDATE wp_dutchie_api_permissions
SET last_used_at = CURRENT_TIMESTAMP
WHERE id = $1
`, [permission.id]).catch((err) => {
console.error('Error updating last_used_at:', err);
});
req.apiPermission = permission;
next();
}
catch (error) {
console.error('Public API validation error:', error);
return res.status(500).json({
error: 'Internal server error during API validation'
});
}
}
// Apply middleware to all routes
router.use(validatePublicApiKey);
// ============================================================
// PRODUCT ENDPOINTS
// ============================================================
/**
* GET /api/v1/products
* Get products for the authenticated dispensary
*
* Query params:
* - category: Filter by product type (e.g., 'flower', 'edible')
* - brand: Filter by brand name
* - in_stock_only: Only return in-stock products (default: false)
* - limit: Max products to return (default: 100, max: 500)
* - offset: Pagination offset (default: 0)
*/
router.get('/products', async (req, res) => {
try {
const permission = req.apiPermission;
// Check if we have a dutchie_az store mapping
if (!permission.dutchie_az_store_id) {
return res.status(503).json({
error: 'No menu data available',
message: `Menu data for ${permission.store_name} is not yet available. The dispensary may not be set up in the new data pipeline.`,
dispensary_name: permission.store_name
});
}
const { category, brand, in_stock_only = 'false', limit = '100', offset = '0' } = req.query;
// Build query
let whereClause = 'WHERE p.dispensary_id = $1';
const params = [permission.dutchie_az_store_id];
let paramIndex = 2;
// Filter by stock status if requested
if (in_stock_only === 'true' || in_stock_only === '1') {
whereClause += ` AND p.stock_status = 'in_stock'`;
}
// Filter by category (maps to 'type' in dutchie_az)
if (category) {
whereClause += ` AND LOWER(p.type) = LOWER($${paramIndex})`;
params.push(category);
paramIndex++;
}
// Filter by brand
if (brand) {
whereClause += ` AND LOWER(p.brand_name) LIKE LOWER($${paramIndex})`;
params.push(`%${brand}%`);
paramIndex++;
}
// Enforce limits
const limitNum = Math.min(parseInt(limit, 10) || 100, 500);
const offsetNum = parseInt(offset, 10) || 0;
params.push(limitNum, offsetNum);
// Query products with latest snapshot data
const { rows: products } = await (0, connection_1.query)(`
SELECT
p.id,
p.external_product_id as dutchie_id,
p.name,
p.brand_name as brand,
p.type as category,
p.subcategory,
p.strain_type,
p.stock_status,
p.thc,
p.cbd,
p.primary_image_url as image_url,
p.images,
p.effects,
p.created_at,
p.updated_at,
-- Latest snapshot data for pricing
s.rec_min_price_cents,
s.rec_max_price_cents,
s.rec_min_special_price_cents,
s.med_min_price_cents,
s.med_max_price_cents,
s.med_min_special_price_cents,
s.total_quantity_available,
s.options,
s.special,
s.crawled_at as snapshot_at
FROM dutchie_products p
LEFT JOIN LATERAL (
SELECT * FROM dutchie_product_snapshots
WHERE dutchie_product_id = p.id
ORDER BY crawled_at DESC
LIMIT 1
) s ON true
${whereClause}
ORDER BY p.name ASC
LIMIT $${paramIndex} OFFSET $${paramIndex + 1}
`, params);
// Get total count for pagination
const { rows: countRows } = await (0, connection_1.query)(`
SELECT COUNT(*) as total FROM dutchie_products p ${whereClause}
`, params.slice(0, -2));
// Transform products to backward-compatible format
const transformedProducts = products.map((p) => {
// Extract first image URL from images array
let imageUrl = p.image_url;
if (!imageUrl && p.images && Array.isArray(p.images) && p.images.length > 0) {
const firstImage = p.images[0];
imageUrl = typeof firstImage === 'string' ? firstImage : firstImage?.url;
}
// Convert prices from cents to dollars
const regularPrice = p.rec_min_price_cents
? (p.rec_min_price_cents / 100).toFixed(2)
: null;
const salePrice = p.rec_min_special_price_cents
? (p.rec_min_special_price_cents / 100).toFixed(2)
: null;
return {
id: p.id,
dutchie_id: p.dutchie_id,
name: p.name,
brand: p.brand || null,
category: p.category || null,
subcategory: p.subcategory || null,
strain_type: p.strain_type || null,
description: null, // Not stored in dutchie_products, would need snapshot
regular_price: regularPrice,
sale_price: salePrice,
thc_percentage: p.thc ? parseFloat(p.thc) : null,
cbd_percentage: p.cbd ? parseFloat(p.cbd) : null,
image_url: imageUrl || null,
in_stock: p.stock_status === 'in_stock',
on_special: p.special || false,
effects: p.effects || [],
options: p.options || [],
quantity_available: p.total_quantity_available || 0,
created_at: p.created_at,
updated_at: p.updated_at,
snapshot_at: p.snapshot_at
};
});
res.json({
success: true,
dispensary: permission.store_name,
products: transformedProducts,
pagination: {
total: parseInt(countRows[0]?.total || '0', 10),
limit: limitNum,
offset: offsetNum,
has_more: offsetNum + products.length < parseInt(countRows[0]?.total || '0', 10)
}
});
}
catch (error) {
console.error('Public API products error:', error);
res.status(500).json({
error: 'Failed to fetch products',
message: error.message
});
}
});
/**
* GET /api/v1/products/:id
* Get a single product by ID
*/
router.get('/products/:id', async (req, res) => {
try {
const permission = req.apiPermission;
const { id } = req.params;
if (!permission.dutchie_az_store_id) {
return res.status(503).json({
error: 'No menu data available',
message: `Menu data for ${permission.store_name} is not yet available.`
});
}
// Get product with latest snapshot
const { rows: products } = await (0, connection_1.query)(`
SELECT
p.*,
s.rec_min_price_cents,
s.rec_max_price_cents,
s.rec_min_special_price_cents,
s.med_min_price_cents,
s.med_max_price_cents,
s.total_quantity_available,
s.options,
s.special,
s.crawled_at as snapshot_at
FROM dutchie_products p
LEFT JOIN LATERAL (
SELECT * FROM dutchie_product_snapshots
WHERE dutchie_product_id = p.id
ORDER BY crawled_at DESC
LIMIT 1
) s ON true
WHERE p.id = $1 AND p.dispensary_id = $2
`, [id, permission.dutchie_az_store_id]);
if (products.length === 0) {
return res.status(404).json({
error: 'Product not found'
});
}
const p = products[0];
// Extract first image URL
let imageUrl = p.primary_image_url;
if (!imageUrl && p.images && Array.isArray(p.images) && p.images.length > 0) {
const firstImage = p.images[0];
imageUrl = typeof firstImage === 'string' ? firstImage : firstImage?.url;
}
res.json({
success: true,
product: {
id: p.id,
dutchie_id: p.external_product_id,
name: p.name,
brand: p.brand_name || null,
category: p.type || null,
subcategory: p.subcategory || null,
strain_type: p.strain_type || null,
regular_price: p.rec_min_price_cents ? (p.rec_min_price_cents / 100).toFixed(2) : null,
sale_price: p.rec_min_special_price_cents ? (p.rec_min_special_price_cents / 100).toFixed(2) : null,
thc_percentage: p.thc ? parseFloat(p.thc) : null,
cbd_percentage: p.cbd ? parseFloat(p.cbd) : null,
image_url: imageUrl || null,
images: p.images || [],
in_stock: p.stock_status === 'in_stock',
on_special: p.special || false,
effects: p.effects || [],
options: p.options || [],
quantity_available: p.total_quantity_available || 0,
created_at: p.created_at,
updated_at: p.updated_at,
snapshot_at: p.snapshot_at
}
});
}
catch (error) {
console.error('Public API product detail error:', error);
res.status(500).json({
error: 'Failed to fetch product',
message: error.message
});
}
});
/**
* GET /api/v1/categories
* Get all categories for the authenticated dispensary
*/
router.get('/categories', async (req, res) => {
try {
const permission = req.apiPermission;
if (!permission.dutchie_az_store_id) {
return res.status(503).json({
error: 'No menu data available',
message: `Menu data for ${permission.store_name} is not yet available.`
});
}
const { rows: categories } = await (0, connection_1.query)(`
SELECT
type as category,
subcategory,
COUNT(*) as product_count,
COUNT(*) FILTER (WHERE stock_status = 'in_stock') as in_stock_count
FROM dutchie_products
WHERE dispensary_id = $1 AND type IS NOT NULL
GROUP BY type, subcategory
ORDER BY type, subcategory
`, [permission.dutchie_az_store_id]);
res.json({
success: true,
dispensary: permission.store_name,
categories
});
}
catch (error) {
console.error('Public API categories error:', error);
res.status(500).json({
error: 'Failed to fetch categories',
message: error.message
});
}
});
/**
* GET /api/v1/brands
* Get all brands for the authenticated dispensary
*/
router.get('/brands', async (req, res) => {
try {
const permission = req.apiPermission;
if (!permission.dutchie_az_store_id) {
return res.status(503).json({
error: 'No menu data available',
message: `Menu data for ${permission.store_name} is not yet available.`
});
}
const { rows: brands } = await (0, connection_1.query)(`
SELECT
brand_name as brand,
COUNT(*) as product_count,
COUNT(*) FILTER (WHERE stock_status = 'in_stock') as in_stock_count
FROM dutchie_products
WHERE dispensary_id = $1 AND brand_name IS NOT NULL
GROUP BY brand_name
ORDER BY product_count DESC
`, [permission.dutchie_az_store_id]);
res.json({
success: true,
dispensary: permission.store_name,
brands
});
}
catch (error) {
console.error('Public API brands error:', error);
res.status(500).json({
error: 'Failed to fetch brands',
message: error.message
});
}
});
/**
* GET /api/v1/specials
* Get products on special/sale for the authenticated dispensary
*/
router.get('/specials', async (req, res) => {
try {
const permission = req.apiPermission;
if (!permission.dutchie_az_store_id) {
return res.status(503).json({
error: 'No menu data available',
message: `Menu data for ${permission.store_name} is not yet available.`
});
}
const { limit = '100', offset = '0' } = req.query;
const limitNum = Math.min(parseInt(limit, 10) || 100, 500);
const offsetNum = parseInt(offset, 10) || 0;
// Get products with special pricing from latest snapshot
const { rows: products } = await (0, connection_1.query)(`
SELECT
p.id,
p.external_product_id as dutchie_id,
p.name,
p.brand_name as brand,
p.type as category,
p.subcategory,
p.strain_type,
p.stock_status,
p.primary_image_url as image_url,
s.rec_min_price_cents,
s.rec_min_special_price_cents,
s.special,
s.options,
p.updated_at,
s.crawled_at as snapshot_at
FROM dutchie_products p
INNER JOIN LATERAL (
SELECT * FROM dutchie_product_snapshots
WHERE dutchie_product_id = p.id
ORDER BY crawled_at DESC
LIMIT 1
) s ON true
WHERE p.dispensary_id = $1
AND s.special = true
AND p.stock_status = 'in_stock'
ORDER BY p.name ASC
LIMIT $2 OFFSET $3
`, [permission.dutchie_az_store_id, limitNum, offsetNum]);
// Get total count
const { rows: countRows } = await (0, connection_1.query)(`
SELECT COUNT(*) as total
FROM dutchie_products p
INNER JOIN LATERAL (
SELECT special FROM dutchie_product_snapshots
WHERE dutchie_product_id = p.id
ORDER BY crawled_at DESC
LIMIT 1
) s ON true
WHERE p.dispensary_id = $1
AND s.special = true
AND p.stock_status = 'in_stock'
`, [permission.dutchie_az_store_id]);
const transformedProducts = products.map((p) => ({
id: p.id,
dutchie_id: p.dutchie_id,
name: p.name,
brand: p.brand || null,
category: p.category || null,
strain_type: p.strain_type || null,
regular_price: p.rec_min_price_cents ? (p.rec_min_price_cents / 100).toFixed(2) : null,
sale_price: p.rec_min_special_price_cents ? (p.rec_min_special_price_cents / 100).toFixed(2) : null,
image_url: p.image_url || null,
in_stock: p.stock_status === 'in_stock',
options: p.options || [],
updated_at: p.updated_at,
snapshot_at: p.snapshot_at
}));
res.json({
success: true,
dispensary: permission.store_name,
specials: transformedProducts,
pagination: {
total: parseInt(countRows[0]?.total || '0', 10),
limit: limitNum,
offset: offsetNum,
has_more: offsetNum + products.length < parseInt(countRows[0]?.total || '0', 10)
}
});
}
catch (error) {
console.error('Public API specials error:', error);
res.status(500).json({
error: 'Failed to fetch specials',
message: error.message
});
}
});
/**
* GET /api/v1/menu
* Get complete menu summary for the authenticated dispensary
*/
router.get('/menu', async (req, res) => {
try {
const permission = req.apiPermission;
if (!permission.dutchie_az_store_id) {
return res.status(503).json({
error: 'No menu data available',
message: `Menu data for ${permission.store_name} is not yet available.`
});
}
// Get counts by category
const { rows: categoryCounts } = await (0, connection_1.query)(`
SELECT
type as category,
COUNT(*) as total,
COUNT(*) FILTER (WHERE stock_status = 'in_stock') as in_stock
FROM dutchie_products
WHERE dispensary_id = $1 AND type IS NOT NULL
GROUP BY type
ORDER BY total DESC
`, [permission.dutchie_az_store_id]);
// Get overall stats
const { rows: stats } = await (0, connection_1.query)(`
SELECT
COUNT(*) as total_products,
COUNT(*) FILTER (WHERE stock_status = 'in_stock') as in_stock_count,
COUNT(DISTINCT brand_name) as brand_count,
COUNT(DISTINCT type) as category_count,
MAX(updated_at) as last_updated
FROM dutchie_products
WHERE dispensary_id = $1
`, [permission.dutchie_az_store_id]);
// Get specials count
const { rows: specialsCount } = await (0, connection_1.query)(`
SELECT COUNT(*) as count
FROM dutchie_products p
INNER JOIN LATERAL (
SELECT special FROM dutchie_product_snapshots
WHERE dutchie_product_id = p.id
ORDER BY crawled_at DESC
LIMIT 1
) s ON true
WHERE p.dispensary_id = $1
AND s.special = true
AND p.stock_status = 'in_stock'
`, [permission.dutchie_az_store_id]);
const summary = stats[0] || {};
res.json({
success: true,
dispensary: permission.store_name,
menu: {
total_products: parseInt(summary.total_products || '0', 10),
in_stock_count: parseInt(summary.in_stock_count || '0', 10),
brand_count: parseInt(summary.brand_count || '0', 10),
category_count: parseInt(summary.category_count || '0', 10),
specials_count: parseInt(specialsCount[0]?.count || '0', 10),
last_updated: summary.last_updated,
categories: categoryCounts.map((c) => ({
name: c.category,
total: parseInt(c.total, 10),
in_stock: parseInt(c.in_stock, 10)
}))
}
});
}
catch (error) {
console.error('Public API menu error:', error);
res.status(500).json({
error: 'Failed to fetch menu summary',
message: error.message
});
}
});
exports.default = router;

887
backend/dist/routes/schedule.js vendored Normal file
View File

@@ -0,0 +1,887 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
const express_1 = require("express");
const middleware_1 = require("../auth/middleware");
const crawl_scheduler_1 = require("../services/crawl-scheduler");
const store_crawl_orchestrator_1 = require("../services/store-crawl-orchestrator");
const dispensary_orchestrator_1 = require("../services/dispensary-orchestrator");
const migrate_1 = require("../db/migrate");
const graphql_client_1 = require("../dutchie-az/services/graphql-client");
const router = (0, express_1.Router)();
router.use(middleware_1.authMiddleware);
// ============================================
// Global Schedule Endpoints
// ============================================
/**
* GET /api/schedule/global
* Get global schedule settings
*/
router.get('/global', async (req, res) => {
try {
const schedules = await (0, crawl_scheduler_1.getGlobalSchedule)();
res.json({ schedules });
}
catch (error) {
console.error('Error fetching global schedule:', error);
res.status(500).json({ error: 'Failed to fetch global schedule' });
}
});
/**
* PUT /api/schedule/global/:type
* Update global schedule setting
*/
router.put('/global/:type', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
try {
const { type } = req.params;
const { enabled, interval_hours, run_time } = req.body;
if (type !== 'global_interval' && type !== 'daily_special') {
return res.status(400).json({ error: 'Invalid schedule type' });
}
const schedule = await (0, crawl_scheduler_1.updateGlobalSchedule)(type, {
enabled,
interval_hours,
run_time
});
// Restart scheduler to apply changes
await (0, crawl_scheduler_1.restartCrawlScheduler)();
res.json({ schedule, message: 'Schedule updated and scheduler restarted' });
}
catch (error) {
console.error('Error updating global schedule:', error);
res.status(500).json({ error: 'Failed to update global schedule' });
}
});
// ============================================
// Store Schedule Endpoints
// ============================================
/**
* GET /api/schedule/stores
* Get all store schedule statuses
*/
router.get('/stores', async (req, res) => {
try {
const stores = await (0, crawl_scheduler_1.getStoreScheduleStatuses)();
res.json({ stores });
}
catch (error) {
console.error('Error fetching store schedules:', error);
res.status(500).json({ error: 'Failed to fetch store schedules' });
}
});
/**
* GET /api/schedule/stores/:storeId
* Get schedule for a specific store
*/
router.get('/stores/:storeId', async (req, res) => {
try {
const storeId = parseInt(req.params.storeId);
if (isNaN(storeId)) {
return res.status(400).json({ error: 'Invalid store ID' });
}
const schedule = await (0, crawl_scheduler_1.getStoreSchedule)(storeId);
res.json({ schedule });
}
catch (error) {
console.error('Error fetching store schedule:', error);
res.status(500).json({ error: 'Failed to fetch store schedule' });
}
});
/**
* PUT /api/schedule/stores/:storeId
* Update schedule for a specific store
*/
router.put('/stores/:storeId', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
try {
const storeId = parseInt(req.params.storeId);
if (isNaN(storeId)) {
return res.status(400).json({ error: 'Invalid store ID' });
}
const { enabled, interval_hours, daily_special_enabled, daily_special_time, priority } = req.body;
const schedule = await (0, crawl_scheduler_1.updateStoreSchedule)(storeId, {
enabled,
interval_hours,
daily_special_enabled,
daily_special_time,
priority
});
res.json({ schedule });
}
catch (error) {
console.error('Error updating store schedule:', error);
res.status(500).json({ error: 'Failed to update store schedule' });
}
});
// ============================================
// Job Queue Endpoints
// ============================================
/**
* GET /api/schedule/jobs
* Get recent jobs
*/
router.get('/jobs', async (req, res) => {
try {
const limit = parseInt(req.query.limit) || 50;
const jobs = await (0, crawl_scheduler_1.getAllRecentJobs)(Math.min(limit, 200));
res.json({ jobs });
}
catch (error) {
console.error('Error fetching jobs:', error);
res.status(500).json({ error: 'Failed to fetch jobs' });
}
});
/**
* GET /api/schedule/jobs/store/:storeId
* Get recent jobs for a specific store
*/
router.get('/jobs/store/:storeId', async (req, res) => {
try {
const storeId = parseInt(req.params.storeId);
if (isNaN(storeId)) {
return res.status(400).json({ error: 'Invalid store ID' });
}
const limit = parseInt(req.query.limit) || 10;
const jobs = await (0, crawl_scheduler_1.getRecentJobs)(storeId, Math.min(limit, 100));
res.json({ jobs });
}
catch (error) {
console.error('Error fetching store jobs:', error);
res.status(500).json({ error: 'Failed to fetch store jobs' });
}
});
/**
* POST /api/schedule/jobs/:jobId/cancel
* Cancel a pending job
*/
router.post('/jobs/:jobId/cancel', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
try {
const jobId = parseInt(req.params.jobId);
if (isNaN(jobId)) {
return res.status(400).json({ error: 'Invalid job ID' });
}
const cancelled = await (0, crawl_scheduler_1.cancelJob)(jobId);
if (cancelled) {
res.json({ success: true, message: 'Job cancelled' });
}
else {
res.status(400).json({ error: 'Job could not be cancelled (may not be pending)' });
}
}
catch (error) {
console.error('Error cancelling job:', error);
res.status(500).json({ error: 'Failed to cancel job' });
}
});
// ============================================
// Manual Trigger Endpoints
// ============================================
/**
* POST /api/schedule/trigger/store/:storeId
* Manually trigger orchestrated crawl for a specific store
* Uses the intelligent orchestrator which:
* - Checks provider detection status
* - Runs detection if needed
* - Queues appropriate crawl type (production/sandbox)
*/
router.post('/trigger/store/:storeId', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
try {
const storeId = parseInt(req.params.storeId);
if (isNaN(storeId)) {
return res.status(400).json({ error: 'Invalid store ID' });
}
// Use the orchestrator instead of simple triggerManualCrawl
const result = await (0, store_crawl_orchestrator_1.runStoreCrawlOrchestrator)(storeId);
res.json({
result,
message: result.summary,
success: result.status === 'success' || result.status === 'sandbox_only',
});
}
catch (error) {
console.error('Error triggering orchestrated crawl:', error);
res.status(500).json({ error: 'Failed to trigger crawl' });
}
});
/**
* POST /api/schedule/trigger/store/:storeId/legacy
* Legacy: Simple job queue trigger (no orchestration)
*/
router.post('/trigger/store/:storeId/legacy', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
try {
const storeId = parseInt(req.params.storeId);
if (isNaN(storeId)) {
return res.status(400).json({ error: 'Invalid store ID' });
}
const job = await (0, crawl_scheduler_1.triggerManualCrawl)(storeId);
res.json({ job, message: 'Crawl job created' });
}
catch (error) {
console.error('Error triggering manual crawl:', error);
res.status(500).json({ error: 'Failed to trigger crawl' });
}
});
/**
* POST /api/schedule/trigger/all
* Manually trigger crawls for all stores
*/
router.post('/trigger/all', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
try {
const jobsCreated = await (0, crawl_scheduler_1.triggerAllStoresCrawl)();
res.json({ jobs_created: jobsCreated, message: `Created ${jobsCreated} crawl jobs` });
}
catch (error) {
console.error('Error triggering all crawls:', error);
res.status(500).json({ error: 'Failed to trigger crawls' });
}
});
/**
* POST /api/schedule/restart
* Restart the scheduler
*/
router.post('/restart', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
try {
await (0, crawl_scheduler_1.restartCrawlScheduler)();
res.json({ message: 'Scheduler restarted', mode: (0, crawl_scheduler_1.getSchedulerMode)() });
}
catch (error) {
console.error('Error restarting scheduler:', error);
res.status(500).json({ error: 'Failed to restart scheduler' });
}
});
// ============================================
// Scheduler Mode Endpoints
// ============================================
/**
* GET /api/schedule/mode
* Get current scheduler mode
*/
router.get('/mode', async (req, res) => {
try {
const mode = (0, crawl_scheduler_1.getSchedulerMode)();
res.json({ mode });
}
catch (error) {
console.error('Error getting scheduler mode:', error);
res.status(500).json({ error: 'Failed to get scheduler mode' });
}
});
/**
* PUT /api/schedule/mode
* Set scheduler mode (legacy or orchestrator)
*/
router.put('/mode', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
try {
const { mode } = req.body;
if (mode !== 'legacy' && mode !== 'orchestrator') {
return res.status(400).json({ error: 'Invalid mode. Must be "legacy" or "orchestrator"' });
}
(0, crawl_scheduler_1.setSchedulerMode)(mode);
// Restart scheduler with new mode
await (0, crawl_scheduler_1.restartCrawlScheduler)();
res.json({ mode, message: `Scheduler mode set to ${mode} and restarted` });
}
catch (error) {
console.error('Error setting scheduler mode:', error);
res.status(500).json({ error: 'Failed to set scheduler mode' });
}
});
/**
* GET /api/schedule/due
* Get stores that are due for orchestration
*/
router.get('/due', async (req, res) => {
try {
const limit = parseInt(req.query.limit) || 10;
const storeIds = await (0, store_crawl_orchestrator_1.getStoresDueForOrchestration)(Math.min(limit, 50));
res.json({ stores_due: storeIds, count: storeIds.length });
}
catch (error) {
console.error('Error getting stores due for orchestration:', error);
res.status(500).json({ error: 'Failed to get stores due' });
}
});
// ============================================
// Dispensary Schedule Endpoints (NEW - dispensary-centric)
// ============================================
/**
* GET /api/schedule/dispensaries
* Get all dispensary schedule statuses with optional filters
* Query params:
* - state: filter by state (e.g., 'AZ')
* - search: search by name or slug
*/
router.get('/dispensaries', async (req, res) => {
try {
const { state, search } = req.query;
// Build dynamic query with optional filters
const conditions = [];
const params = [];
let paramIndex = 1;
if (state) {
conditions.push(`d.state = $${paramIndex}`);
params.push(state);
paramIndex++;
}
if (search) {
conditions.push(`(d.name ILIKE $${paramIndex} OR d.slug ILIKE $${paramIndex} OR d.dba_name ILIKE $${paramIndex})`);
params.push(`%${search}%`);
paramIndex++;
}
const whereClause = conditions.length > 0 ? `WHERE ${conditions.join(' AND ')}` : '';
const query = `
SELECT
d.id AS dispensary_id,
COALESCE(d.dba_name, d.name) AS dispensary_name,
d.slug AS dispensary_slug,
d.city,
d.state,
d.menu_url,
d.menu_type,
d.platform_dispensary_id,
d.scrape_enabled,
d.last_crawl_at,
d.crawl_status,
d.product_crawler_mode,
d.product_provider,
cs.interval_minutes,
cs.is_active,
cs.priority,
cs.last_run_at,
cs.next_run_at,
cs.last_status AS schedule_last_status,
cs.last_error AS schedule_last_error,
cs.consecutive_failures,
j.id AS latest_job_id,
j.status AS latest_job_status,
j.job_type AS latest_job_type,
j.started_at AS latest_job_started,
j.completed_at AS latest_job_completed,
j.products_found AS latest_products_found,
j.products_new AS latest_products_created,
j.products_updated AS latest_products_updated,
j.error_message AS latest_job_error,
CASE
WHEN d.menu_type = 'dutchie' AND d.platform_dispensary_id IS NOT NULL THEN true
ELSE false
END AS can_crawl,
CASE
WHEN d.menu_type IS NULL OR d.menu_type = 'unknown' THEN 'menu_type not detected'
WHEN d.menu_type != 'dutchie' THEN 'not dutchie platform'
WHEN d.platform_dispensary_id IS NULL THEN 'platform ID not resolved'
WHEN d.scrape_enabled = false THEN 'scraping disabled'
ELSE 'ready'
END AS schedule_status_reason
FROM public.dispensaries d
LEFT JOIN public.dispensary_crawl_schedule cs ON cs.dispensary_id = d.id
LEFT JOIN LATERAL (
SELECT *
FROM public.dispensary_crawl_jobs dj
WHERE dj.dispensary_id = d.id
ORDER BY dj.created_at DESC
LIMIT 1
) j ON true
${whereClause}
ORDER BY cs.priority DESC NULLS LAST, COALESCE(d.dba_name, d.name)
`;
const result = await migrate_1.pool.query(query, params);
res.json({ dispensaries: result.rows });
}
catch (error) {
console.error('Error fetching dispensary schedules:', error);
res.status(500).json({ error: 'Failed to fetch dispensary schedules' });
}
});
/**
* GET /api/schedule/dispensaries/:id
* Get schedule for a specific dispensary
*/
router.get('/dispensaries/:id', async (req, res) => {
try {
const dispensaryId = parseInt(req.params.id);
if (isNaN(dispensaryId)) {
return res.status(400).json({ error: 'Invalid dispensary ID' });
}
const result = await migrate_1.pool.query(`
SELECT * FROM dispensary_crawl_status
WHERE dispensary_id = $1
`, [dispensaryId]);
if (result.rows.length === 0) {
return res.status(404).json({ error: 'Dispensary not found' });
}
res.json({ schedule: result.rows[0] });
}
catch (error) {
console.error('Error fetching dispensary schedule:', error);
res.status(500).json({ error: 'Failed to fetch dispensary schedule' });
}
});
/**
* PUT /api/schedule/dispensaries/:id
* Update schedule for a specific dispensary
*/
router.put('/dispensaries/:id', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
try {
const dispensaryId = parseInt(req.params.id);
if (isNaN(dispensaryId)) {
return res.status(400).json({ error: 'Invalid dispensary ID' });
}
const { is_active, interval_minutes, priority } = req.body;
// Upsert schedule
const result = await migrate_1.pool.query(`
INSERT INTO dispensary_crawl_schedule (dispensary_id, is_active, interval_minutes, priority)
VALUES ($1, COALESCE($2, TRUE), COALESCE($3, 240), COALESCE($4, 0))
ON CONFLICT (dispensary_id) DO UPDATE SET
is_active = COALESCE($2, dispensary_crawl_schedule.is_active),
interval_minutes = COALESCE($3, dispensary_crawl_schedule.interval_minutes),
priority = COALESCE($4, dispensary_crawl_schedule.priority),
updated_at = NOW()
RETURNING *
`, [dispensaryId, is_active, interval_minutes, priority]);
res.json({ schedule: result.rows[0] });
}
catch (error) {
console.error('Error updating dispensary schedule:', error);
res.status(500).json({ error: 'Failed to update dispensary schedule' });
}
});
/**
* GET /api/schedule/dispensary-jobs
* Get recent dispensary crawl jobs
*/
router.get('/dispensary-jobs', async (req, res) => {
try {
const limit = parseInt(req.query.limit) || 50;
const result = await migrate_1.pool.query(`
SELECT dcj.*, d.name as dispensary_name
FROM dispensary_crawl_jobs dcj
JOIN dispensaries d ON d.id = dcj.dispensary_id
ORDER BY dcj.created_at DESC
LIMIT $1
`, [Math.min(limit, 200)]);
res.json({ jobs: result.rows });
}
catch (error) {
console.error('Error fetching dispensary jobs:', error);
res.status(500).json({ error: 'Failed to fetch dispensary jobs' });
}
});
/**
* GET /api/schedule/dispensary-jobs/:dispensaryId
* Get recent jobs for a specific dispensary
*/
router.get('/dispensary-jobs/:dispensaryId', async (req, res) => {
try {
const dispensaryId = parseInt(req.params.dispensaryId);
if (isNaN(dispensaryId)) {
return res.status(400).json({ error: 'Invalid dispensary ID' });
}
const limit = parseInt(req.query.limit) || 10;
const result = await migrate_1.pool.query(`
SELECT dcj.*, d.name as dispensary_name
FROM dispensary_crawl_jobs dcj
JOIN dispensaries d ON d.id = dcj.dispensary_id
WHERE dcj.dispensary_id = $1
ORDER BY dcj.created_at DESC
LIMIT $2
`, [dispensaryId, Math.min(limit, 100)]);
res.json({ jobs: result.rows });
}
catch (error) {
console.error('Error fetching dispensary jobs:', error);
res.status(500).json({ error: 'Failed to fetch dispensary jobs' });
}
});
/**
* POST /api/schedule/trigger/dispensary/:id
* Trigger orchestrator for a specific dispensary (Run Now button)
*/
router.post('/trigger/dispensary/:id', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
try {
const dispensaryId = parseInt(req.params.id);
if (isNaN(dispensaryId)) {
return res.status(400).json({ error: 'Invalid dispensary ID' });
}
// Run the dispensary orchestrator
const result = await (0, dispensary_orchestrator_1.runDispensaryOrchestrator)(dispensaryId);
res.json({
result,
message: result.summary,
success: result.status === 'success' || result.status === 'sandbox_only' || result.status === 'detection_only',
});
}
catch (error) {
console.error('Error triggering dispensary orchestrator:', error);
res.status(500).json({ error: 'Failed to trigger orchestrator' });
}
});
/**
* POST /api/schedule/trigger/dispensaries/batch
* Trigger orchestrator for multiple dispensaries
*/
router.post('/trigger/dispensaries/batch', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
try {
const { dispensary_ids, concurrency } = req.body;
if (!Array.isArray(dispensary_ids) || dispensary_ids.length === 0) {
return res.status(400).json({ error: 'dispensary_ids must be a non-empty array' });
}
const results = await (0, dispensary_orchestrator_1.runBatchDispensaryOrchestrator)(dispensary_ids, concurrency || 3);
const summary = {
total: results.length,
success: results.filter(r => r.status === 'success').length,
sandbox_only: results.filter(r => r.status === 'sandbox_only').length,
detection_only: results.filter(r => r.status === 'detection_only').length,
error: results.filter(r => r.status === 'error').length,
};
res.json({ results, summary });
}
catch (error) {
console.error('Error triggering batch orchestrator:', error);
res.status(500).json({ error: 'Failed to trigger batch orchestrator' });
}
});
/**
* GET /api/schedule/dispensary-due
* Get dispensaries that are due for orchestration
*/
router.get('/dispensary-due', async (req, res) => {
try {
const limit = parseInt(req.query.limit) || 10;
const dispensaryIds = await (0, dispensary_orchestrator_1.getDispensariesDueForOrchestration)(Math.min(limit, 50));
// Get details for the due dispensaries
if (dispensaryIds.length > 0) {
const details = await migrate_1.pool.query(`
SELECT d.id, d.name, d.product_provider, d.product_crawler_mode,
dcs.next_run_at, dcs.last_status, dcs.priority
FROM dispensaries d
LEFT JOIN dispensary_crawl_schedule dcs ON dcs.dispensary_id = d.id
WHERE d.id = ANY($1)
ORDER BY COALESCE(dcs.priority, 0) DESC, dcs.last_run_at ASC NULLS FIRST
`, [dispensaryIds]);
res.json({ dispensaries_due: details.rows, count: dispensaryIds.length });
}
else {
res.json({ dispensaries_due: [], count: 0 });
}
}
catch (error) {
console.error('Error getting dispensaries due for orchestration:', error);
res.status(500).json({ error: 'Failed to get dispensaries due' });
}
});
/**
* POST /api/schedule/dispensaries/bootstrap
* Ensure all dispensaries have schedule entries
*/
router.post('/dispensaries/bootstrap', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
try {
const { interval_minutes } = req.body;
const result = await (0, dispensary_orchestrator_1.ensureAllDispensariesHaveSchedules)(interval_minutes || 240);
res.json({
message: `Created ${result.created} new schedules, ${result.existing} already existed`,
created: result.created,
existing: result.existing,
});
}
catch (error) {
console.error('Error bootstrapping dispensary schedules:', error);
res.status(500).json({ error: 'Failed to bootstrap schedules' });
}
});
// ============================================
// Platform ID & Menu Type Detection Endpoints
// ============================================
/**
* POST /api/schedule/dispensaries/:id/resolve-platform-id
* Resolve the Dutchie platform_dispensary_id from menu_url slug
*/
router.post('/dispensaries/:id/resolve-platform-id', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
try {
const dispensaryId = parseInt(req.params.id);
if (isNaN(dispensaryId)) {
return res.status(400).json({ error: 'Invalid dispensary ID' });
}
// Get dispensary info
const dispensaryResult = await migrate_1.pool.query(`
SELECT id, name, slug, menu_url, menu_type, platform_dispensary_id
FROM dispensaries WHERE id = $1
`, [dispensaryId]);
if (dispensaryResult.rows.length === 0) {
return res.status(404).json({ error: 'Dispensary not found' });
}
const dispensary = dispensaryResult.rows[0];
// Check if already resolved
if (dispensary.platform_dispensary_id) {
return res.json({
success: true,
message: 'Platform ID already resolved',
platform_dispensary_id: dispensary.platform_dispensary_id,
already_resolved: true
});
}
// Extract slug from menu_url for Dutchie URLs
let slugToResolve = dispensary.slug;
if (dispensary.menu_url) {
// Match embedded-menu or dispensary URLs
const match = dispensary.menu_url.match(/(?:embedded-menu|dispensar(?:y|ies))\/([^\/\?#]+)/i);
if (match) {
slugToResolve = match[1];
}
}
if (!slugToResolve) {
return res.status(400).json({
error: 'No slug available to resolve platform ID',
menu_url: dispensary.menu_url
});
}
console.log(`[Schedule] Resolving platform ID for ${dispensary.name} using slug: ${slugToResolve}`);
// Resolve platform ID using GraphQL client
const platformId = await (0, graphql_client_1.resolveDispensaryId)(slugToResolve);
if (!platformId) {
return res.status(404).json({
error: 'Could not resolve platform ID',
slug_tried: slugToResolve,
message: 'The dispensary might not be on Dutchie or the slug is incorrect'
});
}
// Update the dispensary with resolved platform ID
await migrate_1.pool.query(`
UPDATE dispensaries
SET platform_dispensary_id = $1,
menu_type = COALESCE(menu_type, 'dutchie'),
updated_at = NOW()
WHERE id = $2
`, [platformId, dispensaryId]);
res.json({
success: true,
platform_dispensary_id: platformId,
slug_resolved: slugToResolve,
message: `Platform ID resolved: ${platformId}`
});
}
catch (error) {
console.error('Error resolving platform ID:', error);
res.status(500).json({ error: 'Failed to resolve platform ID', details: error.message });
}
});
/**
* POST /api/schedule/dispensaries/:id/detect-menu-type
* Detect menu type from menu_url
*/
router.post('/dispensaries/:id/detect-menu-type', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
try {
const dispensaryId = parseInt(req.params.id);
if (isNaN(dispensaryId)) {
return res.status(400).json({ error: 'Invalid dispensary ID' });
}
// Get dispensary info
const dispensaryResult = await migrate_1.pool.query(`
SELECT id, name, menu_url, website FROM dispensaries WHERE id = $1
`, [dispensaryId]);
if (dispensaryResult.rows.length === 0) {
return res.status(404).json({ error: 'Dispensary not found' });
}
const dispensary = dispensaryResult.rows[0];
const urlToCheck = dispensary.menu_url || dispensary.website;
if (!urlToCheck) {
return res.status(400).json({ error: 'No menu_url or website to detect from' });
}
// Detect menu type from URL patterns
let detectedType = 'unknown';
if (urlToCheck.includes('dutchie.com') || urlToCheck.includes('embedded-menu')) {
detectedType = 'dutchie';
}
else if (urlToCheck.includes('iheartjane.com') || urlToCheck.includes('jane.co')) {
detectedType = 'jane';
}
else if (urlToCheck.includes('weedmaps.com')) {
detectedType = 'weedmaps';
}
else if (urlToCheck.includes('leafly.com')) {
detectedType = 'leafly';
}
else if (urlToCheck.includes('treez.io') || urlToCheck.includes('treez.co')) {
detectedType = 'treez';
}
else if (urlToCheck.includes('meadow.com')) {
detectedType = 'meadow';
}
else if (urlToCheck.includes('blaze.me') || urlToCheck.includes('blazepay')) {
detectedType = 'blaze';
}
else if (urlToCheck.includes('flowhub.com')) {
detectedType = 'flowhub';
}
else if (urlToCheck.includes('dispense.app')) {
detectedType = 'dispense';
}
else if (urlToCheck.includes('covasoft.com')) {
detectedType = 'cova';
}
// Update menu_type
await migrate_1.pool.query(`
UPDATE dispensaries
SET menu_type = $1, updated_at = NOW()
WHERE id = $2
`, [detectedType, dispensaryId]);
res.json({
success: true,
menu_type: detectedType,
url_checked: urlToCheck,
message: `Menu type detected: ${detectedType}`
});
}
catch (error) {
console.error('Error detecting menu type:', error);
res.status(500).json({ error: 'Failed to detect menu type' });
}
});
/**
* POST /api/schedule/dispensaries/:id/refresh-detection
* Combined: detect menu_type AND resolve platform_dispensary_id if dutchie
*/
router.post('/dispensaries/:id/refresh-detection', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
try {
const dispensaryId = parseInt(req.params.id);
if (isNaN(dispensaryId)) {
return res.status(400).json({ error: 'Invalid dispensary ID' });
}
// Get dispensary info
const dispensaryResult = await migrate_1.pool.query(`
SELECT id, name, slug, menu_url, website FROM dispensaries WHERE id = $1
`, [dispensaryId]);
if (dispensaryResult.rows.length === 0) {
return res.status(404).json({ error: 'Dispensary not found' });
}
const dispensary = dispensaryResult.rows[0];
const urlToCheck = dispensary.menu_url || dispensary.website;
if (!urlToCheck) {
return res.status(400).json({ error: 'No menu_url or website to detect from' });
}
// Detect menu type from URL patterns
let detectedType = 'unknown';
if (urlToCheck.includes('dutchie.com') || urlToCheck.includes('embedded-menu')) {
detectedType = 'dutchie';
}
else if (urlToCheck.includes('iheartjane.com') || urlToCheck.includes('jane.co')) {
detectedType = 'jane';
}
else if (urlToCheck.includes('weedmaps.com')) {
detectedType = 'weedmaps';
}
else if (urlToCheck.includes('leafly.com')) {
detectedType = 'leafly';
}
else if (urlToCheck.includes('treez.io') || urlToCheck.includes('treez.co')) {
detectedType = 'treez';
}
else if (urlToCheck.includes('meadow.com')) {
detectedType = 'meadow';
}
else if (urlToCheck.includes('blaze.me') || urlToCheck.includes('blazepay')) {
detectedType = 'blaze';
}
else if (urlToCheck.includes('flowhub.com')) {
detectedType = 'flowhub';
}
else if (urlToCheck.includes('dispense.app')) {
detectedType = 'dispense';
}
else if (urlToCheck.includes('covasoft.com')) {
detectedType = 'cova';
}
// Update menu_type first
await migrate_1.pool.query(`
UPDATE dispensaries SET menu_type = $1, updated_at = NOW() WHERE id = $2
`, [detectedType, dispensaryId]);
let platformId = null;
// If dutchie, also try to resolve platform ID
if (detectedType === 'dutchie') {
let slugToResolve = dispensary.slug;
const match = urlToCheck.match(/(?:embedded-menu|dispensar(?:y|ies))\/([^\/\?#]+)/i);
if (match) {
slugToResolve = match[1];
}
if (slugToResolve) {
try {
console.log(`[Schedule] Resolving platform ID for ${dispensary.name} using slug: ${slugToResolve}`);
platformId = await (0, graphql_client_1.resolveDispensaryId)(slugToResolve);
if (platformId) {
await migrate_1.pool.query(`
UPDATE dispensaries SET platform_dispensary_id = $1, updated_at = NOW() WHERE id = $2
`, [platformId, dispensaryId]);
}
}
catch (err) {
console.warn(`[Schedule] Failed to resolve platform ID: ${err.message}`);
}
}
}
res.json({
success: true,
menu_type: detectedType,
platform_dispensary_id: platformId,
url_checked: urlToCheck,
can_crawl: detectedType === 'dutchie' && !!platformId
});
}
catch (error) {
console.error('Error refreshing detection:', error);
res.status(500).json({ error: 'Failed to refresh detection' });
}
});
/**
* PUT /api/schedule/dispensaries/:id/toggle-active
* Enable or disable schedule for a dispensary
*/
router.put('/dispensaries/:id/toggle-active', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
try {
const dispensaryId = parseInt(req.params.id);
if (isNaN(dispensaryId)) {
return res.status(400).json({ error: 'Invalid dispensary ID' });
}
const { is_active } = req.body;
// Upsert schedule with new is_active value
const result = await migrate_1.pool.query(`
INSERT INTO dispensary_crawl_schedule (dispensary_id, is_active, interval_minutes, priority)
VALUES ($1, $2, 240, 0)
ON CONFLICT (dispensary_id) DO UPDATE SET
is_active = $2,
updated_at = NOW()
RETURNING *
`, [dispensaryId, is_active]);
res.json({
success: true,
schedule: result.rows[0],
message: is_active ? 'Schedule enabled' : 'Schedule disabled'
});
}
catch (error) {
console.error('Error toggling schedule active status:', error);
res.status(500).json({ error: 'Failed to toggle schedule' });
}
});
/**
* DELETE /api/schedule/dispensaries/:id/schedule
* Delete schedule for a dispensary
*/
router.delete('/dispensaries/:id/schedule', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
try {
const dispensaryId = parseInt(req.params.id);
if (isNaN(dispensaryId)) {
return res.status(400).json({ error: 'Invalid dispensary ID' });
}
const result = await migrate_1.pool.query(`
DELETE FROM dispensary_crawl_schedule WHERE dispensary_id = $1 RETURNING id
`, [dispensaryId]);
const deleted = (result.rowCount ?? 0) > 0;
res.json({
success: true,
deleted,
message: deleted ? 'Schedule deleted' : 'No schedule to delete'
});
}
catch (error) {
console.error('Error deleting schedule:', error);
res.status(500).json({ error: 'Failed to delete schedule' });
}
});
exports.default = router;

View File

@@ -1,4 +1,37 @@
"use strict";
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
exports.activeScrapers = void 0;
exports.registerScraper = registerScraper;
@@ -49,32 +82,42 @@ router.get('/active/:id', async (req, res) => {
// Get scraper history (last 50 completed scrapes)
router.get('/history', async (req, res) => {
try {
const { limit = 50, store_id } = req.query;
const { limit = 50, dispensary_id } = req.query;
let query = `
SELECT
s.id as store_id,
s.name as store_name,
c.id as category_id,
c.name as category_name,
c.last_scraped_at,
d.id as dispensary_id,
COALESCE(d.dba_name, d.name) as dispensary_name,
d.city,
d.state,
dcj.id as job_id,
dcj.job_type,
dcj.status,
dcj.products_found,
dcj.products_new,
dcj.products_updated,
dcj.in_stock_count,
dcj.out_of_stock_count,
dcj.duration_ms,
dcj.completed_at as last_scraped_at,
dcj.error_message,
(
SELECT COUNT(*)
FROM products p
WHERE p.store_id = s.id
AND p.category_id = c.id
WHERE p.dispensary_id = d.id
AND p.last_seen_at >= NOW() - INTERVAL '7 days'
) as product_count
FROM stores s
LEFT JOIN categories c ON c.store_id = s.id
WHERE c.last_scraped_at IS NOT NULL
FROM dispensary_crawl_jobs dcj
JOIN dispensaries d ON d.id = dcj.dispensary_id
WHERE dcj.completed_at IS NOT NULL
`;
const params = [];
let paramCount = 1;
if (store_id) {
query += ` AND s.id = $${paramCount}`;
params.push(store_id);
if (dispensary_id) {
query += ` AND d.id = $${paramCount}`;
params.push(dispensary_id);
paramCount++;
}
query += ` ORDER BY c.last_scraped_at DESC LIMIT $${paramCount}`;
query += ` ORDER BY dcj.completed_at DESC LIMIT $${paramCount}`;
params.push(limit);
const result = await migrate_1.pool.query(query, params);
res.json({ history: result.rows });
@@ -127,4 +170,180 @@ function completeScraper(id, error) {
}, 5 * 60 * 1000);
}
}
// Dispensary crawl jobs endpoints
router.get('/jobs/stats', async (req, res) => {
try {
const { dispensary_id } = req.query;
let whereClause = '';
const params = [];
if (dispensary_id) {
whereClause = 'WHERE dispensary_id = $1';
params.push(dispensary_id);
}
const result = await migrate_1.pool.query(`
SELECT
status,
COUNT(*) as count,
SUM(products_found) as total_products_found,
SUM(COALESCE(products_new, 0) + COALESCE(products_updated, 0)) as total_products_saved
FROM dispensary_crawl_jobs
${whereClause}
GROUP BY status
`, params);
const stats = {
pending: 0,
in_progress: 0,
completed: 0,
failed: 0,
total_products_found: 0,
total_products_saved: 0
};
result.rows.forEach((row) => {
stats[row.status] = parseInt(row.count);
if (row.status === 'completed') {
stats.total_products_found += parseInt(row.total_products_found || '0');
stats.total_products_saved += parseInt(row.total_products_saved || '0');
}
});
res.json(stats);
}
catch (error) {
console.error('Error fetching job stats:', error);
res.status(500).json({ error: 'Failed to fetch job stats' });
}
});
router.get('/jobs/active', async (req, res) => {
try {
const { dispensary_id } = req.query;
let whereClause = "WHERE dcj.status = 'in_progress'";
const params = [];
let paramCount = 1;
if (dispensary_id) {
whereClause += ` AND dcj.dispensary_id = $${paramCount}`;
params.push(dispensary_id);
paramCount++;
}
const result = await migrate_1.pool.query(`
SELECT
dcj.id,
dcj.dispensary_id,
COALESCE(d.dba_name, d.name) as dispensary_name,
dcj.job_type,
dcj.status,
dcj.worker_id,
dcj.started_at,
dcj.products_found,
COALESCE(dcj.products_new, 0) + COALESCE(dcj.products_updated, 0) as products_saved,
EXTRACT(EPOCH FROM (NOW() - dcj.started_at)) as duration_seconds
FROM dispensary_crawl_jobs dcj
JOIN dispensaries d ON d.id = dcj.dispensary_id
${whereClause}
ORDER BY dcj.started_at DESC
`, params);
res.json({ jobs: result.rows });
}
catch (error) {
console.error('Error fetching active jobs:', error);
res.status(500).json({ error: 'Failed to fetch active jobs' });
}
});
router.get('/jobs/recent', async (req, res) => {
try {
const { limit = 50, dispensary_id, status } = req.query;
let whereClause = '';
const params = [];
let paramCount = 1;
const conditions = [];
if (dispensary_id) {
conditions.push(`dcj.dispensary_id = $${paramCount}`);
params.push(dispensary_id);
paramCount++;
}
if (status) {
conditions.push(`dcj.status = $${paramCount}`);
params.push(status);
paramCount++;
}
if (conditions.length > 0) {
whereClause = 'WHERE ' + conditions.join(' AND ');
}
params.push(limit);
const result = await migrate_1.pool.query(`
SELECT
dcj.id,
dcj.dispensary_id,
COALESCE(d.dba_name, d.name) as dispensary_name,
dcj.job_type,
dcj.status,
dcj.worker_id,
dcj.started_at,
dcj.completed_at,
dcj.products_found,
COALESCE(dcj.products_new, 0) + COALESCE(dcj.products_updated, 0) as products_saved,
dcj.error_message,
EXTRACT(EPOCH FROM (COALESCE(dcj.completed_at, NOW()) - dcj.started_at)) as duration_seconds
FROM dispensary_crawl_jobs dcj
JOIN dispensaries d ON d.id = dcj.dispensary_id
${whereClause}
ORDER BY dcj.created_at DESC
LIMIT $${paramCount}
`, params);
res.json({ jobs: result.rows });
}
catch (error) {
console.error('Error fetching recent jobs:', error);
res.status(500).json({ error: 'Failed to fetch recent jobs' });
}
});
router.get('/jobs/workers', async (req, res) => {
try {
const { dispensary_id } = req.query;
let whereClause = "WHERE status = 'in_progress' AND worker_id IS NOT NULL";
const params = [];
if (dispensary_id) {
whereClause += ` AND dispensary_id = $1`;
params.push(dispensary_id);
}
const result = await migrate_1.pool.query(`
SELECT
worker_id,
COUNT(*) as active_jobs,
SUM(products_found) as total_products_found,
SUM(COALESCE(products_new, 0) + COALESCE(products_updated, 0)) as total_products_saved,
MIN(started_at) as earliest_start,
MAX(started_at) as latest_start
FROM dispensary_crawl_jobs
${whereClause}
GROUP BY worker_id
ORDER BY worker_id
`, params);
res.json({ workers: result.rows });
}
catch (error) {
console.error('Error fetching worker stats:', error);
res.status(500).json({ error: 'Failed to fetch worker stats' });
}
});
router.get('/jobs/worker-logs/:workerId', async (req, res) => {
try {
const { workerId } = req.params;
const fs = await Promise.resolve().then(() => __importStar(require('fs/promises')));
const path = await Promise.resolve().then(() => __importStar(require('path')));
const logPath = path.join('/tmp', `worker-${workerId}.log`);
try {
const logs = await fs.readFile(logPath, 'utf-8');
const lines = logs.split('\n');
// Return last 100 lines
const recentLogs = lines.slice(-100).join('\n');
res.json({ logs: recentLogs });
}
catch (fileError) {
res.json({ logs: 'No logs available for this worker yet.' });
}
}
catch (error) {
console.error('Failed to get worker logs:', error);
res.status(500).json({ error: 'Failed to get worker logs' });
}
});
exports.default = router;

View File

@@ -60,31 +60,185 @@ router.get('/', async (req, res) => {
res.status(500).json({ error: 'Failed to fetch stores' });
}
});
// Get single store
// Freshness threshold in hours
const STALE_THRESHOLD_HOURS = 4;
function calculateFreshness(lastScrapedAt) {
if (!lastScrapedAt) {
return {
last_scraped_at: null,
is_stale: true,
freshness: 'Never scraped',
hours_since_scrape: null
};
}
const now = new Date();
const diffMs = now.getTime() - lastScrapedAt.getTime();
const diffHours = diffMs / (1000 * 60 * 60);
const isStale = diffHours > STALE_THRESHOLD_HOURS;
let freshnessText;
if (diffHours < 1) {
const mins = Math.round(diffHours * 60);
freshnessText = `${mins} minute${mins !== 1 ? 's' : ''} ago`;
}
else if (diffHours < 24) {
const hrs = Math.round(diffHours);
freshnessText = `${hrs} hour${hrs !== 1 ? 's' : ''} ago`;
}
else {
const days = Math.round(diffHours / 24);
freshnessText = `${days} day${days !== 1 ? 's' : ''} ago`;
}
return {
last_scraped_at: lastScrapedAt.toISOString(),
is_stale: isStale,
freshness: freshnessText,
hours_since_scrape: Math.round(diffHours * 10) / 10
};
}
function detectProvider(dutchieUrl) {
if (!dutchieUrl)
return 'unknown';
if (dutchieUrl.includes('dutchie.com'))
return 'Dutchie';
if (dutchieUrl.includes('iheartjane.com') || dutchieUrl.includes('jane.co'))
return 'Jane';
if (dutchieUrl.includes('treez.io'))
return 'Treez';
if (dutchieUrl.includes('weedmaps.com'))
return 'Weedmaps';
if (dutchieUrl.includes('leafly.com'))
return 'Leafly';
return 'Custom';
}
// Get single store with full details
router.get('/:id', async (req, res) => {
try {
const { id } = req.params;
// Get store with counts and linked dispensary
const result = await migrate_1.pool.query(`
SELECT
SELECT
s.*,
d.id as dispensary_id,
d.name as dispensary_name,
d.slug as dispensary_slug,
d.state as dispensary_state,
d.city as dispensary_city,
d.address as dispensary_address,
d.menu_provider as dispensary_menu_provider,
COUNT(DISTINCT p.id) as product_count,
COUNT(DISTINCT c.id) as category_count
COUNT(DISTINCT c.id) as category_count,
COUNT(DISTINCT p.id) FILTER (WHERE p.in_stock = true) as in_stock_count,
COUNT(DISTINCT p.id) FILTER (WHERE p.in_stock = false) as out_of_stock_count
FROM stores s
LEFT JOIN dispensaries d ON s.dispensary_id = d.id
LEFT JOIN products p ON s.id = p.store_id
LEFT JOIN categories c ON s.id = c.store_id
WHERE s.id = $1
GROUP BY s.id
GROUP BY s.id, d.id, d.name, d.slug, d.state, d.city, d.address, d.menu_provider
`, [id]);
if (result.rows.length === 0) {
return res.status(404).json({ error: 'Store not found' });
}
res.json(result.rows[0]);
const store = result.rows[0];
// Get recent crawl jobs for this store
const jobsResult = await migrate_1.pool.query(`
SELECT
id, status, job_type, trigger_type,
started_at, completed_at,
products_found, products_new, products_updated,
in_stock_count, out_of_stock_count,
error_message
FROM crawl_jobs
WHERE store_id = $1
ORDER BY created_at DESC
LIMIT 10
`, [id]);
// Get schedule info if exists
const scheduleResult = await migrate_1.pool.query(`
SELECT
enabled, interval_hours, next_run_at, last_run_at
FROM store_crawl_schedule
WHERE store_id = $1
`, [id]);
// Calculate freshness
const freshness = calculateFreshness(store.last_scraped_at);
// Detect provider from URL
const provider = detectProvider(store.dutchie_url);
// Build response
const response = {
...store,
provider,
freshness: freshness.freshness,
is_stale: freshness.is_stale,
hours_since_scrape: freshness.hours_since_scrape,
linked_dispensary: store.dispensary_id ? {
id: store.dispensary_id,
name: store.dispensary_name,
slug: store.dispensary_slug,
state: store.dispensary_state,
city: store.dispensary_city,
address: store.dispensary_address,
menu_provider: store.dispensary_menu_provider
} : null,
schedule: scheduleResult.rows[0] || null,
recent_jobs: jobsResult.rows
};
// Remove redundant dispensary fields from root
delete response.dispensary_name;
delete response.dispensary_slug;
delete response.dispensary_state;
delete response.dispensary_city;
delete response.dispensary_address;
delete response.dispensary_menu_provider;
res.json(response);
}
catch (error) {
console.error('Error fetching store:', error);
res.status(500).json({ error: 'Failed to fetch store' });
}
});
// Get store brands
router.get('/:id/brands', async (req, res) => {
try {
const { id } = req.params;
const result = await migrate_1.pool.query(`
SELECT name
FROM brands
WHERE store_id = $1
ORDER BY name
`, [id]);
const brands = result.rows.map((row) => row.name);
res.json({ brands });
}
catch (error) {
console.error('Error fetching store brands:', error);
res.status(500).json({ error: 'Failed to fetch store brands' });
}
});
// Get store specials
router.get('/:id/specials', async (req, res) => {
try {
const { id } = req.params;
const { date } = req.query;
// Use provided date or today's date
const queryDate = date || new Date().toISOString().split('T')[0];
const result = await migrate_1.pool.query(`
SELECT
s.*,
p.name as product_name,
p.image_url as product_image
FROM specials s
LEFT JOIN products p ON s.product_id = p.id
WHERE s.store_id = $1 AND s.valid_date = $2
ORDER BY s.name
`, [id, queryDate]);
res.json({ specials: result.rows, date: queryDate });
}
catch (error) {
console.error('Error fetching store specials:', error);
res.status(500).json({ error: 'Failed to fetch store specials' });
}
});
// Create store
router.post('/', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
try {
@@ -146,17 +300,18 @@ router.delete('/:id', (0, middleware_1.requireRole)('superadmin'), async (req, r
router.post('/:id/scrape', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
try {
const { id } = req.params;
const { parallel = 3 } = req.body; // Default to 3 parallel scrapers
const { parallel = 3, userAgent } = req.body; // Default to 3 parallel scrapers
const storeResult = await migrate_1.pool.query('SELECT id FROM stores WHERE id = $1', [id]);
if (storeResult.rows.length === 0) {
return res.status(404).json({ error: 'Store not found' });
}
(0, scraper_v2_1.scrapeStore)(parseInt(id), parseInt(parallel)).catch(err => {
(0, scraper_v2_1.scrapeStore)(parseInt(id), parseInt(parallel), userAgent).catch(err => {
console.error('Background scrape error:', err);
});
res.json({
message: 'Scrape started',
parallel: parseInt(parallel)
parallel: parseInt(parallel),
userAgent: userAgent || 'random'
});
}
catch (error) {

24
backend/dist/routes/version.js vendored Normal file
View File

@@ -0,0 +1,24 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
const express_1 = require("express");
const router = (0, express_1.Router)();
/**
* GET /api/version
* Returns build version information for display in admin UI
*/
router.get('/', async (req, res) => {
try {
const versionInfo = {
build_version: process.env.APP_BUILD_VERSION || 'dev',
git_sha: process.env.APP_GIT_SHA || 'local',
build_time: process.env.APP_BUILD_TIME || new Date().toISOString(),
image_tag: process.env.CONTAINER_IMAGE_TAG || 'local',
};
res.json(versionInfo);
}
catch (error) {
console.error('Error fetching version info:', error);
res.status(500).json({ error: 'Failed to fetch version info' });
}
});
exports.default = router;

View File

@@ -8,15 +8,87 @@ const puppeteer_1 = __importDefault(require("puppeteer"));
const axios_1 = __importDefault(require("axios"));
const types_1 = require("./types");
const logger_1 = require("../services/logger");
// Fingerprint profiles for randomization
const SCREEN_RESOLUTIONS = [
{ width: 1920, height: 1080 },
{ width: 1366, height: 768 },
{ width: 1536, height: 864 },
{ width: 1440, height: 900 },
{ width: 1280, height: 720 },
{ width: 2560, height: 1440 },
{ width: 1680, height: 1050 },
{ width: 1600, height: 900 },
];
const TIMEZONES = [
'America/New_York',
'America/Chicago',
'America/Denver',
'America/Los_Angeles',
'America/Phoenix',
];
const LANGUAGES = [
['en-US', 'en'],
['en-US', 'en', 'es'],
['en-US'],
];
const PLATFORMS = [
'Win32',
'MacIntel',
'Linux x86_64',
];
const WEBGL_VENDORS = [
'Google Inc. (NVIDIA)',
'Google Inc. (Intel)',
'Google Inc. (AMD)',
'Intel Inc.',
'NVIDIA Corporation',
];
const WEBGL_RENDERERS = [
'ANGLE (NVIDIA GeForce GTX 1080 Direct3D11 vs_5_0 ps_5_0)',
'ANGLE (Intel(R) UHD Graphics 630 Direct3D11 vs_5_0 ps_5_0)',
'ANGLE (AMD Radeon RX 580 Series Direct3D11 vs_5_0 ps_5_0)',
'Intel Iris OpenGL Engine',
'NVIDIA GeForce RTX 3070/PCIe/SSE2',
'AMD Radeon Pro 5500M OpenGL Engine',
];
function generateRandomFingerprint() {
return {
screen: SCREEN_RESOLUTIONS[Math.floor(Math.random() * SCREEN_RESOLUTIONS.length)],
timezone: TIMEZONES[Math.floor(Math.random() * TIMEZONES.length)],
languages: LANGUAGES[Math.floor(Math.random() * LANGUAGES.length)],
platform: PLATFORMS[Math.floor(Math.random() * PLATFORMS.length)],
hardwareConcurrency: [4, 8, 12, 16][Math.floor(Math.random() * 4)],
deviceMemory: [4, 8, 16, 32][Math.floor(Math.random() * 4)],
webglVendor: WEBGL_VENDORS[Math.floor(Math.random() * WEBGL_VENDORS.length)],
webglRenderer: WEBGL_RENDERERS[Math.floor(Math.random() * WEBGL_RENDERERS.length)],
};
}
class Downloader {
browser = null;
page = null;
pageInUse = false;
currentFingerprint = generateRandomFingerprint();
needsNewFingerprint = false;
/**
* Initialize browser instance (lazy initialization)
* Force new fingerprint on next browser creation
*/
async getBrowser() {
rotateFingerprint() {
this.needsNewFingerprint = true;
logger_1.logger.info('scraper', '🔄 Fingerprint rotation scheduled');
}
/**
* Initialize browser instance with fingerprint
*/
async getBrowser(forceNew = false) {
// Create new browser if needed for fingerprint rotation
if (forceNew || this.needsNewFingerprint) {
await this.close();
this.currentFingerprint = generateRandomFingerprint();
this.needsNewFingerprint = false;
logger_1.logger.info('scraper', `🎭 New fingerprint: ${this.currentFingerprint.screen.width}x${this.currentFingerprint.screen.height}, ${this.currentFingerprint.timezone}, ${this.currentFingerprint.platform}`);
}
if (!this.browser || !this.browser.isConnected()) {
const { screen } = this.currentFingerprint;
const launchOptions = {
headless: 'new',
args: [
@@ -24,9 +96,11 @@ class Downloader {
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-blink-features=AutomationControlled',
'--window-size=1920,1080',
`--window-size=${screen.width},${screen.height}`,
'--disable-web-security',
'--disable-features=IsolateOrigins,site-per-process'
'--disable-features=IsolateOrigins,site-per-process',
'--disable-infobars',
'--disable-extensions',
]
};
this.browser = await puppeteer_1.default.launch(launchOptions);
@@ -35,45 +109,137 @@ class Downloader {
return this.browser;
}
/**
* Get or create a page instance
* Get or create a page instance with current fingerprint
*/
async getPage() {
if (!this.page || this.page.isClosed()) {
const browser = await this.getBrowser();
async getPage(forceNew = false) {
if (!this.page || this.page.isClosed() || forceNew) {
const browser = await this.getBrowser(forceNew);
this.page = await browser.newPage();
await this.page.setViewport({ width: 1920, height: 1080 });
logger_1.logger.debug('scraper', 'New page created');
const { screen } = this.currentFingerprint;
await this.page.setViewport({
width: screen.width,
height: screen.height,
deviceScaleFactor: 1,
});
// Apply fingerprint
await this.applyFingerprint(this.page);
logger_1.logger.debug('scraper', 'New page created with fingerprint');
}
return this.page;
}
/**
* Apply stealth mode to page
* Apply full fingerprint to page
*/
async makePageStealthy(page) {
await page.evaluateOnNewDocument(() => {
// @ts-ignore - runs in browser context
async applyFingerprint(page) {
const fp = this.currentFingerprint;
await page.evaluateOnNewDocument((fingerprint) => {
// Hide webdriver
Object.defineProperty(navigator, 'webdriver', {
get: () => false,
});
// @ts-ignore - runs in browser context
Object.defineProperty(navigator, 'plugins', {
get: () => [1, 2, 3, 4, 5],
// Spoof platform
Object.defineProperty(navigator, 'platform', {
get: () => fingerprint.platform,
});
// @ts-ignore - runs in browser context
// Spoof languages
Object.defineProperty(navigator, 'languages', {
get: () => ['en-US', 'en'],
get: () => fingerprint.languages,
});
// @ts-ignore - runs in browser context
// Spoof hardware concurrency
Object.defineProperty(navigator, 'hardwareConcurrency', {
get: () => fingerprint.hardwareConcurrency,
});
// Spoof device memory
Object.defineProperty(navigator, 'deviceMemory', {
get: () => fingerprint.deviceMemory,
});
// Spoof plugins (realistic count)
Object.defineProperty(navigator, 'plugins', {
get: () => {
const plugins = [];
for (let i = 0; i < 5; i++) {
plugins.push({
name: `Plugin ${i}`,
filename: `plugin${i}.dll`,
description: `Description ${i}`,
});
}
plugins.length = 5;
return plugins;
},
});
// Chrome object
window.chrome = {
runtime: {},
loadTimes: () => ({}),
csi: () => ({}),
app: {},
};
// @ts-ignore - runs in browser context
// Permissions
const originalQuery = window.navigator.permissions.query;
// @ts-ignore - runs in browser context
window.navigator.permissions.query = (parameters) => parameters.name === 'notifications'
? Promise.resolve({ state: 'denied' })
: originalQuery(parameters);
});
// WebGL fingerprint spoofing
const getParameterProxyHandler = {
apply: function (target, thisArg, argumentsList) {
const param = argumentsList[0];
// UNMASKED_VENDOR_WEBGL
if (param === 37445) {
return fingerprint.webglVendor;
}
// UNMASKED_RENDERER_WEBGL
if (param === 37446) {
return fingerprint.webglRenderer;
}
return Reflect.apply(target, thisArg, argumentsList);
}
};
// Override WebGL
const originalGetContext = HTMLCanvasElement.prototype.getContext;
HTMLCanvasElement.prototype.getContext = function (type, ...args) {
const context = originalGetContext.call(this, type, ...args);
if (context && (type === 'webgl' || type === 'webgl2' || type === 'experimental-webgl')) {
const glContext = context;
const originalGetParameter = glContext.getParameter.bind(glContext);
glContext.getParameter = new Proxy(originalGetParameter, getParameterProxyHandler);
}
return context;
};
// Canvas fingerprint noise
const originalToDataURL = HTMLCanvasElement.prototype.toDataURL;
HTMLCanvasElement.prototype.toDataURL = function (type) {
const context = this.getContext('2d');
if (context) {
const imageData = context.getImageData(0, 0, this.width, this.height);
for (let i = 0; i < imageData.data.length; i += 4) {
// Add tiny noise to RGB values
imageData.data[i] = imageData.data[i] ^ (Math.random() > 0.5 ? 1 : 0);
}
context.putImageData(imageData, 0, 0);
}
return originalToDataURL.call(this, type);
};
// Screen dimensions
Object.defineProperty(window.screen, 'width', { get: () => fingerprint.screen.width });
Object.defineProperty(window.screen, 'height', { get: () => fingerprint.screen.height });
Object.defineProperty(window.screen, 'availWidth', { get: () => fingerprint.screen.width });
Object.defineProperty(window.screen, 'availHeight', { get: () => fingerprint.screen.height - 40 });
Object.defineProperty(window, 'innerWidth', { get: () => fingerprint.screen.width });
Object.defineProperty(window, 'innerHeight', { get: () => fingerprint.screen.height - 140 });
Object.defineProperty(window, 'outerWidth', { get: () => fingerprint.screen.width });
Object.defineProperty(window, 'outerHeight', { get: () => fingerprint.screen.height });
}, fp);
// Set timezone via CDP
const client = await page.target().createCDPSession();
await client.send('Emulation.setTimezoneOverride', { timezoneId: fp.timezone });
}
/**
* Apply stealth mode to page (legacy - now uses applyFingerprint)
*/
async makePageStealthy(page) {
// Now handled by applyFingerprint
await this.applyFingerprint(page);
}
/**
* Configure proxy for browser
@@ -162,17 +328,29 @@ class Downloader {
if (request.metadata.userAgent) {
await page.setUserAgent(request.metadata.userAgent);
}
// Navigate to page
// Navigate to page - use networkidle2 for SPAs like Dutchie
// Increased timeout to 90s - Dutchie pages can take 30-40s to fully load
const navigationPromise = page.goto(request.url, {
waitUntil: 'domcontentloaded',
timeout: 60000
waitUntil: 'networkidle2',
timeout: 90000
});
const response = await navigationPromise;
if (!response) {
throw new Error('Navigation failed - no response');
}
// Wait for initial render
await page.waitForTimeout(3000);
// Wait for React to render product content
// Try to wait for products, but don't fail if they don't appear (empty category)
try {
await page.waitForSelector('[data-testid="product-list-item"], [data-testid="empty-state"]', {
timeout: 10000
});
}
catch {
// Products might not exist in this category - continue anyway
logger_1.logger.debug('scraper', 'No products found within timeout - continuing');
}
// Additional wait for any lazy-loaded content
await page.waitForTimeout(2000);
// Check for lazy-loaded content
await this.autoScroll(page);
// Get page content

View File

@@ -346,7 +346,7 @@ class DutchieSpider {
catch (error) {
logger_1.logger.error('scraper', `Category scrape failed: ${error}`);
if (completeScraper) {
completeScraper(scraperId, error.toString());
completeScraper(scraperId, String(error));
}
throw error;
}
@@ -397,7 +397,28 @@ class DutchieSpider {
// @ts-ignore - runs in browser context
href = window.location.origin + href;
}
items.push({ name, price, originalPrice, href });
// Extract image URL from product card
let imageUrl = null;
const imgSelectors = [
'img[src*="images.dutchie.com"]',
'img[src*="dutchie"]',
'img[data-testid*="product"]',
'img[class*="product"]',
'img[class*="Product"]',
'picture img',
'img'
];
for (const sel of imgSelectors) {
const img = card.querySelector(sel);
if (img) {
const src = img.getAttribute('src') || img.getAttribute('data-src') || '';
if (src && (src.includes('dutchie.com') || src.includes('images.'))) {
imageUrl = src;
break;
}
}
}
items.push({ name, price, originalPrice, href, imageUrl });
}
catch (err) {
console.error('Error parsing product card:', err);
@@ -416,6 +437,7 @@ class DutchieSpider {
productName: card.name,
productPrice: card.price,
productOriginalPrice: card.originalPrice,
productImageUrl: card.imageUrl, // Pass image from category page
requiresBrowser: true
},
callback: this.parseProductPage.bind(this)
@@ -436,20 +458,26 @@ class DutchieSpider {
const details = await page.evaluate(() => {
// @ts-ignore - runs in browser context
const allText = document.body.textContent || '';
// Extract image
// Extract image - expanded selectors for better coverage
let fullSizeImage = null;
const mainImageSelectors = [
'img[src*="images.dutchie.com"]',
'img[src*="dutchie"]',
'img[class*="ProductImage"]',
'img[class*="product-image"]',
'img[class*="Product"]',
'[class*="ImageGallery"] img',
'main img',
'img[src*="images.dutchie.com"]'
'[data-testid*="product"] img',
'[data-testid*="image"] img',
'picture img',
'main img'
];
for (const sel of mainImageSelectors) {
// @ts-ignore - runs in browser context
const img = document.querySelector(sel);
if (img?.src && img.src.includes('dutchie.com')) {
fullSizeImage = img.src;
const src = img?.src || img?.getAttribute('data-src') || '';
if (src && (src.includes('dutchie.com') || src.includes('images.'))) {
fullSizeImage = src;
break;
}
}
@@ -546,6 +574,8 @@ class DutchieSpider {
};
});
// Create product item
// Use image from product page, fallback to category page image
const imageUrl = details.fullSizeImage || response.request.metadata.productImageUrl || undefined;
const product = {
dutchieProductId: `${response.request.metadata.storeSlug}-${response.request.metadata.categorySlug}-${Date.now()}-${Math.random()}`,
name: productName || 'Unknown Product',
@@ -556,7 +586,7 @@ class DutchieSpider {
cbdPercentage: details.cbd || undefined,
strainType: details.strainType || undefined,
brand: details.brand || undefined,
imageUrl: details.fullSizeImage || undefined,
imageUrl: imageUrl,
dutchieUrl: response.url,
metadata: {
terpenes: details.terpenes,
@@ -573,6 +603,17 @@ class DutchieSpider {
async scrapeStore(storeId, parallel = 3) {
logger_1.logger.info('scraper', `🏪 Starting store scrape: ${storeId} (${parallel} parallel scrapers)`);
try {
// Check if categories exist, if not, discover them first
const categoryCountResult = await migrate_1.pool.query(`
SELECT COUNT(*) as count
FROM categories
WHERE store_id = $1
`, [storeId]);
if (parseInt(categoryCountResult.rows[0].count) === 0) {
logger_1.logger.info('scraper', 'No categories found - running discovery first');
const { discoverCategories } = await Promise.resolve().then(() => __importStar(require('./index')));
await discoverCategories(storeId);
}
// Get all leaf categories (no children)
const categoriesResult = await migrate_1.pool.query(`
SELECT c.id, c.name

View File

@@ -2,6 +2,13 @@
/**
* Scraper V2 - Scrapy-inspired web scraping framework
*
* IMPORTANT: For Dutchie stores, DO NOT USE scrapeStore() from this module.
* Dutchie crawling must go through the dutchie-az GraphQL pipeline:
* src/dutchie-az/services/product-crawler.ts
*
* This scraper-v2 module uses DOM-based extraction which is unreliable
* for Dutchie. The new dutchie-az pipeline uses GraphQL directly.
*
* Architecture:
* - Engine: Main orchestrator
* - Scheduler: Priority queue with deduplication
@@ -77,7 +84,7 @@ async function scrapeCategory(storeId, categoryId) {
/**
* Scrape an entire store
*/
async function scrapeStore(storeId, parallel = 3) {
async function scrapeStore(storeId, parallel = 3, _userAgent) {
const engine = new engine_2.ScraperEngine(1);
const spider = new engine_2.DutchieSpider(engine);
try {

View File

@@ -3,13 +3,31 @@ Object.defineProperty(exports, "__esModule", { value: true });
exports.MiddlewareEngine = exports.StealthMiddleware = exports.BotDetectionMiddleware = exports.RetryMiddleware = exports.RateLimitMiddleware = exports.ProxyMiddleware = exports.UserAgentMiddleware = void 0;
const types_1 = require("./types");
const logger_1 = require("../services/logger");
const migrate_1 = require("../db/migrate");
const proxy_1 = require("../services/proxy");
// Diverse, realistic user agents - updated for 2024/2025
const USER_AGENTS = [
// Chrome on Windows (most common)
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
// Chrome on Mac
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
// Chrome on Linux
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
// Firefox
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15'
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 14.0; rv:121.0) Gecko/20100101 Firefox/121.0',
// Safari
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_0) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15',
// Edge
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0',
];
function getRandomUserAgent() {
return USER_AGENTS[Math.floor(Math.random() * USER_AGENTS.length)];
@@ -18,55 +36,100 @@ function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
/**
* User Agent Rotation Middleware
* User Agent Rotation Middleware - rotates UA on each request for better evasion
*/
class UserAgentMiddleware {
name = 'UserAgentMiddleware';
priority = 100;
lastUserAgent = null;
async processRequest(request) {
if (!request.metadata.userAgent) {
request.metadata.userAgent = getRandomUserAgent();
// Always rotate UA on retries or bot detection
const forceRotation = request.retryCount > 0 || request.metadata.botDetected;
if (!request.metadata.userAgent || forceRotation) {
// Get a different UA than the last one used
let newUA = getRandomUserAgent();
let attempts = 0;
while (newUA === this.lastUserAgent && attempts < 5) {
newUA = getRandomUserAgent();
attempts++;
}
request.metadata.userAgent = newUA;
this.lastUserAgent = newUA;
if (forceRotation) {
logger_1.logger.debug('scraper', `🔄 Rotated User-Agent: ${newUA.substring(0, 50)}...`);
}
}
return request;
}
}
exports.UserAgentMiddleware = UserAgentMiddleware;
// Domains that should skip proxy (datacenter IPs are blocked)
const PROXY_SKIP_DOMAINS = [
'dutchie.com',
];
function shouldSkipProxy(url) {
try {
const urlObj = new URL(url);
return PROXY_SKIP_DOMAINS.some(domain => urlObj.hostname.includes(domain));
}
catch {
return false;
}
}
/**
* Proxy Rotation Middleware
* Proxy Rotation Middleware - uses the central proxy service with timeout handling
*/
class ProxyMiddleware {
name = 'ProxyMiddleware';
priority = 90;
async getActiveProxy() {
try {
const result = await migrate_1.pool.query(`
SELECT host, port, protocol, username, password
FROM proxies
WHERE active = true AND is_anonymous = true
ORDER BY RANDOM()
LIMIT 1
`);
if (result.rows.length === 0) {
return null;
}
return result.rows[0];
}
catch (error) {
logger_1.logger.error('scraper', `Failed to get proxy: ${error}`);
return null;
}
}
currentProxyId = null;
async processRequest(request) {
// Only add proxy if not already set
if (!request.metadata.proxy && request.retryCount > 0) {
// Use proxy on retries
request.metadata.proxy = await this.getActiveProxy();
if (request.metadata.proxy) {
logger_1.logger.debug('scraper', `Using proxy for retry: ${request.metadata.proxy.host}:${request.metadata.proxy.port}`);
// Skip proxy for domains that block datacenter IPs
if (shouldSkipProxy(request.url)) {
logger_1.logger.info('scraper', `⏭️ Skipping proxy for ${new URL(request.url).hostname} (datacenter IPs blocked)`);
return request;
}
// Always try to use a proxy from the central proxy service
// The service handles bot detection timeouts automatically
const forceRotation = request.retryCount > 0 || request.metadata.botDetected;
if (!request.metadata.proxy || forceRotation) {
// Get proxy from central service - it handles timeouts automatically
const proxy = await (0, proxy_1.getActiveProxy)();
if (proxy) {
request.metadata.proxy = {
host: proxy.host,
port: proxy.port,
protocol: proxy.protocol,
username: proxy.username,
password: proxy.password,
};
request.metadata.proxyId = proxy.id;
this.currentProxyId = proxy.id;
const reason = forceRotation ? 'rotation' : 'initial';
logger_1.logger.info('scraper', `🔄 Using proxy (${reason}): ${proxy.protocol}://${proxy.host}:${proxy.port}`);
}
else {
logger_1.logger.warn('scraper', '⚠️ No proxy available - running without proxy');
}
}
return request;
}
async processResponse(response) {
// If bot detection was triggered, put the proxy in timeout
if (response.request.metadata.botDetected && response.request.metadata.proxyId) {
(0, proxy_1.putProxyInTimeout)(response.request.metadata.proxyId, 'Bot detection triggered');
logger_1.logger.info('scraper', `🚫 Proxy ${response.request.metadata.proxyId} put in timeout due to bot detection`);
}
return response;
}
async processError(error, request) {
// If bot detection error, put proxy in timeout
if ((0, proxy_1.isBotDetectionError)(error.message) && request.metadata.proxyId) {
(0, proxy_1.putProxyInTimeout)(request.metadata.proxyId, error.message);
logger_1.logger.info('scraper', `🚫 Proxy ${request.metadata.proxyId} put in timeout: ${error.message}`);
}
return error;
}
}
exports.ProxyMiddleware = ProxyMiddleware;
/**
@@ -165,13 +228,15 @@ class RetryMiddleware {
}
exports.RetryMiddleware = RetryMiddleware;
/**
* Bot Detection Middleware
* Bot Detection Middleware - detects bot blocking and triggers fingerprint rotation
*/
class BotDetectionMiddleware {
name = 'BotDetectionMiddleware';
priority = 60;
detectedCount = 0;
DETECTION_THRESHOLD = 3;
// Export for use by other middlewares
static shouldRotateFingerprint = false;
async processResponse(response) {
const content = typeof response.content === 'string'
? response.content
@@ -183,14 +248,24 @@ class BotDetectionMiddleware {
/access denied/i,
/you have been blocked/i,
/unusual traffic/i,
/robot/i
/robot/i,
/verify.*human/i,
/security check/i,
/please wait/i,
/checking your browser/i,
/ray id/i
];
const detected = botIndicators.some(pattern => pattern.test(content));
if (detected) {
this.detectedCount++;
BotDetectionMiddleware.shouldRotateFingerprint = true;
logger_1.logger.warn('scraper', `Bot detection suspected (${this.detectedCount}/${this.DETECTION_THRESHOLD}): ${response.url}`);
logger_1.logger.info('scraper', '🔄 Flagging for proxy/UA rotation on next request');
// Mark the request for rotation on retry
response.request.metadata.botDetected = true;
response.request.metadata.needsNewBrowser = true;
if (this.detectedCount >= this.DETECTION_THRESHOLD) {
const error = new Error('Bot detection threshold reached');
const error = new Error('Bot detection threshold reached - rotating fingerprint');
error.type = types_1.ErrorType.BOT_DETECTION;
error.retryable = true;
error.request = response.request;
@@ -200,9 +275,22 @@ class BotDetectionMiddleware {
else {
// Gradually decrease detection count on successful requests
this.detectedCount = Math.max(0, this.detectedCount - 0.5);
BotDetectionMiddleware.shouldRotateFingerprint = false;
}
return response;
}
async processError(error, request) {
// If bot detection error, flag for rotation and allow retry
if ('type' in error && error.type === types_1.ErrorType.BOT_DETECTION) {
request.metadata.botDetected = true;
request.metadata.needsNewBrowser = true;
logger_1.logger.info('scraper', '🔄 Bot detection error - will rotate proxy/UA on retry');
// Add delay before retry to avoid rate limiting
await sleep(5000 + Math.random() * 5000);
return null; // Return null to trigger retry
}
return error;
}
}
exports.BotDetectionMiddleware = BotDetectionMiddleware;
/**

View File

@@ -4,6 +4,7 @@ exports.PipelineEngine = exports.StatsPipeline = exports.DatabasePipeline = expo
const logger_1 = require("../services/logger");
const migrate_1 = require("../db/migrate");
const minio_1 = require("../utils/minio");
const product_normalizer_1 = require("../utils/product-normalizer");
/**
* Validation Pipeline - ensures data quality
*/
@@ -138,82 +139,182 @@ class ImagePipeline {
}
exports.ImagePipeline = ImagePipeline;
/**
* Database Pipeline - saves items to database
* Generate a URL-safe slug from a product name
*/
function generateSlug(name) {
return name
.toLowerCase()
.replace(/[^a-z0-9]+/g, '-')
.replace(/^-+|-+$/g, '')
.substring(0, 400);
}
/**
* Database Pipeline - saves items to database with improved matching
*
* MATCHING PRIORITY:
* 1. external_id (dutchie_product_id) - exact match
* 2. normalized name + brand + category - strong match
* 3. normalized name + category - weak match (same product, different/missing brand)
*
* ALWAYS creates a snapshot after upsert for historical tracking.
*/
class DatabasePipeline {
name = 'DatabasePipeline';
priority = 10; // Low priority - runs last
crawlId = null;
setCrawlId(id) {
this.crawlId = id;
}
async process(item, spider) {
const client = await migrate_1.pool.connect();
try {
// Extract store and category from metadata (set by spider)
const storeId = item.storeId;
const categoryId = item.categoryId;
const dispensaryId = item.dispensaryId;
const categoryName = item.categoryName;
// Generate normalized values for matching
const nameNormalized = (0, product_normalizer_1.normalizeProductName)(item.name);
const brandNormalized = (0, product_normalizer_1.normalizeBrandName)(item.brand);
const slug = generateSlug(item.name);
const externalId = item.dutchieProductId || null;
if (!storeId || !categoryId) {
logger_1.logger.error('pipeline', `Missing storeId or categoryId for ${item.name}`);
return null;
}
// Check if product exists
const existingResult = await client.query(`
SELECT id, image_url, local_image_path
FROM products
WHERE store_id = $1 AND name = $2 AND category_id = $3
`, [storeId, item.name, categoryId]);
let productId = null;
let localImagePath = null;
let productId;
if (existingResult.rows.length > 0) {
let isNewProduct = false;
// STEP 1: Try to match by external_id (most reliable)
if (externalId) {
const extMatch = await client.query(`
SELECT id, image_url, local_image_path
FROM products
WHERE store_id = $1 AND (external_id = $2 OR dutchie_product_id = $2)
`, [storeId, externalId]);
if (extMatch.rows.length > 0) {
productId = extMatch.rows[0].id;
localImagePath = extMatch.rows[0].local_image_path;
logger_1.logger.debug('pipeline', `Matched by external_id: ${item.name}`);
}
}
// STEP 2: Try to match by normalized name + brand + category
if (!productId) {
const normMatch = await client.query(`
SELECT id, image_url, local_image_path
FROM products
WHERE store_id = $1
AND name_normalized = $2
AND brand_normalized = $3
AND category_id = $4
`, [storeId, nameNormalized, brandNormalized, categoryId]);
if (normMatch.rows.length > 0) {
productId = normMatch.rows[0].id;
localImagePath = normMatch.rows[0].local_image_path;
logger_1.logger.debug('pipeline', `Matched by normalized name+brand+category: ${item.name}`);
}
}
// STEP 3: Fallback to normalized name + category only (weaker match)
if (!productId) {
const weakMatch = await client.query(`
SELECT id, image_url, local_image_path
FROM products
WHERE store_id = $1
AND name_normalized = $2
AND category_id = $3
LIMIT 1
`, [storeId, nameNormalized, categoryId]);
if (weakMatch.rows.length === 1) {
productId = weakMatch.rows[0].id;
localImagePath = weakMatch.rows[0].local_image_path;
logger_1.logger.debug('pipeline', `Matched by normalized name+category: ${item.name}`);
}
}
// STEP 4: Final fallback - exact name match (legacy compatibility)
if (!productId) {
const exactMatch = await client.query(`
SELECT id, image_url, local_image_path
FROM products
WHERE store_id = $1 AND name = $2 AND category_id = $3
`, [storeId, item.name, categoryId]);
if (exactMatch.rows.length > 0) {
productId = exactMatch.rows[0].id;
localImagePath = exactMatch.rows[0].local_image_path;
logger_1.logger.debug('pipeline', `Matched by exact name: ${item.name}`);
}
}
// UPDATE or INSERT
if (productId) {
// Update existing product
productId = existingResult.rows[0].id;
localImagePath = existingResult.rows[0].local_image_path;
await client.query(`
UPDATE products
SET name = $1, description = $2, price = $3,
strain_type = $4, thc_percentage = $5, cbd_percentage = $6,
brand = $7, weight = $8, image_url = $9, dutchie_url = $10,
brand = $7, weight = $8, image_url = COALESCE($9, image_url), dutchie_url = $10,
in_stock = true, metadata = $11, last_seen_at = CURRENT_TIMESTAMP,
updated_at = CURRENT_TIMESTAMP
updated_at = CURRENT_TIMESTAMP, dispensary_id = $13, slug = COALESCE(slug, $14),
name_normalized = $15, brand_normalized = $16,
external_id = COALESCE(external_id, $17), source_platform = COALESCE(source_platform, 'dutchie')
WHERE id = $12
`, [
item.name, item.description, item.price,
item.strainType, item.thcPercentage, item.cbdPercentage,
item.brand, item.weight, item.imageUrl, item.dutchieUrl,
JSON.stringify(item.metadata || {}), productId
JSON.stringify(item.metadata || {}), productId, dispensaryId, slug,
nameNormalized, brandNormalized, externalId
]);
logger_1.logger.debug('pipeline', `Updated product: ${item.name}`);
}
else {
// Insert new product
isNewProduct = true;
const insertResult = await client.query(`
INSERT INTO products (
store_id, category_id, dutchie_product_id, name, description,
store_id, category_id, dispensary_id, dutchie_product_id, external_id,
slug, name, name_normalized, description,
price, strain_type, thc_percentage, cbd_percentage,
brand, weight, image_url, dutchie_url, in_stock, metadata
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, true, $14)
brand, brand_normalized, weight, image_url, dutchie_url, in_stock, metadata,
source_platform
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, true, $19, 'dutchie')
RETURNING id
`, [
storeId, categoryId, item.dutchieProductId, item.name, item.description,
storeId, categoryId, dispensaryId, externalId, externalId,
slug, item.name, nameNormalized, item.description,
item.price, item.strainType, item.thcPercentage, item.cbdPercentage,
item.brand, item.weight, item.imageUrl, item.dutchieUrl,
item.brand, brandNormalized, item.weight, item.imageUrl, item.dutchieUrl,
JSON.stringify(item.metadata || {})
]);
productId = insertResult.rows[0].id;
logger_1.logger.debug('pipeline', `Inserted new product: ${item.name}`);
logger_1.logger.debug('pipeline', `Inserted NEW product: ${item.name}`);
}
// Download image if needed
if (item.imageUrl && !localImagePath) {
// ALWAYS create a snapshot for historical tracking
await this.createSnapshot(client, {
productId: productId,
dispensaryId,
externalId,
slug,
item,
categoryName
});
// Download image if needed (only for new products or missing local image)
if (item.imageUrl && !localImagePath && productId) {
try {
localImagePath = await (0, minio_1.uploadImageFromUrl)(item.imageUrl, productId);
const storeResult = await client.query('SELECT slug FROM stores WHERE id = $1', [storeId]);
const storeSlug = storeResult.rows[0]?.slug || undefined;
const imageSizes = await (0, minio_1.uploadImageFromUrl)(item.imageUrl, productId, storeSlug);
localImagePath = imageSizes.thumbnail;
await client.query(`
UPDATE products
SET local_image_path = $1
WHERE id = $2
`, [localImagePath, productId]);
UPDATE products SET local_image_path = $1 WHERE id = $2
`, [imageSizes.thumbnail, productId]);
logger_1.logger.debug('pipeline', `Downloaded image for: ${item.name}`);
}
catch (error) {
logger_1.logger.error('pipeline', `Failed to download image for ${item.name}: ${error}`);
}
}
// Attach metadata for stats tracking
item.isNewProduct = isNewProduct;
item.productId = productId;
return item;
}
catch (error) {
@@ -224,6 +325,64 @@ class DatabasePipeline {
client.release();
}
}
/**
* Create a snapshot record for historical tracking
*/
async createSnapshot(client, params) {
try {
// Only create snapshots if the table exists (graceful degradation)
const tableExists = await client.query(`
SELECT EXISTS (
SELECT FROM information_schema.tables
WHERE table_name = 'product_snapshots'
)
`);
if (!tableExists.rows[0].exists) {
return; // Snapshot table not yet created
}
const crawlId = this.crawlId || crypto.randomUUID();
const { productId, dispensaryId, externalId, slug, item, categoryName } = params;
await client.query(`
INSERT INTO product_snapshots (
crawl_id, dispensary_id, external_product_id, product_slug,
name, brand, category, price, original_price, sale_price,
discount_type, discount_value, availability_status, stock_quantity,
thc_percentage, cbd_percentage, strain_type, weight, variant,
description, image_url, effects, terpenes, captured_at
) VALUES (
$1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, NOW()
)
`, [
crawlId,
dispensaryId,
externalId,
slug,
item.name,
item.brand || null,
categoryName || null,
item.price || null,
item.originalPrice || null,
item.metadata?.salePrice || null,
item.metadata?.discountType || null,
item.metadata?.discountValue || null,
'in_stock', // availability_status - if we scraped it, it's in stock
item.metadata?.stockQuantity || null,
item.thcPercentage || null,
item.cbdPercentage || null,
item.strainType || null,
item.weight || null,
item.metadata?.variant || null,
item.description || null,
item.imageUrl || null,
item.metadata?.effects || null,
item.metadata?.terpenes || null
]);
}
catch (error) {
// Don't fail the whole pipeline if snapshot creation fails
logger_1.logger.warn('pipeline', `Failed to create snapshot for ${params.item.name}: ${error}`);
}
}
}
exports.DatabasePipeline = DatabasePipeline;
/**

View File

@@ -0,0 +1,360 @@
"use strict";
// ============================================================================
// DEPRECATED: This scraper writes to the LEGACY products table.
// DO NOT USE - All Dutchie crawling must use the new dutchie-az pipeline.
//
// New pipeline location: src/dutchie-az/services/product-crawler.ts
// - Uses fetch-based GraphQL (no Puppeteer needed)
// - Writes to isolated dutchie_az_* tables with snapshot model
// - Tracks stockStatus, isPresentInFeed, missing_from_feed
// ============================================================================
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.fetchAllDutchieProducts = fetchAllDutchieProducts;
exports.upsertProductsDirect = upsertProductsDirect;
exports.scrapeAllDutchieProducts = scrapeAllDutchieProducts;
/**
* @deprecated DEPRECATED - Use src/dutchie-az/services/product-crawler.ts instead.
* This scraper writes to the legacy products table, not the new dutchie_az tables.
*
* Makes direct GraphQL requests from within the browser context to:
* 1. Bypass Cloudflare (using browser session)
* 2. Fetch ALL products including out-of-stock (Status: null)
* 3. Paginate through complete menu
*/
const puppeteer_extra_1 = __importDefault(require("puppeteer-extra"));
const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth"));
const dutchie_graphql_1 = require("./dutchie-graphql");
puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)());
// GraphQL persisted query hashes
const GRAPHQL_HASHES = {
FilteredProducts: 'ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0',
GetAddressBasedDispensaryData: '13461f73abf7268770dfd05fe7e10c523084b2bb916a929c08efe3d87531977b',
};
/**
* Fetch all products via in-page GraphQL requests
* This includes both in-stock and out-of-stock items
*/
async function fetchAllDutchieProducts(menuUrl, options = {}) {
const { headless = 'new', timeout = 90000, perPage = 100, includeOutOfStock = true, } = options;
let browser;
try {
browser = await puppeteer_extra_1.default.launch({
headless,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-blink-features=AutomationControlled',
],
});
const page = await browser.newPage();
// Stealth configuration
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
await page.setViewport({ width: 1920, height: 1080 });
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', { get: () => false });
window.chrome = { runtime: {} };
});
// Navigate to menu page to establish session
console.log('[DutchieGraphQL] Loading menu page to establish session...');
await page.goto(menuUrl, {
waitUntil: 'networkidle2',
timeout,
});
// Get dispensary ID from page
const dispensaryId = await page.evaluate(() => {
const env = window.reactEnv;
return env?.dispensaryId || env?.retailerId || '';
});
if (!dispensaryId) {
throw new Error('Could not determine dispensaryId from page');
}
console.log(`[DutchieGraphQL] Dispensary ID: ${dispensaryId}`);
// Fetch all products via in-page GraphQL requests
const allProducts = [];
let page_num = 0;
let hasMore = true;
while (hasMore) {
console.log(`[DutchieGraphQL] Fetching page ${page_num} (perPage=${perPage})...`);
const result = await page.evaluate(async (dispensaryId, page_num, perPage, includeOutOfStock, hash) => {
const variables = {
includeEnterpriseSpecials: false,
productsFilter: {
dispensaryId,
pricingType: 'rec',
Status: includeOutOfStock ? null : 'Active', // null = include out-of-stock
types: [],
useCache: false, // Don't cache to get fresh data
isDefaultSort: true,
sortBy: 'popularSortIdx',
sortDirection: 1,
bypassOnlineThresholds: true,
isKioskMenu: false,
removeProductsBelowOptionThresholds: false,
},
page: page_num,
perPage,
};
const qs = new URLSearchParams({
operationName: 'FilteredProducts',
variables: JSON.stringify(variables),
extensions: JSON.stringify({
persistedQuery: { version: 1, sha256Hash: hash },
}),
});
const response = await fetch(`https://dutchie.com/graphql?${qs.toString()}`, {
method: 'GET',
headers: {
'content-type': 'application/json',
'apollographql-client-name': 'Marketplace (production)',
},
credentials: 'include', // Include cookies/session
});
if (!response.ok) {
throw new Error(`HTTP ${response.status}`);
}
return response.json();
}, dispensaryId, page_num, perPage, includeOutOfStock, GRAPHQL_HASHES.FilteredProducts);
if (result.errors) {
console.error('[DutchieGraphQL] GraphQL errors:', result.errors);
break;
}
const products = result?.data?.filteredProducts?.products || [];
console.log(`[DutchieGraphQL] Page ${page_num}: ${products.length} products`);
if (products.length === 0) {
hasMore = false;
}
else {
allProducts.push(...products);
page_num++;
// Safety limit
if (page_num > 50) {
console.log('[DutchieGraphQL] Reached page limit, stopping');
hasMore = false;
}
}
}
// Count active vs inactive
const activeCount = allProducts.filter((p) => p.Status === 'Active').length;
const inactiveCount = allProducts.filter((p) => p.Status !== 'Active').length;
console.log(`[DutchieGraphQL] Total: ${allProducts.length} products (${activeCount} active, ${inactiveCount} inactive)`);
return {
products: allProducts,
dispensaryId,
totalProducts: allProducts.length,
activeCount,
inactiveCount,
};
}
finally {
if (browser) {
await browser.close();
}
}
}
/**
* Upsert products to database
*/
async function upsertProductsDirect(pool, storeId, products) {
const client = await pool.connect();
let inserted = 0;
let updated = 0;
try {
await client.query('BEGIN');
for (const product of products) {
const result = await client.query(`
INSERT INTO products (
store_id, external_id, slug, name, enterprise_product_id,
brand, brand_external_id, brand_logo_url,
subcategory, strain_type, canonical_category,
price, rec_price, med_price, rec_special_price, med_special_price,
is_on_special, special_name, discount_percent, special_data,
sku, inventory_quantity, inventory_available, is_below_threshold, status,
thc_percentage, cbd_percentage, cannabinoids,
weight_mg, net_weight_value, net_weight_unit, options, raw_options,
image_url, additional_images,
is_featured, medical_only, rec_only,
source_created_at, source_updated_at,
description, raw_data,
dutchie_url, last_seen_at, updated_at
)
VALUES (
$1, $2, $3, $4, $5,
$6, $7, $8,
$9, $10, $11,
$12, $13, $14, $15, $16,
$17, $18, $19, $20,
$21, $22, $23, $24, $25,
$26, $27, $28,
$29, $30, $31, $32, $33,
$34, $35,
$36, $37, $38,
$39, $40,
$41, $42,
'', NOW(), NOW()
)
ON CONFLICT (store_id, slug) DO UPDATE SET
name = EXCLUDED.name,
enterprise_product_id = EXCLUDED.enterprise_product_id,
brand = EXCLUDED.brand,
brand_external_id = EXCLUDED.brand_external_id,
brand_logo_url = EXCLUDED.brand_logo_url,
subcategory = EXCLUDED.subcategory,
strain_type = EXCLUDED.strain_type,
canonical_category = EXCLUDED.canonical_category,
price = EXCLUDED.price,
rec_price = EXCLUDED.rec_price,
med_price = EXCLUDED.med_price,
rec_special_price = EXCLUDED.rec_special_price,
med_special_price = EXCLUDED.med_special_price,
is_on_special = EXCLUDED.is_on_special,
special_name = EXCLUDED.special_name,
discount_percent = EXCLUDED.discount_percent,
special_data = EXCLUDED.special_data,
sku = EXCLUDED.sku,
inventory_quantity = EXCLUDED.inventory_quantity,
inventory_available = EXCLUDED.inventory_available,
is_below_threshold = EXCLUDED.is_below_threshold,
status = EXCLUDED.status,
thc_percentage = EXCLUDED.thc_percentage,
cbd_percentage = EXCLUDED.cbd_percentage,
cannabinoids = EXCLUDED.cannabinoids,
weight_mg = EXCLUDED.weight_mg,
net_weight_value = EXCLUDED.net_weight_value,
net_weight_unit = EXCLUDED.net_weight_unit,
options = EXCLUDED.options,
raw_options = EXCLUDED.raw_options,
image_url = EXCLUDED.image_url,
additional_images = EXCLUDED.additional_images,
is_featured = EXCLUDED.is_featured,
medical_only = EXCLUDED.medical_only,
rec_only = EXCLUDED.rec_only,
source_created_at = EXCLUDED.source_created_at,
source_updated_at = EXCLUDED.source_updated_at,
description = EXCLUDED.description,
raw_data = EXCLUDED.raw_data,
last_seen_at = NOW(),
updated_at = NOW()
RETURNING (xmax = 0) AS was_inserted
`, [
storeId,
product.external_id,
product.slug,
product.name,
product.enterprise_product_id,
product.brand,
product.brand_external_id,
product.brand_logo_url,
product.subcategory,
product.strain_type,
product.canonical_category,
product.price,
product.rec_price,
product.med_price,
product.rec_special_price,
product.med_special_price,
product.is_on_special,
product.special_name,
product.discount_percent,
product.special_data ? JSON.stringify(product.special_data) : null,
product.sku,
product.inventory_quantity,
product.inventory_available,
product.is_below_threshold,
product.status,
product.thc_percentage,
product.cbd_percentage,
product.cannabinoids ? JSON.stringify(product.cannabinoids) : null,
product.weight_mg,
product.net_weight_value,
product.net_weight_unit,
product.options,
product.raw_options,
product.image_url,
product.additional_images,
product.is_featured,
product.medical_only,
product.rec_only,
product.source_created_at,
product.source_updated_at,
product.description,
product.raw_data ? JSON.stringify(product.raw_data) : null,
]);
if (result.rows[0]?.was_inserted) {
inserted++;
}
else {
updated++;
}
}
await client.query('COMMIT');
return { inserted, updated };
}
catch (error) {
await client.query('ROLLBACK');
throw error;
}
finally {
client.release();
}
}
/**
* @deprecated DEPRECATED - Use src/dutchie-az/services/product-crawler.ts instead.
* This function is disabled and will throw an error if called.
* Main entry point - scrape all products including out-of-stock
*/
async function scrapeAllDutchieProducts(pool, storeId, menuUrl) {
// DEPRECATED: Throw error to prevent accidental use
throw new Error('DEPRECATED: scrapeAllDutchieProducts() is deprecated. ' +
'Use src/dutchie-az/services/product-crawler.ts instead. ' +
'This scraper writes to the legacy products table.');
// Original code below is unreachable but kept for reference
try {
console.log(`[DutchieGraphQL] Scraping ALL products (including out-of-stock): ${menuUrl}`);
// Fetch all products via direct GraphQL
const { products, totalProducts, activeCount, inactiveCount } = await fetchAllDutchieProducts(menuUrl, {
includeOutOfStock: true,
perPage: 100,
});
if (products.length === 0) {
return {
success: false,
totalProducts: 0,
activeCount: 0,
inactiveCount: 0,
inserted: 0,
updated: 0,
error: 'No products returned from GraphQL',
};
}
// Normalize products
const normalized = products.map(dutchie_graphql_1.normalizeDutchieProduct);
// Upsert to database
const { inserted, updated } = await upsertProductsDirect(pool, storeId, normalized);
console.log(`[DutchieGraphQL] Complete: ${totalProducts} products (${activeCount} active, ${inactiveCount} inactive)`);
console.log(`[DutchieGraphQL] Database: ${inserted} inserted, ${updated} updated`);
return {
success: true,
totalProducts,
activeCount,
inactiveCount,
inserted,
updated,
};
}
catch (error) {
console.error(`[DutchieGraphQL] Error:`, error.message);
return {
success: false,
totalProducts: 0,
activeCount: 0,
inactiveCount: 0,
inserted: 0,
updated: 0,
error: error.message,
};
}
}

446
backend/dist/scrapers/dutchie-graphql.js vendored Normal file
View File

@@ -0,0 +1,446 @@
"use strict";
// ============================================================================
// DEPRECATED: This scraper writes to the LEGACY products table.
// DO NOT USE - All Dutchie crawling must use the new dutchie-az pipeline.
//
// New pipeline location: src/dutchie-az/services/product-crawler.ts
// - Uses fetch-based GraphQL (no Puppeteer needed)
// - Writes to isolated dutchie_az_* tables with snapshot model
// - Tracks stockStatus, isPresentInFeed, missing_from_feed
//
// The normalizer functions in this file (normalizeDutchieProduct) may still
// be imported for reference, but do NOT call scrapeDutchieMenu() or upsertProducts().
// ============================================================================
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.normalizeDutchieProduct = normalizeDutchieProduct;
exports.fetchDutchieMenuViaPuppeteer = fetchDutchieMenuViaPuppeteer;
exports.upsertProducts = upsertProducts;
exports.scrapeDutchieMenu = scrapeDutchieMenu;
/**
* @deprecated DEPRECATED - Use src/dutchie-az/services/product-crawler.ts instead.
* This scraper writes to the legacy products table, not the new dutchie_az tables.
*
* Fetches product data via Puppeteer interception of Dutchie's GraphQL API.
* This bypasses Cloudflare by using a real browser to load the menu page.
*
* GraphQL Operations:
* - FilteredProducts: Returns paginated product list with full details
* - GetAddressBasedDispensaryData: Resolves dispensary cName to dispensaryId
*/
const puppeteer_extra_1 = __importDefault(require("puppeteer-extra"));
const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth"));
puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)());
// =====================================================
// NORMALIZER: Dutchie GraphQL → DB Schema
// =====================================================
function normalizeDutchieProduct(product) {
// Extract first special if exists
const saleSpecial = product.specialData?.saleSpecials?.[0];
// Calculate inventory from POSMetaData children
const children = product.POSMetaData?.children || [];
const totalQuantity = children.reduce((sum, c) => sum + (c.quantity || 0), 0);
const availableQuantity = children.reduce((sum, c) => sum + (c.quantityAvailable || 0), 0);
// Parse timestamps
let sourceCreatedAt;
if (product.createdAt) {
// createdAt is a timestamp string like "1729044510543"
const ts = parseInt(product.createdAt, 10);
if (!isNaN(ts)) {
sourceCreatedAt = new Date(ts);
}
}
let sourceUpdatedAt;
if (product.updatedAt) {
sourceUpdatedAt = new Date(product.updatedAt);
}
return {
// Identity
external_id: product._id || product.id,
slug: product.cName,
name: product.Name,
enterprise_product_id: product.enterpriseProductId,
// Brand
brand: product.brandName || product.brand?.name,
brand_external_id: product.brandId || product.brand?.id,
brand_logo_url: product.brandLogo || product.brand?.imageUrl,
// Category
subcategory: product.subcategory,
strain_type: product.strainType,
canonical_category: product.POSMetaData?.canonicalCategory,
// Pricing
price: product.Prices?.[0],
rec_price: product.recPrices?.[0],
med_price: product.medicalPrices?.[0],
rec_special_price: product.recSpecialPrices?.[0],
med_special_price: product.medicalSpecialPrices?.[0],
// Specials
is_on_special: product.special === true,
special_name: saleSpecial?.specialName,
discount_percent: saleSpecial?.percentDiscount ? saleSpecial.discount : undefined,
special_data: product.specialData,
// Inventory
sku: product.POSMetaData?.canonicalSKU,
inventory_quantity: totalQuantity || undefined,
inventory_available: availableQuantity || undefined,
is_below_threshold: product.isBelowThreshold === true,
status: product.Status,
// Potency
thc_percentage: product.THCContent?.range?.[0],
cbd_percentage: product.CBDContent?.range?.[0],
cannabinoids: product.cannabinoidsV2,
// Weight/Options
weight_mg: product.weight,
net_weight_value: product.measurements?.netWeight?.values?.[0],
net_weight_unit: product.measurements?.netWeight?.unit,
options: product.Options,
raw_options: product.rawOptions,
// Images
image_url: product.Image,
additional_images: product.images?.length ? product.images : undefined,
// Flags
is_featured: product.featured === true,
medical_only: product.medicalOnly === true,
rec_only: product.recOnly === true,
// Timestamps
source_created_at: sourceCreatedAt,
source_updated_at: sourceUpdatedAt,
// Description
description: typeof product.description === 'string' ? product.description : undefined,
// Raw
raw_data: product,
};
}
async function fetchDutchieMenuViaPuppeteer(menuUrl, options = {}) {
const { headless = 'new', timeout = 90000, maxScrolls = 30, // Increased for full menu capture
} = options;
let browser;
const capturedProducts = [];
let dispensaryId = '';
try {
browser = await puppeteer_extra_1.default.launch({
headless,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-blink-features=AutomationControlled',
],
});
const page = await browser.newPage();
// Stealth configuration
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
await page.setViewport({ width: 1920, height: 1080 });
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', { get: () => false });
window.chrome = { runtime: {} };
});
// Track seen product IDs to avoid duplicates
const seenIds = new Set();
// Intercept GraphQL responses
page.on('response', async (response) => {
const url = response.url();
if (!url.includes('graphql'))
return;
try {
const contentType = response.headers()['content-type'] || '';
if (!contentType.includes('application/json'))
return;
const data = await response.json();
// Capture dispensary ID
if (data?.data?.getAddressBasedDispensaryData?.dispensaryData?.dispensaryId) {
dispensaryId = data.data.getAddressBasedDispensaryData.dispensaryData.dispensaryId;
}
// Capture products from FilteredProducts
if (data?.data?.filteredProducts?.products) {
const products = data.data.filteredProducts.products;
for (const product of products) {
if (!seenIds.has(product._id)) {
seenIds.add(product._id);
capturedProducts.push(product);
}
}
}
}
catch {
// Ignore parse errors
}
});
// Navigate to menu
console.log('[DutchieGraphQL] Loading menu page...');
await page.goto(menuUrl, {
waitUntil: 'networkidle2',
timeout,
});
// Get dispensary ID from window.reactEnv if not captured
if (!dispensaryId) {
dispensaryId = await page.evaluate(() => {
const env = window.reactEnv;
return env?.dispensaryId || env?.retailerId || '';
});
}
// Helper function to scroll through a page until no more products load
async function scrollToLoadAll(maxScrollAttempts = maxScrolls) {
let scrollCount = 0;
let previousCount = 0;
let noNewProductsCount = 0;
while (scrollCount < maxScrollAttempts && noNewProductsCount < 3) {
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
await new Promise((r) => setTimeout(r, 1500));
const currentCount = seenIds.size;
if (currentCount === previousCount) {
noNewProductsCount++;
}
else {
noNewProductsCount = 0;
}
previousCount = currentCount;
scrollCount++;
}
}
// First, scroll through the main page (all products)
console.log('[DutchieGraphQL] Scrolling main page...');
await scrollToLoadAll();
console.log(`[DutchieGraphQL] After main page: ${seenIds.size} products`);
// Get category links from the navigation
const categoryLinks = await page.evaluate(() => {
const links = [];
// Look for category navigation links
const navLinks = document.querySelectorAll('a[href*="/products/"]');
navLinks.forEach((link) => {
const href = link.href;
if (href && !links.includes(href)) {
links.push(href);
}
});
return links;
});
console.log(`[DutchieGraphQL] Found ${categoryLinks.length} category links`);
// Visit each category page to capture all products
for (const categoryUrl of categoryLinks) {
try {
console.log(`[DutchieGraphQL] Visiting category: ${categoryUrl.split('/').pop()}`);
await page.goto(categoryUrl, {
waitUntil: 'networkidle2',
timeout: 30000,
});
await scrollToLoadAll(15); // Fewer scrolls per category
console.log(`[DutchieGraphQL] Total products: ${seenIds.size}`);
}
catch (e) {
console.log(`[DutchieGraphQL] Category error: ${e.message}`);
}
}
// Wait for any final responses
await new Promise((r) => setTimeout(r, 2000));
return {
products: capturedProducts,
dispensaryId,
menuUrl,
};
}
finally {
if (browser) {
await browser.close();
}
}
}
// =====================================================
// DATABASE OPERATIONS
// =====================================================
async function upsertProducts(pool, storeId, products) {
const client = await pool.connect();
let inserted = 0;
let updated = 0;
try {
await client.query('BEGIN');
for (const product of products) {
// Upsert product
const result = await client.query(`
INSERT INTO products (
store_id, external_id, slug, name, enterprise_product_id,
brand, brand_external_id, brand_logo_url,
subcategory, strain_type, canonical_category,
price, rec_price, med_price, rec_special_price, med_special_price,
is_on_special, special_name, discount_percent, special_data,
sku, inventory_quantity, inventory_available, is_below_threshold, status,
thc_percentage, cbd_percentage, cannabinoids,
weight_mg, net_weight_value, net_weight_unit, options, raw_options,
image_url, additional_images,
is_featured, medical_only, rec_only,
source_created_at, source_updated_at,
description, raw_data,
dutchie_url, last_seen_at, updated_at
)
VALUES (
$1, $2, $3, $4, $5,
$6, $7, $8,
$9, $10, $11,
$12, $13, $14, $15, $16,
$17, $18, $19, $20,
$21, $22, $23, $24, $25,
$26, $27, $28,
$29, $30, $31, $32, $33,
$34, $35,
$36, $37, $38,
$39, $40,
$41, $42,
'', NOW(), NOW()
)
ON CONFLICT (store_id, slug) DO UPDATE SET
name = EXCLUDED.name,
enterprise_product_id = EXCLUDED.enterprise_product_id,
brand = EXCLUDED.brand,
brand_external_id = EXCLUDED.brand_external_id,
brand_logo_url = EXCLUDED.brand_logo_url,
subcategory = EXCLUDED.subcategory,
strain_type = EXCLUDED.strain_type,
canonical_category = EXCLUDED.canonical_category,
price = EXCLUDED.price,
rec_price = EXCLUDED.rec_price,
med_price = EXCLUDED.med_price,
rec_special_price = EXCLUDED.rec_special_price,
med_special_price = EXCLUDED.med_special_price,
is_on_special = EXCLUDED.is_on_special,
special_name = EXCLUDED.special_name,
discount_percent = EXCLUDED.discount_percent,
special_data = EXCLUDED.special_data,
sku = EXCLUDED.sku,
inventory_quantity = EXCLUDED.inventory_quantity,
inventory_available = EXCLUDED.inventory_available,
is_below_threshold = EXCLUDED.is_below_threshold,
status = EXCLUDED.status,
thc_percentage = EXCLUDED.thc_percentage,
cbd_percentage = EXCLUDED.cbd_percentage,
cannabinoids = EXCLUDED.cannabinoids,
weight_mg = EXCLUDED.weight_mg,
net_weight_value = EXCLUDED.net_weight_value,
net_weight_unit = EXCLUDED.net_weight_unit,
options = EXCLUDED.options,
raw_options = EXCLUDED.raw_options,
image_url = EXCLUDED.image_url,
additional_images = EXCLUDED.additional_images,
is_featured = EXCLUDED.is_featured,
medical_only = EXCLUDED.medical_only,
rec_only = EXCLUDED.rec_only,
source_created_at = EXCLUDED.source_created_at,
source_updated_at = EXCLUDED.source_updated_at,
description = EXCLUDED.description,
raw_data = EXCLUDED.raw_data,
last_seen_at = NOW(),
updated_at = NOW()
RETURNING (xmax = 0) AS was_inserted
`, [
storeId,
product.external_id,
product.slug,
product.name,
product.enterprise_product_id,
product.brand,
product.brand_external_id,
product.brand_logo_url,
product.subcategory,
product.strain_type,
product.canonical_category,
product.price,
product.rec_price,
product.med_price,
product.rec_special_price,
product.med_special_price,
product.is_on_special,
product.special_name,
product.discount_percent,
product.special_data ? JSON.stringify(product.special_data) : null,
product.sku,
product.inventory_quantity,
product.inventory_available,
product.is_below_threshold,
product.status,
product.thc_percentage,
product.cbd_percentage,
product.cannabinoids ? JSON.stringify(product.cannabinoids) : null,
product.weight_mg,
product.net_weight_value,
product.net_weight_unit,
product.options,
product.raw_options,
product.image_url,
product.additional_images,
product.is_featured,
product.medical_only,
product.rec_only,
product.source_created_at,
product.source_updated_at,
product.description,
product.raw_data ? JSON.stringify(product.raw_data) : null,
]);
if (result.rows[0]?.was_inserted) {
inserted++;
}
else {
updated++;
}
}
await client.query('COMMIT');
return { inserted, updated };
}
catch (error) {
await client.query('ROLLBACK');
throw error;
}
finally {
client.release();
}
}
// =====================================================
// MAIN ENTRY POINT
// =====================================================
/**
* @deprecated DEPRECATED - Use src/dutchie-az/services/product-crawler.ts instead.
* This function is disabled and will throw an error if called.
*/
async function scrapeDutchieMenu(pool, storeId, menuUrl) {
// DEPRECATED: Throw error to prevent accidental use
throw new Error('DEPRECATED: scrapeDutchieMenu() is deprecated. ' +
'Use src/dutchie-az/services/product-crawler.ts instead. ' +
'This scraper writes to the legacy products table.');
// Original code below is unreachable but kept for reference
try {
console.log(`[DutchieGraphQL] Scraping: ${menuUrl}`);
// Fetch products via Puppeteer
const { products, dispensaryId } = await fetchDutchieMenuViaPuppeteer(menuUrl);
console.log(`[DutchieGraphQL] Captured ${products.length} products, dispensaryId: ${dispensaryId}`);
if (products.length === 0) {
return {
success: false,
productsFound: 0,
inserted: 0,
updated: 0,
error: 'No products captured from GraphQL responses',
};
}
// Normalize products
const normalized = products.map(normalizeDutchieProduct);
// Upsert to database
const { inserted, updated } = await upsertProducts(pool, storeId, normalized);
console.log(`[DutchieGraphQL] Upsert complete: ${inserted} inserted, ${updated} updated`);
return {
success: true,
productsFound: products.length,
inserted,
updated,
};
}
catch (error) {
console.error(`[DutchieGraphQL] Error:`, error.message);
return {
success: false,
productsFound: 0,
inserted: 0,
updated: 0,
error: error.message,
};
}
}

View File

@@ -0,0 +1,85 @@
"use strict";
// ============================================================================
// DEPRECATED: Dutchie now crawled via GraphQL only (see dutchie-az pipeline)
// DO NOT USE - This HTML scraper is unreliable and targets the legacy products table.
// All Dutchie crawling must go through: src/dutchie-az/services/product-crawler.ts
// ============================================================================
Object.defineProperty(exports, "__esModule", { value: true });
exports.dutchieTemplate = void 0;
exports.getTemplateForUrl = getTemplateForUrl;
const logger_1 = require("../../services/logger");
/**
* @deprecated DEPRECATED - Dutchie HTML scraping is no longer supported.
* Use the dutchie-az GraphQL pipeline instead: src/dutchie-az/services/product-crawler.ts
* This template relied on unstable DOM selectors and wrote to legacy tables.
*/
exports.dutchieTemplate = {
name: 'Dutchie Marketplace',
urlPattern: /dutchie\.com\/dispensary\//,
buildCategoryUrl: (baseUrl, category) => {
// Remove trailing slash
const base = baseUrl.replace(/\/$/, '');
// Convert category name to URL-friendly slug
const categorySlug = category.toLowerCase().replace(/\s+/g, '-');
return `${base}/products/${categorySlug}`;
},
extractProducts: async (page) => {
const products = [];
try {
// Wait for product cards to load
await page.waitForSelector('a[data-testid="card-link"]', { timeout: 10000 }).catch(() => {
logger_1.logger.warn('scraper', 'No product cards found with data-testid="card-link"');
});
// Get all product card links
const productCards = await page.locator('a[href*="/product/"][data-testid="card-link"]').all();
logger_1.logger.info('scraper', `Found ${productCards.length} Dutchie product cards`);
for (const card of productCards) {
try {
// Extract all data at once using evaluate for speed
const cardData = await card.evaluate((el) => {
const href = el.getAttribute('href') || '';
const img = el.querySelector('img');
const imageUrl = img ? img.getAttribute('src') || '' : '';
// Get all text nodes in order
const textElements = Array.from(el.querySelectorAll('*'))
.filter(el => el.textContent && el.children.length === 0)
.map(el => (el.textContent || '').trim())
.filter(text => text.length > 0);
const name = textElements[0] || '';
const brand = textElements[1] || '';
// Look for price
const priceMatch = el.textContent?.match(/\$(\d+(?:\.\d{2})?)/);
const price = priceMatch ? parseFloat(priceMatch[1]) : undefined;
return { href, imageUrl, name, brand, price };
});
if (cardData.name && cardData.href) {
products.push({
name: cardData.name,
brand: cardData.brand || undefined,
product_url: cardData.href.startsWith('http') ? cardData.href : `https://dutchie.com${cardData.href}`,
image_url: cardData.imageUrl || undefined,
price: cardData.price,
in_stock: true,
});
}
}
catch (err) {
logger_1.logger.warn('scraper', `Error extracting Dutchie product card: ${err}`);
}
}
}
catch (err) {
logger_1.logger.error('scraper', `Error in Dutchie product extraction: ${err}`);
}
return products;
},
};
/**
* Get the appropriate scraper template based on URL
*/
function getTemplateForUrl(url) {
if (exports.dutchieTemplate.urlPattern.test(url)) {
return exports.dutchieTemplate;
}
return null;
}

View File

@@ -0,0 +1,287 @@
#!/usr/bin/env npx tsx
"use strict";
/**
* Backfill Store-Dispensary Mapping
*
* Links existing stores (scheduler) to dispensaries (master AZDHS directory)
* by matching on name, city, and zip code.
*
* Usage:
* npx tsx src/scripts/backfill-store-dispensary.ts # Preview matches
* npx tsx src/scripts/backfill-store-dispensary.ts --apply # Apply matches
* npx tsx src/scripts/backfill-store-dispensary.ts --verbose # Show all match details
*/
Object.defineProperty(exports, "__esModule", { value: true });
const migrate_1 = require("../db/migrate");
const logger_1 = require("../services/logger");
const args = process.argv.slice(2);
const flags = {
apply: args.includes('--apply'),
verbose: args.includes('--verbose'),
help: args.includes('--help') || args.includes('-h'),
};
/**
* Normalize a store/dispensary name for comparison
* Removes common suffixes, punctuation, and extra whitespace
*/
function normalizeName(name) {
return name
.toLowerCase()
.replace(/\s*[-–—]\s*/g, ' ') // Normalize dashes to spaces
.replace(/\s*(dispensary|cannabis|marijuana|weed|shop|store|llc|inc)\s*/gi, ' ')
.replace(/['']/g, "'") // Normalize apostrophes
.replace(/[^\w\s']/g, '') // Remove other punctuation
.replace(/\s+/g, ' ') // Collapse whitespace
.trim();
}
/**
* Simple Levenshtein distance for fuzzy matching
*/
function levenshteinDistance(a, b) {
const matrix = [];
for (let i = 0; i <= b.length; i++) {
matrix[i] = [i];
}
for (let j = 0; j <= a.length; j++) {
matrix[0][j] = j;
}
for (let i = 1; i <= b.length; i++) {
for (let j = 1; j <= a.length; j++) {
if (b.charAt(i - 1) === a.charAt(j - 1)) {
matrix[i][j] = matrix[i - 1][j - 1];
}
else {
matrix[i][j] = Math.min(matrix[i - 1][j - 1] + 1, // substitution
matrix[i][j - 1] + 1, // insertion
matrix[i - 1][j] + 1 // deletion
);
}
}
}
return matrix[b.length][a.length];
}
/**
* Calculate similarity score (0-100)
*/
function similarityScore(a, b) {
const maxLen = Math.max(a.length, b.length);
if (maxLen === 0)
return 100;
const distance = levenshteinDistance(a, b);
return Math.round((1 - distance / maxLen) * 100);
}
/**
* Find the best dispensary match for a store
*/
function findBestMatch(store, dispensaries) {
const normalizedStoreName = normalizeName(store.name);
const storeSlug = store.slug.toLowerCase();
let bestMatch = {
store,
dispensary: null,
matchType: 'none',
score: 0,
};
for (const disp of dispensaries) {
const normalizedDispName = normalizeName(disp.name);
const normalizedCompanyName = disp.company_name ? normalizeName(disp.company_name) : '';
const dispSlug = disp.slug.toLowerCase();
// 1. Exact name match (case-insensitive)
if (store.name.toLowerCase() === disp.name.toLowerCase()) {
return {
store,
dispensary: disp,
matchType: 'exact_name',
score: 100,
};
}
// 2. Normalized name match
if (normalizedStoreName === normalizedDispName) {
return {
store,
dispensary: disp,
matchType: 'normalized_name',
score: 95,
};
}
// 3. Store name matches company name
if (normalizedCompanyName && normalizedStoreName === normalizedCompanyName) {
return {
store,
dispensary: disp,
matchType: 'company_name',
score: 90,
};
}
// 4. Slug match
if (storeSlug === dispSlug) {
return {
store,
dispensary: disp,
matchType: 'slug',
score: 85,
};
}
// 5. Fuzzy matching (only if score > 70)
const nameScore = similarityScore(normalizedStoreName, normalizedDispName);
const companyScore = normalizedCompanyName
? similarityScore(normalizedStoreName, normalizedCompanyName)
: 0;
const fuzzyScore = Math.max(nameScore, companyScore);
if (fuzzyScore > bestMatch.score && fuzzyScore >= 70) {
bestMatch = {
store,
dispensary: disp,
matchType: 'fuzzy',
score: fuzzyScore,
};
}
}
return bestMatch;
}
async function main() {
if (flags.help) {
console.log(`
Backfill Store-Dispensary Mapping
Links existing stores (scheduler) to dispensaries (master AZDHS directory)
by matching on name, company name, or slug similarity.
USAGE:
npx tsx src/scripts/backfill-store-dispensary.ts [OPTIONS]
OPTIONS:
--apply Apply the mappings to the database (default: preview only)
--verbose Show detailed match information for all stores
--help, -h Show this help message
EXAMPLES:
# Preview what would be matched
npx tsx src/scripts/backfill-store-dispensary.ts
# Apply the mappings
npx tsx src/scripts/backfill-store-dispensary.ts --apply
# Show verbose output
npx tsx src/scripts/backfill-store-dispensary.ts --verbose
`);
process.exit(0);
}
console.log('\n📦 Backfill Store-Dispensary Mapping');
console.log('=====================================\n');
try {
// Fetch all stores without a dispensary_id
const storesResult = await migrate_1.pool.query(`
SELECT id, name, slug, dispensary_id
FROM stores
WHERE dispensary_id IS NULL
ORDER BY name
`);
const unmappedStores = storesResult.rows;
// Fetch all already-mapped stores for context
const mappedResult = await migrate_1.pool.query(`
SELECT id, name, slug, dispensary_id
FROM stores
WHERE dispensary_id IS NOT NULL
ORDER BY name
`);
const mappedStores = mappedResult.rows;
// Fetch all dispensaries
const dispResult = await migrate_1.pool.query(`
SELECT id, name, company_name, city, address, slug
FROM dispensaries
ORDER BY name
`);
const dispensaries = dispResult.rows;
console.log(`📊 Current Status:`);
console.log(` Stores without dispensary_id: ${unmappedStores.length}`);
console.log(` Stores already mapped: ${mappedStores.length}`);
console.log(` Total dispensaries: ${dispensaries.length}\n`);
if (unmappedStores.length === 0) {
console.log('✅ All stores are already mapped to dispensaries!\n');
await migrate_1.pool.end();
process.exit(0);
}
// Find matches for each unmapped store
const matches = [];
const noMatches = [];
for (const store of unmappedStores) {
const match = findBestMatch(store, dispensaries);
if (match.dispensary) {
matches.push(match);
}
else {
noMatches.push(store);
}
}
// Sort matches by score (highest first)
matches.sort((a, b) => b.score - a.score);
// Display results
console.log(`\n🔗 Matches Found: ${matches.length}`);
console.log('----------------------------------\n');
if (matches.length > 0) {
// Group by match type
const byType = {};
for (const m of matches) {
if (!byType[m.matchType])
byType[m.matchType] = [];
byType[m.matchType].push(m);
}
const typeLabels = {
exact_name: '✅ Exact Name Match',
normalized_name: '✅ Normalized Name Match',
company_name: '🏢 Company Name Match',
slug: '🔗 Slug Match',
fuzzy: '🔍 Fuzzy Match',
};
for (const [type, results] of Object.entries(byType)) {
console.log(`${typeLabels[type]} (${results.length}):`);
for (const r of results) {
const dispInfo = r.dispensary;
console.log(` • "${r.store.name}" → "${dispInfo.name}" (${dispInfo.city}) [${r.score}%]`);
}
console.log('');
}
}
if (noMatches.length > 0) {
console.log(`\n❌ No Match Found: ${noMatches.length}`);
console.log('----------------------------------\n');
for (const store of noMatches) {
console.log(` • "${store.name}" (slug: ${store.slug})`);
}
console.log('');
}
// Apply if requested
if (flags.apply && matches.length > 0) {
console.log('\n🔧 Applying mappings...\n');
let updated = 0;
for (const match of matches) {
if (!match.dispensary)
continue;
await migrate_1.pool.query('UPDATE stores SET dispensary_id = $1 WHERE id = $2', [match.dispensary.id, match.store.id]);
updated++;
if (flags.verbose) {
console.log(` ✓ Linked store ${match.store.id} to dispensary ${match.dispensary.id}`);
}
}
console.log(`\n✅ Updated ${updated} stores with dispensary mappings\n`);
logger_1.logger.info('system', `Backfill complete: linked ${updated} stores to dispensaries`);
}
else if (matches.length > 0 && !flags.apply) {
console.log('\n💡 Run with --apply to update the database\n');
}
// Summary
console.log('📈 Summary:');
console.log(` Would match: ${matches.length} stores`);
console.log(` No match: ${noMatches.length} stores`);
console.log(` Match rate: ${Math.round((matches.length / unmappedStores.length) * 100)}%\n`);
}
catch (error) {
console.error('Error:', error);
process.exit(1);
}
finally {
await migrate_1.pool.end();
}
}
main().catch(console.error);

View File

@@ -0,0 +1,332 @@
#!/usr/bin/env npx tsx
"use strict";
/**
* Bootstrap Discovery Script
*
* One-time (but reusable) bootstrap command that:
* 1. Ensures every Dispensary has a dispensary_crawl_schedule entry (4h default)
* 2. Optionally runs RunDispensaryOrchestrator for each dispensary
*
* Usage:
* npx tsx src/scripts/bootstrap-discovery.ts # Create schedules only
* npx tsx src/scripts/bootstrap-discovery.ts --run # Create schedules + run orchestrator
* npx tsx src/scripts/bootstrap-discovery.ts --run --limit=10 # Run for first 10 dispensaries
* npx tsx src/scripts/bootstrap-discovery.ts --dry-run # Preview what would happen
* npx tsx src/scripts/bootstrap-discovery.ts --status # Show current status only
*/
Object.defineProperty(exports, "__esModule", { value: true });
const migrate_1 = require("../db/migrate");
const dispensary_orchestrator_1 = require("../services/dispensary-orchestrator");
// Parse command line args
const args = process.argv.slice(2);
const flags = {
run: args.includes('--run'),
dryRun: args.includes('--dry-run'),
status: args.includes('--status'),
help: args.includes('--help') || args.includes('-h'),
limit: parseInt(args.find(a => a.startsWith('--limit='))?.split('=')[1] || '0'),
concurrency: parseInt(args.find(a => a.startsWith('--concurrency='))?.split('=')[1] || '3'),
interval: parseInt(args.find(a => a.startsWith('--interval='))?.split('=')[1] || '240'),
detectionOnly: args.includes('--detection-only'),
productionOnly: args.includes('--production-only'),
sandboxOnly: args.includes('--sandbox-only'),
};
async function showHelp() {
console.log(`
Bootstrap Discovery - Initialize Dispensary Crawl System
USAGE:
npx tsx src/scripts/bootstrap-discovery.ts [OPTIONS]
OPTIONS:
--run After creating schedules, run the orchestrator for each dispensary
--dry-run Show what would happen without making changes
--status Show current status and exit
--limit=N Limit how many dispensaries to process (0 = all, default: 0)
--concurrency=N How many dispensaries to process in parallel (default: 3)
--interval=M Default interval in minutes for new schedules (default: 240 = 4 hours)
--detection-only Only run detection, don't crawl
--production-only Only run dispensaries in production mode
--sandbox-only Only run dispensaries in sandbox mode
--help, -h Show this help message
EXAMPLES:
# Create schedule entries for all dispensaries (no crawling)
npx tsx src/scripts/bootstrap-discovery.ts
# Create schedules and run orchestrator for all dispensaries
npx tsx src/scripts/bootstrap-discovery.ts --run
# Run orchestrator for first 10 dispensaries
npx tsx src/scripts/bootstrap-discovery.ts --run --limit=10
# Run with higher concurrency
npx tsx src/scripts/bootstrap-discovery.ts --run --concurrency=5
# Show current status
npx tsx src/scripts/bootstrap-discovery.ts --status
WHAT IT DOES:
1. Creates dispensary_crawl_schedule entries for all dispensaries that don't have one
2. If --run: For each dispensary, runs the orchestrator which:
a. Checks if provider detection is needed (null/unknown/stale/low confidence)
b. Runs detection if needed
c. If Dutchie + production mode: runs production crawl
d. Otherwise: runs sandbox crawl
3. Updates schedule status and job records
`);
}
async function showStatus() {
console.log('\n📊 Current Dispensary Crawl Status\n');
console.log('═'.repeat(70));
// Get dispensary counts by provider
const providerStats = await migrate_1.pool.query(`
SELECT
COALESCE(product_provider, 'undetected') as provider,
COUNT(*) as count,
COUNT(*) FILTER (WHERE product_crawler_mode = 'production') as production,
COUNT(*) FILTER (WHERE product_crawler_mode = 'sandbox') as sandbox,
COUNT(*) FILTER (WHERE product_crawler_mode IS NULL) as no_mode
FROM dispensaries
GROUP BY COALESCE(product_provider, 'undetected')
ORDER BY count DESC
`);
console.log('\nProvider Distribution:');
console.log('-'.repeat(60));
console.log('Provider'.padEnd(20) +
'Total'.padStart(8) +
'Production'.padStart(12) +
'Sandbox'.padStart(10) +
'No Mode'.padStart(10));
console.log('-'.repeat(60));
for (const row of providerStats.rows) {
console.log(row.provider.padEnd(20) +
row.count.toString().padStart(8) +
row.production.toString().padStart(12) +
row.sandbox.toString().padStart(10) +
row.no_mode.toString().padStart(10));
}
// Get schedule stats
const scheduleStats = await migrate_1.pool.query(`
SELECT
COUNT(DISTINCT d.id) as total_dispensaries,
COUNT(DISTINCT dcs.id) as with_schedule,
COUNT(DISTINCT d.id) - COUNT(DISTINCT dcs.id) as without_schedule,
COUNT(*) FILTER (WHERE dcs.is_active = TRUE) as active_schedules,
COUNT(*) FILTER (WHERE dcs.last_status = 'success') as last_success,
COUNT(*) FILTER (WHERE dcs.last_status = 'error') as last_error,
COUNT(*) FILTER (WHERE dcs.last_status = 'sandbox_only') as last_sandbox,
COUNT(*) FILTER (WHERE dcs.last_status = 'detection_only') as last_detection,
COUNT(*) FILTER (WHERE dcs.next_run_at <= NOW()) as due_now,
AVG(dcs.interval_minutes)::INTEGER as avg_interval
FROM dispensaries d
LEFT JOIN dispensary_crawl_schedule dcs ON dcs.dispensary_id = d.id
`);
const s = scheduleStats.rows[0];
console.log('\n\nSchedule Status:');
console.log('-'.repeat(60));
console.log(` Total Dispensaries: ${s.total_dispensaries}`);
console.log(` With Schedule: ${s.with_schedule}`);
console.log(` Without Schedule: ${s.without_schedule}`);
console.log(` Active Schedules: ${s.active_schedules || 0}`);
console.log(` Average Interval: ${s.avg_interval || 240} minutes`);
console.log('\n Last Run Status:');
console.log(` - Success: ${s.last_success || 0}`);
console.log(` - Error: ${s.last_error || 0}`);
console.log(` - Sandbox Only: ${s.last_sandbox || 0}`);
console.log(` - Detection Only: ${s.last_detection || 0}`);
console.log(` - Due Now: ${s.due_now || 0}`);
// Get recent job stats
const jobStats = await migrate_1.pool.query(`
SELECT
COUNT(*) as total,
COUNT(*) FILTER (WHERE status = 'completed') as completed,
COUNT(*) FILTER (WHERE status = 'failed') as failed,
COUNT(*) FILTER (WHERE status = 'running') as running,
COUNT(*) FILTER (WHERE status = 'pending') as pending,
COUNT(*) FILTER (WHERE detection_ran = TRUE) as with_detection,
COUNT(*) FILTER (WHERE crawl_ran = TRUE) as with_crawl,
COUNT(*) FILTER (WHERE crawl_type = 'production') as production_crawls,
COUNT(*) FILTER (WHERE crawl_type = 'sandbox') as sandbox_crawls,
SUM(products_found) as total_products_found
FROM dispensary_crawl_jobs
WHERE created_at > NOW() - INTERVAL '24 hours'
`);
const j = jobStats.rows[0];
console.log('\n\nJobs (Last 24 Hours):');
console.log('-'.repeat(60));
console.log(` Total Jobs: ${j.total || 0}`);
console.log(` Completed: ${j.completed || 0}`);
console.log(` Failed: ${j.failed || 0}`);
console.log(` Running: ${j.running || 0}`);
console.log(` Pending: ${j.pending || 0}`);
console.log(` With Detection: ${j.with_detection || 0}`);
console.log(` With Crawl: ${j.with_crawl || 0}`);
console.log(` - Production: ${j.production_crawls || 0}`);
console.log(` - Sandbox: ${j.sandbox_crawls || 0}`);
console.log(` Products Found: ${j.total_products_found || 0}`);
console.log('\n' + '═'.repeat(70) + '\n');
}
async function createSchedules() {
console.log('\n📅 Creating Dispensary Schedules...\n');
if (flags.dryRun) {
// Count how many would be created
const result = await migrate_1.pool.query(`
SELECT COUNT(*) as count
FROM dispensaries d
WHERE NOT EXISTS (
SELECT 1 FROM dispensary_crawl_schedule dcs WHERE dcs.dispensary_id = d.id
)
`);
const wouldCreate = parseInt(result.rows[0].count);
console.log(` Would create ${wouldCreate} new schedule entries (${flags.interval} minute interval)`);
return { created: wouldCreate, existing: 0 };
}
const result = await (0, dispensary_orchestrator_1.ensureAllDispensariesHaveSchedules)(flags.interval);
console.log(` ✓ Created ${result.created} new schedule entries`);
console.log(`${result.existing} dispensaries already had schedules`);
return result;
}
async function getDispensariesToProcess() {
// Build query based on filters
let whereClause = 'TRUE';
if (flags.productionOnly) {
whereClause += ` AND d.product_crawler_mode = 'production'`;
}
else if (flags.sandboxOnly) {
whereClause += ` AND d.product_crawler_mode = 'sandbox'`;
}
if (flags.detectionOnly) {
whereClause += ` AND (d.product_provider IS NULL OR d.product_provider = 'unknown' OR d.product_confidence < 50)`;
}
const limitClause = flags.limit > 0 ? `LIMIT ${flags.limit}` : '';
const query = `
SELECT d.id, d.name, d.product_provider, d.product_crawler_mode
FROM dispensaries d
LEFT JOIN dispensary_crawl_schedule dcs ON dcs.dispensary_id = d.id
WHERE ${whereClause}
ORDER BY
COALESCE(dcs.priority, 0) DESC,
dcs.last_run_at ASC NULLS FIRST,
d.id ASC
${limitClause}
`;
const result = await migrate_1.pool.query(query);
return result.rows.map(row => row.id);
}
async function runOrchestrator() {
console.log('\n🚀 Running Dispensary Orchestrator...\n');
const dispensaryIds = await getDispensariesToProcess();
if (dispensaryIds.length === 0) {
console.log(' No dispensaries to process.');
return;
}
console.log(` Found ${dispensaryIds.length} dispensaries to process`);
console.log(` Concurrency: ${flags.concurrency}`);
if (flags.dryRun) {
console.log('\n Would process these dispensaries:');
const details = await migrate_1.pool.query(`SELECT id, name, product_provider, product_crawler_mode
FROM dispensaries WHERE id = ANY($1) ORDER BY id`, [dispensaryIds]);
for (const row of details.rows.slice(0, 20)) {
console.log(` - [${row.id}] ${row.name} (${row.product_provider || 'undetected'}, ${row.product_crawler_mode || 'no mode'})`);
}
if (details.rows.length > 20) {
console.log(` ... and ${details.rows.length - 20} more`);
}
return;
}
console.log('\n Starting batch processing...\n');
const results = await (0, dispensary_orchestrator_1.runBatchDispensaryOrchestrator)(dispensaryIds, flags.concurrency);
// Summarize results
const summary = {
total: results.length,
success: results.filter(r => r.status === 'success').length,
sandboxOnly: results.filter(r => r.status === 'sandbox_only').length,
detectionOnly: results.filter(r => r.status === 'detection_only').length,
error: results.filter(r => r.status === 'error').length,
detectionsRan: results.filter(r => r.detectionRan).length,
crawlsRan: results.filter(r => r.crawlRan).length,
productionCrawls: results.filter(r => r.crawlType === 'production').length,
sandboxCrawls: results.filter(r => r.crawlType === 'sandbox').length,
totalProducts: results.reduce((sum, r) => sum + (r.productsFound || 0), 0),
totalDuration: results.reduce((sum, r) => sum + r.durationMs, 0),
};
console.log('\n' + '═'.repeat(70));
console.log(' Orchestrator Results');
console.log('═'.repeat(70));
console.log(`
Total Processed: ${summary.total}
Status:
- Success: ${summary.success}
- Sandbox Only: ${summary.sandboxOnly}
- Detection Only: ${summary.detectionOnly}
- Error: ${summary.error}
Operations:
- Detections Ran: ${summary.detectionsRan}
- Crawls Ran: ${summary.crawlsRan}
- Production: ${summary.productionCrawls}
- Sandbox: ${summary.sandboxCrawls}
Results:
- Products Found: ${summary.totalProducts}
- Total Duration: ${(summary.totalDuration / 1000).toFixed(1)}s
- Avg per Dispensary: ${(summary.totalDuration / summary.total / 1000).toFixed(1)}s
`);
console.log('═'.repeat(70) + '\n');
// Show errors if any
const errors = results.filter(r => r.status === 'error');
if (errors.length > 0) {
console.log('\n⚠ Errors encountered:');
for (const err of errors.slice(0, 10)) {
console.log(` - [${err.dispensaryId}] ${err.dispensaryName}: ${err.error}`);
}
if (errors.length > 10) {
console.log(` ... and ${errors.length - 10} more errors`);
}
}
}
async function main() {
if (flags.help) {
await showHelp();
process.exit(0);
}
console.log('\n' + '═'.repeat(70));
console.log(' Dispensary Crawl Bootstrap Discovery');
console.log('═'.repeat(70));
if (flags.dryRun) {
console.log('\n🔍 DRY RUN MODE - No changes will be made');
}
try {
// Always show status first
await showStatus();
if (flags.status) {
// Status-only mode, we're done
await migrate_1.pool.end();
process.exit(0);
}
// Step 1: Create schedule entries
await createSchedules();
// Step 2: Optionally run orchestrator
if (flags.run) {
await runOrchestrator();
}
else {
console.log('\n💡 Tip: Use --run to also run the orchestrator for each dispensary');
}
// Show final status
if (!flags.dryRun) {
await showStatus();
}
}
catch (error) {
console.error('\n❌ Fatal error:', error.message);
console.error(error.stack);
process.exit(1);
}
finally {
await migrate_1.pool.end();
}
}
main();

View File

@@ -0,0 +1,236 @@
"use strict";
/**
* Capture Dutchie GraphQL response structure via Puppeteer interception
* This script navigates to a Dutchie menu page and captures the GraphQL responses
* to understand the exact product data structure
*/
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
const puppeteer_extra_1 = __importDefault(require("puppeteer-extra"));
const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth"));
const fs = __importStar(require("fs"));
puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)());
async function captureSchema(menuUrl) {
let browser;
const capturedResponses = [];
try {
console.log('='.repeat(80));
console.log('DUTCHIE GRAPHQL SCHEMA CAPTURE');
console.log('='.repeat(80));
console.log(`\nTarget URL: ${menuUrl}\n`);
browser = await puppeteer_extra_1.default.launch({
headless: 'new',
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-blink-features=AutomationControlled',
]
});
const page = await browser.newPage();
// Use a realistic user agent
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
// Set viewport to desktop size
await page.setViewport({ width: 1920, height: 1080 });
// Hide webdriver flag
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', { get: () => false });
window.chrome = { runtime: {} };
});
// Intercept all GraphQL responses
page.on('response', async (response) => {
const url = response.url();
// Only capture GraphQL responses
if (!url.includes('graphql'))
return;
try {
const contentType = response.headers()['content-type'] || '';
if (!contentType.includes('application/json'))
return;
const data = await response.json();
// Extract operation name from URL if possible
const urlParams = new URLSearchParams(url.split('?')[1] || '');
const operationName = urlParams.get('operationName') || 'Unknown';
capturedResponses.push({
operationName,
url: url.substring(0, 200),
data,
timestamp: new Date()
});
console.log(`📡 Captured: ${operationName}`);
// Check for product data
if (data?.data?.filteredProducts?.products) {
const products = data.data.filteredProducts.products;
console.log(` Found ${products.length} products`);
}
}
catch (e) {
// Ignore parse errors
}
});
console.log('Navigating to page...');
await page.goto(menuUrl, {
waitUntil: 'networkidle2',
timeout: 90000
});
// Check if it's a Dutchie menu
const isDutchie = await page.evaluate(() => {
return typeof window.reactEnv !== 'undefined';
});
if (isDutchie) {
console.log('✅ Dutchie menu detected\n');
// Get environment info
const reactEnv = await page.evaluate(() => window.reactEnv);
console.log('Dutchie Environment:');
console.log(` dispensaryId: ${reactEnv?.dispensaryId}`);
console.log(` retailerId: ${reactEnv?.retailerId}`);
console.log(` chainId: ${reactEnv?.chainId}`);
}
// Scroll to trigger lazy loading
console.log('\nScrolling to load more products...');
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
await new Promise(r => setTimeout(r, 3000));
// Click on a category to trigger more loads
const categoryLinks = await page.$$('a[href*="/products/"]');
if (categoryLinks.length > 0) {
console.log(`Found ${categoryLinks.length} category links, clicking first one...`);
try {
await categoryLinks[0].click();
await page.waitForNavigation({ waitUntil: 'networkidle2', timeout: 30000 });
}
catch (e) {
console.log('Category navigation failed, continuing...');
}
}
// Wait a bit more for any final responses
await new Promise(r => setTimeout(r, 2000));
console.log(`\n${'='.repeat(80)}`);
console.log(`CAPTURED ${capturedResponses.length} GRAPHQL RESPONSES`);
console.log('='.repeat(80));
// Find product data
let productSchema = null;
let sampleProduct = null;
for (const resp of capturedResponses) {
console.log(`\n${resp.operationName}:`);
console.log(` URL: ${resp.url.substring(0, 100)}...`);
if (resp.data?.data?.filteredProducts?.products) {
const products = resp.data.data.filteredProducts.products;
console.log(` ✅ Contains ${products.length} products`);
if (products.length > 0 && !sampleProduct) {
sampleProduct = products[0];
productSchema = extractSchema(products[0]);
}
}
// Show top-level data keys
if (resp.data?.data) {
console.log(` Data keys: ${Object.keys(resp.data.data).join(', ')}`);
}
}
// Output the product schema
if (productSchema) {
console.log('\n' + '='.repeat(80));
console.log('PRODUCT SCHEMA (from first product):');
console.log('='.repeat(80));
console.log(JSON.stringify(productSchema, null, 2));
console.log('\n' + '='.repeat(80));
console.log('SAMPLE PRODUCT:');
console.log('='.repeat(80));
console.log(JSON.stringify(sampleProduct, null, 2));
// Save to file
const outputData = {
capturedAt: new Date().toISOString(),
menuUrl,
schema: productSchema,
sampleProduct,
allResponses: capturedResponses.map(r => ({
operationName: r.operationName,
dataKeys: r.data?.data ? Object.keys(r.data.data) : [],
productCount: r.data?.data?.filteredProducts?.products?.length || 0
}))
};
const outputPath = '/tmp/dutchie-schema-capture.json';
fs.writeFileSync(outputPath, JSON.stringify(outputData, null, 2));
console.log(`\nSaved capture to: ${outputPath}`);
}
else {
console.log('\n❌ No product data captured');
// Debug: show all responses
console.log('\nAll captured responses:');
for (const resp of capturedResponses) {
console.log(`\n${resp.operationName}:`);
console.log(JSON.stringify(resp.data, null, 2).substring(0, 500));
}
}
}
catch (error) {
console.error('Error:', error.message);
}
finally {
if (browser) {
await browser.close();
}
}
}
/**
* Extract schema from an object (field names + types)
*/
function extractSchema(obj, prefix = '') {
if (obj === null)
return { type: 'null' };
if (obj === undefined)
return { type: 'undefined' };
if (Array.isArray(obj)) {
if (obj.length === 0)
return { type: 'array', items: 'unknown' };
return {
type: 'array',
items: extractSchema(obj[0], prefix + '[]')
};
}
if (typeof obj === 'object') {
const schema = { type: 'object', properties: {} };
for (const [key, value] of Object.entries(obj)) {
schema.properties[key] = extractSchema(value, prefix ? `${prefix}.${key}` : key);
}
return schema;
}
return { type: typeof obj, example: String(obj).substring(0, 100) };
}
// Run
const url = process.argv[2] || 'https://dutchie.com/embedded-menu/AZ-Deeply-Rooted';
captureSchema(url).catch(console.error);

View File

@@ -0,0 +1,56 @@
"use strict";
/**
* Seed crawl: trigger dutchie crawls for all dispensaries with menu_type='dutchie'
* and a resolved platform_dispensary_id. This uses the AZ orchestrator endpoint logic.
*
* Usage (local):
* node dist/scripts/crawl-all-dutchie.js
*
* Requires:
* - DATABASE_URL/CRAWLSY_DATABASE_URL pointing to the consolidated DB
* - Dispensaries table populated with menu_type and platform_dispensary_id
*/
Object.defineProperty(exports, "__esModule", { value: true });
const connection_1 = require("../dutchie-az/db/connection");
const dispensary_orchestrator_1 = require("../services/dispensary-orchestrator");
async function main() {
const { rows } = await (0, connection_1.query)(`
SELECT id, name, slug, platform_dispensary_id
FROM dispensaries
WHERE menu_type = 'dutchie'
AND platform_dispensary_id IS NOT NULL
ORDER BY id
`);
if (!rows.length) {
console.log('No dutchie dispensaries with resolved platform_dispensary_id found.');
process.exit(0);
}
console.log(`Found ${rows.length} dutchie dispensaries with resolved IDs. Triggering crawls...`);
let success = 0;
let failed = 0;
for (const row of rows) {
try {
console.log(`Crawling ${row.id} (${row.name})...`);
const result = await (0, dispensary_orchestrator_1.runDispensaryOrchestrator)(row.id);
const ok = result.status === 'success' ||
result.status === 'sandbox_only' ||
result.status === 'detection_only';
if (ok) {
success++;
}
else {
failed++;
console.warn(`Crawl returned status ${result.status} for ${row.id} (${row.name})`);
}
}
catch (err) {
failed++;
console.error(`Failed crawl for ${row.id} (${row.name}): ${err.message}`);
}
}
console.log(`Completed. Success: ${success}, Failed: ${failed}`);
}
main().catch((err) => {
console.error('Fatal:', err);
process.exit(1);
});

View File

@@ -0,0 +1,24 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
const dispensary_orchestrator_1 = require("../services/dispensary-orchestrator");
// Run 5 crawlers sequentially to avoid OOM
const dispensaryIds = [112, 81, 115, 140, 177];
async function run() {
console.log('Starting 5 crawlers SEQUENTIALLY...');
for (const id of dispensaryIds) {
console.log(`\n=== Starting crawler for dispensary ${id} ===`);
try {
const result = await (0, dispensary_orchestrator_1.runDispensaryOrchestrator)(id);
console.log(` Status: ${result.status}`);
console.log(` Summary: ${result.summary}`);
if (result.productsFound) {
console.log(` Products: ${result.productsFound} found, ${result.productsNew} new, ${result.productsUpdated} updated`);
}
}
catch (e) {
console.log(` ERROR: ${e.message}`);
}
}
console.log('\n=== All 5 crawlers complete ===');
}
run().catch(e => console.log('Fatal:', e.message));

181
backend/dist/scripts/parallel-scrape.js vendored Normal file
View File

@@ -0,0 +1,181 @@
"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
const migrate_1 = require("../db/migrate");
const proxy_1 = require("../services/proxy");
const puppeteer_extra_1 = __importDefault(require("puppeteer-extra"));
const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth"));
puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)());
const FIREFOX_USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0';
const NUM_WORKERS = parseInt(process.argv[2] || '15');
const DISPENSARY_NAME = process.argv[3] || 'Deeply Rooted';
const USE_PROXIES = process.argv[4] !== 'no-proxy';
async function getStore(name) {
const result = await migrate_1.pool.query(`SELECT id, name, slug, dutchie_url FROM stores WHERE name ILIKE $1 LIMIT 1`, [`%${name}%`]);
return result.rows[0] || null;
}
async function getCategories(storeId) {
const result = await migrate_1.pool.query(`SELECT id, name, slug, dutchie_url as url FROM categories WHERE store_id = $1 AND scrape_enabled = true`, [storeId]);
return result.rows;
}
async function scrapeWithProxy(workerId, store, category) {
let browser = null;
let proxyId = null;
try {
// Get a proxy (if enabled)
let proxy = null;
if (USE_PROXIES) {
proxy = await (0, proxy_1.getActiveProxy)();
if (proxy) {
proxyId = proxy.id;
console.log(`[Worker ${workerId}] Using proxy: ${proxy.protocol}://${proxy.host}:${proxy.port}`);
}
else {
console.log(`[Worker ${workerId}] No proxy available, using direct connection`);
}
}
else {
console.log(`[Worker ${workerId}] Direct connection (proxies disabled)`);
}
// Build browser args
const args = [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-accelerated-2d-canvas',
'--disable-gpu',
'--window-size=1920,1080',
];
if (proxy) {
if (proxy.protocol === 'socks5' || proxy.protocol === 'socks') {
args.push(`--proxy-server=socks5://${proxy.host}:${proxy.port}`);
}
else {
args.push(`--proxy-server=${proxy.protocol}://${proxy.host}:${proxy.port}`);
}
}
browser = await puppeteer_extra_1.default.launch({
headless: true,
args,
executablePath: process.env.PUPPETEER_EXECUTABLE_PATH,
});
const page = await browser.newPage();
await page.setUserAgent(FIREFOX_USER_AGENT);
await page.setViewport({ width: 1920, height: 1080 });
// Handle proxy auth if needed
if (proxy?.username && proxy?.password) {
await page.authenticate({
username: proxy.username,
password: proxy.password,
});
}
console.log(`[Worker ${workerId}] Scraping category: ${category.name} (${category.url})`);
// Navigate to the category page
const response = await page.goto(category.url, {
waitUntil: 'networkidle2',
timeout: 60000,
});
if (!response || !response.ok()) {
throw new Error(`Failed to load page: ${response?.status()}`);
}
// Wait for products to load
await page.waitForSelector('[data-testid="product-list-item"], a[href*="/product/"]', {
timeout: 30000,
}).catch(() => {
console.log(`[Worker ${workerId}] No products found on page`);
});
// Extract products
const products = await page.evaluate(() => {
// Try data-testid first, then fall back to product links
const listItems = document.querySelectorAll('[data-testid="product-list-item"]');
if (listItems.length > 0)
return listItems.length;
return document.querySelectorAll('a[href*="/product/"]').length;
});
console.log(`[Worker ${workerId}] Found ${products} products in ${category.name}`);
await browser.close();
return { success: true, products };
}
catch (error) {
console.error(`[Worker ${workerId}] Error:`, error.message);
// Check for bot detection
if (proxyId && (0, proxy_1.isBotDetectionError)(error.message)) {
(0, proxy_1.putProxyInTimeout)(proxyId, error.message);
}
if (browser) {
await browser.close().catch(() => { });
}
return { success: false, products: 0, error: error.message };
}
}
async function worker(workerId, store, categories, categoryIndex) {
while (categoryIndex.current < categories.length) {
const idx = categoryIndex.current++;
const category = categories[idx];
if (!category)
break;
console.log(`[Worker ${workerId}] Starting category ${idx + 1}/${categories.length}: ${category.name}`);
const result = await scrapeWithProxy(workerId, store, category);
if (result.success) {
console.log(`[Worker ${workerId}] Completed ${category.name}: ${result.products} products`);
}
else {
console.log(`[Worker ${workerId}] Failed ${category.name}: ${result.error}`);
}
// Small delay between requests
await new Promise(resolve => setTimeout(resolve, 2000 + Math.random() * 3000));
}
console.log(`[Worker ${workerId}] Finished all assigned work`);
}
async function main() {
console.log(`\n${'='.repeat(60)}`);
console.log(`Parallel Scraper - ${NUM_WORKERS} workers`);
console.log(`Target: ${DISPENSARY_NAME}`);
console.log(`User Agent: Firefox`);
console.log(`Proxies: ${USE_PROXIES ? 'Enabled' : 'Disabled'}`);
console.log(`${'='.repeat(60)}\n`);
// Find the store
const store = await getStore(DISPENSARY_NAME);
if (!store) {
console.error(`Store not found: ${DISPENSARY_NAME}`);
process.exit(1);
}
console.log(`Found store: ${store.name} (ID: ${store.id})`);
// Get categories
const categories = await getCategories(store.id);
if (categories.length === 0) {
console.error('No categories found for this store');
process.exit(1);
}
console.log(`Found ${categories.length} categories to scrape`);
console.log(`Categories: ${categories.map(c => c.name).join(', ')}\n`);
// Check proxies
const proxyResult = await migrate_1.pool.query('SELECT COUNT(*) as total, COUNT(*) FILTER (WHERE active = true) as active FROM proxies');
console.log(`Proxies: ${proxyResult.rows[0].active} active / ${proxyResult.rows[0].total} total\n`);
// Shared index for work distribution
const categoryIndex = { current: 0 };
// For a store with few categories, we'll run multiple passes
// Expand the work by duplicating categories for parallel workers
const expandedCategories = [];
const passes = Math.ceil(NUM_WORKERS / Math.max(categories.length, 1));
for (let i = 0; i < passes; i++) {
expandedCategories.push(...categories);
}
console.log(`Running ${NUM_WORKERS} workers across ${expandedCategories.length} category scrapes\n`);
// Start workers
const workers = [];
for (let i = 0; i < NUM_WORKERS; i++) {
workers.push(worker(i + 1, store, expandedCategories, categoryIndex));
// Stagger worker starts
await new Promise(resolve => setTimeout(resolve, 500));
}
// Wait for all workers
await Promise.all(workers);
console.log(`\n${'='.repeat(60)}`);
console.log('All workers completed!');
console.log(`${'='.repeat(60)}\n`);
await migrate_1.pool.end();
}
main().catch(console.error);

View File

@@ -0,0 +1,344 @@
#!/usr/bin/env npx tsx
"use strict";
/**
* Queue Dispensaries Script
*
* Orchestrates the multi-provider crawler system:
* 1. Queue dispensaries that need provider detection
* 2. Queue Dutchie dispensaries for production crawl
* 3. Queue sandbox dispensaries for learning crawls
*
* Usage:
* npx tsx src/scripts/queue-dispensaries.ts [--detection] [--production] [--sandbox] [--all]
* npx tsx src/scripts/queue-dispensaries.ts --dry-run
* npx tsx src/scripts/queue-dispensaries.ts --process # Process queued jobs
*/
Object.defineProperty(exports, "__esModule", { value: true });
const migrate_1 = require("../db/migrate");
const crawler_jobs_1 = require("../services/crawler-jobs");
// Parse command line args
const args = process.argv.slice(2);
const flags = {
detection: args.includes('--detection') || args.includes('--all'),
production: args.includes('--production') || args.includes('--all'),
sandbox: args.includes('--sandbox') || args.includes('--all'),
dryRun: args.includes('--dry-run'),
process: args.includes('--process'),
help: args.includes('--help') || args.includes('-h'),
limit: parseInt(args.find(a => a.startsWith('--limit='))?.split('=')[1] || '10'),
};
// If no specific flags, default to all
if (!flags.detection && !flags.production && !flags.sandbox && !flags.process) {
flags.detection = true;
flags.production = true;
flags.sandbox = true;
}
async function showHelp() {
console.log(`
Queue Dispensaries - Multi-Provider Crawler Orchestration
USAGE:
npx tsx src/scripts/queue-dispensaries.ts [OPTIONS]
OPTIONS:
--detection Queue dispensaries that need provider detection
--production Queue Dutchie production crawls
--sandbox Queue sandbox/learning crawls
--all Queue all job types (default if no specific flag)
--process Process queued jobs instead of just queuing
--dry-run Show what would be queued without making changes
--limit=N Maximum dispensaries to queue per type (default: 10)
--help, -h Show this help message
EXAMPLES:
# Queue all dispensaries for appropriate jobs
npx tsx src/scripts/queue-dispensaries.ts
# Only queue detection jobs
npx tsx src/scripts/queue-dispensaries.ts --detection --limit=20
# Dry run to see what would be queued
npx tsx src/scripts/queue-dispensaries.ts --dry-run
# Process sandbox jobs
npx tsx src/scripts/queue-dispensaries.ts --process
`);
}
async function queueDetectionJobs() {
console.log('\n📡 Queueing Detection Jobs...');
// Find dispensaries that need provider detection:
// - menu_provider is null OR
// - menu_provider_confidence < 70 AND
// - crawler_status is idle (not already queued/running)
// - has a website URL
const query = `
SELECT id, name, website, menu_url, menu_provider, menu_provider_confidence
FROM dispensaries
WHERE (website IS NOT NULL OR menu_url IS NOT NULL)
AND crawler_status = 'idle'
AND (menu_provider IS NULL OR menu_provider_confidence < 70)
ORDER BY
CASE WHEN menu_provider IS NULL THEN 0 ELSE 1 END,
menu_provider_confidence ASC
LIMIT $1
`;
const result = await migrate_1.pool.query(query, [flags.limit]);
if (flags.dryRun) {
console.log(` Would queue ${result.rows.length} dispensaries for detection:`);
for (const row of result.rows) {
console.log(` - [${row.id}] ${row.name} (current: ${row.menu_provider || 'unknown'}, confidence: ${row.menu_provider_confidence}%)`);
}
return result.rows.length;
}
let queued = 0;
for (const dispensary of result.rows) {
try {
// Update status to queued
await migrate_1.pool.query(`UPDATE dispensaries SET crawler_status = 'queued_detection', updated_at = NOW() WHERE id = $1`, [dispensary.id]);
// Create sandbox job for detection
await migrate_1.pool.query(`INSERT INTO sandbox_crawl_jobs (dispensary_id, job_type, status, priority)
VALUES ($1, 'detection', 'pending', 10)`, [dispensary.id]);
console.log(` ✓ Queued detection: [${dispensary.id}] ${dispensary.name}`);
queued++;
}
catch (error) {
console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`);
}
}
return queued;
}
async function queueProductionCrawls() {
console.log('\n🏭 Queueing Production Dutchie Crawls...');
// Find Dutchie dispensaries ready for production crawl:
// - menu_provider = 'dutchie'
// - crawler_mode = 'production'
// - crawler_status is idle
// - last_menu_scrape is old or null
const query = `
SELECT d.id, d.name, d.last_menu_scrape, d.menu_url
FROM dispensaries d
WHERE d.menu_provider = 'dutchie'
AND d.crawler_mode = 'production'
AND d.crawler_status = 'idle'
AND (d.last_menu_scrape IS NULL OR d.last_menu_scrape < NOW() - INTERVAL '4 hours')
ORDER BY
CASE WHEN d.last_menu_scrape IS NULL THEN 0 ELSE 1 END,
d.last_menu_scrape ASC
LIMIT $1
`;
const result = await migrate_1.pool.query(query, [flags.limit]);
if (flags.dryRun) {
console.log(` Would queue ${result.rows.length} Dutchie dispensaries for production crawl:`);
for (const row of result.rows) {
const lastScrape = row.last_menu_scrape ? new Date(row.last_menu_scrape).toISOString() : 'never';
console.log(` - [${row.id}] ${row.name} (last scrape: ${lastScrape})`);
}
return result.rows.length;
}
let queued = 0;
for (const dispensary of result.rows) {
try {
// Update status to queued
await migrate_1.pool.query(`UPDATE dispensaries SET crawler_status = 'queued_crawl', updated_at = NOW() WHERE id = $1`, [dispensary.id]);
// Create crawl job in the main crawl_jobs table (production queue)
await migrate_1.pool.query(`INSERT INTO crawl_jobs (store_id, job_type, trigger_type, status, priority, metadata)
SELECT s.id, 'full_crawl', 'scheduled', 'pending', 50,
jsonb_build_object('dispensary_id', $1, 'source', 'queue-dispensaries')
FROM stores s
JOIN dispensaries d ON (d.menu_url = s.dutchie_url OR d.name ILIKE '%' || s.name || '%')
WHERE d.id = $1
LIMIT 1`, [dispensary.id]);
console.log(` ✓ Queued production crawl: [${dispensary.id}] ${dispensary.name}`);
queued++;
}
catch (error) {
console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`);
}
}
return queued;
}
async function queueSandboxCrawls() {
console.log('\n🧪 Queueing Sandbox Crawls...');
// Find sandbox dispensaries needing crawls:
// - crawler_mode = 'sandbox'
// - crawler_status in (idle, error_needs_review)
// - No recent sandbox job
const query = `
SELECT d.id, d.name, d.menu_provider, d.crawler_status, d.website
FROM dispensaries d
WHERE d.crawler_mode = 'sandbox'
AND d.crawler_status IN ('idle', 'error_needs_review')
AND (d.website IS NOT NULL OR d.menu_url IS NOT NULL)
AND NOT EXISTS (
SELECT 1 FROM sandbox_crawl_jobs sj
WHERE sj.dispensary_id = d.id
AND sj.status IN ('pending', 'running')
)
ORDER BY d.updated_at ASC
LIMIT $1
`;
const result = await migrate_1.pool.query(query, [flags.limit]);
if (flags.dryRun) {
console.log(` Would queue ${result.rows.length} dispensaries for sandbox crawl:`);
for (const row of result.rows) {
console.log(` - [${row.id}] ${row.name} (provider: ${row.menu_provider || 'unknown'}, status: ${row.crawler_status})`);
}
return result.rows.length;
}
let queued = 0;
for (const dispensary of result.rows) {
try {
// Update status
await migrate_1.pool.query(`UPDATE dispensaries SET crawler_status = 'queued_crawl', updated_at = NOW() WHERE id = $1`, [dispensary.id]);
// Create sandbox job
await migrate_1.pool.query(`INSERT INTO sandbox_crawl_jobs (dispensary_id, job_type, status, priority)
VALUES ($1, 'deep_crawl', 'pending', 5)`, [dispensary.id]);
console.log(` ✓ Queued sandbox crawl: [${dispensary.id}] ${dispensary.name}`);
queued++;
}
catch (error) {
console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`);
}
}
return queued;
}
async function processJobs() {
console.log('\n⚙ Processing Queued Jobs...\n');
// Process sandbox jobs (detection + sandbox crawls)
const sandboxJobs = await migrate_1.pool.query(`SELECT * FROM sandbox_crawl_jobs
WHERE status = 'pending'
ORDER BY priority DESC, scheduled_at ASC
LIMIT $1`, [flags.limit]);
console.log(`Found ${sandboxJobs.rows.length} pending sandbox jobs\n`);
for (const job of sandboxJobs.rows) {
console.log(`Processing job ${job.id} (${job.job_type}) for dispensary ${job.dispensary_id}...`);
try {
// Mark as running
await migrate_1.pool.query(`UPDATE sandbox_crawl_jobs SET status = 'running', started_at = NOW() WHERE id = $1`, [job.id]);
let result;
if (job.job_type === 'detection') {
result = await (0, crawler_jobs_1.runDetectMenuProviderJob)(job.dispensary_id);
}
else {
result = await (0, crawler_jobs_1.runSandboxCrawlJob)(job.dispensary_id, job.sandbox_id);
}
// Update job status
await migrate_1.pool.query(`UPDATE sandbox_crawl_jobs
SET status = $1, completed_at = NOW(), result_summary = $2, error_message = $3
WHERE id = $4`, [
result.success ? 'completed' : 'failed',
JSON.stringify(result.data || {}),
result.success ? null : result.message,
job.id,
]);
console.log(` ${result.success ? '✓' : '✗'} ${result.message}\n`);
}
catch (error) {
await migrate_1.pool.query(`UPDATE sandbox_crawl_jobs SET status = 'failed', error_message = $1 WHERE id = $2`, [error.message, job.id]);
console.log(` ✗ Error: ${error.message}\n`);
}
}
}
async function showStats() {
console.log('\n📊 Current Stats:');
// Dispensary stats
const stats = await migrate_1.pool.query(`
SELECT
COUNT(*) as total,
COUNT(*) FILTER (WHERE menu_provider IS NULL) as no_provider,
COUNT(*) FILTER (WHERE menu_provider = 'dutchie') as dutchie,
COUNT(*) FILTER (WHERE menu_provider NOT IN ('dutchie', 'unknown') AND menu_provider IS NOT NULL) as other_providers,
COUNT(*) FILTER (WHERE menu_provider = 'unknown') as unknown,
COUNT(*) FILTER (WHERE crawler_mode = 'production') as production_mode,
COUNT(*) FILTER (WHERE crawler_mode = 'sandbox') as sandbox_mode,
COUNT(*) FILTER (WHERE crawler_status = 'idle') as idle,
COUNT(*) FILTER (WHERE crawler_status LIKE 'queued%') as queued,
COUNT(*) FILTER (WHERE crawler_status = 'running') as running,
COUNT(*) FILTER (WHERE crawler_status = 'ok') as ok,
COUNT(*) FILTER (WHERE crawler_status = 'error_needs_review') as needs_review
FROM dispensaries
`);
const s = stats.rows[0];
console.log(`
Dispensaries: ${s.total}
- No provider detected: ${s.no_provider}
- Dutchie: ${s.dutchie}
- Other providers: ${s.other_providers}
- Unknown: ${s.unknown}
Crawler Mode:
- Production: ${s.production_mode}
- Sandbox: ${s.sandbox_mode}
Status:
- Idle: ${s.idle}
- Queued: ${s.queued}
- Running: ${s.running}
- OK: ${s.ok}
- Needs Review: ${s.needs_review}
`);
// Job stats
const jobStats = await migrate_1.pool.query(`
SELECT
COUNT(*) FILTER (WHERE status = 'pending') as pending,
COUNT(*) FILTER (WHERE status = 'running') as running,
COUNT(*) FILTER (WHERE status = 'completed') as completed,
COUNT(*) FILTER (WHERE status = 'failed') as failed
FROM sandbox_crawl_jobs
`);
const j = jobStats.rows[0];
console.log(` Sandbox Jobs:
- Pending: ${j.pending}
- Running: ${j.running}
- Completed: ${j.completed}
- Failed: ${j.failed}
`);
}
async function main() {
if (flags.help) {
await showHelp();
process.exit(0);
}
console.log('═══════════════════════════════════════════════════════');
console.log(' Multi-Provider Crawler Queue Manager');
console.log('═══════════════════════════════════════════════════════');
if (flags.dryRun) {
console.log('\n🔍 DRY RUN MODE - No changes will be made\n');
}
try {
// Show current stats first
await showStats();
if (flags.process) {
// Process mode - run jobs instead of queuing
await processJobs();
}
else {
// Queuing mode
let totalQueued = 0;
if (flags.detection) {
totalQueued += await queueDetectionJobs();
}
if (flags.production) {
totalQueued += await queueProductionCrawls();
}
if (flags.sandbox) {
totalQueued += await queueSandboxCrawls();
}
console.log('\n═══════════════════════════════════════════════════════');
console.log(` Total dispensaries queued: ${totalQueued}`);
console.log('═══════════════════════════════════════════════════════\n');
}
// Show updated stats
if (!flags.dryRun) {
await showStats();
}
}
catch (error) {
console.error('Fatal error:', error);
process.exit(1);
}
finally {
await migrate_1.pool.end();
}
}
main();

View File

@@ -0,0 +1,473 @@
#!/usr/bin/env npx tsx
"use strict";
/**
* Queue Intelligence Script
*
* Orchestrates the multi-category intelligence crawler system:
* 1. Queue dispensaries that need provider detection (all 4 categories)
* 2. Queue per-category production crawls (Dutchie products only for now)
* 3. Queue per-category sandbox crawls (all providers)
*
* Each category (product, specials, brand, metadata) is handled independently.
* A failure in one category does NOT affect other categories.
*
* Usage:
* npx tsx src/scripts/queue-intelligence.ts [--detection] [--production] [--sandbox] [--all]
* npx tsx src/scripts/queue-intelligence.ts --category=product --sandbox
* npx tsx src/scripts/queue-intelligence.ts --process --category=product
* npx tsx src/scripts/queue-intelligence.ts --dry-run
*/
Object.defineProperty(exports, "__esModule", { value: true });
const migrate_1 = require("../db/migrate");
const intelligence_detector_1 = require("../services/intelligence-detector");
const category_crawler_jobs_1 = require("../services/category-crawler-jobs");
// Parse command line args
const args = process.argv.slice(2);
const flags = {
detection: args.includes('--detection') || args.includes('--all'),
production: args.includes('--production') || args.includes('--all'),
sandbox: args.includes('--sandbox') || args.includes('--all'),
dryRun: args.includes('--dry-run'),
process: args.includes('--process'),
help: args.includes('--help') || args.includes('-h'),
limit: parseInt(args.find(a => a.startsWith('--limit='))?.split('=')[1] || '10'),
category: args.find(a => a.startsWith('--category='))?.split('=')[1],
dispensary: parseInt(args.find(a => a.startsWith('--dispensary='))?.split('=')[1] || '0'),
};
// If no specific flags, default to all
if (!flags.detection && !flags.production && !flags.sandbox && !flags.process) {
flags.detection = true;
flags.production = true;
flags.sandbox = true;
}
const CATEGORIES = ['product', 'specials', 'brand', 'metadata'];
async function showHelp() {
console.log(`
Queue Intelligence - Multi-Category Crawler Orchestration
USAGE:
npx tsx src/scripts/queue-intelligence.ts [OPTIONS]
OPTIONS:
--detection Queue dispensaries that need multi-category detection
--production Queue per-category production crawls
--sandbox Queue per-category sandbox crawls
--all Queue all job types (default if no specific flag)
--process Process queued jobs instead of just queuing
--category=CATEGORY Filter to specific category (product|specials|brand|metadata)
--dispensary=ID Process only a specific dispensary
--dry-run Show what would be queued without making changes
--limit=N Maximum dispensaries to queue per type (default: 10)
--help, -h Show this help message
CATEGORIES:
product - Product/menu data (Dutchie=production, others=sandbox)
specials - Deals and specials (all sandbox for now)
brand - Brand intelligence (all sandbox for now)
metadata - Categories/taxonomy (all sandbox for now)
EXAMPLES:
# Queue all dispensaries for appropriate jobs
npx tsx src/scripts/queue-intelligence.ts
# Only queue product detection jobs
npx tsx src/scripts/queue-intelligence.ts --detection --category=product
# Process sandbox jobs for specials category
npx tsx src/scripts/queue-intelligence.ts --process --category=specials --limit=5
# Run full detection for a specific dispensary
npx tsx src/scripts/queue-intelligence.ts --process --detection --dispensary=123
# Dry run to see what would be queued
npx tsx src/scripts/queue-intelligence.ts --dry-run
`);
}
async function queueMultiCategoryDetection() {
console.log('\n📡 Queueing Multi-Category Detection Jobs...');
// Find dispensaries that need provider detection for any category:
// - Any *_provider is null OR
// - Any *_confidence < 70
// - has a website URL
const query = `
SELECT id, name, website, menu_url,
product_provider, product_confidence, product_crawler_mode,
specials_provider, specials_confidence, specials_crawler_mode,
brand_provider, brand_confidence, brand_crawler_mode,
metadata_provider, metadata_confidence, metadata_crawler_mode
FROM dispensaries
WHERE (website IS NOT NULL OR menu_url IS NOT NULL)
AND (
product_provider IS NULL OR product_confidence < 70 OR
specials_provider IS NULL OR specials_confidence < 70 OR
brand_provider IS NULL OR brand_confidence < 70 OR
metadata_provider IS NULL OR metadata_confidence < 70
)
ORDER BY
CASE WHEN product_provider IS NULL THEN 0 ELSE 1 END,
product_confidence ASC
LIMIT $1
`;
const result = await migrate_1.pool.query(query, [flags.limit]);
if (flags.dryRun) {
console.log(` Would queue ${result.rows.length} dispensaries for multi-category detection:`);
for (const row of result.rows) {
const needsDetection = [];
if (!row.product_provider || row.product_confidence < 70)
needsDetection.push('product');
if (!row.specials_provider || row.specials_confidence < 70)
needsDetection.push('specials');
if (!row.brand_provider || row.brand_confidence < 70)
needsDetection.push('brand');
if (!row.metadata_provider || row.metadata_confidence < 70)
needsDetection.push('metadata');
console.log(` - [${row.id}] ${row.name} (needs: ${needsDetection.join(', ')})`);
}
return result.rows.length;
}
let queued = 0;
for (const dispensary of result.rows) {
try {
// Create detection jobs for each category that needs it
for (const category of CATEGORIES) {
const provider = dispensary[`${category}_provider`];
const confidence = dispensary[`${category}_confidence`];
if (!provider || confidence < 70) {
await migrate_1.pool.query(`INSERT INTO sandbox_crawl_jobs (dispensary_id, category, job_type, status, priority)
VALUES ($1, $2, 'detection', 'pending', 10)
ON CONFLICT DO NOTHING`, [dispensary.id, category]);
}
}
console.log(` ✓ Queued detection: [${dispensary.id}] ${dispensary.name}`);
queued++;
}
catch (error) {
console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`);
}
}
return queued;
}
async function queueCategoryProductionCrawls(category) {
const categories = category ? [category] : CATEGORIES;
let totalQueued = 0;
for (const cat of categories) {
console.log(`\n🏭 Queueing Production ${cat.toUpperCase()} Crawls...`);
// For now, only products have production-ready crawlers (Dutchie only)
if (cat !== 'product') {
console.log(` ⏭️ No production crawler for ${cat} yet - skipping`);
continue;
}
// Find dispensaries ready for production crawl
const query = `
SELECT id, name, ${cat}_provider as provider, last_${cat}_scan_at as last_scan
FROM dispensaries
WHERE ${cat}_provider = 'dutchie'
AND ${cat}_crawler_mode = 'production'
AND ${cat}_confidence >= 70
AND (last_${cat}_scan_at IS NULL OR last_${cat}_scan_at < NOW() - INTERVAL '4 hours')
ORDER BY
CASE WHEN last_${cat}_scan_at IS NULL THEN 0 ELSE 1 END,
last_${cat}_scan_at ASC
LIMIT $1
`;
const result = await migrate_1.pool.query(query, [flags.limit]);
if (flags.dryRun) {
console.log(` Would queue ${result.rows.length} dispensaries for ${cat} production crawl:`);
for (const row of result.rows) {
const lastScan = row.last_scan ? new Date(row.last_scan).toISOString() : 'never';
console.log(` - [${row.id}] ${row.name} (provider: ${row.provider}, last: ${lastScan})`);
}
totalQueued += result.rows.length;
continue;
}
for (const dispensary of result.rows) {
try {
// For products, use the existing crawl_jobs table for production
await migrate_1.pool.query(`INSERT INTO crawl_jobs (store_id, job_type, trigger_type, status, priority, metadata)
SELECT s.id, 'full_crawl', 'scheduled', 'pending', 50,
jsonb_build_object('dispensary_id', $1, 'category', $2, 'source', 'queue-intelligence')
FROM stores s
JOIN dispensaries d ON (d.menu_url = s.dutchie_url OR d.name ILIKE '%' || s.name || '%')
WHERE d.id = $1
LIMIT 1`, [dispensary.id, cat]);
console.log(` ✓ Queued ${cat} production: [${dispensary.id}] ${dispensary.name}`);
totalQueued++;
}
catch (error) {
console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`);
}
}
}
return totalQueued;
}
async function queueCategorySandboxCrawls(category) {
const categories = category ? [category] : CATEGORIES;
let totalQueued = 0;
for (const cat of categories) {
console.log(`\n🧪 Queueing Sandbox ${cat.toUpperCase()} Crawls...`);
// Find dispensaries in sandbox mode for this category
const query = `
SELECT d.id, d.name, d.${cat}_provider as provider, d.${cat}_confidence as confidence,
d.website, d.menu_url
FROM dispensaries d
WHERE d.${cat}_crawler_mode = 'sandbox'
AND d.${cat}_provider IS NOT NULL
AND (d.website IS NOT NULL OR d.menu_url IS NOT NULL)
AND NOT EXISTS (
SELECT 1 FROM sandbox_crawl_jobs sj
WHERE sj.dispensary_id = d.id
AND sj.category = $1
AND sj.status IN ('pending', 'running')
)
ORDER BY d.${cat}_confidence DESC, d.updated_at ASC
LIMIT $2
`;
const result = await migrate_1.pool.query(query, [cat, flags.limit]);
if (flags.dryRun) {
console.log(` Would queue ${result.rows.length} dispensaries for ${cat} sandbox crawl:`);
for (const row of result.rows) {
console.log(` - [${row.id}] ${row.name} (provider: ${row.provider}, confidence: ${row.confidence}%)`);
}
totalQueued += result.rows.length;
continue;
}
for (const dispensary of result.rows) {
try {
// Create sandbox entry if needed
const sandboxResult = await migrate_1.pool.query(`INSERT INTO crawler_sandboxes (dispensary_id, category, suspected_menu_provider, mode, status)
VALUES ($1, $2, $3, 'template_learning', 'pending')
ON CONFLICT (dispensary_id, category) WHERE status NOT IN ('moved_to_production', 'failed')
DO UPDATE SET updated_at = NOW()
RETURNING id`, [dispensary.id, cat, dispensary.provider]);
const sandboxId = sandboxResult.rows[0]?.id;
// Create sandbox job
await migrate_1.pool.query(`INSERT INTO sandbox_crawl_jobs (dispensary_id, sandbox_id, category, job_type, status, priority)
VALUES ($1, $2, $3, 'crawl', 'pending', 5)`, [dispensary.id, sandboxId, cat]);
console.log(` ✓ Queued ${cat} sandbox: [${dispensary.id}] ${dispensary.name} (${dispensary.provider})`);
totalQueued++;
}
catch (error) {
console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`);
}
}
}
return totalQueued;
}
async function processDetectionJobs() {
console.log('\n🔍 Processing Detection Jobs...');
// Get pending detection jobs
const jobs = await migrate_1.pool.query(`SELECT DISTINCT dispensary_id
FROM sandbox_crawl_jobs
WHERE job_type = 'detection' AND status = 'pending'
${flags.category ? `AND category = $2` : ''}
${flags.dispensary ? `AND dispensary_id = $${flags.category ? '3' : '2'}` : ''}
LIMIT $1`, flags.category
? (flags.dispensary ? [flags.limit, flags.category, flags.dispensary] : [flags.limit, flags.category])
: (flags.dispensary ? [flags.limit, flags.dispensary] : [flags.limit]));
for (const job of jobs.rows) {
console.log(`\nProcessing detection for dispensary ${job.dispensary_id}...`);
try {
// Get dispensary info
const dispResult = await migrate_1.pool.query('SELECT id, name, website, menu_url FROM dispensaries WHERE id = $1', [job.dispensary_id]);
const dispensary = dispResult.rows[0];
if (!dispensary) {
console.log(` ✗ Dispensary not found`);
continue;
}
const websiteUrl = dispensary.website || dispensary.menu_url;
if (!websiteUrl) {
console.log(` ✗ No website URL`);
continue;
}
// Mark jobs as running
await migrate_1.pool.query(`UPDATE sandbox_crawl_jobs SET status = 'running', started_at = NOW()
WHERE dispensary_id = $1 AND job_type = 'detection' AND status = 'pending'`, [job.dispensary_id]);
// Run multi-category detection
console.log(` Detecting providers for ${dispensary.name}...`);
const detection = await (0, intelligence_detector_1.detectMultiCategoryProviders)(websiteUrl, { timeout: 45000 });
// Update all categories
await (0, intelligence_detector_1.updateAllCategoryProviders)(job.dispensary_id, detection);
// Mark jobs as completed
await migrate_1.pool.query(`UPDATE sandbox_crawl_jobs SET status = 'completed', completed_at = NOW(),
result_summary = $1
WHERE dispensary_id = $2 AND job_type = 'detection' AND status = 'running'`, [JSON.stringify({
product: { provider: detection.product.provider, confidence: detection.product.confidence },
specials: { provider: detection.specials.provider, confidence: detection.specials.confidence },
brand: { provider: detection.brand.provider, confidence: detection.brand.confidence },
metadata: { provider: detection.metadata.provider, confidence: detection.metadata.confidence },
}), job.dispensary_id]);
console.log(` ✓ Detection complete:`);
console.log(` Product: ${detection.product.provider} (${detection.product.confidence}%) -> ${detection.product.mode}`);
console.log(` Specials: ${detection.specials.provider} (${detection.specials.confidence}%) -> ${detection.specials.mode}`);
console.log(` Brand: ${detection.brand.provider} (${detection.brand.confidence}%) -> ${detection.brand.mode}`);
console.log(` Metadata: ${detection.metadata.provider} (${detection.metadata.confidence}%) -> ${detection.metadata.mode}`);
}
catch (error) {
console.log(` ✗ Error: ${error.message}`);
await migrate_1.pool.query(`UPDATE sandbox_crawl_jobs SET status = 'failed', error_message = $1
WHERE dispensary_id = $2 AND job_type = 'detection' AND status = 'running'`, [error.message, job.dispensary_id]);
}
}
}
async function processCrawlJobs() {
const categories = flags.category ? [flags.category] : CATEGORIES;
for (const cat of categories) {
console.log(`\n⚙️ Processing ${cat.toUpperCase()} Crawl Jobs...\n`);
// Process sandbox jobs for this category
if (flags.sandbox || !flags.production) {
await (0, category_crawler_jobs_1.processCategorySandboxJobs)(cat, flags.limit);
}
// Process production jobs for this category
if (flags.production && cat === 'product') {
// Get pending production crawls
const prodJobs = await migrate_1.pool.query(`SELECT d.id
FROM dispensaries d
WHERE d.product_provider = 'dutchie'
AND d.product_crawler_mode = 'production'
AND d.product_confidence >= 70
${flags.dispensary ? 'AND d.id = $2' : ''}
LIMIT $1`, flags.dispensary ? [flags.limit, flags.dispensary] : [flags.limit]);
for (const job of prodJobs.rows) {
console.log(`Processing production ${cat} crawl for dispensary ${job.id}...`);
const result = await (0, category_crawler_jobs_1.runCrawlProductsJob)(job.id);
console.log(` ${result.success ? '✓' : '✗'} ${result.message}`);
}
}
}
}
async function processSpecificDispensary() {
if (!flags.dispensary)
return;
console.log(`\n🎯 Processing Dispensary ${flags.dispensary}...\n`);
const dispResult = await migrate_1.pool.query('SELECT * FROM dispensaries WHERE id = $1', [flags.dispensary]);
if (dispResult.rows.length === 0) {
console.log('Dispensary not found');
return;
}
const dispensary = dispResult.rows[0];
console.log(`Name: ${dispensary.name}`);
console.log(`Website: ${dispensary.website || dispensary.menu_url || 'none'}`);
console.log('');
if (flags.detection) {
console.log('Running multi-category detection...');
const websiteUrl = dispensary.website || dispensary.menu_url;
if (websiteUrl) {
const detection = await (0, intelligence_detector_1.detectMultiCategoryProviders)(websiteUrl);
await (0, intelligence_detector_1.updateAllCategoryProviders)(flags.dispensary, detection);
console.log('Detection results:');
console.log(` Product: ${detection.product.provider} (${detection.product.confidence}%) -> ${detection.product.mode}`);
console.log(` Specials: ${detection.specials.provider} (${detection.specials.confidence}%) -> ${detection.specials.mode}`);
console.log(` Brand: ${detection.brand.provider} (${detection.brand.confidence}%) -> ${detection.brand.mode}`);
console.log(` Metadata: ${detection.metadata.provider} (${detection.metadata.confidence}%) -> ${detection.metadata.mode}`);
}
}
if (flags.production) {
console.log('\nRunning production crawls...');
const results = await (0, category_crawler_jobs_1.runAllCategoryProductionCrawls)(flags.dispensary);
console.log(` ${results.summary}`);
}
if (flags.sandbox) {
console.log('\nRunning sandbox crawls...');
const results = await (0, category_crawler_jobs_1.runAllCategorySandboxCrawls)(flags.dispensary);
console.log(` ${results.summary}`);
}
}
async function showStats() {
console.log('\n📊 Multi-Category Intelligence Stats:');
// Per-category stats
for (const cat of CATEGORIES) {
const stats = await migrate_1.pool.query(`
SELECT
COUNT(*) as total,
COUNT(*) FILTER (WHERE ${cat}_provider IS NULL) as no_provider,
COUNT(*) FILTER (WHERE ${cat}_provider = 'dutchie') as dutchie,
COUNT(*) FILTER (WHERE ${cat}_provider = 'treez') as treez,
COUNT(*) FILTER (WHERE ${cat}_provider NOT IN ('dutchie', 'treez', 'unknown') AND ${cat}_provider IS NOT NULL) as other,
COUNT(*) FILTER (WHERE ${cat}_provider = 'unknown') as unknown,
COUNT(*) FILTER (WHERE ${cat}_crawler_mode = 'production') as production,
COUNT(*) FILTER (WHERE ${cat}_crawler_mode = 'sandbox') as sandbox,
AVG(${cat}_confidence) as avg_confidence
FROM dispensaries
`);
const s = stats.rows[0];
console.log(`
${cat.toUpperCase()}:
Providers: Dutchie=${s.dutchie}, Treez=${s.treez}, Other=${s.other}, Unknown=${s.unknown}, None=${s.no_provider}
Modes: Production=${s.production}, Sandbox=${s.sandbox}
Avg Confidence: ${Math.round(s.avg_confidence || 0)}%`);
}
// Job stats per category
console.log('\n Sandbox Jobs by Category:');
const jobStats = await migrate_1.pool.query(`
SELECT
category,
COUNT(*) FILTER (WHERE status = 'pending') as pending,
COUNT(*) FILTER (WHERE status = 'running') as running,
COUNT(*) FILTER (WHERE status = 'completed') as completed,
COUNT(*) FILTER (WHERE status = 'failed') as failed
FROM sandbox_crawl_jobs
GROUP BY category
ORDER BY category
`);
for (const row of jobStats.rows) {
console.log(` ${row.category}: pending=${row.pending}, running=${row.running}, completed=${row.completed}, failed=${row.failed}`);
}
}
async function main() {
if (flags.help) {
await showHelp();
process.exit(0);
}
console.log('═══════════════════════════════════════════════════════');
console.log(' Multi-Category Intelligence Queue Manager');
console.log('═══════════════════════════════════════════════════════');
if (flags.dryRun) {
console.log('\n🔍 DRY RUN MODE - No changes will be made\n');
}
if (flags.category) {
console.log(`\n📌 Filtering to category: ${flags.category}\n`);
}
try {
// Show current stats first
await showStats();
// If specific dispensary specified, process it directly
if (flags.dispensary && flags.process) {
await processSpecificDispensary();
}
else if (flags.process) {
// Process mode - run jobs
if (flags.detection) {
await processDetectionJobs();
}
await processCrawlJobs();
}
else {
// Queuing mode
let totalQueued = 0;
if (flags.detection) {
totalQueued += await queueMultiCategoryDetection();
}
if (flags.production) {
totalQueued += await queueCategoryProductionCrawls(flags.category);
}
if (flags.sandbox) {
totalQueued += await queueCategorySandboxCrawls(flags.category);
}
console.log('\n═══════════════════════════════════════════════════════');
console.log(` Total queued: ${totalQueued}`);
console.log('═══════════════════════════════════════════════════════\n');
}
// Show updated stats
if (!flags.dryRun) {
await showStats();
}
}
catch (error) {
console.error('Fatal error:', error);
process.exit(1);
}
finally {
await migrate_1.pool.end();
}
}
main();

View File

@@ -0,0 +1,125 @@
"use strict";
/**
* Run Dutchie GraphQL Scrape
*
* This script demonstrates the full pipeline:
* 1. Puppeteer navigates to Dutchie menu
* 2. GraphQL responses are intercepted
* 3. Products are normalized to our schema
* 4. Products are upserted to database
* 5. Derived views (brands, categories, specials) are automatically updated
*/
Object.defineProperty(exports, "__esModule", { value: true });
const pg_1 = require("pg");
const dutchie_graphql_1 = require("../scrapers/dutchie-graphql");
const DATABASE_URL = process.env.DATABASE_URL || 'postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus';
async function main() {
const pool = new pg_1.Pool({ connectionString: DATABASE_URL });
try {
console.log('='.repeat(80));
console.log('DUTCHIE GRAPHQL SCRAPER - FULL PIPELINE TEST');
console.log('='.repeat(80));
console.log(`Database: ${DATABASE_URL.replace(/:[^:@]+@/, ':***@')}`);
// Configuration
const storeId = 1; // Deeply Rooted
const menuUrl = 'https://dutchie.com/embedded-menu/AZ-Deeply-Rooted';
console.log(`\nStore ID: ${storeId}`);
console.log(`Menu URL: ${menuUrl}`);
console.log('\n' + '-'.repeat(80));
// Run the scrape
console.log('\n🚀 Starting scrape...\n');
const result = await (0, dutchie_graphql_1.scrapeDutchieMenu)(pool, storeId, menuUrl);
console.log('\n' + '-'.repeat(80));
console.log('📊 SCRAPE RESULTS:');
console.log('-'.repeat(80));
console.log(` Success: ${result.success}`);
console.log(` Products Found: ${result.productsFound}`);
console.log(` Inserted: ${result.inserted}`);
console.log(` Updated: ${result.updated}`);
if (result.error) {
console.log(` Error: ${result.error}`);
}
// Query derived views to show the result
if (result.success) {
console.log('\n' + '-'.repeat(80));
console.log('📈 DERIVED DATA (from products table):');
console.log('-'.repeat(80));
// Brands
const brandsResult = await pool.query(`
SELECT brand_name, product_count, min_price, max_price
FROM derived_brands
WHERE store_id = $1
ORDER BY product_count DESC
LIMIT 5
`, [storeId]);
console.log('\nTop 5 Brands:');
brandsResult.rows.forEach(row => {
console.log(` - ${row.brand_name}: ${row.product_count} products ($${row.min_price} - $${row.max_price})`);
});
// Specials
const specialsResult = await pool.query(`
SELECT name, brand, rec_price, rec_special_price, discount_percent
FROM current_specials
WHERE store_id = $1
LIMIT 5
`, [storeId]);
console.log('\nTop 5 Specials:');
if (specialsResult.rows.length === 0) {
console.log(' (No specials found - is_on_special may not be populated yet)');
}
else {
specialsResult.rows.forEach(row => {
console.log(` - ${row.name} (${row.brand}): $${row.rec_price}$${row.rec_special_price} (${row.discount_percent}% off)`);
});
}
// Categories
const categoriesResult = await pool.query(`
SELECT category_name, product_count
FROM derived_categories
WHERE store_id = $1
ORDER BY product_count DESC
LIMIT 5
`, [storeId]);
console.log('\nTop 5 Categories:');
if (categoriesResult.rows.length === 0) {
console.log(' (No categories found - subcategory may not be populated yet)');
}
else {
categoriesResult.rows.forEach(row => {
console.log(` - ${row.category_name}: ${row.product_count} products`);
});
}
// Sample product
const sampleResult = await pool.query(`
SELECT name, brand, subcategory, rec_price, rec_special_price, is_on_special, thc_percentage, status
FROM products
WHERE store_id = $1 AND subcategory IS NOT NULL
ORDER BY updated_at DESC
LIMIT 1
`, [storeId]);
if (sampleResult.rows.length > 0) {
const sample = sampleResult.rows[0];
console.log('\nSample Product (with new fields):');
console.log(` Name: ${sample.name}`);
console.log(` Brand: ${sample.brand}`);
console.log(` Category: ${sample.subcategory}`);
console.log(` Price: $${sample.rec_price}`);
console.log(` Sale Price: ${sample.rec_special_price ? `$${sample.rec_special_price}` : 'N/A'}`);
console.log(` On Special: ${sample.is_on_special}`);
console.log(` THC: ${sample.thc_percentage}%`);
console.log(` Status: ${sample.status}`);
}
}
console.log('\n' + '='.repeat(80));
console.log('✅ SCRAPE COMPLETE');
console.log('='.repeat(80));
}
catch (error) {
console.error('\n❌ Error:', error.message);
throw error;
}
finally {
await pool.end();
}
}
main().catch(console.error);

View File

@@ -0,0 +1,279 @@
"use strict";
/**
* Scrape ALL active products via direct GraphQL pagination
* This is more reliable than category navigation
*/
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
const puppeteer_extra_1 = __importDefault(require("puppeteer-extra"));
const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth"));
const pg_1 = require("pg");
const dutchie_graphql_1 = require("../scrapers/dutchie-graphql");
puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)());
const DATABASE_URL = process.env.DATABASE_URL || 'postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus';
const GRAPHQL_HASH = 'ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0';
async function scrapeAllProducts(menuUrl, storeId) {
const pool = new pg_1.Pool({ connectionString: DATABASE_URL });
const browser = await puppeteer_extra_1.default.launch({
headless: 'new',
args: ['--no-sandbox', '--disable-setuid-sandbox'],
});
try {
const page = await browser.newPage();
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36');
console.log('Loading menu to establish session...');
await page.goto(menuUrl, {
waitUntil: 'networkidle2',
timeout: 60000,
});
await new Promise((r) => setTimeout(r, 3000));
const dispensaryId = await page.evaluate(() => window.reactEnv?.dispensaryId);
console.log('Dispensary ID:', dispensaryId);
// Paginate through all products
const allProducts = [];
let pageNum = 0;
const perPage = 100;
console.log('\nFetching all products via paginated GraphQL...');
while (true) {
const result = await page.evaluate(async (dispId, hash, page, perPage) => {
const variables = {
includeEnterpriseSpecials: false,
productsFilter: {
dispensaryId: dispId,
pricingType: 'rec',
Status: 'Active',
types: [],
useCache: false,
isDefaultSort: true,
sortBy: 'popularSortIdx',
sortDirection: 1,
bypassOnlineThresholds: true,
isKioskMenu: false,
removeProductsBelowOptionThresholds: false,
},
page,
perPage,
};
const qs = new URLSearchParams({
operationName: 'FilteredProducts',
variables: JSON.stringify(variables),
extensions: JSON.stringify({ persistedQuery: { version: 1, sha256Hash: hash } }),
});
const resp = await fetch(`https://dutchie.com/graphql?${qs.toString()}`, {
method: 'GET',
headers: {
'content-type': 'application/json',
'apollographql-client-name': 'Marketplace (production)',
},
credentials: 'include',
});
const json = await resp.json();
return {
products: json?.data?.filteredProducts?.products || [],
totalCount: json?.data?.filteredProducts?.queryInfo?.totalCount,
};
}, dispensaryId, GRAPHQL_HASH, pageNum, perPage);
if (result.products.length === 0) {
break;
}
allProducts.push(...result.products);
console.log(`Page ${pageNum}: ${result.products.length} products (total so far: ${allProducts.length}/${result.totalCount})`);
pageNum++;
// Safety limit
if (pageNum > 50) {
console.log('Reached page limit');
break;
}
}
console.log(`\nTotal products fetched: ${allProducts.length}`);
// Normalize and upsert
console.log('\nNormalizing and upserting to database...');
const normalized = allProducts.map(dutchie_graphql_1.normalizeDutchieProduct);
const client = await pool.connect();
let inserted = 0;
let updated = 0;
try {
await client.query('BEGIN');
for (const product of normalized) {
const result = await client.query(`
INSERT INTO products (
store_id, external_id, slug, name, enterprise_product_id,
brand, brand_external_id, brand_logo_url,
subcategory, strain_type, canonical_category,
price, rec_price, med_price, rec_special_price, med_special_price,
is_on_special, special_name, discount_percent, special_data,
sku, inventory_quantity, inventory_available, is_below_threshold, status,
thc_percentage, cbd_percentage, cannabinoids,
weight_mg, net_weight_value, net_weight_unit, options, raw_options,
image_url, additional_images,
is_featured, medical_only, rec_only,
source_created_at, source_updated_at,
description, raw_data,
dutchie_url, last_seen_at, updated_at
)
VALUES (
$1, $2, $3, $4, $5,
$6, $7, $8,
$9, $10, $11,
$12, $13, $14, $15, $16,
$17, $18, $19, $20,
$21, $22, $23, $24, $25,
$26, $27, $28,
$29, $30, $31, $32, $33,
$34, $35,
$36, $37, $38,
$39, $40,
$41, $42,
'', NOW(), NOW()
)
ON CONFLICT (store_id, slug) DO UPDATE SET
name = EXCLUDED.name,
enterprise_product_id = EXCLUDED.enterprise_product_id,
brand = EXCLUDED.brand,
brand_external_id = EXCLUDED.brand_external_id,
brand_logo_url = EXCLUDED.brand_logo_url,
subcategory = EXCLUDED.subcategory,
strain_type = EXCLUDED.strain_type,
canonical_category = EXCLUDED.canonical_category,
price = EXCLUDED.price,
rec_price = EXCLUDED.rec_price,
med_price = EXCLUDED.med_price,
rec_special_price = EXCLUDED.rec_special_price,
med_special_price = EXCLUDED.med_special_price,
is_on_special = EXCLUDED.is_on_special,
special_name = EXCLUDED.special_name,
discount_percent = EXCLUDED.discount_percent,
special_data = EXCLUDED.special_data,
sku = EXCLUDED.sku,
inventory_quantity = EXCLUDED.inventory_quantity,
inventory_available = EXCLUDED.inventory_available,
is_below_threshold = EXCLUDED.is_below_threshold,
status = EXCLUDED.status,
thc_percentage = EXCLUDED.thc_percentage,
cbd_percentage = EXCLUDED.cbd_percentage,
cannabinoids = EXCLUDED.cannabinoids,
weight_mg = EXCLUDED.weight_mg,
net_weight_value = EXCLUDED.net_weight_value,
net_weight_unit = EXCLUDED.net_weight_unit,
options = EXCLUDED.options,
raw_options = EXCLUDED.raw_options,
image_url = EXCLUDED.image_url,
additional_images = EXCLUDED.additional_images,
is_featured = EXCLUDED.is_featured,
medical_only = EXCLUDED.medical_only,
rec_only = EXCLUDED.rec_only,
source_created_at = EXCLUDED.source_created_at,
source_updated_at = EXCLUDED.source_updated_at,
description = EXCLUDED.description,
raw_data = EXCLUDED.raw_data,
last_seen_at = NOW(),
updated_at = NOW()
RETURNING (xmax = 0) AS was_inserted
`, [
storeId,
product.external_id,
product.slug,
product.name,
product.enterprise_product_id,
product.brand,
product.brand_external_id,
product.brand_logo_url,
product.subcategory,
product.strain_type,
product.canonical_category,
product.price,
product.rec_price,
product.med_price,
product.rec_special_price,
product.med_special_price,
product.is_on_special,
product.special_name,
product.discount_percent,
product.special_data ? JSON.stringify(product.special_data) : null,
product.sku,
product.inventory_quantity,
product.inventory_available,
product.is_below_threshold,
product.status,
product.thc_percentage,
product.cbd_percentage,
product.cannabinoids ? JSON.stringify(product.cannabinoids) : null,
product.weight_mg,
product.net_weight_value,
product.net_weight_unit,
product.options,
product.raw_options,
product.image_url,
product.additional_images,
product.is_featured,
product.medical_only,
product.rec_only,
product.source_created_at,
product.source_updated_at,
product.description,
product.raw_data ? JSON.stringify(product.raw_data) : null,
]);
if (result.rows[0]?.was_inserted) {
inserted++;
}
else {
updated++;
}
}
await client.query('COMMIT');
}
catch (error) {
await client.query('ROLLBACK');
throw error;
}
finally {
client.release();
}
console.log(`\nDatabase: ${inserted} inserted, ${updated} updated`);
// Show summary stats
const stats = await pool.query(`
SELECT
COUNT(*) as total,
COUNT(*) FILTER (WHERE is_on_special) as specials,
COUNT(DISTINCT brand) as brands,
COUNT(DISTINCT subcategory) as categories
FROM products WHERE store_id = $1
`, [storeId]);
console.log('\nStore summary:');
console.log(` Total products: ${stats.rows[0].total}`);
console.log(` On special: ${stats.rows[0].specials}`);
console.log(` Unique brands: ${stats.rows[0].brands}`);
console.log(` Categories: ${stats.rows[0].categories}`);
return {
success: true,
totalProducts: allProducts.length,
inserted,
updated,
};
}
finally {
await browser.close();
await pool.end();
}
}
// Run
const menuUrl = process.argv[2] || 'https://dutchie.com/embedded-menu/AZ-Deeply-Rooted';
const storeId = parseInt(process.argv[3] || '1', 10);
console.log('='.repeat(60));
console.log('DUTCHIE GRAPHQL FULL SCRAPE');
console.log('='.repeat(60));
console.log(`Menu URL: ${menuUrl}`);
console.log(`Store ID: ${storeId}`);
console.log('');
scrapeAllProducts(menuUrl, storeId)
.then((result) => {
console.log('\n' + '='.repeat(60));
console.log('COMPLETE');
console.log(JSON.stringify(result, null, 2));
})
.catch((error) => {
console.error('Error:', error.message);
process.exit(1);
});

169
backend/dist/scripts/test-dutchie-e2e.js vendored Normal file
View File

@@ -0,0 +1,169 @@
"use strict";
/**
* Test script: End-to-end Dutchie GraphQL → DB → Dashboard flow
*
* This demonstrates the complete data pipeline:
* 1. Fetch one product from Dutchie GraphQL via Puppeteer
* 2. Normalize it to our schema
* 3. Show the mapping
*/
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
const dutchie_graphql_1 = require("../scrapers/dutchie-graphql");
const fs = __importStar(require("fs"));
// Load the captured sample product from schema capture
const capturedData = JSON.parse(fs.readFileSync('/tmp/dutchie-schema-capture.json', 'utf-8'));
const sampleProduct = capturedData.sampleProduct;
console.log('='.repeat(80));
console.log('DUTCHIE GRAPHQL → DATABASE MAPPING DEMONSTRATION');
console.log('='.repeat(80));
console.log('\n📥 RAW DUTCHIE GRAPHQL PRODUCT:');
console.log('-'.repeat(80));
// Show key fields from raw product
const keyRawFields = {
'_id': sampleProduct._id,
'Name': sampleProduct.Name,
'cName': sampleProduct.cName,
'brandName': sampleProduct.brandName,
'brand.id': sampleProduct.brand?.id,
'type': sampleProduct.type,
'subcategory': sampleProduct.subcategory,
'strainType': sampleProduct.strainType,
'Prices': sampleProduct.Prices,
'recPrices': sampleProduct.recPrices,
'recSpecialPrices': sampleProduct.recSpecialPrices,
'special': sampleProduct.special,
'specialData.saleSpecials[0].specialName': sampleProduct.specialData?.saleSpecials?.[0]?.specialName,
'specialData.saleSpecials[0].discount': sampleProduct.specialData?.saleSpecials?.[0]?.discount,
'THCContent.range[0]': sampleProduct.THCContent?.range?.[0],
'CBDContent.range[0]': sampleProduct.CBDContent?.range?.[0],
'Status': sampleProduct.Status,
'Image': sampleProduct.Image,
'POSMetaData.canonicalSKU': sampleProduct.POSMetaData?.canonicalSKU,
'POSMetaData.children[0].quantity': sampleProduct.POSMetaData?.children?.[0]?.quantity,
'POSMetaData.children[0].quantityAvailable': sampleProduct.POSMetaData?.children?.[0]?.quantityAvailable,
};
Object.entries(keyRawFields).forEach(([key, value]) => {
console.log(` ${key}: ${JSON.stringify(value)}`);
});
console.log('\n📤 NORMALIZED DATABASE ROW:');
console.log('-'.repeat(80));
// Normalize the product
const normalized = (0, dutchie_graphql_1.normalizeDutchieProduct)(sampleProduct);
// Show the normalized result (excluding raw_data for readability)
const { raw_data, cannabinoids, special_data, ...displayFields } = normalized;
Object.entries(displayFields).forEach(([key, value]) => {
if (value !== undefined && value !== null) {
console.log(` ${key}: ${JSON.stringify(value)}`);
}
});
console.log('\n🔗 FIELD MAPPING:');
console.log('-'.repeat(80));
const fieldMappings = [
['_id / id', 'external_id', sampleProduct._id, normalized.external_id],
['Name', 'name', sampleProduct.Name, normalized.name],
['cName', 'slug', sampleProduct.cName, normalized.slug],
['brandName', 'brand', sampleProduct.brandName, normalized.brand],
['brand.id', 'brand_external_id', sampleProduct.brand?.id, normalized.brand_external_id],
['subcategory', 'subcategory', sampleProduct.subcategory, normalized.subcategory],
['strainType', 'strain_type', sampleProduct.strainType, normalized.strain_type],
['recPrices[0]', 'rec_price', sampleProduct.recPrices?.[0], normalized.rec_price],
['recSpecialPrices[0]', 'rec_special_price', sampleProduct.recSpecialPrices?.[0], normalized.rec_special_price],
['special', 'is_on_special', sampleProduct.special, normalized.is_on_special],
['specialData...specialName', 'special_name', sampleProduct.specialData?.saleSpecials?.[0]?.specialName?.substring(0, 40) + '...', normalized.special_name?.substring(0, 40) + '...'],
['THCContent.range[0]', 'thc_percentage', sampleProduct.THCContent?.range?.[0], normalized.thc_percentage],
['CBDContent.range[0]', 'cbd_percentage', sampleProduct.CBDContent?.range?.[0], normalized.cbd_percentage],
['Status', 'status', sampleProduct.Status, normalized.status],
['Image', 'image_url', sampleProduct.Image?.substring(0, 50) + '...', normalized.image_url?.substring(0, 50) + '...'],
['POSMetaData.canonicalSKU', 'sku', sampleProduct.POSMetaData?.canonicalSKU, normalized.sku],
];
console.log(' GraphQL Field → DB Column | Value');
console.log(' ' + '-'.repeat(75));
fieldMappings.forEach(([gqlField, dbCol, gqlVal, dbVal]) => {
const gqlStr = String(gqlField).padEnd(30);
const dbStr = String(dbCol).padEnd(20);
console.log(` ${gqlStr}${dbStr} | ${JSON.stringify(dbVal)}`);
});
console.log('\n📊 SQL INSERT STATEMENT:');
console.log('-'.repeat(80));
// Generate example SQL
const sqlExample = `
INSERT INTO products (
store_id, external_id, slug, name,
brand, brand_external_id,
subcategory, strain_type,
rec_price, rec_special_price,
is_on_special, special_name, discount_percent,
thc_percentage, cbd_percentage,
status, image_url, sku
) VALUES (
1, -- store_id (Deeply Rooted)
'${normalized.external_id}', -- external_id
'${normalized.slug}', -- slug
'${normalized.name}', -- name
'${normalized.brand}', -- brand
'${normalized.brand_external_id}', -- brand_external_id
'${normalized.subcategory}', -- subcategory
'${normalized.strain_type}', -- strain_type
${normalized.rec_price}, -- rec_price
${normalized.rec_special_price}, -- rec_special_price
${normalized.is_on_special}, -- is_on_special
'${normalized.special_name?.substring(0, 50)}...', -- special_name
${normalized.discount_percent || 'NULL'}, -- discount_percent
${normalized.thc_percentage}, -- thc_percentage
${normalized.cbd_percentage}, -- cbd_percentage
'${normalized.status}', -- status
'${normalized.image_url}', -- image_url
'${normalized.sku}' -- sku
)
ON CONFLICT (store_id, slug) DO UPDATE SET ...;
`;
console.log(sqlExample);
console.log('\n✅ SUMMARY:');
console.log('-'.repeat(80));
console.log(` Product: ${normalized.name}`);
console.log(` Brand: ${normalized.brand}`);
console.log(` Category: ${normalized.subcategory}`);
console.log(` Price: $${normalized.rec_price}$${normalized.rec_special_price} (${normalized.discount_percent}% off)`);
console.log(` THC: ${normalized.thc_percentage}%`);
console.log(` Status: ${normalized.status}`);
console.log(` On Special: ${normalized.is_on_special}`);
console.log(` SKU: ${normalized.sku}`);
console.log('\n🎯 DERIVED VIEWS (computed from products table):');
console.log('-'.repeat(80));
console.log(' - current_specials: Products where is_on_special = true');
console.log(' - derived_brands: Aggregated by brand name with counts/prices');
console.log(' - derived_categories: Aggregated by subcategory');
console.log('\nAll views are computed from the single products table - no separate tables needed!');

View File

@@ -0,0 +1,179 @@
"use strict";
/**
* Test script to validate Dutchie GraphQL API access and capture response structure
*/
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
// @ts-ignore - node-fetch type declaration not installed
const node_fetch_1 = __importDefault(require("node-fetch"));
const GRAPHQL_HASHES = {
ConsumerDispensaries: '0a5bfa6ca1d64ae47bcccb7c8077c87147cbc4e6982c17ceec97a2a4948b311b',
GetAddressBasedDispensaryData: '13461f73abf7268770dfd05fe7e10c523084b2bb916a929c08efe3d87531977b',
FilteredProducts: 'ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0',
MenuFiltersV2: '2f0b3233b8a2426b391649ca3f0f7a5d43b9aefd683f6286d7261a2517e3568e',
FilteredSpecials: '0dfb85a4fc138c55a076d4d11bf6d1a25f7cbd511428e1cf5a5b863b3eb23f25',
};
async function fetchProducts(dispensaryId, page = 0, perPage = 25) {
const session = 'crawlsy-session-' + Date.now();
const variables = {
includeEnterpriseSpecials: false,
productsFilter: {
dispensaryId,
pricingType: 'rec',
Status: null, // null to include all (in-stock and out-of-stock)
types: [],
useCache: true,
isDefaultSort: true,
sortBy: 'popularSortIdx',
sortDirection: 1,
bypassOnlineThresholds: true,
isKioskMenu: false,
removeProductsBelowOptionThresholds: false
},
page,
perPage
};
const qs = new URLSearchParams({
operationName: 'FilteredProducts',
variables: JSON.stringify(variables),
extensions: JSON.stringify({ persistedQuery: { version: 1, sha256Hash: GRAPHQL_HASHES.FilteredProducts } })
});
const res = await (0, node_fetch_1.default)(`https://dutchie.com/api-3/graphql?${qs.toString()}`, {
headers: {
'x-dutchie-session': session,
'apollographql-client-name': 'Marketplace (production)',
'content-type': 'application/json',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
});
if (!res.ok) {
const text = await res.text();
console.error('HTTP Status:', res.status);
console.error('Response:', text.substring(0, 500));
throw new Error(`HTTP ${res.status}: ${text.substring(0, 200)}`);
}
return res.json();
}
async function resolveDispensaryId(cName) {
const session = 'crawlsy-session-' + Date.now();
const variables = { input: { dispensaryId: cName } };
const qs = new URLSearchParams({
operationName: 'GetAddressBasedDispensaryData',
variables: JSON.stringify(variables),
extensions: JSON.stringify({ persistedQuery: { version: 1, sha256Hash: GRAPHQL_HASHES.GetAddressBasedDispensaryData } })
});
const res = await (0, node_fetch_1.default)(`https://dutchie.com/graphql?${qs.toString()}`, {
headers: {
'x-dutchie-session': session,
'apollographql-client-name': 'Marketplace (production)',
'content-type': 'application/json',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
});
if (!res.ok) {
console.error('Failed to resolve dispensary ID:', res.status);
return null;
}
const data = await res.json();
return data?.data?.getAddressBasedDispensaryData?.dispensaryData?.dispensaryId || null;
}
function enumerateFields(obj, prefix = '') {
const fields = [];
for (const [key, value] of Object.entries(obj)) {
const path = prefix ? `${prefix}.${key}` : key;
if (value === null) {
fields.push(`${path}: null`);
}
else if (Array.isArray(value)) {
fields.push(`${path}: Array[${value.length}]`);
if (value.length > 0 && typeof value[0] === 'object') {
const subFields = enumerateFields(value[0], `${path}[0]`);
fields.push(...subFields);
}
}
else if (typeof value === 'object') {
fields.push(`${path}: Object`);
const subFields = enumerateFields(value, path);
fields.push(...subFields);
}
else {
const typeStr = typeof value;
const preview = String(value).substring(0, 50);
fields.push(`${path}: ${typeStr} = "${preview}"`);
}
}
return fields;
}
async function main() {
console.log('='.repeat(80));
console.log('DUTCHIE GRAPHQL API TEST');
console.log('='.repeat(80));
const cName = 'AZ-Deeply-Rooted';
// Step 1: Resolve dispensary ID
console.log(`\n1. Resolving dispensary ID for "${cName}"...`);
const dispensaryId = await resolveDispensaryId(cName);
const finalDispensaryId = dispensaryId || '6405ef617056e8014d79101b'; // Fallback to known ID
if (!dispensaryId) {
console.log(' Failed to resolve via API, using hardcoded ID: 6405ef617056e8014d79101b');
}
console.log(` Final ID: ${finalDispensaryId}`);
// Step 2: Fetch first page of products
console.log('\n2. Fetching products (page 0, perPage 5)...');
const result = await fetchProducts(finalDispensaryId, 0, 5);
if (result.errors) {
console.error('\nGraphQL Errors:');
console.error(JSON.stringify(result.errors, null, 2));
return;
}
const products = result?.data?.filteredProducts?.products || [];
console.log(` Found ${products.length} products in this page`);
if (products.length === 0) {
console.log('No products returned. Full response:');
console.log(JSON.stringify(result, null, 2));
return;
}
// Step 3: Enumerate all fields from first product
console.log('\n3. PRODUCT FIELD STRUCTURE (from first product):');
console.log('-'.repeat(80));
const product = products[0];
const fields = enumerateFields(product);
fields.forEach(f => console.log(` ${f}`));
// Step 4: Show full sample product JSON
console.log('\n4. FULL SAMPLE PRODUCT JSON:');
console.log('-'.repeat(80));
console.log(JSON.stringify(product, null, 2));
// Step 5: Summary of key fields for schema design
console.log('\n5. KEY FIELDS FOR SCHEMA DESIGN:');
console.log('-'.repeat(80));
const keyFields = [
{ field: 'id', value: product.id },
{ field: 'name', value: product.name },
{ field: 'slug', value: product.slug },
{ field: 'brand', value: product.brand },
{ field: 'brandId', value: product.brandId },
{ field: 'type', value: product.type },
{ field: 'category', value: product.category },
{ field: 'subcategory', value: product.subcategory },
{ field: 'strainType', value: product.strainType },
{ field: 'THCContent', value: product.THCContent },
{ field: 'CBDContent', value: product.CBDContent },
{ field: 'description', value: product.description?.substring(0, 100) + '...' },
{ field: 'image', value: product.image },
{ field: 'options.length', value: product.options?.length },
{ field: 'pricing', value: product.pricing },
{ field: 'terpenes.length', value: product.terpenes?.length },
{ field: 'effects.length', value: product.effects?.length },
];
keyFields.forEach(({ field, value }) => {
console.log(` ${field}: ${JSON.stringify(value)}`);
});
// Step 6: Show an option (variant) if available
if (product.options && product.options.length > 0) {
console.log('\n6. SAMPLE OPTION/VARIANT:');
console.log('-'.repeat(80));
console.log(JSON.stringify(product.options[0], null, 2));
}
}
main().catch(console.error);

View File

@@ -0,0 +1,84 @@
"use strict";
/**
* Test different Status filter values in Dutchie GraphQL
*/
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
const puppeteer_extra_1 = __importDefault(require("puppeteer-extra"));
const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth"));
puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)());
const GRAPHQL_HASH = 'ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0';
async function main() {
const browser = await puppeteer_extra_1.default.launch({
headless: 'new',
args: ['--no-sandbox', '--disable-setuid-sandbox'],
});
const page = await browser.newPage();
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36');
console.log('Loading menu...');
await page.goto('https://dutchie.com/embedded-menu/AZ-Deeply-Rooted', {
waitUntil: 'networkidle2',
timeout: 60000,
});
await new Promise((r) => setTimeout(r, 3000));
const dispensaryId = await page.evaluate(() => window.reactEnv?.dispensaryId);
console.log('Dispensary ID:', dispensaryId);
// Test different status values
const testCases = [
{ label: 'Active', status: 'Active', includeStatus: true },
{ label: 'Inactive', status: 'Inactive', includeStatus: true },
{ label: 'null', status: null, includeStatus: true },
{ label: 'omitted', status: null, includeStatus: false },
];
for (const testCase of testCases) {
const result = await page.evaluate(async (dispId, hash, status, includeStatus) => {
const filter = {
dispensaryId: dispId,
pricingType: 'rec',
types: [],
useCache: false,
isDefaultSort: true,
sortBy: 'popularSortIdx',
sortDirection: 1,
bypassOnlineThresholds: true,
isKioskMenu: false,
removeProductsBelowOptionThresholds: false,
};
if (includeStatus) {
filter.Status = status;
}
const variables = {
includeEnterpriseSpecials: false,
productsFilter: filter,
page: 0,
perPage: 100,
};
const qs = new URLSearchParams({
operationName: 'FilteredProducts',
variables: JSON.stringify(variables),
extensions: JSON.stringify({ persistedQuery: { version: 1, sha256Hash: hash } }),
});
const resp = await fetch(`https://dutchie.com/graphql?${qs.toString()}`, {
method: 'GET',
headers: {
'content-type': 'application/json',
'apollographql-client-name': 'Marketplace (production)',
},
credentials: 'include',
});
const json = await resp.json();
const products = json?.data?.filteredProducts?.products || [];
return {
count: products.length,
totalCount: json?.data?.filteredProducts?.queryInfo?.totalCount,
sampleStatus: products[0]?.Status,
statuses: [...new Set(products.map((p) => p.Status))],
};
}, dispensaryId, GRAPHQL_HASH, testCase.status, testCase.includeStatus);
console.log(`Status ${testCase.label}: Products=${result.count}, Total=${result.totalCount}, Statuses=${JSON.stringify(result.statuses)}`);
}
await browser.close();
}
main().catch(console.error);

201
backend/dist/services/availability.js vendored Normal file
View File

@@ -0,0 +1,201 @@
"use strict";
/**
* Availability Service
*
* Normalizes product availability from various menu providers and tracks
* state transitions for inventory analytics.
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.normalizeAvailability = normalizeAvailability;
exports.extractAvailabilityHints = extractAvailabilityHints;
exports.hintsToAvailability = hintsToAvailability;
exports.aggregateAvailability = aggregateAvailability;
// Threshold for considering stock as "limited"
const LIMITED_THRESHOLD = 5;
/**
* Normalize availability from a Dutchie product
*
* Dutchie products can have various availability indicators:
* - potencyAmount.quantity: explicit stock count
* - status: sometimes includes stock status
* - variants[].quantity: stock per variant
* - isInStock / inStock: boolean flags
*/
function normalizeAvailability(dutchieProduct) {
const raw = {};
// Collect raw availability data for debugging
if (dutchieProduct.potencyAmount?.quantity !== undefined) {
raw.potencyQuantity = dutchieProduct.potencyAmount.quantity;
}
if (dutchieProduct.status !== undefined) {
raw.status = dutchieProduct.status;
}
if (dutchieProduct.isInStock !== undefined) {
raw.isInStock = dutchieProduct.isInStock;
}
if (dutchieProduct.inStock !== undefined) {
raw.inStock = dutchieProduct.inStock;
}
if (dutchieProduct.variants?.length) {
const variantQuantities = dutchieProduct.variants
.filter((v) => v.quantity !== undefined)
.map((v) => ({ option: v.option, quantity: v.quantity }));
if (variantQuantities.length) {
raw.variantQuantities = variantQuantities;
}
}
// Try to extract quantity
let quantity = null;
// Check potencyAmount.quantity first (most reliable for Dutchie)
if (typeof dutchieProduct.potencyAmount?.quantity === 'number') {
quantity = dutchieProduct.potencyAmount.quantity;
}
// Sum variant quantities if available
else if (dutchieProduct.variants?.length) {
const totalVariantQty = dutchieProduct.variants.reduce((sum, v) => {
return sum + (typeof v.quantity === 'number' ? v.quantity : 0);
}, 0);
if (totalVariantQty > 0) {
quantity = totalVariantQty;
}
}
// Determine status
let status = 'unknown';
// Explicit boolean flags take precedence
if (dutchieProduct.isInStock === false || dutchieProduct.inStock === false) {
status = 'out_of_stock';
}
else if (dutchieProduct.isInStock === true || dutchieProduct.inStock === true) {
status = quantity !== null && quantity <= LIMITED_THRESHOLD ? 'limited' : 'in_stock';
}
// Check status string
else if (typeof dutchieProduct.status === 'string') {
const statusLower = dutchieProduct.status.toLowerCase();
if (statusLower.includes('out') || statusLower.includes('unavailable')) {
status = 'out_of_stock';
}
else if (statusLower.includes('limited') || statusLower.includes('low')) {
status = 'limited';
}
else if (statusLower.includes('in') || statusLower.includes('available')) {
status = 'in_stock';
}
}
// Infer from quantity
else if (quantity !== null) {
if (quantity === 0) {
status = 'out_of_stock';
}
else if (quantity <= LIMITED_THRESHOLD) {
status = 'limited';
}
else {
status = 'in_stock';
}
}
return { status, quantity, raw };
}
/**
* Extract availability hints from page content or product card HTML
*
* Used for sandbox provider scraping where we don't have structured data
*/
function extractAvailabilityHints(pageContent, productElement) {
const hints = {};
const content = (productElement || pageContent).toLowerCase();
// Check for out-of-stock indicators
const oosPatterns = [
'out of stock',
'out-of-stock',
'sold out',
'soldout',
'unavailable',
'not available',
'coming soon',
'notify me'
];
hints.hasOutOfStockBadge = oosPatterns.some(p => content.includes(p));
// Check for limited stock indicators
const limitedPatterns = [
'limited stock',
'limited quantity',
'low stock',
'only \\d+ left',
'few remaining',
'almost gone',
'selling fast'
];
hints.hasLimitedBadge = limitedPatterns.some(p => {
if (p.includes('\\d')) {
return new RegExp(p, 'i').test(content);
}
return content.includes(p);
});
// Check for in-stock indicators
const inStockPatterns = [
'in stock',
'in-stock',
'add to cart',
'add to bag',
'buy now',
'available'
];
hints.hasInStockBadge = inStockPatterns.some(p => content.includes(p));
// Try to extract quantity text
const qtyMatch = content.match(/(\d+)\s*(left|remaining|in stock|available)/i);
if (qtyMatch) {
hints.quantityText = qtyMatch[0];
}
// Look for explicit stock text
const stockTextMatch = content.match(/(out of stock|in stock|low stock|limited|sold out)[^<]*/i);
if (stockTextMatch) {
hints.stockText = stockTextMatch[0].trim();
}
return hints;
}
/**
* Convert availability hints to normalized availability
*/
function hintsToAvailability(hints) {
let status = 'unknown';
let quantity = null;
// Extract quantity if present
if (hints.quantityText) {
const match = hints.quantityText.match(/(\d+)/);
if (match) {
quantity = parseInt(match[1], 10);
}
}
// Determine status from hints
if (hints.hasOutOfStockBadge) {
status = 'out_of_stock';
}
else if (hints.hasLimitedBadge) {
status = 'limited';
}
else if (hints.hasInStockBadge) {
status = quantity !== null && quantity <= LIMITED_THRESHOLD ? 'limited' : 'in_stock';
}
return {
status,
quantity,
raw: hints
};
}
function aggregateAvailability(products) {
const counts = {
in_stock: 0,
out_of_stock: 0,
limited: 0,
unknown: 0,
changed: 0
};
for (const product of products) {
const status = product.availability_status || 'unknown';
counts[status]++;
if (product.previous_status && product.previous_status !== status) {
counts.changed++;
}
}
return counts;
}

File diff suppressed because it is too large Load Diff

View File

@@ -4,9 +4,14 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.discoverCategories = discoverCategories;
const puppeteer_1 = __importDefault(require("puppeteer"));
const puppeteer_extra_1 = __importDefault(require("puppeteer-extra"));
const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth"));
const migrate_1 = require("../db/migrate");
const logger_1 = require("./logger");
const age_gate_1 = require("../utils/age-gate");
const dutchie_1 = require("../scrapers/templates/dutchie");
// Apply stealth plugin
puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)());
const DUTCHIE_CATEGORIES = [
{ name: 'Shop', slug: 'shop' },
{ name: 'Flower', slug: 'flower', parentSlug: 'shop' },
@@ -19,6 +24,18 @@ const DUTCHIE_CATEGORIES = [
{ name: 'Brands', slug: 'brands' },
{ name: 'Specials', slug: 'specials' }
];
const CURALEAF_CATEGORIES = [
{ name: 'Shop', slug: 'shop' },
{ name: 'Flower', slug: 'flower', parentSlug: 'shop' },
{ name: 'Pre-Rolls', slug: 'pre-rolls', parentSlug: 'shop' },
{ name: 'Vaporizers', slug: 'vaporizers', parentSlug: 'shop' },
{ name: 'Concentrates', slug: 'concentrates', parentSlug: 'shop' },
{ name: 'Edibles', slug: 'edibles', parentSlug: 'shop' },
{ name: 'Tinctures', slug: 'tinctures', parentSlug: 'shop' },
{ name: 'Topicals', slug: 'topicals', parentSlug: 'shop' },
{ name: 'Capsules', slug: 'capsules', parentSlug: 'shop' },
{ name: 'Accessories', slug: 'accessories', parentSlug: 'shop' }
];
async function makePageStealthy(page) {
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', { get: () => false });
@@ -72,7 +89,7 @@ async function discoverCategories(storeId) {
const store = storeResult.rows[0];
const baseUrl = store.dutchie_url;
// Launch browser to check page source
browser = await puppeteer_1.default.launch({
browser = await puppeteer_extra_1.default.launch({
headless: 'new',
args: [
'--no-sandbox',
@@ -85,9 +102,14 @@ async function discoverCategories(storeId) {
await makePageStealthy(page);
await page.setViewport({ width: 1920, height: 1080 });
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
// Set age gate bypass cookies BEFORE navigation (standard for all cannabis sites)
const state = (0, age_gate_1.detectStateFromUrl)(baseUrl);
await (0, age_gate_1.setAgeGateCookies)(page, baseUrl, state);
logger_1.logger.info('categories', `Loading page to detect menu type: ${baseUrl}`);
await page.goto(baseUrl, { waitUntil: 'domcontentloaded', timeout: 60000 });
await page.waitForTimeout(3000);
// If age gate still appears, try to bypass it
await (0, age_gate_1.bypassAgeGate)(page, state);
// Detect if it's a Dutchie menu by inspecting page source
const isDutchie = await isDutchieMenu(page);
await browser.close();
@@ -97,8 +119,9 @@ async function discoverCategories(storeId) {
await createDutchieCategories(storeId, store);
}
else {
logger_1.logger.info('categories', `⚠️ Non-Dutchie menu detected, would need custom scraping logic`);
throw new Error('Non-Dutchie menus not yet supported. Please contact support.');
// Fallback: Use standard cannabis categories for non-Dutchie sites
logger_1.logger.info('categories', `Non-Dutchie menu detected, using standard cannabis categories for ${store.name}`);
await createCuraleafCategories(storeId, store);
}
}
catch (error) {
@@ -116,24 +139,24 @@ async function createDutchieCategories(storeId, store) {
const baseUrl = store.dutchie_url;
for (const category of DUTCHIE_CATEGORIES) {
let categoryUrl;
// Use Dutchie template to build correct category URLs
if (category.parentSlug) {
// Subcategory: /embedded-menu/{slug}/shop/flower
categoryUrl = `${baseUrl}/${category.parentSlug}/${category.slug}`;
// Subcategory: Use template's buildCategoryUrl (e.g., /products/flower)
categoryUrl = dutchie_1.dutchieTemplate.buildCategoryUrl(baseUrl, category.name);
}
else {
// Top-level: /embedded-menu/{slug}/shop
// Top-level: Use base URL with slug
categoryUrl = `${baseUrl}/${category.slug}`;
}
const path = category.parentSlug ? `${category.parentSlug}/${category.slug}` : category.slug;
if (!category.parentSlug) {
// Create parent category
await client.query(`
INSERT INTO categories (store_id, name, slug, dutchie_url, path, scrape_enabled, parent_id)
VALUES ($1, $2, $3, $4, $5, true, NULL)
ON CONFLICT (store_id, slug)
DO UPDATE SET name = $2, dutchie_url = $4, path = $5
INSERT INTO categories (store_id, name, slug, dutchie_url, scrape_enabled)
VALUES ($1, $2, $3, $4, true)
ON CONFLICT (store_id, slug)
DO UPDATE SET name = $2, dutchie_url = $4
RETURNING id
`, [storeId, category.name, category.slug, categoryUrl, path]);
`, [storeId, category.name, category.slug, categoryUrl]);
logger_1.logger.info('categories', `📁 ${category.name}`);
}
else {
@@ -143,13 +166,12 @@ async function createDutchieCategories(storeId, store) {
WHERE store_id = $1 AND slug = $2
`, [storeId, category.parentSlug]);
if (parentResult.rows.length > 0) {
const parentId = parentResult.rows[0].id;
await client.query(`
INSERT INTO categories (store_id, name, slug, dutchie_url, path, scrape_enabled, parent_id)
VALUES ($1, $2, $3, $4, $5, true, $6)
INSERT INTO categories (store_id, name, slug, dutchie_url, scrape_enabled)
VALUES ($1, $2, $3, $4, true)
ON CONFLICT (store_id, slug)
DO UPDATE SET name = $2, dutchie_url = $4, path = $5, parent_id = $6
`, [storeId, category.name, category.slug, categoryUrl, path, parentId]);
DO UPDATE SET name = $2, dutchie_url = $4
`, [storeId, category.name, category.slug, categoryUrl]);
logger_1.logger.info('categories', ` └── ${category.name}`);
}
}
@@ -166,3 +188,59 @@ async function createDutchieCategories(storeId, store) {
client.release();
}
}
async function createCuraleafCategories(storeId, store) {
const client = await migrate_1.pool.connect();
try {
await client.query('BEGIN');
logger_1.logger.info('categories', `Creating predefined Curaleaf category structure`);
const baseUrl = store.dutchie_url;
for (const category of CURALEAF_CATEGORIES) {
let categoryUrl;
if (category.parentSlug) {
// Subcategory URL - Curaleaf uses pattern like: /stores/{store-slug}/{category}
categoryUrl = `${baseUrl}?category=${category.slug}`;
}
else {
// Top-level category
categoryUrl = baseUrl;
}
if (!category.parentSlug) {
// Create parent category
await client.query(`
INSERT INTO categories (store_id, name, slug, dutchie_url, scrape_enabled)
VALUES ($1, $2, $3, $4, true)
ON CONFLICT (store_id, slug)
DO UPDATE SET name = $2, dutchie_url = $4
RETURNING id
`, [storeId, category.name, category.slug, categoryUrl]);
logger_1.logger.info('categories', `📁 ${category.name}`);
}
else {
// Create subcategory
const parentResult = await client.query(`
SELECT id FROM categories
WHERE store_id = $1 AND slug = $2
`, [storeId, category.parentSlug]);
if (parentResult.rows.length > 0) {
await client.query(`
INSERT INTO categories (store_id, name, slug, dutchie_url, scrape_enabled)
VALUES ($1, $2, $3, $4, true)
ON CONFLICT (store_id, slug)
DO UPDATE SET name = $2, dutchie_url = $4
`, [storeId, category.name, category.slug, categoryUrl]);
logger_1.logger.info('categories', ` └── ${category.name}`);
}
}
}
await client.query('COMMIT');
logger_1.logger.info('categories', `✅ Created ${CURALEAF_CATEGORIES.length} Curaleaf categories successfully`);
}
catch (error) {
await client.query('ROLLBACK');
logger_1.logger.error('categories', `Failed to create Curaleaf categories: ${error}`);
throw error;
}
finally {
client.release();
}
}

536
backend/dist/services/crawl-scheduler.js vendored Normal file
View File

@@ -0,0 +1,536 @@
"use strict";
/**
* Crawl Scheduler Service
*
* This service manages crawl scheduling using a job queue approach.
* It does NOT modify the crawler - it only TRIGGERS the existing crawler.
*
* Features:
* - Global schedule: crawl all stores every N hours
* - Daily special run: 12:01 AM local store time
* - Per-store schedule overrides
* - Job queue for tracking pending/running crawls
*/
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.getGlobalSchedule = getGlobalSchedule;
exports.updateGlobalSchedule = updateGlobalSchedule;
exports.getStoreScheduleStatuses = getStoreScheduleStatuses;
exports.getStoreSchedule = getStoreSchedule;
exports.updateStoreSchedule = updateStoreSchedule;
exports.createCrawlJob = createCrawlJob;
exports.getPendingJobs = getPendingJobs;
exports.claimJob = claimJob;
exports.completeJob = completeJob;
exports.getRecentJobs = getRecentJobs;
exports.getAllRecentJobs = getAllRecentJobs;
exports.checkAndCreateScheduledJobs = checkAndCreateScheduledJobs;
exports.checkAndCreateDailySpecialJobs = checkAndCreateDailySpecialJobs;
exports.processJobs = processJobs;
exports.processOrchestrator = processOrchestrator;
exports.setSchedulerMode = setSchedulerMode;
exports.getSchedulerMode = getSchedulerMode;
exports.startCrawlScheduler = startCrawlScheduler;
exports.stopCrawlScheduler = stopCrawlScheduler;
exports.restartCrawlScheduler = restartCrawlScheduler;
exports.triggerManualCrawl = triggerManualCrawl;
exports.triggerAllStoresCrawl = triggerAllStoresCrawl;
exports.cancelJob = cancelJob;
const node_cron_1 = __importDefault(require("node-cron"));
const migrate_1 = require("../db/migrate");
const scraper_v2_1 = require("../scraper-v2");
const store_crawl_orchestrator_1 = require("./store-crawl-orchestrator");
// Worker identification
const WORKER_ID = `worker-${process.pid}-${Date.now()}`;
let schedulerCronJob = null;
let jobProcessorRunning = false;
let orchestratorProcessorRunning = false;
// Scheduler mode: 'legacy' uses job queue, 'orchestrator' uses intelligent orchestration
let schedulerMode = 'orchestrator';
// ============================================
// Schedule Management
// ============================================
/**
* Get global schedule settings
*/
async function getGlobalSchedule() {
const result = await migrate_1.pool.query(`
SELECT * FROM crawler_schedule ORDER BY id
`);
return result.rows;
}
/**
* Update global schedule setting
*/
async function updateGlobalSchedule(scheduleType, updates) {
const setClauses = [];
const values = [];
let paramIndex = 1;
if (updates.enabled !== undefined) {
setClauses.push(`enabled = $${paramIndex++}`);
values.push(updates.enabled);
}
if (updates.interval_hours !== undefined) {
setClauses.push(`interval_hours = $${paramIndex++}`);
values.push(updates.interval_hours);
}
if (updates.run_time !== undefined) {
setClauses.push(`run_time = $${paramIndex++}`);
values.push(updates.run_time);
}
values.push(scheduleType);
const result = await migrate_1.pool.query(`
UPDATE crawler_schedule
SET ${setClauses.join(', ')}
WHERE schedule_type = $${paramIndex}
RETURNING *
`, values);
return result.rows[0];
}
/**
* Get all store schedule statuses
*/
async function getStoreScheduleStatuses() {
const result = await migrate_1.pool.query(`SELECT * FROM crawl_schedule_status ORDER BY priority DESC, store_name`);
return result.rows;
}
/**
* Get or create per-store schedule override
*/
async function getStoreSchedule(storeId) {
const result = await migrate_1.pool.query(`
SELECT * FROM store_crawl_schedule WHERE store_id = $1
`, [storeId]);
if (result.rows.length > 0) {
return result.rows[0];
}
// Return default (use global)
return {
store_id: storeId,
enabled: true,
interval_hours: null,
daily_special_enabled: true,
daily_special_time: null,
priority: 0
};
}
/**
* Update per-store schedule override
*/
async function updateStoreSchedule(storeId, updates) {
const result = await migrate_1.pool.query(`
INSERT INTO store_crawl_schedule (store_id, enabled, interval_hours, daily_special_enabled, daily_special_time, priority)
VALUES ($1, $2, $3, $4, $5, $6)
ON CONFLICT (store_id) DO UPDATE SET
enabled = COALESCE(EXCLUDED.enabled, store_crawl_schedule.enabled),
interval_hours = EXCLUDED.interval_hours,
daily_special_enabled = COALESCE(EXCLUDED.daily_special_enabled, store_crawl_schedule.daily_special_enabled),
daily_special_time = EXCLUDED.daily_special_time,
priority = COALESCE(EXCLUDED.priority, store_crawl_schedule.priority),
updated_at = NOW()
RETURNING *
`, [
storeId,
updates.enabled ?? true,
updates.interval_hours ?? null,
updates.daily_special_enabled ?? true,
updates.daily_special_time ?? null,
updates.priority ?? 0
]);
return result.rows[0];
}
// ============================================
// Job Queue Management
// ============================================
/**
* Create a new crawl job
*/
async function createCrawlJob(storeId, jobType = 'full_crawl', triggerType = 'scheduled', scheduledAt = new Date(), priority = 0) {
// Check if there's already a pending or running job for this store
const existing = await migrate_1.pool.query(`
SELECT id FROM crawl_jobs
WHERE store_id = $1 AND status IN ('pending', 'running')
LIMIT 1
`, [storeId]);
if (existing.rows.length > 0) {
console.log(`Skipping job creation for store ${storeId} - already has pending/running job`);
return existing.rows[0];
}
const result = await migrate_1.pool.query(`
INSERT INTO crawl_jobs (store_id, job_type, trigger_type, scheduled_at, priority, status)
VALUES ($1, $2, $3, $4, $5, 'pending')
RETURNING *
`, [storeId, jobType, triggerType, scheduledAt, priority]);
console.log(`Created crawl job ${result.rows[0].id} for store ${storeId} (${triggerType})`);
return result.rows[0];
}
/**
* Get pending jobs ready to run
*/
async function getPendingJobs(limit = 5) {
const result = await migrate_1.pool.query(`
SELECT cj.*, s.name as store_name
FROM crawl_jobs cj
JOIN stores s ON s.id = cj.store_id
WHERE cj.status = 'pending'
AND cj.scheduled_at <= NOW()
ORDER BY cj.priority DESC, cj.scheduled_at ASC
LIMIT $1
`, [limit]);
return result.rows;
}
/**
* Claim a job for processing
*/
async function claimJob(jobId) {
const result = await migrate_1.pool.query(`
UPDATE crawl_jobs
SET status = 'running', started_at = NOW(), worker_id = $2
WHERE id = $1 AND status = 'pending'
RETURNING id
`, [jobId, WORKER_ID]);
return result.rows.length > 0;
}
/**
* Complete a job
*/
async function completeJob(jobId, success, results) {
await migrate_1.pool.query(`
UPDATE crawl_jobs
SET
status = $2,
completed_at = NOW(),
products_found = $3,
error_message = $4
WHERE id = $1
`, [
jobId,
success ? 'completed' : 'failed',
results?.products_found ?? null,
results?.error_message ?? null
]);
}
/**
* Get recent jobs for a store
*/
async function getRecentJobs(storeId, limit = 10) {
const result = await migrate_1.pool.query(`
SELECT * FROM crawl_jobs
WHERE store_id = $1
ORDER BY created_at DESC
LIMIT $2
`, [storeId, limit]);
return result.rows;
}
/**
* Get all recent jobs
*/
async function getAllRecentJobs(limit = 50) {
const result = await migrate_1.pool.query(`
SELECT cj.*, s.name as store_name, s.slug as store_slug
FROM crawl_jobs cj
JOIN stores s ON s.id = cj.store_id
ORDER BY cj.created_at DESC
LIMIT $1
`, [limit]);
return result.rows;
}
// ============================================
// Scheduler Logic
// ============================================
/**
* Check which stores are due for a crawl and create jobs
*/
async function checkAndCreateScheduledJobs() {
console.log('Checking for stores due for crawl...');
// Get global schedule settings
const globalSchedule = await migrate_1.pool.query(`
SELECT * FROM crawler_schedule WHERE schedule_type = 'global_interval'
`);
if (globalSchedule.rows.length === 0 || !globalSchedule.rows[0].enabled) {
console.log('Global scheduler is disabled');
return 0;
}
const intervalHours = globalSchedule.rows[0].interval_hours || 4;
// Find stores due for crawl
const result = await migrate_1.pool.query(`
SELECT
s.id,
s.name,
s.timezone,
s.last_scraped_at,
COALESCE(scs.enabled, TRUE) as schedule_enabled,
COALESCE(scs.interval_hours, $1) as interval_hours,
COALESCE(scs.priority, 0) as priority
FROM stores s
LEFT JOIN store_crawl_schedule scs ON scs.store_id = s.id
WHERE s.active = TRUE
AND s.scrape_enabled = TRUE
AND COALESCE(scs.enabled, TRUE) = TRUE
AND (
s.last_scraped_at IS NULL
OR s.last_scraped_at < NOW() - (COALESCE(scs.interval_hours, $1) || ' hours')::INTERVAL
)
AND NOT EXISTS (
SELECT 1 FROM crawl_jobs cj
WHERE cj.store_id = s.id AND cj.status IN ('pending', 'running')
)
ORDER BY COALESCE(scs.priority, 0) DESC, s.last_scraped_at ASC NULLS FIRST
`, [intervalHours]);
let jobsCreated = 0;
for (const store of result.rows) {
try {
await createCrawlJob(store.id, 'full_crawl', 'scheduled', new Date(), store.priority);
jobsCreated++;
console.log(`Scheduled crawl job for: ${store.name}`);
}
catch (error) {
console.error(`Failed to create job for store ${store.name}:`, error);
}
}
console.log(`Created ${jobsCreated} scheduled crawl jobs`);
return jobsCreated;
}
/**
* Check for daily special runs (12:01 AM local time)
*/
async function checkAndCreateDailySpecialJobs() {
console.log('Checking for daily special runs...');
// Get daily special schedule
const dailySchedule = await migrate_1.pool.query(`
SELECT * FROM crawler_schedule WHERE schedule_type = 'daily_special'
`);
if (dailySchedule.rows.length === 0 || !dailySchedule.rows[0].enabled) {
console.log('Daily special scheduler is disabled');
return 0;
}
const targetTime = dailySchedule.rows[0].run_time || '00:01';
// Find stores where it's currently the target time in their local timezone
// and they haven't had a daily special run today
const result = await migrate_1.pool.query(`
SELECT
s.id,
s.name,
s.timezone,
COALESCE(scs.daily_special_enabled, TRUE) as daily_special_enabled,
COALESCE(scs.daily_special_time, $1::TIME) as daily_special_time,
COALESCE(scs.priority, 0) as priority
FROM stores s
LEFT JOIN store_crawl_schedule scs ON scs.store_id = s.id
WHERE s.active = TRUE
AND s.scrape_enabled = TRUE
AND COALESCE(scs.daily_special_enabled, TRUE) = TRUE
-- Check if current time in store timezone matches the target time (within 2 minutes)
AND ABS(
EXTRACT(EPOCH FROM (
(NOW() AT TIME ZONE COALESCE(s.timezone, 'America/Phoenix'))::TIME
- COALESCE(scs.daily_special_time, $1::TIME)
))
) < 120 -- within 2 minutes
-- Ensure we haven't already created a daily_special job today for this store
AND NOT EXISTS (
SELECT 1 FROM crawl_jobs cj
WHERE cj.store_id = s.id
AND cj.trigger_type = 'daily_special'
AND cj.created_at > (NOW() AT TIME ZONE COALESCE(s.timezone, 'America/Phoenix'))::DATE
)
AND NOT EXISTS (
SELECT 1 FROM crawl_jobs cj
WHERE cj.store_id = s.id AND cj.status IN ('pending', 'running')
)
ORDER BY COALESCE(scs.priority, 0) DESC
`, [targetTime]);
let jobsCreated = 0;
for (const store of result.rows) {
try {
await createCrawlJob(store.id, 'full_crawl', 'daily_special', new Date(), store.priority + 10);
jobsCreated++;
console.log(`Created daily special job for: ${store.name} (${store.timezone})`);
}
catch (error) {
console.error(`Failed to create daily special job for store ${store.name}:`, error);
}
}
if (jobsCreated > 0) {
console.log(`Created ${jobsCreated} daily special crawl jobs`);
}
return jobsCreated;
}
/**
* Process pending jobs
*/
async function processJobs() {
if (jobProcessorRunning) {
console.log('Job processor already running, skipping...');
return;
}
jobProcessorRunning = true;
try {
const jobs = await getPendingJobs(1); // Process one at a time for safety
for (const job of jobs) {
console.log(`Processing job ${job.id} for store: ${job.store_name}`);
const claimed = await claimJob(job.id);
if (!claimed) {
console.log(`Job ${job.id} already claimed by another worker`);
continue;
}
try {
// Call the existing scraper - DO NOT MODIFY SCRAPER LOGIC
await (0, scraper_v2_1.scrapeStore)(job.store_id);
// Update store's last_scraped_at
await migrate_1.pool.query(`
UPDATE stores SET last_scraped_at = NOW() WHERE id = $1
`, [job.store_id]);
await completeJob(job.id, true, {});
console.log(`Job ${job.id} completed successfully`);
}
catch (error) {
console.error(`Job ${job.id} failed:`, error);
await completeJob(job.id, false, { error_message: error.message });
}
}
}
finally {
jobProcessorRunning = false;
}
}
/**
* Process stores using the intelligent orchestrator
* This replaces the simple job queue approach with intelligent provider detection
*/
async function processOrchestrator() {
if (orchestratorProcessorRunning) {
console.log('Orchestrator processor already running, skipping...');
return;
}
orchestratorProcessorRunning = true;
try {
// Get stores due for orchestration (respects schedule, intervals, etc.)
const storeIds = await (0, store_crawl_orchestrator_1.getStoresDueForOrchestration)(3); // Process up to 3 at a time
if (storeIds.length === 0) {
return;
}
console.log(`Orchestrator: Processing ${storeIds.length} stores due for crawl`);
// Process each store through the orchestrator
for (const storeId of storeIds) {
try {
console.log(`Orchestrator: Starting crawl for store ${storeId}`);
const result = await (0, store_crawl_orchestrator_1.runStoreCrawlOrchestrator)(storeId);
console.log(`Orchestrator: Store ${storeId} completed - ${result.summary}`);
}
catch (error) {
console.error(`Orchestrator: Store ${storeId} failed - ${error.message}`);
}
}
console.log(`Orchestrator: Finished processing ${storeIds.length} stores`);
}
finally {
orchestratorProcessorRunning = false;
}
}
// ============================================
// Scheduler Control
// ============================================
/**
* Set scheduler mode
*/
function setSchedulerMode(mode) {
schedulerMode = mode;
console.log(`Scheduler mode set to: ${mode}`);
}
/**
* Get current scheduler mode
*/
function getSchedulerMode() {
return schedulerMode;
}
/**
* Start the scheduler (runs every minute to check for due jobs)
*/
async function startCrawlScheduler() {
stopCrawlScheduler();
console.log(`Starting crawl scheduler in ${schedulerMode} mode...`);
// Run every minute
schedulerCronJob = node_cron_1.default.schedule('* * * * *', async () => {
try {
if (schedulerMode === 'orchestrator') {
// Use intelligent orchestrator (handles detection + crawl)
await processOrchestrator();
}
else {
// Legacy mode: job queue approach
// Check for interval-based scheduled jobs
await checkAndCreateScheduledJobs();
// Check for daily special runs
await checkAndCreateDailySpecialJobs();
// Process any pending jobs
await processJobs();
}
}
catch (error) {
console.error('Scheduler tick error:', error);
}
});
console.log(`Crawl scheduler started in ${schedulerMode} mode (checking every minute)`);
}
/**
* Stop the scheduler
*/
function stopCrawlScheduler() {
if (schedulerCronJob) {
schedulerCronJob.stop();
schedulerCronJob = null;
console.log('Crawl scheduler stopped');
}
}
/**
* Restart the scheduler
*/
async function restartCrawlScheduler() {
await startCrawlScheduler();
}
// ============================================
// Manual Triggers
// ============================================
/**
* Manually trigger a crawl for a specific store (creates a job immediately)
*/
async function triggerManualCrawl(storeId) {
console.log(`Manual crawl triggered for store ID: ${storeId}`);
return await createCrawlJob(storeId, 'full_crawl', 'manual', new Date(), 100); // High priority
}
/**
* Manually trigger crawls for all stores
*/
async function triggerAllStoresCrawl() {
console.log('Manual crawl triggered for all stores');
const result = await migrate_1.pool.query(`
SELECT id, name FROM stores
WHERE active = TRUE AND scrape_enabled = TRUE
AND NOT EXISTS (
SELECT 1 FROM crawl_jobs cj
WHERE cj.store_id = stores.id AND cj.status IN ('pending', 'running')
)
`);
let jobsCreated = 0;
for (const store of result.rows) {
await createCrawlJob(store.id, 'full_crawl', 'manual', new Date(), 50);
jobsCreated++;
}
console.log(`Created ${jobsCreated} manual crawl jobs`);
return jobsCreated;
}
/**
* Cancel a pending job
*/
async function cancelJob(jobId) {
const result = await migrate_1.pool.query(`
UPDATE crawl_jobs
SET status = 'cancelled'
WHERE id = $1 AND status = 'pending'
RETURNING id
`, [jobId]);
return result.rows.length > 0;
}

476
backend/dist/services/crawler-jobs.js vendored Normal file
View File

@@ -0,0 +1,476 @@
"use strict";
/**
* Crawler Jobs Service
*
* Handles three types of jobs:
* 1. DetectMenuProviderJob - Detect menu provider for a dispensary
* 2. DutchieMenuCrawlJob - Production Dutchie crawl
* 3. SandboxCrawlJob - Learning/testing crawl for unknown providers
*/
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.runDetectMenuProviderJob = runDetectMenuProviderJob;
exports.runDutchieMenuCrawlJob = runDutchieMenuCrawlJob;
exports.runSandboxCrawlJob = runSandboxCrawlJob;
exports.processSandboxJobs = processSandboxJobs;
const migrate_1 = require("../db/migrate");
const logger_1 = require("./logger");
const menu_provider_detector_1 = require("./menu-provider-detector");
const scraper_v2_1 = require("../scraper-v2");
const puppeteer_1 = __importDefault(require("puppeteer"));
const fs_1 = require("fs");
const path_1 = __importDefault(require("path"));
const availability_1 = require("./availability");
const WORKER_ID = `crawler-${process.pid}-${Date.now()}`;
// ========================================
// Helper Functions
// ========================================
async function getDispensary(dispensaryId) {
const result = await migrate_1.pool.query(`SELECT id, name, website, menu_url, menu_provider, menu_provider_confidence,
crawler_mode, crawler_status, scraper_template
FROM dispensaries WHERE id = $1`, [dispensaryId]);
return result.rows[0] || null;
}
async function updateDispensary(dispensaryId, updates) {
const setClauses = [];
const values = [];
let paramIndex = 1;
for (const [key, value] of Object.entries(updates)) {
setClauses.push(`${key} = $${paramIndex}`);
values.push(value);
paramIndex++;
}
setClauses.push(`updated_at = NOW()`);
values.push(dispensaryId);
await migrate_1.pool.query(`UPDATE dispensaries SET ${setClauses.join(', ')} WHERE id = $${paramIndex}`, values);
}
async function createSandboxEntry(dispensaryId, suspectedProvider, mode, detectionSignals) {
// First, check if there's an existing active sandbox
const existing = await migrate_1.pool.query(`SELECT id FROM crawler_sandboxes
WHERE dispensary_id = $1 AND status NOT IN ('moved_to_production', 'failed')`, [dispensaryId]);
if (existing.rows.length > 0) {
// Update existing
await migrate_1.pool.query(`UPDATE crawler_sandboxes
SET suspected_menu_provider = $2, mode = $3, detection_signals = COALESCE($4, detection_signals), updated_at = NOW()
WHERE id = $1`, [existing.rows[0].id, suspectedProvider, mode, detectionSignals ? JSON.stringify(detectionSignals) : null]);
return existing.rows[0].id;
}
// Create new
const result = await migrate_1.pool.query(`INSERT INTO crawler_sandboxes (dispensary_id, suspected_menu_provider, mode, detection_signals, status)
VALUES ($1, $2, $3, $4, 'pending')
RETURNING id`, [dispensaryId, suspectedProvider, mode, detectionSignals ? JSON.stringify(detectionSignals) : '{}']);
return result.rows[0].id;
}
async function createSandboxJob(dispensaryId, sandboxId, jobType, priority = 0) {
const result = await migrate_1.pool.query(`INSERT INTO sandbox_crawl_jobs (dispensary_id, sandbox_id, job_type, status, priority)
VALUES ($1, $2, $3, 'pending', $4)
RETURNING id`, [dispensaryId, sandboxId, jobType, priority]);
return result.rows[0].id;
}
// Get linked store ID for a dispensary (for using existing scraper)
async function getStoreIdForDispensary(dispensaryId) {
// Check if there's a stores entry linked to this dispensary
const result = await migrate_1.pool.query(`SELECT s.id FROM stores s
JOIN dispensaries d ON d.menu_url = s.dutchie_url OR d.name ILIKE '%' || s.name || '%'
WHERE d.id = $1
LIMIT 1`, [dispensaryId]);
if (result.rows.length > 0) {
return result.rows[0].id;
}
// Try to find by website
const result2 = await migrate_1.pool.query(`SELECT s.id FROM stores s
JOIN dispensaries d ON d.website ILIKE '%' || s.slug || '%'
WHERE d.id = $1
LIMIT 1`, [dispensaryId]);
return result2.rows[0]?.id || null;
}
// ========================================
// Job 1: Detect Menu Provider
// ========================================
async function runDetectMenuProviderJob(dispensaryId) {
logger_1.logger.info('crawler-jobs', `Starting menu provider detection for dispensary ${dispensaryId}`);
const dispensary = await getDispensary(dispensaryId);
if (!dispensary) {
return { success: false, message: `Dispensary ${dispensaryId} not found` };
}
// Check for website URL
const websiteUrl = dispensary.website || dispensary.menu_url;
if (!websiteUrl) {
await updateDispensary(dispensaryId, {
crawler_status: 'error_needs_review',
last_menu_error_at: new Date(),
last_error_message: 'No website URL available for detection',
});
return { success: false, message: 'No website URL available' };
}
try {
// Run detection
const detection = await (0, menu_provider_detector_1.detectMenuProvider)(websiteUrl, {
checkMenuPaths: true,
timeout: 30000,
});
// Update dispensary with results
const updates = {
menu_provider: detection.provider,
menu_provider_confidence: detection.confidence,
provider_detection_data: JSON.stringify({
signals: detection.signals,
urlsTested: detection.urlsTested,
menuEntryPoints: detection.menuEntryPoints,
rawSignals: detection.rawSignals,
detectedAt: new Date().toISOString(),
}),
crawler_status: 'idle',
};
// Decide crawler mode based on provider
if (detection.provider === 'dutchie' && detection.confidence >= 70) {
// Dutchie with high confidence -> production
updates.crawler_mode = 'production';
logger_1.logger.info('crawler-jobs', `Dispensary ${dispensaryId} detected as Dutchie (${detection.confidence}%), setting to production`);
}
else {
// Unknown or non-Dutchie -> sandbox
updates.crawler_mode = 'sandbox';
// Create sandbox entry for further analysis
const sandboxId = await createSandboxEntry(dispensaryId, detection.provider, 'detection', {
signals: detection.signals,
rawSignals: detection.rawSignals,
});
// Queue sandbox crawl job
await createSandboxJob(dispensaryId, sandboxId, 'detection');
logger_1.logger.info('crawler-jobs', `Dispensary ${dispensaryId} detected as ${detection.provider} (${detection.confidence}%), setting to sandbox`);
}
// Update menu entry points if found
if (detection.menuEntryPoints.length > 0 && !dispensary.menu_url) {
updates.menu_url = detection.menuEntryPoints[0];
}
await updateDispensary(dispensaryId, updates);
return {
success: true,
message: `Detected provider: ${detection.provider} (${detection.confidence}%)`,
data: {
provider: detection.provider,
confidence: detection.confidence,
mode: updates.crawler_mode,
menuEntryPoints: detection.menuEntryPoints,
},
};
}
catch (error) {
logger_1.logger.error('crawler-jobs', `Detection failed for dispensary ${dispensaryId}: ${error.message}`);
await updateDispensary(dispensaryId, {
crawler_status: 'error_needs_review',
last_menu_error_at: new Date(),
last_error_message: `Detection failed: ${error.message}`,
});
return { success: false, message: error.message };
}
}
// ========================================
// Job 2: Dutchie Menu Crawl (Production)
// ========================================
async function runDutchieMenuCrawlJob(dispensaryId) {
logger_1.logger.info('crawler-jobs', `Starting Dutchie production crawl for dispensary ${dispensaryId}`);
const dispensary = await getDispensary(dispensaryId);
if (!dispensary) {
return { success: false, message: `Dispensary ${dispensaryId} not found` };
}
// Verify it's a Dutchie production dispensary
if (dispensary.menu_provider !== 'dutchie') {
logger_1.logger.warn('crawler-jobs', `Dispensary ${dispensaryId} is not Dutchie, skipping production crawl`);
return { success: false, message: 'Not a Dutchie dispensary' };
}
if (dispensary.crawler_mode !== 'production') {
logger_1.logger.warn('crawler-jobs', `Dispensary ${dispensaryId} is not in production mode, skipping`);
return { success: false, message: 'Not in production mode' };
}
// Find linked store ID
const storeId = await getStoreIdForDispensary(dispensaryId);
if (!storeId) {
// Need to create a store entry or handle differently
logger_1.logger.warn('crawler-jobs', `No linked store found for dispensary ${dispensaryId}`);
return { success: false, message: 'No linked store found - needs setup' };
}
try {
// Update status to running
await updateDispensary(dispensaryId, { crawler_status: 'running' });
// Run the existing Dutchie scraper
await (0, scraper_v2_1.scrapeStore)(storeId, 3); // 3 parallel workers
// Update success status
await updateDispensary(dispensaryId, {
crawler_status: 'ok',
last_menu_scrape: new Date(),
menu_scrape_status: 'active',
});
logger_1.logger.info('crawler-jobs', `Dutchie crawl completed for dispensary ${dispensaryId}`);
return {
success: true,
message: 'Dutchie crawl completed successfully',
data: { storeId },
};
}
catch (error) {
logger_1.logger.error('crawler-jobs', `Dutchie crawl failed for dispensary ${dispensaryId}: ${error.message}`);
// Check if this might be a provider change
let providerChanged = false;
try {
const browser = await puppeteer_1.default.launch({ headless: true, args: ['--no-sandbox'] });
const page = await browser.newPage();
const url = dispensary.menu_url || dispensary.website;
if (url) {
await page.goto(url, { waitUntil: 'networkidle2', timeout: 30000 });
const changeResult = await (0, menu_provider_detector_1.detectProviderChange)(page, 'dutchie');
providerChanged = changeResult.changed;
if (providerChanged) {
// Provider changed - move to sandbox
await updateDispensary(dispensaryId, {
crawler_mode: 'sandbox',
crawler_status: 'error_needs_review',
last_menu_error_at: new Date(),
last_error_message: `Provider appears to have changed from Dutchie to ${changeResult.newProvider}`,
});
const sandboxId = await createSandboxEntry(dispensaryId, changeResult.newProvider || 'unknown', 'detection', { providerChangeDetected: true, previousProvider: 'dutchie' });
await createSandboxJob(dispensaryId, sandboxId, 'detection');
logger_1.logger.warn('crawler-jobs', `Provider change detected for dispensary ${dispensaryId}: Dutchie -> ${changeResult.newProvider}`);
}
}
await browser.close();
}
catch {
// Ignore detection errors during failure handling
}
if (!providerChanged) {
await updateDispensary(dispensaryId, {
crawler_status: 'error_needs_review',
last_menu_error_at: new Date(),
last_error_message: error.message,
});
}
return { success: false, message: error.message };
}
}
// ========================================
// Job 3: Sandbox Crawl (Learning Mode)
// ========================================
async function runSandboxCrawlJob(dispensaryId, sandboxId) {
logger_1.logger.info('crawler-jobs', `Starting sandbox crawl for dispensary ${dispensaryId}`);
const dispensary = await getDispensary(dispensaryId);
if (!dispensary) {
return { success: false, message: `Dispensary ${dispensaryId} not found` };
}
// Get or create sandbox entry
let sandbox;
if (sandboxId) {
const result = await migrate_1.pool.query('SELECT * FROM crawler_sandboxes WHERE id = $1', [sandboxId]);
sandbox = result.rows[0];
}
else {
const result = await migrate_1.pool.query(`SELECT * FROM crawler_sandboxes
WHERE dispensary_id = $1 AND status NOT IN ('moved_to_production', 'failed')
ORDER BY created_at DESC LIMIT 1`, [dispensaryId]);
sandbox = result.rows[0];
if (!sandbox) {
const newSandboxId = await createSandboxEntry(dispensaryId, dispensary.menu_provider, 'template_learning');
const result = await migrate_1.pool.query('SELECT * FROM crawler_sandboxes WHERE id = $1', [newSandboxId]);
sandbox = result.rows[0];
}
}
const websiteUrl = dispensary.menu_url || dispensary.website;
if (!websiteUrl) {
await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = 'failed', failure_reason = 'No website URL' WHERE id = $1`, [sandbox.id]);
return { success: false, message: 'No website URL available' };
}
let browser = null;
try {
// Update status
await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = 'analyzing', updated_at = NOW() WHERE id = $1`, [sandbox.id]);
await updateDispensary(dispensaryId, { crawler_status: 'running' });
// Launch browser
browser = await puppeteer_1.default.launch({
headless: true,
args: ['--no-sandbox', '--disable-setuid-sandbox'],
});
const page = await browser.newPage();
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
// URLs to crawl (limited depth for sandbox)
const urlsToVisit = [websiteUrl];
const menuPaths = ['/menu', '/shop', '/products', '/order'];
for (const path of menuPaths) {
const baseUrl = new URL(websiteUrl).origin;
urlsToVisit.push(`${baseUrl}${path}`);
}
const urlsTested = [];
const menuEntryPoints = [];
const capturedHtml = [];
const analysisData = {
provider_signals: {},
selector_candidates: [],
page_structures: [],
};
// Crawl each URL
for (const url of urlsToVisit) {
try {
urlsTested.push(url);
await page.goto(url, { waitUntil: 'networkidle2', timeout: 30000 });
await new Promise(r => setTimeout(r, 2000)); // Wait for dynamic content
// Get page HTML
const html = await page.content();
// Check if this looks like a menu page
const hasMenuContent = await page.evaluate(() => {
const text = document.body.innerText.toLowerCase();
return (text.includes('add to cart') ||
text.includes('thc') ||
text.includes('indica') ||
text.includes('sativa'));
});
if (hasMenuContent) {
menuEntryPoints.push(url);
capturedHtml.push({ url, html });
// Analyze page structure for selector candidates
const structure = await page.evaluate(() => {
const candidates = [];
// Look for product-like containers
const productSelectors = [
'.product', '.product-card', '.menu-item', '.item-card',
'[data-product]', '[data-item]', '.strain', '.listing',
];
for (const selector of productSelectors) {
const els = document.querySelectorAll(selector);
if (els.length > 3) { // Likely a list
candidates.push({
selector,
count: els.length,
type: 'product_container',
});
}
}
// Look for price patterns
const pricePattern = /\$\d+(\.\d{2})?/;
const textNodes = document.body.innerText;
const priceMatches = textNodes.match(/\$\d+(\.\d{2})?/g);
return {
candidates,
priceCount: priceMatches?.length || 0,
hasAddToCart: textNodes.toLowerCase().includes('add to cart'),
};
});
// Extract availability hints from page content
const availabilityHints = (0, availability_1.extractAvailabilityHints)(html);
analysisData.page_structures.push({
url,
...structure,
availabilityHints,
});
}
}
catch (pageError) {
if (!pageError.message.includes('404')) {
logger_1.logger.warn('crawler-jobs', `Sandbox crawl error for ${url}: ${pageError.message}`);
}
}
}
// Save HTML to storage (local for now, S3 later)
let rawHtmlLocation = null;
if (capturedHtml.length > 0) {
const htmlDir = path_1.default.join(process.cwd(), 'sandbox-data', `dispensary-${dispensaryId}`);
await fs_1.promises.mkdir(htmlDir, { recursive: true });
for (const { url, html } of capturedHtml) {
const filename = `${Date.now()}-${url.replace(/[^a-z0-9]/gi, '_')}.html`;
await fs_1.promises.writeFile(path_1.default.join(htmlDir, filename), html);
}
rawHtmlLocation = htmlDir;
}
// Update sandbox with results
await migrate_1.pool.query(`UPDATE crawler_sandboxes SET
status = $1,
urls_tested = $2,
menu_entry_points = $3,
raw_html_location = $4,
analysis_json = $5,
confidence_score = $6,
analyzed_at = NOW(),
updated_at = NOW()
WHERE id = $7`, [
menuEntryPoints.length > 0 ? 'needs_human_review' : 'pending',
JSON.stringify(urlsTested),
JSON.stringify(menuEntryPoints),
rawHtmlLocation,
JSON.stringify(analysisData),
menuEntryPoints.length > 0 ? 50 : 20,
sandbox.id,
]);
// Update dispensary status
await updateDispensary(dispensaryId, {
crawler_status: 'error_needs_review', // Sandbox results need review
});
logger_1.logger.info('crawler-jobs', `Sandbox crawl completed for dispensary ${dispensaryId}: ${menuEntryPoints.length} menu pages found`);
return {
success: true,
message: `Sandbox crawl completed. Found ${menuEntryPoints.length} menu entry points.`,
data: {
sandboxId: sandbox.id,
urlsTested: urlsTested.length,
menuEntryPoints,
analysisData,
},
};
}
catch (error) {
logger_1.logger.error('crawler-jobs', `Sandbox crawl failed for dispensary ${dispensaryId}: ${error.message}`);
await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = 'failed', failure_reason = $1 WHERE id = $2`, [error.message, sandbox.id]);
await updateDispensary(dispensaryId, {
crawler_status: 'error_needs_review',
last_menu_error_at: new Date(),
last_error_message: `Sandbox crawl failed: ${error.message}`,
});
return { success: false, message: error.message };
}
finally {
if (browser) {
await browser.close();
}
}
}
// ========================================
// Queue Processing Functions
// ========================================
/**
* Process pending sandbox jobs
*/
async function processSandboxJobs(limit = 5) {
// Claim pending jobs
const jobs = await migrate_1.pool.query(`UPDATE sandbox_crawl_jobs
SET status = 'running', worker_id = $1, started_at = NOW()
WHERE id IN (
SELECT id FROM sandbox_crawl_jobs
WHERE status = 'pending' AND scheduled_at <= NOW()
ORDER BY priority DESC, scheduled_at ASC
LIMIT $2
FOR UPDATE SKIP LOCKED
)
RETURNING *`, [WORKER_ID, limit]);
for (const job of jobs.rows) {
try {
let result;
if (job.job_type === 'detection') {
result = await runDetectMenuProviderJob(job.dispensary_id);
}
else {
result = await runSandboxCrawlJob(job.dispensary_id, job.sandbox_id);
}
await migrate_1.pool.query(`UPDATE sandbox_crawl_jobs
SET status = $1, completed_at = NOW(), result_summary = $2, error_message = $3
WHERE id = $4`, [
result.success ? 'completed' : 'failed',
JSON.stringify(result.data || {}),
result.success ? null : result.message,
job.id,
]);
}
catch (error) {
await migrate_1.pool.query(`UPDATE sandbox_crawl_jobs SET status = 'failed', error_message = $1 WHERE id = $2`, [error.message, job.id]);
}
}
}

202
backend/dist/services/crawler-logger.js vendored Normal file
View File

@@ -0,0 +1,202 @@
"use strict";
/**
* CrawlerLogger - Structured logging for crawler operations
*
* High-signal, low-noise logging with JSON output for:
* - Job lifecycle (one summary per job)
* - Provider/mode changes
* - Sandbox events
* - Queue failures
*
* NO per-product logging - that's too noisy.
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.crawlerLogger = void 0;
class CrawlerLoggerService {
formatLog(payload) {
return JSON.stringify(payload);
}
log(payload) {
const formatted = this.formatLog(payload);
switch (payload.level) {
case 'error':
console.error(`[CRAWLER] ${formatted}`);
break;
case 'warn':
console.warn(`[CRAWLER] ${formatted}`);
break;
case 'debug':
console.debug(`[CRAWLER] ${formatted}`);
break;
default:
console.log(`[CRAWLER] ${formatted}`);
}
}
/**
* Log when a crawl job starts
*/
jobStarted(params) {
this.log({
timestamp: new Date().toISOString(),
level: 'info',
event: 'job_started',
job_id: params.job_id,
store_id: params.store_id,
store_name: params.store_name,
job_type: params.job_type,
trigger_type: params.trigger_type,
provider: params.provider,
});
}
/**
* Log when a crawl job completes successfully
*/
jobCompleted(params) {
this.log({
timestamp: new Date().toISOString(),
level: 'info',
event: 'job_completed',
job_id: params.job_id,
store_id: params.store_id,
store_name: params.store_name,
duration_ms: params.duration_ms,
products_found: params.products_found,
products_new: params.products_new,
products_updated: params.products_updated,
products_marked_oos: params.products_marked_oos,
provider: params.provider,
});
}
/**
* Log when a crawl job fails
*/
jobFailed(params) {
this.log({
timestamp: new Date().toISOString(),
level: 'error',
event: 'job_failed',
job_id: params.job_id,
store_id: params.store_id,
store_name: params.store_name,
duration_ms: params.duration_ms,
error_message: params.error_message,
error_code: params.error_code,
provider: params.provider,
});
}
/**
* Log when a provider is detected for a dispensary
*/
providerDetected(params) {
this.log({
timestamp: new Date().toISOString(),
level: 'info',
event: 'provider_detected',
dispensary_id: params.dispensary_id,
dispensary_name: params.dispensary_name,
detected_provider: params.detected_provider,
confidence: params.confidence,
detection_method: params.detection_method,
menu_url: params.menu_url,
category: params.category,
});
}
/**
* Log when a dispensary's provider changes
*/
providerChanged(params) {
this.log({
timestamp: new Date().toISOString(),
level: 'info',
event: 'provider_changed',
dispensary_id: params.dispensary_id,
dispensary_name: params.dispensary_name,
old_provider: params.old_provider,
new_provider: params.new_provider,
old_confidence: params.old_confidence,
new_confidence: params.new_confidence,
category: params.category,
});
}
/**
* Log when a dispensary's crawler mode changes (sandbox -> production, etc.)
*/
modeChanged(params) {
this.log({
timestamp: new Date().toISOString(),
level: 'info',
event: 'mode_changed',
dispensary_id: params.dispensary_id,
dispensary_name: params.dispensary_name,
old_mode: params.old_mode,
new_mode: params.new_mode,
reason: params.reason,
category: params.category,
provider: params.provider,
});
}
/**
* Log sandbox crawl events
*/
sandboxEvent(params) {
const level = params.event === 'sandbox_failed' ? 'error' : 'info';
this.log({
timestamp: new Date().toISOString(),
level,
event: params.event,
dispensary_id: params.dispensary_id,
dispensary_name: params.dispensary_name,
template_name: params.template_name,
category: params.category,
quality_score: params.quality_score,
products_extracted: params.products_extracted,
fields_missing: params.fields_missing,
error_message: params.error_message,
provider: params.provider,
});
}
/**
* Log queue processing failures
*/
queueFailure(params) {
this.log({
timestamp: new Date().toISOString(),
level: 'error',
event: 'queue_failure',
queue_type: params.queue_type,
error_message: params.error_message,
affected_items: params.affected_items,
});
}
/**
* Log detection scan summary
*/
detectionScan(params) {
this.log({
timestamp: new Date().toISOString(),
level: 'info',
event: 'detection_scan',
total_scanned: params.total_scanned,
detected: params.detected,
failed: params.failed,
skipped: params.skipped,
duration_ms: params.duration_ms,
});
}
/**
* Log intelligence run summary
*/
intelligenceRun(params) {
this.log({
timestamp: new Date().toISOString(),
level: 'info',
event: 'intelligence_run',
run_type: params.run_type,
dispensaries_processed: params.dispensaries_processed,
jobs_queued: params.jobs_queued,
duration_ms: params.duration_ms,
});
}
}
// Export singleton instance
exports.crawlerLogger = new CrawlerLoggerService();

View File

@@ -0,0 +1,383 @@
"use strict";
/**
* Dispensary Crawl Orchestrator
*
* Orchestrates the complete crawl workflow for a dispensary:
* 1. Load dispensary data
* 2. Check if provider detection is needed
* 3. Run provider detection if needed
* 4. Queue appropriate crawl jobs based on provider/mode
* 5. Update dispensary_crawl_schedule with meaningful status
*
* This works DIRECTLY with dispensaries (not through stores table).
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.runDispensaryOrchestrator = runDispensaryOrchestrator;
exports.runBatchDispensaryOrchestrator = runBatchDispensaryOrchestrator;
exports.getDispensariesDueForOrchestration = getDispensariesDueForOrchestration;
exports.ensureAllDispensariesHaveSchedules = ensureAllDispensariesHaveSchedules;
exports.processDispensaryScheduler = processDispensaryScheduler;
const uuid_1 = require("uuid");
const migrate_1 = require("../db/migrate");
const crawler_logger_1 = require("./crawler-logger");
const intelligence_detector_1 = require("./intelligence-detector");
const category_crawler_jobs_1 = require("./category-crawler-jobs");
// ========================================
// Main Orchestrator Function
// ========================================
/**
* Run the complete crawl orchestration for a dispensary
*
* Behavior:
* 1. Load the dispensary info
* 2. If product_provider is missing or stale (>7 days), run detection
* 3. After detection:
* - If product_provider = 'dutchie' and product_crawler_mode = 'production': Run production crawl
* - Otherwise: Run sandbox crawl
* 4. Update dispensary_crawl_schedule with status/summary
*/
async function runDispensaryOrchestrator(dispensaryId, scheduleId) {
const startTime = Date.now();
const runId = (0, uuid_1.v4)();
let result = {
status: 'pending',
summary: '',
runId,
dispensaryId,
dispensaryName: '',
detectionRan: false,
crawlRan: false,
durationMs: 0,
};
try {
// Mark schedule as running
await updateScheduleStatus(dispensaryId, 'running', 'Starting orchestrator...', null, runId);
// 1. Load dispensary info
const dispensary = await getDispensaryInfo(dispensaryId);
if (!dispensary) {
throw new Error(`Dispensary ${dispensaryId} not found`);
}
result.dispensaryName = dispensary.name;
// 2. Check if provider detection is needed
const needsDetection = await checkNeedsDetection(dispensary);
if (needsDetection) {
// Run provider detection
const websiteUrl = dispensary.menu_url || dispensary.website;
if (!websiteUrl) {
result.status = 'error';
result.summary = 'No website URL available for detection';
result.error = 'Dispensary has no menu_url or website configured';
await updateScheduleStatus(dispensaryId, 'error', result.summary, result.error, runId);
result.durationMs = Date.now() - startTime;
await createJobRecord(dispensaryId, scheduleId, result);
return result;
}
await updateScheduleStatus(dispensaryId, 'running', 'Running provider detection...', null, runId);
const detectionResult = await (0, intelligence_detector_1.detectMultiCategoryProviders)(websiteUrl);
result.detectionRan = true;
result.detectionResult = detectionResult;
// Save detection results to dispensary
await (0, intelligence_detector_1.updateAllCategoryProviders)(dispensaryId, detectionResult);
crawler_logger_1.crawlerLogger.providerDetected({
dispensary_id: dispensaryId,
dispensary_name: dispensary.name,
detected_provider: detectionResult.product.provider,
confidence: detectionResult.product.confidence,
detection_method: 'dispensary_orchestrator',
menu_url: websiteUrl,
category: 'product',
});
// Refresh dispensary info after detection
const updatedDispensary = await getDispensaryInfo(dispensaryId);
if (updatedDispensary) {
Object.assign(dispensary, updatedDispensary);
}
}
// 3. Determine crawl type and run
const provider = dispensary.product_provider;
const mode = dispensary.product_crawler_mode;
if (provider === 'dutchie' && mode === 'production') {
// Production Dutchie crawl
await updateScheduleStatus(dispensaryId, 'running', 'Running Dutchie production crawl...', null, runId);
try {
// Run the category-specific crawl job
const crawlResult = await (0, category_crawler_jobs_1.runCrawlProductsJob)(dispensaryId);
result.crawlRan = true;
result.crawlType = 'production';
if (crawlResult.success) {
result.productsFound = crawlResult.data?.productsFound || 0;
const detectionPart = result.detectionRan ? 'Detection + ' : '';
result.summary = `${detectionPart}Dutchie products crawl completed`;
result.status = 'success';
crawler_logger_1.crawlerLogger.jobCompleted({
job_id: 0,
store_id: 0,
store_name: dispensary.name,
duration_ms: Date.now() - startTime,
products_found: result.productsFound || 0,
products_new: 0,
products_updated: 0,
provider: 'dutchie',
});
}
else {
result.status = 'error';
result.error = crawlResult.message;
result.summary = `Dutchie crawl failed: ${crawlResult.message.slice(0, 100)}`;
}
}
catch (crawlError) {
result.status = 'error';
result.error = crawlError.message;
result.summary = `Dutchie crawl failed: ${crawlError.message.slice(0, 100)}`;
result.crawlRan = true;
result.crawlType = 'production';
crawler_logger_1.crawlerLogger.jobFailed({
job_id: 0,
store_id: 0,
store_name: dispensary.name,
duration_ms: Date.now() - startTime,
error_message: crawlError.message,
provider: 'dutchie',
});
}
}
else if (provider && provider !== 'unknown') {
// Sandbox crawl for non-Dutchie or sandbox mode
await updateScheduleStatus(dispensaryId, 'running', `Running ${provider} sandbox crawl...`, null, runId);
try {
const sandboxResult = await (0, category_crawler_jobs_1.runSandboxProductsJob)(dispensaryId);
result.crawlRan = true;
result.crawlType = 'sandbox';
result.productsFound = sandboxResult.data?.productsExtracted || 0;
const detectionPart = result.detectionRan ? 'Detection + ' : '';
if (sandboxResult.success) {
result.summary = `${detectionPart}${provider} sandbox crawl (${result.productsFound} items, quality ${sandboxResult.data?.qualityScore || 0}%)`;
result.status = 'sandbox_only';
}
else {
result.summary = `${detectionPart}${provider} sandbox failed: ${sandboxResult.message}`;
result.status = 'error';
result.error = sandboxResult.message;
}
}
catch (sandboxError) {
result.status = 'error';
result.error = sandboxError.message;
result.summary = `Sandbox crawl failed: ${sandboxError.message.slice(0, 100)}`;
result.crawlRan = true;
result.crawlType = 'sandbox';
}
}
else {
// No provider detected - detection only
if (result.detectionRan) {
result.summary = `Detection complete: provider=${dispensary.product_provider || 'unknown'}, confidence=${dispensary.product_confidence || 0}%`;
result.status = 'detection_only';
}
else {
result.summary = 'No provider detected and no crawl possible';
result.status = 'error';
result.error = 'Could not determine menu provider';
}
}
}
catch (error) {
result.status = 'error';
result.error = error.message;
result.summary = `Orchestrator error: ${error.message.slice(0, 100)}`;
crawler_logger_1.crawlerLogger.queueFailure({
queue_type: 'dispensary_orchestrator',
error_message: error.message,
});
}
result.durationMs = Date.now() - startTime;
// Update final schedule status
await updateScheduleStatus(dispensaryId, result.status, result.summary, result.error || null, runId);
// Create job record
await createJobRecord(dispensaryId, scheduleId, result);
return result;
}
// ========================================
// Helper Functions
// ========================================
async function getDispensaryInfo(dispensaryId) {
const result = await migrate_1.pool.query(`SELECT id, name, city, website, menu_url,
product_provider, product_confidence, product_crawler_mode, last_product_scan_at
FROM dispensaries
WHERE id = $1`, [dispensaryId]);
return result.rows[0] || null;
}
async function checkNeedsDetection(dispensary) {
// No provider = definitely needs detection
if (!dispensary.product_provider)
return true;
// Unknown provider = needs detection
if (dispensary.product_provider === 'unknown')
return true;
// Low confidence = needs re-detection
if (dispensary.product_confidence !== null && dispensary.product_confidence < 50)
return true;
// Stale detection (> 7 days) = needs refresh
if (dispensary.last_product_scan_at) {
const daysSince = (Date.now() - new Date(dispensary.last_product_scan_at).getTime()) / (1000 * 60 * 60 * 24);
if (daysSince > 7)
return true;
}
return false;
}
async function updateScheduleStatus(dispensaryId, status, summary, error, runId) {
await migrate_1.pool.query(`INSERT INTO dispensary_crawl_schedule (dispensary_id, last_status, last_summary, last_error, last_run_at, updated_at)
VALUES ($1, $2, $3, $4, NOW(), NOW())
ON CONFLICT (dispensary_id) DO UPDATE SET
last_status = $2,
last_summary = $3,
last_error = $4,
last_run_at = NOW(),
updated_at = NOW()`, [dispensaryId, status, summary, error]);
}
async function createJobRecord(dispensaryId, scheduleId, result) {
await migrate_1.pool.query(`INSERT INTO dispensary_crawl_jobs (
dispensary_id, schedule_id, job_type, trigger_type, status, priority,
scheduled_at, started_at, completed_at, duration_ms,
detection_ran, crawl_ran, crawl_type,
products_found, products_new, products_updated,
detected_provider, detected_confidence, detected_mode,
error_message, run_id
) VALUES (
$1, $2, 'orchestrator', 'manual', $3, 100,
NOW(), NOW(), NOW(), $4,
$5, $6, $7,
$8, $9, $10,
$11, $12, $13,
$14, $15
)`, [
dispensaryId,
scheduleId || null,
result.status === 'success' ? 'completed' : result.status === 'error' ? 'failed' : 'completed',
result.durationMs,
result.detectionRan,
result.crawlRan,
result.crawlType || null,
result.productsFound || null,
result.productsNew || null,
result.productsUpdated || null,
result.detectionResult?.product.provider || null,
result.detectionResult?.product.confidence || null,
result.detectionResult?.product.mode || null,
result.error || null,
result.runId,
]);
// Update schedule stats
if (result.status === 'success' || result.status === 'sandbox_only' || result.status === 'detection_only') {
await migrate_1.pool.query(`UPDATE dispensary_crawl_schedule SET
total_runs = COALESCE(total_runs, 0) + 1,
successful_runs = COALESCE(successful_runs, 0) + 1,
consecutive_failures = 0,
next_run_at = NOW() + (interval_minutes || ' minutes')::INTERVAL,
last_duration_ms = $2
WHERE dispensary_id = $1`, [dispensaryId, result.durationMs]);
}
else if (result.status === 'error') {
await migrate_1.pool.query(`UPDATE dispensary_crawl_schedule SET
total_runs = COALESCE(total_runs, 0) + 1,
consecutive_failures = COALESCE(consecutive_failures, 0) + 1,
next_run_at = NOW() + (interval_minutes || ' minutes')::INTERVAL,
last_duration_ms = $2
WHERE dispensary_id = $1`, [dispensaryId, result.durationMs]);
}
}
// ========================================
// Batch Processing
// ========================================
/**
* Run orchestrator for multiple dispensaries
*/
async function runBatchDispensaryOrchestrator(dispensaryIds, concurrency = 3) {
const results = [];
// Process in batches
for (let i = 0; i < dispensaryIds.length; i += concurrency) {
const batch = dispensaryIds.slice(i, i + concurrency);
console.log(`Processing batch ${Math.floor(i / concurrency) + 1}: dispensaries ${batch.join(', ')}`);
const batchResults = await Promise.all(batch.map(id => runDispensaryOrchestrator(id)));
results.push(...batchResults);
// Small delay between batches to avoid overwhelming the system
if (i + concurrency < dispensaryIds.length) {
await new Promise(r => setTimeout(r, 1000));
}
}
return results;
}
/**
* Get dispensaries that are due for orchestration
*/
async function getDispensariesDueForOrchestration(limit = 10) {
const result = await migrate_1.pool.query(`SELECT d.id
FROM dispensaries d
LEFT JOIN dispensary_crawl_schedule dcs ON dcs.dispensary_id = d.id
WHERE COALESCE(dcs.is_active, TRUE) = TRUE
AND (
dcs.next_run_at IS NULL
OR dcs.next_run_at <= NOW()
)
AND (dcs.last_status IS NULL OR dcs.last_status NOT IN ('running', 'pending'))
ORDER BY COALESCE(dcs.priority, 0) DESC, dcs.last_run_at ASC NULLS FIRST
LIMIT $1`, [limit]);
return result.rows.map(row => row.id);
}
/**
* Ensure all dispensaries have schedule entries
*/
async function ensureAllDispensariesHaveSchedules(intervalMinutes = 240) {
// Get all dispensary IDs that don't have a schedule
const result = await migrate_1.pool.query(`INSERT INTO dispensary_crawl_schedule (dispensary_id, is_active, interval_minutes, priority)
SELECT d.id, TRUE, $1, 0
FROM dispensaries d
WHERE NOT EXISTS (
SELECT 1 FROM dispensary_crawl_schedule dcs WHERE dcs.dispensary_id = d.id
)
RETURNING id`, [intervalMinutes]);
const existingCount = await migrate_1.pool.query('SELECT COUNT(*) FROM dispensary_crawl_schedule');
return {
created: result.rowCount || 0,
existing: parseInt(existingCount.rows[0].count) - (result.rowCount || 0),
};
}
// ========================================
// Scheduler Integration
// ========================================
let dispensarySchedulerRunning = false;
/**
* Process dispensaries using the intelligent orchestrator
* Called periodically by the scheduler
*/
async function processDispensaryScheduler() {
if (dispensarySchedulerRunning) {
console.log('Dispensary scheduler already running, skipping...');
return;
}
dispensarySchedulerRunning = true;
try {
// Get dispensaries due for orchestration
const dispensaryIds = await getDispensariesDueForOrchestration(3);
if (dispensaryIds.length === 0) {
return;
}
console.log(`Dispensary Scheduler: Processing ${dispensaryIds.length} dispensaries due for crawl`);
// Process each dispensary through the orchestrator
for (const dispensaryId of dispensaryIds) {
try {
console.log(`Dispensary Scheduler: Starting crawl for dispensary ${dispensaryId}`);
const result = await runDispensaryOrchestrator(dispensaryId);
console.log(`Dispensary Scheduler: Dispensary ${dispensaryId} completed - ${result.summary}`);
}
catch (error) {
console.error(`Dispensary Scheduler: Dispensary ${dispensaryId} failed - ${error.message}`);
}
}
console.log(`Dispensary Scheduler: Finished processing ${dispensaryIds.length} dispensaries`);
}
finally {
dispensarySchedulerRunning = false;
}
}

125
backend/dist/services/geolocation.js vendored Normal file
View File

@@ -0,0 +1,125 @@
"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.lookupProxyLocation = lookupProxyLocation;
exports.updateProxyLocation = updateProxyLocation;
exports.updateAllProxyLocations = updateAllProxyLocations;
exports.queueProxyLocationUpdate = queueProxyLocationUpdate;
const axios_1 = __importDefault(require("axios"));
const migrate_1 = require("../db/migrate");
// Free API - 45 requests/minute limit
const GEOLOCATION_API = 'http://ip-api.com/json/';
async function lookupProxyLocation(host) {
try {
const response = await axios_1.default.get(`${GEOLOCATION_API}${host}?fields=status,message,country,countryCode,regionName,city,query`);
const data = response.data;
if (data.status === 'fail') {
console.log(`❌ Geolocation lookup failed for ${host}: ${data.message}`);
return null;
}
return data;
}
catch (error) {
console.error(`❌ Error looking up location for ${host}:`, error.message);
return null;
}
}
async function updateProxyLocation(proxyId, location) {
await migrate_1.pool.query(`
UPDATE proxies
SET city = $1,
state = $2,
country = $3,
country_code = $4,
location_updated_at = CURRENT_TIMESTAMP
WHERE id = $5
`, [
location.city,
location.regionName,
location.country,
location.countryCode,
proxyId
]);
}
async function updateAllProxyLocations(batchSize = 45) {
console.log('🌍 Starting proxy location update job...');
// Get all proxies without location data
const result = await migrate_1.pool.query(`
SELECT id, host
FROM proxies
WHERE location_updated_at IS NULL
OR location_updated_at < CURRENT_TIMESTAMP - INTERVAL '30 days'
ORDER BY id
`);
const proxies = result.rows;
console.log(`📊 Found ${proxies.length} proxies to update`);
let updated = 0;
let failed = 0;
// Process in batches to respect rate limit (45 req/min)
for (let i = 0; i < proxies.length; i += batchSize) {
const batch = proxies.slice(i, i + batchSize);
console.log(`🔄 Processing batch ${Math.floor(i / batchSize) + 1}/${Math.ceil(proxies.length / batchSize)} (${batch.length} proxies)`);
// Process batch
for (const proxy of batch) {
const location = await lookupProxyLocation(proxy.host);
if (location) {
await updateProxyLocation(proxy.id, location);
console.log(`✅ Updated ${proxy.id}: ${location.city}, ${location.regionName} - ${location.country}`);
updated++;
}
else {
console.log(`⚠️ Failed to get location for proxy ${proxy.id} (${proxy.host})`);
failed++;
}
// Small delay between requests
await new Promise(resolve => setTimeout(resolve, 100));
}
// Wait 60 seconds before next batch to respect rate limit
if (i + batchSize < proxies.length) {
console.log(`⏳ Waiting 60s before next batch (rate limit: 45 req/min)...`);
await new Promise(resolve => setTimeout(resolve, 60000));
}
}
console.log(`✅ Proxy location update complete!`);
console.log(` Updated: ${updated}`);
console.log(` Failed: ${failed}`);
}
// Queue for background processing
const locationUpdateQueue = new Set();
let isProcessing = false;
function queueProxyLocationUpdate(proxyId) {
locationUpdateQueue.add(proxyId);
processLocationQueue();
}
async function processLocationQueue() {
if (isProcessing || locationUpdateQueue.size === 0)
return;
isProcessing = true;
try {
const proxyIds = Array.from(locationUpdateQueue);
locationUpdateQueue.clear();
console.log(`🌍 Processing ${proxyIds.length} proxy location updates from queue`);
for (const proxyId of proxyIds) {
const result = await migrate_1.pool.query('SELECT host FROM proxies WHERE id = $1', [proxyId]);
if (result.rows.length === 0)
continue;
const host = result.rows[0].host;
const location = await lookupProxyLocation(host);
if (location) {
await updateProxyLocation(proxyId, location);
console.log(`✅ Queue: Updated ${proxyId}: ${location.city}, ${location.regionName} - ${location.country}`);
}
// Respect rate limit
await new Promise(resolve => setTimeout(resolve, 1500)); // ~40 req/min
}
}
finally {
isProcessing = false;
// Process any new items that were added while we were processing
if (locationUpdateQueue.size > 0) {
processLocationQueue();
}
}
}

View File

@@ -0,0 +1,493 @@
"use strict";
/**
* Multi-Category Intelligence Detector
*
* Detects providers for each intelligence category independently:
* - Products: Which provider serves product data
* - Specials: Which provider serves deals/specials
* - Brand: Which provider serves brand information
* - Metadata: Which provider serves taxonomy/category data
*/
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.detectMultiCategoryProviders = detectMultiCategoryProviders;
exports.detectCategoryProviderChange = detectCategoryProviderChange;
exports.updateDispensaryCategoryProvider = updateDispensaryCategoryProvider;
exports.updateAllCategoryProviders = updateAllCategoryProviders;
exports.moveCategoryToSandbox = moveCategoryToSandbox;
const migrate_1 = require("../db/migrate");
const logger_1 = require("./logger");
const puppeteer_1 = __importDefault(require("puppeteer"));
// Production-ready providers per category
// Only these combinations can be set to production mode
const PRODUCTION_READY = {
product: ['dutchie'], // Only Dutchie products are production-ready
specials: [], // None yet
brand: [], // None yet
metadata: [], // None yet
};
// Provider detection patterns
const PROVIDER_PATTERNS = {
dutchie: {
scripts: [
/dutchie\.com/i,
/dutchie-plus/i,
/dutchie\.js/i,
/__DUTCHIE__/i,
/dutchie-embed/i,
],
iframes: [
/dutchie\.com/i,
/dutchie-plus\.com/i,
/embed\.dutchie/i,
],
html: [
/class="dutchie/i,
/id="dutchie/i,
/data-dutchie/i,
/"menuType":\s*"dutchie"/i,
],
apiEndpoints: [
/dutchie\.com\/graphql/i,
/plus\.dutchie\.com/i,
],
metaTags: [
/dutchie/i,
],
},
treez: {
scripts: [
/treez\.io/i,
/treez-ecommerce/i,
/treez\.js/i,
],
iframes: [
/treez\.io/i,
/shop\.treez/i,
],
html: [
/class="treez/i,
/data-treez/i,
/treez-menu/i,
],
apiEndpoints: [
/api\.treez\.io/i,
/treez\.io\/api/i,
],
metaTags: [],
},
jane: {
scripts: [
/jane\.co/i,
/iheartjane\.com/i,
/jane-frame/i,
/jane\.js/i,
],
iframes: [
/jane\.co/i,
/iheartjane\.com/i,
/embed\.iheartjane/i,
],
html: [
/class="jane/i,
/data-jane/i,
/jane-embed/i,
],
apiEndpoints: [
/api\.iheartjane/i,
/jane\.co\/api/i,
],
metaTags: [],
},
weedmaps: {
scripts: [
/weedmaps\.com/i,
/wm-menu/i,
],
iframes: [
/weedmaps\.com/i,
/menu\.weedmaps/i,
],
html: [
/data-weedmaps/i,
/wm-menu/i,
],
apiEndpoints: [
/api-g\.weedmaps/i,
/weedmaps\.com\/api/i,
],
metaTags: [],
},
leafly: {
scripts: [
/leafly\.com/i,
/leafly-menu/i,
],
iframes: [
/leafly\.com/i,
/order\.leafly/i,
],
html: [
/data-leafly/i,
/leafly-embed/i,
],
apiEndpoints: [
/api\.leafly/i,
],
metaTags: [],
},
};
// Category-specific detection signals
const CATEGORY_SIGNALS = {
product: {
urlPatterns: [/\/menu/i, /\/products/i, /\/shop/i, /\/order/i],
htmlPatterns: [/product-card/i, /menu-item/i, /product-list/i, /product-grid/i],
jsonKeys: ['products', 'menuItems', 'items', 'inventory'],
},
specials: {
urlPatterns: [/\/specials/i, /\/deals/i, /\/promotions/i, /\/offers/i],
htmlPatterns: [/special/i, /deal/i, /promotion/i, /discount/i, /sale/i],
jsonKeys: ['specials', 'deals', 'promotions', 'offers'],
},
brand: {
urlPatterns: [/\/brands/i, /\/vendors/i, /\/producers/i],
htmlPatterns: [/brand-list/i, /vendor/i, /producer/i, /manufacturer/i],
jsonKeys: ['brands', 'vendors', 'producers', 'manufacturers'],
},
metadata: {
urlPatterns: [/\/categories/i, /\/taxonomy/i],
htmlPatterns: [/category-nav/i, /menu-categories/i, /filter-category/i],
jsonKeys: ['categories', 'taxonomy', 'filters', 'types'],
},
};
// ========================================
// Main Detection Function
// ========================================
async function detectMultiCategoryProviders(websiteUrl, options = {}) {
const { timeout = 30000, headless = true, existingBrowser } = options;
let browser = null;
let page = null;
const urlsTested = [];
const rawSignals = {};
try {
browser = existingBrowser || await puppeteer_1.default.launch({
headless,
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'],
});
page = await browser.newPage();
await page.setViewport({ width: 1920, height: 1080 });
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36');
// Navigate to main site
const baseUrl = normalizeUrl(websiteUrl);
urlsTested.push(baseUrl);
await page.goto(baseUrl, { waitUntil: 'networkidle2', timeout });
// Collect signals from main page
const mainPageSignals = await collectPageSignals(page);
rawSignals.mainPage = mainPageSignals;
// Try common menu URLs
const menuUrls = ['/menu', '/shop', '/products', '/order', '/specials', '/deals', '/brands'];
for (const path of menuUrls) {
try {
const fullUrl = new URL(path, baseUrl).toString();
urlsTested.push(fullUrl);
await page.goto(fullUrl, { waitUntil: 'networkidle2', timeout: 15000 });
const signals = await collectPageSignals(page);
rawSignals[path] = signals;
}
catch {
// URL doesn't exist or timed out
}
}
// Analyze signals for each category
const result = {
product: analyzeCategorySignals('product', rawSignals),
specials: analyzeCategorySignals('specials', rawSignals),
brand: analyzeCategorySignals('brand', rawSignals),
metadata: analyzeCategorySignals('metadata', rawSignals),
urlsTested,
rawSignals,
};
logger_1.logger.info('provider-detection', `Multi-category detection complete for ${websiteUrl}`);
return result;
}
catch (error) {
logger_1.logger.error('provider-detection', `Detection failed for ${websiteUrl}: ${error.message}`);
// Return unknown results for all categories
return {
product: createUnknownResult(),
specials: createUnknownResult(),
brand: createUnknownResult(),
metadata: createUnknownResult(),
urlsTested,
rawSignals: { error: error.message },
};
}
finally {
if (page)
await page.close().catch(() => { });
if (browser && !existingBrowser)
await browser.close().catch(() => { });
}
}
// ========================================
// Helper Functions
// ========================================
function normalizeUrl(url) {
if (!url.startsWith('http')) {
url = 'https://' + url;
}
return url.replace(/\/$/, '');
}
async function collectPageSignals(page) {
return page.evaluate(() => {
const signals = {
scripts: [],
iframes: [],
links: [],
metaTags: [],
bodyClasses: document.body?.className || '',
bodyId: document.body?.id || '',
htmlSnippet: document.documentElement.outerHTML.slice(0, 10000),
};
// Collect script sources
document.querySelectorAll('script[src]').forEach((el) => {
signals.scripts.push(el.src);
});
// Collect inline scripts
document.querySelectorAll('script:not([src])').forEach((el) => {
const content = el.textContent || '';
if (content.length < 5000) {
signals.scripts.push(`inline:${content.slice(0, 500)}`);
}
});
// Collect iframes
document.querySelectorAll('iframe').forEach((el) => {
signals.iframes.push(el.src);
});
// Collect links
document.querySelectorAll('a[href]').forEach((el) => {
signals.links.push(el.href);
});
// Collect meta tags
document.querySelectorAll('meta').forEach((el) => {
const content = el.getAttribute('content') || '';
const name = el.getAttribute('name') || el.getAttribute('property') || '';
if (content || name) {
signals.metaTags.push(`${name}:${content}`);
}
});
// Look for JSON data
const jsonBlocks = [];
document.querySelectorAll('script[type="application/json"]').forEach((el) => {
jsonBlocks.push(el.textContent?.slice(0, 2000) || '');
});
signals.jsonBlocks = jsonBlocks;
return signals;
});
}
function analyzeCategorySignals(category, allSignals) {
const providerScores = {};
const detectedSignals = {};
// Initialize scores
for (const provider of Object.keys(PROVIDER_PATTERNS)) {
providerScores[provider] = 0;
}
// Analyze each page's signals
for (const [pagePath, signals] of Object.entries(allSignals)) {
if (!signals || typeof signals !== 'object')
continue;
// Check for provider-specific patterns
for (const [provider, patterns] of Object.entries(PROVIDER_PATTERNS)) {
let score = 0;
// Check scripts
if (signals.scripts) {
for (const script of signals.scripts) {
for (const pattern of patterns.scripts) {
if (pattern.test(script)) {
score += 20;
detectedSignals[`${provider}_script_${pagePath}`] = script;
}
}
}
}
// Check iframes
if (signals.iframes) {
for (const iframe of signals.iframes) {
for (const pattern of patterns.iframes) {
if (pattern.test(iframe)) {
score += 25;
detectedSignals[`${provider}_iframe_${pagePath}`] = iframe;
}
}
}
}
// Check HTML content
if (signals.htmlSnippet) {
for (const pattern of patterns.html) {
if (pattern.test(signals.htmlSnippet)) {
score += 15;
detectedSignals[`${provider}_html_${pagePath}`] = true;
}
}
}
providerScores[provider] += score;
}
// Check for category-specific signals on relevant pages
const categorySignals = CATEGORY_SIGNALS[category];
const isRelevantPage = categorySignals.urlPatterns.some((p) => p.test(pagePath));
if (isRelevantPage && signals.htmlSnippet) {
for (const pattern of categorySignals.htmlPatterns) {
if (pattern.test(signals.htmlSnippet)) {
detectedSignals[`${category}_html_pattern`] = true;
}
}
}
// Check JSON blocks for category data
if (signals.jsonBlocks) {
for (const json of signals.jsonBlocks) {
for (const key of categorySignals.jsonKeys) {
if (json.toLowerCase().includes(`"${key}"`)) {
detectedSignals[`${category}_json_key_${key}`] = true;
}
}
}
}
}
// Determine winning provider
let bestProvider = 'unknown';
let bestScore = 0;
for (const [provider, score] of Object.entries(providerScores)) {
if (score > bestScore) {
bestScore = score;
bestProvider = provider;
}
}
// Calculate confidence (0-100)
const confidence = Math.min(100, bestScore);
// Determine mode based on provider and confidence
const isProductionReady = PRODUCTION_READY[category].includes(bestProvider);
const mode = isProductionReady && confidence >= 70
? 'production'
: 'sandbox';
// Get template name if available
let templateName;
if (bestProvider === 'dutchie' && category === 'product') {
templateName = 'dutchie_standard';
}
else if (bestProvider === 'treez') {
templateName = 'treez_products_v0';
}
return {
provider: bestProvider,
confidence,
mode,
signals: detectedSignals,
templateName,
};
}
function createUnknownResult() {
return {
provider: 'unknown',
confidence: 0,
mode: 'sandbox',
signals: {},
};
}
// ========================================
// Lightweight Per-Category Change Detection
// ========================================
async function detectCategoryProviderChange(page, category, expectedProvider) {
try {
const signals = await collectPageSignals(page);
const result = analyzeCategorySignals(category, { currentPage: signals });
if (result.provider !== expectedProvider && result.confidence > 50) {
logger_1.logger.warn('provider-detection', `Provider change detected for ${category}: ${expectedProvider} -> ${result.provider}`);
return {
changed: true,
newProvider: result.provider,
confidence: result.confidence,
};
}
return { changed: false };
}
catch (error) {
logger_1.logger.error('provider-detection', `Change detection failed: ${error.message}`);
return { changed: false };
}
}
// ========================================
// Database Operations
// ========================================
async function updateDispensaryCategoryProvider(dispensaryId, category, result) {
const columnPrefix = category === 'product' ? 'product' :
category === 'specials' ? 'specials' :
category === 'brand' ? 'brand' : 'metadata';
await migrate_1.pool.query(`UPDATE dispensaries SET
${columnPrefix}_provider = $1,
${columnPrefix}_confidence = $2,
${columnPrefix}_crawler_mode = $3,
${columnPrefix}_detection_data = $4,
updated_at = NOW()
WHERE id = $5`, [
result.provider,
result.confidence,
result.mode,
JSON.stringify(result.signals),
dispensaryId,
]);
}
async function updateAllCategoryProviders(dispensaryId, result) {
await migrate_1.pool.query(`UPDATE dispensaries SET
product_provider = $1,
product_confidence = $2,
product_crawler_mode = $3,
product_detection_data = $4,
specials_provider = $5,
specials_confidence = $6,
specials_crawler_mode = $7,
specials_detection_data = $8,
brand_provider = $9,
brand_confidence = $10,
brand_crawler_mode = $11,
brand_detection_data = $12,
metadata_provider = $13,
metadata_confidence = $14,
metadata_crawler_mode = $15,
metadata_detection_data = $16,
updated_at = NOW()
WHERE id = $17`, [
result.product.provider,
result.product.confidence,
result.product.mode,
JSON.stringify(result.product.signals),
result.specials.provider,
result.specials.confidence,
result.specials.mode,
JSON.stringify(result.specials.signals),
result.brand.provider,
result.brand.confidence,
result.brand.mode,
JSON.stringify(result.brand.signals),
result.metadata.provider,
result.metadata.confidence,
result.metadata.mode,
JSON.stringify(result.metadata.signals),
dispensaryId,
]);
}
async function moveCategoryToSandbox(dispensaryId, category, reason) {
const columnPrefix = category === 'product' ? 'product' :
category === 'specials' ? 'specials' :
category === 'brand' ? 'brand' : 'metadata';
await migrate_1.pool.query(`UPDATE dispensaries SET
${columnPrefix}_crawler_mode = 'sandbox',
${columnPrefix}_detection_data = ${columnPrefix}_detection_data || $1::jsonb,
updated_at = NOW()
WHERE id = $2`, [
JSON.stringify({ sandbox_reason: reason, sandbox_at: new Date().toISOString() }),
dispensaryId,
]);
logger_1.logger.info('provider-detection', `Moved dispensary ${dispensaryId} ${category} to sandbox: ${reason}`);
}

View File

@@ -0,0 +1,612 @@
"use strict";
/**
* Menu Provider Detection Service
*
* Detects which menu platform a dispensary is using by analyzing:
* - HTML content patterns (scripts, iframes, classes)
* - URL patterns (embedded menu paths)
* - API endpoint signatures
* - Meta tags and headers
*/
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.detectMenuProvider = detectMenuProvider;
exports.quickDutchieCheck = quickDutchieCheck;
exports.detectProviderChange = detectProviderChange;
const puppeteer_1 = __importDefault(require("puppeteer"));
const logger_1 = require("./logger");
// Provider detection patterns
const PROVIDER_PATTERNS = {
dutchie: {
scripts: [
/dutchie/i,
/dutchie-plus/i,
/dutchie\.com/i,
/dutchie-embed/i,
],
iframes: [
/dutchie\.com/i,
/embed\.dutchie/i,
/iframe\.dutchie/i,
],
classes: [
/dutchie-/i,
/DutchieEmbed/i,
],
urls: [
/dutchie\.com/i,
/\.dutchie\./i,
],
meta: [
/dutchie/i,
],
apiEndpoints: [
/graphql.*dutchie/i,
/api\.dutchie/i,
],
htmlPatterns: [
/data-dutchie/i,
/__DUTCHIE__/i,
/dutchie-plus-iframe/i,
],
},
treez: {
scripts: [
/treez/i,
/treez\.io/i,
/treezpay/i,
],
iframes: [
/treez\.io/i,
/menu\.treez/i,
],
classes: [
/treez-/i,
],
urls: [
/treez\.io/i,
/\.treez\./i,
],
meta: [
/treez/i,
],
apiEndpoints: [
/api\.treez/i,
],
htmlPatterns: [
/data-treez/i,
/treez-embed/i,
],
},
jane: {
scripts: [
/jane\.co/i,
/iheartjane/i,
/jane-embed/i,
/janetechnologies/i,
],
iframes: [
/jane\.co/i,
/iheartjane\.com/i,
/menu\.jane/i,
],
classes: [
/jane-/i,
/iheartjane/i,
],
urls: [
/jane\.co/i,
/iheartjane\.com/i,
],
meta: [
/jane/i,
/iheartjane/i,
],
apiEndpoints: [
/api\.iheartjane/i,
/api\.jane\.co/i,
],
htmlPatterns: [
/data-jane/i,
/jane-root/i,
/jane-embed/i,
],
},
weedmaps: {
scripts: [
/weedmaps/i,
/wm\.com/i,
],
iframes: [
/weedmaps\.com/i,
/menu\.weedmaps/i,
],
classes: [
/weedmaps-/i,
/wm-/i,
],
urls: [
/weedmaps\.com/i,
],
meta: [
/weedmaps/i,
],
apiEndpoints: [
/api.*weedmaps/i,
],
htmlPatterns: [
/data-weedmaps/i,
],
},
leafly: {
scripts: [
/leafly/i,
/leafly\.com/i,
],
iframes: [
/leafly\.com/i,
/menu\.leafly/i,
],
classes: [
/leafly-/i,
],
urls: [
/leafly\.com/i,
],
meta: [
/leafly/i,
],
apiEndpoints: [
/api\.leafly/i,
],
htmlPatterns: [
/data-leafly/i,
],
},
meadow: {
scripts: [
/meadow/i,
/getmeadow/i,
],
iframes: [
/getmeadow\.com/i,
],
classes: [
/meadow-/i,
],
urls: [
/getmeadow\.com/i,
],
meta: [],
apiEndpoints: [
/api\.getmeadow/i,
],
htmlPatterns: [],
},
greenlight: {
scripts: [
/greenlight/i,
/greenlightmenu/i,
],
iframes: [
/greenlight/i,
],
classes: [
/greenlight-/i,
],
urls: [
/greenlight/i,
],
meta: [],
apiEndpoints: [],
htmlPatterns: [],
},
blaze: {
scripts: [
/blaze\.me/i,
/blazepos/i,
],
iframes: [
/blaze\.me/i,
],
classes: [
/blaze-/i,
],
urls: [
/blaze\.me/i,
],
meta: [],
apiEndpoints: [
/api\.blaze/i,
],
htmlPatterns: [],
},
flowhub: {
scripts: [
/flowhub/i,
],
iframes: [
/flowhub\.com/i,
],
classes: [
/flowhub-/i,
],
urls: [
/flowhub\.com/i,
],
meta: [],
apiEndpoints: [],
htmlPatterns: [],
},
dispense: {
scripts: [
/dispenseapp/i,
],
iframes: [
/dispenseapp\.com/i,
],
classes: [
/dispense-/i,
],
urls: [
/dispenseapp\.com/i,
],
meta: [],
apiEndpoints: [],
htmlPatterns: [],
},
cova: {
scripts: [
/covasoftware/i,
/cova\.software/i,
],
iframes: [
/cova/i,
],
classes: [
/cova-/i,
],
urls: [
/cova/i,
],
meta: [],
apiEndpoints: [],
htmlPatterns: [],
},
};
// Common menu URL paths to check
const MENU_PATHS = [
'/menu',
'/shop',
'/products',
'/order',
'/store',
'/dispensary-menu',
'/online-menu',
'/shop-all',
'/browse',
'/catalog',
];
/**
* Analyze a single page for provider signals
*/
async function analyzePageForProviders(page, url) {
const signals = [];
try {
// Get page HTML
const html = await page.content();
const lowerHtml = html.toLowerCase();
// Check each provider's patterns
for (const [provider, patterns] of Object.entries(PROVIDER_PATTERNS)) {
// Check script sources
const scripts = await page.$$eval('script[src]', els => els.map(el => el.getAttribute('src') || ''));
for (const script of scripts) {
for (const pattern of patterns.scripts) {
if (pattern.test(script)) {
signals.push({
provider: provider,
confidence: 90,
source: 'script_src',
details: script,
});
}
}
}
// Check inline scripts
const inlineScripts = await page.$$eval('script:not([src])', els => els.map(el => el.textContent || ''));
for (const scriptContent of inlineScripts) {
for (const pattern of patterns.scripts) {
if (pattern.test(scriptContent)) {
signals.push({
provider: provider,
confidence: 70,
source: 'inline_script',
details: `Pattern: ${pattern}`,
});
}
}
}
// Check iframes
const iframes = await page.$$eval('iframe', els => els.map(el => el.getAttribute('src') || ''));
for (const iframe of iframes) {
for (const pattern of patterns.iframes) {
if (pattern.test(iframe)) {
signals.push({
provider: provider,
confidence: 95,
source: 'iframe_src',
details: iframe,
});
}
}
}
// Check HTML patterns
for (const pattern of patterns.htmlPatterns) {
if (pattern.test(html)) {
signals.push({
provider: provider,
confidence: 85,
source: 'html_pattern',
details: `Pattern: ${pattern}`,
});
}
}
// Check CSS classes
for (const pattern of patterns.classes) {
if (pattern.test(html)) {
signals.push({
provider: provider,
confidence: 60,
source: 'css_class',
details: `Pattern: ${pattern}`,
});
}
}
// Check meta tags
const metaTags = await page.$$eval('meta', els => els.map(el => `${el.getAttribute('name')} ${el.getAttribute('content')}`));
for (const meta of metaTags) {
for (const pattern of patterns.meta) {
if (pattern.test(meta)) {
signals.push({
provider: provider,
confidence: 80,
source: 'meta_tag',
details: meta,
});
}
}
}
}
// Check for network requests (if we intercepted them)
// This would be enhanced with request interception
}
catch (error) {
logger_1.logger.error('provider-detection', `Error analyzing page ${url}: ${error}`);
}
return signals;
}
/**
* Aggregate signals into a final detection result
*/
function aggregateSignals(signals) {
if (signals.length === 0) {
return { provider: 'unknown', confidence: 0 };
}
// Group signals by provider
const providerScores = {};
for (const signal of signals) {
if (!providerScores[signal.provider]) {
providerScores[signal.provider] = [];
}
providerScores[signal.provider].push(signal.confidence);
}
// Calculate weighted score for each provider
const scores = [];
for (const [provider, confidences] of Object.entries(providerScores)) {
// Use max confidence + bonus for multiple signals
const maxConf = Math.max(...confidences);
const multiSignalBonus = Math.min(10, (confidences.length - 1) * 3);
const score = Math.min(100, maxConf + multiSignalBonus);
scores.push({ provider: provider, score });
}
// Sort by score descending
scores.sort((a, b) => b.score - a.score);
const best = scores[0];
// If there's a clear winner (20+ point lead), use it
if (scores.length === 1 || best.score - scores[1].score >= 20) {
return { provider: best.provider, confidence: best.score };
}
// Multiple contenders - reduce confidence
return { provider: best.provider, confidence: Math.max(50, best.score - 20) };
}
/**
* Detect the menu provider for a dispensary
*/
async function detectMenuProvider(websiteUrl, options = {}) {
const { checkMenuPaths = true, timeout = 30000 } = options;
const result = {
provider: 'unknown',
confidence: 0,
signals: [],
urlsTested: [],
menuEntryPoints: [],
rawSignals: {},
};
let browser = null;
try {
// Normalize URL
let baseUrl = websiteUrl.trim();
if (!baseUrl.startsWith('http')) {
baseUrl = `https://${baseUrl}`;
}
baseUrl = baseUrl.replace(/\/$/, ''); // Remove trailing slash
// Launch browser
browser = await puppeteer_1.default.launch({
headless: true,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu',
],
});
const page = await browser.newPage();
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
// Track network requests for API detection
const apiRequests = [];
await page.setRequestInterception(true);
page.on('request', (request) => {
const url = request.url();
if (url.includes('api') || url.includes('graphql')) {
apiRequests.push(url);
}
request.continue();
});
// URLs to check
const urlsToCheck = [baseUrl];
if (checkMenuPaths) {
for (const path of MENU_PATHS) {
urlsToCheck.push(`${baseUrl}${path}`);
}
}
// Check each URL
for (const url of urlsToCheck) {
try {
result.urlsTested.push(url);
await page.goto(url, {
waitUntil: 'networkidle2',
timeout,
});
// Wait a bit for dynamic content
await new Promise(r => setTimeout(r, 2000));
// Analyze page
const pageSignals = await analyzePageForProviders(page, url);
result.signals.push(...pageSignals);
// Track if this URL has menu content
const hasMenuContent = await page.evaluate(() => {
const text = document.body.innerText.toLowerCase();
return (text.includes('add to cart') ||
text.includes('add to bag') ||
text.includes('product') ||
text.includes('indica') ||
text.includes('sativa') ||
text.includes('hybrid') ||
text.includes('thc') ||
text.includes('cbd'));
});
if (hasMenuContent && url !== baseUrl) {
result.menuEntryPoints.push(url);
}
}
catch (pageError) {
// 404s are fine, just skip
if (!pageError.message?.includes('404')) {
logger_1.logger.warn('provider-detection', `Could not load ${url}: ${pageError.message}`);
}
}
}
// Check API requests for provider hints
for (const apiUrl of apiRequests) {
for (const [provider, patterns] of Object.entries(PROVIDER_PATTERNS)) {
for (const pattern of patterns.apiEndpoints) {
if (pattern.test(apiUrl)) {
result.signals.push({
provider: provider,
confidence: 95,
source: 'api_request',
details: apiUrl,
});
}
}
}
}
// Record raw signals
result.rawSignals = {
apiRequestsFound: apiRequests.length,
menuEntryPointsFound: result.menuEntryPoints.length,
totalSignals: result.signals.length,
uniqueProviders: [...new Set(result.signals.map(s => s.provider))].length,
};
// Aggregate signals into final result
const aggregated = aggregateSignals(result.signals);
result.provider = aggregated.provider;
result.confidence = aggregated.confidence;
}
catch (error) {
result.error = error.message;
logger_1.logger.error('provider-detection', `Detection failed for ${websiteUrl}: ${error.message}`);
}
finally {
if (browser) {
await browser.close();
}
}
return result;
}
/**
* Quick check if a site has Dutchie - used during production crawls
*/
async function quickDutchieCheck(page) {
try {
const html = await page.content();
// Check for Dutchie-specific patterns
const dutchiePatterns = [
/dutchie/i,
/dutchie-plus/i,
/__DUTCHIE__/i,
/data-dutchie/i,
/embed\.dutchie/i,
];
for (const pattern of dutchiePatterns) {
if (pattern.test(html)) {
return true;
}
}
// Check iframes
const iframes = await page.$$eval('iframe', els => els.map(el => el.getAttribute('src') || ''));
for (const iframe of iframes) {
if (/dutchie/i.test(iframe)) {
return true;
}
}
return false;
}
catch {
return false;
}
}
/**
* Check if provider has changed from expected
*/
async function detectProviderChange(page, expectedProvider) {
try {
const signals = await analyzePageForProviders(page, page.url());
const aggregated = aggregateSignals(signals);
// If we expected Dutchie but found something else with high confidence
if (expectedProvider === 'dutchie' && aggregated.provider !== 'dutchie' && aggregated.confidence >= 70) {
return {
changed: true,
newProvider: aggregated.provider,
confidence: aggregated.confidence,
};
}
// If we expected Dutchie and found nothing/low confidence, might have switched
if (expectedProvider === 'dutchie' && aggregated.confidence < 30) {
// Check if Dutchie is definitely NOT present
const hasDutchie = await quickDutchieCheck(page);
if (!hasDutchie) {
return {
changed: true,
newProvider: aggregated.provider !== 'unknown' ? aggregated.provider : 'other',
confidence: Math.max(30, aggregated.confidence),
};
}
}
return { changed: false };
}
catch {
return { changed: false };
}
}

View File

@@ -3,22 +3,92 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.isBotDetectionError = isBotDetectionError;
exports.putProxyInTimeout = putProxyInTimeout;
exports.isProxyInTimeout = isProxyInTimeout;
exports.getActiveProxy = getActiveProxy;
exports.testProxy = testProxy;
exports.saveProxyTestResult = saveProxyTestResult;
exports.testAllProxies = testAllProxies;
exports.addProxy = addProxy;
exports.addProxiesFromList = addProxiesFromList;
exports.moveProxyToFailed = moveProxyToFailed;
exports.incrementProxyFailure = incrementProxyFailure;
const axios_1 = __importDefault(require("axios"));
const socks_proxy_agent_1 = require("socks-proxy-agent");
const https_proxy_agent_1 = require("https-proxy-agent");
const migrate_1 = require("../db/migrate");
// In-memory proxy timeout tracking
// Maps proxy ID to timestamp when timeout expires
const proxyTimeouts = new Map();
const PROXY_TIMEOUT_MS = 35000; // 35 seconds timeout for bot-detected proxies
// Check if error message indicates bot detection
function isBotDetectionError(errorMsg) {
const botPatterns = [
/bot detection/i,
/captcha/i,
/challenge/i,
/cloudflare/i,
/access denied/i,
/rate limit/i,
/too many requests/i,
/temporarily blocked/i,
/suspicious activity/i,
];
return botPatterns.some(pattern => pattern.test(errorMsg));
}
// Put proxy in timeout (bot detection cooldown)
function putProxyInTimeout(proxyId, reason) {
const timeoutUntil = Date.now() + PROXY_TIMEOUT_MS;
proxyTimeouts.set(proxyId, timeoutUntil);
console.log(`🚫 Proxy ${proxyId} in timeout for ${PROXY_TIMEOUT_MS / 1000}s: ${reason}`);
}
// Check if proxy is currently in timeout
function isProxyInTimeout(proxyId) {
const timeoutUntil = proxyTimeouts.get(proxyId);
if (!timeoutUntil)
return false;
if (Date.now() >= timeoutUntil) {
// Timeout expired, remove it
proxyTimeouts.delete(proxyId);
console.log(`✅ Proxy ${proxyId} timeout expired, back in rotation`);
return false;
}
return true;
}
// Get active proxy that's not in timeout
async function getActiveProxy() {
const result = await migrate_1.pool.query(`
SELECT id, host, port, protocol, username, password
FROM proxies
WHERE active = true
ORDER BY RANDOM()
`);
// Filter out proxies in timeout
for (const proxy of result.rows) {
if (!isProxyInTimeout(proxy.id)) {
return proxy;
}
}
// All proxies are in timeout, wait for first one to expire
if (proxyTimeouts.size > 0) {
const nextAvailable = Math.min(...Array.from(proxyTimeouts.values()));
const waitTime = Math.max(0, nextAvailable - Date.now());
console.log(`⏳ All proxies in timeout, waiting ${Math.ceil(waitTime / 1000)}s for next available...`);
await new Promise(resolve => setTimeout(resolve, waitTime));
// Try again after waiting
return getActiveProxy();
}
console.log('⚠️ No active proxies available');
return null;
}
async function getSettings() {
const result = await migrate_1.pool.query(`
SELECT key, value FROM settings
WHERE key IN ('proxy_timeout_ms', 'proxy_test_url')
`);
const settings = {};
result.rows.forEach(row => {
result.rows.forEach((row) => {
settings[row.key] = row.value;
});
return {
@@ -146,12 +216,44 @@ async function addProxy(host, port, protocol, username, password) {
async function addProxiesFromList(proxies) {
let added = 0;
let failed = 0;
let duplicates = 0;
const errors = [];
console.log(`📥 Importing ${proxies.length} proxies without testing...`);
for (const proxy of proxies) {
try {
await addProxy(proxy.host, proxy.port, proxy.protocol, proxy.username, proxy.password);
added++;
console.log(`✅ Added proxy: ${proxy.protocol}://${proxy.host}:${proxy.port}`);
// Insert without testing first
await migrate_1.pool.query(`
INSERT INTO proxies (host, port, protocol, username, password, active)
VALUES ($1, $2, $3, $4, $5, false)
ON CONFLICT (host, port, protocol) DO NOTHING
`, [
proxy.host,
proxy.port,
proxy.protocol,
proxy.username,
proxy.password
]);
// Check if it was actually inserted
const result = await migrate_1.pool.query(`
SELECT id FROM proxies
WHERE host = $1 AND port = $2 AND protocol = $3
`, [proxy.host, proxy.port, proxy.protocol]);
if (result.rows.length > 0) {
// Check if it was just inserted (no last_tested_at means new)
const checkResult = await migrate_1.pool.query(`
SELECT last_tested_at FROM proxies
WHERE host = $1 AND port = $2 AND protocol = $3
`, [proxy.host, proxy.port, proxy.protocol]);
if (checkResult.rows[0].last_tested_at === null) {
added++;
if (added % 100 === 0) {
console.log(`📥 Imported ${added} proxies...`);
}
}
else {
duplicates++;
}
}
}
catch (error) {
failed++;
@@ -159,8 +261,63 @@ async function addProxiesFromList(proxies) {
errors.push(errorMsg);
console.log(`❌ Failed to add proxy: ${errorMsg}`);
}
// Small delay between adds
await new Promise(resolve => setTimeout(resolve, 500));
}
return { added, failed, errors };
console.log(`✅ Import complete: ${added} added, ${duplicates} duplicates, ${failed} failed`);
return { added, failed, duplicates, errors };
}
async function moveProxyToFailed(proxyId, errorMsg) {
// Get proxy details
const proxyResult = await migrate_1.pool.query(`
SELECT host, port, protocol, username, password, failure_count
FROM proxies
WHERE id = $1
`, [proxyId]);
if (proxyResult.rows.length === 0) {
return;
}
const proxy = proxyResult.rows[0];
// Insert into failed_proxies table
await migrate_1.pool.query(`
INSERT INTO failed_proxies (host, port, protocol, username, password, failure_count, last_error)
VALUES ($1, $2, $3, $4, $5, $6, $7)
ON CONFLICT (host, port, protocol)
DO UPDATE SET
failure_count = $6,
last_error = $7,
failed_at = CURRENT_TIMESTAMP
`, [
proxy.host,
proxy.port,
proxy.protocol,
proxy.username,
proxy.password,
proxy.failure_count,
errorMsg
]);
// Delete from active proxies
await migrate_1.pool.query(`DELETE FROM proxies WHERE id = $1`, [proxyId]);
console.log(`🔴 Moved proxy to failed: ${proxy.protocol}://${proxy.host}:${proxy.port} (${proxy.failure_count} failures)`);
}
async function incrementProxyFailure(proxyId, errorMsg) {
// Increment failure count
const result = await migrate_1.pool.query(`
UPDATE proxies
SET failure_count = failure_count + 1,
active = false,
updated_at = CURRENT_TIMESTAMP
WHERE id = $1
RETURNING failure_count, host, port, protocol
`, [proxyId]);
if (result.rows.length === 0) {
return false;
}
const proxy = result.rows[0];
const failureCount = proxy.failure_count;
console.log(`⚠️ Proxy failure #${failureCount}: ${proxy.protocol}://${proxy.host}:${proxy.port}`);
// If failed 3 times, move to failed table
if (failureCount >= 3) {
await moveProxyToFailed(proxyId, errorMsg);
return true; // Moved to failed
}
return false; // Still in active proxies
}

174
backend/dist/services/proxyTestQueue.js vendored Normal file
View File

@@ -0,0 +1,174 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.cleanupOrphanedJobs = cleanupOrphanedJobs;
exports.createProxyTestJob = createProxyTestJob;
exports.getProxyTestJob = getProxyTestJob;
exports.getActiveProxyTestJob = getActiveProxyTestJob;
exports.cancelProxyTestJob = cancelProxyTestJob;
const migrate_1 = require("../db/migrate");
const proxy_1 = require("./proxy");
// Simple in-memory queue - could be replaced with Bull/Bee-Queue for production
const activeJobs = new Map();
// Clean up orphaned jobs on server startup
async function cleanupOrphanedJobs() {
try {
const result = await migrate_1.pool.query(`
UPDATE proxy_test_jobs
SET status = 'cancelled',
completed_at = CURRENT_TIMESTAMP,
updated_at = CURRENT_TIMESTAMP
WHERE status IN ('pending', 'running')
RETURNING id
`);
if (result.rows.length > 0) {
console.log(`🧹 Cleaned up ${result.rows.length} orphaned proxy test jobs`);
}
}
catch (error) {
console.error('Error cleaning up orphaned jobs:', error);
}
}
async function createProxyTestJob() {
// Check for existing running jobs first
const existingJob = await getActiveProxyTestJob();
if (existingJob) {
throw new Error('A proxy test job is already running. Please cancel it first.');
}
const result = await migrate_1.pool.query(`
SELECT COUNT(*) as count FROM proxies
`);
const totalProxies = parseInt(result.rows[0].count);
const jobResult = await migrate_1.pool.query(`
INSERT INTO proxy_test_jobs (status, total_proxies)
VALUES ('pending', $1)
RETURNING id
`, [totalProxies]);
const jobId = jobResult.rows[0].id;
// Start job in background
runProxyTestJob(jobId).catch(err => {
console.error(`❌ Proxy test job ${jobId} failed:`, err);
});
return jobId;
}
async function getProxyTestJob(jobId) {
const result = await migrate_1.pool.query(`
SELECT id, status, total_proxies, tested_proxies, passed_proxies, failed_proxies
FROM proxy_test_jobs
WHERE id = $1
`, [jobId]);
if (result.rows.length === 0) {
return null;
}
return result.rows[0];
}
async function getActiveProxyTestJob() {
const result = await migrate_1.pool.query(`
SELECT id, status, total_proxies, tested_proxies, passed_proxies, failed_proxies
FROM proxy_test_jobs
WHERE status IN ('pending', 'running')
ORDER BY created_at DESC
LIMIT 1
`);
if (result.rows.length === 0) {
return null;
}
return result.rows[0];
}
async function cancelProxyTestJob(jobId) {
// Try to cancel in-memory job first
const jobControl = activeJobs.get(jobId);
if (jobControl) {
jobControl.cancelled = true;
}
// Always update database to handle orphaned jobs
const result = await migrate_1.pool.query(`
UPDATE proxy_test_jobs
SET status = 'cancelled',
completed_at = CURRENT_TIMESTAMP,
updated_at = CURRENT_TIMESTAMP
WHERE id = $1 AND status IN ('pending', 'running')
RETURNING id
`, [jobId]);
return result.rows.length > 0;
}
async function runProxyTestJob(jobId) {
// Register job as active
activeJobs.set(jobId, { cancelled: false });
try {
// Update status to running
await migrate_1.pool.query(`
UPDATE proxy_test_jobs
SET status = 'running',
started_at = CURRENT_TIMESTAMP,
updated_at = CURRENT_TIMESTAMP
WHERE id = $1
`, [jobId]);
console.log(`🔍 Starting proxy test job ${jobId}...`);
// Get all proxies
const result = await migrate_1.pool.query(`
SELECT id, host, port, protocol, username, password
FROM proxies
ORDER BY id
`);
let tested = 0;
let passed = 0;
let failed = 0;
for (const proxy of result.rows) {
// Check if job was cancelled
const jobControl = activeJobs.get(jobId);
if (jobControl?.cancelled) {
console.log(`⏸️ Proxy test job ${jobId} cancelled`);
break;
}
// Test the proxy
const testResult = await (0, proxy_1.testProxy)(proxy.host, proxy.port, proxy.protocol, proxy.username, proxy.password);
// Save result
await (0, proxy_1.saveProxyTestResult)(proxy.id, testResult);
tested++;
if (testResult.success) {
passed++;
}
else {
failed++;
}
// Update job progress
await migrate_1.pool.query(`
UPDATE proxy_test_jobs
SET tested_proxies = $1,
passed_proxies = $2,
failed_proxies = $3,
updated_at = CURRENT_TIMESTAMP
WHERE id = $4
`, [tested, passed, failed, jobId]);
// Log progress every 10 proxies
if (tested % 10 === 0) {
console.log(`📊 Job ${jobId}: ${tested}/${result.rows.length} proxies tested (${passed} passed, ${failed} failed)`);
}
}
// Mark job as completed
const jobControl = activeJobs.get(jobId);
const finalStatus = jobControl?.cancelled ? 'cancelled' : 'completed';
await migrate_1.pool.query(`
UPDATE proxy_test_jobs
SET status = $1,
completed_at = CURRENT_TIMESTAMP,
updated_at = CURRENT_TIMESTAMP
WHERE id = $2
`, [finalStatus, jobId]);
console.log(`✅ Proxy test job ${jobId} ${finalStatus}: ${tested} tested, ${passed} passed, ${failed} failed`);
}
catch (error) {
console.error(`❌ Proxy test job ${jobId} error:`, error);
await migrate_1.pool.query(`
UPDATE proxy_test_jobs
SET status = 'failed',
completed_at = CURRENT_TIMESTAMP,
updated_at = CURRENT_TIMESTAMP
WHERE id = $1
`, [jobId]);
}
finally {
// Remove from active jobs
activeJobs.delete(jobId);
}
}

View File

@@ -18,7 +18,7 @@ async function getSettings() {
WHERE key IN ('scrape_interval_hours', 'scrape_specials_time')
`);
const settings = {};
result.rows.forEach(row => {
result.rows.forEach((row) => {
settings[row.key] = row.value;
});
return {

View File

@@ -4,10 +4,13 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.debugDutchiePage = debugDutchiePage;
const puppeteer_1 = __importDefault(require("puppeteer"));
const puppeteer_extra_1 = __importDefault(require("puppeteer-extra"));
const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth"));
const logger_1 = require("./logger");
// Apply stealth plugin
puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)());
async function debugDutchiePage(url) {
const browser = await puppeteer_1.default.launch({
const browser = await puppeteer_extra_1.default.launch({
headless: 'new',
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage']
});

View File

@@ -0,0 +1,236 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.scrapeCategoryPlaywright = scrapeCategoryPlaywright;
exports.testScrapeCategoryPlaywright = testScrapeCategoryPlaywright;
const age_gate_playwright_1 = require("../utils/age-gate-playwright");
const logger_1 = require("./logger");
const stealthBrowser_1 = require("../utils/stealthBrowser");
const dutchie_1 = require("../scrapers/templates/dutchie");
/**
* Scrapes a category page using Playwright with stealth mode to extract product information
*/
async function scrapeCategoryPlaywright(categoryUrl, categoryName, state = 'Arizona', proxy) {
logger_1.logger.info('scraper', `Scraping category: ${categoryName}`);
logger_1.logger.info('scraper', `URL: ${categoryUrl}`);
// Create stealth browser with optional proxy
const browser = await (0, stealthBrowser_1.createStealthBrowser)({ proxy, headless: true });
try {
// Create stealth context with age gate cookies
const context = await (0, stealthBrowser_1.createStealthContext)(browser, { state });
// Try to load saved session cookies
const cookiesPath = `/tmp/dutchie-session-${state.toLowerCase()}.json`;
await (0, stealthBrowser_1.loadCookies)(context, cookiesPath);
const page = await context.newPage();
// Navigate to category page
logger_1.logger.info('scraper', `Loading page: ${categoryUrl}`);
await page.goto(categoryUrl, { waitUntil: 'domcontentloaded', timeout: 60000 });
// Random delay to appear more human
await (0, stealthBrowser_1.randomDelay)(1000, 2000);
// Check for Cloudflare challenge
if (await (0, stealthBrowser_1.isCloudflareChallenge)(page)) {
logger_1.logger.info('scraper', '🛡️ Cloudflare challenge detected, waiting...');
const passed = await (0, stealthBrowser_1.waitForCloudflareChallenge)(page, 30000);
if (!passed) {
logger_1.logger.error('scraper', '❌ Failed to pass Cloudflare challenge');
await browser.close();
return [];
}
// Save successful session cookies
await (0, stealthBrowser_1.saveCookies)(context, cookiesPath);
}
// Wait for page to be fully loaded
await (0, stealthBrowser_1.waitForPageLoad)(page);
// Simulate human behavior
await (0, stealthBrowser_1.simulateHumanBehavior)(page);
// Check for and bypass age gate
const bypassed = await (0, age_gate_playwright_1.bypassAgeGatePlaywright)(page, state);
if (!bypassed) {
logger_1.logger.error('scraper', 'Failed to bypass age gate');
await browser.close();
return [];
}
// Wait for products to load with random delay
logger_1.logger.info('scraper', 'Waiting for products to load...');
await (0, stealthBrowser_1.randomDelay)(2000, 4000);
// Scroll to load all products with human-like behavior
logger_1.logger.info('scraper', 'Scrolling to load all products...');
await scrollToBottomHuman(page);
// Extract products
logger_1.logger.info('scraper', 'Extracting products from page...');
const products = await extractProducts(page, categoryUrl, categoryName);
logger_1.logger.info('scraper', `Found ${products.length} products`);
await browser.close();
return products;
}
catch (error) {
logger_1.logger.error('scraper', `Error scraping category: ${error}`);
await browser.close();
return [];
}
}
/**
* Scrolls to the bottom of the page with human-like behavior
*/
async function scrollToBottomHuman(page) {
let previousHeight = 0;
let currentHeight = await page.evaluate(() => document.body.scrollHeight);
let attempts = 0;
const maxAttempts = 20;
while (previousHeight < currentHeight && attempts < maxAttempts) {
previousHeight = currentHeight;
// Scroll down in chunks with randomized delays
const scrollAmount = Math.floor(Math.random() * 200) + 300; // 300-500px
await (0, stealthBrowser_1.humanScroll)(page, scrollAmount);
// Random pause like a human reading
await (0, stealthBrowser_1.randomDelay)(500, 1500);
// Check new height
currentHeight = await page.evaluate(() => document.body.scrollHeight);
attempts++;
}
// Final wait for any lazy-loaded content
await (0, stealthBrowser_1.randomDelay)(1000, 2000);
}
/**
* Extracts product information from the page
*/
async function extractProducts(page, categoryUrl, categoryName) {
let products = [];
// Check if we have a template for this URL
const template = (0, dutchie_1.getTemplateForUrl)(categoryUrl);
if (template) {
logger_1.logger.info('scraper', `Using ${template.name} template for extraction`);
try {
const templateProducts = await template.extractProducts(page);
// Add category to products from template
products = templateProducts.map(p => ({
...p,
category: categoryName,
}));
logger_1.logger.info('scraper', `Template extracted ${products.length} products`);
return products;
}
catch (err) {
logger_1.logger.error('scraper', `Template extraction failed: ${err}`);
// Fall through to fallback methods
}
}
// Fallback Method 1: Dutchie products (for Sol Flower, etc.)
try {
const dutchieProducts = await page.locator('[data-testid^="product-"], .product-card, [class*="ProductCard"]').all();
if (dutchieProducts.length > 0) {
logger_1.logger.info('scraper', `Found ${dutchieProducts.length} Dutchie-style products`);
for (const productEl of dutchieProducts) {
try {
const name = await productEl.locator('[data-testid="product-name"], .product-name, h3, h4').first().textContent() || '';
const brand = await productEl.locator('[data-testid="product-brand"], .product-brand, .brand').first().textContent().catch(() => '');
const priceText = await productEl.locator('[data-testid="product-price"], .product-price, .price').first().textContent().catch(() => '');
const imageUrl = await productEl.locator('img').first().getAttribute('src').catch(() => '');
const productLink = await productEl.locator('a').first().getAttribute('href').catch(() => '');
// Parse price
const price = priceText ? parseFloat(priceText.replace(/[^0-9.]/g, '')) : undefined;
if (name) {
products.push({
name: name.trim(),
brand: brand ? brand.trim() : undefined,
category: categoryName,
price,
image_url: imageUrl || undefined,
product_url: productLink ? new URL(productLink, categoryUrl).toString() : categoryUrl,
in_stock: true
});
}
}
catch (err) {
logger_1.logger.warn('scraper', `Error extracting Dutchie product: ${err}`);
}
}
}
}
catch (err) {
logger_1.logger.warn('scraper', `Dutchie product extraction failed: ${err}`);
}
// Method 2: Curaleaf products
if (products.length === 0) {
try {
const curaleafProducts = await page.locator('.product, [class*="Product"], [class*="item"]').all();
if (curaleafProducts.length > 0) {
logger_1.logger.info('scraper', `Found ${curaleafProducts.length} Curaleaf-style products`);
for (const productEl of curaleafProducts) {
try {
const name = await productEl.locator('h1, h2, h3, h4, .title, .name').first().textContent() || '';
const priceText = await productEl.locator('.price, [class*="price"]').first().textContent().catch(() => '');
const imageUrl = await productEl.locator('img').first().getAttribute('src').catch(() => '');
const price = priceText ? parseFloat(priceText.replace(/[^0-9.]/g, '')) : undefined;
if (name && name.length > 3) {
products.push({
name: name.trim(),
category: categoryName,
price,
image_url: imageUrl || undefined,
product_url: categoryUrl,
in_stock: true
});
}
}
catch (err) {
logger_1.logger.warn('scraper', `Error extracting Curaleaf product: ${err}`);
}
}
}
}
catch (err) {
logger_1.logger.warn('scraper', `Curaleaf product extraction failed: ${err}`);
}
}
// Method 3: Generic product cards
if (products.length === 0) {
try {
const genericProducts = await page.locator('article, [role="article"], .card, [class*="card"]').all();
logger_1.logger.info('scraper', `Trying generic selectors, found ${genericProducts.length} elements`);
for (const productEl of genericProducts) {
try {
const text = await productEl.textContent() || '';
// Only consider elements that look like products
if (text.includes('$') || text.toLowerCase().includes('price') || text.toLowerCase().includes('thc')) {
const name = await productEl.locator('h1, h2, h3, h4').first().textContent() || '';
if (name && name.length > 3) {
products.push({
name: name.trim(),
category: categoryName,
product_url: categoryUrl,
in_stock: true
});
}
}
}
catch (err) {
// Skip this element
}
}
}
catch (err) {
logger_1.logger.warn('scraper', `Generic product extraction failed: ${err}`);
}
}
return products;
}
/**
* Test function to scrape a single category
*/
async function testScrapeCategoryPlaywright(url, categoryName, state = 'Arizona') {
console.log(`\n🎭 Testing Playwright Category Scraper\n`);
console.log(`Category: ${categoryName}`);
console.log(`URL: ${url}\n`);
const products = await scrapeCategoryPlaywright(url, categoryName, state);
console.log(`\n✅ Found ${products.length} products\n`);
products.slice(0, 5).forEach((p, i) => {
console.log(`${i + 1}. ${p.name}`);
if (p.brand)
console.log(` Brand: ${p.brand}`);
if (p.price)
console.log(` Price: $${p.price}`);
console.log(` URL: ${p.product_url}`);
console.log('');
});
return products;
}

View File

@@ -3,20 +3,52 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.USER_AGENT_GROUPS = exports.USER_AGENTS = void 0;
exports.getUserAgent = getUserAgent;
exports.scrapeCategory = scrapeCategory;
exports.saveProducts = saveProducts;
exports.scrapeStore = scrapeStore;
const puppeteer_1 = __importDefault(require("puppeteer"));
const puppeteer_extra_1 = __importDefault(require("puppeteer-extra"));
const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth"));
const migrate_1 = require("../db/migrate");
const minio_1 = require("../utils/minio");
const logger_1 = require("./logger");
const USER_AGENTS = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
];
function getRandomUserAgent() {
return USER_AGENTS[Math.floor(Math.random() * USER_AGENTS.length)];
const scraper_monitor_1 = require("../routes/scraper-monitor");
const proxy_1 = require("./proxy");
const age_gate_1 = require("../utils/age-gate");
const availability_1 = require("./availability");
// Apply stealth plugin for antidetect/anti-fingerprinting
puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)());
exports.USER_AGENTS = {
'chrome-windows': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'chrome-mac': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'chrome-linux': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'mobile-ios': 'Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1',
'mobile-android': 'Mozilla/5.0 (Linux; Android 13; Pixel 7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Mobile Safari/537.36',
'googlebot': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
'bingbot': 'Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)'
};
exports.USER_AGENT_GROUPS = {
desktop: ['chrome-windows', 'chrome-mac', 'chrome-linux'],
mobile: ['mobile-ios', 'mobile-android'],
serp: ['googlebot', 'bingbot']
};
function getRandomUserAgentFromGroup(group) {
const randomKey = group[Math.floor(Math.random() * group.length)];
return exports.USER_AGENTS[randomKey];
}
function getUserAgent(key) {
if (!key)
return getRandomUserAgentFromGroup(exports.USER_AGENT_GROUPS.desktop);
// Check if it's a group
if (key === 'rotate-desktop')
return getRandomUserAgentFromGroup(exports.USER_AGENT_GROUPS.desktop);
if (key === 'rotate-mobile')
return getRandomUserAgentFromGroup(exports.USER_AGENT_GROUPS.mobile);
if (key === 'rotate-serp')
return getRandomUserAgentFromGroup(exports.USER_AGENT_GROUPS.serp);
// Otherwise treat as specific UA
return exports.USER_AGENTS[key] || getRandomUserAgentFromGroup(exports.USER_AGENT_GROUPS.desktop);
}
function extractImageIdFromUrl(url) {
try {
@@ -44,19 +76,6 @@ function sanitizeProductData(product) {
cbd: product.cbd && product.cbd < 100 ? product.cbd : null
};
}
async function getActiveProxy() {
const result = await migrate_1.pool.query(`
SELECT host, port, protocol, username, password
FROM proxies
WHERE active = true AND is_anonymous = true
ORDER BY RANDOM()
LIMIT 1
`);
if (result.rows.length === 0) {
return null;
}
return result.rows[0];
}
async function makePageStealthy(page) {
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', {
@@ -86,12 +105,11 @@ async function makePageStealthy(page) {
});
}
async function scrapeProductDetails(page, productUrl, productName) {
const maxRetries = 2;
const maxRetries = 3;
let lastError = null;
for (let attempt = 1; attempt <= maxRetries; attempt++) {
try {
await page.goto(productUrl, { waitUntil: 'domcontentloaded', timeout: 20000 });
await page.waitForTimeout(3000);
await page.goto(productUrl, { waitUntil: 'domcontentloaded', timeout: 30000 });
const details = await page.evaluate(() => {
const allText = document.body.textContent || '';
let fullSizeImage = null;
@@ -233,9 +251,7 @@ async function scrapeProductDetails(page, productUrl, productName) {
catch (error) {
lastError = error;
logger_1.logger.warn('scraper', ` Attempt ${attempt}/${maxRetries} failed for ${productName}: ${error}`);
if (attempt < maxRetries) {
await page.waitForTimeout(2000);
}
// No delays - just retry immediately
}
}
logger_1.logger.error('scraper', ` ✗ All attempts failed for ${productName}`);
@@ -253,8 +269,10 @@ async function scrapeProductDetails(page, productUrl, productName) {
weights: []
};
}
async function scrapeCategory(storeId, categoryId) {
async function scrapeCategory(storeId, categoryId, userAgent) {
let browser = null;
const scraperId = `cat-${categoryId}-${Date.now()}`;
let proxyId = null;
try {
const categoryResult = await migrate_1.pool.query(`
SELECT c.*, s.slug as store_slug, s.name as store_name
@@ -267,7 +285,12 @@ async function scrapeCategory(storeId, categoryId) {
}
const category = categoryResult.rows[0];
logger_1.logger.info('scraper', `Scraping category: ${category.name} for ${category.store_name}`);
const proxy = await getActiveProxy();
// Register scraper with monitoring system
(0, scraper_monitor_1.registerScraper)(scraperId, storeId, category.store_name, categoryId, category.name);
const proxy = await (0, proxy_1.getActiveProxy)();
if (proxy) {
proxyId = proxy.id;
}
const launchOptions = {
headless: 'new',
args: [
@@ -287,24 +310,51 @@ async function scrapeCategory(storeId, categoryId) {
}
logger_1.logger.info('scraper', `Using proxy: ${proxy.protocol}://${proxy.host}:${proxy.port}`);
}
browser = await puppeteer_1.default.launch(launchOptions);
browser = await puppeteer_extra_1.default.launch(launchOptions);
const page = await browser.newPage();
await makePageStealthy(page);
await page.setViewport({ width: 1920, height: 1080 });
await page.setUserAgent(getRandomUserAgent());
// Use provided userAgent or random if not specified
const ua = getUserAgent(userAgent);
await page.setUserAgent(ua);
// Set age gate bypass cookies BEFORE navigation (standard for all cannabis sites)
const state = (0, age_gate_1.detectStateFromUrl)(category.dutchie_url);
await (0, age_gate_1.setAgeGateCookies)(page, category.dutchie_url, state);
logger_1.logger.info('scraper', `Loading page: ${category.dutchie_url}`);
try {
await page.goto(category.dutchie_url, {
waitUntil: 'domcontentloaded',
waitUntil: 'networkidle2',
timeout: 60000
});
await page.waitForTimeout(5000);
// If age gate still appears, try to bypass it
await (0, age_gate_1.bypassAgeGate)(page, state);
// Wait for products to load
await page.waitForSelector('[data-testid="product-list-item"], a[href*="/product/"]', {
timeout: 30000,
}).catch(() => {
logger_1.logger.warn('scraper', 'No product selectors found, trying anyway...');
});
logger_1.logger.info('scraper', 'Scrolling to load all products...');
await autoScroll(page);
await page.waitForTimeout(3000);
}
catch (navError) {
logger_1.logger.error('scraper', `Navigation error: ${navError}`);
// Check if this is bot detection - put proxy in timeout instead of hard failure
if (proxyId) {
const errorMsg = String(navError);
if ((0, proxy_1.isBotDetectionError)(errorMsg)) {
// Bot detection! Put this proxy in timeout and get a new one
logger_1.logger.warn('scraper', `🤖 Bot detection triggered for proxy ${proxyId}!`);
(0, proxy_1.putProxyInTimeout)(proxyId, errorMsg);
throw new Error(`Bot detection: ${errorMsg}`);
}
else if (errorMsg.includes('timeout') || errorMsg.includes('net::') ||
errorMsg.includes('ERR_') || errorMsg.includes('Navigation')) {
// Regular proxy failure - increment failure count
logger_1.logger.warn('scraper', `Proxy failure detected, incrementing failure count for proxy ${proxyId}`);
await (0, proxy_1.incrementProxyFailure)(proxyId, errorMsg);
}
}
throw navError;
}
logger_1.logger.info('scraper', 'Extracting product list from page...');
@@ -336,6 +386,21 @@ async function scrapeCategory(storeId, categoryId) {
originalPrice = parseFloat(priceMatches[1].replace('$', ''));
}
}
// Extract variant (weight/size) - look for common patterns
let variant = null;
const variantPatterns = [
/(\d+\.?\d*\s*(?:g|oz|mg|ml|gram|ounce))/i, // Weight units
/(\d+\s*pack)/i, // Pack sizes
/(\d+\s*ct)/i, // Count
/(\d+\s*x\s*\d+\.?\d*\s*(?:g|mg|ml))/i // Multi-pack (e.g., 5x0.5g)
];
for (const pattern of variantPatterns) {
const match = allText.match(pattern);
if (match) {
variant = match[1].trim();
break;
}
}
const linkEl = card.querySelector('a[href*="/product/"]');
let href = linkEl?.href || linkEl?.getAttribute('href') || '';
if (href && href.startsWith('/')) {
@@ -343,6 +408,7 @@ async function scrapeCategory(storeId, categoryId) {
}
items.push({
name,
variant,
price,
originalPrice,
href: href || window.location.href
@@ -358,10 +424,19 @@ async function scrapeCategory(storeId, categoryId) {
logger_1.logger.info('scraper', `Now visiting each product page for complete details...`);
let successCount = 0;
let failCount = 0;
// Update initial stats
(0, scraper_monitor_1.updateScraperStats)(scraperId, {
productsProcessed: 0,
productsTotal: products.length
});
for (let i = 0; i < products.length; i++) {
const product = products[i];
try {
logger_1.logger.info('scraper', ` [${i + 1}/${products.length}] ${product.name}`);
(0, scraper_monitor_1.updateScraperStats)(scraperId, {
productsProcessed: i + 1,
productsTotal: products.length
}, `Processing: ${product.name}`);
if (!product.href) {
logger_1.logger.warn('scraper', ` ⚠ No product URL, skipping details`);
product.metadata = {};
@@ -391,7 +466,7 @@ async function scrapeCategory(storeId, categoryId) {
logger_1.logger.warn('scraper', ` ⚠ Limited data extracted`);
failCount++;
}
await page.waitForTimeout(1500);
// No delays - scrape fast!
}
catch (error) {
logger_1.logger.error('scraper', ` ✗ Unexpected error: ${error}`);
@@ -411,11 +486,16 @@ async function scrapeCategory(storeId, categoryId) {
SET last_scraped_at = CURRENT_TIMESTAMP
WHERE id = $1
`, [categoryId]);
// Mark scraper as complete
(0, scraper_monitor_1.completeScraper)(scraperId);
const formattedProducts = products.map((p, index) => {
const sanitized = sanitizeProductData(p);
// Normalize availability from Dutchie product data
const availability = (0, availability_1.normalizeAvailability)(p);
return {
dutchieProductId: `${category.store_slug}-${category.slug}-${Date.now()}-${index}`,
name: sanitized.name,
variant: p.variant || null,
description: sanitized.description,
price: p.price,
originalPrice: p.originalPrice,
@@ -426,13 +506,34 @@ async function scrapeCategory(storeId, categoryId) {
weight: sanitized.weight,
imageUrl: p.imageUrl,
dutchieUrl: p.href,
metadata: p.metadata || {}
metadata: p.metadata || {},
availabilityStatus: availability.status,
availabilityRaw: availability.raw,
stockQuantity: availability.quantity
};
});
return formattedProducts;
}
catch (error) {
logger_1.logger.error('scraper', `❌ Category scraping error: ${error}`);
// Smart proxy error handling
if (proxyId) {
const errorMsg = String(error);
if ((0, proxy_1.isBotDetectionError)(errorMsg)) {
// Bot detection! Put this proxy in timeout
logger_1.logger.warn('scraper', `🤖 Bot detection triggered for proxy ${proxyId}!`);
(0, proxy_1.putProxyInTimeout)(proxyId, errorMsg);
}
else if (errorMsg.includes('timeout') || errorMsg.includes('net::') ||
errorMsg.includes('ERR_') || errorMsg.includes('Navigation') ||
errorMsg.includes('Protocol error') || errorMsg.includes('Target closed')) {
// Regular proxy failure - increment failure count
logger_1.logger.warn('scraper', `Proxy failure detected, incrementing failure count for proxy ${proxyId}`);
await (0, proxy_1.incrementProxyFailure)(proxyId, errorMsg);
}
}
// Mark scraper as failed
(0, scraper_monitor_1.completeScraper)(scraperId, String(error));
if (browser) {
try {
await browser.close();
@@ -466,51 +567,84 @@ async function saveProducts(storeId, categoryId, products) {
try {
await client.query('BEGIN');
logger_1.logger.info('scraper', `Saving ${products.length} products to database...`);
// Mark all products as out-of-stock before processing (they'll be re-marked if found)
// Also update availability_status and last_seen_out_of_stock_at for state transition tracking
await client.query(`
UPDATE products
SET in_stock = false
WHERE store_id = $1 AND category_id = $2
SET in_stock = false,
availability_status = 'out_of_stock',
last_seen_out_of_stock_at = CASE
WHEN availability_status != 'out_of_stock' THEN CURRENT_TIMESTAMP
ELSE last_seen_out_of_stock_at
END
WHERE store_id = $1 AND category_id = $2 AND in_stock = true
`, [storeId, categoryId]);
for (const product of products) {
try {
// Get availability from product (defaults to in_stock if product exists in scraped data)
const availStatus = product.availabilityStatus || 'in_stock';
const availRaw = product.availabilityRaw ? JSON.stringify(product.availabilityRaw) : null;
const stockQty = product.stockQuantity ?? null;
const existingResult = await client.query(`
SELECT id, image_url, local_image_path
SELECT id, image_url, local_image_path, availability_status
FROM products
WHERE store_id = $1 AND name = $2 AND category_id = $3
`, [storeId, product.name, categoryId]);
AND (variant = $4 OR (variant IS NULL AND $4 IS NULL))
`, [storeId, product.name, categoryId, product.variant || null]);
let localImagePath = null;
let productId;
if (existingResult.rows.length > 0) {
productId = existingResult.rows[0].id;
localImagePath = existingResult.rows[0].local_image_path;
const prevStatus = existingResult.rows[0].availability_status;
// Determine if we need to update last_seen_in_stock_at
const isNowInStock = availStatus === 'in_stock' || availStatus === 'limited';
const wasOutOfStock = prevStatus === 'out_of_stock' || prevStatus === 'unknown';
await client.query(`
UPDATE products
SET name = $1, description = $2, price = $3,
strain_type = $4, thc_percentage = $5, cbd_percentage = $6,
brand = $7, weight = $8, image_url = $9, dutchie_url = $10,
in_stock = true, metadata = $11, last_seen_at = CURRENT_TIMESTAMP,
updated_at = CURRENT_TIMESTAMP
WHERE id = $12
SET name = $1, variant = $2, description = $3, price = $4,
strain_type = $5, thc_percentage = $6, cbd_percentage = $7,
brand = $8, weight = $9, image_url = $10, dutchie_url = $11,
in_stock = true, metadata = $12, last_seen_at = CURRENT_TIMESTAMP,
updated_at = CURRENT_TIMESTAMP,
availability_status = $14,
availability_raw = $15,
stock_quantity = $16,
last_seen_in_stock_at = CASE
WHEN $17 THEN CURRENT_TIMESTAMP
ELSE last_seen_in_stock_at
END
WHERE id = $13
`, [
product.name, product.description, product.price,
product.name, product.variant, product.description, product.price,
product.strainType, product.thcPercentage, product.cbdPercentage,
product.brand, product.weight, product.imageUrl, product.dutchieUrl,
JSON.stringify(product.metadata), productId
JSON.stringify(product.metadata), productId, availStatus, availRaw, stockQty,
isNowInStock && wasOutOfStock
]);
}
else {
// Generate unique slug from product name + timestamp + random suffix
const baseSlug = product.name
.toLowerCase()
.replace(/[^a-z0-9]+/g, '-')
.replace(/^-|-$/g, '')
.substring(0, 150);
const uniqueSuffix = `${Date.now()}-${Math.random().toString(36).substr(2, 6)}`;
const slug = `${baseSlug}-${uniqueSuffix}`;
const insertResult = await client.query(`
INSERT INTO products (
store_id, category_id, dutchie_product_id, name, description,
store_id, category_id, dutchie_product_id, name, slug, variant, description,
price, strain_type, thc_percentage, cbd_percentage,
brand, weight, image_url, dutchie_url, in_stock, metadata
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, true, $14)
brand, weight, image_url, dutchie_url, in_stock, metadata,
availability_status, availability_raw, stock_quantity, last_seen_in_stock_at
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, true, $16, $17, $18, $19, CURRENT_TIMESTAMP)
RETURNING id
`, [
storeId, categoryId, product.dutchieProductId, product.name, product.description,
storeId, categoryId, product.dutchieProductId, product.name, slug, product.variant, product.description,
product.price, product.strainType, product.thcPercentage, product.cbdPercentage,
product.brand, product.weight, product.imageUrl, product.dutchieUrl,
JSON.stringify(product.metadata)
JSON.stringify(product.metadata), availStatus, availRaw, stockQty
]);
productId = insertResult.rows[0].id;
}
@@ -544,19 +678,15 @@ async function saveProducts(storeId, categoryId, products) {
client.release();
}
}
async function scrapeStore(storeId) {
async function scrapeStore(storeId, parallel = 3, userAgent) {
try {
logger_1.logger.info('scraper', `🏪 Starting scrape for store ID: ${storeId}`);
logger_1.logger.info('scraper', `🏪 Starting scrape for store ID: ${storeId} (${parallel} parallel, UA: ${userAgent || 'random'})`);
const categoriesResult = await migrate_1.pool.query(`
SELECT c.id, c.name, c.slug, c.dutchie_url
FROM categories c
WHERE c.store_id = $1
AND c.scrape_enabled = true
AND NOT EXISTS (
SELECT 1 FROM categories child
WHERE child.parent_id = c.id
)
ORDER BY c.display_order, c.name
WHERE c.store_id = $1
AND c.scrape_enabled = true
ORDER BY c.name
`, [storeId]);
logger_1.logger.info('scraper', `Found ${categoriesResult.rows.length} categories to scrape`);
for (const category of categoriesResult.rows) {
@@ -564,14 +694,14 @@ async function scrapeStore(storeId) {
logger_1.logger.info('scraper', `━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
logger_1.logger.info('scraper', `📂 Scraping: ${category.name}`);
logger_1.logger.info('scraper', `━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
const products = await scrapeCategory(storeId, category.id);
const products = await scrapeCategory(storeId, category.id, userAgent);
await saveProducts(storeId, category.id, products);
logger_1.logger.info('scraper', `✅ Completed ${category.name} - ${products.length} products saved`);
}
catch (error) {
logger_1.logger.error('scraper', `❌ Failed to scrape ${category.name}: ${error}`);
}
await new Promise(resolve => setTimeout(resolve, 5000));
// No delays - scrape fast!
}
await migrate_1.pool.query(`
UPDATE stores

View File

@@ -0,0 +1,351 @@
"use strict";
/**
* Store Crawl Orchestrator
*
* Orchestrates the complete crawl workflow for a store:
* 1. Load store and its linked dispensary
* 2. Check if provider detection is needed
* 3. Run provider detection if needed
* 4. Queue appropriate crawl jobs based on provider/mode
* 5. Update store_crawl_schedule with meaningful status
*
* This replaces the simple "triggerManualCrawl" with intelligent orchestration.
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.runStoreCrawlOrchestrator = runStoreCrawlOrchestrator;
exports.runBatchOrchestrator = runBatchOrchestrator;
exports.getStoresDueForOrchestration = getStoresDueForOrchestration;
const uuid_1 = require("uuid");
const migrate_1 = require("../db/migrate");
const crawler_logger_1 = require("./crawler-logger");
const intelligence_detector_1 = require("./intelligence-detector");
const category_crawler_jobs_1 = require("./category-crawler-jobs");
// DEPRECATED: scrapeStore writes to legacy products table
// import { scrapeStore } from '../scraper-v2';
// Import the new dutchie-az pipeline for Dutchie crawling
const product_crawler_1 = require("../dutchie-az/services/product-crawler");
const connection_1 = require("../dutchie-az/db/connection");
// ========================================
// Main Orchestrator Function
// ========================================
/**
* Run the complete crawl orchestration for a store
*
* Behavior:
* 1. Load the store and its linked dispensary
* 2. If no dispensary is linked, report error
* 3. If product_provider is missing or stale (>7 days), run detection
* 4. After detection:
* - If product_provider = 'dutchie' and product_crawler_mode = 'production': Run production crawl
* - Otherwise: Run sandbox crawl
* 5. Update store_crawl_schedule with status/summary
*/
async function runStoreCrawlOrchestrator(storeId) {
const startTime = Date.now();
const runId = (0, uuid_1.v4)();
let result = {
status: 'pending',
summary: '',
runId,
storeId,
dispensaryId: null,
detectionRan: false,
crawlRan: false,
durationMs: 0,
};
try {
// Mark schedule as running
await updateScheduleStatus(storeId, 'running', 'Starting orchestrator...', runId);
// 1. Load store with dispensary info
const store = await getStoreWithDispensary(storeId);
if (!store) {
throw new Error(`Store ${storeId} not found`);
}
result.dispensaryId = store.dispensary_id;
// 2. Check if dispensary is linked
if (!store.dispensary_id) {
result.status = 'error';
result.summary = 'No dispensary linked - cannot determine provider';
result.error = 'Store is not linked to a dispensary. Link it in the Dispensaries page.';
await updateScheduleStatus(storeId, 'error', result.summary, runId, result.error);
result.durationMs = Date.now() - startTime;
return result;
}
// 3. Check if provider detection is needed
const needsDetection = await checkNeedsDetection(store);
if (needsDetection) {
// Run provider detection
const websiteUrl = store.dispensary_menu_url || store.dispensary_website;
if (!websiteUrl) {
result.status = 'error';
result.summary = 'No website URL available for detection';
result.error = 'Dispensary has no menu_url or website configured';
await updateScheduleStatus(storeId, 'error', result.summary, runId, result.error);
result.durationMs = Date.now() - startTime;
return result;
}
await updateScheduleStatus(storeId, 'running', 'Running provider detection...', runId);
const detectionResult = await (0, intelligence_detector_1.detectMultiCategoryProviders)(websiteUrl);
result.detectionRan = true;
result.detectionResult = detectionResult;
// Save detection results to dispensary
await (0, intelligence_detector_1.updateAllCategoryProviders)(store.dispensary_id, detectionResult);
crawler_logger_1.crawlerLogger.providerDetected({
dispensary_id: store.dispensary_id,
dispensary_name: store.dispensary_name || store.name,
detected_provider: detectionResult.product.provider,
confidence: detectionResult.product.confidence,
detection_method: 'orchestrator_run',
menu_url: websiteUrl,
category: 'product',
});
// Refresh store info after detection
const updatedStore = await getStoreWithDispensary(storeId);
if (updatedStore) {
Object.assign(store, updatedStore);
}
}
// 4. Determine crawl type and run
const provider = store.product_provider;
const mode = store.product_crawler_mode;
if (provider === 'dutchie' && mode === 'production') {
// Production Dutchie crawl - now uses the new dutchie-az GraphQL pipeline
await updateScheduleStatus(storeId, 'running', 'Running Dutchie GraphQL crawl (dutchie-az)...', runId);
try {
// Look up the dispensary in the dutchie-az database
// The dutchie-az pipeline has its own dispensaries table
// We try multiple matching strategies: name, slug, or platform_dispensary_id
const dispensaryResult = await (0, connection_1.query)(`SELECT * FROM dispensaries
WHERE name ILIKE $1
OR slug ILIKE $2
LIMIT 1`, [store.dispensary_name, store.slug]);
if (dispensaryResult.rows.length === 0) {
throw new Error(`Dispensary not found in dutchie-az database. ` +
`You must add this dispensary to the dutchie-az pipeline first. ` +
`Store: ${store.name} (${store.dispensary_name})`);
}
const dutchieDispensary = dispensaryResult.rows[0];
// Run the new dutchie-az GraphQL crawler
const crawlResult = await (0, product_crawler_1.crawlDispensaryProducts)(dutchieDispensary, 'rec', { useBothModes: true });
result.crawlRan = true;
result.crawlType = 'production';
result.productsFound = crawlResult.productsFound ?? undefined;
result.productsNew = crawlResult.productsUpserted ?? undefined;
result.productsUpdated = crawlResult.snapshotsCreated ?? undefined;
if (crawlResult.success) {
const detectionPart = result.detectionRan ? 'Detection + ' : '';
result.summary = `${detectionPart}Dutchie GraphQL crawl (${crawlResult.productsFound || 0} items, ${crawlResult.productsUpserted || 0} upserted, ${crawlResult.snapshotsCreated || 0} snapshots)`;
result.status = 'success';
// Update store's last_scraped_at
await migrate_1.pool.query('UPDATE stores SET last_scraped_at = NOW() WHERE id = $1', [storeId]);
crawler_logger_1.crawlerLogger.jobCompleted({
job_id: 0, // Orchestrator doesn't create traditional jobs
store_id: storeId,
store_name: store.name,
duration_ms: crawlResult.durationMs,
products_found: crawlResult.productsFound || 0,
products_new: crawlResult.productsUpserted || 0,
products_updated: crawlResult.snapshotsCreated || 0,
provider: 'dutchie',
});
}
else {
throw new Error(crawlResult.errorMessage || 'Crawl failed');
}
}
catch (crawlError) {
result.status = 'error';
result.error = crawlError.message;
result.summary = `Dutchie crawl failed: ${crawlError.message.slice(0, 100)}`;
result.crawlRan = true;
result.crawlType = 'production';
crawler_logger_1.crawlerLogger.jobFailed({
job_id: 0,
store_id: storeId,
store_name: store.name,
duration_ms: Date.now() - startTime,
error_message: crawlError.message,
provider: 'dutchie',
});
}
}
else if (provider && provider !== 'unknown') {
// Sandbox crawl for non-Dutchie or sandbox mode
await updateScheduleStatus(storeId, 'running', `Running ${provider} sandbox crawl...`, runId);
try {
const sandboxResult = await (0, category_crawler_jobs_1.runSandboxProductsJob)(store.dispensary_id);
result.crawlRan = true;
result.crawlType = 'sandbox';
result.productsFound = sandboxResult.data?.productsExtracted || 0;
const detectionPart = result.detectionRan ? 'Detection + ' : '';
if (sandboxResult.success) {
result.summary = `${detectionPart}${provider} sandbox crawl (${result.productsFound} items, quality ${sandboxResult.data?.qualityScore || 0}%)`;
result.status = 'sandbox_only';
}
else {
result.summary = `${detectionPart}${provider} sandbox failed: ${sandboxResult.message}`;
result.status = 'error';
result.error = sandboxResult.message;
}
}
catch (sandboxError) {
result.status = 'error';
result.error = sandboxError.message;
result.summary = `Sandbox crawl failed: ${sandboxError.message.slice(0, 100)}`;
result.crawlRan = true;
result.crawlType = 'sandbox';
}
}
else {
// No provider detected - detection only
if (result.detectionRan) {
result.summary = `Detection complete: provider=${store.product_provider || 'unknown'}, confidence=${store.product_confidence || 0}%`;
result.status = 'detection_only';
}
else {
result.summary = 'No provider detected and no crawl possible';
result.status = 'error';
result.error = 'Could not determine menu provider';
}
}
}
catch (error) {
result.status = 'error';
result.error = error.message;
result.summary = `Orchestrator error: ${error.message.slice(0, 100)}`;
crawler_logger_1.crawlerLogger.queueFailure({
queue_type: 'orchestrator',
error_message: error.message,
});
}
result.durationMs = Date.now() - startTime;
// Update final schedule status
await updateScheduleStatus(storeId, result.status, result.summary, runId, result.error);
// Create a crawl_job record for tracking
await createOrchestratorJobRecord(storeId, result);
return result;
}
// ========================================
// Helper Functions
// ========================================
async function getStoreWithDispensary(storeId) {
const result = await migrate_1.pool.query(`SELECT
s.id, s.name, s.slug, s.timezone, s.dispensary_id,
d.name as dispensary_name,
d.menu_url as dispensary_menu_url,
d.website as dispensary_website,
d.product_provider,
d.product_confidence,
d.product_crawler_mode,
d.last_product_scan_at
FROM stores s
LEFT JOIN dispensaries d ON d.id = s.dispensary_id
WHERE s.id = $1`, [storeId]);
return result.rows[0] || null;
}
async function checkNeedsDetection(store) {
// No dispensary = can't detect
if (!store.dispensary_id)
return false;
// No provider = definitely needs detection
if (!store.product_provider)
return true;
// Unknown provider = needs detection
if (store.product_provider === 'unknown')
return true;
// Low confidence = needs re-detection
if (store.product_confidence !== null && store.product_confidence < 50)
return true;
// Stale detection (> 7 days) = needs refresh
if (store.last_product_scan_at) {
const daysSince = (Date.now() - new Date(store.last_product_scan_at).getTime()) / (1000 * 60 * 60 * 24);
if (daysSince > 7)
return true;
}
return false;
}
async function updateScheduleStatus(storeId, status, summary, runId, error) {
await migrate_1.pool.query(`INSERT INTO store_crawl_schedule (store_id, last_status, last_summary, last_run_at, last_error)
VALUES ($1, $2, $3, NOW(), $4)
ON CONFLICT (store_id) DO UPDATE SET
last_status = $2,
last_summary = $3,
last_run_at = NOW(),
last_error = $4,
updated_at = NOW()`, [storeId, status, summary, error || null]);
}
async function getLatestCrawlStats(storeId) {
// Get count of products for this store
const result = await migrate_1.pool.query(`SELECT
COUNT(*) as total,
COUNT(*) FILTER (WHERE created_at > NOW() - INTERVAL '1 hour') as recent_new,
COUNT(*) FILTER (WHERE updated_at > NOW() - INTERVAL '1 hour' AND created_at < NOW() - INTERVAL '1 hour') as recent_updated
FROM products
WHERE store_id = $1`, [storeId]);
return {
products_found: parseInt(result.rows[0]?.total || '0'),
products_new: parseInt(result.rows[0]?.recent_new || '0'),
products_updated: parseInt(result.rows[0]?.recent_updated || '0'),
};
}
async function createOrchestratorJobRecord(storeId, result) {
await migrate_1.pool.query(`INSERT INTO crawl_jobs (
store_id, job_type, trigger_type, status, priority,
scheduled_at, started_at, completed_at,
products_found, products_new, products_updated,
error_message, orchestrator_run_id, detection_result
) VALUES (
$1, 'orchestrator', 'manual', $2, 100,
NOW(), NOW(), NOW(),
$3, $4, $5,
$6, $7, $8
)`, [
storeId,
result.status === 'success' ? 'completed' : result.status === 'error' ? 'failed' : 'completed',
result.productsFound || null,
result.productsNew || null,
result.productsUpdated || null,
result.error || null,
result.runId,
result.detectionResult ? JSON.stringify({
product_provider: result.detectionResult.product.provider,
product_confidence: result.detectionResult.product.confidence,
product_mode: result.detectionResult.product.mode,
}) : null,
]);
}
// ========================================
// Batch Orchestration
// ========================================
/**
* Run orchestrator for multiple stores
*/
async function runBatchOrchestrator(storeIds, concurrency = 3) {
const results = [];
// Process in batches
for (let i = 0; i < storeIds.length; i += concurrency) {
const batch = storeIds.slice(i, i + concurrency);
const batchResults = await Promise.all(batch.map(storeId => runStoreCrawlOrchestrator(storeId)));
results.push(...batchResults);
}
return results;
}
/**
* Get stores that are due for orchestration
*/
async function getStoresDueForOrchestration(limit = 10) {
const result = await migrate_1.pool.query(`SELECT s.id
FROM stores s
LEFT JOIN store_crawl_schedule scs ON scs.store_id = s.id
WHERE s.active = TRUE
AND s.scrape_enabled = TRUE
AND COALESCE(scs.enabled, TRUE) = TRUE
AND (
scs.last_run_at IS NULL
OR scs.last_run_at < NOW() - (COALESCE(scs.interval_hours, 4) || ' hours')::INTERVAL
)
AND (scs.last_status IS NULL OR scs.last_status NOT IN ('running', 'pending'))
ORDER BY COALESCE(scs.priority, 0) DESC, scs.last_run_at ASC NULLS FIRST
LIMIT $1`, [limit]);
return result.rows.map(row => row.id);
}

View File

@@ -0,0 +1,175 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.hasAgeGatePlaywright = hasAgeGatePlaywright;
exports.bypassAgeGatePlaywright = bypassAgeGatePlaywright;
exports.detectStateFromUrlPlaywright = detectStateFromUrlPlaywright;
const logger_1 = require("../services/logger");
/**
* Detects if a Playwright page has an age verification gate
*/
async function hasAgeGatePlaywright(page) {
try {
const url = page.url();
const bodyText = await page.textContent('body') || '';
const hasAgeVerification = url.includes('/age-gate') ||
bodyText.includes('age verification') ||
bodyText.includes('Please select your state') ||
bodyText.includes('are you 21') ||
bodyText.includes('are you 18') ||
bodyText.includes('Enter your date of birth') ||
bodyText.toLowerCase().includes('verify your age');
return hasAgeVerification;
}
catch (err) {
logger_1.logger.warn('age-gate', `Error detecting age gate: ${err}`);
return false;
}
}
/**
* Attempts to bypass an age gate using Playwright
* Handles multiple age gate patterns including Curaleaf's complex React-based gate
*
* @param page - Playwright page object
* @param state - State to select (e.g., 'Arizona', 'California')
* @returns Promise<boolean> - true if bypass succeeded, false otherwise
*/
async function bypassAgeGatePlaywright(page, state = 'Arizona') {
try {
const hasGate = await hasAgeGatePlaywright(page);
if (!hasGate) {
logger_1.logger.info('age-gate', 'No age gate detected');
return true;
}
logger_1.logger.info('age-gate', `Age gate detected - attempting to bypass with state: ${state}...`);
// Wait for age gate to fully render
await page.waitForTimeout(2000);
// Method 1: Curaleaf-style (state dropdown + "I'm over 21" button)
try {
const stateButton = page.locator('button#state, button[id="state"]').first();
const stateButtonExists = await stateButton.count() > 0;
if (stateButtonExists) {
logger_1.logger.info('age-gate', 'Found Curaleaf-style state dropdown...');
await stateButton.click();
await page.waitForTimeout(1000);
// Select state
const stateOption = page.locator('[role="option"]').filter({ hasText: new RegExp(`^${state}$`, 'i') });
const stateExists = await stateOption.count() > 0;
if (stateExists) {
logger_1.logger.info('age-gate', `Clicking ${state} option...`);
await stateOption.first().click();
await page.waitForTimeout(2000);
// Look for "I'm over 21" button
const ageButton = page.locator('button').filter({ hasText: /I'm over 21|I am 21|I'm 21|over 21/i });
const ageButtonExists = await ageButton.count() > 0;
if (ageButtonExists) {
logger_1.logger.info('age-gate', 'Clicking age verification button...');
await ageButton.first().click();
await page.waitForLoadState('domcontentloaded', { timeout: 15000 });
await page.waitForTimeout(3000);
// Check if we successfully bypassed
const finalUrl = page.url();
if (!finalUrl.includes('/age-gate')) {
logger_1.logger.info('age-gate', `✅ Age gate bypass successful`);
return true;
}
}
}
}
}
catch (e) {
logger_1.logger.warn('age-gate', `Curaleaf method failed: ${e}`);
}
// Method 2: Simple "Yes" or "I'm 21" button (for simpler age gates)
try {
const simpleButton = page.locator('button, a, [role="button"]').filter({
hasText: /yes|i am 21|i'm 21|enter the site|continue|confirm/i
});
const simpleExists = await simpleButton.count() > 0;
if (simpleExists) {
logger_1.logger.info('age-gate', 'Found simple age gate button...');
await simpleButton.first().click();
await page.waitForLoadState('domcontentloaded', { timeout: 10000 });
await page.waitForTimeout(2000);
const finalUrl = page.url();
if (!finalUrl.includes('/age-gate')) {
logger_1.logger.info('age-gate', `✅ Age gate bypass successful`);
return true;
}
}
}
catch (e) {
logger_1.logger.warn('age-gate', `Simple button method failed: ${e}`);
}
// Method 3: Standard select dropdown
try {
const selectExists = await page.locator('select').count() > 0;
if (selectExists) {
logger_1.logger.info('age-gate', 'Found select dropdown...');
const select = page.locator('select').first();
await select.selectOption({ label: state });
await page.waitForTimeout(1000);
// Look for submit button
const submitButton = page.locator('button[type="submit"], input[type="submit"]');
const submitExists = await submitButton.count() > 0;
if (submitExists) {
await submitButton.first().click();
await page.waitForLoadState('domcontentloaded', { timeout: 10000 });
await page.waitForTimeout(2000);
const finalUrl = page.url();
if (!finalUrl.includes('/age-gate')) {
logger_1.logger.info('age-gate', `✅ Age gate bypass successful`);
return true;
}
}
}
}
catch (e) {
logger_1.logger.warn('age-gate', `Select dropdown method failed: ${e}`);
}
// Verify final state
const finalUrl = page.url();
if (finalUrl.includes('/age-gate')) {
logger_1.logger.error('age-gate', `❌ Age gate bypass failed - still at: ${finalUrl}`);
return false;
}
logger_1.logger.info('age-gate', `✅ Age gate bypass successful`);
return true;
}
catch (err) {
logger_1.logger.error('age-gate', `Error bypassing age gate: ${err}`);
return false;
}
}
/**
* Helper to detect the state from a store URL
*/
function detectStateFromUrlPlaywright(url) {
const stateMap = {
'-az-': 'Arizona',
'arizona': 'Arizona',
'-ca-': 'California',
'california': 'California',
'-co-': 'Colorado',
'colorado': 'Colorado',
'-fl-': 'Florida',
'florida': 'Florida',
'-il-': 'Illinois',
'illinois': 'Illinois',
'-ma-': 'Massachusetts',
'-mi-': 'Michigan',
'-nv-': 'Nevada',
'-nj-': 'New Jersey',
'-ny-': 'New York',
'-or-': 'Oregon',
'-pa-': 'Pennsylvania',
'-wa-': 'Washington',
};
const lowerUrl = url.toLowerCase();
for (const [pattern, stateName] of Object.entries(stateMap)) {
if (lowerUrl.includes(pattern)) {
return stateName;
}
}
// Default to Arizona
return 'Arizona';
}

263
backend/dist/utils/age-gate.js vendored Normal file
View File

@@ -0,0 +1,263 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.setAgeGateCookies = setAgeGateCookies;
exports.hasAgeGate = hasAgeGate;
exports.bypassAgeGate = bypassAgeGate;
exports.detectStateFromUrl = detectStateFromUrl;
const logger_1 = require("../services/logger");
/**
* Sets age gate bypass cookies before navigating to a page
* This should be called BEFORE page.goto() to prevent the age gate from showing
*
* @param page - Puppeteer page object
* @param url - URL to extract domain from
* @param state - State to set in cookie
*/
async function setAgeGateCookies(page, url, state = 'Arizona') {
try {
const urlObj = new URL(url);
const domain = urlObj.hostname.replace('www.', '');
// Set cookies that bypass age gates
await page.setCookie({
name: 'age_gate_passed',
value: 'true',
domain: `.${domain}`,
path: '/',
expires: Date.now() / 1000 + 365 * 24 * 60 * 60, // 1 year
httpOnly: false,
secure: false,
sameSite: 'Lax'
}, {
name: 'selected_state',
value: state,
domain: `.${domain}`,
path: '/',
expires: Date.now() / 1000 + 365 * 24 * 60 * 60, // 1 year
httpOnly: false,
secure: false,
sameSite: 'Lax'
}, {
name: 'age_verified',
value: 'true',
domain: `.${domain}`,
path: '/',
expires: Date.now() / 1000 + 365 * 24 * 60 * 60,
httpOnly: false,
secure: false,
sameSite: 'Lax'
});
logger_1.logger.info('age-gate', `Set age gate bypass cookies for ${domain} (state: ${state})`);
}
catch (err) {
logger_1.logger.warn('age-gate', `Failed to set age gate cookies: ${err}`);
}
}
/**
* Detects if a page has an age verification gate
*/
async function hasAgeGate(page) {
return await page.evaluate(() => {
const bodyText = document.body.textContent || '';
const hasAgeVerification = bodyText.includes('age verification') ||
bodyText.includes('Please select your state') ||
bodyText.includes('are you 21') ||
bodyText.includes('are you 18') ||
bodyText.includes('Enter your date of birth') ||
bodyText.toLowerCase().includes('verify');
return hasAgeVerification;
});
}
/**
* Attempts to bypass an age gate by selecting the appropriate state
* Works with multiple age gate patterns used by cannabis dispensaries
*
* @param page - Puppeteer page object
* @param state - State to select (e.g., 'Arizona', 'California'). Defaults to 'Arizona'
* @returns Promise<boolean> - true if bypass was attempted, false if no age gate found
*/
async function bypassAgeGate(page, state = 'Arizona', useSavedCookies = true) {
try {
const hasGate = await hasAgeGate(page);
if (!hasGate) {
logger_1.logger.info('age-gate', 'No age gate detected');
return false;
}
logger_1.logger.info('age-gate', `Age gate detected - attempting to bypass with state: ${state}...`);
// Wait a bit for React components to fully render
await page.waitForTimeout(2000);
// Try Method 0: Custom dropdown button (shadcn/radix style - Curaleaf)
let customDropdownWorked = false;
try {
// Click button to open dropdown
const dropdownButton = await page.$('button#state, button[id="state"]');
if (dropdownButton) {
logger_1.logger.info('age-gate', 'Found state dropdown button, clicking...');
await dropdownButton.click();
await page.waitForTimeout(800);
// Click the state option and trigger React events
const stateClicked = await page.evaluate((selectedState) => {
const options = Array.from(document.querySelectorAll('[role="option"]'));
const stateOption = options.find(el => el.textContent?.toLowerCase() === selectedState.toLowerCase());
if (stateOption instanceof HTMLElement) {
// Trigger multiple events that React might be listening for
stateOption.dispatchEvent(new MouseEvent('mousedown', { bubbles: true }));
stateOption.dispatchEvent(new MouseEvent('mouseup', { bubbles: true }));
stateOption.click();
stateOption.dispatchEvent(new MouseEvent('click', { bubbles: true }));
stateOption.dispatchEvent(new Event('change', { bubbles: true }));
stateOption.dispatchEvent(new Event('input', { bubbles: true }));
return true;
}
return false;
}, state);
if (stateClicked) {
logger_1.logger.info('age-gate', `Clicked ${state} option with React events`);
await page.waitForTimeout(1000);
// Look for and click any submit/continue button that appeared
const submitClicked = await page.evaluate(() => {
const buttons = Array.from(document.querySelectorAll('button, [role="button"], a'));
const submitBtn = buttons.find(el => {
const text = el.textContent?.toLowerCase() || '';
const ariaLabel = el.getAttribute('aria-label')?.toLowerCase() || '';
return text.includes('continue') || text.includes('submit') ||
text.includes('enter') || text.includes('confirm') ||
ariaLabel.includes('continue') || ariaLabel.includes('submit');
});
if (submitBtn instanceof HTMLElement && submitBtn.offsetParent !== null) {
submitBtn.click();
return true;
}
return false;
});
if (submitClicked) {
logger_1.logger.info('age-gate', `Found and clicked submit button`);
}
customDropdownWorked = true;
}
}
}
catch (e) {
logger_1.logger.warn('age-gate', `Dropdown method failed: ${e}`);
}
// Try Method 1: Dropdown select
const selectFound = await page.evaluate((selectedState) => {
const selects = Array.from(document.querySelectorAll('select'));
for (const select of selects) {
const options = Array.from(select.options);
const stateOption = options.find(opt => opt.text.toLowerCase().includes(selectedState.toLowerCase()) ||
opt.value.toLowerCase().includes(selectedState.toLowerCase()));
if (stateOption) {
select.value = stateOption.value;
select.dispatchEvent(new Event('change', { bubbles: true }));
select.dispatchEvent(new Event('input', { bubbles: true }));
return true;
}
}
return false;
}, state);
// Try Method 2: State button/card (click state, then click confirm)
let stateClicked = false;
if (!selectFound) {
stateClicked = await page.evaluate((selectedState) => {
const allElements = Array.from(document.querySelectorAll('button, a, div, span, [role="button"], [class*="state"], [class*="State"], [class*="card"], [class*="option"]'));
const stateButton = allElements.find(el => el.textContent?.toLowerCase().includes(selectedState.toLowerCase()));
if (stateButton instanceof HTMLElement) {
stateButton.click();
return true;
}
return false;
}, state);
if (stateClicked) {
// Wait for confirm button to appear and click it
await page.waitForTimeout(1000);
await page.evaluate(() => {
const confirmBtns = Array.from(document.querySelectorAll('button, a, [role="button"]'));
const confirmBtn = confirmBtns.find(el => {
const text = el.textContent?.toLowerCase() || '';
return text.includes('enter') || text.includes('continue') || text.includes('yes') || text.includes('confirm');
});
if (confirmBtn instanceof HTMLElement) {
confirmBtn.click();
}
});
}
}
// Try Method 3: Direct "Yes" or age confirmation button
const yesClicked = await page.evaluate(() => {
const confirmButtons = Array.from(document.querySelectorAll('button, a, [role="button"]'));
const yesButton = confirmButtons.find(el => {
const text = el.textContent?.toLowerCase() || '';
return text.includes('yes') ||
text.includes('i am 21') ||
text.includes('i am 18') ||
text.includes('enter the site') ||
text.includes('enter') ||
text.includes('continue');
});
if (yesButton instanceof HTMLElement) {
yesButton.click();
return true;
}
return false;
});
const bypassed = customDropdownWorked || selectFound || stateClicked || yesClicked;
if (bypassed) {
// Wait for navigation to complete after clicking age gate button
logger_1.logger.info('age-gate', `Waiting for navigation after age gate bypass...`);
try {
await page.waitForNavigation({ waitUntil: 'domcontentloaded', timeout: 15000 });
}
catch (e) {
// Navigation might not trigger, that's ok - wait a bit anyway
await page.waitForTimeout(3000);
}
// Give the page extra time to load content
await page.waitForTimeout(3000);
// Verify we actually bypassed by checking the URL
const finalUrl = page.url();
if (finalUrl.includes('/age-gate')) {
logger_1.logger.error('age-gate', `❌ Age gate bypass failed - still at age gate URL: ${finalUrl}`);
return false;
}
logger_1.logger.info('age-gate', `✅ Age gate bypass completed - now at: ${finalUrl}`);
return true;
}
else {
logger_1.logger.warn('age-gate', `Could not find ${state} option or confirmation button in age gate`);
return false;
}
}
catch (err) {
logger_1.logger.error('age-gate', `Error bypassing age gate: ${err}`);
return false;
}
}
/**
* Helper to detect the state from a store URL
* @param url - Store URL
* @returns State name (e.g., 'Arizona', 'California')
*/
function detectStateFromUrl(url) {
const stateMap = {
'-az-': 'Arizona',
'-ca-': 'California',
'-co-': 'Colorado',
'-fl-': 'Florida',
'-il-': 'Illinois',
'-ma-': 'Massachusetts',
'-mi-': 'Michigan',
'-nv-': 'Nevada',
'-nj-': 'New Jersey',
'-ny-': 'New York',
'-or-': 'Oregon',
'-pa-': 'Pennsylvania',
'-wa-': 'Washington',
};
for (const [pattern, stateName] of Object.entries(stateMap)) {
if (url.toLowerCase().includes(pattern)) {
return stateName;
}
}
// Default to Arizona if state not detected
return 'Arizona';
}

296
backend/dist/utils/image-storage.js vendored Normal file
View File

@@ -0,0 +1,296 @@
"use strict";
/**
* Local Image Storage Utility
*
* Downloads and stores product images to local filesystem.
* Replaces MinIO-based storage with simple local file storage.
*
* Directory structure:
* /images/products/<dispensary_id>/<product_id>.webp
* /images/products/<dispensary_id>/<product_id>-thumb.webp
* /images/products/<dispensary_id>/<product_id>-medium.webp
* /images/brands/<brand_slug>.webp
*/
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.downloadProductImage = downloadProductImage;
exports.downloadBrandLogo = downloadBrandLogo;
exports.imageExists = imageExists;
exports.deleteProductImages = deleteProductImages;
exports.initializeImageStorage = initializeImageStorage;
exports.getStorageStats = getStorageStats;
const axios_1 = __importDefault(require("axios"));
const sharp_1 = __importDefault(require("sharp"));
const fs = __importStar(require("fs/promises"));
const path = __importStar(require("path"));
const crypto_1 = require("crypto");
// Base path for image storage - configurable via env
const IMAGES_BASE_PATH = process.env.IMAGES_PATH || '/app/public/images';
// Public URL base for serving images
const IMAGES_PUBLIC_URL = process.env.IMAGES_PUBLIC_URL || '/images';
/**
* Ensure a directory exists
*/
async function ensureDir(dirPath) {
try {
await fs.mkdir(dirPath, { recursive: true });
}
catch (error) {
if (error.code !== 'EEXIST')
throw error;
}
}
/**
* Generate a short hash from a URL for deduplication
*/
function hashUrl(url) {
return (0, crypto_1.createHash)('md5').update(url).digest('hex').substring(0, 8);
}
/**
* Download an image from a URL and return the buffer
*/
async function downloadImage(imageUrl) {
const response = await axios_1.default.get(imageUrl, {
responseType: 'arraybuffer',
timeout: 30000,
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8',
},
});
return Buffer.from(response.data);
}
/**
* Process and save image in multiple sizes
* Returns the file paths relative to IMAGES_BASE_PATH
*/
async function processAndSaveImage(buffer, outputDir, baseFilename) {
await ensureDir(outputDir);
const fullPath = path.join(outputDir, `${baseFilename}.webp`);
const mediumPath = path.join(outputDir, `${baseFilename}-medium.webp`);
const thumbPath = path.join(outputDir, `${baseFilename}-thumb.webp`);
// Process images in parallel
const [fullBuffer, mediumBuffer, thumbBuffer] = await Promise.all([
// Full: max 1200x1200, high quality
(0, sharp_1.default)(buffer)
.resize(1200, 1200, { fit: 'inside', withoutEnlargement: true })
.webp({ quality: 85 })
.toBuffer(),
// Medium: 600x600
(0, sharp_1.default)(buffer)
.resize(600, 600, { fit: 'inside', withoutEnlargement: true })
.webp({ quality: 80 })
.toBuffer(),
// Thumb: 200x200
(0, sharp_1.default)(buffer)
.resize(200, 200, { fit: 'inside', withoutEnlargement: true })
.webp({ quality: 75 })
.toBuffer(),
]);
// Save all sizes
await Promise.all([
fs.writeFile(fullPath, fullBuffer),
fs.writeFile(mediumPath, mediumBuffer),
fs.writeFile(thumbPath, thumbBuffer),
]);
const totalBytes = fullBuffer.length + mediumBuffer.length + thumbBuffer.length;
return {
full: fullPath,
medium: mediumPath,
thumb: thumbPath,
totalBytes,
};
}
/**
* Convert a file path to a public URL
*/
function pathToUrl(filePath) {
const relativePath = filePath.replace(IMAGES_BASE_PATH, '');
return `${IMAGES_PUBLIC_URL}${relativePath}`;
}
/**
* Download and store a product image locally
*
* @param imageUrl - The third-party image URL to download
* @param dispensaryId - The dispensary ID (for directory organization)
* @param productId - The product ID or external ID (for filename)
* @returns Download result with local URLs
*/
async function downloadProductImage(imageUrl, dispensaryId, productId) {
try {
if (!imageUrl) {
return { success: false, error: 'No image URL provided' };
}
// Download the image
const buffer = await downloadImage(imageUrl);
// Organize by dispensary ID
const outputDir = path.join(IMAGES_BASE_PATH, 'products', String(dispensaryId));
// Use product ID + URL hash for uniqueness
const urlHash = hashUrl(imageUrl);
const baseFilename = `${productId}-${urlHash}`;
// Process and save
const result = await processAndSaveImage(buffer, outputDir, baseFilename);
return {
success: true,
urls: {
full: pathToUrl(result.full),
medium: pathToUrl(result.medium),
thumb: pathToUrl(result.thumb),
},
bytesDownloaded: result.totalBytes,
};
}
catch (error) {
return {
success: false,
error: error.message || 'Failed to download image',
};
}
}
/**
* Download and store a brand logo locally
*
* @param logoUrl - The brand logo URL
* @param brandId - The brand ID or slug
* @returns Download result with local URL
*/
async function downloadBrandLogo(logoUrl, brandId) {
try {
if (!logoUrl) {
return { success: false, error: 'No logo URL provided' };
}
// Download the image
const buffer = await downloadImage(logoUrl);
// Brand logos go in /images/brands/
const outputDir = path.join(IMAGES_BASE_PATH, 'brands');
// Sanitize brand ID for filename
const safeBrandId = brandId.replace(/[^a-zA-Z0-9-_]/g, '_');
const urlHash = hashUrl(logoUrl);
const baseFilename = `${safeBrandId}-${urlHash}`;
// Process and save (single size for logos)
await ensureDir(outputDir);
const logoPath = path.join(outputDir, `${baseFilename}.webp`);
const logoBuffer = await (0, sharp_1.default)(buffer)
.resize(400, 400, { fit: 'inside', withoutEnlargement: true })
.webp({ quality: 85 })
.toBuffer();
await fs.writeFile(logoPath, logoBuffer);
return {
success: true,
urls: {
full: pathToUrl(logoPath),
medium: pathToUrl(logoPath),
thumb: pathToUrl(logoPath),
},
bytesDownloaded: logoBuffer.length,
};
}
catch (error) {
return {
success: false,
error: error.message || 'Failed to download brand logo',
};
}
}
/**
* Check if a local image already exists
*/
async function imageExists(dispensaryId, productId, imageUrl) {
const urlHash = hashUrl(imageUrl);
const imagePath = path.join(IMAGES_BASE_PATH, 'products', String(dispensaryId), `${productId}-${urlHash}.webp`);
try {
await fs.access(imagePath);
return true;
}
catch {
return false;
}
}
/**
* Delete a product's local images
*/
async function deleteProductImages(dispensaryId, productId, imageUrl) {
const productDir = path.join(IMAGES_BASE_PATH, 'products', String(dispensaryId));
const prefix = imageUrl
? `${productId}-${hashUrl(imageUrl)}`
: String(productId);
try {
const files = await fs.readdir(productDir);
const toDelete = files.filter(f => f.startsWith(prefix));
await Promise.all(toDelete.map(f => fs.unlink(path.join(productDir, f))));
}
catch {
// Directory might not exist, that's fine
}
}
/**
* Initialize the image storage directories
*/
async function initializeImageStorage() {
await ensureDir(path.join(IMAGES_BASE_PATH, 'products'));
await ensureDir(path.join(IMAGES_BASE_PATH, 'brands'));
console.log(`✅ Image storage initialized at ${IMAGES_BASE_PATH}`);
}
/**
* Get storage stats
*/
async function getStorageStats() {
const productsDir = path.join(IMAGES_BASE_PATH, 'products');
const brandsDir = path.join(IMAGES_BASE_PATH, 'brands');
let productCount = 0;
let brandCount = 0;
try {
const productDirs = await fs.readdir(productsDir);
for (const dir of productDirs) {
const files = await fs.readdir(path.join(productsDir, dir));
productCount += files.filter(f => f.endsWith('.webp') && !f.includes('-')).length;
}
}
catch { /* ignore */ }
try {
const brandFiles = await fs.readdir(brandsDir);
brandCount = brandFiles.filter(f => f.endsWith('.webp')).length;
}
catch { /* ignore */ }
return {
productsDir,
brandsDir,
productCount,
brandCount,
};
}

View File

@@ -36,30 +36,61 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.minioClient = void 0;
exports.isMinioEnabled = isMinioEnabled;
exports.initializeMinio = initializeMinio;
exports.uploadImageFromUrl = uploadImageFromUrl;
exports.getImageUrl = getImageUrl;
exports.deleteImage = deleteImage;
exports.minioClient = getMinioClient;
const Minio = __importStar(require("minio"));
const axios_1 = __importDefault(require("axios"));
const uuid_1 = require("uuid");
const minioClient = new Minio.Client({
endPoint: process.env.MINIO_ENDPOINT || 'minio',
port: parseInt(process.env.MINIO_PORT || '9000'),
useSSL: process.env.MINIO_USE_SSL === 'true',
accessKey: process.env.MINIO_ACCESS_KEY || 'minioadmin',
secretKey: process.env.MINIO_SECRET_KEY || 'minioadmin',
});
exports.minioClient = minioClient;
const sharp_1 = __importDefault(require("sharp"));
const fs = __importStar(require("fs/promises"));
const path = __importStar(require("path"));
let minioClient = null;
// Check if MinIO is configured
function isMinioEnabled() {
return !!process.env.MINIO_ENDPOINT;
}
// Local storage path for images when MinIO is not configured
const LOCAL_IMAGES_PATH = process.env.LOCAL_IMAGES_PATH || '/app/public/images';
function getMinioClient() {
if (!minioClient) {
minioClient = new Minio.Client({
endPoint: process.env.MINIO_ENDPOINT || 'minio',
port: parseInt(process.env.MINIO_PORT || '9000'),
useSSL: process.env.MINIO_USE_SSL === 'true',
accessKey: process.env.MINIO_ACCESS_KEY || 'minioadmin',
secretKey: process.env.MINIO_SECRET_KEY || 'minioadmin',
});
}
return minioClient;
}
const BUCKET_NAME = process.env.MINIO_BUCKET || 'dutchie';
async function initializeMinio() {
// Skip MinIO initialization if not configured
if (!isMinioEnabled()) {
console.log(' MinIO not configured (MINIO_ENDPOINT not set), using local filesystem storage');
// Ensure local images directory exists
try {
await fs.mkdir(LOCAL_IMAGES_PATH, { recursive: true });
await fs.mkdir(path.join(LOCAL_IMAGES_PATH, 'products'), { recursive: true });
console.log(`✅ Local images directory ready: ${LOCAL_IMAGES_PATH}`);
}
catch (error) {
console.error('❌ Failed to create local images directory:', error);
throw error;
}
return;
}
try {
const client = getMinioClient();
// Check if bucket exists
const exists = await minioClient.bucketExists(BUCKET_NAME);
const exists = await client.bucketExists(BUCKET_NAME);
if (!exists) {
// Create bucket
await minioClient.makeBucket(BUCKET_NAME, 'us-east-1');
await client.makeBucket(BUCKET_NAME, 'us-east-1');
console.log(`✅ Minio bucket created: ${BUCKET_NAME}`);
// Set public read policy
const policy = {
@@ -73,7 +104,7 @@ async function initializeMinio() {
},
],
};
await minioClient.setBucketPolicy(BUCKET_NAME, JSON.stringify(policy));
await client.setBucketPolicy(BUCKET_NAME, JSON.stringify(policy));
console.log(`✅ Bucket policy set to public read`);
}
else {
@@ -85,36 +116,145 @@ async function initializeMinio() {
throw error;
}
}
async function uploadImageFromUrl(imageUrl, productId) {
async function removeBackground(buffer) {
try {
// Get image metadata to check if it has an alpha channel
const metadata = await (0, sharp_1.default)(buffer).metadata();
// If image already has transparency, trim and optimize it
if (metadata.hasAlpha) {
return await (0, sharp_1.default)(buffer)
.trim() // Remove transparent borders
.toBuffer();
}
// For images without alpha (like JPEGs with solid backgrounds),
// we'll use a threshold-based approach to detect and remove solid backgrounds
// This works well for product images on solid color backgrounds
// Convert to PNG with alpha channel, then flatten with transparency
const withAlpha = await (0, sharp_1.default)(buffer)
.ensureAlpha() // Add alpha channel
.toBuffer();
// Use threshold to make similar colors transparent (targets solid backgrounds)
// This is a simple approach - for better results, use remove.bg API or ML models
return await (0, sharp_1.default)(withAlpha)
.flatten({ background: { r: 0, g: 0, b: 0, alpha: 0 } })
.trim()
.toBuffer();
}
catch (error) {
console.warn('Background removal failed, using original image:', error);
return buffer;
}
}
async function uploadToLocalFilesystem(thumbnailBuffer, mediumBuffer, fullBuffer, baseFilename) {
const thumbnailPath = `${baseFilename}-thumb.png`;
const mediumPath = `${baseFilename}-medium.png`;
const fullPath = `${baseFilename}-full.png`;
// Ensure the target directory exists (in case initializeMinio wasn't called)
// Extract directory from baseFilename (e.g., 'products/store-slug' or just 'products')
const targetDir = path.join(LOCAL_IMAGES_PATH, path.dirname(baseFilename));
await fs.mkdir(targetDir, { recursive: true });
await Promise.all([
fs.writeFile(path.join(LOCAL_IMAGES_PATH, thumbnailPath), thumbnailBuffer),
fs.writeFile(path.join(LOCAL_IMAGES_PATH, mediumPath), mediumBuffer),
fs.writeFile(path.join(LOCAL_IMAGES_PATH, fullPath), fullBuffer),
]);
return {
thumbnail: thumbnailPath,
medium: mediumPath,
full: fullPath,
};
}
async function uploadToMinio(thumbnailBuffer, mediumBuffer, fullBuffer, baseFilename) {
const client = getMinioClient();
const thumbnailPath = `${baseFilename}-thumb.png`;
const mediumPath = `${baseFilename}-medium.png`;
const fullPath = `${baseFilename}-full.png`;
await Promise.all([
client.putObject(BUCKET_NAME, thumbnailPath, thumbnailBuffer, thumbnailBuffer.length, {
'Content-Type': 'image/png',
}),
client.putObject(BUCKET_NAME, mediumPath, mediumBuffer, mediumBuffer.length, {
'Content-Type': 'image/png',
}),
client.putObject(BUCKET_NAME, fullPath, fullBuffer, fullBuffer.length, {
'Content-Type': 'image/png',
}),
]);
return {
thumbnail: thumbnailPath,
medium: mediumPath,
full: fullPath,
};
}
async function uploadImageFromUrl(imageUrl, productId, storeSlug, removeBackgrounds = true) {
try {
// Download image
const response = await axios_1.default.get(imageUrl, { responseType: 'arraybuffer' });
const buffer = Buffer.from(response.data);
// Generate unique filename
const ext = imageUrl.split('.').pop()?.split('?')[0] || 'jpg';
const filename = `products/${productId}-${(0, uuid_1.v4)()}.${ext}`;
// Get content type
const contentType = response.headers['content-type'] || 'image/jpeg';
// Upload to Minio
await minioClient.putObject(BUCKET_NAME, filename, buffer, buffer.length, {
'Content-Type': contentType,
});
// Return the path (URL will be constructed when serving)
return filename;
let buffer = Buffer.from(response.data);
// Remove background if enabled
if (removeBackgrounds) {
buffer = await removeBackground(buffer);
}
// Generate unique base filename - organize by store if slug provided
const storeDir = storeSlug ? `products/${storeSlug}` : 'products';
const baseFilename = `${storeDir}/${productId}-${(0, uuid_1.v4)()}`;
// Create multiple sizes with Sharp and convert to WebP/PNG for better compression
// Use PNG for images with transparency
const [thumbnailBuffer, mediumBuffer, fullBuffer] = await Promise.all([
// Thumbnail: 300x300
(0, sharp_1.default)(buffer)
.resize(300, 300, { fit: 'inside', background: { r: 0, g: 0, b: 0, alpha: 0 } })
.png({ quality: 80, compressionLevel: 9 })
.toBuffer(),
// Medium: 800x800
(0, sharp_1.default)(buffer)
.resize(800, 800, { fit: 'inside', background: { r: 0, g: 0, b: 0, alpha: 0 } })
.png({ quality: 85, compressionLevel: 9 })
.toBuffer(),
// Full: 2000x2000 (optimized)
(0, sharp_1.default)(buffer)
.resize(2000, 2000, { fit: 'inside', withoutEnlargement: true, background: { r: 0, g: 0, b: 0, alpha: 0 } })
.png({ quality: 90, compressionLevel: 9 })
.toBuffer(),
]);
// Upload to appropriate storage backend
let result;
if (isMinioEnabled()) {
result = await uploadToMinio(thumbnailBuffer, mediumBuffer, fullBuffer, baseFilename);
}
else {
result = await uploadToLocalFilesystem(thumbnailBuffer, mediumBuffer, fullBuffer, baseFilename);
}
console.log(`✅ Uploaded 3 sizes for product ${productId}: ${thumbnailBuffer.length + mediumBuffer.length + fullBuffer.length} bytes total`);
return result;
}
catch (error) {
console.error('Error uploading image:', error);
throw error;
}
}
function getImageUrl(path) {
// Use localhost:9020 for browser access since Minio is exposed on host port 9020
const endpoint = process.env.MINIO_PUBLIC_ENDPOINT || 'http://localhost:9020';
return `${endpoint}/${BUCKET_NAME}/${path}`;
function getImageUrl(imagePath) {
if (isMinioEnabled()) {
// Use MinIO endpoint for browser access
const endpoint = process.env.MINIO_PUBLIC_ENDPOINT || 'http://localhost:9020';
return `${endpoint}/${BUCKET_NAME}/${imagePath}`;
}
else {
// Use local path - served via Express static middleware
const publicUrl = process.env.PUBLIC_URL || '';
return `${publicUrl}/images/${imagePath}`;
}
}
async function deleteImage(path) {
async function deleteImage(imagePath) {
try {
await minioClient.removeObject(BUCKET_NAME, path);
if (isMinioEnabled()) {
const client = getMinioClient();
await client.removeObject(BUCKET_NAME, imagePath);
}
else {
const fullPath = path.join(LOCAL_IMAGES_PATH, imagePath);
await fs.unlink(fullPath);
}
}
catch (error) {
console.error('Error deleting image:', error);

181
backend/dist/utils/product-normalizer.js vendored Normal file
View File

@@ -0,0 +1,181 @@
"use strict";
/**
* Product Normalizer Utility
*
* Functions for normalizing product data to enable consistent matching
* and prevent duplicate product entries.
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.normalizeProductName = normalizeProductName;
exports.normalizeBrandName = normalizeBrandName;
exports.normalizeWeight = normalizeWeight;
exports.generateProductFingerprint = generateProductFingerprint;
exports.stringSimilarity = stringSimilarity;
exports.areProductsSimilar = areProductsSimilar;
/**
* Normalize product name for matching
* - Lowercase
* - Remove punctuation
* - Remove THC/CBD percentages often appended to names
* - Remove weight suffixes
* - Remove emoji
* - Normalize whitespace
*/
function normalizeProductName(name) {
if (!name)
return '';
return name
.toLowerCase()
.trim()
// Remove special characters except alphanumeric and spaces
.replace(/[^\w\s]/g, ' ')
// Remove common suffixes like THC/CBD percentages appended to names
.replace(/\s*(thc|cbd|cbg|cbn|tac)\s*[:=]?\s*[\d.]+\s*%?/gi, '')
// Remove weight/size suffixes often appended
.replace(/\s*\d+(\.\d+)?\s*(mg|g|oz|ml|gram|grams|ounce|ounces)\b/gi, '')
// Remove emoji
.replace(/[\u{1F300}-\u{1F9FF}]/gu, '')
// Remove "special offer" type suffixes
.replace(/\s*special\s*offer\s*/gi, '')
// Normalize multiple spaces to single space
.replace(/\s+/g, ' ')
.trim();
}
/**
* Normalize brand name for matching
*/
function normalizeBrandName(brand) {
if (!brand)
return '';
return brand
.toLowerCase()
.trim()
// Remove special characters
.replace(/[^\w\s]/g, ' ')
// Normalize whitespace
.replace(/\s+/g, ' ')
.trim();
}
/**
* Normalize weight string to standard format
* e.g., "3.5 grams" -> "3.5g", "1/8 oz" -> "3.5g"
*/
function normalizeWeight(weight) {
if (!weight)
return '';
const w = weight.toLowerCase().trim();
// Handle fractional ounces
if (w.includes('1/8') || w.includes('eighth')) {
return '3.5g';
}
if (w.includes('1/4') || w.includes('quarter')) {
return '7g';
}
if (w.includes('1/2') || w.includes('half')) {
return '14g';
}
if (w.includes('1 oz') || w === 'oz' || w === '1oz') {
return '28g';
}
// Extract numeric value and unit
const match = w.match(/([\d.]+)\s*(mg|g|oz|ml|gram|grams?|ounce|ounces?)?/i);
if (!match)
return w;
const value = parseFloat(match[1]);
let unit = (match[2] || 'g').toLowerCase();
// Normalize unit names
unit = unit.replace(/gram(s)?/, 'g').replace(/ounce(s)?/, 'oz');
// Convert oz to grams for consistency
if (unit === 'oz') {
return `${(value * 28).toFixed(1)}g`;
}
return `${value}${unit}`;
}
/**
* Generate a matching fingerprint for a product
* Used for deduplication
*/
function generateProductFingerprint(name, brand, weight, categoryId) {
const parts = [
normalizeProductName(name),
normalizeBrandName(brand),
normalizeWeight(weight),
categoryId?.toString() || ''
];
return parts.filter(Boolean).join('|');
}
/**
* Calculate similarity between two strings (0-100)
* Uses Levenshtein distance
*/
function stringSimilarity(str1, str2) {
if (str1 === str2)
return 100;
if (!str1 || !str2)
return 0;
const s1 = str1.toLowerCase();
const s2 = str2.toLowerCase();
if (s1 === s2)
return 100;
const longer = s1.length > s2.length ? s1 : s2;
const shorter = s1.length > s2.length ? s2 : s1;
const longerLength = longer.length;
if (longerLength === 0)
return 100;
const distance = levenshteinDistance(longer, shorter);
return Math.round(((longerLength - distance) / longerLength) * 100);
}
/**
* Levenshtein distance between two strings
*/
function levenshteinDistance(str1, str2) {
const m = str1.length;
const n = str2.length;
// Create distance matrix
const dp = Array(m + 1).fill(null).map(() => Array(n + 1).fill(0));
// Initialize first row and column
for (let i = 0; i <= m; i++)
dp[i][0] = i;
for (let j = 0; j <= n; j++)
dp[0][j] = j;
// Fill in the rest
for (let i = 1; i <= m; i++) {
for (let j = 1; j <= n; j++) {
const cost = str1[i - 1] === str2[j - 1] ? 0 : 1;
dp[i][j] = Math.min(dp[i - 1][j] + 1, // deletion
dp[i][j - 1] + 1, // insertion
dp[i - 1][j - 1] + cost // substitution
);
}
}
return dp[m][n];
}
/**
* Check if two products are likely the same
* Returns confidence score (0-100)
*/
function areProductsSimilar(product1, product2, threshold = 92) {
const name1 = normalizeProductName(product1.name);
const name2 = normalizeProductName(product2.name);
const nameSimilarity = stringSimilarity(name1, name2);
// If names are very similar, likely same product
if (nameSimilarity >= threshold) {
return { isSimilar: true, confidence: nameSimilarity };
}
// Check brand match for additional confidence
const brand1 = normalizeBrandName(product1.brand);
const brand2 = normalizeBrandName(product2.brand);
if (brand1 && brand2 && brand1 === brand2) {
// Same brand, lower threshold for name match
if (nameSimilarity >= threshold - 10) {
return { isSimilar: true, confidence: nameSimilarity + 5 };
}
}
// Check weight match
const weight1 = normalizeWeight(product1.weight);
const weight2 = normalizeWeight(product2.weight);
if (weight1 && weight2 && weight1 === weight2 && nameSimilarity >= threshold - 15) {
return { isSimilar: true, confidence: nameSimilarity + 3 };
}
return { isSimilar: false, confidence: nameSimilarity };
}

112
backend/dist/utils/proxyManager.js vendored Normal file
View File

@@ -0,0 +1,112 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.getProxy = getProxy;
exports.getPhoenixProxy = getPhoenixProxy;
exports.getStateProxy = getStateProxy;
exports.getCityProxy = getCityProxy;
exports.getRandomProxy = getRandomProxy;
exports.getProxyLocationStats = getProxyLocationStats;
const migrate_1 = require("../db/migrate");
const logger_1 = require("../services/logger");
/**
* Get an active proxy from the database, optionally filtered by location
*/
async function getProxy(locationFilter) {
try {
let query = `
SELECT protocol, host, port, username, password
FROM proxies
WHERE active = true
`;
const params = [];
let paramIndex = 1;
if (locationFilter) {
if (locationFilter.city) {
query += ` AND LOWER(city) = LOWER($${paramIndex})`;
params.push(locationFilter.city);
paramIndex++;
}
if (locationFilter.state) {
query += ` AND LOWER(state) = LOWER($${paramIndex})`;
params.push(locationFilter.state);
paramIndex++;
}
if (locationFilter.country) {
query += ` AND LOWER(country) = LOWER($${paramIndex})`;
params.push(locationFilter.country);
paramIndex++;
}
if (locationFilter.countryCode) {
query += ` AND LOWER(country_code) = LOWER($${paramIndex})`;
params.push(locationFilter.countryCode);
paramIndex++;
}
}
// Use RANDOM() for true randomization instead of least recently used
query += ` ORDER BY RANDOM() LIMIT 1`;
const result = await migrate_1.pool.query(query, params);
if (result.rows.length === 0) {
logger_1.logger.warn('proxy', `No active proxies found with filter: ${JSON.stringify(locationFilter)}`);
return null;
}
const proxy = result.rows[0];
return {
server: `${proxy.protocol}://${proxy.host}:${proxy.port}`,
username: proxy.username || undefined,
password: proxy.password || undefined,
};
}
catch (error) {
logger_1.logger.error('proxy', `Error fetching proxy: ${error}`);
return null;
}
}
/**
* Get a proxy from Phoenix, AZ, USA (ideal for Arizona dispensaries)
*/
async function getPhoenixProxy() {
return getProxy({ city: 'Phoenix', state: 'Arizona', country: 'United States' });
}
/**
* Get a proxy from a specific US state
*/
async function getStateProxy(state) {
return getProxy({ state, country: 'United States' });
}
/**
* Get a proxy from a specific city
*/
async function getCityProxy(city, state) {
return getProxy({ city, state });
}
/**
* Get a random active proxy (no location filter)
*/
async function getRandomProxy() {
return getProxy();
}
/**
* Get proxy location statistics
*/
async function getProxyLocationStats() {
try {
const result = await migrate_1.pool.query(`
SELECT
country,
state,
city,
COUNT(*) as count,
SUM(CASE WHEN active THEN 1 ELSE 0 END) as active_count
FROM proxies
WHERE country IS NOT NULL
GROUP BY country, state, city
ORDER BY count DESC
LIMIT 50
`);
return result.rows;
}
catch (error) {
logger_1.logger.error('proxy', `Error fetching proxy stats: ${error}`);
return [];
}
}

264
backend/dist/utils/stealthBrowser.js vendored Normal file
View File

@@ -0,0 +1,264 @@
"use strict";
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.createStealthBrowser = createStealthBrowser;
exports.createStealthContext = createStealthContext;
exports.randomDelay = randomDelay;
exports.humanMouseMove = humanMouseMove;
exports.humanScroll = humanScroll;
exports.humanType = humanType;
exports.simulateHumanBehavior = simulateHumanBehavior;
exports.waitForPageLoad = waitForPageLoad;
exports.isCloudflareChallenge = isCloudflareChallenge;
exports.waitForCloudflareChallenge = waitForCloudflareChallenge;
exports.saveCookies = saveCookies;
exports.loadCookies = loadCookies;
const playwright_extra_1 = require("playwright-extra");
const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth"));
// Add stealth plugin
playwright_extra_1.chromium.use((0, puppeteer_extra_plugin_stealth_1.default)());
/**
* Create a stealth browser instance with anti-detection measures
*/
async function createStealthBrowser(options = {}) {
const launchOptions = {
headless: options.headless !== false,
args: [
'--disable-blink-features=AutomationControlled',
'--disable-features=IsolateOrigins,site-per-process',
'--disable-web-security',
'--disable-features=VizDisplayCompositor',
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-accelerated-2d-canvas',
'--no-first-run',
'--no-zygote',
'--disable-gpu',
],
};
if (options.proxy) {
launchOptions.proxy = options.proxy;
}
const browser = await playwright_extra_1.chromium.launch(launchOptions);
return browser;
}
/**
* Create a stealth context with realistic browser fingerprint
*/
async function createStealthContext(browser, options = {}) {
const userAgent = options.userAgent ||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36';
const context = await browser.newContext({
userAgent,
viewport: { width: 1920, height: 1080 },
locale: 'en-US',
timezoneId: 'America/Phoenix',
permissions: ['geolocation'],
geolocation: { latitude: 33.4484, longitude: -112.074 }, // Phoenix, AZ
colorScheme: 'light',
deviceScaleFactor: 1,
hasTouch: false,
isMobile: false,
javaScriptEnabled: true,
extraHTTPHeaders: {
'Accept-Language': 'en-US,en;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-User': '?1',
'Sec-Fetch-Dest': 'document',
'Upgrade-Insecure-Requests': '1',
},
});
// Set age verification cookies for Dutchie
await context.addCookies([
{
name: 'age_verified',
value: 'true',
domain: '.dutchie.com',
path: '/',
expires: Math.floor(Date.now() / 1000) + 86400 * 30, // 30 days
},
{
name: 'initial_location',
value: JSON.stringify({ state: options.state || 'Arizona' }),
domain: '.dutchie.com',
path: '/',
expires: Math.floor(Date.now() / 1000) + 86400 * 30,
},
]);
return context;
}
/**
* Random delay between min and max milliseconds
*/
function randomDelay(min, max) {
const delay = Math.floor(Math.random() * (max - min + 1)) + min;
return new Promise((resolve) => setTimeout(resolve, delay));
}
/**
* Simulate human-like mouse movement
*/
async function humanMouseMove(page, x, y) {
const steps = 20;
const currentPos = await page.evaluate(() => ({ x: 0, y: 0 }));
for (let i = 0; i <= steps; i++) {
const progress = i / steps;
const easeProgress = easeInOutQuad(progress);
const nextX = currentPos.x + (x - currentPos.x) * easeProgress;
const nextY = currentPos.y + (y - currentPos.y) * easeProgress;
await page.mouse.move(nextX, nextY);
await randomDelay(5, 15);
}
}
/**
* Easing function for smooth mouse movement
*/
function easeInOutQuad(t) {
return t < 0.5 ? 2 * t * t : -1 + (4 - 2 * t) * t;
}
/**
* Simulate human-like scrolling
*/
async function humanScroll(page, scrollAmount = 500) {
const scrollSteps = 10;
const stepSize = scrollAmount / scrollSteps;
for (let i = 0; i < scrollSteps; i++) {
await page.mouse.wheel(0, stepSize);
await randomDelay(50, 150);
}
}
/**
* Simulate human-like typing
*/
async function humanType(page, selector, text) {
await page.click(selector);
await randomDelay(100, 300);
for (const char of text) {
await page.keyboard.type(char);
await randomDelay(50, 150);
}
}
/**
* Random realistic behavior before interacting with page
*/
async function simulateHumanBehavior(page) {
// Random small mouse movements
for (let i = 0; i < 3; i++) {
const x = Math.random() * 500 + 100;
const y = Math.random() * 300 + 100;
await humanMouseMove(page, x, y);
await randomDelay(200, 500);
}
// Small scroll
await humanScroll(page, 100);
await randomDelay(300, 700);
}
/**
* Wait for page to be fully loaded with human-like delay
*/
async function waitForPageLoad(page, timeout = 60000) {
try {
await page.waitForLoadState('networkidle', { timeout });
await randomDelay(500, 1500); // Random delay after load
}
catch (error) {
// If networkidle times out, try domcontentloaded as fallback
console.log('⚠️ networkidle timeout, waiting for domcontentloaded...');
await page.waitForLoadState('domcontentloaded', { timeout: 30000 });
await randomDelay(1000, 2000);
}
}
/**
* Check if we're on a Cloudflare challenge page
*/
async function isCloudflareChallenge(page) {
const title = await page.title();
const content = await page.content();
return (title.includes('Cloudflare') ||
title.includes('Just a moment') ||
title.includes('Attention Required') ||
content.includes('challenge-platform') ||
content.includes('cf-challenge') ||
content.includes('Checking your browser'));
}
/**
* Wait for Cloudflare challenge to complete
*/
async function waitForCloudflareChallenge(page, maxWaitMs = 60000) {
const startTime = Date.now();
let attempts = 0;
while (Date.now() - startTime < maxWaitMs) {
attempts++;
if (!(await isCloudflareChallenge(page))) {
console.log(`✅ Cloudflare challenge passed after ${attempts} attempts (${Math.floor((Date.now() - startTime) / 1000)}s)`);
return true;
}
const remaining = Math.floor((maxWaitMs - (Date.now() - startTime)) / 1000);
console.log(`⏳ Waiting for Cloudflare challenge... (attempt ${attempts}, ${remaining}s remaining)`);
// Random delay between checks
await randomDelay(2000, 3000);
}
console.log('❌ Cloudflare challenge timeout - may need residential proxy or manual intervention');
return false;
}
/**
* Save session cookies to file
*/
async function saveCookies(context, filepath) {
const cookies = await context.cookies();
const fs = await Promise.resolve().then(() => __importStar(require('fs/promises')));
await fs.writeFile(filepath, JSON.stringify(cookies, null, 2));
}
/**
* Load session cookies from file
*/
async function loadCookies(context, filepath) {
try {
const fs = await Promise.resolve().then(() => __importStar(require('fs/promises')));
const cookiesString = await fs.readFile(filepath, 'utf-8');
const cookies = JSON.parse(cookiesString);
await context.addCookies(cookies);
return true;
}
catch (error) {
return false;
}
}