fix(monitor): remove non-existent worker columns from job_run_logs query
The job_run_logs table tracks scheduled job orchestration, not individual worker jobs. Worker info (worker_id, worker_hostname) belongs on dispensary_crawl_jobs, not job_run_logs. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
60
backend/dist/auth/middleware.js
vendored
60
backend/dist/auth/middleware.js
vendored
@@ -39,18 +39,66 @@ async function authenticateUser(email, password) {
|
||||
role: user.role
|
||||
};
|
||||
}
|
||||
function authMiddleware(req, res, next) {
|
||||
async function authMiddleware(req, res, next) {
|
||||
const authHeader = req.headers.authorization;
|
||||
if (!authHeader || !authHeader.startsWith('Bearer ')) {
|
||||
return res.status(401).json({ error: 'No token provided' });
|
||||
}
|
||||
const token = authHeader.substring(7);
|
||||
const user = verifyToken(token);
|
||||
if (!user) {
|
||||
return res.status(401).json({ error: 'Invalid token' });
|
||||
// Try JWT first
|
||||
const jwtUser = verifyToken(token);
|
||||
if (jwtUser) {
|
||||
req.user = jwtUser;
|
||||
return next();
|
||||
}
|
||||
// If JWT fails, try API token
|
||||
try {
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT id, name, rate_limit, active, expires_at, allowed_endpoints
|
||||
FROM api_tokens
|
||||
WHERE token = $1
|
||||
`, [token]);
|
||||
if (result.rows.length === 0) {
|
||||
return res.status(401).json({ error: 'Invalid token' });
|
||||
}
|
||||
const apiToken = result.rows[0];
|
||||
// Check if token is active
|
||||
if (!apiToken.active) {
|
||||
return res.status(401).json({ error: 'Token is disabled' });
|
||||
}
|
||||
// Check if token is expired
|
||||
if (apiToken.expires_at && new Date(apiToken.expires_at) < new Date()) {
|
||||
return res.status(401).json({ error: 'Token has expired' });
|
||||
}
|
||||
// Check allowed endpoints
|
||||
if (apiToken.allowed_endpoints && apiToken.allowed_endpoints.length > 0) {
|
||||
const isAllowed = apiToken.allowed_endpoints.some((pattern) => {
|
||||
// Simple wildcard matching
|
||||
const regex = new RegExp('^' + pattern.replace('*', '.*') + '$');
|
||||
return regex.test(req.path);
|
||||
});
|
||||
if (!isAllowed) {
|
||||
return res.status(403).json({ error: 'Endpoint not allowed for this token' });
|
||||
}
|
||||
}
|
||||
// Set API token on request for tracking
|
||||
req.apiToken = {
|
||||
id: apiToken.id,
|
||||
name: apiToken.name,
|
||||
rate_limit: apiToken.rate_limit
|
||||
};
|
||||
// Set a generic user for compatibility with existing code
|
||||
req.user = {
|
||||
id: apiToken.id,
|
||||
email: `api-token-${apiToken.id}@system`,
|
||||
role: 'api'
|
||||
};
|
||||
next();
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error verifying API token:', error);
|
||||
return res.status(500).json({ error: 'Authentication failed' });
|
||||
}
|
||||
req.user = user;
|
||||
next();
|
||||
}
|
||||
function requireRole(...roles) {
|
||||
return (req, res, next) => {
|
||||
|
||||
141
backend/dist/db/migrate.js
vendored
141
backend/dist/db/migrate.js
vendored
@@ -3,8 +3,14 @@ Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.pool = void 0;
|
||||
exports.runMigrations = runMigrations;
|
||||
const pg_1 = require("pg");
|
||||
// Consolidated DB connection:
|
||||
// - Prefer CRAWLSY_DATABASE_URL (e.g., crawlsy_local, crawlsy_prod)
|
||||
// - Then DATABASE_URL (default)
|
||||
const DATABASE_URL = process.env.CRAWLSY_DATABASE_URL ||
|
||||
process.env.DATABASE_URL ||
|
||||
'postgresql://dutchie:dutchie_local_pass@localhost:54320/crawlsy_local';
|
||||
const pool = new pg_1.Pool({
|
||||
connectionString: process.env.DATABASE_URL,
|
||||
connectionString: DATABASE_URL,
|
||||
});
|
||||
exports.pool = pool;
|
||||
async function runMigrations() {
|
||||
@@ -94,6 +100,99 @@ async function runMigrations() {
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
`);
|
||||
// Add variant column to products table (for different sizes/options of same product)
|
||||
await client.query(`
|
||||
ALTER TABLE products ADD COLUMN IF NOT EXISTS variant VARCHAR(255);
|
||||
`);
|
||||
// Add special tracking columns (DEPRECATED - not used with new approach)
|
||||
await client.query(`
|
||||
ALTER TABLE products ADD COLUMN IF NOT EXISTS special_ends_at TIMESTAMP;
|
||||
ALTER TABLE products ADD COLUMN IF NOT EXISTS special_text TEXT;
|
||||
ALTER TABLE products ADD COLUMN IF NOT EXISTS special_type VARCHAR(100);
|
||||
`);
|
||||
// ====== NEW SCHEMA ADDITIONS ======
|
||||
// Add array columns for product attributes
|
||||
await client.query(`
|
||||
ALTER TABLE products ADD COLUMN IF NOT EXISTS terpenes TEXT[];
|
||||
ALTER TABLE products ADD COLUMN IF NOT EXISTS effects TEXT[];
|
||||
ALTER TABLE products ADD COLUMN IF NOT EXISTS flavors TEXT[];
|
||||
`);
|
||||
// Add new price columns (regular_price = market price, sale_price = discount price)
|
||||
await client.query(`
|
||||
ALTER TABLE products ADD COLUMN IF NOT EXISTS regular_price DECIMAL(10, 2);
|
||||
ALTER TABLE products ADD COLUMN IF NOT EXISTS sale_price DECIMAL(10, 2);
|
||||
`);
|
||||
// Migrate existing price data
|
||||
await client.query(`
|
||||
UPDATE products
|
||||
SET regular_price = original_price
|
||||
WHERE regular_price IS NULL AND original_price IS NOT NULL;
|
||||
`);
|
||||
await client.query(`
|
||||
UPDATE products
|
||||
SET sale_price = price
|
||||
WHERE sale_price IS NULL AND price IS NOT NULL AND original_price IS NOT NULL AND price < original_price;
|
||||
`);
|
||||
// Make slug NOT NULL and add unique constraint
|
||||
await client.query(`
|
||||
UPDATE products SET slug = dutchie_product_id WHERE slug IS NULL;
|
||||
ALTER TABLE products ALTER COLUMN slug SET NOT NULL;
|
||||
`);
|
||||
// Drop old unique constraint and add new one on slug
|
||||
await client.query(`
|
||||
ALTER TABLE products DROP CONSTRAINT IF EXISTS products_store_id_dutchie_product_id_key;
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (SELECT 1 FROM pg_constraint WHERE conname = 'products_store_id_slug_unique') THEN
|
||||
ALTER TABLE products ADD CONSTRAINT products_store_id_slug_unique UNIQUE (store_id, slug);
|
||||
END IF;
|
||||
END$$;
|
||||
`);
|
||||
// Product Categories (many-to-many) - products can appear in multiple categories
|
||||
await client.query(`
|
||||
CREATE TABLE IF NOT EXISTS product_categories (
|
||||
id SERIAL PRIMARY KEY,
|
||||
product_id INTEGER REFERENCES products(id) ON DELETE CASCADE,
|
||||
category_slug VARCHAR(255) NOT NULL,
|
||||
first_seen_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
last_seen_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
UNIQUE(product_id, category_slug)
|
||||
);
|
||||
`);
|
||||
await client.query(`
|
||||
CREATE INDEX IF NOT EXISTS idx_product_categories_slug ON product_categories(category_slug, last_seen_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_product_categories_product ON product_categories(product_id);
|
||||
`);
|
||||
// Price History - track regular and sale price changes over time
|
||||
await client.query(`
|
||||
CREATE TABLE IF NOT EXISTS price_history (
|
||||
id SERIAL PRIMARY KEY,
|
||||
product_id INTEGER REFERENCES products(id) ON DELETE CASCADE,
|
||||
regular_price DECIMAL(10, 2),
|
||||
sale_price DECIMAL(10, 2),
|
||||
recorded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
`);
|
||||
await client.query(`
|
||||
CREATE INDEX IF NOT EXISTS idx_price_history_product ON price_history(product_id, recorded_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_price_history_recorded ON price_history(recorded_at DESC);
|
||||
`);
|
||||
// Batch History - track cannabinoid/terpene changes (different batches)
|
||||
await client.query(`
|
||||
CREATE TABLE IF NOT EXISTS batch_history (
|
||||
id SERIAL PRIMARY KEY,
|
||||
product_id INTEGER REFERENCES products(id) ON DELETE CASCADE,
|
||||
thc_percentage DECIMAL(5, 2),
|
||||
cbd_percentage DECIMAL(5, 2),
|
||||
terpenes TEXT[],
|
||||
strain_type VARCHAR(100),
|
||||
recorded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
`);
|
||||
await client.query(`
|
||||
CREATE INDEX IF NOT EXISTS idx_batch_history_product ON batch_history(product_id, recorded_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_batch_history_recorded ON batch_history(recorded_at DESC);
|
||||
`);
|
||||
// Campaign products (many-to-many with ordering)
|
||||
await client.query(`
|
||||
@@ -138,10 +237,50 @@ async function runMigrations() {
|
||||
last_tested_at TIMESTAMP,
|
||||
test_result VARCHAR(50),
|
||||
response_time_ms INTEGER,
|
||||
failure_count INTEGER DEFAULT 0,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
UNIQUE(host, port, protocol)
|
||||
);
|
||||
`);
|
||||
// Add failure_count column if it doesn't exist
|
||||
await client.query(`
|
||||
ALTER TABLE proxies ADD COLUMN IF NOT EXISTS failure_count INTEGER DEFAULT 0;
|
||||
`);
|
||||
// Failed proxies table
|
||||
await client.query(`
|
||||
CREATE TABLE IF NOT EXISTS failed_proxies (
|
||||
id SERIAL PRIMARY KEY,
|
||||
host VARCHAR(255) NOT NULL,
|
||||
port INTEGER NOT NULL,
|
||||
protocol VARCHAR(10) NOT NULL,
|
||||
username VARCHAR(255),
|
||||
password VARCHAR(255),
|
||||
failure_count INTEGER NOT NULL,
|
||||
last_error TEXT,
|
||||
failed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
UNIQUE(host, port, protocol)
|
||||
);
|
||||
`);
|
||||
// Proxy test jobs table
|
||||
await client.query(`
|
||||
CREATE TABLE IF NOT EXISTS proxy_test_jobs (
|
||||
id SERIAL PRIMARY KEY,
|
||||
status VARCHAR(20) NOT NULL DEFAULT 'pending',
|
||||
total_proxies INTEGER NOT NULL DEFAULT 0,
|
||||
tested_proxies INTEGER NOT NULL DEFAULT 0,
|
||||
passed_proxies INTEGER NOT NULL DEFAULT 0,
|
||||
failed_proxies INTEGER NOT NULL DEFAULT 0,
|
||||
started_at TIMESTAMP,
|
||||
completed_at TIMESTAMP,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
`);
|
||||
await client.query(`
|
||||
CREATE INDEX IF NOT EXISTS idx_proxy_test_jobs_status ON proxy_test_jobs(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_proxy_test_jobs_created_at ON proxy_test_jobs(created_at DESC);
|
||||
`);
|
||||
// Settings table
|
||||
await client.query(`
|
||||
|
||||
56
backend/dist/db/run-notifications-migration.js
vendored
Normal file
56
backend/dist/db/run-notifications-migration.js
vendored
Normal file
@@ -0,0 +1,56 @@
|
||||
"use strict";
|
||||
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
||||
if (k2 === undefined) k2 = k;
|
||||
var desc = Object.getOwnPropertyDescriptor(m, k);
|
||||
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
||||
desc = { enumerable: true, get: function() { return m[k]; } };
|
||||
}
|
||||
Object.defineProperty(o, k2, desc);
|
||||
}) : (function(o, m, k, k2) {
|
||||
if (k2 === undefined) k2 = k;
|
||||
o[k2] = m[k];
|
||||
}));
|
||||
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
||||
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
||||
}) : function(o, v) {
|
||||
o["default"] = v;
|
||||
});
|
||||
var __importStar = (this && this.__importStar) || (function () {
|
||||
var ownKeys = function(o) {
|
||||
ownKeys = Object.getOwnPropertyNames || function (o) {
|
||||
var ar = [];
|
||||
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
||||
return ar;
|
||||
};
|
||||
return ownKeys(o);
|
||||
};
|
||||
return function (mod) {
|
||||
if (mod && mod.__esModule) return mod;
|
||||
var result = {};
|
||||
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
||||
__setModuleDefault(result, mod);
|
||||
return result;
|
||||
};
|
||||
})();
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
const migrate_1 = require("./migrate");
|
||||
const fs = __importStar(require("fs"));
|
||||
const path = __importStar(require("path"));
|
||||
async function runNotificationsMigration() {
|
||||
const client = await migrate_1.pool.connect();
|
||||
try {
|
||||
console.log('Running notifications migration...');
|
||||
const migrationSQL = fs.readFileSync(path.join(__dirname, '../../migrations/005_notifications.sql'), 'utf-8');
|
||||
await client.query(migrationSQL);
|
||||
console.log('✅ Notifications migration completed successfully');
|
||||
process.exit(0);
|
||||
}
|
||||
catch (error) {
|
||||
console.error('❌ Migration failed:', error);
|
||||
process.exit(1);
|
||||
}
|
||||
finally {
|
||||
client.release();
|
||||
}
|
||||
}
|
||||
runNotificationsMigration();
|
||||
106
backend/dist/dutchie-az/config/dutchie.js
vendored
Normal file
106
backend/dist/dutchie-az/config/dutchie.js
vendored
Normal file
@@ -0,0 +1,106 @@
|
||||
"use strict";
|
||||
/**
|
||||
* Dutchie Configuration
|
||||
*
|
||||
* Centralized configuration for Dutchie GraphQL API interaction.
|
||||
* Update hashes here when Dutchie changes their persisted query system.
|
||||
*/
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.ARIZONA_CENTERPOINTS = exports.GRAPHQL_HASHES = exports.dutchieConfig = void 0;
|
||||
exports.dutchieConfig = {
|
||||
// ============================================================
|
||||
// GRAPHQL ENDPOINT
|
||||
// ============================================================
|
||||
/** GraphQL endpoint - must be the api-3 graphql endpoint (NOT api-gw.dutchie.com which no longer exists) */
|
||||
graphqlEndpoint: 'https://dutchie.com/api-3/graphql',
|
||||
// ============================================================
|
||||
// GRAPHQL PERSISTED QUERY HASHES
|
||||
// ============================================================
|
||||
//
|
||||
// These hashes identify specific GraphQL operations.
|
||||
// If Dutchie changes their schema, you may need to capture
|
||||
// new hashes from live browser traffic (Network tab → graphql requests).
|
||||
/** FilteredProducts - main product listing query */
|
||||
filteredProductsHash: 'ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0',
|
||||
/** GetAddressBasedDispensaryData - resolve slug to internal ID */
|
||||
getDispensaryDataHash: '13461f73abf7268770dfd05fe7e10c523084b2bb916a929c08efe3d87531977b',
|
||||
/**
|
||||
* ConsumerDispensaries - geo-based discovery
|
||||
* NOTE: This is a placeholder guess. If discovery fails, either:
|
||||
* 1. Capture the real hash from live traffic
|
||||
* 2. Rely on known AZDHS slugs instead (set useDiscovery: false)
|
||||
*/
|
||||
consumerDispensariesHash: '0a5bfa6ca1d64ae47bcccb7c8077c87147cbc4e6982c17ceec97a2a4948b311b',
|
||||
// ============================================================
|
||||
// BEHAVIOR FLAGS
|
||||
// ============================================================
|
||||
/** Enable geo-based discovery (false = use known AZDHS slugs only) */
|
||||
useDiscovery: true,
|
||||
/** Prefer GET requests (true) or POST (false). GET is default. */
|
||||
preferGet: true,
|
||||
/**
|
||||
* Enable POST fallback when GET fails with 405 or blocked.
|
||||
* If true, will retry failed GETs as POSTs.
|
||||
*/
|
||||
enablePostFallback: true,
|
||||
// ============================================================
|
||||
// PAGINATION & RETRY
|
||||
// ============================================================
|
||||
/** Products per page for pagination */
|
||||
perPage: 100,
|
||||
/** Maximum pages to fetch (safety limit) */
|
||||
maxPages: 200,
|
||||
/** Number of retries for failed page fetches */
|
||||
maxRetries: 1,
|
||||
/** Delay between pages in ms */
|
||||
pageDelayMs: 500,
|
||||
/** Delay between modes in ms */
|
||||
modeDelayMs: 2000,
|
||||
// ============================================================
|
||||
// HTTP HEADERS
|
||||
// ============================================================
|
||||
/** Default headers to mimic browser requests */
|
||||
defaultHeaders: {
|
||||
'accept': 'application/json, text/plain, */*',
|
||||
'accept-language': 'en-US,en;q=0.9',
|
||||
'apollographql-client-name': 'Marketplace (production)',
|
||||
},
|
||||
/** User agent string */
|
||||
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
// ============================================================
|
||||
// BROWSER LAUNCH OPTIONS
|
||||
// ============================================================
|
||||
browserArgs: [
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-blink-features=AutomationControlled',
|
||||
],
|
||||
/** Navigation timeout in ms */
|
||||
navigationTimeout: 60000,
|
||||
/** Initial page load delay in ms */
|
||||
pageLoadDelay: 2000,
|
||||
};
|
||||
/**
|
||||
* Get GraphQL hashes object for backward compatibility
|
||||
*/
|
||||
exports.GRAPHQL_HASHES = {
|
||||
FilteredProducts: exports.dutchieConfig.filteredProductsHash,
|
||||
GetAddressBasedDispensaryData: exports.dutchieConfig.getDispensaryDataHash,
|
||||
ConsumerDispensaries: exports.dutchieConfig.consumerDispensariesHash,
|
||||
};
|
||||
/**
|
||||
* Arizona geo centerpoints for discovery scans
|
||||
*/
|
||||
exports.ARIZONA_CENTERPOINTS = [
|
||||
{ name: 'Phoenix', lat: 33.4484, lng: -112.074 },
|
||||
{ name: 'Tucson', lat: 32.2226, lng: -110.9747 },
|
||||
{ name: 'Flagstaff', lat: 35.1983, lng: -111.6513 },
|
||||
{ name: 'Mesa', lat: 33.4152, lng: -111.8315 },
|
||||
{ name: 'Scottsdale', lat: 33.4942, lng: -111.9261 },
|
||||
{ name: 'Tempe', lat: 33.4255, lng: -111.94 },
|
||||
{ name: 'Yuma', lat: 32.6927, lng: -114.6277 },
|
||||
{ name: 'Prescott', lat: 34.54, lng: -112.4685 },
|
||||
{ name: 'Lake Havasu', lat: 34.4839, lng: -114.3224 },
|
||||
{ name: 'Sierra Vista', lat: 31.5455, lng: -110.2773 },
|
||||
];
|
||||
79
backend/dist/dutchie-az/db/connection.js
vendored
Normal file
79
backend/dist/dutchie-az/db/connection.js
vendored
Normal file
@@ -0,0 +1,79 @@
|
||||
"use strict";
|
||||
/**
|
||||
* Dutchie AZ Database Connection
|
||||
*
|
||||
* Isolated database connection for Dutchie Arizona data.
|
||||
* Uses a separate database/schema to prevent cross-contamination with main app data.
|
||||
*/
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.getDutchieAZPool = getDutchieAZPool;
|
||||
exports.query = query;
|
||||
exports.getClient = getClient;
|
||||
exports.closePool = closePool;
|
||||
exports.healthCheck = healthCheck;
|
||||
const pg_1 = require("pg");
|
||||
// Consolidated DB naming:
|
||||
// - Prefer CRAWLSY_DATABASE_URL (e.g., crawlsy_local, crawlsy_prod)
|
||||
// - Then DUTCHIE_AZ_DATABASE_URL (legacy)
|
||||
// - Finally DATABASE_URL (legacy main DB)
|
||||
const DUTCHIE_AZ_DATABASE_URL = process.env.CRAWLSY_DATABASE_URL ||
|
||||
process.env.DUTCHIE_AZ_DATABASE_URL ||
|
||||
process.env.DATABASE_URL ||
|
||||
'postgresql://dutchie:dutchie_local_pass@localhost:54320/crawlsy_local';
|
||||
let pool = null;
|
||||
/**
|
||||
* Get the Dutchie AZ database pool (singleton)
|
||||
*/
|
||||
function getDutchieAZPool() {
|
||||
if (!pool) {
|
||||
pool = new pg_1.Pool({
|
||||
connectionString: DUTCHIE_AZ_DATABASE_URL,
|
||||
max: 10,
|
||||
idleTimeoutMillis: 30000,
|
||||
connectionTimeoutMillis: 5000,
|
||||
});
|
||||
pool.on('error', (err) => {
|
||||
console.error('[DutchieAZ DB] Unexpected error on idle client:', err);
|
||||
});
|
||||
console.log('[DutchieAZ DB] Pool initialized');
|
||||
}
|
||||
return pool;
|
||||
}
|
||||
/**
|
||||
* Execute a query on the Dutchie AZ database
|
||||
*/
|
||||
async function query(text, params) {
|
||||
const p = getDutchieAZPool();
|
||||
const result = await p.query(text, params);
|
||||
return { rows: result.rows, rowCount: result.rowCount || 0 };
|
||||
}
|
||||
/**
|
||||
* Get a client from the pool for transaction use
|
||||
*/
|
||||
async function getClient() {
|
||||
const p = getDutchieAZPool();
|
||||
return p.connect();
|
||||
}
|
||||
/**
|
||||
* Close the pool connection
|
||||
*/
|
||||
async function closePool() {
|
||||
if (pool) {
|
||||
await pool.end();
|
||||
pool = null;
|
||||
console.log('[DutchieAZ DB] Pool closed');
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Check if the database is accessible
|
||||
*/
|
||||
async function healthCheck() {
|
||||
try {
|
||||
const result = await query('SELECT 1 as ok');
|
||||
return result.rows.length > 0 && result.rows[0].ok === 1;
|
||||
}
|
||||
catch (error) {
|
||||
console.error('[DutchieAZ DB] Health check failed:', error);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
30
backend/dist/dutchie-az/db/migrate.js
vendored
Normal file
30
backend/dist/dutchie-az/db/migrate.js
vendored
Normal file
@@ -0,0 +1,30 @@
|
||||
"use strict";
|
||||
/**
|
||||
* Dutchie AZ Schema Bootstrap
|
||||
*
|
||||
* Run this to create/update the dutchie_az tables (dutchie_products, dutchie_product_snapshots, etc.)
|
||||
* in the AZ pipeline database. This is separate from the legacy schema.
|
||||
*
|
||||
* Usage:
|
||||
* TS_NODE_TRANSPILE_ONLY=1 npx ts-node src/dutchie-az/db/migrate.ts
|
||||
* or (after build)
|
||||
* node dist/dutchie-az/db/migrate.js
|
||||
*/
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
const schema_1 = require("./schema");
|
||||
const connection_1 = require("./connection");
|
||||
async function main() {
|
||||
try {
|
||||
console.log('[DutchieAZ] Running schema migration...');
|
||||
await (0, schema_1.createSchema)();
|
||||
console.log('[DutchieAZ] Schema migration complete.');
|
||||
}
|
||||
catch (err) {
|
||||
console.error('[DutchieAZ] Schema migration failed:', err.message);
|
||||
process.exitCode = 1;
|
||||
}
|
||||
finally {
|
||||
await (0, connection_1.closePool)();
|
||||
}
|
||||
}
|
||||
main();
|
||||
405
backend/dist/dutchie-az/db/schema.js
vendored
Normal file
405
backend/dist/dutchie-az/db/schema.js
vendored
Normal file
@@ -0,0 +1,405 @@
|
||||
"use strict";
|
||||
/**
|
||||
* Dutchie AZ Database Schema
|
||||
*
|
||||
* Creates all tables for the isolated Dutchie Arizona data pipeline.
|
||||
* Run this to initialize the dutchie_az database.
|
||||
*/
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.createSchema = createSchema;
|
||||
exports.dropSchema = dropSchema;
|
||||
exports.schemaExists = schemaExists;
|
||||
exports.ensureSchema = ensureSchema;
|
||||
const connection_1 = require("./connection");
|
||||
/**
|
||||
* SQL statements to create all tables
|
||||
*/
|
||||
const SCHEMA_SQL = `
|
||||
-- ============================================================
|
||||
-- DISPENSARIES TABLE
|
||||
-- Stores discovered Dutchie dispensaries in Arizona
|
||||
-- ============================================================
|
||||
CREATE TABLE IF NOT EXISTS dispensaries (
|
||||
id SERIAL PRIMARY KEY,
|
||||
platform VARCHAR(20) NOT NULL DEFAULT 'dutchie',
|
||||
name VARCHAR(255) NOT NULL,
|
||||
slug VARCHAR(255) NOT NULL,
|
||||
city VARCHAR(100) NOT NULL,
|
||||
state VARCHAR(10) NOT NULL DEFAULT 'AZ',
|
||||
postal_code VARCHAR(20),
|
||||
address TEXT,
|
||||
latitude DECIMAL(10, 7),
|
||||
longitude DECIMAL(10, 7),
|
||||
platform_dispensary_id VARCHAR(100),
|
||||
is_delivery BOOLEAN DEFAULT false,
|
||||
is_pickup BOOLEAN DEFAULT true,
|
||||
raw_metadata JSONB,
|
||||
last_crawled_at TIMESTAMPTZ,
|
||||
product_count INTEGER DEFAULT 0,
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
|
||||
CONSTRAINT uk_dispensaries_platform_slug UNIQUE (platform, slug, city, state)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_platform ON dispensaries(platform);
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_platform_id ON dispensaries(platform_dispensary_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_state ON dispensaries(state);
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_city ON dispensaries(city);
|
||||
|
||||
-- ============================================================
|
||||
-- DUTCHIE_PRODUCTS TABLE
|
||||
-- Canonical product identity per store
|
||||
-- ============================================================
|
||||
CREATE TABLE IF NOT EXISTS dutchie_products (
|
||||
id SERIAL PRIMARY KEY,
|
||||
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
|
||||
platform VARCHAR(20) NOT NULL DEFAULT 'dutchie',
|
||||
|
||||
external_product_id VARCHAR(100) NOT NULL,
|
||||
platform_dispensary_id VARCHAR(100) NOT NULL,
|
||||
c_name VARCHAR(500),
|
||||
name VARCHAR(500) NOT NULL,
|
||||
|
||||
-- Brand
|
||||
brand_name VARCHAR(255),
|
||||
brand_id VARCHAR(100),
|
||||
brand_logo_url TEXT,
|
||||
|
||||
-- Classification
|
||||
type VARCHAR(100),
|
||||
subcategory VARCHAR(100),
|
||||
strain_type VARCHAR(50),
|
||||
provider VARCHAR(100),
|
||||
|
||||
-- Potency
|
||||
thc DECIMAL(10, 4),
|
||||
thc_content DECIMAL(10, 4),
|
||||
cbd DECIMAL(10, 4),
|
||||
cbd_content DECIMAL(10, 4),
|
||||
cannabinoids_v2 JSONB,
|
||||
effects JSONB,
|
||||
|
||||
-- Status / flags
|
||||
status VARCHAR(50),
|
||||
medical_only BOOLEAN DEFAULT false,
|
||||
rec_only BOOLEAN DEFAULT false,
|
||||
featured BOOLEAN DEFAULT false,
|
||||
coming_soon BOOLEAN DEFAULT false,
|
||||
certificate_of_analysis_enabled BOOLEAN DEFAULT false,
|
||||
|
||||
is_below_threshold BOOLEAN DEFAULT false,
|
||||
is_below_kiosk_threshold BOOLEAN DEFAULT false,
|
||||
options_below_threshold BOOLEAN DEFAULT false,
|
||||
options_below_kiosk_threshold BOOLEAN DEFAULT false,
|
||||
|
||||
-- Derived stock status: 'in_stock', 'out_of_stock', 'unknown'
|
||||
stock_status VARCHAR(20) DEFAULT 'unknown',
|
||||
total_quantity_available INTEGER DEFAULT 0,
|
||||
|
||||
-- Images
|
||||
primary_image_url TEXT,
|
||||
images JSONB,
|
||||
|
||||
-- Misc
|
||||
measurements JSONB,
|
||||
weight VARCHAR(50),
|
||||
past_c_names TEXT[],
|
||||
|
||||
created_at_dutchie TIMESTAMPTZ,
|
||||
updated_at_dutchie TIMESTAMPTZ,
|
||||
|
||||
latest_raw_payload JSONB,
|
||||
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
|
||||
CONSTRAINT uk_dutchie_products UNIQUE (dispensary_id, external_product_id)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_dutchie_products_dispensary ON dutchie_products(dispensary_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_dutchie_products_external_id ON dutchie_products(external_product_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_dutchie_products_platform_disp ON dutchie_products(platform_dispensary_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_dutchie_products_brand ON dutchie_products(brand_name);
|
||||
CREATE INDEX IF NOT EXISTS idx_dutchie_products_type ON dutchie_products(type);
|
||||
CREATE INDEX IF NOT EXISTS idx_dutchie_products_subcategory ON dutchie_products(subcategory);
|
||||
CREATE INDEX IF NOT EXISTS idx_dutchie_products_status ON dutchie_products(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_dutchie_products_strain ON dutchie_products(strain_type);
|
||||
CREATE INDEX IF NOT EXISTS idx_dutchie_products_stock_status ON dutchie_products(stock_status);
|
||||
|
||||
-- ============================================================
|
||||
-- DUTCHIE_PRODUCT_SNAPSHOTS TABLE
|
||||
-- Historical state per crawl, includes options[]
|
||||
-- ============================================================
|
||||
CREATE TABLE IF NOT EXISTS dutchie_product_snapshots (
|
||||
id SERIAL PRIMARY KEY,
|
||||
dutchie_product_id INTEGER NOT NULL REFERENCES dutchie_products(id) ON DELETE CASCADE,
|
||||
dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
|
||||
platform_dispensary_id VARCHAR(100) NOT NULL,
|
||||
external_product_id VARCHAR(100) NOT NULL,
|
||||
pricing_type VARCHAR(20) DEFAULT 'unknown',
|
||||
crawl_mode VARCHAR(20) DEFAULT 'mode_a', -- 'mode_a' (UI parity) or 'mode_b' (max coverage)
|
||||
|
||||
status VARCHAR(50),
|
||||
featured BOOLEAN DEFAULT false,
|
||||
special BOOLEAN DEFAULT false,
|
||||
medical_only BOOLEAN DEFAULT false,
|
||||
rec_only BOOLEAN DEFAULT false,
|
||||
|
||||
-- Flag indicating if product was present in feed (false = missing_from_feed snapshot)
|
||||
is_present_in_feed BOOLEAN DEFAULT true,
|
||||
|
||||
-- Derived stock status
|
||||
stock_status VARCHAR(20) DEFAULT 'unknown',
|
||||
|
||||
-- Price summary (in cents)
|
||||
rec_min_price_cents INTEGER,
|
||||
rec_max_price_cents INTEGER,
|
||||
rec_min_special_price_cents INTEGER,
|
||||
med_min_price_cents INTEGER,
|
||||
med_max_price_cents INTEGER,
|
||||
med_min_special_price_cents INTEGER,
|
||||
wholesale_min_price_cents INTEGER,
|
||||
|
||||
-- Inventory summary
|
||||
total_quantity_available INTEGER,
|
||||
total_kiosk_quantity_available INTEGER,
|
||||
manual_inventory BOOLEAN DEFAULT false,
|
||||
is_below_threshold BOOLEAN DEFAULT false,
|
||||
is_below_kiosk_threshold BOOLEAN DEFAULT false,
|
||||
|
||||
-- Option-level data (from POSMetaData.children)
|
||||
options JSONB,
|
||||
|
||||
-- Full raw product node
|
||||
raw_payload JSONB NOT NULL,
|
||||
|
||||
crawled_at TIMESTAMPTZ NOT NULL,
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_product ON dutchie_product_snapshots(dutchie_product_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_dispensary ON dutchie_product_snapshots(dispensary_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_crawled_at ON dutchie_product_snapshots(crawled_at);
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_platform_disp ON dutchie_product_snapshots(platform_dispensary_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_external_id ON dutchie_product_snapshots(external_product_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_special ON dutchie_product_snapshots(special) WHERE special = true;
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_stock_status ON dutchie_product_snapshots(stock_status);
|
||||
CREATE INDEX IF NOT EXISTS idx_snapshots_crawl_mode ON dutchie_product_snapshots(crawl_mode);
|
||||
|
||||
-- ============================================================
|
||||
-- CRAWL_JOBS TABLE
|
||||
-- Tracks crawl execution status
|
||||
-- ============================================================
|
||||
CREATE TABLE IF NOT EXISTS crawl_jobs (
|
||||
id SERIAL PRIMARY KEY,
|
||||
job_type VARCHAR(50) NOT NULL,
|
||||
dispensary_id INTEGER REFERENCES dispensaries(id) ON DELETE SET NULL,
|
||||
status VARCHAR(20) NOT NULL DEFAULT 'pending',
|
||||
started_at TIMESTAMPTZ,
|
||||
completed_at TIMESTAMPTZ,
|
||||
error_message TEXT,
|
||||
products_found INTEGER,
|
||||
snapshots_created INTEGER,
|
||||
metadata JSONB,
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_crawl_jobs_type ON crawl_jobs(job_type);
|
||||
CREATE INDEX IF NOT EXISTS idx_crawl_jobs_status ON crawl_jobs(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_crawl_jobs_dispensary ON crawl_jobs(dispensary_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_crawl_jobs_created ON crawl_jobs(created_at);
|
||||
|
||||
-- ============================================================
|
||||
-- JOB_SCHEDULES TABLE
|
||||
-- Stores schedule configuration for recurring jobs with jitter support
|
||||
-- Each job has independent timing that "wanders" over time
|
||||
-- ============================================================
|
||||
CREATE TABLE IF NOT EXISTS job_schedules (
|
||||
id SERIAL PRIMARY KEY,
|
||||
job_name VARCHAR(100) NOT NULL UNIQUE,
|
||||
description TEXT,
|
||||
enabled BOOLEAN DEFAULT true,
|
||||
|
||||
-- Timing configuration (jitter makes times "wander")
|
||||
base_interval_minutes INTEGER NOT NULL DEFAULT 240, -- e.g., 4 hours
|
||||
jitter_minutes INTEGER NOT NULL DEFAULT 30, -- e.g., ±30 min
|
||||
|
||||
-- Last run tracking
|
||||
last_run_at TIMESTAMPTZ,
|
||||
last_status VARCHAR(20), -- 'success', 'error', 'partial', 'running'
|
||||
last_error_message TEXT,
|
||||
last_duration_ms INTEGER,
|
||||
|
||||
-- Next run (calculated with jitter after each run)
|
||||
next_run_at TIMESTAMPTZ,
|
||||
|
||||
-- Additional config
|
||||
job_config JSONB, -- e.g., { pricingType: 'rec', useBothModes: true }
|
||||
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_job_schedules_enabled ON job_schedules(enabled);
|
||||
CREATE INDEX IF NOT EXISTS idx_job_schedules_next_run ON job_schedules(next_run_at);
|
||||
|
||||
-- ============================================================
|
||||
-- JOB_RUN_LOGS TABLE
|
||||
-- Stores history of job runs for monitoring
|
||||
-- ============================================================
|
||||
CREATE TABLE IF NOT EXISTS job_run_logs (
|
||||
id SERIAL PRIMARY KEY,
|
||||
schedule_id INTEGER NOT NULL REFERENCES job_schedules(id) ON DELETE CASCADE,
|
||||
job_name VARCHAR(100) NOT NULL,
|
||||
status VARCHAR(20) NOT NULL, -- 'pending', 'running', 'success', 'error', 'partial'
|
||||
started_at TIMESTAMPTZ,
|
||||
completed_at TIMESTAMPTZ,
|
||||
duration_ms INTEGER,
|
||||
error_message TEXT,
|
||||
|
||||
-- Results summary
|
||||
items_processed INTEGER,
|
||||
items_succeeded INTEGER,
|
||||
items_failed INTEGER,
|
||||
|
||||
metadata JSONB, -- Additional run details
|
||||
|
||||
created_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_job_run_logs_schedule ON job_run_logs(schedule_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_job_run_logs_job_name ON job_run_logs(job_name);
|
||||
CREATE INDEX IF NOT EXISTS idx_job_run_logs_status ON job_run_logs(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_job_run_logs_created ON job_run_logs(created_at);
|
||||
|
||||
-- ============================================================
|
||||
-- VIEWS FOR EASY QUERYING
|
||||
-- ============================================================
|
||||
|
||||
-- Categories derived from products
|
||||
CREATE OR REPLACE VIEW v_categories AS
|
||||
SELECT
|
||||
type,
|
||||
subcategory,
|
||||
COUNT(DISTINCT id) as product_count,
|
||||
COUNT(DISTINCT dispensary_id) as dispensary_count,
|
||||
AVG(thc) as avg_thc,
|
||||
MIN(thc) as min_thc,
|
||||
MAX(thc) as max_thc
|
||||
FROM dutchie_products
|
||||
WHERE type IS NOT NULL
|
||||
GROUP BY type, subcategory
|
||||
ORDER BY type, subcategory;
|
||||
|
||||
-- Brands derived from products
|
||||
CREATE OR REPLACE VIEW v_brands AS
|
||||
SELECT
|
||||
brand_name,
|
||||
brand_id,
|
||||
MAX(brand_logo_url) as brand_logo_url,
|
||||
COUNT(DISTINCT id) as product_count,
|
||||
COUNT(DISTINCT dispensary_id) as dispensary_count,
|
||||
ARRAY_AGG(DISTINCT type) FILTER (WHERE type IS NOT NULL) as product_types
|
||||
FROM dutchie_products
|
||||
WHERE brand_name IS NOT NULL
|
||||
GROUP BY brand_name, brand_id
|
||||
ORDER BY product_count DESC;
|
||||
|
||||
-- Latest snapshot per product (most recent crawl data)
|
||||
CREATE OR REPLACE VIEW v_latest_snapshots AS
|
||||
SELECT DISTINCT ON (dutchie_product_id)
|
||||
s.*
|
||||
FROM dutchie_product_snapshots s
|
||||
ORDER BY dutchie_product_id, crawled_at DESC;
|
||||
|
||||
-- Dashboard stats
|
||||
CREATE OR REPLACE VIEW v_dashboard_stats AS
|
||||
SELECT
|
||||
(SELECT COUNT(*) FROM dispensaries WHERE state = 'AZ') as dispensary_count,
|
||||
(SELECT COUNT(*) FROM dutchie_products) as product_count,
|
||||
(SELECT COUNT(*) FROM dutchie_product_snapshots WHERE crawled_at > NOW() - INTERVAL '24 hours') as snapshots_24h,
|
||||
(SELECT MAX(crawled_at) FROM dutchie_product_snapshots) as last_crawl_time,
|
||||
(SELECT COUNT(*) FROM crawl_jobs WHERE status = 'failed' AND created_at > NOW() - INTERVAL '24 hours') as failed_jobs_24h,
|
||||
(SELECT COUNT(DISTINCT brand_name) FROM dutchie_products WHERE brand_name IS NOT NULL) as brand_count,
|
||||
(SELECT COUNT(DISTINCT (type, subcategory)) FROM dutchie_products WHERE type IS NOT NULL) as category_count;
|
||||
`;
|
||||
/**
|
||||
* Run the schema migration
|
||||
*/
|
||||
async function createSchema() {
|
||||
console.log('[DutchieAZ Schema] Creating database schema...');
|
||||
const client = await (0, connection_1.getClient)();
|
||||
try {
|
||||
await client.query('BEGIN');
|
||||
// Split into individual statements and execute
|
||||
const statements = SCHEMA_SQL
|
||||
.split(';')
|
||||
.map(s => s.trim())
|
||||
.filter(s => s.length > 0 && !s.startsWith('--'));
|
||||
for (const statement of statements) {
|
||||
if (statement.trim()) {
|
||||
await client.query(statement + ';');
|
||||
}
|
||||
}
|
||||
await client.query('COMMIT');
|
||||
console.log('[DutchieAZ Schema] Schema created successfully');
|
||||
}
|
||||
catch (error) {
|
||||
await client.query('ROLLBACK');
|
||||
console.error('[DutchieAZ Schema] Failed to create schema:', error);
|
||||
throw error;
|
||||
}
|
||||
finally {
|
||||
client.release();
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Drop all tables (for development/testing)
|
||||
*/
|
||||
async function dropSchema() {
|
||||
console.log('[DutchieAZ Schema] Dropping all tables...');
|
||||
await (0, connection_1.query)(`
|
||||
DROP VIEW IF EXISTS v_dashboard_stats CASCADE;
|
||||
DROP VIEW IF EXISTS v_latest_snapshots CASCADE;
|
||||
DROP VIEW IF EXISTS v_brands CASCADE;
|
||||
DROP VIEW IF EXISTS v_categories CASCADE;
|
||||
DROP TABLE IF EXISTS crawl_schedule CASCADE;
|
||||
DROP TABLE IF EXISTS crawl_jobs CASCADE;
|
||||
DROP TABLE IF EXISTS dutchie_product_snapshots CASCADE;
|
||||
DROP TABLE IF EXISTS dutchie_products CASCADE;
|
||||
DROP TABLE IF EXISTS dispensaries CASCADE;
|
||||
`);
|
||||
console.log('[DutchieAZ Schema] All tables dropped');
|
||||
}
|
||||
/**
|
||||
* Check if schema exists
|
||||
*/
|
||||
async function schemaExists() {
|
||||
try {
|
||||
const result = await (0, connection_1.query)(`
|
||||
SELECT EXISTS (
|
||||
SELECT FROM information_schema.tables
|
||||
WHERE table_name = 'dispensaries'
|
||||
) as exists
|
||||
`);
|
||||
return result.rows[0]?.exists === true;
|
||||
}
|
||||
catch (error) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Initialize schema if it doesn't exist
|
||||
*/
|
||||
async function ensureSchema() {
|
||||
const exists = await schemaExists();
|
||||
if (!exists) {
|
||||
await createSchema();
|
||||
}
|
||||
else {
|
||||
console.log('[DutchieAZ Schema] Schema already exists');
|
||||
}
|
||||
}
|
||||
95
backend/dist/dutchie-az/index.js
vendored
Normal file
95
backend/dist/dutchie-az/index.js
vendored
Normal file
@@ -0,0 +1,95 @@
|
||||
"use strict";
|
||||
/**
|
||||
* Dutchie AZ Data Pipeline
|
||||
*
|
||||
* Isolated data pipeline for crawling and storing Dutchie Arizona dispensary data.
|
||||
* This module is completely separate from the main application database.
|
||||
*
|
||||
* Features:
|
||||
* - Two-mode crawling (Mode A: UI parity, Mode B: MAX COVERAGE)
|
||||
* - Derived stockStatus field (in_stock, out_of_stock, unknown)
|
||||
* - Full raw payload storage for 100% data preservation
|
||||
* - AZDHS dispensary list as canonical source
|
||||
*/
|
||||
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
||||
if (k2 === undefined) k2 = k;
|
||||
var desc = Object.getOwnPropertyDescriptor(m, k);
|
||||
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
||||
desc = { enumerable: true, get: function() { return m[k]; } };
|
||||
}
|
||||
Object.defineProperty(o, k2, desc);
|
||||
}) : (function(o, m, k, k2) {
|
||||
if (k2 === undefined) k2 = k;
|
||||
o[k2] = m[k];
|
||||
}));
|
||||
var __exportStar = (this && this.__exportStar) || function(m, exports) {
|
||||
for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
|
||||
};
|
||||
var __importDefault = (this && this.__importDefault) || function (mod) {
|
||||
return (mod && mod.__esModule) ? mod : { "default": mod };
|
||||
};
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.dutchieAZRouter = exports.getImportStats = exports.importFromJSON = exports.importAZDHSDispensaries = exports.getRunLogs = exports.initializeDefaultSchedules = exports.triggerScheduleNow = exports.deleteSchedule = exports.updateSchedule = exports.createSchedule = exports.getScheduleById = exports.getAllSchedules = exports.crawlSingleDispensary = exports.getSchedulerStatus = exports.triggerImmediateCrawl = exports.stopScheduler = exports.startScheduler = exports.crawlAllArizonaDispensaries = exports.crawlDispensaryProducts = exports.normalizeSnapshot = exports.normalizeProduct = exports.getDispensariesWithPlatformIds = exports.getDispensaryById = exports.getAllDispensaries = exports.resolvePlatformDispensaryIds = exports.discoverAndSaveDispensaries = exports.importFromExistingDispensaries = exports.discoverDispensaries = exports.discoverArizonaDispensaries = exports.fetchAllProductsBothModes = exports.fetchAllProducts = exports.resolveDispensaryId = exports.ARIZONA_CENTERPOINTS = exports.GRAPHQL_HASHES = exports.ensureSchema = exports.schemaExists = exports.dropSchema = exports.createSchema = exports.healthCheck = exports.closePool = exports.getClient = exports.query = exports.getDutchieAZPool = void 0;
|
||||
// Types
|
||||
__exportStar(require("./types"), exports);
|
||||
// Database
|
||||
var connection_1 = require("./db/connection");
|
||||
Object.defineProperty(exports, "getDutchieAZPool", { enumerable: true, get: function () { return connection_1.getDutchieAZPool; } });
|
||||
Object.defineProperty(exports, "query", { enumerable: true, get: function () { return connection_1.query; } });
|
||||
Object.defineProperty(exports, "getClient", { enumerable: true, get: function () { return connection_1.getClient; } });
|
||||
Object.defineProperty(exports, "closePool", { enumerable: true, get: function () { return connection_1.closePool; } });
|
||||
Object.defineProperty(exports, "healthCheck", { enumerable: true, get: function () { return connection_1.healthCheck; } });
|
||||
var schema_1 = require("./db/schema");
|
||||
Object.defineProperty(exports, "createSchema", { enumerable: true, get: function () { return schema_1.createSchema; } });
|
||||
Object.defineProperty(exports, "dropSchema", { enumerable: true, get: function () { return schema_1.dropSchema; } });
|
||||
Object.defineProperty(exports, "schemaExists", { enumerable: true, get: function () { return schema_1.schemaExists; } });
|
||||
Object.defineProperty(exports, "ensureSchema", { enumerable: true, get: function () { return schema_1.ensureSchema; } });
|
||||
// Services - GraphQL Client
|
||||
var graphql_client_1 = require("./services/graphql-client");
|
||||
Object.defineProperty(exports, "GRAPHQL_HASHES", { enumerable: true, get: function () { return graphql_client_1.GRAPHQL_HASHES; } });
|
||||
Object.defineProperty(exports, "ARIZONA_CENTERPOINTS", { enumerable: true, get: function () { return graphql_client_1.ARIZONA_CENTERPOINTS; } });
|
||||
Object.defineProperty(exports, "resolveDispensaryId", { enumerable: true, get: function () { return graphql_client_1.resolveDispensaryId; } });
|
||||
Object.defineProperty(exports, "fetchAllProducts", { enumerable: true, get: function () { return graphql_client_1.fetchAllProducts; } });
|
||||
Object.defineProperty(exports, "fetchAllProductsBothModes", { enumerable: true, get: function () { return graphql_client_1.fetchAllProductsBothModes; } });
|
||||
Object.defineProperty(exports, "discoverArizonaDispensaries", { enumerable: true, get: function () { return graphql_client_1.discoverArizonaDispensaries; } });
|
||||
// Alias for backward compatibility
|
||||
Object.defineProperty(exports, "discoverDispensaries", { enumerable: true, get: function () { return graphql_client_1.discoverArizonaDispensaries; } });
|
||||
// Services - Discovery
|
||||
var discovery_1 = require("./services/discovery");
|
||||
Object.defineProperty(exports, "importFromExistingDispensaries", { enumerable: true, get: function () { return discovery_1.importFromExistingDispensaries; } });
|
||||
Object.defineProperty(exports, "discoverAndSaveDispensaries", { enumerable: true, get: function () { return discovery_1.discoverDispensaries; } });
|
||||
Object.defineProperty(exports, "resolvePlatformDispensaryIds", { enumerable: true, get: function () { return discovery_1.resolvePlatformDispensaryIds; } });
|
||||
Object.defineProperty(exports, "getAllDispensaries", { enumerable: true, get: function () { return discovery_1.getAllDispensaries; } });
|
||||
Object.defineProperty(exports, "getDispensaryById", { enumerable: true, get: function () { return discovery_1.getDispensaryById; } });
|
||||
Object.defineProperty(exports, "getDispensariesWithPlatformIds", { enumerable: true, get: function () { return discovery_1.getDispensariesWithPlatformIds; } });
|
||||
// Services - Product Crawler
|
||||
var product_crawler_1 = require("./services/product-crawler");
|
||||
Object.defineProperty(exports, "normalizeProduct", { enumerable: true, get: function () { return product_crawler_1.normalizeProduct; } });
|
||||
Object.defineProperty(exports, "normalizeSnapshot", { enumerable: true, get: function () { return product_crawler_1.normalizeSnapshot; } });
|
||||
Object.defineProperty(exports, "crawlDispensaryProducts", { enumerable: true, get: function () { return product_crawler_1.crawlDispensaryProducts; } });
|
||||
Object.defineProperty(exports, "crawlAllArizonaDispensaries", { enumerable: true, get: function () { return product_crawler_1.crawlAllArizonaDispensaries; } });
|
||||
// Services - Scheduler
|
||||
var scheduler_1 = require("./services/scheduler");
|
||||
Object.defineProperty(exports, "startScheduler", { enumerable: true, get: function () { return scheduler_1.startScheduler; } });
|
||||
Object.defineProperty(exports, "stopScheduler", { enumerable: true, get: function () { return scheduler_1.stopScheduler; } });
|
||||
Object.defineProperty(exports, "triggerImmediateCrawl", { enumerable: true, get: function () { return scheduler_1.triggerImmediateCrawl; } });
|
||||
Object.defineProperty(exports, "getSchedulerStatus", { enumerable: true, get: function () { return scheduler_1.getSchedulerStatus; } });
|
||||
Object.defineProperty(exports, "crawlSingleDispensary", { enumerable: true, get: function () { return scheduler_1.crawlSingleDispensary; } });
|
||||
// Schedule config CRUD
|
||||
Object.defineProperty(exports, "getAllSchedules", { enumerable: true, get: function () { return scheduler_1.getAllSchedules; } });
|
||||
Object.defineProperty(exports, "getScheduleById", { enumerable: true, get: function () { return scheduler_1.getScheduleById; } });
|
||||
Object.defineProperty(exports, "createSchedule", { enumerable: true, get: function () { return scheduler_1.createSchedule; } });
|
||||
Object.defineProperty(exports, "updateSchedule", { enumerable: true, get: function () { return scheduler_1.updateSchedule; } });
|
||||
Object.defineProperty(exports, "deleteSchedule", { enumerable: true, get: function () { return scheduler_1.deleteSchedule; } });
|
||||
Object.defineProperty(exports, "triggerScheduleNow", { enumerable: true, get: function () { return scheduler_1.triggerScheduleNow; } });
|
||||
Object.defineProperty(exports, "initializeDefaultSchedules", { enumerable: true, get: function () { return scheduler_1.initializeDefaultSchedules; } });
|
||||
// Run logs
|
||||
Object.defineProperty(exports, "getRunLogs", { enumerable: true, get: function () { return scheduler_1.getRunLogs; } });
|
||||
// Services - AZDHS Import
|
||||
var azdhs_import_1 = require("./services/azdhs-import");
|
||||
Object.defineProperty(exports, "importAZDHSDispensaries", { enumerable: true, get: function () { return azdhs_import_1.importAZDHSDispensaries; } });
|
||||
Object.defineProperty(exports, "importFromJSON", { enumerable: true, get: function () { return azdhs_import_1.importFromJSON; } });
|
||||
Object.defineProperty(exports, "getImportStats", { enumerable: true, get: function () { return azdhs_import_1.getImportStats; } });
|
||||
// Routes
|
||||
var routes_1 = require("./routes");
|
||||
Object.defineProperty(exports, "dutchieAZRouter", { enumerable: true, get: function () { return __importDefault(routes_1).default; } });
|
||||
1610
backend/dist/dutchie-az/routes/index.js
vendored
Normal file
1610
backend/dist/dutchie-az/routes/index.js
vendored
Normal file
File diff suppressed because it is too large
Load Diff
229
backend/dist/dutchie-az/services/azdhs-import.js
vendored
Normal file
229
backend/dist/dutchie-az/services/azdhs-import.js
vendored
Normal file
@@ -0,0 +1,229 @@
|
||||
"use strict";
|
||||
/**
|
||||
* AZDHS Import Service
|
||||
*
|
||||
* Imports Arizona dispensaries from the main database's dispensaries table
|
||||
* (which was populated from AZDHS data) into the isolated Dutchie AZ database.
|
||||
*
|
||||
* This establishes the canonical list of AZ dispensaries to match against Dutchie.
|
||||
*/
|
||||
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
||||
if (k2 === undefined) k2 = k;
|
||||
var desc = Object.getOwnPropertyDescriptor(m, k);
|
||||
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
||||
desc = { enumerable: true, get: function() { return m[k]; } };
|
||||
}
|
||||
Object.defineProperty(o, k2, desc);
|
||||
}) : (function(o, m, k, k2) {
|
||||
if (k2 === undefined) k2 = k;
|
||||
o[k2] = m[k];
|
||||
}));
|
||||
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
||||
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
||||
}) : function(o, v) {
|
||||
o["default"] = v;
|
||||
});
|
||||
var __importStar = (this && this.__importStar) || (function () {
|
||||
var ownKeys = function(o) {
|
||||
ownKeys = Object.getOwnPropertyNames || function (o) {
|
||||
var ar = [];
|
||||
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
||||
return ar;
|
||||
};
|
||||
return ownKeys(o);
|
||||
};
|
||||
return function (mod) {
|
||||
if (mod && mod.__esModule) return mod;
|
||||
var result = {};
|
||||
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
||||
__setModuleDefault(result, mod);
|
||||
return result;
|
||||
};
|
||||
})();
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.importAZDHSDispensaries = importAZDHSDispensaries;
|
||||
exports.importFromJSON = importFromJSON;
|
||||
exports.getImportStats = getImportStats;
|
||||
const pg_1 = require("pg");
|
||||
const connection_1 = require("../db/connection");
|
||||
// Main database connection (source of AZDHS data)
|
||||
const MAIN_DATABASE_URL = process.env.DATABASE_URL ||
|
||||
'postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus';
|
||||
/**
|
||||
* Create a temporary connection to the main database
|
||||
*/
|
||||
function getMainDBPool() {
|
||||
return new pg_1.Pool({
|
||||
connectionString: MAIN_DATABASE_URL,
|
||||
max: 5,
|
||||
idleTimeoutMillis: 30000,
|
||||
connectionTimeoutMillis: 5000,
|
||||
});
|
||||
}
|
||||
/**
|
||||
* Fetch all AZ dispensaries from the main database
|
||||
*/
|
||||
async function fetchAZDHSDispensaries() {
|
||||
const pool = getMainDBPool();
|
||||
try {
|
||||
const result = await pool.query(`
|
||||
SELECT
|
||||
id, azdhs_id, name, company_name, address, city, state, zip,
|
||||
latitude, longitude, dba_name, phone, email, website,
|
||||
google_rating, google_review_count, slug,
|
||||
menu_provider, product_provider,
|
||||
created_at, updated_at
|
||||
FROM dispensaries
|
||||
WHERE state = 'AZ'
|
||||
ORDER BY id
|
||||
`);
|
||||
return result.rows;
|
||||
}
|
||||
finally {
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Import a single dispensary into the Dutchie AZ database
|
||||
*/
|
||||
async function importDispensary(disp) {
|
||||
const result = await (0, connection_1.query)(`
|
||||
INSERT INTO dispensaries (
|
||||
platform, name, slug, city, state, postal_code, address,
|
||||
latitude, longitude, is_delivery, is_pickup, raw_metadata, updated_at
|
||||
) VALUES (
|
||||
$1, $2, $3, $4, $5, $6, $7,
|
||||
$8, $9, $10, $11, $12, NOW()
|
||||
)
|
||||
ON CONFLICT (platform, slug, city, state) DO UPDATE SET
|
||||
name = EXCLUDED.name,
|
||||
postal_code = EXCLUDED.postal_code,
|
||||
address = EXCLUDED.address,
|
||||
latitude = EXCLUDED.latitude,
|
||||
longitude = EXCLUDED.longitude,
|
||||
raw_metadata = EXCLUDED.raw_metadata,
|
||||
updated_at = NOW()
|
||||
RETURNING id
|
||||
`, [
|
||||
'dutchie', // Will be updated when Dutchie match is found
|
||||
disp.dba_name || disp.name,
|
||||
disp.slug,
|
||||
disp.city,
|
||||
disp.state,
|
||||
disp.zip,
|
||||
disp.address,
|
||||
disp.latitude,
|
||||
disp.longitude,
|
||||
false, // is_delivery - unknown
|
||||
true, // is_pickup - assume true
|
||||
JSON.stringify({
|
||||
azdhs_id: disp.azdhs_id,
|
||||
main_db_id: disp.id,
|
||||
company_name: disp.company_name,
|
||||
phone: disp.phone,
|
||||
email: disp.email,
|
||||
website: disp.website,
|
||||
google_rating: disp.google_rating,
|
||||
google_review_count: disp.google_review_count,
|
||||
menu_provider: disp.menu_provider,
|
||||
product_provider: disp.product_provider,
|
||||
}),
|
||||
]);
|
||||
return result.rows[0].id;
|
||||
}
|
||||
/**
|
||||
* Import all AZDHS dispensaries into the Dutchie AZ database
|
||||
*/
|
||||
async function importAZDHSDispensaries() {
|
||||
console.log('[AZDHS Import] Starting import from main database...');
|
||||
const result = {
|
||||
total: 0,
|
||||
imported: 0,
|
||||
skipped: 0,
|
||||
errors: [],
|
||||
};
|
||||
try {
|
||||
const dispensaries = await fetchAZDHSDispensaries();
|
||||
result.total = dispensaries.length;
|
||||
console.log(`[AZDHS Import] Found ${dispensaries.length} AZ dispensaries in main DB`);
|
||||
for (const disp of dispensaries) {
|
||||
try {
|
||||
const id = await importDispensary(disp);
|
||||
result.imported++;
|
||||
console.log(`[AZDHS Import] Imported: ${disp.name} (${disp.city}) -> id=${id}`);
|
||||
}
|
||||
catch (error) {
|
||||
if (error.message.includes('duplicate')) {
|
||||
result.skipped++;
|
||||
}
|
||||
else {
|
||||
result.errors.push(`${disp.name}: ${error.message}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (error) {
|
||||
result.errors.push(`Failed to fetch from main DB: ${error.message}`);
|
||||
}
|
||||
console.log(`[AZDHS Import] Complete: ${result.imported} imported, ${result.skipped} skipped, ${result.errors.length} errors`);
|
||||
return result;
|
||||
}
|
||||
/**
|
||||
* Import dispensaries from JSON file (backup export)
|
||||
*/
|
||||
async function importFromJSON(jsonPath) {
|
||||
console.log(`[AZDHS Import] Importing from JSON: ${jsonPath}`);
|
||||
const result = {
|
||||
total: 0,
|
||||
imported: 0,
|
||||
skipped: 0,
|
||||
errors: [],
|
||||
};
|
||||
try {
|
||||
const fs = await Promise.resolve().then(() => __importStar(require('fs/promises')));
|
||||
const data = await fs.readFile(jsonPath, 'utf-8');
|
||||
const dispensaries = JSON.parse(data);
|
||||
result.total = dispensaries.length;
|
||||
console.log(`[AZDHS Import] Found ${dispensaries.length} dispensaries in JSON file`);
|
||||
for (const disp of dispensaries) {
|
||||
try {
|
||||
const id = await importDispensary(disp);
|
||||
result.imported++;
|
||||
}
|
||||
catch (error) {
|
||||
if (error.message.includes('duplicate')) {
|
||||
result.skipped++;
|
||||
}
|
||||
else {
|
||||
result.errors.push(`${disp.name}: ${error.message}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (error) {
|
||||
result.errors.push(`Failed to read JSON file: ${error.message}`);
|
||||
}
|
||||
console.log(`[AZDHS Import] Complete: ${result.imported} imported, ${result.skipped} skipped`);
|
||||
return result;
|
||||
}
|
||||
/**
|
||||
* Get import statistics
|
||||
*/
|
||||
async function getImportStats() {
|
||||
const { rows } = await (0, connection_1.query)(`
|
||||
SELECT
|
||||
COUNT(*) as total,
|
||||
COUNT(platform_dispensary_id) as with_platform_id,
|
||||
COUNT(*) - COUNT(platform_dispensary_id) as without_platform_id,
|
||||
MAX(updated_at) as last_updated
|
||||
FROM dispensaries
|
||||
WHERE state = 'AZ'
|
||||
`);
|
||||
const stats = rows[0];
|
||||
return {
|
||||
totalDispensaries: parseInt(stats.total, 10),
|
||||
withPlatformIds: parseInt(stats.with_platform_id, 10),
|
||||
withoutPlatformIds: parseInt(stats.without_platform_id, 10),
|
||||
lastImportedAt: stats.last_updated,
|
||||
};
|
||||
}
|
||||
380
backend/dist/dutchie-az/services/directory-matcher.js
vendored
Normal file
380
backend/dist/dutchie-az/services/directory-matcher.js
vendored
Normal file
@@ -0,0 +1,380 @@
|
||||
"use strict";
|
||||
/**
|
||||
* Directory-Based Store Matcher
|
||||
*
|
||||
* Scrapes provider directory pages (Curaleaf, Sol, etc.) to get store lists,
|
||||
* then matches them to existing dispensaries by fuzzy name/city/address matching.
|
||||
*
|
||||
* This allows us to:
|
||||
* 1. Find specific store URLs for directory-style websites
|
||||
* 2. Match stores confidently by name+city
|
||||
* 3. Mark non-Dutchie providers as not_crawlable until we build crawlers
|
||||
*/
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.scrapeSolDirectory = scrapeSolDirectory;
|
||||
exports.scrapeCuraleafDirectory = scrapeCuraleafDirectory;
|
||||
exports.matchDirectoryToDispensaries = matchDirectoryToDispensaries;
|
||||
exports.previewDirectoryMatches = previewDirectoryMatches;
|
||||
exports.applyHighConfidenceMatches = applyHighConfidenceMatches;
|
||||
const connection_1 = require("../db/connection");
|
||||
// ============================================================
|
||||
// NORMALIZATION FUNCTIONS
|
||||
// ============================================================
|
||||
/**
|
||||
* Normalize a string for comparison:
|
||||
* - Lowercase
|
||||
* - Remove common suffixes (dispensary, cannabis, etc.)
|
||||
* - Remove punctuation
|
||||
* - Collapse whitespace
|
||||
*/
|
||||
function normalizeForComparison(str) {
|
||||
if (!str)
|
||||
return '';
|
||||
return str
|
||||
.toLowerCase()
|
||||
.replace(/\s+(dispensary|cannabis|marijuana|medical|recreational|shop|store|flower|wellness)(\s|$)/gi, ' ')
|
||||
.replace(/[^\w\s]/g, ' ') // Remove punctuation
|
||||
.replace(/\s+/g, ' ') // Collapse whitespace
|
||||
.trim();
|
||||
}
|
||||
/**
|
||||
* Normalize city name for comparison
|
||||
*/
|
||||
function normalizeCity(city) {
|
||||
if (!city)
|
||||
return '';
|
||||
return city
|
||||
.toLowerCase()
|
||||
.replace(/[^\w\s]/g, '')
|
||||
.trim();
|
||||
}
|
||||
/**
|
||||
* Calculate similarity between two strings (0-1)
|
||||
* Uses Levenshtein distance normalized by max length
|
||||
*/
|
||||
function stringSimilarity(a, b) {
|
||||
if (!a || !b)
|
||||
return 0;
|
||||
if (a === b)
|
||||
return 1;
|
||||
const longer = a.length > b.length ? a : b;
|
||||
const shorter = a.length > b.length ? b : a;
|
||||
if (longer.length === 0)
|
||||
return 1;
|
||||
const distance = levenshteinDistance(longer, shorter);
|
||||
return (longer.length - distance) / longer.length;
|
||||
}
|
||||
/**
|
||||
* Levenshtein distance between two strings
|
||||
*/
|
||||
function levenshteinDistance(a, b) {
|
||||
const matrix = [];
|
||||
for (let i = 0; i <= b.length; i++) {
|
||||
matrix[i] = [i];
|
||||
}
|
||||
for (let j = 0; j <= a.length; j++) {
|
||||
matrix[0][j] = j;
|
||||
}
|
||||
for (let i = 1; i <= b.length; i++) {
|
||||
for (let j = 1; j <= a.length; j++) {
|
||||
if (b.charAt(i - 1) === a.charAt(j - 1)) {
|
||||
matrix[i][j] = matrix[i - 1][j - 1];
|
||||
}
|
||||
else {
|
||||
matrix[i][j] = Math.min(matrix[i - 1][j - 1] + 1, // substitution
|
||||
matrix[i][j - 1] + 1, // insertion
|
||||
matrix[i - 1][j] + 1 // deletion
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
return matrix[b.length][a.length];
|
||||
}
|
||||
/**
|
||||
* Check if string contains another (with normalization)
|
||||
*/
|
||||
function containsNormalized(haystack, needle) {
|
||||
return normalizeForComparison(haystack).includes(normalizeForComparison(needle));
|
||||
}
|
||||
// ============================================================
|
||||
// PROVIDER DIRECTORY SCRAPERS
|
||||
// ============================================================
|
||||
/**
|
||||
* Sol Flower (livewithsol.com) - Static HTML, easy to scrape
|
||||
*/
|
||||
async function scrapeSolDirectory() {
|
||||
console.log('[DirectoryMatcher] Scraping Sol Flower directory...');
|
||||
try {
|
||||
const response = await fetch('https://www.livewithsol.com/locations/', {
|
||||
headers: {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
Accept: 'text/html',
|
||||
},
|
||||
});
|
||||
if (!response.ok) {
|
||||
throw new Error(`HTTP ${response.status}`);
|
||||
}
|
||||
const html = await response.text();
|
||||
// Extract store entries from HTML
|
||||
// Sol's structure: Each location has name, address in specific divs
|
||||
const stores = [];
|
||||
// Pattern to find location cards
|
||||
// Format: <a href="/locations/slug/">NAME</a> with address nearby
|
||||
const locationRegex = /<a[^>]+href="(\/locations\/[^"]+)"[^>]*>([^<]+)<\/a>[\s\S]*?(\d+[^<]+(?:Ave|St|Blvd|Dr|Rd|Way)[^<]*)/gi;
|
||||
let match;
|
||||
while ((match = locationRegex.exec(html)) !== null) {
|
||||
const [, path, name, address] = match;
|
||||
// Extract city from common Arizona cities
|
||||
let city = 'Unknown';
|
||||
const cityPatterns = [
|
||||
{ pattern: /phoenix/i, city: 'Phoenix' },
|
||||
{ pattern: /scottsdale/i, city: 'Scottsdale' },
|
||||
{ pattern: /tempe/i, city: 'Tempe' },
|
||||
{ pattern: /tucson/i, city: 'Tucson' },
|
||||
{ pattern: /mesa/i, city: 'Mesa' },
|
||||
{ pattern: /sun city/i, city: 'Sun City' },
|
||||
{ pattern: /glendale/i, city: 'Glendale' },
|
||||
];
|
||||
for (const { pattern, city: cityName } of cityPatterns) {
|
||||
if (pattern.test(name) || pattern.test(address)) {
|
||||
city = cityName;
|
||||
break;
|
||||
}
|
||||
}
|
||||
stores.push({
|
||||
name: name.trim(),
|
||||
city,
|
||||
state: 'AZ',
|
||||
address: address.trim(),
|
||||
storeUrl: `https://www.livewithsol.com${path}`,
|
||||
});
|
||||
}
|
||||
// If regex didn't work, use known hardcoded values (fallback)
|
||||
if (stores.length === 0) {
|
||||
console.log('[DirectoryMatcher] Using hardcoded Sol locations');
|
||||
return [
|
||||
{ name: 'Sol Flower 32nd & Shea', city: 'Phoenix', state: 'AZ', address: '3217 E Shea Blvd Suite 1 A', storeUrl: 'https://www.livewithsol.com/locations/deer-valley/' },
|
||||
{ name: 'Sol Flower Scottsdale Airpark', city: 'Scottsdale', state: 'AZ', address: '14980 N 78th Way Ste 204', storeUrl: 'https://www.livewithsol.com/locations/scottsdale-airpark/' },
|
||||
{ name: 'Sol Flower Sun City', city: 'Sun City', state: 'AZ', address: '13650 N 99th Ave', storeUrl: 'https://www.livewithsol.com/locations/sun-city/' },
|
||||
{ name: 'Sol Flower Tempe McClintock', city: 'Tempe', state: 'AZ', address: '1322 N McClintock Dr', storeUrl: 'https://www.livewithsol.com/locations/tempe-mcclintock/' },
|
||||
{ name: 'Sol Flower Tempe University', city: 'Tempe', state: 'AZ', address: '2424 W University Dr', storeUrl: 'https://www.livewithsol.com/locations/tempe-university/' },
|
||||
{ name: 'Sol Flower Foothills Tucson', city: 'Tucson', state: 'AZ', address: '6026 N Oracle Rd', storeUrl: 'https://www.livewithsol.com/locations/foothills-tucson/' },
|
||||
{ name: 'Sol Flower South Tucson', city: 'Tucson', state: 'AZ', address: '3000 W Valencia Rd Ste 210', storeUrl: 'https://www.livewithsol.com/locations/south-tucson/' },
|
||||
{ name: 'Sol Flower North Tucson', city: 'Tucson', state: 'AZ', address: '4837 N 1st Ave', storeUrl: 'https://www.livewithsol.com/locations/north-tucson/' },
|
||||
{ name: 'Sol Flower Casas Adobes', city: 'Tucson', state: 'AZ', address: '6437 N Oracle Rd', storeUrl: 'https://www.livewithsol.com/locations/casas-adobes/' },
|
||||
];
|
||||
}
|
||||
console.log(`[DirectoryMatcher] Found ${stores.length} Sol Flower locations`);
|
||||
return stores;
|
||||
}
|
||||
catch (error) {
|
||||
console.error('[DirectoryMatcher] Error scraping Sol directory:', error.message);
|
||||
// Return hardcoded fallback
|
||||
return [
|
||||
{ name: 'Sol Flower 32nd & Shea', city: 'Phoenix', state: 'AZ', address: '3217 E Shea Blvd Suite 1 A', storeUrl: 'https://www.livewithsol.com/locations/deer-valley/' },
|
||||
{ name: 'Sol Flower Scottsdale Airpark', city: 'Scottsdale', state: 'AZ', address: '14980 N 78th Way Ste 204', storeUrl: 'https://www.livewithsol.com/locations/scottsdale-airpark/' },
|
||||
{ name: 'Sol Flower Sun City', city: 'Sun City', state: 'AZ', address: '13650 N 99th Ave', storeUrl: 'https://www.livewithsol.com/locations/sun-city/' },
|
||||
{ name: 'Sol Flower Tempe McClintock', city: 'Tempe', state: 'AZ', address: '1322 N McClintock Dr', storeUrl: 'https://www.livewithsol.com/locations/tempe-mcclintock/' },
|
||||
{ name: 'Sol Flower Tempe University', city: 'Tempe', state: 'AZ', address: '2424 W University Dr', storeUrl: 'https://www.livewithsol.com/locations/tempe-university/' },
|
||||
{ name: 'Sol Flower Foothills Tucson', city: 'Tucson', state: 'AZ', address: '6026 N Oracle Rd', storeUrl: 'https://www.livewithsol.com/locations/foothills-tucson/' },
|
||||
{ name: 'Sol Flower South Tucson', city: 'Tucson', state: 'AZ', address: '3000 W Valencia Rd Ste 210', storeUrl: 'https://www.livewithsol.com/locations/south-tucson/' },
|
||||
{ name: 'Sol Flower North Tucson', city: 'Tucson', state: 'AZ', address: '4837 N 1st Ave', storeUrl: 'https://www.livewithsol.com/locations/north-tucson/' },
|
||||
{ name: 'Sol Flower Casas Adobes', city: 'Tucson', state: 'AZ', address: '6437 N Oracle Rd', storeUrl: 'https://www.livewithsol.com/locations/casas-adobes/' },
|
||||
];
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Curaleaf - Has age-gate, so we need hardcoded AZ locations
|
||||
* In production, this would use Playwright to bypass age-gate
|
||||
*/
|
||||
async function scrapeCuraleafDirectory() {
|
||||
console.log('[DirectoryMatcher] Using hardcoded Curaleaf AZ locations (age-gate blocks simple fetch)...');
|
||||
// Hardcoded Arizona Curaleaf locations from public knowledge
|
||||
// These would be scraped via Playwright in production
|
||||
return [
|
||||
{ name: 'Curaleaf Phoenix Camelback', city: 'Phoenix', state: 'AZ', address: '4811 E Camelback Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-phoenix-camelback' },
|
||||
{ name: 'Curaleaf Phoenix Midtown', city: 'Phoenix', state: 'AZ', address: '1928 E Highland Ave', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-phoenix-midtown' },
|
||||
{ name: 'Curaleaf Glendale East', city: 'Glendale', state: 'AZ', address: '5150 W Glendale Ave', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-glendale-east' },
|
||||
{ name: 'Curaleaf Glendale West', city: 'Glendale', state: 'AZ', address: '6501 W Glendale Ave', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-glendale-west' },
|
||||
{ name: 'Curaleaf Gilbert', city: 'Gilbert', state: 'AZ', address: '1736 E Williams Field Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-gilbert' },
|
||||
{ name: 'Curaleaf Mesa', city: 'Mesa', state: 'AZ', address: '1540 S Power Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-mesa' },
|
||||
{ name: 'Curaleaf Tempe', city: 'Tempe', state: 'AZ', address: '1815 E Broadway Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-tempe' },
|
||||
{ name: 'Curaleaf Scottsdale', city: 'Scottsdale', state: 'AZ', address: '8904 E Indian Bend Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-scottsdale' },
|
||||
{ name: 'Curaleaf Tucson Prince', city: 'Tucson', state: 'AZ', address: '3955 W Prince Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-tucson-prince' },
|
||||
{ name: 'Curaleaf Tucson Midvale', city: 'Tucson', state: 'AZ', address: '2936 N Midvale Park Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-tucson-midvale' },
|
||||
{ name: 'Curaleaf Sedona', city: 'Sedona', state: 'AZ', address: '525 AZ-179', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-sedona' },
|
||||
{ name: 'Curaleaf Youngtown', city: 'Youngtown', state: 'AZ', address: '11125 W Grand Ave', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-youngtown' },
|
||||
];
|
||||
}
|
||||
/**
|
||||
* Match a directory store to an existing dispensary
|
||||
*/
|
||||
function matchStoreToDispensary(store, dispensaries) {
|
||||
const normalizedStoreName = normalizeForComparison(store.name);
|
||||
const normalizedStoreCity = normalizeCity(store.city);
|
||||
let bestMatch = null;
|
||||
let bestScore = 0;
|
||||
let matchReason = '';
|
||||
for (const disp of dispensaries) {
|
||||
const normalizedDispName = normalizeForComparison(disp.name);
|
||||
const normalizedDispCity = normalizeCity(disp.city || '');
|
||||
let score = 0;
|
||||
const reasons = [];
|
||||
// 1. Name similarity (max 50 points)
|
||||
const nameSimilarity = stringSimilarity(normalizedStoreName, normalizedDispName);
|
||||
score += nameSimilarity * 50;
|
||||
if (nameSimilarity > 0.8)
|
||||
reasons.push(`name_match(${(nameSimilarity * 100).toFixed(0)}%)`);
|
||||
// 2. City match (25 points for exact, 15 for partial)
|
||||
if (normalizedStoreCity && normalizedDispCity) {
|
||||
if (normalizedStoreCity === normalizedDispCity) {
|
||||
score += 25;
|
||||
reasons.push('city_exact');
|
||||
}
|
||||
else if (normalizedStoreCity.includes(normalizedDispCity) ||
|
||||
normalizedDispCity.includes(normalizedStoreCity)) {
|
||||
score += 15;
|
||||
reasons.push('city_partial');
|
||||
}
|
||||
}
|
||||
// 3. Address contains street name (15 points)
|
||||
if (store.address && disp.address) {
|
||||
const storeStreet = store.address.toLowerCase().split(/\s+/).slice(1, 4).join(' ');
|
||||
const dispStreet = disp.address.toLowerCase().split(/\s+/).slice(1, 4).join(' ');
|
||||
if (storeStreet && dispStreet && stringSimilarity(storeStreet, dispStreet) > 0.7) {
|
||||
score += 15;
|
||||
reasons.push('address_match');
|
||||
}
|
||||
}
|
||||
// 4. Brand name in dispensary name (10 points)
|
||||
const brandName = store.name.split(' ')[0].toLowerCase(); // e.g., "Curaleaf", "Sol"
|
||||
if (disp.name.toLowerCase().includes(brandName)) {
|
||||
score += 10;
|
||||
reasons.push('brand_match');
|
||||
}
|
||||
if (score > bestScore) {
|
||||
bestScore = score;
|
||||
bestMatch = disp;
|
||||
matchReason = reasons.join(', ');
|
||||
}
|
||||
}
|
||||
// Determine confidence level
|
||||
let confidence;
|
||||
if (bestScore >= 70) {
|
||||
confidence = 'high';
|
||||
}
|
||||
else if (bestScore >= 50) {
|
||||
confidence = 'medium';
|
||||
}
|
||||
else if (bestScore >= 30) {
|
||||
confidence = 'low';
|
||||
}
|
||||
else {
|
||||
confidence = 'none';
|
||||
}
|
||||
return {
|
||||
directoryStore: store,
|
||||
dispensaryId: bestMatch?.id || null,
|
||||
dispensaryName: bestMatch?.name || null,
|
||||
confidence,
|
||||
matchReason: matchReason || 'no_match',
|
||||
};
|
||||
}
|
||||
// ============================================================
|
||||
// MAIN FUNCTIONS
|
||||
// ============================================================
|
||||
/**
|
||||
* Run directory matching for a provider and update database
|
||||
* Only applies high-confidence matches automatically
|
||||
*/
|
||||
async function matchDirectoryToDispensaries(provider, dryRun = true) {
|
||||
console.log(`[DirectoryMatcher] Running ${provider} directory matching (dryRun=${dryRun})...`);
|
||||
// Get directory stores
|
||||
let directoryStores;
|
||||
if (provider === 'curaleaf') {
|
||||
directoryStores = await scrapeCuraleafDirectory();
|
||||
}
|
||||
else if (provider === 'sol') {
|
||||
directoryStores = await scrapeSolDirectory();
|
||||
}
|
||||
else {
|
||||
throw new Error(`Unknown provider: ${provider}`);
|
||||
}
|
||||
// Get all AZ dispensaries from database
|
||||
const { rows: dispensaries } = await (0, connection_1.query)(`SELECT id, name, city, state, address, menu_type, menu_url, website
|
||||
FROM dispensaries
|
||||
WHERE state = 'AZ'`);
|
||||
console.log(`[DirectoryMatcher] Matching ${directoryStores.length} directory stores against ${dispensaries.length} dispensaries`);
|
||||
// Match each directory store
|
||||
const results = [];
|
||||
for (const store of directoryStores) {
|
||||
const match = matchStoreToDispensary(store, dispensaries);
|
||||
results.push(match);
|
||||
// Only apply high-confidence matches if not dry run
|
||||
if (!dryRun && match.confidence === 'high' && match.dispensaryId) {
|
||||
await applyDirectoryMatch(match.dispensaryId, provider, store);
|
||||
}
|
||||
}
|
||||
// Count results
|
||||
const report = {
|
||||
provider,
|
||||
totalDirectoryStores: directoryStores.length,
|
||||
highConfidenceMatches: results.filter((r) => r.confidence === 'high').length,
|
||||
mediumConfidenceMatches: results.filter((r) => r.confidence === 'medium').length,
|
||||
lowConfidenceMatches: results.filter((r) => r.confidence === 'low').length,
|
||||
unmatched: results.filter((r) => r.confidence === 'none').length,
|
||||
results,
|
||||
};
|
||||
console.log(`[DirectoryMatcher] ${provider} matching complete:`);
|
||||
console.log(` - High confidence: ${report.highConfidenceMatches}`);
|
||||
console.log(` - Medium confidence: ${report.mediumConfidenceMatches}`);
|
||||
console.log(` - Low confidence: ${report.lowConfidenceMatches}`);
|
||||
console.log(` - Unmatched: ${report.unmatched}`);
|
||||
return report;
|
||||
}
|
||||
/**
|
||||
* Apply a directory match to a dispensary
|
||||
*/
|
||||
async function applyDirectoryMatch(dispensaryId, provider, store) {
|
||||
console.log(`[DirectoryMatcher] Applying match: dispensary ${dispensaryId} -> ${store.storeUrl}`);
|
||||
await (0, connection_1.query)(`
|
||||
UPDATE dispensaries SET
|
||||
menu_type = $1,
|
||||
menu_url = $2,
|
||||
platform_dispensary_id = NULL,
|
||||
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
||||
jsonb_build_object(
|
||||
'detected_provider', $1::text,
|
||||
'detection_method', 'directory_match'::text,
|
||||
'detected_at', NOW(),
|
||||
'directory_store_name', $3::text,
|
||||
'directory_store_url', $2::text,
|
||||
'directory_store_city', $4::text,
|
||||
'directory_store_address', $5::text,
|
||||
'not_crawlable', true,
|
||||
'not_crawlable_reason', $6::text
|
||||
),
|
||||
updated_at = NOW()
|
||||
WHERE id = $7
|
||||
`, [
|
||||
provider,
|
||||
store.storeUrl,
|
||||
store.name,
|
||||
store.city,
|
||||
store.address,
|
||||
`${provider} proprietary menu - no crawler available`,
|
||||
dispensaryId,
|
||||
]);
|
||||
}
|
||||
/**
|
||||
* Preview matches without applying them
|
||||
*/
|
||||
async function previewDirectoryMatches(provider) {
|
||||
return matchDirectoryToDispensaries(provider, true);
|
||||
}
|
||||
/**
|
||||
* Apply high-confidence matches
|
||||
*/
|
||||
async function applyHighConfidenceMatches(provider) {
|
||||
return matchDirectoryToDispensaries(provider, false);
|
||||
}
|
||||
487
backend/dist/dutchie-az/services/discovery.js
vendored
Normal file
487
backend/dist/dutchie-az/services/discovery.js
vendored
Normal file
@@ -0,0 +1,487 @@
|
||||
"use strict";
|
||||
/**
|
||||
* Dutchie AZ Discovery Service
|
||||
*
|
||||
* Discovers and manages dispensaries from Dutchie for Arizona.
|
||||
*/
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.importFromExistingDispensaries = importFromExistingDispensaries;
|
||||
exports.discoverDispensaries = discoverDispensaries;
|
||||
exports.extractCNameFromMenuUrl = extractCNameFromMenuUrl;
|
||||
exports.resolvePlatformDispensaryIds = resolvePlatformDispensaryIds;
|
||||
exports.getAllDispensaries = getAllDispensaries;
|
||||
exports.mapDbRowToDispensary = mapDbRowToDispensary;
|
||||
exports.getDispensaryById = getDispensaryById;
|
||||
exports.getDispensariesWithPlatformIds = getDispensariesWithPlatformIds;
|
||||
exports.reResolveDispensaryPlatformId = reResolveDispensaryPlatformId;
|
||||
exports.updateMenuUrlAndResolve = updateMenuUrlAndResolve;
|
||||
exports.markDispensaryNotCrawlable = markDispensaryNotCrawlable;
|
||||
exports.getDispensaryCName = getDispensaryCName;
|
||||
const connection_1 = require("../db/connection");
|
||||
const graphql_client_1 = require("./graphql-client");
|
||||
/**
|
||||
* Upsert a dispensary record
|
||||
*/
|
||||
async function upsertDispensary(dispensary) {
|
||||
const result = await (0, connection_1.query)(`
|
||||
INSERT INTO dispensaries (
|
||||
platform, name, slug, city, state, postal_code, address,
|
||||
latitude, longitude, platform_dispensary_id,
|
||||
is_delivery, is_pickup, raw_metadata, updated_at
|
||||
) VALUES (
|
||||
$1, $2, $3, $4, $5, $6, $7,
|
||||
$8, $9, $10,
|
||||
$11, $12, $13, NOW()
|
||||
)
|
||||
ON CONFLICT (platform, slug, city, state) DO UPDATE SET
|
||||
name = EXCLUDED.name,
|
||||
postal_code = EXCLUDED.postal_code,
|
||||
address = EXCLUDED.address,
|
||||
latitude = EXCLUDED.latitude,
|
||||
longitude = EXCLUDED.longitude,
|
||||
platform_dispensary_id = COALESCE(EXCLUDED.platform_dispensary_id, dispensaries.platform_dispensary_id),
|
||||
is_delivery = EXCLUDED.is_delivery,
|
||||
is_pickup = EXCLUDED.is_pickup,
|
||||
raw_metadata = EXCLUDED.raw_metadata,
|
||||
updated_at = NOW()
|
||||
RETURNING id
|
||||
`, [
|
||||
dispensary.platform || 'dutchie',
|
||||
dispensary.name,
|
||||
dispensary.slug,
|
||||
dispensary.city,
|
||||
dispensary.state || 'AZ',
|
||||
dispensary.postalCode,
|
||||
dispensary.address,
|
||||
dispensary.latitude,
|
||||
dispensary.longitude,
|
||||
dispensary.platformDispensaryId,
|
||||
dispensary.isDelivery || false,
|
||||
dispensary.isPickup || true,
|
||||
dispensary.rawMetadata ? JSON.stringify(dispensary.rawMetadata) : null,
|
||||
]);
|
||||
return result.rows[0].id;
|
||||
}
|
||||
/**
|
||||
* Normalize a raw discovery result to Dispensary
|
||||
*/
|
||||
function normalizeDispensary(raw) {
|
||||
return {
|
||||
platform: 'dutchie',
|
||||
name: raw.name || raw.Name || '',
|
||||
slug: raw.slug || raw.cName || raw.id || '',
|
||||
city: raw.city || raw.address?.city || '',
|
||||
state: 'AZ',
|
||||
postalCode: raw.postalCode || raw.address?.postalCode || raw.address?.zip,
|
||||
address: raw.streetAddress || raw.address?.streetAddress,
|
||||
latitude: raw.latitude || raw.location?.lat,
|
||||
longitude: raw.longitude || raw.location?.lng,
|
||||
platformDispensaryId: raw.dispensaryId || raw.id || null,
|
||||
isDelivery: raw.isDelivery || raw.delivery || false,
|
||||
isPickup: raw.isPickup || raw.pickup || true,
|
||||
rawMetadata: raw,
|
||||
};
|
||||
}
|
||||
/**
|
||||
* Import dispensaries from the existing dispensaries table (from AZDHS data)
|
||||
* This creates records in the dutchie_az database for AZ dispensaries
|
||||
*/
|
||||
async function importFromExistingDispensaries() {
|
||||
console.log('[Discovery] Importing from existing dispensaries table...');
|
||||
// This is a workaround - we'll use the dispensaries we already know about
|
||||
// and try to resolve their Dutchie IDs
|
||||
const knownDispensaries = [
|
||||
{ name: 'Deeply Rooted', slug: 'AZ-Deeply-Rooted', city: 'Phoenix', state: 'AZ' },
|
||||
{ name: 'Curaleaf Gilbert', slug: 'curaleaf-gilbert', city: 'Gilbert', state: 'AZ' },
|
||||
{ name: 'Zen Leaf Prescott', slug: 'AZ-zen-leaf-prescott', city: 'Prescott', state: 'AZ' },
|
||||
// Add more known Dutchie stores here
|
||||
];
|
||||
let imported = 0;
|
||||
for (const disp of knownDispensaries) {
|
||||
try {
|
||||
const id = await upsertDispensary({
|
||||
platform: 'dutchie',
|
||||
name: disp.name,
|
||||
slug: disp.slug,
|
||||
city: disp.city,
|
||||
state: disp.state,
|
||||
});
|
||||
imported++;
|
||||
console.log(`[Discovery] Imported: ${disp.name} (id=${id})`);
|
||||
}
|
||||
catch (error) {
|
||||
console.error(`[Discovery] Failed to import ${disp.name}:`, error.message);
|
||||
}
|
||||
}
|
||||
return { imported };
|
||||
}
|
||||
/**
|
||||
* Discover all Arizona Dutchie dispensaries via GraphQL
|
||||
*/
|
||||
async function discoverDispensaries() {
|
||||
console.log('[Discovery] Starting Arizona dispensary discovery...');
|
||||
const errors = [];
|
||||
let discovered = 0;
|
||||
try {
|
||||
const rawDispensaries = await (0, graphql_client_1.discoverArizonaDispensaries)();
|
||||
console.log(`[Discovery] Found ${rawDispensaries.length} dispensaries from GraphQL`);
|
||||
for (const raw of rawDispensaries) {
|
||||
try {
|
||||
const normalized = normalizeDispensary(raw);
|
||||
if (normalized.name && normalized.slug && normalized.city) {
|
||||
await upsertDispensary(normalized);
|
||||
discovered++;
|
||||
}
|
||||
}
|
||||
catch (error) {
|
||||
errors.push(`${raw.name || raw.slug}: ${error.message}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (error) {
|
||||
errors.push(`Discovery failed: ${error.message}`);
|
||||
}
|
||||
console.log(`[Discovery] Completed: ${discovered} dispensaries, ${errors.length} errors`);
|
||||
return { discovered, errors };
|
||||
}
|
||||
/**
|
||||
* Extract cName (slug) from a Dutchie menu_url
|
||||
* Supports formats:
|
||||
* - https://dutchie.com/embedded-menu/<cName>
|
||||
* - https://dutchie.com/dispensary/<cName>
|
||||
*/
|
||||
function extractCNameFromMenuUrl(menuUrl) {
|
||||
if (!menuUrl)
|
||||
return null;
|
||||
try {
|
||||
const url = new URL(menuUrl);
|
||||
const pathname = url.pathname;
|
||||
// Match /embedded-menu/<cName> or /dispensary/<cName>
|
||||
const embeddedMatch = pathname.match(/^\/embedded-menu\/([^/?]+)/);
|
||||
if (embeddedMatch)
|
||||
return embeddedMatch[1];
|
||||
const dispensaryMatch = pathname.match(/^\/dispensary\/([^/?]+)/);
|
||||
if (dispensaryMatch)
|
||||
return dispensaryMatch[1];
|
||||
return null;
|
||||
}
|
||||
catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Resolve platform dispensary IDs for all dispensaries that don't have one
|
||||
* CRITICAL: Uses cName extracted from menu_url, NOT the slug column!
|
||||
*
|
||||
* Uses the new resolveDispensaryIdWithDetails which:
|
||||
* 1. Extracts dispensaryId from window.reactEnv in the embedded menu page (preferred)
|
||||
* 2. Falls back to GraphQL if reactEnv extraction fails
|
||||
* 3. Returns HTTP status so we can mark 403/404 stores as not_crawlable
|
||||
*/
|
||||
async function resolvePlatformDispensaryIds() {
|
||||
console.log('[Discovery] Resolving platform dispensary IDs...');
|
||||
const { rows: dispensaries } = await (0, connection_1.query)(`
|
||||
SELECT id, name, slug, menu_url, menu_type, platform_dispensary_id, crawl_status
|
||||
FROM dispensaries
|
||||
WHERE menu_type = 'dutchie'
|
||||
AND platform_dispensary_id IS NULL
|
||||
AND menu_url IS NOT NULL
|
||||
AND (crawl_status IS NULL OR crawl_status != 'not_crawlable')
|
||||
ORDER BY id
|
||||
`);
|
||||
let resolved = 0;
|
||||
let failed = 0;
|
||||
let skipped = 0;
|
||||
let notCrawlable = 0;
|
||||
for (const dispensary of dispensaries) {
|
||||
try {
|
||||
// Extract cName from menu_url - this is the CORRECT way to get the Dutchie slug
|
||||
const cName = extractCNameFromMenuUrl(dispensary.menu_url);
|
||||
if (!cName) {
|
||||
console.log(`[Discovery] Skipping ${dispensary.name}: Could not extract cName from menu_url: ${dispensary.menu_url}`);
|
||||
skipped++;
|
||||
continue;
|
||||
}
|
||||
console.log(`[Discovery] Resolving ID for: ${dispensary.name} (cName=${cName}, menu_url=${dispensary.menu_url})`);
|
||||
// Use the new detailed resolver that extracts from reactEnv first
|
||||
const result = await (0, graphql_client_1.resolveDispensaryIdWithDetails)(cName);
|
||||
if (result.dispensaryId) {
|
||||
// SUCCESS: Store resolved
|
||||
await (0, connection_1.query)(`
|
||||
UPDATE dispensaries
|
||||
SET platform_dispensary_id = $1,
|
||||
platform_dispensary_id_resolved_at = NOW(),
|
||||
crawl_status = 'ready',
|
||||
crawl_status_reason = $2,
|
||||
crawl_status_updated_at = NOW(),
|
||||
last_tested_menu_url = $3,
|
||||
last_http_status = $4,
|
||||
updated_at = NOW()
|
||||
WHERE id = $5
|
||||
`, [
|
||||
result.dispensaryId,
|
||||
`Resolved from ${result.source || 'page'}`,
|
||||
dispensary.menu_url,
|
||||
result.httpStatus,
|
||||
dispensary.id,
|
||||
]);
|
||||
resolved++;
|
||||
console.log(`[Discovery] Resolved: ${cName} -> ${result.dispensaryId} (source: ${result.source})`);
|
||||
}
|
||||
else if (result.httpStatus === 403 || result.httpStatus === 404) {
|
||||
// NOT CRAWLABLE: Store removed or not accessible
|
||||
await (0, connection_1.query)(`
|
||||
UPDATE dispensaries
|
||||
SET platform_dispensary_id = NULL,
|
||||
crawl_status = 'not_crawlable',
|
||||
crawl_status_reason = $1,
|
||||
crawl_status_updated_at = NOW(),
|
||||
last_tested_menu_url = $2,
|
||||
last_http_status = $3,
|
||||
updated_at = NOW()
|
||||
WHERE id = $4
|
||||
`, [
|
||||
result.error || `HTTP ${result.httpStatus}: Removed from Dutchie`,
|
||||
dispensary.menu_url,
|
||||
result.httpStatus,
|
||||
dispensary.id,
|
||||
]);
|
||||
notCrawlable++;
|
||||
console.log(`[Discovery] Marked not crawlable: ${cName} (HTTP ${result.httpStatus})`);
|
||||
}
|
||||
else {
|
||||
// FAILED: Could not resolve but page loaded
|
||||
await (0, connection_1.query)(`
|
||||
UPDATE dispensaries
|
||||
SET crawl_status = 'not_ready',
|
||||
crawl_status_reason = $1,
|
||||
crawl_status_updated_at = NOW(),
|
||||
last_tested_menu_url = $2,
|
||||
last_http_status = $3,
|
||||
updated_at = NOW()
|
||||
WHERE id = $4
|
||||
`, [
|
||||
result.error || 'Could not extract dispensaryId from page',
|
||||
dispensary.menu_url,
|
||||
result.httpStatus,
|
||||
dispensary.id,
|
||||
]);
|
||||
failed++;
|
||||
console.log(`[Discovery] Could not resolve: ${cName} - ${result.error}`);
|
||||
}
|
||||
// Delay between requests
|
||||
await new Promise((r) => setTimeout(r, 2000));
|
||||
}
|
||||
catch (error) {
|
||||
failed++;
|
||||
console.error(`[Discovery] Error resolving ${dispensary.name}:`, error.message);
|
||||
}
|
||||
}
|
||||
console.log(`[Discovery] Completed: ${resolved} resolved, ${failed} failed, ${skipped} skipped, ${notCrawlable} not crawlable`);
|
||||
return { resolved, failed, skipped, notCrawlable };
|
||||
}
|
||||
/**
|
||||
* Get all dispensaries
|
||||
*/
|
||||
// Explicit column list for dispensaries table (avoids SELECT * issues with schema differences)
|
||||
const DISPENSARY_COLUMNS = `
|
||||
id, name, slug, city, state, zip, address, latitude, longitude,
|
||||
menu_type, menu_url, platform_dispensary_id, website,
|
||||
provider_detection_data, created_at, updated_at
|
||||
`;
|
||||
async function getAllDispensaries() {
|
||||
const { rows } = await (0, connection_1.query)(`SELECT ${DISPENSARY_COLUMNS} FROM dispensaries WHERE menu_type = 'dutchie' ORDER BY name`);
|
||||
return rows.map(mapDbRowToDispensary);
|
||||
}
|
||||
/**
|
||||
* Map snake_case DB row to camelCase Dispensary object
|
||||
* CRITICAL: DB returns snake_case (platform_dispensary_id) but TypeScript expects camelCase (platformDispensaryId)
|
||||
* This function is exported for use in other modules that query dispensaries directly.
|
||||
*
|
||||
* NOTE: The consolidated dispensaries table column mappings:
|
||||
* - zip → postalCode
|
||||
* - menu_type → menuType (keep platform as 'dutchie')
|
||||
* - last_crawl_at → lastCrawledAt
|
||||
* - platform_dispensary_id → platformDispensaryId
|
||||
*/
|
||||
function mapDbRowToDispensary(row) {
|
||||
// Extract website from raw_metadata if available (field may not exist in all environments)
|
||||
let rawMetadata = undefined;
|
||||
if (row.raw_metadata !== undefined) {
|
||||
rawMetadata = typeof row.raw_metadata === 'string'
|
||||
? JSON.parse(row.raw_metadata)
|
||||
: row.raw_metadata;
|
||||
}
|
||||
const website = row.website || rawMetadata?.website || undefined;
|
||||
return {
|
||||
id: row.id,
|
||||
platform: row.platform || 'dutchie', // keep platform as-is, default to 'dutchie'
|
||||
name: row.name,
|
||||
slug: row.slug,
|
||||
city: row.city,
|
||||
state: row.state,
|
||||
postalCode: row.postalCode || row.zip || row.postal_code,
|
||||
latitude: row.latitude ? parseFloat(row.latitude) : undefined,
|
||||
longitude: row.longitude ? parseFloat(row.longitude) : undefined,
|
||||
address: row.address,
|
||||
platformDispensaryId: row.platformDispensaryId || row.platform_dispensary_id, // CRITICAL mapping!
|
||||
isDelivery: row.is_delivery,
|
||||
isPickup: row.is_pickup,
|
||||
rawMetadata: rawMetadata,
|
||||
lastCrawledAt: row.lastCrawledAt || row.last_crawl_at, // use last_crawl_at
|
||||
productCount: row.product_count,
|
||||
createdAt: row.created_at,
|
||||
updatedAt: row.updated_at,
|
||||
menuType: row.menuType || row.menu_type,
|
||||
menuUrl: row.menuUrl || row.menu_url,
|
||||
scrapeEnabled: row.scrapeEnabled ?? row.scrape_enabled,
|
||||
providerDetectionData: row.provider_detection_data,
|
||||
platformDispensaryIdResolvedAt: row.platform_dispensary_id_resolved_at,
|
||||
website,
|
||||
};
|
||||
}
|
||||
/**
|
||||
* Get dispensary by ID
|
||||
* NOTE: Uses SQL aliases to map snake_case → camelCase directly
|
||||
*/
|
||||
async function getDispensaryById(id) {
|
||||
const { rows } = await (0, connection_1.query)(`
|
||||
SELECT
|
||||
id,
|
||||
name,
|
||||
slug,
|
||||
city,
|
||||
state,
|
||||
zip AS "postalCode",
|
||||
address,
|
||||
latitude,
|
||||
longitude,
|
||||
menu_type AS "menuType",
|
||||
menu_url AS "menuUrl",
|
||||
platform_dispensary_id AS "platformDispensaryId",
|
||||
website,
|
||||
provider_detection_data AS "providerDetectionData",
|
||||
created_at,
|
||||
updated_at
|
||||
FROM dispensaries
|
||||
WHERE id = $1
|
||||
`, [id]);
|
||||
if (!rows[0])
|
||||
return null;
|
||||
return mapDbRowToDispensary(rows[0]);
|
||||
}
|
||||
/**
|
||||
* Get dispensaries with platform IDs (ready for crawling)
|
||||
*/
|
||||
async function getDispensariesWithPlatformIds() {
|
||||
const { rows } = await (0, connection_1.query)(`
|
||||
SELECT ${DISPENSARY_COLUMNS} FROM dispensaries
|
||||
WHERE menu_type = 'dutchie' AND platform_dispensary_id IS NOT NULL
|
||||
ORDER BY name
|
||||
`);
|
||||
return rows.map(mapDbRowToDispensary);
|
||||
}
|
||||
/**
|
||||
* Re-resolve a single dispensary's platform ID
|
||||
* Clears the existing ID and re-resolves from the menu_url cName
|
||||
*/
|
||||
async function reResolveDispensaryPlatformId(dispensaryId) {
|
||||
console.log(`[Discovery] Re-resolving platform ID for dispensary ${dispensaryId}...`);
|
||||
const dispensary = await getDispensaryById(dispensaryId);
|
||||
if (!dispensary) {
|
||||
return { success: false, platformId: null, cName: null, error: 'Dispensary not found' };
|
||||
}
|
||||
const cName = extractCNameFromMenuUrl(dispensary.menuUrl);
|
||||
if (!cName) {
|
||||
console.log(`[Discovery] Could not extract cName from menu_url: ${dispensary.menuUrl}`);
|
||||
return {
|
||||
success: false,
|
||||
platformId: null,
|
||||
cName: null,
|
||||
error: `Could not extract cName from menu_url: ${dispensary.menuUrl}`,
|
||||
};
|
||||
}
|
||||
console.log(`[Discovery] Extracted cName: ${cName} from menu_url: ${dispensary.menuUrl}`);
|
||||
try {
|
||||
const platformId = await (0, graphql_client_1.resolveDispensaryId)(cName);
|
||||
if (platformId) {
|
||||
await (0, connection_1.query)(`
|
||||
UPDATE dispensaries
|
||||
SET platform_dispensary_id = $1,
|
||||
platform_dispensary_id_resolved_at = NOW(),
|
||||
updated_at = NOW()
|
||||
WHERE id = $2
|
||||
`, [platformId, dispensaryId]);
|
||||
console.log(`[Discovery] Resolved: ${cName} -> ${platformId}`);
|
||||
return { success: true, platformId, cName };
|
||||
}
|
||||
else {
|
||||
// Clear the invalid platform ID and mark as not crawlable
|
||||
await (0, connection_1.query)(`
|
||||
UPDATE dispensaries
|
||||
SET platform_dispensary_id = NULL,
|
||||
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
||||
'{"resolution_error": "cName no longer exists on Dutchie", "not_crawlable": true}'::jsonb,
|
||||
updated_at = NOW()
|
||||
WHERE id = $1
|
||||
`, [dispensaryId]);
|
||||
console.log(`[Discovery] Could not resolve: ${cName} - marked as not crawlable`);
|
||||
return {
|
||||
success: false,
|
||||
platformId: null,
|
||||
cName,
|
||||
error: `cName "${cName}" no longer exists on Dutchie`,
|
||||
};
|
||||
}
|
||||
}
|
||||
catch (error) {
|
||||
console.error(`[Discovery] Error resolving ${cName}:`, error.message);
|
||||
return { success: false, platformId: null, cName, error: error.message };
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Update menu_url for a dispensary and re-resolve platform ID
|
||||
*/
|
||||
async function updateMenuUrlAndResolve(dispensaryId, newMenuUrl) {
|
||||
console.log(`[Discovery] Updating menu_url for dispensary ${dispensaryId} to: ${newMenuUrl}`);
|
||||
const cName = extractCNameFromMenuUrl(newMenuUrl);
|
||||
if (!cName) {
|
||||
return {
|
||||
success: false,
|
||||
platformId: null,
|
||||
cName: null,
|
||||
error: `Could not extract cName from new menu_url: ${newMenuUrl}`,
|
||||
};
|
||||
}
|
||||
// Update the menu_url first
|
||||
await (0, connection_1.query)(`
|
||||
UPDATE dispensaries
|
||||
SET menu_url = $1,
|
||||
menu_type = 'dutchie',
|
||||
platform_dispensary_id = NULL,
|
||||
updated_at = NOW()
|
||||
WHERE id = $2
|
||||
`, [newMenuUrl, dispensaryId]);
|
||||
// Now resolve the platform ID with the new cName
|
||||
return await reResolveDispensaryPlatformId(dispensaryId);
|
||||
}
|
||||
/**
|
||||
* Mark a dispensary as not crawlable (when resolution fails permanently)
|
||||
*/
|
||||
async function markDispensaryNotCrawlable(dispensaryId, reason) {
|
||||
await (0, connection_1.query)(`
|
||||
UPDATE dispensaries
|
||||
SET platform_dispensary_id = NULL,
|
||||
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
||||
jsonb_build_object('not_crawlable', true, 'not_crawlable_reason', $1::text, 'not_crawlable_at', NOW()::text),
|
||||
updated_at = NOW()
|
||||
WHERE id = $2
|
||||
`, [reason, dispensaryId]);
|
||||
console.log(`[Discovery] Marked dispensary ${dispensaryId} as not crawlable: ${reason}`);
|
||||
}
|
||||
/**
|
||||
* Get the cName for a dispensary (extracted from menu_url)
|
||||
*/
|
||||
function getDispensaryCName(dispensary) {
|
||||
return extractCNameFromMenuUrl(dispensary.menuUrl);
|
||||
}
|
||||
538
backend/dist/dutchie-az/services/graphql-client.js
vendored
Normal file
538
backend/dist/dutchie-az/services/graphql-client.js
vendored
Normal file
@@ -0,0 +1,538 @@
|
||||
"use strict";
|
||||
/**
|
||||
* Dutchie GraphQL Client
|
||||
*
|
||||
* Uses Puppeteer to establish a session (get CF cookies), then makes
|
||||
* SERVER-SIDE fetch calls to api-gw.dutchie.com with those cookies.
|
||||
*
|
||||
* DUTCHIE FETCH RULES:
|
||||
* 1. Server-side only - use axios (never browser fetch with CORS)
|
||||
* 2. Use dispensaryFilter.cNameOrID, NOT dispensaryId directly
|
||||
* 3. Headers must mimic Chrome: User-Agent, Origin, Referer
|
||||
* 4. If 403, extract CF cookies from Puppeteer session and include them
|
||||
* 5. Log status codes, error bodies, and product counts
|
||||
*/
|
||||
var __importDefault = (this && this.__importDefault) || function (mod) {
|
||||
return (mod && mod.__esModule) ? mod : { "default": mod };
|
||||
};
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.ARIZONA_CENTERPOINTS = exports.GRAPHQL_HASHES = void 0;
|
||||
exports.resolveDispensaryId = resolveDispensaryId;
|
||||
exports.resolveDispensaryIdWithDetails = resolveDispensaryIdWithDetails;
|
||||
exports.discoverArizonaDispensaries = discoverArizonaDispensaries;
|
||||
exports.fetchAllProducts = fetchAllProducts;
|
||||
exports.fetchAllProductsBothModes = fetchAllProductsBothModes;
|
||||
const axios_1 = __importDefault(require("axios"));
|
||||
const puppeteer_extra_1 = __importDefault(require("puppeteer-extra"));
|
||||
const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth"));
|
||||
const dutchie_1 = require("../config/dutchie");
|
||||
Object.defineProperty(exports, "GRAPHQL_HASHES", { enumerable: true, get: function () { return dutchie_1.GRAPHQL_HASHES; } });
|
||||
Object.defineProperty(exports, "ARIZONA_CENTERPOINTS", { enumerable: true, get: function () { return dutchie_1.ARIZONA_CENTERPOINTS; } });
|
||||
puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)());
|
||||
/**
|
||||
* Create a session by navigating to the embedded menu page
|
||||
* and extracting CF clearance cookies for server-side requests.
|
||||
* Also extracts dispensaryId from window.reactEnv if available.
|
||||
*/
|
||||
async function createSession(cName) {
|
||||
const browser = await puppeteer_extra_1.default.launch({
|
||||
headless: 'new',
|
||||
args: dutchie_1.dutchieConfig.browserArgs,
|
||||
});
|
||||
const page = await browser.newPage();
|
||||
const userAgent = dutchie_1.dutchieConfig.userAgent;
|
||||
await page.setUserAgent(userAgent);
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
||||
window.chrome = { runtime: {} };
|
||||
});
|
||||
// Navigate to the embedded menu page for this dispensary
|
||||
const embeddedMenuUrl = `https://dutchie.com/embedded-menu/${cName}`;
|
||||
console.log(`[GraphQL Client] Loading ${embeddedMenuUrl} to get CF cookies...`);
|
||||
let httpStatus;
|
||||
let dispensaryId;
|
||||
try {
|
||||
const response = await page.goto(embeddedMenuUrl, {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout: dutchie_1.dutchieConfig.navigationTimeout,
|
||||
});
|
||||
httpStatus = response?.status();
|
||||
await new Promise((r) => setTimeout(r, dutchie_1.dutchieConfig.pageLoadDelay));
|
||||
// Try to extract dispensaryId from window.reactEnv
|
||||
try {
|
||||
dispensaryId = await page.evaluate(() => {
|
||||
return window.reactEnv?.dispensaryId || null;
|
||||
});
|
||||
if (dispensaryId) {
|
||||
console.log(`[GraphQL Client] Extracted dispensaryId from reactEnv: ${dispensaryId}`);
|
||||
}
|
||||
}
|
||||
catch (evalError) {
|
||||
console.log(`[GraphQL Client] Could not extract dispensaryId from reactEnv: ${evalError.message}`);
|
||||
}
|
||||
}
|
||||
catch (error) {
|
||||
console.warn(`[GraphQL Client] Navigation warning: ${error.message}`);
|
||||
// Continue anyway - we may have gotten cookies
|
||||
}
|
||||
// Extract cookies
|
||||
const cookies = await page.cookies();
|
||||
const cookieString = cookies.map((c) => `${c.name}=${c.value}`).join('; ');
|
||||
console.log(`[GraphQL Client] Got ${cookies.length} cookies, HTTP status: ${httpStatus}`);
|
||||
if (cookies.length > 0) {
|
||||
console.log(`[GraphQL Client] Cookie names: ${cookies.map(c => c.name).join(', ')}`);
|
||||
}
|
||||
return { cookies: cookieString, userAgent, browser, page, dispensaryId, httpStatus };
|
||||
}
|
||||
/**
|
||||
* Close session (browser)
|
||||
*/
|
||||
async function closeSession(session) {
|
||||
await session.browser.close();
|
||||
}
|
||||
// ============================================================
|
||||
// SERVER-SIDE GRAPHQL FETCH USING AXIOS
|
||||
// ============================================================
|
||||
/**
|
||||
* Build headers that mimic a real browser request
|
||||
*/
|
||||
function buildHeaders(session, cName) {
|
||||
const embeddedMenuUrl = `https://dutchie.com/embedded-menu/${cName}`;
|
||||
return {
|
||||
'accept': 'application/json, text/plain, */*',
|
||||
'accept-language': 'en-US,en;q=0.9',
|
||||
'accept-encoding': 'gzip, deflate, br',
|
||||
'content-type': 'application/json',
|
||||
'origin': 'https://dutchie.com',
|
||||
'referer': embeddedMenuUrl,
|
||||
'user-agent': session.userAgent,
|
||||
'apollographql-client-name': 'Marketplace (production)',
|
||||
'sec-ch-ua': '"Chromium";v="120", "Google Chrome";v="120", "Not-A.Brand";v="99"',
|
||||
'sec-ch-ua-mobile': '?0',
|
||||
'sec-ch-ua-platform': '"Windows"',
|
||||
'sec-fetch-dest': 'empty',
|
||||
'sec-fetch-mode': 'cors',
|
||||
'sec-fetch-site': 'same-site',
|
||||
...(session.cookies ? { 'cookie': session.cookies } : {}),
|
||||
};
|
||||
}
|
||||
/**
|
||||
* Execute GraphQL query server-side using axios
|
||||
* Uses cookies from the browser session to bypass CF
|
||||
*/
|
||||
async function executeGraphQL(session, operationName, variables, hash, cName) {
|
||||
const endpoint = dutchie_1.dutchieConfig.graphqlEndpoint;
|
||||
const headers = buildHeaders(session, cName);
|
||||
// Build request body for POST
|
||||
const body = {
|
||||
operationName,
|
||||
variables,
|
||||
extensions: {
|
||||
persistedQuery: { version: 1, sha256Hash: hash },
|
||||
},
|
||||
};
|
||||
console.log(`[GraphQL Client] POST: ${operationName} -> ${endpoint}`);
|
||||
console.log(`[GraphQL Client] Variables: ${JSON.stringify(variables).slice(0, 300)}...`);
|
||||
try {
|
||||
const response = await axios_1.default.post(endpoint, body, {
|
||||
headers,
|
||||
timeout: 30000,
|
||||
validateStatus: () => true, // Don't throw on non-2xx
|
||||
});
|
||||
// Log response details
|
||||
console.log(`[GraphQL Client] Response status: ${response.status}`);
|
||||
if (response.status !== 200) {
|
||||
const bodyPreview = typeof response.data === 'string'
|
||||
? response.data.slice(0, 500)
|
||||
: JSON.stringify(response.data).slice(0, 500);
|
||||
console.error(`[GraphQL Client] HTTP ${response.status}: ${bodyPreview}`);
|
||||
throw new Error(`HTTP ${response.status}`);
|
||||
}
|
||||
// Check for GraphQL errors
|
||||
if (response.data?.errors && response.data.errors.length > 0) {
|
||||
console.error(`[GraphQL Client] GraphQL errors: ${JSON.stringify(response.data.errors[0])}`);
|
||||
}
|
||||
return response.data;
|
||||
}
|
||||
catch (error) {
|
||||
if (axios_1.default.isAxiosError(error)) {
|
||||
const axiosError = error;
|
||||
console.error(`[GraphQL Client] Axios error: ${axiosError.message}`);
|
||||
if (axiosError.response) {
|
||||
console.error(`[GraphQL Client] Response status: ${axiosError.response.status}`);
|
||||
console.error(`[GraphQL Client] Response data: ${JSON.stringify(axiosError.response.data).slice(0, 500)}`);
|
||||
}
|
||||
if (axiosError.code) {
|
||||
console.error(`[GraphQL Client] Error code: ${axiosError.code}`);
|
||||
}
|
||||
}
|
||||
else {
|
||||
console.error(`[GraphQL Client] Error: ${error.message}`);
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Resolve a dispensary slug to its internal platform ID.
|
||||
*
|
||||
* STRATEGY:
|
||||
* 1. Navigate to embedded menu page and extract window.reactEnv.dispensaryId (preferred)
|
||||
* 2. Fall back to GraphQL GetAddressBasedDispensaryData query if reactEnv fails
|
||||
*
|
||||
* Returns the dispensaryId (platform_dispensary_id) or null if not found.
|
||||
* Throws if page returns 403/404 so caller can mark as not_crawlable.
|
||||
*/
|
||||
async function resolveDispensaryId(slug) {
|
||||
const result = await resolveDispensaryIdWithDetails(slug);
|
||||
return result.dispensaryId;
|
||||
}
|
||||
/**
|
||||
* Resolve a dispensary slug with full details (HTTP status, source, error).
|
||||
* Use this when you need to know WHY resolution failed.
|
||||
*/
|
||||
async function resolveDispensaryIdWithDetails(slug) {
|
||||
console.log(`[GraphQL Client] Resolving dispensary ID for slug: ${slug}`);
|
||||
const session = await createSession(slug);
|
||||
try {
|
||||
// Check HTTP status first - if 403/404, the store is not crawlable
|
||||
if (session.httpStatus && (session.httpStatus === 403 || session.httpStatus === 404)) {
|
||||
console.log(`[GraphQL Client] Page returned HTTP ${session.httpStatus} for ${slug} - not crawlable`);
|
||||
return {
|
||||
dispensaryId: null,
|
||||
httpStatus: session.httpStatus,
|
||||
error: `HTTP ${session.httpStatus}: Store removed or not accessible`,
|
||||
source: 'reactEnv',
|
||||
};
|
||||
}
|
||||
// PREFERRED: Use dispensaryId from window.reactEnv (extracted during createSession)
|
||||
if (session.dispensaryId) {
|
||||
console.log(`[GraphQL Client] Resolved ${slug} -> ${session.dispensaryId} (from reactEnv)`);
|
||||
return {
|
||||
dispensaryId: session.dispensaryId,
|
||||
httpStatus: session.httpStatus,
|
||||
source: 'reactEnv',
|
||||
};
|
||||
}
|
||||
// FALLBACK: Try GraphQL query
|
||||
console.log(`[GraphQL Client] reactEnv.dispensaryId not found for ${slug}, trying GraphQL...`);
|
||||
const variables = {
|
||||
dispensaryFilter: {
|
||||
cNameOrID: slug,
|
||||
},
|
||||
};
|
||||
const result = await executeGraphQL(session, 'GetAddressBasedDispensaryData', variables, dutchie_1.GRAPHQL_HASHES.GetAddressBasedDispensaryData, slug);
|
||||
const dispensaryId = result?.data?.dispensaryBySlug?.id ||
|
||||
result?.data?.dispensary?.id ||
|
||||
result?.data?.getAddressBasedDispensaryData?.dispensary?.id;
|
||||
if (dispensaryId) {
|
||||
console.log(`[GraphQL Client] Resolved ${slug} -> ${dispensaryId} (from GraphQL)`);
|
||||
return {
|
||||
dispensaryId,
|
||||
httpStatus: session.httpStatus,
|
||||
source: 'graphql',
|
||||
};
|
||||
}
|
||||
console.log(`[GraphQL Client] Could not resolve ${slug}, GraphQL response:`, JSON.stringify(result).slice(0, 300));
|
||||
return {
|
||||
dispensaryId: null,
|
||||
httpStatus: session.httpStatus,
|
||||
error: 'Could not extract dispensaryId from reactEnv or GraphQL',
|
||||
};
|
||||
}
|
||||
finally {
|
||||
await closeSession(session);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Discover Arizona dispensaries via geo-based query
|
||||
*/
|
||||
async function discoverArizonaDispensaries() {
|
||||
console.log('[GraphQL Client] Discovering Arizona dispensaries...');
|
||||
// Use Phoenix as the default center
|
||||
const session = await createSession('AZ-Deeply-Rooted');
|
||||
const allDispensaries = [];
|
||||
const seenIds = new Set();
|
||||
try {
|
||||
for (const centerpoint of dutchie_1.ARIZONA_CENTERPOINTS) {
|
||||
console.log(`[GraphQL Client] Scanning ${centerpoint.name}...`);
|
||||
const variables = {
|
||||
dispensariesFilter: {
|
||||
latitude: centerpoint.lat,
|
||||
longitude: centerpoint.lng,
|
||||
distance: 100,
|
||||
state: 'AZ',
|
||||
},
|
||||
};
|
||||
try {
|
||||
const result = await executeGraphQL(session, 'ConsumerDispensaries', variables, dutchie_1.GRAPHQL_HASHES.ConsumerDispensaries, 'AZ-Deeply-Rooted');
|
||||
const dispensaries = result?.data?.consumerDispensaries || [];
|
||||
for (const d of dispensaries) {
|
||||
const id = d.id || d.dispensaryId;
|
||||
if (id && !seenIds.has(id)) {
|
||||
seenIds.add(id);
|
||||
allDispensaries.push(d);
|
||||
}
|
||||
}
|
||||
console.log(`[GraphQL Client] Found ${dispensaries.length} in ${centerpoint.name} (${allDispensaries.length} total unique)`);
|
||||
}
|
||||
catch (error) {
|
||||
console.warn(`[GraphQL Client] Error scanning ${centerpoint.name}: ${error.message}`);
|
||||
}
|
||||
// Delay between requests
|
||||
await new Promise((r) => setTimeout(r, 1000));
|
||||
}
|
||||
}
|
||||
finally {
|
||||
await closeSession(session);
|
||||
}
|
||||
console.log(`[GraphQL Client] Discovery complete: ${allDispensaries.length} dispensaries`);
|
||||
return allDispensaries;
|
||||
}
|
||||
// ============================================================
|
||||
// PRODUCT FILTERING VARIABLES
|
||||
// ============================================================
|
||||
/**
|
||||
* Build filter variables for FilteredProducts query
|
||||
*
|
||||
* CRITICAL: Uses dispensaryId directly (the MongoDB ObjectId, e.g. "6405ef617056e8014d79101b")
|
||||
* NOT dispensaryFilter.cNameOrID!
|
||||
*
|
||||
* The actual browser request structure is:
|
||||
* {
|
||||
* "productsFilter": {
|
||||
* "dispensaryId": "6405ef617056e8014d79101b",
|
||||
* "pricingType": "rec",
|
||||
* "Status": "Active", // Mode A only
|
||||
* "strainTypes": [],
|
||||
* "subcategories": [],
|
||||
* "types": [],
|
||||
* "useCache": true,
|
||||
* ...
|
||||
* },
|
||||
* "page": 0,
|
||||
* "perPage": 100
|
||||
* }
|
||||
*
|
||||
* Mode A = UI parity (Status: "Active")
|
||||
* Mode B = MAX COVERAGE (no Status filter)
|
||||
*/
|
||||
function buildFilterVariables(platformDispensaryId, pricingType, crawlMode, page, perPage) {
|
||||
const isModeA = crawlMode === 'mode_a';
|
||||
// Per CLAUDE.md Rule #11: Use simple productsFilter with dispensaryId directly
|
||||
// Do NOT use dispensaryFilter.cNameOrID - that's outdated
|
||||
const productsFilter = {
|
||||
dispensaryId: platformDispensaryId,
|
||||
pricingType: pricingType,
|
||||
};
|
||||
// Mode A: Only active products (UI parity) - Status: "Active"
|
||||
// Mode B: MAX COVERAGE (OOS/inactive) - omit Status or set to null
|
||||
if (isModeA) {
|
||||
productsFilter.Status = 'Active';
|
||||
}
|
||||
// Mode B: No Status filter = returns all products including OOS/inactive
|
||||
return {
|
||||
productsFilter,
|
||||
page,
|
||||
perPage,
|
||||
};
|
||||
}
|
||||
// ============================================================
|
||||
// PRODUCT FETCHING WITH PAGINATION
|
||||
// ============================================================
|
||||
/**
|
||||
* Fetch products for a single mode with pagination
|
||||
*/
|
||||
async function fetchProductsForMode(session, platformDispensaryId, cName, pricingType, crawlMode) {
|
||||
const perPage = dutchie_1.dutchieConfig.perPage;
|
||||
const maxPages = dutchie_1.dutchieConfig.maxPages;
|
||||
const maxRetries = dutchie_1.dutchieConfig.maxRetries;
|
||||
const pageDelayMs = dutchie_1.dutchieConfig.pageDelayMs;
|
||||
const allProducts = [];
|
||||
let pageNum = 0;
|
||||
let totalCount = 0;
|
||||
let consecutiveEmptyPages = 0;
|
||||
console.log(`[GraphQL Client] Fetching products for ${cName} (platformId: ${platformDispensaryId}, ${pricingType}, ${crawlMode})...`);
|
||||
while (pageNum < maxPages) {
|
||||
const variables = buildFilterVariables(platformDispensaryId, pricingType, crawlMode, pageNum, perPage);
|
||||
let result = null;
|
||||
let lastError = null;
|
||||
// Retry logic
|
||||
for (let attempt = 0; attempt <= maxRetries; attempt++) {
|
||||
try {
|
||||
result = await executeGraphQL(session, 'FilteredProducts', variables, dutchie_1.GRAPHQL_HASHES.FilteredProducts, cName);
|
||||
lastError = null;
|
||||
break;
|
||||
}
|
||||
catch (error) {
|
||||
lastError = error;
|
||||
console.warn(`[GraphQL Client] Page ${pageNum} attempt ${attempt + 1} failed: ${error.message}`);
|
||||
if (attempt < maxRetries) {
|
||||
await new Promise((r) => setTimeout(r, 1000 * (attempt + 1)));
|
||||
}
|
||||
}
|
||||
}
|
||||
if (lastError) {
|
||||
console.error(`[GraphQL Client] Page ${pageNum} failed after ${maxRetries + 1} attempts`);
|
||||
break;
|
||||
}
|
||||
if (result?.errors) {
|
||||
console.error('[GraphQL Client] GraphQL errors:', JSON.stringify(result.errors));
|
||||
break;
|
||||
}
|
||||
// Log response shape on first page
|
||||
if (pageNum === 0) {
|
||||
console.log(`[GraphQL Client] Response keys: ${Object.keys(result || {}).join(', ')}`);
|
||||
if (result?.data) {
|
||||
console.log(`[GraphQL Client] data keys: ${Object.keys(result.data || {}).join(', ')}`);
|
||||
}
|
||||
if (!result?.data?.filteredProducts) {
|
||||
console.log(`[GraphQL Client] WARNING: No filteredProducts in response!`);
|
||||
console.log(`[GraphQL Client] Full response: ${JSON.stringify(result).slice(0, 1000)}`);
|
||||
}
|
||||
}
|
||||
const products = result?.data?.filteredProducts?.products || [];
|
||||
const queryInfo = result?.data?.filteredProducts?.queryInfo;
|
||||
if (queryInfo?.totalCount) {
|
||||
totalCount = queryInfo.totalCount;
|
||||
}
|
||||
console.log(`[GraphQL Client] Page ${pageNum}: ${products.length} products (total so far: ${allProducts.length + products.length}/${totalCount})`);
|
||||
if (products.length === 0) {
|
||||
consecutiveEmptyPages++;
|
||||
if (consecutiveEmptyPages >= 2) {
|
||||
console.log('[GraphQL Client] Multiple empty pages, stopping pagination');
|
||||
break;
|
||||
}
|
||||
}
|
||||
else {
|
||||
consecutiveEmptyPages = 0;
|
||||
allProducts.push(...products);
|
||||
}
|
||||
// Stop if incomplete page (last page)
|
||||
if (products.length < perPage) {
|
||||
console.log(`[GraphQL Client] Incomplete page (${products.length} < ${perPage}), stopping`);
|
||||
break;
|
||||
}
|
||||
pageNum++;
|
||||
await new Promise((r) => setTimeout(r, pageDelayMs));
|
||||
}
|
||||
console.log(`[GraphQL Client] Fetched ${allProducts.length} total products (${crawlMode})`);
|
||||
return { products: allProducts, totalCount: totalCount || allProducts.length, crawlMode };
|
||||
}
|
||||
// ============================================================
|
||||
// LEGACY SINGLE-MODE INTERFACE
|
||||
// ============================================================
|
||||
/**
|
||||
* Fetch all products for a dispensary (single mode)
|
||||
*/
|
||||
async function fetchAllProducts(platformDispensaryId, pricingType = 'rec', options = {}) {
|
||||
const { crawlMode = 'mode_a' } = options;
|
||||
// cName is now REQUIRED - no default fallback to avoid using wrong store's session
|
||||
const cName = options.cName;
|
||||
if (!cName) {
|
||||
throw new Error('[GraphQL Client] cName is required for fetchAllProducts - cannot use another store\'s session');
|
||||
}
|
||||
const session = await createSession(cName);
|
||||
try {
|
||||
return await fetchProductsForMode(session, platformDispensaryId, cName, pricingType, crawlMode);
|
||||
}
|
||||
finally {
|
||||
await closeSession(session);
|
||||
}
|
||||
}
|
||||
// ============================================================
|
||||
// MODE A+B MERGING
|
||||
// ============================================================
|
||||
/**
|
||||
* Merge POSMetaData.children arrays from Mode A and Mode B products
|
||||
*/
|
||||
function mergeProductOptions(modeAProduct, modeBProduct) {
|
||||
const modeAChildren = modeAProduct.POSMetaData?.children || [];
|
||||
const modeBChildren = modeBProduct.POSMetaData?.children || [];
|
||||
const getOptionKey = (child) => {
|
||||
return child.canonicalID || child.canonicalSKU || child.canonicalPackageId || child.option || '';
|
||||
};
|
||||
const mergedMap = new Map();
|
||||
for (const child of modeAChildren) {
|
||||
const key = getOptionKey(child);
|
||||
if (key)
|
||||
mergedMap.set(key, child);
|
||||
}
|
||||
for (const child of modeBChildren) {
|
||||
const key = getOptionKey(child);
|
||||
if (key && !mergedMap.has(key)) {
|
||||
mergedMap.set(key, child);
|
||||
}
|
||||
}
|
||||
return Array.from(mergedMap.values());
|
||||
}
|
||||
/**
|
||||
* Merge a Mode A product with a Mode B product
|
||||
*/
|
||||
function mergeProducts(modeAProduct, modeBProduct) {
|
||||
if (!modeBProduct) {
|
||||
return modeAProduct;
|
||||
}
|
||||
const mergedChildren = mergeProductOptions(modeAProduct, modeBProduct);
|
||||
return {
|
||||
...modeAProduct,
|
||||
POSMetaData: {
|
||||
...modeAProduct.POSMetaData,
|
||||
children: mergedChildren,
|
||||
},
|
||||
};
|
||||
}
|
||||
// ============================================================
|
||||
// MAIN EXPORT: TWO-MODE CRAWL
|
||||
// ============================================================
|
||||
/**
|
||||
* Fetch products using BOTH crawl modes with SINGLE session
|
||||
* Runs Mode A then Mode B, merges results
|
||||
*/
|
||||
async function fetchAllProductsBothModes(platformDispensaryId, pricingType = 'rec', options = {}) {
|
||||
// cName is now REQUIRED - no default fallback to avoid using wrong store's session
|
||||
const cName = options.cName;
|
||||
if (!cName) {
|
||||
throw new Error('[GraphQL Client] cName is required for fetchAllProductsBothModes - cannot use another store\'s session');
|
||||
}
|
||||
console.log(`[GraphQL Client] Running two-mode crawl for ${cName} (${pricingType})...`);
|
||||
console.log(`[GraphQL Client] Platform ID: ${platformDispensaryId}, cName: ${cName}`);
|
||||
const session = await createSession(cName);
|
||||
try {
|
||||
// Mode A (UI parity)
|
||||
const modeAResult = await fetchProductsForMode(session, platformDispensaryId, cName, pricingType, 'mode_a');
|
||||
// Delay between modes
|
||||
await new Promise((r) => setTimeout(r, dutchie_1.dutchieConfig.modeDelayMs));
|
||||
// Mode B (MAX COVERAGE)
|
||||
const modeBResult = await fetchProductsForMode(session, platformDispensaryId, cName, pricingType, 'mode_b');
|
||||
// Merge results
|
||||
const modeBMap = new Map();
|
||||
for (const product of modeBResult.products) {
|
||||
modeBMap.set(product._id, product);
|
||||
}
|
||||
const productMap = new Map();
|
||||
// Add Mode A products, merging with Mode B if exists
|
||||
for (const product of modeAResult.products) {
|
||||
const modeBProduct = modeBMap.get(product._id);
|
||||
const mergedProduct = mergeProducts(product, modeBProduct);
|
||||
productMap.set(product._id, mergedProduct);
|
||||
}
|
||||
// Add Mode B products not in Mode A
|
||||
for (const product of modeBResult.products) {
|
||||
if (!productMap.has(product._id)) {
|
||||
productMap.set(product._id, product);
|
||||
}
|
||||
}
|
||||
const mergedProducts = Array.from(productMap.values());
|
||||
console.log(`[GraphQL Client] Merged: ${mergedProducts.length} unique products`);
|
||||
console.log(`[GraphQL Client] Mode A: ${modeAResult.products.length}, Mode B: ${modeBResult.products.length}`);
|
||||
return {
|
||||
modeA: { products: modeAResult.products, totalCount: modeAResult.totalCount },
|
||||
modeB: { products: modeBResult.products, totalCount: modeBResult.totalCount },
|
||||
merged: { products: mergedProducts, totalCount: mergedProducts.length },
|
||||
};
|
||||
}
|
||||
finally {
|
||||
await closeSession(session);
|
||||
}
|
||||
}
|
||||
414
backend/dist/dutchie-az/services/job-queue.js
vendored
Normal file
414
backend/dist/dutchie-az/services/job-queue.js
vendored
Normal file
@@ -0,0 +1,414 @@
|
||||
"use strict";
|
||||
/**
|
||||
* Job Queue Service
|
||||
*
|
||||
* DB-backed job queue with claiming/locking for distributed workers.
|
||||
* Ensures only one worker processes a given store at a time.
|
||||
*/
|
||||
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
||||
if (k2 === undefined) k2 = k;
|
||||
var desc = Object.getOwnPropertyDescriptor(m, k);
|
||||
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
||||
desc = { enumerable: true, get: function() { return m[k]; } };
|
||||
}
|
||||
Object.defineProperty(o, k2, desc);
|
||||
}) : (function(o, m, k, k2) {
|
||||
if (k2 === undefined) k2 = k;
|
||||
o[k2] = m[k];
|
||||
}));
|
||||
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
||||
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
||||
}) : function(o, v) {
|
||||
o["default"] = v;
|
||||
});
|
||||
var __importStar = (this && this.__importStar) || (function () {
|
||||
var ownKeys = function(o) {
|
||||
ownKeys = Object.getOwnPropertyNames || function (o) {
|
||||
var ar = [];
|
||||
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
||||
return ar;
|
||||
};
|
||||
return ownKeys(o);
|
||||
};
|
||||
return function (mod) {
|
||||
if (mod && mod.__esModule) return mod;
|
||||
var result = {};
|
||||
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
||||
__setModuleDefault(result, mod);
|
||||
return result;
|
||||
};
|
||||
})();
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.getWorkerId = getWorkerId;
|
||||
exports.getWorkerHostname = getWorkerHostname;
|
||||
exports.enqueueJob = enqueueJob;
|
||||
exports.bulkEnqueueJobs = bulkEnqueueJobs;
|
||||
exports.claimNextJob = claimNextJob;
|
||||
exports.updateJobProgress = updateJobProgress;
|
||||
exports.heartbeat = heartbeat;
|
||||
exports.completeJob = completeJob;
|
||||
exports.failJob = failJob;
|
||||
exports.getQueueStats = getQueueStats;
|
||||
exports.getActiveWorkers = getActiveWorkers;
|
||||
exports.getRunningJobs = getRunningJobs;
|
||||
exports.recoverStaleJobs = recoverStaleJobs;
|
||||
exports.cleanupOldJobs = cleanupOldJobs;
|
||||
const connection_1 = require("../db/connection");
|
||||
const uuid_1 = require("uuid");
|
||||
const os = __importStar(require("os"));
|
||||
// ============================================================
|
||||
// WORKER IDENTITY
|
||||
// ============================================================
|
||||
let _workerId = null;
|
||||
/**
|
||||
* Get or create a unique worker ID for this process
|
||||
* In Kubernetes, uses POD_NAME for clarity; otherwise generates a unique ID
|
||||
*/
|
||||
function getWorkerId() {
|
||||
if (!_workerId) {
|
||||
// Prefer POD_NAME in K8s (set via fieldRef)
|
||||
const podName = process.env.POD_NAME;
|
||||
if (podName) {
|
||||
_workerId = podName;
|
||||
}
|
||||
else {
|
||||
const hostname = os.hostname();
|
||||
const pid = process.pid;
|
||||
const uuid = (0, uuid_1.v4)().slice(0, 8);
|
||||
_workerId = `${hostname}-${pid}-${uuid}`;
|
||||
}
|
||||
}
|
||||
return _workerId;
|
||||
}
|
||||
/**
|
||||
* Get hostname for worker tracking
|
||||
* In Kubernetes, uses POD_NAME; otherwise uses os.hostname()
|
||||
*/
|
||||
function getWorkerHostname() {
|
||||
return process.env.POD_NAME || os.hostname();
|
||||
}
|
||||
// ============================================================
|
||||
// JOB ENQUEUEING
|
||||
// ============================================================
|
||||
/**
|
||||
* Enqueue a new job for processing
|
||||
* Returns null if a pending/running job already exists for this dispensary
|
||||
*/
|
||||
async function enqueueJob(options) {
|
||||
const { jobType, dispensaryId, priority = 0, metadata, maxRetries = 3, } = options;
|
||||
// Check if there's already a pending/running job for this dispensary
|
||||
if (dispensaryId) {
|
||||
const { rows: existing } = await (0, connection_1.query)(`SELECT id FROM dispensary_crawl_jobs
|
||||
WHERE dispensary_id = $1 AND status IN ('pending', 'running')
|
||||
LIMIT 1`, [dispensaryId]);
|
||||
if (existing.length > 0) {
|
||||
console.log(`[JobQueue] Skipping enqueue - job already exists for dispensary ${dispensaryId}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
const { rows } = await (0, connection_1.query)(`INSERT INTO dispensary_crawl_jobs (job_type, dispensary_id, status, priority, max_retries, metadata, created_at)
|
||||
VALUES ($1, $2, 'pending', $3, $4, $5, NOW())
|
||||
RETURNING id`, [jobType, dispensaryId || null, priority, maxRetries, metadata ? JSON.stringify(metadata) : null]);
|
||||
const jobId = rows[0].id;
|
||||
console.log(`[JobQueue] Enqueued job ${jobId} (type=${jobType}, dispensary=${dispensaryId})`);
|
||||
return jobId;
|
||||
}
|
||||
/**
|
||||
* Bulk enqueue jobs for multiple dispensaries
|
||||
* Skips dispensaries that already have pending/running jobs
|
||||
*/
|
||||
async function bulkEnqueueJobs(jobType, dispensaryIds, options = {}) {
|
||||
const { priority = 0, metadata } = options;
|
||||
// Get dispensaries that already have pending/running jobs
|
||||
const { rows: existing } = await (0, connection_1.query)(`SELECT DISTINCT dispensary_id FROM dispensary_crawl_jobs
|
||||
WHERE dispensary_id = ANY($1) AND status IN ('pending', 'running')`, [dispensaryIds]);
|
||||
const existingSet = new Set(existing.map((r) => r.dispensary_id));
|
||||
// Filter out dispensaries with existing jobs
|
||||
const toEnqueue = dispensaryIds.filter(id => !existingSet.has(id));
|
||||
if (toEnqueue.length === 0) {
|
||||
return { enqueued: 0, skipped: dispensaryIds.length };
|
||||
}
|
||||
// Bulk insert - each row needs 4 params: job_type, dispensary_id, priority, metadata
|
||||
const metadataJson = metadata ? JSON.stringify(metadata) : null;
|
||||
const values = toEnqueue.map((_, i) => {
|
||||
const offset = i * 4;
|
||||
return `($${offset + 1}, $${offset + 2}, 'pending', $${offset + 3}, 3, $${offset + 4}, NOW())`;
|
||||
}).join(', ');
|
||||
const params = [];
|
||||
toEnqueue.forEach(dispensaryId => {
|
||||
params.push(jobType, dispensaryId, priority, metadataJson);
|
||||
});
|
||||
await (0, connection_1.query)(`INSERT INTO dispensary_crawl_jobs (job_type, dispensary_id, status, priority, max_retries, metadata, created_at)
|
||||
VALUES ${values}`, params);
|
||||
console.log(`[JobQueue] Bulk enqueued ${toEnqueue.length} jobs, skipped ${existingSet.size}`);
|
||||
return { enqueued: toEnqueue.length, skipped: existingSet.size };
|
||||
}
|
||||
// ============================================================
|
||||
// JOB CLAIMING (with locking)
|
||||
// ============================================================
|
||||
/**
|
||||
* Claim the next available job from the queue
|
||||
* Uses SELECT FOR UPDATE SKIP LOCKED to prevent double-claims
|
||||
*/
|
||||
async function claimNextJob(options) {
|
||||
const { workerId, jobTypes, lockDurationMinutes = 30 } = options;
|
||||
const hostname = getWorkerHostname();
|
||||
const client = await (0, connection_1.getClient)();
|
||||
try {
|
||||
await client.query('BEGIN');
|
||||
// Build job type filter
|
||||
let typeFilter = '';
|
||||
const params = [workerId, hostname, lockDurationMinutes];
|
||||
let paramIndex = 4;
|
||||
if (jobTypes && jobTypes.length > 0) {
|
||||
typeFilter = `AND job_type = ANY($${paramIndex})`;
|
||||
params.push(jobTypes);
|
||||
paramIndex++;
|
||||
}
|
||||
// Claim the next pending job using FOR UPDATE SKIP LOCKED
|
||||
// This atomically selects and locks a row, skipping any already locked by other workers
|
||||
const { rows } = await client.query(`UPDATE dispensary_crawl_jobs
|
||||
SET
|
||||
status = 'running',
|
||||
claimed_by = $1,
|
||||
claimed_at = NOW(),
|
||||
worker_id = $1,
|
||||
worker_hostname = $2,
|
||||
started_at = NOW(),
|
||||
locked_until = NOW() + ($3 || ' minutes')::INTERVAL,
|
||||
last_heartbeat_at = NOW(),
|
||||
updated_at = NOW()
|
||||
WHERE id = (
|
||||
SELECT id FROM dispensary_crawl_jobs
|
||||
WHERE status = 'pending'
|
||||
${typeFilter}
|
||||
ORDER BY priority DESC, created_at ASC
|
||||
FOR UPDATE SKIP LOCKED
|
||||
LIMIT 1
|
||||
)
|
||||
RETURNING *`, params);
|
||||
await client.query('COMMIT');
|
||||
if (rows.length === 0) {
|
||||
return null;
|
||||
}
|
||||
const job = mapDbRowToJob(rows[0]);
|
||||
console.log(`[JobQueue] Worker ${workerId} claimed job ${job.id} (type=${job.jobType}, dispensary=${job.dispensaryId})`);
|
||||
return job;
|
||||
}
|
||||
catch (error) {
|
||||
await client.query('ROLLBACK');
|
||||
throw error;
|
||||
}
|
||||
finally {
|
||||
client.release();
|
||||
}
|
||||
}
|
||||
// ============================================================
|
||||
// JOB PROGRESS & COMPLETION
|
||||
// ============================================================
|
||||
/**
|
||||
* Update job progress (for live monitoring)
|
||||
*/
|
||||
async function updateJobProgress(jobId, progress) {
|
||||
const updates = ['last_heartbeat_at = NOW()', 'updated_at = NOW()'];
|
||||
const params = [];
|
||||
let paramIndex = 1;
|
||||
if (progress.productsFound !== undefined) {
|
||||
updates.push(`products_found = $${paramIndex++}`);
|
||||
params.push(progress.productsFound);
|
||||
}
|
||||
if (progress.productsUpserted !== undefined) {
|
||||
updates.push(`products_upserted = $${paramIndex++}`);
|
||||
params.push(progress.productsUpserted);
|
||||
}
|
||||
if (progress.snapshotsCreated !== undefined) {
|
||||
updates.push(`snapshots_created = $${paramIndex++}`);
|
||||
params.push(progress.snapshotsCreated);
|
||||
}
|
||||
if (progress.currentPage !== undefined) {
|
||||
updates.push(`current_page = $${paramIndex++}`);
|
||||
params.push(progress.currentPage);
|
||||
}
|
||||
if (progress.totalPages !== undefined) {
|
||||
updates.push(`total_pages = $${paramIndex++}`);
|
||||
params.push(progress.totalPages);
|
||||
}
|
||||
params.push(jobId);
|
||||
await (0, connection_1.query)(`UPDATE dispensary_crawl_jobs SET ${updates.join(', ')} WHERE id = $${paramIndex}`, params);
|
||||
}
|
||||
/**
|
||||
* Send heartbeat to keep job alive (prevents timeout)
|
||||
*/
|
||||
async function heartbeat(jobId) {
|
||||
await (0, connection_1.query)(`UPDATE dispensary_crawl_jobs
|
||||
SET last_heartbeat_at = NOW(), locked_until = NOW() + INTERVAL '30 minutes'
|
||||
WHERE id = $1 AND status = 'running'`, [jobId]);
|
||||
}
|
||||
/**
|
||||
* Mark job as completed
|
||||
*/
|
||||
async function completeJob(jobId, result) {
|
||||
await (0, connection_1.query)(`UPDATE dispensary_crawl_jobs
|
||||
SET
|
||||
status = 'completed',
|
||||
completed_at = NOW(),
|
||||
products_found = COALESCE($2, products_found),
|
||||
products_upserted = COALESCE($3, products_upserted),
|
||||
snapshots_created = COALESCE($4, snapshots_created),
|
||||
updated_at = NOW()
|
||||
WHERE id = $1`, [jobId, result.productsFound, result.productsUpserted, result.snapshotsCreated]);
|
||||
console.log(`[JobQueue] Job ${jobId} completed`);
|
||||
}
|
||||
/**
|
||||
* Mark job as failed
|
||||
*/
|
||||
async function failJob(jobId, errorMessage) {
|
||||
// Check if we should retry
|
||||
const { rows } = await (0, connection_1.query)(`SELECT retry_count, max_retries FROM dispensary_crawl_jobs WHERE id = $1`, [jobId]);
|
||||
if (rows.length === 0)
|
||||
return false;
|
||||
const { retry_count, max_retries } = rows[0];
|
||||
if (retry_count < max_retries) {
|
||||
// Re-queue for retry
|
||||
await (0, connection_1.query)(`UPDATE dispensary_crawl_jobs
|
||||
SET
|
||||
status = 'pending',
|
||||
retry_count = retry_count + 1,
|
||||
claimed_by = NULL,
|
||||
claimed_at = NULL,
|
||||
worker_id = NULL,
|
||||
worker_hostname = NULL,
|
||||
started_at = NULL,
|
||||
locked_until = NULL,
|
||||
last_heartbeat_at = NULL,
|
||||
error_message = $2,
|
||||
updated_at = NOW()
|
||||
WHERE id = $1`, [jobId, errorMessage]);
|
||||
console.log(`[JobQueue] Job ${jobId} failed, re-queued for retry (${retry_count + 1}/${max_retries})`);
|
||||
return true; // Will retry
|
||||
}
|
||||
else {
|
||||
// Mark as failed permanently
|
||||
await (0, connection_1.query)(`UPDATE dispensary_crawl_jobs
|
||||
SET
|
||||
status = 'failed',
|
||||
completed_at = NOW(),
|
||||
error_message = $2,
|
||||
updated_at = NOW()
|
||||
WHERE id = $1`, [jobId, errorMessage]);
|
||||
console.log(`[JobQueue] Job ${jobId} failed permanently after ${retry_count} retries`);
|
||||
return false; // No more retries
|
||||
}
|
||||
}
|
||||
// ============================================================
|
||||
// QUEUE MONITORING
|
||||
// ============================================================
|
||||
/**
|
||||
* Get queue statistics
|
||||
*/
|
||||
async function getQueueStats() {
|
||||
const { rows } = await (0, connection_1.query)(`SELECT * FROM v_queue_stats`);
|
||||
const stats = rows[0] || {};
|
||||
return {
|
||||
pending: parseInt(stats.pending_jobs || '0', 10),
|
||||
running: parseInt(stats.running_jobs || '0', 10),
|
||||
completed1h: parseInt(stats.completed_1h || '0', 10),
|
||||
failed1h: parseInt(stats.failed_1h || '0', 10),
|
||||
activeWorkers: parseInt(stats.active_workers || '0', 10),
|
||||
avgDurationSeconds: stats.avg_duration_seconds ? parseFloat(stats.avg_duration_seconds) : null,
|
||||
};
|
||||
}
|
||||
/**
|
||||
* Get active workers
|
||||
*/
|
||||
async function getActiveWorkers() {
|
||||
const { rows } = await (0, connection_1.query)(`SELECT * FROM v_active_workers`);
|
||||
return rows.map((row) => ({
|
||||
workerId: row.worker_id,
|
||||
hostname: row.worker_hostname,
|
||||
currentJobs: parseInt(row.current_jobs || '0', 10),
|
||||
totalProductsFound: parseInt(row.total_products_found || '0', 10),
|
||||
totalProductsUpserted: parseInt(row.total_products_upserted || '0', 10),
|
||||
totalSnapshots: parseInt(row.total_snapshots || '0', 10),
|
||||
firstClaimedAt: new Date(row.first_claimed_at),
|
||||
lastHeartbeat: row.last_heartbeat ? new Date(row.last_heartbeat) : null,
|
||||
}));
|
||||
}
|
||||
/**
|
||||
* Get running jobs with worker info
|
||||
*/
|
||||
async function getRunningJobs() {
|
||||
const { rows } = await (0, connection_1.query)(`SELECT cj.*, d.name as dispensary_name, d.city
|
||||
FROM dispensary_crawl_jobs cj
|
||||
LEFT JOIN dispensaries d ON cj.dispensary_id = d.id
|
||||
WHERE cj.status = 'running'
|
||||
ORDER BY cj.started_at DESC`);
|
||||
return rows.map(mapDbRowToJob);
|
||||
}
|
||||
/**
|
||||
* Recover stale jobs (workers that died without completing)
|
||||
*/
|
||||
async function recoverStaleJobs(staleMinutes = 15) {
|
||||
const { rowCount } = await (0, connection_1.query)(`UPDATE dispensary_crawl_jobs
|
||||
SET
|
||||
status = 'pending',
|
||||
claimed_by = NULL,
|
||||
claimed_at = NULL,
|
||||
worker_id = NULL,
|
||||
worker_hostname = NULL,
|
||||
started_at = NULL,
|
||||
locked_until = NULL,
|
||||
error_message = 'Recovered from stale worker',
|
||||
retry_count = retry_count + 1,
|
||||
updated_at = NOW()
|
||||
WHERE status = 'running'
|
||||
AND last_heartbeat_at < NOW() - ($1 || ' minutes')::INTERVAL
|
||||
AND retry_count < max_retries`, [staleMinutes]);
|
||||
if (rowCount && rowCount > 0) {
|
||||
console.log(`[JobQueue] Recovered ${rowCount} stale jobs`);
|
||||
}
|
||||
return rowCount || 0;
|
||||
}
|
||||
/**
|
||||
* Clean up old completed/failed jobs
|
||||
*/
|
||||
async function cleanupOldJobs(olderThanDays = 7) {
|
||||
const { rowCount } = await (0, connection_1.query)(`DELETE FROM dispensary_crawl_jobs
|
||||
WHERE status IN ('completed', 'failed')
|
||||
AND completed_at < NOW() - ($1 || ' days')::INTERVAL`, [olderThanDays]);
|
||||
if (rowCount && rowCount > 0) {
|
||||
console.log(`[JobQueue] Cleaned up ${rowCount} old jobs`);
|
||||
}
|
||||
return rowCount || 0;
|
||||
}
|
||||
// ============================================================
|
||||
// HELPERS
|
||||
// ============================================================
|
||||
function mapDbRowToJob(row) {
|
||||
return {
|
||||
id: row.id,
|
||||
jobType: row.job_type,
|
||||
dispensaryId: row.dispensary_id,
|
||||
status: row.status,
|
||||
priority: row.priority || 0,
|
||||
retryCount: row.retry_count || 0,
|
||||
maxRetries: row.max_retries || 3,
|
||||
claimedBy: row.claimed_by,
|
||||
claimedAt: row.claimed_at ? new Date(row.claimed_at) : null,
|
||||
workerHostname: row.worker_hostname,
|
||||
startedAt: row.started_at ? new Date(row.started_at) : null,
|
||||
completedAt: row.completed_at ? new Date(row.completed_at) : null,
|
||||
errorMessage: row.error_message,
|
||||
productsFound: row.products_found || 0,
|
||||
productsUpserted: row.products_upserted || 0,
|
||||
snapshotsCreated: row.snapshots_created || 0,
|
||||
currentPage: row.current_page || 0,
|
||||
totalPages: row.total_pages,
|
||||
lastHeartbeatAt: row.last_heartbeat_at ? new Date(row.last_heartbeat_at) : null,
|
||||
metadata: row.metadata,
|
||||
createdAt: new Date(row.created_at),
|
||||
// Add extra fields from join if present
|
||||
...(row.dispensary_name && { dispensaryName: row.dispensary_name }),
|
||||
...(row.city && { city: row.city }),
|
||||
};
|
||||
}
|
||||
837
backend/dist/dutchie-az/services/menu-detection.js
vendored
Normal file
837
backend/dist/dutchie-az/services/menu-detection.js
vendored
Normal file
@@ -0,0 +1,837 @@
|
||||
"use strict";
|
||||
/**
|
||||
* Menu Detection Service
|
||||
*
|
||||
* Detects menu provider (dutchie, treez, jane, etc.) from dispensary menu_url
|
||||
* and resolves platform_dispensary_id for dutchie stores.
|
||||
*
|
||||
* This service:
|
||||
* 1. Iterates dispensaries with unknown/missing menu_type or platform_dispensary_id
|
||||
* 2. Detects provider from menu_url patterns
|
||||
* 3. For dutchie: extracts cName and resolves platform_dispensary_id via GraphQL
|
||||
* 4. Logs results to job_run_logs
|
||||
*/
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.crawlWebsiteForMenuLinks = crawlWebsiteForMenuLinks;
|
||||
exports.detectProviderFromUrl = detectProviderFromUrl;
|
||||
exports.detectAndResolveDispensary = detectAndResolveDispensary;
|
||||
exports.runBulkDetection = runBulkDetection;
|
||||
exports.executeMenuDetectionJob = executeMenuDetectionJob;
|
||||
exports.getDetectionStats = getDetectionStats;
|
||||
exports.getDispensariesNeedingDetection = getDispensariesNeedingDetection;
|
||||
const connection_1 = require("../db/connection");
|
||||
const discovery_1 = require("./discovery");
|
||||
const graphql_client_1 = require("./graphql-client");
|
||||
// Explicit column list for dispensaries table (avoids SELECT * issues with schema differences)
|
||||
const DISPENSARY_COLUMNS = `
|
||||
id, name, slug, city, state, zip, address, latitude, longitude,
|
||||
menu_type, menu_url, platform_dispensary_id, website,
|
||||
provider_detection_data, created_at, updated_at
|
||||
`;
|
||||
// ============================================================
|
||||
// PROVIDER DETECTION PATTERNS
|
||||
// ============================================================
|
||||
const PROVIDER_URL_PATTERNS = [
|
||||
// IMPORTANT: Curaleaf and Sol must come BEFORE dutchie to take precedence
|
||||
// These stores have their own proprietary menu systems (not crawlable via Dutchie)
|
||||
{
|
||||
provider: 'curaleaf',
|
||||
patterns: [
|
||||
/curaleaf\.com\/stores\//i, // e.g., https://curaleaf.com/stores/curaleaf-az-glendale-east
|
||||
/curaleaf\.com\/dispensary\//i, // e.g., https://curaleaf.com/dispensary/arizona
|
||||
],
|
||||
},
|
||||
{
|
||||
provider: 'sol',
|
||||
patterns: [
|
||||
/livewithsol\.com/i, // e.g., https://www.livewithsol.com/locations/sun-city/
|
||||
/solflower\.com/i, // alternate domain if any
|
||||
],
|
||||
},
|
||||
{
|
||||
provider: 'dutchie',
|
||||
patterns: [
|
||||
/dutchie\.com/i,
|
||||
/\/embedded-menu\//i,
|
||||
/\/dispensary\/[A-Z]{2}-/i, // e.g., /dispensary/AZ-store-name
|
||||
/dutchie-plus/i,
|
||||
],
|
||||
},
|
||||
{
|
||||
provider: 'treez',
|
||||
patterns: [
|
||||
/treez\.io/i,
|
||||
/shop\.treez/i,
|
||||
/treez-ecommerce/i,
|
||||
],
|
||||
},
|
||||
{
|
||||
provider: 'jane',
|
||||
patterns: [
|
||||
/jane\.co/i,
|
||||
/iheartjane\.com/i,
|
||||
/embed\.iheartjane/i,
|
||||
],
|
||||
},
|
||||
{
|
||||
provider: 'weedmaps',
|
||||
patterns: [
|
||||
/weedmaps\.com/i,
|
||||
/menu\.weedmaps/i,
|
||||
],
|
||||
},
|
||||
{
|
||||
provider: 'leafly',
|
||||
patterns: [
|
||||
/leafly\.com/i,
|
||||
/order\.leafly/i,
|
||||
],
|
||||
},
|
||||
{
|
||||
provider: 'meadow',
|
||||
patterns: [
|
||||
/getmeadow\.com/i,
|
||||
/meadow\.co/i,
|
||||
],
|
||||
},
|
||||
{
|
||||
provider: 'blaze',
|
||||
patterns: [
|
||||
/blaze\.me/i,
|
||||
/blazepos\.com/i,
|
||||
],
|
||||
},
|
||||
{
|
||||
provider: 'flowhub',
|
||||
patterns: [
|
||||
/flowhub\.com/i,
|
||||
/flowhub\.co/i,
|
||||
],
|
||||
},
|
||||
{
|
||||
provider: 'dispense',
|
||||
patterns: [
|
||||
/dispense\.io/i,
|
||||
/dispenseapp\.com/i,
|
||||
],
|
||||
},
|
||||
];
|
||||
/**
|
||||
* Link patterns that suggest a menu or ordering page
|
||||
*/
|
||||
const MENU_LINK_PATTERNS = [
|
||||
/\/menu/i,
|
||||
/\/order/i,
|
||||
/\/shop/i,
|
||||
/\/products/i,
|
||||
/\/dispensary/i,
|
||||
/\/store/i,
|
||||
/curaleaf\.com/i,
|
||||
/dutchie\.com/i,
|
||||
/treez\.io/i,
|
||||
/jane\.co/i,
|
||||
/iheartjane\.com/i,
|
||||
/weedmaps\.com/i,
|
||||
/leafly\.com/i,
|
||||
/getmeadow\.com/i,
|
||||
/blaze\.me/i,
|
||||
/flowhub\.com/i,
|
||||
/dispense\.io/i,
|
||||
];
|
||||
/**
|
||||
* Check if a URL is a Curaleaf store URL
|
||||
*/
|
||||
function isCuraleafUrl(url) {
|
||||
if (!url)
|
||||
return false;
|
||||
return /curaleaf\.com\/(stores|dispensary)\//i.test(url);
|
||||
}
|
||||
/**
|
||||
* Extract the Curaleaf store URL from a website URL
|
||||
* Handles both /stores/ and /dispensary/ formats
|
||||
*/
|
||||
function extractCuraleafStoreUrl(url) {
|
||||
if (!url)
|
||||
return null;
|
||||
// If it's already a Curaleaf stores/dispensary URL, use it
|
||||
if (isCuraleafUrl(url)) {
|
||||
return url;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
/**
|
||||
* Fetch a page and extract all links
|
||||
*/
|
||||
async function fetchPageLinks(url, timeout = 10000) {
|
||||
try {
|
||||
const controller = new AbortController();
|
||||
const timeoutId = setTimeout(() => controller.abort(), timeout);
|
||||
const response = await fetch(url, {
|
||||
signal: controller.signal,
|
||||
headers: {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
},
|
||||
redirect: 'follow',
|
||||
});
|
||||
clearTimeout(timeoutId);
|
||||
if (!response.ok) {
|
||||
return { links: [], error: `HTTP ${response.status}` };
|
||||
}
|
||||
const html = await response.text();
|
||||
// Extract all href attributes from anchor tags
|
||||
const linkRegex = /href=["']([^"']+)["']/gi;
|
||||
const links = [];
|
||||
let match;
|
||||
while ((match = linkRegex.exec(html)) !== null) {
|
||||
const href = match[1];
|
||||
// Convert relative URLs to absolute
|
||||
try {
|
||||
const absoluteUrl = new URL(href, url).href;
|
||||
links.push(absoluteUrl);
|
||||
}
|
||||
catch {
|
||||
// Skip invalid URLs
|
||||
}
|
||||
}
|
||||
// Also look for iframe src attributes (common for embedded menus)
|
||||
const iframeRegex = /src=["']([^"']+)["']/gi;
|
||||
while ((match = iframeRegex.exec(html)) !== null) {
|
||||
const src = match[1];
|
||||
try {
|
||||
const absoluteUrl = new URL(src, url).href;
|
||||
// Only add if it matches a provider pattern
|
||||
for (const { patterns } of PROVIDER_URL_PATTERNS) {
|
||||
if (patterns.some(p => p.test(absoluteUrl))) {
|
||||
links.push(absoluteUrl);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
catch {
|
||||
// Skip invalid URLs
|
||||
}
|
||||
}
|
||||
return { links: [...new Set(links)] }; // Deduplicate
|
||||
}
|
||||
catch (error) {
|
||||
if (error.name === 'AbortError') {
|
||||
return { links: [], error: 'Timeout' };
|
||||
}
|
||||
return { links: [], error: error.message };
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Crawl a dispensary's website to find menu provider links
|
||||
*
|
||||
* Strategy:
|
||||
* 1. Fetch the homepage and extract all links
|
||||
* 2. Look for links that match known provider patterns (dutchie, treez, etc.)
|
||||
* 3. If no direct match, look for menu/order/shop links and follow them (1-2 hops)
|
||||
* 4. Check followed pages for provider patterns
|
||||
*/
|
||||
async function crawlWebsiteForMenuLinks(websiteUrl) {
|
||||
console.log(`[WebsiteCrawl] Crawling ${websiteUrl} for menu links...`);
|
||||
const result = {
|
||||
menuUrl: null,
|
||||
provider: 'unknown',
|
||||
foundLinks: [],
|
||||
crawledPages: [],
|
||||
};
|
||||
// Normalize URL
|
||||
let baseUrl;
|
||||
try {
|
||||
baseUrl = new URL(websiteUrl);
|
||||
if (!baseUrl.protocol.startsWith('http')) {
|
||||
baseUrl = new URL(`https://${websiteUrl}`);
|
||||
}
|
||||
}
|
||||
catch {
|
||||
result.error = 'Invalid website URL';
|
||||
return result;
|
||||
}
|
||||
// Step 1: Fetch the homepage
|
||||
const homepage = baseUrl.href;
|
||||
result.crawledPages.push(homepage);
|
||||
const { links: homepageLinks, error: homepageError } = await fetchPageLinks(homepage);
|
||||
if (homepageError) {
|
||||
result.error = `Failed to fetch homepage: ${homepageError}`;
|
||||
return result;
|
||||
}
|
||||
result.foundLinks = homepageLinks;
|
||||
// Step 2: Check for direct provider matches in homepage links
|
||||
for (const link of homepageLinks) {
|
||||
for (const { provider, patterns } of PROVIDER_URL_PATTERNS) {
|
||||
if (patterns.some(p => p.test(link))) {
|
||||
console.log(`[WebsiteCrawl] Found ${provider} link on homepage: ${link}`);
|
||||
result.menuUrl = link;
|
||||
result.provider = provider;
|
||||
return result;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Step 3: Find menu/order/shop links to follow
|
||||
const menuLinks = homepageLinks.filter(link => {
|
||||
// Must be same domain or a known provider domain
|
||||
try {
|
||||
const linkUrl = new URL(link);
|
||||
const isSameDomain = linkUrl.hostname === baseUrl.hostname ||
|
||||
linkUrl.hostname.endsWith(`.${baseUrl.hostname}`);
|
||||
const isProviderDomain = PROVIDER_URL_PATTERNS.some(({ patterns }) => patterns.some(p => p.test(link)));
|
||||
const isMenuPath = MENU_LINK_PATTERNS.some(p => p.test(link));
|
||||
return (isSameDomain && isMenuPath) || isProviderDomain;
|
||||
}
|
||||
catch {
|
||||
return false;
|
||||
}
|
||||
});
|
||||
console.log(`[WebsiteCrawl] Found ${menuLinks.length} potential menu links to follow`);
|
||||
// Step 4: Follow menu links (limit to 3 to avoid excessive crawling)
|
||||
for (const menuLink of menuLinks.slice(0, 3)) {
|
||||
// Skip if we've already crawled this page
|
||||
if (result.crawledPages.includes(menuLink))
|
||||
continue;
|
||||
// Check if this link itself is a provider URL
|
||||
for (const { provider, patterns } of PROVIDER_URL_PATTERNS) {
|
||||
if (patterns.some(p => p.test(menuLink))) {
|
||||
console.log(`[WebsiteCrawl] Menu link is a ${provider} URL: ${menuLink}`);
|
||||
result.menuUrl = menuLink;
|
||||
result.provider = provider;
|
||||
return result;
|
||||
}
|
||||
}
|
||||
result.crawledPages.push(menuLink);
|
||||
// Rate limit
|
||||
await new Promise(r => setTimeout(r, 500));
|
||||
const { links: pageLinks, error: pageError } = await fetchPageLinks(menuLink);
|
||||
if (pageError) {
|
||||
console.log(`[WebsiteCrawl] Failed to fetch ${menuLink}: ${pageError}`);
|
||||
continue;
|
||||
}
|
||||
result.foundLinks.push(...pageLinks);
|
||||
// Check for provider matches on this page
|
||||
for (const link of pageLinks) {
|
||||
for (const { provider, patterns } of PROVIDER_URL_PATTERNS) {
|
||||
if (patterns.some(p => p.test(link))) {
|
||||
console.log(`[WebsiteCrawl] Found ${provider} link on ${menuLink}: ${link}`);
|
||||
result.menuUrl = link;
|
||||
result.provider = provider;
|
||||
return result;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
console.log(`[WebsiteCrawl] No menu provider found on ${websiteUrl}`);
|
||||
return result;
|
||||
}
|
||||
// ============================================================
|
||||
// CORE DETECTION FUNCTIONS
|
||||
// ============================================================
|
||||
/**
|
||||
* Detect menu provider from a URL
|
||||
*/
|
||||
function detectProviderFromUrl(menuUrl) {
|
||||
if (!menuUrl)
|
||||
return 'unknown';
|
||||
for (const { provider, patterns } of PROVIDER_URL_PATTERNS) {
|
||||
for (const pattern of patterns) {
|
||||
if (pattern.test(menuUrl)) {
|
||||
return provider;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Check if it's a custom website (has a domain but doesn't match known providers)
|
||||
try {
|
||||
const url = new URL(menuUrl);
|
||||
if (url.hostname && !url.hostname.includes('localhost')) {
|
||||
return 'custom';
|
||||
}
|
||||
}
|
||||
catch {
|
||||
// Invalid URL
|
||||
}
|
||||
return 'unknown';
|
||||
}
|
||||
/**
|
||||
* Detect provider and resolve platform ID for a single dispensary
|
||||
*/
|
||||
async function detectAndResolveDispensary(dispensaryId) {
|
||||
console.log(`[MenuDetection] Processing dispensary ${dispensaryId}...`);
|
||||
// Get dispensary record
|
||||
const { rows } = await (0, connection_1.query)(`SELECT ${DISPENSARY_COLUMNS} FROM dispensaries WHERE id = $1`, [dispensaryId]);
|
||||
if (rows.length === 0) {
|
||||
return {
|
||||
dispensaryId,
|
||||
dispensaryName: 'Unknown',
|
||||
previousMenuType: null,
|
||||
detectedProvider: 'unknown',
|
||||
cName: null,
|
||||
platformDispensaryId: null,
|
||||
success: false,
|
||||
error: 'Dispensary not found',
|
||||
};
|
||||
}
|
||||
const dispensary = (0, discovery_1.mapDbRowToDispensary)(rows[0]);
|
||||
let menuUrl = dispensary.menuUrl;
|
||||
const previousMenuType = dispensary.menuType || null;
|
||||
const website = dispensary.website;
|
||||
// ============================================================
|
||||
// CURALEAF CHECK: If website is Curaleaf, override any stale Dutchie menu_url
|
||||
// This prevents 60s Dutchie timeouts for stores that have migrated to Curaleaf's platform
|
||||
// ============================================================
|
||||
if (isCuraleafUrl(website)) {
|
||||
console.log(`[MenuDetection] ${dispensary.name}: Website is Curaleaf - marking as curaleaf provider`);
|
||||
// Use the Curaleaf website URL as the menu_url (clearing stale Dutchie URL if any)
|
||||
// At this point we know website is defined since isCuraleafUrl returned true
|
||||
const curaleafUrl = extractCuraleafStoreUrl(website) || website;
|
||||
await (0, connection_1.query)(`
|
||||
UPDATE dispensaries SET
|
||||
menu_type = 'curaleaf',
|
||||
menu_url = $1,
|
||||
platform_dispensary_id = NULL,
|
||||
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
||||
jsonb_build_object(
|
||||
'detected_provider', 'curaleaf'::text,
|
||||
'detection_method', 'website_pattern'::text,
|
||||
'detected_at', NOW(),
|
||||
'curaleaf_store_url', $1::text,
|
||||
'stale_dutchie_url', $2::text,
|
||||
'not_crawlable', true,
|
||||
'not_crawlable_reason', 'Curaleaf proprietary menu - no Dutchie integration'::text
|
||||
),
|
||||
updated_at = NOW()
|
||||
WHERE id = $3
|
||||
`, [curaleafUrl, menuUrl || null, dispensaryId]);
|
||||
return {
|
||||
dispensaryId,
|
||||
dispensaryName: dispensary.name,
|
||||
previousMenuType,
|
||||
detectedProvider: 'curaleaf',
|
||||
cName: null,
|
||||
platformDispensaryId: null,
|
||||
success: true,
|
||||
error: undefined,
|
||||
};
|
||||
}
|
||||
// If menu_url is null or empty, try to discover it by crawling the dispensary website
|
||||
if (!menuUrl || menuUrl.trim() === '') {
|
||||
console.log(`[MenuDetection] ${dispensary.name}: No menu_url - attempting website crawl`);
|
||||
// Check if website is available
|
||||
if (!website || website.trim() === '') {
|
||||
console.log(`[MenuDetection] ${dispensary.name}: No website available - marking as not crawlable`);
|
||||
await (0, connection_1.query)(`
|
||||
UPDATE dispensaries SET
|
||||
menu_type = 'unknown',
|
||||
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
||||
jsonb_build_object(
|
||||
'detected_provider', 'unknown'::text,
|
||||
'detection_method', 'no_data'::text,
|
||||
'detected_at', NOW(),
|
||||
'resolution_error', 'No menu_url and no website available'::text,
|
||||
'not_crawlable', true,
|
||||
'website_crawl_attempted', false
|
||||
),
|
||||
updated_at = NOW()
|
||||
WHERE id = $1
|
||||
`, [dispensaryId]);
|
||||
return {
|
||||
dispensaryId,
|
||||
dispensaryName: dispensary.name,
|
||||
previousMenuType,
|
||||
detectedProvider: 'unknown',
|
||||
cName: null,
|
||||
platformDispensaryId: null,
|
||||
success: true,
|
||||
error: 'No menu_url and no website available - marked as not crawlable',
|
||||
};
|
||||
}
|
||||
// Crawl the website to find menu provider links
|
||||
console.log(`[MenuDetection] ${dispensary.name}: Crawling website ${website} for menu links...`);
|
||||
const crawlResult = await crawlWebsiteForMenuLinks(website);
|
||||
if (crawlResult.menuUrl && crawlResult.provider !== 'unknown') {
|
||||
// SUCCESS: Found a menu URL from website crawl!
|
||||
console.log(`[MenuDetection] ${dispensary.name}: Found ${crawlResult.provider} menu at ${crawlResult.menuUrl}`);
|
||||
menuUrl = crawlResult.menuUrl;
|
||||
// Update the dispensary with the discovered menu_url
|
||||
await (0, connection_1.query)(`
|
||||
UPDATE dispensaries SET
|
||||
menu_url = $1,
|
||||
menu_type = $2,
|
||||
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
||||
jsonb_build_object(
|
||||
'detected_provider', $2::text,
|
||||
'detection_method', 'website_crawl'::text,
|
||||
'detected_at', NOW(),
|
||||
'website_crawled', $3::text,
|
||||
'website_crawl_pages', $4::jsonb,
|
||||
'not_crawlable', false
|
||||
),
|
||||
updated_at = NOW()
|
||||
WHERE id = $5
|
||||
`, [
|
||||
crawlResult.menuUrl,
|
||||
crawlResult.provider,
|
||||
website,
|
||||
JSON.stringify(crawlResult.crawledPages),
|
||||
dispensaryId
|
||||
]);
|
||||
// Continue with full detection flow using the discovered menu_url
|
||||
}
|
||||
else {
|
||||
// Website crawl failed to find a menu provider
|
||||
const errorReason = crawlResult.error || 'No menu provider links found on website';
|
||||
console.log(`[MenuDetection] ${dispensary.name}: Website crawl failed - ${errorReason}`);
|
||||
await (0, connection_1.query)(`
|
||||
UPDATE dispensaries SET
|
||||
menu_type = 'unknown',
|
||||
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
||||
jsonb_build_object(
|
||||
'detected_provider', 'unknown'::text,
|
||||
'detection_method', 'website_crawl'::text,
|
||||
'detected_at', NOW(),
|
||||
'website_crawled', $1::text,
|
||||
'website_crawl_pages', $2::jsonb,
|
||||
'resolution_error', $3::text,
|
||||
'not_crawlable', true
|
||||
),
|
||||
updated_at = NOW()
|
||||
WHERE id = $4
|
||||
`, [
|
||||
website,
|
||||
JSON.stringify(crawlResult.crawledPages),
|
||||
errorReason,
|
||||
dispensaryId
|
||||
]);
|
||||
return {
|
||||
dispensaryId,
|
||||
dispensaryName: dispensary.name,
|
||||
previousMenuType,
|
||||
detectedProvider: 'unknown',
|
||||
cName: null,
|
||||
platformDispensaryId: null,
|
||||
success: true,
|
||||
error: `Website crawl failed: ${errorReason}`,
|
||||
};
|
||||
}
|
||||
}
|
||||
// Detect provider from URL
|
||||
const detectedProvider = detectProviderFromUrl(menuUrl);
|
||||
console.log(`[MenuDetection] ${dispensary.name}: Detected provider = ${detectedProvider} from URL: ${menuUrl}`);
|
||||
// Initialize result
|
||||
const result = {
|
||||
dispensaryId,
|
||||
dispensaryName: dispensary.name,
|
||||
previousMenuType,
|
||||
detectedProvider,
|
||||
cName: null,
|
||||
platformDispensaryId: null,
|
||||
success: false,
|
||||
};
|
||||
// If not dutchie, just update menu_type and return
|
||||
if (detectedProvider !== 'dutchie') {
|
||||
// Special handling for proprietary providers - mark as not_crawlable until we have crawlers
|
||||
const PROPRIETARY_PROVIDERS = ['curaleaf', 'sol'];
|
||||
const isProprietaryProvider = PROPRIETARY_PROVIDERS.includes(detectedProvider);
|
||||
const notCrawlableReason = isProprietaryProvider
|
||||
? `${detectedProvider} proprietary menu - no crawler available`
|
||||
: null;
|
||||
await (0, connection_1.query)(`
|
||||
UPDATE dispensaries SET
|
||||
menu_type = $1,
|
||||
platform_dispensary_id = CASE WHEN $3 THEN NULL ELSE platform_dispensary_id END,
|
||||
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
||||
jsonb_build_object(
|
||||
'detected_provider', $1::text,
|
||||
'detection_method', 'url_pattern'::text,
|
||||
'detected_at', NOW(),
|
||||
'not_crawlable', $3,
|
||||
'not_crawlable_reason', $4::text
|
||||
),
|
||||
updated_at = NOW()
|
||||
WHERE id = $2
|
||||
`, [detectedProvider, dispensaryId, isProprietaryProvider, notCrawlableReason]);
|
||||
result.success = true;
|
||||
console.log(`[MenuDetection] ${dispensary.name}: Updated menu_type to ${detectedProvider}${isProprietaryProvider ? ' (not crawlable)' : ''}`);
|
||||
return result;
|
||||
}
|
||||
// For dutchie: extract cName and resolve platform ID
|
||||
const cName = (0, discovery_1.extractCNameFromMenuUrl)(menuUrl);
|
||||
result.cName = cName;
|
||||
if (!cName) {
|
||||
result.error = `Could not extract cName from menu_url: ${menuUrl}`;
|
||||
await (0, connection_1.query)(`
|
||||
UPDATE dispensaries SET
|
||||
menu_type = 'dutchie',
|
||||
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
||||
jsonb_build_object(
|
||||
'detected_provider', 'dutchie'::text,
|
||||
'detection_method', 'url_pattern'::text,
|
||||
'detected_at', NOW(),
|
||||
'resolution_error', $1::text,
|
||||
'not_crawlable', true
|
||||
),
|
||||
updated_at = NOW()
|
||||
WHERE id = $2
|
||||
`, [result.error, dispensaryId]);
|
||||
console.log(`[MenuDetection] ${dispensary.name}: ${result.error}`);
|
||||
return result;
|
||||
}
|
||||
// Resolve platform_dispensary_id from cName
|
||||
console.log(`[MenuDetection] ${dispensary.name}: Resolving platform ID for cName = ${cName}`);
|
||||
try {
|
||||
const platformId = await (0, graphql_client_1.resolveDispensaryId)(cName);
|
||||
if (platformId) {
|
||||
result.platformDispensaryId = platformId;
|
||||
result.success = true;
|
||||
await (0, connection_1.query)(`
|
||||
UPDATE dispensaries SET
|
||||
menu_type = 'dutchie',
|
||||
platform_dispensary_id = $1,
|
||||
platform_dispensary_id_resolved_at = NOW(),
|
||||
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
||||
jsonb_build_object(
|
||||
'detected_provider', 'dutchie'::text,
|
||||
'detection_method', 'url_pattern'::text,
|
||||
'detected_at', NOW(),
|
||||
'cname_extracted', $2::text,
|
||||
'platform_id_resolved', true,
|
||||
'resolution_error', NULL::text,
|
||||
'not_crawlable', false
|
||||
),
|
||||
updated_at = NOW()
|
||||
WHERE id = $3
|
||||
`, [platformId, cName, dispensaryId]);
|
||||
console.log(`[MenuDetection] ${dispensary.name}: Resolved platform ID = ${platformId}`);
|
||||
}
|
||||
else {
|
||||
result.error = `cName "${cName}" could not be resolved - may not exist on Dutchie`;
|
||||
await (0, connection_1.query)(`
|
||||
UPDATE dispensaries SET
|
||||
menu_type = 'dutchie',
|
||||
platform_dispensary_id = NULL,
|
||||
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
||||
jsonb_build_object(
|
||||
'detected_provider', 'dutchie'::text,
|
||||
'detection_method', 'url_pattern'::text,
|
||||
'detected_at', NOW(),
|
||||
'cname_extracted', $1::text,
|
||||
'platform_id_resolved', false,
|
||||
'resolution_error', $2::text,
|
||||
'not_crawlable', true
|
||||
),
|
||||
updated_at = NOW()
|
||||
WHERE id = $3
|
||||
`, [cName, result.error, dispensaryId]);
|
||||
console.log(`[MenuDetection] ${dispensary.name}: ${result.error}`);
|
||||
}
|
||||
}
|
||||
catch (error) {
|
||||
result.error = `Resolution failed: ${error.message}`;
|
||||
await (0, connection_1.query)(`
|
||||
UPDATE dispensaries SET
|
||||
menu_type = 'dutchie',
|
||||
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
||||
jsonb_build_object(
|
||||
'detected_provider', 'dutchie'::text,
|
||||
'detection_method', 'url_pattern'::text,
|
||||
'detected_at', NOW(),
|
||||
'cname_extracted', $1::text,
|
||||
'platform_id_resolved', false,
|
||||
'resolution_error', $2::text,
|
||||
'not_crawlable', true
|
||||
),
|
||||
updated_at = NOW()
|
||||
WHERE id = $3
|
||||
`, [cName, result.error, dispensaryId]);
|
||||
console.error(`[MenuDetection] ${dispensary.name}: ${result.error}`);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
/**
|
||||
* Run bulk detection on all dispensaries with unknown/missing menu_type or platform_dispensary_id
|
||||
* Also includes dispensaries with no menu_url but with a website (for website crawl discovery)
|
||||
*/
|
||||
async function runBulkDetection(options = {}) {
|
||||
const { state, onlyUnknown = true, onlyMissingPlatformId = false, includeWebsiteCrawl = true, limit } = options;
|
||||
console.log('[MenuDetection] Starting bulk detection...');
|
||||
// Build query to find dispensaries needing detection
|
||||
// Now includes: dispensaries with menu_url OR (no menu_url but has website and not already marked not_crawlable)
|
||||
let whereClause = `WHERE (
|
||||
menu_url IS NOT NULL
|
||||
${includeWebsiteCrawl ? `OR (
|
||||
menu_url IS NULL
|
||||
AND website IS NOT NULL
|
||||
AND website != ''
|
||||
AND (provider_detection_data IS NULL OR NOT (provider_detection_data->>'not_crawlable')::boolean)
|
||||
)` : ''}
|
||||
)`;
|
||||
const params = [];
|
||||
let paramIndex = 1;
|
||||
if (state) {
|
||||
whereClause += ` AND state = $${paramIndex++}`;
|
||||
params.push(state);
|
||||
}
|
||||
if (onlyUnknown) {
|
||||
whereClause += ` AND (menu_type IS NULL OR menu_type = '' OR menu_type = 'unknown')`;
|
||||
}
|
||||
if (onlyMissingPlatformId) {
|
||||
whereClause += ` AND (menu_type = 'dutchie' AND platform_dispensary_id IS NULL)`;
|
||||
}
|
||||
let query_str = `
|
||||
SELECT ${DISPENSARY_COLUMNS} FROM dispensaries
|
||||
${whereClause}
|
||||
ORDER BY name
|
||||
`;
|
||||
if (limit) {
|
||||
query_str += ` LIMIT $${paramIndex}`;
|
||||
params.push(limit);
|
||||
}
|
||||
const { rows: dispensaries } = await (0, connection_1.query)(query_str, params);
|
||||
console.log(`[MenuDetection] Found ${dispensaries.length} dispensaries to process (includeWebsiteCrawl=${includeWebsiteCrawl})`);
|
||||
const result = {
|
||||
totalProcessed: 0,
|
||||
totalSucceeded: 0,
|
||||
totalFailed: 0,
|
||||
totalSkipped: 0,
|
||||
results: [],
|
||||
errors: [],
|
||||
};
|
||||
for (const row of dispensaries) {
|
||||
result.totalProcessed++;
|
||||
try {
|
||||
const detectionResult = await detectAndResolveDispensary(row.id);
|
||||
result.results.push(detectionResult);
|
||||
if (detectionResult.success) {
|
||||
result.totalSucceeded++;
|
||||
}
|
||||
else {
|
||||
result.totalFailed++;
|
||||
if (detectionResult.error) {
|
||||
result.errors.push(`${detectionResult.dispensaryName}: ${detectionResult.error}`);
|
||||
}
|
||||
}
|
||||
// Rate limit between requests
|
||||
await new Promise(r => setTimeout(r, 1000));
|
||||
}
|
||||
catch (error) {
|
||||
result.totalFailed++;
|
||||
result.errors.push(`${row.name || row.id}: ${error.message}`);
|
||||
}
|
||||
}
|
||||
console.log(`[MenuDetection] Bulk detection complete: ${result.totalSucceeded} succeeded, ${result.totalFailed} failed`);
|
||||
return result;
|
||||
}
|
||||
// ============================================================
|
||||
// SCHEDULED JOB EXECUTOR
|
||||
// ============================================================
|
||||
/**
|
||||
* Execute the menu detection job (called by scheduler)
|
||||
*/
|
||||
async function executeMenuDetectionJob(config = {}) {
|
||||
const state = config.state || 'AZ';
|
||||
const onlyUnknown = config.onlyUnknown !== false;
|
||||
const onlyMissingPlatformId = config.onlyMissingPlatformId || false;
|
||||
console.log(`[MenuDetection] Executing scheduled job for state=${state}...`);
|
||||
try {
|
||||
const result = await runBulkDetection({
|
||||
state,
|
||||
onlyUnknown,
|
||||
onlyMissingPlatformId,
|
||||
});
|
||||
const status = result.totalFailed === 0 ? 'success' :
|
||||
result.totalSucceeded === 0 ? 'error' : 'partial';
|
||||
return {
|
||||
status,
|
||||
itemsProcessed: result.totalProcessed,
|
||||
itemsSucceeded: result.totalSucceeded,
|
||||
itemsFailed: result.totalFailed,
|
||||
errorMessage: result.errors.length > 0 ? result.errors.slice(0, 5).join('; ') : undefined,
|
||||
metadata: {
|
||||
state,
|
||||
onlyUnknown,
|
||||
onlyMissingPlatformId,
|
||||
providerCounts: countByProvider(result.results),
|
||||
},
|
||||
};
|
||||
}
|
||||
catch (error) {
|
||||
return {
|
||||
status: 'error',
|
||||
itemsProcessed: 0,
|
||||
itemsSucceeded: 0,
|
||||
itemsFailed: 0,
|
||||
errorMessage: error.message,
|
||||
};
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Count results by detected provider
|
||||
*/
|
||||
function countByProvider(results) {
|
||||
const counts = {};
|
||||
for (const r of results) {
|
||||
counts[r.detectedProvider] = (counts[r.detectedProvider] || 0) + 1;
|
||||
}
|
||||
return counts;
|
||||
}
|
||||
// ============================================================
|
||||
// UTILITY FUNCTIONS
|
||||
// ============================================================
|
||||
/**
|
||||
* Get detection stats for dashboard
|
||||
*/
|
||||
async function getDetectionStats() {
|
||||
const { rows } = await (0, connection_1.query)(`
|
||||
SELECT
|
||||
COUNT(*) as total,
|
||||
COUNT(*) FILTER (WHERE menu_type IS NOT NULL AND menu_type != '' AND menu_type != 'unknown') as with_menu_type,
|
||||
COUNT(*) FILTER (WHERE platform_dispensary_id IS NOT NULL) as with_platform_id,
|
||||
COUNT(*) FILTER (WHERE menu_url IS NOT NULL AND (menu_type IS NULL OR menu_type = '' OR menu_type = 'unknown')) as needs_detection
|
||||
FROM dispensaries
|
||||
WHERE state = 'AZ'
|
||||
`);
|
||||
const stats = rows[0] || {};
|
||||
// Get provider breakdown
|
||||
const { rows: providerRows } = await (0, connection_1.query)(`
|
||||
SELECT menu_type, COUNT(*) as count
|
||||
FROM dispensaries
|
||||
WHERE state = 'AZ' AND menu_type IS NOT NULL AND menu_type != ''
|
||||
GROUP BY menu_type
|
||||
ORDER BY count DESC
|
||||
`);
|
||||
const byProvider = {};
|
||||
for (const row of providerRows) {
|
||||
byProvider[row.menu_type] = parseInt(row.count, 10);
|
||||
}
|
||||
return {
|
||||
totalDispensaries: parseInt(stats.total || '0', 10),
|
||||
withMenuType: parseInt(stats.with_menu_type || '0', 10),
|
||||
withPlatformId: parseInt(stats.with_platform_id || '0', 10),
|
||||
needsDetection: parseInt(stats.needs_detection || '0', 10),
|
||||
byProvider,
|
||||
};
|
||||
}
|
||||
/**
|
||||
* Get dispensaries needing detection
|
||||
* Includes dispensaries with website but no menu_url for website crawl discovery
|
||||
*/
|
||||
async function getDispensariesNeedingDetection(options = {}) {
|
||||
const { state = 'AZ', limit = 100, includeWebsiteCrawl = true } = options;
|
||||
const { rows } = await (0, connection_1.query)(`
|
||||
SELECT ${DISPENSARY_COLUMNS} FROM dispensaries
|
||||
WHERE state = $1
|
||||
AND (
|
||||
(menu_url IS NOT NULL AND (menu_type IS NULL OR menu_type = '' OR menu_type = 'unknown'
|
||||
OR (menu_type = 'dutchie' AND platform_dispensary_id IS NULL)))
|
||||
${includeWebsiteCrawl ? `OR (
|
||||
menu_url IS NULL
|
||||
AND website IS NOT NULL
|
||||
AND website != ''
|
||||
AND (provider_detection_data IS NULL OR NOT (provider_detection_data->>'not_crawlable')::boolean)
|
||||
)` : ''}
|
||||
)
|
||||
ORDER BY name
|
||||
LIMIT $2
|
||||
`, [state, limit]);
|
||||
return rows.map(discovery_1.mapDbRowToDispensary);
|
||||
}
|
||||
843
backend/dist/dutchie-az/services/product-crawler.js
vendored
Normal file
843
backend/dist/dutchie-az/services/product-crawler.js
vendored
Normal file
@@ -0,0 +1,843 @@
|
||||
"use strict";
|
||||
/**
|
||||
* Dutchie AZ Product Crawler Service
|
||||
*
|
||||
* Crawls products from Dutchie dispensaries and stores them in the dutchie_az database.
|
||||
* Handles normalization from GraphQL response to database entities.
|
||||
*
|
||||
* IMPORTANT: Uses chunked batch processing per CLAUDE.md Rule #15 to avoid OOM.
|
||||
*/
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.normalizeProduct = normalizeProduct;
|
||||
exports.normalizeSnapshot = normalizeSnapshot;
|
||||
exports.crawlDispensaryProducts = crawlDispensaryProducts;
|
||||
exports.crawlAllArizonaDispensaries = crawlAllArizonaDispensaries;
|
||||
const connection_1 = require("../db/connection");
|
||||
const graphql_client_1 = require("./graphql-client");
|
||||
const discovery_1 = require("./discovery");
|
||||
const types_1 = require("../types");
|
||||
const image_storage_1 = require("../../utils/image-storage");
|
||||
// Explicit column list for dispensaries table (avoids SELECT * issues with schema differences)
|
||||
const DISPENSARY_COLUMNS = `
|
||||
id, name, slug, city, state, zip, address, latitude, longitude,
|
||||
menu_type, menu_url, platform_dispensary_id, website,
|
||||
provider_detection_data, created_at, updated_at
|
||||
`;
|
||||
// ============================================================
|
||||
// BATCH PROCESSING CONFIGURATION
|
||||
// ============================================================
|
||||
/** Chunk size for batch DB writes (per CLAUDE.md Rule #15) */
|
||||
const BATCH_CHUNK_SIZE = 100;
|
||||
// ============================================================
|
||||
// NORMALIZATION FUNCTIONS
|
||||
// ============================================================
|
||||
/**
|
||||
* Convert price to cents
|
||||
*/
|
||||
function toCents(price) {
|
||||
if (price === undefined || price === null)
|
||||
return undefined;
|
||||
return Math.round(price * 100);
|
||||
}
|
||||
/**
|
||||
* Get min value from array of numbers
|
||||
*/
|
||||
function getMin(arr) {
|
||||
if (!arr || arr.length === 0)
|
||||
return undefined;
|
||||
return Math.min(...arr.filter((n) => n !== null && n !== undefined));
|
||||
}
|
||||
/**
|
||||
* Get max value from array of numbers
|
||||
*/
|
||||
function getMax(arr) {
|
||||
if (!arr || arr.length === 0)
|
||||
return undefined;
|
||||
return Math.max(...arr.filter((n) => n !== null && n !== undefined));
|
||||
}
|
||||
/**
|
||||
* Normalize a value to boolean
|
||||
* Handles Dutchie API returning {} or [] or other non-boolean values
|
||||
* that would cause "invalid input syntax for type boolean" errors
|
||||
*/
|
||||
function normBool(v, defaultVal = false) {
|
||||
if (v === true)
|
||||
return true;
|
||||
if (v === false)
|
||||
return false;
|
||||
// Log unexpected object/array values once for debugging
|
||||
if (v !== null && v !== undefined && typeof v === 'object') {
|
||||
console.warn(`[normBool] Unexpected object value, coercing to ${defaultVal}:`, JSON.stringify(v));
|
||||
}
|
||||
return defaultVal;
|
||||
}
|
||||
/**
|
||||
* Normalize a value to Date or undefined
|
||||
* Handles Dutchie API returning {} or [] or other non-date values
|
||||
* that would cause "invalid input syntax for type timestamp" errors
|
||||
*/
|
||||
function normDate(v) {
|
||||
if (!v)
|
||||
return undefined;
|
||||
// Reject objects/arrays that aren't dates
|
||||
if (typeof v === 'object' && !(v instanceof Date)) {
|
||||
console.warn(`[normDate] Unexpected object value, ignoring:`, JSON.stringify(v));
|
||||
return undefined;
|
||||
}
|
||||
// Try parsing
|
||||
const d = new Date(v);
|
||||
if (isNaN(d.getTime())) {
|
||||
console.warn(`[normDate] Invalid date value, ignoring:`, v);
|
||||
return undefined;
|
||||
}
|
||||
return d;
|
||||
}
|
||||
/**
|
||||
* Extract cName (Dutchie slug) from menuUrl or dispensary slug
|
||||
* Handles URL formats:
|
||||
* - https://dutchie.com/embedded-menu/AZ-Deeply-Rooted -> AZ-Deeply-Rooted
|
||||
* - https://dutchie.com/dispensary/sol-flower-dispensary-mcclintock -> sol-flower-dispensary-mcclintock
|
||||
* Falls back to dispensary.slug if menuUrl extraction fails
|
||||
*/
|
||||
function extractCName(dispensary) {
|
||||
if (dispensary.menuUrl) {
|
||||
try {
|
||||
const url = new URL(dispensary.menuUrl);
|
||||
// Extract last path segment: /embedded-menu/X or /dispensary/X
|
||||
const segments = url.pathname.split('/').filter(Boolean);
|
||||
if (segments.length >= 2) {
|
||||
const cName = segments[segments.length - 1];
|
||||
if (cName) {
|
||||
console.log(`[ProductCrawler] Extracted cName "${cName}" from menuUrl`);
|
||||
return cName;
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (e) {
|
||||
console.warn(`[ProductCrawler] Failed to parse menuUrl: ${dispensary.menuUrl}`);
|
||||
}
|
||||
}
|
||||
// Fallback to slug
|
||||
console.log(`[ProductCrawler] Using dispensary slug "${dispensary.slug}" as cName`);
|
||||
return dispensary.slug;
|
||||
}
|
||||
/**
|
||||
* Normalize a POSMetaData.children entry to DutchieProductOptionSnapshot
|
||||
*/
|
||||
function normalizeOption(child) {
|
||||
return {
|
||||
optionId: child.canonicalID || child.canonicalPackageId || child.canonicalSKU || child.option || 'unknown',
|
||||
canonicalId: child.canonicalID,
|
||||
canonicalPackageId: child.canonicalPackageId,
|
||||
canonicalSKU: child.canonicalSKU,
|
||||
canonicalName: child.canonicalName,
|
||||
canonicalCategory: child.canonicalCategory,
|
||||
canonicalCategoryId: child.canonicalCategoryId,
|
||||
canonicalBrandId: child.canonicalBrandId,
|
||||
canonicalBrandName: child.canonicalBrandName,
|
||||
canonicalStrainId: child.canonicalStrainId,
|
||||
canonicalVendorId: child.canonicalVendorId,
|
||||
optionLabel: child.option,
|
||||
packageQuantity: child.packageQuantity,
|
||||
recEquivalent: child.recEquivalent,
|
||||
standardEquivalent: child.standardEquivalent,
|
||||
priceCents: toCents(child.price),
|
||||
recPriceCents: toCents(child.recPrice),
|
||||
medPriceCents: toCents(child.medPrice),
|
||||
quantity: child.quantity,
|
||||
quantityAvailable: child.quantityAvailable,
|
||||
kioskQuantityAvailable: child.kioskQuantityAvailable,
|
||||
activeBatchTags: child.activeBatchTags,
|
||||
canonicalImgUrl: child.canonicalImgUrl,
|
||||
canonicalLabResultUrl: child.canonicalLabResultUrl,
|
||||
canonicalEffectivePotencyMg: child.canonicalEffectivePotencyMg,
|
||||
rawChildPayload: child,
|
||||
};
|
||||
}
|
||||
/**
|
||||
* Normalize a raw Dutchie product to DutchieProduct (canonical identity)
|
||||
*/
|
||||
function normalizeProduct(raw, dispensaryId, platformDispensaryId) {
|
||||
return {
|
||||
dispensaryId,
|
||||
platform: 'dutchie',
|
||||
externalProductId: raw._id || raw.id || '',
|
||||
platformDispensaryId,
|
||||
cName: raw.cName,
|
||||
name: raw.Name,
|
||||
// Brand
|
||||
brandName: raw.brandName || raw.brand?.name,
|
||||
brandId: raw.brandId || raw.brand?.id,
|
||||
brandLogoUrl: raw.brandLogo || raw.brand?.imageUrl,
|
||||
// Classification
|
||||
type: raw.type,
|
||||
subcategory: raw.subcategory,
|
||||
strainType: raw.strainType,
|
||||
provider: raw.provider,
|
||||
// Potency
|
||||
thc: raw.THC,
|
||||
thcContent: raw.THCContent?.range?.[0],
|
||||
cbd: raw.CBD,
|
||||
cbdContent: raw.CBDContent?.range?.[0],
|
||||
cannabinoidsV2: raw.cannabinoidsV2,
|
||||
effects: raw.effects,
|
||||
// Status / flags
|
||||
status: raw.Status,
|
||||
medicalOnly: normBool(raw.medicalOnly, false),
|
||||
recOnly: normBool(raw.recOnly, false),
|
||||
featured: normBool(raw.featured, false),
|
||||
comingSoon: normBool(raw.comingSoon, false),
|
||||
certificateOfAnalysisEnabled: normBool(raw.certificateOfAnalysisEnabled, false),
|
||||
isBelowThreshold: normBool(raw.isBelowThreshold, false),
|
||||
isBelowKioskThreshold: normBool(raw.isBelowKioskThreshold, false),
|
||||
optionsBelowThreshold: normBool(raw.optionsBelowThreshold, false),
|
||||
optionsBelowKioskThreshold: normBool(raw.optionsBelowKioskThreshold, false),
|
||||
// Derived stock status
|
||||
stockStatus: (0, types_1.deriveStockStatus)(raw),
|
||||
totalQuantityAvailable: (0, types_1.calculateTotalQuantity)(raw),
|
||||
// Images
|
||||
primaryImageUrl: raw.Image || raw.images?.[0]?.url,
|
||||
images: raw.images,
|
||||
// Misc
|
||||
measurements: raw.measurements,
|
||||
weight: typeof raw.weight === 'number' ? String(raw.weight) : raw.weight,
|
||||
pastCNames: raw.pastCNames,
|
||||
createdAtDutchie: normDate(raw.createdAt),
|
||||
updatedAtDutchie: normDate(raw.updatedAt),
|
||||
latestRawPayload: raw,
|
||||
};
|
||||
}
|
||||
/**
|
||||
* Normalize a raw Dutchie product to DutchieProductSnapshot (time-series data)
|
||||
*/
|
||||
function normalizeSnapshot(raw, dutchieProductId, dispensaryId, platformDispensaryId, pricingType, crawlMode = 'mode_a') {
|
||||
const children = raw.POSMetaData?.children || [];
|
||||
const options = children.map(normalizeOption);
|
||||
// Aggregate prices from various sources
|
||||
const recPrices = raw.recPrices || [];
|
||||
const medPrices = raw.medicalPrices || [];
|
||||
const recSpecialPrices = raw.recSpecialPrices || [];
|
||||
const medSpecialPrices = raw.medicalSpecialPrices || [];
|
||||
const wholesalePrices = raw.wholesalePrices || [];
|
||||
// Also consider child prices
|
||||
const childRecPrices = children.map((c) => c.recPrice).filter((p) => p !== undefined);
|
||||
const childMedPrices = children.map((c) => c.medPrice).filter((p) => p !== undefined);
|
||||
const childPrices = children.map((c) => c.price).filter((p) => p !== undefined);
|
||||
// Aggregate inventory - use calculateTotalQuantity for proper null handling
|
||||
const totalQty = (0, types_1.calculateTotalQuantity)(raw);
|
||||
const hasAnyKioskQty = children.some(c => typeof c.kioskQuantityAvailable === 'number');
|
||||
const totalKioskQty = hasAnyKioskQty
|
||||
? children.reduce((sum, c) => sum + (c.kioskQuantityAvailable || 0), 0)
|
||||
: null;
|
||||
// Determine if on special
|
||||
const isOnSpecial = raw.special === true ||
|
||||
(raw.specialData?.saleSpecials && raw.specialData.saleSpecials.length > 0) ||
|
||||
(recSpecialPrices.length > 0 && recSpecialPrices[0] !== null) ||
|
||||
(medSpecialPrices.length > 0 && medSpecialPrices[0] !== null);
|
||||
return {
|
||||
dutchieProductId,
|
||||
dispensaryId,
|
||||
platformDispensaryId,
|
||||
externalProductId: raw._id || raw.id || '',
|
||||
pricingType,
|
||||
crawlMode,
|
||||
status: raw.Status,
|
||||
featured: normBool(raw.featured, false),
|
||||
special: normBool(isOnSpecial, false),
|
||||
medicalOnly: normBool(raw.medicalOnly, false),
|
||||
recOnly: normBool(raw.recOnly, false),
|
||||
// Product was present in feed
|
||||
isPresentInFeed: true,
|
||||
// Derived stock status
|
||||
stockStatus: (0, types_1.deriveStockStatus)(raw),
|
||||
// Price summary
|
||||
recMinPriceCents: toCents(getMin([...recPrices, ...childRecPrices, ...childPrices])),
|
||||
recMaxPriceCents: toCents(getMax([...recPrices, ...childRecPrices, ...childPrices])),
|
||||
recMinSpecialPriceCents: toCents(getMin(recSpecialPrices)),
|
||||
medMinPriceCents: toCents(getMin([...medPrices, ...childMedPrices])),
|
||||
medMaxPriceCents: toCents(getMax([...medPrices, ...childMedPrices])),
|
||||
medMinSpecialPriceCents: toCents(getMin(medSpecialPrices)),
|
||||
wholesaleMinPriceCents: toCents(getMin(wholesalePrices)),
|
||||
// Inventory summary - null = unknown, 0 = all OOS
|
||||
totalQuantityAvailable: totalQty,
|
||||
totalKioskQuantityAvailable: totalKioskQty,
|
||||
manualInventory: normBool(raw.manualInventory, false),
|
||||
isBelowThreshold: normBool(raw.isBelowThreshold, false),
|
||||
isBelowKioskThreshold: normBool(raw.isBelowKioskThreshold, false),
|
||||
options,
|
||||
rawPayload: raw,
|
||||
crawledAt: new Date(),
|
||||
};
|
||||
}
|
||||
// ============================================================
|
||||
// DATABASE OPERATIONS
|
||||
// ============================================================
|
||||
/**
|
||||
* Upsert a DutchieProduct record
|
||||
*/
|
||||
async function upsertProduct(product) {
|
||||
const result = await (0, connection_1.query)(`
|
||||
INSERT INTO dutchie_products (
|
||||
dispensary_id, platform, external_product_id, platform_dispensary_id,
|
||||
c_name, name, brand_name, brand_id, brand_logo_url,
|
||||
type, subcategory, strain_type, provider,
|
||||
thc, thc_content, cbd, cbd_content, cannabinoids_v2, effects,
|
||||
status, medical_only, rec_only, featured, coming_soon, certificate_of_analysis_enabled,
|
||||
is_below_threshold, is_below_kiosk_threshold, options_below_threshold, options_below_kiosk_threshold,
|
||||
stock_status, total_quantity_available,
|
||||
primary_image_url, images, measurements, weight, past_c_names,
|
||||
created_at_dutchie, updated_at_dutchie, latest_raw_payload, updated_at
|
||||
) VALUES (
|
||||
$1, $2, $3, $4,
|
||||
$5, $6, $7, $8, $9,
|
||||
$10, $11, $12, $13,
|
||||
$14, $15, $16, $17, $18, $19,
|
||||
$20, $21, $22, $23, $24, $25,
|
||||
$26, $27, $28, $29,
|
||||
$30, $31,
|
||||
$32, $33, $34, $35, $36,
|
||||
$37, $38, $39, NOW()
|
||||
)
|
||||
ON CONFLICT (dispensary_id, external_product_id) DO UPDATE SET
|
||||
c_name = EXCLUDED.c_name,
|
||||
name = EXCLUDED.name,
|
||||
brand_name = EXCLUDED.brand_name,
|
||||
brand_id = EXCLUDED.brand_id,
|
||||
brand_logo_url = EXCLUDED.brand_logo_url,
|
||||
type = EXCLUDED.type,
|
||||
subcategory = EXCLUDED.subcategory,
|
||||
strain_type = EXCLUDED.strain_type,
|
||||
provider = EXCLUDED.provider,
|
||||
thc = EXCLUDED.thc,
|
||||
thc_content = EXCLUDED.thc_content,
|
||||
cbd = EXCLUDED.cbd,
|
||||
cbd_content = EXCLUDED.cbd_content,
|
||||
cannabinoids_v2 = EXCLUDED.cannabinoids_v2,
|
||||
effects = EXCLUDED.effects,
|
||||
status = EXCLUDED.status,
|
||||
medical_only = EXCLUDED.medical_only,
|
||||
rec_only = EXCLUDED.rec_only,
|
||||
featured = EXCLUDED.featured,
|
||||
coming_soon = EXCLUDED.coming_soon,
|
||||
certificate_of_analysis_enabled = EXCLUDED.certificate_of_analysis_enabled,
|
||||
is_below_threshold = EXCLUDED.is_below_threshold,
|
||||
is_below_kiosk_threshold = EXCLUDED.is_below_kiosk_threshold,
|
||||
options_below_threshold = EXCLUDED.options_below_threshold,
|
||||
options_below_kiosk_threshold = EXCLUDED.options_below_kiosk_threshold,
|
||||
stock_status = EXCLUDED.stock_status,
|
||||
total_quantity_available = EXCLUDED.total_quantity_available,
|
||||
primary_image_url = EXCLUDED.primary_image_url,
|
||||
images = EXCLUDED.images,
|
||||
measurements = EXCLUDED.measurements,
|
||||
weight = EXCLUDED.weight,
|
||||
past_c_names = EXCLUDED.past_c_names,
|
||||
created_at_dutchie = EXCLUDED.created_at_dutchie,
|
||||
updated_at_dutchie = EXCLUDED.updated_at_dutchie,
|
||||
latest_raw_payload = EXCLUDED.latest_raw_payload,
|
||||
updated_at = NOW()
|
||||
RETURNING id
|
||||
`, [
|
||||
product.dispensaryId,
|
||||
product.platform,
|
||||
product.externalProductId,
|
||||
product.platformDispensaryId,
|
||||
product.cName,
|
||||
product.name,
|
||||
product.brandName,
|
||||
product.brandId,
|
||||
product.brandLogoUrl,
|
||||
product.type,
|
||||
product.subcategory,
|
||||
product.strainType,
|
||||
product.provider,
|
||||
product.thc,
|
||||
product.thcContent,
|
||||
product.cbd,
|
||||
product.cbdContent,
|
||||
product.cannabinoidsV2 ? JSON.stringify(product.cannabinoidsV2) : null,
|
||||
product.effects ? JSON.stringify(product.effects) : null,
|
||||
product.status,
|
||||
product.medicalOnly,
|
||||
product.recOnly,
|
||||
product.featured,
|
||||
product.comingSoon,
|
||||
product.certificateOfAnalysisEnabled,
|
||||
product.isBelowThreshold,
|
||||
product.isBelowKioskThreshold,
|
||||
product.optionsBelowThreshold,
|
||||
product.optionsBelowKioskThreshold,
|
||||
product.stockStatus,
|
||||
product.totalQuantityAvailable,
|
||||
product.primaryImageUrl,
|
||||
product.images ? JSON.stringify(product.images) : null,
|
||||
product.measurements ? JSON.stringify(product.measurements) : null,
|
||||
product.weight,
|
||||
product.pastCNames,
|
||||
product.createdAtDutchie,
|
||||
product.updatedAtDutchie,
|
||||
product.latestRawPayload ? JSON.stringify(product.latestRawPayload) : null,
|
||||
]);
|
||||
return result.rows[0].id;
|
||||
}
|
||||
/**
|
||||
* Download product image and update local image URLs
|
||||
* Skips download if local image already exists for this product+URL combo
|
||||
*/
|
||||
async function downloadAndUpdateProductImage(productId, dispensaryId, externalProductId, primaryImageUrl) {
|
||||
if (!primaryImageUrl) {
|
||||
return { downloaded: false, error: 'No image URL' };
|
||||
}
|
||||
try {
|
||||
// Check if we already have this image locally
|
||||
const exists = await (0, image_storage_1.imageExists)(dispensaryId, externalProductId, primaryImageUrl);
|
||||
if (exists) {
|
||||
return { downloaded: false };
|
||||
}
|
||||
// Download and process the image
|
||||
const result = await (0, image_storage_1.downloadProductImage)(primaryImageUrl, dispensaryId, externalProductId);
|
||||
if (!result.success || !result.urls) {
|
||||
return { downloaded: false, error: result.error };
|
||||
}
|
||||
// Update the product record with local image URLs
|
||||
await (0, connection_1.query)(`
|
||||
UPDATE dutchie_products
|
||||
SET
|
||||
local_image_url = $1,
|
||||
local_image_thumb_url = $2,
|
||||
local_image_medium_url = $3,
|
||||
original_image_url = COALESCE(original_image_url, primary_image_url),
|
||||
updated_at = NOW()
|
||||
WHERE id = $4
|
||||
`, [result.urls.full, result.urls.thumb, result.urls.medium, productId]);
|
||||
return { downloaded: true };
|
||||
}
|
||||
catch (error) {
|
||||
return { downloaded: false, error: error.message };
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Insert a snapshot record
|
||||
*/
|
||||
async function insertSnapshot(snapshot) {
|
||||
const result = await (0, connection_1.query)(`
|
||||
INSERT INTO dutchie_product_snapshots (
|
||||
dutchie_product_id, dispensary_id, platform_dispensary_id, external_product_id,
|
||||
pricing_type, crawl_mode, status, featured, special, medical_only, rec_only,
|
||||
is_present_in_feed, stock_status,
|
||||
rec_min_price_cents, rec_max_price_cents, rec_min_special_price_cents,
|
||||
med_min_price_cents, med_max_price_cents, med_min_special_price_cents,
|
||||
wholesale_min_price_cents,
|
||||
total_quantity_available, total_kiosk_quantity_available, manual_inventory,
|
||||
is_below_threshold, is_below_kiosk_threshold,
|
||||
options, raw_payload, crawled_at
|
||||
) VALUES (
|
||||
$1, $2, $3, $4,
|
||||
$5, $6, $7, $8, $9, $10, $11,
|
||||
$12, $13,
|
||||
$14, $15, $16,
|
||||
$17, $18, $19,
|
||||
$20,
|
||||
$21, $22, $23,
|
||||
$24, $25,
|
||||
$26, $27, $28
|
||||
)
|
||||
RETURNING id
|
||||
`, [
|
||||
snapshot.dutchieProductId,
|
||||
snapshot.dispensaryId,
|
||||
snapshot.platformDispensaryId,
|
||||
snapshot.externalProductId,
|
||||
snapshot.pricingType,
|
||||
snapshot.crawlMode,
|
||||
snapshot.status,
|
||||
snapshot.featured,
|
||||
snapshot.special,
|
||||
snapshot.medicalOnly,
|
||||
snapshot.recOnly,
|
||||
snapshot.isPresentInFeed ?? true,
|
||||
snapshot.stockStatus,
|
||||
snapshot.recMinPriceCents,
|
||||
snapshot.recMaxPriceCents,
|
||||
snapshot.recMinSpecialPriceCents,
|
||||
snapshot.medMinPriceCents,
|
||||
snapshot.medMaxPriceCents,
|
||||
snapshot.medMinSpecialPriceCents,
|
||||
snapshot.wholesaleMinPriceCents,
|
||||
snapshot.totalQuantityAvailable,
|
||||
snapshot.totalKioskQuantityAvailable,
|
||||
snapshot.manualInventory,
|
||||
snapshot.isBelowThreshold,
|
||||
snapshot.isBelowKioskThreshold,
|
||||
JSON.stringify(snapshot.options || []),
|
||||
JSON.stringify(snapshot.rawPayload || {}),
|
||||
snapshot.crawledAt,
|
||||
]);
|
||||
return result.rows[0].id;
|
||||
}
|
||||
// ============================================================
|
||||
// BATCH DATABASE OPERATIONS (per CLAUDE.md Rule #15)
|
||||
// ============================================================
|
||||
/**
|
||||
* Helper to chunk an array into smaller arrays
|
||||
*/
|
||||
function chunkArray(array, size) {
|
||||
const chunks = [];
|
||||
for (let i = 0; i < array.length; i += size) {
|
||||
chunks.push(array.slice(i, i + size));
|
||||
}
|
||||
return chunks;
|
||||
}
|
||||
/**
|
||||
* Batch upsert products - processes in chunks to avoid OOM
|
||||
* Returns a Map of externalProductId -> database id
|
||||
*/
|
||||
async function batchUpsertProducts(products) {
|
||||
const productIdMap = new Map();
|
||||
const chunks = chunkArray(products, BATCH_CHUNK_SIZE);
|
||||
console.log(`[ProductCrawler] Batch upserting ${products.length} products in ${chunks.length} chunks of ${BATCH_CHUNK_SIZE}...`);
|
||||
for (let i = 0; i < chunks.length; i++) {
|
||||
const chunk = chunks[i];
|
||||
// Process each product in the chunk
|
||||
for (const product of chunk) {
|
||||
try {
|
||||
const id = await upsertProduct(product);
|
||||
if (product.externalProductId) {
|
||||
productIdMap.set(product.externalProductId, id);
|
||||
}
|
||||
}
|
||||
catch (error) {
|
||||
console.error(`[ProductCrawler] Error upserting product ${product.externalProductId}:`, error.message);
|
||||
}
|
||||
}
|
||||
// Log progress
|
||||
if ((i + 1) % 5 === 0 || i === chunks.length - 1) {
|
||||
console.log(`[ProductCrawler] Upserted chunk ${i + 1}/${chunks.length} (${productIdMap.size} products so far)`);
|
||||
}
|
||||
}
|
||||
return productIdMap;
|
||||
}
|
||||
/**
|
||||
* Batch insert snapshots - processes in chunks to avoid OOM
|
||||
*/
|
||||
async function batchInsertSnapshots(snapshots) {
|
||||
const chunks = chunkArray(snapshots, BATCH_CHUNK_SIZE);
|
||||
let inserted = 0;
|
||||
console.log(`[ProductCrawler] Batch inserting ${snapshots.length} snapshots in ${chunks.length} chunks of ${BATCH_CHUNK_SIZE}...`);
|
||||
for (let i = 0; i < chunks.length; i++) {
|
||||
const chunk = chunks[i];
|
||||
// Process each snapshot in the chunk
|
||||
for (const snapshot of chunk) {
|
||||
try {
|
||||
await insertSnapshot(snapshot);
|
||||
inserted++;
|
||||
}
|
||||
catch (error) {
|
||||
console.error(`[ProductCrawler] Error inserting snapshot for ${snapshot.externalProductId}:`, error.message);
|
||||
}
|
||||
}
|
||||
// Log progress
|
||||
if ((i + 1) % 5 === 0 || i === chunks.length - 1) {
|
||||
console.log(`[ProductCrawler] Inserted snapshot chunk ${i + 1}/${chunks.length} (${inserted} snapshots so far)`);
|
||||
}
|
||||
}
|
||||
return inserted;
|
||||
}
|
||||
/**
|
||||
* Update dispensary last_crawled_at and product_count
|
||||
*/
|
||||
async function updateDispensaryCrawlStats(dispensaryId, productCount) {
|
||||
// Update last_crawl_at to track when we last crawled
|
||||
// Skip product_count as that column may not exist
|
||||
await (0, connection_1.query)(`
|
||||
UPDATE dispensaries
|
||||
SET last_crawl_at = NOW(), updated_at = NOW()
|
||||
WHERE id = $1
|
||||
`, [dispensaryId]);
|
||||
}
|
||||
/**
|
||||
* Mark products as missing from feed
|
||||
* Creates a snapshot with isPresentInFeed=false and stockStatus='missing_from_feed'
|
||||
* for products that were NOT in the UNION of Mode A and Mode B product lists
|
||||
*
|
||||
* IMPORTANT: Uses UNION of both modes to avoid false positives
|
||||
* If the union is empty (possible outage), we skip marking to avoid data corruption
|
||||
*/
|
||||
async function markMissingProducts(dispensaryId, platformDispensaryId, modeAProductIds, modeBProductIds, pricingType) {
|
||||
// Build UNION of Mode A + Mode B product IDs
|
||||
const unionProductIds = new Set([...Array.from(modeAProductIds), ...Array.from(modeBProductIds)]);
|
||||
// OUTAGE DETECTION: If union is empty, something went wrong - don't mark anything as missing
|
||||
if (unionProductIds.size === 0) {
|
||||
console.warn('[ProductCrawler] OUTAGE DETECTED: Both Mode A and Mode B returned 0 products. Skipping missing product marking.');
|
||||
return 0;
|
||||
}
|
||||
// Get all existing products for this dispensary that were not in the UNION
|
||||
const { rows: missingProducts } = await (0, connection_1.query)(`
|
||||
SELECT id, external_product_id, name
|
||||
FROM dutchie_products
|
||||
WHERE dispensary_id = $1
|
||||
AND external_product_id NOT IN (SELECT unnest($2::text[]))
|
||||
`, [dispensaryId, Array.from(unionProductIds)]);
|
||||
if (missingProducts.length === 0) {
|
||||
return 0;
|
||||
}
|
||||
console.log(`[ProductCrawler] Marking ${missingProducts.length} products as missing from feed (union of ${modeAProductIds.size} Mode A + ${modeBProductIds.size} Mode B = ${unionProductIds.size} unique)...`);
|
||||
const crawledAt = new Date();
|
||||
// Build all missing snapshots first (per CLAUDE.md Rule #15 - batch writes)
|
||||
const missingSnapshots = missingProducts.map(product => ({
|
||||
dutchieProductId: product.id,
|
||||
dispensaryId,
|
||||
platformDispensaryId,
|
||||
externalProductId: product.external_product_id,
|
||||
pricingType,
|
||||
crawlMode: 'mode_a', // Use mode_a for missing snapshots (convention)
|
||||
status: undefined,
|
||||
featured: false,
|
||||
special: false,
|
||||
medicalOnly: false,
|
||||
recOnly: false,
|
||||
isPresentInFeed: false,
|
||||
stockStatus: 'missing_from_feed',
|
||||
totalQuantityAvailable: undefined, // null = unknown, not 0
|
||||
manualInventory: false,
|
||||
isBelowThreshold: false,
|
||||
isBelowKioskThreshold: false,
|
||||
options: [],
|
||||
rawPayload: { _missingFromFeed: true, lastKnownName: product.name },
|
||||
crawledAt,
|
||||
}));
|
||||
// Batch insert missing snapshots
|
||||
const snapshotsInserted = await batchInsertSnapshots(missingSnapshots);
|
||||
// Batch update product stock status in chunks
|
||||
const productIds = missingProducts.map(p => p.id);
|
||||
const productChunks = chunkArray(productIds, BATCH_CHUNK_SIZE);
|
||||
console.log(`[ProductCrawler] Updating ${productIds.length} product statuses in ${productChunks.length} chunks...`);
|
||||
for (const chunk of productChunks) {
|
||||
await (0, connection_1.query)(`
|
||||
UPDATE dutchie_products
|
||||
SET stock_status = 'missing_from_feed', total_quantity_available = NULL, updated_at = NOW()
|
||||
WHERE id = ANY($1::int[])
|
||||
`, [chunk]);
|
||||
}
|
||||
console.log(`[ProductCrawler] Marked ${snapshotsInserted} products as missing from feed`);
|
||||
return snapshotsInserted;
|
||||
}
|
||||
/**
|
||||
* Process a batch of products from a single crawl mode
|
||||
* IMPORTANT: Stores ALL products, never filters before DB
|
||||
* Uses chunked batch processing per CLAUDE.md Rule #15 to avoid OOM
|
||||
* Returns the set of external product IDs that were processed
|
||||
*/
|
||||
async function processProducts(products, dispensary, pricingType, crawlMode, options = {}) {
|
||||
const { downloadImages = true } = options;
|
||||
const productIds = new Set();
|
||||
let imagesDownloaded = 0;
|
||||
let imageErrors = 0;
|
||||
console.log(`[ProductCrawler] Processing ${products.length} products using chunked batch processing...`);
|
||||
// Step 1: Normalize all products and collect IDs
|
||||
const normalizedProducts = [];
|
||||
const rawByExternalId = new Map();
|
||||
for (const raw of products) {
|
||||
const externalId = raw._id || raw.id || '';
|
||||
productIds.add(externalId);
|
||||
rawByExternalId.set(externalId, raw);
|
||||
const normalized = normalizeProduct(raw, dispensary.id, dispensary.platformDispensaryId);
|
||||
normalizedProducts.push(normalized);
|
||||
}
|
||||
// Step 2: Batch upsert products (chunked)
|
||||
const productIdMap = await batchUpsertProducts(normalizedProducts);
|
||||
const upserted = productIdMap.size;
|
||||
// Step 3: Create and batch insert snapshots (chunked)
|
||||
// IMPORTANT: Do this BEFORE image downloads to ensure snapshots are created even if images fail
|
||||
const snapshots = [];
|
||||
for (const [externalId, productId] of Array.from(productIdMap.entries())) {
|
||||
const raw = rawByExternalId.get(externalId);
|
||||
if (raw) {
|
||||
const snapshot = normalizeSnapshot(raw, productId, dispensary.id, dispensary.platformDispensaryId, pricingType, crawlMode);
|
||||
snapshots.push(snapshot);
|
||||
}
|
||||
}
|
||||
const snapshotsInserted = await batchInsertSnapshots(snapshots);
|
||||
// Step 4: Download images in chunks (if enabled)
|
||||
// This is done AFTER snapshots to ensure core data is saved even if image downloads fail
|
||||
if (downloadImages) {
|
||||
const imageChunks = chunkArray(Array.from(productIdMap.entries()), BATCH_CHUNK_SIZE);
|
||||
console.log(`[ProductCrawler] Downloading images in ${imageChunks.length} chunks...`);
|
||||
for (let i = 0; i < imageChunks.length; i++) {
|
||||
const chunk = imageChunks[i];
|
||||
for (const [externalId, productId] of chunk) {
|
||||
const normalized = normalizedProducts.find(p => p.externalProductId === externalId);
|
||||
if (normalized?.primaryImageUrl) {
|
||||
try {
|
||||
const imageResult = await downloadAndUpdateProductImage(productId, dispensary.id, externalId, normalized.primaryImageUrl);
|
||||
if (imageResult.downloaded) {
|
||||
imagesDownloaded++;
|
||||
}
|
||||
else if (imageResult.error && imageResult.error !== 'No image URL') {
|
||||
imageErrors++;
|
||||
}
|
||||
}
|
||||
catch (error) {
|
||||
imageErrors++;
|
||||
}
|
||||
}
|
||||
}
|
||||
if ((i + 1) % 5 === 0 || i === imageChunks.length - 1) {
|
||||
console.log(`[ProductCrawler] Image download chunk ${i + 1}/${imageChunks.length} (${imagesDownloaded} downloaded, ${imageErrors} errors)`);
|
||||
}
|
||||
}
|
||||
}
|
||||
// Clear references to help GC
|
||||
normalizedProducts.length = 0;
|
||||
rawByExternalId.clear();
|
||||
return { upserted, snapshots: snapshotsInserted, productIds, imagesDownloaded, imageErrors };
|
||||
}
|
||||
async function crawlDispensaryProducts(dispensary, pricingType = 'rec', options = {}) {
|
||||
const { useBothModes = true, downloadImages = true, onProgress } = options;
|
||||
const startTime = Date.now();
|
||||
if (!dispensary.platformDispensaryId) {
|
||||
return {
|
||||
success: false,
|
||||
dispensaryId: dispensary.id,
|
||||
productsFound: 0,
|
||||
productsFetched: 0,
|
||||
productsUpserted: 0,
|
||||
snapshotsCreated: 0,
|
||||
errorMessage: 'Missing platformDispensaryId',
|
||||
durationMs: Date.now() - startTime,
|
||||
};
|
||||
}
|
||||
try {
|
||||
console.log(`[ProductCrawler] Crawling ${dispensary.name} (${dispensary.platformDispensaryId})...`);
|
||||
let totalUpserted = 0;
|
||||
let totalSnapshots = 0;
|
||||
let totalImagesDownloaded = 0;
|
||||
let totalImageErrors = 0;
|
||||
let modeAProducts = 0;
|
||||
let modeBProducts = 0;
|
||||
let missingMarked = 0;
|
||||
// Track product IDs separately for each mode (needed for missing product detection)
|
||||
const modeAProductIds = new Set();
|
||||
const modeBProductIds = new Set();
|
||||
// Extract cName for this specific dispensary (used for Puppeteer session & headers)
|
||||
const cName = extractCName(dispensary);
|
||||
console.log(`[ProductCrawler] Using cName="${cName}" for dispensary ${dispensary.name}`);
|
||||
if (useBothModes) {
|
||||
// Run two-mode crawl for maximum coverage
|
||||
const bothResults = await (0, graphql_client_1.fetchAllProductsBothModes)(dispensary.platformDispensaryId, pricingType, { cName });
|
||||
modeAProducts = bothResults.modeA.products.length;
|
||||
modeBProducts = bothResults.modeB.products.length;
|
||||
console.log(`[ProductCrawler] Two-mode crawl: Mode A=${modeAProducts}, Mode B=${modeBProducts}, Merged=${bothResults.merged.products.length}`);
|
||||
// Collect Mode A product IDs
|
||||
for (const p of bothResults.modeA.products) {
|
||||
modeAProductIds.add(p._id);
|
||||
}
|
||||
// Collect Mode B product IDs
|
||||
for (const p of bothResults.modeB.products) {
|
||||
modeBProductIds.add(p._id);
|
||||
}
|
||||
// Process MERGED products (includes options from both modes)
|
||||
if (bothResults.merged.products.length > 0) {
|
||||
const mergedResult = await processProducts(bothResults.merged.products, dispensary, pricingType, 'mode_a', // Use mode_a for merged products (convention)
|
||||
{ downloadImages });
|
||||
totalUpserted = mergedResult.upserted;
|
||||
totalSnapshots = mergedResult.snapshots;
|
||||
totalImagesDownloaded = mergedResult.imagesDownloaded;
|
||||
totalImageErrors = mergedResult.imageErrors;
|
||||
// Report progress
|
||||
if (onProgress) {
|
||||
await onProgress({
|
||||
productsFound: bothResults.merged.products.length,
|
||||
productsUpserted: totalUpserted,
|
||||
snapshotsCreated: totalSnapshots,
|
||||
currentPage: 1,
|
||||
totalPages: 1,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
// Single mode crawl (Mode A only)
|
||||
const { products, crawlMode } = await (0, graphql_client_1.fetchAllProducts)(dispensary.platformDispensaryId, pricingType, { crawlMode: 'mode_a', cName });
|
||||
modeAProducts = products.length;
|
||||
// Collect Mode A product IDs
|
||||
for (const p of products) {
|
||||
modeAProductIds.add(p._id);
|
||||
}
|
||||
const result = await processProducts(products, dispensary, pricingType, crawlMode, { downloadImages });
|
||||
totalUpserted = result.upserted;
|
||||
totalSnapshots = result.snapshots;
|
||||
totalImagesDownloaded = result.imagesDownloaded;
|
||||
totalImageErrors = result.imageErrors;
|
||||
// Report progress
|
||||
if (onProgress) {
|
||||
await onProgress({
|
||||
productsFound: products.length,
|
||||
productsUpserted: totalUpserted,
|
||||
snapshotsCreated: totalSnapshots,
|
||||
currentPage: 1,
|
||||
totalPages: 1,
|
||||
});
|
||||
}
|
||||
}
|
||||
// Mark products as missing using UNION of Mode A + Mode B
|
||||
// The function handles outage detection (empty union = skip marking)
|
||||
missingMarked = await markMissingProducts(dispensary.id, dispensary.platformDispensaryId, modeAProductIds, modeBProductIds, pricingType);
|
||||
totalSnapshots += missingMarked;
|
||||
// Update dispensary stats
|
||||
await updateDispensaryCrawlStats(dispensary.id, totalUpserted);
|
||||
console.log(`[ProductCrawler] Completed: ${totalUpserted} products, ${totalSnapshots} snapshots, ${missingMarked} marked missing, ${totalImagesDownloaded} images downloaded`);
|
||||
const totalProductsFound = modeAProducts + modeBProducts;
|
||||
return {
|
||||
success: true,
|
||||
dispensaryId: dispensary.id,
|
||||
productsFound: totalProductsFound,
|
||||
productsFetched: totalProductsFound,
|
||||
productsUpserted: totalUpserted,
|
||||
snapshotsCreated: totalSnapshots,
|
||||
modeAProducts,
|
||||
modeBProducts,
|
||||
missingProductsMarked: missingMarked,
|
||||
imagesDownloaded: totalImagesDownloaded,
|
||||
imageErrors: totalImageErrors,
|
||||
durationMs: Date.now() - startTime,
|
||||
};
|
||||
}
|
||||
catch (error) {
|
||||
console.error(`[ProductCrawler] Failed to crawl ${dispensary.name}:`, error.message);
|
||||
return {
|
||||
success: false,
|
||||
dispensaryId: dispensary.id,
|
||||
productsFound: 0,
|
||||
productsFetched: 0,
|
||||
productsUpserted: 0,
|
||||
snapshotsCreated: 0,
|
||||
errorMessage: error.message,
|
||||
durationMs: Date.now() - startTime,
|
||||
};
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Crawl all Arizona dispensaries
|
||||
*/
|
||||
async function crawlAllArizonaDispensaries(pricingType = 'rec') {
|
||||
const results = [];
|
||||
// Get all AZ dispensaries with platform IDs
|
||||
const { rows: rawRows } = await (0, connection_1.query)(`
|
||||
SELECT ${DISPENSARY_COLUMNS} FROM dispensaries
|
||||
WHERE state = 'AZ' AND menu_type = 'dutchie' AND platform_dispensary_id IS NOT NULL
|
||||
ORDER BY id
|
||||
`);
|
||||
const dispensaries = rawRows.map(discovery_1.mapDbRowToDispensary);
|
||||
console.log(`[ProductCrawler] Starting crawl of ${dispensaries.length} dispensaries...`);
|
||||
for (const dispensary of dispensaries) {
|
||||
const result = await crawlDispensaryProducts(dispensary, pricingType);
|
||||
results.push(result);
|
||||
// Delay between dispensaries
|
||||
await new Promise((r) => setTimeout(r, 2000));
|
||||
}
|
||||
const successful = results.filter((r) => r.success).length;
|
||||
const totalProducts = results.reduce((sum, r) => sum + r.productsUpserted, 0);
|
||||
const totalSnapshots = results.reduce((sum, r) => sum + r.snapshotsCreated, 0);
|
||||
console.log(`[ProductCrawler] Completed: ${successful}/${dispensaries.length} stores, ${totalProducts} products, ${totalSnapshots} snapshots`);
|
||||
return results;
|
||||
}
|
||||
595
backend/dist/dutchie-az/services/scheduler.js
vendored
Normal file
595
backend/dist/dutchie-az/services/scheduler.js
vendored
Normal file
@@ -0,0 +1,595 @@
|
||||
"use strict";
|
||||
/**
|
||||
* Dutchie AZ Scheduler Service
|
||||
*
|
||||
* Handles scheduled crawling with JITTER - no fixed intervals!
|
||||
* Each job re-schedules itself with a NEW random offset after each run.
|
||||
* This makes timing "wander" around the clock, avoiding detectable patterns.
|
||||
*
|
||||
* Jitter Logic:
|
||||
* nextRunAt = lastRunAt + baseIntervalMinutes + random(-jitterMinutes, +jitterMinutes)
|
||||
*
|
||||
* Example: 4-hour base with ±30min jitter = runs anywhere from 3h30m to 4h30m apart
|
||||
*/
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.crawlSingleDispensary = void 0;
|
||||
exports.getAllSchedules = getAllSchedules;
|
||||
exports.getScheduleById = getScheduleById;
|
||||
exports.createSchedule = createSchedule;
|
||||
exports.updateSchedule = updateSchedule;
|
||||
exports.deleteSchedule = deleteSchedule;
|
||||
exports.getRunLogs = getRunLogs;
|
||||
exports.startScheduler = startScheduler;
|
||||
exports.stopScheduler = stopScheduler;
|
||||
exports.getSchedulerStatus = getSchedulerStatus;
|
||||
exports.triggerScheduleNow = triggerScheduleNow;
|
||||
exports.initializeDefaultSchedules = initializeDefaultSchedules;
|
||||
exports.triggerImmediateCrawl = triggerImmediateCrawl;
|
||||
const connection_1 = require("../db/connection");
|
||||
const menu_detection_1 = require("./menu-detection");
|
||||
const job_queue_1 = require("./job-queue");
|
||||
// Scheduler poll interval (how often we check for due jobs)
|
||||
const SCHEDULER_POLL_INTERVAL_MS = 60 * 1000; // 1 minute
|
||||
// Track running state
|
||||
let isSchedulerRunning = false;
|
||||
let schedulerInterval = null;
|
||||
// ============================================================
|
||||
// JITTER CALCULATION
|
||||
// ============================================================
|
||||
/**
|
||||
* Generate a random jitter value in minutes
|
||||
* Returns a value between -jitterMinutes and +jitterMinutes
|
||||
*/
|
||||
function getRandomJitterMinutes(jitterMinutes) {
|
||||
// random() returns [0, 1), we want [-jitter, +jitter]
|
||||
return (Math.random() * 2 - 1) * jitterMinutes;
|
||||
}
|
||||
/**
|
||||
* Calculate next run time with jitter
|
||||
* nextRunAt = baseTime + baseIntervalMinutes + random(-jitter, +jitter)
|
||||
*/
|
||||
function calculateNextRunAt(baseTime, baseIntervalMinutes, jitterMinutes) {
|
||||
const jitter = getRandomJitterMinutes(jitterMinutes);
|
||||
const totalMinutes = baseIntervalMinutes + jitter;
|
||||
const totalMs = totalMinutes * 60 * 1000;
|
||||
return new Date(baseTime.getTime() + totalMs);
|
||||
}
|
||||
// ============================================================
|
||||
// DATABASE OPERATIONS
|
||||
// ============================================================
|
||||
/**
|
||||
* Get all job schedules
|
||||
*/
|
||||
async function getAllSchedules() {
|
||||
const { rows } = await (0, connection_1.query)(`
|
||||
SELECT
|
||||
id, job_name, description, enabled,
|
||||
base_interval_minutes, jitter_minutes,
|
||||
last_run_at, last_status, last_error_message, last_duration_ms,
|
||||
next_run_at, job_config, created_at, updated_at
|
||||
FROM job_schedules
|
||||
ORDER BY job_name
|
||||
`);
|
||||
return rows.map(row => ({
|
||||
id: row.id,
|
||||
jobName: row.job_name,
|
||||
description: row.description,
|
||||
enabled: row.enabled,
|
||||
baseIntervalMinutes: row.base_interval_minutes,
|
||||
jitterMinutes: row.jitter_minutes,
|
||||
lastRunAt: row.last_run_at,
|
||||
lastStatus: row.last_status,
|
||||
lastErrorMessage: row.last_error_message,
|
||||
lastDurationMs: row.last_duration_ms,
|
||||
nextRunAt: row.next_run_at,
|
||||
jobConfig: row.job_config,
|
||||
createdAt: row.created_at,
|
||||
updatedAt: row.updated_at,
|
||||
}));
|
||||
}
|
||||
/**
|
||||
* Get a single schedule by ID
|
||||
*/
|
||||
async function getScheduleById(id) {
|
||||
const { rows } = await (0, connection_1.query)(`SELECT * FROM job_schedules WHERE id = $1`, [id]);
|
||||
if (rows.length === 0)
|
||||
return null;
|
||||
const row = rows[0];
|
||||
return {
|
||||
id: row.id,
|
||||
jobName: row.job_name,
|
||||
description: row.description,
|
||||
enabled: row.enabled,
|
||||
baseIntervalMinutes: row.base_interval_minutes,
|
||||
jitterMinutes: row.jitter_minutes,
|
||||
lastRunAt: row.last_run_at,
|
||||
lastStatus: row.last_status,
|
||||
lastErrorMessage: row.last_error_message,
|
||||
lastDurationMs: row.last_duration_ms,
|
||||
nextRunAt: row.next_run_at,
|
||||
jobConfig: row.job_config,
|
||||
createdAt: row.created_at,
|
||||
updatedAt: row.updated_at,
|
||||
};
|
||||
}
|
||||
/**
|
||||
* Create a new schedule
|
||||
*/
|
||||
async function createSchedule(schedule) {
|
||||
// Calculate initial nextRunAt
|
||||
const nextRunAt = schedule.startImmediately
|
||||
? new Date() // Start immediately
|
||||
: calculateNextRunAt(new Date(), schedule.baseIntervalMinutes, schedule.jitterMinutes);
|
||||
const { rows } = await (0, connection_1.query)(`
|
||||
INSERT INTO job_schedules (
|
||||
job_name, description, enabled,
|
||||
base_interval_minutes, jitter_minutes,
|
||||
next_run_at, job_config
|
||||
) VALUES ($1, $2, $3, $4, $5, $6, $7)
|
||||
RETURNING *
|
||||
`, [
|
||||
schedule.jobName,
|
||||
schedule.description || null,
|
||||
schedule.enabled ?? true,
|
||||
schedule.baseIntervalMinutes,
|
||||
schedule.jitterMinutes,
|
||||
nextRunAt,
|
||||
schedule.jobConfig ? JSON.stringify(schedule.jobConfig) : null,
|
||||
]);
|
||||
const row = rows[0];
|
||||
console.log(`[Scheduler] Created schedule "${schedule.jobName}" - next run at ${nextRunAt.toISOString()}`);
|
||||
return {
|
||||
id: row.id,
|
||||
jobName: row.job_name,
|
||||
description: row.description,
|
||||
enabled: row.enabled,
|
||||
baseIntervalMinutes: row.base_interval_minutes,
|
||||
jitterMinutes: row.jitter_minutes,
|
||||
lastRunAt: row.last_run_at,
|
||||
lastStatus: row.last_status,
|
||||
lastErrorMessage: row.last_error_message,
|
||||
lastDurationMs: row.last_duration_ms,
|
||||
nextRunAt: row.next_run_at,
|
||||
jobConfig: row.job_config,
|
||||
createdAt: row.created_at,
|
||||
updatedAt: row.updated_at,
|
||||
};
|
||||
}
|
||||
/**
|
||||
* Update a schedule
|
||||
*/
|
||||
async function updateSchedule(id, updates) {
|
||||
const setClauses = [];
|
||||
const params = [];
|
||||
let paramIndex = 1;
|
||||
if (updates.description !== undefined) {
|
||||
setClauses.push(`description = $${paramIndex++}`);
|
||||
params.push(updates.description);
|
||||
}
|
||||
if (updates.enabled !== undefined) {
|
||||
setClauses.push(`enabled = $${paramIndex++}`);
|
||||
params.push(updates.enabled);
|
||||
}
|
||||
if (updates.baseIntervalMinutes !== undefined) {
|
||||
setClauses.push(`base_interval_minutes = $${paramIndex++}`);
|
||||
params.push(updates.baseIntervalMinutes);
|
||||
}
|
||||
if (updates.jitterMinutes !== undefined) {
|
||||
setClauses.push(`jitter_minutes = $${paramIndex++}`);
|
||||
params.push(updates.jitterMinutes);
|
||||
}
|
||||
if (updates.jobConfig !== undefined) {
|
||||
setClauses.push(`job_config = $${paramIndex++}`);
|
||||
params.push(JSON.stringify(updates.jobConfig));
|
||||
}
|
||||
if (setClauses.length === 0) {
|
||||
return getScheduleById(id);
|
||||
}
|
||||
setClauses.push(`updated_at = NOW()`);
|
||||
params.push(id);
|
||||
const { rows } = await (0, connection_1.query)(`UPDATE job_schedules SET ${setClauses.join(', ')} WHERE id = $${paramIndex} RETURNING *`, params);
|
||||
if (rows.length === 0)
|
||||
return null;
|
||||
const row = rows[0];
|
||||
return {
|
||||
id: row.id,
|
||||
jobName: row.job_name,
|
||||
description: row.description,
|
||||
enabled: row.enabled,
|
||||
baseIntervalMinutes: row.base_interval_minutes,
|
||||
jitterMinutes: row.jitter_minutes,
|
||||
lastRunAt: row.last_run_at,
|
||||
lastStatus: row.last_status,
|
||||
lastErrorMessage: row.last_error_message,
|
||||
lastDurationMs: row.last_duration_ms,
|
||||
nextRunAt: row.next_run_at,
|
||||
jobConfig: row.job_config,
|
||||
createdAt: row.created_at,
|
||||
updatedAt: row.updated_at,
|
||||
};
|
||||
}
|
||||
/**
|
||||
* Delete a schedule
|
||||
*/
|
||||
async function deleteSchedule(id) {
|
||||
const result = await (0, connection_1.query)(`DELETE FROM job_schedules WHERE id = $1`, [id]);
|
||||
return (result.rowCount || 0) > 0;
|
||||
}
|
||||
/**
|
||||
* Mark a schedule as running
|
||||
*/
|
||||
async function markScheduleRunning(id) {
|
||||
await (0, connection_1.query)(`UPDATE job_schedules SET last_status = 'running', updated_at = NOW() WHERE id = $1`, [id]);
|
||||
}
|
||||
/**
|
||||
* Update schedule after job completion with NEW jittered next_run_at
|
||||
*/
|
||||
async function updateScheduleAfterRun(id, status, durationMs, errorMessage) {
|
||||
// Get current schedule to calculate new nextRunAt
|
||||
const schedule = await getScheduleById(id);
|
||||
if (!schedule)
|
||||
return;
|
||||
const now = new Date();
|
||||
const newNextRunAt = calculateNextRunAt(now, schedule.baseIntervalMinutes, schedule.jitterMinutes);
|
||||
console.log(`[Scheduler] Schedule "${schedule.jobName}" completed (${status}). Next run: ${newNextRunAt.toISOString()}`);
|
||||
await (0, connection_1.query)(`
|
||||
UPDATE job_schedules SET
|
||||
last_run_at = $2,
|
||||
last_status = $3,
|
||||
last_error_message = $4,
|
||||
last_duration_ms = $5,
|
||||
next_run_at = $6,
|
||||
updated_at = NOW()
|
||||
WHERE id = $1
|
||||
`, [id, now, status, errorMessage || null, durationMs, newNextRunAt]);
|
||||
}
|
||||
/**
|
||||
* Create a job run log entry
|
||||
*/
|
||||
async function createRunLog(scheduleId, jobName, status) {
|
||||
const { rows } = await (0, connection_1.query)(`
|
||||
INSERT INTO job_run_logs (schedule_id, job_name, status, started_at)
|
||||
VALUES ($1, $2, $3, NOW())
|
||||
RETURNING id
|
||||
`, [scheduleId, jobName, status]);
|
||||
return rows[0].id;
|
||||
}
|
||||
/**
|
||||
* Update a job run log entry
|
||||
*/
|
||||
async function updateRunLog(runLogId, status, results) {
|
||||
await (0, connection_1.query)(`
|
||||
UPDATE job_run_logs SET
|
||||
status = $2,
|
||||
completed_at = NOW(),
|
||||
duration_ms = $3,
|
||||
error_message = $4,
|
||||
items_processed = $5,
|
||||
items_succeeded = $6,
|
||||
items_failed = $7,
|
||||
metadata = $8
|
||||
WHERE id = $1
|
||||
`, [
|
||||
runLogId,
|
||||
status,
|
||||
results.durationMs,
|
||||
results.errorMessage || null,
|
||||
results.itemsProcessed || 0,
|
||||
results.itemsSucceeded || 0,
|
||||
results.itemsFailed || 0,
|
||||
results.metadata ? JSON.stringify(results.metadata) : null,
|
||||
]);
|
||||
}
|
||||
/**
|
||||
* Get job run logs
|
||||
*/
|
||||
async function getRunLogs(options) {
|
||||
const { scheduleId, jobName, limit = 50, offset = 0 } = options;
|
||||
let whereClause = 'WHERE 1=1';
|
||||
const params = [];
|
||||
let paramIndex = 1;
|
||||
if (scheduleId) {
|
||||
whereClause += ` AND schedule_id = $${paramIndex++}`;
|
||||
params.push(scheduleId);
|
||||
}
|
||||
if (jobName) {
|
||||
whereClause += ` AND job_name = $${paramIndex++}`;
|
||||
params.push(jobName);
|
||||
}
|
||||
params.push(limit, offset);
|
||||
const { rows } = await (0, connection_1.query)(`
|
||||
SELECT * FROM job_run_logs
|
||||
${whereClause}
|
||||
ORDER BY created_at DESC
|
||||
LIMIT $${paramIndex} OFFSET $${paramIndex + 1}
|
||||
`, params);
|
||||
const { rows: countRows } = await (0, connection_1.query)(`SELECT COUNT(*) as total FROM job_run_logs ${whereClause}`, params.slice(0, -2));
|
||||
return {
|
||||
logs: rows,
|
||||
total: parseInt(countRows[0]?.total || '0', 10),
|
||||
};
|
||||
}
|
||||
// ============================================================
|
||||
// JOB EXECUTION
|
||||
// ============================================================
|
||||
/**
|
||||
* Execute a job based on its name
|
||||
*/
|
||||
async function executeJob(schedule) {
|
||||
const config = schedule.jobConfig || {};
|
||||
switch (schedule.jobName) {
|
||||
case 'dutchie_az_product_crawl':
|
||||
return executeProductCrawl(config);
|
||||
case 'dutchie_az_discovery':
|
||||
return executeDiscovery(config);
|
||||
case 'dutchie_az_menu_detection':
|
||||
return (0, menu_detection_1.executeMenuDetectionJob)(config);
|
||||
default:
|
||||
throw new Error(`Unknown job type: ${schedule.jobName}`);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Execute the AZ Dutchie product crawl job
|
||||
*
|
||||
* NEW BEHAVIOR: Instead of running crawls directly, this now ENQUEUES jobs
|
||||
* into the crawl_jobs queue. Workers (running as separate replicas) will
|
||||
* pick up and process these jobs.
|
||||
*
|
||||
* This allows:
|
||||
* - Multiple workers to process jobs in parallel
|
||||
* - No double-crawls (DB-level locking per dispensary)
|
||||
* - Better scalability (add more worker replicas)
|
||||
* - Live monitoring of individual job progress
|
||||
*/
|
||||
async function executeProductCrawl(config) {
|
||||
const pricingType = config.pricingType || 'rec';
|
||||
const useBothModes = config.useBothModes !== false;
|
||||
// Get all "ready" dispensaries (menu_type='dutchie' AND platform_dispensary_id IS NOT NULL AND not failed)
|
||||
// Note: Menu detection is handled separately by the dutchie_az_menu_detection schedule
|
||||
const { rows: rawRows } = await (0, connection_1.query)(`
|
||||
SELECT id FROM dispensaries
|
||||
WHERE state = 'AZ'
|
||||
AND menu_type = 'dutchie'
|
||||
AND platform_dispensary_id IS NOT NULL
|
||||
AND failed_at IS NULL
|
||||
ORDER BY last_crawl_at ASC NULLS FIRST
|
||||
`);
|
||||
const dispensaryIds = rawRows.map((r) => r.id);
|
||||
if (dispensaryIds.length === 0) {
|
||||
return {
|
||||
status: 'success',
|
||||
itemsProcessed: 0,
|
||||
itemsSucceeded: 0,
|
||||
itemsFailed: 0,
|
||||
metadata: { message: 'No ready dispensaries to crawl. Run menu detection to discover more.' },
|
||||
};
|
||||
}
|
||||
console.log(`[Scheduler] Enqueueing crawl jobs for ${dispensaryIds.length} dispensaries...`);
|
||||
// Bulk enqueue jobs (skips dispensaries that already have pending/running jobs)
|
||||
const { enqueued, skipped } = await (0, job_queue_1.bulkEnqueueJobs)('dutchie_product_crawl', dispensaryIds, {
|
||||
priority: 0,
|
||||
metadata: { pricingType, useBothModes },
|
||||
});
|
||||
console.log(`[Scheduler] Enqueued ${enqueued} jobs, skipped ${skipped} (already queued)`);
|
||||
// Get current queue stats
|
||||
const queueStats = await (0, job_queue_1.getQueueStats)();
|
||||
return {
|
||||
status: 'success',
|
||||
itemsProcessed: dispensaryIds.length,
|
||||
itemsSucceeded: enqueued,
|
||||
itemsFailed: 0, // Enqueue itself doesn't fail
|
||||
metadata: {
|
||||
enqueued,
|
||||
skipped,
|
||||
queueStats,
|
||||
pricingType,
|
||||
useBothModes,
|
||||
message: `Enqueued ${enqueued} jobs. Workers will process them. Check /scraper-monitor for progress.`,
|
||||
},
|
||||
};
|
||||
}
|
||||
/**
|
||||
* Execute the AZ Dutchie discovery job (placeholder)
|
||||
*/
|
||||
async function executeDiscovery(_config) {
|
||||
// Placeholder - implement discovery logic
|
||||
return {
|
||||
status: 'success',
|
||||
itemsProcessed: 0,
|
||||
itemsSucceeded: 0,
|
||||
itemsFailed: 0,
|
||||
metadata: { message: 'Discovery not yet implemented' },
|
||||
};
|
||||
}
|
||||
// ============================================================
|
||||
// SCHEDULER RUNNER
|
||||
// ============================================================
|
||||
/**
|
||||
* Check for due jobs and run them
|
||||
*/
|
||||
async function checkAndRunDueJobs() {
|
||||
try {
|
||||
// Get enabled schedules where nextRunAt <= now
|
||||
const { rows } = await (0, connection_1.query)(`
|
||||
SELECT * FROM job_schedules
|
||||
WHERE enabled = true
|
||||
AND next_run_at IS NOT NULL
|
||||
AND next_run_at <= NOW()
|
||||
AND (last_status IS NULL OR last_status != 'running')
|
||||
ORDER BY next_run_at ASC
|
||||
`);
|
||||
if (rows.length === 0)
|
||||
return;
|
||||
console.log(`[Scheduler] Found ${rows.length} due job(s)`);
|
||||
for (const row of rows) {
|
||||
const schedule = {
|
||||
id: row.id,
|
||||
jobName: row.job_name,
|
||||
description: row.description,
|
||||
enabled: row.enabled,
|
||||
baseIntervalMinutes: row.base_interval_minutes,
|
||||
jitterMinutes: row.jitter_minutes,
|
||||
lastRunAt: row.last_run_at,
|
||||
lastStatus: row.last_status,
|
||||
lastErrorMessage: row.last_error_message,
|
||||
lastDurationMs: row.last_duration_ms,
|
||||
nextRunAt: row.next_run_at,
|
||||
jobConfig: row.job_config,
|
||||
createdAt: row.created_at,
|
||||
updatedAt: row.updated_at,
|
||||
};
|
||||
await runScheduledJob(schedule);
|
||||
}
|
||||
}
|
||||
catch (error) {
|
||||
console.error('[Scheduler] Error checking for due jobs:', error);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Run a single scheduled job
|
||||
*/
|
||||
async function runScheduledJob(schedule) {
|
||||
const startTime = Date.now();
|
||||
console.log(`[Scheduler] Starting job "${schedule.jobName}"...`);
|
||||
// Mark as running
|
||||
await markScheduleRunning(schedule.id);
|
||||
// Create run log entry
|
||||
const runLogId = await createRunLog(schedule.id, schedule.jobName, 'running');
|
||||
try {
|
||||
// Execute the job
|
||||
const result = await executeJob(schedule);
|
||||
const durationMs = Date.now() - startTime;
|
||||
// Determine final status (exclude 'running' and null)
|
||||
const finalStatus = result.status === 'running' || result.status === null
|
||||
? 'success'
|
||||
: result.status;
|
||||
// Update run log
|
||||
await updateRunLog(runLogId, finalStatus, {
|
||||
durationMs,
|
||||
errorMessage: result.errorMessage,
|
||||
itemsProcessed: result.itemsProcessed,
|
||||
itemsSucceeded: result.itemsSucceeded,
|
||||
itemsFailed: result.itemsFailed,
|
||||
metadata: result.metadata,
|
||||
});
|
||||
// Update schedule with NEW jittered next_run_at
|
||||
await updateScheduleAfterRun(schedule.id, result.status, durationMs, result.errorMessage);
|
||||
console.log(`[Scheduler] Job "${schedule.jobName}" completed in ${Math.round(durationMs / 1000)}s (${result.status})`);
|
||||
}
|
||||
catch (error) {
|
||||
const durationMs = Date.now() - startTime;
|
||||
console.error(`[Scheduler] Job "${schedule.jobName}" failed:`, error.message);
|
||||
// Update run log with error
|
||||
await updateRunLog(runLogId, 'error', {
|
||||
durationMs,
|
||||
errorMessage: error.message,
|
||||
itemsProcessed: 0,
|
||||
itemsSucceeded: 0,
|
||||
itemsFailed: 0,
|
||||
});
|
||||
// Update schedule with NEW jittered next_run_at
|
||||
await updateScheduleAfterRun(schedule.id, 'error', durationMs, error.message);
|
||||
}
|
||||
}
|
||||
// ============================================================
|
||||
// PUBLIC API
|
||||
// ============================================================
|
||||
/**
|
||||
* Start the scheduler
|
||||
*/
|
||||
function startScheduler() {
|
||||
if (isSchedulerRunning) {
|
||||
console.log('[Scheduler] Scheduler is already running');
|
||||
return;
|
||||
}
|
||||
isSchedulerRunning = true;
|
||||
console.log(`[Scheduler] Starting scheduler (polling every ${SCHEDULER_POLL_INTERVAL_MS / 1000}s)...`);
|
||||
// Immediately check for due jobs
|
||||
checkAndRunDueJobs();
|
||||
// Set up interval to check for due jobs
|
||||
schedulerInterval = setInterval(checkAndRunDueJobs, SCHEDULER_POLL_INTERVAL_MS);
|
||||
}
|
||||
/**
|
||||
* Stop the scheduler
|
||||
*/
|
||||
function stopScheduler() {
|
||||
if (!isSchedulerRunning) {
|
||||
console.log('[Scheduler] Scheduler is not running');
|
||||
return;
|
||||
}
|
||||
isSchedulerRunning = false;
|
||||
if (schedulerInterval) {
|
||||
clearInterval(schedulerInterval);
|
||||
schedulerInterval = null;
|
||||
}
|
||||
console.log('[Scheduler] Scheduler stopped');
|
||||
}
|
||||
/**
|
||||
* Get scheduler status
|
||||
*/
|
||||
function getSchedulerStatus() {
|
||||
return {
|
||||
running: isSchedulerRunning,
|
||||
pollIntervalMs: SCHEDULER_POLL_INTERVAL_MS,
|
||||
};
|
||||
}
|
||||
/**
|
||||
* Trigger immediate execution of a schedule
|
||||
*/
|
||||
async function triggerScheduleNow(scheduleId) {
|
||||
const schedule = await getScheduleById(scheduleId);
|
||||
if (!schedule) {
|
||||
return { success: false, message: 'Schedule not found' };
|
||||
}
|
||||
if (schedule.lastStatus === 'running') {
|
||||
return { success: false, message: 'Job is already running' };
|
||||
}
|
||||
// Run the job
|
||||
await runScheduledJob(schedule);
|
||||
return { success: true, message: 'Job triggered successfully' };
|
||||
}
|
||||
/**
|
||||
* Initialize default schedules if they don't exist
|
||||
*/
|
||||
async function initializeDefaultSchedules() {
|
||||
const schedules = await getAllSchedules();
|
||||
// Check if product crawl schedule exists
|
||||
const productCrawlExists = schedules.some(s => s.jobName === 'dutchie_az_product_crawl');
|
||||
if (!productCrawlExists) {
|
||||
await createSchedule({
|
||||
jobName: 'dutchie_az_product_crawl',
|
||||
description: 'Crawl all AZ Dutchie dispensary products',
|
||||
enabled: true,
|
||||
baseIntervalMinutes: 240, // 4 hours
|
||||
jitterMinutes: 30, // ±30 minutes
|
||||
jobConfig: { pricingType: 'rec', useBothModes: true },
|
||||
startImmediately: false,
|
||||
});
|
||||
console.log('[Scheduler] Created default product crawl schedule');
|
||||
}
|
||||
// Check if menu detection schedule exists
|
||||
const menuDetectionExists = schedules.some(s => s.jobName === 'dutchie_az_menu_detection');
|
||||
if (!menuDetectionExists) {
|
||||
await createSchedule({
|
||||
jobName: 'dutchie_az_menu_detection',
|
||||
description: 'Detect menu providers and resolve platform IDs for AZ dispensaries',
|
||||
enabled: true,
|
||||
baseIntervalMinutes: 1440, // 24 hours
|
||||
jitterMinutes: 60, // ±1 hour
|
||||
jobConfig: { state: 'AZ', onlyUnknown: true },
|
||||
startImmediately: false,
|
||||
});
|
||||
console.log('[Scheduler] Created default menu detection schedule');
|
||||
}
|
||||
}
|
||||
// Re-export for backward compatibility
|
||||
var product_crawler_1 = require("./product-crawler");
|
||||
Object.defineProperty(exports, "crawlSingleDispensary", { enumerable: true, get: function () { return product_crawler_1.crawlDispensaryProducts; } });
|
||||
async function triggerImmediateCrawl() {
|
||||
const schedules = await getAllSchedules();
|
||||
const productCrawl = schedules.find(s => s.jobName === 'dutchie_az_product_crawl');
|
||||
if (productCrawl) {
|
||||
return triggerScheduleNow(productCrawl.id);
|
||||
}
|
||||
return { success: false, message: 'Product crawl schedule not found' };
|
||||
}
|
||||
440
backend/dist/dutchie-az/services/worker.js
vendored
Normal file
440
backend/dist/dutchie-az/services/worker.js
vendored
Normal file
@@ -0,0 +1,440 @@
|
||||
"use strict";
|
||||
/**
|
||||
* Worker Service
|
||||
*
|
||||
* Polls the job queue and processes crawl jobs.
|
||||
* Each worker instance runs independently, claiming jobs atomically.
|
||||
*/
|
||||
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
||||
if (k2 === undefined) k2 = k;
|
||||
var desc = Object.getOwnPropertyDescriptor(m, k);
|
||||
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
||||
desc = { enumerable: true, get: function() { return m[k]; } };
|
||||
}
|
||||
Object.defineProperty(o, k2, desc);
|
||||
}) : (function(o, m, k, k2) {
|
||||
if (k2 === undefined) k2 = k;
|
||||
o[k2] = m[k];
|
||||
}));
|
||||
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
||||
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
||||
}) : function(o, v) {
|
||||
o["default"] = v;
|
||||
});
|
||||
var __importStar = (this && this.__importStar) || (function () {
|
||||
var ownKeys = function(o) {
|
||||
ownKeys = Object.getOwnPropertyNames || function (o) {
|
||||
var ar = [];
|
||||
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
||||
return ar;
|
||||
};
|
||||
return ownKeys(o);
|
||||
};
|
||||
return function (mod) {
|
||||
if (mod && mod.__esModule) return mod;
|
||||
var result = {};
|
||||
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
||||
__setModuleDefault(result, mod);
|
||||
return result;
|
||||
};
|
||||
})();
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.startWorker = startWorker;
|
||||
exports.stopWorker = stopWorker;
|
||||
exports.getWorkerStatus = getWorkerStatus;
|
||||
const job_queue_1 = require("./job-queue");
|
||||
const product_crawler_1 = require("./product-crawler");
|
||||
const discovery_1 = require("./discovery");
|
||||
const connection_1 = require("../db/connection");
|
||||
// Explicit column list for dispensaries table (avoids SELECT * issues with schema differences)
|
||||
// NOTE: failed_at is included for worker compatibility checks
|
||||
const DISPENSARY_COLUMNS = `
|
||||
id, name, slug, city, state, zip, address, latitude, longitude,
|
||||
menu_type, menu_url, platform_dispensary_id, website,
|
||||
provider_detection_data, created_at, updated_at, failed_at
|
||||
`;
|
||||
// ============================================================
|
||||
// WORKER CONFIG
|
||||
// ============================================================
|
||||
const POLL_INTERVAL_MS = 5000; // Check for jobs every 5 seconds
|
||||
const HEARTBEAT_INTERVAL_MS = 60000; // Send heartbeat every 60 seconds
|
||||
const STALE_CHECK_INTERVAL_MS = 300000; // Check for stale jobs every 5 minutes
|
||||
const SHUTDOWN_GRACE_PERIOD_MS = 30000; // Wait 30s for job to complete on shutdown
|
||||
// ============================================================
|
||||
// WORKER STATE
|
||||
// ============================================================
|
||||
let isRunning = false;
|
||||
let currentJob = null;
|
||||
let pollTimer = null;
|
||||
let heartbeatTimer = null;
|
||||
let staleCheckTimer = null;
|
||||
let shutdownPromise = null;
|
||||
// ============================================================
|
||||
// WORKER LIFECYCLE
|
||||
// ============================================================
|
||||
/**
|
||||
* Start the worker
|
||||
*/
|
||||
async function startWorker() {
|
||||
if (isRunning) {
|
||||
console.log('[Worker] Already running');
|
||||
return;
|
||||
}
|
||||
const workerId = (0, job_queue_1.getWorkerId)();
|
||||
const hostname = (0, job_queue_1.getWorkerHostname)();
|
||||
console.log(`[Worker] Starting worker ${workerId} on ${hostname}`);
|
||||
isRunning = true;
|
||||
// Set up graceful shutdown
|
||||
setupShutdownHandlers();
|
||||
// Start polling for jobs
|
||||
pollTimer = setInterval(pollForJobs, POLL_INTERVAL_MS);
|
||||
// Start stale job recovery (only one worker should do this, but it's idempotent)
|
||||
staleCheckTimer = setInterval(async () => {
|
||||
try {
|
||||
await (0, job_queue_1.recoverStaleJobs)(15);
|
||||
}
|
||||
catch (error) {
|
||||
console.error('[Worker] Error recovering stale jobs:', error);
|
||||
}
|
||||
}, STALE_CHECK_INTERVAL_MS);
|
||||
// Immediately poll for a job
|
||||
await pollForJobs();
|
||||
console.log(`[Worker] Worker ${workerId} started, polling every ${POLL_INTERVAL_MS}ms`);
|
||||
}
|
||||
/**
|
||||
* Stop the worker gracefully
|
||||
*/
|
||||
async function stopWorker() {
|
||||
if (!isRunning)
|
||||
return;
|
||||
console.log('[Worker] Stopping worker...');
|
||||
isRunning = false;
|
||||
// Clear timers
|
||||
if (pollTimer) {
|
||||
clearInterval(pollTimer);
|
||||
pollTimer = null;
|
||||
}
|
||||
if (heartbeatTimer) {
|
||||
clearInterval(heartbeatTimer);
|
||||
heartbeatTimer = null;
|
||||
}
|
||||
if (staleCheckTimer) {
|
||||
clearInterval(staleCheckTimer);
|
||||
staleCheckTimer = null;
|
||||
}
|
||||
// Wait for current job to complete
|
||||
if (currentJob) {
|
||||
console.log(`[Worker] Waiting for job ${currentJob.id} to complete...`);
|
||||
const startWait = Date.now();
|
||||
while (currentJob && Date.now() - startWait < SHUTDOWN_GRACE_PERIOD_MS) {
|
||||
await new Promise(r => setTimeout(r, 1000));
|
||||
}
|
||||
if (currentJob) {
|
||||
console.log(`[Worker] Job ${currentJob.id} did not complete in time, marking for retry`);
|
||||
await (0, job_queue_1.failJob)(currentJob.id, 'Worker shutdown');
|
||||
}
|
||||
}
|
||||
console.log('[Worker] Worker stopped');
|
||||
}
|
||||
/**
|
||||
* Get worker status
|
||||
*/
|
||||
function getWorkerStatus() {
|
||||
return {
|
||||
isRunning,
|
||||
workerId: (0, job_queue_1.getWorkerId)(),
|
||||
hostname: (0, job_queue_1.getWorkerHostname)(),
|
||||
currentJob,
|
||||
};
|
||||
}
|
||||
// ============================================================
|
||||
// JOB PROCESSING
|
||||
// ============================================================
|
||||
/**
|
||||
* Poll for and process the next available job
|
||||
*/
|
||||
async function pollForJobs() {
|
||||
if (!isRunning || currentJob) {
|
||||
return; // Already processing a job
|
||||
}
|
||||
try {
|
||||
const workerId = (0, job_queue_1.getWorkerId)();
|
||||
// Try to claim a job
|
||||
const job = await (0, job_queue_1.claimNextJob)({
|
||||
workerId,
|
||||
jobTypes: ['dutchie_product_crawl', 'menu_detection', 'menu_detection_single'],
|
||||
lockDurationMinutes: 30,
|
||||
});
|
||||
if (!job) {
|
||||
return; // No jobs available
|
||||
}
|
||||
currentJob = job;
|
||||
console.log(`[Worker] Processing job ${job.id} (type=${job.jobType}, dispensary=${job.dispensaryId})`);
|
||||
// Start heartbeat for this job
|
||||
heartbeatTimer = setInterval(async () => {
|
||||
if (currentJob) {
|
||||
try {
|
||||
await (0, job_queue_1.heartbeat)(currentJob.id);
|
||||
}
|
||||
catch (error) {
|
||||
console.error('[Worker] Heartbeat error:', error);
|
||||
}
|
||||
}
|
||||
}, HEARTBEAT_INTERVAL_MS);
|
||||
// Process the job
|
||||
await processJob(job);
|
||||
}
|
||||
catch (error) {
|
||||
console.error('[Worker] Error polling for jobs:', error);
|
||||
if (currentJob) {
|
||||
try {
|
||||
await (0, job_queue_1.failJob)(currentJob.id, error.message);
|
||||
}
|
||||
catch (failError) {
|
||||
console.error('[Worker] Error failing job:', failError);
|
||||
}
|
||||
}
|
||||
}
|
||||
finally {
|
||||
// Clear heartbeat timer
|
||||
if (heartbeatTimer) {
|
||||
clearInterval(heartbeatTimer);
|
||||
heartbeatTimer = null;
|
||||
}
|
||||
currentJob = null;
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Process a single job
|
||||
*/
|
||||
async function processJob(job) {
|
||||
try {
|
||||
switch (job.jobType) {
|
||||
case 'dutchie_product_crawl':
|
||||
await processProductCrawlJob(job);
|
||||
break;
|
||||
case 'menu_detection':
|
||||
await processMenuDetectionJob(job);
|
||||
break;
|
||||
case 'menu_detection_single':
|
||||
await processSingleDetectionJob(job);
|
||||
break;
|
||||
default:
|
||||
throw new Error(`Unknown job type: ${job.jobType}`);
|
||||
}
|
||||
}
|
||||
catch (error) {
|
||||
console.error(`[Worker] Job ${job.id} failed:`, error);
|
||||
await (0, job_queue_1.failJob)(job.id, error.message);
|
||||
}
|
||||
}
|
||||
// Maximum consecutive failures before flagging a dispensary
|
||||
const MAX_CONSECUTIVE_FAILURES = 3;
|
||||
/**
|
||||
* Record a successful crawl - resets failure counter
|
||||
*/
|
||||
async function recordCrawlSuccess(dispensaryId) {
|
||||
await (0, connection_1.query)(`UPDATE dispensaries
|
||||
SET consecutive_failures = 0,
|
||||
last_crawl_at = NOW(),
|
||||
updated_at = NOW()
|
||||
WHERE id = $1`, [dispensaryId]);
|
||||
}
|
||||
/**
|
||||
* Record a crawl failure - increments counter and may flag dispensary
|
||||
* Returns true if dispensary was flagged as failed
|
||||
*/
|
||||
async function recordCrawlFailure(dispensaryId, errorMessage) {
|
||||
// Increment failure counter
|
||||
const { rows } = await (0, connection_1.query)(`UPDATE dispensaries
|
||||
SET consecutive_failures = consecutive_failures + 1,
|
||||
last_failure_at = NOW(),
|
||||
last_failure_reason = $2,
|
||||
updated_at = NOW()
|
||||
WHERE id = $1
|
||||
RETURNING consecutive_failures`, [dispensaryId, errorMessage]);
|
||||
const failures = rows[0]?.consecutive_failures || 0;
|
||||
// If we've hit the threshold, flag the dispensary as failed
|
||||
if (failures >= MAX_CONSECUTIVE_FAILURES) {
|
||||
await (0, connection_1.query)(`UPDATE dispensaries
|
||||
SET failed_at = NOW(),
|
||||
menu_type = NULL,
|
||||
platform_dispensary_id = NULL,
|
||||
failure_notes = $2,
|
||||
updated_at = NOW()
|
||||
WHERE id = $1`, [dispensaryId, `Auto-flagged after ${failures} consecutive failures. Last error: ${errorMessage}`]);
|
||||
console.log(`[Worker] Dispensary ${dispensaryId} flagged as FAILED after ${failures} consecutive failures`);
|
||||
return true;
|
||||
}
|
||||
console.log(`[Worker] Dispensary ${dispensaryId} failure recorded (${failures}/${MAX_CONSECUTIVE_FAILURES})`);
|
||||
return false;
|
||||
}
|
||||
/**
|
||||
* Process a product crawl job for a single dispensary
|
||||
*/
|
||||
async function processProductCrawlJob(job) {
|
||||
if (!job.dispensaryId) {
|
||||
throw new Error('Product crawl job requires dispensary_id');
|
||||
}
|
||||
// Get dispensary details
|
||||
const { rows } = await (0, connection_1.query)(`SELECT ${DISPENSARY_COLUMNS} FROM dispensaries WHERE id = $1`, [job.dispensaryId]);
|
||||
if (rows.length === 0) {
|
||||
throw new Error(`Dispensary ${job.dispensaryId} not found`);
|
||||
}
|
||||
const dispensary = (0, discovery_1.mapDbRowToDispensary)(rows[0]);
|
||||
// Check if dispensary is already flagged as failed
|
||||
if (rows[0].failed_at) {
|
||||
console.log(`[Worker] Skipping dispensary ${job.dispensaryId} - already flagged as failed`);
|
||||
await (0, job_queue_1.completeJob)(job.id, { productsFound: 0, productsUpserted: 0 });
|
||||
return;
|
||||
}
|
||||
if (!dispensary.platformDispensaryId) {
|
||||
// Record failure and potentially flag
|
||||
await recordCrawlFailure(job.dispensaryId, 'Missing platform_dispensary_id');
|
||||
throw new Error(`Dispensary ${job.dispensaryId} has no platform_dispensary_id`);
|
||||
}
|
||||
// Get crawl options from job metadata
|
||||
const pricingType = job.metadata?.pricingType || 'rec';
|
||||
const useBothModes = job.metadata?.useBothModes !== false;
|
||||
try {
|
||||
// Crawl the dispensary
|
||||
const result = await (0, product_crawler_1.crawlDispensaryProducts)(dispensary, pricingType, {
|
||||
useBothModes,
|
||||
onProgress: async (progress) => {
|
||||
// Update progress for live monitoring
|
||||
await (0, job_queue_1.updateJobProgress)(job.id, {
|
||||
productsFound: progress.productsFound,
|
||||
productsUpserted: progress.productsUpserted,
|
||||
snapshotsCreated: progress.snapshotsCreated,
|
||||
currentPage: progress.currentPage,
|
||||
totalPages: progress.totalPages,
|
||||
});
|
||||
},
|
||||
});
|
||||
if (result.success) {
|
||||
// Success! Reset failure counter
|
||||
await recordCrawlSuccess(job.dispensaryId);
|
||||
await (0, job_queue_1.completeJob)(job.id, {
|
||||
productsFound: result.productsFetched,
|
||||
productsUpserted: result.productsUpserted,
|
||||
snapshotsCreated: result.snapshotsCreated,
|
||||
});
|
||||
}
|
||||
else {
|
||||
// Crawl returned failure - record it
|
||||
const wasFlagged = await recordCrawlFailure(job.dispensaryId, result.errorMessage || 'Crawl failed');
|
||||
if (wasFlagged) {
|
||||
// Don't throw - the dispensary is now flagged, job is "complete"
|
||||
await (0, job_queue_1.completeJob)(job.id, { productsFound: 0, productsUpserted: 0 });
|
||||
}
|
||||
else {
|
||||
throw new Error(result.errorMessage || 'Crawl failed');
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (error) {
|
||||
// Record the failure
|
||||
const wasFlagged = await recordCrawlFailure(job.dispensaryId, error.message);
|
||||
if (wasFlagged) {
|
||||
// Dispensary is now flagged - complete the job rather than fail it
|
||||
await (0, job_queue_1.completeJob)(job.id, { productsFound: 0, productsUpserted: 0 });
|
||||
}
|
||||
else {
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Process a menu detection job (bulk)
|
||||
*/
|
||||
async function processMenuDetectionJob(job) {
|
||||
const { executeMenuDetectionJob } = await Promise.resolve().then(() => __importStar(require('./menu-detection')));
|
||||
const config = job.metadata || {};
|
||||
const result = await executeMenuDetectionJob(config);
|
||||
if (result.status === 'error') {
|
||||
throw new Error(result.errorMessage || 'Menu detection failed');
|
||||
}
|
||||
await (0, job_queue_1.completeJob)(job.id, {
|
||||
productsFound: result.itemsProcessed,
|
||||
productsUpserted: result.itemsSucceeded,
|
||||
});
|
||||
}
|
||||
/**
|
||||
* Process a single dispensary menu detection job
|
||||
* This is the parallelizable version - each worker can detect one dispensary at a time
|
||||
*/
|
||||
async function processSingleDetectionJob(job) {
|
||||
if (!job.dispensaryId) {
|
||||
throw new Error('Single detection job requires dispensary_id');
|
||||
}
|
||||
const { detectAndResolveDispensary } = await Promise.resolve().then(() => __importStar(require('./menu-detection')));
|
||||
// Get dispensary details
|
||||
const { rows } = await (0, connection_1.query)(`SELECT ${DISPENSARY_COLUMNS} FROM dispensaries WHERE id = $1`, [job.dispensaryId]);
|
||||
if (rows.length === 0) {
|
||||
throw new Error(`Dispensary ${job.dispensaryId} not found`);
|
||||
}
|
||||
const dispensary = rows[0];
|
||||
// Skip if already detected or failed
|
||||
if (dispensary.failed_at) {
|
||||
console.log(`[Worker] Skipping dispensary ${job.dispensaryId} - already flagged as failed`);
|
||||
await (0, job_queue_1.completeJob)(job.id, { productsFound: 0, productsUpserted: 0 });
|
||||
return;
|
||||
}
|
||||
if (dispensary.menu_type && dispensary.menu_type !== 'unknown') {
|
||||
console.log(`[Worker] Skipping dispensary ${job.dispensaryId} - already detected as ${dispensary.menu_type}`);
|
||||
await (0, job_queue_1.completeJob)(job.id, { productsFound: 0, productsUpserted: 1 });
|
||||
return;
|
||||
}
|
||||
console.log(`[Worker] Detecting menu for dispensary ${job.dispensaryId} (${dispensary.name})...`);
|
||||
try {
|
||||
const result = await detectAndResolveDispensary(job.dispensaryId);
|
||||
if (result.success) {
|
||||
console.log(`[Worker] Dispensary ${job.dispensaryId}: detected ${result.detectedProvider}, platformId=${result.platformDispensaryId || 'none'}`);
|
||||
await (0, job_queue_1.completeJob)(job.id, {
|
||||
productsFound: 1,
|
||||
productsUpserted: result.platformDispensaryId ? 1 : 0,
|
||||
});
|
||||
}
|
||||
else {
|
||||
// Detection failed - record failure
|
||||
await recordCrawlFailure(job.dispensaryId, result.error || 'Detection failed');
|
||||
throw new Error(result.error || 'Detection failed');
|
||||
}
|
||||
}
|
||||
catch (error) {
|
||||
// Record the failure
|
||||
const wasFlagged = await recordCrawlFailure(job.dispensaryId, error.message);
|
||||
if (wasFlagged) {
|
||||
// Dispensary is now flagged - complete the job rather than fail it
|
||||
await (0, job_queue_1.completeJob)(job.id, { productsFound: 0, productsUpserted: 0 });
|
||||
}
|
||||
else {
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
}
|
||||
// ============================================================
|
||||
// SHUTDOWN HANDLING
|
||||
// ============================================================
|
||||
function setupShutdownHandlers() {
|
||||
const shutdown = async (signal) => {
|
||||
if (shutdownPromise)
|
||||
return shutdownPromise;
|
||||
console.log(`\n[Worker] Received ${signal}, shutting down...`);
|
||||
shutdownPromise = stopWorker();
|
||||
await shutdownPromise;
|
||||
process.exit(0);
|
||||
};
|
||||
process.on('SIGTERM', () => shutdown('SIGTERM'));
|
||||
process.on('SIGINT', () => shutdown('SIGINT'));
|
||||
}
|
||||
// ============================================================
|
||||
// STANDALONE WORKER ENTRY POINT
|
||||
// ============================================================
|
||||
if (require.main === module) {
|
||||
// Run as standalone worker
|
||||
startWorker().catch((error) => {
|
||||
console.error('[Worker] Fatal error:', error);
|
||||
process.exit(1);
|
||||
});
|
||||
}
|
||||
96
backend/dist/dutchie-az/types/index.js
vendored
Normal file
96
backend/dist/dutchie-az/types/index.js
vendored
Normal file
@@ -0,0 +1,96 @@
|
||||
"use strict";
|
||||
/**
|
||||
* Dutchie AZ Data Types
|
||||
*
|
||||
* Complete TypeScript interfaces for the isolated Dutchie Arizona data pipeline.
|
||||
* These types map directly to Dutchie's GraphQL FilteredProducts response.
|
||||
*/
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.getOptionQuantity = getOptionQuantity;
|
||||
exports.deriveOptionStockStatus = deriveOptionStockStatus;
|
||||
exports.deriveStockStatus = deriveStockStatus;
|
||||
exports.calculateTotalQuantity = calculateTotalQuantity;
|
||||
exports.calculateTotalKioskQuantity = calculateTotalKioskQuantity;
|
||||
/**
|
||||
* Get available quantity for a single option
|
||||
* Priority: quantityAvailable > kioskQuantityAvailable > quantity
|
||||
*/
|
||||
function getOptionQuantity(child) {
|
||||
if (typeof child.quantityAvailable === 'number')
|
||||
return child.quantityAvailable;
|
||||
if (typeof child.kioskQuantityAvailable === 'number')
|
||||
return child.kioskQuantityAvailable;
|
||||
if (typeof child.quantity === 'number')
|
||||
return child.quantity;
|
||||
return null; // No quantity data available
|
||||
}
|
||||
/**
|
||||
* Derive stock status for a single option
|
||||
* Returns: 'in_stock' if qty > 0, 'out_of_stock' if qty === 0, 'unknown' if no data
|
||||
*/
|
||||
function deriveOptionStockStatus(child) {
|
||||
const qty = getOptionQuantity(child);
|
||||
if (qty === null)
|
||||
return 'unknown';
|
||||
return qty > 0 ? 'in_stock' : 'out_of_stock';
|
||||
}
|
||||
/**
|
||||
* Derive product-level stock status from POSMetaData.children
|
||||
*
|
||||
* Logic per spec:
|
||||
* - If ANY child is "in_stock" → product is "in_stock"
|
||||
* - Else if ALL children are "out_of_stock" → product is "out_of_stock"
|
||||
* - Else → product is "unknown"
|
||||
*
|
||||
* IMPORTANT: Threshold flags (isBelowThreshold, etc.) do NOT override stock status.
|
||||
* They only indicate "low stock" - if qty > 0, status stays "in_stock".
|
||||
*/
|
||||
function deriveStockStatus(product) {
|
||||
const children = product.POSMetaData?.children;
|
||||
// No children data - unknown
|
||||
if (!children || children.length === 0) {
|
||||
return 'unknown';
|
||||
}
|
||||
// Get stock status for each option
|
||||
const optionStatuses = children.map(deriveOptionStockStatus);
|
||||
// If ANY option is in_stock → product is in_stock
|
||||
if (optionStatuses.some(status => status === 'in_stock')) {
|
||||
return 'in_stock';
|
||||
}
|
||||
// If ALL options are out_of_stock → product is out_of_stock
|
||||
if (optionStatuses.every(status => status === 'out_of_stock')) {
|
||||
return 'out_of_stock';
|
||||
}
|
||||
// Otherwise (mix of out_of_stock and unknown) → unknown
|
||||
return 'unknown';
|
||||
}
|
||||
/**
|
||||
* Calculate total quantity available across all options
|
||||
* Returns null if no children data (unknown inventory), 0 if children exist but all have 0 qty
|
||||
*/
|
||||
function calculateTotalQuantity(product) {
|
||||
const children = product.POSMetaData?.children;
|
||||
// No children = unknown inventory, return null (NOT 0)
|
||||
if (!children || children.length === 0)
|
||||
return null;
|
||||
// Check if any child has quantity data
|
||||
const hasAnyQtyData = children.some(child => getOptionQuantity(child) !== null);
|
||||
if (!hasAnyQtyData)
|
||||
return null; // All children lack qty data = unknown
|
||||
return children.reduce((sum, child) => {
|
||||
const qty = getOptionQuantity(child);
|
||||
return sum + (qty ?? 0);
|
||||
}, 0);
|
||||
}
|
||||
/**
|
||||
* Calculate total kiosk quantity available across all options
|
||||
*/
|
||||
function calculateTotalKioskQuantity(product) {
|
||||
const children = product.POSMetaData?.children;
|
||||
if (!children || children.length === 0)
|
||||
return null;
|
||||
const hasAnyKioskQty = children.some(child => typeof child.kioskQuantityAvailable === 'number');
|
||||
if (!hasAnyKioskQty)
|
||||
return null;
|
||||
return children.reduce((sum, child) => sum + (child.kioskQuantityAvailable ?? 0), 0);
|
||||
}
|
||||
64
backend/dist/index.js
vendored
64
backend/dist/index.js
vendored
@@ -7,18 +7,39 @@ const express_1 = __importDefault(require("express"));
|
||||
const cors_1 = __importDefault(require("cors"));
|
||||
const dotenv_1 = __importDefault(require("dotenv"));
|
||||
const minio_1 = require("./utils/minio");
|
||||
const image_storage_1 = require("./utils/image-storage");
|
||||
const logger_1 = require("./services/logger");
|
||||
const proxyTestQueue_1 = require("./services/proxyTestQueue");
|
||||
dotenv_1.default.config();
|
||||
const app = (0, express_1.default)();
|
||||
const PORT = process.env.PORT || 3010;
|
||||
app.use((0, cors_1.default)());
|
||||
app.use(express_1.default.json());
|
||||
// Serve static images when MinIO is not configured
|
||||
const LOCAL_IMAGES_PATH = process.env.LOCAL_IMAGES_PATH || '/app/public/images';
|
||||
app.use('/images', express_1.default.static(LOCAL_IMAGES_PATH));
|
||||
// Serve static downloads (plugin files, etc.)
|
||||
const LOCAL_DOWNLOADS_PATH = process.env.LOCAL_DOWNLOADS_PATH || '/app/public/downloads';
|
||||
app.use('/downloads', express_1.default.static(LOCAL_DOWNLOADS_PATH));
|
||||
app.get('/health', (req, res) => {
|
||||
res.json({ status: 'ok', timestamp: new Date().toISOString() });
|
||||
});
|
||||
// Endpoint to check server's outbound IP (for proxy whitelist setup)
|
||||
app.get('/outbound-ip', async (req, res) => {
|
||||
try {
|
||||
const axios = require('axios');
|
||||
const response = await axios.get('https://api.ipify.org?format=json', { timeout: 10000 });
|
||||
res.json({ outbound_ip: response.data.ip });
|
||||
}
|
||||
catch (error) {
|
||||
res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
const auth_1 = __importDefault(require("./routes/auth"));
|
||||
const dashboard_1 = __importDefault(require("./routes/dashboard"));
|
||||
const stores_1 = __importDefault(require("./routes/stores"));
|
||||
const dispensaries_1 = __importDefault(require("./routes/dispensaries"));
|
||||
const changes_1 = __importDefault(require("./routes/changes"));
|
||||
const categories_1 = __importDefault(require("./routes/categories"));
|
||||
const products_1 = __importDefault(require("./routes/products"));
|
||||
const campaigns_1 = __importDefault(require("./routes/campaigns"));
|
||||
@@ -27,9 +48,27 @@ const settings_1 = __importDefault(require("./routes/settings"));
|
||||
const proxies_1 = __importDefault(require("./routes/proxies"));
|
||||
const logs_1 = __importDefault(require("./routes/logs"));
|
||||
const scraper_monitor_1 = __importDefault(require("./routes/scraper-monitor"));
|
||||
const api_tokens_1 = __importDefault(require("./routes/api-tokens"));
|
||||
const api_permissions_1 = __importDefault(require("./routes/api-permissions"));
|
||||
const parallel_scrape_1 = __importDefault(require("./routes/parallel-scrape"));
|
||||
const schedule_1 = __importDefault(require("./routes/schedule"));
|
||||
const crawler_sandbox_1 = __importDefault(require("./routes/crawler-sandbox"));
|
||||
const version_1 = __importDefault(require("./routes/version"));
|
||||
const public_api_1 = __importDefault(require("./routes/public-api"));
|
||||
const dutchie_az_1 = require("./dutchie-az");
|
||||
const apiTokenTracker_1 = require("./middleware/apiTokenTracker");
|
||||
const crawl_scheduler_1 = require("./services/crawl-scheduler");
|
||||
const wordpressPermissions_1 = require("./middleware/wordpressPermissions");
|
||||
// Apply WordPress permissions validation first (sets req.apiToken)
|
||||
app.use(wordpressPermissions_1.validateWordPressPermissions);
|
||||
// Apply API tracking middleware globally
|
||||
app.use(apiTokenTracker_1.trackApiUsage);
|
||||
app.use(apiTokenTracker_1.checkRateLimit);
|
||||
app.use('/api/auth', auth_1.default);
|
||||
app.use('/api/dashboard', dashboard_1.default);
|
||||
app.use('/api/stores', stores_1.default);
|
||||
app.use('/api/dispensaries', dispensaries_1.default);
|
||||
app.use('/api/changes', changes_1.default);
|
||||
app.use('/api/categories', categories_1.default);
|
||||
app.use('/api/products', products_1.default);
|
||||
app.use('/api/campaigns', campaigns_1.default);
|
||||
@@ -38,11 +77,34 @@ app.use('/api/settings', settings_1.default);
|
||||
app.use('/api/proxies', proxies_1.default);
|
||||
app.use('/api/logs', logs_1.default);
|
||||
app.use('/api/scraper-monitor', scraper_monitor_1.default);
|
||||
app.use('/api/api-tokens', api_tokens_1.default);
|
||||
app.use('/api/api-permissions', api_permissions_1.default);
|
||||
app.use('/api/parallel-scrape', parallel_scrape_1.default);
|
||||
app.use('/api/schedule', schedule_1.default);
|
||||
app.use('/api/crawler-sandbox', crawler_sandbox_1.default);
|
||||
app.use('/api/version', version_1.default);
|
||||
// Vendor-agnostic AZ data pipeline routes (new public surface)
|
||||
app.use('/api/az', dutchie_az_1.dutchieAZRouter);
|
||||
// Legacy alias (kept temporarily for backward compatibility)
|
||||
app.use('/api/dutchie-az', dutchie_az_1.dutchieAZRouter);
|
||||
// Public API v1 - External consumer endpoints (WordPress, etc.)
|
||||
// Uses dutchie_az data pipeline with per-dispensary API key auth
|
||||
app.use('/api/v1', public_api_1.default);
|
||||
async function startServer() {
|
||||
try {
|
||||
logger_1.logger.info('system', 'Starting server...');
|
||||
await (0, minio_1.initializeMinio)();
|
||||
logger_1.logger.info('system', 'Minio initialized');
|
||||
await (0, image_storage_1.initializeImageStorage)();
|
||||
logger_1.logger.info('system', (0, minio_1.isMinioEnabled)() ? 'MinIO storage initialized' : 'Local filesystem storage initialized');
|
||||
// Clean up any orphaned proxy test jobs from previous server runs
|
||||
await (0, proxyTestQueue_1.cleanupOrphanedJobs)();
|
||||
// Start the crawl scheduler (checks every minute for jobs to run)
|
||||
(0, crawl_scheduler_1.startCrawlScheduler)();
|
||||
logger_1.logger.info('system', 'Crawl scheduler started');
|
||||
// Start the Dutchie AZ scheduler (enqueues jobs for workers)
|
||||
await (0, dutchie_az_1.initializeDefaultSchedules)();
|
||||
(0, dutchie_az_1.startScheduler)();
|
||||
logger_1.logger.info('system', 'Dutchie AZ scheduler started');
|
||||
app.listen(PORT, () => {
|
||||
logger_1.logger.info('system', `Server running on port ${PORT}`);
|
||||
console.log(`🚀 Server running on port ${PORT}`);
|
||||
|
||||
94
backend/dist/middleware/apiTokenTracker.js
vendored
Normal file
94
backend/dist/middleware/apiTokenTracker.js
vendored
Normal file
@@ -0,0 +1,94 @@
|
||||
"use strict";
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.trackApiUsage = trackApiUsage;
|
||||
exports.checkRateLimit = checkRateLimit;
|
||||
const migrate_1 = require("../db/migrate");
|
||||
async function trackApiUsage(req, res, next) {
|
||||
// Only track if authenticated via API token
|
||||
if (!req.apiToken) {
|
||||
return next();
|
||||
}
|
||||
const startTime = Date.now();
|
||||
req.startTime = startTime;
|
||||
// Get request size
|
||||
const requestSize = req.headers['content-length']
|
||||
? parseInt(req.headers['content-length'])
|
||||
: 0;
|
||||
// Capture original res.json to measure response
|
||||
const originalJson = res.json.bind(res);
|
||||
let responseSize = 0;
|
||||
res.json = function (body) {
|
||||
responseSize = JSON.stringify(body).length;
|
||||
return originalJson(body);
|
||||
};
|
||||
// Track after response is sent
|
||||
res.on('finish', async () => {
|
||||
const responseTime = Date.now() - startTime;
|
||||
try {
|
||||
await migrate_1.pool.query(`
|
||||
INSERT INTO api_token_usage (
|
||||
token_id,
|
||||
endpoint,
|
||||
method,
|
||||
status_code,
|
||||
response_time_ms,
|
||||
request_size,
|
||||
response_size,
|
||||
ip_address,
|
||||
user_agent
|
||||
)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)
|
||||
`, [
|
||||
req.apiToken.id,
|
||||
req.path,
|
||||
req.method,
|
||||
res.statusCode,
|
||||
responseTime,
|
||||
requestSize,
|
||||
responseSize,
|
||||
req.ip,
|
||||
req.headers['user-agent'] || null
|
||||
]);
|
||||
// Update last_used_at
|
||||
await migrate_1.pool.query('UPDATE api_tokens SET last_used_at = CURRENT_TIMESTAMP WHERE id = $1', [req.apiToken.id]);
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error tracking API usage:', error);
|
||||
}
|
||||
});
|
||||
next();
|
||||
}
|
||||
// Rate limiting check
|
||||
async function checkRateLimit(req, res, next) {
|
||||
if (!req.apiToken) {
|
||||
return next();
|
||||
}
|
||||
const { id, rate_limit } = req.apiToken;
|
||||
try {
|
||||
// Count requests in the last minute
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT COUNT(*) as request_count
|
||||
FROM api_token_usage
|
||||
WHERE token_id = $1
|
||||
AND created_at > NOW() - INTERVAL '1 minute'
|
||||
`, [id]);
|
||||
const requestCount = parseInt(result.rows[0].request_count);
|
||||
if (requestCount >= rate_limit) {
|
||||
return res.status(429).json({
|
||||
error: 'Rate limit exceeded',
|
||||
limit: rate_limit,
|
||||
current: requestCount,
|
||||
retry_after: 60
|
||||
});
|
||||
}
|
||||
// Add rate limit headers
|
||||
res.setHeader('X-RateLimit-Limit', rate_limit.toString());
|
||||
res.setHeader('X-RateLimit-Remaining', (rate_limit - requestCount).toString());
|
||||
res.setHeader('X-RateLimit-Reset', new Date(Date.now() + 60000).toISOString());
|
||||
next();
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error checking rate limit:', error);
|
||||
next();
|
||||
}
|
||||
}
|
||||
163
backend/dist/middleware/wordpressPermissions.js
vendored
Normal file
163
backend/dist/middleware/wordpressPermissions.js
vendored
Normal file
@@ -0,0 +1,163 @@
|
||||
"use strict";
|
||||
var __importDefault = (this && this.__importDefault) || function (mod) {
|
||||
return (mod && mod.__esModule) ? mod : { "default": mod };
|
||||
};
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.validateWordPressPermissions = validateWordPressPermissions;
|
||||
const migrate_1 = require("../db/migrate");
|
||||
const ipaddr_js_1 = __importDefault(require("ipaddr.js"));
|
||||
/**
|
||||
* Validates if an IP address matches any of the allowed IP patterns
|
||||
* Supports CIDR notation and wildcards
|
||||
*/
|
||||
function isIpAllowed(clientIp, allowedIps) {
|
||||
try {
|
||||
const clientAddr = ipaddr_js_1.default.process(clientIp);
|
||||
for (const allowedIp of allowedIps) {
|
||||
const trimmed = allowedIp.trim();
|
||||
if (!trimmed)
|
||||
continue;
|
||||
// Check for CIDR notation
|
||||
if (trimmed.includes('/')) {
|
||||
try {
|
||||
const [subnet, bits] = trimmed.split('/');
|
||||
const range = ipaddr_js_1.default.parseCIDR(trimmed);
|
||||
if (clientAddr.match(range)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
catch (e) {
|
||||
console.warn(`Invalid CIDR notation: ${trimmed}`);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
else {
|
||||
// Exact match
|
||||
try {
|
||||
const allowedAddr = ipaddr_js_1.default.process(trimmed);
|
||||
if (clientAddr.toString() === allowedAddr.toString()) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
catch (e) {
|
||||
console.warn(`Invalid IP address: ${trimmed}`);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error processing client IP:', error);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Validates if a domain matches any of the allowed domain patterns
|
||||
* Supports wildcard subdomains (*.example.com)
|
||||
*/
|
||||
function isDomainAllowed(origin, allowedDomains) {
|
||||
try {
|
||||
// Extract domain from origin URL
|
||||
const url = new URL(origin);
|
||||
const domain = url.hostname;
|
||||
for (const allowedDomain of allowedDomains) {
|
||||
const trimmed = allowedDomain.trim();
|
||||
if (!trimmed)
|
||||
continue;
|
||||
// Wildcard subdomain support
|
||||
if (trimmed.startsWith('*.')) {
|
||||
const baseDomain = trimmed.substring(2);
|
||||
if (domain === baseDomain || domain.endsWith('.' + baseDomain)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
else {
|
||||
// Exact match
|
||||
if (domain === trimmed) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error processing domain:', error);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
/**
|
||||
* WordPress API Permissions Middleware
|
||||
* Validates API access based on WordPress permissions table
|
||||
*/
|
||||
async function validateWordPressPermissions(req, res, next) {
|
||||
// Get API key from header
|
||||
const apiKey = req.headers['x-api-key'];
|
||||
// If no API key provided, skip WordPress validation
|
||||
if (!apiKey) {
|
||||
return next();
|
||||
}
|
||||
try {
|
||||
// Query WordPress permissions table
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT id, user_name, api_key, allowed_ips, allowed_domains, is_active
|
||||
FROM wp_dutchie_api_permissions
|
||||
WHERE api_key = $1 AND is_active = 1
|
||||
`, [apiKey]);
|
||||
if (result.rows.length === 0) {
|
||||
return res.status(401).json({
|
||||
error: 'Invalid API key'
|
||||
});
|
||||
}
|
||||
const permission = result.rows[0];
|
||||
// Get client IP
|
||||
const clientIp = req.headers['x-forwarded-for']?.split(',')[0].trim() ||
|
||||
req.headers['x-real-ip'] ||
|
||||
req.ip ||
|
||||
req.connection.remoteAddress ||
|
||||
'';
|
||||
// Validate IP if configured
|
||||
if (permission.allowed_ips) {
|
||||
const allowedIps = permission.allowed_ips.split('\n').filter((ip) => ip.trim());
|
||||
if (allowedIps.length > 0 && !isIpAllowed(clientIp, allowedIps)) {
|
||||
return res.status(403).json({
|
||||
error: 'IP address not allowed',
|
||||
client_ip: clientIp
|
||||
});
|
||||
}
|
||||
}
|
||||
// Validate domain if configured
|
||||
const origin = req.get('origin') || req.get('referer') || '';
|
||||
if (permission.allowed_domains && origin) {
|
||||
const allowedDomains = permission.allowed_domains.split('\n').filter((d) => d.trim());
|
||||
if (allowedDomains.length > 0 && !isDomainAllowed(origin, allowedDomains)) {
|
||||
return res.status(403).json({
|
||||
error: 'Domain not allowed',
|
||||
origin: origin
|
||||
});
|
||||
}
|
||||
}
|
||||
// Update last_used_at timestamp (async, don't wait)
|
||||
migrate_1.pool.query(`
|
||||
UPDATE wp_dutchie_api_permissions
|
||||
SET last_used_at = CURRENT_TIMESTAMP
|
||||
WHERE id = $1
|
||||
`, [permission.id]).catch((err) => {
|
||||
console.error('Error updating last_used_at:', err);
|
||||
});
|
||||
// Set apiToken on request for tracking middleware
|
||||
// Default rate limit of 100 requests/minute for WordPress permissions
|
||||
req.apiToken = {
|
||||
id: permission.id,
|
||||
name: permission.user_name,
|
||||
rate_limit: 100
|
||||
};
|
||||
next();
|
||||
}
|
||||
catch (error) {
|
||||
console.error('WordPress permissions validation error:', error);
|
||||
return res.status(500).json({
|
||||
error: 'Internal server error during API validation'
|
||||
});
|
||||
}
|
||||
}
|
||||
32
backend/dist/migrations-runner/009_image_sizes.js
vendored
Normal file
32
backend/dist/migrations-runner/009_image_sizes.js
vendored
Normal file
@@ -0,0 +1,32 @@
|
||||
"use strict";
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
const migrate_1 = require("../db/migrate");
|
||||
(async () => {
|
||||
try {
|
||||
console.log('🔄 Running image sizes migration...');
|
||||
// Add thumbnail and medium paths
|
||||
await migrate_1.pool.query(`
|
||||
ALTER TABLE products
|
||||
ADD COLUMN IF NOT EXISTS thumbnail_path TEXT,
|
||||
ADD COLUMN IF NOT EXISTS medium_path TEXT
|
||||
`);
|
||||
console.log('✅ Added thumbnail_path and medium_path columns');
|
||||
// Rename local_image_path to full_path
|
||||
await migrate_1.pool.query(`
|
||||
ALTER TABLE products
|
||||
RENAME COLUMN local_image_path TO full_path
|
||||
`);
|
||||
console.log('✅ Renamed local_image_path to full_path');
|
||||
// Add index
|
||||
await migrate_1.pool.query(`
|
||||
CREATE INDEX IF NOT EXISTS idx_products_images ON products(full_path, thumbnail_path, medium_path)
|
||||
`);
|
||||
console.log('✅ Created image index');
|
||||
console.log('✅ Migration complete!');
|
||||
process.exit(0);
|
||||
}
|
||||
catch (error) {
|
||||
console.error('❌ Migration failed:', error);
|
||||
process.exit(1);
|
||||
}
|
||||
})();
|
||||
174
backend/dist/routes/api-permissions.js
vendored
Normal file
174
backend/dist/routes/api-permissions.js
vendored
Normal file
@@ -0,0 +1,174 @@
|
||||
"use strict";
|
||||
var __importDefault = (this && this.__importDefault) || function (mod) {
|
||||
return (mod && mod.__esModule) ? mod : { "default": mod };
|
||||
};
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
const express_1 = require("express");
|
||||
const middleware_1 = require("../auth/middleware");
|
||||
const migrate_1 = require("../db/migrate");
|
||||
const crypto_1 = __importDefault(require("crypto"));
|
||||
const router = (0, express_1.Router)();
|
||||
router.use(middleware_1.authMiddleware);
|
||||
// Generate secure random API key (64-character hex)
|
||||
function generateApiKey() {
|
||||
return crypto_1.default.randomBytes(32).toString('hex');
|
||||
}
|
||||
// Get all API permissions
|
||||
router.get('/', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
|
||||
try {
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT *
|
||||
FROM wp_dutchie_api_permissions
|
||||
ORDER BY created_at DESC
|
||||
`);
|
||||
res.json({ permissions: result.rows });
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error fetching API permissions:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch API permissions' });
|
||||
}
|
||||
});
|
||||
// Get all dispensaries for dropdown (must be before /:id to avoid route conflict)
|
||||
router.get('/dispensaries', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
|
||||
try {
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT id, name
|
||||
FROM dispensaries
|
||||
ORDER BY name
|
||||
`);
|
||||
res.json({ dispensaries: result.rows });
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error fetching dispensaries:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch dispensaries' });
|
||||
}
|
||||
});
|
||||
// Get single API permission
|
||||
router.get('/:id', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
|
||||
try {
|
||||
const { id } = req.params;
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT *
|
||||
FROM wp_dutchie_api_permissions
|
||||
WHERE id = $1
|
||||
`, [id]);
|
||||
if (result.rows.length === 0) {
|
||||
return res.status(404).json({ error: 'Permission not found' });
|
||||
}
|
||||
res.json({ permission: result.rows[0] });
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error fetching API permission:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch API permission' });
|
||||
}
|
||||
});
|
||||
// Create new API permission
|
||||
router.post('/', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
|
||||
try {
|
||||
// Support both store_id (existing) and dispensary_id (for compatibility)
|
||||
const { user_name, allowed_ips, allowed_domains, store_id, dispensary_id } = req.body;
|
||||
const storeIdToUse = store_id || dispensary_id;
|
||||
if (!user_name) {
|
||||
return res.status(400).json({ error: 'User name is required' });
|
||||
}
|
||||
if (!storeIdToUse) {
|
||||
return res.status(400).json({ error: 'Store/Dispensary is required' });
|
||||
}
|
||||
// Get dispensary name for display
|
||||
const dispensaryResult = await migrate_1.pool.query('SELECT name FROM dispensaries WHERE id = $1', [storeIdToUse]);
|
||||
if (dispensaryResult.rows.length === 0) {
|
||||
return res.status(400).json({ error: 'Invalid store/dispensary ID' });
|
||||
}
|
||||
const storeName = dispensaryResult.rows[0].name;
|
||||
const apiKey = generateApiKey();
|
||||
const result = await migrate_1.pool.query(`
|
||||
INSERT INTO wp_dutchie_api_permissions (
|
||||
user_name,
|
||||
api_key,
|
||||
allowed_ips,
|
||||
allowed_domains,
|
||||
is_active,
|
||||
store_id,
|
||||
store_name
|
||||
)
|
||||
VALUES ($1, $2, $3, $4, 1, $5, $6)
|
||||
RETURNING *
|
||||
`, [
|
||||
user_name,
|
||||
apiKey,
|
||||
allowed_ips || null,
|
||||
allowed_domains || null,
|
||||
storeIdToUse,
|
||||
storeName
|
||||
]);
|
||||
res.status(201).json({
|
||||
permission: result.rows[0],
|
||||
message: 'API permission created successfully. Save the API key securely - it cannot be retrieved later.'
|
||||
});
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error creating API permission:', error);
|
||||
res.status(500).json({ error: 'Failed to create API permission' });
|
||||
}
|
||||
});
|
||||
// Update API permission
|
||||
router.put('/:id', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
|
||||
try {
|
||||
const { id } = req.params;
|
||||
const { user_name, allowed_ips, allowed_domains, is_active } = req.body;
|
||||
const result = await migrate_1.pool.query(`
|
||||
UPDATE wp_dutchie_api_permissions
|
||||
SET
|
||||
user_name = COALESCE($1, user_name),
|
||||
allowed_ips = COALESCE($2, allowed_ips),
|
||||
allowed_domains = COALESCE($3, allowed_domains),
|
||||
is_active = COALESCE($4, is_active)
|
||||
WHERE id = $5
|
||||
RETURNING *
|
||||
`, [user_name, allowed_ips, allowed_domains, is_active, id]);
|
||||
if (result.rows.length === 0) {
|
||||
return res.status(404).json({ error: 'Permission not found' });
|
||||
}
|
||||
res.json({ permission: result.rows[0] });
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error updating API permission:', error);
|
||||
res.status(500).json({ error: 'Failed to update API permission' });
|
||||
}
|
||||
});
|
||||
// Toggle permission active status
|
||||
router.patch('/:id/toggle', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
|
||||
try {
|
||||
const { id } = req.params;
|
||||
const result = await migrate_1.pool.query(`
|
||||
UPDATE wp_dutchie_api_permissions
|
||||
SET is_active = NOT is_active
|
||||
WHERE id = $1
|
||||
RETURNING *
|
||||
`, [id]);
|
||||
if (result.rows.length === 0) {
|
||||
return res.status(404).json({ error: 'Permission not found' });
|
||||
}
|
||||
res.json({ permission: result.rows[0] });
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error toggling API permission:', error);
|
||||
res.status(500).json({ error: 'Failed to toggle API permission' });
|
||||
}
|
||||
});
|
||||
// Delete API permission
|
||||
router.delete('/:id', (0, middleware_1.requireRole)('superadmin'), async (req, res) => {
|
||||
try {
|
||||
const { id } = req.params;
|
||||
const result = await migrate_1.pool.query('DELETE FROM wp_dutchie_api_permissions WHERE id = $1 RETURNING *', [id]);
|
||||
if (result.rows.length === 0) {
|
||||
return res.status(404).json({ error: 'Permission not found' });
|
||||
}
|
||||
res.json({ message: 'API permission deleted successfully' });
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error deleting API permission:', error);
|
||||
res.status(500).json({ error: 'Failed to delete API permission' });
|
||||
}
|
||||
});
|
||||
exports.default = router;
|
||||
265
backend/dist/routes/api-tokens.js
vendored
Normal file
265
backend/dist/routes/api-tokens.js
vendored
Normal file
@@ -0,0 +1,265 @@
|
||||
"use strict";
|
||||
var __importDefault = (this && this.__importDefault) || function (mod) {
|
||||
return (mod && mod.__esModule) ? mod : { "default": mod };
|
||||
};
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
const express_1 = require("express");
|
||||
const middleware_1 = require("../auth/middleware");
|
||||
const migrate_1 = require("../db/migrate");
|
||||
const crypto_1 = __importDefault(require("crypto"));
|
||||
const router = (0, express_1.Router)();
|
||||
router.use(middleware_1.authMiddleware);
|
||||
// Generate secure random token
|
||||
function generateToken() {
|
||||
return crypto_1.default.randomBytes(32).toString('hex');
|
||||
}
|
||||
// Get all API tokens
|
||||
router.get('/', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
|
||||
try {
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT
|
||||
t.*,
|
||||
u.email as created_by_email,
|
||||
(
|
||||
SELECT COUNT(*)
|
||||
FROM api_token_usage
|
||||
WHERE token_id = t.id
|
||||
AND created_at > NOW() - INTERVAL '24 hours'
|
||||
) as requests_24h,
|
||||
(
|
||||
SELECT COUNT(*)
|
||||
FROM api_token_usage
|
||||
WHERE token_id = t.id
|
||||
AND created_at > NOW() - INTERVAL '7 days'
|
||||
) as requests_7d,
|
||||
(
|
||||
SELECT COUNT(*)
|
||||
FROM api_token_usage
|
||||
WHERE token_id = t.id
|
||||
) as total_requests
|
||||
FROM api_tokens t
|
||||
LEFT JOIN users u ON t.user_id = u.id
|
||||
ORDER BY t.created_at DESC
|
||||
`);
|
||||
res.json({ tokens: result.rows });
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error fetching API tokens:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch API tokens' });
|
||||
}
|
||||
});
|
||||
// Get single API token
|
||||
router.get('/:id', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
|
||||
try {
|
||||
const { id } = req.params;
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT
|
||||
t.*,
|
||||
u.email as created_by_email
|
||||
FROM api_tokens t
|
||||
LEFT JOIN users u ON t.user_id = u.id
|
||||
WHERE t.id = $1
|
||||
`, [id]);
|
||||
if (result.rows.length === 0) {
|
||||
return res.status(404).json({ error: 'Token not found' });
|
||||
}
|
||||
res.json({ token: result.rows[0] });
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error fetching API token:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch API token' });
|
||||
}
|
||||
});
|
||||
// Create new API token
|
||||
router.post('/', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
|
||||
try {
|
||||
const { name, description, rate_limit, allowed_endpoints, expires_at } = req.body;
|
||||
const userId = req.user.userId;
|
||||
if (!name) {
|
||||
return res.status(400).json({ error: 'Name is required' });
|
||||
}
|
||||
const token = generateToken();
|
||||
const result = await migrate_1.pool.query(`
|
||||
INSERT INTO api_tokens (
|
||||
name,
|
||||
token,
|
||||
description,
|
||||
user_id,
|
||||
rate_limit,
|
||||
allowed_endpoints,
|
||||
expires_at
|
||||
)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7)
|
||||
RETURNING *
|
||||
`, [
|
||||
name,
|
||||
token,
|
||||
description || null,
|
||||
userId,
|
||||
rate_limit || 100,
|
||||
allowed_endpoints || null,
|
||||
expires_at || null
|
||||
]);
|
||||
res.status(201).json({
|
||||
token: result.rows[0],
|
||||
message: 'API token created successfully. Save this token securely - it cannot be retrieved later.'
|
||||
});
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error creating API token:', error);
|
||||
res.status(500).json({ error: 'Failed to create API token' });
|
||||
}
|
||||
});
|
||||
// Update API token
|
||||
router.put('/:id', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
|
||||
try {
|
||||
const { id } = req.params;
|
||||
const { name, description, active, rate_limit, allowed_endpoints, expires_at } = req.body;
|
||||
const result = await migrate_1.pool.query(`
|
||||
UPDATE api_tokens
|
||||
SET
|
||||
name = COALESCE($1, name),
|
||||
description = COALESCE($2, description),
|
||||
active = COALESCE($3, active),
|
||||
rate_limit = COALESCE($4, rate_limit),
|
||||
allowed_endpoints = COALESCE($5, allowed_endpoints),
|
||||
expires_at = COALESCE($6, expires_at)
|
||||
WHERE id = $7
|
||||
RETURNING *
|
||||
`, [name, description, active, rate_limit, allowed_endpoints, expires_at, id]);
|
||||
if (result.rows.length === 0) {
|
||||
return res.status(404).json({ error: 'Token not found' });
|
||||
}
|
||||
res.json({ token: result.rows[0] });
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error updating API token:', error);
|
||||
res.status(500).json({ error: 'Failed to update API token' });
|
||||
}
|
||||
});
|
||||
// Delete API token
|
||||
router.delete('/:id', (0, middleware_1.requireRole)('superadmin'), async (req, res) => {
|
||||
try {
|
||||
const { id } = req.params;
|
||||
const result = await migrate_1.pool.query('DELETE FROM api_tokens WHERE id = $1 RETURNING *', [id]);
|
||||
if (result.rows.length === 0) {
|
||||
return res.status(404).json({ error: 'Token not found' });
|
||||
}
|
||||
res.json({ message: 'API token deleted successfully' });
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error deleting API token:', error);
|
||||
res.status(500).json({ error: 'Failed to delete API token' });
|
||||
}
|
||||
});
|
||||
// Get token usage statistics
|
||||
router.get('/:id/usage', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
|
||||
try {
|
||||
const { id } = req.params;
|
||||
const { days = 7 } = req.query;
|
||||
// Get hourly usage for the past N days
|
||||
const hourlyUsage = await migrate_1.pool.query(`
|
||||
SELECT
|
||||
DATE_TRUNC('hour', created_at) as hour,
|
||||
COUNT(*) as requests,
|
||||
AVG(response_time_ms) as avg_response_time,
|
||||
SUM(CASE WHEN status_code >= 200 AND status_code < 300 THEN 1 ELSE 0 END) as successful_requests,
|
||||
SUM(CASE WHEN status_code >= 400 THEN 1 ELSE 0 END) as failed_requests
|
||||
FROM api_token_usage
|
||||
WHERE token_id = $1
|
||||
AND created_at > NOW() - INTERVAL '${parseInt(days)} days'
|
||||
GROUP BY hour
|
||||
ORDER BY hour DESC
|
||||
`, [id]);
|
||||
// Get endpoint usage
|
||||
const endpointUsage = await migrate_1.pool.query(`
|
||||
SELECT
|
||||
endpoint,
|
||||
method,
|
||||
COUNT(*) as requests,
|
||||
AVG(response_time_ms) as avg_response_time
|
||||
FROM api_token_usage
|
||||
WHERE token_id = $1
|
||||
AND created_at > NOW() - INTERVAL '${parseInt(days)} days'
|
||||
GROUP BY endpoint, method
|
||||
ORDER BY requests DESC
|
||||
LIMIT 20
|
||||
`, [id]);
|
||||
// Get recent requests
|
||||
const recentRequests = await migrate_1.pool.query(`
|
||||
SELECT
|
||||
endpoint,
|
||||
method,
|
||||
status_code,
|
||||
response_time_ms,
|
||||
ip_address,
|
||||
created_at
|
||||
FROM api_token_usage
|
||||
WHERE token_id = $1
|
||||
ORDER BY created_at DESC
|
||||
LIMIT 100
|
||||
`, [id]);
|
||||
res.json({
|
||||
hourly_usage: hourlyUsage.rows,
|
||||
endpoint_usage: endpointUsage.rows,
|
||||
recent_requests: recentRequests.rows
|
||||
});
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error fetching token usage:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch token usage' });
|
||||
}
|
||||
});
|
||||
// Get overall API usage statistics
|
||||
router.get('/stats/overview', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
|
||||
try {
|
||||
const { days = 7 } = req.query;
|
||||
const stats = await migrate_1.pool.query(`
|
||||
SELECT
|
||||
COUNT(DISTINCT token_id) as active_tokens,
|
||||
COUNT(*) as total_requests,
|
||||
AVG(response_time_ms) as avg_response_time,
|
||||
SUM(CASE WHEN status_code >= 200 AND status_code < 300 THEN 1 ELSE 0 END) as successful_requests,
|
||||
SUM(CASE WHEN status_code >= 400 THEN 1 ELSE 0 END) as failed_requests
|
||||
FROM api_token_usage
|
||||
WHERE created_at > NOW() - INTERVAL '${parseInt(days)} days'
|
||||
`);
|
||||
// Top tokens by usage
|
||||
const topTokens = await migrate_1.pool.query(`
|
||||
SELECT
|
||||
t.id,
|
||||
t.name,
|
||||
COUNT(u.id) as requests,
|
||||
AVG(u.response_time_ms) as avg_response_time
|
||||
FROM api_tokens t
|
||||
LEFT JOIN api_token_usage u ON t.id = u.token_id
|
||||
WHERE u.created_at > NOW() - INTERVAL '${parseInt(days)} days'
|
||||
GROUP BY t.id, t.name
|
||||
ORDER BY requests DESC
|
||||
LIMIT 10
|
||||
`);
|
||||
// Most used endpoints
|
||||
const topEndpoints = await migrate_1.pool.query(`
|
||||
SELECT
|
||||
endpoint,
|
||||
method,
|
||||
COUNT(*) as requests,
|
||||
AVG(response_time_ms) as avg_response_time
|
||||
FROM api_token_usage
|
||||
WHERE created_at > NOW() - INTERVAL '${parseInt(days)} days'
|
||||
GROUP BY endpoint, method
|
||||
ORDER BY requests DESC
|
||||
LIMIT 10
|
||||
`);
|
||||
res.json({
|
||||
overview: stats.rows[0],
|
||||
top_tokens: topTokens.rows,
|
||||
top_endpoints: topEndpoints.rows
|
||||
});
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error fetching API stats:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch API stats' });
|
||||
}
|
||||
});
|
||||
exports.default = router;
|
||||
4
backend/dist/routes/categories.js
vendored
4
backend/dist/routes/categories.js
vendored
@@ -58,11 +58,11 @@ router.get('/tree', async (req, res) => {
|
||||
const categoryMap = new Map();
|
||||
const tree = [];
|
||||
// First pass: create map
|
||||
categories.forEach(cat => {
|
||||
categories.forEach((cat) => {
|
||||
categoryMap.set(cat.id, { ...cat, children: [] });
|
||||
});
|
||||
// Second pass: build tree
|
||||
categories.forEach(cat => {
|
||||
categories.forEach((cat) => {
|
||||
const node = categoryMap.get(cat.id);
|
||||
if (cat.parent_id) {
|
||||
const parent = categoryMap.get(cat.parent_id);
|
||||
|
||||
152
backend/dist/routes/changes.js
vendored
Normal file
152
backend/dist/routes/changes.js
vendored
Normal file
@@ -0,0 +1,152 @@
|
||||
"use strict";
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
const express_1 = require("express");
|
||||
const middleware_1 = require("../auth/middleware");
|
||||
const migrate_1 = require("../db/migrate");
|
||||
const router = (0, express_1.Router)();
|
||||
router.use(middleware_1.authMiddleware);
|
||||
// Get all changes with optional status filter
|
||||
router.get('/', async (req, res) => {
|
||||
try {
|
||||
const { status } = req.query;
|
||||
let query = `
|
||||
SELECT
|
||||
dc.id,
|
||||
dc.dispensary_id,
|
||||
dc.field_name,
|
||||
dc.old_value,
|
||||
dc.new_value,
|
||||
dc.source,
|
||||
dc.confidence_score,
|
||||
dc.change_notes,
|
||||
dc.status,
|
||||
dc.requires_recrawl,
|
||||
dc.created_at,
|
||||
dc.reviewed_at,
|
||||
dc.reviewed_by,
|
||||
dc.rejection_reason,
|
||||
d.name as dispensary_name,
|
||||
d.slug as dispensary_slug,
|
||||
d.city,
|
||||
d.state
|
||||
FROM dispensary_changes dc
|
||||
JOIN dispensaries d ON dc.dispensary_id = d.id
|
||||
`;
|
||||
const params = [];
|
||||
if (status) {
|
||||
query += ` WHERE dc.status = $1`;
|
||||
params.push(status);
|
||||
}
|
||||
query += ` ORDER BY dc.created_at DESC`;
|
||||
const result = await migrate_1.pool.query(query, params);
|
||||
res.json({ changes: result.rows });
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error fetching changes:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch changes' });
|
||||
}
|
||||
});
|
||||
// Get changes statistics (for alert banner)
|
||||
router.get('/stats', async (req, res) => {
|
||||
try {
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT
|
||||
COUNT(*) FILTER (WHERE status = 'pending') as pending_count,
|
||||
COUNT(*) FILTER (WHERE status = 'pending' AND requires_recrawl = TRUE) as pending_recrawl_count,
|
||||
COUNT(*) FILTER (WHERE status = 'approved') as approved_count,
|
||||
COUNT(*) FILTER (WHERE status = 'rejected') as rejected_count
|
||||
FROM dispensary_changes
|
||||
`);
|
||||
res.json(result.rows[0]);
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error fetching change stats:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch change stats' });
|
||||
}
|
||||
});
|
||||
// Approve a change and apply it to the dispensary
|
||||
router.post('/:id/approve', async (req, res) => {
|
||||
const client = await migrate_1.pool.connect();
|
||||
try {
|
||||
await client.query('BEGIN');
|
||||
const { id } = req.params;
|
||||
const userId = req.user?.id; // From auth middleware
|
||||
// Get the change record
|
||||
const changeResult = await client.query(`
|
||||
SELECT * FROM dispensary_changes WHERE id = $1 AND status = 'pending'
|
||||
`, [id]);
|
||||
if (changeResult.rows.length === 0) {
|
||||
await client.query('ROLLBACK');
|
||||
return res.status(404).json({ error: 'Pending change not found' });
|
||||
}
|
||||
const change = changeResult.rows[0];
|
||||
// Apply the change to the dispensary table
|
||||
const updateQuery = `
|
||||
UPDATE dispensaries
|
||||
SET ${change.field_name} = $1, updated_at = CURRENT_TIMESTAMP
|
||||
WHERE id = $2
|
||||
RETURNING *
|
||||
`;
|
||||
const dispensaryResult = await client.query(updateQuery, [
|
||||
change.new_value,
|
||||
change.dispensary_id
|
||||
]);
|
||||
if (dispensaryResult.rows.length === 0) {
|
||||
await client.query('ROLLBACK');
|
||||
return res.status(404).json({ error: 'Dispensary not found' });
|
||||
}
|
||||
// Mark the change as approved
|
||||
await client.query(`
|
||||
UPDATE dispensary_changes
|
||||
SET
|
||||
status = 'approved',
|
||||
reviewed_at = CURRENT_TIMESTAMP,
|
||||
reviewed_by = $1
|
||||
WHERE id = $2
|
||||
`, [userId, id]);
|
||||
await client.query('COMMIT');
|
||||
res.json({
|
||||
message: 'Change approved and applied',
|
||||
dispensary: dispensaryResult.rows[0],
|
||||
requires_recrawl: change.requires_recrawl
|
||||
});
|
||||
}
|
||||
catch (error) {
|
||||
await client.query('ROLLBACK');
|
||||
console.error('Error approving change:', error);
|
||||
res.status(500).json({ error: 'Failed to approve change' });
|
||||
}
|
||||
finally {
|
||||
client.release();
|
||||
}
|
||||
});
|
||||
// Reject a change with optional reason
|
||||
router.post('/:id/reject', async (req, res) => {
|
||||
try {
|
||||
const { id } = req.params;
|
||||
const { reason } = req.body;
|
||||
const userId = req.user?.id; // From auth middleware
|
||||
const result = await migrate_1.pool.query(`
|
||||
UPDATE dispensary_changes
|
||||
SET
|
||||
status = 'rejected',
|
||||
reviewed_at = CURRENT_TIMESTAMP,
|
||||
reviewed_by = $1,
|
||||
rejection_reason = $2
|
||||
WHERE id = $3 AND status = 'pending'
|
||||
RETURNING *
|
||||
`, [userId, reason, id]);
|
||||
if (result.rows.length === 0) {
|
||||
return res.status(404).json({ error: 'Pending change not found' });
|
||||
}
|
||||
res.json({
|
||||
message: 'Change rejected',
|
||||
change: result.rows[0]
|
||||
});
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error rejecting change:', error);
|
||||
res.status(500).json({ error: 'Failed to reject change' });
|
||||
}
|
||||
});
|
||||
exports.default = router;
|
||||
497
backend/dist/routes/crawler-sandbox.js
vendored
Normal file
497
backend/dist/routes/crawler-sandbox.js
vendored
Normal file
@@ -0,0 +1,497 @@
|
||||
"use strict";
|
||||
/**
|
||||
* Crawler Sandbox API Routes
|
||||
*
|
||||
* Endpoints for managing sandbox crawls, templates, and provider detection
|
||||
*/
|
||||
var __importDefault = (this && this.__importDefault) || function (mod) {
|
||||
return (mod && mod.__esModule) ? mod : { "default": mod };
|
||||
};
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
const express_1 = __importDefault(require("express"));
|
||||
const migrate_1 = require("../db/migrate");
|
||||
const middleware_1 = require("../auth/middleware");
|
||||
const logger_1 = require("../services/logger");
|
||||
const crawler_jobs_1 = require("../services/crawler-jobs");
|
||||
const router = express_1.default.Router();
|
||||
// Apply auth middleware to all routes
|
||||
router.use(middleware_1.authMiddleware);
|
||||
// ========================================
|
||||
// Sandbox Entries
|
||||
// ========================================
|
||||
/**
|
||||
* GET /api/crawler-sandbox
|
||||
* List sandbox entries with optional filters
|
||||
*/
|
||||
router.get('/', async (req, res) => {
|
||||
try {
|
||||
const { status, dispensaryId, limit = 50, offset = 0 } = req.query;
|
||||
let query = `
|
||||
SELECT cs.*, d.name as dispensary_name, d.website, d.menu_provider, d.crawler_status
|
||||
FROM crawler_sandboxes cs
|
||||
JOIN dispensaries d ON d.id = cs.dispensary_id
|
||||
WHERE 1=1
|
||||
`;
|
||||
const params = [];
|
||||
let paramIndex = 1;
|
||||
if (status) {
|
||||
query += ` AND cs.status = $${paramIndex}`;
|
||||
params.push(status);
|
||||
paramIndex++;
|
||||
}
|
||||
if (dispensaryId) {
|
||||
query += ` AND cs.dispensary_id = $${paramIndex}`;
|
||||
params.push(Number(dispensaryId));
|
||||
paramIndex++;
|
||||
}
|
||||
query += ` ORDER BY cs.created_at DESC LIMIT $${paramIndex} OFFSET $${paramIndex + 1}`;
|
||||
params.push(Number(limit), Number(offset));
|
||||
const result = await migrate_1.pool.query(query, params);
|
||||
// Get total count
|
||||
const countResult = await migrate_1.pool.query(`SELECT COUNT(*) FROM crawler_sandboxes cs WHERE 1=1
|
||||
${status ? 'AND cs.status = $1' : ''}
|
||||
${dispensaryId ? `AND cs.dispensary_id = $${status ? 2 : 1}` : ''}`, status && dispensaryId ? [status, dispensaryId] : status ? [status] : dispensaryId ? [dispensaryId] : []);
|
||||
res.json({
|
||||
sandboxes: result.rows,
|
||||
total: parseInt(countResult.rows[0].count),
|
||||
limit: Number(limit),
|
||||
offset: Number(offset),
|
||||
});
|
||||
}
|
||||
catch (error) {
|
||||
logger_1.logger.error('api', `Get sandboxes error: ${error.message}`);
|
||||
res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
/**
|
||||
* GET /api/crawler-sandbox/:id
|
||||
* Get a single sandbox entry with full details
|
||||
*/
|
||||
router.get('/:id', async (req, res) => {
|
||||
try {
|
||||
const { id } = req.params;
|
||||
const result = await migrate_1.pool.query(`SELECT cs.*, d.name as dispensary_name, d.website, d.menu_url,
|
||||
d.menu_provider, d.menu_provider_confidence, d.crawler_mode, d.crawler_status
|
||||
FROM crawler_sandboxes cs
|
||||
JOIN dispensaries d ON d.id = cs.dispensary_id
|
||||
WHERE cs.id = $1`, [id]);
|
||||
if (result.rows.length === 0) {
|
||||
return res.status(404).json({ error: 'Sandbox entry not found' });
|
||||
}
|
||||
// Get related jobs
|
||||
const jobs = await migrate_1.pool.query(`SELECT * FROM sandbox_crawl_jobs
|
||||
WHERE sandbox_id = $1 OR dispensary_id = $2
|
||||
ORDER BY created_at DESC
|
||||
LIMIT 10`, [id, result.rows[0].dispensary_id]);
|
||||
res.json({
|
||||
sandbox: result.rows[0],
|
||||
jobs: jobs.rows,
|
||||
});
|
||||
}
|
||||
catch (error) {
|
||||
logger_1.logger.error('api', `Get sandbox error: ${error.message}`);
|
||||
res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
/**
|
||||
* POST /api/crawler-sandbox/:id/analyze
|
||||
* Trigger re-analysis of a sandbox entry
|
||||
*/
|
||||
router.post('/:id/analyze', (0, middleware_1.requireRole)('admin'), async (req, res) => {
|
||||
try {
|
||||
const { id } = req.params;
|
||||
const sandbox = await migrate_1.pool.query('SELECT * FROM crawler_sandboxes WHERE id = $1', [id]);
|
||||
if (sandbox.rows.length === 0) {
|
||||
return res.status(404).json({ error: 'Sandbox entry not found' });
|
||||
}
|
||||
// Queue a new sandbox job
|
||||
const job = await migrate_1.pool.query(`INSERT INTO sandbox_crawl_jobs (dispensary_id, sandbox_id, job_type, status, priority)
|
||||
VALUES ($1, $2, 'deep_crawl', 'pending', 20)
|
||||
RETURNING id`, [sandbox.rows[0].dispensary_id, id]);
|
||||
// Update sandbox status
|
||||
await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = 'pending', updated_at = NOW() WHERE id = $1`, [id]);
|
||||
res.json({
|
||||
message: 'Analysis job queued',
|
||||
jobId: job.rows[0].id,
|
||||
});
|
||||
}
|
||||
catch (error) {
|
||||
logger_1.logger.error('api', `Analyze sandbox error: ${error.message}`);
|
||||
res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
/**
|
||||
* POST /api/crawler-sandbox/:id/move-to-production
|
||||
* Move a sandbox entry to production (for Dutchie dispensaries)
|
||||
*/
|
||||
router.post('/:id/move-to-production', (0, middleware_1.requireRole)('admin'), async (req, res) => {
|
||||
try {
|
||||
const { id } = req.params;
|
||||
const sandbox = await migrate_1.pool.query(`SELECT cs.*, d.menu_provider
|
||||
FROM crawler_sandboxes cs
|
||||
JOIN dispensaries d ON d.id = cs.dispensary_id
|
||||
WHERE cs.id = $1`, [id]);
|
||||
if (sandbox.rows.length === 0) {
|
||||
return res.status(404).json({ error: 'Sandbox entry not found' });
|
||||
}
|
||||
// Can only move to production if provider is dutchie
|
||||
if (sandbox.rows[0].menu_provider !== 'dutchie') {
|
||||
return res.status(400).json({
|
||||
error: 'Only Dutchie dispensaries can be moved to production currently',
|
||||
});
|
||||
}
|
||||
// Update dispensary to production mode
|
||||
await migrate_1.pool.query(`UPDATE dispensaries
|
||||
SET crawler_mode = 'production', crawler_status = 'idle', updated_at = NOW()
|
||||
WHERE id = $1`, [sandbox.rows[0].dispensary_id]);
|
||||
// Mark sandbox as moved
|
||||
await migrate_1.pool.query(`UPDATE crawler_sandboxes
|
||||
SET status = 'moved_to_production', updated_at = NOW()
|
||||
WHERE id = $1`, [id]);
|
||||
res.json({ message: 'Dispensary moved to production' });
|
||||
}
|
||||
catch (error) {
|
||||
logger_1.logger.error('api', `Move to production error: ${error.message}`);
|
||||
res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
/**
|
||||
* PATCH /api/crawler-sandbox/:id
|
||||
* Update sandbox entry (e.g., add human review notes)
|
||||
*/
|
||||
router.patch('/:id', (0, middleware_1.requireRole)('admin'), async (req, res) => {
|
||||
try {
|
||||
const { id } = req.params;
|
||||
const { human_review_notes, status, suspected_menu_provider } = req.body;
|
||||
const updates = [];
|
||||
const params = [];
|
||||
let paramIndex = 1;
|
||||
if (human_review_notes !== undefined) {
|
||||
updates.push(`human_review_notes = $${paramIndex}`);
|
||||
params.push(human_review_notes);
|
||||
paramIndex++;
|
||||
}
|
||||
if (status) {
|
||||
updates.push(`status = $${paramIndex}`);
|
||||
params.push(status);
|
||||
paramIndex++;
|
||||
}
|
||||
if (suspected_menu_provider !== undefined) {
|
||||
updates.push(`suspected_menu_provider = $${paramIndex}`);
|
||||
params.push(suspected_menu_provider);
|
||||
paramIndex++;
|
||||
}
|
||||
if (updates.length === 0) {
|
||||
return res.status(400).json({ error: 'No updates provided' });
|
||||
}
|
||||
updates.push('updated_at = NOW()');
|
||||
if (human_review_notes !== undefined) {
|
||||
updates.push('reviewed_at = NOW()');
|
||||
}
|
||||
params.push(id);
|
||||
await migrate_1.pool.query(`UPDATE crawler_sandboxes SET ${updates.join(', ')} WHERE id = $${paramIndex}`, params);
|
||||
res.json({ message: 'Sandbox updated' });
|
||||
}
|
||||
catch (error) {
|
||||
logger_1.logger.error('api', `Update sandbox error: ${error.message}`);
|
||||
res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
// ========================================
|
||||
// Templates
|
||||
// ========================================
|
||||
/**
|
||||
* GET /api/crawler-sandbox/templates
|
||||
* List all crawler templates
|
||||
*/
|
||||
router.get('/templates/list', async (req, res) => {
|
||||
try {
|
||||
const result = await migrate_1.pool.query(`SELECT * FROM crawler_templates ORDER BY provider, is_default_for_provider DESC, name`);
|
||||
res.json({ templates: result.rows });
|
||||
}
|
||||
catch (error) {
|
||||
logger_1.logger.error('api', `Get templates error: ${error.message}`);
|
||||
res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
/**
|
||||
* GET /api/crawler-sandbox/templates/:id
|
||||
* Get a single template
|
||||
*/
|
||||
router.get('/templates/:id', async (req, res) => {
|
||||
try {
|
||||
const { id } = req.params;
|
||||
const result = await migrate_1.pool.query('SELECT * FROM crawler_templates WHERE id = $1', [id]);
|
||||
if (result.rows.length === 0) {
|
||||
return res.status(404).json({ error: 'Template not found' });
|
||||
}
|
||||
res.json({ template: result.rows[0] });
|
||||
}
|
||||
catch (error) {
|
||||
logger_1.logger.error('api', `Get template error: ${error.message}`);
|
||||
res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
/**
|
||||
* POST /api/crawler-sandbox/templates
|
||||
* Create a new template
|
||||
*/
|
||||
router.post('/templates', (0, middleware_1.requireRole)('admin'), async (req, res) => {
|
||||
try {
|
||||
const { provider, name, selector_config, navigation_config, transform_config, validation_rules, notes, } = req.body;
|
||||
if (!provider || !name) {
|
||||
return res.status(400).json({ error: 'provider and name are required' });
|
||||
}
|
||||
const result = await migrate_1.pool.query(`INSERT INTO crawler_templates
|
||||
(provider, name, selector_config, navigation_config, transform_config, validation_rules, notes, created_by)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
|
||||
RETURNING *`, [
|
||||
provider,
|
||||
name,
|
||||
JSON.stringify(selector_config || {}),
|
||||
JSON.stringify(navigation_config || {}),
|
||||
JSON.stringify(transform_config || {}),
|
||||
JSON.stringify(validation_rules || {}),
|
||||
notes,
|
||||
req.user?.email || 'system',
|
||||
]);
|
||||
res.status(201).json({ template: result.rows[0] });
|
||||
}
|
||||
catch (error) {
|
||||
logger_1.logger.error('api', `Create template error: ${error.message}`);
|
||||
res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
/**
|
||||
* PUT /api/crawler-sandbox/templates/:id
|
||||
* Update a template
|
||||
*/
|
||||
router.put('/templates/:id', (0, middleware_1.requireRole)('admin'), async (req, res) => {
|
||||
try {
|
||||
const { id } = req.params;
|
||||
const { is_active, is_default_for_provider, selector_config, navigation_config, transform_config, validation_rules, notes, } = req.body;
|
||||
const updates = [];
|
||||
const params = [];
|
||||
let paramIndex = 1;
|
||||
if (is_active !== undefined) {
|
||||
updates.push(`is_active = $${paramIndex}`);
|
||||
params.push(is_active);
|
||||
paramIndex++;
|
||||
}
|
||||
if (is_default_for_provider !== undefined) {
|
||||
updates.push(`is_default_for_provider = $${paramIndex}`);
|
||||
params.push(is_default_for_provider);
|
||||
paramIndex++;
|
||||
}
|
||||
if (selector_config !== undefined) {
|
||||
updates.push(`selector_config = $${paramIndex}`);
|
||||
params.push(JSON.stringify(selector_config));
|
||||
paramIndex++;
|
||||
}
|
||||
if (navigation_config !== undefined) {
|
||||
updates.push(`navigation_config = $${paramIndex}`);
|
||||
params.push(JSON.stringify(navigation_config));
|
||||
paramIndex++;
|
||||
}
|
||||
if (transform_config !== undefined) {
|
||||
updates.push(`transform_config = $${paramIndex}`);
|
||||
params.push(JSON.stringify(transform_config));
|
||||
paramIndex++;
|
||||
}
|
||||
if (validation_rules !== undefined) {
|
||||
updates.push(`validation_rules = $${paramIndex}`);
|
||||
params.push(JSON.stringify(validation_rules));
|
||||
paramIndex++;
|
||||
}
|
||||
if (notes !== undefined) {
|
||||
updates.push(`notes = $${paramIndex}`);
|
||||
params.push(notes);
|
||||
paramIndex++;
|
||||
}
|
||||
if (updates.length === 0) {
|
||||
return res.status(400).json({ error: 'No updates provided' });
|
||||
}
|
||||
updates.push('updated_at = NOW()');
|
||||
params.push(id);
|
||||
await migrate_1.pool.query(`UPDATE crawler_templates SET ${updates.join(', ')} WHERE id = $${paramIndex}`, params);
|
||||
const result = await migrate_1.pool.query('SELECT * FROM crawler_templates WHERE id = $1', [id]);
|
||||
res.json({ template: result.rows[0] });
|
||||
}
|
||||
catch (error) {
|
||||
logger_1.logger.error('api', `Update template error: ${error.message}`);
|
||||
res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
// ========================================
|
||||
// Jobs
|
||||
// ========================================
|
||||
/**
|
||||
* GET /api/crawler-sandbox/jobs
|
||||
* List sandbox crawl jobs
|
||||
*/
|
||||
router.get('/jobs/list', async (req, res) => {
|
||||
try {
|
||||
const { status, dispensaryId, limit = 50 } = req.query;
|
||||
let query = `
|
||||
SELECT sj.*, d.name as dispensary_name
|
||||
FROM sandbox_crawl_jobs sj
|
||||
JOIN dispensaries d ON d.id = sj.dispensary_id
|
||||
WHERE 1=1
|
||||
`;
|
||||
const params = [];
|
||||
let paramIndex = 1;
|
||||
if (status) {
|
||||
query += ` AND sj.status = $${paramIndex}`;
|
||||
params.push(status);
|
||||
paramIndex++;
|
||||
}
|
||||
if (dispensaryId) {
|
||||
query += ` AND sj.dispensary_id = $${paramIndex}`;
|
||||
params.push(Number(dispensaryId));
|
||||
paramIndex++;
|
||||
}
|
||||
query += ` ORDER BY sj.created_at DESC LIMIT $${paramIndex}`;
|
||||
params.push(Number(limit));
|
||||
const result = await migrate_1.pool.query(query, params);
|
||||
res.json({ jobs: result.rows });
|
||||
}
|
||||
catch (error) {
|
||||
logger_1.logger.error('api', `Get jobs error: ${error.message}`);
|
||||
res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
/**
|
||||
* POST /api/crawler-sandbox/jobs/detect/:dispensaryId
|
||||
* Trigger provider detection for a dispensary
|
||||
*/
|
||||
router.post('/jobs/detect/:dispensaryId', (0, middleware_1.requireRole)('admin'), async (req, res) => {
|
||||
try {
|
||||
const { dispensaryId } = req.params;
|
||||
// Create detection job
|
||||
const job = await migrate_1.pool.query(`INSERT INTO sandbox_crawl_jobs (dispensary_id, job_type, status, priority)
|
||||
VALUES ($1, 'detection', 'pending', 30)
|
||||
RETURNING id`, [dispensaryId]);
|
||||
// Update dispensary status
|
||||
await migrate_1.pool.query(`UPDATE dispensaries SET crawler_status = 'queued_detection', updated_at = NOW() WHERE id = $1`, [dispensaryId]);
|
||||
res.json({
|
||||
message: 'Detection job queued',
|
||||
jobId: job.rows[0].id,
|
||||
});
|
||||
}
|
||||
catch (error) {
|
||||
logger_1.logger.error('api', `Queue detection error: ${error.message}`);
|
||||
res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
/**
|
||||
* POST /api/crawler-sandbox/jobs/run/:id
|
||||
* Immediately run a sandbox job
|
||||
*/
|
||||
router.post('/jobs/run/:id', (0, middleware_1.requireRole)('admin'), async (req, res) => {
|
||||
try {
|
||||
const { id } = req.params;
|
||||
const job = await migrate_1.pool.query('SELECT * FROM sandbox_crawl_jobs WHERE id = $1', [id]);
|
||||
if (job.rows.length === 0) {
|
||||
return res.status(404).json({ error: 'Job not found' });
|
||||
}
|
||||
const jobData = job.rows[0];
|
||||
// Run the job immediately
|
||||
let result;
|
||||
if (jobData.job_type === 'detection') {
|
||||
result = await (0, crawler_jobs_1.runDetectMenuProviderJob)(jobData.dispensary_id);
|
||||
}
|
||||
else {
|
||||
result = await (0, crawler_jobs_1.runSandboxCrawlJob)(jobData.dispensary_id, jobData.sandbox_id);
|
||||
}
|
||||
// Update job status
|
||||
await migrate_1.pool.query(`UPDATE sandbox_crawl_jobs
|
||||
SET status = $1, completed_at = NOW(), result_summary = $2, error_message = $3
|
||||
WHERE id = $4`, [
|
||||
result.success ? 'completed' : 'failed',
|
||||
JSON.stringify(result.data || {}),
|
||||
result.success ? null : result.message,
|
||||
id,
|
||||
]);
|
||||
res.json(result);
|
||||
}
|
||||
catch (error) {
|
||||
logger_1.logger.error('api', `Run job error: ${error.message}`);
|
||||
res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
// ========================================
|
||||
// Stats
|
||||
// ========================================
|
||||
/**
|
||||
* GET /api/crawler-sandbox/stats
|
||||
* Get sandbox/crawler statistics
|
||||
*/
|
||||
router.get('/stats/overview', async (req, res) => {
|
||||
try {
|
||||
// Dispensary provider stats
|
||||
const providerStats = await migrate_1.pool.query(`
|
||||
SELECT
|
||||
menu_provider,
|
||||
COUNT(*) as count,
|
||||
AVG(menu_provider_confidence)::integer as avg_confidence
|
||||
FROM dispensaries
|
||||
WHERE menu_provider IS NOT NULL
|
||||
GROUP BY menu_provider
|
||||
ORDER BY count DESC
|
||||
`);
|
||||
// Mode stats
|
||||
const modeStats = await migrate_1.pool.query(`
|
||||
SELECT
|
||||
crawler_mode,
|
||||
COUNT(*) as count
|
||||
FROM dispensaries
|
||||
GROUP BY crawler_mode
|
||||
`);
|
||||
// Status stats
|
||||
const statusStats = await migrate_1.pool.query(`
|
||||
SELECT
|
||||
crawler_status,
|
||||
COUNT(*) as count
|
||||
FROM dispensaries
|
||||
GROUP BY crawler_status
|
||||
ORDER BY count DESC
|
||||
`);
|
||||
// Sandbox stats
|
||||
const sandboxStats = await migrate_1.pool.query(`
|
||||
SELECT
|
||||
status,
|
||||
COUNT(*) as count
|
||||
FROM crawler_sandboxes
|
||||
GROUP BY status
|
||||
`);
|
||||
// Job stats
|
||||
const jobStats = await migrate_1.pool.query(`
|
||||
SELECT
|
||||
status,
|
||||
job_type,
|
||||
COUNT(*) as count
|
||||
FROM sandbox_crawl_jobs
|
||||
GROUP BY status, job_type
|
||||
`);
|
||||
// Recent activity
|
||||
const recentActivity = await migrate_1.pool.query(`
|
||||
SELECT 'sandbox' as type, id, dispensary_id, status, created_at
|
||||
FROM crawler_sandboxes
|
||||
ORDER BY created_at DESC
|
||||
LIMIT 5
|
||||
`);
|
||||
res.json({
|
||||
providers: providerStats.rows,
|
||||
modes: modeStats.rows,
|
||||
statuses: statusStats.rows,
|
||||
sandbox: sandboxStats.rows,
|
||||
jobs: jobStats.rows,
|
||||
recentActivity: recentActivity.rows,
|
||||
});
|
||||
}
|
||||
catch (error) {
|
||||
logger_1.logger.error('api', `Get stats error: ${error.message}`);
|
||||
res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
exports.default = router;
|
||||
134
backend/dist/routes/dashboard.js
vendored
134
backend/dist/routes/dashboard.js
vendored
@@ -2,63 +2,70 @@
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
const express_1 = require("express");
|
||||
const middleware_1 = require("../auth/middleware");
|
||||
const migrate_1 = require("../db/migrate");
|
||||
const connection_1 = require("../dutchie-az/db/connection");
|
||||
const router = (0, express_1.Router)();
|
||||
router.use(middleware_1.authMiddleware);
|
||||
// Get dashboard stats
|
||||
// Get dashboard stats - uses consolidated dutchie-az DB
|
||||
router.get('/stats', async (req, res) => {
|
||||
try {
|
||||
// Store stats
|
||||
const storesResult = await migrate_1.pool.query(`
|
||||
SELECT
|
||||
// Store stats from dispensaries table in consolidated DB
|
||||
const dispensariesResult = await (0, connection_1.query)(`
|
||||
SELECT
|
||||
COUNT(*) as total,
|
||||
COUNT(*) FILTER (WHERE active = true) as active,
|
||||
MIN(last_scraped_at) as oldest_scrape,
|
||||
MAX(last_scraped_at) as latest_scrape
|
||||
FROM stores
|
||||
COUNT(*) FILTER (WHERE menu_type IS NOT NULL AND menu_type != 'unknown') as active,
|
||||
COUNT(*) FILTER (WHERE platform_dispensary_id IS NOT NULL) as with_platform_id,
|
||||
COUNT(*) FILTER (WHERE menu_url IS NOT NULL) as with_menu_url,
|
||||
MIN(last_crawled_at) as oldest_crawl,
|
||||
MAX(last_crawled_at) as latest_crawl
|
||||
FROM dispensaries
|
||||
`);
|
||||
// Product stats
|
||||
const productsResult = await migrate_1.pool.query(`
|
||||
SELECT
|
||||
// Product stats from dutchie_products table
|
||||
const productsResult = await (0, connection_1.query)(`
|
||||
SELECT
|
||||
COUNT(*) as total,
|
||||
COUNT(*) FILTER (WHERE in_stock = true) as in_stock,
|
||||
COUNT(*) FILTER (WHERE local_image_path IS NOT NULL) as with_images
|
||||
FROM products
|
||||
COUNT(*) FILTER (WHERE stock_status = 'in_stock') as in_stock,
|
||||
COUNT(*) FILTER (WHERE primary_image_url IS NOT NULL) as with_images,
|
||||
COUNT(DISTINCT brand_name) FILTER (WHERE brand_name IS NOT NULL AND brand_name != '') as unique_brands,
|
||||
COUNT(DISTINCT dispensary_id) as dispensaries_with_products
|
||||
FROM dutchie_products
|
||||
`);
|
||||
// Campaign stats
|
||||
const campaignsResult = await migrate_1.pool.query(`
|
||||
SELECT
|
||||
COUNT(*) as total,
|
||||
COUNT(*) FILTER (WHERE active = true) as active
|
||||
FROM campaigns
|
||||
`);
|
||||
// Recent clicks (last 24 hours)
|
||||
const clicksResult = await migrate_1.pool.query(`
|
||||
SELECT COUNT(*) as clicks_24h
|
||||
FROM clicks
|
||||
WHERE clicked_at >= NOW() - INTERVAL '24 hours'
|
||||
// Brand stats from dutchie_products
|
||||
const brandResult = await (0, connection_1.query)(`
|
||||
SELECT COUNT(DISTINCT brand_name) as total
|
||||
FROM dutchie_products
|
||||
WHERE brand_name IS NOT NULL AND brand_name != ''
|
||||
`);
|
||||
// Recent products added (last 24 hours)
|
||||
const recentProductsResult = await migrate_1.pool.query(`
|
||||
const recentProductsResult = await (0, connection_1.query)(`
|
||||
SELECT COUNT(*) as new_products_24h
|
||||
FROM products
|
||||
WHERE first_seen_at >= NOW() - INTERVAL '24 hours'
|
||||
`);
|
||||
// Proxy stats
|
||||
const proxiesResult = await migrate_1.pool.query(`
|
||||
SELECT
|
||||
COUNT(*) as total,
|
||||
COUNT(*) FILTER (WHERE active = true) as active,
|
||||
COUNT(*) FILTER (WHERE is_anonymous = true) as anonymous
|
||||
FROM proxies
|
||||
FROM dutchie_products
|
||||
WHERE created_at >= NOW() - INTERVAL '24 hours'
|
||||
`);
|
||||
// Combine results
|
||||
const storeStats = dispensariesResult.rows[0];
|
||||
const productStats = productsResult.rows[0];
|
||||
res.json({
|
||||
stores: storesResult.rows[0],
|
||||
products: productsResult.rows[0],
|
||||
campaigns: campaignsResult.rows[0],
|
||||
clicks: clicksResult.rows[0],
|
||||
recent: recentProductsResult.rows[0],
|
||||
proxies: proxiesResult.rows[0]
|
||||
stores: {
|
||||
total: parseInt(storeStats.total) || 0,
|
||||
active: parseInt(storeStats.active) || 0,
|
||||
with_menu_url: parseInt(storeStats.with_menu_url) || 0,
|
||||
with_platform_id: parseInt(storeStats.with_platform_id) || 0,
|
||||
oldest_crawl: storeStats.oldest_crawl,
|
||||
latest_crawl: storeStats.latest_crawl
|
||||
},
|
||||
products: {
|
||||
total: parseInt(productStats.total) || 0,
|
||||
in_stock: parseInt(productStats.in_stock) || 0,
|
||||
with_images: parseInt(productStats.with_images) || 0,
|
||||
unique_brands: parseInt(productStats.unique_brands) || 0,
|
||||
dispensaries_with_products: parseInt(productStats.dispensaries_with_products) || 0
|
||||
},
|
||||
brands: {
|
||||
total: parseInt(brandResult.rows[0].total) || 0
|
||||
},
|
||||
campaigns: { total: 0, active: 0 }, // Legacy - no longer used
|
||||
clicks: { clicks_24h: 0 }, // Legacy - no longer used
|
||||
recent: recentProductsResult.rows[0]
|
||||
});
|
||||
}
|
||||
catch (error) {
|
||||
@@ -66,27 +73,34 @@ router.get('/stats', async (req, res) => {
|
||||
res.status(500).json({ error: 'Failed to fetch dashboard stats' });
|
||||
}
|
||||
});
|
||||
// Get recent activity
|
||||
// Get recent activity - from consolidated dutchie-az DB
|
||||
router.get('/activity', async (req, res) => {
|
||||
try {
|
||||
const { limit = 20 } = req.query;
|
||||
// Recent scrapes
|
||||
const scrapesResult = await migrate_1.pool.query(`
|
||||
SELECT s.name, s.last_scraped_at,
|
||||
COUNT(p.id) as product_count
|
||||
FROM stores s
|
||||
LEFT JOIN products p ON s.id = p.store_id AND p.last_seen_at = s.last_scraped_at
|
||||
WHERE s.last_scraped_at IS NOT NULL
|
||||
GROUP BY s.id, s.name, s.last_scraped_at
|
||||
ORDER BY s.last_scraped_at DESC
|
||||
// Recent crawls from dispensaries (with product counts from dutchie_products)
|
||||
const scrapesResult = await (0, connection_1.query)(`
|
||||
SELECT
|
||||
d.name,
|
||||
d.last_crawled_at as last_scraped_at,
|
||||
d.product_count
|
||||
FROM dispensaries d
|
||||
WHERE d.last_crawled_at IS NOT NULL
|
||||
ORDER BY d.last_crawled_at DESC
|
||||
LIMIT $1
|
||||
`, [limit]);
|
||||
// Recent products
|
||||
const productsResult = await migrate_1.pool.query(`
|
||||
SELECT p.name, p.price, s.name as store_name, p.first_seen_at
|
||||
FROM products p
|
||||
JOIN stores s ON p.store_id = s.id
|
||||
ORDER BY p.first_seen_at DESC
|
||||
// Recent products from dutchie_products
|
||||
const productsResult = await (0, connection_1.query)(`
|
||||
SELECT
|
||||
p.name,
|
||||
0 as price,
|
||||
p.brand_name as brand,
|
||||
p.thc as thc_percentage,
|
||||
p.cbd as cbd_percentage,
|
||||
d.name as store_name,
|
||||
p.created_at as first_seen_at
|
||||
FROM dutchie_products p
|
||||
JOIN dispensaries d ON p.dispensary_id = d.id
|
||||
ORDER BY p.created_at DESC
|
||||
LIMIT $1
|
||||
`, [limit]);
|
||||
res.json({
|
||||
|
||||
437
backend/dist/routes/dispensaries.js
vendored
Normal file
437
backend/dist/routes/dispensaries.js
vendored
Normal file
@@ -0,0 +1,437 @@
|
||||
"use strict";
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
const express_1 = require("express");
|
||||
const middleware_1 = require("../auth/middleware");
|
||||
const migrate_1 = require("../db/migrate");
|
||||
const router = (0, express_1.Router)();
|
||||
router.use(middleware_1.authMiddleware);
|
||||
// Valid menu_type values
|
||||
const VALID_MENU_TYPES = ['dutchie', 'treez', 'jane', 'weedmaps', 'leafly', 'meadow', 'blaze', 'flowhub', 'dispense', 'cova', 'other', 'unknown'];
|
||||
// Get all dispensaries
|
||||
router.get('/', async (req, res) => {
|
||||
try {
|
||||
const { menu_type } = req.query;
|
||||
let query = `
|
||||
SELECT
|
||||
id,
|
||||
azdhs_id,
|
||||
name,
|
||||
company_name,
|
||||
slug,
|
||||
address,
|
||||
city,
|
||||
state,
|
||||
zip,
|
||||
phone,
|
||||
email,
|
||||
website,
|
||||
dba_name,
|
||||
google_rating,
|
||||
google_review_count,
|
||||
status_line,
|
||||
azdhs_url,
|
||||
latitude,
|
||||
longitude,
|
||||
menu_url,
|
||||
menu_type,
|
||||
menu_provider,
|
||||
menu_provider_confidence,
|
||||
scraper_template,
|
||||
last_menu_scrape,
|
||||
menu_scrape_status,
|
||||
platform_dispensary_id,
|
||||
created_at,
|
||||
updated_at
|
||||
FROM dispensaries
|
||||
`;
|
||||
const params = [];
|
||||
// Filter by menu_type if provided
|
||||
if (menu_type) {
|
||||
query += ` WHERE menu_type = $1`;
|
||||
params.push(menu_type);
|
||||
}
|
||||
query += ` ORDER BY name`;
|
||||
const result = await migrate_1.pool.query(query, params);
|
||||
res.json({ dispensaries: result.rows });
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error fetching dispensaries:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch dispensaries' });
|
||||
}
|
||||
});
|
||||
// Get menu type stats
|
||||
router.get('/stats/menu-types', async (req, res) => {
|
||||
try {
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT menu_type, COUNT(*) as count
|
||||
FROM dispensaries
|
||||
GROUP BY menu_type
|
||||
ORDER BY count DESC
|
||||
`);
|
||||
res.json({ menu_types: result.rows, valid_types: VALID_MENU_TYPES });
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error fetching menu type stats:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch menu type stats' });
|
||||
}
|
||||
});
|
||||
// Get single dispensary by slug
|
||||
router.get('/:slug', async (req, res) => {
|
||||
try {
|
||||
const { slug } = req.params;
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT
|
||||
id,
|
||||
azdhs_id,
|
||||
name,
|
||||
company_name,
|
||||
slug,
|
||||
address,
|
||||
city,
|
||||
state,
|
||||
zip,
|
||||
phone,
|
||||
email,
|
||||
website,
|
||||
dba_name,
|
||||
google_rating,
|
||||
google_review_count,
|
||||
status_line,
|
||||
azdhs_url,
|
||||
latitude,
|
||||
longitude,
|
||||
menu_url,
|
||||
menu_type,
|
||||
menu_provider,
|
||||
menu_provider_confidence,
|
||||
scraper_template,
|
||||
scraper_config,
|
||||
last_menu_scrape,
|
||||
menu_scrape_status,
|
||||
platform_dispensary_id,
|
||||
created_at,
|
||||
updated_at
|
||||
FROM dispensaries
|
||||
WHERE slug = $1
|
||||
`, [slug]);
|
||||
if (result.rows.length === 0) {
|
||||
return res.status(404).json({ error: 'Dispensary not found' });
|
||||
}
|
||||
res.json(result.rows[0]);
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error fetching dispensary:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch dispensary' });
|
||||
}
|
||||
});
|
||||
// Update dispensary
|
||||
router.put('/:id', async (req, res) => {
|
||||
try {
|
||||
const { id } = req.params;
|
||||
const { dba_name, website, phone, email, google_rating, google_review_count, menu_url, menu_type, scraper_template, scraper_config, menu_scrape_status } = req.body;
|
||||
// Validate menu_type if provided
|
||||
if (menu_type !== undefined && menu_type !== null && menu_type !== '' && !VALID_MENU_TYPES.includes(menu_type)) {
|
||||
return res.status(400).json({
|
||||
error: `Invalid menu_type. Must be one of: ${VALID_MENU_TYPES.join(', ')}`
|
||||
});
|
||||
}
|
||||
const result = await migrate_1.pool.query(`
|
||||
UPDATE dispensaries
|
||||
SET
|
||||
dba_name = COALESCE($1, dba_name),
|
||||
website = COALESCE($2, website),
|
||||
phone = COALESCE($3, phone),
|
||||
email = COALESCE($4, email),
|
||||
google_rating = COALESCE($5, google_rating),
|
||||
google_review_count = COALESCE($6, google_review_count),
|
||||
menu_url = COALESCE($7, menu_url),
|
||||
menu_type = COALESCE($8, menu_type),
|
||||
scraper_template = COALESCE($9, scraper_template),
|
||||
scraper_config = COALESCE($10, scraper_config),
|
||||
menu_scrape_status = COALESCE($11, menu_scrape_status),
|
||||
updated_at = CURRENT_TIMESTAMP
|
||||
WHERE id = $12
|
||||
RETURNING *
|
||||
`, [
|
||||
dba_name,
|
||||
website,
|
||||
phone,
|
||||
email,
|
||||
google_rating,
|
||||
google_review_count,
|
||||
menu_url,
|
||||
menu_type,
|
||||
scraper_template,
|
||||
scraper_config,
|
||||
menu_scrape_status,
|
||||
id
|
||||
]);
|
||||
if (result.rows.length === 0) {
|
||||
return res.status(404).json({ error: 'Dispensary not found' });
|
||||
}
|
||||
res.json(result.rows[0]);
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error updating dispensary:', error);
|
||||
res.status(500).json({ error: 'Failed to update dispensary' });
|
||||
}
|
||||
});
|
||||
// Get products for a dispensary by slug
|
||||
router.get('/:slug/products', async (req, res) => {
|
||||
try {
|
||||
const { slug } = req.params;
|
||||
const { category } = req.query;
|
||||
// First get the dispensary ID from slug
|
||||
const dispensaryResult = await migrate_1.pool.query(`
|
||||
SELECT id FROM dispensaries WHERE slug = $1
|
||||
`, [slug]);
|
||||
if (dispensaryResult.rows.length === 0) {
|
||||
return res.status(404).json({ error: 'Dispensary not found' });
|
||||
}
|
||||
const dispensaryId = dispensaryResult.rows[0].id;
|
||||
// Build query for products
|
||||
let query = `
|
||||
SELECT
|
||||
p.id,
|
||||
p.name,
|
||||
p.brand,
|
||||
p.variant,
|
||||
p.slug,
|
||||
p.description,
|
||||
p.regular_price,
|
||||
p.sale_price,
|
||||
p.thc_percentage,
|
||||
p.cbd_percentage,
|
||||
p.strain_type,
|
||||
p.terpenes,
|
||||
p.effects,
|
||||
p.flavors,
|
||||
p.image_url,
|
||||
p.dutchie_url,
|
||||
p.in_stock,
|
||||
p.created_at,
|
||||
p.updated_at
|
||||
FROM products p
|
||||
WHERE p.dispensary_id = $1
|
||||
`;
|
||||
const params = [dispensaryId];
|
||||
if (category) {
|
||||
query += ` AND p.category = $2`;
|
||||
params.push(category);
|
||||
}
|
||||
query += ` ORDER BY p.created_at DESC`;
|
||||
const result = await migrate_1.pool.query(query, params);
|
||||
res.json({ products: result.rows });
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error fetching dispensary products:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch products' });
|
||||
}
|
||||
});
|
||||
// Get unique brands for a dispensary by slug
|
||||
router.get('/:slug/brands', async (req, res) => {
|
||||
try {
|
||||
const { slug } = req.params;
|
||||
const { search } = req.query;
|
||||
// First get the dispensary ID from slug
|
||||
const dispensaryResult = await migrate_1.pool.query(`
|
||||
SELECT id FROM dispensaries WHERE slug = $1
|
||||
`, [slug]);
|
||||
if (dispensaryResult.rows.length === 0) {
|
||||
return res.status(404).json({ error: 'Dispensary not found' });
|
||||
}
|
||||
const dispensaryId = dispensaryResult.rows[0].id;
|
||||
// Build query with optional search filter
|
||||
let query = `
|
||||
SELECT DISTINCT
|
||||
brand,
|
||||
COUNT(*) as product_count
|
||||
FROM products
|
||||
WHERE dispensary_id = $1 AND brand IS NOT NULL
|
||||
`;
|
||||
const params = [dispensaryId];
|
||||
// Add search filter if provided
|
||||
if (search) {
|
||||
query += ` AND brand ILIKE $2`;
|
||||
params.push(`%${search}%`);
|
||||
}
|
||||
query += ` GROUP BY brand ORDER BY product_count DESC, brand ASC`;
|
||||
const result = await migrate_1.pool.query(query, params);
|
||||
res.json({ brands: result.rows });
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error fetching dispensary brands:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch brands' });
|
||||
}
|
||||
});
|
||||
// Get products with discounts/specials for a dispensary by slug
|
||||
router.get('/:slug/specials', async (req, res) => {
|
||||
try {
|
||||
const { slug } = req.params;
|
||||
const { search } = req.query;
|
||||
// First get the dispensary ID from slug
|
||||
const dispensaryResult = await migrate_1.pool.query(`
|
||||
SELECT id FROM dispensaries WHERE slug = $1
|
||||
`, [slug]);
|
||||
if (dispensaryResult.rows.length === 0) {
|
||||
return res.status(404).json({ error: 'Dispensary not found' });
|
||||
}
|
||||
const dispensaryId = dispensaryResult.rows[0].id;
|
||||
// Build query to get products with discounts
|
||||
let query = `
|
||||
SELECT
|
||||
p.id,
|
||||
p.name,
|
||||
p.brand,
|
||||
p.variant,
|
||||
p.slug,
|
||||
p.description,
|
||||
p.regular_price,
|
||||
p.sale_price,
|
||||
p.discount_type,
|
||||
p.discount_value,
|
||||
p.thc_percentage,
|
||||
p.cbd_percentage,
|
||||
p.strain_type,
|
||||
p.terpenes,
|
||||
p.effects,
|
||||
p.flavors,
|
||||
p.image_url,
|
||||
p.dutchie_url,
|
||||
p.in_stock,
|
||||
p.created_at,
|
||||
p.updated_at
|
||||
FROM products p
|
||||
WHERE p.dispensary_id = $1
|
||||
AND p.discount_type IS NOT NULL
|
||||
AND p.discount_value IS NOT NULL
|
||||
`;
|
||||
const params = [dispensaryId];
|
||||
// Add search filter if provided
|
||||
if (search) {
|
||||
query += ` AND (p.name ILIKE $2 OR p.brand ILIKE $2 OR p.description ILIKE $2)`;
|
||||
params.push(`%${search}%`);
|
||||
}
|
||||
query += ` ORDER BY p.created_at DESC`;
|
||||
const result = await migrate_1.pool.query(query, params);
|
||||
res.json({ specials: result.rows });
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error fetching dispensary specials:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch specials' });
|
||||
}
|
||||
});
|
||||
// Trigger scraping for a dispensary
|
||||
router.post('/:slug/scrape', async (req, res) => {
|
||||
try {
|
||||
const { slug } = req.params;
|
||||
const { type } = req.body; // 'products' | 'brands' | 'specials' | 'all'
|
||||
if (!['products', 'brands', 'specials', 'all'].includes(type)) {
|
||||
return res.status(400).json({ error: 'Invalid type. Must be: products, brands, specials, or all' });
|
||||
}
|
||||
// Get the dispensary
|
||||
const dispensaryResult = await migrate_1.pool.query(`
|
||||
SELECT id, name, slug, website, menu_url, scraper_template, scraper_config
|
||||
FROM dispensaries
|
||||
WHERE slug = $1
|
||||
`, [slug]);
|
||||
if (dispensaryResult.rows.length === 0) {
|
||||
return res.status(404).json({ error: 'Dispensary not found' });
|
||||
}
|
||||
const dispensary = dispensaryResult.rows[0];
|
||||
if (!dispensary.menu_url && !dispensary.website) {
|
||||
return res.status(400).json({ error: 'Dispensary has no menu URL or website configured' });
|
||||
}
|
||||
// Update last_menu_scrape time and status
|
||||
await migrate_1.pool.query(`
|
||||
UPDATE dispensaries
|
||||
SET
|
||||
last_menu_scrape = CURRENT_TIMESTAMP,
|
||||
menu_scrape_status = 'pending',
|
||||
updated_at = CURRENT_TIMESTAMP
|
||||
WHERE id = $1
|
||||
`, [dispensary.id]);
|
||||
// Log the scrape request
|
||||
console.log(`[SCRAPE REQUEST] Dispensary: ${dispensary.name} (${slug}), Type: ${type}`);
|
||||
console.log(` Menu URL: ${dispensary.menu_url || dispensary.website}`);
|
||||
console.log(` Template: ${dispensary.scraper_template || 'N/A'}`);
|
||||
// TODO: Actually trigger the scraper here
|
||||
// For now, this is a placeholder that updates the status
|
||||
// You can integrate with your existing scraper infrastructure
|
||||
res.json({
|
||||
success: true,
|
||||
message: `Scraping queued for ${dispensary.name}`,
|
||||
type,
|
||||
dispensary: {
|
||||
id: dispensary.id,
|
||||
name: dispensary.name,
|
||||
slug: dispensary.slug
|
||||
}
|
||||
});
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error triggering scrape:', error);
|
||||
res.status(500).json({ error: 'Failed to trigger scraping' });
|
||||
}
|
||||
});
|
||||
// Update menu_type for a dispensary (dedicated endpoint)
|
||||
router.patch('/:id/menu-type', async (req, res) => {
|
||||
try {
|
||||
const { id } = req.params;
|
||||
const { menu_type } = req.body;
|
||||
// Validate menu_type
|
||||
if (menu_type !== null && menu_type !== '' && !VALID_MENU_TYPES.includes(menu_type)) {
|
||||
return res.status(400).json({
|
||||
error: `Invalid menu_type. Must be one of: ${VALID_MENU_TYPES.join(', ')} (or null to clear)`
|
||||
});
|
||||
}
|
||||
const result = await migrate_1.pool.query(`
|
||||
UPDATE dispensaries
|
||||
SET menu_type = $1, updated_at = CURRENT_TIMESTAMP
|
||||
WHERE id = $2
|
||||
RETURNING id, name, slug, menu_type, menu_provider, menu_url
|
||||
`, [menu_type || null, id]);
|
||||
if (result.rows.length === 0) {
|
||||
return res.status(404).json({ error: 'Dispensary not found' });
|
||||
}
|
||||
res.json({
|
||||
success: true,
|
||||
dispensary: result.rows[0]
|
||||
});
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error updating menu_type:', error);
|
||||
res.status(500).json({ error: 'Failed to update menu_type' });
|
||||
}
|
||||
});
|
||||
// Bulk update menu_type for multiple dispensaries
|
||||
router.post('/bulk/menu-type', async (req, res) => {
|
||||
try {
|
||||
const { dispensary_ids, menu_type } = req.body;
|
||||
if (!Array.isArray(dispensary_ids) || dispensary_ids.length === 0) {
|
||||
return res.status(400).json({ error: 'dispensary_ids must be a non-empty array' });
|
||||
}
|
||||
// Validate menu_type
|
||||
if (menu_type !== null && menu_type !== '' && !VALID_MENU_TYPES.includes(menu_type)) {
|
||||
return res.status(400).json({
|
||||
error: `Invalid menu_type. Must be one of: ${VALID_MENU_TYPES.join(', ')} (or null to clear)`
|
||||
});
|
||||
}
|
||||
const result = await migrate_1.pool.query(`
|
||||
UPDATE dispensaries
|
||||
SET menu_type = $1, updated_at = CURRENT_TIMESTAMP
|
||||
WHERE id = ANY($2::int[])
|
||||
RETURNING id, name, slug, menu_type
|
||||
`, [menu_type || null, dispensary_ids]);
|
||||
res.json({
|
||||
success: true,
|
||||
updated_count: result.rowCount,
|
||||
dispensaries: result.rows
|
||||
});
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error bulk updating menu_type:', error);
|
||||
res.status(500).json({ error: 'Failed to bulk update menu_type' });
|
||||
}
|
||||
});
|
||||
exports.default = router;
|
||||
182
backend/dist/routes/parallel-scrape.js
vendored
Normal file
182
backend/dist/routes/parallel-scrape.js
vendored
Normal file
@@ -0,0 +1,182 @@
|
||||
"use strict";
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
const express_1 = require("express");
|
||||
const migrate_1 = require("../db/migrate");
|
||||
const proxy_1 = require("../services/proxy");
|
||||
const middleware_1 = require("../auth/middleware");
|
||||
const router = (0, express_1.Router)();
|
||||
router.use(middleware_1.authMiddleware);
|
||||
const FIREFOX_USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0';
|
||||
// In-memory job tracking
|
||||
const activeJobs = new Map();
|
||||
// Get job status
|
||||
router.get('/status/:jobId', (req, res) => {
|
||||
const job = activeJobs.get(req.params.jobId);
|
||||
if (!job) {
|
||||
return res.status(404).json({ error: 'Job not found' });
|
||||
}
|
||||
res.json(job);
|
||||
});
|
||||
// List active jobs
|
||||
router.get('/jobs', (req, res) => {
|
||||
const jobs = Array.from(activeJobs.values());
|
||||
res.json({ jobs });
|
||||
});
|
||||
// Start parallel scrape
|
||||
router.post('/start', async (req, res) => {
|
||||
const { storeName = 'Deeply Rooted', workers = 15, useProxies = true } = req.body;
|
||||
try {
|
||||
// Find the store
|
||||
const storeResult = await migrate_1.pool.query(`SELECT id, name, slug, dutchie_url FROM stores WHERE name ILIKE $1 LIMIT 1`, [`%${storeName}%`]);
|
||||
if (storeResult.rows.length === 0) {
|
||||
return res.status(404).json({ error: `Store not found: ${storeName}` });
|
||||
}
|
||||
const store = storeResult.rows[0];
|
||||
// Get categories
|
||||
const categoriesResult = await migrate_1.pool.query(`SELECT id, name, slug, dutchie_url as url FROM categories WHERE store_id = $1 AND scrape_enabled = true`, [store.id]);
|
||||
if (categoriesResult.rows.length === 0) {
|
||||
return res.status(404).json({ error: 'No categories found for this store' });
|
||||
}
|
||||
const categories = categoriesResult.rows;
|
||||
// Create job
|
||||
const jobId = `scrape-${Date.now()}`;
|
||||
const job = {
|
||||
id: jobId,
|
||||
storeName: store.name,
|
||||
status: 'running',
|
||||
workers,
|
||||
startedAt: new Date(),
|
||||
results: []
|
||||
};
|
||||
activeJobs.set(jobId, job);
|
||||
// Start scraping in background
|
||||
runParallelScrape(job, store, categories, workers, useProxies).catch(err => {
|
||||
console.error('Parallel scrape error:', err);
|
||||
job.status = 'failed';
|
||||
});
|
||||
res.json({
|
||||
message: 'Parallel scrape started',
|
||||
jobId,
|
||||
store: store.name,
|
||||
categories: categories.length,
|
||||
workers
|
||||
});
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Failed to start parallel scrape:', error);
|
||||
res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
async function runParallelScrape(job, store, categories, numWorkers, useProxies) {
|
||||
const puppeteer = require('puppeteer-extra');
|
||||
const StealthPlugin = require('puppeteer-extra-plugin-stealth');
|
||||
puppeteer.use(StealthPlugin());
|
||||
// Expand categories for multiple passes
|
||||
const expandedCategories = [];
|
||||
const passes = Math.ceil(numWorkers / Math.max(categories.length, 1));
|
||||
for (let i = 0; i < passes; i++) {
|
||||
expandedCategories.push(...categories);
|
||||
}
|
||||
const categoryIndex = { current: 0 };
|
||||
const worker = async (workerId) => {
|
||||
while (categoryIndex.current < expandedCategories.length) {
|
||||
const idx = categoryIndex.current++;
|
||||
const category = expandedCategories[idx];
|
||||
if (!category)
|
||||
break;
|
||||
const result = await scrapeCategory(puppeteer, workerId, category, useProxies);
|
||||
job.results.push({
|
||||
category: category.name,
|
||||
success: result.success,
|
||||
products: result.products,
|
||||
error: result.error
|
||||
});
|
||||
// Delay between requests
|
||||
await new Promise(resolve => setTimeout(resolve, 2000 + Math.random() * 3000));
|
||||
}
|
||||
};
|
||||
// Start workers with staggered starts
|
||||
const workers = [];
|
||||
for (let i = 0; i < numWorkers; i++) {
|
||||
workers.push(worker(i + 1));
|
||||
await new Promise(resolve => setTimeout(resolve, 500));
|
||||
}
|
||||
await Promise.all(workers);
|
||||
job.status = 'completed';
|
||||
job.completedAt = new Date();
|
||||
// Clean up job after 1 hour
|
||||
setTimeout(() => activeJobs.delete(job.id), 60 * 60 * 1000);
|
||||
}
|
||||
async function scrapeCategory(puppeteer, workerId, category, useProxies) {
|
||||
let browser = null;
|
||||
let proxyId = null;
|
||||
try {
|
||||
let proxy = null;
|
||||
if (useProxies) {
|
||||
proxy = await (0, proxy_1.getActiveProxy)();
|
||||
}
|
||||
const args = [
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-accelerated-2d-canvas',
|
||||
'--disable-gpu',
|
||||
'--window-size=1920,1080',
|
||||
];
|
||||
if (proxy) {
|
||||
proxyId = proxy.id;
|
||||
if (proxy.protocol === 'socks5' || proxy.protocol === 'socks') {
|
||||
args.push(`--proxy-server=socks5://${proxy.host}:${proxy.port}`);
|
||||
}
|
||||
else {
|
||||
args.push(`--proxy-server=${proxy.protocol}://${proxy.host}:${proxy.port}`);
|
||||
}
|
||||
}
|
||||
browser = await puppeteer.launch({
|
||||
headless: 'new',
|
||||
args,
|
||||
executablePath: process.env.PUPPETEER_EXECUTABLE_PATH || '/usr/bin/chromium',
|
||||
});
|
||||
const page = await browser.newPage();
|
||||
await page.setUserAgent(FIREFOX_USER_AGENT);
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
if (proxy?.username && proxy?.password) {
|
||||
await page.authenticate({
|
||||
username: proxy.username,
|
||||
password: proxy.password,
|
||||
});
|
||||
}
|
||||
console.log(`[Worker ${workerId}] Scraping: ${category.name} (${category.url})`);
|
||||
const response = await page.goto(category.url, {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout: 60000,
|
||||
});
|
||||
if (!response || !response.ok()) {
|
||||
throw new Error(`Failed to load page: ${response?.status()}`);
|
||||
}
|
||||
await page.waitForSelector('[data-testid="product-list-item"], a[href*="/product/"]', {
|
||||
timeout: 30000,
|
||||
}).catch(() => { });
|
||||
const products = await page.evaluate(() => {
|
||||
// Try data-testid first, then fall back to product links
|
||||
const listItems = document.querySelectorAll('[data-testid="product-list-item"]');
|
||||
if (listItems.length > 0)
|
||||
return listItems.length;
|
||||
return document.querySelectorAll('a[href*="/product/"]').length;
|
||||
});
|
||||
console.log(`[Worker ${workerId}] Found ${products} products in ${category.name}`);
|
||||
await browser.close();
|
||||
return { success: true, products };
|
||||
}
|
||||
catch (error) {
|
||||
console.error(`[Worker ${workerId}] Error:`, error.message);
|
||||
if (proxyId && (0, proxy_1.isBotDetectionError)(error.message)) {
|
||||
(0, proxy_1.putProxyInTimeout)(proxyId, error.message);
|
||||
}
|
||||
if (browser) {
|
||||
await browser.close().catch(() => { });
|
||||
}
|
||||
return { success: false, products: 0, error: error.message };
|
||||
}
|
||||
}
|
||||
exports.default = router;
|
||||
251
backend/dist/routes/products.js
vendored
251
backend/dist/routes/products.js
vendored
@@ -6,10 +6,69 @@ const migrate_1 = require("../db/migrate");
|
||||
const minio_1 = require("../utils/minio");
|
||||
const router = (0, express_1.Router)();
|
||||
router.use(middleware_1.authMiddleware);
|
||||
// Get all products with filters
|
||||
// Freshness threshold: data older than this is considered stale
|
||||
const STALE_THRESHOLD_HOURS = 4;
|
||||
function calculateFreshness(lastCrawlAt) {
|
||||
if (!lastCrawlAt) {
|
||||
return {
|
||||
last_crawl_at: null,
|
||||
is_stale: true,
|
||||
freshness: 'Never crawled',
|
||||
hours_since_crawl: null
|
||||
};
|
||||
}
|
||||
const now = new Date();
|
||||
const diffMs = now.getTime() - lastCrawlAt.getTime();
|
||||
const diffHours = diffMs / (1000 * 60 * 60);
|
||||
const isStale = diffHours > STALE_THRESHOLD_HOURS;
|
||||
let freshnessText;
|
||||
if (diffHours < 1) {
|
||||
const mins = Math.round(diffHours * 60);
|
||||
freshnessText = `Last crawled ${mins} minute${mins !== 1 ? 's' : ''} ago`;
|
||||
}
|
||||
else if (diffHours < 24) {
|
||||
const hrs = Math.round(diffHours);
|
||||
freshnessText = `Last crawled ${hrs} hour${hrs !== 1 ? 's' : ''} ago`;
|
||||
}
|
||||
else {
|
||||
const days = Math.round(diffHours / 24);
|
||||
freshnessText = `Last crawled ${days} day${days !== 1 ? 's' : ''} ago`;
|
||||
}
|
||||
if (isStale) {
|
||||
freshnessText += ' (STALE)';
|
||||
}
|
||||
return {
|
||||
last_crawl_at: lastCrawlAt.toISOString(),
|
||||
is_stale: isStale,
|
||||
freshness: freshnessText,
|
||||
hours_since_crawl: Math.round(diffHours * 10) / 10
|
||||
};
|
||||
}
|
||||
// Helper function to filter fields from object
|
||||
function selectFields(obj, fields) {
|
||||
if (!fields || fields.length === 0)
|
||||
return obj;
|
||||
const result = {};
|
||||
fields.forEach(field => {
|
||||
if (obj.hasOwnProperty(field)) {
|
||||
result[field] = obj[field];
|
||||
}
|
||||
});
|
||||
return result;
|
||||
}
|
||||
// Get all products with filters, sorting, and field selection
|
||||
router.get('/', async (req, res) => {
|
||||
try {
|
||||
const { store_id, category_id, in_stock, search, limit = 50, offset = 0 } = req.query;
|
||||
const { store_id, category_id, in_stock, search, brand, min_price, max_price, min_thc, max_thc, strain_type, sort_by = 'last_seen_at', sort_order = 'desc', limit = 50, offset = 0, fields } = req.query;
|
||||
// Validate sort field to prevent SQL injection
|
||||
const allowedSortFields = [
|
||||
'id', 'name', 'brand', 'price', 'thc_percentage',
|
||||
'cbd_percentage', 'last_seen_at', 'created_at'
|
||||
];
|
||||
const sortField = allowedSortFields.includes(sort_by)
|
||||
? sort_by
|
||||
: 'last_seen_at';
|
||||
const sortDirection = sort_order.toLowerCase() === 'asc' ? 'ASC' : 'DESC';
|
||||
let query = `
|
||||
SELECT p.*, s.name as store_name, c.name as category_name
|
||||
FROM products p
|
||||
@@ -19,35 +78,81 @@ router.get('/', async (req, res) => {
|
||||
`;
|
||||
const params = [];
|
||||
let paramCount = 1;
|
||||
// Store filter
|
||||
if (store_id) {
|
||||
query += ` AND p.store_id = $${paramCount}`;
|
||||
params.push(store_id);
|
||||
paramCount++;
|
||||
}
|
||||
// Category filter
|
||||
if (category_id) {
|
||||
query += ` AND p.category_id = $${paramCount}`;
|
||||
params.push(category_id);
|
||||
paramCount++;
|
||||
}
|
||||
// Stock filter
|
||||
if (in_stock !== undefined) {
|
||||
query += ` AND p.in_stock = $${paramCount}`;
|
||||
params.push(in_stock === 'true');
|
||||
paramCount++;
|
||||
}
|
||||
// Search filter
|
||||
if (search) {
|
||||
query += ` AND (p.name ILIKE $${paramCount} OR p.brand ILIKE $${paramCount})`;
|
||||
query += ` AND (p.name ILIKE $${paramCount} OR p.brand ILIKE $${paramCount} OR p.description ILIKE $${paramCount})`;
|
||||
params.push(`%${search}%`);
|
||||
paramCount++;
|
||||
}
|
||||
query += ` ORDER BY p.last_seen_at DESC LIMIT $${paramCount} OFFSET $${paramCount + 1}`;
|
||||
// Brand filter
|
||||
if (brand) {
|
||||
query += ` AND p.brand ILIKE $${paramCount}`;
|
||||
params.push(`%${brand}%`);
|
||||
paramCount++;
|
||||
}
|
||||
// Price range filter
|
||||
if (min_price) {
|
||||
query += ` AND p.price >= $${paramCount}`;
|
||||
params.push(parseFloat(min_price));
|
||||
paramCount++;
|
||||
}
|
||||
if (max_price) {
|
||||
query += ` AND p.price <= $${paramCount}`;
|
||||
params.push(parseFloat(max_price));
|
||||
paramCount++;
|
||||
}
|
||||
// THC range filter
|
||||
if (min_thc) {
|
||||
query += ` AND p.thc_percentage >= $${paramCount}`;
|
||||
params.push(parseFloat(min_thc));
|
||||
paramCount++;
|
||||
}
|
||||
if (max_thc) {
|
||||
query += ` AND p.thc_percentage <= $${paramCount}`;
|
||||
params.push(parseFloat(max_thc));
|
||||
paramCount++;
|
||||
}
|
||||
// Strain type filter
|
||||
if (strain_type) {
|
||||
query += ` AND p.strain_type = $${paramCount}`;
|
||||
params.push(strain_type);
|
||||
paramCount++;
|
||||
}
|
||||
// Sorting
|
||||
query += ` ORDER BY p.${sortField} ${sortDirection} LIMIT $${paramCount} OFFSET $${paramCount + 1}`;
|
||||
params.push(limit, offset);
|
||||
const result = await migrate_1.pool.query(query, params);
|
||||
// Add image URLs
|
||||
const products = result.rows.map(p => ({
|
||||
let products = result.rows.map((p) => ({
|
||||
...p,
|
||||
image_url_full: p.local_image_path ? (0, minio_1.getImageUrl)(p.local_image_path) : p.image_url
|
||||
image_url_full: p.local_image_path ? (0, minio_1.getImageUrl)(p.local_image_path) : p.image_url,
|
||||
thumbnail_url: p.thumbnail_path ? (0, minio_1.getImageUrl)(p.thumbnail_path) : null,
|
||||
medium_url: p.medium_path ? (0, minio_1.getImageUrl)(p.medium_path) : null,
|
||||
}));
|
||||
// Get total count
|
||||
// Field selection
|
||||
if (fields) {
|
||||
const selectedFields = fields.split(',').map(f => f.trim());
|
||||
products = products.map((p) => selectFields(p, selectedFields));
|
||||
}
|
||||
// Get total count (reuse same filters)
|
||||
let countQuery = `SELECT COUNT(*) FROM products p WHERE 1=1`;
|
||||
const countParams = [];
|
||||
let countParamCount = 1;
|
||||
@@ -67,16 +172,79 @@ router.get('/', async (req, res) => {
|
||||
countParamCount++;
|
||||
}
|
||||
if (search) {
|
||||
countQuery += ` AND (p.name ILIKE $${countParamCount} OR p.brand ILIKE $${countParamCount})`;
|
||||
countQuery += ` AND (p.name ILIKE $${countParamCount} OR p.brand ILIKE $${countParamCount} OR p.description ILIKE $${countParamCount})`;
|
||||
countParams.push(`%${search}%`);
|
||||
countParamCount++;
|
||||
}
|
||||
if (brand) {
|
||||
countQuery += ` AND p.brand ILIKE $${countParamCount}`;
|
||||
countParams.push(`%${brand}%`);
|
||||
countParamCount++;
|
||||
}
|
||||
if (min_price) {
|
||||
countQuery += ` AND p.price >= $${countParamCount}`;
|
||||
countParams.push(parseFloat(min_price));
|
||||
countParamCount++;
|
||||
}
|
||||
if (max_price) {
|
||||
countQuery += ` AND p.price <= $${countParamCount}`;
|
||||
countParams.push(parseFloat(max_price));
|
||||
countParamCount++;
|
||||
}
|
||||
if (min_thc) {
|
||||
countQuery += ` AND p.thc_percentage >= $${countParamCount}`;
|
||||
countParams.push(parseFloat(min_thc));
|
||||
countParamCount++;
|
||||
}
|
||||
if (max_thc) {
|
||||
countQuery += ` AND p.thc_percentage <= $${countParamCount}`;
|
||||
countParams.push(parseFloat(max_thc));
|
||||
countParamCount++;
|
||||
}
|
||||
if (strain_type) {
|
||||
countQuery += ` AND p.strain_type = $${countParamCount}`;
|
||||
countParams.push(strain_type);
|
||||
countParamCount++;
|
||||
}
|
||||
const countResult = await migrate_1.pool.query(countQuery, countParams);
|
||||
// Get freshness info if store_id is specified
|
||||
let freshnessInfo = null;
|
||||
let storeInfo = null;
|
||||
if (store_id) {
|
||||
const storeResult = await migrate_1.pool.query('SELECT id, name, last_scraped_at FROM stores WHERE id = $1', [store_id]);
|
||||
if (storeResult.rows.length > 0) {
|
||||
const store = storeResult.rows[0];
|
||||
storeInfo = { id: store.id, name: store.name };
|
||||
freshnessInfo = calculateFreshness(store.last_scraped_at);
|
||||
}
|
||||
}
|
||||
res.json({
|
||||
products,
|
||||
total: parseInt(countResult.rows[0].count),
|
||||
limit: parseInt(limit),
|
||||
offset: parseInt(offset)
|
||||
offset: parseInt(offset),
|
||||
// Add freshness metadata when store_id is provided
|
||||
...(freshnessInfo && {
|
||||
store: storeInfo,
|
||||
last_crawl_at: freshnessInfo.last_crawl_at,
|
||||
is_stale: freshnessInfo.is_stale,
|
||||
freshness: freshnessInfo.freshness,
|
||||
hours_since_crawl: freshnessInfo.hours_since_crawl
|
||||
}),
|
||||
filters: {
|
||||
store_id,
|
||||
category_id,
|
||||
in_stock,
|
||||
search,
|
||||
brand,
|
||||
min_price,
|
||||
max_price,
|
||||
min_thc,
|
||||
max_thc,
|
||||
strain_type,
|
||||
sort_by: sortField,
|
||||
sort_order: sortDirection
|
||||
}
|
||||
});
|
||||
}
|
||||
catch (error) {
|
||||
@@ -84,10 +252,11 @@ router.get('/', async (req, res) => {
|
||||
res.status(500).json({ error: 'Failed to fetch products' });
|
||||
}
|
||||
});
|
||||
// Get single product
|
||||
// Get single product with optional field selection
|
||||
router.get('/:id', async (req, res) => {
|
||||
try {
|
||||
const { id } = req.params;
|
||||
const { fields } = req.query;
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT p.*, s.name as store_name, c.name as category_name
|
||||
FROM products p
|
||||
@@ -98,10 +267,17 @@ router.get('/:id', async (req, res) => {
|
||||
if (result.rows.length === 0) {
|
||||
return res.status(404).json({ error: 'Product not found' });
|
||||
}
|
||||
const product = result.rows[0];
|
||||
let product = result.rows[0];
|
||||
product.image_url_full = product.local_image_path
|
||||
? (0, minio_1.getImageUrl)(product.local_image_path)
|
||||
: product.image_url;
|
||||
product.thumbnail_url = product.thumbnail_path ? (0, minio_1.getImageUrl)(product.thumbnail_path) : null;
|
||||
product.medium_url = product.medium_path ? (0, minio_1.getImageUrl)(product.medium_path) : null;
|
||||
// Field selection
|
||||
if (fields) {
|
||||
const selectedFields = fields.split(',').map(f => f.trim());
|
||||
product = selectFields(product, selectedFields);
|
||||
}
|
||||
res.json({ product });
|
||||
}
|
||||
catch (error) {
|
||||
@@ -109,4 +285,57 @@ router.get('/:id', async (req, res) => {
|
||||
res.status(500).json({ error: 'Failed to fetch product' });
|
||||
}
|
||||
});
|
||||
// Get available brands (for filter dropdowns)
|
||||
router.get('/meta/brands', async (req, res) => {
|
||||
try {
|
||||
const { store_id } = req.query;
|
||||
let query = `
|
||||
SELECT DISTINCT brand
|
||||
FROM products
|
||||
WHERE brand IS NOT NULL AND brand != ''
|
||||
`;
|
||||
const params = [];
|
||||
if (store_id) {
|
||||
query += ' AND store_id = $1';
|
||||
params.push(store_id);
|
||||
}
|
||||
query += ' ORDER BY brand';
|
||||
const result = await migrate_1.pool.query(query, params);
|
||||
const brands = result.rows.map((row) => row.brand);
|
||||
res.json({ brands });
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error fetching brands:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch brands' });
|
||||
}
|
||||
});
|
||||
// Get price range (for filter sliders)
|
||||
router.get('/meta/price-range', async (req, res) => {
|
||||
try {
|
||||
const { store_id } = req.query;
|
||||
let query = `
|
||||
SELECT
|
||||
MIN(price) as min_price,
|
||||
MAX(price) as max_price,
|
||||
AVG(price) as avg_price
|
||||
FROM products
|
||||
WHERE price IS NOT NULL
|
||||
`;
|
||||
const params = [];
|
||||
if (store_id) {
|
||||
query += ' AND store_id = $1';
|
||||
params.push(store_id);
|
||||
}
|
||||
const result = await migrate_1.pool.query(query, params);
|
||||
res.json({
|
||||
min_price: parseFloat(result.rows[0].min_price) || 0,
|
||||
max_price: parseFloat(result.rows[0].max_price) || 0,
|
||||
avg_price: parseFloat(result.rows[0].avg_price) || 0
|
||||
});
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error fetching price range:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch price range' });
|
||||
}
|
||||
});
|
||||
exports.default = router;
|
||||
|
||||
108
backend/dist/routes/proxies.js
vendored
108
backend/dist/routes/proxies.js
vendored
@@ -1,17 +1,52 @@
|
||||
"use strict";
|
||||
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
||||
if (k2 === undefined) k2 = k;
|
||||
var desc = Object.getOwnPropertyDescriptor(m, k);
|
||||
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
||||
desc = { enumerable: true, get: function() { return m[k]; } };
|
||||
}
|
||||
Object.defineProperty(o, k2, desc);
|
||||
}) : (function(o, m, k, k2) {
|
||||
if (k2 === undefined) k2 = k;
|
||||
o[k2] = m[k];
|
||||
}));
|
||||
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
||||
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
||||
}) : function(o, v) {
|
||||
o["default"] = v;
|
||||
});
|
||||
var __importStar = (this && this.__importStar) || (function () {
|
||||
var ownKeys = function(o) {
|
||||
ownKeys = Object.getOwnPropertyNames || function (o) {
|
||||
var ar = [];
|
||||
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
||||
return ar;
|
||||
};
|
||||
return ownKeys(o);
|
||||
};
|
||||
return function (mod) {
|
||||
if (mod && mod.__esModule) return mod;
|
||||
var result = {};
|
||||
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
||||
__setModuleDefault(result, mod);
|
||||
return result;
|
||||
};
|
||||
})();
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
const express_1 = require("express");
|
||||
const middleware_1 = require("../auth/middleware");
|
||||
const migrate_1 = require("../db/migrate");
|
||||
const proxy_1 = require("../services/proxy");
|
||||
const proxyTestQueue_1 = require("../services/proxyTestQueue");
|
||||
const router = (0, express_1.Router)();
|
||||
router.use(middleware_1.authMiddleware);
|
||||
// Get all proxies
|
||||
router.get('/', async (req, res) => {
|
||||
try {
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT id, host, port, protocol, active, is_anonymous,
|
||||
last_tested_at, test_result, response_time_ms, created_at
|
||||
SELECT id, host, port, protocol, active, is_anonymous,
|
||||
last_tested_at, test_result, response_time_ms, created_at,
|
||||
city, state, country, country_code, location_updated_at
|
||||
FROM proxies
|
||||
ORDER BY created_at DESC
|
||||
`);
|
||||
@@ -22,6 +57,32 @@ router.get('/', async (req, res) => {
|
||||
res.status(500).json({ error: 'Failed to fetch proxies' });
|
||||
}
|
||||
});
|
||||
// Get active proxy test job (must be before /:id route)
|
||||
router.get('/test-job', async (req, res) => {
|
||||
try {
|
||||
const job = await (0, proxyTestQueue_1.getActiveProxyTestJob)();
|
||||
res.json({ job });
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error fetching active job:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch active job' });
|
||||
}
|
||||
});
|
||||
// Get proxy test job status (must be before /:id route)
|
||||
router.get('/test-job/:jobId', async (req, res) => {
|
||||
try {
|
||||
const { jobId } = req.params;
|
||||
const job = await (0, proxyTestQueue_1.getProxyTestJob)(parseInt(jobId));
|
||||
if (!job) {
|
||||
return res.status(404).json({ error: 'Job not found' });
|
||||
}
|
||||
res.json({ job });
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error fetching job status:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch job status' });
|
||||
}
|
||||
});
|
||||
// Get single proxy
|
||||
router.get('/:id', async (req, res) => {
|
||||
try {
|
||||
@@ -113,18 +174,30 @@ router.post('/:id/test', (0, middleware_1.requireRole)('superadmin', 'admin'), a
|
||||
res.status(500).json({ error: 'Failed to test proxy' });
|
||||
}
|
||||
});
|
||||
// Test all proxies
|
||||
// Start proxy test job
|
||||
router.post('/test-all', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
|
||||
try {
|
||||
// Run in background
|
||||
(0, proxy_1.testAllProxies)().catch(err => {
|
||||
console.error('Background proxy testing error:', err);
|
||||
});
|
||||
res.json({ message: 'Proxy testing started in background' });
|
||||
const jobId = await (0, proxyTestQueue_1.createProxyTestJob)();
|
||||
res.json({ jobId, message: 'Proxy test job started' });
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error starting proxy tests:', error);
|
||||
res.status(500).json({ error: 'Failed to start proxy tests' });
|
||||
console.error('Error starting proxy test job:', error);
|
||||
res.status(500).json({ error: 'Failed to start proxy test job' });
|
||||
}
|
||||
});
|
||||
// Cancel proxy test job
|
||||
router.post('/test-job/:jobId/cancel', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
|
||||
try {
|
||||
const { jobId } = req.params;
|
||||
const cancelled = await (0, proxyTestQueue_1.cancelProxyTestJob)(parseInt(jobId));
|
||||
if (!cancelled) {
|
||||
return res.status(404).json({ error: 'Job not found or already completed' });
|
||||
}
|
||||
res.json({ message: 'Job cancelled successfully' });
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error cancelling job:', error);
|
||||
res.status(500).json({ error: 'Failed to cancel job' });
|
||||
}
|
||||
});
|
||||
// Update proxy
|
||||
@@ -171,4 +244,19 @@ router.delete('/:id', (0, middleware_1.requireRole)('superadmin'), async (req, r
|
||||
res.status(500).json({ error: 'Failed to delete proxy' });
|
||||
}
|
||||
});
|
||||
// Update all proxy locations
|
||||
router.post('/update-locations', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
|
||||
try {
|
||||
const { updateAllProxyLocations } = await Promise.resolve().then(() => __importStar(require('../services/geolocation')));
|
||||
// Run in background
|
||||
updateAllProxyLocations().catch(err => {
|
||||
console.error('❌ Location update failed:', err);
|
||||
});
|
||||
res.json({ message: 'Location update job started' });
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error starting location update:', error);
|
||||
res.status(500).json({ error: 'Failed to start location update' });
|
||||
}
|
||||
});
|
||||
exports.default = router;
|
||||
|
||||
668
backend/dist/routes/public-api.js
vendored
Normal file
668
backend/dist/routes/public-api.js
vendored
Normal file
@@ -0,0 +1,668 @@
|
||||
"use strict";
|
||||
/**
|
||||
* Public API Routes for External Consumers (WordPress, etc.)
|
||||
*
|
||||
* These routes use the dutchie_az data pipeline and are protected by API key auth.
|
||||
* Designed for Deeply Rooted and other WordPress sites consuming menu data.
|
||||
*/
|
||||
var __importDefault = (this && this.__importDefault) || function (mod) {
|
||||
return (mod && mod.__esModule) ? mod : { "default": mod };
|
||||
};
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
const express_1 = require("express");
|
||||
const migrate_1 = require("../db/migrate");
|
||||
const connection_1 = require("../dutchie-az/db/connection");
|
||||
const ipaddr_js_1 = __importDefault(require("ipaddr.js"));
|
||||
const router = (0, express_1.Router)();
|
||||
// ============================================================
|
||||
// MIDDLEWARE
|
||||
// ============================================================
|
||||
/**
|
||||
* Validates if an IP address matches any of the allowed IP patterns
|
||||
*/
|
||||
function isIpAllowed(clientIp, allowedIps) {
|
||||
try {
|
||||
const clientAddr = ipaddr_js_1.default.process(clientIp);
|
||||
for (const allowedIp of allowedIps) {
|
||||
const trimmed = allowedIp.trim();
|
||||
if (!trimmed)
|
||||
continue;
|
||||
if (trimmed.includes('/')) {
|
||||
try {
|
||||
const range = ipaddr_js_1.default.parseCIDR(trimmed);
|
||||
if (clientAddr.match(range)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
catch (e) {
|
||||
console.warn(`Invalid CIDR notation: ${trimmed}`);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
else {
|
||||
try {
|
||||
const allowedAddr = ipaddr_js_1.default.process(trimmed);
|
||||
if (clientAddr.toString() === allowedAddr.toString()) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
catch (e) {
|
||||
console.warn(`Invalid IP address: ${trimmed}`);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error processing client IP:', error);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Validates if a domain matches any of the allowed domain patterns
|
||||
*/
|
||||
function isDomainAllowed(origin, allowedDomains) {
|
||||
try {
|
||||
const url = new URL(origin);
|
||||
const domain = url.hostname;
|
||||
for (const allowedDomain of allowedDomains) {
|
||||
const trimmed = allowedDomain.trim();
|
||||
if (!trimmed)
|
||||
continue;
|
||||
if (trimmed.startsWith('*.')) {
|
||||
const baseDomain = trimmed.substring(2);
|
||||
if (domain === baseDomain || domain.endsWith('.' + baseDomain)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (domain === trimmed) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error processing domain:', error);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Middleware to validate API key and resolve dispensary -> dutchie_az store mapping
|
||||
*/
|
||||
async function validatePublicApiKey(req, res, next) {
|
||||
const apiKey = req.headers['x-api-key'];
|
||||
if (!apiKey) {
|
||||
return res.status(401).json({
|
||||
error: 'Missing API key',
|
||||
message: 'Provide your API key in the X-API-Key header'
|
||||
});
|
||||
}
|
||||
try {
|
||||
// Query WordPress permissions table with store info
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT
|
||||
p.id,
|
||||
p.user_name,
|
||||
p.api_key,
|
||||
p.allowed_ips,
|
||||
p.allowed_domains,
|
||||
p.is_active,
|
||||
p.store_id,
|
||||
p.store_name
|
||||
FROM wp_dutchie_api_permissions p
|
||||
WHERE p.api_key = $1 AND p.is_active = 1
|
||||
`, [apiKey]);
|
||||
if (result.rows.length === 0) {
|
||||
return res.status(401).json({
|
||||
error: 'Invalid API key'
|
||||
});
|
||||
}
|
||||
const permission = result.rows[0];
|
||||
// Validate IP if configured
|
||||
const clientIp = req.headers['x-forwarded-for']?.split(',')[0].trim() ||
|
||||
req.headers['x-real-ip'] ||
|
||||
req.ip ||
|
||||
req.connection.remoteAddress ||
|
||||
'';
|
||||
if (permission.allowed_ips) {
|
||||
const allowedIps = permission.allowed_ips.split('\n').filter((ip) => ip.trim());
|
||||
if (allowedIps.length > 0 && !isIpAllowed(clientIp, allowedIps)) {
|
||||
return res.status(403).json({
|
||||
error: 'IP address not allowed',
|
||||
client_ip: clientIp
|
||||
});
|
||||
}
|
||||
}
|
||||
// Validate domain if configured
|
||||
const origin = req.get('origin') || req.get('referer') || '';
|
||||
if (permission.allowed_domains && origin) {
|
||||
const allowedDomains = permission.allowed_domains.split('\n').filter((d) => d.trim());
|
||||
if (allowedDomains.length > 0 && !isDomainAllowed(origin, allowedDomains)) {
|
||||
return res.status(403).json({
|
||||
error: 'Domain not allowed',
|
||||
origin: origin
|
||||
});
|
||||
}
|
||||
}
|
||||
// Resolve the dutchie_az store for this store
|
||||
// Match by store name (from main DB) to dutchie_az.dispensaries.name
|
||||
const storeResult = await (0, connection_1.query)(`
|
||||
SELECT id FROM dispensaries
|
||||
WHERE LOWER(TRIM(name)) = LOWER(TRIM($1))
|
||||
OR LOWER(TRIM(name)) LIKE LOWER(TRIM($1)) || '%'
|
||||
OR LOWER(TRIM($1)) LIKE LOWER(TRIM(name)) || '%'
|
||||
ORDER BY
|
||||
CASE WHEN LOWER(TRIM(name)) = LOWER(TRIM($1)) THEN 0 ELSE 1 END,
|
||||
id
|
||||
LIMIT 1
|
||||
`, [permission.store_name]);
|
||||
if (storeResult.rows.length > 0) {
|
||||
permission.dutchie_az_store_id = storeResult.rows[0].id;
|
||||
}
|
||||
// Update last_used_at timestamp (async, don't wait)
|
||||
migrate_1.pool.query(`
|
||||
UPDATE wp_dutchie_api_permissions
|
||||
SET last_used_at = CURRENT_TIMESTAMP
|
||||
WHERE id = $1
|
||||
`, [permission.id]).catch((err) => {
|
||||
console.error('Error updating last_used_at:', err);
|
||||
});
|
||||
req.apiPermission = permission;
|
||||
next();
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Public API validation error:', error);
|
||||
return res.status(500).json({
|
||||
error: 'Internal server error during API validation'
|
||||
});
|
||||
}
|
||||
}
|
||||
// Apply middleware to all routes
|
||||
router.use(validatePublicApiKey);
|
||||
// ============================================================
|
||||
// PRODUCT ENDPOINTS
|
||||
// ============================================================
|
||||
/**
|
||||
* GET /api/v1/products
|
||||
* Get products for the authenticated dispensary
|
||||
*
|
||||
* Query params:
|
||||
* - category: Filter by product type (e.g., 'flower', 'edible')
|
||||
* - brand: Filter by brand name
|
||||
* - in_stock_only: Only return in-stock products (default: false)
|
||||
* - limit: Max products to return (default: 100, max: 500)
|
||||
* - offset: Pagination offset (default: 0)
|
||||
*/
|
||||
router.get('/products', async (req, res) => {
|
||||
try {
|
||||
const permission = req.apiPermission;
|
||||
// Check if we have a dutchie_az store mapping
|
||||
if (!permission.dutchie_az_store_id) {
|
||||
return res.status(503).json({
|
||||
error: 'No menu data available',
|
||||
message: `Menu data for ${permission.store_name} is not yet available. The dispensary may not be set up in the new data pipeline.`,
|
||||
dispensary_name: permission.store_name
|
||||
});
|
||||
}
|
||||
const { category, brand, in_stock_only = 'false', limit = '100', offset = '0' } = req.query;
|
||||
// Build query
|
||||
let whereClause = 'WHERE p.dispensary_id = $1';
|
||||
const params = [permission.dutchie_az_store_id];
|
||||
let paramIndex = 2;
|
||||
// Filter by stock status if requested
|
||||
if (in_stock_only === 'true' || in_stock_only === '1') {
|
||||
whereClause += ` AND p.stock_status = 'in_stock'`;
|
||||
}
|
||||
// Filter by category (maps to 'type' in dutchie_az)
|
||||
if (category) {
|
||||
whereClause += ` AND LOWER(p.type) = LOWER($${paramIndex})`;
|
||||
params.push(category);
|
||||
paramIndex++;
|
||||
}
|
||||
// Filter by brand
|
||||
if (brand) {
|
||||
whereClause += ` AND LOWER(p.brand_name) LIKE LOWER($${paramIndex})`;
|
||||
params.push(`%${brand}%`);
|
||||
paramIndex++;
|
||||
}
|
||||
// Enforce limits
|
||||
const limitNum = Math.min(parseInt(limit, 10) || 100, 500);
|
||||
const offsetNum = parseInt(offset, 10) || 0;
|
||||
params.push(limitNum, offsetNum);
|
||||
// Query products with latest snapshot data
|
||||
const { rows: products } = await (0, connection_1.query)(`
|
||||
SELECT
|
||||
p.id,
|
||||
p.external_product_id as dutchie_id,
|
||||
p.name,
|
||||
p.brand_name as brand,
|
||||
p.type as category,
|
||||
p.subcategory,
|
||||
p.strain_type,
|
||||
p.stock_status,
|
||||
p.thc,
|
||||
p.cbd,
|
||||
p.primary_image_url as image_url,
|
||||
p.images,
|
||||
p.effects,
|
||||
p.created_at,
|
||||
p.updated_at,
|
||||
-- Latest snapshot data for pricing
|
||||
s.rec_min_price_cents,
|
||||
s.rec_max_price_cents,
|
||||
s.rec_min_special_price_cents,
|
||||
s.med_min_price_cents,
|
||||
s.med_max_price_cents,
|
||||
s.med_min_special_price_cents,
|
||||
s.total_quantity_available,
|
||||
s.options,
|
||||
s.special,
|
||||
s.crawled_at as snapshot_at
|
||||
FROM dutchie_products p
|
||||
LEFT JOIN LATERAL (
|
||||
SELECT * FROM dutchie_product_snapshots
|
||||
WHERE dutchie_product_id = p.id
|
||||
ORDER BY crawled_at DESC
|
||||
LIMIT 1
|
||||
) s ON true
|
||||
${whereClause}
|
||||
ORDER BY p.name ASC
|
||||
LIMIT $${paramIndex} OFFSET $${paramIndex + 1}
|
||||
`, params);
|
||||
// Get total count for pagination
|
||||
const { rows: countRows } = await (0, connection_1.query)(`
|
||||
SELECT COUNT(*) as total FROM dutchie_products p ${whereClause}
|
||||
`, params.slice(0, -2));
|
||||
// Transform products to backward-compatible format
|
||||
const transformedProducts = products.map((p) => {
|
||||
// Extract first image URL from images array
|
||||
let imageUrl = p.image_url;
|
||||
if (!imageUrl && p.images && Array.isArray(p.images) && p.images.length > 0) {
|
||||
const firstImage = p.images[0];
|
||||
imageUrl = typeof firstImage === 'string' ? firstImage : firstImage?.url;
|
||||
}
|
||||
// Convert prices from cents to dollars
|
||||
const regularPrice = p.rec_min_price_cents
|
||||
? (p.rec_min_price_cents / 100).toFixed(2)
|
||||
: null;
|
||||
const salePrice = p.rec_min_special_price_cents
|
||||
? (p.rec_min_special_price_cents / 100).toFixed(2)
|
||||
: null;
|
||||
return {
|
||||
id: p.id,
|
||||
dutchie_id: p.dutchie_id,
|
||||
name: p.name,
|
||||
brand: p.brand || null,
|
||||
category: p.category || null,
|
||||
subcategory: p.subcategory || null,
|
||||
strain_type: p.strain_type || null,
|
||||
description: null, // Not stored in dutchie_products, would need snapshot
|
||||
regular_price: regularPrice,
|
||||
sale_price: salePrice,
|
||||
thc_percentage: p.thc ? parseFloat(p.thc) : null,
|
||||
cbd_percentage: p.cbd ? parseFloat(p.cbd) : null,
|
||||
image_url: imageUrl || null,
|
||||
in_stock: p.stock_status === 'in_stock',
|
||||
on_special: p.special || false,
|
||||
effects: p.effects || [],
|
||||
options: p.options || [],
|
||||
quantity_available: p.total_quantity_available || 0,
|
||||
created_at: p.created_at,
|
||||
updated_at: p.updated_at,
|
||||
snapshot_at: p.snapshot_at
|
||||
};
|
||||
});
|
||||
res.json({
|
||||
success: true,
|
||||
dispensary: permission.store_name,
|
||||
products: transformedProducts,
|
||||
pagination: {
|
||||
total: parseInt(countRows[0]?.total || '0', 10),
|
||||
limit: limitNum,
|
||||
offset: offsetNum,
|
||||
has_more: offsetNum + products.length < parseInt(countRows[0]?.total || '0', 10)
|
||||
}
|
||||
});
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Public API products error:', error);
|
||||
res.status(500).json({
|
||||
error: 'Failed to fetch products',
|
||||
message: error.message
|
||||
});
|
||||
}
|
||||
});
|
||||
/**
|
||||
* GET /api/v1/products/:id
|
||||
* Get a single product by ID
|
||||
*/
|
||||
router.get('/products/:id', async (req, res) => {
|
||||
try {
|
||||
const permission = req.apiPermission;
|
||||
const { id } = req.params;
|
||||
if (!permission.dutchie_az_store_id) {
|
||||
return res.status(503).json({
|
||||
error: 'No menu data available',
|
||||
message: `Menu data for ${permission.store_name} is not yet available.`
|
||||
});
|
||||
}
|
||||
// Get product with latest snapshot
|
||||
const { rows: products } = await (0, connection_1.query)(`
|
||||
SELECT
|
||||
p.*,
|
||||
s.rec_min_price_cents,
|
||||
s.rec_max_price_cents,
|
||||
s.rec_min_special_price_cents,
|
||||
s.med_min_price_cents,
|
||||
s.med_max_price_cents,
|
||||
s.total_quantity_available,
|
||||
s.options,
|
||||
s.special,
|
||||
s.crawled_at as snapshot_at
|
||||
FROM dutchie_products p
|
||||
LEFT JOIN LATERAL (
|
||||
SELECT * FROM dutchie_product_snapshots
|
||||
WHERE dutchie_product_id = p.id
|
||||
ORDER BY crawled_at DESC
|
||||
LIMIT 1
|
||||
) s ON true
|
||||
WHERE p.id = $1 AND p.dispensary_id = $2
|
||||
`, [id, permission.dutchie_az_store_id]);
|
||||
if (products.length === 0) {
|
||||
return res.status(404).json({
|
||||
error: 'Product not found'
|
||||
});
|
||||
}
|
||||
const p = products[0];
|
||||
// Extract first image URL
|
||||
let imageUrl = p.primary_image_url;
|
||||
if (!imageUrl && p.images && Array.isArray(p.images) && p.images.length > 0) {
|
||||
const firstImage = p.images[0];
|
||||
imageUrl = typeof firstImage === 'string' ? firstImage : firstImage?.url;
|
||||
}
|
||||
res.json({
|
||||
success: true,
|
||||
product: {
|
||||
id: p.id,
|
||||
dutchie_id: p.external_product_id,
|
||||
name: p.name,
|
||||
brand: p.brand_name || null,
|
||||
category: p.type || null,
|
||||
subcategory: p.subcategory || null,
|
||||
strain_type: p.strain_type || null,
|
||||
regular_price: p.rec_min_price_cents ? (p.rec_min_price_cents / 100).toFixed(2) : null,
|
||||
sale_price: p.rec_min_special_price_cents ? (p.rec_min_special_price_cents / 100).toFixed(2) : null,
|
||||
thc_percentage: p.thc ? parseFloat(p.thc) : null,
|
||||
cbd_percentage: p.cbd ? parseFloat(p.cbd) : null,
|
||||
image_url: imageUrl || null,
|
||||
images: p.images || [],
|
||||
in_stock: p.stock_status === 'in_stock',
|
||||
on_special: p.special || false,
|
||||
effects: p.effects || [],
|
||||
options: p.options || [],
|
||||
quantity_available: p.total_quantity_available || 0,
|
||||
created_at: p.created_at,
|
||||
updated_at: p.updated_at,
|
||||
snapshot_at: p.snapshot_at
|
||||
}
|
||||
});
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Public API product detail error:', error);
|
||||
res.status(500).json({
|
||||
error: 'Failed to fetch product',
|
||||
message: error.message
|
||||
});
|
||||
}
|
||||
});
|
||||
/**
|
||||
* GET /api/v1/categories
|
||||
* Get all categories for the authenticated dispensary
|
||||
*/
|
||||
router.get('/categories', async (req, res) => {
|
||||
try {
|
||||
const permission = req.apiPermission;
|
||||
if (!permission.dutchie_az_store_id) {
|
||||
return res.status(503).json({
|
||||
error: 'No menu data available',
|
||||
message: `Menu data for ${permission.store_name} is not yet available.`
|
||||
});
|
||||
}
|
||||
const { rows: categories } = await (0, connection_1.query)(`
|
||||
SELECT
|
||||
type as category,
|
||||
subcategory,
|
||||
COUNT(*) as product_count,
|
||||
COUNT(*) FILTER (WHERE stock_status = 'in_stock') as in_stock_count
|
||||
FROM dutchie_products
|
||||
WHERE dispensary_id = $1 AND type IS NOT NULL
|
||||
GROUP BY type, subcategory
|
||||
ORDER BY type, subcategory
|
||||
`, [permission.dutchie_az_store_id]);
|
||||
res.json({
|
||||
success: true,
|
||||
dispensary: permission.store_name,
|
||||
categories
|
||||
});
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Public API categories error:', error);
|
||||
res.status(500).json({
|
||||
error: 'Failed to fetch categories',
|
||||
message: error.message
|
||||
});
|
||||
}
|
||||
});
|
||||
/**
|
||||
* GET /api/v1/brands
|
||||
* Get all brands for the authenticated dispensary
|
||||
*/
|
||||
router.get('/brands', async (req, res) => {
|
||||
try {
|
||||
const permission = req.apiPermission;
|
||||
if (!permission.dutchie_az_store_id) {
|
||||
return res.status(503).json({
|
||||
error: 'No menu data available',
|
||||
message: `Menu data for ${permission.store_name} is not yet available.`
|
||||
});
|
||||
}
|
||||
const { rows: brands } = await (0, connection_1.query)(`
|
||||
SELECT
|
||||
brand_name as brand,
|
||||
COUNT(*) as product_count,
|
||||
COUNT(*) FILTER (WHERE stock_status = 'in_stock') as in_stock_count
|
||||
FROM dutchie_products
|
||||
WHERE dispensary_id = $1 AND brand_name IS NOT NULL
|
||||
GROUP BY brand_name
|
||||
ORDER BY product_count DESC
|
||||
`, [permission.dutchie_az_store_id]);
|
||||
res.json({
|
||||
success: true,
|
||||
dispensary: permission.store_name,
|
||||
brands
|
||||
});
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Public API brands error:', error);
|
||||
res.status(500).json({
|
||||
error: 'Failed to fetch brands',
|
||||
message: error.message
|
||||
});
|
||||
}
|
||||
});
|
||||
/**
|
||||
* GET /api/v1/specials
|
||||
* Get products on special/sale for the authenticated dispensary
|
||||
*/
|
||||
router.get('/specials', async (req, res) => {
|
||||
try {
|
||||
const permission = req.apiPermission;
|
||||
if (!permission.dutchie_az_store_id) {
|
||||
return res.status(503).json({
|
||||
error: 'No menu data available',
|
||||
message: `Menu data for ${permission.store_name} is not yet available.`
|
||||
});
|
||||
}
|
||||
const { limit = '100', offset = '0' } = req.query;
|
||||
const limitNum = Math.min(parseInt(limit, 10) || 100, 500);
|
||||
const offsetNum = parseInt(offset, 10) || 0;
|
||||
// Get products with special pricing from latest snapshot
|
||||
const { rows: products } = await (0, connection_1.query)(`
|
||||
SELECT
|
||||
p.id,
|
||||
p.external_product_id as dutchie_id,
|
||||
p.name,
|
||||
p.brand_name as brand,
|
||||
p.type as category,
|
||||
p.subcategory,
|
||||
p.strain_type,
|
||||
p.stock_status,
|
||||
p.primary_image_url as image_url,
|
||||
s.rec_min_price_cents,
|
||||
s.rec_min_special_price_cents,
|
||||
s.special,
|
||||
s.options,
|
||||
p.updated_at,
|
||||
s.crawled_at as snapshot_at
|
||||
FROM dutchie_products p
|
||||
INNER JOIN LATERAL (
|
||||
SELECT * FROM dutchie_product_snapshots
|
||||
WHERE dutchie_product_id = p.id
|
||||
ORDER BY crawled_at DESC
|
||||
LIMIT 1
|
||||
) s ON true
|
||||
WHERE p.dispensary_id = $1
|
||||
AND s.special = true
|
||||
AND p.stock_status = 'in_stock'
|
||||
ORDER BY p.name ASC
|
||||
LIMIT $2 OFFSET $3
|
||||
`, [permission.dutchie_az_store_id, limitNum, offsetNum]);
|
||||
// Get total count
|
||||
const { rows: countRows } = await (0, connection_1.query)(`
|
||||
SELECT COUNT(*) as total
|
||||
FROM dutchie_products p
|
||||
INNER JOIN LATERAL (
|
||||
SELECT special FROM dutchie_product_snapshots
|
||||
WHERE dutchie_product_id = p.id
|
||||
ORDER BY crawled_at DESC
|
||||
LIMIT 1
|
||||
) s ON true
|
||||
WHERE p.dispensary_id = $1
|
||||
AND s.special = true
|
||||
AND p.stock_status = 'in_stock'
|
||||
`, [permission.dutchie_az_store_id]);
|
||||
const transformedProducts = products.map((p) => ({
|
||||
id: p.id,
|
||||
dutchie_id: p.dutchie_id,
|
||||
name: p.name,
|
||||
brand: p.brand || null,
|
||||
category: p.category || null,
|
||||
strain_type: p.strain_type || null,
|
||||
regular_price: p.rec_min_price_cents ? (p.rec_min_price_cents / 100).toFixed(2) : null,
|
||||
sale_price: p.rec_min_special_price_cents ? (p.rec_min_special_price_cents / 100).toFixed(2) : null,
|
||||
image_url: p.image_url || null,
|
||||
in_stock: p.stock_status === 'in_stock',
|
||||
options: p.options || [],
|
||||
updated_at: p.updated_at,
|
||||
snapshot_at: p.snapshot_at
|
||||
}));
|
||||
res.json({
|
||||
success: true,
|
||||
dispensary: permission.store_name,
|
||||
specials: transformedProducts,
|
||||
pagination: {
|
||||
total: parseInt(countRows[0]?.total || '0', 10),
|
||||
limit: limitNum,
|
||||
offset: offsetNum,
|
||||
has_more: offsetNum + products.length < parseInt(countRows[0]?.total || '0', 10)
|
||||
}
|
||||
});
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Public API specials error:', error);
|
||||
res.status(500).json({
|
||||
error: 'Failed to fetch specials',
|
||||
message: error.message
|
||||
});
|
||||
}
|
||||
});
|
||||
/**
|
||||
* GET /api/v1/menu
|
||||
* Get complete menu summary for the authenticated dispensary
|
||||
*/
|
||||
router.get('/menu', async (req, res) => {
|
||||
try {
|
||||
const permission = req.apiPermission;
|
||||
if (!permission.dutchie_az_store_id) {
|
||||
return res.status(503).json({
|
||||
error: 'No menu data available',
|
||||
message: `Menu data for ${permission.store_name} is not yet available.`
|
||||
});
|
||||
}
|
||||
// Get counts by category
|
||||
const { rows: categoryCounts } = await (0, connection_1.query)(`
|
||||
SELECT
|
||||
type as category,
|
||||
COUNT(*) as total,
|
||||
COUNT(*) FILTER (WHERE stock_status = 'in_stock') as in_stock
|
||||
FROM dutchie_products
|
||||
WHERE dispensary_id = $1 AND type IS NOT NULL
|
||||
GROUP BY type
|
||||
ORDER BY total DESC
|
||||
`, [permission.dutchie_az_store_id]);
|
||||
// Get overall stats
|
||||
const { rows: stats } = await (0, connection_1.query)(`
|
||||
SELECT
|
||||
COUNT(*) as total_products,
|
||||
COUNT(*) FILTER (WHERE stock_status = 'in_stock') as in_stock_count,
|
||||
COUNT(DISTINCT brand_name) as brand_count,
|
||||
COUNT(DISTINCT type) as category_count,
|
||||
MAX(updated_at) as last_updated
|
||||
FROM dutchie_products
|
||||
WHERE dispensary_id = $1
|
||||
`, [permission.dutchie_az_store_id]);
|
||||
// Get specials count
|
||||
const { rows: specialsCount } = await (0, connection_1.query)(`
|
||||
SELECT COUNT(*) as count
|
||||
FROM dutchie_products p
|
||||
INNER JOIN LATERAL (
|
||||
SELECT special FROM dutchie_product_snapshots
|
||||
WHERE dutchie_product_id = p.id
|
||||
ORDER BY crawled_at DESC
|
||||
LIMIT 1
|
||||
) s ON true
|
||||
WHERE p.dispensary_id = $1
|
||||
AND s.special = true
|
||||
AND p.stock_status = 'in_stock'
|
||||
`, [permission.dutchie_az_store_id]);
|
||||
const summary = stats[0] || {};
|
||||
res.json({
|
||||
success: true,
|
||||
dispensary: permission.store_name,
|
||||
menu: {
|
||||
total_products: parseInt(summary.total_products || '0', 10),
|
||||
in_stock_count: parseInt(summary.in_stock_count || '0', 10),
|
||||
brand_count: parseInt(summary.brand_count || '0', 10),
|
||||
category_count: parseInt(summary.category_count || '0', 10),
|
||||
specials_count: parseInt(specialsCount[0]?.count || '0', 10),
|
||||
last_updated: summary.last_updated,
|
||||
categories: categoryCounts.map((c) => ({
|
||||
name: c.category,
|
||||
total: parseInt(c.total, 10),
|
||||
in_stock: parseInt(c.in_stock, 10)
|
||||
}))
|
||||
}
|
||||
});
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Public API menu error:', error);
|
||||
res.status(500).json({
|
||||
error: 'Failed to fetch menu summary',
|
||||
message: error.message
|
||||
});
|
||||
}
|
||||
});
|
||||
exports.default = router;
|
||||
887
backend/dist/routes/schedule.js
vendored
Normal file
887
backend/dist/routes/schedule.js
vendored
Normal file
@@ -0,0 +1,887 @@
|
||||
"use strict";
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
const express_1 = require("express");
|
||||
const middleware_1 = require("../auth/middleware");
|
||||
const crawl_scheduler_1 = require("../services/crawl-scheduler");
|
||||
const store_crawl_orchestrator_1 = require("../services/store-crawl-orchestrator");
|
||||
const dispensary_orchestrator_1 = require("../services/dispensary-orchestrator");
|
||||
const migrate_1 = require("../db/migrate");
|
||||
const graphql_client_1 = require("../dutchie-az/services/graphql-client");
|
||||
const router = (0, express_1.Router)();
|
||||
router.use(middleware_1.authMiddleware);
|
||||
// ============================================
|
||||
// Global Schedule Endpoints
|
||||
// ============================================
|
||||
/**
|
||||
* GET /api/schedule/global
|
||||
* Get global schedule settings
|
||||
*/
|
||||
router.get('/global', async (req, res) => {
|
||||
try {
|
||||
const schedules = await (0, crawl_scheduler_1.getGlobalSchedule)();
|
||||
res.json({ schedules });
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error fetching global schedule:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch global schedule' });
|
||||
}
|
||||
});
|
||||
/**
|
||||
* PUT /api/schedule/global/:type
|
||||
* Update global schedule setting
|
||||
*/
|
||||
router.put('/global/:type', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
|
||||
try {
|
||||
const { type } = req.params;
|
||||
const { enabled, interval_hours, run_time } = req.body;
|
||||
if (type !== 'global_interval' && type !== 'daily_special') {
|
||||
return res.status(400).json({ error: 'Invalid schedule type' });
|
||||
}
|
||||
const schedule = await (0, crawl_scheduler_1.updateGlobalSchedule)(type, {
|
||||
enabled,
|
||||
interval_hours,
|
||||
run_time
|
||||
});
|
||||
// Restart scheduler to apply changes
|
||||
await (0, crawl_scheduler_1.restartCrawlScheduler)();
|
||||
res.json({ schedule, message: 'Schedule updated and scheduler restarted' });
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error updating global schedule:', error);
|
||||
res.status(500).json({ error: 'Failed to update global schedule' });
|
||||
}
|
||||
});
|
||||
// ============================================
|
||||
// Store Schedule Endpoints
|
||||
// ============================================
|
||||
/**
|
||||
* GET /api/schedule/stores
|
||||
* Get all store schedule statuses
|
||||
*/
|
||||
router.get('/stores', async (req, res) => {
|
||||
try {
|
||||
const stores = await (0, crawl_scheduler_1.getStoreScheduleStatuses)();
|
||||
res.json({ stores });
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error fetching store schedules:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch store schedules' });
|
||||
}
|
||||
});
|
||||
/**
|
||||
* GET /api/schedule/stores/:storeId
|
||||
* Get schedule for a specific store
|
||||
*/
|
||||
router.get('/stores/:storeId', async (req, res) => {
|
||||
try {
|
||||
const storeId = parseInt(req.params.storeId);
|
||||
if (isNaN(storeId)) {
|
||||
return res.status(400).json({ error: 'Invalid store ID' });
|
||||
}
|
||||
const schedule = await (0, crawl_scheduler_1.getStoreSchedule)(storeId);
|
||||
res.json({ schedule });
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error fetching store schedule:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch store schedule' });
|
||||
}
|
||||
});
|
||||
/**
|
||||
* PUT /api/schedule/stores/:storeId
|
||||
* Update schedule for a specific store
|
||||
*/
|
||||
router.put('/stores/:storeId', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
|
||||
try {
|
||||
const storeId = parseInt(req.params.storeId);
|
||||
if (isNaN(storeId)) {
|
||||
return res.status(400).json({ error: 'Invalid store ID' });
|
||||
}
|
||||
const { enabled, interval_hours, daily_special_enabled, daily_special_time, priority } = req.body;
|
||||
const schedule = await (0, crawl_scheduler_1.updateStoreSchedule)(storeId, {
|
||||
enabled,
|
||||
interval_hours,
|
||||
daily_special_enabled,
|
||||
daily_special_time,
|
||||
priority
|
||||
});
|
||||
res.json({ schedule });
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error updating store schedule:', error);
|
||||
res.status(500).json({ error: 'Failed to update store schedule' });
|
||||
}
|
||||
});
|
||||
// ============================================
|
||||
// Job Queue Endpoints
|
||||
// ============================================
|
||||
/**
|
||||
* GET /api/schedule/jobs
|
||||
* Get recent jobs
|
||||
*/
|
||||
router.get('/jobs', async (req, res) => {
|
||||
try {
|
||||
const limit = parseInt(req.query.limit) || 50;
|
||||
const jobs = await (0, crawl_scheduler_1.getAllRecentJobs)(Math.min(limit, 200));
|
||||
res.json({ jobs });
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error fetching jobs:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch jobs' });
|
||||
}
|
||||
});
|
||||
/**
|
||||
* GET /api/schedule/jobs/store/:storeId
|
||||
* Get recent jobs for a specific store
|
||||
*/
|
||||
router.get('/jobs/store/:storeId', async (req, res) => {
|
||||
try {
|
||||
const storeId = parseInt(req.params.storeId);
|
||||
if (isNaN(storeId)) {
|
||||
return res.status(400).json({ error: 'Invalid store ID' });
|
||||
}
|
||||
const limit = parseInt(req.query.limit) || 10;
|
||||
const jobs = await (0, crawl_scheduler_1.getRecentJobs)(storeId, Math.min(limit, 100));
|
||||
res.json({ jobs });
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error fetching store jobs:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch store jobs' });
|
||||
}
|
||||
});
|
||||
/**
|
||||
* POST /api/schedule/jobs/:jobId/cancel
|
||||
* Cancel a pending job
|
||||
*/
|
||||
router.post('/jobs/:jobId/cancel', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
|
||||
try {
|
||||
const jobId = parseInt(req.params.jobId);
|
||||
if (isNaN(jobId)) {
|
||||
return res.status(400).json({ error: 'Invalid job ID' });
|
||||
}
|
||||
const cancelled = await (0, crawl_scheduler_1.cancelJob)(jobId);
|
||||
if (cancelled) {
|
||||
res.json({ success: true, message: 'Job cancelled' });
|
||||
}
|
||||
else {
|
||||
res.status(400).json({ error: 'Job could not be cancelled (may not be pending)' });
|
||||
}
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error cancelling job:', error);
|
||||
res.status(500).json({ error: 'Failed to cancel job' });
|
||||
}
|
||||
});
|
||||
// ============================================
|
||||
// Manual Trigger Endpoints
|
||||
// ============================================
|
||||
/**
|
||||
* POST /api/schedule/trigger/store/:storeId
|
||||
* Manually trigger orchestrated crawl for a specific store
|
||||
* Uses the intelligent orchestrator which:
|
||||
* - Checks provider detection status
|
||||
* - Runs detection if needed
|
||||
* - Queues appropriate crawl type (production/sandbox)
|
||||
*/
|
||||
router.post('/trigger/store/:storeId', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
|
||||
try {
|
||||
const storeId = parseInt(req.params.storeId);
|
||||
if (isNaN(storeId)) {
|
||||
return res.status(400).json({ error: 'Invalid store ID' });
|
||||
}
|
||||
// Use the orchestrator instead of simple triggerManualCrawl
|
||||
const result = await (0, store_crawl_orchestrator_1.runStoreCrawlOrchestrator)(storeId);
|
||||
res.json({
|
||||
result,
|
||||
message: result.summary,
|
||||
success: result.status === 'success' || result.status === 'sandbox_only',
|
||||
});
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error triggering orchestrated crawl:', error);
|
||||
res.status(500).json({ error: 'Failed to trigger crawl' });
|
||||
}
|
||||
});
|
||||
/**
|
||||
* POST /api/schedule/trigger/store/:storeId/legacy
|
||||
* Legacy: Simple job queue trigger (no orchestration)
|
||||
*/
|
||||
router.post('/trigger/store/:storeId/legacy', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
|
||||
try {
|
||||
const storeId = parseInt(req.params.storeId);
|
||||
if (isNaN(storeId)) {
|
||||
return res.status(400).json({ error: 'Invalid store ID' });
|
||||
}
|
||||
const job = await (0, crawl_scheduler_1.triggerManualCrawl)(storeId);
|
||||
res.json({ job, message: 'Crawl job created' });
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error triggering manual crawl:', error);
|
||||
res.status(500).json({ error: 'Failed to trigger crawl' });
|
||||
}
|
||||
});
|
||||
/**
|
||||
* POST /api/schedule/trigger/all
|
||||
* Manually trigger crawls for all stores
|
||||
*/
|
||||
router.post('/trigger/all', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
|
||||
try {
|
||||
const jobsCreated = await (0, crawl_scheduler_1.triggerAllStoresCrawl)();
|
||||
res.json({ jobs_created: jobsCreated, message: `Created ${jobsCreated} crawl jobs` });
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error triggering all crawls:', error);
|
||||
res.status(500).json({ error: 'Failed to trigger crawls' });
|
||||
}
|
||||
});
|
||||
/**
|
||||
* POST /api/schedule/restart
|
||||
* Restart the scheduler
|
||||
*/
|
||||
router.post('/restart', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
|
||||
try {
|
||||
await (0, crawl_scheduler_1.restartCrawlScheduler)();
|
||||
res.json({ message: 'Scheduler restarted', mode: (0, crawl_scheduler_1.getSchedulerMode)() });
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error restarting scheduler:', error);
|
||||
res.status(500).json({ error: 'Failed to restart scheduler' });
|
||||
}
|
||||
});
|
||||
// ============================================
|
||||
// Scheduler Mode Endpoints
|
||||
// ============================================
|
||||
/**
|
||||
* GET /api/schedule/mode
|
||||
* Get current scheduler mode
|
||||
*/
|
||||
router.get('/mode', async (req, res) => {
|
||||
try {
|
||||
const mode = (0, crawl_scheduler_1.getSchedulerMode)();
|
||||
res.json({ mode });
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error getting scheduler mode:', error);
|
||||
res.status(500).json({ error: 'Failed to get scheduler mode' });
|
||||
}
|
||||
});
|
||||
/**
|
||||
* PUT /api/schedule/mode
|
||||
* Set scheduler mode (legacy or orchestrator)
|
||||
*/
|
||||
router.put('/mode', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
|
||||
try {
|
||||
const { mode } = req.body;
|
||||
if (mode !== 'legacy' && mode !== 'orchestrator') {
|
||||
return res.status(400).json({ error: 'Invalid mode. Must be "legacy" or "orchestrator"' });
|
||||
}
|
||||
(0, crawl_scheduler_1.setSchedulerMode)(mode);
|
||||
// Restart scheduler with new mode
|
||||
await (0, crawl_scheduler_1.restartCrawlScheduler)();
|
||||
res.json({ mode, message: `Scheduler mode set to ${mode} and restarted` });
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error setting scheduler mode:', error);
|
||||
res.status(500).json({ error: 'Failed to set scheduler mode' });
|
||||
}
|
||||
});
|
||||
/**
|
||||
* GET /api/schedule/due
|
||||
* Get stores that are due for orchestration
|
||||
*/
|
||||
router.get('/due', async (req, res) => {
|
||||
try {
|
||||
const limit = parseInt(req.query.limit) || 10;
|
||||
const storeIds = await (0, store_crawl_orchestrator_1.getStoresDueForOrchestration)(Math.min(limit, 50));
|
||||
res.json({ stores_due: storeIds, count: storeIds.length });
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error getting stores due for orchestration:', error);
|
||||
res.status(500).json({ error: 'Failed to get stores due' });
|
||||
}
|
||||
});
|
||||
// ============================================
|
||||
// Dispensary Schedule Endpoints (NEW - dispensary-centric)
|
||||
// ============================================
|
||||
/**
|
||||
* GET /api/schedule/dispensaries
|
||||
* Get all dispensary schedule statuses with optional filters
|
||||
* Query params:
|
||||
* - state: filter by state (e.g., 'AZ')
|
||||
* - search: search by name or slug
|
||||
*/
|
||||
router.get('/dispensaries', async (req, res) => {
|
||||
try {
|
||||
const { state, search } = req.query;
|
||||
// Build dynamic query with optional filters
|
||||
const conditions = [];
|
||||
const params = [];
|
||||
let paramIndex = 1;
|
||||
if (state) {
|
||||
conditions.push(`d.state = $${paramIndex}`);
|
||||
params.push(state);
|
||||
paramIndex++;
|
||||
}
|
||||
if (search) {
|
||||
conditions.push(`(d.name ILIKE $${paramIndex} OR d.slug ILIKE $${paramIndex} OR d.dba_name ILIKE $${paramIndex})`);
|
||||
params.push(`%${search}%`);
|
||||
paramIndex++;
|
||||
}
|
||||
const whereClause = conditions.length > 0 ? `WHERE ${conditions.join(' AND ')}` : '';
|
||||
const query = `
|
||||
SELECT
|
||||
d.id AS dispensary_id,
|
||||
COALESCE(d.dba_name, d.name) AS dispensary_name,
|
||||
d.slug AS dispensary_slug,
|
||||
d.city,
|
||||
d.state,
|
||||
d.menu_url,
|
||||
d.menu_type,
|
||||
d.platform_dispensary_id,
|
||||
d.scrape_enabled,
|
||||
d.last_crawl_at,
|
||||
d.crawl_status,
|
||||
d.product_crawler_mode,
|
||||
d.product_provider,
|
||||
cs.interval_minutes,
|
||||
cs.is_active,
|
||||
cs.priority,
|
||||
cs.last_run_at,
|
||||
cs.next_run_at,
|
||||
cs.last_status AS schedule_last_status,
|
||||
cs.last_error AS schedule_last_error,
|
||||
cs.consecutive_failures,
|
||||
j.id AS latest_job_id,
|
||||
j.status AS latest_job_status,
|
||||
j.job_type AS latest_job_type,
|
||||
j.started_at AS latest_job_started,
|
||||
j.completed_at AS latest_job_completed,
|
||||
j.products_found AS latest_products_found,
|
||||
j.products_new AS latest_products_created,
|
||||
j.products_updated AS latest_products_updated,
|
||||
j.error_message AS latest_job_error,
|
||||
CASE
|
||||
WHEN d.menu_type = 'dutchie' AND d.platform_dispensary_id IS NOT NULL THEN true
|
||||
ELSE false
|
||||
END AS can_crawl,
|
||||
CASE
|
||||
WHEN d.menu_type IS NULL OR d.menu_type = 'unknown' THEN 'menu_type not detected'
|
||||
WHEN d.menu_type != 'dutchie' THEN 'not dutchie platform'
|
||||
WHEN d.platform_dispensary_id IS NULL THEN 'platform ID not resolved'
|
||||
WHEN d.scrape_enabled = false THEN 'scraping disabled'
|
||||
ELSE 'ready'
|
||||
END AS schedule_status_reason
|
||||
FROM public.dispensaries d
|
||||
LEFT JOIN public.dispensary_crawl_schedule cs ON cs.dispensary_id = d.id
|
||||
LEFT JOIN LATERAL (
|
||||
SELECT *
|
||||
FROM public.dispensary_crawl_jobs dj
|
||||
WHERE dj.dispensary_id = d.id
|
||||
ORDER BY dj.created_at DESC
|
||||
LIMIT 1
|
||||
) j ON true
|
||||
${whereClause}
|
||||
ORDER BY cs.priority DESC NULLS LAST, COALESCE(d.dba_name, d.name)
|
||||
`;
|
||||
const result = await migrate_1.pool.query(query, params);
|
||||
res.json({ dispensaries: result.rows });
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error fetching dispensary schedules:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch dispensary schedules' });
|
||||
}
|
||||
});
|
||||
/**
|
||||
* GET /api/schedule/dispensaries/:id
|
||||
* Get schedule for a specific dispensary
|
||||
*/
|
||||
router.get('/dispensaries/:id', async (req, res) => {
|
||||
try {
|
||||
const dispensaryId = parseInt(req.params.id);
|
||||
if (isNaN(dispensaryId)) {
|
||||
return res.status(400).json({ error: 'Invalid dispensary ID' });
|
||||
}
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT * FROM dispensary_crawl_status
|
||||
WHERE dispensary_id = $1
|
||||
`, [dispensaryId]);
|
||||
if (result.rows.length === 0) {
|
||||
return res.status(404).json({ error: 'Dispensary not found' });
|
||||
}
|
||||
res.json({ schedule: result.rows[0] });
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error fetching dispensary schedule:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch dispensary schedule' });
|
||||
}
|
||||
});
|
||||
/**
|
||||
* PUT /api/schedule/dispensaries/:id
|
||||
* Update schedule for a specific dispensary
|
||||
*/
|
||||
router.put('/dispensaries/:id', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
|
||||
try {
|
||||
const dispensaryId = parseInt(req.params.id);
|
||||
if (isNaN(dispensaryId)) {
|
||||
return res.status(400).json({ error: 'Invalid dispensary ID' });
|
||||
}
|
||||
const { is_active, interval_minutes, priority } = req.body;
|
||||
// Upsert schedule
|
||||
const result = await migrate_1.pool.query(`
|
||||
INSERT INTO dispensary_crawl_schedule (dispensary_id, is_active, interval_minutes, priority)
|
||||
VALUES ($1, COALESCE($2, TRUE), COALESCE($3, 240), COALESCE($4, 0))
|
||||
ON CONFLICT (dispensary_id) DO UPDATE SET
|
||||
is_active = COALESCE($2, dispensary_crawl_schedule.is_active),
|
||||
interval_minutes = COALESCE($3, dispensary_crawl_schedule.interval_minutes),
|
||||
priority = COALESCE($4, dispensary_crawl_schedule.priority),
|
||||
updated_at = NOW()
|
||||
RETURNING *
|
||||
`, [dispensaryId, is_active, interval_minutes, priority]);
|
||||
res.json({ schedule: result.rows[0] });
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error updating dispensary schedule:', error);
|
||||
res.status(500).json({ error: 'Failed to update dispensary schedule' });
|
||||
}
|
||||
});
|
||||
/**
|
||||
* GET /api/schedule/dispensary-jobs
|
||||
* Get recent dispensary crawl jobs
|
||||
*/
|
||||
router.get('/dispensary-jobs', async (req, res) => {
|
||||
try {
|
||||
const limit = parseInt(req.query.limit) || 50;
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT dcj.*, d.name as dispensary_name
|
||||
FROM dispensary_crawl_jobs dcj
|
||||
JOIN dispensaries d ON d.id = dcj.dispensary_id
|
||||
ORDER BY dcj.created_at DESC
|
||||
LIMIT $1
|
||||
`, [Math.min(limit, 200)]);
|
||||
res.json({ jobs: result.rows });
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error fetching dispensary jobs:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch dispensary jobs' });
|
||||
}
|
||||
});
|
||||
/**
|
||||
* GET /api/schedule/dispensary-jobs/:dispensaryId
|
||||
* Get recent jobs for a specific dispensary
|
||||
*/
|
||||
router.get('/dispensary-jobs/:dispensaryId', async (req, res) => {
|
||||
try {
|
||||
const dispensaryId = parseInt(req.params.dispensaryId);
|
||||
if (isNaN(dispensaryId)) {
|
||||
return res.status(400).json({ error: 'Invalid dispensary ID' });
|
||||
}
|
||||
const limit = parseInt(req.query.limit) || 10;
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT dcj.*, d.name as dispensary_name
|
||||
FROM dispensary_crawl_jobs dcj
|
||||
JOIN dispensaries d ON d.id = dcj.dispensary_id
|
||||
WHERE dcj.dispensary_id = $1
|
||||
ORDER BY dcj.created_at DESC
|
||||
LIMIT $2
|
||||
`, [dispensaryId, Math.min(limit, 100)]);
|
||||
res.json({ jobs: result.rows });
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error fetching dispensary jobs:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch dispensary jobs' });
|
||||
}
|
||||
});
|
||||
/**
|
||||
* POST /api/schedule/trigger/dispensary/:id
|
||||
* Trigger orchestrator for a specific dispensary (Run Now button)
|
||||
*/
|
||||
router.post('/trigger/dispensary/:id', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
|
||||
try {
|
||||
const dispensaryId = parseInt(req.params.id);
|
||||
if (isNaN(dispensaryId)) {
|
||||
return res.status(400).json({ error: 'Invalid dispensary ID' });
|
||||
}
|
||||
// Run the dispensary orchestrator
|
||||
const result = await (0, dispensary_orchestrator_1.runDispensaryOrchestrator)(dispensaryId);
|
||||
res.json({
|
||||
result,
|
||||
message: result.summary,
|
||||
success: result.status === 'success' || result.status === 'sandbox_only' || result.status === 'detection_only',
|
||||
});
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error triggering dispensary orchestrator:', error);
|
||||
res.status(500).json({ error: 'Failed to trigger orchestrator' });
|
||||
}
|
||||
});
|
||||
/**
|
||||
* POST /api/schedule/trigger/dispensaries/batch
|
||||
* Trigger orchestrator for multiple dispensaries
|
||||
*/
|
||||
router.post('/trigger/dispensaries/batch', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
|
||||
try {
|
||||
const { dispensary_ids, concurrency } = req.body;
|
||||
if (!Array.isArray(dispensary_ids) || dispensary_ids.length === 0) {
|
||||
return res.status(400).json({ error: 'dispensary_ids must be a non-empty array' });
|
||||
}
|
||||
const results = await (0, dispensary_orchestrator_1.runBatchDispensaryOrchestrator)(dispensary_ids, concurrency || 3);
|
||||
const summary = {
|
||||
total: results.length,
|
||||
success: results.filter(r => r.status === 'success').length,
|
||||
sandbox_only: results.filter(r => r.status === 'sandbox_only').length,
|
||||
detection_only: results.filter(r => r.status === 'detection_only').length,
|
||||
error: results.filter(r => r.status === 'error').length,
|
||||
};
|
||||
res.json({ results, summary });
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error triggering batch orchestrator:', error);
|
||||
res.status(500).json({ error: 'Failed to trigger batch orchestrator' });
|
||||
}
|
||||
});
|
||||
/**
|
||||
* GET /api/schedule/dispensary-due
|
||||
* Get dispensaries that are due for orchestration
|
||||
*/
|
||||
router.get('/dispensary-due', async (req, res) => {
|
||||
try {
|
||||
const limit = parseInt(req.query.limit) || 10;
|
||||
const dispensaryIds = await (0, dispensary_orchestrator_1.getDispensariesDueForOrchestration)(Math.min(limit, 50));
|
||||
// Get details for the due dispensaries
|
||||
if (dispensaryIds.length > 0) {
|
||||
const details = await migrate_1.pool.query(`
|
||||
SELECT d.id, d.name, d.product_provider, d.product_crawler_mode,
|
||||
dcs.next_run_at, dcs.last_status, dcs.priority
|
||||
FROM dispensaries d
|
||||
LEFT JOIN dispensary_crawl_schedule dcs ON dcs.dispensary_id = d.id
|
||||
WHERE d.id = ANY($1)
|
||||
ORDER BY COALESCE(dcs.priority, 0) DESC, dcs.last_run_at ASC NULLS FIRST
|
||||
`, [dispensaryIds]);
|
||||
res.json({ dispensaries_due: details.rows, count: dispensaryIds.length });
|
||||
}
|
||||
else {
|
||||
res.json({ dispensaries_due: [], count: 0 });
|
||||
}
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error getting dispensaries due for orchestration:', error);
|
||||
res.status(500).json({ error: 'Failed to get dispensaries due' });
|
||||
}
|
||||
});
|
||||
/**
|
||||
* POST /api/schedule/dispensaries/bootstrap
|
||||
* Ensure all dispensaries have schedule entries
|
||||
*/
|
||||
router.post('/dispensaries/bootstrap', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
|
||||
try {
|
||||
const { interval_minutes } = req.body;
|
||||
const result = await (0, dispensary_orchestrator_1.ensureAllDispensariesHaveSchedules)(interval_minutes || 240);
|
||||
res.json({
|
||||
message: `Created ${result.created} new schedules, ${result.existing} already existed`,
|
||||
created: result.created,
|
||||
existing: result.existing,
|
||||
});
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error bootstrapping dispensary schedules:', error);
|
||||
res.status(500).json({ error: 'Failed to bootstrap schedules' });
|
||||
}
|
||||
});
|
||||
// ============================================
|
||||
// Platform ID & Menu Type Detection Endpoints
|
||||
// ============================================
|
||||
/**
|
||||
* POST /api/schedule/dispensaries/:id/resolve-platform-id
|
||||
* Resolve the Dutchie platform_dispensary_id from menu_url slug
|
||||
*/
|
||||
router.post('/dispensaries/:id/resolve-platform-id', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
|
||||
try {
|
||||
const dispensaryId = parseInt(req.params.id);
|
||||
if (isNaN(dispensaryId)) {
|
||||
return res.status(400).json({ error: 'Invalid dispensary ID' });
|
||||
}
|
||||
// Get dispensary info
|
||||
const dispensaryResult = await migrate_1.pool.query(`
|
||||
SELECT id, name, slug, menu_url, menu_type, platform_dispensary_id
|
||||
FROM dispensaries WHERE id = $1
|
||||
`, [dispensaryId]);
|
||||
if (dispensaryResult.rows.length === 0) {
|
||||
return res.status(404).json({ error: 'Dispensary not found' });
|
||||
}
|
||||
const dispensary = dispensaryResult.rows[0];
|
||||
// Check if already resolved
|
||||
if (dispensary.platform_dispensary_id) {
|
||||
return res.json({
|
||||
success: true,
|
||||
message: 'Platform ID already resolved',
|
||||
platform_dispensary_id: dispensary.platform_dispensary_id,
|
||||
already_resolved: true
|
||||
});
|
||||
}
|
||||
// Extract slug from menu_url for Dutchie URLs
|
||||
let slugToResolve = dispensary.slug;
|
||||
if (dispensary.menu_url) {
|
||||
// Match embedded-menu or dispensary URLs
|
||||
const match = dispensary.menu_url.match(/(?:embedded-menu|dispensar(?:y|ies))\/([^\/\?#]+)/i);
|
||||
if (match) {
|
||||
slugToResolve = match[1];
|
||||
}
|
||||
}
|
||||
if (!slugToResolve) {
|
||||
return res.status(400).json({
|
||||
error: 'No slug available to resolve platform ID',
|
||||
menu_url: dispensary.menu_url
|
||||
});
|
||||
}
|
||||
console.log(`[Schedule] Resolving platform ID for ${dispensary.name} using slug: ${slugToResolve}`);
|
||||
// Resolve platform ID using GraphQL client
|
||||
const platformId = await (0, graphql_client_1.resolveDispensaryId)(slugToResolve);
|
||||
if (!platformId) {
|
||||
return res.status(404).json({
|
||||
error: 'Could not resolve platform ID',
|
||||
slug_tried: slugToResolve,
|
||||
message: 'The dispensary might not be on Dutchie or the slug is incorrect'
|
||||
});
|
||||
}
|
||||
// Update the dispensary with resolved platform ID
|
||||
await migrate_1.pool.query(`
|
||||
UPDATE dispensaries
|
||||
SET platform_dispensary_id = $1,
|
||||
menu_type = COALESCE(menu_type, 'dutchie'),
|
||||
updated_at = NOW()
|
||||
WHERE id = $2
|
||||
`, [platformId, dispensaryId]);
|
||||
res.json({
|
||||
success: true,
|
||||
platform_dispensary_id: platformId,
|
||||
slug_resolved: slugToResolve,
|
||||
message: `Platform ID resolved: ${platformId}`
|
||||
});
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error resolving platform ID:', error);
|
||||
res.status(500).json({ error: 'Failed to resolve platform ID', details: error.message });
|
||||
}
|
||||
});
|
||||
/**
|
||||
* POST /api/schedule/dispensaries/:id/detect-menu-type
|
||||
* Detect menu type from menu_url
|
||||
*/
|
||||
router.post('/dispensaries/:id/detect-menu-type', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
|
||||
try {
|
||||
const dispensaryId = parseInt(req.params.id);
|
||||
if (isNaN(dispensaryId)) {
|
||||
return res.status(400).json({ error: 'Invalid dispensary ID' });
|
||||
}
|
||||
// Get dispensary info
|
||||
const dispensaryResult = await migrate_1.pool.query(`
|
||||
SELECT id, name, menu_url, website FROM dispensaries WHERE id = $1
|
||||
`, [dispensaryId]);
|
||||
if (dispensaryResult.rows.length === 0) {
|
||||
return res.status(404).json({ error: 'Dispensary not found' });
|
||||
}
|
||||
const dispensary = dispensaryResult.rows[0];
|
||||
const urlToCheck = dispensary.menu_url || dispensary.website;
|
||||
if (!urlToCheck) {
|
||||
return res.status(400).json({ error: 'No menu_url or website to detect from' });
|
||||
}
|
||||
// Detect menu type from URL patterns
|
||||
let detectedType = 'unknown';
|
||||
if (urlToCheck.includes('dutchie.com') || urlToCheck.includes('embedded-menu')) {
|
||||
detectedType = 'dutchie';
|
||||
}
|
||||
else if (urlToCheck.includes('iheartjane.com') || urlToCheck.includes('jane.co')) {
|
||||
detectedType = 'jane';
|
||||
}
|
||||
else if (urlToCheck.includes('weedmaps.com')) {
|
||||
detectedType = 'weedmaps';
|
||||
}
|
||||
else if (urlToCheck.includes('leafly.com')) {
|
||||
detectedType = 'leafly';
|
||||
}
|
||||
else if (urlToCheck.includes('treez.io') || urlToCheck.includes('treez.co')) {
|
||||
detectedType = 'treez';
|
||||
}
|
||||
else if (urlToCheck.includes('meadow.com')) {
|
||||
detectedType = 'meadow';
|
||||
}
|
||||
else if (urlToCheck.includes('blaze.me') || urlToCheck.includes('blazepay')) {
|
||||
detectedType = 'blaze';
|
||||
}
|
||||
else if (urlToCheck.includes('flowhub.com')) {
|
||||
detectedType = 'flowhub';
|
||||
}
|
||||
else if (urlToCheck.includes('dispense.app')) {
|
||||
detectedType = 'dispense';
|
||||
}
|
||||
else if (urlToCheck.includes('covasoft.com')) {
|
||||
detectedType = 'cova';
|
||||
}
|
||||
// Update menu_type
|
||||
await migrate_1.pool.query(`
|
||||
UPDATE dispensaries
|
||||
SET menu_type = $1, updated_at = NOW()
|
||||
WHERE id = $2
|
||||
`, [detectedType, dispensaryId]);
|
||||
res.json({
|
||||
success: true,
|
||||
menu_type: detectedType,
|
||||
url_checked: urlToCheck,
|
||||
message: `Menu type detected: ${detectedType}`
|
||||
});
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error detecting menu type:', error);
|
||||
res.status(500).json({ error: 'Failed to detect menu type' });
|
||||
}
|
||||
});
|
||||
/**
|
||||
* POST /api/schedule/dispensaries/:id/refresh-detection
|
||||
* Combined: detect menu_type AND resolve platform_dispensary_id if dutchie
|
||||
*/
|
||||
router.post('/dispensaries/:id/refresh-detection', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
|
||||
try {
|
||||
const dispensaryId = parseInt(req.params.id);
|
||||
if (isNaN(dispensaryId)) {
|
||||
return res.status(400).json({ error: 'Invalid dispensary ID' });
|
||||
}
|
||||
// Get dispensary info
|
||||
const dispensaryResult = await migrate_1.pool.query(`
|
||||
SELECT id, name, slug, menu_url, website FROM dispensaries WHERE id = $1
|
||||
`, [dispensaryId]);
|
||||
if (dispensaryResult.rows.length === 0) {
|
||||
return res.status(404).json({ error: 'Dispensary not found' });
|
||||
}
|
||||
const dispensary = dispensaryResult.rows[0];
|
||||
const urlToCheck = dispensary.menu_url || dispensary.website;
|
||||
if (!urlToCheck) {
|
||||
return res.status(400).json({ error: 'No menu_url or website to detect from' });
|
||||
}
|
||||
// Detect menu type from URL patterns
|
||||
let detectedType = 'unknown';
|
||||
if (urlToCheck.includes('dutchie.com') || urlToCheck.includes('embedded-menu')) {
|
||||
detectedType = 'dutchie';
|
||||
}
|
||||
else if (urlToCheck.includes('iheartjane.com') || urlToCheck.includes('jane.co')) {
|
||||
detectedType = 'jane';
|
||||
}
|
||||
else if (urlToCheck.includes('weedmaps.com')) {
|
||||
detectedType = 'weedmaps';
|
||||
}
|
||||
else if (urlToCheck.includes('leafly.com')) {
|
||||
detectedType = 'leafly';
|
||||
}
|
||||
else if (urlToCheck.includes('treez.io') || urlToCheck.includes('treez.co')) {
|
||||
detectedType = 'treez';
|
||||
}
|
||||
else if (urlToCheck.includes('meadow.com')) {
|
||||
detectedType = 'meadow';
|
||||
}
|
||||
else if (urlToCheck.includes('blaze.me') || urlToCheck.includes('blazepay')) {
|
||||
detectedType = 'blaze';
|
||||
}
|
||||
else if (urlToCheck.includes('flowhub.com')) {
|
||||
detectedType = 'flowhub';
|
||||
}
|
||||
else if (urlToCheck.includes('dispense.app')) {
|
||||
detectedType = 'dispense';
|
||||
}
|
||||
else if (urlToCheck.includes('covasoft.com')) {
|
||||
detectedType = 'cova';
|
||||
}
|
||||
// Update menu_type first
|
||||
await migrate_1.pool.query(`
|
||||
UPDATE dispensaries SET menu_type = $1, updated_at = NOW() WHERE id = $2
|
||||
`, [detectedType, dispensaryId]);
|
||||
let platformId = null;
|
||||
// If dutchie, also try to resolve platform ID
|
||||
if (detectedType === 'dutchie') {
|
||||
let slugToResolve = dispensary.slug;
|
||||
const match = urlToCheck.match(/(?:embedded-menu|dispensar(?:y|ies))\/([^\/\?#]+)/i);
|
||||
if (match) {
|
||||
slugToResolve = match[1];
|
||||
}
|
||||
if (slugToResolve) {
|
||||
try {
|
||||
console.log(`[Schedule] Resolving platform ID for ${dispensary.name} using slug: ${slugToResolve}`);
|
||||
platformId = await (0, graphql_client_1.resolveDispensaryId)(slugToResolve);
|
||||
if (platformId) {
|
||||
await migrate_1.pool.query(`
|
||||
UPDATE dispensaries SET platform_dispensary_id = $1, updated_at = NOW() WHERE id = $2
|
||||
`, [platformId, dispensaryId]);
|
||||
}
|
||||
}
|
||||
catch (err) {
|
||||
console.warn(`[Schedule] Failed to resolve platform ID: ${err.message}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
res.json({
|
||||
success: true,
|
||||
menu_type: detectedType,
|
||||
platform_dispensary_id: platformId,
|
||||
url_checked: urlToCheck,
|
||||
can_crawl: detectedType === 'dutchie' && !!platformId
|
||||
});
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error refreshing detection:', error);
|
||||
res.status(500).json({ error: 'Failed to refresh detection' });
|
||||
}
|
||||
});
|
||||
/**
|
||||
* PUT /api/schedule/dispensaries/:id/toggle-active
|
||||
* Enable or disable schedule for a dispensary
|
||||
*/
|
||||
router.put('/dispensaries/:id/toggle-active', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
|
||||
try {
|
||||
const dispensaryId = parseInt(req.params.id);
|
||||
if (isNaN(dispensaryId)) {
|
||||
return res.status(400).json({ error: 'Invalid dispensary ID' });
|
||||
}
|
||||
const { is_active } = req.body;
|
||||
// Upsert schedule with new is_active value
|
||||
const result = await migrate_1.pool.query(`
|
||||
INSERT INTO dispensary_crawl_schedule (dispensary_id, is_active, interval_minutes, priority)
|
||||
VALUES ($1, $2, 240, 0)
|
||||
ON CONFLICT (dispensary_id) DO UPDATE SET
|
||||
is_active = $2,
|
||||
updated_at = NOW()
|
||||
RETURNING *
|
||||
`, [dispensaryId, is_active]);
|
||||
res.json({
|
||||
success: true,
|
||||
schedule: result.rows[0],
|
||||
message: is_active ? 'Schedule enabled' : 'Schedule disabled'
|
||||
});
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error toggling schedule active status:', error);
|
||||
res.status(500).json({ error: 'Failed to toggle schedule' });
|
||||
}
|
||||
});
|
||||
/**
|
||||
* DELETE /api/schedule/dispensaries/:id/schedule
|
||||
* Delete schedule for a dispensary
|
||||
*/
|
||||
router.delete('/dispensaries/:id/schedule', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
|
||||
try {
|
||||
const dispensaryId = parseInt(req.params.id);
|
||||
if (isNaN(dispensaryId)) {
|
||||
return res.status(400).json({ error: 'Invalid dispensary ID' });
|
||||
}
|
||||
const result = await migrate_1.pool.query(`
|
||||
DELETE FROM dispensary_crawl_schedule WHERE dispensary_id = $1 RETURNING id
|
||||
`, [dispensaryId]);
|
||||
const deleted = (result.rowCount ?? 0) > 0;
|
||||
res.json({
|
||||
success: true,
|
||||
deleted,
|
||||
message: deleted ? 'Schedule deleted' : 'No schedule to delete'
|
||||
});
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error deleting schedule:', error);
|
||||
res.status(500).json({ error: 'Failed to delete schedule' });
|
||||
}
|
||||
});
|
||||
exports.default = router;
|
||||
249
backend/dist/routes/scraper-monitor.js
vendored
249
backend/dist/routes/scraper-monitor.js
vendored
@@ -1,4 +1,37 @@
|
||||
"use strict";
|
||||
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
||||
if (k2 === undefined) k2 = k;
|
||||
var desc = Object.getOwnPropertyDescriptor(m, k);
|
||||
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
||||
desc = { enumerable: true, get: function() { return m[k]; } };
|
||||
}
|
||||
Object.defineProperty(o, k2, desc);
|
||||
}) : (function(o, m, k, k2) {
|
||||
if (k2 === undefined) k2 = k;
|
||||
o[k2] = m[k];
|
||||
}));
|
||||
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
||||
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
||||
}) : function(o, v) {
|
||||
o["default"] = v;
|
||||
});
|
||||
var __importStar = (this && this.__importStar) || (function () {
|
||||
var ownKeys = function(o) {
|
||||
ownKeys = Object.getOwnPropertyNames || function (o) {
|
||||
var ar = [];
|
||||
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
||||
return ar;
|
||||
};
|
||||
return ownKeys(o);
|
||||
};
|
||||
return function (mod) {
|
||||
if (mod && mod.__esModule) return mod;
|
||||
var result = {};
|
||||
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
||||
__setModuleDefault(result, mod);
|
||||
return result;
|
||||
};
|
||||
})();
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.activeScrapers = void 0;
|
||||
exports.registerScraper = registerScraper;
|
||||
@@ -49,32 +82,42 @@ router.get('/active/:id', async (req, res) => {
|
||||
// Get scraper history (last 50 completed scrapes)
|
||||
router.get('/history', async (req, res) => {
|
||||
try {
|
||||
const { limit = 50, store_id } = req.query;
|
||||
const { limit = 50, dispensary_id } = req.query;
|
||||
let query = `
|
||||
SELECT
|
||||
s.id as store_id,
|
||||
s.name as store_name,
|
||||
c.id as category_id,
|
||||
c.name as category_name,
|
||||
c.last_scraped_at,
|
||||
d.id as dispensary_id,
|
||||
COALESCE(d.dba_name, d.name) as dispensary_name,
|
||||
d.city,
|
||||
d.state,
|
||||
dcj.id as job_id,
|
||||
dcj.job_type,
|
||||
dcj.status,
|
||||
dcj.products_found,
|
||||
dcj.products_new,
|
||||
dcj.products_updated,
|
||||
dcj.in_stock_count,
|
||||
dcj.out_of_stock_count,
|
||||
dcj.duration_ms,
|
||||
dcj.completed_at as last_scraped_at,
|
||||
dcj.error_message,
|
||||
(
|
||||
SELECT COUNT(*)
|
||||
FROM products p
|
||||
WHERE p.store_id = s.id
|
||||
AND p.category_id = c.id
|
||||
WHERE p.dispensary_id = d.id
|
||||
AND p.last_seen_at >= NOW() - INTERVAL '7 days'
|
||||
) as product_count
|
||||
FROM stores s
|
||||
LEFT JOIN categories c ON c.store_id = s.id
|
||||
WHERE c.last_scraped_at IS NOT NULL
|
||||
FROM dispensary_crawl_jobs dcj
|
||||
JOIN dispensaries d ON d.id = dcj.dispensary_id
|
||||
WHERE dcj.completed_at IS NOT NULL
|
||||
`;
|
||||
const params = [];
|
||||
let paramCount = 1;
|
||||
if (store_id) {
|
||||
query += ` AND s.id = $${paramCount}`;
|
||||
params.push(store_id);
|
||||
if (dispensary_id) {
|
||||
query += ` AND d.id = $${paramCount}`;
|
||||
params.push(dispensary_id);
|
||||
paramCount++;
|
||||
}
|
||||
query += ` ORDER BY c.last_scraped_at DESC LIMIT $${paramCount}`;
|
||||
query += ` ORDER BY dcj.completed_at DESC LIMIT $${paramCount}`;
|
||||
params.push(limit);
|
||||
const result = await migrate_1.pool.query(query, params);
|
||||
res.json({ history: result.rows });
|
||||
@@ -127,4 +170,180 @@ function completeScraper(id, error) {
|
||||
}, 5 * 60 * 1000);
|
||||
}
|
||||
}
|
||||
// Dispensary crawl jobs endpoints
|
||||
router.get('/jobs/stats', async (req, res) => {
|
||||
try {
|
||||
const { dispensary_id } = req.query;
|
||||
let whereClause = '';
|
||||
const params = [];
|
||||
if (dispensary_id) {
|
||||
whereClause = 'WHERE dispensary_id = $1';
|
||||
params.push(dispensary_id);
|
||||
}
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT
|
||||
status,
|
||||
COUNT(*) as count,
|
||||
SUM(products_found) as total_products_found,
|
||||
SUM(COALESCE(products_new, 0) + COALESCE(products_updated, 0)) as total_products_saved
|
||||
FROM dispensary_crawl_jobs
|
||||
${whereClause}
|
||||
GROUP BY status
|
||||
`, params);
|
||||
const stats = {
|
||||
pending: 0,
|
||||
in_progress: 0,
|
||||
completed: 0,
|
||||
failed: 0,
|
||||
total_products_found: 0,
|
||||
total_products_saved: 0
|
||||
};
|
||||
result.rows.forEach((row) => {
|
||||
stats[row.status] = parseInt(row.count);
|
||||
if (row.status === 'completed') {
|
||||
stats.total_products_found += parseInt(row.total_products_found || '0');
|
||||
stats.total_products_saved += parseInt(row.total_products_saved || '0');
|
||||
}
|
||||
});
|
||||
res.json(stats);
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error fetching job stats:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch job stats' });
|
||||
}
|
||||
});
|
||||
router.get('/jobs/active', async (req, res) => {
|
||||
try {
|
||||
const { dispensary_id } = req.query;
|
||||
let whereClause = "WHERE dcj.status = 'in_progress'";
|
||||
const params = [];
|
||||
let paramCount = 1;
|
||||
if (dispensary_id) {
|
||||
whereClause += ` AND dcj.dispensary_id = $${paramCount}`;
|
||||
params.push(dispensary_id);
|
||||
paramCount++;
|
||||
}
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT
|
||||
dcj.id,
|
||||
dcj.dispensary_id,
|
||||
COALESCE(d.dba_name, d.name) as dispensary_name,
|
||||
dcj.job_type,
|
||||
dcj.status,
|
||||
dcj.worker_id,
|
||||
dcj.started_at,
|
||||
dcj.products_found,
|
||||
COALESCE(dcj.products_new, 0) + COALESCE(dcj.products_updated, 0) as products_saved,
|
||||
EXTRACT(EPOCH FROM (NOW() - dcj.started_at)) as duration_seconds
|
||||
FROM dispensary_crawl_jobs dcj
|
||||
JOIN dispensaries d ON d.id = dcj.dispensary_id
|
||||
${whereClause}
|
||||
ORDER BY dcj.started_at DESC
|
||||
`, params);
|
||||
res.json({ jobs: result.rows });
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error fetching active jobs:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch active jobs' });
|
||||
}
|
||||
});
|
||||
router.get('/jobs/recent', async (req, res) => {
|
||||
try {
|
||||
const { limit = 50, dispensary_id, status } = req.query;
|
||||
let whereClause = '';
|
||||
const params = [];
|
||||
let paramCount = 1;
|
||||
const conditions = [];
|
||||
if (dispensary_id) {
|
||||
conditions.push(`dcj.dispensary_id = $${paramCount}`);
|
||||
params.push(dispensary_id);
|
||||
paramCount++;
|
||||
}
|
||||
if (status) {
|
||||
conditions.push(`dcj.status = $${paramCount}`);
|
||||
params.push(status);
|
||||
paramCount++;
|
||||
}
|
||||
if (conditions.length > 0) {
|
||||
whereClause = 'WHERE ' + conditions.join(' AND ');
|
||||
}
|
||||
params.push(limit);
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT
|
||||
dcj.id,
|
||||
dcj.dispensary_id,
|
||||
COALESCE(d.dba_name, d.name) as dispensary_name,
|
||||
dcj.job_type,
|
||||
dcj.status,
|
||||
dcj.worker_id,
|
||||
dcj.started_at,
|
||||
dcj.completed_at,
|
||||
dcj.products_found,
|
||||
COALESCE(dcj.products_new, 0) + COALESCE(dcj.products_updated, 0) as products_saved,
|
||||
dcj.error_message,
|
||||
EXTRACT(EPOCH FROM (COALESCE(dcj.completed_at, NOW()) - dcj.started_at)) as duration_seconds
|
||||
FROM dispensary_crawl_jobs dcj
|
||||
JOIN dispensaries d ON d.id = dcj.dispensary_id
|
||||
${whereClause}
|
||||
ORDER BY dcj.created_at DESC
|
||||
LIMIT $${paramCount}
|
||||
`, params);
|
||||
res.json({ jobs: result.rows });
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error fetching recent jobs:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch recent jobs' });
|
||||
}
|
||||
});
|
||||
router.get('/jobs/workers', async (req, res) => {
|
||||
try {
|
||||
const { dispensary_id } = req.query;
|
||||
let whereClause = "WHERE status = 'in_progress' AND worker_id IS NOT NULL";
|
||||
const params = [];
|
||||
if (dispensary_id) {
|
||||
whereClause += ` AND dispensary_id = $1`;
|
||||
params.push(dispensary_id);
|
||||
}
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT
|
||||
worker_id,
|
||||
COUNT(*) as active_jobs,
|
||||
SUM(products_found) as total_products_found,
|
||||
SUM(COALESCE(products_new, 0) + COALESCE(products_updated, 0)) as total_products_saved,
|
||||
MIN(started_at) as earliest_start,
|
||||
MAX(started_at) as latest_start
|
||||
FROM dispensary_crawl_jobs
|
||||
${whereClause}
|
||||
GROUP BY worker_id
|
||||
ORDER BY worker_id
|
||||
`, params);
|
||||
res.json({ workers: result.rows });
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error fetching worker stats:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch worker stats' });
|
||||
}
|
||||
});
|
||||
router.get('/jobs/worker-logs/:workerId', async (req, res) => {
|
||||
try {
|
||||
const { workerId } = req.params;
|
||||
const fs = await Promise.resolve().then(() => __importStar(require('fs/promises')));
|
||||
const path = await Promise.resolve().then(() => __importStar(require('path')));
|
||||
const logPath = path.join('/tmp', `worker-${workerId}.log`);
|
||||
try {
|
||||
const logs = await fs.readFile(logPath, 'utf-8');
|
||||
const lines = logs.split('\n');
|
||||
// Return last 100 lines
|
||||
const recentLogs = lines.slice(-100).join('\n');
|
||||
res.json({ logs: recentLogs });
|
||||
}
|
||||
catch (fileError) {
|
||||
res.json({ logs: 'No logs available for this worker yet.' });
|
||||
}
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Failed to get worker logs:', error);
|
||||
res.status(500).json({ error: 'Failed to get worker logs' });
|
||||
}
|
||||
});
|
||||
exports.default = router;
|
||||
|
||||
171
backend/dist/routes/stores.js
vendored
171
backend/dist/routes/stores.js
vendored
@@ -60,31 +60,185 @@ router.get('/', async (req, res) => {
|
||||
res.status(500).json({ error: 'Failed to fetch stores' });
|
||||
}
|
||||
});
|
||||
// Get single store
|
||||
// Freshness threshold in hours
|
||||
const STALE_THRESHOLD_HOURS = 4;
|
||||
function calculateFreshness(lastScrapedAt) {
|
||||
if (!lastScrapedAt) {
|
||||
return {
|
||||
last_scraped_at: null,
|
||||
is_stale: true,
|
||||
freshness: 'Never scraped',
|
||||
hours_since_scrape: null
|
||||
};
|
||||
}
|
||||
const now = new Date();
|
||||
const diffMs = now.getTime() - lastScrapedAt.getTime();
|
||||
const diffHours = diffMs / (1000 * 60 * 60);
|
||||
const isStale = diffHours > STALE_THRESHOLD_HOURS;
|
||||
let freshnessText;
|
||||
if (diffHours < 1) {
|
||||
const mins = Math.round(diffHours * 60);
|
||||
freshnessText = `${mins} minute${mins !== 1 ? 's' : ''} ago`;
|
||||
}
|
||||
else if (diffHours < 24) {
|
||||
const hrs = Math.round(diffHours);
|
||||
freshnessText = `${hrs} hour${hrs !== 1 ? 's' : ''} ago`;
|
||||
}
|
||||
else {
|
||||
const days = Math.round(diffHours / 24);
|
||||
freshnessText = `${days} day${days !== 1 ? 's' : ''} ago`;
|
||||
}
|
||||
return {
|
||||
last_scraped_at: lastScrapedAt.toISOString(),
|
||||
is_stale: isStale,
|
||||
freshness: freshnessText,
|
||||
hours_since_scrape: Math.round(diffHours * 10) / 10
|
||||
};
|
||||
}
|
||||
function detectProvider(dutchieUrl) {
|
||||
if (!dutchieUrl)
|
||||
return 'unknown';
|
||||
if (dutchieUrl.includes('dutchie.com'))
|
||||
return 'Dutchie';
|
||||
if (dutchieUrl.includes('iheartjane.com') || dutchieUrl.includes('jane.co'))
|
||||
return 'Jane';
|
||||
if (dutchieUrl.includes('treez.io'))
|
||||
return 'Treez';
|
||||
if (dutchieUrl.includes('weedmaps.com'))
|
||||
return 'Weedmaps';
|
||||
if (dutchieUrl.includes('leafly.com'))
|
||||
return 'Leafly';
|
||||
return 'Custom';
|
||||
}
|
||||
// Get single store with full details
|
||||
router.get('/:id', async (req, res) => {
|
||||
try {
|
||||
const { id } = req.params;
|
||||
// Get store with counts and linked dispensary
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT
|
||||
SELECT
|
||||
s.*,
|
||||
d.id as dispensary_id,
|
||||
d.name as dispensary_name,
|
||||
d.slug as dispensary_slug,
|
||||
d.state as dispensary_state,
|
||||
d.city as dispensary_city,
|
||||
d.address as dispensary_address,
|
||||
d.menu_provider as dispensary_menu_provider,
|
||||
COUNT(DISTINCT p.id) as product_count,
|
||||
COUNT(DISTINCT c.id) as category_count
|
||||
COUNT(DISTINCT c.id) as category_count,
|
||||
COUNT(DISTINCT p.id) FILTER (WHERE p.in_stock = true) as in_stock_count,
|
||||
COUNT(DISTINCT p.id) FILTER (WHERE p.in_stock = false) as out_of_stock_count
|
||||
FROM stores s
|
||||
LEFT JOIN dispensaries d ON s.dispensary_id = d.id
|
||||
LEFT JOIN products p ON s.id = p.store_id
|
||||
LEFT JOIN categories c ON s.id = c.store_id
|
||||
WHERE s.id = $1
|
||||
GROUP BY s.id
|
||||
GROUP BY s.id, d.id, d.name, d.slug, d.state, d.city, d.address, d.menu_provider
|
||||
`, [id]);
|
||||
if (result.rows.length === 0) {
|
||||
return res.status(404).json({ error: 'Store not found' });
|
||||
}
|
||||
res.json(result.rows[0]);
|
||||
const store = result.rows[0];
|
||||
// Get recent crawl jobs for this store
|
||||
const jobsResult = await migrate_1.pool.query(`
|
||||
SELECT
|
||||
id, status, job_type, trigger_type,
|
||||
started_at, completed_at,
|
||||
products_found, products_new, products_updated,
|
||||
in_stock_count, out_of_stock_count,
|
||||
error_message
|
||||
FROM crawl_jobs
|
||||
WHERE store_id = $1
|
||||
ORDER BY created_at DESC
|
||||
LIMIT 10
|
||||
`, [id]);
|
||||
// Get schedule info if exists
|
||||
const scheduleResult = await migrate_1.pool.query(`
|
||||
SELECT
|
||||
enabled, interval_hours, next_run_at, last_run_at
|
||||
FROM store_crawl_schedule
|
||||
WHERE store_id = $1
|
||||
`, [id]);
|
||||
// Calculate freshness
|
||||
const freshness = calculateFreshness(store.last_scraped_at);
|
||||
// Detect provider from URL
|
||||
const provider = detectProvider(store.dutchie_url);
|
||||
// Build response
|
||||
const response = {
|
||||
...store,
|
||||
provider,
|
||||
freshness: freshness.freshness,
|
||||
is_stale: freshness.is_stale,
|
||||
hours_since_scrape: freshness.hours_since_scrape,
|
||||
linked_dispensary: store.dispensary_id ? {
|
||||
id: store.dispensary_id,
|
||||
name: store.dispensary_name,
|
||||
slug: store.dispensary_slug,
|
||||
state: store.dispensary_state,
|
||||
city: store.dispensary_city,
|
||||
address: store.dispensary_address,
|
||||
menu_provider: store.dispensary_menu_provider
|
||||
} : null,
|
||||
schedule: scheduleResult.rows[0] || null,
|
||||
recent_jobs: jobsResult.rows
|
||||
};
|
||||
// Remove redundant dispensary fields from root
|
||||
delete response.dispensary_name;
|
||||
delete response.dispensary_slug;
|
||||
delete response.dispensary_state;
|
||||
delete response.dispensary_city;
|
||||
delete response.dispensary_address;
|
||||
delete response.dispensary_menu_provider;
|
||||
res.json(response);
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error fetching store:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch store' });
|
||||
}
|
||||
});
|
||||
// Get store brands
|
||||
router.get('/:id/brands', async (req, res) => {
|
||||
try {
|
||||
const { id } = req.params;
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT name
|
||||
FROM brands
|
||||
WHERE store_id = $1
|
||||
ORDER BY name
|
||||
`, [id]);
|
||||
const brands = result.rows.map((row) => row.name);
|
||||
res.json({ brands });
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error fetching store brands:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch store brands' });
|
||||
}
|
||||
});
|
||||
// Get store specials
|
||||
router.get('/:id/specials', async (req, res) => {
|
||||
try {
|
||||
const { id } = req.params;
|
||||
const { date } = req.query;
|
||||
// Use provided date or today's date
|
||||
const queryDate = date || new Date().toISOString().split('T')[0];
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT
|
||||
s.*,
|
||||
p.name as product_name,
|
||||
p.image_url as product_image
|
||||
FROM specials s
|
||||
LEFT JOIN products p ON s.product_id = p.id
|
||||
WHERE s.store_id = $1 AND s.valid_date = $2
|
||||
ORDER BY s.name
|
||||
`, [id, queryDate]);
|
||||
res.json({ specials: result.rows, date: queryDate });
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error fetching store specials:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch store specials' });
|
||||
}
|
||||
});
|
||||
// Create store
|
||||
router.post('/', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
|
||||
try {
|
||||
@@ -146,17 +300,18 @@ router.delete('/:id', (0, middleware_1.requireRole)('superadmin'), async (req, r
|
||||
router.post('/:id/scrape', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
|
||||
try {
|
||||
const { id } = req.params;
|
||||
const { parallel = 3 } = req.body; // Default to 3 parallel scrapers
|
||||
const { parallel = 3, userAgent } = req.body; // Default to 3 parallel scrapers
|
||||
const storeResult = await migrate_1.pool.query('SELECT id FROM stores WHERE id = $1', [id]);
|
||||
if (storeResult.rows.length === 0) {
|
||||
return res.status(404).json({ error: 'Store not found' });
|
||||
}
|
||||
(0, scraper_v2_1.scrapeStore)(parseInt(id), parseInt(parallel)).catch(err => {
|
||||
(0, scraper_v2_1.scrapeStore)(parseInt(id), parseInt(parallel), userAgent).catch(err => {
|
||||
console.error('Background scrape error:', err);
|
||||
});
|
||||
res.json({
|
||||
message: 'Scrape started',
|
||||
parallel: parseInt(parallel)
|
||||
parallel: parseInt(parallel),
|
||||
userAgent: userAgent || 'random'
|
||||
});
|
||||
}
|
||||
catch (error) {
|
||||
|
||||
24
backend/dist/routes/version.js
vendored
Normal file
24
backend/dist/routes/version.js
vendored
Normal file
@@ -0,0 +1,24 @@
|
||||
"use strict";
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
const express_1 = require("express");
|
||||
const router = (0, express_1.Router)();
|
||||
/**
|
||||
* GET /api/version
|
||||
* Returns build version information for display in admin UI
|
||||
*/
|
||||
router.get('/', async (req, res) => {
|
||||
try {
|
||||
const versionInfo = {
|
||||
build_version: process.env.APP_BUILD_VERSION || 'dev',
|
||||
git_sha: process.env.APP_GIT_SHA || 'local',
|
||||
build_time: process.env.APP_BUILD_TIME || new Date().toISOString(),
|
||||
image_tag: process.env.CONTAINER_IMAGE_TAG || 'local',
|
||||
};
|
||||
res.json(versionInfo);
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error fetching version info:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch version info' });
|
||||
}
|
||||
});
|
||||
exports.default = router;
|
||||
234
backend/dist/scraper-v2/downloader.js
vendored
234
backend/dist/scraper-v2/downloader.js
vendored
@@ -8,15 +8,87 @@ const puppeteer_1 = __importDefault(require("puppeteer"));
|
||||
const axios_1 = __importDefault(require("axios"));
|
||||
const types_1 = require("./types");
|
||||
const logger_1 = require("../services/logger");
|
||||
// Fingerprint profiles for randomization
|
||||
const SCREEN_RESOLUTIONS = [
|
||||
{ width: 1920, height: 1080 },
|
||||
{ width: 1366, height: 768 },
|
||||
{ width: 1536, height: 864 },
|
||||
{ width: 1440, height: 900 },
|
||||
{ width: 1280, height: 720 },
|
||||
{ width: 2560, height: 1440 },
|
||||
{ width: 1680, height: 1050 },
|
||||
{ width: 1600, height: 900 },
|
||||
];
|
||||
const TIMEZONES = [
|
||||
'America/New_York',
|
||||
'America/Chicago',
|
||||
'America/Denver',
|
||||
'America/Los_Angeles',
|
||||
'America/Phoenix',
|
||||
];
|
||||
const LANGUAGES = [
|
||||
['en-US', 'en'],
|
||||
['en-US', 'en', 'es'],
|
||||
['en-US'],
|
||||
];
|
||||
const PLATFORMS = [
|
||||
'Win32',
|
||||
'MacIntel',
|
||||
'Linux x86_64',
|
||||
];
|
||||
const WEBGL_VENDORS = [
|
||||
'Google Inc. (NVIDIA)',
|
||||
'Google Inc. (Intel)',
|
||||
'Google Inc. (AMD)',
|
||||
'Intel Inc.',
|
||||
'NVIDIA Corporation',
|
||||
];
|
||||
const WEBGL_RENDERERS = [
|
||||
'ANGLE (NVIDIA GeForce GTX 1080 Direct3D11 vs_5_0 ps_5_0)',
|
||||
'ANGLE (Intel(R) UHD Graphics 630 Direct3D11 vs_5_0 ps_5_0)',
|
||||
'ANGLE (AMD Radeon RX 580 Series Direct3D11 vs_5_0 ps_5_0)',
|
||||
'Intel Iris OpenGL Engine',
|
||||
'NVIDIA GeForce RTX 3070/PCIe/SSE2',
|
||||
'AMD Radeon Pro 5500M OpenGL Engine',
|
||||
];
|
||||
function generateRandomFingerprint() {
|
||||
return {
|
||||
screen: SCREEN_RESOLUTIONS[Math.floor(Math.random() * SCREEN_RESOLUTIONS.length)],
|
||||
timezone: TIMEZONES[Math.floor(Math.random() * TIMEZONES.length)],
|
||||
languages: LANGUAGES[Math.floor(Math.random() * LANGUAGES.length)],
|
||||
platform: PLATFORMS[Math.floor(Math.random() * PLATFORMS.length)],
|
||||
hardwareConcurrency: [4, 8, 12, 16][Math.floor(Math.random() * 4)],
|
||||
deviceMemory: [4, 8, 16, 32][Math.floor(Math.random() * 4)],
|
||||
webglVendor: WEBGL_VENDORS[Math.floor(Math.random() * WEBGL_VENDORS.length)],
|
||||
webglRenderer: WEBGL_RENDERERS[Math.floor(Math.random() * WEBGL_RENDERERS.length)],
|
||||
};
|
||||
}
|
||||
class Downloader {
|
||||
browser = null;
|
||||
page = null;
|
||||
pageInUse = false;
|
||||
currentFingerprint = generateRandomFingerprint();
|
||||
needsNewFingerprint = false;
|
||||
/**
|
||||
* Initialize browser instance (lazy initialization)
|
||||
* Force new fingerprint on next browser creation
|
||||
*/
|
||||
async getBrowser() {
|
||||
rotateFingerprint() {
|
||||
this.needsNewFingerprint = true;
|
||||
logger_1.logger.info('scraper', '🔄 Fingerprint rotation scheduled');
|
||||
}
|
||||
/**
|
||||
* Initialize browser instance with fingerprint
|
||||
*/
|
||||
async getBrowser(forceNew = false) {
|
||||
// Create new browser if needed for fingerprint rotation
|
||||
if (forceNew || this.needsNewFingerprint) {
|
||||
await this.close();
|
||||
this.currentFingerprint = generateRandomFingerprint();
|
||||
this.needsNewFingerprint = false;
|
||||
logger_1.logger.info('scraper', `🎭 New fingerprint: ${this.currentFingerprint.screen.width}x${this.currentFingerprint.screen.height}, ${this.currentFingerprint.timezone}, ${this.currentFingerprint.platform}`);
|
||||
}
|
||||
if (!this.browser || !this.browser.isConnected()) {
|
||||
const { screen } = this.currentFingerprint;
|
||||
const launchOptions = {
|
||||
headless: 'new',
|
||||
args: [
|
||||
@@ -24,9 +96,11 @@ class Downloader {
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-blink-features=AutomationControlled',
|
||||
'--window-size=1920,1080',
|
||||
`--window-size=${screen.width},${screen.height}`,
|
||||
'--disable-web-security',
|
||||
'--disable-features=IsolateOrigins,site-per-process'
|
||||
'--disable-features=IsolateOrigins,site-per-process',
|
||||
'--disable-infobars',
|
||||
'--disable-extensions',
|
||||
]
|
||||
};
|
||||
this.browser = await puppeteer_1.default.launch(launchOptions);
|
||||
@@ -35,45 +109,137 @@ class Downloader {
|
||||
return this.browser;
|
||||
}
|
||||
/**
|
||||
* Get or create a page instance
|
||||
* Get or create a page instance with current fingerprint
|
||||
*/
|
||||
async getPage() {
|
||||
if (!this.page || this.page.isClosed()) {
|
||||
const browser = await this.getBrowser();
|
||||
async getPage(forceNew = false) {
|
||||
if (!this.page || this.page.isClosed() || forceNew) {
|
||||
const browser = await this.getBrowser(forceNew);
|
||||
this.page = await browser.newPage();
|
||||
await this.page.setViewport({ width: 1920, height: 1080 });
|
||||
logger_1.logger.debug('scraper', 'New page created');
|
||||
const { screen } = this.currentFingerprint;
|
||||
await this.page.setViewport({
|
||||
width: screen.width,
|
||||
height: screen.height,
|
||||
deviceScaleFactor: 1,
|
||||
});
|
||||
// Apply fingerprint
|
||||
await this.applyFingerprint(this.page);
|
||||
logger_1.logger.debug('scraper', 'New page created with fingerprint');
|
||||
}
|
||||
return this.page;
|
||||
}
|
||||
/**
|
||||
* Apply stealth mode to page
|
||||
* Apply full fingerprint to page
|
||||
*/
|
||||
async makePageStealthy(page) {
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
// @ts-ignore - runs in browser context
|
||||
async applyFingerprint(page) {
|
||||
const fp = this.currentFingerprint;
|
||||
await page.evaluateOnNewDocument((fingerprint) => {
|
||||
// Hide webdriver
|
||||
Object.defineProperty(navigator, 'webdriver', {
|
||||
get: () => false,
|
||||
});
|
||||
// @ts-ignore - runs in browser context
|
||||
Object.defineProperty(navigator, 'plugins', {
|
||||
get: () => [1, 2, 3, 4, 5],
|
||||
// Spoof platform
|
||||
Object.defineProperty(navigator, 'platform', {
|
||||
get: () => fingerprint.platform,
|
||||
});
|
||||
// @ts-ignore - runs in browser context
|
||||
// Spoof languages
|
||||
Object.defineProperty(navigator, 'languages', {
|
||||
get: () => ['en-US', 'en'],
|
||||
get: () => fingerprint.languages,
|
||||
});
|
||||
// @ts-ignore - runs in browser context
|
||||
// Spoof hardware concurrency
|
||||
Object.defineProperty(navigator, 'hardwareConcurrency', {
|
||||
get: () => fingerprint.hardwareConcurrency,
|
||||
});
|
||||
// Spoof device memory
|
||||
Object.defineProperty(navigator, 'deviceMemory', {
|
||||
get: () => fingerprint.deviceMemory,
|
||||
});
|
||||
// Spoof plugins (realistic count)
|
||||
Object.defineProperty(navigator, 'plugins', {
|
||||
get: () => {
|
||||
const plugins = [];
|
||||
for (let i = 0; i < 5; i++) {
|
||||
plugins.push({
|
||||
name: `Plugin ${i}`,
|
||||
filename: `plugin${i}.dll`,
|
||||
description: `Description ${i}`,
|
||||
});
|
||||
}
|
||||
plugins.length = 5;
|
||||
return plugins;
|
||||
},
|
||||
});
|
||||
// Chrome object
|
||||
window.chrome = {
|
||||
runtime: {},
|
||||
loadTimes: () => ({}),
|
||||
csi: () => ({}),
|
||||
app: {},
|
||||
};
|
||||
// @ts-ignore - runs in browser context
|
||||
// Permissions
|
||||
const originalQuery = window.navigator.permissions.query;
|
||||
// @ts-ignore - runs in browser context
|
||||
window.navigator.permissions.query = (parameters) => parameters.name === 'notifications'
|
||||
? Promise.resolve({ state: 'denied' })
|
||||
: originalQuery(parameters);
|
||||
});
|
||||
// WebGL fingerprint spoofing
|
||||
const getParameterProxyHandler = {
|
||||
apply: function (target, thisArg, argumentsList) {
|
||||
const param = argumentsList[0];
|
||||
// UNMASKED_VENDOR_WEBGL
|
||||
if (param === 37445) {
|
||||
return fingerprint.webglVendor;
|
||||
}
|
||||
// UNMASKED_RENDERER_WEBGL
|
||||
if (param === 37446) {
|
||||
return fingerprint.webglRenderer;
|
||||
}
|
||||
return Reflect.apply(target, thisArg, argumentsList);
|
||||
}
|
||||
};
|
||||
// Override WebGL
|
||||
const originalGetContext = HTMLCanvasElement.prototype.getContext;
|
||||
HTMLCanvasElement.prototype.getContext = function (type, ...args) {
|
||||
const context = originalGetContext.call(this, type, ...args);
|
||||
if (context && (type === 'webgl' || type === 'webgl2' || type === 'experimental-webgl')) {
|
||||
const glContext = context;
|
||||
const originalGetParameter = glContext.getParameter.bind(glContext);
|
||||
glContext.getParameter = new Proxy(originalGetParameter, getParameterProxyHandler);
|
||||
}
|
||||
return context;
|
||||
};
|
||||
// Canvas fingerprint noise
|
||||
const originalToDataURL = HTMLCanvasElement.prototype.toDataURL;
|
||||
HTMLCanvasElement.prototype.toDataURL = function (type) {
|
||||
const context = this.getContext('2d');
|
||||
if (context) {
|
||||
const imageData = context.getImageData(0, 0, this.width, this.height);
|
||||
for (let i = 0; i < imageData.data.length; i += 4) {
|
||||
// Add tiny noise to RGB values
|
||||
imageData.data[i] = imageData.data[i] ^ (Math.random() > 0.5 ? 1 : 0);
|
||||
}
|
||||
context.putImageData(imageData, 0, 0);
|
||||
}
|
||||
return originalToDataURL.call(this, type);
|
||||
};
|
||||
// Screen dimensions
|
||||
Object.defineProperty(window.screen, 'width', { get: () => fingerprint.screen.width });
|
||||
Object.defineProperty(window.screen, 'height', { get: () => fingerprint.screen.height });
|
||||
Object.defineProperty(window.screen, 'availWidth', { get: () => fingerprint.screen.width });
|
||||
Object.defineProperty(window.screen, 'availHeight', { get: () => fingerprint.screen.height - 40 });
|
||||
Object.defineProperty(window, 'innerWidth', { get: () => fingerprint.screen.width });
|
||||
Object.defineProperty(window, 'innerHeight', { get: () => fingerprint.screen.height - 140 });
|
||||
Object.defineProperty(window, 'outerWidth', { get: () => fingerprint.screen.width });
|
||||
Object.defineProperty(window, 'outerHeight', { get: () => fingerprint.screen.height });
|
||||
}, fp);
|
||||
// Set timezone via CDP
|
||||
const client = await page.target().createCDPSession();
|
||||
await client.send('Emulation.setTimezoneOverride', { timezoneId: fp.timezone });
|
||||
}
|
||||
/**
|
||||
* Apply stealth mode to page (legacy - now uses applyFingerprint)
|
||||
*/
|
||||
async makePageStealthy(page) {
|
||||
// Now handled by applyFingerprint
|
||||
await this.applyFingerprint(page);
|
||||
}
|
||||
/**
|
||||
* Configure proxy for browser
|
||||
@@ -162,17 +328,29 @@ class Downloader {
|
||||
if (request.metadata.userAgent) {
|
||||
await page.setUserAgent(request.metadata.userAgent);
|
||||
}
|
||||
// Navigate to page
|
||||
// Navigate to page - use networkidle2 for SPAs like Dutchie
|
||||
// Increased timeout to 90s - Dutchie pages can take 30-40s to fully load
|
||||
const navigationPromise = page.goto(request.url, {
|
||||
waitUntil: 'domcontentloaded',
|
||||
timeout: 60000
|
||||
waitUntil: 'networkidle2',
|
||||
timeout: 90000
|
||||
});
|
||||
const response = await navigationPromise;
|
||||
if (!response) {
|
||||
throw new Error('Navigation failed - no response');
|
||||
}
|
||||
// Wait for initial render
|
||||
await page.waitForTimeout(3000);
|
||||
// Wait for React to render product content
|
||||
// Try to wait for products, but don't fail if they don't appear (empty category)
|
||||
try {
|
||||
await page.waitForSelector('[data-testid="product-list-item"], [data-testid="empty-state"]', {
|
||||
timeout: 10000
|
||||
});
|
||||
}
|
||||
catch {
|
||||
// Products might not exist in this category - continue anyway
|
||||
logger_1.logger.debug('scraper', 'No products found within timeout - continuing');
|
||||
}
|
||||
// Additional wait for any lazy-loaded content
|
||||
await page.waitForTimeout(2000);
|
||||
// Check for lazy-loaded content
|
||||
await this.autoScroll(page);
|
||||
// Get page content
|
||||
|
||||
57
backend/dist/scraper-v2/engine.js
vendored
57
backend/dist/scraper-v2/engine.js
vendored
@@ -346,7 +346,7 @@ class DutchieSpider {
|
||||
catch (error) {
|
||||
logger_1.logger.error('scraper', `Category scrape failed: ${error}`);
|
||||
if (completeScraper) {
|
||||
completeScraper(scraperId, error.toString());
|
||||
completeScraper(scraperId, String(error));
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
@@ -397,7 +397,28 @@ class DutchieSpider {
|
||||
// @ts-ignore - runs in browser context
|
||||
href = window.location.origin + href;
|
||||
}
|
||||
items.push({ name, price, originalPrice, href });
|
||||
// Extract image URL from product card
|
||||
let imageUrl = null;
|
||||
const imgSelectors = [
|
||||
'img[src*="images.dutchie.com"]',
|
||||
'img[src*="dutchie"]',
|
||||
'img[data-testid*="product"]',
|
||||
'img[class*="product"]',
|
||||
'img[class*="Product"]',
|
||||
'picture img',
|
||||
'img'
|
||||
];
|
||||
for (const sel of imgSelectors) {
|
||||
const img = card.querySelector(sel);
|
||||
if (img) {
|
||||
const src = img.getAttribute('src') || img.getAttribute('data-src') || '';
|
||||
if (src && (src.includes('dutchie.com') || src.includes('images.'))) {
|
||||
imageUrl = src;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
items.push({ name, price, originalPrice, href, imageUrl });
|
||||
}
|
||||
catch (err) {
|
||||
console.error('Error parsing product card:', err);
|
||||
@@ -416,6 +437,7 @@ class DutchieSpider {
|
||||
productName: card.name,
|
||||
productPrice: card.price,
|
||||
productOriginalPrice: card.originalPrice,
|
||||
productImageUrl: card.imageUrl, // Pass image from category page
|
||||
requiresBrowser: true
|
||||
},
|
||||
callback: this.parseProductPage.bind(this)
|
||||
@@ -436,20 +458,26 @@ class DutchieSpider {
|
||||
const details = await page.evaluate(() => {
|
||||
// @ts-ignore - runs in browser context
|
||||
const allText = document.body.textContent || '';
|
||||
// Extract image
|
||||
// Extract image - expanded selectors for better coverage
|
||||
let fullSizeImage = null;
|
||||
const mainImageSelectors = [
|
||||
'img[src*="images.dutchie.com"]',
|
||||
'img[src*="dutchie"]',
|
||||
'img[class*="ProductImage"]',
|
||||
'img[class*="product-image"]',
|
||||
'img[class*="Product"]',
|
||||
'[class*="ImageGallery"] img',
|
||||
'main img',
|
||||
'img[src*="images.dutchie.com"]'
|
||||
'[data-testid*="product"] img',
|
||||
'[data-testid*="image"] img',
|
||||
'picture img',
|
||||
'main img'
|
||||
];
|
||||
for (const sel of mainImageSelectors) {
|
||||
// @ts-ignore - runs in browser context
|
||||
const img = document.querySelector(sel);
|
||||
if (img?.src && img.src.includes('dutchie.com')) {
|
||||
fullSizeImage = img.src;
|
||||
const src = img?.src || img?.getAttribute('data-src') || '';
|
||||
if (src && (src.includes('dutchie.com') || src.includes('images.'))) {
|
||||
fullSizeImage = src;
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -546,6 +574,8 @@ class DutchieSpider {
|
||||
};
|
||||
});
|
||||
// Create product item
|
||||
// Use image from product page, fallback to category page image
|
||||
const imageUrl = details.fullSizeImage || response.request.metadata.productImageUrl || undefined;
|
||||
const product = {
|
||||
dutchieProductId: `${response.request.metadata.storeSlug}-${response.request.metadata.categorySlug}-${Date.now()}-${Math.random()}`,
|
||||
name: productName || 'Unknown Product',
|
||||
@@ -556,7 +586,7 @@ class DutchieSpider {
|
||||
cbdPercentage: details.cbd || undefined,
|
||||
strainType: details.strainType || undefined,
|
||||
brand: details.brand || undefined,
|
||||
imageUrl: details.fullSizeImage || undefined,
|
||||
imageUrl: imageUrl,
|
||||
dutchieUrl: response.url,
|
||||
metadata: {
|
||||
terpenes: details.terpenes,
|
||||
@@ -573,6 +603,17 @@ class DutchieSpider {
|
||||
async scrapeStore(storeId, parallel = 3) {
|
||||
logger_1.logger.info('scraper', `🏪 Starting store scrape: ${storeId} (${parallel} parallel scrapers)`);
|
||||
try {
|
||||
// Check if categories exist, if not, discover them first
|
||||
const categoryCountResult = await migrate_1.pool.query(`
|
||||
SELECT COUNT(*) as count
|
||||
FROM categories
|
||||
WHERE store_id = $1
|
||||
`, [storeId]);
|
||||
if (parseInt(categoryCountResult.rows[0].count) === 0) {
|
||||
logger_1.logger.info('scraper', 'No categories found - running discovery first');
|
||||
const { discoverCategories } = await Promise.resolve().then(() => __importStar(require('./index')));
|
||||
await discoverCategories(storeId);
|
||||
}
|
||||
// Get all leaf categories (no children)
|
||||
const categoriesResult = await migrate_1.pool.query(`
|
||||
SELECT c.id, c.name
|
||||
|
||||
9
backend/dist/scraper-v2/index.js
vendored
9
backend/dist/scraper-v2/index.js
vendored
@@ -2,6 +2,13 @@
|
||||
/**
|
||||
* Scraper V2 - Scrapy-inspired web scraping framework
|
||||
*
|
||||
* IMPORTANT: For Dutchie stores, DO NOT USE scrapeStore() from this module.
|
||||
* Dutchie crawling must go through the dutchie-az GraphQL pipeline:
|
||||
* src/dutchie-az/services/product-crawler.ts
|
||||
*
|
||||
* This scraper-v2 module uses DOM-based extraction which is unreliable
|
||||
* for Dutchie. The new dutchie-az pipeline uses GraphQL directly.
|
||||
*
|
||||
* Architecture:
|
||||
* - Engine: Main orchestrator
|
||||
* - Scheduler: Priority queue with deduplication
|
||||
@@ -77,7 +84,7 @@ async function scrapeCategory(storeId, categoryId) {
|
||||
/**
|
||||
* Scrape an entire store
|
||||
*/
|
||||
async function scrapeStore(storeId, parallel = 3) {
|
||||
async function scrapeStore(storeId, parallel = 3, _userAgent) {
|
||||
const engine = new engine_2.ScraperEngine(1);
|
||||
const spider = new engine_2.DutchieSpider(engine);
|
||||
try {
|
||||
|
||||
156
backend/dist/scraper-v2/middlewares.js
vendored
156
backend/dist/scraper-v2/middlewares.js
vendored
@@ -3,13 +3,31 @@ Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.MiddlewareEngine = exports.StealthMiddleware = exports.BotDetectionMiddleware = exports.RetryMiddleware = exports.RateLimitMiddleware = exports.ProxyMiddleware = exports.UserAgentMiddleware = void 0;
|
||||
const types_1 = require("./types");
|
||||
const logger_1 = require("../services/logger");
|
||||
const migrate_1 = require("../db/migrate");
|
||||
const proxy_1 = require("../services/proxy");
|
||||
// Diverse, realistic user agents - updated for 2024/2025
|
||||
const USER_AGENTS = [
|
||||
// Chrome on Windows (most common)
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
|
||||
// Chrome on Mac
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
|
||||
// Chrome on Linux
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
|
||||
// Firefox
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15'
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 14.0; rv:121.0) Gecko/20100101 Firefox/121.0',
|
||||
// Safari
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_0) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15',
|
||||
// Edge
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0',
|
||||
];
|
||||
function getRandomUserAgent() {
|
||||
return USER_AGENTS[Math.floor(Math.random() * USER_AGENTS.length)];
|
||||
@@ -18,55 +36,100 @@ function sleep(ms) {
|
||||
return new Promise(resolve => setTimeout(resolve, ms));
|
||||
}
|
||||
/**
|
||||
* User Agent Rotation Middleware
|
||||
* User Agent Rotation Middleware - rotates UA on each request for better evasion
|
||||
*/
|
||||
class UserAgentMiddleware {
|
||||
name = 'UserAgentMiddleware';
|
||||
priority = 100;
|
||||
lastUserAgent = null;
|
||||
async processRequest(request) {
|
||||
if (!request.metadata.userAgent) {
|
||||
request.metadata.userAgent = getRandomUserAgent();
|
||||
// Always rotate UA on retries or bot detection
|
||||
const forceRotation = request.retryCount > 0 || request.metadata.botDetected;
|
||||
if (!request.metadata.userAgent || forceRotation) {
|
||||
// Get a different UA than the last one used
|
||||
let newUA = getRandomUserAgent();
|
||||
let attempts = 0;
|
||||
while (newUA === this.lastUserAgent && attempts < 5) {
|
||||
newUA = getRandomUserAgent();
|
||||
attempts++;
|
||||
}
|
||||
request.metadata.userAgent = newUA;
|
||||
this.lastUserAgent = newUA;
|
||||
if (forceRotation) {
|
||||
logger_1.logger.debug('scraper', `🔄 Rotated User-Agent: ${newUA.substring(0, 50)}...`);
|
||||
}
|
||||
}
|
||||
return request;
|
||||
}
|
||||
}
|
||||
exports.UserAgentMiddleware = UserAgentMiddleware;
|
||||
// Domains that should skip proxy (datacenter IPs are blocked)
|
||||
const PROXY_SKIP_DOMAINS = [
|
||||
'dutchie.com',
|
||||
];
|
||||
function shouldSkipProxy(url) {
|
||||
try {
|
||||
const urlObj = new URL(url);
|
||||
return PROXY_SKIP_DOMAINS.some(domain => urlObj.hostname.includes(domain));
|
||||
}
|
||||
catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Proxy Rotation Middleware
|
||||
* Proxy Rotation Middleware - uses the central proxy service with timeout handling
|
||||
*/
|
||||
class ProxyMiddleware {
|
||||
name = 'ProxyMiddleware';
|
||||
priority = 90;
|
||||
async getActiveProxy() {
|
||||
try {
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT host, port, protocol, username, password
|
||||
FROM proxies
|
||||
WHERE active = true AND is_anonymous = true
|
||||
ORDER BY RANDOM()
|
||||
LIMIT 1
|
||||
`);
|
||||
if (result.rows.length === 0) {
|
||||
return null;
|
||||
}
|
||||
return result.rows[0];
|
||||
}
|
||||
catch (error) {
|
||||
logger_1.logger.error('scraper', `Failed to get proxy: ${error}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
currentProxyId = null;
|
||||
async processRequest(request) {
|
||||
// Only add proxy if not already set
|
||||
if (!request.metadata.proxy && request.retryCount > 0) {
|
||||
// Use proxy on retries
|
||||
request.metadata.proxy = await this.getActiveProxy();
|
||||
if (request.metadata.proxy) {
|
||||
logger_1.logger.debug('scraper', `Using proxy for retry: ${request.metadata.proxy.host}:${request.metadata.proxy.port}`);
|
||||
// Skip proxy for domains that block datacenter IPs
|
||||
if (shouldSkipProxy(request.url)) {
|
||||
logger_1.logger.info('scraper', `⏭️ Skipping proxy for ${new URL(request.url).hostname} (datacenter IPs blocked)`);
|
||||
return request;
|
||||
}
|
||||
// Always try to use a proxy from the central proxy service
|
||||
// The service handles bot detection timeouts automatically
|
||||
const forceRotation = request.retryCount > 0 || request.metadata.botDetected;
|
||||
if (!request.metadata.proxy || forceRotation) {
|
||||
// Get proxy from central service - it handles timeouts automatically
|
||||
const proxy = await (0, proxy_1.getActiveProxy)();
|
||||
if (proxy) {
|
||||
request.metadata.proxy = {
|
||||
host: proxy.host,
|
||||
port: proxy.port,
|
||||
protocol: proxy.protocol,
|
||||
username: proxy.username,
|
||||
password: proxy.password,
|
||||
};
|
||||
request.metadata.proxyId = proxy.id;
|
||||
this.currentProxyId = proxy.id;
|
||||
const reason = forceRotation ? 'rotation' : 'initial';
|
||||
logger_1.logger.info('scraper', `🔄 Using proxy (${reason}): ${proxy.protocol}://${proxy.host}:${proxy.port}`);
|
||||
}
|
||||
else {
|
||||
logger_1.logger.warn('scraper', '⚠️ No proxy available - running without proxy');
|
||||
}
|
||||
}
|
||||
return request;
|
||||
}
|
||||
async processResponse(response) {
|
||||
// If bot detection was triggered, put the proxy in timeout
|
||||
if (response.request.metadata.botDetected && response.request.metadata.proxyId) {
|
||||
(0, proxy_1.putProxyInTimeout)(response.request.metadata.proxyId, 'Bot detection triggered');
|
||||
logger_1.logger.info('scraper', `🚫 Proxy ${response.request.metadata.proxyId} put in timeout due to bot detection`);
|
||||
}
|
||||
return response;
|
||||
}
|
||||
async processError(error, request) {
|
||||
// If bot detection error, put proxy in timeout
|
||||
if ((0, proxy_1.isBotDetectionError)(error.message) && request.metadata.proxyId) {
|
||||
(0, proxy_1.putProxyInTimeout)(request.metadata.proxyId, error.message);
|
||||
logger_1.logger.info('scraper', `🚫 Proxy ${request.metadata.proxyId} put in timeout: ${error.message}`);
|
||||
}
|
||||
return error;
|
||||
}
|
||||
}
|
||||
exports.ProxyMiddleware = ProxyMiddleware;
|
||||
/**
|
||||
@@ -165,13 +228,15 @@ class RetryMiddleware {
|
||||
}
|
||||
exports.RetryMiddleware = RetryMiddleware;
|
||||
/**
|
||||
* Bot Detection Middleware
|
||||
* Bot Detection Middleware - detects bot blocking and triggers fingerprint rotation
|
||||
*/
|
||||
class BotDetectionMiddleware {
|
||||
name = 'BotDetectionMiddleware';
|
||||
priority = 60;
|
||||
detectedCount = 0;
|
||||
DETECTION_THRESHOLD = 3;
|
||||
// Export for use by other middlewares
|
||||
static shouldRotateFingerprint = false;
|
||||
async processResponse(response) {
|
||||
const content = typeof response.content === 'string'
|
||||
? response.content
|
||||
@@ -183,14 +248,24 @@ class BotDetectionMiddleware {
|
||||
/access denied/i,
|
||||
/you have been blocked/i,
|
||||
/unusual traffic/i,
|
||||
/robot/i
|
||||
/robot/i,
|
||||
/verify.*human/i,
|
||||
/security check/i,
|
||||
/please wait/i,
|
||||
/checking your browser/i,
|
||||
/ray id/i
|
||||
];
|
||||
const detected = botIndicators.some(pattern => pattern.test(content));
|
||||
if (detected) {
|
||||
this.detectedCount++;
|
||||
BotDetectionMiddleware.shouldRotateFingerprint = true;
|
||||
logger_1.logger.warn('scraper', `Bot detection suspected (${this.detectedCount}/${this.DETECTION_THRESHOLD}): ${response.url}`);
|
||||
logger_1.logger.info('scraper', '🔄 Flagging for proxy/UA rotation on next request');
|
||||
// Mark the request for rotation on retry
|
||||
response.request.metadata.botDetected = true;
|
||||
response.request.metadata.needsNewBrowser = true;
|
||||
if (this.detectedCount >= this.DETECTION_THRESHOLD) {
|
||||
const error = new Error('Bot detection threshold reached');
|
||||
const error = new Error('Bot detection threshold reached - rotating fingerprint');
|
||||
error.type = types_1.ErrorType.BOT_DETECTION;
|
||||
error.retryable = true;
|
||||
error.request = response.request;
|
||||
@@ -200,9 +275,22 @@ class BotDetectionMiddleware {
|
||||
else {
|
||||
// Gradually decrease detection count on successful requests
|
||||
this.detectedCount = Math.max(0, this.detectedCount - 0.5);
|
||||
BotDetectionMiddleware.shouldRotateFingerprint = false;
|
||||
}
|
||||
return response;
|
||||
}
|
||||
async processError(error, request) {
|
||||
// If bot detection error, flag for rotation and allow retry
|
||||
if ('type' in error && error.type === types_1.ErrorType.BOT_DETECTION) {
|
||||
request.metadata.botDetected = true;
|
||||
request.metadata.needsNewBrowser = true;
|
||||
logger_1.logger.info('scraper', '🔄 Bot detection error - will rotate proxy/UA on retry');
|
||||
// Add delay before retry to avoid rate limiting
|
||||
await sleep(5000 + Math.random() * 5000);
|
||||
return null; // Return null to trigger retry
|
||||
}
|
||||
return error;
|
||||
}
|
||||
}
|
||||
exports.BotDetectionMiddleware = BotDetectionMiddleware;
|
||||
/**
|
||||
|
||||
213
backend/dist/scraper-v2/pipelines.js
vendored
213
backend/dist/scraper-v2/pipelines.js
vendored
@@ -4,6 +4,7 @@ exports.PipelineEngine = exports.StatsPipeline = exports.DatabasePipeline = expo
|
||||
const logger_1 = require("../services/logger");
|
||||
const migrate_1 = require("../db/migrate");
|
||||
const minio_1 = require("../utils/minio");
|
||||
const product_normalizer_1 = require("../utils/product-normalizer");
|
||||
/**
|
||||
* Validation Pipeline - ensures data quality
|
||||
*/
|
||||
@@ -138,82 +139,182 @@ class ImagePipeline {
|
||||
}
|
||||
exports.ImagePipeline = ImagePipeline;
|
||||
/**
|
||||
* Database Pipeline - saves items to database
|
||||
* Generate a URL-safe slug from a product name
|
||||
*/
|
||||
function generateSlug(name) {
|
||||
return name
|
||||
.toLowerCase()
|
||||
.replace(/[^a-z0-9]+/g, '-')
|
||||
.replace(/^-+|-+$/g, '')
|
||||
.substring(0, 400);
|
||||
}
|
||||
/**
|
||||
* Database Pipeline - saves items to database with improved matching
|
||||
*
|
||||
* MATCHING PRIORITY:
|
||||
* 1. external_id (dutchie_product_id) - exact match
|
||||
* 2. normalized name + brand + category - strong match
|
||||
* 3. normalized name + category - weak match (same product, different/missing brand)
|
||||
*
|
||||
* ALWAYS creates a snapshot after upsert for historical tracking.
|
||||
*/
|
||||
class DatabasePipeline {
|
||||
name = 'DatabasePipeline';
|
||||
priority = 10; // Low priority - runs last
|
||||
crawlId = null;
|
||||
setCrawlId(id) {
|
||||
this.crawlId = id;
|
||||
}
|
||||
async process(item, spider) {
|
||||
const client = await migrate_1.pool.connect();
|
||||
try {
|
||||
// Extract store and category from metadata (set by spider)
|
||||
const storeId = item.storeId;
|
||||
const categoryId = item.categoryId;
|
||||
const dispensaryId = item.dispensaryId;
|
||||
const categoryName = item.categoryName;
|
||||
// Generate normalized values for matching
|
||||
const nameNormalized = (0, product_normalizer_1.normalizeProductName)(item.name);
|
||||
const brandNormalized = (0, product_normalizer_1.normalizeBrandName)(item.brand);
|
||||
const slug = generateSlug(item.name);
|
||||
const externalId = item.dutchieProductId || null;
|
||||
if (!storeId || !categoryId) {
|
||||
logger_1.logger.error('pipeline', `Missing storeId or categoryId for ${item.name}`);
|
||||
return null;
|
||||
}
|
||||
// Check if product exists
|
||||
const existingResult = await client.query(`
|
||||
SELECT id, image_url, local_image_path
|
||||
FROM products
|
||||
WHERE store_id = $1 AND name = $2 AND category_id = $3
|
||||
`, [storeId, item.name, categoryId]);
|
||||
let productId = null;
|
||||
let localImagePath = null;
|
||||
let productId;
|
||||
if (existingResult.rows.length > 0) {
|
||||
let isNewProduct = false;
|
||||
// STEP 1: Try to match by external_id (most reliable)
|
||||
if (externalId) {
|
||||
const extMatch = await client.query(`
|
||||
SELECT id, image_url, local_image_path
|
||||
FROM products
|
||||
WHERE store_id = $1 AND (external_id = $2 OR dutchie_product_id = $2)
|
||||
`, [storeId, externalId]);
|
||||
if (extMatch.rows.length > 0) {
|
||||
productId = extMatch.rows[0].id;
|
||||
localImagePath = extMatch.rows[0].local_image_path;
|
||||
logger_1.logger.debug('pipeline', `Matched by external_id: ${item.name}`);
|
||||
}
|
||||
}
|
||||
// STEP 2: Try to match by normalized name + brand + category
|
||||
if (!productId) {
|
||||
const normMatch = await client.query(`
|
||||
SELECT id, image_url, local_image_path
|
||||
FROM products
|
||||
WHERE store_id = $1
|
||||
AND name_normalized = $2
|
||||
AND brand_normalized = $3
|
||||
AND category_id = $4
|
||||
`, [storeId, nameNormalized, brandNormalized, categoryId]);
|
||||
if (normMatch.rows.length > 0) {
|
||||
productId = normMatch.rows[0].id;
|
||||
localImagePath = normMatch.rows[0].local_image_path;
|
||||
logger_1.logger.debug('pipeline', `Matched by normalized name+brand+category: ${item.name}`);
|
||||
}
|
||||
}
|
||||
// STEP 3: Fallback to normalized name + category only (weaker match)
|
||||
if (!productId) {
|
||||
const weakMatch = await client.query(`
|
||||
SELECT id, image_url, local_image_path
|
||||
FROM products
|
||||
WHERE store_id = $1
|
||||
AND name_normalized = $2
|
||||
AND category_id = $3
|
||||
LIMIT 1
|
||||
`, [storeId, nameNormalized, categoryId]);
|
||||
if (weakMatch.rows.length === 1) {
|
||||
productId = weakMatch.rows[0].id;
|
||||
localImagePath = weakMatch.rows[0].local_image_path;
|
||||
logger_1.logger.debug('pipeline', `Matched by normalized name+category: ${item.name}`);
|
||||
}
|
||||
}
|
||||
// STEP 4: Final fallback - exact name match (legacy compatibility)
|
||||
if (!productId) {
|
||||
const exactMatch = await client.query(`
|
||||
SELECT id, image_url, local_image_path
|
||||
FROM products
|
||||
WHERE store_id = $1 AND name = $2 AND category_id = $3
|
||||
`, [storeId, item.name, categoryId]);
|
||||
if (exactMatch.rows.length > 0) {
|
||||
productId = exactMatch.rows[0].id;
|
||||
localImagePath = exactMatch.rows[0].local_image_path;
|
||||
logger_1.logger.debug('pipeline', `Matched by exact name: ${item.name}`);
|
||||
}
|
||||
}
|
||||
// UPDATE or INSERT
|
||||
if (productId) {
|
||||
// Update existing product
|
||||
productId = existingResult.rows[0].id;
|
||||
localImagePath = existingResult.rows[0].local_image_path;
|
||||
await client.query(`
|
||||
UPDATE products
|
||||
SET name = $1, description = $2, price = $3,
|
||||
strain_type = $4, thc_percentage = $5, cbd_percentage = $6,
|
||||
brand = $7, weight = $8, image_url = $9, dutchie_url = $10,
|
||||
brand = $7, weight = $8, image_url = COALESCE($9, image_url), dutchie_url = $10,
|
||||
in_stock = true, metadata = $11, last_seen_at = CURRENT_TIMESTAMP,
|
||||
updated_at = CURRENT_TIMESTAMP
|
||||
updated_at = CURRENT_TIMESTAMP, dispensary_id = $13, slug = COALESCE(slug, $14),
|
||||
name_normalized = $15, brand_normalized = $16,
|
||||
external_id = COALESCE(external_id, $17), source_platform = COALESCE(source_platform, 'dutchie')
|
||||
WHERE id = $12
|
||||
`, [
|
||||
item.name, item.description, item.price,
|
||||
item.strainType, item.thcPercentage, item.cbdPercentage,
|
||||
item.brand, item.weight, item.imageUrl, item.dutchieUrl,
|
||||
JSON.stringify(item.metadata || {}), productId
|
||||
JSON.stringify(item.metadata || {}), productId, dispensaryId, slug,
|
||||
nameNormalized, brandNormalized, externalId
|
||||
]);
|
||||
logger_1.logger.debug('pipeline', `Updated product: ${item.name}`);
|
||||
}
|
||||
else {
|
||||
// Insert new product
|
||||
isNewProduct = true;
|
||||
const insertResult = await client.query(`
|
||||
INSERT INTO products (
|
||||
store_id, category_id, dutchie_product_id, name, description,
|
||||
store_id, category_id, dispensary_id, dutchie_product_id, external_id,
|
||||
slug, name, name_normalized, description,
|
||||
price, strain_type, thc_percentage, cbd_percentage,
|
||||
brand, weight, image_url, dutchie_url, in_stock, metadata
|
||||
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, true, $14)
|
||||
brand, brand_normalized, weight, image_url, dutchie_url, in_stock, metadata,
|
||||
source_platform
|
||||
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, true, $19, 'dutchie')
|
||||
RETURNING id
|
||||
`, [
|
||||
storeId, categoryId, item.dutchieProductId, item.name, item.description,
|
||||
storeId, categoryId, dispensaryId, externalId, externalId,
|
||||
slug, item.name, nameNormalized, item.description,
|
||||
item.price, item.strainType, item.thcPercentage, item.cbdPercentage,
|
||||
item.brand, item.weight, item.imageUrl, item.dutchieUrl,
|
||||
item.brand, brandNormalized, item.weight, item.imageUrl, item.dutchieUrl,
|
||||
JSON.stringify(item.metadata || {})
|
||||
]);
|
||||
productId = insertResult.rows[0].id;
|
||||
logger_1.logger.debug('pipeline', `Inserted new product: ${item.name}`);
|
||||
logger_1.logger.debug('pipeline', `Inserted NEW product: ${item.name}`);
|
||||
}
|
||||
// Download image if needed
|
||||
if (item.imageUrl && !localImagePath) {
|
||||
// ALWAYS create a snapshot for historical tracking
|
||||
await this.createSnapshot(client, {
|
||||
productId: productId,
|
||||
dispensaryId,
|
||||
externalId,
|
||||
slug,
|
||||
item,
|
||||
categoryName
|
||||
});
|
||||
// Download image if needed (only for new products or missing local image)
|
||||
if (item.imageUrl && !localImagePath && productId) {
|
||||
try {
|
||||
localImagePath = await (0, minio_1.uploadImageFromUrl)(item.imageUrl, productId);
|
||||
const storeResult = await client.query('SELECT slug FROM stores WHERE id = $1', [storeId]);
|
||||
const storeSlug = storeResult.rows[0]?.slug || undefined;
|
||||
const imageSizes = await (0, minio_1.uploadImageFromUrl)(item.imageUrl, productId, storeSlug);
|
||||
localImagePath = imageSizes.thumbnail;
|
||||
await client.query(`
|
||||
UPDATE products
|
||||
SET local_image_path = $1
|
||||
WHERE id = $2
|
||||
`, [localImagePath, productId]);
|
||||
UPDATE products SET local_image_path = $1 WHERE id = $2
|
||||
`, [imageSizes.thumbnail, productId]);
|
||||
logger_1.logger.debug('pipeline', `Downloaded image for: ${item.name}`);
|
||||
}
|
||||
catch (error) {
|
||||
logger_1.logger.error('pipeline', `Failed to download image for ${item.name}: ${error}`);
|
||||
}
|
||||
}
|
||||
// Attach metadata for stats tracking
|
||||
item.isNewProduct = isNewProduct;
|
||||
item.productId = productId;
|
||||
return item;
|
||||
}
|
||||
catch (error) {
|
||||
@@ -224,6 +325,64 @@ class DatabasePipeline {
|
||||
client.release();
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Create a snapshot record for historical tracking
|
||||
*/
|
||||
async createSnapshot(client, params) {
|
||||
try {
|
||||
// Only create snapshots if the table exists (graceful degradation)
|
||||
const tableExists = await client.query(`
|
||||
SELECT EXISTS (
|
||||
SELECT FROM information_schema.tables
|
||||
WHERE table_name = 'product_snapshots'
|
||||
)
|
||||
`);
|
||||
if (!tableExists.rows[0].exists) {
|
||||
return; // Snapshot table not yet created
|
||||
}
|
||||
const crawlId = this.crawlId || crypto.randomUUID();
|
||||
const { productId, dispensaryId, externalId, slug, item, categoryName } = params;
|
||||
await client.query(`
|
||||
INSERT INTO product_snapshots (
|
||||
crawl_id, dispensary_id, external_product_id, product_slug,
|
||||
name, brand, category, price, original_price, sale_price,
|
||||
discount_type, discount_value, availability_status, stock_quantity,
|
||||
thc_percentage, cbd_percentage, strain_type, weight, variant,
|
||||
description, image_url, effects, terpenes, captured_at
|
||||
) VALUES (
|
||||
$1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, NOW()
|
||||
)
|
||||
`, [
|
||||
crawlId,
|
||||
dispensaryId,
|
||||
externalId,
|
||||
slug,
|
||||
item.name,
|
||||
item.brand || null,
|
||||
categoryName || null,
|
||||
item.price || null,
|
||||
item.originalPrice || null,
|
||||
item.metadata?.salePrice || null,
|
||||
item.metadata?.discountType || null,
|
||||
item.metadata?.discountValue || null,
|
||||
'in_stock', // availability_status - if we scraped it, it's in stock
|
||||
item.metadata?.stockQuantity || null,
|
||||
item.thcPercentage || null,
|
||||
item.cbdPercentage || null,
|
||||
item.strainType || null,
|
||||
item.weight || null,
|
||||
item.metadata?.variant || null,
|
||||
item.description || null,
|
||||
item.imageUrl || null,
|
||||
item.metadata?.effects || null,
|
||||
item.metadata?.terpenes || null
|
||||
]);
|
||||
}
|
||||
catch (error) {
|
||||
// Don't fail the whole pipeline if snapshot creation fails
|
||||
logger_1.logger.warn('pipeline', `Failed to create snapshot for ${params.item.name}: ${error}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
exports.DatabasePipeline = DatabasePipeline;
|
||||
/**
|
||||
|
||||
360
backend/dist/scrapers/dutchie-graphql-direct.js
vendored
Normal file
360
backend/dist/scrapers/dutchie-graphql-direct.js
vendored
Normal file
@@ -0,0 +1,360 @@
|
||||
"use strict";
|
||||
// ============================================================================
|
||||
// DEPRECATED: This scraper writes to the LEGACY products table.
|
||||
// DO NOT USE - All Dutchie crawling must use the new dutchie-az pipeline.
|
||||
//
|
||||
// New pipeline location: src/dutchie-az/services/product-crawler.ts
|
||||
// - Uses fetch-based GraphQL (no Puppeteer needed)
|
||||
// - Writes to isolated dutchie_az_* tables with snapshot model
|
||||
// - Tracks stockStatus, isPresentInFeed, missing_from_feed
|
||||
// ============================================================================
|
||||
var __importDefault = (this && this.__importDefault) || function (mod) {
|
||||
return (mod && mod.__esModule) ? mod : { "default": mod };
|
||||
};
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.fetchAllDutchieProducts = fetchAllDutchieProducts;
|
||||
exports.upsertProductsDirect = upsertProductsDirect;
|
||||
exports.scrapeAllDutchieProducts = scrapeAllDutchieProducts;
|
||||
/**
|
||||
* @deprecated DEPRECATED - Use src/dutchie-az/services/product-crawler.ts instead.
|
||||
* This scraper writes to the legacy products table, not the new dutchie_az tables.
|
||||
*
|
||||
* Makes direct GraphQL requests from within the browser context to:
|
||||
* 1. Bypass Cloudflare (using browser session)
|
||||
* 2. Fetch ALL products including out-of-stock (Status: null)
|
||||
* 3. Paginate through complete menu
|
||||
*/
|
||||
const puppeteer_extra_1 = __importDefault(require("puppeteer-extra"));
|
||||
const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth"));
|
||||
const dutchie_graphql_1 = require("./dutchie-graphql");
|
||||
puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)());
|
||||
// GraphQL persisted query hashes
|
||||
const GRAPHQL_HASHES = {
|
||||
FilteredProducts: 'ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0',
|
||||
GetAddressBasedDispensaryData: '13461f73abf7268770dfd05fe7e10c523084b2bb916a929c08efe3d87531977b',
|
||||
};
|
||||
/**
|
||||
* Fetch all products via in-page GraphQL requests
|
||||
* This includes both in-stock and out-of-stock items
|
||||
*/
|
||||
async function fetchAllDutchieProducts(menuUrl, options = {}) {
|
||||
const { headless = 'new', timeout = 90000, perPage = 100, includeOutOfStock = true, } = options;
|
||||
let browser;
|
||||
try {
|
||||
browser = await puppeteer_extra_1.default.launch({
|
||||
headless,
|
||||
args: [
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-blink-features=AutomationControlled',
|
||||
],
|
||||
});
|
||||
const page = await browser.newPage();
|
||||
// Stealth configuration
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
||||
window.chrome = { runtime: {} };
|
||||
});
|
||||
// Navigate to menu page to establish session
|
||||
console.log('[DutchieGraphQL] Loading menu page to establish session...');
|
||||
await page.goto(menuUrl, {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout,
|
||||
});
|
||||
// Get dispensary ID from page
|
||||
const dispensaryId = await page.evaluate(() => {
|
||||
const env = window.reactEnv;
|
||||
return env?.dispensaryId || env?.retailerId || '';
|
||||
});
|
||||
if (!dispensaryId) {
|
||||
throw new Error('Could not determine dispensaryId from page');
|
||||
}
|
||||
console.log(`[DutchieGraphQL] Dispensary ID: ${dispensaryId}`);
|
||||
// Fetch all products via in-page GraphQL requests
|
||||
const allProducts = [];
|
||||
let page_num = 0;
|
||||
let hasMore = true;
|
||||
while (hasMore) {
|
||||
console.log(`[DutchieGraphQL] Fetching page ${page_num} (perPage=${perPage})...`);
|
||||
const result = await page.evaluate(async (dispensaryId, page_num, perPage, includeOutOfStock, hash) => {
|
||||
const variables = {
|
||||
includeEnterpriseSpecials: false,
|
||||
productsFilter: {
|
||||
dispensaryId,
|
||||
pricingType: 'rec',
|
||||
Status: includeOutOfStock ? null : 'Active', // null = include out-of-stock
|
||||
types: [],
|
||||
useCache: false, // Don't cache to get fresh data
|
||||
isDefaultSort: true,
|
||||
sortBy: 'popularSortIdx',
|
||||
sortDirection: 1,
|
||||
bypassOnlineThresholds: true,
|
||||
isKioskMenu: false,
|
||||
removeProductsBelowOptionThresholds: false,
|
||||
},
|
||||
page: page_num,
|
||||
perPage,
|
||||
};
|
||||
const qs = new URLSearchParams({
|
||||
operationName: 'FilteredProducts',
|
||||
variables: JSON.stringify(variables),
|
||||
extensions: JSON.stringify({
|
||||
persistedQuery: { version: 1, sha256Hash: hash },
|
||||
}),
|
||||
});
|
||||
const response = await fetch(`https://dutchie.com/graphql?${qs.toString()}`, {
|
||||
method: 'GET',
|
||||
headers: {
|
||||
'content-type': 'application/json',
|
||||
'apollographql-client-name': 'Marketplace (production)',
|
||||
},
|
||||
credentials: 'include', // Include cookies/session
|
||||
});
|
||||
if (!response.ok) {
|
||||
throw new Error(`HTTP ${response.status}`);
|
||||
}
|
||||
return response.json();
|
||||
}, dispensaryId, page_num, perPage, includeOutOfStock, GRAPHQL_HASHES.FilteredProducts);
|
||||
if (result.errors) {
|
||||
console.error('[DutchieGraphQL] GraphQL errors:', result.errors);
|
||||
break;
|
||||
}
|
||||
const products = result?.data?.filteredProducts?.products || [];
|
||||
console.log(`[DutchieGraphQL] Page ${page_num}: ${products.length} products`);
|
||||
if (products.length === 0) {
|
||||
hasMore = false;
|
||||
}
|
||||
else {
|
||||
allProducts.push(...products);
|
||||
page_num++;
|
||||
// Safety limit
|
||||
if (page_num > 50) {
|
||||
console.log('[DutchieGraphQL] Reached page limit, stopping');
|
||||
hasMore = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Count active vs inactive
|
||||
const activeCount = allProducts.filter((p) => p.Status === 'Active').length;
|
||||
const inactiveCount = allProducts.filter((p) => p.Status !== 'Active').length;
|
||||
console.log(`[DutchieGraphQL] Total: ${allProducts.length} products (${activeCount} active, ${inactiveCount} inactive)`);
|
||||
return {
|
||||
products: allProducts,
|
||||
dispensaryId,
|
||||
totalProducts: allProducts.length,
|
||||
activeCount,
|
||||
inactiveCount,
|
||||
};
|
||||
}
|
||||
finally {
|
||||
if (browser) {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Upsert products to database
|
||||
*/
|
||||
async function upsertProductsDirect(pool, storeId, products) {
|
||||
const client = await pool.connect();
|
||||
let inserted = 0;
|
||||
let updated = 0;
|
||||
try {
|
||||
await client.query('BEGIN');
|
||||
for (const product of products) {
|
||||
const result = await client.query(`
|
||||
INSERT INTO products (
|
||||
store_id, external_id, slug, name, enterprise_product_id,
|
||||
brand, brand_external_id, brand_logo_url,
|
||||
subcategory, strain_type, canonical_category,
|
||||
price, rec_price, med_price, rec_special_price, med_special_price,
|
||||
is_on_special, special_name, discount_percent, special_data,
|
||||
sku, inventory_quantity, inventory_available, is_below_threshold, status,
|
||||
thc_percentage, cbd_percentage, cannabinoids,
|
||||
weight_mg, net_weight_value, net_weight_unit, options, raw_options,
|
||||
image_url, additional_images,
|
||||
is_featured, medical_only, rec_only,
|
||||
source_created_at, source_updated_at,
|
||||
description, raw_data,
|
||||
dutchie_url, last_seen_at, updated_at
|
||||
)
|
||||
VALUES (
|
||||
$1, $2, $3, $4, $5,
|
||||
$6, $7, $8,
|
||||
$9, $10, $11,
|
||||
$12, $13, $14, $15, $16,
|
||||
$17, $18, $19, $20,
|
||||
$21, $22, $23, $24, $25,
|
||||
$26, $27, $28,
|
||||
$29, $30, $31, $32, $33,
|
||||
$34, $35,
|
||||
$36, $37, $38,
|
||||
$39, $40,
|
||||
$41, $42,
|
||||
'', NOW(), NOW()
|
||||
)
|
||||
ON CONFLICT (store_id, slug) DO UPDATE SET
|
||||
name = EXCLUDED.name,
|
||||
enterprise_product_id = EXCLUDED.enterprise_product_id,
|
||||
brand = EXCLUDED.brand,
|
||||
brand_external_id = EXCLUDED.brand_external_id,
|
||||
brand_logo_url = EXCLUDED.brand_logo_url,
|
||||
subcategory = EXCLUDED.subcategory,
|
||||
strain_type = EXCLUDED.strain_type,
|
||||
canonical_category = EXCLUDED.canonical_category,
|
||||
price = EXCLUDED.price,
|
||||
rec_price = EXCLUDED.rec_price,
|
||||
med_price = EXCLUDED.med_price,
|
||||
rec_special_price = EXCLUDED.rec_special_price,
|
||||
med_special_price = EXCLUDED.med_special_price,
|
||||
is_on_special = EXCLUDED.is_on_special,
|
||||
special_name = EXCLUDED.special_name,
|
||||
discount_percent = EXCLUDED.discount_percent,
|
||||
special_data = EXCLUDED.special_data,
|
||||
sku = EXCLUDED.sku,
|
||||
inventory_quantity = EXCLUDED.inventory_quantity,
|
||||
inventory_available = EXCLUDED.inventory_available,
|
||||
is_below_threshold = EXCLUDED.is_below_threshold,
|
||||
status = EXCLUDED.status,
|
||||
thc_percentage = EXCLUDED.thc_percentage,
|
||||
cbd_percentage = EXCLUDED.cbd_percentage,
|
||||
cannabinoids = EXCLUDED.cannabinoids,
|
||||
weight_mg = EXCLUDED.weight_mg,
|
||||
net_weight_value = EXCLUDED.net_weight_value,
|
||||
net_weight_unit = EXCLUDED.net_weight_unit,
|
||||
options = EXCLUDED.options,
|
||||
raw_options = EXCLUDED.raw_options,
|
||||
image_url = EXCLUDED.image_url,
|
||||
additional_images = EXCLUDED.additional_images,
|
||||
is_featured = EXCLUDED.is_featured,
|
||||
medical_only = EXCLUDED.medical_only,
|
||||
rec_only = EXCLUDED.rec_only,
|
||||
source_created_at = EXCLUDED.source_created_at,
|
||||
source_updated_at = EXCLUDED.source_updated_at,
|
||||
description = EXCLUDED.description,
|
||||
raw_data = EXCLUDED.raw_data,
|
||||
last_seen_at = NOW(),
|
||||
updated_at = NOW()
|
||||
RETURNING (xmax = 0) AS was_inserted
|
||||
`, [
|
||||
storeId,
|
||||
product.external_id,
|
||||
product.slug,
|
||||
product.name,
|
||||
product.enterprise_product_id,
|
||||
product.brand,
|
||||
product.brand_external_id,
|
||||
product.brand_logo_url,
|
||||
product.subcategory,
|
||||
product.strain_type,
|
||||
product.canonical_category,
|
||||
product.price,
|
||||
product.rec_price,
|
||||
product.med_price,
|
||||
product.rec_special_price,
|
||||
product.med_special_price,
|
||||
product.is_on_special,
|
||||
product.special_name,
|
||||
product.discount_percent,
|
||||
product.special_data ? JSON.stringify(product.special_data) : null,
|
||||
product.sku,
|
||||
product.inventory_quantity,
|
||||
product.inventory_available,
|
||||
product.is_below_threshold,
|
||||
product.status,
|
||||
product.thc_percentage,
|
||||
product.cbd_percentage,
|
||||
product.cannabinoids ? JSON.stringify(product.cannabinoids) : null,
|
||||
product.weight_mg,
|
||||
product.net_weight_value,
|
||||
product.net_weight_unit,
|
||||
product.options,
|
||||
product.raw_options,
|
||||
product.image_url,
|
||||
product.additional_images,
|
||||
product.is_featured,
|
||||
product.medical_only,
|
||||
product.rec_only,
|
||||
product.source_created_at,
|
||||
product.source_updated_at,
|
||||
product.description,
|
||||
product.raw_data ? JSON.stringify(product.raw_data) : null,
|
||||
]);
|
||||
if (result.rows[0]?.was_inserted) {
|
||||
inserted++;
|
||||
}
|
||||
else {
|
||||
updated++;
|
||||
}
|
||||
}
|
||||
await client.query('COMMIT');
|
||||
return { inserted, updated };
|
||||
}
|
||||
catch (error) {
|
||||
await client.query('ROLLBACK');
|
||||
throw error;
|
||||
}
|
||||
finally {
|
||||
client.release();
|
||||
}
|
||||
}
|
||||
/**
|
||||
* @deprecated DEPRECATED - Use src/dutchie-az/services/product-crawler.ts instead.
|
||||
* This function is disabled and will throw an error if called.
|
||||
* Main entry point - scrape all products including out-of-stock
|
||||
*/
|
||||
async function scrapeAllDutchieProducts(pool, storeId, menuUrl) {
|
||||
// DEPRECATED: Throw error to prevent accidental use
|
||||
throw new Error('DEPRECATED: scrapeAllDutchieProducts() is deprecated. ' +
|
||||
'Use src/dutchie-az/services/product-crawler.ts instead. ' +
|
||||
'This scraper writes to the legacy products table.');
|
||||
// Original code below is unreachable but kept for reference
|
||||
try {
|
||||
console.log(`[DutchieGraphQL] Scraping ALL products (including out-of-stock): ${menuUrl}`);
|
||||
// Fetch all products via direct GraphQL
|
||||
const { products, totalProducts, activeCount, inactiveCount } = await fetchAllDutchieProducts(menuUrl, {
|
||||
includeOutOfStock: true,
|
||||
perPage: 100,
|
||||
});
|
||||
if (products.length === 0) {
|
||||
return {
|
||||
success: false,
|
||||
totalProducts: 0,
|
||||
activeCount: 0,
|
||||
inactiveCount: 0,
|
||||
inserted: 0,
|
||||
updated: 0,
|
||||
error: 'No products returned from GraphQL',
|
||||
};
|
||||
}
|
||||
// Normalize products
|
||||
const normalized = products.map(dutchie_graphql_1.normalizeDutchieProduct);
|
||||
// Upsert to database
|
||||
const { inserted, updated } = await upsertProductsDirect(pool, storeId, normalized);
|
||||
console.log(`[DutchieGraphQL] Complete: ${totalProducts} products (${activeCount} active, ${inactiveCount} inactive)`);
|
||||
console.log(`[DutchieGraphQL] Database: ${inserted} inserted, ${updated} updated`);
|
||||
return {
|
||||
success: true,
|
||||
totalProducts,
|
||||
activeCount,
|
||||
inactiveCount,
|
||||
inserted,
|
||||
updated,
|
||||
};
|
||||
}
|
||||
catch (error) {
|
||||
console.error(`[DutchieGraphQL] Error:`, error.message);
|
||||
return {
|
||||
success: false,
|
||||
totalProducts: 0,
|
||||
activeCount: 0,
|
||||
inactiveCount: 0,
|
||||
inserted: 0,
|
||||
updated: 0,
|
||||
error: error.message,
|
||||
};
|
||||
}
|
||||
}
|
||||
446
backend/dist/scrapers/dutchie-graphql.js
vendored
Normal file
446
backend/dist/scrapers/dutchie-graphql.js
vendored
Normal file
@@ -0,0 +1,446 @@
|
||||
"use strict";
|
||||
// ============================================================================
|
||||
// DEPRECATED: This scraper writes to the LEGACY products table.
|
||||
// DO NOT USE - All Dutchie crawling must use the new dutchie-az pipeline.
|
||||
//
|
||||
// New pipeline location: src/dutchie-az/services/product-crawler.ts
|
||||
// - Uses fetch-based GraphQL (no Puppeteer needed)
|
||||
// - Writes to isolated dutchie_az_* tables with snapshot model
|
||||
// - Tracks stockStatus, isPresentInFeed, missing_from_feed
|
||||
//
|
||||
// The normalizer functions in this file (normalizeDutchieProduct) may still
|
||||
// be imported for reference, but do NOT call scrapeDutchieMenu() or upsertProducts().
|
||||
// ============================================================================
|
||||
var __importDefault = (this && this.__importDefault) || function (mod) {
|
||||
return (mod && mod.__esModule) ? mod : { "default": mod };
|
||||
};
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.normalizeDutchieProduct = normalizeDutchieProduct;
|
||||
exports.fetchDutchieMenuViaPuppeteer = fetchDutchieMenuViaPuppeteer;
|
||||
exports.upsertProducts = upsertProducts;
|
||||
exports.scrapeDutchieMenu = scrapeDutchieMenu;
|
||||
/**
|
||||
* @deprecated DEPRECATED - Use src/dutchie-az/services/product-crawler.ts instead.
|
||||
* This scraper writes to the legacy products table, not the new dutchie_az tables.
|
||||
*
|
||||
* Fetches product data via Puppeteer interception of Dutchie's GraphQL API.
|
||||
* This bypasses Cloudflare by using a real browser to load the menu page.
|
||||
*
|
||||
* GraphQL Operations:
|
||||
* - FilteredProducts: Returns paginated product list with full details
|
||||
* - GetAddressBasedDispensaryData: Resolves dispensary cName to dispensaryId
|
||||
*/
|
||||
const puppeteer_extra_1 = __importDefault(require("puppeteer-extra"));
|
||||
const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth"));
|
||||
puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)());
|
||||
// =====================================================
|
||||
// NORMALIZER: Dutchie GraphQL → DB Schema
|
||||
// =====================================================
|
||||
function normalizeDutchieProduct(product) {
|
||||
// Extract first special if exists
|
||||
const saleSpecial = product.specialData?.saleSpecials?.[0];
|
||||
// Calculate inventory from POSMetaData children
|
||||
const children = product.POSMetaData?.children || [];
|
||||
const totalQuantity = children.reduce((sum, c) => sum + (c.quantity || 0), 0);
|
||||
const availableQuantity = children.reduce((sum, c) => sum + (c.quantityAvailable || 0), 0);
|
||||
// Parse timestamps
|
||||
let sourceCreatedAt;
|
||||
if (product.createdAt) {
|
||||
// createdAt is a timestamp string like "1729044510543"
|
||||
const ts = parseInt(product.createdAt, 10);
|
||||
if (!isNaN(ts)) {
|
||||
sourceCreatedAt = new Date(ts);
|
||||
}
|
||||
}
|
||||
let sourceUpdatedAt;
|
||||
if (product.updatedAt) {
|
||||
sourceUpdatedAt = new Date(product.updatedAt);
|
||||
}
|
||||
return {
|
||||
// Identity
|
||||
external_id: product._id || product.id,
|
||||
slug: product.cName,
|
||||
name: product.Name,
|
||||
enterprise_product_id: product.enterpriseProductId,
|
||||
// Brand
|
||||
brand: product.brandName || product.brand?.name,
|
||||
brand_external_id: product.brandId || product.brand?.id,
|
||||
brand_logo_url: product.brandLogo || product.brand?.imageUrl,
|
||||
// Category
|
||||
subcategory: product.subcategory,
|
||||
strain_type: product.strainType,
|
||||
canonical_category: product.POSMetaData?.canonicalCategory,
|
||||
// Pricing
|
||||
price: product.Prices?.[0],
|
||||
rec_price: product.recPrices?.[0],
|
||||
med_price: product.medicalPrices?.[0],
|
||||
rec_special_price: product.recSpecialPrices?.[0],
|
||||
med_special_price: product.medicalSpecialPrices?.[0],
|
||||
// Specials
|
||||
is_on_special: product.special === true,
|
||||
special_name: saleSpecial?.specialName,
|
||||
discount_percent: saleSpecial?.percentDiscount ? saleSpecial.discount : undefined,
|
||||
special_data: product.specialData,
|
||||
// Inventory
|
||||
sku: product.POSMetaData?.canonicalSKU,
|
||||
inventory_quantity: totalQuantity || undefined,
|
||||
inventory_available: availableQuantity || undefined,
|
||||
is_below_threshold: product.isBelowThreshold === true,
|
||||
status: product.Status,
|
||||
// Potency
|
||||
thc_percentage: product.THCContent?.range?.[0],
|
||||
cbd_percentage: product.CBDContent?.range?.[0],
|
||||
cannabinoids: product.cannabinoidsV2,
|
||||
// Weight/Options
|
||||
weight_mg: product.weight,
|
||||
net_weight_value: product.measurements?.netWeight?.values?.[0],
|
||||
net_weight_unit: product.measurements?.netWeight?.unit,
|
||||
options: product.Options,
|
||||
raw_options: product.rawOptions,
|
||||
// Images
|
||||
image_url: product.Image,
|
||||
additional_images: product.images?.length ? product.images : undefined,
|
||||
// Flags
|
||||
is_featured: product.featured === true,
|
||||
medical_only: product.medicalOnly === true,
|
||||
rec_only: product.recOnly === true,
|
||||
// Timestamps
|
||||
source_created_at: sourceCreatedAt,
|
||||
source_updated_at: sourceUpdatedAt,
|
||||
// Description
|
||||
description: typeof product.description === 'string' ? product.description : undefined,
|
||||
// Raw
|
||||
raw_data: product,
|
||||
};
|
||||
}
|
||||
async function fetchDutchieMenuViaPuppeteer(menuUrl, options = {}) {
|
||||
const { headless = 'new', timeout = 90000, maxScrolls = 30, // Increased for full menu capture
|
||||
} = options;
|
||||
let browser;
|
||||
const capturedProducts = [];
|
||||
let dispensaryId = '';
|
||||
try {
|
||||
browser = await puppeteer_extra_1.default.launch({
|
||||
headless,
|
||||
args: [
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-blink-features=AutomationControlled',
|
||||
],
|
||||
});
|
||||
const page = await browser.newPage();
|
||||
// Stealth configuration
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
||||
window.chrome = { runtime: {} };
|
||||
});
|
||||
// Track seen product IDs to avoid duplicates
|
||||
const seenIds = new Set();
|
||||
// Intercept GraphQL responses
|
||||
page.on('response', async (response) => {
|
||||
const url = response.url();
|
||||
if (!url.includes('graphql'))
|
||||
return;
|
||||
try {
|
||||
const contentType = response.headers()['content-type'] || '';
|
||||
if (!contentType.includes('application/json'))
|
||||
return;
|
||||
const data = await response.json();
|
||||
// Capture dispensary ID
|
||||
if (data?.data?.getAddressBasedDispensaryData?.dispensaryData?.dispensaryId) {
|
||||
dispensaryId = data.data.getAddressBasedDispensaryData.dispensaryData.dispensaryId;
|
||||
}
|
||||
// Capture products from FilteredProducts
|
||||
if (data?.data?.filteredProducts?.products) {
|
||||
const products = data.data.filteredProducts.products;
|
||||
for (const product of products) {
|
||||
if (!seenIds.has(product._id)) {
|
||||
seenIds.add(product._id);
|
||||
capturedProducts.push(product);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch {
|
||||
// Ignore parse errors
|
||||
}
|
||||
});
|
||||
// Navigate to menu
|
||||
console.log('[DutchieGraphQL] Loading menu page...');
|
||||
await page.goto(menuUrl, {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout,
|
||||
});
|
||||
// Get dispensary ID from window.reactEnv if not captured
|
||||
if (!dispensaryId) {
|
||||
dispensaryId = await page.evaluate(() => {
|
||||
const env = window.reactEnv;
|
||||
return env?.dispensaryId || env?.retailerId || '';
|
||||
});
|
||||
}
|
||||
// Helper function to scroll through a page until no more products load
|
||||
async function scrollToLoadAll(maxScrollAttempts = maxScrolls) {
|
||||
let scrollCount = 0;
|
||||
let previousCount = 0;
|
||||
let noNewProductsCount = 0;
|
||||
while (scrollCount < maxScrollAttempts && noNewProductsCount < 3) {
|
||||
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
|
||||
await new Promise((r) => setTimeout(r, 1500));
|
||||
const currentCount = seenIds.size;
|
||||
if (currentCount === previousCount) {
|
||||
noNewProductsCount++;
|
||||
}
|
||||
else {
|
||||
noNewProductsCount = 0;
|
||||
}
|
||||
previousCount = currentCount;
|
||||
scrollCount++;
|
||||
}
|
||||
}
|
||||
// First, scroll through the main page (all products)
|
||||
console.log('[DutchieGraphQL] Scrolling main page...');
|
||||
await scrollToLoadAll();
|
||||
console.log(`[DutchieGraphQL] After main page: ${seenIds.size} products`);
|
||||
// Get category links from the navigation
|
||||
const categoryLinks = await page.evaluate(() => {
|
||||
const links = [];
|
||||
// Look for category navigation links
|
||||
const navLinks = document.querySelectorAll('a[href*="/products/"]');
|
||||
navLinks.forEach((link) => {
|
||||
const href = link.href;
|
||||
if (href && !links.includes(href)) {
|
||||
links.push(href);
|
||||
}
|
||||
});
|
||||
return links;
|
||||
});
|
||||
console.log(`[DutchieGraphQL] Found ${categoryLinks.length} category links`);
|
||||
// Visit each category page to capture all products
|
||||
for (const categoryUrl of categoryLinks) {
|
||||
try {
|
||||
console.log(`[DutchieGraphQL] Visiting category: ${categoryUrl.split('/').pop()}`);
|
||||
await page.goto(categoryUrl, {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout: 30000,
|
||||
});
|
||||
await scrollToLoadAll(15); // Fewer scrolls per category
|
||||
console.log(`[DutchieGraphQL] Total products: ${seenIds.size}`);
|
||||
}
|
||||
catch (e) {
|
||||
console.log(`[DutchieGraphQL] Category error: ${e.message}`);
|
||||
}
|
||||
}
|
||||
// Wait for any final responses
|
||||
await new Promise((r) => setTimeout(r, 2000));
|
||||
return {
|
||||
products: capturedProducts,
|
||||
dispensaryId,
|
||||
menuUrl,
|
||||
};
|
||||
}
|
||||
finally {
|
||||
if (browser) {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
// =====================================================
|
||||
// DATABASE OPERATIONS
|
||||
// =====================================================
|
||||
async function upsertProducts(pool, storeId, products) {
|
||||
const client = await pool.connect();
|
||||
let inserted = 0;
|
||||
let updated = 0;
|
||||
try {
|
||||
await client.query('BEGIN');
|
||||
for (const product of products) {
|
||||
// Upsert product
|
||||
const result = await client.query(`
|
||||
INSERT INTO products (
|
||||
store_id, external_id, slug, name, enterprise_product_id,
|
||||
brand, brand_external_id, brand_logo_url,
|
||||
subcategory, strain_type, canonical_category,
|
||||
price, rec_price, med_price, rec_special_price, med_special_price,
|
||||
is_on_special, special_name, discount_percent, special_data,
|
||||
sku, inventory_quantity, inventory_available, is_below_threshold, status,
|
||||
thc_percentage, cbd_percentage, cannabinoids,
|
||||
weight_mg, net_weight_value, net_weight_unit, options, raw_options,
|
||||
image_url, additional_images,
|
||||
is_featured, medical_only, rec_only,
|
||||
source_created_at, source_updated_at,
|
||||
description, raw_data,
|
||||
dutchie_url, last_seen_at, updated_at
|
||||
)
|
||||
VALUES (
|
||||
$1, $2, $3, $4, $5,
|
||||
$6, $7, $8,
|
||||
$9, $10, $11,
|
||||
$12, $13, $14, $15, $16,
|
||||
$17, $18, $19, $20,
|
||||
$21, $22, $23, $24, $25,
|
||||
$26, $27, $28,
|
||||
$29, $30, $31, $32, $33,
|
||||
$34, $35,
|
||||
$36, $37, $38,
|
||||
$39, $40,
|
||||
$41, $42,
|
||||
'', NOW(), NOW()
|
||||
)
|
||||
ON CONFLICT (store_id, slug) DO UPDATE SET
|
||||
name = EXCLUDED.name,
|
||||
enterprise_product_id = EXCLUDED.enterprise_product_id,
|
||||
brand = EXCLUDED.brand,
|
||||
brand_external_id = EXCLUDED.brand_external_id,
|
||||
brand_logo_url = EXCLUDED.brand_logo_url,
|
||||
subcategory = EXCLUDED.subcategory,
|
||||
strain_type = EXCLUDED.strain_type,
|
||||
canonical_category = EXCLUDED.canonical_category,
|
||||
price = EXCLUDED.price,
|
||||
rec_price = EXCLUDED.rec_price,
|
||||
med_price = EXCLUDED.med_price,
|
||||
rec_special_price = EXCLUDED.rec_special_price,
|
||||
med_special_price = EXCLUDED.med_special_price,
|
||||
is_on_special = EXCLUDED.is_on_special,
|
||||
special_name = EXCLUDED.special_name,
|
||||
discount_percent = EXCLUDED.discount_percent,
|
||||
special_data = EXCLUDED.special_data,
|
||||
sku = EXCLUDED.sku,
|
||||
inventory_quantity = EXCLUDED.inventory_quantity,
|
||||
inventory_available = EXCLUDED.inventory_available,
|
||||
is_below_threshold = EXCLUDED.is_below_threshold,
|
||||
status = EXCLUDED.status,
|
||||
thc_percentage = EXCLUDED.thc_percentage,
|
||||
cbd_percentage = EXCLUDED.cbd_percentage,
|
||||
cannabinoids = EXCLUDED.cannabinoids,
|
||||
weight_mg = EXCLUDED.weight_mg,
|
||||
net_weight_value = EXCLUDED.net_weight_value,
|
||||
net_weight_unit = EXCLUDED.net_weight_unit,
|
||||
options = EXCLUDED.options,
|
||||
raw_options = EXCLUDED.raw_options,
|
||||
image_url = EXCLUDED.image_url,
|
||||
additional_images = EXCLUDED.additional_images,
|
||||
is_featured = EXCLUDED.is_featured,
|
||||
medical_only = EXCLUDED.medical_only,
|
||||
rec_only = EXCLUDED.rec_only,
|
||||
source_created_at = EXCLUDED.source_created_at,
|
||||
source_updated_at = EXCLUDED.source_updated_at,
|
||||
description = EXCLUDED.description,
|
||||
raw_data = EXCLUDED.raw_data,
|
||||
last_seen_at = NOW(),
|
||||
updated_at = NOW()
|
||||
RETURNING (xmax = 0) AS was_inserted
|
||||
`, [
|
||||
storeId,
|
||||
product.external_id,
|
||||
product.slug,
|
||||
product.name,
|
||||
product.enterprise_product_id,
|
||||
product.brand,
|
||||
product.brand_external_id,
|
||||
product.brand_logo_url,
|
||||
product.subcategory,
|
||||
product.strain_type,
|
||||
product.canonical_category,
|
||||
product.price,
|
||||
product.rec_price,
|
||||
product.med_price,
|
||||
product.rec_special_price,
|
||||
product.med_special_price,
|
||||
product.is_on_special,
|
||||
product.special_name,
|
||||
product.discount_percent,
|
||||
product.special_data ? JSON.stringify(product.special_data) : null,
|
||||
product.sku,
|
||||
product.inventory_quantity,
|
||||
product.inventory_available,
|
||||
product.is_below_threshold,
|
||||
product.status,
|
||||
product.thc_percentage,
|
||||
product.cbd_percentage,
|
||||
product.cannabinoids ? JSON.stringify(product.cannabinoids) : null,
|
||||
product.weight_mg,
|
||||
product.net_weight_value,
|
||||
product.net_weight_unit,
|
||||
product.options,
|
||||
product.raw_options,
|
||||
product.image_url,
|
||||
product.additional_images,
|
||||
product.is_featured,
|
||||
product.medical_only,
|
||||
product.rec_only,
|
||||
product.source_created_at,
|
||||
product.source_updated_at,
|
||||
product.description,
|
||||
product.raw_data ? JSON.stringify(product.raw_data) : null,
|
||||
]);
|
||||
if (result.rows[0]?.was_inserted) {
|
||||
inserted++;
|
||||
}
|
||||
else {
|
||||
updated++;
|
||||
}
|
||||
}
|
||||
await client.query('COMMIT');
|
||||
return { inserted, updated };
|
||||
}
|
||||
catch (error) {
|
||||
await client.query('ROLLBACK');
|
||||
throw error;
|
||||
}
|
||||
finally {
|
||||
client.release();
|
||||
}
|
||||
}
|
||||
// =====================================================
|
||||
// MAIN ENTRY POINT
|
||||
// =====================================================
|
||||
/**
|
||||
* @deprecated DEPRECATED - Use src/dutchie-az/services/product-crawler.ts instead.
|
||||
* This function is disabled and will throw an error if called.
|
||||
*/
|
||||
async function scrapeDutchieMenu(pool, storeId, menuUrl) {
|
||||
// DEPRECATED: Throw error to prevent accidental use
|
||||
throw new Error('DEPRECATED: scrapeDutchieMenu() is deprecated. ' +
|
||||
'Use src/dutchie-az/services/product-crawler.ts instead. ' +
|
||||
'This scraper writes to the legacy products table.');
|
||||
// Original code below is unreachable but kept for reference
|
||||
try {
|
||||
console.log(`[DutchieGraphQL] Scraping: ${menuUrl}`);
|
||||
// Fetch products via Puppeteer
|
||||
const { products, dispensaryId } = await fetchDutchieMenuViaPuppeteer(menuUrl);
|
||||
console.log(`[DutchieGraphQL] Captured ${products.length} products, dispensaryId: ${dispensaryId}`);
|
||||
if (products.length === 0) {
|
||||
return {
|
||||
success: false,
|
||||
productsFound: 0,
|
||||
inserted: 0,
|
||||
updated: 0,
|
||||
error: 'No products captured from GraphQL responses',
|
||||
};
|
||||
}
|
||||
// Normalize products
|
||||
const normalized = products.map(normalizeDutchieProduct);
|
||||
// Upsert to database
|
||||
const { inserted, updated } = await upsertProducts(pool, storeId, normalized);
|
||||
console.log(`[DutchieGraphQL] Upsert complete: ${inserted} inserted, ${updated} updated`);
|
||||
return {
|
||||
success: true,
|
||||
productsFound: products.length,
|
||||
inserted,
|
||||
updated,
|
||||
};
|
||||
}
|
||||
catch (error) {
|
||||
console.error(`[DutchieGraphQL] Error:`, error.message);
|
||||
return {
|
||||
success: false,
|
||||
productsFound: 0,
|
||||
inserted: 0,
|
||||
updated: 0,
|
||||
error: error.message,
|
||||
};
|
||||
}
|
||||
}
|
||||
85
backend/dist/scrapers/templates/dutchie.js
vendored
Normal file
85
backend/dist/scrapers/templates/dutchie.js
vendored
Normal file
@@ -0,0 +1,85 @@
|
||||
"use strict";
|
||||
// ============================================================================
|
||||
// DEPRECATED: Dutchie now crawled via GraphQL only (see dutchie-az pipeline)
|
||||
// DO NOT USE - This HTML scraper is unreliable and targets the legacy products table.
|
||||
// All Dutchie crawling must go through: src/dutchie-az/services/product-crawler.ts
|
||||
// ============================================================================
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.dutchieTemplate = void 0;
|
||||
exports.getTemplateForUrl = getTemplateForUrl;
|
||||
const logger_1 = require("../../services/logger");
|
||||
/**
|
||||
* @deprecated DEPRECATED - Dutchie HTML scraping is no longer supported.
|
||||
* Use the dutchie-az GraphQL pipeline instead: src/dutchie-az/services/product-crawler.ts
|
||||
* This template relied on unstable DOM selectors and wrote to legacy tables.
|
||||
*/
|
||||
exports.dutchieTemplate = {
|
||||
name: 'Dutchie Marketplace',
|
||||
urlPattern: /dutchie\.com\/dispensary\//,
|
||||
buildCategoryUrl: (baseUrl, category) => {
|
||||
// Remove trailing slash
|
||||
const base = baseUrl.replace(/\/$/, '');
|
||||
// Convert category name to URL-friendly slug
|
||||
const categorySlug = category.toLowerCase().replace(/\s+/g, '-');
|
||||
return `${base}/products/${categorySlug}`;
|
||||
},
|
||||
extractProducts: async (page) => {
|
||||
const products = [];
|
||||
try {
|
||||
// Wait for product cards to load
|
||||
await page.waitForSelector('a[data-testid="card-link"]', { timeout: 10000 }).catch(() => {
|
||||
logger_1.logger.warn('scraper', 'No product cards found with data-testid="card-link"');
|
||||
});
|
||||
// Get all product card links
|
||||
const productCards = await page.locator('a[href*="/product/"][data-testid="card-link"]').all();
|
||||
logger_1.logger.info('scraper', `Found ${productCards.length} Dutchie product cards`);
|
||||
for (const card of productCards) {
|
||||
try {
|
||||
// Extract all data at once using evaluate for speed
|
||||
const cardData = await card.evaluate((el) => {
|
||||
const href = el.getAttribute('href') || '';
|
||||
const img = el.querySelector('img');
|
||||
const imageUrl = img ? img.getAttribute('src') || '' : '';
|
||||
// Get all text nodes in order
|
||||
const textElements = Array.from(el.querySelectorAll('*'))
|
||||
.filter(el => el.textContent && el.children.length === 0)
|
||||
.map(el => (el.textContent || '').trim())
|
||||
.filter(text => text.length > 0);
|
||||
const name = textElements[0] || '';
|
||||
const brand = textElements[1] || '';
|
||||
// Look for price
|
||||
const priceMatch = el.textContent?.match(/\$(\d+(?:\.\d{2})?)/);
|
||||
const price = priceMatch ? parseFloat(priceMatch[1]) : undefined;
|
||||
return { href, imageUrl, name, brand, price };
|
||||
});
|
||||
if (cardData.name && cardData.href) {
|
||||
products.push({
|
||||
name: cardData.name,
|
||||
brand: cardData.brand || undefined,
|
||||
product_url: cardData.href.startsWith('http') ? cardData.href : `https://dutchie.com${cardData.href}`,
|
||||
image_url: cardData.imageUrl || undefined,
|
||||
price: cardData.price,
|
||||
in_stock: true,
|
||||
});
|
||||
}
|
||||
}
|
||||
catch (err) {
|
||||
logger_1.logger.warn('scraper', `Error extracting Dutchie product card: ${err}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (err) {
|
||||
logger_1.logger.error('scraper', `Error in Dutchie product extraction: ${err}`);
|
||||
}
|
||||
return products;
|
||||
},
|
||||
};
|
||||
/**
|
||||
* Get the appropriate scraper template based on URL
|
||||
*/
|
||||
function getTemplateForUrl(url) {
|
||||
if (exports.dutchieTemplate.urlPattern.test(url)) {
|
||||
return exports.dutchieTemplate;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
287
backend/dist/scripts/backfill-store-dispensary.js
vendored
Normal file
287
backend/dist/scripts/backfill-store-dispensary.js
vendored
Normal file
@@ -0,0 +1,287 @@
|
||||
#!/usr/bin/env npx tsx
|
||||
"use strict";
|
||||
/**
|
||||
* Backfill Store-Dispensary Mapping
|
||||
*
|
||||
* Links existing stores (scheduler) to dispensaries (master AZDHS directory)
|
||||
* by matching on name, city, and zip code.
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx src/scripts/backfill-store-dispensary.ts # Preview matches
|
||||
* npx tsx src/scripts/backfill-store-dispensary.ts --apply # Apply matches
|
||||
* npx tsx src/scripts/backfill-store-dispensary.ts --verbose # Show all match details
|
||||
*/
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
const migrate_1 = require("../db/migrate");
|
||||
const logger_1 = require("../services/logger");
|
||||
const args = process.argv.slice(2);
|
||||
const flags = {
|
||||
apply: args.includes('--apply'),
|
||||
verbose: args.includes('--verbose'),
|
||||
help: args.includes('--help') || args.includes('-h'),
|
||||
};
|
||||
/**
|
||||
* Normalize a store/dispensary name for comparison
|
||||
* Removes common suffixes, punctuation, and extra whitespace
|
||||
*/
|
||||
function normalizeName(name) {
|
||||
return name
|
||||
.toLowerCase()
|
||||
.replace(/\s*[-–—]\s*/g, ' ') // Normalize dashes to spaces
|
||||
.replace(/\s*(dispensary|cannabis|marijuana|weed|shop|store|llc|inc)\s*/gi, ' ')
|
||||
.replace(/['']/g, "'") // Normalize apostrophes
|
||||
.replace(/[^\w\s']/g, '') // Remove other punctuation
|
||||
.replace(/\s+/g, ' ') // Collapse whitespace
|
||||
.trim();
|
||||
}
|
||||
/**
|
||||
* Simple Levenshtein distance for fuzzy matching
|
||||
*/
|
||||
function levenshteinDistance(a, b) {
|
||||
const matrix = [];
|
||||
for (let i = 0; i <= b.length; i++) {
|
||||
matrix[i] = [i];
|
||||
}
|
||||
for (let j = 0; j <= a.length; j++) {
|
||||
matrix[0][j] = j;
|
||||
}
|
||||
for (let i = 1; i <= b.length; i++) {
|
||||
for (let j = 1; j <= a.length; j++) {
|
||||
if (b.charAt(i - 1) === a.charAt(j - 1)) {
|
||||
matrix[i][j] = matrix[i - 1][j - 1];
|
||||
}
|
||||
else {
|
||||
matrix[i][j] = Math.min(matrix[i - 1][j - 1] + 1, // substitution
|
||||
matrix[i][j - 1] + 1, // insertion
|
||||
matrix[i - 1][j] + 1 // deletion
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
return matrix[b.length][a.length];
|
||||
}
|
||||
/**
|
||||
* Calculate similarity score (0-100)
|
||||
*/
|
||||
function similarityScore(a, b) {
|
||||
const maxLen = Math.max(a.length, b.length);
|
||||
if (maxLen === 0)
|
||||
return 100;
|
||||
const distance = levenshteinDistance(a, b);
|
||||
return Math.round((1 - distance / maxLen) * 100);
|
||||
}
|
||||
/**
|
||||
* Find the best dispensary match for a store
|
||||
*/
|
||||
function findBestMatch(store, dispensaries) {
|
||||
const normalizedStoreName = normalizeName(store.name);
|
||||
const storeSlug = store.slug.toLowerCase();
|
||||
let bestMatch = {
|
||||
store,
|
||||
dispensary: null,
|
||||
matchType: 'none',
|
||||
score: 0,
|
||||
};
|
||||
for (const disp of dispensaries) {
|
||||
const normalizedDispName = normalizeName(disp.name);
|
||||
const normalizedCompanyName = disp.company_name ? normalizeName(disp.company_name) : '';
|
||||
const dispSlug = disp.slug.toLowerCase();
|
||||
// 1. Exact name match (case-insensitive)
|
||||
if (store.name.toLowerCase() === disp.name.toLowerCase()) {
|
||||
return {
|
||||
store,
|
||||
dispensary: disp,
|
||||
matchType: 'exact_name',
|
||||
score: 100,
|
||||
};
|
||||
}
|
||||
// 2. Normalized name match
|
||||
if (normalizedStoreName === normalizedDispName) {
|
||||
return {
|
||||
store,
|
||||
dispensary: disp,
|
||||
matchType: 'normalized_name',
|
||||
score: 95,
|
||||
};
|
||||
}
|
||||
// 3. Store name matches company name
|
||||
if (normalizedCompanyName && normalizedStoreName === normalizedCompanyName) {
|
||||
return {
|
||||
store,
|
||||
dispensary: disp,
|
||||
matchType: 'company_name',
|
||||
score: 90,
|
||||
};
|
||||
}
|
||||
// 4. Slug match
|
||||
if (storeSlug === dispSlug) {
|
||||
return {
|
||||
store,
|
||||
dispensary: disp,
|
||||
matchType: 'slug',
|
||||
score: 85,
|
||||
};
|
||||
}
|
||||
// 5. Fuzzy matching (only if score > 70)
|
||||
const nameScore = similarityScore(normalizedStoreName, normalizedDispName);
|
||||
const companyScore = normalizedCompanyName
|
||||
? similarityScore(normalizedStoreName, normalizedCompanyName)
|
||||
: 0;
|
||||
const fuzzyScore = Math.max(nameScore, companyScore);
|
||||
if (fuzzyScore > bestMatch.score && fuzzyScore >= 70) {
|
||||
bestMatch = {
|
||||
store,
|
||||
dispensary: disp,
|
||||
matchType: 'fuzzy',
|
||||
score: fuzzyScore,
|
||||
};
|
||||
}
|
||||
}
|
||||
return bestMatch;
|
||||
}
|
||||
async function main() {
|
||||
if (flags.help) {
|
||||
console.log(`
|
||||
Backfill Store-Dispensary Mapping
|
||||
|
||||
Links existing stores (scheduler) to dispensaries (master AZDHS directory)
|
||||
by matching on name, company name, or slug similarity.
|
||||
|
||||
USAGE:
|
||||
npx tsx src/scripts/backfill-store-dispensary.ts [OPTIONS]
|
||||
|
||||
OPTIONS:
|
||||
--apply Apply the mappings to the database (default: preview only)
|
||||
--verbose Show detailed match information for all stores
|
||||
--help, -h Show this help message
|
||||
|
||||
EXAMPLES:
|
||||
# Preview what would be matched
|
||||
npx tsx src/scripts/backfill-store-dispensary.ts
|
||||
|
||||
# Apply the mappings
|
||||
npx tsx src/scripts/backfill-store-dispensary.ts --apply
|
||||
|
||||
# Show verbose output
|
||||
npx tsx src/scripts/backfill-store-dispensary.ts --verbose
|
||||
`);
|
||||
process.exit(0);
|
||||
}
|
||||
console.log('\n📦 Backfill Store-Dispensary Mapping');
|
||||
console.log('=====================================\n');
|
||||
try {
|
||||
// Fetch all stores without a dispensary_id
|
||||
const storesResult = await migrate_1.pool.query(`
|
||||
SELECT id, name, slug, dispensary_id
|
||||
FROM stores
|
||||
WHERE dispensary_id IS NULL
|
||||
ORDER BY name
|
||||
`);
|
||||
const unmappedStores = storesResult.rows;
|
||||
// Fetch all already-mapped stores for context
|
||||
const mappedResult = await migrate_1.pool.query(`
|
||||
SELECT id, name, slug, dispensary_id
|
||||
FROM stores
|
||||
WHERE dispensary_id IS NOT NULL
|
||||
ORDER BY name
|
||||
`);
|
||||
const mappedStores = mappedResult.rows;
|
||||
// Fetch all dispensaries
|
||||
const dispResult = await migrate_1.pool.query(`
|
||||
SELECT id, name, company_name, city, address, slug
|
||||
FROM dispensaries
|
||||
ORDER BY name
|
||||
`);
|
||||
const dispensaries = dispResult.rows;
|
||||
console.log(`📊 Current Status:`);
|
||||
console.log(` Stores without dispensary_id: ${unmappedStores.length}`);
|
||||
console.log(` Stores already mapped: ${mappedStores.length}`);
|
||||
console.log(` Total dispensaries: ${dispensaries.length}\n`);
|
||||
if (unmappedStores.length === 0) {
|
||||
console.log('✅ All stores are already mapped to dispensaries!\n');
|
||||
await migrate_1.pool.end();
|
||||
process.exit(0);
|
||||
}
|
||||
// Find matches for each unmapped store
|
||||
const matches = [];
|
||||
const noMatches = [];
|
||||
for (const store of unmappedStores) {
|
||||
const match = findBestMatch(store, dispensaries);
|
||||
if (match.dispensary) {
|
||||
matches.push(match);
|
||||
}
|
||||
else {
|
||||
noMatches.push(store);
|
||||
}
|
||||
}
|
||||
// Sort matches by score (highest first)
|
||||
matches.sort((a, b) => b.score - a.score);
|
||||
// Display results
|
||||
console.log(`\n🔗 Matches Found: ${matches.length}`);
|
||||
console.log('----------------------------------\n');
|
||||
if (matches.length > 0) {
|
||||
// Group by match type
|
||||
const byType = {};
|
||||
for (const m of matches) {
|
||||
if (!byType[m.matchType])
|
||||
byType[m.matchType] = [];
|
||||
byType[m.matchType].push(m);
|
||||
}
|
||||
const typeLabels = {
|
||||
exact_name: '✅ Exact Name Match',
|
||||
normalized_name: '✅ Normalized Name Match',
|
||||
company_name: '🏢 Company Name Match',
|
||||
slug: '🔗 Slug Match',
|
||||
fuzzy: '🔍 Fuzzy Match',
|
||||
};
|
||||
for (const [type, results] of Object.entries(byType)) {
|
||||
console.log(`${typeLabels[type]} (${results.length}):`);
|
||||
for (const r of results) {
|
||||
const dispInfo = r.dispensary;
|
||||
console.log(` • "${r.store.name}" → "${dispInfo.name}" (${dispInfo.city}) [${r.score}%]`);
|
||||
}
|
||||
console.log('');
|
||||
}
|
||||
}
|
||||
if (noMatches.length > 0) {
|
||||
console.log(`\n❌ No Match Found: ${noMatches.length}`);
|
||||
console.log('----------------------------------\n');
|
||||
for (const store of noMatches) {
|
||||
console.log(` • "${store.name}" (slug: ${store.slug})`);
|
||||
}
|
||||
console.log('');
|
||||
}
|
||||
// Apply if requested
|
||||
if (flags.apply && matches.length > 0) {
|
||||
console.log('\n🔧 Applying mappings...\n');
|
||||
let updated = 0;
|
||||
for (const match of matches) {
|
||||
if (!match.dispensary)
|
||||
continue;
|
||||
await migrate_1.pool.query('UPDATE stores SET dispensary_id = $1 WHERE id = $2', [match.dispensary.id, match.store.id]);
|
||||
updated++;
|
||||
if (flags.verbose) {
|
||||
console.log(` ✓ Linked store ${match.store.id} to dispensary ${match.dispensary.id}`);
|
||||
}
|
||||
}
|
||||
console.log(`\n✅ Updated ${updated} stores with dispensary mappings\n`);
|
||||
logger_1.logger.info('system', `Backfill complete: linked ${updated} stores to dispensaries`);
|
||||
}
|
||||
else if (matches.length > 0 && !flags.apply) {
|
||||
console.log('\n💡 Run with --apply to update the database\n');
|
||||
}
|
||||
// Summary
|
||||
console.log('📈 Summary:');
|
||||
console.log(` Would match: ${matches.length} stores`);
|
||||
console.log(` No match: ${noMatches.length} stores`);
|
||||
console.log(` Match rate: ${Math.round((matches.length / unmappedStores.length) * 100)}%\n`);
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error:', error);
|
||||
process.exit(1);
|
||||
}
|
||||
finally {
|
||||
await migrate_1.pool.end();
|
||||
}
|
||||
}
|
||||
main().catch(console.error);
|
||||
332
backend/dist/scripts/bootstrap-discovery.js
vendored
Normal file
332
backend/dist/scripts/bootstrap-discovery.js
vendored
Normal file
@@ -0,0 +1,332 @@
|
||||
#!/usr/bin/env npx tsx
|
||||
"use strict";
|
||||
/**
|
||||
* Bootstrap Discovery Script
|
||||
*
|
||||
* One-time (but reusable) bootstrap command that:
|
||||
* 1. Ensures every Dispensary has a dispensary_crawl_schedule entry (4h default)
|
||||
* 2. Optionally runs RunDispensaryOrchestrator for each dispensary
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx src/scripts/bootstrap-discovery.ts # Create schedules only
|
||||
* npx tsx src/scripts/bootstrap-discovery.ts --run # Create schedules + run orchestrator
|
||||
* npx tsx src/scripts/bootstrap-discovery.ts --run --limit=10 # Run for first 10 dispensaries
|
||||
* npx tsx src/scripts/bootstrap-discovery.ts --dry-run # Preview what would happen
|
||||
* npx tsx src/scripts/bootstrap-discovery.ts --status # Show current status only
|
||||
*/
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
const migrate_1 = require("../db/migrate");
|
||||
const dispensary_orchestrator_1 = require("../services/dispensary-orchestrator");
|
||||
// Parse command line args
|
||||
const args = process.argv.slice(2);
|
||||
const flags = {
|
||||
run: args.includes('--run'),
|
||||
dryRun: args.includes('--dry-run'),
|
||||
status: args.includes('--status'),
|
||||
help: args.includes('--help') || args.includes('-h'),
|
||||
limit: parseInt(args.find(a => a.startsWith('--limit='))?.split('=')[1] || '0'),
|
||||
concurrency: parseInt(args.find(a => a.startsWith('--concurrency='))?.split('=')[1] || '3'),
|
||||
interval: parseInt(args.find(a => a.startsWith('--interval='))?.split('=')[1] || '240'),
|
||||
detectionOnly: args.includes('--detection-only'),
|
||||
productionOnly: args.includes('--production-only'),
|
||||
sandboxOnly: args.includes('--sandbox-only'),
|
||||
};
|
||||
async function showHelp() {
|
||||
console.log(`
|
||||
Bootstrap Discovery - Initialize Dispensary Crawl System
|
||||
|
||||
USAGE:
|
||||
npx tsx src/scripts/bootstrap-discovery.ts [OPTIONS]
|
||||
|
||||
OPTIONS:
|
||||
--run After creating schedules, run the orchestrator for each dispensary
|
||||
--dry-run Show what would happen without making changes
|
||||
--status Show current status and exit
|
||||
--limit=N Limit how many dispensaries to process (0 = all, default: 0)
|
||||
--concurrency=N How many dispensaries to process in parallel (default: 3)
|
||||
--interval=M Default interval in minutes for new schedules (default: 240 = 4 hours)
|
||||
--detection-only Only run detection, don't crawl
|
||||
--production-only Only run dispensaries in production mode
|
||||
--sandbox-only Only run dispensaries in sandbox mode
|
||||
--help, -h Show this help message
|
||||
|
||||
EXAMPLES:
|
||||
# Create schedule entries for all dispensaries (no crawling)
|
||||
npx tsx src/scripts/bootstrap-discovery.ts
|
||||
|
||||
# Create schedules and run orchestrator for all dispensaries
|
||||
npx tsx src/scripts/bootstrap-discovery.ts --run
|
||||
|
||||
# Run orchestrator for first 10 dispensaries
|
||||
npx tsx src/scripts/bootstrap-discovery.ts --run --limit=10
|
||||
|
||||
# Run with higher concurrency
|
||||
npx tsx src/scripts/bootstrap-discovery.ts --run --concurrency=5
|
||||
|
||||
# Show current status
|
||||
npx tsx src/scripts/bootstrap-discovery.ts --status
|
||||
|
||||
WHAT IT DOES:
|
||||
1. Creates dispensary_crawl_schedule entries for all dispensaries that don't have one
|
||||
2. If --run: For each dispensary, runs the orchestrator which:
|
||||
a. Checks if provider detection is needed (null/unknown/stale/low confidence)
|
||||
b. Runs detection if needed
|
||||
c. If Dutchie + production mode: runs production crawl
|
||||
d. Otherwise: runs sandbox crawl
|
||||
3. Updates schedule status and job records
|
||||
`);
|
||||
}
|
||||
async function showStatus() {
|
||||
console.log('\n📊 Current Dispensary Crawl Status\n');
|
||||
console.log('═'.repeat(70));
|
||||
// Get dispensary counts by provider
|
||||
const providerStats = await migrate_1.pool.query(`
|
||||
SELECT
|
||||
COALESCE(product_provider, 'undetected') as provider,
|
||||
COUNT(*) as count,
|
||||
COUNT(*) FILTER (WHERE product_crawler_mode = 'production') as production,
|
||||
COUNT(*) FILTER (WHERE product_crawler_mode = 'sandbox') as sandbox,
|
||||
COUNT(*) FILTER (WHERE product_crawler_mode IS NULL) as no_mode
|
||||
FROM dispensaries
|
||||
GROUP BY COALESCE(product_provider, 'undetected')
|
||||
ORDER BY count DESC
|
||||
`);
|
||||
console.log('\nProvider Distribution:');
|
||||
console.log('-'.repeat(60));
|
||||
console.log('Provider'.padEnd(20) +
|
||||
'Total'.padStart(8) +
|
||||
'Production'.padStart(12) +
|
||||
'Sandbox'.padStart(10) +
|
||||
'No Mode'.padStart(10));
|
||||
console.log('-'.repeat(60));
|
||||
for (const row of providerStats.rows) {
|
||||
console.log(row.provider.padEnd(20) +
|
||||
row.count.toString().padStart(8) +
|
||||
row.production.toString().padStart(12) +
|
||||
row.sandbox.toString().padStart(10) +
|
||||
row.no_mode.toString().padStart(10));
|
||||
}
|
||||
// Get schedule stats
|
||||
const scheduleStats = await migrate_1.pool.query(`
|
||||
SELECT
|
||||
COUNT(DISTINCT d.id) as total_dispensaries,
|
||||
COUNT(DISTINCT dcs.id) as with_schedule,
|
||||
COUNT(DISTINCT d.id) - COUNT(DISTINCT dcs.id) as without_schedule,
|
||||
COUNT(*) FILTER (WHERE dcs.is_active = TRUE) as active_schedules,
|
||||
COUNT(*) FILTER (WHERE dcs.last_status = 'success') as last_success,
|
||||
COUNT(*) FILTER (WHERE dcs.last_status = 'error') as last_error,
|
||||
COUNT(*) FILTER (WHERE dcs.last_status = 'sandbox_only') as last_sandbox,
|
||||
COUNT(*) FILTER (WHERE dcs.last_status = 'detection_only') as last_detection,
|
||||
COUNT(*) FILTER (WHERE dcs.next_run_at <= NOW()) as due_now,
|
||||
AVG(dcs.interval_minutes)::INTEGER as avg_interval
|
||||
FROM dispensaries d
|
||||
LEFT JOIN dispensary_crawl_schedule dcs ON dcs.dispensary_id = d.id
|
||||
`);
|
||||
const s = scheduleStats.rows[0];
|
||||
console.log('\n\nSchedule Status:');
|
||||
console.log('-'.repeat(60));
|
||||
console.log(` Total Dispensaries: ${s.total_dispensaries}`);
|
||||
console.log(` With Schedule: ${s.with_schedule}`);
|
||||
console.log(` Without Schedule: ${s.without_schedule}`);
|
||||
console.log(` Active Schedules: ${s.active_schedules || 0}`);
|
||||
console.log(` Average Interval: ${s.avg_interval || 240} minutes`);
|
||||
console.log('\n Last Run Status:');
|
||||
console.log(` - Success: ${s.last_success || 0}`);
|
||||
console.log(` - Error: ${s.last_error || 0}`);
|
||||
console.log(` - Sandbox Only: ${s.last_sandbox || 0}`);
|
||||
console.log(` - Detection Only: ${s.last_detection || 0}`);
|
||||
console.log(` - Due Now: ${s.due_now || 0}`);
|
||||
// Get recent job stats
|
||||
const jobStats = await migrate_1.pool.query(`
|
||||
SELECT
|
||||
COUNT(*) as total,
|
||||
COUNT(*) FILTER (WHERE status = 'completed') as completed,
|
||||
COUNT(*) FILTER (WHERE status = 'failed') as failed,
|
||||
COUNT(*) FILTER (WHERE status = 'running') as running,
|
||||
COUNT(*) FILTER (WHERE status = 'pending') as pending,
|
||||
COUNT(*) FILTER (WHERE detection_ran = TRUE) as with_detection,
|
||||
COUNT(*) FILTER (WHERE crawl_ran = TRUE) as with_crawl,
|
||||
COUNT(*) FILTER (WHERE crawl_type = 'production') as production_crawls,
|
||||
COUNT(*) FILTER (WHERE crawl_type = 'sandbox') as sandbox_crawls,
|
||||
SUM(products_found) as total_products_found
|
||||
FROM dispensary_crawl_jobs
|
||||
WHERE created_at > NOW() - INTERVAL '24 hours'
|
||||
`);
|
||||
const j = jobStats.rows[0];
|
||||
console.log('\n\nJobs (Last 24 Hours):');
|
||||
console.log('-'.repeat(60));
|
||||
console.log(` Total Jobs: ${j.total || 0}`);
|
||||
console.log(` Completed: ${j.completed || 0}`);
|
||||
console.log(` Failed: ${j.failed || 0}`);
|
||||
console.log(` Running: ${j.running || 0}`);
|
||||
console.log(` Pending: ${j.pending || 0}`);
|
||||
console.log(` With Detection: ${j.with_detection || 0}`);
|
||||
console.log(` With Crawl: ${j.with_crawl || 0}`);
|
||||
console.log(` - Production: ${j.production_crawls || 0}`);
|
||||
console.log(` - Sandbox: ${j.sandbox_crawls || 0}`);
|
||||
console.log(` Products Found: ${j.total_products_found || 0}`);
|
||||
console.log('\n' + '═'.repeat(70) + '\n');
|
||||
}
|
||||
async function createSchedules() {
|
||||
console.log('\n📅 Creating Dispensary Schedules...\n');
|
||||
if (flags.dryRun) {
|
||||
// Count how many would be created
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT COUNT(*) as count
|
||||
FROM dispensaries d
|
||||
WHERE NOT EXISTS (
|
||||
SELECT 1 FROM dispensary_crawl_schedule dcs WHERE dcs.dispensary_id = d.id
|
||||
)
|
||||
`);
|
||||
const wouldCreate = parseInt(result.rows[0].count);
|
||||
console.log(` Would create ${wouldCreate} new schedule entries (${flags.interval} minute interval)`);
|
||||
return { created: wouldCreate, existing: 0 };
|
||||
}
|
||||
const result = await (0, dispensary_orchestrator_1.ensureAllDispensariesHaveSchedules)(flags.interval);
|
||||
console.log(` ✓ Created ${result.created} new schedule entries`);
|
||||
console.log(` ✓ ${result.existing} dispensaries already had schedules`);
|
||||
return result;
|
||||
}
|
||||
async function getDispensariesToProcess() {
|
||||
// Build query based on filters
|
||||
let whereClause = 'TRUE';
|
||||
if (flags.productionOnly) {
|
||||
whereClause += ` AND d.product_crawler_mode = 'production'`;
|
||||
}
|
||||
else if (flags.sandboxOnly) {
|
||||
whereClause += ` AND d.product_crawler_mode = 'sandbox'`;
|
||||
}
|
||||
if (flags.detectionOnly) {
|
||||
whereClause += ` AND (d.product_provider IS NULL OR d.product_provider = 'unknown' OR d.product_confidence < 50)`;
|
||||
}
|
||||
const limitClause = flags.limit > 0 ? `LIMIT ${flags.limit}` : '';
|
||||
const query = `
|
||||
SELECT d.id, d.name, d.product_provider, d.product_crawler_mode
|
||||
FROM dispensaries d
|
||||
LEFT JOIN dispensary_crawl_schedule dcs ON dcs.dispensary_id = d.id
|
||||
WHERE ${whereClause}
|
||||
ORDER BY
|
||||
COALESCE(dcs.priority, 0) DESC,
|
||||
dcs.last_run_at ASC NULLS FIRST,
|
||||
d.id ASC
|
||||
${limitClause}
|
||||
`;
|
||||
const result = await migrate_1.pool.query(query);
|
||||
return result.rows.map(row => row.id);
|
||||
}
|
||||
async function runOrchestrator() {
|
||||
console.log('\n🚀 Running Dispensary Orchestrator...\n');
|
||||
const dispensaryIds = await getDispensariesToProcess();
|
||||
if (dispensaryIds.length === 0) {
|
||||
console.log(' No dispensaries to process.');
|
||||
return;
|
||||
}
|
||||
console.log(` Found ${dispensaryIds.length} dispensaries to process`);
|
||||
console.log(` Concurrency: ${flags.concurrency}`);
|
||||
if (flags.dryRun) {
|
||||
console.log('\n Would process these dispensaries:');
|
||||
const details = await migrate_1.pool.query(`SELECT id, name, product_provider, product_crawler_mode
|
||||
FROM dispensaries WHERE id = ANY($1) ORDER BY id`, [dispensaryIds]);
|
||||
for (const row of details.rows.slice(0, 20)) {
|
||||
console.log(` - [${row.id}] ${row.name} (${row.product_provider || 'undetected'}, ${row.product_crawler_mode || 'no mode'})`);
|
||||
}
|
||||
if (details.rows.length > 20) {
|
||||
console.log(` ... and ${details.rows.length - 20} more`);
|
||||
}
|
||||
return;
|
||||
}
|
||||
console.log('\n Starting batch processing...\n');
|
||||
const results = await (0, dispensary_orchestrator_1.runBatchDispensaryOrchestrator)(dispensaryIds, flags.concurrency);
|
||||
// Summarize results
|
||||
const summary = {
|
||||
total: results.length,
|
||||
success: results.filter(r => r.status === 'success').length,
|
||||
sandboxOnly: results.filter(r => r.status === 'sandbox_only').length,
|
||||
detectionOnly: results.filter(r => r.status === 'detection_only').length,
|
||||
error: results.filter(r => r.status === 'error').length,
|
||||
detectionsRan: results.filter(r => r.detectionRan).length,
|
||||
crawlsRan: results.filter(r => r.crawlRan).length,
|
||||
productionCrawls: results.filter(r => r.crawlType === 'production').length,
|
||||
sandboxCrawls: results.filter(r => r.crawlType === 'sandbox').length,
|
||||
totalProducts: results.reduce((sum, r) => sum + (r.productsFound || 0), 0),
|
||||
totalDuration: results.reduce((sum, r) => sum + r.durationMs, 0),
|
||||
};
|
||||
console.log('\n' + '═'.repeat(70));
|
||||
console.log(' Orchestrator Results');
|
||||
console.log('═'.repeat(70));
|
||||
console.log(`
|
||||
Total Processed: ${summary.total}
|
||||
|
||||
Status:
|
||||
- Success: ${summary.success}
|
||||
- Sandbox Only: ${summary.sandboxOnly}
|
||||
- Detection Only: ${summary.detectionOnly}
|
||||
- Error: ${summary.error}
|
||||
|
||||
Operations:
|
||||
- Detections Ran: ${summary.detectionsRan}
|
||||
- Crawls Ran: ${summary.crawlsRan}
|
||||
- Production: ${summary.productionCrawls}
|
||||
- Sandbox: ${summary.sandboxCrawls}
|
||||
|
||||
Results:
|
||||
- Products Found: ${summary.totalProducts}
|
||||
- Total Duration: ${(summary.totalDuration / 1000).toFixed(1)}s
|
||||
- Avg per Dispensary: ${(summary.totalDuration / summary.total / 1000).toFixed(1)}s
|
||||
`);
|
||||
console.log('═'.repeat(70) + '\n');
|
||||
// Show errors if any
|
||||
const errors = results.filter(r => r.status === 'error');
|
||||
if (errors.length > 0) {
|
||||
console.log('\n⚠️ Errors encountered:');
|
||||
for (const err of errors.slice(0, 10)) {
|
||||
console.log(` - [${err.dispensaryId}] ${err.dispensaryName}: ${err.error}`);
|
||||
}
|
||||
if (errors.length > 10) {
|
||||
console.log(` ... and ${errors.length - 10} more errors`);
|
||||
}
|
||||
}
|
||||
}
|
||||
async function main() {
|
||||
if (flags.help) {
|
||||
await showHelp();
|
||||
process.exit(0);
|
||||
}
|
||||
console.log('\n' + '═'.repeat(70));
|
||||
console.log(' Dispensary Crawl Bootstrap Discovery');
|
||||
console.log('═'.repeat(70));
|
||||
if (flags.dryRun) {
|
||||
console.log('\n🔍 DRY RUN MODE - No changes will be made');
|
||||
}
|
||||
try {
|
||||
// Always show status first
|
||||
await showStatus();
|
||||
if (flags.status) {
|
||||
// Status-only mode, we're done
|
||||
await migrate_1.pool.end();
|
||||
process.exit(0);
|
||||
}
|
||||
// Step 1: Create schedule entries
|
||||
await createSchedules();
|
||||
// Step 2: Optionally run orchestrator
|
||||
if (flags.run) {
|
||||
await runOrchestrator();
|
||||
}
|
||||
else {
|
||||
console.log('\n💡 Tip: Use --run to also run the orchestrator for each dispensary');
|
||||
}
|
||||
// Show final status
|
||||
if (!flags.dryRun) {
|
||||
await showStatus();
|
||||
}
|
||||
}
|
||||
catch (error) {
|
||||
console.error('\n❌ Fatal error:', error.message);
|
||||
console.error(error.stack);
|
||||
process.exit(1);
|
||||
}
|
||||
finally {
|
||||
await migrate_1.pool.end();
|
||||
}
|
||||
}
|
||||
main();
|
||||
236
backend/dist/scripts/capture-dutchie-schema.js
vendored
Normal file
236
backend/dist/scripts/capture-dutchie-schema.js
vendored
Normal file
@@ -0,0 +1,236 @@
|
||||
"use strict";
|
||||
/**
|
||||
* Capture Dutchie GraphQL response structure via Puppeteer interception
|
||||
* This script navigates to a Dutchie menu page and captures the GraphQL responses
|
||||
* to understand the exact product data structure
|
||||
*/
|
||||
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
||||
if (k2 === undefined) k2 = k;
|
||||
var desc = Object.getOwnPropertyDescriptor(m, k);
|
||||
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
||||
desc = { enumerable: true, get: function() { return m[k]; } };
|
||||
}
|
||||
Object.defineProperty(o, k2, desc);
|
||||
}) : (function(o, m, k, k2) {
|
||||
if (k2 === undefined) k2 = k;
|
||||
o[k2] = m[k];
|
||||
}));
|
||||
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
||||
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
||||
}) : function(o, v) {
|
||||
o["default"] = v;
|
||||
});
|
||||
var __importStar = (this && this.__importStar) || (function () {
|
||||
var ownKeys = function(o) {
|
||||
ownKeys = Object.getOwnPropertyNames || function (o) {
|
||||
var ar = [];
|
||||
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
||||
return ar;
|
||||
};
|
||||
return ownKeys(o);
|
||||
};
|
||||
return function (mod) {
|
||||
if (mod && mod.__esModule) return mod;
|
||||
var result = {};
|
||||
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
||||
__setModuleDefault(result, mod);
|
||||
return result;
|
||||
};
|
||||
})();
|
||||
var __importDefault = (this && this.__importDefault) || function (mod) {
|
||||
return (mod && mod.__esModule) ? mod : { "default": mod };
|
||||
};
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
const puppeteer_extra_1 = __importDefault(require("puppeteer-extra"));
|
||||
const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth"));
|
||||
const fs = __importStar(require("fs"));
|
||||
puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)());
|
||||
async function captureSchema(menuUrl) {
|
||||
let browser;
|
||||
const capturedResponses = [];
|
||||
try {
|
||||
console.log('='.repeat(80));
|
||||
console.log('DUTCHIE GRAPHQL SCHEMA CAPTURE');
|
||||
console.log('='.repeat(80));
|
||||
console.log(`\nTarget URL: ${menuUrl}\n`);
|
||||
browser = await puppeteer_extra_1.default.launch({
|
||||
headless: 'new',
|
||||
args: [
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-blink-features=AutomationControlled',
|
||||
]
|
||||
});
|
||||
const page = await browser.newPage();
|
||||
// Use a realistic user agent
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
|
||||
// Set viewport to desktop size
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
// Hide webdriver flag
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
||||
window.chrome = { runtime: {} };
|
||||
});
|
||||
// Intercept all GraphQL responses
|
||||
page.on('response', async (response) => {
|
||||
const url = response.url();
|
||||
// Only capture GraphQL responses
|
||||
if (!url.includes('graphql'))
|
||||
return;
|
||||
try {
|
||||
const contentType = response.headers()['content-type'] || '';
|
||||
if (!contentType.includes('application/json'))
|
||||
return;
|
||||
const data = await response.json();
|
||||
// Extract operation name from URL if possible
|
||||
const urlParams = new URLSearchParams(url.split('?')[1] || '');
|
||||
const operationName = urlParams.get('operationName') || 'Unknown';
|
||||
capturedResponses.push({
|
||||
operationName,
|
||||
url: url.substring(0, 200),
|
||||
data,
|
||||
timestamp: new Date()
|
||||
});
|
||||
console.log(`📡 Captured: ${operationName}`);
|
||||
// Check for product data
|
||||
if (data?.data?.filteredProducts?.products) {
|
||||
const products = data.data.filteredProducts.products;
|
||||
console.log(` Found ${products.length} products`);
|
||||
}
|
||||
}
|
||||
catch (e) {
|
||||
// Ignore parse errors
|
||||
}
|
||||
});
|
||||
console.log('Navigating to page...');
|
||||
await page.goto(menuUrl, {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout: 90000
|
||||
});
|
||||
// Check if it's a Dutchie menu
|
||||
const isDutchie = await page.evaluate(() => {
|
||||
return typeof window.reactEnv !== 'undefined';
|
||||
});
|
||||
if (isDutchie) {
|
||||
console.log('✅ Dutchie menu detected\n');
|
||||
// Get environment info
|
||||
const reactEnv = await page.evaluate(() => window.reactEnv);
|
||||
console.log('Dutchie Environment:');
|
||||
console.log(` dispensaryId: ${reactEnv?.dispensaryId}`);
|
||||
console.log(` retailerId: ${reactEnv?.retailerId}`);
|
||||
console.log(` chainId: ${reactEnv?.chainId}`);
|
||||
}
|
||||
// Scroll to trigger lazy loading
|
||||
console.log('\nScrolling to load more products...');
|
||||
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
|
||||
await new Promise(r => setTimeout(r, 3000));
|
||||
// Click on a category to trigger more loads
|
||||
const categoryLinks = await page.$$('a[href*="/products/"]');
|
||||
if (categoryLinks.length > 0) {
|
||||
console.log(`Found ${categoryLinks.length} category links, clicking first one...`);
|
||||
try {
|
||||
await categoryLinks[0].click();
|
||||
await page.waitForNavigation({ waitUntil: 'networkidle2', timeout: 30000 });
|
||||
}
|
||||
catch (e) {
|
||||
console.log('Category navigation failed, continuing...');
|
||||
}
|
||||
}
|
||||
// Wait a bit more for any final responses
|
||||
await new Promise(r => setTimeout(r, 2000));
|
||||
console.log(`\n${'='.repeat(80)}`);
|
||||
console.log(`CAPTURED ${capturedResponses.length} GRAPHQL RESPONSES`);
|
||||
console.log('='.repeat(80));
|
||||
// Find product data
|
||||
let productSchema = null;
|
||||
let sampleProduct = null;
|
||||
for (const resp of capturedResponses) {
|
||||
console.log(`\n${resp.operationName}:`);
|
||||
console.log(` URL: ${resp.url.substring(0, 100)}...`);
|
||||
if (resp.data?.data?.filteredProducts?.products) {
|
||||
const products = resp.data.data.filteredProducts.products;
|
||||
console.log(` ✅ Contains ${products.length} products`);
|
||||
if (products.length > 0 && !sampleProduct) {
|
||||
sampleProduct = products[0];
|
||||
productSchema = extractSchema(products[0]);
|
||||
}
|
||||
}
|
||||
// Show top-level data keys
|
||||
if (resp.data?.data) {
|
||||
console.log(` Data keys: ${Object.keys(resp.data.data).join(', ')}`);
|
||||
}
|
||||
}
|
||||
// Output the product schema
|
||||
if (productSchema) {
|
||||
console.log('\n' + '='.repeat(80));
|
||||
console.log('PRODUCT SCHEMA (from first product):');
|
||||
console.log('='.repeat(80));
|
||||
console.log(JSON.stringify(productSchema, null, 2));
|
||||
console.log('\n' + '='.repeat(80));
|
||||
console.log('SAMPLE PRODUCT:');
|
||||
console.log('='.repeat(80));
|
||||
console.log(JSON.stringify(sampleProduct, null, 2));
|
||||
// Save to file
|
||||
const outputData = {
|
||||
capturedAt: new Date().toISOString(),
|
||||
menuUrl,
|
||||
schema: productSchema,
|
||||
sampleProduct,
|
||||
allResponses: capturedResponses.map(r => ({
|
||||
operationName: r.operationName,
|
||||
dataKeys: r.data?.data ? Object.keys(r.data.data) : [],
|
||||
productCount: r.data?.data?.filteredProducts?.products?.length || 0
|
||||
}))
|
||||
};
|
||||
const outputPath = '/tmp/dutchie-schema-capture.json';
|
||||
fs.writeFileSync(outputPath, JSON.stringify(outputData, null, 2));
|
||||
console.log(`\nSaved capture to: ${outputPath}`);
|
||||
}
|
||||
else {
|
||||
console.log('\n❌ No product data captured');
|
||||
// Debug: show all responses
|
||||
console.log('\nAll captured responses:');
|
||||
for (const resp of capturedResponses) {
|
||||
console.log(`\n${resp.operationName}:`);
|
||||
console.log(JSON.stringify(resp.data, null, 2).substring(0, 500));
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error:', error.message);
|
||||
}
|
||||
finally {
|
||||
if (browser) {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Extract schema from an object (field names + types)
|
||||
*/
|
||||
function extractSchema(obj, prefix = '') {
|
||||
if (obj === null)
|
||||
return { type: 'null' };
|
||||
if (obj === undefined)
|
||||
return { type: 'undefined' };
|
||||
if (Array.isArray(obj)) {
|
||||
if (obj.length === 0)
|
||||
return { type: 'array', items: 'unknown' };
|
||||
return {
|
||||
type: 'array',
|
||||
items: extractSchema(obj[0], prefix + '[]')
|
||||
};
|
||||
}
|
||||
if (typeof obj === 'object') {
|
||||
const schema = { type: 'object', properties: {} };
|
||||
for (const [key, value] of Object.entries(obj)) {
|
||||
schema.properties[key] = extractSchema(value, prefix ? `${prefix}.${key}` : key);
|
||||
}
|
||||
return schema;
|
||||
}
|
||||
return { type: typeof obj, example: String(obj).substring(0, 100) };
|
||||
}
|
||||
// Run
|
||||
const url = process.argv[2] || 'https://dutchie.com/embedded-menu/AZ-Deeply-Rooted';
|
||||
captureSchema(url).catch(console.error);
|
||||
56
backend/dist/scripts/crawl-all-dutchie.js
vendored
Normal file
56
backend/dist/scripts/crawl-all-dutchie.js
vendored
Normal file
@@ -0,0 +1,56 @@
|
||||
"use strict";
|
||||
/**
|
||||
* Seed crawl: trigger dutchie crawls for all dispensaries with menu_type='dutchie'
|
||||
* and a resolved platform_dispensary_id. This uses the AZ orchestrator endpoint logic.
|
||||
*
|
||||
* Usage (local):
|
||||
* node dist/scripts/crawl-all-dutchie.js
|
||||
*
|
||||
* Requires:
|
||||
* - DATABASE_URL/CRAWLSY_DATABASE_URL pointing to the consolidated DB
|
||||
* - Dispensaries table populated with menu_type and platform_dispensary_id
|
||||
*/
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
const connection_1 = require("../dutchie-az/db/connection");
|
||||
const dispensary_orchestrator_1 = require("../services/dispensary-orchestrator");
|
||||
async function main() {
|
||||
const { rows } = await (0, connection_1.query)(`
|
||||
SELECT id, name, slug, platform_dispensary_id
|
||||
FROM dispensaries
|
||||
WHERE menu_type = 'dutchie'
|
||||
AND platform_dispensary_id IS NOT NULL
|
||||
ORDER BY id
|
||||
`);
|
||||
if (!rows.length) {
|
||||
console.log('No dutchie dispensaries with resolved platform_dispensary_id found.');
|
||||
process.exit(0);
|
||||
}
|
||||
console.log(`Found ${rows.length} dutchie dispensaries with resolved IDs. Triggering crawls...`);
|
||||
let success = 0;
|
||||
let failed = 0;
|
||||
for (const row of rows) {
|
||||
try {
|
||||
console.log(`Crawling ${row.id} (${row.name})...`);
|
||||
const result = await (0, dispensary_orchestrator_1.runDispensaryOrchestrator)(row.id);
|
||||
const ok = result.status === 'success' ||
|
||||
result.status === 'sandbox_only' ||
|
||||
result.status === 'detection_only';
|
||||
if (ok) {
|
||||
success++;
|
||||
}
|
||||
else {
|
||||
failed++;
|
||||
console.warn(`Crawl returned status ${result.status} for ${row.id} (${row.name})`);
|
||||
}
|
||||
}
|
||||
catch (err) {
|
||||
failed++;
|
||||
console.error(`Failed crawl for ${row.id} (${row.name}): ${err.message}`);
|
||||
}
|
||||
}
|
||||
console.log(`Completed. Success: ${success}, Failed: ${failed}`);
|
||||
}
|
||||
main().catch((err) => {
|
||||
console.error('Fatal:', err);
|
||||
process.exit(1);
|
||||
});
|
||||
24
backend/dist/scripts/crawl-five-sequential.js
vendored
Normal file
24
backend/dist/scripts/crawl-five-sequential.js
vendored
Normal file
@@ -0,0 +1,24 @@
|
||||
"use strict";
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
const dispensary_orchestrator_1 = require("../services/dispensary-orchestrator");
|
||||
// Run 5 crawlers sequentially to avoid OOM
|
||||
const dispensaryIds = [112, 81, 115, 140, 177];
|
||||
async function run() {
|
||||
console.log('Starting 5 crawlers SEQUENTIALLY...');
|
||||
for (const id of dispensaryIds) {
|
||||
console.log(`\n=== Starting crawler for dispensary ${id} ===`);
|
||||
try {
|
||||
const result = await (0, dispensary_orchestrator_1.runDispensaryOrchestrator)(id);
|
||||
console.log(` Status: ${result.status}`);
|
||||
console.log(` Summary: ${result.summary}`);
|
||||
if (result.productsFound) {
|
||||
console.log(` Products: ${result.productsFound} found, ${result.productsNew} new, ${result.productsUpdated} updated`);
|
||||
}
|
||||
}
|
||||
catch (e) {
|
||||
console.log(` ERROR: ${e.message}`);
|
||||
}
|
||||
}
|
||||
console.log('\n=== All 5 crawlers complete ===');
|
||||
}
|
||||
run().catch(e => console.log('Fatal:', e.message));
|
||||
181
backend/dist/scripts/parallel-scrape.js
vendored
Normal file
181
backend/dist/scripts/parallel-scrape.js
vendored
Normal file
@@ -0,0 +1,181 @@
|
||||
"use strict";
|
||||
var __importDefault = (this && this.__importDefault) || function (mod) {
|
||||
return (mod && mod.__esModule) ? mod : { "default": mod };
|
||||
};
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
const migrate_1 = require("../db/migrate");
|
||||
const proxy_1 = require("../services/proxy");
|
||||
const puppeteer_extra_1 = __importDefault(require("puppeteer-extra"));
|
||||
const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth"));
|
||||
puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)());
|
||||
const FIREFOX_USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0';
|
||||
const NUM_WORKERS = parseInt(process.argv[2] || '15');
|
||||
const DISPENSARY_NAME = process.argv[3] || 'Deeply Rooted';
|
||||
const USE_PROXIES = process.argv[4] !== 'no-proxy';
|
||||
async function getStore(name) {
|
||||
const result = await migrate_1.pool.query(`SELECT id, name, slug, dutchie_url FROM stores WHERE name ILIKE $1 LIMIT 1`, [`%${name}%`]);
|
||||
return result.rows[0] || null;
|
||||
}
|
||||
async function getCategories(storeId) {
|
||||
const result = await migrate_1.pool.query(`SELECT id, name, slug, dutchie_url as url FROM categories WHERE store_id = $1 AND scrape_enabled = true`, [storeId]);
|
||||
return result.rows;
|
||||
}
|
||||
async function scrapeWithProxy(workerId, store, category) {
|
||||
let browser = null;
|
||||
let proxyId = null;
|
||||
try {
|
||||
// Get a proxy (if enabled)
|
||||
let proxy = null;
|
||||
if (USE_PROXIES) {
|
||||
proxy = await (0, proxy_1.getActiveProxy)();
|
||||
if (proxy) {
|
||||
proxyId = proxy.id;
|
||||
console.log(`[Worker ${workerId}] Using proxy: ${proxy.protocol}://${proxy.host}:${proxy.port}`);
|
||||
}
|
||||
else {
|
||||
console.log(`[Worker ${workerId}] No proxy available, using direct connection`);
|
||||
}
|
||||
}
|
||||
else {
|
||||
console.log(`[Worker ${workerId}] Direct connection (proxies disabled)`);
|
||||
}
|
||||
// Build browser args
|
||||
const args = [
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-accelerated-2d-canvas',
|
||||
'--disable-gpu',
|
||||
'--window-size=1920,1080',
|
||||
];
|
||||
if (proxy) {
|
||||
if (proxy.protocol === 'socks5' || proxy.protocol === 'socks') {
|
||||
args.push(`--proxy-server=socks5://${proxy.host}:${proxy.port}`);
|
||||
}
|
||||
else {
|
||||
args.push(`--proxy-server=${proxy.protocol}://${proxy.host}:${proxy.port}`);
|
||||
}
|
||||
}
|
||||
browser = await puppeteer_extra_1.default.launch({
|
||||
headless: true,
|
||||
args,
|
||||
executablePath: process.env.PUPPETEER_EXECUTABLE_PATH,
|
||||
});
|
||||
const page = await browser.newPage();
|
||||
await page.setUserAgent(FIREFOX_USER_AGENT);
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
// Handle proxy auth if needed
|
||||
if (proxy?.username && proxy?.password) {
|
||||
await page.authenticate({
|
||||
username: proxy.username,
|
||||
password: proxy.password,
|
||||
});
|
||||
}
|
||||
console.log(`[Worker ${workerId}] Scraping category: ${category.name} (${category.url})`);
|
||||
// Navigate to the category page
|
||||
const response = await page.goto(category.url, {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout: 60000,
|
||||
});
|
||||
if (!response || !response.ok()) {
|
||||
throw new Error(`Failed to load page: ${response?.status()}`);
|
||||
}
|
||||
// Wait for products to load
|
||||
await page.waitForSelector('[data-testid="product-list-item"], a[href*="/product/"]', {
|
||||
timeout: 30000,
|
||||
}).catch(() => {
|
||||
console.log(`[Worker ${workerId}] No products found on page`);
|
||||
});
|
||||
// Extract products
|
||||
const products = await page.evaluate(() => {
|
||||
// Try data-testid first, then fall back to product links
|
||||
const listItems = document.querySelectorAll('[data-testid="product-list-item"]');
|
||||
if (listItems.length > 0)
|
||||
return listItems.length;
|
||||
return document.querySelectorAll('a[href*="/product/"]').length;
|
||||
});
|
||||
console.log(`[Worker ${workerId}] Found ${products} products in ${category.name}`);
|
||||
await browser.close();
|
||||
return { success: true, products };
|
||||
}
|
||||
catch (error) {
|
||||
console.error(`[Worker ${workerId}] Error:`, error.message);
|
||||
// Check for bot detection
|
||||
if (proxyId && (0, proxy_1.isBotDetectionError)(error.message)) {
|
||||
(0, proxy_1.putProxyInTimeout)(proxyId, error.message);
|
||||
}
|
||||
if (browser) {
|
||||
await browser.close().catch(() => { });
|
||||
}
|
||||
return { success: false, products: 0, error: error.message };
|
||||
}
|
||||
}
|
||||
async function worker(workerId, store, categories, categoryIndex) {
|
||||
while (categoryIndex.current < categories.length) {
|
||||
const idx = categoryIndex.current++;
|
||||
const category = categories[idx];
|
||||
if (!category)
|
||||
break;
|
||||
console.log(`[Worker ${workerId}] Starting category ${idx + 1}/${categories.length}: ${category.name}`);
|
||||
const result = await scrapeWithProxy(workerId, store, category);
|
||||
if (result.success) {
|
||||
console.log(`[Worker ${workerId}] Completed ${category.name}: ${result.products} products`);
|
||||
}
|
||||
else {
|
||||
console.log(`[Worker ${workerId}] Failed ${category.name}: ${result.error}`);
|
||||
}
|
||||
// Small delay between requests
|
||||
await new Promise(resolve => setTimeout(resolve, 2000 + Math.random() * 3000));
|
||||
}
|
||||
console.log(`[Worker ${workerId}] Finished all assigned work`);
|
||||
}
|
||||
async function main() {
|
||||
console.log(`\n${'='.repeat(60)}`);
|
||||
console.log(`Parallel Scraper - ${NUM_WORKERS} workers`);
|
||||
console.log(`Target: ${DISPENSARY_NAME}`);
|
||||
console.log(`User Agent: Firefox`);
|
||||
console.log(`Proxies: ${USE_PROXIES ? 'Enabled' : 'Disabled'}`);
|
||||
console.log(`${'='.repeat(60)}\n`);
|
||||
// Find the store
|
||||
const store = await getStore(DISPENSARY_NAME);
|
||||
if (!store) {
|
||||
console.error(`Store not found: ${DISPENSARY_NAME}`);
|
||||
process.exit(1);
|
||||
}
|
||||
console.log(`Found store: ${store.name} (ID: ${store.id})`);
|
||||
// Get categories
|
||||
const categories = await getCategories(store.id);
|
||||
if (categories.length === 0) {
|
||||
console.error('No categories found for this store');
|
||||
process.exit(1);
|
||||
}
|
||||
console.log(`Found ${categories.length} categories to scrape`);
|
||||
console.log(`Categories: ${categories.map(c => c.name).join(', ')}\n`);
|
||||
// Check proxies
|
||||
const proxyResult = await migrate_1.pool.query('SELECT COUNT(*) as total, COUNT(*) FILTER (WHERE active = true) as active FROM proxies');
|
||||
console.log(`Proxies: ${proxyResult.rows[0].active} active / ${proxyResult.rows[0].total} total\n`);
|
||||
// Shared index for work distribution
|
||||
const categoryIndex = { current: 0 };
|
||||
// For a store with few categories, we'll run multiple passes
|
||||
// Expand the work by duplicating categories for parallel workers
|
||||
const expandedCategories = [];
|
||||
const passes = Math.ceil(NUM_WORKERS / Math.max(categories.length, 1));
|
||||
for (let i = 0; i < passes; i++) {
|
||||
expandedCategories.push(...categories);
|
||||
}
|
||||
console.log(`Running ${NUM_WORKERS} workers across ${expandedCategories.length} category scrapes\n`);
|
||||
// Start workers
|
||||
const workers = [];
|
||||
for (let i = 0; i < NUM_WORKERS; i++) {
|
||||
workers.push(worker(i + 1, store, expandedCategories, categoryIndex));
|
||||
// Stagger worker starts
|
||||
await new Promise(resolve => setTimeout(resolve, 500));
|
||||
}
|
||||
// Wait for all workers
|
||||
await Promise.all(workers);
|
||||
console.log(`\n${'='.repeat(60)}`);
|
||||
console.log('All workers completed!');
|
||||
console.log(`${'='.repeat(60)}\n`);
|
||||
await migrate_1.pool.end();
|
||||
}
|
||||
main().catch(console.error);
|
||||
344
backend/dist/scripts/queue-dispensaries.js
vendored
Normal file
344
backend/dist/scripts/queue-dispensaries.js
vendored
Normal file
@@ -0,0 +1,344 @@
|
||||
#!/usr/bin/env npx tsx
|
||||
"use strict";
|
||||
/**
|
||||
* Queue Dispensaries Script
|
||||
*
|
||||
* Orchestrates the multi-provider crawler system:
|
||||
* 1. Queue dispensaries that need provider detection
|
||||
* 2. Queue Dutchie dispensaries for production crawl
|
||||
* 3. Queue sandbox dispensaries for learning crawls
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx src/scripts/queue-dispensaries.ts [--detection] [--production] [--sandbox] [--all]
|
||||
* npx tsx src/scripts/queue-dispensaries.ts --dry-run
|
||||
* npx tsx src/scripts/queue-dispensaries.ts --process # Process queued jobs
|
||||
*/
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
const migrate_1 = require("../db/migrate");
|
||||
const crawler_jobs_1 = require("../services/crawler-jobs");
|
||||
// Parse command line args
|
||||
const args = process.argv.slice(2);
|
||||
const flags = {
|
||||
detection: args.includes('--detection') || args.includes('--all'),
|
||||
production: args.includes('--production') || args.includes('--all'),
|
||||
sandbox: args.includes('--sandbox') || args.includes('--all'),
|
||||
dryRun: args.includes('--dry-run'),
|
||||
process: args.includes('--process'),
|
||||
help: args.includes('--help') || args.includes('-h'),
|
||||
limit: parseInt(args.find(a => a.startsWith('--limit='))?.split('=')[1] || '10'),
|
||||
};
|
||||
// If no specific flags, default to all
|
||||
if (!flags.detection && !flags.production && !flags.sandbox && !flags.process) {
|
||||
flags.detection = true;
|
||||
flags.production = true;
|
||||
flags.sandbox = true;
|
||||
}
|
||||
async function showHelp() {
|
||||
console.log(`
|
||||
Queue Dispensaries - Multi-Provider Crawler Orchestration
|
||||
|
||||
USAGE:
|
||||
npx tsx src/scripts/queue-dispensaries.ts [OPTIONS]
|
||||
|
||||
OPTIONS:
|
||||
--detection Queue dispensaries that need provider detection
|
||||
--production Queue Dutchie production crawls
|
||||
--sandbox Queue sandbox/learning crawls
|
||||
--all Queue all job types (default if no specific flag)
|
||||
--process Process queued jobs instead of just queuing
|
||||
--dry-run Show what would be queued without making changes
|
||||
--limit=N Maximum dispensaries to queue per type (default: 10)
|
||||
--help, -h Show this help message
|
||||
|
||||
EXAMPLES:
|
||||
# Queue all dispensaries for appropriate jobs
|
||||
npx tsx src/scripts/queue-dispensaries.ts
|
||||
|
||||
# Only queue detection jobs
|
||||
npx tsx src/scripts/queue-dispensaries.ts --detection --limit=20
|
||||
|
||||
# Dry run to see what would be queued
|
||||
npx tsx src/scripts/queue-dispensaries.ts --dry-run
|
||||
|
||||
# Process sandbox jobs
|
||||
npx tsx src/scripts/queue-dispensaries.ts --process
|
||||
`);
|
||||
}
|
||||
async function queueDetectionJobs() {
|
||||
console.log('\n📡 Queueing Detection Jobs...');
|
||||
// Find dispensaries that need provider detection:
|
||||
// - menu_provider is null OR
|
||||
// - menu_provider_confidence < 70 AND
|
||||
// - crawler_status is idle (not already queued/running)
|
||||
// - has a website URL
|
||||
const query = `
|
||||
SELECT id, name, website, menu_url, menu_provider, menu_provider_confidence
|
||||
FROM dispensaries
|
||||
WHERE (website IS NOT NULL OR menu_url IS NOT NULL)
|
||||
AND crawler_status = 'idle'
|
||||
AND (menu_provider IS NULL OR menu_provider_confidence < 70)
|
||||
ORDER BY
|
||||
CASE WHEN menu_provider IS NULL THEN 0 ELSE 1 END,
|
||||
menu_provider_confidence ASC
|
||||
LIMIT $1
|
||||
`;
|
||||
const result = await migrate_1.pool.query(query, [flags.limit]);
|
||||
if (flags.dryRun) {
|
||||
console.log(` Would queue ${result.rows.length} dispensaries for detection:`);
|
||||
for (const row of result.rows) {
|
||||
console.log(` - [${row.id}] ${row.name} (current: ${row.menu_provider || 'unknown'}, confidence: ${row.menu_provider_confidence}%)`);
|
||||
}
|
||||
return result.rows.length;
|
||||
}
|
||||
let queued = 0;
|
||||
for (const dispensary of result.rows) {
|
||||
try {
|
||||
// Update status to queued
|
||||
await migrate_1.pool.query(`UPDATE dispensaries SET crawler_status = 'queued_detection', updated_at = NOW() WHERE id = $1`, [dispensary.id]);
|
||||
// Create sandbox job for detection
|
||||
await migrate_1.pool.query(`INSERT INTO sandbox_crawl_jobs (dispensary_id, job_type, status, priority)
|
||||
VALUES ($1, 'detection', 'pending', 10)`, [dispensary.id]);
|
||||
console.log(` ✓ Queued detection: [${dispensary.id}] ${dispensary.name}`);
|
||||
queued++;
|
||||
}
|
||||
catch (error) {
|
||||
console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`);
|
||||
}
|
||||
}
|
||||
return queued;
|
||||
}
|
||||
async function queueProductionCrawls() {
|
||||
console.log('\n🏭 Queueing Production Dutchie Crawls...');
|
||||
// Find Dutchie dispensaries ready for production crawl:
|
||||
// - menu_provider = 'dutchie'
|
||||
// - crawler_mode = 'production'
|
||||
// - crawler_status is idle
|
||||
// - last_menu_scrape is old or null
|
||||
const query = `
|
||||
SELECT d.id, d.name, d.last_menu_scrape, d.menu_url
|
||||
FROM dispensaries d
|
||||
WHERE d.menu_provider = 'dutchie'
|
||||
AND d.crawler_mode = 'production'
|
||||
AND d.crawler_status = 'idle'
|
||||
AND (d.last_menu_scrape IS NULL OR d.last_menu_scrape < NOW() - INTERVAL '4 hours')
|
||||
ORDER BY
|
||||
CASE WHEN d.last_menu_scrape IS NULL THEN 0 ELSE 1 END,
|
||||
d.last_menu_scrape ASC
|
||||
LIMIT $1
|
||||
`;
|
||||
const result = await migrate_1.pool.query(query, [flags.limit]);
|
||||
if (flags.dryRun) {
|
||||
console.log(` Would queue ${result.rows.length} Dutchie dispensaries for production crawl:`);
|
||||
for (const row of result.rows) {
|
||||
const lastScrape = row.last_menu_scrape ? new Date(row.last_menu_scrape).toISOString() : 'never';
|
||||
console.log(` - [${row.id}] ${row.name} (last scrape: ${lastScrape})`);
|
||||
}
|
||||
return result.rows.length;
|
||||
}
|
||||
let queued = 0;
|
||||
for (const dispensary of result.rows) {
|
||||
try {
|
||||
// Update status to queued
|
||||
await migrate_1.pool.query(`UPDATE dispensaries SET crawler_status = 'queued_crawl', updated_at = NOW() WHERE id = $1`, [dispensary.id]);
|
||||
// Create crawl job in the main crawl_jobs table (production queue)
|
||||
await migrate_1.pool.query(`INSERT INTO crawl_jobs (store_id, job_type, trigger_type, status, priority, metadata)
|
||||
SELECT s.id, 'full_crawl', 'scheduled', 'pending', 50,
|
||||
jsonb_build_object('dispensary_id', $1, 'source', 'queue-dispensaries')
|
||||
FROM stores s
|
||||
JOIN dispensaries d ON (d.menu_url = s.dutchie_url OR d.name ILIKE '%' || s.name || '%')
|
||||
WHERE d.id = $1
|
||||
LIMIT 1`, [dispensary.id]);
|
||||
console.log(` ✓ Queued production crawl: [${dispensary.id}] ${dispensary.name}`);
|
||||
queued++;
|
||||
}
|
||||
catch (error) {
|
||||
console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`);
|
||||
}
|
||||
}
|
||||
return queued;
|
||||
}
|
||||
async function queueSandboxCrawls() {
|
||||
console.log('\n🧪 Queueing Sandbox Crawls...');
|
||||
// Find sandbox dispensaries needing crawls:
|
||||
// - crawler_mode = 'sandbox'
|
||||
// - crawler_status in (idle, error_needs_review)
|
||||
// - No recent sandbox job
|
||||
const query = `
|
||||
SELECT d.id, d.name, d.menu_provider, d.crawler_status, d.website
|
||||
FROM dispensaries d
|
||||
WHERE d.crawler_mode = 'sandbox'
|
||||
AND d.crawler_status IN ('idle', 'error_needs_review')
|
||||
AND (d.website IS NOT NULL OR d.menu_url IS NOT NULL)
|
||||
AND NOT EXISTS (
|
||||
SELECT 1 FROM sandbox_crawl_jobs sj
|
||||
WHERE sj.dispensary_id = d.id
|
||||
AND sj.status IN ('pending', 'running')
|
||||
)
|
||||
ORDER BY d.updated_at ASC
|
||||
LIMIT $1
|
||||
`;
|
||||
const result = await migrate_1.pool.query(query, [flags.limit]);
|
||||
if (flags.dryRun) {
|
||||
console.log(` Would queue ${result.rows.length} dispensaries for sandbox crawl:`);
|
||||
for (const row of result.rows) {
|
||||
console.log(` - [${row.id}] ${row.name} (provider: ${row.menu_provider || 'unknown'}, status: ${row.crawler_status})`);
|
||||
}
|
||||
return result.rows.length;
|
||||
}
|
||||
let queued = 0;
|
||||
for (const dispensary of result.rows) {
|
||||
try {
|
||||
// Update status
|
||||
await migrate_1.pool.query(`UPDATE dispensaries SET crawler_status = 'queued_crawl', updated_at = NOW() WHERE id = $1`, [dispensary.id]);
|
||||
// Create sandbox job
|
||||
await migrate_1.pool.query(`INSERT INTO sandbox_crawl_jobs (dispensary_id, job_type, status, priority)
|
||||
VALUES ($1, 'deep_crawl', 'pending', 5)`, [dispensary.id]);
|
||||
console.log(` ✓ Queued sandbox crawl: [${dispensary.id}] ${dispensary.name}`);
|
||||
queued++;
|
||||
}
|
||||
catch (error) {
|
||||
console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`);
|
||||
}
|
||||
}
|
||||
return queued;
|
||||
}
|
||||
async function processJobs() {
|
||||
console.log('\n⚙️ Processing Queued Jobs...\n');
|
||||
// Process sandbox jobs (detection + sandbox crawls)
|
||||
const sandboxJobs = await migrate_1.pool.query(`SELECT * FROM sandbox_crawl_jobs
|
||||
WHERE status = 'pending'
|
||||
ORDER BY priority DESC, scheduled_at ASC
|
||||
LIMIT $1`, [flags.limit]);
|
||||
console.log(`Found ${sandboxJobs.rows.length} pending sandbox jobs\n`);
|
||||
for (const job of sandboxJobs.rows) {
|
||||
console.log(`Processing job ${job.id} (${job.job_type}) for dispensary ${job.dispensary_id}...`);
|
||||
try {
|
||||
// Mark as running
|
||||
await migrate_1.pool.query(`UPDATE sandbox_crawl_jobs SET status = 'running', started_at = NOW() WHERE id = $1`, [job.id]);
|
||||
let result;
|
||||
if (job.job_type === 'detection') {
|
||||
result = await (0, crawler_jobs_1.runDetectMenuProviderJob)(job.dispensary_id);
|
||||
}
|
||||
else {
|
||||
result = await (0, crawler_jobs_1.runSandboxCrawlJob)(job.dispensary_id, job.sandbox_id);
|
||||
}
|
||||
// Update job status
|
||||
await migrate_1.pool.query(`UPDATE sandbox_crawl_jobs
|
||||
SET status = $1, completed_at = NOW(), result_summary = $2, error_message = $3
|
||||
WHERE id = $4`, [
|
||||
result.success ? 'completed' : 'failed',
|
||||
JSON.stringify(result.data || {}),
|
||||
result.success ? null : result.message,
|
||||
job.id,
|
||||
]);
|
||||
console.log(` ${result.success ? '✓' : '✗'} ${result.message}\n`);
|
||||
}
|
||||
catch (error) {
|
||||
await migrate_1.pool.query(`UPDATE sandbox_crawl_jobs SET status = 'failed', error_message = $1 WHERE id = $2`, [error.message, job.id]);
|
||||
console.log(` ✗ Error: ${error.message}\n`);
|
||||
}
|
||||
}
|
||||
}
|
||||
async function showStats() {
|
||||
console.log('\n📊 Current Stats:');
|
||||
// Dispensary stats
|
||||
const stats = await migrate_1.pool.query(`
|
||||
SELECT
|
||||
COUNT(*) as total,
|
||||
COUNT(*) FILTER (WHERE menu_provider IS NULL) as no_provider,
|
||||
COUNT(*) FILTER (WHERE menu_provider = 'dutchie') as dutchie,
|
||||
COUNT(*) FILTER (WHERE menu_provider NOT IN ('dutchie', 'unknown') AND menu_provider IS NOT NULL) as other_providers,
|
||||
COUNT(*) FILTER (WHERE menu_provider = 'unknown') as unknown,
|
||||
COUNT(*) FILTER (WHERE crawler_mode = 'production') as production_mode,
|
||||
COUNT(*) FILTER (WHERE crawler_mode = 'sandbox') as sandbox_mode,
|
||||
COUNT(*) FILTER (WHERE crawler_status = 'idle') as idle,
|
||||
COUNT(*) FILTER (WHERE crawler_status LIKE 'queued%') as queued,
|
||||
COUNT(*) FILTER (WHERE crawler_status = 'running') as running,
|
||||
COUNT(*) FILTER (WHERE crawler_status = 'ok') as ok,
|
||||
COUNT(*) FILTER (WHERE crawler_status = 'error_needs_review') as needs_review
|
||||
FROM dispensaries
|
||||
`);
|
||||
const s = stats.rows[0];
|
||||
console.log(`
|
||||
Dispensaries: ${s.total}
|
||||
- No provider detected: ${s.no_provider}
|
||||
- Dutchie: ${s.dutchie}
|
||||
- Other providers: ${s.other_providers}
|
||||
- Unknown: ${s.unknown}
|
||||
|
||||
Crawler Mode:
|
||||
- Production: ${s.production_mode}
|
||||
- Sandbox: ${s.sandbox_mode}
|
||||
|
||||
Status:
|
||||
- Idle: ${s.idle}
|
||||
- Queued: ${s.queued}
|
||||
- Running: ${s.running}
|
||||
- OK: ${s.ok}
|
||||
- Needs Review: ${s.needs_review}
|
||||
`);
|
||||
// Job stats
|
||||
const jobStats = await migrate_1.pool.query(`
|
||||
SELECT
|
||||
COUNT(*) FILTER (WHERE status = 'pending') as pending,
|
||||
COUNT(*) FILTER (WHERE status = 'running') as running,
|
||||
COUNT(*) FILTER (WHERE status = 'completed') as completed,
|
||||
COUNT(*) FILTER (WHERE status = 'failed') as failed
|
||||
FROM sandbox_crawl_jobs
|
||||
`);
|
||||
const j = jobStats.rows[0];
|
||||
console.log(` Sandbox Jobs:
|
||||
- Pending: ${j.pending}
|
||||
- Running: ${j.running}
|
||||
- Completed: ${j.completed}
|
||||
- Failed: ${j.failed}
|
||||
`);
|
||||
}
|
||||
async function main() {
|
||||
if (flags.help) {
|
||||
await showHelp();
|
||||
process.exit(0);
|
||||
}
|
||||
console.log('═══════════════════════════════════════════════════════');
|
||||
console.log(' Multi-Provider Crawler Queue Manager');
|
||||
console.log('═══════════════════════════════════════════════════════');
|
||||
if (flags.dryRun) {
|
||||
console.log('\n🔍 DRY RUN MODE - No changes will be made\n');
|
||||
}
|
||||
try {
|
||||
// Show current stats first
|
||||
await showStats();
|
||||
if (flags.process) {
|
||||
// Process mode - run jobs instead of queuing
|
||||
await processJobs();
|
||||
}
|
||||
else {
|
||||
// Queuing mode
|
||||
let totalQueued = 0;
|
||||
if (flags.detection) {
|
||||
totalQueued += await queueDetectionJobs();
|
||||
}
|
||||
if (flags.production) {
|
||||
totalQueued += await queueProductionCrawls();
|
||||
}
|
||||
if (flags.sandbox) {
|
||||
totalQueued += await queueSandboxCrawls();
|
||||
}
|
||||
console.log('\n═══════════════════════════════════════════════════════');
|
||||
console.log(` Total dispensaries queued: ${totalQueued}`);
|
||||
console.log('═══════════════════════════════════════════════════════\n');
|
||||
}
|
||||
// Show updated stats
|
||||
if (!flags.dryRun) {
|
||||
await showStats();
|
||||
}
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Fatal error:', error);
|
||||
process.exit(1);
|
||||
}
|
||||
finally {
|
||||
await migrate_1.pool.end();
|
||||
}
|
||||
}
|
||||
main();
|
||||
473
backend/dist/scripts/queue-intelligence.js
vendored
Normal file
473
backend/dist/scripts/queue-intelligence.js
vendored
Normal file
@@ -0,0 +1,473 @@
|
||||
#!/usr/bin/env npx tsx
|
||||
"use strict";
|
||||
/**
|
||||
* Queue Intelligence Script
|
||||
*
|
||||
* Orchestrates the multi-category intelligence crawler system:
|
||||
* 1. Queue dispensaries that need provider detection (all 4 categories)
|
||||
* 2. Queue per-category production crawls (Dutchie products only for now)
|
||||
* 3. Queue per-category sandbox crawls (all providers)
|
||||
*
|
||||
* Each category (product, specials, brand, metadata) is handled independently.
|
||||
* A failure in one category does NOT affect other categories.
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx src/scripts/queue-intelligence.ts [--detection] [--production] [--sandbox] [--all]
|
||||
* npx tsx src/scripts/queue-intelligence.ts --category=product --sandbox
|
||||
* npx tsx src/scripts/queue-intelligence.ts --process --category=product
|
||||
* npx tsx src/scripts/queue-intelligence.ts --dry-run
|
||||
*/
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
const migrate_1 = require("../db/migrate");
|
||||
const intelligence_detector_1 = require("../services/intelligence-detector");
|
||||
const category_crawler_jobs_1 = require("../services/category-crawler-jobs");
|
||||
// Parse command line args
|
||||
const args = process.argv.slice(2);
|
||||
const flags = {
|
||||
detection: args.includes('--detection') || args.includes('--all'),
|
||||
production: args.includes('--production') || args.includes('--all'),
|
||||
sandbox: args.includes('--sandbox') || args.includes('--all'),
|
||||
dryRun: args.includes('--dry-run'),
|
||||
process: args.includes('--process'),
|
||||
help: args.includes('--help') || args.includes('-h'),
|
||||
limit: parseInt(args.find(a => a.startsWith('--limit='))?.split('=')[1] || '10'),
|
||||
category: args.find(a => a.startsWith('--category='))?.split('=')[1],
|
||||
dispensary: parseInt(args.find(a => a.startsWith('--dispensary='))?.split('=')[1] || '0'),
|
||||
};
|
||||
// If no specific flags, default to all
|
||||
if (!flags.detection && !flags.production && !flags.sandbox && !flags.process) {
|
||||
flags.detection = true;
|
||||
flags.production = true;
|
||||
flags.sandbox = true;
|
||||
}
|
||||
const CATEGORIES = ['product', 'specials', 'brand', 'metadata'];
|
||||
async function showHelp() {
|
||||
console.log(`
|
||||
Queue Intelligence - Multi-Category Crawler Orchestration
|
||||
|
||||
USAGE:
|
||||
npx tsx src/scripts/queue-intelligence.ts [OPTIONS]
|
||||
|
||||
OPTIONS:
|
||||
--detection Queue dispensaries that need multi-category detection
|
||||
--production Queue per-category production crawls
|
||||
--sandbox Queue per-category sandbox crawls
|
||||
--all Queue all job types (default if no specific flag)
|
||||
--process Process queued jobs instead of just queuing
|
||||
--category=CATEGORY Filter to specific category (product|specials|brand|metadata)
|
||||
--dispensary=ID Process only a specific dispensary
|
||||
--dry-run Show what would be queued without making changes
|
||||
--limit=N Maximum dispensaries to queue per type (default: 10)
|
||||
--help, -h Show this help message
|
||||
|
||||
CATEGORIES:
|
||||
product - Product/menu data (Dutchie=production, others=sandbox)
|
||||
specials - Deals and specials (all sandbox for now)
|
||||
brand - Brand intelligence (all sandbox for now)
|
||||
metadata - Categories/taxonomy (all sandbox for now)
|
||||
|
||||
EXAMPLES:
|
||||
# Queue all dispensaries for appropriate jobs
|
||||
npx tsx src/scripts/queue-intelligence.ts
|
||||
|
||||
# Only queue product detection jobs
|
||||
npx tsx src/scripts/queue-intelligence.ts --detection --category=product
|
||||
|
||||
# Process sandbox jobs for specials category
|
||||
npx tsx src/scripts/queue-intelligence.ts --process --category=specials --limit=5
|
||||
|
||||
# Run full detection for a specific dispensary
|
||||
npx tsx src/scripts/queue-intelligence.ts --process --detection --dispensary=123
|
||||
|
||||
# Dry run to see what would be queued
|
||||
npx tsx src/scripts/queue-intelligence.ts --dry-run
|
||||
`);
|
||||
}
|
||||
async function queueMultiCategoryDetection() {
|
||||
console.log('\n📡 Queueing Multi-Category Detection Jobs...');
|
||||
// Find dispensaries that need provider detection for any category:
|
||||
// - Any *_provider is null OR
|
||||
// - Any *_confidence < 70
|
||||
// - has a website URL
|
||||
const query = `
|
||||
SELECT id, name, website, menu_url,
|
||||
product_provider, product_confidence, product_crawler_mode,
|
||||
specials_provider, specials_confidence, specials_crawler_mode,
|
||||
brand_provider, brand_confidence, brand_crawler_mode,
|
||||
metadata_provider, metadata_confidence, metadata_crawler_mode
|
||||
FROM dispensaries
|
||||
WHERE (website IS NOT NULL OR menu_url IS NOT NULL)
|
||||
AND (
|
||||
product_provider IS NULL OR product_confidence < 70 OR
|
||||
specials_provider IS NULL OR specials_confidence < 70 OR
|
||||
brand_provider IS NULL OR brand_confidence < 70 OR
|
||||
metadata_provider IS NULL OR metadata_confidence < 70
|
||||
)
|
||||
ORDER BY
|
||||
CASE WHEN product_provider IS NULL THEN 0 ELSE 1 END,
|
||||
product_confidence ASC
|
||||
LIMIT $1
|
||||
`;
|
||||
const result = await migrate_1.pool.query(query, [flags.limit]);
|
||||
if (flags.dryRun) {
|
||||
console.log(` Would queue ${result.rows.length} dispensaries for multi-category detection:`);
|
||||
for (const row of result.rows) {
|
||||
const needsDetection = [];
|
||||
if (!row.product_provider || row.product_confidence < 70)
|
||||
needsDetection.push('product');
|
||||
if (!row.specials_provider || row.specials_confidence < 70)
|
||||
needsDetection.push('specials');
|
||||
if (!row.brand_provider || row.brand_confidence < 70)
|
||||
needsDetection.push('brand');
|
||||
if (!row.metadata_provider || row.metadata_confidence < 70)
|
||||
needsDetection.push('metadata');
|
||||
console.log(` - [${row.id}] ${row.name} (needs: ${needsDetection.join(', ')})`);
|
||||
}
|
||||
return result.rows.length;
|
||||
}
|
||||
let queued = 0;
|
||||
for (const dispensary of result.rows) {
|
||||
try {
|
||||
// Create detection jobs for each category that needs it
|
||||
for (const category of CATEGORIES) {
|
||||
const provider = dispensary[`${category}_provider`];
|
||||
const confidence = dispensary[`${category}_confidence`];
|
||||
if (!provider || confidence < 70) {
|
||||
await migrate_1.pool.query(`INSERT INTO sandbox_crawl_jobs (dispensary_id, category, job_type, status, priority)
|
||||
VALUES ($1, $2, 'detection', 'pending', 10)
|
||||
ON CONFLICT DO NOTHING`, [dispensary.id, category]);
|
||||
}
|
||||
}
|
||||
console.log(` ✓ Queued detection: [${dispensary.id}] ${dispensary.name}`);
|
||||
queued++;
|
||||
}
|
||||
catch (error) {
|
||||
console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`);
|
||||
}
|
||||
}
|
||||
return queued;
|
||||
}
|
||||
async function queueCategoryProductionCrawls(category) {
|
||||
const categories = category ? [category] : CATEGORIES;
|
||||
let totalQueued = 0;
|
||||
for (const cat of categories) {
|
||||
console.log(`\n🏭 Queueing Production ${cat.toUpperCase()} Crawls...`);
|
||||
// For now, only products have production-ready crawlers (Dutchie only)
|
||||
if (cat !== 'product') {
|
||||
console.log(` ⏭️ No production crawler for ${cat} yet - skipping`);
|
||||
continue;
|
||||
}
|
||||
// Find dispensaries ready for production crawl
|
||||
const query = `
|
||||
SELECT id, name, ${cat}_provider as provider, last_${cat}_scan_at as last_scan
|
||||
FROM dispensaries
|
||||
WHERE ${cat}_provider = 'dutchie'
|
||||
AND ${cat}_crawler_mode = 'production'
|
||||
AND ${cat}_confidence >= 70
|
||||
AND (last_${cat}_scan_at IS NULL OR last_${cat}_scan_at < NOW() - INTERVAL '4 hours')
|
||||
ORDER BY
|
||||
CASE WHEN last_${cat}_scan_at IS NULL THEN 0 ELSE 1 END,
|
||||
last_${cat}_scan_at ASC
|
||||
LIMIT $1
|
||||
`;
|
||||
const result = await migrate_1.pool.query(query, [flags.limit]);
|
||||
if (flags.dryRun) {
|
||||
console.log(` Would queue ${result.rows.length} dispensaries for ${cat} production crawl:`);
|
||||
for (const row of result.rows) {
|
||||
const lastScan = row.last_scan ? new Date(row.last_scan).toISOString() : 'never';
|
||||
console.log(` - [${row.id}] ${row.name} (provider: ${row.provider}, last: ${lastScan})`);
|
||||
}
|
||||
totalQueued += result.rows.length;
|
||||
continue;
|
||||
}
|
||||
for (const dispensary of result.rows) {
|
||||
try {
|
||||
// For products, use the existing crawl_jobs table for production
|
||||
await migrate_1.pool.query(`INSERT INTO crawl_jobs (store_id, job_type, trigger_type, status, priority, metadata)
|
||||
SELECT s.id, 'full_crawl', 'scheduled', 'pending', 50,
|
||||
jsonb_build_object('dispensary_id', $1, 'category', $2, 'source', 'queue-intelligence')
|
||||
FROM stores s
|
||||
JOIN dispensaries d ON (d.menu_url = s.dutchie_url OR d.name ILIKE '%' || s.name || '%')
|
||||
WHERE d.id = $1
|
||||
LIMIT 1`, [dispensary.id, cat]);
|
||||
console.log(` ✓ Queued ${cat} production: [${dispensary.id}] ${dispensary.name}`);
|
||||
totalQueued++;
|
||||
}
|
||||
catch (error) {
|
||||
console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
return totalQueued;
|
||||
}
|
||||
async function queueCategorySandboxCrawls(category) {
|
||||
const categories = category ? [category] : CATEGORIES;
|
||||
let totalQueued = 0;
|
||||
for (const cat of categories) {
|
||||
console.log(`\n🧪 Queueing Sandbox ${cat.toUpperCase()} Crawls...`);
|
||||
// Find dispensaries in sandbox mode for this category
|
||||
const query = `
|
||||
SELECT d.id, d.name, d.${cat}_provider as provider, d.${cat}_confidence as confidence,
|
||||
d.website, d.menu_url
|
||||
FROM dispensaries d
|
||||
WHERE d.${cat}_crawler_mode = 'sandbox'
|
||||
AND d.${cat}_provider IS NOT NULL
|
||||
AND (d.website IS NOT NULL OR d.menu_url IS NOT NULL)
|
||||
AND NOT EXISTS (
|
||||
SELECT 1 FROM sandbox_crawl_jobs sj
|
||||
WHERE sj.dispensary_id = d.id
|
||||
AND sj.category = $1
|
||||
AND sj.status IN ('pending', 'running')
|
||||
)
|
||||
ORDER BY d.${cat}_confidence DESC, d.updated_at ASC
|
||||
LIMIT $2
|
||||
`;
|
||||
const result = await migrate_1.pool.query(query, [cat, flags.limit]);
|
||||
if (flags.dryRun) {
|
||||
console.log(` Would queue ${result.rows.length} dispensaries for ${cat} sandbox crawl:`);
|
||||
for (const row of result.rows) {
|
||||
console.log(` - [${row.id}] ${row.name} (provider: ${row.provider}, confidence: ${row.confidence}%)`);
|
||||
}
|
||||
totalQueued += result.rows.length;
|
||||
continue;
|
||||
}
|
||||
for (const dispensary of result.rows) {
|
||||
try {
|
||||
// Create sandbox entry if needed
|
||||
const sandboxResult = await migrate_1.pool.query(`INSERT INTO crawler_sandboxes (dispensary_id, category, suspected_menu_provider, mode, status)
|
||||
VALUES ($1, $2, $3, 'template_learning', 'pending')
|
||||
ON CONFLICT (dispensary_id, category) WHERE status NOT IN ('moved_to_production', 'failed')
|
||||
DO UPDATE SET updated_at = NOW()
|
||||
RETURNING id`, [dispensary.id, cat, dispensary.provider]);
|
||||
const sandboxId = sandboxResult.rows[0]?.id;
|
||||
// Create sandbox job
|
||||
await migrate_1.pool.query(`INSERT INTO sandbox_crawl_jobs (dispensary_id, sandbox_id, category, job_type, status, priority)
|
||||
VALUES ($1, $2, $3, 'crawl', 'pending', 5)`, [dispensary.id, sandboxId, cat]);
|
||||
console.log(` ✓ Queued ${cat} sandbox: [${dispensary.id}] ${dispensary.name} (${dispensary.provider})`);
|
||||
totalQueued++;
|
||||
}
|
||||
catch (error) {
|
||||
console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
return totalQueued;
|
||||
}
|
||||
async function processDetectionJobs() {
|
||||
console.log('\n🔍 Processing Detection Jobs...');
|
||||
// Get pending detection jobs
|
||||
const jobs = await migrate_1.pool.query(`SELECT DISTINCT dispensary_id
|
||||
FROM sandbox_crawl_jobs
|
||||
WHERE job_type = 'detection' AND status = 'pending'
|
||||
${flags.category ? `AND category = $2` : ''}
|
||||
${flags.dispensary ? `AND dispensary_id = $${flags.category ? '3' : '2'}` : ''}
|
||||
LIMIT $1`, flags.category
|
||||
? (flags.dispensary ? [flags.limit, flags.category, flags.dispensary] : [flags.limit, flags.category])
|
||||
: (flags.dispensary ? [flags.limit, flags.dispensary] : [flags.limit]));
|
||||
for (const job of jobs.rows) {
|
||||
console.log(`\nProcessing detection for dispensary ${job.dispensary_id}...`);
|
||||
try {
|
||||
// Get dispensary info
|
||||
const dispResult = await migrate_1.pool.query('SELECT id, name, website, menu_url FROM dispensaries WHERE id = $1', [job.dispensary_id]);
|
||||
const dispensary = dispResult.rows[0];
|
||||
if (!dispensary) {
|
||||
console.log(` ✗ Dispensary not found`);
|
||||
continue;
|
||||
}
|
||||
const websiteUrl = dispensary.website || dispensary.menu_url;
|
||||
if (!websiteUrl) {
|
||||
console.log(` ✗ No website URL`);
|
||||
continue;
|
||||
}
|
||||
// Mark jobs as running
|
||||
await migrate_1.pool.query(`UPDATE sandbox_crawl_jobs SET status = 'running', started_at = NOW()
|
||||
WHERE dispensary_id = $1 AND job_type = 'detection' AND status = 'pending'`, [job.dispensary_id]);
|
||||
// Run multi-category detection
|
||||
console.log(` Detecting providers for ${dispensary.name}...`);
|
||||
const detection = await (0, intelligence_detector_1.detectMultiCategoryProviders)(websiteUrl, { timeout: 45000 });
|
||||
// Update all categories
|
||||
await (0, intelligence_detector_1.updateAllCategoryProviders)(job.dispensary_id, detection);
|
||||
// Mark jobs as completed
|
||||
await migrate_1.pool.query(`UPDATE sandbox_crawl_jobs SET status = 'completed', completed_at = NOW(),
|
||||
result_summary = $1
|
||||
WHERE dispensary_id = $2 AND job_type = 'detection' AND status = 'running'`, [JSON.stringify({
|
||||
product: { provider: detection.product.provider, confidence: detection.product.confidence },
|
||||
specials: { provider: detection.specials.provider, confidence: detection.specials.confidence },
|
||||
brand: { provider: detection.brand.provider, confidence: detection.brand.confidence },
|
||||
metadata: { provider: detection.metadata.provider, confidence: detection.metadata.confidence },
|
||||
}), job.dispensary_id]);
|
||||
console.log(` ✓ Detection complete:`);
|
||||
console.log(` Product: ${detection.product.provider} (${detection.product.confidence}%) -> ${detection.product.mode}`);
|
||||
console.log(` Specials: ${detection.specials.provider} (${detection.specials.confidence}%) -> ${detection.specials.mode}`);
|
||||
console.log(` Brand: ${detection.brand.provider} (${detection.brand.confidence}%) -> ${detection.brand.mode}`);
|
||||
console.log(` Metadata: ${detection.metadata.provider} (${detection.metadata.confidence}%) -> ${detection.metadata.mode}`);
|
||||
}
|
||||
catch (error) {
|
||||
console.log(` ✗ Error: ${error.message}`);
|
||||
await migrate_1.pool.query(`UPDATE sandbox_crawl_jobs SET status = 'failed', error_message = $1
|
||||
WHERE dispensary_id = $2 AND job_type = 'detection' AND status = 'running'`, [error.message, job.dispensary_id]);
|
||||
}
|
||||
}
|
||||
}
|
||||
async function processCrawlJobs() {
|
||||
const categories = flags.category ? [flags.category] : CATEGORIES;
|
||||
for (const cat of categories) {
|
||||
console.log(`\n⚙️ Processing ${cat.toUpperCase()} Crawl Jobs...\n`);
|
||||
// Process sandbox jobs for this category
|
||||
if (flags.sandbox || !flags.production) {
|
||||
await (0, category_crawler_jobs_1.processCategorySandboxJobs)(cat, flags.limit);
|
||||
}
|
||||
// Process production jobs for this category
|
||||
if (flags.production && cat === 'product') {
|
||||
// Get pending production crawls
|
||||
const prodJobs = await migrate_1.pool.query(`SELECT d.id
|
||||
FROM dispensaries d
|
||||
WHERE d.product_provider = 'dutchie'
|
||||
AND d.product_crawler_mode = 'production'
|
||||
AND d.product_confidence >= 70
|
||||
${flags.dispensary ? 'AND d.id = $2' : ''}
|
||||
LIMIT $1`, flags.dispensary ? [flags.limit, flags.dispensary] : [flags.limit]);
|
||||
for (const job of prodJobs.rows) {
|
||||
console.log(`Processing production ${cat} crawl for dispensary ${job.id}...`);
|
||||
const result = await (0, category_crawler_jobs_1.runCrawlProductsJob)(job.id);
|
||||
console.log(` ${result.success ? '✓' : '✗'} ${result.message}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
async function processSpecificDispensary() {
|
||||
if (!flags.dispensary)
|
||||
return;
|
||||
console.log(`\n🎯 Processing Dispensary ${flags.dispensary}...\n`);
|
||||
const dispResult = await migrate_1.pool.query('SELECT * FROM dispensaries WHERE id = $1', [flags.dispensary]);
|
||||
if (dispResult.rows.length === 0) {
|
||||
console.log('Dispensary not found');
|
||||
return;
|
||||
}
|
||||
const dispensary = dispResult.rows[0];
|
||||
console.log(`Name: ${dispensary.name}`);
|
||||
console.log(`Website: ${dispensary.website || dispensary.menu_url || 'none'}`);
|
||||
console.log('');
|
||||
if (flags.detection) {
|
||||
console.log('Running multi-category detection...');
|
||||
const websiteUrl = dispensary.website || dispensary.menu_url;
|
||||
if (websiteUrl) {
|
||||
const detection = await (0, intelligence_detector_1.detectMultiCategoryProviders)(websiteUrl);
|
||||
await (0, intelligence_detector_1.updateAllCategoryProviders)(flags.dispensary, detection);
|
||||
console.log('Detection results:');
|
||||
console.log(` Product: ${detection.product.provider} (${detection.product.confidence}%) -> ${detection.product.mode}`);
|
||||
console.log(` Specials: ${detection.specials.provider} (${detection.specials.confidence}%) -> ${detection.specials.mode}`);
|
||||
console.log(` Brand: ${detection.brand.provider} (${detection.brand.confidence}%) -> ${detection.brand.mode}`);
|
||||
console.log(` Metadata: ${detection.metadata.provider} (${detection.metadata.confidence}%) -> ${detection.metadata.mode}`);
|
||||
}
|
||||
}
|
||||
if (flags.production) {
|
||||
console.log('\nRunning production crawls...');
|
||||
const results = await (0, category_crawler_jobs_1.runAllCategoryProductionCrawls)(flags.dispensary);
|
||||
console.log(` ${results.summary}`);
|
||||
}
|
||||
if (flags.sandbox) {
|
||||
console.log('\nRunning sandbox crawls...');
|
||||
const results = await (0, category_crawler_jobs_1.runAllCategorySandboxCrawls)(flags.dispensary);
|
||||
console.log(` ${results.summary}`);
|
||||
}
|
||||
}
|
||||
async function showStats() {
|
||||
console.log('\n📊 Multi-Category Intelligence Stats:');
|
||||
// Per-category stats
|
||||
for (const cat of CATEGORIES) {
|
||||
const stats = await migrate_1.pool.query(`
|
||||
SELECT
|
||||
COUNT(*) as total,
|
||||
COUNT(*) FILTER (WHERE ${cat}_provider IS NULL) as no_provider,
|
||||
COUNT(*) FILTER (WHERE ${cat}_provider = 'dutchie') as dutchie,
|
||||
COUNT(*) FILTER (WHERE ${cat}_provider = 'treez') as treez,
|
||||
COUNT(*) FILTER (WHERE ${cat}_provider NOT IN ('dutchie', 'treez', 'unknown') AND ${cat}_provider IS NOT NULL) as other,
|
||||
COUNT(*) FILTER (WHERE ${cat}_provider = 'unknown') as unknown,
|
||||
COUNT(*) FILTER (WHERE ${cat}_crawler_mode = 'production') as production,
|
||||
COUNT(*) FILTER (WHERE ${cat}_crawler_mode = 'sandbox') as sandbox,
|
||||
AVG(${cat}_confidence) as avg_confidence
|
||||
FROM dispensaries
|
||||
`);
|
||||
const s = stats.rows[0];
|
||||
console.log(`
|
||||
${cat.toUpperCase()}:
|
||||
Providers: Dutchie=${s.dutchie}, Treez=${s.treez}, Other=${s.other}, Unknown=${s.unknown}, None=${s.no_provider}
|
||||
Modes: Production=${s.production}, Sandbox=${s.sandbox}
|
||||
Avg Confidence: ${Math.round(s.avg_confidence || 0)}%`);
|
||||
}
|
||||
// Job stats per category
|
||||
console.log('\n Sandbox Jobs by Category:');
|
||||
const jobStats = await migrate_1.pool.query(`
|
||||
SELECT
|
||||
category,
|
||||
COUNT(*) FILTER (WHERE status = 'pending') as pending,
|
||||
COUNT(*) FILTER (WHERE status = 'running') as running,
|
||||
COUNT(*) FILTER (WHERE status = 'completed') as completed,
|
||||
COUNT(*) FILTER (WHERE status = 'failed') as failed
|
||||
FROM sandbox_crawl_jobs
|
||||
GROUP BY category
|
||||
ORDER BY category
|
||||
`);
|
||||
for (const row of jobStats.rows) {
|
||||
console.log(` ${row.category}: pending=${row.pending}, running=${row.running}, completed=${row.completed}, failed=${row.failed}`);
|
||||
}
|
||||
}
|
||||
async function main() {
|
||||
if (flags.help) {
|
||||
await showHelp();
|
||||
process.exit(0);
|
||||
}
|
||||
console.log('═══════════════════════════════════════════════════════');
|
||||
console.log(' Multi-Category Intelligence Queue Manager');
|
||||
console.log('═══════════════════════════════════════════════════════');
|
||||
if (flags.dryRun) {
|
||||
console.log('\n🔍 DRY RUN MODE - No changes will be made\n');
|
||||
}
|
||||
if (flags.category) {
|
||||
console.log(`\n📌 Filtering to category: ${flags.category}\n`);
|
||||
}
|
||||
try {
|
||||
// Show current stats first
|
||||
await showStats();
|
||||
// If specific dispensary specified, process it directly
|
||||
if (flags.dispensary && flags.process) {
|
||||
await processSpecificDispensary();
|
||||
}
|
||||
else if (flags.process) {
|
||||
// Process mode - run jobs
|
||||
if (flags.detection) {
|
||||
await processDetectionJobs();
|
||||
}
|
||||
await processCrawlJobs();
|
||||
}
|
||||
else {
|
||||
// Queuing mode
|
||||
let totalQueued = 0;
|
||||
if (flags.detection) {
|
||||
totalQueued += await queueMultiCategoryDetection();
|
||||
}
|
||||
if (flags.production) {
|
||||
totalQueued += await queueCategoryProductionCrawls(flags.category);
|
||||
}
|
||||
if (flags.sandbox) {
|
||||
totalQueued += await queueCategorySandboxCrawls(flags.category);
|
||||
}
|
||||
console.log('\n═══════════════════════════════════════════════════════');
|
||||
console.log(` Total queued: ${totalQueued}`);
|
||||
console.log('═══════════════════════════════════════════════════════\n');
|
||||
}
|
||||
// Show updated stats
|
||||
if (!flags.dryRun) {
|
||||
await showStats();
|
||||
}
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Fatal error:', error);
|
||||
process.exit(1);
|
||||
}
|
||||
finally {
|
||||
await migrate_1.pool.end();
|
||||
}
|
||||
}
|
||||
main();
|
||||
125
backend/dist/scripts/run-dutchie-scrape.js
vendored
Normal file
125
backend/dist/scripts/run-dutchie-scrape.js
vendored
Normal file
@@ -0,0 +1,125 @@
|
||||
"use strict";
|
||||
/**
|
||||
* Run Dutchie GraphQL Scrape
|
||||
*
|
||||
* This script demonstrates the full pipeline:
|
||||
* 1. Puppeteer navigates to Dutchie menu
|
||||
* 2. GraphQL responses are intercepted
|
||||
* 3. Products are normalized to our schema
|
||||
* 4. Products are upserted to database
|
||||
* 5. Derived views (brands, categories, specials) are automatically updated
|
||||
*/
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
const pg_1 = require("pg");
|
||||
const dutchie_graphql_1 = require("../scrapers/dutchie-graphql");
|
||||
const DATABASE_URL = process.env.DATABASE_URL || 'postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus';
|
||||
async function main() {
|
||||
const pool = new pg_1.Pool({ connectionString: DATABASE_URL });
|
||||
try {
|
||||
console.log('='.repeat(80));
|
||||
console.log('DUTCHIE GRAPHQL SCRAPER - FULL PIPELINE TEST');
|
||||
console.log('='.repeat(80));
|
||||
console.log(`Database: ${DATABASE_URL.replace(/:[^:@]+@/, ':***@')}`);
|
||||
// Configuration
|
||||
const storeId = 1; // Deeply Rooted
|
||||
const menuUrl = 'https://dutchie.com/embedded-menu/AZ-Deeply-Rooted';
|
||||
console.log(`\nStore ID: ${storeId}`);
|
||||
console.log(`Menu URL: ${menuUrl}`);
|
||||
console.log('\n' + '-'.repeat(80));
|
||||
// Run the scrape
|
||||
console.log('\n🚀 Starting scrape...\n');
|
||||
const result = await (0, dutchie_graphql_1.scrapeDutchieMenu)(pool, storeId, menuUrl);
|
||||
console.log('\n' + '-'.repeat(80));
|
||||
console.log('📊 SCRAPE RESULTS:');
|
||||
console.log('-'.repeat(80));
|
||||
console.log(` Success: ${result.success}`);
|
||||
console.log(` Products Found: ${result.productsFound}`);
|
||||
console.log(` Inserted: ${result.inserted}`);
|
||||
console.log(` Updated: ${result.updated}`);
|
||||
if (result.error) {
|
||||
console.log(` Error: ${result.error}`);
|
||||
}
|
||||
// Query derived views to show the result
|
||||
if (result.success) {
|
||||
console.log('\n' + '-'.repeat(80));
|
||||
console.log('📈 DERIVED DATA (from products table):');
|
||||
console.log('-'.repeat(80));
|
||||
// Brands
|
||||
const brandsResult = await pool.query(`
|
||||
SELECT brand_name, product_count, min_price, max_price
|
||||
FROM derived_brands
|
||||
WHERE store_id = $1
|
||||
ORDER BY product_count DESC
|
||||
LIMIT 5
|
||||
`, [storeId]);
|
||||
console.log('\nTop 5 Brands:');
|
||||
brandsResult.rows.forEach(row => {
|
||||
console.log(` - ${row.brand_name}: ${row.product_count} products ($${row.min_price} - $${row.max_price})`);
|
||||
});
|
||||
// Specials
|
||||
const specialsResult = await pool.query(`
|
||||
SELECT name, brand, rec_price, rec_special_price, discount_percent
|
||||
FROM current_specials
|
||||
WHERE store_id = $1
|
||||
LIMIT 5
|
||||
`, [storeId]);
|
||||
console.log('\nTop 5 Specials:');
|
||||
if (specialsResult.rows.length === 0) {
|
||||
console.log(' (No specials found - is_on_special may not be populated yet)');
|
||||
}
|
||||
else {
|
||||
specialsResult.rows.forEach(row => {
|
||||
console.log(` - ${row.name} (${row.brand}): $${row.rec_price} → $${row.rec_special_price} (${row.discount_percent}% off)`);
|
||||
});
|
||||
}
|
||||
// Categories
|
||||
const categoriesResult = await pool.query(`
|
||||
SELECT category_name, product_count
|
||||
FROM derived_categories
|
||||
WHERE store_id = $1
|
||||
ORDER BY product_count DESC
|
||||
LIMIT 5
|
||||
`, [storeId]);
|
||||
console.log('\nTop 5 Categories:');
|
||||
if (categoriesResult.rows.length === 0) {
|
||||
console.log(' (No categories found - subcategory may not be populated yet)');
|
||||
}
|
||||
else {
|
||||
categoriesResult.rows.forEach(row => {
|
||||
console.log(` - ${row.category_name}: ${row.product_count} products`);
|
||||
});
|
||||
}
|
||||
// Sample product
|
||||
const sampleResult = await pool.query(`
|
||||
SELECT name, brand, subcategory, rec_price, rec_special_price, is_on_special, thc_percentage, status
|
||||
FROM products
|
||||
WHERE store_id = $1 AND subcategory IS NOT NULL
|
||||
ORDER BY updated_at DESC
|
||||
LIMIT 1
|
||||
`, [storeId]);
|
||||
if (sampleResult.rows.length > 0) {
|
||||
const sample = sampleResult.rows[0];
|
||||
console.log('\nSample Product (with new fields):');
|
||||
console.log(` Name: ${sample.name}`);
|
||||
console.log(` Brand: ${sample.brand}`);
|
||||
console.log(` Category: ${sample.subcategory}`);
|
||||
console.log(` Price: $${sample.rec_price}`);
|
||||
console.log(` Sale Price: ${sample.rec_special_price ? `$${sample.rec_special_price}` : 'N/A'}`);
|
||||
console.log(` On Special: ${sample.is_on_special}`);
|
||||
console.log(` THC: ${sample.thc_percentage}%`);
|
||||
console.log(` Status: ${sample.status}`);
|
||||
}
|
||||
}
|
||||
console.log('\n' + '='.repeat(80));
|
||||
console.log('✅ SCRAPE COMPLETE');
|
||||
console.log('='.repeat(80));
|
||||
}
|
||||
catch (error) {
|
||||
console.error('\n❌ Error:', error.message);
|
||||
throw error;
|
||||
}
|
||||
finally {
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
main().catch(console.error);
|
||||
279
backend/dist/scripts/scrape-all-active.js
vendored
Normal file
279
backend/dist/scripts/scrape-all-active.js
vendored
Normal file
@@ -0,0 +1,279 @@
|
||||
"use strict";
|
||||
/**
|
||||
* Scrape ALL active products via direct GraphQL pagination
|
||||
* This is more reliable than category navigation
|
||||
*/
|
||||
var __importDefault = (this && this.__importDefault) || function (mod) {
|
||||
return (mod && mod.__esModule) ? mod : { "default": mod };
|
||||
};
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
const puppeteer_extra_1 = __importDefault(require("puppeteer-extra"));
|
||||
const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth"));
|
||||
const pg_1 = require("pg");
|
||||
const dutchie_graphql_1 = require("../scrapers/dutchie-graphql");
|
||||
puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)());
|
||||
const DATABASE_URL = process.env.DATABASE_URL || 'postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus';
|
||||
const GRAPHQL_HASH = 'ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0';
|
||||
async function scrapeAllProducts(menuUrl, storeId) {
|
||||
const pool = new pg_1.Pool({ connectionString: DATABASE_URL });
|
||||
const browser = await puppeteer_extra_1.default.launch({
|
||||
headless: 'new',
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox'],
|
||||
});
|
||||
try {
|
||||
const page = await browser.newPage();
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36');
|
||||
console.log('Loading menu to establish session...');
|
||||
await page.goto(menuUrl, {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout: 60000,
|
||||
});
|
||||
await new Promise((r) => setTimeout(r, 3000));
|
||||
const dispensaryId = await page.evaluate(() => window.reactEnv?.dispensaryId);
|
||||
console.log('Dispensary ID:', dispensaryId);
|
||||
// Paginate through all products
|
||||
const allProducts = [];
|
||||
let pageNum = 0;
|
||||
const perPage = 100;
|
||||
console.log('\nFetching all products via paginated GraphQL...');
|
||||
while (true) {
|
||||
const result = await page.evaluate(async (dispId, hash, page, perPage) => {
|
||||
const variables = {
|
||||
includeEnterpriseSpecials: false,
|
||||
productsFilter: {
|
||||
dispensaryId: dispId,
|
||||
pricingType: 'rec',
|
||||
Status: 'Active',
|
||||
types: [],
|
||||
useCache: false,
|
||||
isDefaultSort: true,
|
||||
sortBy: 'popularSortIdx',
|
||||
sortDirection: 1,
|
||||
bypassOnlineThresholds: true,
|
||||
isKioskMenu: false,
|
||||
removeProductsBelowOptionThresholds: false,
|
||||
},
|
||||
page,
|
||||
perPage,
|
||||
};
|
||||
const qs = new URLSearchParams({
|
||||
operationName: 'FilteredProducts',
|
||||
variables: JSON.stringify(variables),
|
||||
extensions: JSON.stringify({ persistedQuery: { version: 1, sha256Hash: hash } }),
|
||||
});
|
||||
const resp = await fetch(`https://dutchie.com/graphql?${qs.toString()}`, {
|
||||
method: 'GET',
|
||||
headers: {
|
||||
'content-type': 'application/json',
|
||||
'apollographql-client-name': 'Marketplace (production)',
|
||||
},
|
||||
credentials: 'include',
|
||||
});
|
||||
const json = await resp.json();
|
||||
return {
|
||||
products: json?.data?.filteredProducts?.products || [],
|
||||
totalCount: json?.data?.filteredProducts?.queryInfo?.totalCount,
|
||||
};
|
||||
}, dispensaryId, GRAPHQL_HASH, pageNum, perPage);
|
||||
if (result.products.length === 0) {
|
||||
break;
|
||||
}
|
||||
allProducts.push(...result.products);
|
||||
console.log(`Page ${pageNum}: ${result.products.length} products (total so far: ${allProducts.length}/${result.totalCount})`);
|
||||
pageNum++;
|
||||
// Safety limit
|
||||
if (pageNum > 50) {
|
||||
console.log('Reached page limit');
|
||||
break;
|
||||
}
|
||||
}
|
||||
console.log(`\nTotal products fetched: ${allProducts.length}`);
|
||||
// Normalize and upsert
|
||||
console.log('\nNormalizing and upserting to database...');
|
||||
const normalized = allProducts.map(dutchie_graphql_1.normalizeDutchieProduct);
|
||||
const client = await pool.connect();
|
||||
let inserted = 0;
|
||||
let updated = 0;
|
||||
try {
|
||||
await client.query('BEGIN');
|
||||
for (const product of normalized) {
|
||||
const result = await client.query(`
|
||||
INSERT INTO products (
|
||||
store_id, external_id, slug, name, enterprise_product_id,
|
||||
brand, brand_external_id, brand_logo_url,
|
||||
subcategory, strain_type, canonical_category,
|
||||
price, rec_price, med_price, rec_special_price, med_special_price,
|
||||
is_on_special, special_name, discount_percent, special_data,
|
||||
sku, inventory_quantity, inventory_available, is_below_threshold, status,
|
||||
thc_percentage, cbd_percentage, cannabinoids,
|
||||
weight_mg, net_weight_value, net_weight_unit, options, raw_options,
|
||||
image_url, additional_images,
|
||||
is_featured, medical_only, rec_only,
|
||||
source_created_at, source_updated_at,
|
||||
description, raw_data,
|
||||
dutchie_url, last_seen_at, updated_at
|
||||
)
|
||||
VALUES (
|
||||
$1, $2, $3, $4, $5,
|
||||
$6, $7, $8,
|
||||
$9, $10, $11,
|
||||
$12, $13, $14, $15, $16,
|
||||
$17, $18, $19, $20,
|
||||
$21, $22, $23, $24, $25,
|
||||
$26, $27, $28,
|
||||
$29, $30, $31, $32, $33,
|
||||
$34, $35,
|
||||
$36, $37, $38,
|
||||
$39, $40,
|
||||
$41, $42,
|
||||
'', NOW(), NOW()
|
||||
)
|
||||
ON CONFLICT (store_id, slug) DO UPDATE SET
|
||||
name = EXCLUDED.name,
|
||||
enterprise_product_id = EXCLUDED.enterprise_product_id,
|
||||
brand = EXCLUDED.brand,
|
||||
brand_external_id = EXCLUDED.brand_external_id,
|
||||
brand_logo_url = EXCLUDED.brand_logo_url,
|
||||
subcategory = EXCLUDED.subcategory,
|
||||
strain_type = EXCLUDED.strain_type,
|
||||
canonical_category = EXCLUDED.canonical_category,
|
||||
price = EXCLUDED.price,
|
||||
rec_price = EXCLUDED.rec_price,
|
||||
med_price = EXCLUDED.med_price,
|
||||
rec_special_price = EXCLUDED.rec_special_price,
|
||||
med_special_price = EXCLUDED.med_special_price,
|
||||
is_on_special = EXCLUDED.is_on_special,
|
||||
special_name = EXCLUDED.special_name,
|
||||
discount_percent = EXCLUDED.discount_percent,
|
||||
special_data = EXCLUDED.special_data,
|
||||
sku = EXCLUDED.sku,
|
||||
inventory_quantity = EXCLUDED.inventory_quantity,
|
||||
inventory_available = EXCLUDED.inventory_available,
|
||||
is_below_threshold = EXCLUDED.is_below_threshold,
|
||||
status = EXCLUDED.status,
|
||||
thc_percentage = EXCLUDED.thc_percentage,
|
||||
cbd_percentage = EXCLUDED.cbd_percentage,
|
||||
cannabinoids = EXCLUDED.cannabinoids,
|
||||
weight_mg = EXCLUDED.weight_mg,
|
||||
net_weight_value = EXCLUDED.net_weight_value,
|
||||
net_weight_unit = EXCLUDED.net_weight_unit,
|
||||
options = EXCLUDED.options,
|
||||
raw_options = EXCLUDED.raw_options,
|
||||
image_url = EXCLUDED.image_url,
|
||||
additional_images = EXCLUDED.additional_images,
|
||||
is_featured = EXCLUDED.is_featured,
|
||||
medical_only = EXCLUDED.medical_only,
|
||||
rec_only = EXCLUDED.rec_only,
|
||||
source_created_at = EXCLUDED.source_created_at,
|
||||
source_updated_at = EXCLUDED.source_updated_at,
|
||||
description = EXCLUDED.description,
|
||||
raw_data = EXCLUDED.raw_data,
|
||||
last_seen_at = NOW(),
|
||||
updated_at = NOW()
|
||||
RETURNING (xmax = 0) AS was_inserted
|
||||
`, [
|
||||
storeId,
|
||||
product.external_id,
|
||||
product.slug,
|
||||
product.name,
|
||||
product.enterprise_product_id,
|
||||
product.brand,
|
||||
product.brand_external_id,
|
||||
product.brand_logo_url,
|
||||
product.subcategory,
|
||||
product.strain_type,
|
||||
product.canonical_category,
|
||||
product.price,
|
||||
product.rec_price,
|
||||
product.med_price,
|
||||
product.rec_special_price,
|
||||
product.med_special_price,
|
||||
product.is_on_special,
|
||||
product.special_name,
|
||||
product.discount_percent,
|
||||
product.special_data ? JSON.stringify(product.special_data) : null,
|
||||
product.sku,
|
||||
product.inventory_quantity,
|
||||
product.inventory_available,
|
||||
product.is_below_threshold,
|
||||
product.status,
|
||||
product.thc_percentage,
|
||||
product.cbd_percentage,
|
||||
product.cannabinoids ? JSON.stringify(product.cannabinoids) : null,
|
||||
product.weight_mg,
|
||||
product.net_weight_value,
|
||||
product.net_weight_unit,
|
||||
product.options,
|
||||
product.raw_options,
|
||||
product.image_url,
|
||||
product.additional_images,
|
||||
product.is_featured,
|
||||
product.medical_only,
|
||||
product.rec_only,
|
||||
product.source_created_at,
|
||||
product.source_updated_at,
|
||||
product.description,
|
||||
product.raw_data ? JSON.stringify(product.raw_data) : null,
|
||||
]);
|
||||
if (result.rows[0]?.was_inserted) {
|
||||
inserted++;
|
||||
}
|
||||
else {
|
||||
updated++;
|
||||
}
|
||||
}
|
||||
await client.query('COMMIT');
|
||||
}
|
||||
catch (error) {
|
||||
await client.query('ROLLBACK');
|
||||
throw error;
|
||||
}
|
||||
finally {
|
||||
client.release();
|
||||
}
|
||||
console.log(`\nDatabase: ${inserted} inserted, ${updated} updated`);
|
||||
// Show summary stats
|
||||
const stats = await pool.query(`
|
||||
SELECT
|
||||
COUNT(*) as total,
|
||||
COUNT(*) FILTER (WHERE is_on_special) as specials,
|
||||
COUNT(DISTINCT brand) as brands,
|
||||
COUNT(DISTINCT subcategory) as categories
|
||||
FROM products WHERE store_id = $1
|
||||
`, [storeId]);
|
||||
console.log('\nStore summary:');
|
||||
console.log(` Total products: ${stats.rows[0].total}`);
|
||||
console.log(` On special: ${stats.rows[0].specials}`);
|
||||
console.log(` Unique brands: ${stats.rows[0].brands}`);
|
||||
console.log(` Categories: ${stats.rows[0].categories}`);
|
||||
return {
|
||||
success: true,
|
||||
totalProducts: allProducts.length,
|
||||
inserted,
|
||||
updated,
|
||||
};
|
||||
}
|
||||
finally {
|
||||
await browser.close();
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
// Run
|
||||
const menuUrl = process.argv[2] || 'https://dutchie.com/embedded-menu/AZ-Deeply-Rooted';
|
||||
const storeId = parseInt(process.argv[3] || '1', 10);
|
||||
console.log('='.repeat(60));
|
||||
console.log('DUTCHIE GRAPHQL FULL SCRAPE');
|
||||
console.log('='.repeat(60));
|
||||
console.log(`Menu URL: ${menuUrl}`);
|
||||
console.log(`Store ID: ${storeId}`);
|
||||
console.log('');
|
||||
scrapeAllProducts(menuUrl, storeId)
|
||||
.then((result) => {
|
||||
console.log('\n' + '='.repeat(60));
|
||||
console.log('COMPLETE');
|
||||
console.log(JSON.stringify(result, null, 2));
|
||||
})
|
||||
.catch((error) => {
|
||||
console.error('Error:', error.message);
|
||||
process.exit(1);
|
||||
});
|
||||
169
backend/dist/scripts/test-dutchie-e2e.js
vendored
Normal file
169
backend/dist/scripts/test-dutchie-e2e.js
vendored
Normal file
@@ -0,0 +1,169 @@
|
||||
"use strict";
|
||||
/**
|
||||
* Test script: End-to-end Dutchie GraphQL → DB → Dashboard flow
|
||||
*
|
||||
* This demonstrates the complete data pipeline:
|
||||
* 1. Fetch one product from Dutchie GraphQL via Puppeteer
|
||||
* 2. Normalize it to our schema
|
||||
* 3. Show the mapping
|
||||
*/
|
||||
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
||||
if (k2 === undefined) k2 = k;
|
||||
var desc = Object.getOwnPropertyDescriptor(m, k);
|
||||
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
||||
desc = { enumerable: true, get: function() { return m[k]; } };
|
||||
}
|
||||
Object.defineProperty(o, k2, desc);
|
||||
}) : (function(o, m, k, k2) {
|
||||
if (k2 === undefined) k2 = k;
|
||||
o[k2] = m[k];
|
||||
}));
|
||||
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
||||
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
||||
}) : function(o, v) {
|
||||
o["default"] = v;
|
||||
});
|
||||
var __importStar = (this && this.__importStar) || (function () {
|
||||
var ownKeys = function(o) {
|
||||
ownKeys = Object.getOwnPropertyNames || function (o) {
|
||||
var ar = [];
|
||||
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
||||
return ar;
|
||||
};
|
||||
return ownKeys(o);
|
||||
};
|
||||
return function (mod) {
|
||||
if (mod && mod.__esModule) return mod;
|
||||
var result = {};
|
||||
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
||||
__setModuleDefault(result, mod);
|
||||
return result;
|
||||
};
|
||||
})();
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
const dutchie_graphql_1 = require("../scrapers/dutchie-graphql");
|
||||
const fs = __importStar(require("fs"));
|
||||
// Load the captured sample product from schema capture
|
||||
const capturedData = JSON.parse(fs.readFileSync('/tmp/dutchie-schema-capture.json', 'utf-8'));
|
||||
const sampleProduct = capturedData.sampleProduct;
|
||||
console.log('='.repeat(80));
|
||||
console.log('DUTCHIE GRAPHQL → DATABASE MAPPING DEMONSTRATION');
|
||||
console.log('='.repeat(80));
|
||||
console.log('\n📥 RAW DUTCHIE GRAPHQL PRODUCT:');
|
||||
console.log('-'.repeat(80));
|
||||
// Show key fields from raw product
|
||||
const keyRawFields = {
|
||||
'_id': sampleProduct._id,
|
||||
'Name': sampleProduct.Name,
|
||||
'cName': sampleProduct.cName,
|
||||
'brandName': sampleProduct.brandName,
|
||||
'brand.id': sampleProduct.brand?.id,
|
||||
'type': sampleProduct.type,
|
||||
'subcategory': sampleProduct.subcategory,
|
||||
'strainType': sampleProduct.strainType,
|
||||
'Prices': sampleProduct.Prices,
|
||||
'recPrices': sampleProduct.recPrices,
|
||||
'recSpecialPrices': sampleProduct.recSpecialPrices,
|
||||
'special': sampleProduct.special,
|
||||
'specialData.saleSpecials[0].specialName': sampleProduct.specialData?.saleSpecials?.[0]?.specialName,
|
||||
'specialData.saleSpecials[0].discount': sampleProduct.specialData?.saleSpecials?.[0]?.discount,
|
||||
'THCContent.range[0]': sampleProduct.THCContent?.range?.[0],
|
||||
'CBDContent.range[0]': sampleProduct.CBDContent?.range?.[0],
|
||||
'Status': sampleProduct.Status,
|
||||
'Image': sampleProduct.Image,
|
||||
'POSMetaData.canonicalSKU': sampleProduct.POSMetaData?.canonicalSKU,
|
||||
'POSMetaData.children[0].quantity': sampleProduct.POSMetaData?.children?.[0]?.quantity,
|
||||
'POSMetaData.children[0].quantityAvailable': sampleProduct.POSMetaData?.children?.[0]?.quantityAvailable,
|
||||
};
|
||||
Object.entries(keyRawFields).forEach(([key, value]) => {
|
||||
console.log(` ${key}: ${JSON.stringify(value)}`);
|
||||
});
|
||||
console.log('\n📤 NORMALIZED DATABASE ROW:');
|
||||
console.log('-'.repeat(80));
|
||||
// Normalize the product
|
||||
const normalized = (0, dutchie_graphql_1.normalizeDutchieProduct)(sampleProduct);
|
||||
// Show the normalized result (excluding raw_data for readability)
|
||||
const { raw_data, cannabinoids, special_data, ...displayFields } = normalized;
|
||||
Object.entries(displayFields).forEach(([key, value]) => {
|
||||
if (value !== undefined && value !== null) {
|
||||
console.log(` ${key}: ${JSON.stringify(value)}`);
|
||||
}
|
||||
});
|
||||
console.log('\n🔗 FIELD MAPPING:');
|
||||
console.log('-'.repeat(80));
|
||||
const fieldMappings = [
|
||||
['_id / id', 'external_id', sampleProduct._id, normalized.external_id],
|
||||
['Name', 'name', sampleProduct.Name, normalized.name],
|
||||
['cName', 'slug', sampleProduct.cName, normalized.slug],
|
||||
['brandName', 'brand', sampleProduct.brandName, normalized.brand],
|
||||
['brand.id', 'brand_external_id', sampleProduct.brand?.id, normalized.brand_external_id],
|
||||
['subcategory', 'subcategory', sampleProduct.subcategory, normalized.subcategory],
|
||||
['strainType', 'strain_type', sampleProduct.strainType, normalized.strain_type],
|
||||
['recPrices[0]', 'rec_price', sampleProduct.recPrices?.[0], normalized.rec_price],
|
||||
['recSpecialPrices[0]', 'rec_special_price', sampleProduct.recSpecialPrices?.[0], normalized.rec_special_price],
|
||||
['special', 'is_on_special', sampleProduct.special, normalized.is_on_special],
|
||||
['specialData...specialName', 'special_name', sampleProduct.specialData?.saleSpecials?.[0]?.specialName?.substring(0, 40) + '...', normalized.special_name?.substring(0, 40) + '...'],
|
||||
['THCContent.range[0]', 'thc_percentage', sampleProduct.THCContent?.range?.[0], normalized.thc_percentage],
|
||||
['CBDContent.range[0]', 'cbd_percentage', sampleProduct.CBDContent?.range?.[0], normalized.cbd_percentage],
|
||||
['Status', 'status', sampleProduct.Status, normalized.status],
|
||||
['Image', 'image_url', sampleProduct.Image?.substring(0, 50) + '...', normalized.image_url?.substring(0, 50) + '...'],
|
||||
['POSMetaData.canonicalSKU', 'sku', sampleProduct.POSMetaData?.canonicalSKU, normalized.sku],
|
||||
];
|
||||
console.log(' GraphQL Field → DB Column | Value');
|
||||
console.log(' ' + '-'.repeat(75));
|
||||
fieldMappings.forEach(([gqlField, dbCol, gqlVal, dbVal]) => {
|
||||
const gqlStr = String(gqlField).padEnd(30);
|
||||
const dbStr = String(dbCol).padEnd(20);
|
||||
console.log(` ${gqlStr} → ${dbStr} | ${JSON.stringify(dbVal)}`);
|
||||
});
|
||||
console.log('\n📊 SQL INSERT STATEMENT:');
|
||||
console.log('-'.repeat(80));
|
||||
// Generate example SQL
|
||||
const sqlExample = `
|
||||
INSERT INTO products (
|
||||
store_id, external_id, slug, name,
|
||||
brand, brand_external_id,
|
||||
subcategory, strain_type,
|
||||
rec_price, rec_special_price,
|
||||
is_on_special, special_name, discount_percent,
|
||||
thc_percentage, cbd_percentage,
|
||||
status, image_url, sku
|
||||
) VALUES (
|
||||
1, -- store_id (Deeply Rooted)
|
||||
'${normalized.external_id}', -- external_id
|
||||
'${normalized.slug}', -- slug
|
||||
'${normalized.name}', -- name
|
||||
'${normalized.brand}', -- brand
|
||||
'${normalized.brand_external_id}', -- brand_external_id
|
||||
'${normalized.subcategory}', -- subcategory
|
||||
'${normalized.strain_type}', -- strain_type
|
||||
${normalized.rec_price}, -- rec_price
|
||||
${normalized.rec_special_price}, -- rec_special_price
|
||||
${normalized.is_on_special}, -- is_on_special
|
||||
'${normalized.special_name?.substring(0, 50)}...', -- special_name
|
||||
${normalized.discount_percent || 'NULL'}, -- discount_percent
|
||||
${normalized.thc_percentage}, -- thc_percentage
|
||||
${normalized.cbd_percentage}, -- cbd_percentage
|
||||
'${normalized.status}', -- status
|
||||
'${normalized.image_url}', -- image_url
|
||||
'${normalized.sku}' -- sku
|
||||
)
|
||||
ON CONFLICT (store_id, slug) DO UPDATE SET ...;
|
||||
`;
|
||||
console.log(sqlExample);
|
||||
console.log('\n✅ SUMMARY:');
|
||||
console.log('-'.repeat(80));
|
||||
console.log(` Product: ${normalized.name}`);
|
||||
console.log(` Brand: ${normalized.brand}`);
|
||||
console.log(` Category: ${normalized.subcategory}`);
|
||||
console.log(` Price: $${normalized.rec_price} → $${normalized.rec_special_price} (${normalized.discount_percent}% off)`);
|
||||
console.log(` THC: ${normalized.thc_percentage}%`);
|
||||
console.log(` Status: ${normalized.status}`);
|
||||
console.log(` On Special: ${normalized.is_on_special}`);
|
||||
console.log(` SKU: ${normalized.sku}`);
|
||||
console.log('\n🎯 DERIVED VIEWS (computed from products table):');
|
||||
console.log('-'.repeat(80));
|
||||
console.log(' - current_specials: Products where is_on_special = true');
|
||||
console.log(' - derived_brands: Aggregated by brand name with counts/prices');
|
||||
console.log(' - derived_categories: Aggregated by subcategory');
|
||||
console.log('\nAll views are computed from the single products table - no separate tables needed!');
|
||||
179
backend/dist/scripts/test-dutchie-graphql.js
vendored
Normal file
179
backend/dist/scripts/test-dutchie-graphql.js
vendored
Normal file
@@ -0,0 +1,179 @@
|
||||
"use strict";
|
||||
/**
|
||||
* Test script to validate Dutchie GraphQL API access and capture response structure
|
||||
*/
|
||||
var __importDefault = (this && this.__importDefault) || function (mod) {
|
||||
return (mod && mod.__esModule) ? mod : { "default": mod };
|
||||
};
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
// @ts-ignore - node-fetch type declaration not installed
|
||||
const node_fetch_1 = __importDefault(require("node-fetch"));
|
||||
const GRAPHQL_HASHES = {
|
||||
ConsumerDispensaries: '0a5bfa6ca1d64ae47bcccb7c8077c87147cbc4e6982c17ceec97a2a4948b311b',
|
||||
GetAddressBasedDispensaryData: '13461f73abf7268770dfd05fe7e10c523084b2bb916a929c08efe3d87531977b',
|
||||
FilteredProducts: 'ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0',
|
||||
MenuFiltersV2: '2f0b3233b8a2426b391649ca3f0f7a5d43b9aefd683f6286d7261a2517e3568e',
|
||||
FilteredSpecials: '0dfb85a4fc138c55a076d4d11bf6d1a25f7cbd511428e1cf5a5b863b3eb23f25',
|
||||
};
|
||||
async function fetchProducts(dispensaryId, page = 0, perPage = 25) {
|
||||
const session = 'crawlsy-session-' + Date.now();
|
||||
const variables = {
|
||||
includeEnterpriseSpecials: false,
|
||||
productsFilter: {
|
||||
dispensaryId,
|
||||
pricingType: 'rec',
|
||||
Status: null, // null to include all (in-stock and out-of-stock)
|
||||
types: [],
|
||||
useCache: true,
|
||||
isDefaultSort: true,
|
||||
sortBy: 'popularSortIdx',
|
||||
sortDirection: 1,
|
||||
bypassOnlineThresholds: true,
|
||||
isKioskMenu: false,
|
||||
removeProductsBelowOptionThresholds: false
|
||||
},
|
||||
page,
|
||||
perPage
|
||||
};
|
||||
const qs = new URLSearchParams({
|
||||
operationName: 'FilteredProducts',
|
||||
variables: JSON.stringify(variables),
|
||||
extensions: JSON.stringify({ persistedQuery: { version: 1, sha256Hash: GRAPHQL_HASHES.FilteredProducts } })
|
||||
});
|
||||
const res = await (0, node_fetch_1.default)(`https://dutchie.com/api-3/graphql?${qs.toString()}`, {
|
||||
headers: {
|
||||
'x-dutchie-session': session,
|
||||
'apollographql-client-name': 'Marketplace (production)',
|
||||
'content-type': 'application/json',
|
||||
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
||||
}
|
||||
});
|
||||
if (!res.ok) {
|
||||
const text = await res.text();
|
||||
console.error('HTTP Status:', res.status);
|
||||
console.error('Response:', text.substring(0, 500));
|
||||
throw new Error(`HTTP ${res.status}: ${text.substring(0, 200)}`);
|
||||
}
|
||||
return res.json();
|
||||
}
|
||||
async function resolveDispensaryId(cName) {
|
||||
const session = 'crawlsy-session-' + Date.now();
|
||||
const variables = { input: { dispensaryId: cName } };
|
||||
const qs = new URLSearchParams({
|
||||
operationName: 'GetAddressBasedDispensaryData',
|
||||
variables: JSON.stringify(variables),
|
||||
extensions: JSON.stringify({ persistedQuery: { version: 1, sha256Hash: GRAPHQL_HASHES.GetAddressBasedDispensaryData } })
|
||||
});
|
||||
const res = await (0, node_fetch_1.default)(`https://dutchie.com/graphql?${qs.toString()}`, {
|
||||
headers: {
|
||||
'x-dutchie-session': session,
|
||||
'apollographql-client-name': 'Marketplace (production)',
|
||||
'content-type': 'application/json',
|
||||
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
||||
}
|
||||
});
|
||||
if (!res.ok) {
|
||||
console.error('Failed to resolve dispensary ID:', res.status);
|
||||
return null;
|
||||
}
|
||||
const data = await res.json();
|
||||
return data?.data?.getAddressBasedDispensaryData?.dispensaryData?.dispensaryId || null;
|
||||
}
|
||||
function enumerateFields(obj, prefix = '') {
|
||||
const fields = [];
|
||||
for (const [key, value] of Object.entries(obj)) {
|
||||
const path = prefix ? `${prefix}.${key}` : key;
|
||||
if (value === null) {
|
||||
fields.push(`${path}: null`);
|
||||
}
|
||||
else if (Array.isArray(value)) {
|
||||
fields.push(`${path}: Array[${value.length}]`);
|
||||
if (value.length > 0 && typeof value[0] === 'object') {
|
||||
const subFields = enumerateFields(value[0], `${path}[0]`);
|
||||
fields.push(...subFields);
|
||||
}
|
||||
}
|
||||
else if (typeof value === 'object') {
|
||||
fields.push(`${path}: Object`);
|
||||
const subFields = enumerateFields(value, path);
|
||||
fields.push(...subFields);
|
||||
}
|
||||
else {
|
||||
const typeStr = typeof value;
|
||||
const preview = String(value).substring(0, 50);
|
||||
fields.push(`${path}: ${typeStr} = "${preview}"`);
|
||||
}
|
||||
}
|
||||
return fields;
|
||||
}
|
||||
async function main() {
|
||||
console.log('='.repeat(80));
|
||||
console.log('DUTCHIE GRAPHQL API TEST');
|
||||
console.log('='.repeat(80));
|
||||
const cName = 'AZ-Deeply-Rooted';
|
||||
// Step 1: Resolve dispensary ID
|
||||
console.log(`\n1. Resolving dispensary ID for "${cName}"...`);
|
||||
const dispensaryId = await resolveDispensaryId(cName);
|
||||
const finalDispensaryId = dispensaryId || '6405ef617056e8014d79101b'; // Fallback to known ID
|
||||
if (!dispensaryId) {
|
||||
console.log(' Failed to resolve via API, using hardcoded ID: 6405ef617056e8014d79101b');
|
||||
}
|
||||
console.log(` Final ID: ${finalDispensaryId}`);
|
||||
// Step 2: Fetch first page of products
|
||||
console.log('\n2. Fetching products (page 0, perPage 5)...');
|
||||
const result = await fetchProducts(finalDispensaryId, 0, 5);
|
||||
if (result.errors) {
|
||||
console.error('\nGraphQL Errors:');
|
||||
console.error(JSON.stringify(result.errors, null, 2));
|
||||
return;
|
||||
}
|
||||
const products = result?.data?.filteredProducts?.products || [];
|
||||
console.log(` Found ${products.length} products in this page`);
|
||||
if (products.length === 0) {
|
||||
console.log('No products returned. Full response:');
|
||||
console.log(JSON.stringify(result, null, 2));
|
||||
return;
|
||||
}
|
||||
// Step 3: Enumerate all fields from first product
|
||||
console.log('\n3. PRODUCT FIELD STRUCTURE (from first product):');
|
||||
console.log('-'.repeat(80));
|
||||
const product = products[0];
|
||||
const fields = enumerateFields(product);
|
||||
fields.forEach(f => console.log(` ${f}`));
|
||||
// Step 4: Show full sample product JSON
|
||||
console.log('\n4. FULL SAMPLE PRODUCT JSON:');
|
||||
console.log('-'.repeat(80));
|
||||
console.log(JSON.stringify(product, null, 2));
|
||||
// Step 5: Summary of key fields for schema design
|
||||
console.log('\n5. KEY FIELDS FOR SCHEMA DESIGN:');
|
||||
console.log('-'.repeat(80));
|
||||
const keyFields = [
|
||||
{ field: 'id', value: product.id },
|
||||
{ field: 'name', value: product.name },
|
||||
{ field: 'slug', value: product.slug },
|
||||
{ field: 'brand', value: product.brand },
|
||||
{ field: 'brandId', value: product.brandId },
|
||||
{ field: 'type', value: product.type },
|
||||
{ field: 'category', value: product.category },
|
||||
{ field: 'subcategory', value: product.subcategory },
|
||||
{ field: 'strainType', value: product.strainType },
|
||||
{ field: 'THCContent', value: product.THCContent },
|
||||
{ field: 'CBDContent', value: product.CBDContent },
|
||||
{ field: 'description', value: product.description?.substring(0, 100) + '...' },
|
||||
{ field: 'image', value: product.image },
|
||||
{ field: 'options.length', value: product.options?.length },
|
||||
{ field: 'pricing', value: product.pricing },
|
||||
{ field: 'terpenes.length', value: product.terpenes?.length },
|
||||
{ field: 'effects.length', value: product.effects?.length },
|
||||
];
|
||||
keyFields.forEach(({ field, value }) => {
|
||||
console.log(` ${field}: ${JSON.stringify(value)}`);
|
||||
});
|
||||
// Step 6: Show an option (variant) if available
|
||||
if (product.options && product.options.length > 0) {
|
||||
console.log('\n6. SAMPLE OPTION/VARIANT:');
|
||||
console.log('-'.repeat(80));
|
||||
console.log(JSON.stringify(product.options[0], null, 2));
|
||||
}
|
||||
}
|
||||
main().catch(console.error);
|
||||
84
backend/dist/scripts/test-status-filter.js
vendored
Normal file
84
backend/dist/scripts/test-status-filter.js
vendored
Normal file
@@ -0,0 +1,84 @@
|
||||
"use strict";
|
||||
/**
|
||||
* Test different Status filter values in Dutchie GraphQL
|
||||
*/
|
||||
var __importDefault = (this && this.__importDefault) || function (mod) {
|
||||
return (mod && mod.__esModule) ? mod : { "default": mod };
|
||||
};
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
const puppeteer_extra_1 = __importDefault(require("puppeteer-extra"));
|
||||
const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth"));
|
||||
puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)());
|
||||
const GRAPHQL_HASH = 'ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0';
|
||||
async function main() {
|
||||
const browser = await puppeteer_extra_1.default.launch({
|
||||
headless: 'new',
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox'],
|
||||
});
|
||||
const page = await browser.newPage();
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36');
|
||||
console.log('Loading menu...');
|
||||
await page.goto('https://dutchie.com/embedded-menu/AZ-Deeply-Rooted', {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout: 60000,
|
||||
});
|
||||
await new Promise((r) => setTimeout(r, 3000));
|
||||
const dispensaryId = await page.evaluate(() => window.reactEnv?.dispensaryId);
|
||||
console.log('Dispensary ID:', dispensaryId);
|
||||
// Test different status values
|
||||
const testCases = [
|
||||
{ label: 'Active', status: 'Active', includeStatus: true },
|
||||
{ label: 'Inactive', status: 'Inactive', includeStatus: true },
|
||||
{ label: 'null', status: null, includeStatus: true },
|
||||
{ label: 'omitted', status: null, includeStatus: false },
|
||||
];
|
||||
for (const testCase of testCases) {
|
||||
const result = await page.evaluate(async (dispId, hash, status, includeStatus) => {
|
||||
const filter = {
|
||||
dispensaryId: dispId,
|
||||
pricingType: 'rec',
|
||||
types: [],
|
||||
useCache: false,
|
||||
isDefaultSort: true,
|
||||
sortBy: 'popularSortIdx',
|
||||
sortDirection: 1,
|
||||
bypassOnlineThresholds: true,
|
||||
isKioskMenu: false,
|
||||
removeProductsBelowOptionThresholds: false,
|
||||
};
|
||||
if (includeStatus) {
|
||||
filter.Status = status;
|
||||
}
|
||||
const variables = {
|
||||
includeEnterpriseSpecials: false,
|
||||
productsFilter: filter,
|
||||
page: 0,
|
||||
perPage: 100,
|
||||
};
|
||||
const qs = new URLSearchParams({
|
||||
operationName: 'FilteredProducts',
|
||||
variables: JSON.stringify(variables),
|
||||
extensions: JSON.stringify({ persistedQuery: { version: 1, sha256Hash: hash } }),
|
||||
});
|
||||
const resp = await fetch(`https://dutchie.com/graphql?${qs.toString()}`, {
|
||||
method: 'GET',
|
||||
headers: {
|
||||
'content-type': 'application/json',
|
||||
'apollographql-client-name': 'Marketplace (production)',
|
||||
},
|
||||
credentials: 'include',
|
||||
});
|
||||
const json = await resp.json();
|
||||
const products = json?.data?.filteredProducts?.products || [];
|
||||
return {
|
||||
count: products.length,
|
||||
totalCount: json?.data?.filteredProducts?.queryInfo?.totalCount,
|
||||
sampleStatus: products[0]?.Status,
|
||||
statuses: [...new Set(products.map((p) => p.Status))],
|
||||
};
|
||||
}, dispensaryId, GRAPHQL_HASH, testCase.status, testCase.includeStatus);
|
||||
console.log(`Status ${testCase.label}: Products=${result.count}, Total=${result.totalCount}, Statuses=${JSON.stringify(result.statuses)}`);
|
||||
}
|
||||
await browser.close();
|
||||
}
|
||||
main().catch(console.error);
|
||||
201
backend/dist/services/availability.js
vendored
Normal file
201
backend/dist/services/availability.js
vendored
Normal file
@@ -0,0 +1,201 @@
|
||||
"use strict";
|
||||
/**
|
||||
* Availability Service
|
||||
*
|
||||
* Normalizes product availability from various menu providers and tracks
|
||||
* state transitions for inventory analytics.
|
||||
*/
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.normalizeAvailability = normalizeAvailability;
|
||||
exports.extractAvailabilityHints = extractAvailabilityHints;
|
||||
exports.hintsToAvailability = hintsToAvailability;
|
||||
exports.aggregateAvailability = aggregateAvailability;
|
||||
// Threshold for considering stock as "limited"
|
||||
const LIMITED_THRESHOLD = 5;
|
||||
/**
|
||||
* Normalize availability from a Dutchie product
|
||||
*
|
||||
* Dutchie products can have various availability indicators:
|
||||
* - potencyAmount.quantity: explicit stock count
|
||||
* - status: sometimes includes stock status
|
||||
* - variants[].quantity: stock per variant
|
||||
* - isInStock / inStock: boolean flags
|
||||
*/
|
||||
function normalizeAvailability(dutchieProduct) {
|
||||
const raw = {};
|
||||
// Collect raw availability data for debugging
|
||||
if (dutchieProduct.potencyAmount?.quantity !== undefined) {
|
||||
raw.potencyQuantity = dutchieProduct.potencyAmount.quantity;
|
||||
}
|
||||
if (dutchieProduct.status !== undefined) {
|
||||
raw.status = dutchieProduct.status;
|
||||
}
|
||||
if (dutchieProduct.isInStock !== undefined) {
|
||||
raw.isInStock = dutchieProduct.isInStock;
|
||||
}
|
||||
if (dutchieProduct.inStock !== undefined) {
|
||||
raw.inStock = dutchieProduct.inStock;
|
||||
}
|
||||
if (dutchieProduct.variants?.length) {
|
||||
const variantQuantities = dutchieProduct.variants
|
||||
.filter((v) => v.quantity !== undefined)
|
||||
.map((v) => ({ option: v.option, quantity: v.quantity }));
|
||||
if (variantQuantities.length) {
|
||||
raw.variantQuantities = variantQuantities;
|
||||
}
|
||||
}
|
||||
// Try to extract quantity
|
||||
let quantity = null;
|
||||
// Check potencyAmount.quantity first (most reliable for Dutchie)
|
||||
if (typeof dutchieProduct.potencyAmount?.quantity === 'number') {
|
||||
quantity = dutchieProduct.potencyAmount.quantity;
|
||||
}
|
||||
// Sum variant quantities if available
|
||||
else if (dutchieProduct.variants?.length) {
|
||||
const totalVariantQty = dutchieProduct.variants.reduce((sum, v) => {
|
||||
return sum + (typeof v.quantity === 'number' ? v.quantity : 0);
|
||||
}, 0);
|
||||
if (totalVariantQty > 0) {
|
||||
quantity = totalVariantQty;
|
||||
}
|
||||
}
|
||||
// Determine status
|
||||
let status = 'unknown';
|
||||
// Explicit boolean flags take precedence
|
||||
if (dutchieProduct.isInStock === false || dutchieProduct.inStock === false) {
|
||||
status = 'out_of_stock';
|
||||
}
|
||||
else if (dutchieProduct.isInStock === true || dutchieProduct.inStock === true) {
|
||||
status = quantity !== null && quantity <= LIMITED_THRESHOLD ? 'limited' : 'in_stock';
|
||||
}
|
||||
// Check status string
|
||||
else if (typeof dutchieProduct.status === 'string') {
|
||||
const statusLower = dutchieProduct.status.toLowerCase();
|
||||
if (statusLower.includes('out') || statusLower.includes('unavailable')) {
|
||||
status = 'out_of_stock';
|
||||
}
|
||||
else if (statusLower.includes('limited') || statusLower.includes('low')) {
|
||||
status = 'limited';
|
||||
}
|
||||
else if (statusLower.includes('in') || statusLower.includes('available')) {
|
||||
status = 'in_stock';
|
||||
}
|
||||
}
|
||||
// Infer from quantity
|
||||
else if (quantity !== null) {
|
||||
if (quantity === 0) {
|
||||
status = 'out_of_stock';
|
||||
}
|
||||
else if (quantity <= LIMITED_THRESHOLD) {
|
||||
status = 'limited';
|
||||
}
|
||||
else {
|
||||
status = 'in_stock';
|
||||
}
|
||||
}
|
||||
return { status, quantity, raw };
|
||||
}
|
||||
/**
|
||||
* Extract availability hints from page content or product card HTML
|
||||
*
|
||||
* Used for sandbox provider scraping where we don't have structured data
|
||||
*/
|
||||
function extractAvailabilityHints(pageContent, productElement) {
|
||||
const hints = {};
|
||||
const content = (productElement || pageContent).toLowerCase();
|
||||
// Check for out-of-stock indicators
|
||||
const oosPatterns = [
|
||||
'out of stock',
|
||||
'out-of-stock',
|
||||
'sold out',
|
||||
'soldout',
|
||||
'unavailable',
|
||||
'not available',
|
||||
'coming soon',
|
||||
'notify me'
|
||||
];
|
||||
hints.hasOutOfStockBadge = oosPatterns.some(p => content.includes(p));
|
||||
// Check for limited stock indicators
|
||||
const limitedPatterns = [
|
||||
'limited stock',
|
||||
'limited quantity',
|
||||
'low stock',
|
||||
'only \\d+ left',
|
||||
'few remaining',
|
||||
'almost gone',
|
||||
'selling fast'
|
||||
];
|
||||
hints.hasLimitedBadge = limitedPatterns.some(p => {
|
||||
if (p.includes('\\d')) {
|
||||
return new RegExp(p, 'i').test(content);
|
||||
}
|
||||
return content.includes(p);
|
||||
});
|
||||
// Check for in-stock indicators
|
||||
const inStockPatterns = [
|
||||
'in stock',
|
||||
'in-stock',
|
||||
'add to cart',
|
||||
'add to bag',
|
||||
'buy now',
|
||||
'available'
|
||||
];
|
||||
hints.hasInStockBadge = inStockPatterns.some(p => content.includes(p));
|
||||
// Try to extract quantity text
|
||||
const qtyMatch = content.match(/(\d+)\s*(left|remaining|in stock|available)/i);
|
||||
if (qtyMatch) {
|
||||
hints.quantityText = qtyMatch[0];
|
||||
}
|
||||
// Look for explicit stock text
|
||||
const stockTextMatch = content.match(/(out of stock|in stock|low stock|limited|sold out)[^<]*/i);
|
||||
if (stockTextMatch) {
|
||||
hints.stockText = stockTextMatch[0].trim();
|
||||
}
|
||||
return hints;
|
||||
}
|
||||
/**
|
||||
* Convert availability hints to normalized availability
|
||||
*/
|
||||
function hintsToAvailability(hints) {
|
||||
let status = 'unknown';
|
||||
let quantity = null;
|
||||
// Extract quantity if present
|
||||
if (hints.quantityText) {
|
||||
const match = hints.quantityText.match(/(\d+)/);
|
||||
if (match) {
|
||||
quantity = parseInt(match[1], 10);
|
||||
}
|
||||
}
|
||||
// Determine status from hints
|
||||
if (hints.hasOutOfStockBadge) {
|
||||
status = 'out_of_stock';
|
||||
}
|
||||
else if (hints.hasLimitedBadge) {
|
||||
status = 'limited';
|
||||
}
|
||||
else if (hints.hasInStockBadge) {
|
||||
status = quantity !== null && quantity <= LIMITED_THRESHOLD ? 'limited' : 'in_stock';
|
||||
}
|
||||
return {
|
||||
status,
|
||||
quantity,
|
||||
raw: hints
|
||||
};
|
||||
}
|
||||
function aggregateAvailability(products) {
|
||||
const counts = {
|
||||
in_stock: 0,
|
||||
out_of_stock: 0,
|
||||
limited: 0,
|
||||
unknown: 0,
|
||||
changed: 0
|
||||
};
|
||||
for (const product of products) {
|
||||
const status = product.availability_status || 'unknown';
|
||||
counts[status]++;
|
||||
if (product.previous_status && product.previous_status !== status) {
|
||||
counts.changed++;
|
||||
}
|
||||
}
|
||||
return counts;
|
||||
}
|
||||
1098
backend/dist/services/category-crawler-jobs.js
vendored
Normal file
1098
backend/dist/services/category-crawler-jobs.js
vendored
Normal file
File diff suppressed because it is too large
Load Diff
114
backend/dist/services/category-discovery.js
vendored
114
backend/dist/services/category-discovery.js
vendored
@@ -4,9 +4,14 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
|
||||
};
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.discoverCategories = discoverCategories;
|
||||
const puppeteer_1 = __importDefault(require("puppeteer"));
|
||||
const puppeteer_extra_1 = __importDefault(require("puppeteer-extra"));
|
||||
const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth"));
|
||||
const migrate_1 = require("../db/migrate");
|
||||
const logger_1 = require("./logger");
|
||||
const age_gate_1 = require("../utils/age-gate");
|
||||
const dutchie_1 = require("../scrapers/templates/dutchie");
|
||||
// Apply stealth plugin
|
||||
puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)());
|
||||
const DUTCHIE_CATEGORIES = [
|
||||
{ name: 'Shop', slug: 'shop' },
|
||||
{ name: 'Flower', slug: 'flower', parentSlug: 'shop' },
|
||||
@@ -19,6 +24,18 @@ const DUTCHIE_CATEGORIES = [
|
||||
{ name: 'Brands', slug: 'brands' },
|
||||
{ name: 'Specials', slug: 'specials' }
|
||||
];
|
||||
const CURALEAF_CATEGORIES = [
|
||||
{ name: 'Shop', slug: 'shop' },
|
||||
{ name: 'Flower', slug: 'flower', parentSlug: 'shop' },
|
||||
{ name: 'Pre-Rolls', slug: 'pre-rolls', parentSlug: 'shop' },
|
||||
{ name: 'Vaporizers', slug: 'vaporizers', parentSlug: 'shop' },
|
||||
{ name: 'Concentrates', slug: 'concentrates', parentSlug: 'shop' },
|
||||
{ name: 'Edibles', slug: 'edibles', parentSlug: 'shop' },
|
||||
{ name: 'Tinctures', slug: 'tinctures', parentSlug: 'shop' },
|
||||
{ name: 'Topicals', slug: 'topicals', parentSlug: 'shop' },
|
||||
{ name: 'Capsules', slug: 'capsules', parentSlug: 'shop' },
|
||||
{ name: 'Accessories', slug: 'accessories', parentSlug: 'shop' }
|
||||
];
|
||||
async function makePageStealthy(page) {
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
||||
@@ -72,7 +89,7 @@ async function discoverCategories(storeId) {
|
||||
const store = storeResult.rows[0];
|
||||
const baseUrl = store.dutchie_url;
|
||||
// Launch browser to check page source
|
||||
browser = await puppeteer_1.default.launch({
|
||||
browser = await puppeteer_extra_1.default.launch({
|
||||
headless: 'new',
|
||||
args: [
|
||||
'--no-sandbox',
|
||||
@@ -85,9 +102,14 @@ async function discoverCategories(storeId) {
|
||||
await makePageStealthy(page);
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
|
||||
// Set age gate bypass cookies BEFORE navigation (standard for all cannabis sites)
|
||||
const state = (0, age_gate_1.detectStateFromUrl)(baseUrl);
|
||||
await (0, age_gate_1.setAgeGateCookies)(page, baseUrl, state);
|
||||
logger_1.logger.info('categories', `Loading page to detect menu type: ${baseUrl}`);
|
||||
await page.goto(baseUrl, { waitUntil: 'domcontentloaded', timeout: 60000 });
|
||||
await page.waitForTimeout(3000);
|
||||
// If age gate still appears, try to bypass it
|
||||
await (0, age_gate_1.bypassAgeGate)(page, state);
|
||||
// Detect if it's a Dutchie menu by inspecting page source
|
||||
const isDutchie = await isDutchieMenu(page);
|
||||
await browser.close();
|
||||
@@ -97,8 +119,9 @@ async function discoverCategories(storeId) {
|
||||
await createDutchieCategories(storeId, store);
|
||||
}
|
||||
else {
|
||||
logger_1.logger.info('categories', `⚠️ Non-Dutchie menu detected, would need custom scraping logic`);
|
||||
throw new Error('Non-Dutchie menus not yet supported. Please contact support.');
|
||||
// Fallback: Use standard cannabis categories for non-Dutchie sites
|
||||
logger_1.logger.info('categories', `Non-Dutchie menu detected, using standard cannabis categories for ${store.name}`);
|
||||
await createCuraleafCategories(storeId, store);
|
||||
}
|
||||
}
|
||||
catch (error) {
|
||||
@@ -116,24 +139,24 @@ async function createDutchieCategories(storeId, store) {
|
||||
const baseUrl = store.dutchie_url;
|
||||
for (const category of DUTCHIE_CATEGORIES) {
|
||||
let categoryUrl;
|
||||
// Use Dutchie template to build correct category URLs
|
||||
if (category.parentSlug) {
|
||||
// Subcategory: /embedded-menu/{slug}/shop/flower
|
||||
categoryUrl = `${baseUrl}/${category.parentSlug}/${category.slug}`;
|
||||
// Subcategory: Use template's buildCategoryUrl (e.g., /products/flower)
|
||||
categoryUrl = dutchie_1.dutchieTemplate.buildCategoryUrl(baseUrl, category.name);
|
||||
}
|
||||
else {
|
||||
// Top-level: /embedded-menu/{slug}/shop
|
||||
// Top-level: Use base URL with slug
|
||||
categoryUrl = `${baseUrl}/${category.slug}`;
|
||||
}
|
||||
const path = category.parentSlug ? `${category.parentSlug}/${category.slug}` : category.slug;
|
||||
if (!category.parentSlug) {
|
||||
// Create parent category
|
||||
await client.query(`
|
||||
INSERT INTO categories (store_id, name, slug, dutchie_url, path, scrape_enabled, parent_id)
|
||||
VALUES ($1, $2, $3, $4, $5, true, NULL)
|
||||
ON CONFLICT (store_id, slug)
|
||||
DO UPDATE SET name = $2, dutchie_url = $4, path = $5
|
||||
INSERT INTO categories (store_id, name, slug, dutchie_url, scrape_enabled)
|
||||
VALUES ($1, $2, $3, $4, true)
|
||||
ON CONFLICT (store_id, slug)
|
||||
DO UPDATE SET name = $2, dutchie_url = $4
|
||||
RETURNING id
|
||||
`, [storeId, category.name, category.slug, categoryUrl, path]);
|
||||
`, [storeId, category.name, category.slug, categoryUrl]);
|
||||
logger_1.logger.info('categories', `📁 ${category.name}`);
|
||||
}
|
||||
else {
|
||||
@@ -143,13 +166,12 @@ async function createDutchieCategories(storeId, store) {
|
||||
WHERE store_id = $1 AND slug = $2
|
||||
`, [storeId, category.parentSlug]);
|
||||
if (parentResult.rows.length > 0) {
|
||||
const parentId = parentResult.rows[0].id;
|
||||
await client.query(`
|
||||
INSERT INTO categories (store_id, name, slug, dutchie_url, path, scrape_enabled, parent_id)
|
||||
VALUES ($1, $2, $3, $4, $5, true, $6)
|
||||
INSERT INTO categories (store_id, name, slug, dutchie_url, scrape_enabled)
|
||||
VALUES ($1, $2, $3, $4, true)
|
||||
ON CONFLICT (store_id, slug)
|
||||
DO UPDATE SET name = $2, dutchie_url = $4, path = $5, parent_id = $6
|
||||
`, [storeId, category.name, category.slug, categoryUrl, path, parentId]);
|
||||
DO UPDATE SET name = $2, dutchie_url = $4
|
||||
`, [storeId, category.name, category.slug, categoryUrl]);
|
||||
logger_1.logger.info('categories', ` └── ${category.name}`);
|
||||
}
|
||||
}
|
||||
@@ -166,3 +188,59 @@ async function createDutchieCategories(storeId, store) {
|
||||
client.release();
|
||||
}
|
||||
}
|
||||
async function createCuraleafCategories(storeId, store) {
|
||||
const client = await migrate_1.pool.connect();
|
||||
try {
|
||||
await client.query('BEGIN');
|
||||
logger_1.logger.info('categories', `Creating predefined Curaleaf category structure`);
|
||||
const baseUrl = store.dutchie_url;
|
||||
for (const category of CURALEAF_CATEGORIES) {
|
||||
let categoryUrl;
|
||||
if (category.parentSlug) {
|
||||
// Subcategory URL - Curaleaf uses pattern like: /stores/{store-slug}/{category}
|
||||
categoryUrl = `${baseUrl}?category=${category.slug}`;
|
||||
}
|
||||
else {
|
||||
// Top-level category
|
||||
categoryUrl = baseUrl;
|
||||
}
|
||||
if (!category.parentSlug) {
|
||||
// Create parent category
|
||||
await client.query(`
|
||||
INSERT INTO categories (store_id, name, slug, dutchie_url, scrape_enabled)
|
||||
VALUES ($1, $2, $3, $4, true)
|
||||
ON CONFLICT (store_id, slug)
|
||||
DO UPDATE SET name = $2, dutchie_url = $4
|
||||
RETURNING id
|
||||
`, [storeId, category.name, category.slug, categoryUrl]);
|
||||
logger_1.logger.info('categories', `📁 ${category.name}`);
|
||||
}
|
||||
else {
|
||||
// Create subcategory
|
||||
const parentResult = await client.query(`
|
||||
SELECT id FROM categories
|
||||
WHERE store_id = $1 AND slug = $2
|
||||
`, [storeId, category.parentSlug]);
|
||||
if (parentResult.rows.length > 0) {
|
||||
await client.query(`
|
||||
INSERT INTO categories (store_id, name, slug, dutchie_url, scrape_enabled)
|
||||
VALUES ($1, $2, $3, $4, true)
|
||||
ON CONFLICT (store_id, slug)
|
||||
DO UPDATE SET name = $2, dutchie_url = $4
|
||||
`, [storeId, category.name, category.slug, categoryUrl]);
|
||||
logger_1.logger.info('categories', ` └── ${category.name}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
await client.query('COMMIT');
|
||||
logger_1.logger.info('categories', `✅ Created ${CURALEAF_CATEGORIES.length} Curaleaf categories successfully`);
|
||||
}
|
||||
catch (error) {
|
||||
await client.query('ROLLBACK');
|
||||
logger_1.logger.error('categories', `Failed to create Curaleaf categories: ${error}`);
|
||||
throw error;
|
||||
}
|
||||
finally {
|
||||
client.release();
|
||||
}
|
||||
}
|
||||
|
||||
536
backend/dist/services/crawl-scheduler.js
vendored
Normal file
536
backend/dist/services/crawl-scheduler.js
vendored
Normal file
@@ -0,0 +1,536 @@
|
||||
"use strict";
|
||||
/**
|
||||
* Crawl Scheduler Service
|
||||
*
|
||||
* This service manages crawl scheduling using a job queue approach.
|
||||
* It does NOT modify the crawler - it only TRIGGERS the existing crawler.
|
||||
*
|
||||
* Features:
|
||||
* - Global schedule: crawl all stores every N hours
|
||||
* - Daily special run: 12:01 AM local store time
|
||||
* - Per-store schedule overrides
|
||||
* - Job queue for tracking pending/running crawls
|
||||
*/
|
||||
var __importDefault = (this && this.__importDefault) || function (mod) {
|
||||
return (mod && mod.__esModule) ? mod : { "default": mod };
|
||||
};
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.getGlobalSchedule = getGlobalSchedule;
|
||||
exports.updateGlobalSchedule = updateGlobalSchedule;
|
||||
exports.getStoreScheduleStatuses = getStoreScheduleStatuses;
|
||||
exports.getStoreSchedule = getStoreSchedule;
|
||||
exports.updateStoreSchedule = updateStoreSchedule;
|
||||
exports.createCrawlJob = createCrawlJob;
|
||||
exports.getPendingJobs = getPendingJobs;
|
||||
exports.claimJob = claimJob;
|
||||
exports.completeJob = completeJob;
|
||||
exports.getRecentJobs = getRecentJobs;
|
||||
exports.getAllRecentJobs = getAllRecentJobs;
|
||||
exports.checkAndCreateScheduledJobs = checkAndCreateScheduledJobs;
|
||||
exports.checkAndCreateDailySpecialJobs = checkAndCreateDailySpecialJobs;
|
||||
exports.processJobs = processJobs;
|
||||
exports.processOrchestrator = processOrchestrator;
|
||||
exports.setSchedulerMode = setSchedulerMode;
|
||||
exports.getSchedulerMode = getSchedulerMode;
|
||||
exports.startCrawlScheduler = startCrawlScheduler;
|
||||
exports.stopCrawlScheduler = stopCrawlScheduler;
|
||||
exports.restartCrawlScheduler = restartCrawlScheduler;
|
||||
exports.triggerManualCrawl = triggerManualCrawl;
|
||||
exports.triggerAllStoresCrawl = triggerAllStoresCrawl;
|
||||
exports.cancelJob = cancelJob;
|
||||
const node_cron_1 = __importDefault(require("node-cron"));
|
||||
const migrate_1 = require("../db/migrate");
|
||||
const scraper_v2_1 = require("../scraper-v2");
|
||||
const store_crawl_orchestrator_1 = require("./store-crawl-orchestrator");
|
||||
// Worker identification
|
||||
const WORKER_ID = `worker-${process.pid}-${Date.now()}`;
|
||||
let schedulerCronJob = null;
|
||||
let jobProcessorRunning = false;
|
||||
let orchestratorProcessorRunning = false;
|
||||
// Scheduler mode: 'legacy' uses job queue, 'orchestrator' uses intelligent orchestration
|
||||
let schedulerMode = 'orchestrator';
|
||||
// ============================================
|
||||
// Schedule Management
|
||||
// ============================================
|
||||
/**
|
||||
* Get global schedule settings
|
||||
*/
|
||||
async function getGlobalSchedule() {
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT * FROM crawler_schedule ORDER BY id
|
||||
`);
|
||||
return result.rows;
|
||||
}
|
||||
/**
|
||||
* Update global schedule setting
|
||||
*/
|
||||
async function updateGlobalSchedule(scheduleType, updates) {
|
||||
const setClauses = [];
|
||||
const values = [];
|
||||
let paramIndex = 1;
|
||||
if (updates.enabled !== undefined) {
|
||||
setClauses.push(`enabled = $${paramIndex++}`);
|
||||
values.push(updates.enabled);
|
||||
}
|
||||
if (updates.interval_hours !== undefined) {
|
||||
setClauses.push(`interval_hours = $${paramIndex++}`);
|
||||
values.push(updates.interval_hours);
|
||||
}
|
||||
if (updates.run_time !== undefined) {
|
||||
setClauses.push(`run_time = $${paramIndex++}`);
|
||||
values.push(updates.run_time);
|
||||
}
|
||||
values.push(scheduleType);
|
||||
const result = await migrate_1.pool.query(`
|
||||
UPDATE crawler_schedule
|
||||
SET ${setClauses.join(', ')}
|
||||
WHERE schedule_type = $${paramIndex}
|
||||
RETURNING *
|
||||
`, values);
|
||||
return result.rows[0];
|
||||
}
|
||||
/**
|
||||
* Get all store schedule statuses
|
||||
*/
|
||||
async function getStoreScheduleStatuses() {
|
||||
const result = await migrate_1.pool.query(`SELECT * FROM crawl_schedule_status ORDER BY priority DESC, store_name`);
|
||||
return result.rows;
|
||||
}
|
||||
/**
|
||||
* Get or create per-store schedule override
|
||||
*/
|
||||
async function getStoreSchedule(storeId) {
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT * FROM store_crawl_schedule WHERE store_id = $1
|
||||
`, [storeId]);
|
||||
if (result.rows.length > 0) {
|
||||
return result.rows[0];
|
||||
}
|
||||
// Return default (use global)
|
||||
return {
|
||||
store_id: storeId,
|
||||
enabled: true,
|
||||
interval_hours: null,
|
||||
daily_special_enabled: true,
|
||||
daily_special_time: null,
|
||||
priority: 0
|
||||
};
|
||||
}
|
||||
/**
|
||||
* Update per-store schedule override
|
||||
*/
|
||||
async function updateStoreSchedule(storeId, updates) {
|
||||
const result = await migrate_1.pool.query(`
|
||||
INSERT INTO store_crawl_schedule (store_id, enabled, interval_hours, daily_special_enabled, daily_special_time, priority)
|
||||
VALUES ($1, $2, $3, $4, $5, $6)
|
||||
ON CONFLICT (store_id) DO UPDATE SET
|
||||
enabled = COALESCE(EXCLUDED.enabled, store_crawl_schedule.enabled),
|
||||
interval_hours = EXCLUDED.interval_hours,
|
||||
daily_special_enabled = COALESCE(EXCLUDED.daily_special_enabled, store_crawl_schedule.daily_special_enabled),
|
||||
daily_special_time = EXCLUDED.daily_special_time,
|
||||
priority = COALESCE(EXCLUDED.priority, store_crawl_schedule.priority),
|
||||
updated_at = NOW()
|
||||
RETURNING *
|
||||
`, [
|
||||
storeId,
|
||||
updates.enabled ?? true,
|
||||
updates.interval_hours ?? null,
|
||||
updates.daily_special_enabled ?? true,
|
||||
updates.daily_special_time ?? null,
|
||||
updates.priority ?? 0
|
||||
]);
|
||||
return result.rows[0];
|
||||
}
|
||||
// ============================================
|
||||
// Job Queue Management
|
||||
// ============================================
|
||||
/**
|
||||
* Create a new crawl job
|
||||
*/
|
||||
async function createCrawlJob(storeId, jobType = 'full_crawl', triggerType = 'scheduled', scheduledAt = new Date(), priority = 0) {
|
||||
// Check if there's already a pending or running job for this store
|
||||
const existing = await migrate_1.pool.query(`
|
||||
SELECT id FROM crawl_jobs
|
||||
WHERE store_id = $1 AND status IN ('pending', 'running')
|
||||
LIMIT 1
|
||||
`, [storeId]);
|
||||
if (existing.rows.length > 0) {
|
||||
console.log(`Skipping job creation for store ${storeId} - already has pending/running job`);
|
||||
return existing.rows[0];
|
||||
}
|
||||
const result = await migrate_1.pool.query(`
|
||||
INSERT INTO crawl_jobs (store_id, job_type, trigger_type, scheduled_at, priority, status)
|
||||
VALUES ($1, $2, $3, $4, $5, 'pending')
|
||||
RETURNING *
|
||||
`, [storeId, jobType, triggerType, scheduledAt, priority]);
|
||||
console.log(`Created crawl job ${result.rows[0].id} for store ${storeId} (${triggerType})`);
|
||||
return result.rows[0];
|
||||
}
|
||||
/**
|
||||
* Get pending jobs ready to run
|
||||
*/
|
||||
async function getPendingJobs(limit = 5) {
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT cj.*, s.name as store_name
|
||||
FROM crawl_jobs cj
|
||||
JOIN stores s ON s.id = cj.store_id
|
||||
WHERE cj.status = 'pending'
|
||||
AND cj.scheduled_at <= NOW()
|
||||
ORDER BY cj.priority DESC, cj.scheduled_at ASC
|
||||
LIMIT $1
|
||||
`, [limit]);
|
||||
return result.rows;
|
||||
}
|
||||
/**
|
||||
* Claim a job for processing
|
||||
*/
|
||||
async function claimJob(jobId) {
|
||||
const result = await migrate_1.pool.query(`
|
||||
UPDATE crawl_jobs
|
||||
SET status = 'running', started_at = NOW(), worker_id = $2
|
||||
WHERE id = $1 AND status = 'pending'
|
||||
RETURNING id
|
||||
`, [jobId, WORKER_ID]);
|
||||
return result.rows.length > 0;
|
||||
}
|
||||
/**
|
||||
* Complete a job
|
||||
*/
|
||||
async function completeJob(jobId, success, results) {
|
||||
await migrate_1.pool.query(`
|
||||
UPDATE crawl_jobs
|
||||
SET
|
||||
status = $2,
|
||||
completed_at = NOW(),
|
||||
products_found = $3,
|
||||
error_message = $4
|
||||
WHERE id = $1
|
||||
`, [
|
||||
jobId,
|
||||
success ? 'completed' : 'failed',
|
||||
results?.products_found ?? null,
|
||||
results?.error_message ?? null
|
||||
]);
|
||||
}
|
||||
/**
|
||||
* Get recent jobs for a store
|
||||
*/
|
||||
async function getRecentJobs(storeId, limit = 10) {
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT * FROM crawl_jobs
|
||||
WHERE store_id = $1
|
||||
ORDER BY created_at DESC
|
||||
LIMIT $2
|
||||
`, [storeId, limit]);
|
||||
return result.rows;
|
||||
}
|
||||
/**
|
||||
* Get all recent jobs
|
||||
*/
|
||||
async function getAllRecentJobs(limit = 50) {
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT cj.*, s.name as store_name, s.slug as store_slug
|
||||
FROM crawl_jobs cj
|
||||
JOIN stores s ON s.id = cj.store_id
|
||||
ORDER BY cj.created_at DESC
|
||||
LIMIT $1
|
||||
`, [limit]);
|
||||
return result.rows;
|
||||
}
|
||||
// ============================================
|
||||
// Scheduler Logic
|
||||
// ============================================
|
||||
/**
|
||||
* Check which stores are due for a crawl and create jobs
|
||||
*/
|
||||
async function checkAndCreateScheduledJobs() {
|
||||
console.log('Checking for stores due for crawl...');
|
||||
// Get global schedule settings
|
||||
const globalSchedule = await migrate_1.pool.query(`
|
||||
SELECT * FROM crawler_schedule WHERE schedule_type = 'global_interval'
|
||||
`);
|
||||
if (globalSchedule.rows.length === 0 || !globalSchedule.rows[0].enabled) {
|
||||
console.log('Global scheduler is disabled');
|
||||
return 0;
|
||||
}
|
||||
const intervalHours = globalSchedule.rows[0].interval_hours || 4;
|
||||
// Find stores due for crawl
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT
|
||||
s.id,
|
||||
s.name,
|
||||
s.timezone,
|
||||
s.last_scraped_at,
|
||||
COALESCE(scs.enabled, TRUE) as schedule_enabled,
|
||||
COALESCE(scs.interval_hours, $1) as interval_hours,
|
||||
COALESCE(scs.priority, 0) as priority
|
||||
FROM stores s
|
||||
LEFT JOIN store_crawl_schedule scs ON scs.store_id = s.id
|
||||
WHERE s.active = TRUE
|
||||
AND s.scrape_enabled = TRUE
|
||||
AND COALESCE(scs.enabled, TRUE) = TRUE
|
||||
AND (
|
||||
s.last_scraped_at IS NULL
|
||||
OR s.last_scraped_at < NOW() - (COALESCE(scs.interval_hours, $1) || ' hours')::INTERVAL
|
||||
)
|
||||
AND NOT EXISTS (
|
||||
SELECT 1 FROM crawl_jobs cj
|
||||
WHERE cj.store_id = s.id AND cj.status IN ('pending', 'running')
|
||||
)
|
||||
ORDER BY COALESCE(scs.priority, 0) DESC, s.last_scraped_at ASC NULLS FIRST
|
||||
`, [intervalHours]);
|
||||
let jobsCreated = 0;
|
||||
for (const store of result.rows) {
|
||||
try {
|
||||
await createCrawlJob(store.id, 'full_crawl', 'scheduled', new Date(), store.priority);
|
||||
jobsCreated++;
|
||||
console.log(`Scheduled crawl job for: ${store.name}`);
|
||||
}
|
||||
catch (error) {
|
||||
console.error(`Failed to create job for store ${store.name}:`, error);
|
||||
}
|
||||
}
|
||||
console.log(`Created ${jobsCreated} scheduled crawl jobs`);
|
||||
return jobsCreated;
|
||||
}
|
||||
/**
|
||||
* Check for daily special runs (12:01 AM local time)
|
||||
*/
|
||||
async function checkAndCreateDailySpecialJobs() {
|
||||
console.log('Checking for daily special runs...');
|
||||
// Get daily special schedule
|
||||
const dailySchedule = await migrate_1.pool.query(`
|
||||
SELECT * FROM crawler_schedule WHERE schedule_type = 'daily_special'
|
||||
`);
|
||||
if (dailySchedule.rows.length === 0 || !dailySchedule.rows[0].enabled) {
|
||||
console.log('Daily special scheduler is disabled');
|
||||
return 0;
|
||||
}
|
||||
const targetTime = dailySchedule.rows[0].run_time || '00:01';
|
||||
// Find stores where it's currently the target time in their local timezone
|
||||
// and they haven't had a daily special run today
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT
|
||||
s.id,
|
||||
s.name,
|
||||
s.timezone,
|
||||
COALESCE(scs.daily_special_enabled, TRUE) as daily_special_enabled,
|
||||
COALESCE(scs.daily_special_time, $1::TIME) as daily_special_time,
|
||||
COALESCE(scs.priority, 0) as priority
|
||||
FROM stores s
|
||||
LEFT JOIN store_crawl_schedule scs ON scs.store_id = s.id
|
||||
WHERE s.active = TRUE
|
||||
AND s.scrape_enabled = TRUE
|
||||
AND COALESCE(scs.daily_special_enabled, TRUE) = TRUE
|
||||
-- Check if current time in store timezone matches the target time (within 2 minutes)
|
||||
AND ABS(
|
||||
EXTRACT(EPOCH FROM (
|
||||
(NOW() AT TIME ZONE COALESCE(s.timezone, 'America/Phoenix'))::TIME
|
||||
- COALESCE(scs.daily_special_time, $1::TIME)
|
||||
))
|
||||
) < 120 -- within 2 minutes
|
||||
-- Ensure we haven't already created a daily_special job today for this store
|
||||
AND NOT EXISTS (
|
||||
SELECT 1 FROM crawl_jobs cj
|
||||
WHERE cj.store_id = s.id
|
||||
AND cj.trigger_type = 'daily_special'
|
||||
AND cj.created_at > (NOW() AT TIME ZONE COALESCE(s.timezone, 'America/Phoenix'))::DATE
|
||||
)
|
||||
AND NOT EXISTS (
|
||||
SELECT 1 FROM crawl_jobs cj
|
||||
WHERE cj.store_id = s.id AND cj.status IN ('pending', 'running')
|
||||
)
|
||||
ORDER BY COALESCE(scs.priority, 0) DESC
|
||||
`, [targetTime]);
|
||||
let jobsCreated = 0;
|
||||
for (const store of result.rows) {
|
||||
try {
|
||||
await createCrawlJob(store.id, 'full_crawl', 'daily_special', new Date(), store.priority + 10);
|
||||
jobsCreated++;
|
||||
console.log(`Created daily special job for: ${store.name} (${store.timezone})`);
|
||||
}
|
||||
catch (error) {
|
||||
console.error(`Failed to create daily special job for store ${store.name}:`, error);
|
||||
}
|
||||
}
|
||||
if (jobsCreated > 0) {
|
||||
console.log(`Created ${jobsCreated} daily special crawl jobs`);
|
||||
}
|
||||
return jobsCreated;
|
||||
}
|
||||
/**
|
||||
* Process pending jobs
|
||||
*/
|
||||
async function processJobs() {
|
||||
if (jobProcessorRunning) {
|
||||
console.log('Job processor already running, skipping...');
|
||||
return;
|
||||
}
|
||||
jobProcessorRunning = true;
|
||||
try {
|
||||
const jobs = await getPendingJobs(1); // Process one at a time for safety
|
||||
for (const job of jobs) {
|
||||
console.log(`Processing job ${job.id} for store: ${job.store_name}`);
|
||||
const claimed = await claimJob(job.id);
|
||||
if (!claimed) {
|
||||
console.log(`Job ${job.id} already claimed by another worker`);
|
||||
continue;
|
||||
}
|
||||
try {
|
||||
// Call the existing scraper - DO NOT MODIFY SCRAPER LOGIC
|
||||
await (0, scraper_v2_1.scrapeStore)(job.store_id);
|
||||
// Update store's last_scraped_at
|
||||
await migrate_1.pool.query(`
|
||||
UPDATE stores SET last_scraped_at = NOW() WHERE id = $1
|
||||
`, [job.store_id]);
|
||||
await completeJob(job.id, true, {});
|
||||
console.log(`Job ${job.id} completed successfully`);
|
||||
}
|
||||
catch (error) {
|
||||
console.error(`Job ${job.id} failed:`, error);
|
||||
await completeJob(job.id, false, { error_message: error.message });
|
||||
}
|
||||
}
|
||||
}
|
||||
finally {
|
||||
jobProcessorRunning = false;
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Process stores using the intelligent orchestrator
|
||||
* This replaces the simple job queue approach with intelligent provider detection
|
||||
*/
|
||||
async function processOrchestrator() {
|
||||
if (orchestratorProcessorRunning) {
|
||||
console.log('Orchestrator processor already running, skipping...');
|
||||
return;
|
||||
}
|
||||
orchestratorProcessorRunning = true;
|
||||
try {
|
||||
// Get stores due for orchestration (respects schedule, intervals, etc.)
|
||||
const storeIds = await (0, store_crawl_orchestrator_1.getStoresDueForOrchestration)(3); // Process up to 3 at a time
|
||||
if (storeIds.length === 0) {
|
||||
return;
|
||||
}
|
||||
console.log(`Orchestrator: Processing ${storeIds.length} stores due for crawl`);
|
||||
// Process each store through the orchestrator
|
||||
for (const storeId of storeIds) {
|
||||
try {
|
||||
console.log(`Orchestrator: Starting crawl for store ${storeId}`);
|
||||
const result = await (0, store_crawl_orchestrator_1.runStoreCrawlOrchestrator)(storeId);
|
||||
console.log(`Orchestrator: Store ${storeId} completed - ${result.summary}`);
|
||||
}
|
||||
catch (error) {
|
||||
console.error(`Orchestrator: Store ${storeId} failed - ${error.message}`);
|
||||
}
|
||||
}
|
||||
console.log(`Orchestrator: Finished processing ${storeIds.length} stores`);
|
||||
}
|
||||
finally {
|
||||
orchestratorProcessorRunning = false;
|
||||
}
|
||||
}
|
||||
// ============================================
|
||||
// Scheduler Control
|
||||
// ============================================
|
||||
/**
|
||||
* Set scheduler mode
|
||||
*/
|
||||
function setSchedulerMode(mode) {
|
||||
schedulerMode = mode;
|
||||
console.log(`Scheduler mode set to: ${mode}`);
|
||||
}
|
||||
/**
|
||||
* Get current scheduler mode
|
||||
*/
|
||||
function getSchedulerMode() {
|
||||
return schedulerMode;
|
||||
}
|
||||
/**
|
||||
* Start the scheduler (runs every minute to check for due jobs)
|
||||
*/
|
||||
async function startCrawlScheduler() {
|
||||
stopCrawlScheduler();
|
||||
console.log(`Starting crawl scheduler in ${schedulerMode} mode...`);
|
||||
// Run every minute
|
||||
schedulerCronJob = node_cron_1.default.schedule('* * * * *', async () => {
|
||||
try {
|
||||
if (schedulerMode === 'orchestrator') {
|
||||
// Use intelligent orchestrator (handles detection + crawl)
|
||||
await processOrchestrator();
|
||||
}
|
||||
else {
|
||||
// Legacy mode: job queue approach
|
||||
// Check for interval-based scheduled jobs
|
||||
await checkAndCreateScheduledJobs();
|
||||
// Check for daily special runs
|
||||
await checkAndCreateDailySpecialJobs();
|
||||
// Process any pending jobs
|
||||
await processJobs();
|
||||
}
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Scheduler tick error:', error);
|
||||
}
|
||||
});
|
||||
console.log(`Crawl scheduler started in ${schedulerMode} mode (checking every minute)`);
|
||||
}
|
||||
/**
|
||||
* Stop the scheduler
|
||||
*/
|
||||
function stopCrawlScheduler() {
|
||||
if (schedulerCronJob) {
|
||||
schedulerCronJob.stop();
|
||||
schedulerCronJob = null;
|
||||
console.log('Crawl scheduler stopped');
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Restart the scheduler
|
||||
*/
|
||||
async function restartCrawlScheduler() {
|
||||
await startCrawlScheduler();
|
||||
}
|
||||
// ============================================
|
||||
// Manual Triggers
|
||||
// ============================================
|
||||
/**
|
||||
* Manually trigger a crawl for a specific store (creates a job immediately)
|
||||
*/
|
||||
async function triggerManualCrawl(storeId) {
|
||||
console.log(`Manual crawl triggered for store ID: ${storeId}`);
|
||||
return await createCrawlJob(storeId, 'full_crawl', 'manual', new Date(), 100); // High priority
|
||||
}
|
||||
/**
|
||||
* Manually trigger crawls for all stores
|
||||
*/
|
||||
async function triggerAllStoresCrawl() {
|
||||
console.log('Manual crawl triggered for all stores');
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT id, name FROM stores
|
||||
WHERE active = TRUE AND scrape_enabled = TRUE
|
||||
AND NOT EXISTS (
|
||||
SELECT 1 FROM crawl_jobs cj
|
||||
WHERE cj.store_id = stores.id AND cj.status IN ('pending', 'running')
|
||||
)
|
||||
`);
|
||||
let jobsCreated = 0;
|
||||
for (const store of result.rows) {
|
||||
await createCrawlJob(store.id, 'full_crawl', 'manual', new Date(), 50);
|
||||
jobsCreated++;
|
||||
}
|
||||
console.log(`Created ${jobsCreated} manual crawl jobs`);
|
||||
return jobsCreated;
|
||||
}
|
||||
/**
|
||||
* Cancel a pending job
|
||||
*/
|
||||
async function cancelJob(jobId) {
|
||||
const result = await migrate_1.pool.query(`
|
||||
UPDATE crawl_jobs
|
||||
SET status = 'cancelled'
|
||||
WHERE id = $1 AND status = 'pending'
|
||||
RETURNING id
|
||||
`, [jobId]);
|
||||
return result.rows.length > 0;
|
||||
}
|
||||
476
backend/dist/services/crawler-jobs.js
vendored
Normal file
476
backend/dist/services/crawler-jobs.js
vendored
Normal file
@@ -0,0 +1,476 @@
|
||||
"use strict";
|
||||
/**
|
||||
* Crawler Jobs Service
|
||||
*
|
||||
* Handles three types of jobs:
|
||||
* 1. DetectMenuProviderJob - Detect menu provider for a dispensary
|
||||
* 2. DutchieMenuCrawlJob - Production Dutchie crawl
|
||||
* 3. SandboxCrawlJob - Learning/testing crawl for unknown providers
|
||||
*/
|
||||
var __importDefault = (this && this.__importDefault) || function (mod) {
|
||||
return (mod && mod.__esModule) ? mod : { "default": mod };
|
||||
};
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.runDetectMenuProviderJob = runDetectMenuProviderJob;
|
||||
exports.runDutchieMenuCrawlJob = runDutchieMenuCrawlJob;
|
||||
exports.runSandboxCrawlJob = runSandboxCrawlJob;
|
||||
exports.processSandboxJobs = processSandboxJobs;
|
||||
const migrate_1 = require("../db/migrate");
|
||||
const logger_1 = require("./logger");
|
||||
const menu_provider_detector_1 = require("./menu-provider-detector");
|
||||
const scraper_v2_1 = require("../scraper-v2");
|
||||
const puppeteer_1 = __importDefault(require("puppeteer"));
|
||||
const fs_1 = require("fs");
|
||||
const path_1 = __importDefault(require("path"));
|
||||
const availability_1 = require("./availability");
|
||||
const WORKER_ID = `crawler-${process.pid}-${Date.now()}`;
|
||||
// ========================================
|
||||
// Helper Functions
|
||||
// ========================================
|
||||
async function getDispensary(dispensaryId) {
|
||||
const result = await migrate_1.pool.query(`SELECT id, name, website, menu_url, menu_provider, menu_provider_confidence,
|
||||
crawler_mode, crawler_status, scraper_template
|
||||
FROM dispensaries WHERE id = $1`, [dispensaryId]);
|
||||
return result.rows[0] || null;
|
||||
}
|
||||
async function updateDispensary(dispensaryId, updates) {
|
||||
const setClauses = [];
|
||||
const values = [];
|
||||
let paramIndex = 1;
|
||||
for (const [key, value] of Object.entries(updates)) {
|
||||
setClauses.push(`${key} = $${paramIndex}`);
|
||||
values.push(value);
|
||||
paramIndex++;
|
||||
}
|
||||
setClauses.push(`updated_at = NOW()`);
|
||||
values.push(dispensaryId);
|
||||
await migrate_1.pool.query(`UPDATE dispensaries SET ${setClauses.join(', ')} WHERE id = $${paramIndex}`, values);
|
||||
}
|
||||
async function createSandboxEntry(dispensaryId, suspectedProvider, mode, detectionSignals) {
|
||||
// First, check if there's an existing active sandbox
|
||||
const existing = await migrate_1.pool.query(`SELECT id FROM crawler_sandboxes
|
||||
WHERE dispensary_id = $1 AND status NOT IN ('moved_to_production', 'failed')`, [dispensaryId]);
|
||||
if (existing.rows.length > 0) {
|
||||
// Update existing
|
||||
await migrate_1.pool.query(`UPDATE crawler_sandboxes
|
||||
SET suspected_menu_provider = $2, mode = $3, detection_signals = COALESCE($4, detection_signals), updated_at = NOW()
|
||||
WHERE id = $1`, [existing.rows[0].id, suspectedProvider, mode, detectionSignals ? JSON.stringify(detectionSignals) : null]);
|
||||
return existing.rows[0].id;
|
||||
}
|
||||
// Create new
|
||||
const result = await migrate_1.pool.query(`INSERT INTO crawler_sandboxes (dispensary_id, suspected_menu_provider, mode, detection_signals, status)
|
||||
VALUES ($1, $2, $3, $4, 'pending')
|
||||
RETURNING id`, [dispensaryId, suspectedProvider, mode, detectionSignals ? JSON.stringify(detectionSignals) : '{}']);
|
||||
return result.rows[0].id;
|
||||
}
|
||||
async function createSandboxJob(dispensaryId, sandboxId, jobType, priority = 0) {
|
||||
const result = await migrate_1.pool.query(`INSERT INTO sandbox_crawl_jobs (dispensary_id, sandbox_id, job_type, status, priority)
|
||||
VALUES ($1, $2, $3, 'pending', $4)
|
||||
RETURNING id`, [dispensaryId, sandboxId, jobType, priority]);
|
||||
return result.rows[0].id;
|
||||
}
|
||||
// Get linked store ID for a dispensary (for using existing scraper)
|
||||
async function getStoreIdForDispensary(dispensaryId) {
|
||||
// Check if there's a stores entry linked to this dispensary
|
||||
const result = await migrate_1.pool.query(`SELECT s.id FROM stores s
|
||||
JOIN dispensaries d ON d.menu_url = s.dutchie_url OR d.name ILIKE '%' || s.name || '%'
|
||||
WHERE d.id = $1
|
||||
LIMIT 1`, [dispensaryId]);
|
||||
if (result.rows.length > 0) {
|
||||
return result.rows[0].id;
|
||||
}
|
||||
// Try to find by website
|
||||
const result2 = await migrate_1.pool.query(`SELECT s.id FROM stores s
|
||||
JOIN dispensaries d ON d.website ILIKE '%' || s.slug || '%'
|
||||
WHERE d.id = $1
|
||||
LIMIT 1`, [dispensaryId]);
|
||||
return result2.rows[0]?.id || null;
|
||||
}
|
||||
// ========================================
|
||||
// Job 1: Detect Menu Provider
|
||||
// ========================================
|
||||
async function runDetectMenuProviderJob(dispensaryId) {
|
||||
logger_1.logger.info('crawler-jobs', `Starting menu provider detection for dispensary ${dispensaryId}`);
|
||||
const dispensary = await getDispensary(dispensaryId);
|
||||
if (!dispensary) {
|
||||
return { success: false, message: `Dispensary ${dispensaryId} not found` };
|
||||
}
|
||||
// Check for website URL
|
||||
const websiteUrl = dispensary.website || dispensary.menu_url;
|
||||
if (!websiteUrl) {
|
||||
await updateDispensary(dispensaryId, {
|
||||
crawler_status: 'error_needs_review',
|
||||
last_menu_error_at: new Date(),
|
||||
last_error_message: 'No website URL available for detection',
|
||||
});
|
||||
return { success: false, message: 'No website URL available' };
|
||||
}
|
||||
try {
|
||||
// Run detection
|
||||
const detection = await (0, menu_provider_detector_1.detectMenuProvider)(websiteUrl, {
|
||||
checkMenuPaths: true,
|
||||
timeout: 30000,
|
||||
});
|
||||
// Update dispensary with results
|
||||
const updates = {
|
||||
menu_provider: detection.provider,
|
||||
menu_provider_confidence: detection.confidence,
|
||||
provider_detection_data: JSON.stringify({
|
||||
signals: detection.signals,
|
||||
urlsTested: detection.urlsTested,
|
||||
menuEntryPoints: detection.menuEntryPoints,
|
||||
rawSignals: detection.rawSignals,
|
||||
detectedAt: new Date().toISOString(),
|
||||
}),
|
||||
crawler_status: 'idle',
|
||||
};
|
||||
// Decide crawler mode based on provider
|
||||
if (detection.provider === 'dutchie' && detection.confidence >= 70) {
|
||||
// Dutchie with high confidence -> production
|
||||
updates.crawler_mode = 'production';
|
||||
logger_1.logger.info('crawler-jobs', `Dispensary ${dispensaryId} detected as Dutchie (${detection.confidence}%), setting to production`);
|
||||
}
|
||||
else {
|
||||
// Unknown or non-Dutchie -> sandbox
|
||||
updates.crawler_mode = 'sandbox';
|
||||
// Create sandbox entry for further analysis
|
||||
const sandboxId = await createSandboxEntry(dispensaryId, detection.provider, 'detection', {
|
||||
signals: detection.signals,
|
||||
rawSignals: detection.rawSignals,
|
||||
});
|
||||
// Queue sandbox crawl job
|
||||
await createSandboxJob(dispensaryId, sandboxId, 'detection');
|
||||
logger_1.logger.info('crawler-jobs', `Dispensary ${dispensaryId} detected as ${detection.provider} (${detection.confidence}%), setting to sandbox`);
|
||||
}
|
||||
// Update menu entry points if found
|
||||
if (detection.menuEntryPoints.length > 0 && !dispensary.menu_url) {
|
||||
updates.menu_url = detection.menuEntryPoints[0];
|
||||
}
|
||||
await updateDispensary(dispensaryId, updates);
|
||||
return {
|
||||
success: true,
|
||||
message: `Detected provider: ${detection.provider} (${detection.confidence}%)`,
|
||||
data: {
|
||||
provider: detection.provider,
|
||||
confidence: detection.confidence,
|
||||
mode: updates.crawler_mode,
|
||||
menuEntryPoints: detection.menuEntryPoints,
|
||||
},
|
||||
};
|
||||
}
|
||||
catch (error) {
|
||||
logger_1.logger.error('crawler-jobs', `Detection failed for dispensary ${dispensaryId}: ${error.message}`);
|
||||
await updateDispensary(dispensaryId, {
|
||||
crawler_status: 'error_needs_review',
|
||||
last_menu_error_at: new Date(),
|
||||
last_error_message: `Detection failed: ${error.message}`,
|
||||
});
|
||||
return { success: false, message: error.message };
|
||||
}
|
||||
}
|
||||
// ========================================
|
||||
// Job 2: Dutchie Menu Crawl (Production)
|
||||
// ========================================
|
||||
async function runDutchieMenuCrawlJob(dispensaryId) {
|
||||
logger_1.logger.info('crawler-jobs', `Starting Dutchie production crawl for dispensary ${dispensaryId}`);
|
||||
const dispensary = await getDispensary(dispensaryId);
|
||||
if (!dispensary) {
|
||||
return { success: false, message: `Dispensary ${dispensaryId} not found` };
|
||||
}
|
||||
// Verify it's a Dutchie production dispensary
|
||||
if (dispensary.menu_provider !== 'dutchie') {
|
||||
logger_1.logger.warn('crawler-jobs', `Dispensary ${dispensaryId} is not Dutchie, skipping production crawl`);
|
||||
return { success: false, message: 'Not a Dutchie dispensary' };
|
||||
}
|
||||
if (dispensary.crawler_mode !== 'production') {
|
||||
logger_1.logger.warn('crawler-jobs', `Dispensary ${dispensaryId} is not in production mode, skipping`);
|
||||
return { success: false, message: 'Not in production mode' };
|
||||
}
|
||||
// Find linked store ID
|
||||
const storeId = await getStoreIdForDispensary(dispensaryId);
|
||||
if (!storeId) {
|
||||
// Need to create a store entry or handle differently
|
||||
logger_1.logger.warn('crawler-jobs', `No linked store found for dispensary ${dispensaryId}`);
|
||||
return { success: false, message: 'No linked store found - needs setup' };
|
||||
}
|
||||
try {
|
||||
// Update status to running
|
||||
await updateDispensary(dispensaryId, { crawler_status: 'running' });
|
||||
// Run the existing Dutchie scraper
|
||||
await (0, scraper_v2_1.scrapeStore)(storeId, 3); // 3 parallel workers
|
||||
// Update success status
|
||||
await updateDispensary(dispensaryId, {
|
||||
crawler_status: 'ok',
|
||||
last_menu_scrape: new Date(),
|
||||
menu_scrape_status: 'active',
|
||||
});
|
||||
logger_1.logger.info('crawler-jobs', `Dutchie crawl completed for dispensary ${dispensaryId}`);
|
||||
return {
|
||||
success: true,
|
||||
message: 'Dutchie crawl completed successfully',
|
||||
data: { storeId },
|
||||
};
|
||||
}
|
||||
catch (error) {
|
||||
logger_1.logger.error('crawler-jobs', `Dutchie crawl failed for dispensary ${dispensaryId}: ${error.message}`);
|
||||
// Check if this might be a provider change
|
||||
let providerChanged = false;
|
||||
try {
|
||||
const browser = await puppeteer_1.default.launch({ headless: true, args: ['--no-sandbox'] });
|
||||
const page = await browser.newPage();
|
||||
const url = dispensary.menu_url || dispensary.website;
|
||||
if (url) {
|
||||
await page.goto(url, { waitUntil: 'networkidle2', timeout: 30000 });
|
||||
const changeResult = await (0, menu_provider_detector_1.detectProviderChange)(page, 'dutchie');
|
||||
providerChanged = changeResult.changed;
|
||||
if (providerChanged) {
|
||||
// Provider changed - move to sandbox
|
||||
await updateDispensary(dispensaryId, {
|
||||
crawler_mode: 'sandbox',
|
||||
crawler_status: 'error_needs_review',
|
||||
last_menu_error_at: new Date(),
|
||||
last_error_message: `Provider appears to have changed from Dutchie to ${changeResult.newProvider}`,
|
||||
});
|
||||
const sandboxId = await createSandboxEntry(dispensaryId, changeResult.newProvider || 'unknown', 'detection', { providerChangeDetected: true, previousProvider: 'dutchie' });
|
||||
await createSandboxJob(dispensaryId, sandboxId, 'detection');
|
||||
logger_1.logger.warn('crawler-jobs', `Provider change detected for dispensary ${dispensaryId}: Dutchie -> ${changeResult.newProvider}`);
|
||||
}
|
||||
}
|
||||
await browser.close();
|
||||
}
|
||||
catch {
|
||||
// Ignore detection errors during failure handling
|
||||
}
|
||||
if (!providerChanged) {
|
||||
await updateDispensary(dispensaryId, {
|
||||
crawler_status: 'error_needs_review',
|
||||
last_menu_error_at: new Date(),
|
||||
last_error_message: error.message,
|
||||
});
|
||||
}
|
||||
return { success: false, message: error.message };
|
||||
}
|
||||
}
|
||||
// ========================================
|
||||
// Job 3: Sandbox Crawl (Learning Mode)
|
||||
// ========================================
|
||||
async function runSandboxCrawlJob(dispensaryId, sandboxId) {
|
||||
logger_1.logger.info('crawler-jobs', `Starting sandbox crawl for dispensary ${dispensaryId}`);
|
||||
const dispensary = await getDispensary(dispensaryId);
|
||||
if (!dispensary) {
|
||||
return { success: false, message: `Dispensary ${dispensaryId} not found` };
|
||||
}
|
||||
// Get or create sandbox entry
|
||||
let sandbox;
|
||||
if (sandboxId) {
|
||||
const result = await migrate_1.pool.query('SELECT * FROM crawler_sandboxes WHERE id = $1', [sandboxId]);
|
||||
sandbox = result.rows[0];
|
||||
}
|
||||
else {
|
||||
const result = await migrate_1.pool.query(`SELECT * FROM crawler_sandboxes
|
||||
WHERE dispensary_id = $1 AND status NOT IN ('moved_to_production', 'failed')
|
||||
ORDER BY created_at DESC LIMIT 1`, [dispensaryId]);
|
||||
sandbox = result.rows[0];
|
||||
if (!sandbox) {
|
||||
const newSandboxId = await createSandboxEntry(dispensaryId, dispensary.menu_provider, 'template_learning');
|
||||
const result = await migrate_1.pool.query('SELECT * FROM crawler_sandboxes WHERE id = $1', [newSandboxId]);
|
||||
sandbox = result.rows[0];
|
||||
}
|
||||
}
|
||||
const websiteUrl = dispensary.menu_url || dispensary.website;
|
||||
if (!websiteUrl) {
|
||||
await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = 'failed', failure_reason = 'No website URL' WHERE id = $1`, [sandbox.id]);
|
||||
return { success: false, message: 'No website URL available' };
|
||||
}
|
||||
let browser = null;
|
||||
try {
|
||||
// Update status
|
||||
await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = 'analyzing', updated_at = NOW() WHERE id = $1`, [sandbox.id]);
|
||||
await updateDispensary(dispensaryId, { crawler_status: 'running' });
|
||||
// Launch browser
|
||||
browser = await puppeteer_1.default.launch({
|
||||
headless: true,
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox'],
|
||||
});
|
||||
const page = await browser.newPage();
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
|
||||
// URLs to crawl (limited depth for sandbox)
|
||||
const urlsToVisit = [websiteUrl];
|
||||
const menuPaths = ['/menu', '/shop', '/products', '/order'];
|
||||
for (const path of menuPaths) {
|
||||
const baseUrl = new URL(websiteUrl).origin;
|
||||
urlsToVisit.push(`${baseUrl}${path}`);
|
||||
}
|
||||
const urlsTested = [];
|
||||
const menuEntryPoints = [];
|
||||
const capturedHtml = [];
|
||||
const analysisData = {
|
||||
provider_signals: {},
|
||||
selector_candidates: [],
|
||||
page_structures: [],
|
||||
};
|
||||
// Crawl each URL
|
||||
for (const url of urlsToVisit) {
|
||||
try {
|
||||
urlsTested.push(url);
|
||||
await page.goto(url, { waitUntil: 'networkidle2', timeout: 30000 });
|
||||
await new Promise(r => setTimeout(r, 2000)); // Wait for dynamic content
|
||||
// Get page HTML
|
||||
const html = await page.content();
|
||||
// Check if this looks like a menu page
|
||||
const hasMenuContent = await page.evaluate(() => {
|
||||
const text = document.body.innerText.toLowerCase();
|
||||
return (text.includes('add to cart') ||
|
||||
text.includes('thc') ||
|
||||
text.includes('indica') ||
|
||||
text.includes('sativa'));
|
||||
});
|
||||
if (hasMenuContent) {
|
||||
menuEntryPoints.push(url);
|
||||
capturedHtml.push({ url, html });
|
||||
// Analyze page structure for selector candidates
|
||||
const structure = await page.evaluate(() => {
|
||||
const candidates = [];
|
||||
// Look for product-like containers
|
||||
const productSelectors = [
|
||||
'.product', '.product-card', '.menu-item', '.item-card',
|
||||
'[data-product]', '[data-item]', '.strain', '.listing',
|
||||
];
|
||||
for (const selector of productSelectors) {
|
||||
const els = document.querySelectorAll(selector);
|
||||
if (els.length > 3) { // Likely a list
|
||||
candidates.push({
|
||||
selector,
|
||||
count: els.length,
|
||||
type: 'product_container',
|
||||
});
|
||||
}
|
||||
}
|
||||
// Look for price patterns
|
||||
const pricePattern = /\$\d+(\.\d{2})?/;
|
||||
const textNodes = document.body.innerText;
|
||||
const priceMatches = textNodes.match(/\$\d+(\.\d{2})?/g);
|
||||
return {
|
||||
candidates,
|
||||
priceCount: priceMatches?.length || 0,
|
||||
hasAddToCart: textNodes.toLowerCase().includes('add to cart'),
|
||||
};
|
||||
});
|
||||
// Extract availability hints from page content
|
||||
const availabilityHints = (0, availability_1.extractAvailabilityHints)(html);
|
||||
analysisData.page_structures.push({
|
||||
url,
|
||||
...structure,
|
||||
availabilityHints,
|
||||
});
|
||||
}
|
||||
}
|
||||
catch (pageError) {
|
||||
if (!pageError.message.includes('404')) {
|
||||
logger_1.logger.warn('crawler-jobs', `Sandbox crawl error for ${url}: ${pageError.message}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
// Save HTML to storage (local for now, S3 later)
|
||||
let rawHtmlLocation = null;
|
||||
if (capturedHtml.length > 0) {
|
||||
const htmlDir = path_1.default.join(process.cwd(), 'sandbox-data', `dispensary-${dispensaryId}`);
|
||||
await fs_1.promises.mkdir(htmlDir, { recursive: true });
|
||||
for (const { url, html } of capturedHtml) {
|
||||
const filename = `${Date.now()}-${url.replace(/[^a-z0-9]/gi, '_')}.html`;
|
||||
await fs_1.promises.writeFile(path_1.default.join(htmlDir, filename), html);
|
||||
}
|
||||
rawHtmlLocation = htmlDir;
|
||||
}
|
||||
// Update sandbox with results
|
||||
await migrate_1.pool.query(`UPDATE crawler_sandboxes SET
|
||||
status = $1,
|
||||
urls_tested = $2,
|
||||
menu_entry_points = $3,
|
||||
raw_html_location = $4,
|
||||
analysis_json = $5,
|
||||
confidence_score = $6,
|
||||
analyzed_at = NOW(),
|
||||
updated_at = NOW()
|
||||
WHERE id = $7`, [
|
||||
menuEntryPoints.length > 0 ? 'needs_human_review' : 'pending',
|
||||
JSON.stringify(urlsTested),
|
||||
JSON.stringify(menuEntryPoints),
|
||||
rawHtmlLocation,
|
||||
JSON.stringify(analysisData),
|
||||
menuEntryPoints.length > 0 ? 50 : 20,
|
||||
sandbox.id,
|
||||
]);
|
||||
// Update dispensary status
|
||||
await updateDispensary(dispensaryId, {
|
||||
crawler_status: 'error_needs_review', // Sandbox results need review
|
||||
});
|
||||
logger_1.logger.info('crawler-jobs', `Sandbox crawl completed for dispensary ${dispensaryId}: ${menuEntryPoints.length} menu pages found`);
|
||||
return {
|
||||
success: true,
|
||||
message: `Sandbox crawl completed. Found ${menuEntryPoints.length} menu entry points.`,
|
||||
data: {
|
||||
sandboxId: sandbox.id,
|
||||
urlsTested: urlsTested.length,
|
||||
menuEntryPoints,
|
||||
analysisData,
|
||||
},
|
||||
};
|
||||
}
|
||||
catch (error) {
|
||||
logger_1.logger.error('crawler-jobs', `Sandbox crawl failed for dispensary ${dispensaryId}: ${error.message}`);
|
||||
await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = 'failed', failure_reason = $1 WHERE id = $2`, [error.message, sandbox.id]);
|
||||
await updateDispensary(dispensaryId, {
|
||||
crawler_status: 'error_needs_review',
|
||||
last_menu_error_at: new Date(),
|
||||
last_error_message: `Sandbox crawl failed: ${error.message}`,
|
||||
});
|
||||
return { success: false, message: error.message };
|
||||
}
|
||||
finally {
|
||||
if (browser) {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
// ========================================
|
||||
// Queue Processing Functions
|
||||
// ========================================
|
||||
/**
|
||||
* Process pending sandbox jobs
|
||||
*/
|
||||
async function processSandboxJobs(limit = 5) {
|
||||
// Claim pending jobs
|
||||
const jobs = await migrate_1.pool.query(`UPDATE sandbox_crawl_jobs
|
||||
SET status = 'running', worker_id = $1, started_at = NOW()
|
||||
WHERE id IN (
|
||||
SELECT id FROM sandbox_crawl_jobs
|
||||
WHERE status = 'pending' AND scheduled_at <= NOW()
|
||||
ORDER BY priority DESC, scheduled_at ASC
|
||||
LIMIT $2
|
||||
FOR UPDATE SKIP LOCKED
|
||||
)
|
||||
RETURNING *`, [WORKER_ID, limit]);
|
||||
for (const job of jobs.rows) {
|
||||
try {
|
||||
let result;
|
||||
if (job.job_type === 'detection') {
|
||||
result = await runDetectMenuProviderJob(job.dispensary_id);
|
||||
}
|
||||
else {
|
||||
result = await runSandboxCrawlJob(job.dispensary_id, job.sandbox_id);
|
||||
}
|
||||
await migrate_1.pool.query(`UPDATE sandbox_crawl_jobs
|
||||
SET status = $1, completed_at = NOW(), result_summary = $2, error_message = $3
|
||||
WHERE id = $4`, [
|
||||
result.success ? 'completed' : 'failed',
|
||||
JSON.stringify(result.data || {}),
|
||||
result.success ? null : result.message,
|
||||
job.id,
|
||||
]);
|
||||
}
|
||||
catch (error) {
|
||||
await migrate_1.pool.query(`UPDATE sandbox_crawl_jobs SET status = 'failed', error_message = $1 WHERE id = $2`, [error.message, job.id]);
|
||||
}
|
||||
}
|
||||
}
|
||||
202
backend/dist/services/crawler-logger.js
vendored
Normal file
202
backend/dist/services/crawler-logger.js
vendored
Normal file
@@ -0,0 +1,202 @@
|
||||
"use strict";
|
||||
/**
|
||||
* CrawlerLogger - Structured logging for crawler operations
|
||||
*
|
||||
* High-signal, low-noise logging with JSON output for:
|
||||
* - Job lifecycle (one summary per job)
|
||||
* - Provider/mode changes
|
||||
* - Sandbox events
|
||||
* - Queue failures
|
||||
*
|
||||
* NO per-product logging - that's too noisy.
|
||||
*/
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.crawlerLogger = void 0;
|
||||
class CrawlerLoggerService {
|
||||
formatLog(payload) {
|
||||
return JSON.stringify(payload);
|
||||
}
|
||||
log(payload) {
|
||||
const formatted = this.formatLog(payload);
|
||||
switch (payload.level) {
|
||||
case 'error':
|
||||
console.error(`[CRAWLER] ${formatted}`);
|
||||
break;
|
||||
case 'warn':
|
||||
console.warn(`[CRAWLER] ${formatted}`);
|
||||
break;
|
||||
case 'debug':
|
||||
console.debug(`[CRAWLER] ${formatted}`);
|
||||
break;
|
||||
default:
|
||||
console.log(`[CRAWLER] ${formatted}`);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Log when a crawl job starts
|
||||
*/
|
||||
jobStarted(params) {
|
||||
this.log({
|
||||
timestamp: new Date().toISOString(),
|
||||
level: 'info',
|
||||
event: 'job_started',
|
||||
job_id: params.job_id,
|
||||
store_id: params.store_id,
|
||||
store_name: params.store_name,
|
||||
job_type: params.job_type,
|
||||
trigger_type: params.trigger_type,
|
||||
provider: params.provider,
|
||||
});
|
||||
}
|
||||
/**
|
||||
* Log when a crawl job completes successfully
|
||||
*/
|
||||
jobCompleted(params) {
|
||||
this.log({
|
||||
timestamp: new Date().toISOString(),
|
||||
level: 'info',
|
||||
event: 'job_completed',
|
||||
job_id: params.job_id,
|
||||
store_id: params.store_id,
|
||||
store_name: params.store_name,
|
||||
duration_ms: params.duration_ms,
|
||||
products_found: params.products_found,
|
||||
products_new: params.products_new,
|
||||
products_updated: params.products_updated,
|
||||
products_marked_oos: params.products_marked_oos,
|
||||
provider: params.provider,
|
||||
});
|
||||
}
|
||||
/**
|
||||
* Log when a crawl job fails
|
||||
*/
|
||||
jobFailed(params) {
|
||||
this.log({
|
||||
timestamp: new Date().toISOString(),
|
||||
level: 'error',
|
||||
event: 'job_failed',
|
||||
job_id: params.job_id,
|
||||
store_id: params.store_id,
|
||||
store_name: params.store_name,
|
||||
duration_ms: params.duration_ms,
|
||||
error_message: params.error_message,
|
||||
error_code: params.error_code,
|
||||
provider: params.provider,
|
||||
});
|
||||
}
|
||||
/**
|
||||
* Log when a provider is detected for a dispensary
|
||||
*/
|
||||
providerDetected(params) {
|
||||
this.log({
|
||||
timestamp: new Date().toISOString(),
|
||||
level: 'info',
|
||||
event: 'provider_detected',
|
||||
dispensary_id: params.dispensary_id,
|
||||
dispensary_name: params.dispensary_name,
|
||||
detected_provider: params.detected_provider,
|
||||
confidence: params.confidence,
|
||||
detection_method: params.detection_method,
|
||||
menu_url: params.menu_url,
|
||||
category: params.category,
|
||||
});
|
||||
}
|
||||
/**
|
||||
* Log when a dispensary's provider changes
|
||||
*/
|
||||
providerChanged(params) {
|
||||
this.log({
|
||||
timestamp: new Date().toISOString(),
|
||||
level: 'info',
|
||||
event: 'provider_changed',
|
||||
dispensary_id: params.dispensary_id,
|
||||
dispensary_name: params.dispensary_name,
|
||||
old_provider: params.old_provider,
|
||||
new_provider: params.new_provider,
|
||||
old_confidence: params.old_confidence,
|
||||
new_confidence: params.new_confidence,
|
||||
category: params.category,
|
||||
});
|
||||
}
|
||||
/**
|
||||
* Log when a dispensary's crawler mode changes (sandbox -> production, etc.)
|
||||
*/
|
||||
modeChanged(params) {
|
||||
this.log({
|
||||
timestamp: new Date().toISOString(),
|
||||
level: 'info',
|
||||
event: 'mode_changed',
|
||||
dispensary_id: params.dispensary_id,
|
||||
dispensary_name: params.dispensary_name,
|
||||
old_mode: params.old_mode,
|
||||
new_mode: params.new_mode,
|
||||
reason: params.reason,
|
||||
category: params.category,
|
||||
provider: params.provider,
|
||||
});
|
||||
}
|
||||
/**
|
||||
* Log sandbox crawl events
|
||||
*/
|
||||
sandboxEvent(params) {
|
||||
const level = params.event === 'sandbox_failed' ? 'error' : 'info';
|
||||
this.log({
|
||||
timestamp: new Date().toISOString(),
|
||||
level,
|
||||
event: params.event,
|
||||
dispensary_id: params.dispensary_id,
|
||||
dispensary_name: params.dispensary_name,
|
||||
template_name: params.template_name,
|
||||
category: params.category,
|
||||
quality_score: params.quality_score,
|
||||
products_extracted: params.products_extracted,
|
||||
fields_missing: params.fields_missing,
|
||||
error_message: params.error_message,
|
||||
provider: params.provider,
|
||||
});
|
||||
}
|
||||
/**
|
||||
* Log queue processing failures
|
||||
*/
|
||||
queueFailure(params) {
|
||||
this.log({
|
||||
timestamp: new Date().toISOString(),
|
||||
level: 'error',
|
||||
event: 'queue_failure',
|
||||
queue_type: params.queue_type,
|
||||
error_message: params.error_message,
|
||||
affected_items: params.affected_items,
|
||||
});
|
||||
}
|
||||
/**
|
||||
* Log detection scan summary
|
||||
*/
|
||||
detectionScan(params) {
|
||||
this.log({
|
||||
timestamp: new Date().toISOString(),
|
||||
level: 'info',
|
||||
event: 'detection_scan',
|
||||
total_scanned: params.total_scanned,
|
||||
detected: params.detected,
|
||||
failed: params.failed,
|
||||
skipped: params.skipped,
|
||||
duration_ms: params.duration_ms,
|
||||
});
|
||||
}
|
||||
/**
|
||||
* Log intelligence run summary
|
||||
*/
|
||||
intelligenceRun(params) {
|
||||
this.log({
|
||||
timestamp: new Date().toISOString(),
|
||||
level: 'info',
|
||||
event: 'intelligence_run',
|
||||
run_type: params.run_type,
|
||||
dispensaries_processed: params.dispensaries_processed,
|
||||
jobs_queued: params.jobs_queued,
|
||||
duration_ms: params.duration_ms,
|
||||
});
|
||||
}
|
||||
}
|
||||
// Export singleton instance
|
||||
exports.crawlerLogger = new CrawlerLoggerService();
|
||||
383
backend/dist/services/dispensary-orchestrator.js
vendored
Normal file
383
backend/dist/services/dispensary-orchestrator.js
vendored
Normal file
@@ -0,0 +1,383 @@
|
||||
"use strict";
|
||||
/**
|
||||
* Dispensary Crawl Orchestrator
|
||||
*
|
||||
* Orchestrates the complete crawl workflow for a dispensary:
|
||||
* 1. Load dispensary data
|
||||
* 2. Check if provider detection is needed
|
||||
* 3. Run provider detection if needed
|
||||
* 4. Queue appropriate crawl jobs based on provider/mode
|
||||
* 5. Update dispensary_crawl_schedule with meaningful status
|
||||
*
|
||||
* This works DIRECTLY with dispensaries (not through stores table).
|
||||
*/
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.runDispensaryOrchestrator = runDispensaryOrchestrator;
|
||||
exports.runBatchDispensaryOrchestrator = runBatchDispensaryOrchestrator;
|
||||
exports.getDispensariesDueForOrchestration = getDispensariesDueForOrchestration;
|
||||
exports.ensureAllDispensariesHaveSchedules = ensureAllDispensariesHaveSchedules;
|
||||
exports.processDispensaryScheduler = processDispensaryScheduler;
|
||||
const uuid_1 = require("uuid");
|
||||
const migrate_1 = require("../db/migrate");
|
||||
const crawler_logger_1 = require("./crawler-logger");
|
||||
const intelligence_detector_1 = require("./intelligence-detector");
|
||||
const category_crawler_jobs_1 = require("./category-crawler-jobs");
|
||||
// ========================================
|
||||
// Main Orchestrator Function
|
||||
// ========================================
|
||||
/**
|
||||
* Run the complete crawl orchestration for a dispensary
|
||||
*
|
||||
* Behavior:
|
||||
* 1. Load the dispensary info
|
||||
* 2. If product_provider is missing or stale (>7 days), run detection
|
||||
* 3. After detection:
|
||||
* - If product_provider = 'dutchie' and product_crawler_mode = 'production': Run production crawl
|
||||
* - Otherwise: Run sandbox crawl
|
||||
* 4. Update dispensary_crawl_schedule with status/summary
|
||||
*/
|
||||
async function runDispensaryOrchestrator(dispensaryId, scheduleId) {
|
||||
const startTime = Date.now();
|
||||
const runId = (0, uuid_1.v4)();
|
||||
let result = {
|
||||
status: 'pending',
|
||||
summary: '',
|
||||
runId,
|
||||
dispensaryId,
|
||||
dispensaryName: '',
|
||||
detectionRan: false,
|
||||
crawlRan: false,
|
||||
durationMs: 0,
|
||||
};
|
||||
try {
|
||||
// Mark schedule as running
|
||||
await updateScheduleStatus(dispensaryId, 'running', 'Starting orchestrator...', null, runId);
|
||||
// 1. Load dispensary info
|
||||
const dispensary = await getDispensaryInfo(dispensaryId);
|
||||
if (!dispensary) {
|
||||
throw new Error(`Dispensary ${dispensaryId} not found`);
|
||||
}
|
||||
result.dispensaryName = dispensary.name;
|
||||
// 2. Check if provider detection is needed
|
||||
const needsDetection = await checkNeedsDetection(dispensary);
|
||||
if (needsDetection) {
|
||||
// Run provider detection
|
||||
const websiteUrl = dispensary.menu_url || dispensary.website;
|
||||
if (!websiteUrl) {
|
||||
result.status = 'error';
|
||||
result.summary = 'No website URL available for detection';
|
||||
result.error = 'Dispensary has no menu_url or website configured';
|
||||
await updateScheduleStatus(dispensaryId, 'error', result.summary, result.error, runId);
|
||||
result.durationMs = Date.now() - startTime;
|
||||
await createJobRecord(dispensaryId, scheduleId, result);
|
||||
return result;
|
||||
}
|
||||
await updateScheduleStatus(dispensaryId, 'running', 'Running provider detection...', null, runId);
|
||||
const detectionResult = await (0, intelligence_detector_1.detectMultiCategoryProviders)(websiteUrl);
|
||||
result.detectionRan = true;
|
||||
result.detectionResult = detectionResult;
|
||||
// Save detection results to dispensary
|
||||
await (0, intelligence_detector_1.updateAllCategoryProviders)(dispensaryId, detectionResult);
|
||||
crawler_logger_1.crawlerLogger.providerDetected({
|
||||
dispensary_id: dispensaryId,
|
||||
dispensary_name: dispensary.name,
|
||||
detected_provider: detectionResult.product.provider,
|
||||
confidence: detectionResult.product.confidence,
|
||||
detection_method: 'dispensary_orchestrator',
|
||||
menu_url: websiteUrl,
|
||||
category: 'product',
|
||||
});
|
||||
// Refresh dispensary info after detection
|
||||
const updatedDispensary = await getDispensaryInfo(dispensaryId);
|
||||
if (updatedDispensary) {
|
||||
Object.assign(dispensary, updatedDispensary);
|
||||
}
|
||||
}
|
||||
// 3. Determine crawl type and run
|
||||
const provider = dispensary.product_provider;
|
||||
const mode = dispensary.product_crawler_mode;
|
||||
if (provider === 'dutchie' && mode === 'production') {
|
||||
// Production Dutchie crawl
|
||||
await updateScheduleStatus(dispensaryId, 'running', 'Running Dutchie production crawl...', null, runId);
|
||||
try {
|
||||
// Run the category-specific crawl job
|
||||
const crawlResult = await (0, category_crawler_jobs_1.runCrawlProductsJob)(dispensaryId);
|
||||
result.crawlRan = true;
|
||||
result.crawlType = 'production';
|
||||
if (crawlResult.success) {
|
||||
result.productsFound = crawlResult.data?.productsFound || 0;
|
||||
const detectionPart = result.detectionRan ? 'Detection + ' : '';
|
||||
result.summary = `${detectionPart}Dutchie products crawl completed`;
|
||||
result.status = 'success';
|
||||
crawler_logger_1.crawlerLogger.jobCompleted({
|
||||
job_id: 0,
|
||||
store_id: 0,
|
||||
store_name: dispensary.name,
|
||||
duration_ms: Date.now() - startTime,
|
||||
products_found: result.productsFound || 0,
|
||||
products_new: 0,
|
||||
products_updated: 0,
|
||||
provider: 'dutchie',
|
||||
});
|
||||
}
|
||||
else {
|
||||
result.status = 'error';
|
||||
result.error = crawlResult.message;
|
||||
result.summary = `Dutchie crawl failed: ${crawlResult.message.slice(0, 100)}`;
|
||||
}
|
||||
}
|
||||
catch (crawlError) {
|
||||
result.status = 'error';
|
||||
result.error = crawlError.message;
|
||||
result.summary = `Dutchie crawl failed: ${crawlError.message.slice(0, 100)}`;
|
||||
result.crawlRan = true;
|
||||
result.crawlType = 'production';
|
||||
crawler_logger_1.crawlerLogger.jobFailed({
|
||||
job_id: 0,
|
||||
store_id: 0,
|
||||
store_name: dispensary.name,
|
||||
duration_ms: Date.now() - startTime,
|
||||
error_message: crawlError.message,
|
||||
provider: 'dutchie',
|
||||
});
|
||||
}
|
||||
}
|
||||
else if (provider && provider !== 'unknown') {
|
||||
// Sandbox crawl for non-Dutchie or sandbox mode
|
||||
await updateScheduleStatus(dispensaryId, 'running', `Running ${provider} sandbox crawl...`, null, runId);
|
||||
try {
|
||||
const sandboxResult = await (0, category_crawler_jobs_1.runSandboxProductsJob)(dispensaryId);
|
||||
result.crawlRan = true;
|
||||
result.crawlType = 'sandbox';
|
||||
result.productsFound = sandboxResult.data?.productsExtracted || 0;
|
||||
const detectionPart = result.detectionRan ? 'Detection + ' : '';
|
||||
if (sandboxResult.success) {
|
||||
result.summary = `${detectionPart}${provider} sandbox crawl (${result.productsFound} items, quality ${sandboxResult.data?.qualityScore || 0}%)`;
|
||||
result.status = 'sandbox_only';
|
||||
}
|
||||
else {
|
||||
result.summary = `${detectionPart}${provider} sandbox failed: ${sandboxResult.message}`;
|
||||
result.status = 'error';
|
||||
result.error = sandboxResult.message;
|
||||
}
|
||||
}
|
||||
catch (sandboxError) {
|
||||
result.status = 'error';
|
||||
result.error = sandboxError.message;
|
||||
result.summary = `Sandbox crawl failed: ${sandboxError.message.slice(0, 100)}`;
|
||||
result.crawlRan = true;
|
||||
result.crawlType = 'sandbox';
|
||||
}
|
||||
}
|
||||
else {
|
||||
// No provider detected - detection only
|
||||
if (result.detectionRan) {
|
||||
result.summary = `Detection complete: provider=${dispensary.product_provider || 'unknown'}, confidence=${dispensary.product_confidence || 0}%`;
|
||||
result.status = 'detection_only';
|
||||
}
|
||||
else {
|
||||
result.summary = 'No provider detected and no crawl possible';
|
||||
result.status = 'error';
|
||||
result.error = 'Could not determine menu provider';
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (error) {
|
||||
result.status = 'error';
|
||||
result.error = error.message;
|
||||
result.summary = `Orchestrator error: ${error.message.slice(0, 100)}`;
|
||||
crawler_logger_1.crawlerLogger.queueFailure({
|
||||
queue_type: 'dispensary_orchestrator',
|
||||
error_message: error.message,
|
||||
});
|
||||
}
|
||||
result.durationMs = Date.now() - startTime;
|
||||
// Update final schedule status
|
||||
await updateScheduleStatus(dispensaryId, result.status, result.summary, result.error || null, runId);
|
||||
// Create job record
|
||||
await createJobRecord(dispensaryId, scheduleId, result);
|
||||
return result;
|
||||
}
|
||||
// ========================================
|
||||
// Helper Functions
|
||||
// ========================================
|
||||
async function getDispensaryInfo(dispensaryId) {
|
||||
const result = await migrate_1.pool.query(`SELECT id, name, city, website, menu_url,
|
||||
product_provider, product_confidence, product_crawler_mode, last_product_scan_at
|
||||
FROM dispensaries
|
||||
WHERE id = $1`, [dispensaryId]);
|
||||
return result.rows[0] || null;
|
||||
}
|
||||
async function checkNeedsDetection(dispensary) {
|
||||
// No provider = definitely needs detection
|
||||
if (!dispensary.product_provider)
|
||||
return true;
|
||||
// Unknown provider = needs detection
|
||||
if (dispensary.product_provider === 'unknown')
|
||||
return true;
|
||||
// Low confidence = needs re-detection
|
||||
if (dispensary.product_confidence !== null && dispensary.product_confidence < 50)
|
||||
return true;
|
||||
// Stale detection (> 7 days) = needs refresh
|
||||
if (dispensary.last_product_scan_at) {
|
||||
const daysSince = (Date.now() - new Date(dispensary.last_product_scan_at).getTime()) / (1000 * 60 * 60 * 24);
|
||||
if (daysSince > 7)
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
async function updateScheduleStatus(dispensaryId, status, summary, error, runId) {
|
||||
await migrate_1.pool.query(`INSERT INTO dispensary_crawl_schedule (dispensary_id, last_status, last_summary, last_error, last_run_at, updated_at)
|
||||
VALUES ($1, $2, $3, $4, NOW(), NOW())
|
||||
ON CONFLICT (dispensary_id) DO UPDATE SET
|
||||
last_status = $2,
|
||||
last_summary = $3,
|
||||
last_error = $4,
|
||||
last_run_at = NOW(),
|
||||
updated_at = NOW()`, [dispensaryId, status, summary, error]);
|
||||
}
|
||||
async function createJobRecord(dispensaryId, scheduleId, result) {
|
||||
await migrate_1.pool.query(`INSERT INTO dispensary_crawl_jobs (
|
||||
dispensary_id, schedule_id, job_type, trigger_type, status, priority,
|
||||
scheduled_at, started_at, completed_at, duration_ms,
|
||||
detection_ran, crawl_ran, crawl_type,
|
||||
products_found, products_new, products_updated,
|
||||
detected_provider, detected_confidence, detected_mode,
|
||||
error_message, run_id
|
||||
) VALUES (
|
||||
$1, $2, 'orchestrator', 'manual', $3, 100,
|
||||
NOW(), NOW(), NOW(), $4,
|
||||
$5, $6, $7,
|
||||
$8, $9, $10,
|
||||
$11, $12, $13,
|
||||
$14, $15
|
||||
)`, [
|
||||
dispensaryId,
|
||||
scheduleId || null,
|
||||
result.status === 'success' ? 'completed' : result.status === 'error' ? 'failed' : 'completed',
|
||||
result.durationMs,
|
||||
result.detectionRan,
|
||||
result.crawlRan,
|
||||
result.crawlType || null,
|
||||
result.productsFound || null,
|
||||
result.productsNew || null,
|
||||
result.productsUpdated || null,
|
||||
result.detectionResult?.product.provider || null,
|
||||
result.detectionResult?.product.confidence || null,
|
||||
result.detectionResult?.product.mode || null,
|
||||
result.error || null,
|
||||
result.runId,
|
||||
]);
|
||||
// Update schedule stats
|
||||
if (result.status === 'success' || result.status === 'sandbox_only' || result.status === 'detection_only') {
|
||||
await migrate_1.pool.query(`UPDATE dispensary_crawl_schedule SET
|
||||
total_runs = COALESCE(total_runs, 0) + 1,
|
||||
successful_runs = COALESCE(successful_runs, 0) + 1,
|
||||
consecutive_failures = 0,
|
||||
next_run_at = NOW() + (interval_minutes || ' minutes')::INTERVAL,
|
||||
last_duration_ms = $2
|
||||
WHERE dispensary_id = $1`, [dispensaryId, result.durationMs]);
|
||||
}
|
||||
else if (result.status === 'error') {
|
||||
await migrate_1.pool.query(`UPDATE dispensary_crawl_schedule SET
|
||||
total_runs = COALESCE(total_runs, 0) + 1,
|
||||
consecutive_failures = COALESCE(consecutive_failures, 0) + 1,
|
||||
next_run_at = NOW() + (interval_minutes || ' minutes')::INTERVAL,
|
||||
last_duration_ms = $2
|
||||
WHERE dispensary_id = $1`, [dispensaryId, result.durationMs]);
|
||||
}
|
||||
}
|
||||
// ========================================
|
||||
// Batch Processing
|
||||
// ========================================
|
||||
/**
|
||||
* Run orchestrator for multiple dispensaries
|
||||
*/
|
||||
async function runBatchDispensaryOrchestrator(dispensaryIds, concurrency = 3) {
|
||||
const results = [];
|
||||
// Process in batches
|
||||
for (let i = 0; i < dispensaryIds.length; i += concurrency) {
|
||||
const batch = dispensaryIds.slice(i, i + concurrency);
|
||||
console.log(`Processing batch ${Math.floor(i / concurrency) + 1}: dispensaries ${batch.join(', ')}`);
|
||||
const batchResults = await Promise.all(batch.map(id => runDispensaryOrchestrator(id)));
|
||||
results.push(...batchResults);
|
||||
// Small delay between batches to avoid overwhelming the system
|
||||
if (i + concurrency < dispensaryIds.length) {
|
||||
await new Promise(r => setTimeout(r, 1000));
|
||||
}
|
||||
}
|
||||
return results;
|
||||
}
|
||||
/**
|
||||
* Get dispensaries that are due for orchestration
|
||||
*/
|
||||
async function getDispensariesDueForOrchestration(limit = 10) {
|
||||
const result = await migrate_1.pool.query(`SELECT d.id
|
||||
FROM dispensaries d
|
||||
LEFT JOIN dispensary_crawl_schedule dcs ON dcs.dispensary_id = d.id
|
||||
WHERE COALESCE(dcs.is_active, TRUE) = TRUE
|
||||
AND (
|
||||
dcs.next_run_at IS NULL
|
||||
OR dcs.next_run_at <= NOW()
|
||||
)
|
||||
AND (dcs.last_status IS NULL OR dcs.last_status NOT IN ('running', 'pending'))
|
||||
ORDER BY COALESCE(dcs.priority, 0) DESC, dcs.last_run_at ASC NULLS FIRST
|
||||
LIMIT $1`, [limit]);
|
||||
return result.rows.map(row => row.id);
|
||||
}
|
||||
/**
|
||||
* Ensure all dispensaries have schedule entries
|
||||
*/
|
||||
async function ensureAllDispensariesHaveSchedules(intervalMinutes = 240) {
|
||||
// Get all dispensary IDs that don't have a schedule
|
||||
const result = await migrate_1.pool.query(`INSERT INTO dispensary_crawl_schedule (dispensary_id, is_active, interval_minutes, priority)
|
||||
SELECT d.id, TRUE, $1, 0
|
||||
FROM dispensaries d
|
||||
WHERE NOT EXISTS (
|
||||
SELECT 1 FROM dispensary_crawl_schedule dcs WHERE dcs.dispensary_id = d.id
|
||||
)
|
||||
RETURNING id`, [intervalMinutes]);
|
||||
const existingCount = await migrate_1.pool.query('SELECT COUNT(*) FROM dispensary_crawl_schedule');
|
||||
return {
|
||||
created: result.rowCount || 0,
|
||||
existing: parseInt(existingCount.rows[0].count) - (result.rowCount || 0),
|
||||
};
|
||||
}
|
||||
// ========================================
|
||||
// Scheduler Integration
|
||||
// ========================================
|
||||
let dispensarySchedulerRunning = false;
|
||||
/**
|
||||
* Process dispensaries using the intelligent orchestrator
|
||||
* Called periodically by the scheduler
|
||||
*/
|
||||
async function processDispensaryScheduler() {
|
||||
if (dispensarySchedulerRunning) {
|
||||
console.log('Dispensary scheduler already running, skipping...');
|
||||
return;
|
||||
}
|
||||
dispensarySchedulerRunning = true;
|
||||
try {
|
||||
// Get dispensaries due for orchestration
|
||||
const dispensaryIds = await getDispensariesDueForOrchestration(3);
|
||||
if (dispensaryIds.length === 0) {
|
||||
return;
|
||||
}
|
||||
console.log(`Dispensary Scheduler: Processing ${dispensaryIds.length} dispensaries due for crawl`);
|
||||
// Process each dispensary through the orchestrator
|
||||
for (const dispensaryId of dispensaryIds) {
|
||||
try {
|
||||
console.log(`Dispensary Scheduler: Starting crawl for dispensary ${dispensaryId}`);
|
||||
const result = await runDispensaryOrchestrator(dispensaryId);
|
||||
console.log(`Dispensary Scheduler: Dispensary ${dispensaryId} completed - ${result.summary}`);
|
||||
}
|
||||
catch (error) {
|
||||
console.error(`Dispensary Scheduler: Dispensary ${dispensaryId} failed - ${error.message}`);
|
||||
}
|
||||
}
|
||||
console.log(`Dispensary Scheduler: Finished processing ${dispensaryIds.length} dispensaries`);
|
||||
}
|
||||
finally {
|
||||
dispensarySchedulerRunning = false;
|
||||
}
|
||||
}
|
||||
125
backend/dist/services/geolocation.js
vendored
Normal file
125
backend/dist/services/geolocation.js
vendored
Normal file
@@ -0,0 +1,125 @@
|
||||
"use strict";
|
||||
var __importDefault = (this && this.__importDefault) || function (mod) {
|
||||
return (mod && mod.__esModule) ? mod : { "default": mod };
|
||||
};
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.lookupProxyLocation = lookupProxyLocation;
|
||||
exports.updateProxyLocation = updateProxyLocation;
|
||||
exports.updateAllProxyLocations = updateAllProxyLocations;
|
||||
exports.queueProxyLocationUpdate = queueProxyLocationUpdate;
|
||||
const axios_1 = __importDefault(require("axios"));
|
||||
const migrate_1 = require("../db/migrate");
|
||||
// Free API - 45 requests/minute limit
|
||||
const GEOLOCATION_API = 'http://ip-api.com/json/';
|
||||
async function lookupProxyLocation(host) {
|
||||
try {
|
||||
const response = await axios_1.default.get(`${GEOLOCATION_API}${host}?fields=status,message,country,countryCode,regionName,city,query`);
|
||||
const data = response.data;
|
||||
if (data.status === 'fail') {
|
||||
console.log(`❌ Geolocation lookup failed for ${host}: ${data.message}`);
|
||||
return null;
|
||||
}
|
||||
return data;
|
||||
}
|
||||
catch (error) {
|
||||
console.error(`❌ Error looking up location for ${host}:`, error.message);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
async function updateProxyLocation(proxyId, location) {
|
||||
await migrate_1.pool.query(`
|
||||
UPDATE proxies
|
||||
SET city = $1,
|
||||
state = $2,
|
||||
country = $3,
|
||||
country_code = $4,
|
||||
location_updated_at = CURRENT_TIMESTAMP
|
||||
WHERE id = $5
|
||||
`, [
|
||||
location.city,
|
||||
location.regionName,
|
||||
location.country,
|
||||
location.countryCode,
|
||||
proxyId
|
||||
]);
|
||||
}
|
||||
async function updateAllProxyLocations(batchSize = 45) {
|
||||
console.log('🌍 Starting proxy location update job...');
|
||||
// Get all proxies without location data
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT id, host
|
||||
FROM proxies
|
||||
WHERE location_updated_at IS NULL
|
||||
OR location_updated_at < CURRENT_TIMESTAMP - INTERVAL '30 days'
|
||||
ORDER BY id
|
||||
`);
|
||||
const proxies = result.rows;
|
||||
console.log(`📊 Found ${proxies.length} proxies to update`);
|
||||
let updated = 0;
|
||||
let failed = 0;
|
||||
// Process in batches to respect rate limit (45 req/min)
|
||||
for (let i = 0; i < proxies.length; i += batchSize) {
|
||||
const batch = proxies.slice(i, i + batchSize);
|
||||
console.log(`🔄 Processing batch ${Math.floor(i / batchSize) + 1}/${Math.ceil(proxies.length / batchSize)} (${batch.length} proxies)`);
|
||||
// Process batch
|
||||
for (const proxy of batch) {
|
||||
const location = await lookupProxyLocation(proxy.host);
|
||||
if (location) {
|
||||
await updateProxyLocation(proxy.id, location);
|
||||
console.log(`✅ Updated ${proxy.id}: ${location.city}, ${location.regionName} - ${location.country}`);
|
||||
updated++;
|
||||
}
|
||||
else {
|
||||
console.log(`⚠️ Failed to get location for proxy ${proxy.id} (${proxy.host})`);
|
||||
failed++;
|
||||
}
|
||||
// Small delay between requests
|
||||
await new Promise(resolve => setTimeout(resolve, 100));
|
||||
}
|
||||
// Wait 60 seconds before next batch to respect rate limit
|
||||
if (i + batchSize < proxies.length) {
|
||||
console.log(`⏳ Waiting 60s before next batch (rate limit: 45 req/min)...`);
|
||||
await new Promise(resolve => setTimeout(resolve, 60000));
|
||||
}
|
||||
}
|
||||
console.log(`✅ Proxy location update complete!`);
|
||||
console.log(` Updated: ${updated}`);
|
||||
console.log(` Failed: ${failed}`);
|
||||
}
|
||||
// Queue for background processing
|
||||
const locationUpdateQueue = new Set();
|
||||
let isProcessing = false;
|
||||
function queueProxyLocationUpdate(proxyId) {
|
||||
locationUpdateQueue.add(proxyId);
|
||||
processLocationQueue();
|
||||
}
|
||||
async function processLocationQueue() {
|
||||
if (isProcessing || locationUpdateQueue.size === 0)
|
||||
return;
|
||||
isProcessing = true;
|
||||
try {
|
||||
const proxyIds = Array.from(locationUpdateQueue);
|
||||
locationUpdateQueue.clear();
|
||||
console.log(`🌍 Processing ${proxyIds.length} proxy location updates from queue`);
|
||||
for (const proxyId of proxyIds) {
|
||||
const result = await migrate_1.pool.query('SELECT host FROM proxies WHERE id = $1', [proxyId]);
|
||||
if (result.rows.length === 0)
|
||||
continue;
|
||||
const host = result.rows[0].host;
|
||||
const location = await lookupProxyLocation(host);
|
||||
if (location) {
|
||||
await updateProxyLocation(proxyId, location);
|
||||
console.log(`✅ Queue: Updated ${proxyId}: ${location.city}, ${location.regionName} - ${location.country}`);
|
||||
}
|
||||
// Respect rate limit
|
||||
await new Promise(resolve => setTimeout(resolve, 1500)); // ~40 req/min
|
||||
}
|
||||
}
|
||||
finally {
|
||||
isProcessing = false;
|
||||
// Process any new items that were added while we were processing
|
||||
if (locationUpdateQueue.size > 0) {
|
||||
processLocationQueue();
|
||||
}
|
||||
}
|
||||
}
|
||||
493
backend/dist/services/intelligence-detector.js
vendored
Normal file
493
backend/dist/services/intelligence-detector.js
vendored
Normal file
@@ -0,0 +1,493 @@
|
||||
"use strict";
|
||||
/**
|
||||
* Multi-Category Intelligence Detector
|
||||
*
|
||||
* Detects providers for each intelligence category independently:
|
||||
* - Products: Which provider serves product data
|
||||
* - Specials: Which provider serves deals/specials
|
||||
* - Brand: Which provider serves brand information
|
||||
* - Metadata: Which provider serves taxonomy/category data
|
||||
*/
|
||||
var __importDefault = (this && this.__importDefault) || function (mod) {
|
||||
return (mod && mod.__esModule) ? mod : { "default": mod };
|
||||
};
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.detectMultiCategoryProviders = detectMultiCategoryProviders;
|
||||
exports.detectCategoryProviderChange = detectCategoryProviderChange;
|
||||
exports.updateDispensaryCategoryProvider = updateDispensaryCategoryProvider;
|
||||
exports.updateAllCategoryProviders = updateAllCategoryProviders;
|
||||
exports.moveCategoryToSandbox = moveCategoryToSandbox;
|
||||
const migrate_1 = require("../db/migrate");
|
||||
const logger_1 = require("./logger");
|
||||
const puppeteer_1 = __importDefault(require("puppeteer"));
|
||||
// Production-ready providers per category
|
||||
// Only these combinations can be set to production mode
|
||||
const PRODUCTION_READY = {
|
||||
product: ['dutchie'], // Only Dutchie products are production-ready
|
||||
specials: [], // None yet
|
||||
brand: [], // None yet
|
||||
metadata: [], // None yet
|
||||
};
|
||||
// Provider detection patterns
|
||||
const PROVIDER_PATTERNS = {
|
||||
dutchie: {
|
||||
scripts: [
|
||||
/dutchie\.com/i,
|
||||
/dutchie-plus/i,
|
||||
/dutchie\.js/i,
|
||||
/__DUTCHIE__/i,
|
||||
/dutchie-embed/i,
|
||||
],
|
||||
iframes: [
|
||||
/dutchie\.com/i,
|
||||
/dutchie-plus\.com/i,
|
||||
/embed\.dutchie/i,
|
||||
],
|
||||
html: [
|
||||
/class="dutchie/i,
|
||||
/id="dutchie/i,
|
||||
/data-dutchie/i,
|
||||
/"menuType":\s*"dutchie"/i,
|
||||
],
|
||||
apiEndpoints: [
|
||||
/dutchie\.com\/graphql/i,
|
||||
/plus\.dutchie\.com/i,
|
||||
],
|
||||
metaTags: [
|
||||
/dutchie/i,
|
||||
],
|
||||
},
|
||||
treez: {
|
||||
scripts: [
|
||||
/treez\.io/i,
|
||||
/treez-ecommerce/i,
|
||||
/treez\.js/i,
|
||||
],
|
||||
iframes: [
|
||||
/treez\.io/i,
|
||||
/shop\.treez/i,
|
||||
],
|
||||
html: [
|
||||
/class="treez/i,
|
||||
/data-treez/i,
|
||||
/treez-menu/i,
|
||||
],
|
||||
apiEndpoints: [
|
||||
/api\.treez\.io/i,
|
||||
/treez\.io\/api/i,
|
||||
],
|
||||
metaTags: [],
|
||||
},
|
||||
jane: {
|
||||
scripts: [
|
||||
/jane\.co/i,
|
||||
/iheartjane\.com/i,
|
||||
/jane-frame/i,
|
||||
/jane\.js/i,
|
||||
],
|
||||
iframes: [
|
||||
/jane\.co/i,
|
||||
/iheartjane\.com/i,
|
||||
/embed\.iheartjane/i,
|
||||
],
|
||||
html: [
|
||||
/class="jane/i,
|
||||
/data-jane/i,
|
||||
/jane-embed/i,
|
||||
],
|
||||
apiEndpoints: [
|
||||
/api\.iheartjane/i,
|
||||
/jane\.co\/api/i,
|
||||
],
|
||||
metaTags: [],
|
||||
},
|
||||
weedmaps: {
|
||||
scripts: [
|
||||
/weedmaps\.com/i,
|
||||
/wm-menu/i,
|
||||
],
|
||||
iframes: [
|
||||
/weedmaps\.com/i,
|
||||
/menu\.weedmaps/i,
|
||||
],
|
||||
html: [
|
||||
/data-weedmaps/i,
|
||||
/wm-menu/i,
|
||||
],
|
||||
apiEndpoints: [
|
||||
/api-g\.weedmaps/i,
|
||||
/weedmaps\.com\/api/i,
|
||||
],
|
||||
metaTags: [],
|
||||
},
|
||||
leafly: {
|
||||
scripts: [
|
||||
/leafly\.com/i,
|
||||
/leafly-menu/i,
|
||||
],
|
||||
iframes: [
|
||||
/leafly\.com/i,
|
||||
/order\.leafly/i,
|
||||
],
|
||||
html: [
|
||||
/data-leafly/i,
|
||||
/leafly-embed/i,
|
||||
],
|
||||
apiEndpoints: [
|
||||
/api\.leafly/i,
|
||||
],
|
||||
metaTags: [],
|
||||
},
|
||||
};
|
||||
// Category-specific detection signals
|
||||
const CATEGORY_SIGNALS = {
|
||||
product: {
|
||||
urlPatterns: [/\/menu/i, /\/products/i, /\/shop/i, /\/order/i],
|
||||
htmlPatterns: [/product-card/i, /menu-item/i, /product-list/i, /product-grid/i],
|
||||
jsonKeys: ['products', 'menuItems', 'items', 'inventory'],
|
||||
},
|
||||
specials: {
|
||||
urlPatterns: [/\/specials/i, /\/deals/i, /\/promotions/i, /\/offers/i],
|
||||
htmlPatterns: [/special/i, /deal/i, /promotion/i, /discount/i, /sale/i],
|
||||
jsonKeys: ['specials', 'deals', 'promotions', 'offers'],
|
||||
},
|
||||
brand: {
|
||||
urlPatterns: [/\/brands/i, /\/vendors/i, /\/producers/i],
|
||||
htmlPatterns: [/brand-list/i, /vendor/i, /producer/i, /manufacturer/i],
|
||||
jsonKeys: ['brands', 'vendors', 'producers', 'manufacturers'],
|
||||
},
|
||||
metadata: {
|
||||
urlPatterns: [/\/categories/i, /\/taxonomy/i],
|
||||
htmlPatterns: [/category-nav/i, /menu-categories/i, /filter-category/i],
|
||||
jsonKeys: ['categories', 'taxonomy', 'filters', 'types'],
|
||||
},
|
||||
};
|
||||
// ========================================
|
||||
// Main Detection Function
|
||||
// ========================================
|
||||
async function detectMultiCategoryProviders(websiteUrl, options = {}) {
|
||||
const { timeout = 30000, headless = true, existingBrowser } = options;
|
||||
let browser = null;
|
||||
let page = null;
|
||||
const urlsTested = [];
|
||||
const rawSignals = {};
|
||||
try {
|
||||
browser = existingBrowser || await puppeteer_1.default.launch({
|
||||
headless,
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'],
|
||||
});
|
||||
page = await browser.newPage();
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36');
|
||||
// Navigate to main site
|
||||
const baseUrl = normalizeUrl(websiteUrl);
|
||||
urlsTested.push(baseUrl);
|
||||
await page.goto(baseUrl, { waitUntil: 'networkidle2', timeout });
|
||||
// Collect signals from main page
|
||||
const mainPageSignals = await collectPageSignals(page);
|
||||
rawSignals.mainPage = mainPageSignals;
|
||||
// Try common menu URLs
|
||||
const menuUrls = ['/menu', '/shop', '/products', '/order', '/specials', '/deals', '/brands'];
|
||||
for (const path of menuUrls) {
|
||||
try {
|
||||
const fullUrl = new URL(path, baseUrl).toString();
|
||||
urlsTested.push(fullUrl);
|
||||
await page.goto(fullUrl, { waitUntil: 'networkidle2', timeout: 15000 });
|
||||
const signals = await collectPageSignals(page);
|
||||
rawSignals[path] = signals;
|
||||
}
|
||||
catch {
|
||||
// URL doesn't exist or timed out
|
||||
}
|
||||
}
|
||||
// Analyze signals for each category
|
||||
const result = {
|
||||
product: analyzeCategorySignals('product', rawSignals),
|
||||
specials: analyzeCategorySignals('specials', rawSignals),
|
||||
brand: analyzeCategorySignals('brand', rawSignals),
|
||||
metadata: analyzeCategorySignals('metadata', rawSignals),
|
||||
urlsTested,
|
||||
rawSignals,
|
||||
};
|
||||
logger_1.logger.info('provider-detection', `Multi-category detection complete for ${websiteUrl}`);
|
||||
return result;
|
||||
}
|
||||
catch (error) {
|
||||
logger_1.logger.error('provider-detection', `Detection failed for ${websiteUrl}: ${error.message}`);
|
||||
// Return unknown results for all categories
|
||||
return {
|
||||
product: createUnknownResult(),
|
||||
specials: createUnknownResult(),
|
||||
brand: createUnknownResult(),
|
||||
metadata: createUnknownResult(),
|
||||
urlsTested,
|
||||
rawSignals: { error: error.message },
|
||||
};
|
||||
}
|
||||
finally {
|
||||
if (page)
|
||||
await page.close().catch(() => { });
|
||||
if (browser && !existingBrowser)
|
||||
await browser.close().catch(() => { });
|
||||
}
|
||||
}
|
||||
// ========================================
|
||||
// Helper Functions
|
||||
// ========================================
|
||||
function normalizeUrl(url) {
|
||||
if (!url.startsWith('http')) {
|
||||
url = 'https://' + url;
|
||||
}
|
||||
return url.replace(/\/$/, '');
|
||||
}
|
||||
async function collectPageSignals(page) {
|
||||
return page.evaluate(() => {
|
||||
const signals = {
|
||||
scripts: [],
|
||||
iframes: [],
|
||||
links: [],
|
||||
metaTags: [],
|
||||
bodyClasses: document.body?.className || '',
|
||||
bodyId: document.body?.id || '',
|
||||
htmlSnippet: document.documentElement.outerHTML.slice(0, 10000),
|
||||
};
|
||||
// Collect script sources
|
||||
document.querySelectorAll('script[src]').forEach((el) => {
|
||||
signals.scripts.push(el.src);
|
||||
});
|
||||
// Collect inline scripts
|
||||
document.querySelectorAll('script:not([src])').forEach((el) => {
|
||||
const content = el.textContent || '';
|
||||
if (content.length < 5000) {
|
||||
signals.scripts.push(`inline:${content.slice(0, 500)}`);
|
||||
}
|
||||
});
|
||||
// Collect iframes
|
||||
document.querySelectorAll('iframe').forEach((el) => {
|
||||
signals.iframes.push(el.src);
|
||||
});
|
||||
// Collect links
|
||||
document.querySelectorAll('a[href]').forEach((el) => {
|
||||
signals.links.push(el.href);
|
||||
});
|
||||
// Collect meta tags
|
||||
document.querySelectorAll('meta').forEach((el) => {
|
||||
const content = el.getAttribute('content') || '';
|
||||
const name = el.getAttribute('name') || el.getAttribute('property') || '';
|
||||
if (content || name) {
|
||||
signals.metaTags.push(`${name}:${content}`);
|
||||
}
|
||||
});
|
||||
// Look for JSON data
|
||||
const jsonBlocks = [];
|
||||
document.querySelectorAll('script[type="application/json"]').forEach((el) => {
|
||||
jsonBlocks.push(el.textContent?.slice(0, 2000) || '');
|
||||
});
|
||||
signals.jsonBlocks = jsonBlocks;
|
||||
return signals;
|
||||
});
|
||||
}
|
||||
function analyzeCategorySignals(category, allSignals) {
|
||||
const providerScores = {};
|
||||
const detectedSignals = {};
|
||||
// Initialize scores
|
||||
for (const provider of Object.keys(PROVIDER_PATTERNS)) {
|
||||
providerScores[provider] = 0;
|
||||
}
|
||||
// Analyze each page's signals
|
||||
for (const [pagePath, signals] of Object.entries(allSignals)) {
|
||||
if (!signals || typeof signals !== 'object')
|
||||
continue;
|
||||
// Check for provider-specific patterns
|
||||
for (const [provider, patterns] of Object.entries(PROVIDER_PATTERNS)) {
|
||||
let score = 0;
|
||||
// Check scripts
|
||||
if (signals.scripts) {
|
||||
for (const script of signals.scripts) {
|
||||
for (const pattern of patterns.scripts) {
|
||||
if (pattern.test(script)) {
|
||||
score += 20;
|
||||
detectedSignals[`${provider}_script_${pagePath}`] = script;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Check iframes
|
||||
if (signals.iframes) {
|
||||
for (const iframe of signals.iframes) {
|
||||
for (const pattern of patterns.iframes) {
|
||||
if (pattern.test(iframe)) {
|
||||
score += 25;
|
||||
detectedSignals[`${provider}_iframe_${pagePath}`] = iframe;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Check HTML content
|
||||
if (signals.htmlSnippet) {
|
||||
for (const pattern of patterns.html) {
|
||||
if (pattern.test(signals.htmlSnippet)) {
|
||||
score += 15;
|
||||
detectedSignals[`${provider}_html_${pagePath}`] = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
providerScores[provider] += score;
|
||||
}
|
||||
// Check for category-specific signals on relevant pages
|
||||
const categorySignals = CATEGORY_SIGNALS[category];
|
||||
const isRelevantPage = categorySignals.urlPatterns.some((p) => p.test(pagePath));
|
||||
if (isRelevantPage && signals.htmlSnippet) {
|
||||
for (const pattern of categorySignals.htmlPatterns) {
|
||||
if (pattern.test(signals.htmlSnippet)) {
|
||||
detectedSignals[`${category}_html_pattern`] = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Check JSON blocks for category data
|
||||
if (signals.jsonBlocks) {
|
||||
for (const json of signals.jsonBlocks) {
|
||||
for (const key of categorySignals.jsonKeys) {
|
||||
if (json.toLowerCase().includes(`"${key}"`)) {
|
||||
detectedSignals[`${category}_json_key_${key}`] = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Determine winning provider
|
||||
let bestProvider = 'unknown';
|
||||
let bestScore = 0;
|
||||
for (const [provider, score] of Object.entries(providerScores)) {
|
||||
if (score > bestScore) {
|
||||
bestScore = score;
|
||||
bestProvider = provider;
|
||||
}
|
||||
}
|
||||
// Calculate confidence (0-100)
|
||||
const confidence = Math.min(100, bestScore);
|
||||
// Determine mode based on provider and confidence
|
||||
const isProductionReady = PRODUCTION_READY[category].includes(bestProvider);
|
||||
const mode = isProductionReady && confidence >= 70
|
||||
? 'production'
|
||||
: 'sandbox';
|
||||
// Get template name if available
|
||||
let templateName;
|
||||
if (bestProvider === 'dutchie' && category === 'product') {
|
||||
templateName = 'dutchie_standard';
|
||||
}
|
||||
else if (bestProvider === 'treez') {
|
||||
templateName = 'treez_products_v0';
|
||||
}
|
||||
return {
|
||||
provider: bestProvider,
|
||||
confidence,
|
||||
mode,
|
||||
signals: detectedSignals,
|
||||
templateName,
|
||||
};
|
||||
}
|
||||
function createUnknownResult() {
|
||||
return {
|
||||
provider: 'unknown',
|
||||
confidence: 0,
|
||||
mode: 'sandbox',
|
||||
signals: {},
|
||||
};
|
||||
}
|
||||
// ========================================
|
||||
// Lightweight Per-Category Change Detection
|
||||
// ========================================
|
||||
async function detectCategoryProviderChange(page, category, expectedProvider) {
|
||||
try {
|
||||
const signals = await collectPageSignals(page);
|
||||
const result = analyzeCategorySignals(category, { currentPage: signals });
|
||||
if (result.provider !== expectedProvider && result.confidence > 50) {
|
||||
logger_1.logger.warn('provider-detection', `Provider change detected for ${category}: ${expectedProvider} -> ${result.provider}`);
|
||||
return {
|
||||
changed: true,
|
||||
newProvider: result.provider,
|
||||
confidence: result.confidence,
|
||||
};
|
||||
}
|
||||
return { changed: false };
|
||||
}
|
||||
catch (error) {
|
||||
logger_1.logger.error('provider-detection', `Change detection failed: ${error.message}`);
|
||||
return { changed: false };
|
||||
}
|
||||
}
|
||||
// ========================================
|
||||
// Database Operations
|
||||
// ========================================
|
||||
async function updateDispensaryCategoryProvider(dispensaryId, category, result) {
|
||||
const columnPrefix = category === 'product' ? 'product' :
|
||||
category === 'specials' ? 'specials' :
|
||||
category === 'brand' ? 'brand' : 'metadata';
|
||||
await migrate_1.pool.query(`UPDATE dispensaries SET
|
||||
${columnPrefix}_provider = $1,
|
||||
${columnPrefix}_confidence = $2,
|
||||
${columnPrefix}_crawler_mode = $3,
|
||||
${columnPrefix}_detection_data = $4,
|
||||
updated_at = NOW()
|
||||
WHERE id = $5`, [
|
||||
result.provider,
|
||||
result.confidence,
|
||||
result.mode,
|
||||
JSON.stringify(result.signals),
|
||||
dispensaryId,
|
||||
]);
|
||||
}
|
||||
async function updateAllCategoryProviders(dispensaryId, result) {
|
||||
await migrate_1.pool.query(`UPDATE dispensaries SET
|
||||
product_provider = $1,
|
||||
product_confidence = $2,
|
||||
product_crawler_mode = $3,
|
||||
product_detection_data = $4,
|
||||
specials_provider = $5,
|
||||
specials_confidence = $6,
|
||||
specials_crawler_mode = $7,
|
||||
specials_detection_data = $8,
|
||||
brand_provider = $9,
|
||||
brand_confidence = $10,
|
||||
brand_crawler_mode = $11,
|
||||
brand_detection_data = $12,
|
||||
metadata_provider = $13,
|
||||
metadata_confidence = $14,
|
||||
metadata_crawler_mode = $15,
|
||||
metadata_detection_data = $16,
|
||||
updated_at = NOW()
|
||||
WHERE id = $17`, [
|
||||
result.product.provider,
|
||||
result.product.confidence,
|
||||
result.product.mode,
|
||||
JSON.stringify(result.product.signals),
|
||||
result.specials.provider,
|
||||
result.specials.confidence,
|
||||
result.specials.mode,
|
||||
JSON.stringify(result.specials.signals),
|
||||
result.brand.provider,
|
||||
result.brand.confidence,
|
||||
result.brand.mode,
|
||||
JSON.stringify(result.brand.signals),
|
||||
result.metadata.provider,
|
||||
result.metadata.confidence,
|
||||
result.metadata.mode,
|
||||
JSON.stringify(result.metadata.signals),
|
||||
dispensaryId,
|
||||
]);
|
||||
}
|
||||
async function moveCategoryToSandbox(dispensaryId, category, reason) {
|
||||
const columnPrefix = category === 'product' ? 'product' :
|
||||
category === 'specials' ? 'specials' :
|
||||
category === 'brand' ? 'brand' : 'metadata';
|
||||
await migrate_1.pool.query(`UPDATE dispensaries SET
|
||||
${columnPrefix}_crawler_mode = 'sandbox',
|
||||
${columnPrefix}_detection_data = ${columnPrefix}_detection_data || $1::jsonb,
|
||||
updated_at = NOW()
|
||||
WHERE id = $2`, [
|
||||
JSON.stringify({ sandbox_reason: reason, sandbox_at: new Date().toISOString() }),
|
||||
dispensaryId,
|
||||
]);
|
||||
logger_1.logger.info('provider-detection', `Moved dispensary ${dispensaryId} ${category} to sandbox: ${reason}`);
|
||||
}
|
||||
612
backend/dist/services/menu-provider-detector.js
vendored
Normal file
612
backend/dist/services/menu-provider-detector.js
vendored
Normal file
@@ -0,0 +1,612 @@
|
||||
"use strict";
|
||||
/**
|
||||
* Menu Provider Detection Service
|
||||
*
|
||||
* Detects which menu platform a dispensary is using by analyzing:
|
||||
* - HTML content patterns (scripts, iframes, classes)
|
||||
* - URL patterns (embedded menu paths)
|
||||
* - API endpoint signatures
|
||||
* - Meta tags and headers
|
||||
*/
|
||||
var __importDefault = (this && this.__importDefault) || function (mod) {
|
||||
return (mod && mod.__esModule) ? mod : { "default": mod };
|
||||
};
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.detectMenuProvider = detectMenuProvider;
|
||||
exports.quickDutchieCheck = quickDutchieCheck;
|
||||
exports.detectProviderChange = detectProviderChange;
|
||||
const puppeteer_1 = __importDefault(require("puppeteer"));
|
||||
const logger_1 = require("./logger");
|
||||
// Provider detection patterns
|
||||
const PROVIDER_PATTERNS = {
|
||||
dutchie: {
|
||||
scripts: [
|
||||
/dutchie/i,
|
||||
/dutchie-plus/i,
|
||||
/dutchie\.com/i,
|
||||
/dutchie-embed/i,
|
||||
],
|
||||
iframes: [
|
||||
/dutchie\.com/i,
|
||||
/embed\.dutchie/i,
|
||||
/iframe\.dutchie/i,
|
||||
],
|
||||
classes: [
|
||||
/dutchie-/i,
|
||||
/DutchieEmbed/i,
|
||||
],
|
||||
urls: [
|
||||
/dutchie\.com/i,
|
||||
/\.dutchie\./i,
|
||||
],
|
||||
meta: [
|
||||
/dutchie/i,
|
||||
],
|
||||
apiEndpoints: [
|
||||
/graphql.*dutchie/i,
|
||||
/api\.dutchie/i,
|
||||
],
|
||||
htmlPatterns: [
|
||||
/data-dutchie/i,
|
||||
/__DUTCHIE__/i,
|
||||
/dutchie-plus-iframe/i,
|
||||
],
|
||||
},
|
||||
treez: {
|
||||
scripts: [
|
||||
/treez/i,
|
||||
/treez\.io/i,
|
||||
/treezpay/i,
|
||||
],
|
||||
iframes: [
|
||||
/treez\.io/i,
|
||||
/menu\.treez/i,
|
||||
],
|
||||
classes: [
|
||||
/treez-/i,
|
||||
],
|
||||
urls: [
|
||||
/treez\.io/i,
|
||||
/\.treez\./i,
|
||||
],
|
||||
meta: [
|
||||
/treez/i,
|
||||
],
|
||||
apiEndpoints: [
|
||||
/api\.treez/i,
|
||||
],
|
||||
htmlPatterns: [
|
||||
/data-treez/i,
|
||||
/treez-embed/i,
|
||||
],
|
||||
},
|
||||
jane: {
|
||||
scripts: [
|
||||
/jane\.co/i,
|
||||
/iheartjane/i,
|
||||
/jane-embed/i,
|
||||
/janetechnologies/i,
|
||||
],
|
||||
iframes: [
|
||||
/jane\.co/i,
|
||||
/iheartjane\.com/i,
|
||||
/menu\.jane/i,
|
||||
],
|
||||
classes: [
|
||||
/jane-/i,
|
||||
/iheartjane/i,
|
||||
],
|
||||
urls: [
|
||||
/jane\.co/i,
|
||||
/iheartjane\.com/i,
|
||||
],
|
||||
meta: [
|
||||
/jane/i,
|
||||
/iheartjane/i,
|
||||
],
|
||||
apiEndpoints: [
|
||||
/api\.iheartjane/i,
|
||||
/api\.jane\.co/i,
|
||||
],
|
||||
htmlPatterns: [
|
||||
/data-jane/i,
|
||||
/jane-root/i,
|
||||
/jane-embed/i,
|
||||
],
|
||||
},
|
||||
weedmaps: {
|
||||
scripts: [
|
||||
/weedmaps/i,
|
||||
/wm\.com/i,
|
||||
],
|
||||
iframes: [
|
||||
/weedmaps\.com/i,
|
||||
/menu\.weedmaps/i,
|
||||
],
|
||||
classes: [
|
||||
/weedmaps-/i,
|
||||
/wm-/i,
|
||||
],
|
||||
urls: [
|
||||
/weedmaps\.com/i,
|
||||
],
|
||||
meta: [
|
||||
/weedmaps/i,
|
||||
],
|
||||
apiEndpoints: [
|
||||
/api.*weedmaps/i,
|
||||
],
|
||||
htmlPatterns: [
|
||||
/data-weedmaps/i,
|
||||
],
|
||||
},
|
||||
leafly: {
|
||||
scripts: [
|
||||
/leafly/i,
|
||||
/leafly\.com/i,
|
||||
],
|
||||
iframes: [
|
||||
/leafly\.com/i,
|
||||
/menu\.leafly/i,
|
||||
],
|
||||
classes: [
|
||||
/leafly-/i,
|
||||
],
|
||||
urls: [
|
||||
/leafly\.com/i,
|
||||
],
|
||||
meta: [
|
||||
/leafly/i,
|
||||
],
|
||||
apiEndpoints: [
|
||||
/api\.leafly/i,
|
||||
],
|
||||
htmlPatterns: [
|
||||
/data-leafly/i,
|
||||
],
|
||||
},
|
||||
meadow: {
|
||||
scripts: [
|
||||
/meadow/i,
|
||||
/getmeadow/i,
|
||||
],
|
||||
iframes: [
|
||||
/getmeadow\.com/i,
|
||||
],
|
||||
classes: [
|
||||
/meadow-/i,
|
||||
],
|
||||
urls: [
|
||||
/getmeadow\.com/i,
|
||||
],
|
||||
meta: [],
|
||||
apiEndpoints: [
|
||||
/api\.getmeadow/i,
|
||||
],
|
||||
htmlPatterns: [],
|
||||
},
|
||||
greenlight: {
|
||||
scripts: [
|
||||
/greenlight/i,
|
||||
/greenlightmenu/i,
|
||||
],
|
||||
iframes: [
|
||||
/greenlight/i,
|
||||
],
|
||||
classes: [
|
||||
/greenlight-/i,
|
||||
],
|
||||
urls: [
|
||||
/greenlight/i,
|
||||
],
|
||||
meta: [],
|
||||
apiEndpoints: [],
|
||||
htmlPatterns: [],
|
||||
},
|
||||
blaze: {
|
||||
scripts: [
|
||||
/blaze\.me/i,
|
||||
/blazepos/i,
|
||||
],
|
||||
iframes: [
|
||||
/blaze\.me/i,
|
||||
],
|
||||
classes: [
|
||||
/blaze-/i,
|
||||
],
|
||||
urls: [
|
||||
/blaze\.me/i,
|
||||
],
|
||||
meta: [],
|
||||
apiEndpoints: [
|
||||
/api\.blaze/i,
|
||||
],
|
||||
htmlPatterns: [],
|
||||
},
|
||||
flowhub: {
|
||||
scripts: [
|
||||
/flowhub/i,
|
||||
],
|
||||
iframes: [
|
||||
/flowhub\.com/i,
|
||||
],
|
||||
classes: [
|
||||
/flowhub-/i,
|
||||
],
|
||||
urls: [
|
||||
/flowhub\.com/i,
|
||||
],
|
||||
meta: [],
|
||||
apiEndpoints: [],
|
||||
htmlPatterns: [],
|
||||
},
|
||||
dispense: {
|
||||
scripts: [
|
||||
/dispenseapp/i,
|
||||
],
|
||||
iframes: [
|
||||
/dispenseapp\.com/i,
|
||||
],
|
||||
classes: [
|
||||
/dispense-/i,
|
||||
],
|
||||
urls: [
|
||||
/dispenseapp\.com/i,
|
||||
],
|
||||
meta: [],
|
||||
apiEndpoints: [],
|
||||
htmlPatterns: [],
|
||||
},
|
||||
cova: {
|
||||
scripts: [
|
||||
/covasoftware/i,
|
||||
/cova\.software/i,
|
||||
],
|
||||
iframes: [
|
||||
/cova/i,
|
||||
],
|
||||
classes: [
|
||||
/cova-/i,
|
||||
],
|
||||
urls: [
|
||||
/cova/i,
|
||||
],
|
||||
meta: [],
|
||||
apiEndpoints: [],
|
||||
htmlPatterns: [],
|
||||
},
|
||||
};
|
||||
// Common menu URL paths to check
|
||||
const MENU_PATHS = [
|
||||
'/menu',
|
||||
'/shop',
|
||||
'/products',
|
||||
'/order',
|
||||
'/store',
|
||||
'/dispensary-menu',
|
||||
'/online-menu',
|
||||
'/shop-all',
|
||||
'/browse',
|
||||
'/catalog',
|
||||
];
|
||||
/**
|
||||
* Analyze a single page for provider signals
|
||||
*/
|
||||
async function analyzePageForProviders(page, url) {
|
||||
const signals = [];
|
||||
try {
|
||||
// Get page HTML
|
||||
const html = await page.content();
|
||||
const lowerHtml = html.toLowerCase();
|
||||
// Check each provider's patterns
|
||||
for (const [provider, patterns] of Object.entries(PROVIDER_PATTERNS)) {
|
||||
// Check script sources
|
||||
const scripts = await page.$$eval('script[src]', els => els.map(el => el.getAttribute('src') || ''));
|
||||
for (const script of scripts) {
|
||||
for (const pattern of patterns.scripts) {
|
||||
if (pattern.test(script)) {
|
||||
signals.push({
|
||||
provider: provider,
|
||||
confidence: 90,
|
||||
source: 'script_src',
|
||||
details: script,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
// Check inline scripts
|
||||
const inlineScripts = await page.$$eval('script:not([src])', els => els.map(el => el.textContent || ''));
|
||||
for (const scriptContent of inlineScripts) {
|
||||
for (const pattern of patterns.scripts) {
|
||||
if (pattern.test(scriptContent)) {
|
||||
signals.push({
|
||||
provider: provider,
|
||||
confidence: 70,
|
||||
source: 'inline_script',
|
||||
details: `Pattern: ${pattern}`,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
// Check iframes
|
||||
const iframes = await page.$$eval('iframe', els => els.map(el => el.getAttribute('src') || ''));
|
||||
for (const iframe of iframes) {
|
||||
for (const pattern of patterns.iframes) {
|
||||
if (pattern.test(iframe)) {
|
||||
signals.push({
|
||||
provider: provider,
|
||||
confidence: 95,
|
||||
source: 'iframe_src',
|
||||
details: iframe,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
// Check HTML patterns
|
||||
for (const pattern of patterns.htmlPatterns) {
|
||||
if (pattern.test(html)) {
|
||||
signals.push({
|
||||
provider: provider,
|
||||
confidence: 85,
|
||||
source: 'html_pattern',
|
||||
details: `Pattern: ${pattern}`,
|
||||
});
|
||||
}
|
||||
}
|
||||
// Check CSS classes
|
||||
for (const pattern of patterns.classes) {
|
||||
if (pattern.test(html)) {
|
||||
signals.push({
|
||||
provider: provider,
|
||||
confidence: 60,
|
||||
source: 'css_class',
|
||||
details: `Pattern: ${pattern}`,
|
||||
});
|
||||
}
|
||||
}
|
||||
// Check meta tags
|
||||
const metaTags = await page.$$eval('meta', els => els.map(el => `${el.getAttribute('name')} ${el.getAttribute('content')}`));
|
||||
for (const meta of metaTags) {
|
||||
for (const pattern of patterns.meta) {
|
||||
if (pattern.test(meta)) {
|
||||
signals.push({
|
||||
provider: provider,
|
||||
confidence: 80,
|
||||
source: 'meta_tag',
|
||||
details: meta,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Check for network requests (if we intercepted them)
|
||||
// This would be enhanced with request interception
|
||||
}
|
||||
catch (error) {
|
||||
logger_1.logger.error('provider-detection', `Error analyzing page ${url}: ${error}`);
|
||||
}
|
||||
return signals;
|
||||
}
|
||||
/**
|
||||
* Aggregate signals into a final detection result
|
||||
*/
|
||||
function aggregateSignals(signals) {
|
||||
if (signals.length === 0) {
|
||||
return { provider: 'unknown', confidence: 0 };
|
||||
}
|
||||
// Group signals by provider
|
||||
const providerScores = {};
|
||||
for (const signal of signals) {
|
||||
if (!providerScores[signal.provider]) {
|
||||
providerScores[signal.provider] = [];
|
||||
}
|
||||
providerScores[signal.provider].push(signal.confidence);
|
||||
}
|
||||
// Calculate weighted score for each provider
|
||||
const scores = [];
|
||||
for (const [provider, confidences] of Object.entries(providerScores)) {
|
||||
// Use max confidence + bonus for multiple signals
|
||||
const maxConf = Math.max(...confidences);
|
||||
const multiSignalBonus = Math.min(10, (confidences.length - 1) * 3);
|
||||
const score = Math.min(100, maxConf + multiSignalBonus);
|
||||
scores.push({ provider: provider, score });
|
||||
}
|
||||
// Sort by score descending
|
||||
scores.sort((a, b) => b.score - a.score);
|
||||
const best = scores[0];
|
||||
// If there's a clear winner (20+ point lead), use it
|
||||
if (scores.length === 1 || best.score - scores[1].score >= 20) {
|
||||
return { provider: best.provider, confidence: best.score };
|
||||
}
|
||||
// Multiple contenders - reduce confidence
|
||||
return { provider: best.provider, confidence: Math.max(50, best.score - 20) };
|
||||
}
|
||||
/**
|
||||
* Detect the menu provider for a dispensary
|
||||
*/
|
||||
async function detectMenuProvider(websiteUrl, options = {}) {
|
||||
const { checkMenuPaths = true, timeout = 30000 } = options;
|
||||
const result = {
|
||||
provider: 'unknown',
|
||||
confidence: 0,
|
||||
signals: [],
|
||||
urlsTested: [],
|
||||
menuEntryPoints: [],
|
||||
rawSignals: {},
|
||||
};
|
||||
let browser = null;
|
||||
try {
|
||||
// Normalize URL
|
||||
let baseUrl = websiteUrl.trim();
|
||||
if (!baseUrl.startsWith('http')) {
|
||||
baseUrl = `https://${baseUrl}`;
|
||||
}
|
||||
baseUrl = baseUrl.replace(/\/$/, ''); // Remove trailing slash
|
||||
// Launch browser
|
||||
browser = await puppeteer_1.default.launch({
|
||||
headless: true,
|
||||
args: [
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-gpu',
|
||||
],
|
||||
});
|
||||
const page = await browser.newPage();
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
|
||||
// Track network requests for API detection
|
||||
const apiRequests = [];
|
||||
await page.setRequestInterception(true);
|
||||
page.on('request', (request) => {
|
||||
const url = request.url();
|
||||
if (url.includes('api') || url.includes('graphql')) {
|
||||
apiRequests.push(url);
|
||||
}
|
||||
request.continue();
|
||||
});
|
||||
// URLs to check
|
||||
const urlsToCheck = [baseUrl];
|
||||
if (checkMenuPaths) {
|
||||
for (const path of MENU_PATHS) {
|
||||
urlsToCheck.push(`${baseUrl}${path}`);
|
||||
}
|
||||
}
|
||||
// Check each URL
|
||||
for (const url of urlsToCheck) {
|
||||
try {
|
||||
result.urlsTested.push(url);
|
||||
await page.goto(url, {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout,
|
||||
});
|
||||
// Wait a bit for dynamic content
|
||||
await new Promise(r => setTimeout(r, 2000));
|
||||
// Analyze page
|
||||
const pageSignals = await analyzePageForProviders(page, url);
|
||||
result.signals.push(...pageSignals);
|
||||
// Track if this URL has menu content
|
||||
const hasMenuContent = await page.evaluate(() => {
|
||||
const text = document.body.innerText.toLowerCase();
|
||||
return (text.includes('add to cart') ||
|
||||
text.includes('add to bag') ||
|
||||
text.includes('product') ||
|
||||
text.includes('indica') ||
|
||||
text.includes('sativa') ||
|
||||
text.includes('hybrid') ||
|
||||
text.includes('thc') ||
|
||||
text.includes('cbd'));
|
||||
});
|
||||
if (hasMenuContent && url !== baseUrl) {
|
||||
result.menuEntryPoints.push(url);
|
||||
}
|
||||
}
|
||||
catch (pageError) {
|
||||
// 404s are fine, just skip
|
||||
if (!pageError.message?.includes('404')) {
|
||||
logger_1.logger.warn('provider-detection', `Could not load ${url}: ${pageError.message}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
// Check API requests for provider hints
|
||||
for (const apiUrl of apiRequests) {
|
||||
for (const [provider, patterns] of Object.entries(PROVIDER_PATTERNS)) {
|
||||
for (const pattern of patterns.apiEndpoints) {
|
||||
if (pattern.test(apiUrl)) {
|
||||
result.signals.push({
|
||||
provider: provider,
|
||||
confidence: 95,
|
||||
source: 'api_request',
|
||||
details: apiUrl,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Record raw signals
|
||||
result.rawSignals = {
|
||||
apiRequestsFound: apiRequests.length,
|
||||
menuEntryPointsFound: result.menuEntryPoints.length,
|
||||
totalSignals: result.signals.length,
|
||||
uniqueProviders: [...new Set(result.signals.map(s => s.provider))].length,
|
||||
};
|
||||
// Aggregate signals into final result
|
||||
const aggregated = aggregateSignals(result.signals);
|
||||
result.provider = aggregated.provider;
|
||||
result.confidence = aggregated.confidence;
|
||||
}
|
||||
catch (error) {
|
||||
result.error = error.message;
|
||||
logger_1.logger.error('provider-detection', `Detection failed for ${websiteUrl}: ${error.message}`);
|
||||
}
|
||||
finally {
|
||||
if (browser) {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
/**
|
||||
* Quick check if a site has Dutchie - used during production crawls
|
||||
*/
|
||||
async function quickDutchieCheck(page) {
|
||||
try {
|
||||
const html = await page.content();
|
||||
// Check for Dutchie-specific patterns
|
||||
const dutchiePatterns = [
|
||||
/dutchie/i,
|
||||
/dutchie-plus/i,
|
||||
/__DUTCHIE__/i,
|
||||
/data-dutchie/i,
|
||||
/embed\.dutchie/i,
|
||||
];
|
||||
for (const pattern of dutchiePatterns) {
|
||||
if (pattern.test(html)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
// Check iframes
|
||||
const iframes = await page.$$eval('iframe', els => els.map(el => el.getAttribute('src') || ''));
|
||||
for (const iframe of iframes) {
|
||||
if (/dutchie/i.test(iframe)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Check if provider has changed from expected
|
||||
*/
|
||||
async function detectProviderChange(page, expectedProvider) {
|
||||
try {
|
||||
const signals = await analyzePageForProviders(page, page.url());
|
||||
const aggregated = aggregateSignals(signals);
|
||||
// If we expected Dutchie but found something else with high confidence
|
||||
if (expectedProvider === 'dutchie' && aggregated.provider !== 'dutchie' && aggregated.confidence >= 70) {
|
||||
return {
|
||||
changed: true,
|
||||
newProvider: aggregated.provider,
|
||||
confidence: aggregated.confidence,
|
||||
};
|
||||
}
|
||||
// If we expected Dutchie and found nothing/low confidence, might have switched
|
||||
if (expectedProvider === 'dutchie' && aggregated.confidence < 30) {
|
||||
// Check if Dutchie is definitely NOT present
|
||||
const hasDutchie = await quickDutchieCheck(page);
|
||||
if (!hasDutchie) {
|
||||
return {
|
||||
changed: true,
|
||||
newProvider: aggregated.provider !== 'unknown' ? aggregated.provider : 'other',
|
||||
confidence: Math.max(30, aggregated.confidence),
|
||||
};
|
||||
}
|
||||
}
|
||||
return { changed: false };
|
||||
}
|
||||
catch {
|
||||
return { changed: false };
|
||||
}
|
||||
}
|
||||
171
backend/dist/services/proxy.js
vendored
171
backend/dist/services/proxy.js
vendored
@@ -3,22 +3,92 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
|
||||
return (mod && mod.__esModule) ? mod : { "default": mod };
|
||||
};
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.isBotDetectionError = isBotDetectionError;
|
||||
exports.putProxyInTimeout = putProxyInTimeout;
|
||||
exports.isProxyInTimeout = isProxyInTimeout;
|
||||
exports.getActiveProxy = getActiveProxy;
|
||||
exports.testProxy = testProxy;
|
||||
exports.saveProxyTestResult = saveProxyTestResult;
|
||||
exports.testAllProxies = testAllProxies;
|
||||
exports.addProxy = addProxy;
|
||||
exports.addProxiesFromList = addProxiesFromList;
|
||||
exports.moveProxyToFailed = moveProxyToFailed;
|
||||
exports.incrementProxyFailure = incrementProxyFailure;
|
||||
const axios_1 = __importDefault(require("axios"));
|
||||
const socks_proxy_agent_1 = require("socks-proxy-agent");
|
||||
const https_proxy_agent_1 = require("https-proxy-agent");
|
||||
const migrate_1 = require("../db/migrate");
|
||||
// In-memory proxy timeout tracking
|
||||
// Maps proxy ID to timestamp when timeout expires
|
||||
const proxyTimeouts = new Map();
|
||||
const PROXY_TIMEOUT_MS = 35000; // 35 seconds timeout for bot-detected proxies
|
||||
// Check if error message indicates bot detection
|
||||
function isBotDetectionError(errorMsg) {
|
||||
const botPatterns = [
|
||||
/bot detection/i,
|
||||
/captcha/i,
|
||||
/challenge/i,
|
||||
/cloudflare/i,
|
||||
/access denied/i,
|
||||
/rate limit/i,
|
||||
/too many requests/i,
|
||||
/temporarily blocked/i,
|
||||
/suspicious activity/i,
|
||||
];
|
||||
return botPatterns.some(pattern => pattern.test(errorMsg));
|
||||
}
|
||||
// Put proxy in timeout (bot detection cooldown)
|
||||
function putProxyInTimeout(proxyId, reason) {
|
||||
const timeoutUntil = Date.now() + PROXY_TIMEOUT_MS;
|
||||
proxyTimeouts.set(proxyId, timeoutUntil);
|
||||
console.log(`🚫 Proxy ${proxyId} in timeout for ${PROXY_TIMEOUT_MS / 1000}s: ${reason}`);
|
||||
}
|
||||
// Check if proxy is currently in timeout
|
||||
function isProxyInTimeout(proxyId) {
|
||||
const timeoutUntil = proxyTimeouts.get(proxyId);
|
||||
if (!timeoutUntil)
|
||||
return false;
|
||||
if (Date.now() >= timeoutUntil) {
|
||||
// Timeout expired, remove it
|
||||
proxyTimeouts.delete(proxyId);
|
||||
console.log(`✅ Proxy ${proxyId} timeout expired, back in rotation`);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
// Get active proxy that's not in timeout
|
||||
async function getActiveProxy() {
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT id, host, port, protocol, username, password
|
||||
FROM proxies
|
||||
WHERE active = true
|
||||
ORDER BY RANDOM()
|
||||
`);
|
||||
// Filter out proxies in timeout
|
||||
for (const proxy of result.rows) {
|
||||
if (!isProxyInTimeout(proxy.id)) {
|
||||
return proxy;
|
||||
}
|
||||
}
|
||||
// All proxies are in timeout, wait for first one to expire
|
||||
if (proxyTimeouts.size > 0) {
|
||||
const nextAvailable = Math.min(...Array.from(proxyTimeouts.values()));
|
||||
const waitTime = Math.max(0, nextAvailable - Date.now());
|
||||
console.log(`⏳ All proxies in timeout, waiting ${Math.ceil(waitTime / 1000)}s for next available...`);
|
||||
await new Promise(resolve => setTimeout(resolve, waitTime));
|
||||
// Try again after waiting
|
||||
return getActiveProxy();
|
||||
}
|
||||
console.log('⚠️ No active proxies available');
|
||||
return null;
|
||||
}
|
||||
async function getSettings() {
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT key, value FROM settings
|
||||
WHERE key IN ('proxy_timeout_ms', 'proxy_test_url')
|
||||
`);
|
||||
const settings = {};
|
||||
result.rows.forEach(row => {
|
||||
result.rows.forEach((row) => {
|
||||
settings[row.key] = row.value;
|
||||
});
|
||||
return {
|
||||
@@ -146,12 +216,44 @@ async function addProxy(host, port, protocol, username, password) {
|
||||
async function addProxiesFromList(proxies) {
|
||||
let added = 0;
|
||||
let failed = 0;
|
||||
let duplicates = 0;
|
||||
const errors = [];
|
||||
console.log(`📥 Importing ${proxies.length} proxies without testing...`);
|
||||
for (const proxy of proxies) {
|
||||
try {
|
||||
await addProxy(proxy.host, proxy.port, proxy.protocol, proxy.username, proxy.password);
|
||||
added++;
|
||||
console.log(`✅ Added proxy: ${proxy.protocol}://${proxy.host}:${proxy.port}`);
|
||||
// Insert without testing first
|
||||
await migrate_1.pool.query(`
|
||||
INSERT INTO proxies (host, port, protocol, username, password, active)
|
||||
VALUES ($1, $2, $3, $4, $5, false)
|
||||
ON CONFLICT (host, port, protocol) DO NOTHING
|
||||
`, [
|
||||
proxy.host,
|
||||
proxy.port,
|
||||
proxy.protocol,
|
||||
proxy.username,
|
||||
proxy.password
|
||||
]);
|
||||
// Check if it was actually inserted
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT id FROM proxies
|
||||
WHERE host = $1 AND port = $2 AND protocol = $3
|
||||
`, [proxy.host, proxy.port, proxy.protocol]);
|
||||
if (result.rows.length > 0) {
|
||||
// Check if it was just inserted (no last_tested_at means new)
|
||||
const checkResult = await migrate_1.pool.query(`
|
||||
SELECT last_tested_at FROM proxies
|
||||
WHERE host = $1 AND port = $2 AND protocol = $3
|
||||
`, [proxy.host, proxy.port, proxy.protocol]);
|
||||
if (checkResult.rows[0].last_tested_at === null) {
|
||||
added++;
|
||||
if (added % 100 === 0) {
|
||||
console.log(`📥 Imported ${added} proxies...`);
|
||||
}
|
||||
}
|
||||
else {
|
||||
duplicates++;
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (error) {
|
||||
failed++;
|
||||
@@ -159,8 +261,63 @@ async function addProxiesFromList(proxies) {
|
||||
errors.push(errorMsg);
|
||||
console.log(`❌ Failed to add proxy: ${errorMsg}`);
|
||||
}
|
||||
// Small delay between adds
|
||||
await new Promise(resolve => setTimeout(resolve, 500));
|
||||
}
|
||||
return { added, failed, errors };
|
||||
console.log(`✅ Import complete: ${added} added, ${duplicates} duplicates, ${failed} failed`);
|
||||
return { added, failed, duplicates, errors };
|
||||
}
|
||||
async function moveProxyToFailed(proxyId, errorMsg) {
|
||||
// Get proxy details
|
||||
const proxyResult = await migrate_1.pool.query(`
|
||||
SELECT host, port, protocol, username, password, failure_count
|
||||
FROM proxies
|
||||
WHERE id = $1
|
||||
`, [proxyId]);
|
||||
if (proxyResult.rows.length === 0) {
|
||||
return;
|
||||
}
|
||||
const proxy = proxyResult.rows[0];
|
||||
// Insert into failed_proxies table
|
||||
await migrate_1.pool.query(`
|
||||
INSERT INTO failed_proxies (host, port, protocol, username, password, failure_count, last_error)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7)
|
||||
ON CONFLICT (host, port, protocol)
|
||||
DO UPDATE SET
|
||||
failure_count = $6,
|
||||
last_error = $7,
|
||||
failed_at = CURRENT_TIMESTAMP
|
||||
`, [
|
||||
proxy.host,
|
||||
proxy.port,
|
||||
proxy.protocol,
|
||||
proxy.username,
|
||||
proxy.password,
|
||||
proxy.failure_count,
|
||||
errorMsg
|
||||
]);
|
||||
// Delete from active proxies
|
||||
await migrate_1.pool.query(`DELETE FROM proxies WHERE id = $1`, [proxyId]);
|
||||
console.log(`🔴 Moved proxy to failed: ${proxy.protocol}://${proxy.host}:${proxy.port} (${proxy.failure_count} failures)`);
|
||||
}
|
||||
async function incrementProxyFailure(proxyId, errorMsg) {
|
||||
// Increment failure count
|
||||
const result = await migrate_1.pool.query(`
|
||||
UPDATE proxies
|
||||
SET failure_count = failure_count + 1,
|
||||
active = false,
|
||||
updated_at = CURRENT_TIMESTAMP
|
||||
WHERE id = $1
|
||||
RETURNING failure_count, host, port, protocol
|
||||
`, [proxyId]);
|
||||
if (result.rows.length === 0) {
|
||||
return false;
|
||||
}
|
||||
const proxy = result.rows[0];
|
||||
const failureCount = proxy.failure_count;
|
||||
console.log(`⚠️ Proxy failure #${failureCount}: ${proxy.protocol}://${proxy.host}:${proxy.port}`);
|
||||
// If failed 3 times, move to failed table
|
||||
if (failureCount >= 3) {
|
||||
await moveProxyToFailed(proxyId, errorMsg);
|
||||
return true; // Moved to failed
|
||||
}
|
||||
return false; // Still in active proxies
|
||||
}
|
||||
|
||||
174
backend/dist/services/proxyTestQueue.js
vendored
Normal file
174
backend/dist/services/proxyTestQueue.js
vendored
Normal file
@@ -0,0 +1,174 @@
|
||||
"use strict";
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.cleanupOrphanedJobs = cleanupOrphanedJobs;
|
||||
exports.createProxyTestJob = createProxyTestJob;
|
||||
exports.getProxyTestJob = getProxyTestJob;
|
||||
exports.getActiveProxyTestJob = getActiveProxyTestJob;
|
||||
exports.cancelProxyTestJob = cancelProxyTestJob;
|
||||
const migrate_1 = require("../db/migrate");
|
||||
const proxy_1 = require("./proxy");
|
||||
// Simple in-memory queue - could be replaced with Bull/Bee-Queue for production
|
||||
const activeJobs = new Map();
|
||||
// Clean up orphaned jobs on server startup
|
||||
async function cleanupOrphanedJobs() {
|
||||
try {
|
||||
const result = await migrate_1.pool.query(`
|
||||
UPDATE proxy_test_jobs
|
||||
SET status = 'cancelled',
|
||||
completed_at = CURRENT_TIMESTAMP,
|
||||
updated_at = CURRENT_TIMESTAMP
|
||||
WHERE status IN ('pending', 'running')
|
||||
RETURNING id
|
||||
`);
|
||||
if (result.rows.length > 0) {
|
||||
console.log(`🧹 Cleaned up ${result.rows.length} orphaned proxy test jobs`);
|
||||
}
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error cleaning up orphaned jobs:', error);
|
||||
}
|
||||
}
|
||||
async function createProxyTestJob() {
|
||||
// Check for existing running jobs first
|
||||
const existingJob = await getActiveProxyTestJob();
|
||||
if (existingJob) {
|
||||
throw new Error('A proxy test job is already running. Please cancel it first.');
|
||||
}
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT COUNT(*) as count FROM proxies
|
||||
`);
|
||||
const totalProxies = parseInt(result.rows[0].count);
|
||||
const jobResult = await migrate_1.pool.query(`
|
||||
INSERT INTO proxy_test_jobs (status, total_proxies)
|
||||
VALUES ('pending', $1)
|
||||
RETURNING id
|
||||
`, [totalProxies]);
|
||||
const jobId = jobResult.rows[0].id;
|
||||
// Start job in background
|
||||
runProxyTestJob(jobId).catch(err => {
|
||||
console.error(`❌ Proxy test job ${jobId} failed:`, err);
|
||||
});
|
||||
return jobId;
|
||||
}
|
||||
async function getProxyTestJob(jobId) {
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT id, status, total_proxies, tested_proxies, passed_proxies, failed_proxies
|
||||
FROM proxy_test_jobs
|
||||
WHERE id = $1
|
||||
`, [jobId]);
|
||||
if (result.rows.length === 0) {
|
||||
return null;
|
||||
}
|
||||
return result.rows[0];
|
||||
}
|
||||
async function getActiveProxyTestJob() {
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT id, status, total_proxies, tested_proxies, passed_proxies, failed_proxies
|
||||
FROM proxy_test_jobs
|
||||
WHERE status IN ('pending', 'running')
|
||||
ORDER BY created_at DESC
|
||||
LIMIT 1
|
||||
`);
|
||||
if (result.rows.length === 0) {
|
||||
return null;
|
||||
}
|
||||
return result.rows[0];
|
||||
}
|
||||
async function cancelProxyTestJob(jobId) {
|
||||
// Try to cancel in-memory job first
|
||||
const jobControl = activeJobs.get(jobId);
|
||||
if (jobControl) {
|
||||
jobControl.cancelled = true;
|
||||
}
|
||||
// Always update database to handle orphaned jobs
|
||||
const result = await migrate_1.pool.query(`
|
||||
UPDATE proxy_test_jobs
|
||||
SET status = 'cancelled',
|
||||
completed_at = CURRENT_TIMESTAMP,
|
||||
updated_at = CURRENT_TIMESTAMP
|
||||
WHERE id = $1 AND status IN ('pending', 'running')
|
||||
RETURNING id
|
||||
`, [jobId]);
|
||||
return result.rows.length > 0;
|
||||
}
|
||||
async function runProxyTestJob(jobId) {
|
||||
// Register job as active
|
||||
activeJobs.set(jobId, { cancelled: false });
|
||||
try {
|
||||
// Update status to running
|
||||
await migrate_1.pool.query(`
|
||||
UPDATE proxy_test_jobs
|
||||
SET status = 'running',
|
||||
started_at = CURRENT_TIMESTAMP,
|
||||
updated_at = CURRENT_TIMESTAMP
|
||||
WHERE id = $1
|
||||
`, [jobId]);
|
||||
console.log(`🔍 Starting proxy test job ${jobId}...`);
|
||||
// Get all proxies
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT id, host, port, protocol, username, password
|
||||
FROM proxies
|
||||
ORDER BY id
|
||||
`);
|
||||
let tested = 0;
|
||||
let passed = 0;
|
||||
let failed = 0;
|
||||
for (const proxy of result.rows) {
|
||||
// Check if job was cancelled
|
||||
const jobControl = activeJobs.get(jobId);
|
||||
if (jobControl?.cancelled) {
|
||||
console.log(`⏸️ Proxy test job ${jobId} cancelled`);
|
||||
break;
|
||||
}
|
||||
// Test the proxy
|
||||
const testResult = await (0, proxy_1.testProxy)(proxy.host, proxy.port, proxy.protocol, proxy.username, proxy.password);
|
||||
// Save result
|
||||
await (0, proxy_1.saveProxyTestResult)(proxy.id, testResult);
|
||||
tested++;
|
||||
if (testResult.success) {
|
||||
passed++;
|
||||
}
|
||||
else {
|
||||
failed++;
|
||||
}
|
||||
// Update job progress
|
||||
await migrate_1.pool.query(`
|
||||
UPDATE proxy_test_jobs
|
||||
SET tested_proxies = $1,
|
||||
passed_proxies = $2,
|
||||
failed_proxies = $3,
|
||||
updated_at = CURRENT_TIMESTAMP
|
||||
WHERE id = $4
|
||||
`, [tested, passed, failed, jobId]);
|
||||
// Log progress every 10 proxies
|
||||
if (tested % 10 === 0) {
|
||||
console.log(`📊 Job ${jobId}: ${tested}/${result.rows.length} proxies tested (${passed} passed, ${failed} failed)`);
|
||||
}
|
||||
}
|
||||
// Mark job as completed
|
||||
const jobControl = activeJobs.get(jobId);
|
||||
const finalStatus = jobControl?.cancelled ? 'cancelled' : 'completed';
|
||||
await migrate_1.pool.query(`
|
||||
UPDATE proxy_test_jobs
|
||||
SET status = $1,
|
||||
completed_at = CURRENT_TIMESTAMP,
|
||||
updated_at = CURRENT_TIMESTAMP
|
||||
WHERE id = $2
|
||||
`, [finalStatus, jobId]);
|
||||
console.log(`✅ Proxy test job ${jobId} ${finalStatus}: ${tested} tested, ${passed} passed, ${failed} failed`);
|
||||
}
|
||||
catch (error) {
|
||||
console.error(`❌ Proxy test job ${jobId} error:`, error);
|
||||
await migrate_1.pool.query(`
|
||||
UPDATE proxy_test_jobs
|
||||
SET status = 'failed',
|
||||
completed_at = CURRENT_TIMESTAMP,
|
||||
updated_at = CURRENT_TIMESTAMP
|
||||
WHERE id = $1
|
||||
`, [jobId]);
|
||||
}
|
||||
finally {
|
||||
// Remove from active jobs
|
||||
activeJobs.delete(jobId);
|
||||
}
|
||||
}
|
||||
2
backend/dist/services/scheduler.js
vendored
2
backend/dist/services/scheduler.js
vendored
@@ -18,7 +18,7 @@ async function getSettings() {
|
||||
WHERE key IN ('scrape_interval_hours', 'scrape_specials_time')
|
||||
`);
|
||||
const settings = {};
|
||||
result.rows.forEach(row => {
|
||||
result.rows.forEach((row) => {
|
||||
settings[row.key] = row.value;
|
||||
});
|
||||
return {
|
||||
|
||||
7
backend/dist/services/scraper-debug.js
vendored
7
backend/dist/services/scraper-debug.js
vendored
@@ -4,10 +4,13 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
|
||||
};
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.debugDutchiePage = debugDutchiePage;
|
||||
const puppeteer_1 = __importDefault(require("puppeteer"));
|
||||
const puppeteer_extra_1 = __importDefault(require("puppeteer-extra"));
|
||||
const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth"));
|
||||
const logger_1 = require("./logger");
|
||||
// Apply stealth plugin
|
||||
puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)());
|
||||
async function debugDutchiePage(url) {
|
||||
const browser = await puppeteer_1.default.launch({
|
||||
const browser = await puppeteer_extra_1.default.launch({
|
||||
headless: 'new',
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage']
|
||||
});
|
||||
|
||||
236
backend/dist/services/scraper-playwright.js
vendored
Normal file
236
backend/dist/services/scraper-playwright.js
vendored
Normal file
@@ -0,0 +1,236 @@
|
||||
"use strict";
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.scrapeCategoryPlaywright = scrapeCategoryPlaywright;
|
||||
exports.testScrapeCategoryPlaywright = testScrapeCategoryPlaywright;
|
||||
const age_gate_playwright_1 = require("../utils/age-gate-playwright");
|
||||
const logger_1 = require("./logger");
|
||||
const stealthBrowser_1 = require("../utils/stealthBrowser");
|
||||
const dutchie_1 = require("../scrapers/templates/dutchie");
|
||||
/**
|
||||
* Scrapes a category page using Playwright with stealth mode to extract product information
|
||||
*/
|
||||
async function scrapeCategoryPlaywright(categoryUrl, categoryName, state = 'Arizona', proxy) {
|
||||
logger_1.logger.info('scraper', `Scraping category: ${categoryName}`);
|
||||
logger_1.logger.info('scraper', `URL: ${categoryUrl}`);
|
||||
// Create stealth browser with optional proxy
|
||||
const browser = await (0, stealthBrowser_1.createStealthBrowser)({ proxy, headless: true });
|
||||
try {
|
||||
// Create stealth context with age gate cookies
|
||||
const context = await (0, stealthBrowser_1.createStealthContext)(browser, { state });
|
||||
// Try to load saved session cookies
|
||||
const cookiesPath = `/tmp/dutchie-session-${state.toLowerCase()}.json`;
|
||||
await (0, stealthBrowser_1.loadCookies)(context, cookiesPath);
|
||||
const page = await context.newPage();
|
||||
// Navigate to category page
|
||||
logger_1.logger.info('scraper', `Loading page: ${categoryUrl}`);
|
||||
await page.goto(categoryUrl, { waitUntil: 'domcontentloaded', timeout: 60000 });
|
||||
// Random delay to appear more human
|
||||
await (0, stealthBrowser_1.randomDelay)(1000, 2000);
|
||||
// Check for Cloudflare challenge
|
||||
if (await (0, stealthBrowser_1.isCloudflareChallenge)(page)) {
|
||||
logger_1.logger.info('scraper', '🛡️ Cloudflare challenge detected, waiting...');
|
||||
const passed = await (0, stealthBrowser_1.waitForCloudflareChallenge)(page, 30000);
|
||||
if (!passed) {
|
||||
logger_1.logger.error('scraper', '❌ Failed to pass Cloudflare challenge');
|
||||
await browser.close();
|
||||
return [];
|
||||
}
|
||||
// Save successful session cookies
|
||||
await (0, stealthBrowser_1.saveCookies)(context, cookiesPath);
|
||||
}
|
||||
// Wait for page to be fully loaded
|
||||
await (0, stealthBrowser_1.waitForPageLoad)(page);
|
||||
// Simulate human behavior
|
||||
await (0, stealthBrowser_1.simulateHumanBehavior)(page);
|
||||
// Check for and bypass age gate
|
||||
const bypassed = await (0, age_gate_playwright_1.bypassAgeGatePlaywright)(page, state);
|
||||
if (!bypassed) {
|
||||
logger_1.logger.error('scraper', 'Failed to bypass age gate');
|
||||
await browser.close();
|
||||
return [];
|
||||
}
|
||||
// Wait for products to load with random delay
|
||||
logger_1.logger.info('scraper', 'Waiting for products to load...');
|
||||
await (0, stealthBrowser_1.randomDelay)(2000, 4000);
|
||||
// Scroll to load all products with human-like behavior
|
||||
logger_1.logger.info('scraper', 'Scrolling to load all products...');
|
||||
await scrollToBottomHuman(page);
|
||||
// Extract products
|
||||
logger_1.logger.info('scraper', 'Extracting products from page...');
|
||||
const products = await extractProducts(page, categoryUrl, categoryName);
|
||||
logger_1.logger.info('scraper', `Found ${products.length} products`);
|
||||
await browser.close();
|
||||
return products;
|
||||
}
|
||||
catch (error) {
|
||||
logger_1.logger.error('scraper', `Error scraping category: ${error}`);
|
||||
await browser.close();
|
||||
return [];
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Scrolls to the bottom of the page with human-like behavior
|
||||
*/
|
||||
async function scrollToBottomHuman(page) {
|
||||
let previousHeight = 0;
|
||||
let currentHeight = await page.evaluate(() => document.body.scrollHeight);
|
||||
let attempts = 0;
|
||||
const maxAttempts = 20;
|
||||
while (previousHeight < currentHeight && attempts < maxAttempts) {
|
||||
previousHeight = currentHeight;
|
||||
// Scroll down in chunks with randomized delays
|
||||
const scrollAmount = Math.floor(Math.random() * 200) + 300; // 300-500px
|
||||
await (0, stealthBrowser_1.humanScroll)(page, scrollAmount);
|
||||
// Random pause like a human reading
|
||||
await (0, stealthBrowser_1.randomDelay)(500, 1500);
|
||||
// Check new height
|
||||
currentHeight = await page.evaluate(() => document.body.scrollHeight);
|
||||
attempts++;
|
||||
}
|
||||
// Final wait for any lazy-loaded content
|
||||
await (0, stealthBrowser_1.randomDelay)(1000, 2000);
|
||||
}
|
||||
/**
|
||||
* Extracts product information from the page
|
||||
*/
|
||||
async function extractProducts(page, categoryUrl, categoryName) {
|
||||
let products = [];
|
||||
// Check if we have a template for this URL
|
||||
const template = (0, dutchie_1.getTemplateForUrl)(categoryUrl);
|
||||
if (template) {
|
||||
logger_1.logger.info('scraper', `Using ${template.name} template for extraction`);
|
||||
try {
|
||||
const templateProducts = await template.extractProducts(page);
|
||||
// Add category to products from template
|
||||
products = templateProducts.map(p => ({
|
||||
...p,
|
||||
category: categoryName,
|
||||
}));
|
||||
logger_1.logger.info('scraper', `Template extracted ${products.length} products`);
|
||||
return products;
|
||||
}
|
||||
catch (err) {
|
||||
logger_1.logger.error('scraper', `Template extraction failed: ${err}`);
|
||||
// Fall through to fallback methods
|
||||
}
|
||||
}
|
||||
// Fallback Method 1: Dutchie products (for Sol Flower, etc.)
|
||||
try {
|
||||
const dutchieProducts = await page.locator('[data-testid^="product-"], .product-card, [class*="ProductCard"]').all();
|
||||
if (dutchieProducts.length > 0) {
|
||||
logger_1.logger.info('scraper', `Found ${dutchieProducts.length} Dutchie-style products`);
|
||||
for (const productEl of dutchieProducts) {
|
||||
try {
|
||||
const name = await productEl.locator('[data-testid="product-name"], .product-name, h3, h4').first().textContent() || '';
|
||||
const brand = await productEl.locator('[data-testid="product-brand"], .product-brand, .brand').first().textContent().catch(() => '');
|
||||
const priceText = await productEl.locator('[data-testid="product-price"], .product-price, .price').first().textContent().catch(() => '');
|
||||
const imageUrl = await productEl.locator('img').first().getAttribute('src').catch(() => '');
|
||||
const productLink = await productEl.locator('a').first().getAttribute('href').catch(() => '');
|
||||
// Parse price
|
||||
const price = priceText ? parseFloat(priceText.replace(/[^0-9.]/g, '')) : undefined;
|
||||
if (name) {
|
||||
products.push({
|
||||
name: name.trim(),
|
||||
brand: brand ? brand.trim() : undefined,
|
||||
category: categoryName,
|
||||
price,
|
||||
image_url: imageUrl || undefined,
|
||||
product_url: productLink ? new URL(productLink, categoryUrl).toString() : categoryUrl,
|
||||
in_stock: true
|
||||
});
|
||||
}
|
||||
}
|
||||
catch (err) {
|
||||
logger_1.logger.warn('scraper', `Error extracting Dutchie product: ${err}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (err) {
|
||||
logger_1.logger.warn('scraper', `Dutchie product extraction failed: ${err}`);
|
||||
}
|
||||
// Method 2: Curaleaf products
|
||||
if (products.length === 0) {
|
||||
try {
|
||||
const curaleafProducts = await page.locator('.product, [class*="Product"], [class*="item"]').all();
|
||||
if (curaleafProducts.length > 0) {
|
||||
logger_1.logger.info('scraper', `Found ${curaleafProducts.length} Curaleaf-style products`);
|
||||
for (const productEl of curaleafProducts) {
|
||||
try {
|
||||
const name = await productEl.locator('h1, h2, h3, h4, .title, .name').first().textContent() || '';
|
||||
const priceText = await productEl.locator('.price, [class*="price"]').first().textContent().catch(() => '');
|
||||
const imageUrl = await productEl.locator('img').first().getAttribute('src').catch(() => '');
|
||||
const price = priceText ? parseFloat(priceText.replace(/[^0-9.]/g, '')) : undefined;
|
||||
if (name && name.length > 3) {
|
||||
products.push({
|
||||
name: name.trim(),
|
||||
category: categoryName,
|
||||
price,
|
||||
image_url: imageUrl || undefined,
|
||||
product_url: categoryUrl,
|
||||
in_stock: true
|
||||
});
|
||||
}
|
||||
}
|
||||
catch (err) {
|
||||
logger_1.logger.warn('scraper', `Error extracting Curaleaf product: ${err}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (err) {
|
||||
logger_1.logger.warn('scraper', `Curaleaf product extraction failed: ${err}`);
|
||||
}
|
||||
}
|
||||
// Method 3: Generic product cards
|
||||
if (products.length === 0) {
|
||||
try {
|
||||
const genericProducts = await page.locator('article, [role="article"], .card, [class*="card"]').all();
|
||||
logger_1.logger.info('scraper', `Trying generic selectors, found ${genericProducts.length} elements`);
|
||||
for (const productEl of genericProducts) {
|
||||
try {
|
||||
const text = await productEl.textContent() || '';
|
||||
// Only consider elements that look like products
|
||||
if (text.includes('$') || text.toLowerCase().includes('price') || text.toLowerCase().includes('thc')) {
|
||||
const name = await productEl.locator('h1, h2, h3, h4').first().textContent() || '';
|
||||
if (name && name.length > 3) {
|
||||
products.push({
|
||||
name: name.trim(),
|
||||
category: categoryName,
|
||||
product_url: categoryUrl,
|
||||
in_stock: true
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (err) {
|
||||
// Skip this element
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (err) {
|
||||
logger_1.logger.warn('scraper', `Generic product extraction failed: ${err}`);
|
||||
}
|
||||
}
|
||||
return products;
|
||||
}
|
||||
/**
|
||||
* Test function to scrape a single category
|
||||
*/
|
||||
async function testScrapeCategoryPlaywright(url, categoryName, state = 'Arizona') {
|
||||
console.log(`\n🎭 Testing Playwright Category Scraper\n`);
|
||||
console.log(`Category: ${categoryName}`);
|
||||
console.log(`URL: ${url}\n`);
|
||||
const products = await scrapeCategoryPlaywright(url, categoryName, state);
|
||||
console.log(`\n✅ Found ${products.length} products\n`);
|
||||
products.slice(0, 5).forEach((p, i) => {
|
||||
console.log(`${i + 1}. ${p.name}`);
|
||||
if (p.brand)
|
||||
console.log(` Brand: ${p.brand}`);
|
||||
if (p.price)
|
||||
console.log(` Price: $${p.price}`);
|
||||
console.log(` URL: ${p.product_url}`);
|
||||
console.log('');
|
||||
});
|
||||
return products;
|
||||
}
|
||||
258
backend/dist/services/scraper.js
vendored
258
backend/dist/services/scraper.js
vendored
@@ -3,20 +3,52 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
|
||||
return (mod && mod.__esModule) ? mod : { "default": mod };
|
||||
};
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.USER_AGENT_GROUPS = exports.USER_AGENTS = void 0;
|
||||
exports.getUserAgent = getUserAgent;
|
||||
exports.scrapeCategory = scrapeCategory;
|
||||
exports.saveProducts = saveProducts;
|
||||
exports.scrapeStore = scrapeStore;
|
||||
const puppeteer_1 = __importDefault(require("puppeteer"));
|
||||
const puppeteer_extra_1 = __importDefault(require("puppeteer-extra"));
|
||||
const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth"));
|
||||
const migrate_1 = require("../db/migrate");
|
||||
const minio_1 = require("../utils/minio");
|
||||
const logger_1 = require("./logger");
|
||||
const USER_AGENTS = [
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
||||
];
|
||||
function getRandomUserAgent() {
|
||||
return USER_AGENTS[Math.floor(Math.random() * USER_AGENTS.length)];
|
||||
const scraper_monitor_1 = require("../routes/scraper-monitor");
|
||||
const proxy_1 = require("./proxy");
|
||||
const age_gate_1 = require("../utils/age-gate");
|
||||
const availability_1 = require("./availability");
|
||||
// Apply stealth plugin for antidetect/anti-fingerprinting
|
||||
puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)());
|
||||
exports.USER_AGENTS = {
|
||||
'chrome-windows': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'chrome-mac': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'chrome-linux': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'mobile-ios': 'Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1',
|
||||
'mobile-android': 'Mozilla/5.0 (Linux; Android 13; Pixel 7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Mobile Safari/537.36',
|
||||
'googlebot': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
|
||||
'bingbot': 'Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)'
|
||||
};
|
||||
exports.USER_AGENT_GROUPS = {
|
||||
desktop: ['chrome-windows', 'chrome-mac', 'chrome-linux'],
|
||||
mobile: ['mobile-ios', 'mobile-android'],
|
||||
serp: ['googlebot', 'bingbot']
|
||||
};
|
||||
function getRandomUserAgentFromGroup(group) {
|
||||
const randomKey = group[Math.floor(Math.random() * group.length)];
|
||||
return exports.USER_AGENTS[randomKey];
|
||||
}
|
||||
function getUserAgent(key) {
|
||||
if (!key)
|
||||
return getRandomUserAgentFromGroup(exports.USER_AGENT_GROUPS.desktop);
|
||||
// Check if it's a group
|
||||
if (key === 'rotate-desktop')
|
||||
return getRandomUserAgentFromGroup(exports.USER_AGENT_GROUPS.desktop);
|
||||
if (key === 'rotate-mobile')
|
||||
return getRandomUserAgentFromGroup(exports.USER_AGENT_GROUPS.mobile);
|
||||
if (key === 'rotate-serp')
|
||||
return getRandomUserAgentFromGroup(exports.USER_AGENT_GROUPS.serp);
|
||||
// Otherwise treat as specific UA
|
||||
return exports.USER_AGENTS[key] || getRandomUserAgentFromGroup(exports.USER_AGENT_GROUPS.desktop);
|
||||
}
|
||||
function extractImageIdFromUrl(url) {
|
||||
try {
|
||||
@@ -44,19 +76,6 @@ function sanitizeProductData(product) {
|
||||
cbd: product.cbd && product.cbd < 100 ? product.cbd : null
|
||||
};
|
||||
}
|
||||
async function getActiveProxy() {
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT host, port, protocol, username, password
|
||||
FROM proxies
|
||||
WHERE active = true AND is_anonymous = true
|
||||
ORDER BY RANDOM()
|
||||
LIMIT 1
|
||||
`);
|
||||
if (result.rows.length === 0) {
|
||||
return null;
|
||||
}
|
||||
return result.rows[0];
|
||||
}
|
||||
async function makePageStealthy(page) {
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(navigator, 'webdriver', {
|
||||
@@ -86,12 +105,11 @@ async function makePageStealthy(page) {
|
||||
});
|
||||
}
|
||||
async function scrapeProductDetails(page, productUrl, productName) {
|
||||
const maxRetries = 2;
|
||||
const maxRetries = 3;
|
||||
let lastError = null;
|
||||
for (let attempt = 1; attempt <= maxRetries; attempt++) {
|
||||
try {
|
||||
await page.goto(productUrl, { waitUntil: 'domcontentloaded', timeout: 20000 });
|
||||
await page.waitForTimeout(3000);
|
||||
await page.goto(productUrl, { waitUntil: 'domcontentloaded', timeout: 30000 });
|
||||
const details = await page.evaluate(() => {
|
||||
const allText = document.body.textContent || '';
|
||||
let fullSizeImage = null;
|
||||
@@ -233,9 +251,7 @@ async function scrapeProductDetails(page, productUrl, productName) {
|
||||
catch (error) {
|
||||
lastError = error;
|
||||
logger_1.logger.warn('scraper', ` Attempt ${attempt}/${maxRetries} failed for ${productName}: ${error}`);
|
||||
if (attempt < maxRetries) {
|
||||
await page.waitForTimeout(2000);
|
||||
}
|
||||
// No delays - just retry immediately
|
||||
}
|
||||
}
|
||||
logger_1.logger.error('scraper', ` ✗ All attempts failed for ${productName}`);
|
||||
@@ -253,8 +269,10 @@ async function scrapeProductDetails(page, productUrl, productName) {
|
||||
weights: []
|
||||
};
|
||||
}
|
||||
async function scrapeCategory(storeId, categoryId) {
|
||||
async function scrapeCategory(storeId, categoryId, userAgent) {
|
||||
let browser = null;
|
||||
const scraperId = `cat-${categoryId}-${Date.now()}`;
|
||||
let proxyId = null;
|
||||
try {
|
||||
const categoryResult = await migrate_1.pool.query(`
|
||||
SELECT c.*, s.slug as store_slug, s.name as store_name
|
||||
@@ -267,7 +285,12 @@ async function scrapeCategory(storeId, categoryId) {
|
||||
}
|
||||
const category = categoryResult.rows[0];
|
||||
logger_1.logger.info('scraper', `Scraping category: ${category.name} for ${category.store_name}`);
|
||||
const proxy = await getActiveProxy();
|
||||
// Register scraper with monitoring system
|
||||
(0, scraper_monitor_1.registerScraper)(scraperId, storeId, category.store_name, categoryId, category.name);
|
||||
const proxy = await (0, proxy_1.getActiveProxy)();
|
||||
if (proxy) {
|
||||
proxyId = proxy.id;
|
||||
}
|
||||
const launchOptions = {
|
||||
headless: 'new',
|
||||
args: [
|
||||
@@ -287,24 +310,51 @@ async function scrapeCategory(storeId, categoryId) {
|
||||
}
|
||||
logger_1.logger.info('scraper', `Using proxy: ${proxy.protocol}://${proxy.host}:${proxy.port}`);
|
||||
}
|
||||
browser = await puppeteer_1.default.launch(launchOptions);
|
||||
browser = await puppeteer_extra_1.default.launch(launchOptions);
|
||||
const page = await browser.newPage();
|
||||
await makePageStealthy(page);
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
await page.setUserAgent(getRandomUserAgent());
|
||||
// Use provided userAgent or random if not specified
|
||||
const ua = getUserAgent(userAgent);
|
||||
await page.setUserAgent(ua);
|
||||
// Set age gate bypass cookies BEFORE navigation (standard for all cannabis sites)
|
||||
const state = (0, age_gate_1.detectStateFromUrl)(category.dutchie_url);
|
||||
await (0, age_gate_1.setAgeGateCookies)(page, category.dutchie_url, state);
|
||||
logger_1.logger.info('scraper', `Loading page: ${category.dutchie_url}`);
|
||||
try {
|
||||
await page.goto(category.dutchie_url, {
|
||||
waitUntil: 'domcontentloaded',
|
||||
waitUntil: 'networkidle2',
|
||||
timeout: 60000
|
||||
});
|
||||
await page.waitForTimeout(5000);
|
||||
// If age gate still appears, try to bypass it
|
||||
await (0, age_gate_1.bypassAgeGate)(page, state);
|
||||
// Wait for products to load
|
||||
await page.waitForSelector('[data-testid="product-list-item"], a[href*="/product/"]', {
|
||||
timeout: 30000,
|
||||
}).catch(() => {
|
||||
logger_1.logger.warn('scraper', 'No product selectors found, trying anyway...');
|
||||
});
|
||||
logger_1.logger.info('scraper', 'Scrolling to load all products...');
|
||||
await autoScroll(page);
|
||||
await page.waitForTimeout(3000);
|
||||
}
|
||||
catch (navError) {
|
||||
logger_1.logger.error('scraper', `Navigation error: ${navError}`);
|
||||
// Check if this is bot detection - put proxy in timeout instead of hard failure
|
||||
if (proxyId) {
|
||||
const errorMsg = String(navError);
|
||||
if ((0, proxy_1.isBotDetectionError)(errorMsg)) {
|
||||
// Bot detection! Put this proxy in timeout and get a new one
|
||||
logger_1.logger.warn('scraper', `🤖 Bot detection triggered for proxy ${proxyId}!`);
|
||||
(0, proxy_1.putProxyInTimeout)(proxyId, errorMsg);
|
||||
throw new Error(`Bot detection: ${errorMsg}`);
|
||||
}
|
||||
else if (errorMsg.includes('timeout') || errorMsg.includes('net::') ||
|
||||
errorMsg.includes('ERR_') || errorMsg.includes('Navigation')) {
|
||||
// Regular proxy failure - increment failure count
|
||||
logger_1.logger.warn('scraper', `Proxy failure detected, incrementing failure count for proxy ${proxyId}`);
|
||||
await (0, proxy_1.incrementProxyFailure)(proxyId, errorMsg);
|
||||
}
|
||||
}
|
||||
throw navError;
|
||||
}
|
||||
logger_1.logger.info('scraper', 'Extracting product list from page...');
|
||||
@@ -336,6 +386,21 @@ async function scrapeCategory(storeId, categoryId) {
|
||||
originalPrice = parseFloat(priceMatches[1].replace('$', ''));
|
||||
}
|
||||
}
|
||||
// Extract variant (weight/size) - look for common patterns
|
||||
let variant = null;
|
||||
const variantPatterns = [
|
||||
/(\d+\.?\d*\s*(?:g|oz|mg|ml|gram|ounce))/i, // Weight units
|
||||
/(\d+\s*pack)/i, // Pack sizes
|
||||
/(\d+\s*ct)/i, // Count
|
||||
/(\d+\s*x\s*\d+\.?\d*\s*(?:g|mg|ml))/i // Multi-pack (e.g., 5x0.5g)
|
||||
];
|
||||
for (const pattern of variantPatterns) {
|
||||
const match = allText.match(pattern);
|
||||
if (match) {
|
||||
variant = match[1].trim();
|
||||
break;
|
||||
}
|
||||
}
|
||||
const linkEl = card.querySelector('a[href*="/product/"]');
|
||||
let href = linkEl?.href || linkEl?.getAttribute('href') || '';
|
||||
if (href && href.startsWith('/')) {
|
||||
@@ -343,6 +408,7 @@ async function scrapeCategory(storeId, categoryId) {
|
||||
}
|
||||
items.push({
|
||||
name,
|
||||
variant,
|
||||
price,
|
||||
originalPrice,
|
||||
href: href || window.location.href
|
||||
@@ -358,10 +424,19 @@ async function scrapeCategory(storeId, categoryId) {
|
||||
logger_1.logger.info('scraper', `Now visiting each product page for complete details...`);
|
||||
let successCount = 0;
|
||||
let failCount = 0;
|
||||
// Update initial stats
|
||||
(0, scraper_monitor_1.updateScraperStats)(scraperId, {
|
||||
productsProcessed: 0,
|
||||
productsTotal: products.length
|
||||
});
|
||||
for (let i = 0; i < products.length; i++) {
|
||||
const product = products[i];
|
||||
try {
|
||||
logger_1.logger.info('scraper', ` [${i + 1}/${products.length}] ${product.name}`);
|
||||
(0, scraper_monitor_1.updateScraperStats)(scraperId, {
|
||||
productsProcessed: i + 1,
|
||||
productsTotal: products.length
|
||||
}, `Processing: ${product.name}`);
|
||||
if (!product.href) {
|
||||
logger_1.logger.warn('scraper', ` ⚠ No product URL, skipping details`);
|
||||
product.metadata = {};
|
||||
@@ -391,7 +466,7 @@ async function scrapeCategory(storeId, categoryId) {
|
||||
logger_1.logger.warn('scraper', ` ⚠ Limited data extracted`);
|
||||
failCount++;
|
||||
}
|
||||
await page.waitForTimeout(1500);
|
||||
// No delays - scrape fast!
|
||||
}
|
||||
catch (error) {
|
||||
logger_1.logger.error('scraper', ` ✗ Unexpected error: ${error}`);
|
||||
@@ -411,11 +486,16 @@ async function scrapeCategory(storeId, categoryId) {
|
||||
SET last_scraped_at = CURRENT_TIMESTAMP
|
||||
WHERE id = $1
|
||||
`, [categoryId]);
|
||||
// Mark scraper as complete
|
||||
(0, scraper_monitor_1.completeScraper)(scraperId);
|
||||
const formattedProducts = products.map((p, index) => {
|
||||
const sanitized = sanitizeProductData(p);
|
||||
// Normalize availability from Dutchie product data
|
||||
const availability = (0, availability_1.normalizeAvailability)(p);
|
||||
return {
|
||||
dutchieProductId: `${category.store_slug}-${category.slug}-${Date.now()}-${index}`,
|
||||
name: sanitized.name,
|
||||
variant: p.variant || null,
|
||||
description: sanitized.description,
|
||||
price: p.price,
|
||||
originalPrice: p.originalPrice,
|
||||
@@ -426,13 +506,34 @@ async function scrapeCategory(storeId, categoryId) {
|
||||
weight: sanitized.weight,
|
||||
imageUrl: p.imageUrl,
|
||||
dutchieUrl: p.href,
|
||||
metadata: p.metadata || {}
|
||||
metadata: p.metadata || {},
|
||||
availabilityStatus: availability.status,
|
||||
availabilityRaw: availability.raw,
|
||||
stockQuantity: availability.quantity
|
||||
};
|
||||
});
|
||||
return formattedProducts;
|
||||
}
|
||||
catch (error) {
|
||||
logger_1.logger.error('scraper', `❌ Category scraping error: ${error}`);
|
||||
// Smart proxy error handling
|
||||
if (proxyId) {
|
||||
const errorMsg = String(error);
|
||||
if ((0, proxy_1.isBotDetectionError)(errorMsg)) {
|
||||
// Bot detection! Put this proxy in timeout
|
||||
logger_1.logger.warn('scraper', `🤖 Bot detection triggered for proxy ${proxyId}!`);
|
||||
(0, proxy_1.putProxyInTimeout)(proxyId, errorMsg);
|
||||
}
|
||||
else if (errorMsg.includes('timeout') || errorMsg.includes('net::') ||
|
||||
errorMsg.includes('ERR_') || errorMsg.includes('Navigation') ||
|
||||
errorMsg.includes('Protocol error') || errorMsg.includes('Target closed')) {
|
||||
// Regular proxy failure - increment failure count
|
||||
logger_1.logger.warn('scraper', `Proxy failure detected, incrementing failure count for proxy ${proxyId}`);
|
||||
await (0, proxy_1.incrementProxyFailure)(proxyId, errorMsg);
|
||||
}
|
||||
}
|
||||
// Mark scraper as failed
|
||||
(0, scraper_monitor_1.completeScraper)(scraperId, String(error));
|
||||
if (browser) {
|
||||
try {
|
||||
await browser.close();
|
||||
@@ -466,51 +567,84 @@ async function saveProducts(storeId, categoryId, products) {
|
||||
try {
|
||||
await client.query('BEGIN');
|
||||
logger_1.logger.info('scraper', `Saving ${products.length} products to database...`);
|
||||
// Mark all products as out-of-stock before processing (they'll be re-marked if found)
|
||||
// Also update availability_status and last_seen_out_of_stock_at for state transition tracking
|
||||
await client.query(`
|
||||
UPDATE products
|
||||
SET in_stock = false
|
||||
WHERE store_id = $1 AND category_id = $2
|
||||
SET in_stock = false,
|
||||
availability_status = 'out_of_stock',
|
||||
last_seen_out_of_stock_at = CASE
|
||||
WHEN availability_status != 'out_of_stock' THEN CURRENT_TIMESTAMP
|
||||
ELSE last_seen_out_of_stock_at
|
||||
END
|
||||
WHERE store_id = $1 AND category_id = $2 AND in_stock = true
|
||||
`, [storeId, categoryId]);
|
||||
for (const product of products) {
|
||||
try {
|
||||
// Get availability from product (defaults to in_stock if product exists in scraped data)
|
||||
const availStatus = product.availabilityStatus || 'in_stock';
|
||||
const availRaw = product.availabilityRaw ? JSON.stringify(product.availabilityRaw) : null;
|
||||
const stockQty = product.stockQuantity ?? null;
|
||||
const existingResult = await client.query(`
|
||||
SELECT id, image_url, local_image_path
|
||||
SELECT id, image_url, local_image_path, availability_status
|
||||
FROM products
|
||||
WHERE store_id = $1 AND name = $2 AND category_id = $3
|
||||
`, [storeId, product.name, categoryId]);
|
||||
AND (variant = $4 OR (variant IS NULL AND $4 IS NULL))
|
||||
`, [storeId, product.name, categoryId, product.variant || null]);
|
||||
let localImagePath = null;
|
||||
let productId;
|
||||
if (existingResult.rows.length > 0) {
|
||||
productId = existingResult.rows[0].id;
|
||||
localImagePath = existingResult.rows[0].local_image_path;
|
||||
const prevStatus = existingResult.rows[0].availability_status;
|
||||
// Determine if we need to update last_seen_in_stock_at
|
||||
const isNowInStock = availStatus === 'in_stock' || availStatus === 'limited';
|
||||
const wasOutOfStock = prevStatus === 'out_of_stock' || prevStatus === 'unknown';
|
||||
await client.query(`
|
||||
UPDATE products
|
||||
SET name = $1, description = $2, price = $3,
|
||||
strain_type = $4, thc_percentage = $5, cbd_percentage = $6,
|
||||
brand = $7, weight = $8, image_url = $9, dutchie_url = $10,
|
||||
in_stock = true, metadata = $11, last_seen_at = CURRENT_TIMESTAMP,
|
||||
updated_at = CURRENT_TIMESTAMP
|
||||
WHERE id = $12
|
||||
SET name = $1, variant = $2, description = $3, price = $4,
|
||||
strain_type = $5, thc_percentage = $6, cbd_percentage = $7,
|
||||
brand = $8, weight = $9, image_url = $10, dutchie_url = $11,
|
||||
in_stock = true, metadata = $12, last_seen_at = CURRENT_TIMESTAMP,
|
||||
updated_at = CURRENT_TIMESTAMP,
|
||||
availability_status = $14,
|
||||
availability_raw = $15,
|
||||
stock_quantity = $16,
|
||||
last_seen_in_stock_at = CASE
|
||||
WHEN $17 THEN CURRENT_TIMESTAMP
|
||||
ELSE last_seen_in_stock_at
|
||||
END
|
||||
WHERE id = $13
|
||||
`, [
|
||||
product.name, product.description, product.price,
|
||||
product.name, product.variant, product.description, product.price,
|
||||
product.strainType, product.thcPercentage, product.cbdPercentage,
|
||||
product.brand, product.weight, product.imageUrl, product.dutchieUrl,
|
||||
JSON.stringify(product.metadata), productId
|
||||
JSON.stringify(product.metadata), productId, availStatus, availRaw, stockQty,
|
||||
isNowInStock && wasOutOfStock
|
||||
]);
|
||||
}
|
||||
else {
|
||||
// Generate unique slug from product name + timestamp + random suffix
|
||||
const baseSlug = product.name
|
||||
.toLowerCase()
|
||||
.replace(/[^a-z0-9]+/g, '-')
|
||||
.replace(/^-|-$/g, '')
|
||||
.substring(0, 150);
|
||||
const uniqueSuffix = `${Date.now()}-${Math.random().toString(36).substr(2, 6)}`;
|
||||
const slug = `${baseSlug}-${uniqueSuffix}`;
|
||||
const insertResult = await client.query(`
|
||||
INSERT INTO products (
|
||||
store_id, category_id, dutchie_product_id, name, description,
|
||||
store_id, category_id, dutchie_product_id, name, slug, variant, description,
|
||||
price, strain_type, thc_percentage, cbd_percentage,
|
||||
brand, weight, image_url, dutchie_url, in_stock, metadata
|
||||
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, true, $14)
|
||||
brand, weight, image_url, dutchie_url, in_stock, metadata,
|
||||
availability_status, availability_raw, stock_quantity, last_seen_in_stock_at
|
||||
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, true, $16, $17, $18, $19, CURRENT_TIMESTAMP)
|
||||
RETURNING id
|
||||
`, [
|
||||
storeId, categoryId, product.dutchieProductId, product.name, product.description,
|
||||
storeId, categoryId, product.dutchieProductId, product.name, slug, product.variant, product.description,
|
||||
product.price, product.strainType, product.thcPercentage, product.cbdPercentage,
|
||||
product.brand, product.weight, product.imageUrl, product.dutchieUrl,
|
||||
JSON.stringify(product.metadata)
|
||||
JSON.stringify(product.metadata), availStatus, availRaw, stockQty
|
||||
]);
|
||||
productId = insertResult.rows[0].id;
|
||||
}
|
||||
@@ -544,19 +678,15 @@ async function saveProducts(storeId, categoryId, products) {
|
||||
client.release();
|
||||
}
|
||||
}
|
||||
async function scrapeStore(storeId) {
|
||||
async function scrapeStore(storeId, parallel = 3, userAgent) {
|
||||
try {
|
||||
logger_1.logger.info('scraper', `🏪 Starting scrape for store ID: ${storeId}`);
|
||||
logger_1.logger.info('scraper', `🏪 Starting scrape for store ID: ${storeId} (${parallel} parallel, UA: ${userAgent || 'random'})`);
|
||||
const categoriesResult = await migrate_1.pool.query(`
|
||||
SELECT c.id, c.name, c.slug, c.dutchie_url
|
||||
FROM categories c
|
||||
WHERE c.store_id = $1
|
||||
AND c.scrape_enabled = true
|
||||
AND NOT EXISTS (
|
||||
SELECT 1 FROM categories child
|
||||
WHERE child.parent_id = c.id
|
||||
)
|
||||
ORDER BY c.display_order, c.name
|
||||
WHERE c.store_id = $1
|
||||
AND c.scrape_enabled = true
|
||||
ORDER BY c.name
|
||||
`, [storeId]);
|
||||
logger_1.logger.info('scraper', `Found ${categoriesResult.rows.length} categories to scrape`);
|
||||
for (const category of categoriesResult.rows) {
|
||||
@@ -564,14 +694,14 @@ async function scrapeStore(storeId) {
|
||||
logger_1.logger.info('scraper', `━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
|
||||
logger_1.logger.info('scraper', `📂 Scraping: ${category.name}`);
|
||||
logger_1.logger.info('scraper', `━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
|
||||
const products = await scrapeCategory(storeId, category.id);
|
||||
const products = await scrapeCategory(storeId, category.id, userAgent);
|
||||
await saveProducts(storeId, category.id, products);
|
||||
logger_1.logger.info('scraper', `✅ Completed ${category.name} - ${products.length} products saved`);
|
||||
}
|
||||
catch (error) {
|
||||
logger_1.logger.error('scraper', `❌ Failed to scrape ${category.name}: ${error}`);
|
||||
}
|
||||
await new Promise(resolve => setTimeout(resolve, 5000));
|
||||
// No delays - scrape fast!
|
||||
}
|
||||
await migrate_1.pool.query(`
|
||||
UPDATE stores
|
||||
|
||||
351
backend/dist/services/store-crawl-orchestrator.js
vendored
Normal file
351
backend/dist/services/store-crawl-orchestrator.js
vendored
Normal file
@@ -0,0 +1,351 @@
|
||||
"use strict";
|
||||
/**
|
||||
* Store Crawl Orchestrator
|
||||
*
|
||||
* Orchestrates the complete crawl workflow for a store:
|
||||
* 1. Load store and its linked dispensary
|
||||
* 2. Check if provider detection is needed
|
||||
* 3. Run provider detection if needed
|
||||
* 4. Queue appropriate crawl jobs based on provider/mode
|
||||
* 5. Update store_crawl_schedule with meaningful status
|
||||
*
|
||||
* This replaces the simple "triggerManualCrawl" with intelligent orchestration.
|
||||
*/
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.runStoreCrawlOrchestrator = runStoreCrawlOrchestrator;
|
||||
exports.runBatchOrchestrator = runBatchOrchestrator;
|
||||
exports.getStoresDueForOrchestration = getStoresDueForOrchestration;
|
||||
const uuid_1 = require("uuid");
|
||||
const migrate_1 = require("../db/migrate");
|
||||
const crawler_logger_1 = require("./crawler-logger");
|
||||
const intelligence_detector_1 = require("./intelligence-detector");
|
||||
const category_crawler_jobs_1 = require("./category-crawler-jobs");
|
||||
// DEPRECATED: scrapeStore writes to legacy products table
|
||||
// import { scrapeStore } from '../scraper-v2';
|
||||
// Import the new dutchie-az pipeline for Dutchie crawling
|
||||
const product_crawler_1 = require("../dutchie-az/services/product-crawler");
|
||||
const connection_1 = require("../dutchie-az/db/connection");
|
||||
// ========================================
|
||||
// Main Orchestrator Function
|
||||
// ========================================
|
||||
/**
|
||||
* Run the complete crawl orchestration for a store
|
||||
*
|
||||
* Behavior:
|
||||
* 1. Load the store and its linked dispensary
|
||||
* 2. If no dispensary is linked, report error
|
||||
* 3. If product_provider is missing or stale (>7 days), run detection
|
||||
* 4. After detection:
|
||||
* - If product_provider = 'dutchie' and product_crawler_mode = 'production': Run production crawl
|
||||
* - Otherwise: Run sandbox crawl
|
||||
* 5. Update store_crawl_schedule with status/summary
|
||||
*/
|
||||
async function runStoreCrawlOrchestrator(storeId) {
|
||||
const startTime = Date.now();
|
||||
const runId = (0, uuid_1.v4)();
|
||||
let result = {
|
||||
status: 'pending',
|
||||
summary: '',
|
||||
runId,
|
||||
storeId,
|
||||
dispensaryId: null,
|
||||
detectionRan: false,
|
||||
crawlRan: false,
|
||||
durationMs: 0,
|
||||
};
|
||||
try {
|
||||
// Mark schedule as running
|
||||
await updateScheduleStatus(storeId, 'running', 'Starting orchestrator...', runId);
|
||||
// 1. Load store with dispensary info
|
||||
const store = await getStoreWithDispensary(storeId);
|
||||
if (!store) {
|
||||
throw new Error(`Store ${storeId} not found`);
|
||||
}
|
||||
result.dispensaryId = store.dispensary_id;
|
||||
// 2. Check if dispensary is linked
|
||||
if (!store.dispensary_id) {
|
||||
result.status = 'error';
|
||||
result.summary = 'No dispensary linked - cannot determine provider';
|
||||
result.error = 'Store is not linked to a dispensary. Link it in the Dispensaries page.';
|
||||
await updateScheduleStatus(storeId, 'error', result.summary, runId, result.error);
|
||||
result.durationMs = Date.now() - startTime;
|
||||
return result;
|
||||
}
|
||||
// 3. Check if provider detection is needed
|
||||
const needsDetection = await checkNeedsDetection(store);
|
||||
if (needsDetection) {
|
||||
// Run provider detection
|
||||
const websiteUrl = store.dispensary_menu_url || store.dispensary_website;
|
||||
if (!websiteUrl) {
|
||||
result.status = 'error';
|
||||
result.summary = 'No website URL available for detection';
|
||||
result.error = 'Dispensary has no menu_url or website configured';
|
||||
await updateScheduleStatus(storeId, 'error', result.summary, runId, result.error);
|
||||
result.durationMs = Date.now() - startTime;
|
||||
return result;
|
||||
}
|
||||
await updateScheduleStatus(storeId, 'running', 'Running provider detection...', runId);
|
||||
const detectionResult = await (0, intelligence_detector_1.detectMultiCategoryProviders)(websiteUrl);
|
||||
result.detectionRan = true;
|
||||
result.detectionResult = detectionResult;
|
||||
// Save detection results to dispensary
|
||||
await (0, intelligence_detector_1.updateAllCategoryProviders)(store.dispensary_id, detectionResult);
|
||||
crawler_logger_1.crawlerLogger.providerDetected({
|
||||
dispensary_id: store.dispensary_id,
|
||||
dispensary_name: store.dispensary_name || store.name,
|
||||
detected_provider: detectionResult.product.provider,
|
||||
confidence: detectionResult.product.confidence,
|
||||
detection_method: 'orchestrator_run',
|
||||
menu_url: websiteUrl,
|
||||
category: 'product',
|
||||
});
|
||||
// Refresh store info after detection
|
||||
const updatedStore = await getStoreWithDispensary(storeId);
|
||||
if (updatedStore) {
|
||||
Object.assign(store, updatedStore);
|
||||
}
|
||||
}
|
||||
// 4. Determine crawl type and run
|
||||
const provider = store.product_provider;
|
||||
const mode = store.product_crawler_mode;
|
||||
if (provider === 'dutchie' && mode === 'production') {
|
||||
// Production Dutchie crawl - now uses the new dutchie-az GraphQL pipeline
|
||||
await updateScheduleStatus(storeId, 'running', 'Running Dutchie GraphQL crawl (dutchie-az)...', runId);
|
||||
try {
|
||||
// Look up the dispensary in the dutchie-az database
|
||||
// The dutchie-az pipeline has its own dispensaries table
|
||||
// We try multiple matching strategies: name, slug, or platform_dispensary_id
|
||||
const dispensaryResult = await (0, connection_1.query)(`SELECT * FROM dispensaries
|
||||
WHERE name ILIKE $1
|
||||
OR slug ILIKE $2
|
||||
LIMIT 1`, [store.dispensary_name, store.slug]);
|
||||
if (dispensaryResult.rows.length === 0) {
|
||||
throw new Error(`Dispensary not found in dutchie-az database. ` +
|
||||
`You must add this dispensary to the dutchie-az pipeline first. ` +
|
||||
`Store: ${store.name} (${store.dispensary_name})`);
|
||||
}
|
||||
const dutchieDispensary = dispensaryResult.rows[0];
|
||||
// Run the new dutchie-az GraphQL crawler
|
||||
const crawlResult = await (0, product_crawler_1.crawlDispensaryProducts)(dutchieDispensary, 'rec', { useBothModes: true });
|
||||
result.crawlRan = true;
|
||||
result.crawlType = 'production';
|
||||
result.productsFound = crawlResult.productsFound ?? undefined;
|
||||
result.productsNew = crawlResult.productsUpserted ?? undefined;
|
||||
result.productsUpdated = crawlResult.snapshotsCreated ?? undefined;
|
||||
if (crawlResult.success) {
|
||||
const detectionPart = result.detectionRan ? 'Detection + ' : '';
|
||||
result.summary = `${detectionPart}Dutchie GraphQL crawl (${crawlResult.productsFound || 0} items, ${crawlResult.productsUpserted || 0} upserted, ${crawlResult.snapshotsCreated || 0} snapshots)`;
|
||||
result.status = 'success';
|
||||
// Update store's last_scraped_at
|
||||
await migrate_1.pool.query('UPDATE stores SET last_scraped_at = NOW() WHERE id = $1', [storeId]);
|
||||
crawler_logger_1.crawlerLogger.jobCompleted({
|
||||
job_id: 0, // Orchestrator doesn't create traditional jobs
|
||||
store_id: storeId,
|
||||
store_name: store.name,
|
||||
duration_ms: crawlResult.durationMs,
|
||||
products_found: crawlResult.productsFound || 0,
|
||||
products_new: crawlResult.productsUpserted || 0,
|
||||
products_updated: crawlResult.snapshotsCreated || 0,
|
||||
provider: 'dutchie',
|
||||
});
|
||||
}
|
||||
else {
|
||||
throw new Error(crawlResult.errorMessage || 'Crawl failed');
|
||||
}
|
||||
}
|
||||
catch (crawlError) {
|
||||
result.status = 'error';
|
||||
result.error = crawlError.message;
|
||||
result.summary = `Dutchie crawl failed: ${crawlError.message.slice(0, 100)}`;
|
||||
result.crawlRan = true;
|
||||
result.crawlType = 'production';
|
||||
crawler_logger_1.crawlerLogger.jobFailed({
|
||||
job_id: 0,
|
||||
store_id: storeId,
|
||||
store_name: store.name,
|
||||
duration_ms: Date.now() - startTime,
|
||||
error_message: crawlError.message,
|
||||
provider: 'dutchie',
|
||||
});
|
||||
}
|
||||
}
|
||||
else if (provider && provider !== 'unknown') {
|
||||
// Sandbox crawl for non-Dutchie or sandbox mode
|
||||
await updateScheduleStatus(storeId, 'running', `Running ${provider} sandbox crawl...`, runId);
|
||||
try {
|
||||
const sandboxResult = await (0, category_crawler_jobs_1.runSandboxProductsJob)(store.dispensary_id);
|
||||
result.crawlRan = true;
|
||||
result.crawlType = 'sandbox';
|
||||
result.productsFound = sandboxResult.data?.productsExtracted || 0;
|
||||
const detectionPart = result.detectionRan ? 'Detection + ' : '';
|
||||
if (sandboxResult.success) {
|
||||
result.summary = `${detectionPart}${provider} sandbox crawl (${result.productsFound} items, quality ${sandboxResult.data?.qualityScore || 0}%)`;
|
||||
result.status = 'sandbox_only';
|
||||
}
|
||||
else {
|
||||
result.summary = `${detectionPart}${provider} sandbox failed: ${sandboxResult.message}`;
|
||||
result.status = 'error';
|
||||
result.error = sandboxResult.message;
|
||||
}
|
||||
}
|
||||
catch (sandboxError) {
|
||||
result.status = 'error';
|
||||
result.error = sandboxError.message;
|
||||
result.summary = `Sandbox crawl failed: ${sandboxError.message.slice(0, 100)}`;
|
||||
result.crawlRan = true;
|
||||
result.crawlType = 'sandbox';
|
||||
}
|
||||
}
|
||||
else {
|
||||
// No provider detected - detection only
|
||||
if (result.detectionRan) {
|
||||
result.summary = `Detection complete: provider=${store.product_provider || 'unknown'}, confidence=${store.product_confidence || 0}%`;
|
||||
result.status = 'detection_only';
|
||||
}
|
||||
else {
|
||||
result.summary = 'No provider detected and no crawl possible';
|
||||
result.status = 'error';
|
||||
result.error = 'Could not determine menu provider';
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (error) {
|
||||
result.status = 'error';
|
||||
result.error = error.message;
|
||||
result.summary = `Orchestrator error: ${error.message.slice(0, 100)}`;
|
||||
crawler_logger_1.crawlerLogger.queueFailure({
|
||||
queue_type: 'orchestrator',
|
||||
error_message: error.message,
|
||||
});
|
||||
}
|
||||
result.durationMs = Date.now() - startTime;
|
||||
// Update final schedule status
|
||||
await updateScheduleStatus(storeId, result.status, result.summary, runId, result.error);
|
||||
// Create a crawl_job record for tracking
|
||||
await createOrchestratorJobRecord(storeId, result);
|
||||
return result;
|
||||
}
|
||||
// ========================================
|
||||
// Helper Functions
|
||||
// ========================================
|
||||
async function getStoreWithDispensary(storeId) {
|
||||
const result = await migrate_1.pool.query(`SELECT
|
||||
s.id, s.name, s.slug, s.timezone, s.dispensary_id,
|
||||
d.name as dispensary_name,
|
||||
d.menu_url as dispensary_menu_url,
|
||||
d.website as dispensary_website,
|
||||
d.product_provider,
|
||||
d.product_confidence,
|
||||
d.product_crawler_mode,
|
||||
d.last_product_scan_at
|
||||
FROM stores s
|
||||
LEFT JOIN dispensaries d ON d.id = s.dispensary_id
|
||||
WHERE s.id = $1`, [storeId]);
|
||||
return result.rows[0] || null;
|
||||
}
|
||||
async function checkNeedsDetection(store) {
|
||||
// No dispensary = can't detect
|
||||
if (!store.dispensary_id)
|
||||
return false;
|
||||
// No provider = definitely needs detection
|
||||
if (!store.product_provider)
|
||||
return true;
|
||||
// Unknown provider = needs detection
|
||||
if (store.product_provider === 'unknown')
|
||||
return true;
|
||||
// Low confidence = needs re-detection
|
||||
if (store.product_confidence !== null && store.product_confidence < 50)
|
||||
return true;
|
||||
// Stale detection (> 7 days) = needs refresh
|
||||
if (store.last_product_scan_at) {
|
||||
const daysSince = (Date.now() - new Date(store.last_product_scan_at).getTime()) / (1000 * 60 * 60 * 24);
|
||||
if (daysSince > 7)
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
async function updateScheduleStatus(storeId, status, summary, runId, error) {
|
||||
await migrate_1.pool.query(`INSERT INTO store_crawl_schedule (store_id, last_status, last_summary, last_run_at, last_error)
|
||||
VALUES ($1, $2, $3, NOW(), $4)
|
||||
ON CONFLICT (store_id) DO UPDATE SET
|
||||
last_status = $2,
|
||||
last_summary = $3,
|
||||
last_run_at = NOW(),
|
||||
last_error = $4,
|
||||
updated_at = NOW()`, [storeId, status, summary, error || null]);
|
||||
}
|
||||
async function getLatestCrawlStats(storeId) {
|
||||
// Get count of products for this store
|
||||
const result = await migrate_1.pool.query(`SELECT
|
||||
COUNT(*) as total,
|
||||
COUNT(*) FILTER (WHERE created_at > NOW() - INTERVAL '1 hour') as recent_new,
|
||||
COUNT(*) FILTER (WHERE updated_at > NOW() - INTERVAL '1 hour' AND created_at < NOW() - INTERVAL '1 hour') as recent_updated
|
||||
FROM products
|
||||
WHERE store_id = $1`, [storeId]);
|
||||
return {
|
||||
products_found: parseInt(result.rows[0]?.total || '0'),
|
||||
products_new: parseInt(result.rows[0]?.recent_new || '0'),
|
||||
products_updated: parseInt(result.rows[0]?.recent_updated || '0'),
|
||||
};
|
||||
}
|
||||
async function createOrchestratorJobRecord(storeId, result) {
|
||||
await migrate_1.pool.query(`INSERT INTO crawl_jobs (
|
||||
store_id, job_type, trigger_type, status, priority,
|
||||
scheduled_at, started_at, completed_at,
|
||||
products_found, products_new, products_updated,
|
||||
error_message, orchestrator_run_id, detection_result
|
||||
) VALUES (
|
||||
$1, 'orchestrator', 'manual', $2, 100,
|
||||
NOW(), NOW(), NOW(),
|
||||
$3, $4, $5,
|
||||
$6, $7, $8
|
||||
)`, [
|
||||
storeId,
|
||||
result.status === 'success' ? 'completed' : result.status === 'error' ? 'failed' : 'completed',
|
||||
result.productsFound || null,
|
||||
result.productsNew || null,
|
||||
result.productsUpdated || null,
|
||||
result.error || null,
|
||||
result.runId,
|
||||
result.detectionResult ? JSON.stringify({
|
||||
product_provider: result.detectionResult.product.provider,
|
||||
product_confidence: result.detectionResult.product.confidence,
|
||||
product_mode: result.detectionResult.product.mode,
|
||||
}) : null,
|
||||
]);
|
||||
}
|
||||
// ========================================
|
||||
// Batch Orchestration
|
||||
// ========================================
|
||||
/**
|
||||
* Run orchestrator for multiple stores
|
||||
*/
|
||||
async function runBatchOrchestrator(storeIds, concurrency = 3) {
|
||||
const results = [];
|
||||
// Process in batches
|
||||
for (let i = 0; i < storeIds.length; i += concurrency) {
|
||||
const batch = storeIds.slice(i, i + concurrency);
|
||||
const batchResults = await Promise.all(batch.map(storeId => runStoreCrawlOrchestrator(storeId)));
|
||||
results.push(...batchResults);
|
||||
}
|
||||
return results;
|
||||
}
|
||||
/**
|
||||
* Get stores that are due for orchestration
|
||||
*/
|
||||
async function getStoresDueForOrchestration(limit = 10) {
|
||||
const result = await migrate_1.pool.query(`SELECT s.id
|
||||
FROM stores s
|
||||
LEFT JOIN store_crawl_schedule scs ON scs.store_id = s.id
|
||||
WHERE s.active = TRUE
|
||||
AND s.scrape_enabled = TRUE
|
||||
AND COALESCE(scs.enabled, TRUE) = TRUE
|
||||
AND (
|
||||
scs.last_run_at IS NULL
|
||||
OR scs.last_run_at < NOW() - (COALESCE(scs.interval_hours, 4) || ' hours')::INTERVAL
|
||||
)
|
||||
AND (scs.last_status IS NULL OR scs.last_status NOT IN ('running', 'pending'))
|
||||
ORDER BY COALESCE(scs.priority, 0) DESC, scs.last_run_at ASC NULLS FIRST
|
||||
LIMIT $1`, [limit]);
|
||||
return result.rows.map(row => row.id);
|
||||
}
|
||||
175
backend/dist/utils/age-gate-playwright.js
vendored
Normal file
175
backend/dist/utils/age-gate-playwright.js
vendored
Normal file
@@ -0,0 +1,175 @@
|
||||
"use strict";
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.hasAgeGatePlaywright = hasAgeGatePlaywright;
|
||||
exports.bypassAgeGatePlaywright = bypassAgeGatePlaywright;
|
||||
exports.detectStateFromUrlPlaywright = detectStateFromUrlPlaywright;
|
||||
const logger_1 = require("../services/logger");
|
||||
/**
|
||||
* Detects if a Playwright page has an age verification gate
|
||||
*/
|
||||
async function hasAgeGatePlaywright(page) {
|
||||
try {
|
||||
const url = page.url();
|
||||
const bodyText = await page.textContent('body') || '';
|
||||
const hasAgeVerification = url.includes('/age-gate') ||
|
||||
bodyText.includes('age verification') ||
|
||||
bodyText.includes('Please select your state') ||
|
||||
bodyText.includes('are you 21') ||
|
||||
bodyText.includes('are you 18') ||
|
||||
bodyText.includes('Enter your date of birth') ||
|
||||
bodyText.toLowerCase().includes('verify your age');
|
||||
return hasAgeVerification;
|
||||
}
|
||||
catch (err) {
|
||||
logger_1.logger.warn('age-gate', `Error detecting age gate: ${err}`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Attempts to bypass an age gate using Playwright
|
||||
* Handles multiple age gate patterns including Curaleaf's complex React-based gate
|
||||
*
|
||||
* @param page - Playwright page object
|
||||
* @param state - State to select (e.g., 'Arizona', 'California')
|
||||
* @returns Promise<boolean> - true if bypass succeeded, false otherwise
|
||||
*/
|
||||
async function bypassAgeGatePlaywright(page, state = 'Arizona') {
|
||||
try {
|
||||
const hasGate = await hasAgeGatePlaywright(page);
|
||||
if (!hasGate) {
|
||||
logger_1.logger.info('age-gate', 'No age gate detected');
|
||||
return true;
|
||||
}
|
||||
logger_1.logger.info('age-gate', `Age gate detected - attempting to bypass with state: ${state}...`);
|
||||
// Wait for age gate to fully render
|
||||
await page.waitForTimeout(2000);
|
||||
// Method 1: Curaleaf-style (state dropdown + "I'm over 21" button)
|
||||
try {
|
||||
const stateButton = page.locator('button#state, button[id="state"]').first();
|
||||
const stateButtonExists = await stateButton.count() > 0;
|
||||
if (stateButtonExists) {
|
||||
logger_1.logger.info('age-gate', 'Found Curaleaf-style state dropdown...');
|
||||
await stateButton.click();
|
||||
await page.waitForTimeout(1000);
|
||||
// Select state
|
||||
const stateOption = page.locator('[role="option"]').filter({ hasText: new RegExp(`^${state}$`, 'i') });
|
||||
const stateExists = await stateOption.count() > 0;
|
||||
if (stateExists) {
|
||||
logger_1.logger.info('age-gate', `Clicking ${state} option...`);
|
||||
await stateOption.first().click();
|
||||
await page.waitForTimeout(2000);
|
||||
// Look for "I'm over 21" button
|
||||
const ageButton = page.locator('button').filter({ hasText: /I'm over 21|I am 21|I'm 21|over 21/i });
|
||||
const ageButtonExists = await ageButton.count() > 0;
|
||||
if (ageButtonExists) {
|
||||
logger_1.logger.info('age-gate', 'Clicking age verification button...');
|
||||
await ageButton.first().click();
|
||||
await page.waitForLoadState('domcontentloaded', { timeout: 15000 });
|
||||
await page.waitForTimeout(3000);
|
||||
// Check if we successfully bypassed
|
||||
const finalUrl = page.url();
|
||||
if (!finalUrl.includes('/age-gate')) {
|
||||
logger_1.logger.info('age-gate', `✅ Age gate bypass successful`);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (e) {
|
||||
logger_1.logger.warn('age-gate', `Curaleaf method failed: ${e}`);
|
||||
}
|
||||
// Method 2: Simple "Yes" or "I'm 21" button (for simpler age gates)
|
||||
try {
|
||||
const simpleButton = page.locator('button, a, [role="button"]').filter({
|
||||
hasText: /yes|i am 21|i'm 21|enter the site|continue|confirm/i
|
||||
});
|
||||
const simpleExists = await simpleButton.count() > 0;
|
||||
if (simpleExists) {
|
||||
logger_1.logger.info('age-gate', 'Found simple age gate button...');
|
||||
await simpleButton.first().click();
|
||||
await page.waitForLoadState('domcontentloaded', { timeout: 10000 });
|
||||
await page.waitForTimeout(2000);
|
||||
const finalUrl = page.url();
|
||||
if (!finalUrl.includes('/age-gate')) {
|
||||
logger_1.logger.info('age-gate', `✅ Age gate bypass successful`);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (e) {
|
||||
logger_1.logger.warn('age-gate', `Simple button method failed: ${e}`);
|
||||
}
|
||||
// Method 3: Standard select dropdown
|
||||
try {
|
||||
const selectExists = await page.locator('select').count() > 0;
|
||||
if (selectExists) {
|
||||
logger_1.logger.info('age-gate', 'Found select dropdown...');
|
||||
const select = page.locator('select').first();
|
||||
await select.selectOption({ label: state });
|
||||
await page.waitForTimeout(1000);
|
||||
// Look for submit button
|
||||
const submitButton = page.locator('button[type="submit"], input[type="submit"]');
|
||||
const submitExists = await submitButton.count() > 0;
|
||||
if (submitExists) {
|
||||
await submitButton.first().click();
|
||||
await page.waitForLoadState('domcontentloaded', { timeout: 10000 });
|
||||
await page.waitForTimeout(2000);
|
||||
const finalUrl = page.url();
|
||||
if (!finalUrl.includes('/age-gate')) {
|
||||
logger_1.logger.info('age-gate', `✅ Age gate bypass successful`);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (e) {
|
||||
logger_1.logger.warn('age-gate', `Select dropdown method failed: ${e}`);
|
||||
}
|
||||
// Verify final state
|
||||
const finalUrl = page.url();
|
||||
if (finalUrl.includes('/age-gate')) {
|
||||
logger_1.logger.error('age-gate', `❌ Age gate bypass failed - still at: ${finalUrl}`);
|
||||
return false;
|
||||
}
|
||||
logger_1.logger.info('age-gate', `✅ Age gate bypass successful`);
|
||||
return true;
|
||||
}
|
||||
catch (err) {
|
||||
logger_1.logger.error('age-gate', `Error bypassing age gate: ${err}`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Helper to detect the state from a store URL
|
||||
*/
|
||||
function detectStateFromUrlPlaywright(url) {
|
||||
const stateMap = {
|
||||
'-az-': 'Arizona',
|
||||
'arizona': 'Arizona',
|
||||
'-ca-': 'California',
|
||||
'california': 'California',
|
||||
'-co-': 'Colorado',
|
||||
'colorado': 'Colorado',
|
||||
'-fl-': 'Florida',
|
||||
'florida': 'Florida',
|
||||
'-il-': 'Illinois',
|
||||
'illinois': 'Illinois',
|
||||
'-ma-': 'Massachusetts',
|
||||
'-mi-': 'Michigan',
|
||||
'-nv-': 'Nevada',
|
||||
'-nj-': 'New Jersey',
|
||||
'-ny-': 'New York',
|
||||
'-or-': 'Oregon',
|
||||
'-pa-': 'Pennsylvania',
|
||||
'-wa-': 'Washington',
|
||||
};
|
||||
const lowerUrl = url.toLowerCase();
|
||||
for (const [pattern, stateName] of Object.entries(stateMap)) {
|
||||
if (lowerUrl.includes(pattern)) {
|
||||
return stateName;
|
||||
}
|
||||
}
|
||||
// Default to Arizona
|
||||
return 'Arizona';
|
||||
}
|
||||
263
backend/dist/utils/age-gate.js
vendored
Normal file
263
backend/dist/utils/age-gate.js
vendored
Normal file
@@ -0,0 +1,263 @@
|
||||
"use strict";
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.setAgeGateCookies = setAgeGateCookies;
|
||||
exports.hasAgeGate = hasAgeGate;
|
||||
exports.bypassAgeGate = bypassAgeGate;
|
||||
exports.detectStateFromUrl = detectStateFromUrl;
|
||||
const logger_1 = require("../services/logger");
|
||||
/**
|
||||
* Sets age gate bypass cookies before navigating to a page
|
||||
* This should be called BEFORE page.goto() to prevent the age gate from showing
|
||||
*
|
||||
* @param page - Puppeteer page object
|
||||
* @param url - URL to extract domain from
|
||||
* @param state - State to set in cookie
|
||||
*/
|
||||
async function setAgeGateCookies(page, url, state = 'Arizona') {
|
||||
try {
|
||||
const urlObj = new URL(url);
|
||||
const domain = urlObj.hostname.replace('www.', '');
|
||||
// Set cookies that bypass age gates
|
||||
await page.setCookie({
|
||||
name: 'age_gate_passed',
|
||||
value: 'true',
|
||||
domain: `.${domain}`,
|
||||
path: '/',
|
||||
expires: Date.now() / 1000 + 365 * 24 * 60 * 60, // 1 year
|
||||
httpOnly: false,
|
||||
secure: false,
|
||||
sameSite: 'Lax'
|
||||
}, {
|
||||
name: 'selected_state',
|
||||
value: state,
|
||||
domain: `.${domain}`,
|
||||
path: '/',
|
||||
expires: Date.now() / 1000 + 365 * 24 * 60 * 60, // 1 year
|
||||
httpOnly: false,
|
||||
secure: false,
|
||||
sameSite: 'Lax'
|
||||
}, {
|
||||
name: 'age_verified',
|
||||
value: 'true',
|
||||
domain: `.${domain}`,
|
||||
path: '/',
|
||||
expires: Date.now() / 1000 + 365 * 24 * 60 * 60,
|
||||
httpOnly: false,
|
||||
secure: false,
|
||||
sameSite: 'Lax'
|
||||
});
|
||||
logger_1.logger.info('age-gate', `Set age gate bypass cookies for ${domain} (state: ${state})`);
|
||||
}
|
||||
catch (err) {
|
||||
logger_1.logger.warn('age-gate', `Failed to set age gate cookies: ${err}`);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Detects if a page has an age verification gate
|
||||
*/
|
||||
async function hasAgeGate(page) {
|
||||
return await page.evaluate(() => {
|
||||
const bodyText = document.body.textContent || '';
|
||||
const hasAgeVerification = bodyText.includes('age verification') ||
|
||||
bodyText.includes('Please select your state') ||
|
||||
bodyText.includes('are you 21') ||
|
||||
bodyText.includes('are you 18') ||
|
||||
bodyText.includes('Enter your date of birth') ||
|
||||
bodyText.toLowerCase().includes('verify');
|
||||
return hasAgeVerification;
|
||||
});
|
||||
}
|
||||
/**
|
||||
* Attempts to bypass an age gate by selecting the appropriate state
|
||||
* Works with multiple age gate patterns used by cannabis dispensaries
|
||||
*
|
||||
* @param page - Puppeteer page object
|
||||
* @param state - State to select (e.g., 'Arizona', 'California'). Defaults to 'Arizona'
|
||||
* @returns Promise<boolean> - true if bypass was attempted, false if no age gate found
|
||||
*/
|
||||
async function bypassAgeGate(page, state = 'Arizona', useSavedCookies = true) {
|
||||
try {
|
||||
const hasGate = await hasAgeGate(page);
|
||||
if (!hasGate) {
|
||||
logger_1.logger.info('age-gate', 'No age gate detected');
|
||||
return false;
|
||||
}
|
||||
logger_1.logger.info('age-gate', `Age gate detected - attempting to bypass with state: ${state}...`);
|
||||
// Wait a bit for React components to fully render
|
||||
await page.waitForTimeout(2000);
|
||||
// Try Method 0: Custom dropdown button (shadcn/radix style - Curaleaf)
|
||||
let customDropdownWorked = false;
|
||||
try {
|
||||
// Click button to open dropdown
|
||||
const dropdownButton = await page.$('button#state, button[id="state"]');
|
||||
if (dropdownButton) {
|
||||
logger_1.logger.info('age-gate', 'Found state dropdown button, clicking...');
|
||||
await dropdownButton.click();
|
||||
await page.waitForTimeout(800);
|
||||
// Click the state option and trigger React events
|
||||
const stateClicked = await page.evaluate((selectedState) => {
|
||||
const options = Array.from(document.querySelectorAll('[role="option"]'));
|
||||
const stateOption = options.find(el => el.textContent?.toLowerCase() === selectedState.toLowerCase());
|
||||
if (stateOption instanceof HTMLElement) {
|
||||
// Trigger multiple events that React might be listening for
|
||||
stateOption.dispatchEvent(new MouseEvent('mousedown', { bubbles: true }));
|
||||
stateOption.dispatchEvent(new MouseEvent('mouseup', { bubbles: true }));
|
||||
stateOption.click();
|
||||
stateOption.dispatchEvent(new MouseEvent('click', { bubbles: true }));
|
||||
stateOption.dispatchEvent(new Event('change', { bubbles: true }));
|
||||
stateOption.dispatchEvent(new Event('input', { bubbles: true }));
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}, state);
|
||||
if (stateClicked) {
|
||||
logger_1.logger.info('age-gate', `Clicked ${state} option with React events`);
|
||||
await page.waitForTimeout(1000);
|
||||
// Look for and click any submit/continue button that appeared
|
||||
const submitClicked = await page.evaluate(() => {
|
||||
const buttons = Array.from(document.querySelectorAll('button, [role="button"], a'));
|
||||
const submitBtn = buttons.find(el => {
|
||||
const text = el.textContent?.toLowerCase() || '';
|
||||
const ariaLabel = el.getAttribute('aria-label')?.toLowerCase() || '';
|
||||
return text.includes('continue') || text.includes('submit') ||
|
||||
text.includes('enter') || text.includes('confirm') ||
|
||||
ariaLabel.includes('continue') || ariaLabel.includes('submit');
|
||||
});
|
||||
if (submitBtn instanceof HTMLElement && submitBtn.offsetParent !== null) {
|
||||
submitBtn.click();
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
});
|
||||
if (submitClicked) {
|
||||
logger_1.logger.info('age-gate', `Found and clicked submit button`);
|
||||
}
|
||||
customDropdownWorked = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (e) {
|
||||
logger_1.logger.warn('age-gate', `Dropdown method failed: ${e}`);
|
||||
}
|
||||
// Try Method 1: Dropdown select
|
||||
const selectFound = await page.evaluate((selectedState) => {
|
||||
const selects = Array.from(document.querySelectorAll('select'));
|
||||
for (const select of selects) {
|
||||
const options = Array.from(select.options);
|
||||
const stateOption = options.find(opt => opt.text.toLowerCase().includes(selectedState.toLowerCase()) ||
|
||||
opt.value.toLowerCase().includes(selectedState.toLowerCase()));
|
||||
if (stateOption) {
|
||||
select.value = stateOption.value;
|
||||
select.dispatchEvent(new Event('change', { bubbles: true }));
|
||||
select.dispatchEvent(new Event('input', { bubbles: true }));
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}, state);
|
||||
// Try Method 2: State button/card (click state, then click confirm)
|
||||
let stateClicked = false;
|
||||
if (!selectFound) {
|
||||
stateClicked = await page.evaluate((selectedState) => {
|
||||
const allElements = Array.from(document.querySelectorAll('button, a, div, span, [role="button"], [class*="state"], [class*="State"], [class*="card"], [class*="option"]'));
|
||||
const stateButton = allElements.find(el => el.textContent?.toLowerCase().includes(selectedState.toLowerCase()));
|
||||
if (stateButton instanceof HTMLElement) {
|
||||
stateButton.click();
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}, state);
|
||||
if (stateClicked) {
|
||||
// Wait for confirm button to appear and click it
|
||||
await page.waitForTimeout(1000);
|
||||
await page.evaluate(() => {
|
||||
const confirmBtns = Array.from(document.querySelectorAll('button, a, [role="button"]'));
|
||||
const confirmBtn = confirmBtns.find(el => {
|
||||
const text = el.textContent?.toLowerCase() || '';
|
||||
return text.includes('enter') || text.includes('continue') || text.includes('yes') || text.includes('confirm');
|
||||
});
|
||||
if (confirmBtn instanceof HTMLElement) {
|
||||
confirmBtn.click();
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
// Try Method 3: Direct "Yes" or age confirmation button
|
||||
const yesClicked = await page.evaluate(() => {
|
||||
const confirmButtons = Array.from(document.querySelectorAll('button, a, [role="button"]'));
|
||||
const yesButton = confirmButtons.find(el => {
|
||||
const text = el.textContent?.toLowerCase() || '';
|
||||
return text.includes('yes') ||
|
||||
text.includes('i am 21') ||
|
||||
text.includes('i am 18') ||
|
||||
text.includes('enter the site') ||
|
||||
text.includes('enter') ||
|
||||
text.includes('continue');
|
||||
});
|
||||
if (yesButton instanceof HTMLElement) {
|
||||
yesButton.click();
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
});
|
||||
const bypassed = customDropdownWorked || selectFound || stateClicked || yesClicked;
|
||||
if (bypassed) {
|
||||
// Wait for navigation to complete after clicking age gate button
|
||||
logger_1.logger.info('age-gate', `Waiting for navigation after age gate bypass...`);
|
||||
try {
|
||||
await page.waitForNavigation({ waitUntil: 'domcontentloaded', timeout: 15000 });
|
||||
}
|
||||
catch (e) {
|
||||
// Navigation might not trigger, that's ok - wait a bit anyway
|
||||
await page.waitForTimeout(3000);
|
||||
}
|
||||
// Give the page extra time to load content
|
||||
await page.waitForTimeout(3000);
|
||||
// Verify we actually bypassed by checking the URL
|
||||
const finalUrl = page.url();
|
||||
if (finalUrl.includes('/age-gate')) {
|
||||
logger_1.logger.error('age-gate', `❌ Age gate bypass failed - still at age gate URL: ${finalUrl}`);
|
||||
return false;
|
||||
}
|
||||
logger_1.logger.info('age-gate', `✅ Age gate bypass completed - now at: ${finalUrl}`);
|
||||
return true;
|
||||
}
|
||||
else {
|
||||
logger_1.logger.warn('age-gate', `Could not find ${state} option or confirmation button in age gate`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
catch (err) {
|
||||
logger_1.logger.error('age-gate', `Error bypassing age gate: ${err}`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Helper to detect the state from a store URL
|
||||
* @param url - Store URL
|
||||
* @returns State name (e.g., 'Arizona', 'California')
|
||||
*/
|
||||
function detectStateFromUrl(url) {
|
||||
const stateMap = {
|
||||
'-az-': 'Arizona',
|
||||
'-ca-': 'California',
|
||||
'-co-': 'Colorado',
|
||||
'-fl-': 'Florida',
|
||||
'-il-': 'Illinois',
|
||||
'-ma-': 'Massachusetts',
|
||||
'-mi-': 'Michigan',
|
||||
'-nv-': 'Nevada',
|
||||
'-nj-': 'New Jersey',
|
||||
'-ny-': 'New York',
|
||||
'-or-': 'Oregon',
|
||||
'-pa-': 'Pennsylvania',
|
||||
'-wa-': 'Washington',
|
||||
};
|
||||
for (const [pattern, stateName] of Object.entries(stateMap)) {
|
||||
if (url.toLowerCase().includes(pattern)) {
|
||||
return stateName;
|
||||
}
|
||||
}
|
||||
// Default to Arizona if state not detected
|
||||
return 'Arizona';
|
||||
}
|
||||
296
backend/dist/utils/image-storage.js
vendored
Normal file
296
backend/dist/utils/image-storage.js
vendored
Normal file
@@ -0,0 +1,296 @@
|
||||
"use strict";
|
||||
/**
|
||||
* Local Image Storage Utility
|
||||
*
|
||||
* Downloads and stores product images to local filesystem.
|
||||
* Replaces MinIO-based storage with simple local file storage.
|
||||
*
|
||||
* Directory structure:
|
||||
* /images/products/<dispensary_id>/<product_id>.webp
|
||||
* /images/products/<dispensary_id>/<product_id>-thumb.webp
|
||||
* /images/products/<dispensary_id>/<product_id>-medium.webp
|
||||
* /images/brands/<brand_slug>.webp
|
||||
*/
|
||||
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
||||
if (k2 === undefined) k2 = k;
|
||||
var desc = Object.getOwnPropertyDescriptor(m, k);
|
||||
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
||||
desc = { enumerable: true, get: function() { return m[k]; } };
|
||||
}
|
||||
Object.defineProperty(o, k2, desc);
|
||||
}) : (function(o, m, k, k2) {
|
||||
if (k2 === undefined) k2 = k;
|
||||
o[k2] = m[k];
|
||||
}));
|
||||
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
||||
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
||||
}) : function(o, v) {
|
||||
o["default"] = v;
|
||||
});
|
||||
var __importStar = (this && this.__importStar) || (function () {
|
||||
var ownKeys = function(o) {
|
||||
ownKeys = Object.getOwnPropertyNames || function (o) {
|
||||
var ar = [];
|
||||
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
||||
return ar;
|
||||
};
|
||||
return ownKeys(o);
|
||||
};
|
||||
return function (mod) {
|
||||
if (mod && mod.__esModule) return mod;
|
||||
var result = {};
|
||||
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
||||
__setModuleDefault(result, mod);
|
||||
return result;
|
||||
};
|
||||
})();
|
||||
var __importDefault = (this && this.__importDefault) || function (mod) {
|
||||
return (mod && mod.__esModule) ? mod : { "default": mod };
|
||||
};
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.downloadProductImage = downloadProductImage;
|
||||
exports.downloadBrandLogo = downloadBrandLogo;
|
||||
exports.imageExists = imageExists;
|
||||
exports.deleteProductImages = deleteProductImages;
|
||||
exports.initializeImageStorage = initializeImageStorage;
|
||||
exports.getStorageStats = getStorageStats;
|
||||
const axios_1 = __importDefault(require("axios"));
|
||||
const sharp_1 = __importDefault(require("sharp"));
|
||||
const fs = __importStar(require("fs/promises"));
|
||||
const path = __importStar(require("path"));
|
||||
const crypto_1 = require("crypto");
|
||||
// Base path for image storage - configurable via env
|
||||
const IMAGES_BASE_PATH = process.env.IMAGES_PATH || '/app/public/images';
|
||||
// Public URL base for serving images
|
||||
const IMAGES_PUBLIC_URL = process.env.IMAGES_PUBLIC_URL || '/images';
|
||||
/**
|
||||
* Ensure a directory exists
|
||||
*/
|
||||
async function ensureDir(dirPath) {
|
||||
try {
|
||||
await fs.mkdir(dirPath, { recursive: true });
|
||||
}
|
||||
catch (error) {
|
||||
if (error.code !== 'EEXIST')
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Generate a short hash from a URL for deduplication
|
||||
*/
|
||||
function hashUrl(url) {
|
||||
return (0, crypto_1.createHash)('md5').update(url).digest('hex').substring(0, 8);
|
||||
}
|
||||
/**
|
||||
* Download an image from a URL and return the buffer
|
||||
*/
|
||||
async function downloadImage(imageUrl) {
|
||||
const response = await axios_1.default.get(imageUrl, {
|
||||
responseType: 'arraybuffer',
|
||||
timeout: 30000,
|
||||
headers: {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
||||
'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8',
|
||||
},
|
||||
});
|
||||
return Buffer.from(response.data);
|
||||
}
|
||||
/**
|
||||
* Process and save image in multiple sizes
|
||||
* Returns the file paths relative to IMAGES_BASE_PATH
|
||||
*/
|
||||
async function processAndSaveImage(buffer, outputDir, baseFilename) {
|
||||
await ensureDir(outputDir);
|
||||
const fullPath = path.join(outputDir, `${baseFilename}.webp`);
|
||||
const mediumPath = path.join(outputDir, `${baseFilename}-medium.webp`);
|
||||
const thumbPath = path.join(outputDir, `${baseFilename}-thumb.webp`);
|
||||
// Process images in parallel
|
||||
const [fullBuffer, mediumBuffer, thumbBuffer] = await Promise.all([
|
||||
// Full: max 1200x1200, high quality
|
||||
(0, sharp_1.default)(buffer)
|
||||
.resize(1200, 1200, { fit: 'inside', withoutEnlargement: true })
|
||||
.webp({ quality: 85 })
|
||||
.toBuffer(),
|
||||
// Medium: 600x600
|
||||
(0, sharp_1.default)(buffer)
|
||||
.resize(600, 600, { fit: 'inside', withoutEnlargement: true })
|
||||
.webp({ quality: 80 })
|
||||
.toBuffer(),
|
||||
// Thumb: 200x200
|
||||
(0, sharp_1.default)(buffer)
|
||||
.resize(200, 200, { fit: 'inside', withoutEnlargement: true })
|
||||
.webp({ quality: 75 })
|
||||
.toBuffer(),
|
||||
]);
|
||||
// Save all sizes
|
||||
await Promise.all([
|
||||
fs.writeFile(fullPath, fullBuffer),
|
||||
fs.writeFile(mediumPath, mediumBuffer),
|
||||
fs.writeFile(thumbPath, thumbBuffer),
|
||||
]);
|
||||
const totalBytes = fullBuffer.length + mediumBuffer.length + thumbBuffer.length;
|
||||
return {
|
||||
full: fullPath,
|
||||
medium: mediumPath,
|
||||
thumb: thumbPath,
|
||||
totalBytes,
|
||||
};
|
||||
}
|
||||
/**
|
||||
* Convert a file path to a public URL
|
||||
*/
|
||||
function pathToUrl(filePath) {
|
||||
const relativePath = filePath.replace(IMAGES_BASE_PATH, '');
|
||||
return `${IMAGES_PUBLIC_URL}${relativePath}`;
|
||||
}
|
||||
/**
|
||||
* Download and store a product image locally
|
||||
*
|
||||
* @param imageUrl - The third-party image URL to download
|
||||
* @param dispensaryId - The dispensary ID (for directory organization)
|
||||
* @param productId - The product ID or external ID (for filename)
|
||||
* @returns Download result with local URLs
|
||||
*/
|
||||
async function downloadProductImage(imageUrl, dispensaryId, productId) {
|
||||
try {
|
||||
if (!imageUrl) {
|
||||
return { success: false, error: 'No image URL provided' };
|
||||
}
|
||||
// Download the image
|
||||
const buffer = await downloadImage(imageUrl);
|
||||
// Organize by dispensary ID
|
||||
const outputDir = path.join(IMAGES_BASE_PATH, 'products', String(dispensaryId));
|
||||
// Use product ID + URL hash for uniqueness
|
||||
const urlHash = hashUrl(imageUrl);
|
||||
const baseFilename = `${productId}-${urlHash}`;
|
||||
// Process and save
|
||||
const result = await processAndSaveImage(buffer, outputDir, baseFilename);
|
||||
return {
|
||||
success: true,
|
||||
urls: {
|
||||
full: pathToUrl(result.full),
|
||||
medium: pathToUrl(result.medium),
|
||||
thumb: pathToUrl(result.thumb),
|
||||
},
|
||||
bytesDownloaded: result.totalBytes,
|
||||
};
|
||||
}
|
||||
catch (error) {
|
||||
return {
|
||||
success: false,
|
||||
error: error.message || 'Failed to download image',
|
||||
};
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Download and store a brand logo locally
|
||||
*
|
||||
* @param logoUrl - The brand logo URL
|
||||
* @param brandId - The brand ID or slug
|
||||
* @returns Download result with local URL
|
||||
*/
|
||||
async function downloadBrandLogo(logoUrl, brandId) {
|
||||
try {
|
||||
if (!logoUrl) {
|
||||
return { success: false, error: 'No logo URL provided' };
|
||||
}
|
||||
// Download the image
|
||||
const buffer = await downloadImage(logoUrl);
|
||||
// Brand logos go in /images/brands/
|
||||
const outputDir = path.join(IMAGES_BASE_PATH, 'brands');
|
||||
// Sanitize brand ID for filename
|
||||
const safeBrandId = brandId.replace(/[^a-zA-Z0-9-_]/g, '_');
|
||||
const urlHash = hashUrl(logoUrl);
|
||||
const baseFilename = `${safeBrandId}-${urlHash}`;
|
||||
// Process and save (single size for logos)
|
||||
await ensureDir(outputDir);
|
||||
const logoPath = path.join(outputDir, `${baseFilename}.webp`);
|
||||
const logoBuffer = await (0, sharp_1.default)(buffer)
|
||||
.resize(400, 400, { fit: 'inside', withoutEnlargement: true })
|
||||
.webp({ quality: 85 })
|
||||
.toBuffer();
|
||||
await fs.writeFile(logoPath, logoBuffer);
|
||||
return {
|
||||
success: true,
|
||||
urls: {
|
||||
full: pathToUrl(logoPath),
|
||||
medium: pathToUrl(logoPath),
|
||||
thumb: pathToUrl(logoPath),
|
||||
},
|
||||
bytesDownloaded: logoBuffer.length,
|
||||
};
|
||||
}
|
||||
catch (error) {
|
||||
return {
|
||||
success: false,
|
||||
error: error.message || 'Failed to download brand logo',
|
||||
};
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Check if a local image already exists
|
||||
*/
|
||||
async function imageExists(dispensaryId, productId, imageUrl) {
|
||||
const urlHash = hashUrl(imageUrl);
|
||||
const imagePath = path.join(IMAGES_BASE_PATH, 'products', String(dispensaryId), `${productId}-${urlHash}.webp`);
|
||||
try {
|
||||
await fs.access(imagePath);
|
||||
return true;
|
||||
}
|
||||
catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Delete a product's local images
|
||||
*/
|
||||
async function deleteProductImages(dispensaryId, productId, imageUrl) {
|
||||
const productDir = path.join(IMAGES_BASE_PATH, 'products', String(dispensaryId));
|
||||
const prefix = imageUrl
|
||||
? `${productId}-${hashUrl(imageUrl)}`
|
||||
: String(productId);
|
||||
try {
|
||||
const files = await fs.readdir(productDir);
|
||||
const toDelete = files.filter(f => f.startsWith(prefix));
|
||||
await Promise.all(toDelete.map(f => fs.unlink(path.join(productDir, f))));
|
||||
}
|
||||
catch {
|
||||
// Directory might not exist, that's fine
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Initialize the image storage directories
|
||||
*/
|
||||
async function initializeImageStorage() {
|
||||
await ensureDir(path.join(IMAGES_BASE_PATH, 'products'));
|
||||
await ensureDir(path.join(IMAGES_BASE_PATH, 'brands'));
|
||||
console.log(`✅ Image storage initialized at ${IMAGES_BASE_PATH}`);
|
||||
}
|
||||
/**
|
||||
* Get storage stats
|
||||
*/
|
||||
async function getStorageStats() {
|
||||
const productsDir = path.join(IMAGES_BASE_PATH, 'products');
|
||||
const brandsDir = path.join(IMAGES_BASE_PATH, 'brands');
|
||||
let productCount = 0;
|
||||
let brandCount = 0;
|
||||
try {
|
||||
const productDirs = await fs.readdir(productsDir);
|
||||
for (const dir of productDirs) {
|
||||
const files = await fs.readdir(path.join(productsDir, dir));
|
||||
productCount += files.filter(f => f.endsWith('.webp') && !f.includes('-')).length;
|
||||
}
|
||||
}
|
||||
catch { /* ignore */ }
|
||||
try {
|
||||
const brandFiles = await fs.readdir(brandsDir);
|
||||
brandCount = brandFiles.filter(f => f.endsWith('.webp')).length;
|
||||
}
|
||||
catch { /* ignore */ }
|
||||
return {
|
||||
productsDir,
|
||||
brandsDir,
|
||||
productCount,
|
||||
brandCount,
|
||||
};
|
||||
}
|
||||
202
backend/dist/utils/minio.js
vendored
202
backend/dist/utils/minio.js
vendored
@@ -36,30 +36,61 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
|
||||
return (mod && mod.__esModule) ? mod : { "default": mod };
|
||||
};
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.minioClient = void 0;
|
||||
exports.isMinioEnabled = isMinioEnabled;
|
||||
exports.initializeMinio = initializeMinio;
|
||||
exports.uploadImageFromUrl = uploadImageFromUrl;
|
||||
exports.getImageUrl = getImageUrl;
|
||||
exports.deleteImage = deleteImage;
|
||||
exports.minioClient = getMinioClient;
|
||||
const Minio = __importStar(require("minio"));
|
||||
const axios_1 = __importDefault(require("axios"));
|
||||
const uuid_1 = require("uuid");
|
||||
const minioClient = new Minio.Client({
|
||||
endPoint: process.env.MINIO_ENDPOINT || 'minio',
|
||||
port: parseInt(process.env.MINIO_PORT || '9000'),
|
||||
useSSL: process.env.MINIO_USE_SSL === 'true',
|
||||
accessKey: process.env.MINIO_ACCESS_KEY || 'minioadmin',
|
||||
secretKey: process.env.MINIO_SECRET_KEY || 'minioadmin',
|
||||
});
|
||||
exports.minioClient = minioClient;
|
||||
const sharp_1 = __importDefault(require("sharp"));
|
||||
const fs = __importStar(require("fs/promises"));
|
||||
const path = __importStar(require("path"));
|
||||
let minioClient = null;
|
||||
// Check if MinIO is configured
|
||||
function isMinioEnabled() {
|
||||
return !!process.env.MINIO_ENDPOINT;
|
||||
}
|
||||
// Local storage path for images when MinIO is not configured
|
||||
const LOCAL_IMAGES_PATH = process.env.LOCAL_IMAGES_PATH || '/app/public/images';
|
||||
function getMinioClient() {
|
||||
if (!minioClient) {
|
||||
minioClient = new Minio.Client({
|
||||
endPoint: process.env.MINIO_ENDPOINT || 'minio',
|
||||
port: parseInt(process.env.MINIO_PORT || '9000'),
|
||||
useSSL: process.env.MINIO_USE_SSL === 'true',
|
||||
accessKey: process.env.MINIO_ACCESS_KEY || 'minioadmin',
|
||||
secretKey: process.env.MINIO_SECRET_KEY || 'minioadmin',
|
||||
});
|
||||
}
|
||||
return minioClient;
|
||||
}
|
||||
const BUCKET_NAME = process.env.MINIO_BUCKET || 'dutchie';
|
||||
async function initializeMinio() {
|
||||
// Skip MinIO initialization if not configured
|
||||
if (!isMinioEnabled()) {
|
||||
console.log('ℹ️ MinIO not configured (MINIO_ENDPOINT not set), using local filesystem storage');
|
||||
// Ensure local images directory exists
|
||||
try {
|
||||
await fs.mkdir(LOCAL_IMAGES_PATH, { recursive: true });
|
||||
await fs.mkdir(path.join(LOCAL_IMAGES_PATH, 'products'), { recursive: true });
|
||||
console.log(`✅ Local images directory ready: ${LOCAL_IMAGES_PATH}`);
|
||||
}
|
||||
catch (error) {
|
||||
console.error('❌ Failed to create local images directory:', error);
|
||||
throw error;
|
||||
}
|
||||
return;
|
||||
}
|
||||
try {
|
||||
const client = getMinioClient();
|
||||
// Check if bucket exists
|
||||
const exists = await minioClient.bucketExists(BUCKET_NAME);
|
||||
const exists = await client.bucketExists(BUCKET_NAME);
|
||||
if (!exists) {
|
||||
// Create bucket
|
||||
await minioClient.makeBucket(BUCKET_NAME, 'us-east-1');
|
||||
await client.makeBucket(BUCKET_NAME, 'us-east-1');
|
||||
console.log(`✅ Minio bucket created: ${BUCKET_NAME}`);
|
||||
// Set public read policy
|
||||
const policy = {
|
||||
@@ -73,7 +104,7 @@ async function initializeMinio() {
|
||||
},
|
||||
],
|
||||
};
|
||||
await minioClient.setBucketPolicy(BUCKET_NAME, JSON.stringify(policy));
|
||||
await client.setBucketPolicy(BUCKET_NAME, JSON.stringify(policy));
|
||||
console.log(`✅ Bucket policy set to public read`);
|
||||
}
|
||||
else {
|
||||
@@ -85,36 +116,145 @@ async function initializeMinio() {
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
async function uploadImageFromUrl(imageUrl, productId) {
|
||||
async function removeBackground(buffer) {
|
||||
try {
|
||||
// Get image metadata to check if it has an alpha channel
|
||||
const metadata = await (0, sharp_1.default)(buffer).metadata();
|
||||
// If image already has transparency, trim and optimize it
|
||||
if (metadata.hasAlpha) {
|
||||
return await (0, sharp_1.default)(buffer)
|
||||
.trim() // Remove transparent borders
|
||||
.toBuffer();
|
||||
}
|
||||
// For images without alpha (like JPEGs with solid backgrounds),
|
||||
// we'll use a threshold-based approach to detect and remove solid backgrounds
|
||||
// This works well for product images on solid color backgrounds
|
||||
// Convert to PNG with alpha channel, then flatten with transparency
|
||||
const withAlpha = await (0, sharp_1.default)(buffer)
|
||||
.ensureAlpha() // Add alpha channel
|
||||
.toBuffer();
|
||||
// Use threshold to make similar colors transparent (targets solid backgrounds)
|
||||
// This is a simple approach - for better results, use remove.bg API or ML models
|
||||
return await (0, sharp_1.default)(withAlpha)
|
||||
.flatten({ background: { r: 0, g: 0, b: 0, alpha: 0 } })
|
||||
.trim()
|
||||
.toBuffer();
|
||||
}
|
||||
catch (error) {
|
||||
console.warn('Background removal failed, using original image:', error);
|
||||
return buffer;
|
||||
}
|
||||
}
|
||||
async function uploadToLocalFilesystem(thumbnailBuffer, mediumBuffer, fullBuffer, baseFilename) {
|
||||
const thumbnailPath = `${baseFilename}-thumb.png`;
|
||||
const mediumPath = `${baseFilename}-medium.png`;
|
||||
const fullPath = `${baseFilename}-full.png`;
|
||||
// Ensure the target directory exists (in case initializeMinio wasn't called)
|
||||
// Extract directory from baseFilename (e.g., 'products/store-slug' or just 'products')
|
||||
const targetDir = path.join(LOCAL_IMAGES_PATH, path.dirname(baseFilename));
|
||||
await fs.mkdir(targetDir, { recursive: true });
|
||||
await Promise.all([
|
||||
fs.writeFile(path.join(LOCAL_IMAGES_PATH, thumbnailPath), thumbnailBuffer),
|
||||
fs.writeFile(path.join(LOCAL_IMAGES_PATH, mediumPath), mediumBuffer),
|
||||
fs.writeFile(path.join(LOCAL_IMAGES_PATH, fullPath), fullBuffer),
|
||||
]);
|
||||
return {
|
||||
thumbnail: thumbnailPath,
|
||||
medium: mediumPath,
|
||||
full: fullPath,
|
||||
};
|
||||
}
|
||||
async function uploadToMinio(thumbnailBuffer, mediumBuffer, fullBuffer, baseFilename) {
|
||||
const client = getMinioClient();
|
||||
const thumbnailPath = `${baseFilename}-thumb.png`;
|
||||
const mediumPath = `${baseFilename}-medium.png`;
|
||||
const fullPath = `${baseFilename}-full.png`;
|
||||
await Promise.all([
|
||||
client.putObject(BUCKET_NAME, thumbnailPath, thumbnailBuffer, thumbnailBuffer.length, {
|
||||
'Content-Type': 'image/png',
|
||||
}),
|
||||
client.putObject(BUCKET_NAME, mediumPath, mediumBuffer, mediumBuffer.length, {
|
||||
'Content-Type': 'image/png',
|
||||
}),
|
||||
client.putObject(BUCKET_NAME, fullPath, fullBuffer, fullBuffer.length, {
|
||||
'Content-Type': 'image/png',
|
||||
}),
|
||||
]);
|
||||
return {
|
||||
thumbnail: thumbnailPath,
|
||||
medium: mediumPath,
|
||||
full: fullPath,
|
||||
};
|
||||
}
|
||||
async function uploadImageFromUrl(imageUrl, productId, storeSlug, removeBackgrounds = true) {
|
||||
try {
|
||||
// Download image
|
||||
const response = await axios_1.default.get(imageUrl, { responseType: 'arraybuffer' });
|
||||
const buffer = Buffer.from(response.data);
|
||||
// Generate unique filename
|
||||
const ext = imageUrl.split('.').pop()?.split('?')[0] || 'jpg';
|
||||
const filename = `products/${productId}-${(0, uuid_1.v4)()}.${ext}`;
|
||||
// Get content type
|
||||
const contentType = response.headers['content-type'] || 'image/jpeg';
|
||||
// Upload to Minio
|
||||
await minioClient.putObject(BUCKET_NAME, filename, buffer, buffer.length, {
|
||||
'Content-Type': contentType,
|
||||
});
|
||||
// Return the path (URL will be constructed when serving)
|
||||
return filename;
|
||||
let buffer = Buffer.from(response.data);
|
||||
// Remove background if enabled
|
||||
if (removeBackgrounds) {
|
||||
buffer = await removeBackground(buffer);
|
||||
}
|
||||
// Generate unique base filename - organize by store if slug provided
|
||||
const storeDir = storeSlug ? `products/${storeSlug}` : 'products';
|
||||
const baseFilename = `${storeDir}/${productId}-${(0, uuid_1.v4)()}`;
|
||||
// Create multiple sizes with Sharp and convert to WebP/PNG for better compression
|
||||
// Use PNG for images with transparency
|
||||
const [thumbnailBuffer, mediumBuffer, fullBuffer] = await Promise.all([
|
||||
// Thumbnail: 300x300
|
||||
(0, sharp_1.default)(buffer)
|
||||
.resize(300, 300, { fit: 'inside', background: { r: 0, g: 0, b: 0, alpha: 0 } })
|
||||
.png({ quality: 80, compressionLevel: 9 })
|
||||
.toBuffer(),
|
||||
// Medium: 800x800
|
||||
(0, sharp_1.default)(buffer)
|
||||
.resize(800, 800, { fit: 'inside', background: { r: 0, g: 0, b: 0, alpha: 0 } })
|
||||
.png({ quality: 85, compressionLevel: 9 })
|
||||
.toBuffer(),
|
||||
// Full: 2000x2000 (optimized)
|
||||
(0, sharp_1.default)(buffer)
|
||||
.resize(2000, 2000, { fit: 'inside', withoutEnlargement: true, background: { r: 0, g: 0, b: 0, alpha: 0 } })
|
||||
.png({ quality: 90, compressionLevel: 9 })
|
||||
.toBuffer(),
|
||||
]);
|
||||
// Upload to appropriate storage backend
|
||||
let result;
|
||||
if (isMinioEnabled()) {
|
||||
result = await uploadToMinio(thumbnailBuffer, mediumBuffer, fullBuffer, baseFilename);
|
||||
}
|
||||
else {
|
||||
result = await uploadToLocalFilesystem(thumbnailBuffer, mediumBuffer, fullBuffer, baseFilename);
|
||||
}
|
||||
console.log(`✅ Uploaded 3 sizes for product ${productId}: ${thumbnailBuffer.length + mediumBuffer.length + fullBuffer.length} bytes total`);
|
||||
return result;
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error uploading image:', error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
function getImageUrl(path) {
|
||||
// Use localhost:9020 for browser access since Minio is exposed on host port 9020
|
||||
const endpoint = process.env.MINIO_PUBLIC_ENDPOINT || 'http://localhost:9020';
|
||||
return `${endpoint}/${BUCKET_NAME}/${path}`;
|
||||
function getImageUrl(imagePath) {
|
||||
if (isMinioEnabled()) {
|
||||
// Use MinIO endpoint for browser access
|
||||
const endpoint = process.env.MINIO_PUBLIC_ENDPOINT || 'http://localhost:9020';
|
||||
return `${endpoint}/${BUCKET_NAME}/${imagePath}`;
|
||||
}
|
||||
else {
|
||||
// Use local path - served via Express static middleware
|
||||
const publicUrl = process.env.PUBLIC_URL || '';
|
||||
return `${publicUrl}/images/${imagePath}`;
|
||||
}
|
||||
}
|
||||
async function deleteImage(path) {
|
||||
async function deleteImage(imagePath) {
|
||||
try {
|
||||
await minioClient.removeObject(BUCKET_NAME, path);
|
||||
if (isMinioEnabled()) {
|
||||
const client = getMinioClient();
|
||||
await client.removeObject(BUCKET_NAME, imagePath);
|
||||
}
|
||||
else {
|
||||
const fullPath = path.join(LOCAL_IMAGES_PATH, imagePath);
|
||||
await fs.unlink(fullPath);
|
||||
}
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error deleting image:', error);
|
||||
|
||||
181
backend/dist/utils/product-normalizer.js
vendored
Normal file
181
backend/dist/utils/product-normalizer.js
vendored
Normal file
@@ -0,0 +1,181 @@
|
||||
"use strict";
|
||||
/**
|
||||
* Product Normalizer Utility
|
||||
*
|
||||
* Functions for normalizing product data to enable consistent matching
|
||||
* and prevent duplicate product entries.
|
||||
*/
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.normalizeProductName = normalizeProductName;
|
||||
exports.normalizeBrandName = normalizeBrandName;
|
||||
exports.normalizeWeight = normalizeWeight;
|
||||
exports.generateProductFingerprint = generateProductFingerprint;
|
||||
exports.stringSimilarity = stringSimilarity;
|
||||
exports.areProductsSimilar = areProductsSimilar;
|
||||
/**
|
||||
* Normalize product name for matching
|
||||
* - Lowercase
|
||||
* - Remove punctuation
|
||||
* - Remove THC/CBD percentages often appended to names
|
||||
* - Remove weight suffixes
|
||||
* - Remove emoji
|
||||
* - Normalize whitespace
|
||||
*/
|
||||
function normalizeProductName(name) {
|
||||
if (!name)
|
||||
return '';
|
||||
return name
|
||||
.toLowerCase()
|
||||
.trim()
|
||||
// Remove special characters except alphanumeric and spaces
|
||||
.replace(/[^\w\s]/g, ' ')
|
||||
// Remove common suffixes like THC/CBD percentages appended to names
|
||||
.replace(/\s*(thc|cbd|cbg|cbn|tac)\s*[:=]?\s*[\d.]+\s*%?/gi, '')
|
||||
// Remove weight/size suffixes often appended
|
||||
.replace(/\s*\d+(\.\d+)?\s*(mg|g|oz|ml|gram|grams|ounce|ounces)\b/gi, '')
|
||||
// Remove emoji
|
||||
.replace(/[\u{1F300}-\u{1F9FF}]/gu, '')
|
||||
// Remove "special offer" type suffixes
|
||||
.replace(/\s*special\s*offer\s*/gi, '')
|
||||
// Normalize multiple spaces to single space
|
||||
.replace(/\s+/g, ' ')
|
||||
.trim();
|
||||
}
|
||||
/**
|
||||
* Normalize brand name for matching
|
||||
*/
|
||||
function normalizeBrandName(brand) {
|
||||
if (!brand)
|
||||
return '';
|
||||
return brand
|
||||
.toLowerCase()
|
||||
.trim()
|
||||
// Remove special characters
|
||||
.replace(/[^\w\s]/g, ' ')
|
||||
// Normalize whitespace
|
||||
.replace(/\s+/g, ' ')
|
||||
.trim();
|
||||
}
|
||||
/**
|
||||
* Normalize weight string to standard format
|
||||
* e.g., "3.5 grams" -> "3.5g", "1/8 oz" -> "3.5g"
|
||||
*/
|
||||
function normalizeWeight(weight) {
|
||||
if (!weight)
|
||||
return '';
|
||||
const w = weight.toLowerCase().trim();
|
||||
// Handle fractional ounces
|
||||
if (w.includes('1/8') || w.includes('eighth')) {
|
||||
return '3.5g';
|
||||
}
|
||||
if (w.includes('1/4') || w.includes('quarter')) {
|
||||
return '7g';
|
||||
}
|
||||
if (w.includes('1/2') || w.includes('half')) {
|
||||
return '14g';
|
||||
}
|
||||
if (w.includes('1 oz') || w === 'oz' || w === '1oz') {
|
||||
return '28g';
|
||||
}
|
||||
// Extract numeric value and unit
|
||||
const match = w.match(/([\d.]+)\s*(mg|g|oz|ml|gram|grams?|ounce|ounces?)?/i);
|
||||
if (!match)
|
||||
return w;
|
||||
const value = parseFloat(match[1]);
|
||||
let unit = (match[2] || 'g').toLowerCase();
|
||||
// Normalize unit names
|
||||
unit = unit.replace(/gram(s)?/, 'g').replace(/ounce(s)?/, 'oz');
|
||||
// Convert oz to grams for consistency
|
||||
if (unit === 'oz') {
|
||||
return `${(value * 28).toFixed(1)}g`;
|
||||
}
|
||||
return `${value}${unit}`;
|
||||
}
|
||||
/**
|
||||
* Generate a matching fingerprint for a product
|
||||
* Used for deduplication
|
||||
*/
|
||||
function generateProductFingerprint(name, brand, weight, categoryId) {
|
||||
const parts = [
|
||||
normalizeProductName(name),
|
||||
normalizeBrandName(brand),
|
||||
normalizeWeight(weight),
|
||||
categoryId?.toString() || ''
|
||||
];
|
||||
return parts.filter(Boolean).join('|');
|
||||
}
|
||||
/**
|
||||
* Calculate similarity between two strings (0-100)
|
||||
* Uses Levenshtein distance
|
||||
*/
|
||||
function stringSimilarity(str1, str2) {
|
||||
if (str1 === str2)
|
||||
return 100;
|
||||
if (!str1 || !str2)
|
||||
return 0;
|
||||
const s1 = str1.toLowerCase();
|
||||
const s2 = str2.toLowerCase();
|
||||
if (s1 === s2)
|
||||
return 100;
|
||||
const longer = s1.length > s2.length ? s1 : s2;
|
||||
const shorter = s1.length > s2.length ? s2 : s1;
|
||||
const longerLength = longer.length;
|
||||
if (longerLength === 0)
|
||||
return 100;
|
||||
const distance = levenshteinDistance(longer, shorter);
|
||||
return Math.round(((longerLength - distance) / longerLength) * 100);
|
||||
}
|
||||
/**
|
||||
* Levenshtein distance between two strings
|
||||
*/
|
||||
function levenshteinDistance(str1, str2) {
|
||||
const m = str1.length;
|
||||
const n = str2.length;
|
||||
// Create distance matrix
|
||||
const dp = Array(m + 1).fill(null).map(() => Array(n + 1).fill(0));
|
||||
// Initialize first row and column
|
||||
for (let i = 0; i <= m; i++)
|
||||
dp[i][0] = i;
|
||||
for (let j = 0; j <= n; j++)
|
||||
dp[0][j] = j;
|
||||
// Fill in the rest
|
||||
for (let i = 1; i <= m; i++) {
|
||||
for (let j = 1; j <= n; j++) {
|
||||
const cost = str1[i - 1] === str2[j - 1] ? 0 : 1;
|
||||
dp[i][j] = Math.min(dp[i - 1][j] + 1, // deletion
|
||||
dp[i][j - 1] + 1, // insertion
|
||||
dp[i - 1][j - 1] + cost // substitution
|
||||
);
|
||||
}
|
||||
}
|
||||
return dp[m][n];
|
||||
}
|
||||
/**
|
||||
* Check if two products are likely the same
|
||||
* Returns confidence score (0-100)
|
||||
*/
|
||||
function areProductsSimilar(product1, product2, threshold = 92) {
|
||||
const name1 = normalizeProductName(product1.name);
|
||||
const name2 = normalizeProductName(product2.name);
|
||||
const nameSimilarity = stringSimilarity(name1, name2);
|
||||
// If names are very similar, likely same product
|
||||
if (nameSimilarity >= threshold) {
|
||||
return { isSimilar: true, confidence: nameSimilarity };
|
||||
}
|
||||
// Check brand match for additional confidence
|
||||
const brand1 = normalizeBrandName(product1.brand);
|
||||
const brand2 = normalizeBrandName(product2.brand);
|
||||
if (brand1 && brand2 && brand1 === brand2) {
|
||||
// Same brand, lower threshold for name match
|
||||
if (nameSimilarity >= threshold - 10) {
|
||||
return { isSimilar: true, confidence: nameSimilarity + 5 };
|
||||
}
|
||||
}
|
||||
// Check weight match
|
||||
const weight1 = normalizeWeight(product1.weight);
|
||||
const weight2 = normalizeWeight(product2.weight);
|
||||
if (weight1 && weight2 && weight1 === weight2 && nameSimilarity >= threshold - 15) {
|
||||
return { isSimilar: true, confidence: nameSimilarity + 3 };
|
||||
}
|
||||
return { isSimilar: false, confidence: nameSimilarity };
|
||||
}
|
||||
112
backend/dist/utils/proxyManager.js
vendored
Normal file
112
backend/dist/utils/proxyManager.js
vendored
Normal file
@@ -0,0 +1,112 @@
|
||||
"use strict";
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.getProxy = getProxy;
|
||||
exports.getPhoenixProxy = getPhoenixProxy;
|
||||
exports.getStateProxy = getStateProxy;
|
||||
exports.getCityProxy = getCityProxy;
|
||||
exports.getRandomProxy = getRandomProxy;
|
||||
exports.getProxyLocationStats = getProxyLocationStats;
|
||||
const migrate_1 = require("../db/migrate");
|
||||
const logger_1 = require("../services/logger");
|
||||
/**
|
||||
* Get an active proxy from the database, optionally filtered by location
|
||||
*/
|
||||
async function getProxy(locationFilter) {
|
||||
try {
|
||||
let query = `
|
||||
SELECT protocol, host, port, username, password
|
||||
FROM proxies
|
||||
WHERE active = true
|
||||
`;
|
||||
const params = [];
|
||||
let paramIndex = 1;
|
||||
if (locationFilter) {
|
||||
if (locationFilter.city) {
|
||||
query += ` AND LOWER(city) = LOWER($${paramIndex})`;
|
||||
params.push(locationFilter.city);
|
||||
paramIndex++;
|
||||
}
|
||||
if (locationFilter.state) {
|
||||
query += ` AND LOWER(state) = LOWER($${paramIndex})`;
|
||||
params.push(locationFilter.state);
|
||||
paramIndex++;
|
||||
}
|
||||
if (locationFilter.country) {
|
||||
query += ` AND LOWER(country) = LOWER($${paramIndex})`;
|
||||
params.push(locationFilter.country);
|
||||
paramIndex++;
|
||||
}
|
||||
if (locationFilter.countryCode) {
|
||||
query += ` AND LOWER(country_code) = LOWER($${paramIndex})`;
|
||||
params.push(locationFilter.countryCode);
|
||||
paramIndex++;
|
||||
}
|
||||
}
|
||||
// Use RANDOM() for true randomization instead of least recently used
|
||||
query += ` ORDER BY RANDOM() LIMIT 1`;
|
||||
const result = await migrate_1.pool.query(query, params);
|
||||
if (result.rows.length === 0) {
|
||||
logger_1.logger.warn('proxy', `No active proxies found with filter: ${JSON.stringify(locationFilter)}`);
|
||||
return null;
|
||||
}
|
||||
const proxy = result.rows[0];
|
||||
return {
|
||||
server: `${proxy.protocol}://${proxy.host}:${proxy.port}`,
|
||||
username: proxy.username || undefined,
|
||||
password: proxy.password || undefined,
|
||||
};
|
||||
}
|
||||
catch (error) {
|
||||
logger_1.logger.error('proxy', `Error fetching proxy: ${error}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Get a proxy from Phoenix, AZ, USA (ideal for Arizona dispensaries)
|
||||
*/
|
||||
async function getPhoenixProxy() {
|
||||
return getProxy({ city: 'Phoenix', state: 'Arizona', country: 'United States' });
|
||||
}
|
||||
/**
|
||||
* Get a proxy from a specific US state
|
||||
*/
|
||||
async function getStateProxy(state) {
|
||||
return getProxy({ state, country: 'United States' });
|
||||
}
|
||||
/**
|
||||
* Get a proxy from a specific city
|
||||
*/
|
||||
async function getCityProxy(city, state) {
|
||||
return getProxy({ city, state });
|
||||
}
|
||||
/**
|
||||
* Get a random active proxy (no location filter)
|
||||
*/
|
||||
async function getRandomProxy() {
|
||||
return getProxy();
|
||||
}
|
||||
/**
|
||||
* Get proxy location statistics
|
||||
*/
|
||||
async function getProxyLocationStats() {
|
||||
try {
|
||||
const result = await migrate_1.pool.query(`
|
||||
SELECT
|
||||
country,
|
||||
state,
|
||||
city,
|
||||
COUNT(*) as count,
|
||||
SUM(CASE WHEN active THEN 1 ELSE 0 END) as active_count
|
||||
FROM proxies
|
||||
WHERE country IS NOT NULL
|
||||
GROUP BY country, state, city
|
||||
ORDER BY count DESC
|
||||
LIMIT 50
|
||||
`);
|
||||
return result.rows;
|
||||
}
|
||||
catch (error) {
|
||||
logger_1.logger.error('proxy', `Error fetching proxy stats: ${error}`);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
264
backend/dist/utils/stealthBrowser.js
vendored
Normal file
264
backend/dist/utils/stealthBrowser.js
vendored
Normal file
@@ -0,0 +1,264 @@
|
||||
"use strict";
|
||||
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
||||
if (k2 === undefined) k2 = k;
|
||||
var desc = Object.getOwnPropertyDescriptor(m, k);
|
||||
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
||||
desc = { enumerable: true, get: function() { return m[k]; } };
|
||||
}
|
||||
Object.defineProperty(o, k2, desc);
|
||||
}) : (function(o, m, k, k2) {
|
||||
if (k2 === undefined) k2 = k;
|
||||
o[k2] = m[k];
|
||||
}));
|
||||
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
||||
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
||||
}) : function(o, v) {
|
||||
o["default"] = v;
|
||||
});
|
||||
var __importStar = (this && this.__importStar) || (function () {
|
||||
var ownKeys = function(o) {
|
||||
ownKeys = Object.getOwnPropertyNames || function (o) {
|
||||
var ar = [];
|
||||
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
||||
return ar;
|
||||
};
|
||||
return ownKeys(o);
|
||||
};
|
||||
return function (mod) {
|
||||
if (mod && mod.__esModule) return mod;
|
||||
var result = {};
|
||||
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
||||
__setModuleDefault(result, mod);
|
||||
return result;
|
||||
};
|
||||
})();
|
||||
var __importDefault = (this && this.__importDefault) || function (mod) {
|
||||
return (mod && mod.__esModule) ? mod : { "default": mod };
|
||||
};
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.createStealthBrowser = createStealthBrowser;
|
||||
exports.createStealthContext = createStealthContext;
|
||||
exports.randomDelay = randomDelay;
|
||||
exports.humanMouseMove = humanMouseMove;
|
||||
exports.humanScroll = humanScroll;
|
||||
exports.humanType = humanType;
|
||||
exports.simulateHumanBehavior = simulateHumanBehavior;
|
||||
exports.waitForPageLoad = waitForPageLoad;
|
||||
exports.isCloudflareChallenge = isCloudflareChallenge;
|
||||
exports.waitForCloudflareChallenge = waitForCloudflareChallenge;
|
||||
exports.saveCookies = saveCookies;
|
||||
exports.loadCookies = loadCookies;
|
||||
const playwright_extra_1 = require("playwright-extra");
|
||||
const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth"));
|
||||
// Add stealth plugin
|
||||
playwright_extra_1.chromium.use((0, puppeteer_extra_plugin_stealth_1.default)());
|
||||
/**
|
||||
* Create a stealth browser instance with anti-detection measures
|
||||
*/
|
||||
async function createStealthBrowser(options = {}) {
|
||||
const launchOptions = {
|
||||
headless: options.headless !== false,
|
||||
args: [
|
||||
'--disable-blink-features=AutomationControlled',
|
||||
'--disable-features=IsolateOrigins,site-per-process',
|
||||
'--disable-web-security',
|
||||
'--disable-features=VizDisplayCompositor',
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-accelerated-2d-canvas',
|
||||
'--no-first-run',
|
||||
'--no-zygote',
|
||||
'--disable-gpu',
|
||||
],
|
||||
};
|
||||
if (options.proxy) {
|
||||
launchOptions.proxy = options.proxy;
|
||||
}
|
||||
const browser = await playwright_extra_1.chromium.launch(launchOptions);
|
||||
return browser;
|
||||
}
|
||||
/**
|
||||
* Create a stealth context with realistic browser fingerprint
|
||||
*/
|
||||
async function createStealthContext(browser, options = {}) {
|
||||
const userAgent = options.userAgent ||
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36';
|
||||
const context = await browser.newContext({
|
||||
userAgent,
|
||||
viewport: { width: 1920, height: 1080 },
|
||||
locale: 'en-US',
|
||||
timezoneId: 'America/Phoenix',
|
||||
permissions: ['geolocation'],
|
||||
geolocation: { latitude: 33.4484, longitude: -112.074 }, // Phoenix, AZ
|
||||
colorScheme: 'light',
|
||||
deviceScaleFactor: 1,
|
||||
hasTouch: false,
|
||||
isMobile: false,
|
||||
javaScriptEnabled: true,
|
||||
extraHTTPHeaders: {
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
|
||||
'Sec-Fetch-Site': 'none',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
'Sec-Fetch-User': '?1',
|
||||
'Sec-Fetch-Dest': 'document',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
},
|
||||
});
|
||||
// Set age verification cookies for Dutchie
|
||||
await context.addCookies([
|
||||
{
|
||||
name: 'age_verified',
|
||||
value: 'true',
|
||||
domain: '.dutchie.com',
|
||||
path: '/',
|
||||
expires: Math.floor(Date.now() / 1000) + 86400 * 30, // 30 days
|
||||
},
|
||||
{
|
||||
name: 'initial_location',
|
||||
value: JSON.stringify({ state: options.state || 'Arizona' }),
|
||||
domain: '.dutchie.com',
|
||||
path: '/',
|
||||
expires: Math.floor(Date.now() / 1000) + 86400 * 30,
|
||||
},
|
||||
]);
|
||||
return context;
|
||||
}
|
||||
/**
|
||||
* Random delay between min and max milliseconds
|
||||
*/
|
||||
function randomDelay(min, max) {
|
||||
const delay = Math.floor(Math.random() * (max - min + 1)) + min;
|
||||
return new Promise((resolve) => setTimeout(resolve, delay));
|
||||
}
|
||||
/**
|
||||
* Simulate human-like mouse movement
|
||||
*/
|
||||
async function humanMouseMove(page, x, y) {
|
||||
const steps = 20;
|
||||
const currentPos = await page.evaluate(() => ({ x: 0, y: 0 }));
|
||||
for (let i = 0; i <= steps; i++) {
|
||||
const progress = i / steps;
|
||||
const easeProgress = easeInOutQuad(progress);
|
||||
const nextX = currentPos.x + (x - currentPos.x) * easeProgress;
|
||||
const nextY = currentPos.y + (y - currentPos.y) * easeProgress;
|
||||
await page.mouse.move(nextX, nextY);
|
||||
await randomDelay(5, 15);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Easing function for smooth mouse movement
|
||||
*/
|
||||
function easeInOutQuad(t) {
|
||||
return t < 0.5 ? 2 * t * t : -1 + (4 - 2 * t) * t;
|
||||
}
|
||||
/**
|
||||
* Simulate human-like scrolling
|
||||
*/
|
||||
async function humanScroll(page, scrollAmount = 500) {
|
||||
const scrollSteps = 10;
|
||||
const stepSize = scrollAmount / scrollSteps;
|
||||
for (let i = 0; i < scrollSteps; i++) {
|
||||
await page.mouse.wheel(0, stepSize);
|
||||
await randomDelay(50, 150);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Simulate human-like typing
|
||||
*/
|
||||
async function humanType(page, selector, text) {
|
||||
await page.click(selector);
|
||||
await randomDelay(100, 300);
|
||||
for (const char of text) {
|
||||
await page.keyboard.type(char);
|
||||
await randomDelay(50, 150);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Random realistic behavior before interacting with page
|
||||
*/
|
||||
async function simulateHumanBehavior(page) {
|
||||
// Random small mouse movements
|
||||
for (let i = 0; i < 3; i++) {
|
||||
const x = Math.random() * 500 + 100;
|
||||
const y = Math.random() * 300 + 100;
|
||||
await humanMouseMove(page, x, y);
|
||||
await randomDelay(200, 500);
|
||||
}
|
||||
// Small scroll
|
||||
await humanScroll(page, 100);
|
||||
await randomDelay(300, 700);
|
||||
}
|
||||
/**
|
||||
* Wait for page to be fully loaded with human-like delay
|
||||
*/
|
||||
async function waitForPageLoad(page, timeout = 60000) {
|
||||
try {
|
||||
await page.waitForLoadState('networkidle', { timeout });
|
||||
await randomDelay(500, 1500); // Random delay after load
|
||||
}
|
||||
catch (error) {
|
||||
// If networkidle times out, try domcontentloaded as fallback
|
||||
console.log('⚠️ networkidle timeout, waiting for domcontentloaded...');
|
||||
await page.waitForLoadState('domcontentloaded', { timeout: 30000 });
|
||||
await randomDelay(1000, 2000);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Check if we're on a Cloudflare challenge page
|
||||
*/
|
||||
async function isCloudflareChallenge(page) {
|
||||
const title = await page.title();
|
||||
const content = await page.content();
|
||||
return (title.includes('Cloudflare') ||
|
||||
title.includes('Just a moment') ||
|
||||
title.includes('Attention Required') ||
|
||||
content.includes('challenge-platform') ||
|
||||
content.includes('cf-challenge') ||
|
||||
content.includes('Checking your browser'));
|
||||
}
|
||||
/**
|
||||
* Wait for Cloudflare challenge to complete
|
||||
*/
|
||||
async function waitForCloudflareChallenge(page, maxWaitMs = 60000) {
|
||||
const startTime = Date.now();
|
||||
let attempts = 0;
|
||||
while (Date.now() - startTime < maxWaitMs) {
|
||||
attempts++;
|
||||
if (!(await isCloudflareChallenge(page))) {
|
||||
console.log(`✅ Cloudflare challenge passed after ${attempts} attempts (${Math.floor((Date.now() - startTime) / 1000)}s)`);
|
||||
return true;
|
||||
}
|
||||
const remaining = Math.floor((maxWaitMs - (Date.now() - startTime)) / 1000);
|
||||
console.log(`⏳ Waiting for Cloudflare challenge... (attempt ${attempts}, ${remaining}s remaining)`);
|
||||
// Random delay between checks
|
||||
await randomDelay(2000, 3000);
|
||||
}
|
||||
console.log('❌ Cloudflare challenge timeout - may need residential proxy or manual intervention');
|
||||
return false;
|
||||
}
|
||||
/**
|
||||
* Save session cookies to file
|
||||
*/
|
||||
async function saveCookies(context, filepath) {
|
||||
const cookies = await context.cookies();
|
||||
const fs = await Promise.resolve().then(() => __importStar(require('fs/promises')));
|
||||
await fs.writeFile(filepath, JSON.stringify(cookies, null, 2));
|
||||
}
|
||||
/**
|
||||
* Load session cookies from file
|
||||
*/
|
||||
async function loadCookies(context, filepath) {
|
||||
try {
|
||||
const fs = await Promise.resolve().then(() => __importStar(require('fs/promises')));
|
||||
const cookiesString = await fs.readFile(filepath, 'utf-8');
|
||||
const cookies = JSON.parse(cookiesString);
|
||||
await context.addCookies(cookies);
|
||||
return true;
|
||||
}
|
||||
catch (error) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user