diff --git a/backend/dist/auth/middleware.js b/backend/dist/auth/middleware.js
deleted file mode 100644
index 280a8cf7..00000000
--- a/backend/dist/auth/middleware.js
+++ /dev/null
@@ -1,113 +0,0 @@
-"use strict";
-var __importDefault = (this && this.__importDefault) || function (mod) {
- return (mod && mod.__esModule) ? mod : { "default": mod };
-};
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.generateToken = generateToken;
-exports.verifyToken = verifyToken;
-exports.authenticateUser = authenticateUser;
-exports.authMiddleware = authMiddleware;
-exports.requireRole = requireRole;
-const jsonwebtoken_1 = __importDefault(require("jsonwebtoken"));
-const bcrypt_1 = __importDefault(require("bcrypt"));
-const migrate_1 = require("../db/migrate");
-const JWT_SECRET = process.env.JWT_SECRET || 'change_this_in_production';
-function generateToken(user) {
- return jsonwebtoken_1.default.sign({ id: user.id, email: user.email, role: user.role }, JWT_SECRET, { expiresIn: '7d' });
-}
-function verifyToken(token) {
- try {
- return jsonwebtoken_1.default.verify(token, JWT_SECRET);
- }
- catch (error) {
- return null;
- }
-}
-async function authenticateUser(email, password) {
- const result = await migrate_1.pool.query('SELECT id, email, password_hash, role FROM users WHERE email = $1', [email]);
- if (result.rows.length === 0) {
- return null;
- }
- const user = result.rows[0];
- const isValid = await bcrypt_1.default.compare(password, user.password_hash);
- if (!isValid) {
- return null;
- }
- return {
- id: user.id,
- email: user.email,
- role: user.role
- };
-}
-async function authMiddleware(req, res, next) {
- const authHeader = req.headers.authorization;
- if (!authHeader || !authHeader.startsWith('Bearer ')) {
- return res.status(401).json({ error: 'No token provided' });
- }
- const token = authHeader.substring(7);
- // Try JWT first
- const jwtUser = verifyToken(token);
- if (jwtUser) {
- req.user = jwtUser;
- return next();
- }
- // If JWT fails, try API token
- try {
- const result = await migrate_1.pool.query(`
- SELECT id, name, rate_limit, active, expires_at, allowed_endpoints
- FROM api_tokens
- WHERE token = $1
- `, [token]);
- if (result.rows.length === 0) {
- return res.status(401).json({ error: 'Invalid token' });
- }
- const apiToken = result.rows[0];
- // Check if token is active
- if (!apiToken.active) {
- return res.status(401).json({ error: 'Token is disabled' });
- }
- // Check if token is expired
- if (apiToken.expires_at && new Date(apiToken.expires_at) < new Date()) {
- return res.status(401).json({ error: 'Token has expired' });
- }
- // Check allowed endpoints
- if (apiToken.allowed_endpoints && apiToken.allowed_endpoints.length > 0) {
- const isAllowed = apiToken.allowed_endpoints.some((pattern) => {
- // Simple wildcard matching
- const regex = new RegExp('^' + pattern.replace('*', '.*') + '$');
- return regex.test(req.path);
- });
- if (!isAllowed) {
- return res.status(403).json({ error: 'Endpoint not allowed for this token' });
- }
- }
- // Set API token on request for tracking
- req.apiToken = {
- id: apiToken.id,
- name: apiToken.name,
- rate_limit: apiToken.rate_limit
- };
- // Set a generic user for compatibility with existing code
- req.user = {
- id: apiToken.id,
- email: `api-token-${apiToken.id}@system`,
- role: 'api'
- };
- next();
- }
- catch (error) {
- console.error('Error verifying API token:', error);
- return res.status(500).json({ error: 'Authentication failed' });
- }
-}
-function requireRole(...roles) {
- return (req, res, next) => {
- if (!req.user) {
- return res.status(401).json({ error: 'Not authenticated' });
- }
- if (!roles.includes(req.user.role)) {
- return res.status(403).json({ error: 'Insufficient permissions' });
- }
- next();
- };
-}
diff --git a/backend/dist/db/add-jobs-table.js b/backend/dist/db/add-jobs-table.js
deleted file mode 100644
index 58db75bc..00000000
--- a/backend/dist/db/add-jobs-table.js
+++ /dev/null
@@ -1,41 +0,0 @@
-"use strict";
-Object.defineProperty(exports, "__esModule", { value: true });
-const migrate_1 = require("./migrate");
-async function addJobsTable() {
- const client = await migrate_1.pool.connect();
- try {
- await client.query('BEGIN');
- await client.query(`
- CREATE TABLE IF NOT EXISTS jobs (
- id SERIAL PRIMARY KEY,
- type VARCHAR(50) NOT NULL,
- status VARCHAR(50) DEFAULT 'pending',
- store_id INTEGER REFERENCES stores(id) ON DELETE CASCADE,
- progress INTEGER DEFAULT 0,
- total_items INTEGER,
- processed_items INTEGER DEFAULT 0,
- error TEXT,
- started_at TIMESTAMP,
- completed_at TIMESTAMP,
- created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
- );
-
- CREATE INDEX IF NOT EXISTS idx_jobs_status ON jobs(status);
- CREATE INDEX IF NOT EXISTS idx_jobs_type ON jobs(type);
- CREATE INDEX IF NOT EXISTS idx_jobs_store_id ON jobs(store_id);
- `);
- await client.query('COMMIT');
- console.log('✅ Jobs table created successfully');
- }
- catch (error) {
- await client.query('ROLLBACK');
- console.error('❌ Failed to create jobs table:', error);
- throw error;
- }
- finally {
- client.release();
- }
-}
-addJobsTable()
- .then(() => process.exit(0))
- .catch(() => process.exit(1));
diff --git a/backend/dist/db/migrate.js b/backend/dist/db/migrate.js
deleted file mode 100644
index 5af42b0c..00000000
--- a/backend/dist/db/migrate.js
+++ /dev/null
@@ -1,321 +0,0 @@
-"use strict";
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.pool = void 0;
-exports.runMigrations = runMigrations;
-const pg_1 = require("pg");
-// Consolidated DB connection:
-// - Prefer CRAWLSY_DATABASE_URL (e.g., crawlsy_local, crawlsy_prod)
-// - Then DATABASE_URL (default)
-const DATABASE_URL = process.env.CRAWLSY_DATABASE_URL ||
- process.env.DATABASE_URL ||
- 'postgresql://dutchie:dutchie_local_pass@localhost:54320/crawlsy_local';
-const pool = new pg_1.Pool({
- connectionString: DATABASE_URL,
-});
-exports.pool = pool;
-async function runMigrations() {
- const client = await pool.connect();
- try {
- await client.query('BEGIN');
- // Users table
- await client.query(`
- CREATE TABLE IF NOT EXISTS users (
- id SERIAL PRIMARY KEY,
- email VARCHAR(255) UNIQUE NOT NULL,
- password_hash VARCHAR(255) NOT NULL,
- role VARCHAR(50) DEFAULT 'admin',
- created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
- updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
- );
- `);
- // Stores table
- await client.query(`
- CREATE TABLE IF NOT EXISTS stores (
- id SERIAL PRIMARY KEY,
- name VARCHAR(255) NOT NULL,
- slug VARCHAR(255) UNIQUE NOT NULL,
- dutchie_url TEXT NOT NULL,
- active BOOLEAN DEFAULT true,
- scrape_enabled BOOLEAN DEFAULT true,
- last_scraped_at TIMESTAMP,
- created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
- updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
- );
- `);
- // Categories table (shop, brands, specials)
- await client.query(`
- CREATE TABLE IF NOT EXISTS categories (
- id SERIAL PRIMARY KEY,
- store_id INTEGER REFERENCES stores(id) ON DELETE CASCADE,
- name VARCHAR(255) NOT NULL,
- slug VARCHAR(255) NOT NULL,
- dutchie_url TEXT NOT NULL,
- scrape_enabled BOOLEAN DEFAULT true,
- last_scraped_at TIMESTAMP,
- created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
- UNIQUE(store_id, slug)
- );
- `);
- // Products table
- await client.query(`
- CREATE TABLE IF NOT EXISTS products (
- id SERIAL PRIMARY KEY,
- store_id INTEGER REFERENCES stores(id) ON DELETE CASCADE,
- category_id INTEGER REFERENCES categories(id) ON DELETE SET NULL,
- dutchie_product_id VARCHAR(255),
- name VARCHAR(500) NOT NULL,
- slug VARCHAR(500),
- description TEXT,
- price DECIMAL(10, 2),
- original_price DECIMAL(10, 2),
- strain_type VARCHAR(100),
- thc_percentage DECIMAL(5, 2),
- cbd_percentage DECIMAL(5, 2),
- brand VARCHAR(255),
- weight VARCHAR(100),
- image_url TEXT,
- local_image_path TEXT,
- dutchie_url TEXT NOT NULL,
- in_stock BOOLEAN DEFAULT true,
- is_special BOOLEAN DEFAULT false,
- metadata JSONB,
- first_seen_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
- last_seen_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
- created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
- updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
- UNIQUE(store_id, dutchie_product_id)
- );
- `);
- // Campaigns table
- await client.query(`
- CREATE TABLE IF NOT EXISTS campaigns (
- id SERIAL PRIMARY KEY,
- name VARCHAR(255) NOT NULL,
- slug VARCHAR(255) UNIQUE NOT NULL,
- description TEXT,
- display_style VARCHAR(50) DEFAULT 'grid',
- active BOOLEAN DEFAULT true,
- start_date TIMESTAMP,
- end_date TIMESTAMP,
- created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
- updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
- );
- `);
- // Add variant column to products table (for different sizes/options of same product)
- await client.query(`
- ALTER TABLE products ADD COLUMN IF NOT EXISTS variant VARCHAR(255);
- `);
- // Add special tracking columns (DEPRECATED - not used with new approach)
- await client.query(`
- ALTER TABLE products ADD COLUMN IF NOT EXISTS special_ends_at TIMESTAMP;
- ALTER TABLE products ADD COLUMN IF NOT EXISTS special_text TEXT;
- ALTER TABLE products ADD COLUMN IF NOT EXISTS special_type VARCHAR(100);
- `);
- // ====== NEW SCHEMA ADDITIONS ======
- // Add array columns for product attributes
- await client.query(`
- ALTER TABLE products ADD COLUMN IF NOT EXISTS terpenes TEXT[];
- ALTER TABLE products ADD COLUMN IF NOT EXISTS effects TEXT[];
- ALTER TABLE products ADD COLUMN IF NOT EXISTS flavors TEXT[];
- `);
- // Add new price columns (regular_price = market price, sale_price = discount price)
- await client.query(`
- ALTER TABLE products ADD COLUMN IF NOT EXISTS regular_price DECIMAL(10, 2);
- ALTER TABLE products ADD COLUMN IF NOT EXISTS sale_price DECIMAL(10, 2);
- `);
- // Migrate existing price data
- await client.query(`
- UPDATE products
- SET regular_price = original_price
- WHERE regular_price IS NULL AND original_price IS NOT NULL;
- `);
- await client.query(`
- UPDATE products
- SET sale_price = price
- WHERE sale_price IS NULL AND price IS NOT NULL AND original_price IS NOT NULL AND price < original_price;
- `);
- // Make slug NOT NULL and add unique constraint
- await client.query(`
- UPDATE products SET slug = dutchie_product_id WHERE slug IS NULL;
- ALTER TABLE products ALTER COLUMN slug SET NOT NULL;
- `);
- // Drop old unique constraint and add new one on slug
- await client.query(`
- ALTER TABLE products DROP CONSTRAINT IF EXISTS products_store_id_dutchie_product_id_key;
- DO $$
- BEGIN
- IF NOT EXISTS (SELECT 1 FROM pg_constraint WHERE conname = 'products_store_id_slug_unique') THEN
- ALTER TABLE products ADD CONSTRAINT products_store_id_slug_unique UNIQUE (store_id, slug);
- END IF;
- END$$;
- `);
- // Product Categories (many-to-many) - products can appear in multiple categories
- await client.query(`
- CREATE TABLE IF NOT EXISTS product_categories (
- id SERIAL PRIMARY KEY,
- product_id INTEGER REFERENCES products(id) ON DELETE CASCADE,
- category_slug VARCHAR(255) NOT NULL,
- first_seen_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
- last_seen_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
- UNIQUE(product_id, category_slug)
- );
- `);
- await client.query(`
- CREATE INDEX IF NOT EXISTS idx_product_categories_slug ON product_categories(category_slug, last_seen_at DESC);
- CREATE INDEX IF NOT EXISTS idx_product_categories_product ON product_categories(product_id);
- `);
- // Price History - track regular and sale price changes over time
- await client.query(`
- CREATE TABLE IF NOT EXISTS price_history (
- id SERIAL PRIMARY KEY,
- product_id INTEGER REFERENCES products(id) ON DELETE CASCADE,
- regular_price DECIMAL(10, 2),
- sale_price DECIMAL(10, 2),
- recorded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
- );
- `);
- await client.query(`
- CREATE INDEX IF NOT EXISTS idx_price_history_product ON price_history(product_id, recorded_at DESC);
- CREATE INDEX IF NOT EXISTS idx_price_history_recorded ON price_history(recorded_at DESC);
- `);
- // Batch History - track cannabinoid/terpene changes (different batches)
- await client.query(`
- CREATE TABLE IF NOT EXISTS batch_history (
- id SERIAL PRIMARY KEY,
- product_id INTEGER REFERENCES products(id) ON DELETE CASCADE,
- thc_percentage DECIMAL(5, 2),
- cbd_percentage DECIMAL(5, 2),
- terpenes TEXT[],
- strain_type VARCHAR(100),
- recorded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
- );
- `);
- await client.query(`
- CREATE INDEX IF NOT EXISTS idx_batch_history_product ON batch_history(product_id, recorded_at DESC);
- CREATE INDEX IF NOT EXISTS idx_batch_history_recorded ON batch_history(recorded_at DESC);
- `);
- // Campaign products (many-to-many with ordering)
- await client.query(`
- CREATE TABLE IF NOT EXISTS campaign_products (
- id SERIAL PRIMARY KEY,
- campaign_id INTEGER REFERENCES campaigns(id) ON DELETE CASCADE,
- product_id INTEGER REFERENCES products(id) ON DELETE CASCADE,
- display_order INTEGER DEFAULT 0,
- created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
- UNIQUE(campaign_id, product_id)
- );
- `);
- // Click tracking
- await client.query(`
- CREATE TABLE IF NOT EXISTS clicks (
- id SERIAL PRIMARY KEY,
- product_id INTEGER REFERENCES products(id) ON DELETE CASCADE,
- campaign_id INTEGER REFERENCES campaigns(id) ON DELETE SET NULL,
- ip_address VARCHAR(45),
- user_agent TEXT,
- referrer TEXT,
- clicked_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
- );
- `);
- // Create index on clicked_at for analytics queries
- await client.query(`
- CREATE INDEX IF NOT EXISTS idx_clicks_clicked_at ON clicks(clicked_at);
- CREATE INDEX IF NOT EXISTS idx_clicks_product_id ON clicks(product_id);
- CREATE INDEX IF NOT EXISTS idx_clicks_campaign_id ON clicks(campaign_id);
- `);
- // Proxies table
- await client.query(`
- CREATE TABLE IF NOT EXISTS proxies (
- id SERIAL PRIMARY KEY,
- host VARCHAR(255) NOT NULL,
- port INTEGER NOT NULL,
- protocol VARCHAR(10) NOT NULL,
- username VARCHAR(255),
- password VARCHAR(255),
- active BOOLEAN DEFAULT true,
- is_anonymous BOOLEAN DEFAULT false,
- last_tested_at TIMESTAMP,
- test_result VARCHAR(50),
- response_time_ms INTEGER,
- failure_count INTEGER DEFAULT 0,
- created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
- updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
- UNIQUE(host, port, protocol)
- );
- `);
- // Add failure_count column if it doesn't exist
- await client.query(`
- ALTER TABLE proxies ADD COLUMN IF NOT EXISTS failure_count INTEGER DEFAULT 0;
- `);
- // Failed proxies table
- await client.query(`
- CREATE TABLE IF NOT EXISTS failed_proxies (
- id SERIAL PRIMARY KEY,
- host VARCHAR(255) NOT NULL,
- port INTEGER NOT NULL,
- protocol VARCHAR(10) NOT NULL,
- username VARCHAR(255),
- password VARCHAR(255),
- failure_count INTEGER NOT NULL,
- last_error TEXT,
- failed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
- created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
- UNIQUE(host, port, protocol)
- );
- `);
- // Proxy test jobs table
- await client.query(`
- CREATE TABLE IF NOT EXISTS proxy_test_jobs (
- id SERIAL PRIMARY KEY,
- status VARCHAR(20) NOT NULL DEFAULT 'pending',
- total_proxies INTEGER NOT NULL DEFAULT 0,
- tested_proxies INTEGER NOT NULL DEFAULT 0,
- passed_proxies INTEGER NOT NULL DEFAULT 0,
- failed_proxies INTEGER NOT NULL DEFAULT 0,
- started_at TIMESTAMP,
- completed_at TIMESTAMP,
- created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
- updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
- );
- `);
- await client.query(`
- CREATE INDEX IF NOT EXISTS idx_proxy_test_jobs_status ON proxy_test_jobs(status);
- CREATE INDEX IF NOT EXISTS idx_proxy_test_jobs_created_at ON proxy_test_jobs(created_at DESC);
- `);
- // Settings table
- await client.query(`
- CREATE TABLE IF NOT EXISTS settings (
- key VARCHAR(255) PRIMARY KEY,
- value TEXT NOT NULL,
- description TEXT,
- updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
- );
- `);
- // Insert default settings
- await client.query(`
- INSERT INTO settings (key, value, description) VALUES
- ('scrape_interval_hours', '4', 'How often to scrape stores (in hours)'),
- ('scrape_specials_time', '00:01', 'Time to scrape specials daily (HH:MM in 24h format)'),
- ('analytics_retention_days', '365', 'How many days to keep analytics data'),
- ('proxy_timeout_ms', '3000', 'Proxy timeout in milliseconds'),
- ('proxy_test_url', 'https://httpbin.org/ip', 'URL to test proxies against')
- ON CONFLICT (key) DO NOTHING;
- `);
- await client.query('COMMIT');
- console.log('✅ Migrations completed successfully');
- }
- catch (error) {
- await client.query('ROLLBACK');
- console.error('❌ Migration failed:', error);
- throw error;
- }
- finally {
- client.release();
- }
-}
-// Run migrations if this file is executed directly
-if (require.main === module) {
- runMigrations()
- .then(() => process.exit(0))
- .catch(() => process.exit(1));
-}
diff --git a/backend/dist/db/run-notifications-migration.js b/backend/dist/db/run-notifications-migration.js
deleted file mode 100644
index 008b33d1..00000000
--- a/backend/dist/db/run-notifications-migration.js
+++ /dev/null
@@ -1,56 +0,0 @@
-"use strict";
-var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
- if (k2 === undefined) k2 = k;
- var desc = Object.getOwnPropertyDescriptor(m, k);
- if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
- desc = { enumerable: true, get: function() { return m[k]; } };
- }
- Object.defineProperty(o, k2, desc);
-}) : (function(o, m, k, k2) {
- if (k2 === undefined) k2 = k;
- o[k2] = m[k];
-}));
-var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
- Object.defineProperty(o, "default", { enumerable: true, value: v });
-}) : function(o, v) {
- o["default"] = v;
-});
-var __importStar = (this && this.__importStar) || (function () {
- var ownKeys = function(o) {
- ownKeys = Object.getOwnPropertyNames || function (o) {
- var ar = [];
- for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
- return ar;
- };
- return ownKeys(o);
- };
- return function (mod) {
- if (mod && mod.__esModule) return mod;
- var result = {};
- if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
- __setModuleDefault(result, mod);
- return result;
- };
-})();
-Object.defineProperty(exports, "__esModule", { value: true });
-const migrate_1 = require("./migrate");
-const fs = __importStar(require("fs"));
-const path = __importStar(require("path"));
-async function runNotificationsMigration() {
- const client = await migrate_1.pool.connect();
- try {
- console.log('Running notifications migration...');
- const migrationSQL = fs.readFileSync(path.join(__dirname, '../../migrations/005_notifications.sql'), 'utf-8');
- await client.query(migrationSQL);
- console.log('✅ Notifications migration completed successfully');
- process.exit(0);
- }
- catch (error) {
- console.error('❌ Migration failed:', error);
- process.exit(1);
- }
- finally {
- client.release();
- }
-}
-runNotificationsMigration();
diff --git a/backend/dist/db/seed.js b/backend/dist/db/seed.js
deleted file mode 100644
index 638b9bc2..00000000
--- a/backend/dist/db/seed.js
+++ /dev/null
@@ -1,72 +0,0 @@
-"use strict";
-var __importDefault = (this && this.__importDefault) || function (mod) {
- return (mod && mod.__esModule) ? mod : { "default": mod };
-};
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.seedDatabase = seedDatabase;
-const migrate_1 = require("./migrate");
-const bcrypt_1 = __importDefault(require("bcrypt"));
-async function seedDatabase() {
- const client = await migrate_1.pool.connect();
- try {
- // Create admin user
- const adminEmail = process.env.ADMIN_EMAIL || 'admin@example.com';
- const adminPassword = process.env.ADMIN_PASSWORD || 'password';
- const passwordHash = await bcrypt_1.default.hash(adminPassword, 10);
- await client.query(`
- INSERT INTO users (email, password_hash, role)
- VALUES ($1, $2, 'superadmin')
- ON CONFLICT (email) DO UPDATE
- SET password_hash = $2, role = 'superadmin'
- `, [adminEmail, passwordHash]);
- console.log(`✅ Admin user created: ${adminEmail}`);
- // Create Deeply Rooted store
- const storeResult = await client.query(`
- INSERT INTO stores (name, slug, dutchie_url, active, scrape_enabled)
- VALUES ('Deeply Rooted', 'AZ-Deeply-Rooted', 'https://dutchie.com/embedded-menu/AZ-Deeply-Rooted', true, true)
- ON CONFLICT (slug) DO UPDATE
- SET name = 'Deeply Rooted', dutchie_url = 'https://dutchie.com/embedded-menu/AZ-Deeply-Rooted'
- RETURNING id
- `);
- const storeId = storeResult.rows[0].id;
- console.log(`✅ Store created: Deeply Rooted (ID: ${storeId})`);
- // Create categories for the store
- const categories = [
- { name: 'Shop', slug: 'shop', url: 'https://dutchie.com/embedded-menu/AZ-Deeply-Rooted' },
- { name: 'Brands', slug: 'brands', url: 'https://dutchie.com/embedded-menu/AZ-Deeply-Rooted/brands' },
- { name: 'Specials', slug: 'specials', url: 'https://dutchie.com/embedded-menu/AZ-Deeply-Rooted/specials/sale/66501e094faefa00079b1835' }
- ];
- for (const cat of categories) {
- await client.query(`
- INSERT INTO categories (store_id, name, slug, dutchie_url, scrape_enabled)
- VALUES ($1, $2, $3, $4, true)
- ON CONFLICT (store_id, slug) DO UPDATE
- SET name = $2, dutchie_url = $4
- `, [storeId, cat.name, cat.slug, cat.url]);
- }
- console.log('✅ Categories created: Shop, Brands, Specials');
- // Create a default "Featured Products" campaign
- await client.query(`
- INSERT INTO campaigns (name, slug, description, display_style, active)
- VALUES ('Featured Products', 'featured', 'Default featured products campaign', 'grid', true)
- ON CONFLICT (slug) DO NOTHING
- `);
- console.log('✅ Default campaign created: Featured Products');
- console.log('\n🎉 Seeding completed successfully!');
- console.log(`\n📧 Login: ${adminEmail}`);
- console.log(`🔑 Password: ${adminPassword}`);
- }
- catch (error) {
- console.error('❌ Seeding failed:', error);
- throw error;
- }
- finally {
- client.release();
- }
-}
-// Run seed if this file is executed directly
-if (require.main === module) {
- seedDatabase()
- .then(() => process.exit(0))
- .catch(() => process.exit(1));
-}
diff --git a/backend/dist/db/update-categories-hierarchy.js b/backend/dist/db/update-categories-hierarchy.js
deleted file mode 100644
index 02f15fee..00000000
--- a/backend/dist/db/update-categories-hierarchy.js
+++ /dev/null
@@ -1,48 +0,0 @@
-"use strict";
-Object.defineProperty(exports, "__esModule", { value: true });
-const migrate_1 = require("./migrate");
-async function updateCategoriesHierarchy() {
- const client = await migrate_1.pool.connect();
- try {
- await client.query('BEGIN');
- // Add parent_id for nested categories
- await client.query(`
- ALTER TABLE categories
- ADD COLUMN IF NOT EXISTS parent_id INTEGER REFERENCES categories(id) ON DELETE CASCADE;
-
- ALTER TABLE categories
- ADD COLUMN IF NOT EXISTS display_order INTEGER DEFAULT 0;
-
- ALTER TABLE categories
- ADD COLUMN IF NOT EXISTS description TEXT;
-
- CREATE INDEX IF NOT EXISTS idx_categories_parent_id ON categories(parent_id);
- `);
- // Add category_path for easy searching (e.g., 'shop/flower')
- await client.query(`
- ALTER TABLE categories
- ADD COLUMN IF NOT EXISTS path VARCHAR(500);
-
- CREATE INDEX IF NOT EXISTS idx_categories_path ON categories(path);
- `);
- // Update existing categories to have paths
- await client.query(`
- UPDATE categories
- SET path = slug
- WHERE path IS NULL;
- `);
- await client.query('COMMIT');
- console.log('✅ Categories hierarchy updated successfully');
- }
- catch (error) {
- await client.query('ROLLBACK');
- console.error('❌ Failed to update categories:', error);
- throw error;
- }
- finally {
- client.release();
- }
-}
-updateCategoriesHierarchy()
- .then(() => process.exit(0))
- .catch(() => process.exit(1));
diff --git a/backend/dist/dutchie-az/config/dutchie.js b/backend/dist/dutchie-az/config/dutchie.js
deleted file mode 100644
index f9b2088b..00000000
--- a/backend/dist/dutchie-az/config/dutchie.js
+++ /dev/null
@@ -1,106 +0,0 @@
-"use strict";
-/**
- * Dutchie Configuration
- *
- * Centralized configuration for Dutchie GraphQL API interaction.
- * Update hashes here when Dutchie changes their persisted query system.
- */
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.ARIZONA_CENTERPOINTS = exports.GRAPHQL_HASHES = exports.dutchieConfig = void 0;
-exports.dutchieConfig = {
- // ============================================================
- // GRAPHQL ENDPOINT
- // ============================================================
- /** GraphQL endpoint - must be the api-3 graphql endpoint (NOT api-gw.dutchie.com which no longer exists) */
- graphqlEndpoint: 'https://dutchie.com/api-3/graphql',
- // ============================================================
- // GRAPHQL PERSISTED QUERY HASHES
- // ============================================================
- //
- // These hashes identify specific GraphQL operations.
- // If Dutchie changes their schema, you may need to capture
- // new hashes from live browser traffic (Network tab → graphql requests).
- /** FilteredProducts - main product listing query */
- filteredProductsHash: 'ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0',
- /** GetAddressBasedDispensaryData - resolve slug to internal ID */
- getDispensaryDataHash: '13461f73abf7268770dfd05fe7e10c523084b2bb916a929c08efe3d87531977b',
- /**
- * ConsumerDispensaries - geo-based discovery
- * NOTE: This is a placeholder guess. If discovery fails, either:
- * 1. Capture the real hash from live traffic
- * 2. Rely on known AZDHS slugs instead (set useDiscovery: false)
- */
- consumerDispensariesHash: '0a5bfa6ca1d64ae47bcccb7c8077c87147cbc4e6982c17ceec97a2a4948b311b',
- // ============================================================
- // BEHAVIOR FLAGS
- // ============================================================
- /** Enable geo-based discovery (false = use known AZDHS slugs only) */
- useDiscovery: true,
- /** Prefer GET requests (true) or POST (false). GET is default. */
- preferGet: true,
- /**
- * Enable POST fallback when GET fails with 405 or blocked.
- * If true, will retry failed GETs as POSTs.
- */
- enablePostFallback: true,
- // ============================================================
- // PAGINATION & RETRY
- // ============================================================
- /** Products per page for pagination */
- perPage: 100,
- /** Maximum pages to fetch (safety limit) */
- maxPages: 200,
- /** Number of retries for failed page fetches */
- maxRetries: 1,
- /** Delay between pages in ms */
- pageDelayMs: 500,
- /** Delay between modes in ms */
- modeDelayMs: 2000,
- // ============================================================
- // HTTP HEADERS
- // ============================================================
- /** Default headers to mimic browser requests */
- defaultHeaders: {
- 'accept': 'application/json, text/plain, */*',
- 'accept-language': 'en-US,en;q=0.9',
- 'apollographql-client-name': 'Marketplace (production)',
- },
- /** User agent string */
- userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
- // ============================================================
- // BROWSER LAUNCH OPTIONS
- // ============================================================
- browserArgs: [
- '--no-sandbox',
- '--disable-setuid-sandbox',
- '--disable-dev-shm-usage',
- '--disable-blink-features=AutomationControlled',
- ],
- /** Navigation timeout in ms */
- navigationTimeout: 60000,
- /** Initial page load delay in ms */
- pageLoadDelay: 2000,
-};
-/**
- * Get GraphQL hashes object for backward compatibility
- */
-exports.GRAPHQL_HASHES = {
- FilteredProducts: exports.dutchieConfig.filteredProductsHash,
- GetAddressBasedDispensaryData: exports.dutchieConfig.getDispensaryDataHash,
- ConsumerDispensaries: exports.dutchieConfig.consumerDispensariesHash,
-};
-/**
- * Arizona geo centerpoints for discovery scans
- */
-exports.ARIZONA_CENTERPOINTS = [
- { name: 'Phoenix', lat: 33.4484, lng: -112.074 },
- { name: 'Tucson', lat: 32.2226, lng: -110.9747 },
- { name: 'Flagstaff', lat: 35.1983, lng: -111.6513 },
- { name: 'Mesa', lat: 33.4152, lng: -111.8315 },
- { name: 'Scottsdale', lat: 33.4942, lng: -111.9261 },
- { name: 'Tempe', lat: 33.4255, lng: -111.94 },
- { name: 'Yuma', lat: 32.6927, lng: -114.6277 },
- { name: 'Prescott', lat: 34.54, lng: -112.4685 },
- { name: 'Lake Havasu', lat: 34.4839, lng: -114.3224 },
- { name: 'Sierra Vista', lat: 31.5455, lng: -110.2773 },
-];
diff --git a/backend/dist/dutchie-az/db/connection.js b/backend/dist/dutchie-az/db/connection.js
deleted file mode 100644
index e3b32e39..00000000
--- a/backend/dist/dutchie-az/db/connection.js
+++ /dev/null
@@ -1,79 +0,0 @@
-"use strict";
-/**
- * Dutchie AZ Database Connection
- *
- * Isolated database connection for Dutchie Arizona data.
- * Uses a separate database/schema to prevent cross-contamination with main app data.
- */
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.getDutchieAZPool = getDutchieAZPool;
-exports.query = query;
-exports.getClient = getClient;
-exports.closePool = closePool;
-exports.healthCheck = healthCheck;
-const pg_1 = require("pg");
-// Consolidated DB naming:
-// - Prefer CRAWLSY_DATABASE_URL (e.g., crawlsy_local, crawlsy_prod)
-// - Then DUTCHIE_AZ_DATABASE_URL (legacy)
-// - Finally DATABASE_URL (legacy main DB)
-const DUTCHIE_AZ_DATABASE_URL = process.env.CRAWLSY_DATABASE_URL ||
- process.env.DUTCHIE_AZ_DATABASE_URL ||
- process.env.DATABASE_URL ||
- 'postgresql://dutchie:dutchie_local_pass@localhost:54320/crawlsy_local';
-let pool = null;
-/**
- * Get the Dutchie AZ database pool (singleton)
- */
-function getDutchieAZPool() {
- if (!pool) {
- pool = new pg_1.Pool({
- connectionString: DUTCHIE_AZ_DATABASE_URL,
- max: 10,
- idleTimeoutMillis: 30000,
- connectionTimeoutMillis: 5000,
- });
- pool.on('error', (err) => {
- console.error('[DutchieAZ DB] Unexpected error on idle client:', err);
- });
- console.log('[DutchieAZ DB] Pool initialized');
- }
- return pool;
-}
-/**
- * Execute a query on the Dutchie AZ database
- */
-async function query(text, params) {
- const p = getDutchieAZPool();
- const result = await p.query(text, params);
- return { rows: result.rows, rowCount: result.rowCount || 0 };
-}
-/**
- * Get a client from the pool for transaction use
- */
-async function getClient() {
- const p = getDutchieAZPool();
- return p.connect();
-}
-/**
- * Close the pool connection
- */
-async function closePool() {
- if (pool) {
- await pool.end();
- pool = null;
- console.log('[DutchieAZ DB] Pool closed');
- }
-}
-/**
- * Check if the database is accessible
- */
-async function healthCheck() {
- try {
- const result = await query('SELECT 1 as ok');
- return result.rows.length > 0 && result.rows[0].ok === 1;
- }
- catch (error) {
- console.error('[DutchieAZ DB] Health check failed:', error);
- return false;
- }
-}
diff --git a/backend/dist/dutchie-az/db/migrate.js b/backend/dist/dutchie-az/db/migrate.js
deleted file mode 100644
index a4ea4eae..00000000
--- a/backend/dist/dutchie-az/db/migrate.js
+++ /dev/null
@@ -1,30 +0,0 @@
-"use strict";
-/**
- * Dutchie AZ Schema Bootstrap
- *
- * Run this to create/update the dutchie_az tables (dutchie_products, dutchie_product_snapshots, etc.)
- * in the AZ pipeline database. This is separate from the legacy schema.
- *
- * Usage:
- * TS_NODE_TRANSPILE_ONLY=1 npx ts-node src/dutchie-az/db/migrate.ts
- * or (after build)
- * node dist/dutchie-az/db/migrate.js
- */
-Object.defineProperty(exports, "__esModule", { value: true });
-const schema_1 = require("./schema");
-const connection_1 = require("./connection");
-async function main() {
- try {
- console.log('[DutchieAZ] Running schema migration...');
- await (0, schema_1.createSchema)();
- console.log('[DutchieAZ] Schema migration complete.');
- }
- catch (err) {
- console.error('[DutchieAZ] Schema migration failed:', err.message);
- process.exitCode = 1;
- }
- finally {
- await (0, connection_1.closePool)();
- }
-}
-main();
diff --git a/backend/dist/dutchie-az/db/schema.js b/backend/dist/dutchie-az/db/schema.js
deleted file mode 100644
index 493692a3..00000000
--- a/backend/dist/dutchie-az/db/schema.js
+++ /dev/null
@@ -1,405 +0,0 @@
-"use strict";
-/**
- * Dutchie AZ Database Schema
- *
- * Creates all tables for the isolated Dutchie Arizona data pipeline.
- * Run this to initialize the dutchie_az database.
- */
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.createSchema = createSchema;
-exports.dropSchema = dropSchema;
-exports.schemaExists = schemaExists;
-exports.ensureSchema = ensureSchema;
-const connection_1 = require("./connection");
-/**
- * SQL statements to create all tables
- */
-const SCHEMA_SQL = `
--- ============================================================
--- DISPENSARIES TABLE
--- Stores discovered Dutchie dispensaries in Arizona
--- ============================================================
-CREATE TABLE IF NOT EXISTS dispensaries (
- id SERIAL PRIMARY KEY,
- platform VARCHAR(20) NOT NULL DEFAULT 'dutchie',
- name VARCHAR(255) NOT NULL,
- slug VARCHAR(255) NOT NULL,
- city VARCHAR(100) NOT NULL,
- state VARCHAR(10) NOT NULL DEFAULT 'AZ',
- postal_code VARCHAR(20),
- address TEXT,
- latitude DECIMAL(10, 7),
- longitude DECIMAL(10, 7),
- platform_dispensary_id VARCHAR(100),
- is_delivery BOOLEAN DEFAULT false,
- is_pickup BOOLEAN DEFAULT true,
- raw_metadata JSONB,
- last_crawled_at TIMESTAMPTZ,
- product_count INTEGER DEFAULT 0,
- created_at TIMESTAMPTZ DEFAULT NOW(),
- updated_at TIMESTAMPTZ DEFAULT NOW(),
-
- CONSTRAINT uk_dispensaries_platform_slug UNIQUE (platform, slug, city, state)
-);
-
-CREATE INDEX IF NOT EXISTS idx_dispensaries_platform ON dispensaries(platform);
-CREATE INDEX IF NOT EXISTS idx_dispensaries_platform_id ON dispensaries(platform_dispensary_id);
-CREATE INDEX IF NOT EXISTS idx_dispensaries_state ON dispensaries(state);
-CREATE INDEX IF NOT EXISTS idx_dispensaries_city ON dispensaries(city);
-
--- ============================================================
--- DUTCHIE_PRODUCTS TABLE
--- Canonical product identity per store
--- ============================================================
-CREATE TABLE IF NOT EXISTS dutchie_products (
- id SERIAL PRIMARY KEY,
- dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
- platform VARCHAR(20) NOT NULL DEFAULT 'dutchie',
-
- external_product_id VARCHAR(100) NOT NULL,
- platform_dispensary_id VARCHAR(100) NOT NULL,
- c_name VARCHAR(500),
- name VARCHAR(500) NOT NULL,
-
- -- Brand
- brand_name VARCHAR(255),
- brand_id VARCHAR(100),
- brand_logo_url TEXT,
-
- -- Classification
- type VARCHAR(100),
- subcategory VARCHAR(100),
- strain_type VARCHAR(50),
- provider VARCHAR(100),
-
- -- Potency
- thc DECIMAL(10, 4),
- thc_content DECIMAL(10, 4),
- cbd DECIMAL(10, 4),
- cbd_content DECIMAL(10, 4),
- cannabinoids_v2 JSONB,
- effects JSONB,
-
- -- Status / flags
- status VARCHAR(50),
- medical_only BOOLEAN DEFAULT false,
- rec_only BOOLEAN DEFAULT false,
- featured BOOLEAN DEFAULT false,
- coming_soon BOOLEAN DEFAULT false,
- certificate_of_analysis_enabled BOOLEAN DEFAULT false,
-
- is_below_threshold BOOLEAN DEFAULT false,
- is_below_kiosk_threshold BOOLEAN DEFAULT false,
- options_below_threshold BOOLEAN DEFAULT false,
- options_below_kiosk_threshold BOOLEAN DEFAULT false,
-
- -- Derived stock status: 'in_stock', 'out_of_stock', 'unknown'
- stock_status VARCHAR(20) DEFAULT 'unknown',
- total_quantity_available INTEGER DEFAULT 0,
-
- -- Images
- primary_image_url TEXT,
- images JSONB,
-
- -- Misc
- measurements JSONB,
- weight VARCHAR(50),
- past_c_names TEXT[],
-
- created_at_dutchie TIMESTAMPTZ,
- updated_at_dutchie TIMESTAMPTZ,
-
- latest_raw_payload JSONB,
-
- created_at TIMESTAMPTZ DEFAULT NOW(),
- updated_at TIMESTAMPTZ DEFAULT NOW(),
-
- CONSTRAINT uk_dutchie_products UNIQUE (dispensary_id, external_product_id)
-);
-
-CREATE INDEX IF NOT EXISTS idx_dutchie_products_dispensary ON dutchie_products(dispensary_id);
-CREATE INDEX IF NOT EXISTS idx_dutchie_products_external_id ON dutchie_products(external_product_id);
-CREATE INDEX IF NOT EXISTS idx_dutchie_products_platform_disp ON dutchie_products(platform_dispensary_id);
-CREATE INDEX IF NOT EXISTS idx_dutchie_products_brand ON dutchie_products(brand_name);
-CREATE INDEX IF NOT EXISTS idx_dutchie_products_type ON dutchie_products(type);
-CREATE INDEX IF NOT EXISTS idx_dutchie_products_subcategory ON dutchie_products(subcategory);
-CREATE INDEX IF NOT EXISTS idx_dutchie_products_status ON dutchie_products(status);
-CREATE INDEX IF NOT EXISTS idx_dutchie_products_strain ON dutchie_products(strain_type);
-CREATE INDEX IF NOT EXISTS idx_dutchie_products_stock_status ON dutchie_products(stock_status);
-
--- ============================================================
--- DUTCHIE_PRODUCT_SNAPSHOTS TABLE
--- Historical state per crawl, includes options[]
--- ============================================================
-CREATE TABLE IF NOT EXISTS dutchie_product_snapshots (
- id SERIAL PRIMARY KEY,
- dutchie_product_id INTEGER NOT NULL REFERENCES dutchie_products(id) ON DELETE CASCADE,
- dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE,
- platform_dispensary_id VARCHAR(100) NOT NULL,
- external_product_id VARCHAR(100) NOT NULL,
- pricing_type VARCHAR(20) DEFAULT 'unknown',
- crawl_mode VARCHAR(20) DEFAULT 'mode_a', -- 'mode_a' (UI parity) or 'mode_b' (max coverage)
-
- status VARCHAR(50),
- featured BOOLEAN DEFAULT false,
- special BOOLEAN DEFAULT false,
- medical_only BOOLEAN DEFAULT false,
- rec_only BOOLEAN DEFAULT false,
-
- -- Flag indicating if product was present in feed (false = missing_from_feed snapshot)
- is_present_in_feed BOOLEAN DEFAULT true,
-
- -- Derived stock status
- stock_status VARCHAR(20) DEFAULT 'unknown',
-
- -- Price summary (in cents)
- rec_min_price_cents INTEGER,
- rec_max_price_cents INTEGER,
- rec_min_special_price_cents INTEGER,
- med_min_price_cents INTEGER,
- med_max_price_cents INTEGER,
- med_min_special_price_cents INTEGER,
- wholesale_min_price_cents INTEGER,
-
- -- Inventory summary
- total_quantity_available INTEGER,
- total_kiosk_quantity_available INTEGER,
- manual_inventory BOOLEAN DEFAULT false,
- is_below_threshold BOOLEAN DEFAULT false,
- is_below_kiosk_threshold BOOLEAN DEFAULT false,
-
- -- Option-level data (from POSMetaData.children)
- options JSONB,
-
- -- Full raw product node
- raw_payload JSONB NOT NULL,
-
- crawled_at TIMESTAMPTZ NOT NULL,
- created_at TIMESTAMPTZ DEFAULT NOW(),
- updated_at TIMESTAMPTZ DEFAULT NOW()
-);
-
-CREATE INDEX IF NOT EXISTS idx_snapshots_product ON dutchie_product_snapshots(dutchie_product_id);
-CREATE INDEX IF NOT EXISTS idx_snapshots_dispensary ON dutchie_product_snapshots(dispensary_id);
-CREATE INDEX IF NOT EXISTS idx_snapshots_crawled_at ON dutchie_product_snapshots(crawled_at);
-CREATE INDEX IF NOT EXISTS idx_snapshots_platform_disp ON dutchie_product_snapshots(platform_dispensary_id);
-CREATE INDEX IF NOT EXISTS idx_snapshots_external_id ON dutchie_product_snapshots(external_product_id);
-CREATE INDEX IF NOT EXISTS idx_snapshots_special ON dutchie_product_snapshots(special) WHERE special = true;
-CREATE INDEX IF NOT EXISTS idx_snapshots_stock_status ON dutchie_product_snapshots(stock_status);
-CREATE INDEX IF NOT EXISTS idx_snapshots_crawl_mode ON dutchie_product_snapshots(crawl_mode);
-
--- ============================================================
--- CRAWL_JOBS TABLE
--- Tracks crawl execution status
--- ============================================================
-CREATE TABLE IF NOT EXISTS crawl_jobs (
- id SERIAL PRIMARY KEY,
- job_type VARCHAR(50) NOT NULL,
- dispensary_id INTEGER REFERENCES dispensaries(id) ON DELETE SET NULL,
- status VARCHAR(20) NOT NULL DEFAULT 'pending',
- started_at TIMESTAMPTZ,
- completed_at TIMESTAMPTZ,
- error_message TEXT,
- products_found INTEGER,
- snapshots_created INTEGER,
- metadata JSONB,
- created_at TIMESTAMPTZ DEFAULT NOW(),
- updated_at TIMESTAMPTZ DEFAULT NOW()
-);
-
-CREATE INDEX IF NOT EXISTS idx_crawl_jobs_type ON crawl_jobs(job_type);
-CREATE INDEX IF NOT EXISTS idx_crawl_jobs_status ON crawl_jobs(status);
-CREATE INDEX IF NOT EXISTS idx_crawl_jobs_dispensary ON crawl_jobs(dispensary_id);
-CREATE INDEX IF NOT EXISTS idx_crawl_jobs_created ON crawl_jobs(created_at);
-
--- ============================================================
--- JOB_SCHEDULES TABLE
--- Stores schedule configuration for recurring jobs with jitter support
--- Each job has independent timing that "wanders" over time
--- ============================================================
-CREATE TABLE IF NOT EXISTS job_schedules (
- id SERIAL PRIMARY KEY,
- job_name VARCHAR(100) NOT NULL UNIQUE,
- description TEXT,
- enabled BOOLEAN DEFAULT true,
-
- -- Timing configuration (jitter makes times "wander")
- base_interval_minutes INTEGER NOT NULL DEFAULT 240, -- e.g., 4 hours
- jitter_minutes INTEGER NOT NULL DEFAULT 30, -- e.g., ±30 min
-
- -- Last run tracking
- last_run_at TIMESTAMPTZ,
- last_status VARCHAR(20), -- 'success', 'error', 'partial', 'running'
- last_error_message TEXT,
- last_duration_ms INTEGER,
-
- -- Next run (calculated with jitter after each run)
- next_run_at TIMESTAMPTZ,
-
- -- Additional config
- job_config JSONB, -- e.g., { pricingType: 'rec', useBothModes: true }
-
- created_at TIMESTAMPTZ DEFAULT NOW(),
- updated_at TIMESTAMPTZ DEFAULT NOW()
-);
-
-CREATE INDEX IF NOT EXISTS idx_job_schedules_enabled ON job_schedules(enabled);
-CREATE INDEX IF NOT EXISTS idx_job_schedules_next_run ON job_schedules(next_run_at);
-
--- ============================================================
--- JOB_RUN_LOGS TABLE
--- Stores history of job runs for monitoring
--- ============================================================
-CREATE TABLE IF NOT EXISTS job_run_logs (
- id SERIAL PRIMARY KEY,
- schedule_id INTEGER NOT NULL REFERENCES job_schedules(id) ON DELETE CASCADE,
- job_name VARCHAR(100) NOT NULL,
- status VARCHAR(20) NOT NULL, -- 'pending', 'running', 'success', 'error', 'partial'
- started_at TIMESTAMPTZ,
- completed_at TIMESTAMPTZ,
- duration_ms INTEGER,
- error_message TEXT,
-
- -- Results summary
- items_processed INTEGER,
- items_succeeded INTEGER,
- items_failed INTEGER,
-
- metadata JSONB, -- Additional run details
-
- created_at TIMESTAMPTZ DEFAULT NOW()
-);
-
-CREATE INDEX IF NOT EXISTS idx_job_run_logs_schedule ON job_run_logs(schedule_id);
-CREATE INDEX IF NOT EXISTS idx_job_run_logs_job_name ON job_run_logs(job_name);
-CREATE INDEX IF NOT EXISTS idx_job_run_logs_status ON job_run_logs(status);
-CREATE INDEX IF NOT EXISTS idx_job_run_logs_created ON job_run_logs(created_at);
-
--- ============================================================
--- VIEWS FOR EASY QUERYING
--- ============================================================
-
--- Categories derived from products
-CREATE OR REPLACE VIEW v_categories AS
-SELECT
- type,
- subcategory,
- COUNT(DISTINCT id) as product_count,
- COUNT(DISTINCT dispensary_id) as dispensary_count,
- AVG(thc) as avg_thc,
- MIN(thc) as min_thc,
- MAX(thc) as max_thc
-FROM dutchie_products
-WHERE type IS NOT NULL
-GROUP BY type, subcategory
-ORDER BY type, subcategory;
-
--- Brands derived from products
-CREATE OR REPLACE VIEW v_brands AS
-SELECT
- brand_name,
- brand_id,
- MAX(brand_logo_url) as brand_logo_url,
- COUNT(DISTINCT id) as product_count,
- COUNT(DISTINCT dispensary_id) as dispensary_count,
- ARRAY_AGG(DISTINCT type) FILTER (WHERE type IS NOT NULL) as product_types
-FROM dutchie_products
-WHERE brand_name IS NOT NULL
-GROUP BY brand_name, brand_id
-ORDER BY product_count DESC;
-
--- Latest snapshot per product (most recent crawl data)
-CREATE OR REPLACE VIEW v_latest_snapshots AS
-SELECT DISTINCT ON (dutchie_product_id)
- s.*
-FROM dutchie_product_snapshots s
-ORDER BY dutchie_product_id, crawled_at DESC;
-
--- Dashboard stats
-CREATE OR REPLACE VIEW v_dashboard_stats AS
-SELECT
- (SELECT COUNT(*) FROM dispensaries WHERE state = 'AZ') as dispensary_count,
- (SELECT COUNT(*) FROM dutchie_products) as product_count,
- (SELECT COUNT(*) FROM dutchie_product_snapshots WHERE crawled_at > NOW() - INTERVAL '24 hours') as snapshots_24h,
- (SELECT MAX(crawled_at) FROM dutchie_product_snapshots) as last_crawl_time,
- (SELECT COUNT(*) FROM crawl_jobs WHERE status = 'failed' AND created_at > NOW() - INTERVAL '24 hours') as failed_jobs_24h,
- (SELECT COUNT(DISTINCT brand_name) FROM dutchie_products WHERE brand_name IS NOT NULL) as brand_count,
- (SELECT COUNT(DISTINCT (type, subcategory)) FROM dutchie_products WHERE type IS NOT NULL) as category_count;
-`;
-/**
- * Run the schema migration
- */
-async function createSchema() {
- console.log('[DutchieAZ Schema] Creating database schema...');
- const client = await (0, connection_1.getClient)();
- try {
- await client.query('BEGIN');
- // Split into individual statements and execute
- const statements = SCHEMA_SQL
- .split(';')
- .map(s => s.trim())
- .filter(s => s.length > 0 && !s.startsWith('--'));
- for (const statement of statements) {
- if (statement.trim()) {
- await client.query(statement + ';');
- }
- }
- await client.query('COMMIT');
- console.log('[DutchieAZ Schema] Schema created successfully');
- }
- catch (error) {
- await client.query('ROLLBACK');
- console.error('[DutchieAZ Schema] Failed to create schema:', error);
- throw error;
- }
- finally {
- client.release();
- }
-}
-/**
- * Drop all tables (for development/testing)
- */
-async function dropSchema() {
- console.log('[DutchieAZ Schema] Dropping all tables...');
- await (0, connection_1.query)(`
- DROP VIEW IF EXISTS v_dashboard_stats CASCADE;
- DROP VIEW IF EXISTS v_latest_snapshots CASCADE;
- DROP VIEW IF EXISTS v_brands CASCADE;
- DROP VIEW IF EXISTS v_categories CASCADE;
- DROP TABLE IF EXISTS crawl_schedule CASCADE;
- DROP TABLE IF EXISTS crawl_jobs CASCADE;
- DROP TABLE IF EXISTS dutchie_product_snapshots CASCADE;
- DROP TABLE IF EXISTS dutchie_products CASCADE;
- DROP TABLE IF EXISTS dispensaries CASCADE;
- `);
- console.log('[DutchieAZ Schema] All tables dropped');
-}
-/**
- * Check if schema exists
- */
-async function schemaExists() {
- try {
- const result = await (0, connection_1.query)(`
- SELECT EXISTS (
- SELECT FROM information_schema.tables
- WHERE table_name = 'dispensaries'
- ) as exists
- `);
- return result.rows[0]?.exists === true;
- }
- catch (error) {
- return false;
- }
-}
-/**
- * Initialize schema if it doesn't exist
- */
-async function ensureSchema() {
- const exists = await schemaExists();
- if (!exists) {
- await createSchema();
- }
- else {
- console.log('[DutchieAZ Schema] Schema already exists');
- }
-}
diff --git a/backend/dist/dutchie-az/index.js b/backend/dist/dutchie-az/index.js
deleted file mode 100644
index b0887874..00000000
--- a/backend/dist/dutchie-az/index.js
+++ /dev/null
@@ -1,95 +0,0 @@
-"use strict";
-/**
- * Dutchie AZ Data Pipeline
- *
- * Isolated data pipeline for crawling and storing Dutchie Arizona dispensary data.
- * This module is completely separate from the main application database.
- *
- * Features:
- * - Two-mode crawling (Mode A: UI parity, Mode B: MAX COVERAGE)
- * - Derived stockStatus field (in_stock, out_of_stock, unknown)
- * - Full raw payload storage for 100% data preservation
- * - AZDHS dispensary list as canonical source
- */
-var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
- if (k2 === undefined) k2 = k;
- var desc = Object.getOwnPropertyDescriptor(m, k);
- if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
- desc = { enumerable: true, get: function() { return m[k]; } };
- }
- Object.defineProperty(o, k2, desc);
-}) : (function(o, m, k, k2) {
- if (k2 === undefined) k2 = k;
- o[k2] = m[k];
-}));
-var __exportStar = (this && this.__exportStar) || function(m, exports) {
- for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
-};
-var __importDefault = (this && this.__importDefault) || function (mod) {
- return (mod && mod.__esModule) ? mod : { "default": mod };
-};
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.dutchieAZRouter = exports.getImportStats = exports.importFromJSON = exports.importAZDHSDispensaries = exports.getRunLogs = exports.initializeDefaultSchedules = exports.triggerScheduleNow = exports.deleteSchedule = exports.updateSchedule = exports.createSchedule = exports.getScheduleById = exports.getAllSchedules = exports.crawlSingleDispensary = exports.getSchedulerStatus = exports.triggerImmediateCrawl = exports.stopScheduler = exports.startScheduler = exports.crawlAllArizonaDispensaries = exports.crawlDispensaryProducts = exports.normalizeSnapshot = exports.normalizeProduct = exports.getDispensariesWithPlatformIds = exports.getDispensaryById = exports.getAllDispensaries = exports.resolvePlatformDispensaryIds = exports.discoverAndSaveDispensaries = exports.importFromExistingDispensaries = exports.discoverDispensaries = exports.discoverArizonaDispensaries = exports.fetchAllProductsBothModes = exports.fetchAllProducts = exports.resolveDispensaryId = exports.ARIZONA_CENTERPOINTS = exports.GRAPHQL_HASHES = exports.ensureSchema = exports.schemaExists = exports.dropSchema = exports.createSchema = exports.healthCheck = exports.closePool = exports.getClient = exports.query = exports.getDutchieAZPool = void 0;
-// Types
-__exportStar(require("./types"), exports);
-// Database
-var connection_1 = require("./db/connection");
-Object.defineProperty(exports, "getDutchieAZPool", { enumerable: true, get: function () { return connection_1.getDutchieAZPool; } });
-Object.defineProperty(exports, "query", { enumerable: true, get: function () { return connection_1.query; } });
-Object.defineProperty(exports, "getClient", { enumerable: true, get: function () { return connection_1.getClient; } });
-Object.defineProperty(exports, "closePool", { enumerable: true, get: function () { return connection_1.closePool; } });
-Object.defineProperty(exports, "healthCheck", { enumerable: true, get: function () { return connection_1.healthCheck; } });
-var schema_1 = require("./db/schema");
-Object.defineProperty(exports, "createSchema", { enumerable: true, get: function () { return schema_1.createSchema; } });
-Object.defineProperty(exports, "dropSchema", { enumerable: true, get: function () { return schema_1.dropSchema; } });
-Object.defineProperty(exports, "schemaExists", { enumerable: true, get: function () { return schema_1.schemaExists; } });
-Object.defineProperty(exports, "ensureSchema", { enumerable: true, get: function () { return schema_1.ensureSchema; } });
-// Services - GraphQL Client
-var graphql_client_1 = require("./services/graphql-client");
-Object.defineProperty(exports, "GRAPHQL_HASHES", { enumerable: true, get: function () { return graphql_client_1.GRAPHQL_HASHES; } });
-Object.defineProperty(exports, "ARIZONA_CENTERPOINTS", { enumerable: true, get: function () { return graphql_client_1.ARIZONA_CENTERPOINTS; } });
-Object.defineProperty(exports, "resolveDispensaryId", { enumerable: true, get: function () { return graphql_client_1.resolveDispensaryId; } });
-Object.defineProperty(exports, "fetchAllProducts", { enumerable: true, get: function () { return graphql_client_1.fetchAllProducts; } });
-Object.defineProperty(exports, "fetchAllProductsBothModes", { enumerable: true, get: function () { return graphql_client_1.fetchAllProductsBothModes; } });
-Object.defineProperty(exports, "discoverArizonaDispensaries", { enumerable: true, get: function () { return graphql_client_1.discoverArizonaDispensaries; } });
-// Alias for backward compatibility
-Object.defineProperty(exports, "discoverDispensaries", { enumerable: true, get: function () { return graphql_client_1.discoverArizonaDispensaries; } });
-// Services - Discovery
-var discovery_1 = require("./services/discovery");
-Object.defineProperty(exports, "importFromExistingDispensaries", { enumerable: true, get: function () { return discovery_1.importFromExistingDispensaries; } });
-Object.defineProperty(exports, "discoverAndSaveDispensaries", { enumerable: true, get: function () { return discovery_1.discoverDispensaries; } });
-Object.defineProperty(exports, "resolvePlatformDispensaryIds", { enumerable: true, get: function () { return discovery_1.resolvePlatformDispensaryIds; } });
-Object.defineProperty(exports, "getAllDispensaries", { enumerable: true, get: function () { return discovery_1.getAllDispensaries; } });
-Object.defineProperty(exports, "getDispensaryById", { enumerable: true, get: function () { return discovery_1.getDispensaryById; } });
-Object.defineProperty(exports, "getDispensariesWithPlatformIds", { enumerable: true, get: function () { return discovery_1.getDispensariesWithPlatformIds; } });
-// Services - Product Crawler
-var product_crawler_1 = require("./services/product-crawler");
-Object.defineProperty(exports, "normalizeProduct", { enumerable: true, get: function () { return product_crawler_1.normalizeProduct; } });
-Object.defineProperty(exports, "normalizeSnapshot", { enumerable: true, get: function () { return product_crawler_1.normalizeSnapshot; } });
-Object.defineProperty(exports, "crawlDispensaryProducts", { enumerable: true, get: function () { return product_crawler_1.crawlDispensaryProducts; } });
-Object.defineProperty(exports, "crawlAllArizonaDispensaries", { enumerable: true, get: function () { return product_crawler_1.crawlAllArizonaDispensaries; } });
-// Services - Scheduler
-var scheduler_1 = require("./services/scheduler");
-Object.defineProperty(exports, "startScheduler", { enumerable: true, get: function () { return scheduler_1.startScheduler; } });
-Object.defineProperty(exports, "stopScheduler", { enumerable: true, get: function () { return scheduler_1.stopScheduler; } });
-Object.defineProperty(exports, "triggerImmediateCrawl", { enumerable: true, get: function () { return scheduler_1.triggerImmediateCrawl; } });
-Object.defineProperty(exports, "getSchedulerStatus", { enumerable: true, get: function () { return scheduler_1.getSchedulerStatus; } });
-Object.defineProperty(exports, "crawlSingleDispensary", { enumerable: true, get: function () { return scheduler_1.crawlSingleDispensary; } });
-// Schedule config CRUD
-Object.defineProperty(exports, "getAllSchedules", { enumerable: true, get: function () { return scheduler_1.getAllSchedules; } });
-Object.defineProperty(exports, "getScheduleById", { enumerable: true, get: function () { return scheduler_1.getScheduleById; } });
-Object.defineProperty(exports, "createSchedule", { enumerable: true, get: function () { return scheduler_1.createSchedule; } });
-Object.defineProperty(exports, "updateSchedule", { enumerable: true, get: function () { return scheduler_1.updateSchedule; } });
-Object.defineProperty(exports, "deleteSchedule", { enumerable: true, get: function () { return scheduler_1.deleteSchedule; } });
-Object.defineProperty(exports, "triggerScheduleNow", { enumerable: true, get: function () { return scheduler_1.triggerScheduleNow; } });
-Object.defineProperty(exports, "initializeDefaultSchedules", { enumerable: true, get: function () { return scheduler_1.initializeDefaultSchedules; } });
-// Run logs
-Object.defineProperty(exports, "getRunLogs", { enumerable: true, get: function () { return scheduler_1.getRunLogs; } });
-// Services - AZDHS Import
-var azdhs_import_1 = require("./services/azdhs-import");
-Object.defineProperty(exports, "importAZDHSDispensaries", { enumerable: true, get: function () { return azdhs_import_1.importAZDHSDispensaries; } });
-Object.defineProperty(exports, "importFromJSON", { enumerable: true, get: function () { return azdhs_import_1.importFromJSON; } });
-Object.defineProperty(exports, "getImportStats", { enumerable: true, get: function () { return azdhs_import_1.getImportStats; } });
-// Routes
-var routes_1 = require("./routes");
-Object.defineProperty(exports, "dutchieAZRouter", { enumerable: true, get: function () { return __importDefault(routes_1).default; } });
diff --git a/backend/dist/dutchie-az/routes/index.js b/backend/dist/dutchie-az/routes/index.js
deleted file mode 100644
index 5e4c313a..00000000
--- a/backend/dist/dutchie-az/routes/index.js
+++ /dev/null
@@ -1,1729 +0,0 @@
-"use strict";
-/**
- * Dutchie AZ API Routes
- *
- * Express routes for the Dutchie AZ data pipeline.
- * Provides API endpoints for stores, products, categories, and dashboard.
- */
-var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
- if (k2 === undefined) k2 = k;
- var desc = Object.getOwnPropertyDescriptor(m, k);
- if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
- desc = { enumerable: true, get: function() { return m[k]; } };
- }
- Object.defineProperty(o, k2, desc);
-}) : (function(o, m, k, k2) {
- if (k2 === undefined) k2 = k;
- o[k2] = m[k];
-}));
-var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
- Object.defineProperty(o, "default", { enumerable: true, value: v });
-}) : function(o, v) {
- o["default"] = v;
-});
-var __importStar = (this && this.__importStar) || (function () {
- var ownKeys = function(o) {
- ownKeys = Object.getOwnPropertyNames || function (o) {
- var ar = [];
- for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
- return ar;
- };
- return ownKeys(o);
- };
- return function (mod) {
- if (mod && mod.__esModule) return mod;
- var result = {};
- if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
- __setModuleDefault(result, mod);
- return result;
- };
-})();
-Object.defineProperty(exports, "__esModule", { value: true });
-const express_1 = require("express");
-const connection_1 = require("../db/connection");
-const schema_1 = require("../db/schema");
-const azdhs_import_1 = require("../services/azdhs-import");
-const discovery_1 = require("../services/discovery");
-const product_crawler_1 = require("../services/product-crawler");
-// Explicit column list for dispensaries table (avoids SELECT * issues with schema differences)
-const DISPENSARY_COLUMNS = `
- id, name, dba_name, slug, city, state, zip, address, latitude, longitude,
- menu_type, menu_url, platform_dispensary_id, website,
- provider_detection_data, created_at, updated_at
-`;
-const scheduler_1 = require("../services/scheduler");
-const router = (0, express_1.Router)();
-// ============================================================
-// DASHBOARD
-// ============================================================
-/**
- * GET /api/dutchie-az/dashboard
- * Dashboard stats overview
- */
-router.get('/dashboard', async (_req, res) => {
- try {
- const { rows } = await (0, connection_1.query)(`SELECT * FROM v_dashboard_stats`);
- const stats = rows[0] || {};
- res.json({
- dispensaryCount: parseInt(stats.dispensary_count || '0', 10),
- productCount: parseInt(stats.product_count || '0', 10),
- snapshotCount24h: parseInt(stats.snapshots_24h || '0', 10),
- lastCrawlTime: stats.last_crawl_time,
- failedJobCount: parseInt(stats.failed_jobs_24h || '0', 10),
- brandCount: parseInt(stats.brand_count || '0', 10),
- categoryCount: parseInt(stats.category_count || '0', 10),
- });
- }
- catch (error) {
- res.status(500).json({ error: error.message });
- }
-});
-// ============================================================
-// DISPENSARIES (STORES)
-// ============================================================
-/**
- * GET /api/dutchie-az/stores
- * List all stores with optional filters
- */
-router.get('/stores', async (req, res) => {
- try {
- const { city, hasPlatformId, limit = '100', offset = '0' } = req.query;
- let whereClause = 'WHERE state = \'AZ\'';
- const params = [];
- let paramIndex = 1;
- if (city) {
- whereClause += ` AND city = $${paramIndex}`;
- params.push(city);
- paramIndex++;
- }
- if (hasPlatformId === 'true') {
- whereClause += ' AND platform_dispensary_id IS NOT NULL';
- }
- else if (hasPlatformId === 'false') {
- whereClause += ' AND platform_dispensary_id IS NULL';
- }
- params.push(parseInt(limit, 10), parseInt(offset, 10));
- const { rows, rowCount } = await (0, connection_1.query)(`
- SELECT ${DISPENSARY_COLUMNS} FROM dispensaries
- ${whereClause}
- ORDER BY name
- LIMIT $${paramIndex} OFFSET $${paramIndex + 1}
- `, params);
- // Get total count
- const { rows: countRows } = await (0, connection_1.query)(`SELECT COUNT(*) as total FROM dispensaries ${whereClause}`, params.slice(0, -2));
- res.json({
- stores: rows,
- total: parseInt(countRows[0]?.total || '0', 10),
- limit: parseInt(limit, 10),
- offset: parseInt(offset, 10),
- });
- }
- catch (error) {
- res.status(500).json({ error: error.message });
- }
-});
-/**
- * GET /api/dutchie-az/stores/slug/:slug
- * Resolve a store by slug (case-insensitive) or platform_dispensary_id
- */
-router.get('/stores/slug/:slug', async (req, res) => {
- try {
- const { slug } = req.params;
- const normalized = slug.toLowerCase();
- const { rows } = await (0, connection_1.query)(`
- SELECT ${DISPENSARY_COLUMNS}
- FROM dispensaries
- WHERE lower(slug) = $1
- OR lower(platform_dispensary_id) = $1
- LIMIT 1
- `, [normalized]);
- if (!rows || rows.length === 0) {
- return res.status(404).json({ error: 'Store not found' });
- }
- res.json(rows[0]);
- }
- catch (error) {
- res.status(500).json({ error: error.message });
- }
-});
-/**
- * GET /api/dutchie-az/stores/:id
- * Get a single store by ID
- */
-router.get('/stores/:id', async (req, res) => {
- try {
- const { id } = req.params;
- const store = await (0, discovery_1.getDispensaryById)(parseInt(id, 10));
- if (!store) {
- return res.status(404).json({ error: 'Store not found' });
- }
- res.json(store);
- }
- catch (error) {
- res.status(500).json({ error: error.message });
- }
-});
-/**
- * GET /api/dutchie-az/stores/:id/summary
- * Get store summary with product count, categories, and brands
- * This is the main endpoint for the DispensaryDetail panel
- */
-router.get('/stores/:id/summary', async (req, res) => {
- try {
- const { id } = req.params;
- // Get dispensary info
- const { rows: dispensaryRows } = await (0, connection_1.query)(`SELECT ${DISPENSARY_COLUMNS} FROM dispensaries WHERE id = $1`, [parseInt(id, 10)]);
- if (dispensaryRows.length === 0) {
- return res.status(404).json({ error: 'Store not found' });
- }
- const dispensary = dispensaryRows[0];
- // Get product counts by stock status
- const { rows: countRows } = await (0, connection_1.query)(`
- SELECT
- COUNT(*) as total_products,
- COUNT(*) FILTER (WHERE stock_status = 'in_stock') as in_stock_count,
- COUNT(*) FILTER (WHERE stock_status = 'out_of_stock') as out_of_stock_count,
- COUNT(*) FILTER (WHERE stock_status = 'unknown') as unknown_count,
- COUNT(*) FILTER (WHERE stock_status = 'missing_from_feed') as missing_count
- FROM dutchie_products
- WHERE dispensary_id = $1
- `, [id]);
- // Get categories with counts for this store
- const { rows: categories } = await (0, connection_1.query)(`
- SELECT
- type,
- subcategory,
- COUNT(*) as product_count
- FROM dutchie_products
- WHERE dispensary_id = $1 AND type IS NOT NULL
- GROUP BY type, subcategory
- ORDER BY type, subcategory
- `, [id]);
- // Get brands with counts for this store
- const { rows: brands } = await (0, connection_1.query)(`
- SELECT
- brand_name,
- COUNT(*) as product_count
- FROM dutchie_products
- WHERE dispensary_id = $1 AND brand_name IS NOT NULL
- GROUP BY brand_name
- ORDER BY product_count DESC
- `, [id]);
- // Get last crawl info
- const { rows: lastCrawl } = await (0, connection_1.query)(`
- SELECT
- id,
- status,
- started_at,
- completed_at,
- products_found,
- products_new,
- products_updated,
- error_message
- FROM dispensary_crawl_jobs
- WHERE dispensary_id = $1
- ORDER BY created_at DESC
- LIMIT 1
- `, [id]);
- const counts = countRows[0] || {};
- res.json({
- dispensary,
- totalProducts: parseInt(counts.total_products || '0', 10),
- inStockCount: parseInt(counts.in_stock_count || '0', 10),
- outOfStockCount: parseInt(counts.out_of_stock_count || '0', 10),
- unknownStockCount: parseInt(counts.unknown_count || '0', 10),
- missingFromFeedCount: parseInt(counts.missing_count || '0', 10),
- categories,
- brands,
- brandCount: brands.length,
- categoryCount: categories.length,
- lastCrawl: lastCrawl[0] || null,
- });
- }
- catch (error) {
- res.status(500).json({ error: error.message });
- }
-});
-/**
- * GET /api/dutchie-az/stores/:id/products
- * Get paginated products for a store with latest snapshot data
- */
-router.get('/stores/:id/products', async (req, res) => {
- try {
- const { id } = req.params;
- const { stockStatus, type, subcategory, brandName, search, limit = '50', offset = '0', } = req.query;
- let whereClause = 'WHERE p.dispensary_id = $1';
- const params = [parseInt(id, 10)];
- let paramIndex = 2;
- if (stockStatus) {
- whereClause += ` AND p.stock_status = $${paramIndex}`;
- params.push(stockStatus);
- paramIndex++;
- }
- if (type) {
- whereClause += ` AND p.type = $${paramIndex}`;
- params.push(type);
- paramIndex++;
- }
- if (subcategory) {
- whereClause += ` AND p.subcategory = $${paramIndex}`;
- params.push(subcategory);
- paramIndex++;
- }
- if (brandName) {
- whereClause += ` AND p.brand_name ILIKE $${paramIndex}`;
- params.push(`%${brandName}%`);
- paramIndex++;
- }
- if (search) {
- whereClause += ` AND (p.name ILIKE $${paramIndex} OR p.brand_name ILIKE $${paramIndex})`;
- params.push(`%${search}%`);
- paramIndex++;
- }
- params.push(parseInt(limit, 10), parseInt(offset, 10));
- // Get products with their latest snapshot data
- const { rows: products } = await (0, connection_1.query)(`
- SELECT
- p.id,
- p.external_product_id,
- p.name,
- p.brand_name,
- p.type,
- p.subcategory,
- p.strain_type,
- p.stock_status,
- p.created_at,
- p.updated_at,
- p.primary_image_url,
- p.thc_content,
- p.cbd_content,
- -- Latest snapshot data (prices in cents)
- s.rec_min_price_cents,
- s.rec_max_price_cents,
- s.med_min_price_cents,
- s.med_max_price_cents,
- s.rec_min_special_price_cents,
- s.med_min_special_price_cents,
- s.total_quantity_available,
- s.options,
- s.stock_status as snapshot_stock_status,
- s.crawled_at as snapshot_at
- FROM dutchie_products p
- LEFT JOIN LATERAL (
- SELECT * FROM dutchie_product_snapshots
- WHERE dutchie_product_id = p.id
- ORDER BY crawled_at DESC
- LIMIT 1
- ) s ON true
- ${whereClause}
- ORDER BY p.updated_at DESC
- LIMIT $${paramIndex} OFFSET $${paramIndex + 1}
- `, params);
- // Get total count
- const { rows: countRows } = await (0, connection_1.query)(`SELECT COUNT(*) as total FROM dutchie_products p ${whereClause}`, params.slice(0, -2));
- // Transform products for frontend compatibility
- const transformedProducts = products.map((p) => ({
- id: p.id,
- external_id: p.external_product_id,
- name: p.name,
- brand: p.brand_name,
- type: p.type,
- subcategory: p.subcategory,
- strain_type: p.strain_type,
- stock_status: p.snapshot_stock_status || p.stock_status,
- in_stock: (p.snapshot_stock_status || p.stock_status) === 'in_stock',
- // Prices from latest snapshot (convert cents to dollars)
- regular_price: p.rec_min_price_cents ? p.rec_min_price_cents / 100 : null,
- regular_price_max: p.rec_max_price_cents ? p.rec_max_price_cents / 100 : null,
- sale_price: p.rec_min_special_price_cents ? p.rec_min_special_price_cents / 100 : null,
- med_price: p.med_min_price_cents ? p.med_min_price_cents / 100 : null,
- med_price_max: p.med_max_price_cents ? p.med_max_price_cents / 100 : null,
- med_sale_price: p.med_min_special_price_cents ? p.med_min_special_price_cents / 100 : null,
- // Potency from products table
- thc_percentage: p.thc_content,
- cbd_percentage: p.cbd_content,
- // Images from products table
- image_url: p.primary_image_url,
- // Other
- options: p.options,
- total_quantity: p.total_quantity_available,
- // Timestamps
- created_at: p.created_at,
- updated_at: p.updated_at,
- snapshot_at: p.snapshot_at,
- }));
- res.json({
- products: transformedProducts,
- total: parseInt(countRows[0]?.total || '0', 10),
- limit: parseInt(limit, 10),
- offset: parseInt(offset, 10),
- });
- }
- catch (error) {
- res.status(500).json({ error: error.message });
- }
-});
-/**
- * GET /api/dutchie-az/stores/:id/brands
- * Get brands for a specific store
- */
-router.get('/stores/:id/brands', async (req, res) => {
- try {
- const { id } = req.params;
- const { rows: brands } = await (0, connection_1.query)(`
- SELECT
- brand_name as brand,
- COUNT(*) as product_count
- FROM dutchie_products
- WHERE dispensary_id = $1 AND brand_name IS NOT NULL
- GROUP BY brand_name
- ORDER BY product_count DESC
- `, [parseInt(id, 10)]);
- res.json({ brands });
- }
- catch (error) {
- res.status(500).json({ error: error.message });
- }
-});
-/**
- * GET /api/dutchie-az/stores/:id/categories
- * Get categories for a specific store
- */
-router.get('/stores/:id/categories', async (req, res) => {
- try {
- const { id } = req.params;
- const { rows: categories } = await (0, connection_1.query)(`
- SELECT
- type,
- subcategory,
- COUNT(*) as product_count
- FROM dutchie_products
- WHERE dispensary_id = $1 AND type IS NOT NULL
- GROUP BY type, subcategory
- ORDER BY type, subcategory
- `, [parseInt(id, 10)]);
- res.json({ categories });
- }
- catch (error) {
- res.status(500).json({ error: error.message });
- }
-});
-// ============================================================
-// PRODUCTS
-// ============================================================
-/**
- * GET /api/dutchie-az/products
- * List products with filtering on our own DB
- */
-router.get('/products', async (req, res) => {
- try {
- const { storeId, stockStatus, type, subcategory, brandName, search, limit = '50', offset = '0', } = req.query;
- let whereClause = 'WHERE 1=1';
- const params = [];
- let paramIndex = 1;
- if (storeId) {
- whereClause += ` AND dispensary_id = $${paramIndex}`;
- params.push(parseInt(storeId, 10));
- paramIndex++;
- }
- if (stockStatus) {
- whereClause += ` AND stock_status = $${paramIndex}`;
- params.push(stockStatus);
- paramIndex++;
- }
- if (type) {
- whereClause += ` AND type = $${paramIndex}`;
- params.push(type);
- paramIndex++;
- }
- if (subcategory) {
- whereClause += ` AND subcategory = $${paramIndex}`;
- params.push(subcategory);
- paramIndex++;
- }
- if (brandName) {
- whereClause += ` AND brand_name ILIKE $${paramIndex}`;
- params.push(`%${brandName}%`);
- paramIndex++;
- }
- if (search) {
- whereClause += ` AND (name ILIKE $${paramIndex} OR brand_name ILIKE $${paramIndex})`;
- params.push(`%${search}%`);
- paramIndex++;
- }
- params.push(parseInt(limit, 10), parseInt(offset, 10));
- const { rows } = await (0, connection_1.query)(`
- SELECT
- p.*,
- d.name as store_name,
- d.city as store_city
- FROM dutchie_products p
- JOIN dispensaries d ON p.dispensary_id = d.id
- ${whereClause}
- ORDER BY p.updated_at DESC
- LIMIT $${paramIndex} OFFSET $${paramIndex + 1}
- `, params);
- // Get total count
- const { rows: countRows } = await (0, connection_1.query)(`SELECT COUNT(*) as total FROM dutchie_products ${whereClause}`, params.slice(0, -2));
- res.json({
- products: rows,
- total: parseInt(countRows[0]?.total || '0', 10),
- limit: parseInt(limit, 10),
- offset: parseInt(offset, 10),
- });
- }
- catch (error) {
- res.status(500).json({ error: error.message });
- }
-});
-/**
- * GET /api/dutchie-az/products/:id
- * Get a single product with its latest snapshot
- */
-router.get('/products/:id', async (req, res) => {
- try {
- const { id } = req.params;
- const { rows: productRows } = await (0, connection_1.query)(`
- SELECT
- p.*,
- d.name as store_name,
- d.city as store_city,
- d.slug as store_slug
- FROM dutchie_products p
- JOIN dispensaries d ON p.dispensary_id = d.id
- WHERE p.id = $1
- `, [id]);
- if (productRows.length === 0) {
- return res.status(404).json({ error: 'Product not found' });
- }
- // Get latest snapshot
- const { rows: snapshotRows } = await (0, connection_1.query)(`
- SELECT * FROM dutchie_product_snapshots
- WHERE dutchie_product_id = $1
- ORDER BY crawled_at DESC
- LIMIT 1
- `, [id]);
- res.json({
- product: productRows[0],
- latestSnapshot: snapshotRows[0] || null,
- });
- }
- catch (error) {
- res.status(500).json({ error: error.message });
- }
-});
-/**
- * GET /api/dutchie-az/products/:id/snapshots
- * Get snapshot history for a product
- */
-router.get('/products/:id/snapshots', async (req, res) => {
- try {
- const { id } = req.params;
- const { limit = '50', offset = '0' } = req.query;
- const { rows } = await (0, connection_1.query)(`
- SELECT * FROM dutchie_product_snapshots
- WHERE dutchie_product_id = $1
- ORDER BY crawled_at DESC
- LIMIT $2 OFFSET $3
- `, [id, parseInt(limit, 10), parseInt(offset, 10)]);
- res.json({ snapshots: rows });
- }
- catch (error) {
- res.status(500).json({ error: error.message });
- }
-});
-// ============================================================
-// CATEGORIES
-// ============================================================
-/**
- * GET /api/dutchie-az/categories
- * Get all categories with counts
- */
-router.get('/categories', async (_req, res) => {
- try {
- const { rows } = await (0, connection_1.query)(`SELECT * FROM v_categories`);
- res.json({ categories: rows });
- }
- catch (error) {
- res.status(500).json({ error: error.message });
- }
-});
-// ============================================================
-// BRANDS
-// ============================================================
-/**
- * GET /api/dutchie-az/brands
- * Get all brands with counts
- */
-router.get('/brands', async (req, res) => {
- try {
- const { limit = '100', offset = '0' } = req.query;
- const { rows } = await (0, connection_1.query)(`
- SELECT * FROM v_brands
- LIMIT $1 OFFSET $2
- `, [parseInt(limit, 10), parseInt(offset, 10)]);
- res.json({ brands: rows });
- }
- catch (error) {
- res.status(500).json({ error: error.message });
- }
-});
-// ============================================================
-// ADMIN ACTIONS
-// ============================================================
-/**
- * POST /api/dutchie-az/admin/init-schema
- * Initialize the database schema
- */
-router.post('/admin/init-schema', async (_req, res) => {
- try {
- await (0, schema_1.ensureSchema)();
- res.json({ success: true, message: 'Schema initialized' });
- }
- catch (error) {
- res.status(500).json({ error: error.message });
- }
-});
-/**
- * POST /api/dutchie-az/admin/import-azdhs
- * Import dispensaries from AZDHS (main database)
- */
-router.post('/admin/import-azdhs', async (_req, res) => {
- try {
- const result = await (0, azdhs_import_1.importAZDHSDispensaries)();
- res.json(result);
- }
- catch (error) {
- res.status(500).json({ error: error.message });
- }
-});
-/**
- * POST /api/dutchie-az/admin/resolve-platform-ids
- * Resolve Dutchie platform IDs for all dispensaries
- */
-router.post('/admin/resolve-platform-ids', async (_req, res) => {
- try {
- const result = await (0, discovery_1.resolvePlatformDispensaryIds)();
- res.json(result);
- }
- catch (error) {
- res.status(500).json({ error: error.message });
- }
-});
-/**
- * POST /api/dutchie-az/admin/crawl-store/:id
- * Crawl a single store
- */
-router.post('/admin/crawl-store/:id', async (req, res) => {
- try {
- const { id } = req.params;
- const { pricingType = 'rec', useBothModes = true } = req.body;
- const dispensary = await (0, discovery_1.getDispensaryById)(parseInt(id, 10));
- if (!dispensary) {
- return res.status(404).json({ error: 'Store not found' });
- }
- const result = await (0, product_crawler_1.crawlDispensaryProducts)(dispensary, pricingType, { useBothModes });
- res.json(result);
- }
- catch (error) {
- res.status(500).json({ error: error.message });
- }
-});
-/**
- * GET /api/dutchie-az/admin/stats
- * Get import and crawl statistics
- */
-router.get('/admin/stats', async (_req, res) => {
- try {
- const importStats = await (0, azdhs_import_1.getImportStats)();
- // Get stock status distribution
- const { rows: stockStats } = await (0, connection_1.query)(`
- SELECT
- stock_status,
- COUNT(*) as count
- FROM dutchie_products
- GROUP BY stock_status
- `);
- // Get recent crawl jobs
- const { rows: recentJobs } = await (0, connection_1.query)(`
- SELECT * FROM dispensary_crawl_jobs
- ORDER BY created_at DESC
- LIMIT 10
- `);
- res.json({
- import: importStats,
- stockDistribution: stockStats,
- recentJobs,
- });
- }
- catch (error) {
- res.status(500).json({ error: error.message });
- }
-});
-// ============================================================
-// SCHEDULER ADMIN
-// ============================================================
-/**
- * GET /api/dutchie-az/admin/scheduler/status
- * Get scheduler status
- */
-router.get('/admin/scheduler/status', async (_req, res) => {
- try {
- const status = (0, scheduler_1.getSchedulerStatus)();
- res.json(status);
- }
- catch (error) {
- res.status(500).json({ error: error.message });
- }
-});
-/**
- * POST /api/dutchie-az/admin/scheduler/start
- * Start the scheduler
- */
-router.post('/admin/scheduler/start', async (_req, res) => {
- try {
- (0, scheduler_1.startScheduler)();
- res.json({ success: true, message: 'Scheduler started' });
- }
- catch (error) {
- res.status(500).json({ error: error.message });
- }
-});
-/**
- * POST /api/dutchie-az/admin/scheduler/stop
- * Stop the scheduler
- */
-router.post('/admin/scheduler/stop', async (_req, res) => {
- try {
- (0, scheduler_1.stopScheduler)();
- res.json({ success: true, message: 'Scheduler stopped' });
- }
- catch (error) {
- res.status(500).json({ error: error.message });
- }
-});
-/**
- * POST /api/dutchie-az/admin/scheduler/trigger
- * Trigger an immediate crawl cycle
- */
-router.post('/admin/scheduler/trigger', async (_req, res) => {
- try {
- const result = await (0, scheduler_1.triggerImmediateCrawl)();
- res.json(result);
- }
- catch (error) {
- res.status(500).json({ error: error.message });
- }
-});
-/**
- * POST /api/dutchie-az/admin/crawl/:id
- * Crawl a single dispensary with job tracking
- */
-router.post('/admin/crawl/:id', async (req, res) => {
- try {
- const { id } = req.params;
- const { pricingType = 'rec', useBothModes = true } = req.body;
- // Fetch the dispensary first
- const dispensary = await (0, discovery_1.getDispensaryById)(parseInt(id, 10));
- if (!dispensary) {
- return res.status(404).json({ error: 'Dispensary not found' });
- }
- const result = await (0, scheduler_1.crawlSingleDispensary)(dispensary, pricingType, { useBothModes });
- res.json(result);
- }
- catch (error) {
- res.status(500).json({ error: error.message });
- }
-});
-const job_queue_1 = require("../services/job-queue");
-/**
- * GET /api/dutchie-az/admin/dutchie-stores
- * Get all Dutchie stores with their crawl status
- */
-router.get('/admin/dutchie-stores', async (_req, res) => {
- try {
- const { rows } = await (0, connection_1.query)(`
- SELECT
- d.id,
- d.name,
- d.dba_name,
- d.city,
- d.state,
- d.menu_type,
- d.platform_dispensary_id,
- d.menu_url,
- d.website,
- d.last_crawl_at,
- d.consecutive_failures,
- d.failed_at,
- (
- SELECT COUNT(*)
- FROM dutchie_products
- WHERE dispensary_id = d.id
- ) as product_count,
- (
- SELECT MAX(crawled_at)
- FROM dutchie_product_snapshots s
- JOIN dutchie_products p ON s.dutchie_product_id = p.id
- WHERE p.dispensary_id = d.id
- ) as last_snapshot_at
- FROM dispensaries d
- WHERE d.menu_type = 'dutchie'
- AND d.state = 'AZ'
- ORDER BY d.name
- `);
- const ready = rows.filter((r) => r.platform_dispensary_id && !r.failed_at);
- const needsPlatformId = rows.filter((r) => !r.platform_dispensary_id && !r.failed_at);
- const failed = rows.filter((r) => r.failed_at);
- res.json({
- total: rows.length,
- ready: ready.length,
- needsPlatformId: needsPlatformId.length,
- failed: failed.length,
- stores: rows.map((r) => ({
- id: r.id,
- name: r.dba_name || r.name,
- city: r.city,
- state: r.state,
- menuType: r.menu_type,
- platformDispensaryId: r.platform_dispensary_id,
- menuUrl: r.menu_url,
- website: r.website,
- lastCrawlAt: r.last_crawl_at,
- productCount: parseInt(r.product_count || '0', 10),
- lastSnapshotAt: r.last_snapshot_at,
- status: r.failed_at
- ? 'failed'
- : r.platform_dispensary_id
- ? 'ready'
- : 'needs_platform_id',
- })),
- });
- }
- catch (error) {
- res.status(500).json({ error: error.message });
- }
-});
-/**
- * POST /api/dutchie-az/admin/crawl-all
- * Enqueue crawl jobs for ALL ready Dutchie stores
- * This is a convenience endpoint to queue all stores without triggering the scheduler
- */
-router.post('/admin/crawl-all', async (req, res) => {
- try {
- const { pricingType = 'rec', useBothModes = true } = req.body;
- // Get all "ready" dispensaries (menu_type='dutchie' AND platform_dispensary_id IS NOT NULL AND not failed)
- const { rows: rawRows } = await (0, connection_1.query)(`
- SELECT id, name, platform_dispensary_id FROM dispensaries
- WHERE state = 'AZ'
- AND menu_type = 'dutchie'
- AND platform_dispensary_id IS NOT NULL
- AND failed_at IS NULL
- ORDER BY last_crawl_at ASC NULLS FIRST
- `);
- if (rawRows.length === 0) {
- return res.json({
- success: true,
- message: 'No ready dispensaries to crawl. Run menu detection first.',
- enqueued: 0,
- skipped: 0,
- dispensaries: [],
- });
- }
- const dispensaryIds = rawRows.map((r) => r.id);
- // Bulk enqueue jobs (skips dispensaries that already have pending/running jobs)
- const { enqueued, skipped } = await (0, job_queue_1.bulkEnqueueJobs)('dutchie_product_crawl', dispensaryIds, {
- priority: 0,
- metadata: { pricingType, useBothModes },
- });
- // Get current queue stats
- const queueStats = await (0, job_queue_1.getQueueStats)();
- res.json({
- success: true,
- message: `Enqueued ${enqueued} crawl jobs for Dutchie stores`,
- totalReady: rawRows.length,
- enqueued,
- skipped,
- queueStats,
- dispensaries: rawRows.map((r) => ({
- id: r.id,
- name: r.name,
- platformDispensaryId: r.platform_dispensary_id,
- })),
- });
- }
- catch (error) {
- res.status(500).json({ error: error.message });
- }
-});
-/**
- * GET /api/dutchie-az/admin/jobs
- * Get crawl job history
- */
-router.get('/admin/jobs', async (req, res) => {
- try {
- const { status, dispensaryId, limit = '50', offset = '0' } = req.query;
- let whereClause = 'WHERE 1=1';
- const params = [];
- let paramIndex = 1;
- if (status) {
- whereClause += ` AND status = $${paramIndex}`;
- params.push(status);
- paramIndex++;
- }
- if (dispensaryId) {
- whereClause += ` AND dispensary_id = $${paramIndex}`;
- params.push(parseInt(dispensaryId, 10));
- paramIndex++;
- }
- params.push(parseInt(limit, 10), parseInt(offset, 10));
- const { rows } = await (0, connection_1.query)(`
- SELECT
- cj.*,
- d.name as dispensary_name,
- d.slug as dispensary_slug
- FROM dispensary_crawl_jobs cj
- LEFT JOIN dispensaries d ON cj.dispensary_id = d.id
- ${whereClause}
- ORDER BY cj.created_at DESC
- LIMIT $${paramIndex} OFFSET $${paramIndex + 1}
- `, params);
- const { rows: countRows } = await (0, connection_1.query)(`SELECT COUNT(*) as total FROM dispensary_crawl_jobs ${whereClause}`, params.slice(0, -2));
- res.json({
- jobs: rows,
- total: parseInt(countRows[0]?.total || '0', 10),
- limit: parseInt(limit, 10),
- offset: parseInt(offset, 10),
- });
- }
- catch (error) {
- res.status(500).json({ error: error.message });
- }
-});
-// ============================================================
-// SCHEDULES (CONFIG CRUD)
-// ============================================================
-/**
- * GET /api/dutchie-az/admin/schedules
- * Get all schedule configurations
- */
-router.get('/admin/schedules', async (_req, res) => {
- try {
- const schedules = await (0, scheduler_1.getAllSchedules)();
- res.json({ schedules });
- }
- catch (error) {
- res.status(500).json({ error: error.message });
- }
-});
-/**
- * GET /api/dutchie-az/admin/schedules/:id
- * Get a single schedule by ID
- */
-router.get('/admin/schedules/:id', async (req, res) => {
- try {
- const { id } = req.params;
- const schedule = await (0, scheduler_1.getScheduleById)(parseInt(id, 10));
- if (!schedule) {
- return res.status(404).json({ error: 'Schedule not found' });
- }
- res.json(schedule);
- }
- catch (error) {
- res.status(500).json({ error: error.message });
- }
-});
-/**
- * POST /api/dutchie-az/admin/schedules
- * Create a new schedule
- */
-router.post('/admin/schedules', async (req, res) => {
- try {
- const { jobName, description, enabled = true, baseIntervalMinutes, jitterMinutes, jobConfig, startImmediately = false, } = req.body;
- if (!jobName || typeof baseIntervalMinutes !== 'number' || typeof jitterMinutes !== 'number') {
- return res.status(400).json({
- error: 'jobName, baseIntervalMinutes, and jitterMinutes are required',
- });
- }
- const schedule = await (0, scheduler_1.createSchedule)({
- jobName,
- description,
- enabled,
- baseIntervalMinutes,
- jitterMinutes,
- jobConfig,
- startImmediately,
- });
- res.status(201).json(schedule);
- }
- catch (error) {
- // Handle unique constraint violation
- if (error.code === '23505') {
- return res.status(409).json({ error: `Schedule "${req.body.jobName}" already exists` });
- }
- res.status(500).json({ error: error.message });
- }
-});
-/**
- * PUT /api/dutchie-az/admin/schedules/:id
- * Update a schedule
- */
-router.put('/admin/schedules/:id', async (req, res) => {
- try {
- const { id } = req.params;
- const { description, enabled, baseIntervalMinutes, jitterMinutes, jobConfig } = req.body;
- const schedule = await (0, scheduler_1.updateSchedule)(parseInt(id, 10), {
- description,
- enabled,
- baseIntervalMinutes,
- jitterMinutes,
- jobConfig,
- });
- if (!schedule) {
- return res.status(404).json({ error: 'Schedule not found' });
- }
- res.json(schedule);
- }
- catch (error) {
- res.status(500).json({ error: error.message });
- }
-});
-/**
- * DELETE /api/dutchie-az/admin/schedules/:id
- * Delete a schedule
- */
-router.delete('/admin/schedules/:id', async (req, res) => {
- try {
- const { id } = req.params;
- const deleted = await (0, scheduler_1.deleteSchedule)(parseInt(id, 10));
- if (!deleted) {
- return res.status(404).json({ error: 'Schedule not found' });
- }
- res.json({ success: true, message: 'Schedule deleted' });
- }
- catch (error) {
- res.status(500).json({ error: error.message });
- }
-});
-/**
- * POST /api/dutchie-az/admin/schedules/:id/trigger
- * Trigger immediate execution of a schedule
- */
-router.post('/admin/schedules/:id/trigger', async (req, res) => {
- try {
- const { id } = req.params;
- const result = await (0, scheduler_1.triggerScheduleNow)(parseInt(id, 10));
- if (!result.success) {
- return res.status(400).json({ error: result.message });
- }
- res.json(result);
- }
- catch (error) {
- res.status(500).json({ error: error.message });
- }
-});
-/**
- * POST /api/dutchie-az/admin/schedules/init
- * Initialize default schedules if they don't exist
- */
-router.post('/admin/schedules/init', async (_req, res) => {
- try {
- await (0, scheduler_1.initializeDefaultSchedules)();
- const schedules = await (0, scheduler_1.getAllSchedules)();
- res.json({ success: true, schedules });
- }
- catch (error) {
- res.status(500).json({ error: error.message });
- }
-});
-/**
- * GET /api/dutchie-az/admin/schedules/:id/logs
- * Get run logs for a specific schedule
- */
-router.get('/admin/schedules/:id/logs', async (req, res) => {
- try {
- const { id } = req.params;
- const { limit = '50', offset = '0' } = req.query;
- const result = await (0, scheduler_1.getRunLogs)({
- scheduleId: parseInt(id, 10),
- limit: parseInt(limit, 10),
- offset: parseInt(offset, 10),
- });
- res.json(result);
- }
- catch (error) {
- res.status(500).json({ error: error.message });
- }
-});
-/**
- * GET /api/dutchie-az/admin/run-logs
- * Get all run logs with filtering
- */
-router.get('/admin/run-logs', async (req, res) => {
- try {
- const { scheduleId, jobName, limit = '50', offset = '0' } = req.query;
- const result = await (0, scheduler_1.getRunLogs)({
- scheduleId: scheduleId ? parseInt(scheduleId, 10) : undefined,
- jobName: jobName,
- limit: parseInt(limit, 10),
- offset: parseInt(offset, 10),
- });
- res.json(result);
- }
- catch (error) {
- res.status(500).json({ error: error.message });
- }
-});
-// ============================================================
-// DEBUG ROUTES
-// ============================================================
-/**
- * GET /api/dutchie-az/debug/summary
- * Get overall system summary for debugging
- */
-router.get('/debug/summary', async (_req, res) => {
- try {
- // Get table counts
- const { rows: tableCounts } = await (0, connection_1.query)(`
- SELECT
- (SELECT COUNT(*) FROM dispensaries) as dispensary_count,
- (SELECT COUNT(*) FROM dispensaries WHERE platform_dispensary_id IS NOT NULL) as dispensaries_with_platform_id,
- (SELECT COUNT(*) FROM dutchie_products) as product_count,
- (SELECT COUNT(*) FROM dutchie_product_snapshots) as snapshot_count,
- (SELECT COUNT(*) FROM dispensary_crawl_jobs) as job_count,
- (SELECT COUNT(*) FROM dispensary_crawl_jobs WHERE status = 'completed') as completed_jobs,
- (SELECT COUNT(*) FROM dispensary_crawl_jobs WHERE status = 'failed') as failed_jobs
- `);
- // Get stock status distribution
- const { rows: stockDistribution } = await (0, connection_1.query)(`
- SELECT
- stock_status,
- COUNT(*) as count
- FROM dutchie_products
- GROUP BY stock_status
- ORDER BY count DESC
- `);
- // Get products by dispensary
- const { rows: productsByDispensary } = await (0, connection_1.query)(`
- SELECT
- d.id,
- d.name,
- d.slug,
- d.platform_dispensary_id,
- COUNT(p.id) as product_count,
- MAX(p.updated_at) as last_product_update
- FROM dispensaries d
- LEFT JOIN dutchie_products p ON d.id = p.dispensary_id
- WHERE d.state = 'AZ'
- GROUP BY d.id, d.name, d.slug, d.platform_dispensary_id
- ORDER BY product_count DESC
- LIMIT 20
- `);
- // Get recent snapshots
- const { rows: recentSnapshots } = await (0, connection_1.query)(`
- SELECT
- s.id,
- s.dutchie_product_id,
- p.name as product_name,
- d.name as dispensary_name,
- s.crawled_at
- FROM dutchie_product_snapshots s
- JOIN dutchie_products p ON s.dutchie_product_id = p.id
- JOIN dispensaries d ON p.dispensary_id = d.id
- ORDER BY s.crawled_at DESC
- LIMIT 10
- `);
- res.json({
- tableCounts: tableCounts[0],
- stockDistribution,
- productsByDispensary,
- recentSnapshots,
- });
- }
- catch (error) {
- res.status(500).json({ error: error.message });
- }
-});
-/**
- * GET /api/dutchie-az/debug/store/:id
- * Get detailed debug info for a specific store
- */
-router.get('/debug/store/:id', async (req, res) => {
- try {
- const { id } = req.params;
- // Get dispensary info
- const { rows: dispensaryRows } = await (0, connection_1.query)(`SELECT ${DISPENSARY_COLUMNS} FROM dispensaries WHERE id = $1`, [parseInt(id, 10)]);
- if (dispensaryRows.length === 0) {
- return res.status(404).json({ error: 'Store not found' });
- }
- const dispensary = dispensaryRows[0];
- // Get product stats
- const { rows: productStats } = await (0, connection_1.query)(`
- SELECT
- COUNT(*) as total_products,
- COUNT(*) FILTER (WHERE stock_status = 'in_stock') as in_stock,
- COUNT(*) FILTER (WHERE stock_status = 'out_of_stock') as out_of_stock,
- COUNT(*) FILTER (WHERE stock_status = 'unknown') as unknown,
- COUNT(*) FILTER (WHERE stock_status = 'missing_from_feed') as missing_from_feed,
- MIN(first_seen_at) as earliest_product,
- MAX(last_seen_at) as latest_product,
- MAX(updated_at) as last_update
- FROM dutchie_products
- WHERE dispensary_id = $1
- `, [id]);
- // Get snapshot stats
- const { rows: snapshotStats } = await (0, connection_1.query)(`
- SELECT
- COUNT(*) as total_snapshots,
- MIN(crawled_at) as earliest_snapshot,
- MAX(crawled_at) as latest_snapshot,
- COUNT(DISTINCT dutchie_product_id) as products_with_snapshots
- FROM dutchie_product_snapshots s
- JOIN dutchie_products p ON s.dutchie_product_id = p.id
- WHERE p.dispensary_id = $1
- `, [id]);
- // Get crawl job history
- const { rows: recentJobs } = await (0, connection_1.query)(`
- SELECT
- id,
- status,
- started_at,
- completed_at,
- products_found,
- products_new,
- products_updated,
- error_message,
- created_at
- FROM dispensary_crawl_jobs
- WHERE dispensary_id = $1
- ORDER BY created_at DESC
- LIMIT 10
- `, [id]);
- // Get sample products (5 in-stock, 5 out-of-stock)
- const { rows: sampleInStock } = await (0, connection_1.query)(`
- SELECT
- p.id,
- p.name,
- p.brand_name,
- p.type,
- p.stock_status,
- p.updated_at
- FROM dutchie_products p
- WHERE p.dispensary_id = $1 AND p.stock_status = 'in_stock'
- ORDER BY p.updated_at DESC
- LIMIT 5
- `, [id]);
- const { rows: sampleOutOfStock } = await (0, connection_1.query)(`
- SELECT
- p.id,
- p.name,
- p.brand_name,
- p.type,
- p.stock_status,
- p.updated_at
- FROM dutchie_products p
- WHERE p.dispensary_id = $1 AND p.stock_status = 'out_of_stock'
- ORDER BY p.updated_at DESC
- LIMIT 5
- `, [id]);
- // Get categories breakdown
- const { rows: categories } = await (0, connection_1.query)(`
- SELECT
- type,
- subcategory,
- COUNT(*) as count
- FROM dutchie_products
- WHERE dispensary_id = $1
- GROUP BY type, subcategory
- ORDER BY count DESC
- `, [id]);
- res.json({
- dispensary,
- productStats: productStats[0],
- snapshotStats: snapshotStats[0],
- recentJobs,
- sampleProducts: {
- inStock: sampleInStock,
- outOfStock: sampleOutOfStock,
- },
- categories,
- });
- }
- catch (error) {
- res.status(500).json({ error: error.message });
- }
-});
-// ============================================================
-// LIVE CRAWLER STATUS ROUTES
-// ============================================================
-const job_queue_2 = require("../services/job-queue");
-/**
- * GET /api/dutchie-az/monitor/active-jobs
- * Get all currently running jobs with real-time status including worker info
- */
-router.get('/monitor/active-jobs', async (_req, res) => {
- try {
- // Get running jobs from job_run_logs (scheduled jobs like "enqueue all")
- const { rows: runningScheduledJobs } = await (0, connection_1.query)(`
- SELECT
- jrl.id,
- jrl.schedule_id,
- jrl.job_name,
- jrl.status,
- jrl.started_at,
- jrl.items_processed,
- jrl.items_succeeded,
- jrl.items_failed,
- jrl.metadata,
- js.description as job_description,
- EXTRACT(EPOCH FROM (NOW() - jrl.started_at)) as duration_seconds
- FROM job_run_logs jrl
- LEFT JOIN job_schedules js ON jrl.schedule_id = js.id
- WHERE jrl.status = 'running'
- ORDER BY jrl.started_at DESC
- `);
- // Get running crawl jobs (individual store crawls with worker info)
- const { rows: runningCrawlJobs } = await (0, connection_1.query)(`
- SELECT
- cj.id,
- cj.job_type,
- cj.dispensary_id,
- d.name as dispensary_name,
- d.city,
- d.platform_dispensary_id,
- cj.status,
- cj.started_at,
- cj.claimed_by as worker_id,
- cj.worker_hostname,
- cj.claimed_at,
- cj.products_found,
- cj.products_upserted,
- cj.snapshots_created,
- cj.current_page,
- cj.total_pages,
- cj.last_heartbeat_at,
- cj.retry_count,
- cj.metadata,
- EXTRACT(EPOCH FROM (NOW() - cj.started_at)) as duration_seconds
- FROM dispensary_crawl_jobs cj
- LEFT JOIN dispensaries d ON cj.dispensary_id = d.id
- WHERE cj.status = 'running'
- ORDER BY cj.started_at DESC
- `);
- // Get queue stats
- const queueStats = await (0, job_queue_2.getQueueStats)();
- // Get active workers
- const activeWorkers = await (0, job_queue_2.getActiveWorkers)();
- // Also get in-memory scrapers if any (from the legacy system)
- let inMemoryScrapers = [];
- try {
- const { activeScrapers } = await Promise.resolve().then(() => __importStar(require('../../routes/scraper-monitor')));
- inMemoryScrapers = Array.from(activeScrapers.values()).map(scraper => ({
- ...scraper,
- source: 'in_memory',
- duration_seconds: (Date.now() - scraper.startTime.getTime()) / 1000,
- }));
- }
- catch {
- // Legacy scraper monitor not available
- }
- res.json({
- scheduledJobs: runningScheduledJobs,
- crawlJobs: runningCrawlJobs,
- inMemoryScrapers,
- activeWorkers,
- queueStats,
- totalActive: runningScheduledJobs.length + runningCrawlJobs.length + inMemoryScrapers.length,
- });
- }
- catch (error) {
- res.status(500).json({ error: error.message });
- }
-});
-/**
- * GET /api/dutchie-az/monitor/recent-jobs
- * Get recent completed jobs
- */
-router.get('/monitor/recent-jobs', async (req, res) => {
- try {
- const { limit = '50' } = req.query;
- const limitNum = Math.min(parseInt(limit, 10), 200);
- // Recent job run logs
- const { rows: recentJobLogs } = await (0, connection_1.query)(`
- SELECT
- jrl.id,
- jrl.schedule_id,
- jrl.job_name,
- jrl.status,
- jrl.started_at,
- jrl.completed_at,
- jrl.duration_ms,
- jrl.error_message,
- jrl.items_processed,
- jrl.items_succeeded,
- jrl.items_failed,
- jrl.metadata,
- js.description as job_description
- FROM job_run_logs jrl
- LEFT JOIN job_schedules js ON jrl.schedule_id = js.id
- ORDER BY jrl.created_at DESC
- LIMIT $1
- `, [limitNum]);
- // Recent crawl jobs
- const { rows: recentCrawlJobs } = await (0, connection_1.query)(`
- SELECT
- cj.id,
- cj.job_type,
- cj.dispensary_id,
- d.name as dispensary_name,
- d.city,
- cj.status,
- cj.started_at,
- cj.completed_at,
- cj.error_message,
- cj.products_found,
- cj.snapshots_created,
- cj.metadata,
- EXTRACT(EPOCH FROM (COALESCE(cj.completed_at, NOW()) - cj.started_at)) * 1000 as duration_ms
- FROM dispensary_crawl_jobs cj
- LEFT JOIN dispensaries d ON cj.dispensary_id = d.id
- ORDER BY cj.created_at DESC
- LIMIT $1
- `, [limitNum]);
- res.json({
- jobLogs: recentJobLogs,
- crawlJobs: recentCrawlJobs,
- });
- }
- catch (error) {
- res.status(500).json({ error: error.message });
- }
-});
-/**
- * GET /api/dutchie-az/monitor/errors
- * Get recent job errors
- */
-router.get('/monitor/errors', async (req, res) => {
- try {
- const { limit = '20', hours = '24' } = req.query;
- const limitNum = Math.min(parseInt(limit, 10), 100);
- const hoursNum = Math.min(parseInt(hours, 10), 168);
- // Errors from job_run_logs
- const { rows: jobErrors } = await (0, connection_1.query)(`
- SELECT
- 'job_run_log' as source,
- jrl.id,
- jrl.job_name,
- jrl.status,
- jrl.started_at,
- jrl.completed_at,
- jrl.error_message,
- jrl.items_processed,
- jrl.items_failed,
- jrl.metadata
- FROM job_run_logs jrl
- WHERE jrl.status IN ('error', 'partial')
- AND jrl.created_at > NOW() - INTERVAL '${hoursNum} hours'
- ORDER BY jrl.created_at DESC
- LIMIT $1
- `, [limitNum]);
- // Errors from dispensary_crawl_jobs
- const { rows: crawlErrors } = await (0, connection_1.query)(`
- SELECT
- 'crawl_job' as source,
- cj.id,
- cj.job_type as job_name,
- d.name as dispensary_name,
- cj.status,
- cj.started_at,
- cj.completed_at,
- cj.error_message,
- cj.products_found as items_processed,
- cj.metadata
- FROM dispensary_crawl_jobs cj
- LEFT JOIN dispensaries d ON cj.dispensary_id = d.id
- WHERE cj.status = 'failed'
- AND cj.created_at > NOW() - INTERVAL '${hoursNum} hours'
- ORDER BY cj.created_at DESC
- LIMIT $1
- `, [limitNum]);
- res.json({
- errors: [...jobErrors, ...crawlErrors].sort((a, b) => new Date(b.started_at || b.created_at).getTime() -
- new Date(a.started_at || a.created_at).getTime()).slice(0, limitNum),
- });
- }
- catch (error) {
- res.status(500).json({ error: error.message });
- }
-});
-/**
- * GET /api/dutchie-az/monitor/summary
- * Get overall monitoring summary
- */
-router.get('/monitor/summary', async (_req, res) => {
- try {
- const { rows: stats } = await (0, connection_1.query)(`
- SELECT
- (SELECT COUNT(*) FROM job_run_logs WHERE status = 'running') as running_scheduled_jobs,
- (SELECT COUNT(*) FROM dispensary_crawl_jobs WHERE status = 'running') as running_dispensary_crawl_jobs,
- (SELECT COUNT(*) FROM job_run_logs WHERE status = 'success' AND created_at > NOW() - INTERVAL '24 hours') as successful_jobs_24h,
- (SELECT COUNT(*) FROM job_run_logs WHERE status IN ('error', 'partial') AND created_at > NOW() - INTERVAL '24 hours') as failed_jobs_24h,
- (SELECT COUNT(*) FROM dispensary_crawl_jobs WHERE status = 'completed' AND created_at > NOW() - INTERVAL '24 hours') as successful_crawls_24h,
- (SELECT COUNT(*) FROM dispensary_crawl_jobs WHERE status = 'failed' AND created_at > NOW() - INTERVAL '24 hours') as failed_crawls_24h,
- (SELECT SUM(products_found) FROM dispensary_crawl_jobs WHERE status = 'completed' AND created_at > NOW() - INTERVAL '24 hours') as products_found_24h,
- (SELECT SUM(snapshots_created) FROM dispensary_crawl_jobs WHERE status = 'completed' AND created_at > NOW() - INTERVAL '24 hours') as snapshots_created_24h,
- (SELECT MAX(started_at) FROM job_run_logs) as last_job_started,
- (SELECT MAX(completed_at) FROM job_run_logs WHERE status = 'success') as last_job_completed
- `);
- // Get next scheduled runs
- const { rows: nextRuns } = await (0, connection_1.query)(`
- SELECT
- id,
- job_name,
- description,
- enabled,
- next_run_at,
- last_status,
- last_run_at
- FROM job_schedules
- WHERE enabled = true AND next_run_at IS NOT NULL
- ORDER BY next_run_at ASC
- LIMIT 5
- `);
- res.json({
- ...(stats[0] || {}),
- nextRuns,
- });
- }
- catch (error) {
- res.status(500).json({ error: error.message });
- }
-});
-// ============================================================
-// MENU DETECTION ROUTES
-// ============================================================
-const menu_detection_1 = require("../services/menu-detection");
-/**
- * GET /api/dutchie-az/admin/detection/stats
- * Get menu detection statistics
- */
-router.get('/admin/detection/stats', async (_req, res) => {
- try {
- const stats = await (0, menu_detection_1.getDetectionStats)();
- res.json(stats);
- }
- catch (error) {
- res.status(500).json({ error: error.message });
- }
-});
-/**
- * GET /api/dutchie-az/admin/detection/pending
- * Get dispensaries that need menu detection
- */
-router.get('/admin/detection/pending', async (req, res) => {
- try {
- const { state = 'AZ', limit = '100' } = req.query;
- const dispensaries = await (0, menu_detection_1.getDispensariesNeedingDetection)({
- state: state,
- limit: parseInt(limit, 10),
- });
- res.json({ dispensaries, total: dispensaries.length });
- }
- catch (error) {
- res.status(500).json({ error: error.message });
- }
-});
-/**
- * POST /api/dutchie-az/admin/detection/detect/:id
- * Detect menu provider and resolve platform ID for a single dispensary
- */
-router.post('/admin/detection/detect/:id', async (req, res) => {
- try {
- const { id } = req.params;
- const result = await (0, menu_detection_1.detectAndResolveDispensary)(parseInt(id, 10));
- res.json(result);
- }
- catch (error) {
- res.status(500).json({ error: error.message });
- }
-});
-/**
- * POST /api/dutchie-az/admin/detection/detect-all
- * Run bulk menu detection on all dispensaries needing it
- */
-router.post('/admin/detection/detect-all', async (req, res) => {
- try {
- const { state = 'AZ', onlyUnknown = true, onlyMissingPlatformId = false, limit } = req.body;
- const result = await (0, menu_detection_1.runBulkDetection)({
- state,
- onlyUnknown,
- onlyMissingPlatformId,
- limit,
- });
- res.json(result);
- }
- catch (error) {
- res.status(500).json({ error: error.message });
- }
-});
-/**
- * POST /api/dutchie-az/admin/detection/trigger
- * Trigger the menu detection scheduled job immediately
- */
-router.post('/admin/detection/trigger', async (_req, res) => {
- try {
- // Find the menu detection schedule and trigger it
- const schedules = await (0, scheduler_1.getAllSchedules)();
- const menuDetection = schedules.find(s => s.jobName === 'dutchie_az_menu_detection');
- if (!menuDetection) {
- return res.status(404).json({ error: 'Menu detection schedule not found. Run /admin/schedules/init first.' });
- }
- const result = await (0, scheduler_1.triggerScheduleNow)(menuDetection.id);
- res.json(result);
- }
- catch (error) {
- res.status(500).json({ error: error.message });
- }
-});
-// ============================================================
-// FAILED DISPENSARIES ROUTES
-// ============================================================
-/**
- * GET /api/dutchie-az/admin/dispensaries/failed
- * Get all dispensaries flagged as failed (for admin review)
- */
-router.get('/admin/dispensaries/failed', async (_req, res) => {
- try {
- const { rows } = await (0, connection_1.query)(`
- SELECT
- id,
- name,
- city,
- state,
- menu_url,
- menu_type,
- platform_dispensary_id,
- consecutive_failures,
- last_failure_at,
- last_failure_reason,
- failed_at,
- failure_notes,
- last_crawl_at,
- updated_at
- FROM dispensaries
- WHERE failed_at IS NOT NULL
- ORDER BY failed_at DESC
- `);
- res.json({
- failed: rows,
- total: rows.length,
- });
- }
- catch (error) {
- res.status(500).json({ error: error.message });
- }
-});
-/**
- * GET /api/dutchie-az/admin/dispensaries/at-risk
- * Get dispensaries with high failure counts (but not yet flagged as failed)
- */
-router.get('/admin/dispensaries/at-risk', async (_req, res) => {
- try {
- const { rows } = await (0, connection_1.query)(`
- SELECT
- id,
- name,
- city,
- state,
- menu_url,
- menu_type,
- consecutive_failures,
- last_failure_at,
- last_failure_reason,
- last_crawl_at
- FROM dispensaries
- WHERE consecutive_failures >= 1
- AND failed_at IS NULL
- ORDER BY consecutive_failures DESC, last_failure_at DESC
- `);
- res.json({
- atRisk: rows,
- total: rows.length,
- });
- }
- catch (error) {
- res.status(500).json({ error: error.message });
- }
-});
-/**
- * POST /api/dutchie-az/admin/dispensaries/:id/unfail
- * Restore a failed dispensary - clears failed status and resets for re-detection
- */
-router.post('/admin/dispensaries/:id/unfail', async (req, res) => {
- try {
- const { id } = req.params;
- await (0, connection_1.query)(`
- UPDATE dispensaries
- SET failed_at = NULL,
- consecutive_failures = 0,
- last_failure_at = NULL,
- last_failure_reason = NULL,
- failure_notes = NULL,
- menu_type = NULL,
- platform_dispensary_id = NULL,
- updated_at = NOW()
- WHERE id = $1
- `, [id]);
- res.json({ success: true, message: `Dispensary ${id} restored for re-detection` });
- }
- catch (error) {
- res.status(500).json({ error: error.message });
- }
-});
-/**
- * POST /api/dutchie-az/admin/dispensaries/:id/reset-failures
- * Reset failure counter for a dispensary (without unflagging)
- */
-router.post('/admin/dispensaries/:id/reset-failures', async (req, res) => {
- try {
- const { id } = req.params;
- await (0, connection_1.query)(`
- UPDATE dispensaries
- SET consecutive_failures = 0,
- last_failure_at = NULL,
- last_failure_reason = NULL,
- updated_at = NOW()
- WHERE id = $1
- `, [id]);
- res.json({ success: true, message: `Failure counter reset for dispensary ${id}` });
- }
- catch (error) {
- res.status(500).json({ error: error.message });
- }
-});
-/**
- * GET /api/dutchie-az/admin/dispensaries/health-summary
- * Get a summary of dispensary health status
- */
-router.get('/admin/dispensaries/health-summary', async (_req, res) => {
- try {
- const { rows } = await (0, connection_1.query)(`
- SELECT
- COUNT(*) as total,
- COUNT(*) FILTER (WHERE state = 'AZ') as arizona_total,
- COUNT(*) FILTER (WHERE failed_at IS NOT NULL) as failed,
- COUNT(*) FILTER (WHERE consecutive_failures >= 1 AND failed_at IS NULL) as at_risk,
- COUNT(*) FILTER (WHERE menu_type = 'dutchie' AND platform_dispensary_id IS NOT NULL AND failed_at IS NULL) as ready_to_crawl,
- COUNT(*) FILTER (WHERE menu_type = 'dutchie' AND failed_at IS NULL) as dutchie_detected,
- COUNT(*) FILTER (WHERE (menu_type IS NULL OR menu_type = 'unknown') AND failed_at IS NULL) as needs_detection,
- COUNT(*) FILTER (WHERE menu_type NOT IN ('dutchie', 'unknown') AND menu_type IS NOT NULL AND failed_at IS NULL) as non_dutchie
- FROM dispensaries
- WHERE state = 'AZ'
- `);
- res.json(rows[0] || {});
- }
- catch (error) {
- res.status(500).json({ error: error.message });
- }
-});
-exports.default = router;
diff --git a/backend/dist/dutchie-az/services/azdhs-import.js b/backend/dist/dutchie-az/services/azdhs-import.js
deleted file mode 100644
index bad6cdcf..00000000
--- a/backend/dist/dutchie-az/services/azdhs-import.js
+++ /dev/null
@@ -1,229 +0,0 @@
-"use strict";
-/**
- * AZDHS Import Service
- *
- * Imports Arizona dispensaries from the main database's dispensaries table
- * (which was populated from AZDHS data) into the isolated Dutchie AZ database.
- *
- * This establishes the canonical list of AZ dispensaries to match against Dutchie.
- */
-var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
- if (k2 === undefined) k2 = k;
- var desc = Object.getOwnPropertyDescriptor(m, k);
- if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
- desc = { enumerable: true, get: function() { return m[k]; } };
- }
- Object.defineProperty(o, k2, desc);
-}) : (function(o, m, k, k2) {
- if (k2 === undefined) k2 = k;
- o[k2] = m[k];
-}));
-var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
- Object.defineProperty(o, "default", { enumerable: true, value: v });
-}) : function(o, v) {
- o["default"] = v;
-});
-var __importStar = (this && this.__importStar) || (function () {
- var ownKeys = function(o) {
- ownKeys = Object.getOwnPropertyNames || function (o) {
- var ar = [];
- for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
- return ar;
- };
- return ownKeys(o);
- };
- return function (mod) {
- if (mod && mod.__esModule) return mod;
- var result = {};
- if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
- __setModuleDefault(result, mod);
- return result;
- };
-})();
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.importAZDHSDispensaries = importAZDHSDispensaries;
-exports.importFromJSON = importFromJSON;
-exports.getImportStats = getImportStats;
-const pg_1 = require("pg");
-const connection_1 = require("../db/connection");
-// Main database connection (source of AZDHS data)
-const MAIN_DATABASE_URL = process.env.DATABASE_URL ||
- 'postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus';
-/**
- * Create a temporary connection to the main database
- */
-function getMainDBPool() {
- return new pg_1.Pool({
- connectionString: MAIN_DATABASE_URL,
- max: 5,
- idleTimeoutMillis: 30000,
- connectionTimeoutMillis: 5000,
- });
-}
-/**
- * Fetch all AZ dispensaries from the main database
- */
-async function fetchAZDHSDispensaries() {
- const pool = getMainDBPool();
- try {
- const result = await pool.query(`
- SELECT
- id, azdhs_id, name, company_name, address, city, state, zip,
- latitude, longitude, dba_name, phone, email, website,
- google_rating, google_review_count, slug,
- menu_provider, product_provider,
- created_at, updated_at
- FROM dispensaries
- WHERE state = 'AZ'
- ORDER BY id
- `);
- return result.rows;
- }
- finally {
- await pool.end();
- }
-}
-/**
- * Import a single dispensary into the Dutchie AZ database
- */
-async function importDispensary(disp) {
- const result = await (0, connection_1.query)(`
- INSERT INTO dispensaries (
- platform, name, slug, city, state, postal_code, address,
- latitude, longitude, is_delivery, is_pickup, raw_metadata, updated_at
- ) VALUES (
- $1, $2, $3, $4, $5, $6, $7,
- $8, $9, $10, $11, $12, NOW()
- )
- ON CONFLICT (platform, slug, city, state) DO UPDATE SET
- name = EXCLUDED.name,
- postal_code = EXCLUDED.postal_code,
- address = EXCLUDED.address,
- latitude = EXCLUDED.latitude,
- longitude = EXCLUDED.longitude,
- raw_metadata = EXCLUDED.raw_metadata,
- updated_at = NOW()
- RETURNING id
- `, [
- 'dutchie', // Will be updated when Dutchie match is found
- disp.dba_name || disp.name,
- disp.slug,
- disp.city,
- disp.state,
- disp.zip,
- disp.address,
- disp.latitude,
- disp.longitude,
- false, // is_delivery - unknown
- true, // is_pickup - assume true
- JSON.stringify({
- azdhs_id: disp.azdhs_id,
- main_db_id: disp.id,
- company_name: disp.company_name,
- phone: disp.phone,
- email: disp.email,
- website: disp.website,
- google_rating: disp.google_rating,
- google_review_count: disp.google_review_count,
- menu_provider: disp.menu_provider,
- product_provider: disp.product_provider,
- }),
- ]);
- return result.rows[0].id;
-}
-/**
- * Import all AZDHS dispensaries into the Dutchie AZ database
- */
-async function importAZDHSDispensaries() {
- console.log('[AZDHS Import] Starting import from main database...');
- const result = {
- total: 0,
- imported: 0,
- skipped: 0,
- errors: [],
- };
- try {
- const dispensaries = await fetchAZDHSDispensaries();
- result.total = dispensaries.length;
- console.log(`[AZDHS Import] Found ${dispensaries.length} AZ dispensaries in main DB`);
- for (const disp of dispensaries) {
- try {
- const id = await importDispensary(disp);
- result.imported++;
- console.log(`[AZDHS Import] Imported: ${disp.name} (${disp.city}) -> id=${id}`);
- }
- catch (error) {
- if (error.message.includes('duplicate')) {
- result.skipped++;
- }
- else {
- result.errors.push(`${disp.name}: ${error.message}`);
- }
- }
- }
- }
- catch (error) {
- result.errors.push(`Failed to fetch from main DB: ${error.message}`);
- }
- console.log(`[AZDHS Import] Complete: ${result.imported} imported, ${result.skipped} skipped, ${result.errors.length} errors`);
- return result;
-}
-/**
- * Import dispensaries from JSON file (backup export)
- */
-async function importFromJSON(jsonPath) {
- console.log(`[AZDHS Import] Importing from JSON: ${jsonPath}`);
- const result = {
- total: 0,
- imported: 0,
- skipped: 0,
- errors: [],
- };
- try {
- const fs = await Promise.resolve().then(() => __importStar(require('fs/promises')));
- const data = await fs.readFile(jsonPath, 'utf-8');
- const dispensaries = JSON.parse(data);
- result.total = dispensaries.length;
- console.log(`[AZDHS Import] Found ${dispensaries.length} dispensaries in JSON file`);
- for (const disp of dispensaries) {
- try {
- const id = await importDispensary(disp);
- result.imported++;
- }
- catch (error) {
- if (error.message.includes('duplicate')) {
- result.skipped++;
- }
- else {
- result.errors.push(`${disp.name}: ${error.message}`);
- }
- }
- }
- }
- catch (error) {
- result.errors.push(`Failed to read JSON file: ${error.message}`);
- }
- console.log(`[AZDHS Import] Complete: ${result.imported} imported, ${result.skipped} skipped`);
- return result;
-}
-/**
- * Get import statistics
- */
-async function getImportStats() {
- const { rows } = await (0, connection_1.query)(`
- SELECT
- COUNT(*) as total,
- COUNT(platform_dispensary_id) as with_platform_id,
- COUNT(*) - COUNT(platform_dispensary_id) as without_platform_id,
- MAX(updated_at) as last_updated
- FROM dispensaries
- WHERE state = 'AZ'
- `);
- const stats = rows[0];
- return {
- totalDispensaries: parseInt(stats.total, 10),
- withPlatformIds: parseInt(stats.with_platform_id, 10),
- withoutPlatformIds: parseInt(stats.without_platform_id, 10),
- lastImportedAt: stats.last_updated,
- };
-}
diff --git a/backend/dist/dutchie-az/services/directory-matcher.js b/backend/dist/dutchie-az/services/directory-matcher.js
deleted file mode 100644
index 1ce11368..00000000
--- a/backend/dist/dutchie-az/services/directory-matcher.js
+++ /dev/null
@@ -1,380 +0,0 @@
-"use strict";
-/**
- * Directory-Based Store Matcher
- *
- * Scrapes provider directory pages (Curaleaf, Sol, etc.) to get store lists,
- * then matches them to existing dispensaries by fuzzy name/city/address matching.
- *
- * This allows us to:
- * 1. Find specific store URLs for directory-style websites
- * 2. Match stores confidently by name+city
- * 3. Mark non-Dutchie providers as not_crawlable until we build crawlers
- */
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.scrapeSolDirectory = scrapeSolDirectory;
-exports.scrapeCuraleafDirectory = scrapeCuraleafDirectory;
-exports.matchDirectoryToDispensaries = matchDirectoryToDispensaries;
-exports.previewDirectoryMatches = previewDirectoryMatches;
-exports.applyHighConfidenceMatches = applyHighConfidenceMatches;
-const connection_1 = require("../db/connection");
-// ============================================================
-// NORMALIZATION FUNCTIONS
-// ============================================================
-/**
- * Normalize a string for comparison:
- * - Lowercase
- * - Remove common suffixes (dispensary, cannabis, etc.)
- * - Remove punctuation
- * - Collapse whitespace
- */
-function normalizeForComparison(str) {
- if (!str)
- return '';
- return str
- .toLowerCase()
- .replace(/\s+(dispensary|cannabis|marijuana|medical|recreational|shop|store|flower|wellness)(\s|$)/gi, ' ')
- .replace(/[^\w\s]/g, ' ') // Remove punctuation
- .replace(/\s+/g, ' ') // Collapse whitespace
- .trim();
-}
-/**
- * Normalize city name for comparison
- */
-function normalizeCity(city) {
- if (!city)
- return '';
- return city
- .toLowerCase()
- .replace(/[^\w\s]/g, '')
- .trim();
-}
-/**
- * Calculate similarity between two strings (0-1)
- * Uses Levenshtein distance normalized by max length
- */
-function stringSimilarity(a, b) {
- if (!a || !b)
- return 0;
- if (a === b)
- return 1;
- const longer = a.length > b.length ? a : b;
- const shorter = a.length > b.length ? b : a;
- if (longer.length === 0)
- return 1;
- const distance = levenshteinDistance(longer, shorter);
- return (longer.length - distance) / longer.length;
-}
-/**
- * Levenshtein distance between two strings
- */
-function levenshteinDistance(a, b) {
- const matrix = [];
- for (let i = 0; i <= b.length; i++) {
- matrix[i] = [i];
- }
- for (let j = 0; j <= a.length; j++) {
- matrix[0][j] = j;
- }
- for (let i = 1; i <= b.length; i++) {
- for (let j = 1; j <= a.length; j++) {
- if (b.charAt(i - 1) === a.charAt(j - 1)) {
- matrix[i][j] = matrix[i - 1][j - 1];
- }
- else {
- matrix[i][j] = Math.min(matrix[i - 1][j - 1] + 1, // substitution
- matrix[i][j - 1] + 1, // insertion
- matrix[i - 1][j] + 1 // deletion
- );
- }
- }
- }
- return matrix[b.length][a.length];
-}
-/**
- * Check if string contains another (with normalization)
- */
-function containsNormalized(haystack, needle) {
- return normalizeForComparison(haystack).includes(normalizeForComparison(needle));
-}
-// ============================================================
-// PROVIDER DIRECTORY SCRAPERS
-// ============================================================
-/**
- * Sol Flower (livewithsol.com) - Static HTML, easy to scrape
- */
-async function scrapeSolDirectory() {
- console.log('[DirectoryMatcher] Scraping Sol Flower directory...');
- try {
- const response = await fetch('https://www.livewithsol.com/locations/', {
- headers: {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
- Accept: 'text/html',
- },
- });
- if (!response.ok) {
- throw new Error(`HTTP ${response.status}`);
- }
- const html = await response.text();
- // Extract store entries from HTML
- // Sol's structure: Each location has name, address in specific divs
- const stores = [];
- // Pattern to find location cards
- // Format: NAME with address nearby
- const locationRegex = /]+href="(\/locations\/[^"]+)"[^>]*>([^<]+)<\/a>[\s\S]*?(\d+[^<]+(?:Ave|St|Blvd|Dr|Rd|Way)[^<]*)/gi;
- let match;
- while ((match = locationRegex.exec(html)) !== null) {
- const [, path, name, address] = match;
- // Extract city from common Arizona cities
- let city = 'Unknown';
- const cityPatterns = [
- { pattern: /phoenix/i, city: 'Phoenix' },
- { pattern: /scottsdale/i, city: 'Scottsdale' },
- { pattern: /tempe/i, city: 'Tempe' },
- { pattern: /tucson/i, city: 'Tucson' },
- { pattern: /mesa/i, city: 'Mesa' },
- { pattern: /sun city/i, city: 'Sun City' },
- { pattern: /glendale/i, city: 'Glendale' },
- ];
- for (const { pattern, city: cityName } of cityPatterns) {
- if (pattern.test(name) || pattern.test(address)) {
- city = cityName;
- break;
- }
- }
- stores.push({
- name: name.trim(),
- city,
- state: 'AZ',
- address: address.trim(),
- storeUrl: `https://www.livewithsol.com${path}`,
- });
- }
- // If regex didn't work, use known hardcoded values (fallback)
- if (stores.length === 0) {
- console.log('[DirectoryMatcher] Using hardcoded Sol locations');
- return [
- { name: 'Sol Flower 32nd & Shea', city: 'Phoenix', state: 'AZ', address: '3217 E Shea Blvd Suite 1 A', storeUrl: 'https://www.livewithsol.com/locations/deer-valley/' },
- { name: 'Sol Flower Scottsdale Airpark', city: 'Scottsdale', state: 'AZ', address: '14980 N 78th Way Ste 204', storeUrl: 'https://www.livewithsol.com/locations/scottsdale-airpark/' },
- { name: 'Sol Flower Sun City', city: 'Sun City', state: 'AZ', address: '13650 N 99th Ave', storeUrl: 'https://www.livewithsol.com/locations/sun-city/' },
- { name: 'Sol Flower Tempe McClintock', city: 'Tempe', state: 'AZ', address: '1322 N McClintock Dr', storeUrl: 'https://www.livewithsol.com/locations/tempe-mcclintock/' },
- { name: 'Sol Flower Tempe University', city: 'Tempe', state: 'AZ', address: '2424 W University Dr', storeUrl: 'https://www.livewithsol.com/locations/tempe-university/' },
- { name: 'Sol Flower Foothills Tucson', city: 'Tucson', state: 'AZ', address: '6026 N Oracle Rd', storeUrl: 'https://www.livewithsol.com/locations/foothills-tucson/' },
- { name: 'Sol Flower South Tucson', city: 'Tucson', state: 'AZ', address: '3000 W Valencia Rd Ste 210', storeUrl: 'https://www.livewithsol.com/locations/south-tucson/' },
- { name: 'Sol Flower North Tucson', city: 'Tucson', state: 'AZ', address: '4837 N 1st Ave', storeUrl: 'https://www.livewithsol.com/locations/north-tucson/' },
- { name: 'Sol Flower Casas Adobes', city: 'Tucson', state: 'AZ', address: '6437 N Oracle Rd', storeUrl: 'https://www.livewithsol.com/locations/casas-adobes/' },
- ];
- }
- console.log(`[DirectoryMatcher] Found ${stores.length} Sol Flower locations`);
- return stores;
- }
- catch (error) {
- console.error('[DirectoryMatcher] Error scraping Sol directory:', error.message);
- // Return hardcoded fallback
- return [
- { name: 'Sol Flower 32nd & Shea', city: 'Phoenix', state: 'AZ', address: '3217 E Shea Blvd Suite 1 A', storeUrl: 'https://www.livewithsol.com/locations/deer-valley/' },
- { name: 'Sol Flower Scottsdale Airpark', city: 'Scottsdale', state: 'AZ', address: '14980 N 78th Way Ste 204', storeUrl: 'https://www.livewithsol.com/locations/scottsdale-airpark/' },
- { name: 'Sol Flower Sun City', city: 'Sun City', state: 'AZ', address: '13650 N 99th Ave', storeUrl: 'https://www.livewithsol.com/locations/sun-city/' },
- { name: 'Sol Flower Tempe McClintock', city: 'Tempe', state: 'AZ', address: '1322 N McClintock Dr', storeUrl: 'https://www.livewithsol.com/locations/tempe-mcclintock/' },
- { name: 'Sol Flower Tempe University', city: 'Tempe', state: 'AZ', address: '2424 W University Dr', storeUrl: 'https://www.livewithsol.com/locations/tempe-university/' },
- { name: 'Sol Flower Foothills Tucson', city: 'Tucson', state: 'AZ', address: '6026 N Oracle Rd', storeUrl: 'https://www.livewithsol.com/locations/foothills-tucson/' },
- { name: 'Sol Flower South Tucson', city: 'Tucson', state: 'AZ', address: '3000 W Valencia Rd Ste 210', storeUrl: 'https://www.livewithsol.com/locations/south-tucson/' },
- { name: 'Sol Flower North Tucson', city: 'Tucson', state: 'AZ', address: '4837 N 1st Ave', storeUrl: 'https://www.livewithsol.com/locations/north-tucson/' },
- { name: 'Sol Flower Casas Adobes', city: 'Tucson', state: 'AZ', address: '6437 N Oracle Rd', storeUrl: 'https://www.livewithsol.com/locations/casas-adobes/' },
- ];
- }
-}
-/**
- * Curaleaf - Has age-gate, so we need hardcoded AZ locations
- * In production, this would use Playwright to bypass age-gate
- */
-async function scrapeCuraleafDirectory() {
- console.log('[DirectoryMatcher] Using hardcoded Curaleaf AZ locations (age-gate blocks simple fetch)...');
- // Hardcoded Arizona Curaleaf locations from public knowledge
- // These would be scraped via Playwright in production
- return [
- { name: 'Curaleaf Phoenix Camelback', city: 'Phoenix', state: 'AZ', address: '4811 E Camelback Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-phoenix-camelback' },
- { name: 'Curaleaf Phoenix Midtown', city: 'Phoenix', state: 'AZ', address: '1928 E Highland Ave', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-phoenix-midtown' },
- { name: 'Curaleaf Glendale East', city: 'Glendale', state: 'AZ', address: '5150 W Glendale Ave', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-glendale-east' },
- { name: 'Curaleaf Glendale West', city: 'Glendale', state: 'AZ', address: '6501 W Glendale Ave', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-glendale-west' },
- { name: 'Curaleaf Gilbert', city: 'Gilbert', state: 'AZ', address: '1736 E Williams Field Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-gilbert' },
- { name: 'Curaleaf Mesa', city: 'Mesa', state: 'AZ', address: '1540 S Power Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-mesa' },
- { name: 'Curaleaf Tempe', city: 'Tempe', state: 'AZ', address: '1815 E Broadway Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-tempe' },
- { name: 'Curaleaf Scottsdale', city: 'Scottsdale', state: 'AZ', address: '8904 E Indian Bend Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-scottsdale' },
- { name: 'Curaleaf Tucson Prince', city: 'Tucson', state: 'AZ', address: '3955 W Prince Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-tucson-prince' },
- { name: 'Curaleaf Tucson Midvale', city: 'Tucson', state: 'AZ', address: '2936 N Midvale Park Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-tucson-midvale' },
- { name: 'Curaleaf Sedona', city: 'Sedona', state: 'AZ', address: '525 AZ-179', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-sedona' },
- { name: 'Curaleaf Youngtown', city: 'Youngtown', state: 'AZ', address: '11125 W Grand Ave', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-youngtown' },
- ];
-}
-/**
- * Match a directory store to an existing dispensary
- */
-function matchStoreToDispensary(store, dispensaries) {
- const normalizedStoreName = normalizeForComparison(store.name);
- const normalizedStoreCity = normalizeCity(store.city);
- let bestMatch = null;
- let bestScore = 0;
- let matchReason = '';
- for (const disp of dispensaries) {
- const normalizedDispName = normalizeForComparison(disp.name);
- const normalizedDispCity = normalizeCity(disp.city || '');
- let score = 0;
- const reasons = [];
- // 1. Name similarity (max 50 points)
- const nameSimilarity = stringSimilarity(normalizedStoreName, normalizedDispName);
- score += nameSimilarity * 50;
- if (nameSimilarity > 0.8)
- reasons.push(`name_match(${(nameSimilarity * 100).toFixed(0)}%)`);
- // 2. City match (25 points for exact, 15 for partial)
- if (normalizedStoreCity && normalizedDispCity) {
- if (normalizedStoreCity === normalizedDispCity) {
- score += 25;
- reasons.push('city_exact');
- }
- else if (normalizedStoreCity.includes(normalizedDispCity) ||
- normalizedDispCity.includes(normalizedStoreCity)) {
- score += 15;
- reasons.push('city_partial');
- }
- }
- // 3. Address contains street name (15 points)
- if (store.address && disp.address) {
- const storeStreet = store.address.toLowerCase().split(/\s+/).slice(1, 4).join(' ');
- const dispStreet = disp.address.toLowerCase().split(/\s+/).slice(1, 4).join(' ');
- if (storeStreet && dispStreet && stringSimilarity(storeStreet, dispStreet) > 0.7) {
- score += 15;
- reasons.push('address_match');
- }
- }
- // 4. Brand name in dispensary name (10 points)
- const brandName = store.name.split(' ')[0].toLowerCase(); // e.g., "Curaleaf", "Sol"
- if (disp.name.toLowerCase().includes(brandName)) {
- score += 10;
- reasons.push('brand_match');
- }
- if (score > bestScore) {
- bestScore = score;
- bestMatch = disp;
- matchReason = reasons.join(', ');
- }
- }
- // Determine confidence level
- let confidence;
- if (bestScore >= 70) {
- confidence = 'high';
- }
- else if (bestScore >= 50) {
- confidence = 'medium';
- }
- else if (bestScore >= 30) {
- confidence = 'low';
- }
- else {
- confidence = 'none';
- }
- return {
- directoryStore: store,
- dispensaryId: bestMatch?.id || null,
- dispensaryName: bestMatch?.name || null,
- confidence,
- matchReason: matchReason || 'no_match',
- };
-}
-// ============================================================
-// MAIN FUNCTIONS
-// ============================================================
-/**
- * Run directory matching for a provider and update database
- * Only applies high-confidence matches automatically
- */
-async function matchDirectoryToDispensaries(provider, dryRun = true) {
- console.log(`[DirectoryMatcher] Running ${provider} directory matching (dryRun=${dryRun})...`);
- // Get directory stores
- let directoryStores;
- if (provider === 'curaleaf') {
- directoryStores = await scrapeCuraleafDirectory();
- }
- else if (provider === 'sol') {
- directoryStores = await scrapeSolDirectory();
- }
- else {
- throw new Error(`Unknown provider: ${provider}`);
- }
- // Get all AZ dispensaries from database
- const { rows: dispensaries } = await (0, connection_1.query)(`SELECT id, name, city, state, address, menu_type, menu_url, website
- FROM dispensaries
- WHERE state = 'AZ'`);
- console.log(`[DirectoryMatcher] Matching ${directoryStores.length} directory stores against ${dispensaries.length} dispensaries`);
- // Match each directory store
- const results = [];
- for (const store of directoryStores) {
- const match = matchStoreToDispensary(store, dispensaries);
- results.push(match);
- // Only apply high-confidence matches if not dry run
- if (!dryRun && match.confidence === 'high' && match.dispensaryId) {
- await applyDirectoryMatch(match.dispensaryId, provider, store);
- }
- }
- // Count results
- const report = {
- provider,
- totalDirectoryStores: directoryStores.length,
- highConfidenceMatches: results.filter((r) => r.confidence === 'high').length,
- mediumConfidenceMatches: results.filter((r) => r.confidence === 'medium').length,
- lowConfidenceMatches: results.filter((r) => r.confidence === 'low').length,
- unmatched: results.filter((r) => r.confidence === 'none').length,
- results,
- };
- console.log(`[DirectoryMatcher] ${provider} matching complete:`);
- console.log(` - High confidence: ${report.highConfidenceMatches}`);
- console.log(` - Medium confidence: ${report.mediumConfidenceMatches}`);
- console.log(` - Low confidence: ${report.lowConfidenceMatches}`);
- console.log(` - Unmatched: ${report.unmatched}`);
- return report;
-}
-/**
- * Apply a directory match to a dispensary
- */
-async function applyDirectoryMatch(dispensaryId, provider, store) {
- console.log(`[DirectoryMatcher] Applying match: dispensary ${dispensaryId} -> ${store.storeUrl}`);
- await (0, connection_1.query)(`
- UPDATE dispensaries SET
- menu_type = $1,
- menu_url = $2,
- platform_dispensary_id = NULL,
- provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
- jsonb_build_object(
- 'detected_provider', $1::text,
- 'detection_method', 'directory_match'::text,
- 'detected_at', NOW(),
- 'directory_store_name', $3::text,
- 'directory_store_url', $2::text,
- 'directory_store_city', $4::text,
- 'directory_store_address', $5::text,
- 'not_crawlable', true,
- 'not_crawlable_reason', $6::text
- ),
- updated_at = NOW()
- WHERE id = $7
- `, [
- provider,
- store.storeUrl,
- store.name,
- store.city,
- store.address,
- `${provider} proprietary menu - no crawler available`,
- dispensaryId,
- ]);
-}
-/**
- * Preview matches without applying them
- */
-async function previewDirectoryMatches(provider) {
- return matchDirectoryToDispensaries(provider, true);
-}
-/**
- * Apply high-confidence matches
- */
-async function applyHighConfidenceMatches(provider) {
- return matchDirectoryToDispensaries(provider, false);
-}
diff --git a/backend/dist/dutchie-az/services/discovery.js b/backend/dist/dutchie-az/services/discovery.js
deleted file mode 100644
index 0b09a9f5..00000000
--- a/backend/dist/dutchie-az/services/discovery.js
+++ /dev/null
@@ -1,515 +0,0 @@
-"use strict";
-/**
- * Dutchie AZ Discovery Service
- *
- * Discovers and manages dispensaries from Dutchie for Arizona.
- */
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.importFromExistingDispensaries = importFromExistingDispensaries;
-exports.discoverDispensaries = discoverDispensaries;
-exports.isObjectId = isObjectId;
-exports.extractFromMenuUrl = extractFromMenuUrl;
-exports.extractCNameFromMenuUrl = extractCNameFromMenuUrl;
-exports.resolvePlatformDispensaryIds = resolvePlatformDispensaryIds;
-exports.getAllDispensaries = getAllDispensaries;
-exports.mapDbRowToDispensary = mapDbRowToDispensary;
-exports.getDispensaryById = getDispensaryById;
-exports.getDispensariesWithPlatformIds = getDispensariesWithPlatformIds;
-exports.reResolveDispensaryPlatformId = reResolveDispensaryPlatformId;
-exports.updateMenuUrlAndResolve = updateMenuUrlAndResolve;
-exports.markDispensaryNotCrawlable = markDispensaryNotCrawlable;
-exports.getDispensaryCName = getDispensaryCName;
-const connection_1 = require("../db/connection");
-const graphql_client_1 = require("./graphql-client");
-/**
- * Upsert a dispensary record
- */
-async function upsertDispensary(dispensary) {
- const result = await (0, connection_1.query)(`
- INSERT INTO dispensaries (
- platform, name, slug, city, state, postal_code, address,
- latitude, longitude, platform_dispensary_id,
- is_delivery, is_pickup, raw_metadata, updated_at
- ) VALUES (
- $1, $2, $3, $4, $5, $6, $7,
- $8, $9, $10,
- $11, $12, $13, NOW()
- )
- ON CONFLICT (platform, slug, city, state) DO UPDATE SET
- name = EXCLUDED.name,
- postal_code = EXCLUDED.postal_code,
- address = EXCLUDED.address,
- latitude = EXCLUDED.latitude,
- longitude = EXCLUDED.longitude,
- platform_dispensary_id = COALESCE(EXCLUDED.platform_dispensary_id, dispensaries.platform_dispensary_id),
- is_delivery = EXCLUDED.is_delivery,
- is_pickup = EXCLUDED.is_pickup,
- raw_metadata = EXCLUDED.raw_metadata,
- updated_at = NOW()
- RETURNING id
- `, [
- dispensary.platform || 'dutchie',
- dispensary.name,
- dispensary.slug,
- dispensary.city,
- dispensary.state || 'AZ',
- dispensary.postalCode,
- dispensary.address,
- dispensary.latitude,
- dispensary.longitude,
- dispensary.platformDispensaryId,
- dispensary.isDelivery || false,
- dispensary.isPickup || true,
- dispensary.rawMetadata ? JSON.stringify(dispensary.rawMetadata) : null,
- ]);
- return result.rows[0].id;
-}
-/**
- * Normalize a raw discovery result to Dispensary
- */
-function normalizeDispensary(raw) {
- return {
- platform: 'dutchie',
- name: raw.name || raw.Name || '',
- slug: raw.slug || raw.cName || raw.id || '',
- city: raw.city || raw.address?.city || '',
- state: 'AZ',
- postalCode: raw.postalCode || raw.address?.postalCode || raw.address?.zip,
- address: raw.streetAddress || raw.address?.streetAddress,
- latitude: raw.latitude || raw.location?.lat,
- longitude: raw.longitude || raw.location?.lng,
- platformDispensaryId: raw.dispensaryId || raw.id || null,
- isDelivery: raw.isDelivery || raw.delivery || false,
- isPickup: raw.isPickup || raw.pickup || true,
- rawMetadata: raw,
- };
-}
-/**
- * Import dispensaries from the existing dispensaries table (from AZDHS data)
- * This creates records in the dutchie_az database for AZ dispensaries
- */
-async function importFromExistingDispensaries() {
- console.log('[Discovery] Importing from existing dispensaries table...');
- // This is a workaround - we'll use the dispensaries we already know about
- // and try to resolve their Dutchie IDs
- const knownDispensaries = [
- { name: 'Deeply Rooted', slug: 'AZ-Deeply-Rooted', city: 'Phoenix', state: 'AZ' },
- { name: 'Curaleaf Gilbert', slug: 'curaleaf-gilbert', city: 'Gilbert', state: 'AZ' },
- { name: 'Zen Leaf Prescott', slug: 'AZ-zen-leaf-prescott', city: 'Prescott', state: 'AZ' },
- // Add more known Dutchie stores here
- ];
- let imported = 0;
- for (const disp of knownDispensaries) {
- try {
- const id = await upsertDispensary({
- platform: 'dutchie',
- name: disp.name,
- slug: disp.slug,
- city: disp.city,
- state: disp.state,
- });
- imported++;
- console.log(`[Discovery] Imported: ${disp.name} (id=${id})`);
- }
- catch (error) {
- console.error(`[Discovery] Failed to import ${disp.name}:`, error.message);
- }
- }
- return { imported };
-}
-/**
- * Discover all Arizona Dutchie dispensaries via GraphQL
- */
-async function discoverDispensaries() {
- console.log('[Discovery] Starting Arizona dispensary discovery...');
- const errors = [];
- let discovered = 0;
- try {
- const rawDispensaries = await (0, graphql_client_1.discoverArizonaDispensaries)();
- console.log(`[Discovery] Found ${rawDispensaries.length} dispensaries from GraphQL`);
- for (const raw of rawDispensaries) {
- try {
- const normalized = normalizeDispensary(raw);
- if (normalized.name && normalized.slug && normalized.city) {
- await upsertDispensary(normalized);
- discovered++;
- }
- }
- catch (error) {
- errors.push(`${raw.name || raw.slug}: ${error.message}`);
- }
- }
- }
- catch (error) {
- errors.push(`Discovery failed: ${error.message}`);
- }
- console.log(`[Discovery] Completed: ${discovered} dispensaries, ${errors.length} errors`);
- return { discovered, errors };
-}
-/**
- * Check if a string looks like a MongoDB ObjectId (24 hex chars)
- */
-function isObjectId(value) {
- return /^[a-f0-9]{24}$/i.test(value);
-}
-function extractFromMenuUrl(menuUrl) {
- if (!menuUrl)
- return null;
- try {
- const url = new URL(menuUrl);
- const pathname = url.pathname;
- // Match /api/v2/embedded-menu/.js - this contains the platform_dispensary_id directly
- const apiMatch = pathname.match(/^\/api\/v2\/embedded-menu\/([a-f0-9]{24})\.js$/i);
- if (apiMatch) {
- return { type: 'platformId', value: apiMatch[1] };
- }
- // Match /embedded-menu/ or /dispensary/
- const embeddedMatch = pathname.match(/^\/embedded-menu\/([^/?]+)/);
- if (embeddedMatch) {
- const value = embeddedMatch[1];
- // Check if it's actually an ObjectId (some URLs use ID directly)
- if (isObjectId(value)) {
- return { type: 'platformId', value };
- }
- return { type: 'cName', value };
- }
- const dispensaryMatch = pathname.match(/^\/dispensary\/([^/?]+)/);
- if (dispensaryMatch) {
- const value = dispensaryMatch[1];
- if (isObjectId(value)) {
- return { type: 'platformId', value };
- }
- return { type: 'cName', value };
- }
- return null;
- }
- catch {
- return null;
- }
-}
-/**
- * Extract cName (slug) from a Dutchie menu_url
- * Backward compatible - use extractFromMenuUrl for full info
- */
-function extractCNameFromMenuUrl(menuUrl) {
- const extraction = extractFromMenuUrl(menuUrl);
- return extraction?.value || null;
-}
-/**
- * Resolve platform dispensary IDs for all dispensaries that don't have one
- * CRITICAL: Uses cName extracted from menu_url, NOT the slug column!
- *
- * Uses the new resolveDispensaryIdWithDetails which:
- * 1. Extracts dispensaryId from window.reactEnv in the embedded menu page (preferred)
- * 2. Falls back to GraphQL if reactEnv extraction fails
- * 3. Returns HTTP status so we can mark 403/404 stores as not_crawlable
- */
-async function resolvePlatformDispensaryIds() {
- console.log('[Discovery] Resolving platform dispensary IDs...');
- const { rows: dispensaries } = await (0, connection_1.query)(`
- SELECT id, name, slug, menu_url, menu_type, platform_dispensary_id, crawl_status
- FROM dispensaries
- WHERE menu_type = 'dutchie'
- AND platform_dispensary_id IS NULL
- AND menu_url IS NOT NULL
- AND (crawl_status IS NULL OR crawl_status != 'not_crawlable')
- ORDER BY id
- `);
- let resolved = 0;
- let failed = 0;
- let skipped = 0;
- let notCrawlable = 0;
- for (const dispensary of dispensaries) {
- try {
- // Extract cName from menu_url - this is the CORRECT way to get the Dutchie slug
- const cName = extractCNameFromMenuUrl(dispensary.menu_url);
- if (!cName) {
- console.log(`[Discovery] Skipping ${dispensary.name}: Could not extract cName from menu_url: ${dispensary.menu_url}`);
- skipped++;
- continue;
- }
- console.log(`[Discovery] Resolving ID for: ${dispensary.name} (cName=${cName}, menu_url=${dispensary.menu_url})`);
- // Use the new detailed resolver that extracts from reactEnv first
- const result = await (0, graphql_client_1.resolveDispensaryIdWithDetails)(cName);
- if (result.dispensaryId) {
- // SUCCESS: Store resolved
- await (0, connection_1.query)(`
- UPDATE dispensaries
- SET platform_dispensary_id = $1,
- platform_dispensary_id_resolved_at = NOW(),
- crawl_status = 'ready',
- crawl_status_reason = $2,
- crawl_status_updated_at = NOW(),
- last_tested_menu_url = $3,
- last_http_status = $4,
- updated_at = NOW()
- WHERE id = $5
- `, [
- result.dispensaryId,
- `Resolved from ${result.source || 'page'}`,
- dispensary.menu_url,
- result.httpStatus,
- dispensary.id,
- ]);
- resolved++;
- console.log(`[Discovery] Resolved: ${cName} -> ${result.dispensaryId} (source: ${result.source})`);
- }
- else if (result.httpStatus === 403 || result.httpStatus === 404) {
- // NOT CRAWLABLE: Store removed or not accessible
- await (0, connection_1.query)(`
- UPDATE dispensaries
- SET platform_dispensary_id = NULL,
- crawl_status = 'not_crawlable',
- crawl_status_reason = $1,
- crawl_status_updated_at = NOW(),
- last_tested_menu_url = $2,
- last_http_status = $3,
- updated_at = NOW()
- WHERE id = $4
- `, [
- result.error || `HTTP ${result.httpStatus}: Removed from Dutchie`,
- dispensary.menu_url,
- result.httpStatus,
- dispensary.id,
- ]);
- notCrawlable++;
- console.log(`[Discovery] Marked not crawlable: ${cName} (HTTP ${result.httpStatus})`);
- }
- else {
- // FAILED: Could not resolve but page loaded
- await (0, connection_1.query)(`
- UPDATE dispensaries
- SET crawl_status = 'not_ready',
- crawl_status_reason = $1,
- crawl_status_updated_at = NOW(),
- last_tested_menu_url = $2,
- last_http_status = $3,
- updated_at = NOW()
- WHERE id = $4
- `, [
- result.error || 'Could not extract dispensaryId from page',
- dispensary.menu_url,
- result.httpStatus,
- dispensary.id,
- ]);
- failed++;
- console.log(`[Discovery] Could not resolve: ${cName} - ${result.error}`);
- }
- // Delay between requests
- await new Promise((r) => setTimeout(r, 2000));
- }
- catch (error) {
- failed++;
- console.error(`[Discovery] Error resolving ${dispensary.name}:`, error.message);
- }
- }
- console.log(`[Discovery] Completed: ${resolved} resolved, ${failed} failed, ${skipped} skipped, ${notCrawlable} not crawlable`);
- return { resolved, failed, skipped, notCrawlable };
-}
-/**
- * Get all dispensaries
- */
-// Explicit column list for dispensaries table (avoids SELECT * issues with schema differences)
-const DISPENSARY_COLUMNS = `
- id, name, slug, city, state, zip, address, latitude, longitude,
- menu_type, menu_url, platform_dispensary_id, website,
- provider_detection_data, created_at, updated_at
-`;
-async function getAllDispensaries() {
- const { rows } = await (0, connection_1.query)(`SELECT ${DISPENSARY_COLUMNS} FROM dispensaries WHERE menu_type = 'dutchie' ORDER BY name`);
- return rows.map(mapDbRowToDispensary);
-}
-/**
- * Map snake_case DB row to camelCase Dispensary object
- * CRITICAL: DB returns snake_case (platform_dispensary_id) but TypeScript expects camelCase (platformDispensaryId)
- * This function is exported for use in other modules that query dispensaries directly.
- *
- * NOTE: The consolidated dispensaries table column mappings:
- * - zip → postalCode
- * - menu_type → menuType (keep platform as 'dutchie')
- * - last_crawl_at → lastCrawledAt
- * - platform_dispensary_id → platformDispensaryId
- */
-function mapDbRowToDispensary(row) {
- // Extract website from raw_metadata if available (field may not exist in all environments)
- let rawMetadata = undefined;
- if (row.raw_metadata !== undefined) {
- rawMetadata = typeof row.raw_metadata === 'string'
- ? JSON.parse(row.raw_metadata)
- : row.raw_metadata;
- }
- const website = row.website || rawMetadata?.website || undefined;
- return {
- id: row.id,
- platform: row.platform || 'dutchie', // keep platform as-is, default to 'dutchie'
- name: row.name,
- dbaName: row.dbaName || row.dba_name,
- slug: row.slug,
- city: row.city,
- state: row.state,
- postalCode: row.postalCode || row.zip || row.postal_code,
- latitude: row.latitude ? parseFloat(row.latitude) : undefined,
- longitude: row.longitude ? parseFloat(row.longitude) : undefined,
- address: row.address,
- platformDispensaryId: row.platformDispensaryId || row.platform_dispensary_id, // CRITICAL mapping!
- isDelivery: row.is_delivery,
- isPickup: row.is_pickup,
- rawMetadata: rawMetadata,
- lastCrawledAt: row.lastCrawledAt || row.last_crawl_at, // use last_crawl_at
- productCount: row.product_count,
- createdAt: row.created_at,
- updatedAt: row.updated_at,
- menuType: row.menuType || row.menu_type,
- menuUrl: row.menuUrl || row.menu_url,
- scrapeEnabled: row.scrapeEnabled ?? row.scrape_enabled,
- providerDetectionData: row.provider_detection_data,
- platformDispensaryIdResolvedAt: row.platform_dispensary_id_resolved_at,
- website,
- };
-}
-/**
- * Get dispensary by ID
- * NOTE: Uses SQL aliases to map snake_case → camelCase directly
- */
-async function getDispensaryById(id) {
- const { rows } = await (0, connection_1.query)(`
- SELECT
- id,
- name,
- dba_name AS "dbaName",
- slug,
- city,
- state,
- zip AS "postalCode",
- address,
- latitude,
- longitude,
- menu_type AS "menuType",
- menu_url AS "menuUrl",
- platform_dispensary_id AS "platformDispensaryId",
- website,
- provider_detection_data AS "providerDetectionData",
- created_at,
- updated_at
- FROM dispensaries
- WHERE id = $1
- `, [id]);
- if (!rows[0])
- return null;
- return mapDbRowToDispensary(rows[0]);
-}
-/**
- * Get dispensaries with platform IDs (ready for crawling)
- */
-async function getDispensariesWithPlatformIds() {
- const { rows } = await (0, connection_1.query)(`
- SELECT ${DISPENSARY_COLUMNS} FROM dispensaries
- WHERE menu_type = 'dutchie' AND platform_dispensary_id IS NOT NULL
- ORDER BY name
- `);
- return rows.map(mapDbRowToDispensary);
-}
-/**
- * Re-resolve a single dispensary's platform ID
- * Clears the existing ID and re-resolves from the menu_url cName
- */
-async function reResolveDispensaryPlatformId(dispensaryId) {
- console.log(`[Discovery] Re-resolving platform ID for dispensary ${dispensaryId}...`);
- const dispensary = await getDispensaryById(dispensaryId);
- if (!dispensary) {
- return { success: false, platformId: null, cName: null, error: 'Dispensary not found' };
- }
- const cName = extractCNameFromMenuUrl(dispensary.menuUrl);
- if (!cName) {
- console.log(`[Discovery] Could not extract cName from menu_url: ${dispensary.menuUrl}`);
- return {
- success: false,
- platformId: null,
- cName: null,
- error: `Could not extract cName from menu_url: ${dispensary.menuUrl}`,
- };
- }
- console.log(`[Discovery] Extracted cName: ${cName} from menu_url: ${dispensary.menuUrl}`);
- try {
- const platformId = await (0, graphql_client_1.resolveDispensaryId)(cName);
- if (platformId) {
- await (0, connection_1.query)(`
- UPDATE dispensaries
- SET platform_dispensary_id = $1,
- platform_dispensary_id_resolved_at = NOW(),
- updated_at = NOW()
- WHERE id = $2
- `, [platformId, dispensaryId]);
- console.log(`[Discovery] Resolved: ${cName} -> ${platformId}`);
- return { success: true, platformId, cName };
- }
- else {
- // Clear the invalid platform ID and mark as not crawlable
- await (0, connection_1.query)(`
- UPDATE dispensaries
- SET platform_dispensary_id = NULL,
- provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
- '{"resolution_error": "cName no longer exists on Dutchie", "not_crawlable": true}'::jsonb,
- updated_at = NOW()
- WHERE id = $1
- `, [dispensaryId]);
- console.log(`[Discovery] Could not resolve: ${cName} - marked as not crawlable`);
- return {
- success: false,
- platformId: null,
- cName,
- error: `cName "${cName}" no longer exists on Dutchie`,
- };
- }
- }
- catch (error) {
- console.error(`[Discovery] Error resolving ${cName}:`, error.message);
- return { success: false, platformId: null, cName, error: error.message };
- }
-}
-/**
- * Update menu_url for a dispensary and re-resolve platform ID
- */
-async function updateMenuUrlAndResolve(dispensaryId, newMenuUrl) {
- console.log(`[Discovery] Updating menu_url for dispensary ${dispensaryId} to: ${newMenuUrl}`);
- const cName = extractCNameFromMenuUrl(newMenuUrl);
- if (!cName) {
- return {
- success: false,
- platformId: null,
- cName: null,
- error: `Could not extract cName from new menu_url: ${newMenuUrl}`,
- };
- }
- // Update the menu_url first
- await (0, connection_1.query)(`
- UPDATE dispensaries
- SET menu_url = $1,
- menu_type = 'dutchie',
- platform_dispensary_id = NULL,
- updated_at = NOW()
- WHERE id = $2
- `, [newMenuUrl, dispensaryId]);
- // Now resolve the platform ID with the new cName
- return await reResolveDispensaryPlatformId(dispensaryId);
-}
-/**
- * Mark a dispensary as not crawlable (when resolution fails permanently)
- */
-async function markDispensaryNotCrawlable(dispensaryId, reason) {
- await (0, connection_1.query)(`
- UPDATE dispensaries
- SET platform_dispensary_id = NULL,
- provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
- jsonb_build_object('not_crawlable', true, 'not_crawlable_reason', $1::text, 'not_crawlable_at', NOW()::text),
- updated_at = NOW()
- WHERE id = $2
- `, [reason, dispensaryId]);
- console.log(`[Discovery] Marked dispensary ${dispensaryId} as not crawlable: ${reason}`);
-}
-/**
- * Get the cName for a dispensary (extracted from menu_url)
- */
-function getDispensaryCName(dispensary) {
- return extractCNameFromMenuUrl(dispensary.menuUrl);
-}
diff --git a/backend/dist/dutchie-az/services/graphql-client.js b/backend/dist/dutchie-az/services/graphql-client.js
deleted file mode 100644
index b19f7146..00000000
--- a/backend/dist/dutchie-az/services/graphql-client.js
+++ /dev/null
@@ -1,538 +0,0 @@
-"use strict";
-/**
- * Dutchie GraphQL Client
- *
- * Uses Puppeteer to establish a session (get CF cookies), then makes
- * SERVER-SIDE fetch calls to api-gw.dutchie.com with those cookies.
- *
- * DUTCHIE FETCH RULES:
- * 1. Server-side only - use axios (never browser fetch with CORS)
- * 2. Use dispensaryFilter.cNameOrID, NOT dispensaryId directly
- * 3. Headers must mimic Chrome: User-Agent, Origin, Referer
- * 4. If 403, extract CF cookies from Puppeteer session and include them
- * 5. Log status codes, error bodies, and product counts
- */
-var __importDefault = (this && this.__importDefault) || function (mod) {
- return (mod && mod.__esModule) ? mod : { "default": mod };
-};
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.ARIZONA_CENTERPOINTS = exports.GRAPHQL_HASHES = void 0;
-exports.resolveDispensaryId = resolveDispensaryId;
-exports.resolveDispensaryIdWithDetails = resolveDispensaryIdWithDetails;
-exports.discoverArizonaDispensaries = discoverArizonaDispensaries;
-exports.fetchAllProducts = fetchAllProducts;
-exports.fetchAllProductsBothModes = fetchAllProductsBothModes;
-const axios_1 = __importDefault(require("axios"));
-const puppeteer_extra_1 = __importDefault(require("puppeteer-extra"));
-const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth"));
-const dutchie_1 = require("../config/dutchie");
-Object.defineProperty(exports, "GRAPHQL_HASHES", { enumerable: true, get: function () { return dutchie_1.GRAPHQL_HASHES; } });
-Object.defineProperty(exports, "ARIZONA_CENTERPOINTS", { enumerable: true, get: function () { return dutchie_1.ARIZONA_CENTERPOINTS; } });
-puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)());
-/**
- * Create a session by navigating to the embedded menu page
- * and extracting CF clearance cookies for server-side requests.
- * Also extracts dispensaryId from window.reactEnv if available.
- */
-async function createSession(cName) {
- const browser = await puppeteer_extra_1.default.launch({
- headless: 'new',
- args: dutchie_1.dutchieConfig.browserArgs,
- });
- const page = await browser.newPage();
- const userAgent = dutchie_1.dutchieConfig.userAgent;
- await page.setUserAgent(userAgent);
- await page.setViewport({ width: 1920, height: 1080 });
- await page.evaluateOnNewDocument(() => {
- Object.defineProperty(navigator, 'webdriver', { get: () => false });
- window.chrome = { runtime: {} };
- });
- // Navigate to the embedded menu page for this dispensary
- const embeddedMenuUrl = `https://dutchie.com/embedded-menu/${cName}`;
- console.log(`[GraphQL Client] Loading ${embeddedMenuUrl} to get CF cookies...`);
- let httpStatus;
- let dispensaryId;
- try {
- const response = await page.goto(embeddedMenuUrl, {
- waitUntil: 'networkidle2',
- timeout: dutchie_1.dutchieConfig.navigationTimeout,
- });
- httpStatus = response?.status();
- await new Promise((r) => setTimeout(r, dutchie_1.dutchieConfig.pageLoadDelay));
- // Try to extract dispensaryId from window.reactEnv
- try {
- dispensaryId = await page.evaluate(() => {
- return window.reactEnv?.dispensaryId || null;
- });
- if (dispensaryId) {
- console.log(`[GraphQL Client] Extracted dispensaryId from reactEnv: ${dispensaryId}`);
- }
- }
- catch (evalError) {
- console.log(`[GraphQL Client] Could not extract dispensaryId from reactEnv: ${evalError.message}`);
- }
- }
- catch (error) {
- console.warn(`[GraphQL Client] Navigation warning: ${error.message}`);
- // Continue anyway - we may have gotten cookies
- }
- // Extract cookies
- const cookies = await page.cookies();
- const cookieString = cookies.map((c) => `${c.name}=${c.value}`).join('; ');
- console.log(`[GraphQL Client] Got ${cookies.length} cookies, HTTP status: ${httpStatus}`);
- if (cookies.length > 0) {
- console.log(`[GraphQL Client] Cookie names: ${cookies.map(c => c.name).join(', ')}`);
- }
- return { cookies: cookieString, userAgent, browser, page, dispensaryId, httpStatus };
-}
-/**
- * Close session (browser)
- */
-async function closeSession(session) {
- await session.browser.close();
-}
-// ============================================================
-// SERVER-SIDE GRAPHQL FETCH USING AXIOS
-// ============================================================
-/**
- * Build headers that mimic a real browser request
- */
-function buildHeaders(session, cName) {
- const embeddedMenuUrl = `https://dutchie.com/embedded-menu/${cName}`;
- return {
- 'accept': 'application/json, text/plain, */*',
- 'accept-language': 'en-US,en;q=0.9',
- 'accept-encoding': 'gzip, deflate, br',
- 'content-type': 'application/json',
- 'origin': 'https://dutchie.com',
- 'referer': embeddedMenuUrl,
- 'user-agent': session.userAgent,
- 'apollographql-client-name': 'Marketplace (production)',
- 'sec-ch-ua': '"Chromium";v="120", "Google Chrome";v="120", "Not-A.Brand";v="99"',
- 'sec-ch-ua-mobile': '?0',
- 'sec-ch-ua-platform': '"Windows"',
- 'sec-fetch-dest': 'empty',
- 'sec-fetch-mode': 'cors',
- 'sec-fetch-site': 'same-site',
- ...(session.cookies ? { 'cookie': session.cookies } : {}),
- };
-}
-/**
- * Execute GraphQL query server-side using axios
- * Uses cookies from the browser session to bypass CF
- */
-async function executeGraphQL(session, operationName, variables, hash, cName) {
- const endpoint = dutchie_1.dutchieConfig.graphqlEndpoint;
- const headers = buildHeaders(session, cName);
- // Build request body for POST
- const body = {
- operationName,
- variables,
- extensions: {
- persistedQuery: { version: 1, sha256Hash: hash },
- },
- };
- console.log(`[GraphQL Client] POST: ${operationName} -> ${endpoint}`);
- console.log(`[GraphQL Client] Variables: ${JSON.stringify(variables).slice(0, 300)}...`);
- try {
- const response = await axios_1.default.post(endpoint, body, {
- headers,
- timeout: 30000,
- validateStatus: () => true, // Don't throw on non-2xx
- });
- // Log response details
- console.log(`[GraphQL Client] Response status: ${response.status}`);
- if (response.status !== 200) {
- const bodyPreview = typeof response.data === 'string'
- ? response.data.slice(0, 500)
- : JSON.stringify(response.data).slice(0, 500);
- console.error(`[GraphQL Client] HTTP ${response.status}: ${bodyPreview}`);
- throw new Error(`HTTP ${response.status}`);
- }
- // Check for GraphQL errors
- if (response.data?.errors && response.data.errors.length > 0) {
- console.error(`[GraphQL Client] GraphQL errors: ${JSON.stringify(response.data.errors[0])}`);
- }
- return response.data;
- }
- catch (error) {
- if (axios_1.default.isAxiosError(error)) {
- const axiosError = error;
- console.error(`[GraphQL Client] Axios error: ${axiosError.message}`);
- if (axiosError.response) {
- console.error(`[GraphQL Client] Response status: ${axiosError.response.status}`);
- console.error(`[GraphQL Client] Response data: ${JSON.stringify(axiosError.response.data).slice(0, 500)}`);
- }
- if (axiosError.code) {
- console.error(`[GraphQL Client] Error code: ${axiosError.code}`);
- }
- }
- else {
- console.error(`[GraphQL Client] Error: ${error.message}`);
- }
- throw error;
- }
-}
-/**
- * Resolve a dispensary slug to its internal platform ID.
- *
- * STRATEGY:
- * 1. Navigate to embedded menu page and extract window.reactEnv.dispensaryId (preferred)
- * 2. Fall back to GraphQL GetAddressBasedDispensaryData query if reactEnv fails
- *
- * Returns the dispensaryId (platform_dispensary_id) or null if not found.
- * Throws if page returns 403/404 so caller can mark as not_crawlable.
- */
-async function resolveDispensaryId(slug) {
- const result = await resolveDispensaryIdWithDetails(slug);
- return result.dispensaryId;
-}
-/**
- * Resolve a dispensary slug with full details (HTTP status, source, error).
- * Use this when you need to know WHY resolution failed.
- */
-async function resolveDispensaryIdWithDetails(slug) {
- console.log(`[GraphQL Client] Resolving dispensary ID for slug: ${slug}`);
- const session = await createSession(slug);
- try {
- // Check HTTP status first - if 403/404, the store is not crawlable
- if (session.httpStatus && (session.httpStatus === 403 || session.httpStatus === 404)) {
- console.log(`[GraphQL Client] Page returned HTTP ${session.httpStatus} for ${slug} - not crawlable`);
- return {
- dispensaryId: null,
- httpStatus: session.httpStatus,
- error: `HTTP ${session.httpStatus}: Store removed or not accessible`,
- source: 'reactEnv',
- };
- }
- // PREFERRED: Use dispensaryId from window.reactEnv (extracted during createSession)
- if (session.dispensaryId) {
- console.log(`[GraphQL Client] Resolved ${slug} -> ${session.dispensaryId} (from reactEnv)`);
- return {
- dispensaryId: session.dispensaryId,
- httpStatus: session.httpStatus,
- source: 'reactEnv',
- };
- }
- // FALLBACK: Try GraphQL query
- console.log(`[GraphQL Client] reactEnv.dispensaryId not found for ${slug}, trying GraphQL...`);
- const variables = {
- dispensaryFilter: {
- cNameOrID: slug,
- },
- };
- const result = await executeGraphQL(session, 'GetAddressBasedDispensaryData', variables, dutchie_1.GRAPHQL_HASHES.GetAddressBasedDispensaryData, slug);
- const dispensaryId = result?.data?.dispensaryBySlug?.id ||
- result?.data?.dispensary?.id ||
- result?.data?.getAddressBasedDispensaryData?.dispensary?.id;
- if (dispensaryId) {
- console.log(`[GraphQL Client] Resolved ${slug} -> ${dispensaryId} (from GraphQL)`);
- return {
- dispensaryId,
- httpStatus: session.httpStatus,
- source: 'graphql',
- };
- }
- console.log(`[GraphQL Client] Could not resolve ${slug}, GraphQL response:`, JSON.stringify(result).slice(0, 300));
- return {
- dispensaryId: null,
- httpStatus: session.httpStatus,
- error: 'Could not extract dispensaryId from reactEnv or GraphQL',
- };
- }
- finally {
- await closeSession(session);
- }
-}
-/**
- * Discover Arizona dispensaries via geo-based query
- */
-async function discoverArizonaDispensaries() {
- console.log('[GraphQL Client] Discovering Arizona dispensaries...');
- // Use Phoenix as the default center
- const session = await createSession('AZ-Deeply-Rooted');
- const allDispensaries = [];
- const seenIds = new Set();
- try {
- for (const centerpoint of dutchie_1.ARIZONA_CENTERPOINTS) {
- console.log(`[GraphQL Client] Scanning ${centerpoint.name}...`);
- const variables = {
- dispensariesFilter: {
- latitude: centerpoint.lat,
- longitude: centerpoint.lng,
- distance: 100,
- state: 'AZ',
- },
- };
- try {
- const result = await executeGraphQL(session, 'ConsumerDispensaries', variables, dutchie_1.GRAPHQL_HASHES.ConsumerDispensaries, 'AZ-Deeply-Rooted');
- const dispensaries = result?.data?.consumerDispensaries || [];
- for (const d of dispensaries) {
- const id = d.id || d.dispensaryId;
- if (id && !seenIds.has(id)) {
- seenIds.add(id);
- allDispensaries.push(d);
- }
- }
- console.log(`[GraphQL Client] Found ${dispensaries.length} in ${centerpoint.name} (${allDispensaries.length} total unique)`);
- }
- catch (error) {
- console.warn(`[GraphQL Client] Error scanning ${centerpoint.name}: ${error.message}`);
- }
- // Delay between requests
- await new Promise((r) => setTimeout(r, 1000));
- }
- }
- finally {
- await closeSession(session);
- }
- console.log(`[GraphQL Client] Discovery complete: ${allDispensaries.length} dispensaries`);
- return allDispensaries;
-}
-// ============================================================
-// PRODUCT FILTERING VARIABLES
-// ============================================================
-/**
- * Build filter variables for FilteredProducts query
- *
- * CRITICAL: Uses dispensaryId directly (the MongoDB ObjectId, e.g. "6405ef617056e8014d79101b")
- * NOT dispensaryFilter.cNameOrID!
- *
- * The actual browser request structure is:
- * {
- * "productsFilter": {
- * "dispensaryId": "6405ef617056e8014d79101b",
- * "pricingType": "rec",
- * "Status": "Active", // Mode A only
- * "strainTypes": [],
- * "subcategories": [],
- * "types": [],
- * "useCache": true,
- * ...
- * },
- * "page": 0,
- * "perPage": 100
- * }
- *
- * Mode A = UI parity (Status: "Active")
- * Mode B = MAX COVERAGE (no Status filter)
- */
-function buildFilterVariables(platformDispensaryId, pricingType, crawlMode, page, perPage) {
- const isModeA = crawlMode === 'mode_a';
- // Per CLAUDE.md Rule #11: Use simple productsFilter with dispensaryId directly
- // Do NOT use dispensaryFilter.cNameOrID - that's outdated
- const productsFilter = {
- dispensaryId: platformDispensaryId,
- pricingType: pricingType,
- };
- // Mode A: Only active products (UI parity) - Status: "Active"
- // Mode B: MAX COVERAGE (OOS/inactive) - omit Status or set to null
- if (isModeA) {
- productsFilter.Status = 'Active';
- }
- // Mode B: No Status filter = returns all products including OOS/inactive
- return {
- productsFilter,
- page,
- perPage,
- };
-}
-// ============================================================
-// PRODUCT FETCHING WITH PAGINATION
-// ============================================================
-/**
- * Fetch products for a single mode with pagination
- */
-async function fetchProductsForMode(session, platformDispensaryId, cName, pricingType, crawlMode) {
- const perPage = dutchie_1.dutchieConfig.perPage;
- const maxPages = dutchie_1.dutchieConfig.maxPages;
- const maxRetries = dutchie_1.dutchieConfig.maxRetries;
- const pageDelayMs = dutchie_1.dutchieConfig.pageDelayMs;
- const allProducts = [];
- let pageNum = 0;
- let totalCount = 0;
- let consecutiveEmptyPages = 0;
- console.log(`[GraphQL Client] Fetching products for ${cName} (platformId: ${platformDispensaryId}, ${pricingType}, ${crawlMode})...`);
- while (pageNum < maxPages) {
- const variables = buildFilterVariables(platformDispensaryId, pricingType, crawlMode, pageNum, perPage);
- let result = null;
- let lastError = null;
- // Retry logic
- for (let attempt = 0; attempt <= maxRetries; attempt++) {
- try {
- result = await executeGraphQL(session, 'FilteredProducts', variables, dutchie_1.GRAPHQL_HASHES.FilteredProducts, cName);
- lastError = null;
- break;
- }
- catch (error) {
- lastError = error;
- console.warn(`[GraphQL Client] Page ${pageNum} attempt ${attempt + 1} failed: ${error.message}`);
- if (attempt < maxRetries) {
- await new Promise((r) => setTimeout(r, 1000 * (attempt + 1)));
- }
- }
- }
- if (lastError) {
- console.error(`[GraphQL Client] Page ${pageNum} failed after ${maxRetries + 1} attempts`);
- break;
- }
- if (result?.errors) {
- console.error('[GraphQL Client] GraphQL errors:', JSON.stringify(result.errors));
- break;
- }
- // Log response shape on first page
- if (pageNum === 0) {
- console.log(`[GraphQL Client] Response keys: ${Object.keys(result || {}).join(', ')}`);
- if (result?.data) {
- console.log(`[GraphQL Client] data keys: ${Object.keys(result.data || {}).join(', ')}`);
- }
- if (!result?.data?.filteredProducts) {
- console.log(`[GraphQL Client] WARNING: No filteredProducts in response!`);
- console.log(`[GraphQL Client] Full response: ${JSON.stringify(result).slice(0, 1000)}`);
- }
- }
- const products = result?.data?.filteredProducts?.products || [];
- const queryInfo = result?.data?.filteredProducts?.queryInfo;
- if (queryInfo?.totalCount) {
- totalCount = queryInfo.totalCount;
- }
- console.log(`[GraphQL Client] Page ${pageNum}: ${products.length} products (total so far: ${allProducts.length + products.length}/${totalCount})`);
- if (products.length === 0) {
- consecutiveEmptyPages++;
- if (consecutiveEmptyPages >= 2) {
- console.log('[GraphQL Client] Multiple empty pages, stopping pagination');
- break;
- }
- }
- else {
- consecutiveEmptyPages = 0;
- allProducts.push(...products);
- }
- // Stop if incomplete page (last page)
- if (products.length < perPage) {
- console.log(`[GraphQL Client] Incomplete page (${products.length} < ${perPage}), stopping`);
- break;
- }
- pageNum++;
- await new Promise((r) => setTimeout(r, pageDelayMs));
- }
- console.log(`[GraphQL Client] Fetched ${allProducts.length} total products (${crawlMode})`);
- return { products: allProducts, totalCount: totalCount || allProducts.length, crawlMode };
-}
-// ============================================================
-// LEGACY SINGLE-MODE INTERFACE
-// ============================================================
-/**
- * Fetch all products for a dispensary (single mode)
- */
-async function fetchAllProducts(platformDispensaryId, pricingType = 'rec', options = {}) {
- const { crawlMode = 'mode_a' } = options;
- // cName is now REQUIRED - no default fallback to avoid using wrong store's session
- const cName = options.cName;
- if (!cName) {
- throw new Error('[GraphQL Client] cName is required for fetchAllProducts - cannot use another store\'s session');
- }
- const session = await createSession(cName);
- try {
- return await fetchProductsForMode(session, platformDispensaryId, cName, pricingType, crawlMode);
- }
- finally {
- await closeSession(session);
- }
-}
-// ============================================================
-// MODE A+B MERGING
-// ============================================================
-/**
- * Merge POSMetaData.children arrays from Mode A and Mode B products
- */
-function mergeProductOptions(modeAProduct, modeBProduct) {
- const modeAChildren = modeAProduct.POSMetaData?.children || [];
- const modeBChildren = modeBProduct.POSMetaData?.children || [];
- const getOptionKey = (child) => {
- return child.canonicalID || child.canonicalSKU || child.canonicalPackageId || child.option || '';
- };
- const mergedMap = new Map();
- for (const child of modeAChildren) {
- const key = getOptionKey(child);
- if (key)
- mergedMap.set(key, child);
- }
- for (const child of modeBChildren) {
- const key = getOptionKey(child);
- if (key && !mergedMap.has(key)) {
- mergedMap.set(key, child);
- }
- }
- return Array.from(mergedMap.values());
-}
-/**
- * Merge a Mode A product with a Mode B product
- */
-function mergeProducts(modeAProduct, modeBProduct) {
- if (!modeBProduct) {
- return modeAProduct;
- }
- const mergedChildren = mergeProductOptions(modeAProduct, modeBProduct);
- return {
- ...modeAProduct,
- POSMetaData: {
- ...modeAProduct.POSMetaData,
- children: mergedChildren,
- },
- };
-}
-// ============================================================
-// MAIN EXPORT: TWO-MODE CRAWL
-// ============================================================
-/**
- * Fetch products using BOTH crawl modes with SINGLE session
- * Runs Mode A then Mode B, merges results
- */
-async function fetchAllProductsBothModes(platformDispensaryId, pricingType = 'rec', options = {}) {
- // cName is now REQUIRED - no default fallback to avoid using wrong store's session
- const cName = options.cName;
- if (!cName) {
- throw new Error('[GraphQL Client] cName is required for fetchAllProductsBothModes - cannot use another store\'s session');
- }
- console.log(`[GraphQL Client] Running two-mode crawl for ${cName} (${pricingType})...`);
- console.log(`[GraphQL Client] Platform ID: ${platformDispensaryId}, cName: ${cName}`);
- const session = await createSession(cName);
- try {
- // Mode A (UI parity)
- const modeAResult = await fetchProductsForMode(session, platformDispensaryId, cName, pricingType, 'mode_a');
- // Delay between modes
- await new Promise((r) => setTimeout(r, dutchie_1.dutchieConfig.modeDelayMs));
- // Mode B (MAX COVERAGE)
- const modeBResult = await fetchProductsForMode(session, platformDispensaryId, cName, pricingType, 'mode_b');
- // Merge results
- const modeBMap = new Map();
- for (const product of modeBResult.products) {
- modeBMap.set(product._id, product);
- }
- const productMap = new Map();
- // Add Mode A products, merging with Mode B if exists
- for (const product of modeAResult.products) {
- const modeBProduct = modeBMap.get(product._id);
- const mergedProduct = mergeProducts(product, modeBProduct);
- productMap.set(product._id, mergedProduct);
- }
- // Add Mode B products not in Mode A
- for (const product of modeBResult.products) {
- if (!productMap.has(product._id)) {
- productMap.set(product._id, product);
- }
- }
- const mergedProducts = Array.from(productMap.values());
- console.log(`[GraphQL Client] Merged: ${mergedProducts.length} unique products`);
- console.log(`[GraphQL Client] Mode A: ${modeAResult.products.length}, Mode B: ${modeBResult.products.length}`);
- return {
- modeA: { products: modeAResult.products, totalCount: modeAResult.totalCount },
- modeB: { products: modeBResult.products, totalCount: modeBResult.totalCount },
- merged: { products: mergedProducts, totalCount: mergedProducts.length },
- };
- }
- finally {
- await closeSession(session);
- }
-}
diff --git a/backend/dist/dutchie-az/services/job-queue.js b/backend/dist/dutchie-az/services/job-queue.js
deleted file mode 100644
index dca167a7..00000000
--- a/backend/dist/dutchie-az/services/job-queue.js
+++ /dev/null
@@ -1,414 +0,0 @@
-"use strict";
-/**
- * Job Queue Service
- *
- * DB-backed job queue with claiming/locking for distributed workers.
- * Ensures only one worker processes a given store at a time.
- */
-var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
- if (k2 === undefined) k2 = k;
- var desc = Object.getOwnPropertyDescriptor(m, k);
- if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
- desc = { enumerable: true, get: function() { return m[k]; } };
- }
- Object.defineProperty(o, k2, desc);
-}) : (function(o, m, k, k2) {
- if (k2 === undefined) k2 = k;
- o[k2] = m[k];
-}));
-var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
- Object.defineProperty(o, "default", { enumerable: true, value: v });
-}) : function(o, v) {
- o["default"] = v;
-});
-var __importStar = (this && this.__importStar) || (function () {
- var ownKeys = function(o) {
- ownKeys = Object.getOwnPropertyNames || function (o) {
- var ar = [];
- for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
- return ar;
- };
- return ownKeys(o);
- };
- return function (mod) {
- if (mod && mod.__esModule) return mod;
- var result = {};
- if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
- __setModuleDefault(result, mod);
- return result;
- };
-})();
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.getWorkerId = getWorkerId;
-exports.getWorkerHostname = getWorkerHostname;
-exports.enqueueJob = enqueueJob;
-exports.bulkEnqueueJobs = bulkEnqueueJobs;
-exports.claimNextJob = claimNextJob;
-exports.updateJobProgress = updateJobProgress;
-exports.heartbeat = heartbeat;
-exports.completeJob = completeJob;
-exports.failJob = failJob;
-exports.getQueueStats = getQueueStats;
-exports.getActiveWorkers = getActiveWorkers;
-exports.getRunningJobs = getRunningJobs;
-exports.recoverStaleJobs = recoverStaleJobs;
-exports.cleanupOldJobs = cleanupOldJobs;
-const connection_1 = require("../db/connection");
-const uuid_1 = require("uuid");
-const os = __importStar(require("os"));
-// ============================================================
-// WORKER IDENTITY
-// ============================================================
-let _workerId = null;
-/**
- * Get or create a unique worker ID for this process
- * In Kubernetes, uses POD_NAME for clarity; otherwise generates a unique ID
- */
-function getWorkerId() {
- if (!_workerId) {
- // Prefer POD_NAME in K8s (set via fieldRef)
- const podName = process.env.POD_NAME;
- if (podName) {
- _workerId = podName;
- }
- else {
- const hostname = os.hostname();
- const pid = process.pid;
- const uuid = (0, uuid_1.v4)().slice(0, 8);
- _workerId = `${hostname}-${pid}-${uuid}`;
- }
- }
- return _workerId;
-}
-/**
- * Get hostname for worker tracking
- * In Kubernetes, uses POD_NAME; otherwise uses os.hostname()
- */
-function getWorkerHostname() {
- return process.env.POD_NAME || os.hostname();
-}
-// ============================================================
-// JOB ENQUEUEING
-// ============================================================
-/**
- * Enqueue a new job for processing
- * Returns null if a pending/running job already exists for this dispensary
- */
-async function enqueueJob(options) {
- const { jobType, dispensaryId, priority = 0, metadata, maxRetries = 3, } = options;
- // Check if there's already a pending/running job for this dispensary
- if (dispensaryId) {
- const { rows: existing } = await (0, connection_1.query)(`SELECT id FROM dispensary_crawl_jobs
- WHERE dispensary_id = $1 AND status IN ('pending', 'running')
- LIMIT 1`, [dispensaryId]);
- if (existing.length > 0) {
- console.log(`[JobQueue] Skipping enqueue - job already exists for dispensary ${dispensaryId}`);
- return null;
- }
- }
- const { rows } = await (0, connection_1.query)(`INSERT INTO dispensary_crawl_jobs (job_type, dispensary_id, status, priority, max_retries, metadata, created_at)
- VALUES ($1, $2, 'pending', $3, $4, $5, NOW())
- RETURNING id`, [jobType, dispensaryId || null, priority, maxRetries, metadata ? JSON.stringify(metadata) : null]);
- const jobId = rows[0].id;
- console.log(`[JobQueue] Enqueued job ${jobId} (type=${jobType}, dispensary=${dispensaryId})`);
- return jobId;
-}
-/**
- * Bulk enqueue jobs for multiple dispensaries
- * Skips dispensaries that already have pending/running jobs
- */
-async function bulkEnqueueJobs(jobType, dispensaryIds, options = {}) {
- const { priority = 0, metadata } = options;
- // Get dispensaries that already have pending/running jobs
- const { rows: existing } = await (0, connection_1.query)(`SELECT DISTINCT dispensary_id FROM dispensary_crawl_jobs
- WHERE dispensary_id = ANY($1) AND status IN ('pending', 'running')`, [dispensaryIds]);
- const existingSet = new Set(existing.map((r) => r.dispensary_id));
- // Filter out dispensaries with existing jobs
- const toEnqueue = dispensaryIds.filter(id => !existingSet.has(id));
- if (toEnqueue.length === 0) {
- return { enqueued: 0, skipped: dispensaryIds.length };
- }
- // Bulk insert - each row needs 4 params: job_type, dispensary_id, priority, metadata
- const metadataJson = metadata ? JSON.stringify(metadata) : null;
- const values = toEnqueue.map((_, i) => {
- const offset = i * 4;
- return `($${offset + 1}, $${offset + 2}, 'pending', $${offset + 3}, 3, $${offset + 4}, NOW())`;
- }).join(', ');
- const params = [];
- toEnqueue.forEach(dispensaryId => {
- params.push(jobType, dispensaryId, priority, metadataJson);
- });
- await (0, connection_1.query)(`INSERT INTO dispensary_crawl_jobs (job_type, dispensary_id, status, priority, max_retries, metadata, created_at)
- VALUES ${values}`, params);
- console.log(`[JobQueue] Bulk enqueued ${toEnqueue.length} jobs, skipped ${existingSet.size}`);
- return { enqueued: toEnqueue.length, skipped: existingSet.size };
-}
-// ============================================================
-// JOB CLAIMING (with locking)
-// ============================================================
-/**
- * Claim the next available job from the queue
- * Uses SELECT FOR UPDATE SKIP LOCKED to prevent double-claims
- */
-async function claimNextJob(options) {
- const { workerId, jobTypes, lockDurationMinutes = 30 } = options;
- const hostname = getWorkerHostname();
- const client = await (0, connection_1.getClient)();
- try {
- await client.query('BEGIN');
- // Build job type filter
- let typeFilter = '';
- const params = [workerId, hostname, lockDurationMinutes];
- let paramIndex = 4;
- if (jobTypes && jobTypes.length > 0) {
- typeFilter = `AND job_type = ANY($${paramIndex})`;
- params.push(jobTypes);
- paramIndex++;
- }
- // Claim the next pending job using FOR UPDATE SKIP LOCKED
- // This atomically selects and locks a row, skipping any already locked by other workers
- const { rows } = await client.query(`UPDATE dispensary_crawl_jobs
- SET
- status = 'running',
- claimed_by = $1,
- claimed_at = NOW(),
- worker_id = $1,
- worker_hostname = $2,
- started_at = NOW(),
- locked_until = NOW() + ($3 || ' minutes')::INTERVAL,
- last_heartbeat_at = NOW(),
- updated_at = NOW()
- WHERE id = (
- SELECT id FROM dispensary_crawl_jobs
- WHERE status = 'pending'
- ${typeFilter}
- ORDER BY priority DESC, created_at ASC
- FOR UPDATE SKIP LOCKED
- LIMIT 1
- )
- RETURNING *`, params);
- await client.query('COMMIT');
- if (rows.length === 0) {
- return null;
- }
- const job = mapDbRowToJob(rows[0]);
- console.log(`[JobQueue] Worker ${workerId} claimed job ${job.id} (type=${job.jobType}, dispensary=${job.dispensaryId})`);
- return job;
- }
- catch (error) {
- await client.query('ROLLBACK');
- throw error;
- }
- finally {
- client.release();
- }
-}
-// ============================================================
-// JOB PROGRESS & COMPLETION
-// ============================================================
-/**
- * Update job progress (for live monitoring)
- */
-async function updateJobProgress(jobId, progress) {
- const updates = ['last_heartbeat_at = NOW()', 'updated_at = NOW()'];
- const params = [];
- let paramIndex = 1;
- if (progress.productsFound !== undefined) {
- updates.push(`products_found = $${paramIndex++}`);
- params.push(progress.productsFound);
- }
- if (progress.productsUpserted !== undefined) {
- updates.push(`products_upserted = $${paramIndex++}`);
- params.push(progress.productsUpserted);
- }
- if (progress.snapshotsCreated !== undefined) {
- updates.push(`snapshots_created = $${paramIndex++}`);
- params.push(progress.snapshotsCreated);
- }
- if (progress.currentPage !== undefined) {
- updates.push(`current_page = $${paramIndex++}`);
- params.push(progress.currentPage);
- }
- if (progress.totalPages !== undefined) {
- updates.push(`total_pages = $${paramIndex++}`);
- params.push(progress.totalPages);
- }
- params.push(jobId);
- await (0, connection_1.query)(`UPDATE dispensary_crawl_jobs SET ${updates.join(', ')} WHERE id = $${paramIndex}`, params);
-}
-/**
- * Send heartbeat to keep job alive (prevents timeout)
- */
-async function heartbeat(jobId) {
- await (0, connection_1.query)(`UPDATE dispensary_crawl_jobs
- SET last_heartbeat_at = NOW(), locked_until = NOW() + INTERVAL '30 minutes'
- WHERE id = $1 AND status = 'running'`, [jobId]);
-}
-/**
- * Mark job as completed
- */
-async function completeJob(jobId, result) {
- await (0, connection_1.query)(`UPDATE dispensary_crawl_jobs
- SET
- status = 'completed',
- completed_at = NOW(),
- products_found = COALESCE($2, products_found),
- products_upserted = COALESCE($3, products_upserted),
- snapshots_created = COALESCE($4, snapshots_created),
- updated_at = NOW()
- WHERE id = $1`, [jobId, result.productsFound, result.productsUpserted, result.snapshotsCreated]);
- console.log(`[JobQueue] Job ${jobId} completed`);
-}
-/**
- * Mark job as failed
- */
-async function failJob(jobId, errorMessage) {
- // Check if we should retry
- const { rows } = await (0, connection_1.query)(`SELECT retry_count, max_retries FROM dispensary_crawl_jobs WHERE id = $1`, [jobId]);
- if (rows.length === 0)
- return false;
- const { retry_count, max_retries } = rows[0];
- if (retry_count < max_retries) {
- // Re-queue for retry
- await (0, connection_1.query)(`UPDATE dispensary_crawl_jobs
- SET
- status = 'pending',
- retry_count = retry_count + 1,
- claimed_by = NULL,
- claimed_at = NULL,
- worker_id = NULL,
- worker_hostname = NULL,
- started_at = NULL,
- locked_until = NULL,
- last_heartbeat_at = NULL,
- error_message = $2,
- updated_at = NOW()
- WHERE id = $1`, [jobId, errorMessage]);
- console.log(`[JobQueue] Job ${jobId} failed, re-queued for retry (${retry_count + 1}/${max_retries})`);
- return true; // Will retry
- }
- else {
- // Mark as failed permanently
- await (0, connection_1.query)(`UPDATE dispensary_crawl_jobs
- SET
- status = 'failed',
- completed_at = NOW(),
- error_message = $2,
- updated_at = NOW()
- WHERE id = $1`, [jobId, errorMessage]);
- console.log(`[JobQueue] Job ${jobId} failed permanently after ${retry_count} retries`);
- return false; // No more retries
- }
-}
-// ============================================================
-// QUEUE MONITORING
-// ============================================================
-/**
- * Get queue statistics
- */
-async function getQueueStats() {
- const { rows } = await (0, connection_1.query)(`SELECT * FROM v_queue_stats`);
- const stats = rows[0] || {};
- return {
- pending: parseInt(stats.pending_jobs || '0', 10),
- running: parseInt(stats.running_jobs || '0', 10),
- completed1h: parseInt(stats.completed_1h || '0', 10),
- failed1h: parseInt(stats.failed_1h || '0', 10),
- activeWorkers: parseInt(stats.active_workers || '0', 10),
- avgDurationSeconds: stats.avg_duration_seconds ? parseFloat(stats.avg_duration_seconds) : null,
- };
-}
-/**
- * Get active workers
- */
-async function getActiveWorkers() {
- const { rows } = await (0, connection_1.query)(`SELECT * FROM v_active_workers`);
- return rows.map((row) => ({
- workerId: row.worker_id,
- hostname: row.worker_hostname,
- currentJobs: parseInt(row.current_jobs || '0', 10),
- totalProductsFound: parseInt(row.total_products_found || '0', 10),
- totalProductsUpserted: parseInt(row.total_products_upserted || '0', 10),
- totalSnapshots: parseInt(row.total_snapshots || '0', 10),
- firstClaimedAt: new Date(row.first_claimed_at),
- lastHeartbeat: row.last_heartbeat ? new Date(row.last_heartbeat) : null,
- }));
-}
-/**
- * Get running jobs with worker info
- */
-async function getRunningJobs() {
- const { rows } = await (0, connection_1.query)(`SELECT cj.*, d.name as dispensary_name, d.city
- FROM dispensary_crawl_jobs cj
- LEFT JOIN dispensaries d ON cj.dispensary_id = d.id
- WHERE cj.status = 'running'
- ORDER BY cj.started_at DESC`);
- return rows.map(mapDbRowToJob);
-}
-/**
- * Recover stale jobs (workers that died without completing)
- */
-async function recoverStaleJobs(staleMinutes = 15) {
- const { rowCount } = await (0, connection_1.query)(`UPDATE dispensary_crawl_jobs
- SET
- status = 'pending',
- claimed_by = NULL,
- claimed_at = NULL,
- worker_id = NULL,
- worker_hostname = NULL,
- started_at = NULL,
- locked_until = NULL,
- error_message = 'Recovered from stale worker',
- retry_count = retry_count + 1,
- updated_at = NOW()
- WHERE status = 'running'
- AND last_heartbeat_at < NOW() - ($1 || ' minutes')::INTERVAL
- AND retry_count < max_retries`, [staleMinutes]);
- if (rowCount && rowCount > 0) {
- console.log(`[JobQueue] Recovered ${rowCount} stale jobs`);
- }
- return rowCount || 0;
-}
-/**
- * Clean up old completed/failed jobs
- */
-async function cleanupOldJobs(olderThanDays = 7) {
- const { rowCount } = await (0, connection_1.query)(`DELETE FROM dispensary_crawl_jobs
- WHERE status IN ('completed', 'failed')
- AND completed_at < NOW() - ($1 || ' days')::INTERVAL`, [olderThanDays]);
- if (rowCount && rowCount > 0) {
- console.log(`[JobQueue] Cleaned up ${rowCount} old jobs`);
- }
- return rowCount || 0;
-}
-// ============================================================
-// HELPERS
-// ============================================================
-function mapDbRowToJob(row) {
- return {
- id: row.id,
- jobType: row.job_type,
- dispensaryId: row.dispensary_id,
- status: row.status,
- priority: row.priority || 0,
- retryCount: row.retry_count || 0,
- maxRetries: row.max_retries || 3,
- claimedBy: row.claimed_by,
- claimedAt: row.claimed_at ? new Date(row.claimed_at) : null,
- workerHostname: row.worker_hostname,
- startedAt: row.started_at ? new Date(row.started_at) : null,
- completedAt: row.completed_at ? new Date(row.completed_at) : null,
- errorMessage: row.error_message,
- productsFound: row.products_found || 0,
- productsUpserted: row.products_upserted || 0,
- snapshotsCreated: row.snapshots_created || 0,
- currentPage: row.current_page || 0,
- totalPages: row.total_pages,
- lastHeartbeatAt: row.last_heartbeat_at ? new Date(row.last_heartbeat_at) : null,
- metadata: row.metadata,
- createdAt: new Date(row.created_at),
- // Add extra fields from join if present
- ...(row.dispensary_name && { dispensaryName: row.dispensary_name }),
- ...(row.city && { city: row.city }),
- };
-}
diff --git a/backend/dist/dutchie-az/services/menu-detection.js b/backend/dist/dutchie-az/services/menu-detection.js
deleted file mode 100644
index 4a91bf93..00000000
--- a/backend/dist/dutchie-az/services/menu-detection.js
+++ /dev/null
@@ -1,909 +0,0 @@
-"use strict";
-/**
- * Menu Detection Service
- *
- * Detects menu provider (dutchie, treez, jane, etc.) from dispensary menu_url
- * and resolves platform_dispensary_id for dutchie stores.
- *
- * This service:
- * 1. Iterates dispensaries with unknown/missing menu_type or platform_dispensary_id
- * 2. Detects provider from menu_url patterns
- * 3. For dutchie: extracts cName and resolves platform_dispensary_id via GraphQL
- * 4. Logs results to job_run_logs
- */
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.crawlWebsiteForMenuLinks = crawlWebsiteForMenuLinks;
-exports.detectProviderFromUrl = detectProviderFromUrl;
-exports.detectAndResolveDispensary = detectAndResolveDispensary;
-exports.runBulkDetection = runBulkDetection;
-exports.executeMenuDetectionJob = executeMenuDetectionJob;
-exports.getDetectionStats = getDetectionStats;
-exports.getDispensariesNeedingDetection = getDispensariesNeedingDetection;
-const connection_1 = require("../db/connection");
-const discovery_1 = require("./discovery");
-const graphql_client_1 = require("./graphql-client");
-// Explicit column list for dispensaries table (avoids SELECT * issues with schema differences)
-const DISPENSARY_COLUMNS = `
- id, name, slug, city, state, zip, address, latitude, longitude,
- menu_type, menu_url, platform_dispensary_id, website,
- provider_detection_data, created_at, updated_at
-`;
-// ============================================================
-// PROVIDER DETECTION PATTERNS
-// ============================================================
-const PROVIDER_URL_PATTERNS = [
- // We detect provider based on the actual menu link we find, not just the site domain.
- {
- provider: 'dutchie',
- patterns: [
- /dutchie\.com/i,
- /\/embedded-menu\//i,
- /\/dispensary\/[A-Z]{2}-/i, // e.g., /dispensary/AZ-store-name
- /dutchie-plus/i,
- /curaleaf\.com/i, // Curaleaf uses Dutchie platform
- /livewithsol\.com/i, // Sol Flower uses Dutchie platform
- ],
- },
- {
- provider: 'treez',
- patterns: [
- /treez\.io/i,
- /shop\.treez/i,
- /treez-ecommerce/i,
- ],
- },
- {
- provider: 'jane',
- patterns: [
- /jane\.co/i,
- /iheartjane\.com/i,
- /embed\.iheartjane/i,
- ],
- },
- {
- provider: 'weedmaps',
- patterns: [
- /weedmaps\.com/i,
- /menu\.weedmaps/i,
- ],
- },
- {
- provider: 'leafly',
- patterns: [
- /leafly\.com/i,
- /order\.leafly/i,
- ],
- },
- {
- provider: 'meadow',
- patterns: [
- /getmeadow\.com/i,
- /meadow\.co/i,
- ],
- },
- {
- provider: 'blaze',
- patterns: [
- /blaze\.me/i,
- /blazepos\.com/i,
- ],
- },
- {
- provider: 'flowhub',
- patterns: [
- /flowhub\.com/i,
- /flowhub\.co/i,
- ],
- },
- {
- provider: 'dispense',
- patterns: [
- /dispense\.io/i,
- /dispenseapp\.com/i,
- ],
- },
-];
-/**
- * Link patterns that suggest a menu or ordering page
- */
-const MENU_LINK_PATTERNS = [
- /\/menu/i,
- /\/order/i,
- /\/shop/i,
- /\/products/i,
- /\/dispensary/i,
- /\/store/i,
- /curaleaf\.com/i,
- /dutchie\.com/i,
- /treez\.io/i,
- /jane\.co/i,
- /iheartjane\.com/i,
- /weedmaps\.com/i,
- /leafly\.com/i,
- /getmeadow\.com/i,
- /blaze\.me/i,
- /flowhub\.com/i,
- /dispense\.io/i,
-];
-/**
- * Check if a URL is a Curaleaf store URL
- */
-function isCuraleafUrl(url) {
- if (!url)
- return false;
- return /curaleaf\.com\/(stores|dispensary)\//i.test(url);
-}
-/**
- * Fetch a page and extract all links
- */
-async function fetchPageLinks(url, timeout = 10000) {
- try {
- const controller = new AbortController();
- const timeoutId = setTimeout(() => controller.abort(), timeout);
- // Use Googlebot User-Agent to bypass age gates on dispensary websites
- const response = await fetch(url, {
- signal: controller.signal,
- headers: {
- 'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
- },
- redirect: 'follow',
- });
- clearTimeout(timeoutId);
- if (!response.ok) {
- return { links: [], error: `HTTP ${response.status}` };
- }
- const html = await response.text();
- // Quick check: if the page contains reactEnv.dispensaryId, treat it as Dutchie
- // Use direct match for dispensaryId - the [^}]* pattern fails with nested braces in JSON
- const reactEnvMatch = /"dispensaryId"\s*:\s*"([a-fA-F0-9]{24})"/i.exec(html);
- if (reactEnvMatch && reactEnvMatch[1]) {
- return { links: [`dutchie-reactenv:${reactEnvMatch[1]}`] };
- }
- // Extract all href attributes from anchor tags
- const linkRegex = /href=["']([^"']+)["']/gi;
- const links = [];
- let match;
- while ((match = linkRegex.exec(html)) !== null) {
- const href = match[1];
- // Convert relative URLs to absolute
- try {
- const absoluteUrl = new URL(href, url).href;
- links.push(absoluteUrl);
- }
- catch {
- // Skip invalid URLs
- }
- }
- // Also look for iframe src attributes (common for embedded menus)
- const iframeRegex = /src=["']([^"']+)["']/gi;
- while ((match = iframeRegex.exec(html)) !== null) {
- const src = match[1];
- try {
- const absoluteUrl = new URL(src, url).href;
- // Only add if it matches a provider pattern
- for (const { patterns } of PROVIDER_URL_PATTERNS) {
- if (patterns.some(p => p.test(absoluteUrl))) {
- links.push(absoluteUrl);
- break;
- }
- }
- }
- catch {
- // Skip invalid URLs
- }
- }
- return { links: [...new Set(links)] }; // Deduplicate
- }
- catch (error) {
- if (error.name === 'AbortError') {
- return { links: [], error: 'Timeout' };
- }
- return { links: [], error: error.message };
- }
-}
-/**
- * Crawl a dispensary's website to find menu provider links
- *
- * Strategy:
- * 1. Fetch the homepage and extract all links
- * 2. Look for links that match known provider patterns (dutchie, treez, etc.)
- * 3. If no direct match, look for menu/order/shop links and follow them (1-2 hops)
- * 4. Check followed pages for provider patterns
- */
-async function crawlWebsiteForMenuLinks(websiteUrl) {
- console.log(`[WebsiteCrawl] Crawling ${websiteUrl} for menu links...`);
- const result = {
- menuUrl: null,
- provider: 'unknown',
- foundLinks: [],
- crawledPages: [],
- };
- // Normalize URL
- let baseUrl;
- try {
- baseUrl = new URL(websiteUrl);
- if (!baseUrl.protocol.startsWith('http')) {
- baseUrl = new URL(`https://${websiteUrl}`);
- }
- }
- catch {
- result.error = 'Invalid website URL';
- return result;
- }
- // Step 1: Fetch the homepage
- const homepage = baseUrl.href;
- result.crawledPages.push(homepage);
- const { links: homepageLinks, error: homepageError } = await fetchPageLinks(homepage);
- if (homepageError) {
- result.error = `Failed to fetch homepage: ${homepageError}`;
- return result;
- }
- result.foundLinks = homepageLinks;
- // Step 2: Try to extract reactEnv.dispensaryId (embedded Dutchie menu) from homepage HTML
- try {
- // Use Googlebot User-Agent to bypass age gates on dispensary websites
- const resp = await fetch(homepage, {
- headers: {
- 'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
- },
- redirect: 'follow',
- });
- if (resp.ok) {
- const html = await resp.text();
- // Look for dispensaryId directly - the [^}]* pattern fails with nested braces
- const reactEnvMatch = /"dispensaryId"\s*:\s*"([a-fA-F0-9]{24})"/i.exec(html);
- if (reactEnvMatch && reactEnvMatch[1]) {
- result.provider = 'dutchie';
- result.menuUrl = homepage;
- result.platformDispensaryId = reactEnvMatch[1];
- console.log(`[WebsiteCrawl] Found reactEnv.dispensaryId=${reactEnvMatch[1]} on homepage ${homepage}`);
- return result;
- }
- }
- }
- catch (err) {
- console.log(`[WebsiteCrawl] reactEnv check failed for ${homepage}: ${err.message}`);
- }
- // Step 2: Check for reactEnv token from fetchPageLinks (encoded as dutchie-reactenv:)
- for (const link of homepageLinks) {
- const reactEnvToken = /^dutchie-reactenv:(.+)$/.exec(link);
- if (reactEnvToken) {
- result.menuUrl = homepage;
- result.provider = 'dutchie';
- result.platformDispensaryId = reactEnvToken[1];
- console.log(`[WebsiteCrawl] Found reactEnv.dispensaryId=${reactEnvToken[1]} on ${homepage}`);
- return result;
- }
- }
- // Step 3: Check for direct provider matches in homepage links
- for (const link of homepageLinks) {
- for (const { provider, patterns } of PROVIDER_URL_PATTERNS) {
- if (patterns.some(p => p.test(link))) {
- console.log(`[WebsiteCrawl] Found ${provider} link on homepage: ${link}`);
- result.menuUrl = link;
- result.provider = provider;
- return result;
- }
- }
- }
- // Step 4: Find menu/order/shop links to follow
- const menuLinks = homepageLinks.filter(link => {
- // Must be same domain or a known provider domain
- try {
- const linkUrl = new URL(link);
- const isSameDomain = linkUrl.hostname === baseUrl.hostname ||
- linkUrl.hostname.endsWith(`.${baseUrl.hostname}`);
- const isProviderDomain = PROVIDER_URL_PATTERNS.some(({ patterns }) => patterns.some(p => p.test(link)));
- const isMenuPath = MENU_LINK_PATTERNS.some(p => p.test(link));
- return (isSameDomain && isMenuPath) || isProviderDomain;
- }
- catch {
- return false;
- }
- });
- console.log(`[WebsiteCrawl] Found ${menuLinks.length} potential menu links to follow`);
- // Step 4: Follow menu links (limit to 3 to avoid excessive crawling)
- for (const menuLink of menuLinks.slice(0, 3)) {
- // Skip if we've already crawled this page
- if (result.crawledPages.includes(menuLink))
- continue;
- // Check if this link itself is a provider URL
- for (const { provider, patterns } of PROVIDER_URL_PATTERNS) {
- if (patterns.some(p => p.test(menuLink))) {
- console.log(`[WebsiteCrawl] Menu link is a ${provider} URL: ${menuLink}`);
- result.menuUrl = menuLink;
- result.provider = provider;
- return result;
- }
- }
- result.crawledPages.push(menuLink);
- // Rate limit
- await new Promise(r => setTimeout(r, 500));
- const { links: pageLinks, error: pageError } = await fetchPageLinks(menuLink);
- if (pageError) {
- console.log(`[WebsiteCrawl] Failed to fetch ${menuLink}: ${pageError}`);
- continue;
- }
- result.foundLinks.push(...pageLinks);
- // Check for provider matches on this page
- for (const link of pageLinks) {
- for (const { provider, patterns } of PROVIDER_URL_PATTERNS) {
- if (patterns.some(p => p.test(link))) {
- console.log(`[WebsiteCrawl] Found ${provider} link on ${menuLink}: ${link}`);
- result.menuUrl = link;
- result.provider = provider;
- return result;
- }
- }
- }
- }
- console.log(`[WebsiteCrawl] No menu provider found on ${websiteUrl}`);
- return result;
-}
-// ============================================================
-// CORE DETECTION FUNCTIONS
-// ============================================================
-/**
- * Detect menu provider from a URL
- */
-function detectProviderFromUrl(menuUrl) {
- if (!menuUrl)
- return 'unknown';
- for (const { provider, patterns } of PROVIDER_URL_PATTERNS) {
- for (const pattern of patterns) {
- if (pattern.test(menuUrl)) {
- return provider;
- }
- }
- }
- // Check if it's a custom website (has a domain but doesn't match known providers)
- try {
- const url = new URL(menuUrl);
- if (url.hostname && !url.hostname.includes('localhost')) {
- return 'custom';
- }
- }
- catch {
- // Invalid URL
- }
- return 'unknown';
-}
-/**
- * Detect provider and resolve platform ID for a single dispensary
- */
-async function detectAndResolveDispensary(dispensaryId) {
- console.log(`[MenuDetection] Processing dispensary ${dispensaryId}...`);
- // Get dispensary record
- const { rows } = await (0, connection_1.query)(`SELECT ${DISPENSARY_COLUMNS} FROM dispensaries WHERE id = $1`, [dispensaryId]);
- if (rows.length === 0) {
- return {
- dispensaryId,
- dispensaryName: 'Unknown',
- previousMenuType: null,
- detectedProvider: 'unknown',
- cName: null,
- platformDispensaryId: null,
- success: false,
- error: 'Dispensary not found',
- };
- }
- const dispensary = (0, discovery_1.mapDbRowToDispensary)(rows[0]);
- let menuUrl = dispensary.menuUrl;
- const previousMenuType = dispensary.menuType || null;
- const website = dispensary.website;
- // If menu_url is null or empty, try to discover it by crawling the dispensary website
- if (!menuUrl || menuUrl.trim() === '') {
- console.log(`[MenuDetection] ${dispensary.name}: No menu_url - attempting website crawl`);
- // Check if website is available
- if (!website || website.trim() === '') {
- console.log(`[MenuDetection] ${dispensary.name}: No website available - marking as not crawlable`);
- await (0, connection_1.query)(`
- UPDATE dispensaries SET
- menu_type = 'unknown',
- provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
- jsonb_build_object(
- 'detected_provider', 'unknown'::text,
- 'detection_method', 'no_data'::text,
- 'detected_at', NOW(),
- 'resolution_error', 'No menu_url and no website available'::text,
- 'not_crawlable', true,
- 'website_crawl_attempted', false
- ),
- updated_at = NOW()
- WHERE id = $1
- `, [dispensaryId]);
- return {
- dispensaryId,
- dispensaryName: dispensary.name,
- previousMenuType,
- detectedProvider: 'unknown',
- cName: null,
- platformDispensaryId: null,
- success: true,
- error: 'No menu_url and no website available - marked as not crawlable',
- };
- }
- // Crawl the website to find menu provider links
- console.log(`[MenuDetection] ${dispensary.name}: Crawling website ${website} for menu links...`);
- const crawlResult = await crawlWebsiteForMenuLinks(website);
- if (crawlResult.menuUrl && crawlResult.provider !== 'unknown') {
- // SUCCESS: Found a menu URL from website crawl!
- console.log(`[MenuDetection] ${dispensary.name}: Found ${crawlResult.provider} menu at ${crawlResult.menuUrl}`);
- menuUrl = crawlResult.menuUrl;
- // Update the dispensary with the discovered menu_url
- await (0, connection_1.query)(`
- UPDATE dispensaries SET
- menu_url = $1,
- menu_type = $2,
- provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
- jsonb_build_object(
- 'detected_provider', $2::text,
- 'detection_method', 'website_crawl'::text,
- 'detected_at', NOW(),
- 'website_crawled', $3::text,
- 'website_crawl_pages', $4::jsonb,
- 'not_crawlable', false
- ),
- updated_at = NOW()
- WHERE id = $5
- `, [
- crawlResult.menuUrl,
- crawlResult.provider,
- website,
- JSON.stringify(crawlResult.crawledPages),
- dispensaryId
- ]);
- // Continue with full detection flow using the discovered menu_url
- }
- else {
- // Website crawl failed to find a menu provider
- const errorReason = crawlResult.error || 'No menu provider links found on website';
- console.log(`[MenuDetection] ${dispensary.name}: Website crawl failed - ${errorReason}`);
- await (0, connection_1.query)(`
- UPDATE dispensaries SET
- menu_type = 'unknown',
- provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
- jsonb_build_object(
- 'detected_provider', 'unknown'::text,
- 'detection_method', 'website_crawl'::text,
- 'detected_at', NOW(),
- 'website_crawled', $1::text,
- 'website_crawl_pages', $2::jsonb,
- 'resolution_error', $3::text,
- 'not_crawlable', true
- ),
- updated_at = NOW()
- WHERE id = $4
- `, [
- website,
- JSON.stringify(crawlResult.crawledPages),
- errorReason,
- dispensaryId
- ]);
- return {
- dispensaryId,
- dispensaryName: dispensary.name,
- previousMenuType,
- detectedProvider: 'unknown',
- cName: null,
- platformDispensaryId: null,
- success: true,
- error: `Website crawl failed: ${errorReason}`,
- };
- }
- }
- // Detect provider from URL
- const detectedProvider = detectProviderFromUrl(menuUrl);
- console.log(`[MenuDetection] ${dispensary.name}: Detected provider = ${detectedProvider} from URL: ${menuUrl}`);
- // Initialize result
- const result = {
- dispensaryId,
- dispensaryName: dispensary.name,
- previousMenuType,
- detectedProvider,
- cName: null,
- platformDispensaryId: null,
- success: false,
- };
- // If not dutchie, just update menu_type (non-dutchie providers)
- // Note: curaleaf.com and livewithsol.com are detected directly as 'dutchie' via PROVIDER_URL_PATTERNS
- if (detectedProvider !== 'dutchie') {
- await (0, connection_1.query)(`
- UPDATE dispensaries SET
- menu_type = $1,
- provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
- jsonb_build_object(
- 'detected_provider', $1::text,
- 'detection_method', 'url_pattern'::text,
- 'detected_at', NOW(),
- 'not_crawlable', false
- ),
- updated_at = NOW()
- WHERE id = $2
- `, [detectedProvider, dispensaryId]);
- result.success = true;
- console.log(`[MenuDetection] ${dispensary.name}: Updated menu_type to ${detectedProvider}`);
- return result;
- }
- // For dutchie: extract cName or platformId from menu_url
- const extraction = (0, discovery_1.extractFromMenuUrl)(menuUrl);
- if (!extraction) {
- result.error = `Could not extract cName or platformId from menu_url: ${menuUrl}`;
- await (0, connection_1.query)(`
- UPDATE dispensaries SET
- menu_type = 'dutchie',
- provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
- jsonb_build_object(
- 'detected_provider', 'dutchie'::text,
- 'detection_method', 'url_pattern'::text,
- 'detected_at', NOW(),
- 'resolution_error', $1::text,
- 'not_crawlable', true
- ),
- updated_at = NOW()
- WHERE id = $2
- `, [result.error, dispensaryId]);
- console.log(`[MenuDetection] ${dispensary.name}: ${result.error}`);
- return result;
- }
- // If URL contains platform_dispensary_id directly (e.g., /api/v2/embedded-menu/.js), skip GraphQL resolution
- if (extraction.type === 'platformId') {
- const platformId = extraction.value;
- result.platformDispensaryId = platformId;
- result.success = true;
- await (0, connection_1.query)(`
- UPDATE dispensaries SET
- menu_type = 'dutchie',
- platform_dispensary_id = $1,
- provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
- jsonb_build_object(
- 'detected_provider', 'dutchie'::text,
- 'detection_method', 'url_direct_platform_id'::text,
- 'detected_at', NOW(),
- 'platform_id_source', 'url_embedded'::text,
- 'platform_id_resolved', true,
- 'platform_id_resolved_at', NOW(),
- 'resolution_error', NULL::text,
- 'not_crawlable', false
- ),
- updated_at = NOW()
- WHERE id = $2
- `, [platformId, dispensaryId]);
- console.log(`[MenuDetection] ${dispensary.name}: Platform ID extracted directly from URL = ${platformId}`);
- return result;
- }
- // Otherwise, we have a cName that needs GraphQL resolution
- const cName = extraction.value;
- result.cName = cName;
- // Resolve platform_dispensary_id from cName
- console.log(`[MenuDetection] ${dispensary.name}: Resolving platform ID for cName = ${cName}`);
- try {
- const platformId = await (0, graphql_client_1.resolveDispensaryId)(cName);
- if (platformId) {
- result.platformDispensaryId = platformId;
- result.success = true;
- await (0, connection_1.query)(`
- UPDATE dispensaries SET
- menu_type = 'dutchie',
- platform_dispensary_id = $1,
- provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
- jsonb_build_object(
- 'detected_provider', 'dutchie'::text,
- 'detection_method', 'url_pattern'::text,
- 'detected_at', NOW(),
- 'cname_extracted', $2::text,
- 'platform_id_resolved', true,
- 'platform_id_resolved_at', NOW(),
- 'resolution_error', NULL::text,
- 'not_crawlable', false
- ),
- updated_at = NOW()
- WHERE id = $3
- `, [platformId, cName, dispensaryId]);
- console.log(`[MenuDetection] ${dispensary.name}: Resolved platform ID = ${platformId}`);
- }
- else {
- // cName resolution failed - try crawling website as fallback
- console.log(`[MenuDetection] ${dispensary.name}: cName "${cName}" not found on Dutchie, trying website crawl fallback...`);
- if (website && website.trim() !== '') {
- const fallbackCrawl = await crawlWebsiteForMenuLinks(website);
- if (fallbackCrawl.menuUrl && fallbackCrawl.provider === 'dutchie') {
- // Found Dutchie menu via website crawl!
- console.log(`[MenuDetection] ${dispensary.name}: Found Dutchie menu via website crawl: ${fallbackCrawl.menuUrl}`);
- // Extract from the new menu URL
- const newExtraction = (0, discovery_1.extractFromMenuUrl)(fallbackCrawl.menuUrl);
- if (newExtraction) {
- let fallbackPlatformId = null;
- if (newExtraction.type === 'platformId') {
- fallbackPlatformId = newExtraction.value;
- }
- else {
- // Try to resolve the new cName
- fallbackPlatformId = await (0, graphql_client_1.resolveDispensaryId)(newExtraction.value);
- }
- if (fallbackPlatformId) {
- result.platformDispensaryId = fallbackPlatformId;
- result.success = true;
- result.cName = newExtraction.value;
- await (0, connection_1.query)(`
- UPDATE dispensaries SET
- menu_type = 'dutchie',
- menu_url = $1,
- platform_dispensary_id = $2,
- provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
- jsonb_build_object(
- 'detected_provider', 'dutchie'::text,
- 'detection_method', 'website_crawl_fallback'::text,
- 'detected_at', NOW(),
- 'original_cname', $3::text,
- 'fallback_cname', $4::text,
- 'website_crawled', $5::text,
- 'platform_id_resolved', true,
- 'platform_id_resolved_at', NOW(),
- 'not_crawlable', false
- ),
- updated_at = NOW()
- WHERE id = $6
- `, [fallbackCrawl.menuUrl, fallbackPlatformId, cName, newExtraction.value, website, dispensaryId]);
- console.log(`[MenuDetection] ${dispensary.name}: Resolved via website crawl, platform ID = ${fallbackPlatformId}`);
- return result;
- }
- }
- }
- }
- // Website crawl fallback didn't work either
- result.error = `cName "${cName}" could not be resolved - may not exist on Dutchie`;
- await (0, connection_1.query)(`
- UPDATE dispensaries SET
- menu_type = 'dutchie',
- platform_dispensary_id = NULL,
- provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
- jsonb_build_object(
- 'detected_provider', 'dutchie'::text,
- 'detection_method', 'url_pattern'::text,
- 'detected_at', NOW(),
- 'cname_extracted', $1::text,
- 'platform_id_resolved', false,
- 'resolution_error', $2::text,
- 'website_crawl_attempted', true,
- 'not_crawlable', true
- ),
- updated_at = NOW()
- WHERE id = $3
- `, [cName, result.error, dispensaryId]);
- console.log(`[MenuDetection] ${dispensary.name}: ${result.error}`);
- }
- }
- catch (error) {
- result.error = `Resolution failed: ${error.message}`;
- await (0, connection_1.query)(`
- UPDATE dispensaries SET
- menu_type = 'dutchie',
- provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
- jsonb_build_object(
- 'detected_provider', 'dutchie'::text,
- 'detection_method', 'url_pattern'::text,
- 'detected_at', NOW(),
- 'cname_extracted', $1::text,
- 'platform_id_resolved', false,
- 'resolution_error', $2::text,
- 'not_crawlable', true
- ),
- updated_at = NOW()
- WHERE id = $3
- `, [cName, result.error, dispensaryId]);
- console.error(`[MenuDetection] ${dispensary.name}: ${result.error}`);
- }
- return result;
-}
-/**
- * Run bulk detection on all dispensaries with unknown/missing menu_type or platform_dispensary_id
- * Also includes dispensaries with no menu_url but with a website (for website crawl discovery)
- */
-async function runBulkDetection(options = {}) {
- const { state, onlyUnknown = true, onlyMissingPlatformId = false, includeWebsiteCrawl = true, includeDutchieMissingPlatformId = true, limit, } = options;
- console.log('[MenuDetection] Starting bulk detection...');
- // Build query to find dispensaries needing detection
- // Includes: dispensaries with menu_url OR (no menu_url but has website and not already marked not_crawlable)
- // Optionally includes dutchie stores missing platform ID
- let whereClause = `WHERE (
- menu_url IS NOT NULL
- ${includeWebsiteCrawl ? `OR (
- menu_url IS NULL
- AND website IS NOT NULL
- AND website != ''
- AND (provider_detection_data IS NULL OR NOT (provider_detection_data->>'not_crawlable')::boolean)
- )` : ''}
- ${includeDutchieMissingPlatformId ? `OR (
- menu_type = 'dutchie' AND platform_dispensary_id IS NULL
- )` : ''}
- )`;
- const params = [];
- let paramIndex = 1;
- if (state) {
- whereClause += ` AND state = $${paramIndex++}`;
- params.push(state);
- }
- // Handle filters for unknown and/or missing platform IDs
- if (onlyUnknown && onlyMissingPlatformId) {
- whereClause += ` AND (
- (menu_type IS NULL OR menu_type = '' OR menu_type = 'unknown')
- OR (menu_type = 'dutchie' AND platform_dispensary_id IS NULL)
- )`;
- }
- else if (onlyUnknown) {
- whereClause += ` AND (
- (menu_type IS NULL OR menu_type = '' OR menu_type = 'unknown')
- ${includeDutchieMissingPlatformId ? `OR (menu_type = 'dutchie' AND platform_dispensary_id IS NULL)` : ''}
- )`;
- }
- else if (onlyMissingPlatformId) {
- whereClause += ` AND (menu_type = 'dutchie' AND platform_dispensary_id IS NULL)`;
- }
- else if (includeDutchieMissingPlatformId) {
- // Always attempt to resolve dutchie stores missing platform IDs
- whereClause += ` AND (menu_type = 'dutchie' AND platform_dispensary_id IS NULL)`;
- }
- let query_str = `
- SELECT ${DISPENSARY_COLUMNS} FROM dispensaries
- ${whereClause}
- ORDER BY name
- `;
- if (limit) {
- query_str += ` LIMIT $${paramIndex}`;
- params.push(limit);
- }
- const { rows: dispensaries } = await (0, connection_1.query)(query_str, params);
- console.log(`[MenuDetection] Found ${dispensaries.length} dispensaries to process (includeWebsiteCrawl=${includeWebsiteCrawl})`);
- const result = {
- totalProcessed: 0,
- totalSucceeded: 0,
- totalFailed: 0,
- totalSkipped: 0,
- results: [],
- errors: [],
- };
- for (const row of dispensaries) {
- result.totalProcessed++;
- try {
- const detectionResult = await detectAndResolveDispensary(row.id);
- result.results.push(detectionResult);
- if (detectionResult.success) {
- result.totalSucceeded++;
- }
- else {
- result.totalFailed++;
- if (detectionResult.error) {
- result.errors.push(`${detectionResult.dispensaryName}: ${detectionResult.error}`);
- }
- }
- // Rate limit between requests
- await new Promise(r => setTimeout(r, 1000));
- }
- catch (error) {
- result.totalFailed++;
- result.errors.push(`${row.name || row.id}: ${error.message}`);
- }
- }
- console.log(`[MenuDetection] Bulk detection complete: ${result.totalSucceeded} succeeded, ${result.totalFailed} failed`);
- return result;
-}
-// ============================================================
-// SCHEDULED JOB EXECUTOR
-// ============================================================
-/**
- * Execute the menu detection job (called by scheduler)
- */
-async function executeMenuDetectionJob(config = {}) {
- const state = config.state || 'AZ';
- const onlyUnknown = config.onlyUnknown !== false;
- // Default to true - always try to resolve platform IDs for dutchie stores
- const onlyMissingPlatformId = config.onlyMissingPlatformId !== false;
- const includeDutchieMissingPlatformId = config.includeDutchieMissingPlatformId !== false;
- console.log(`[MenuDetection] Executing scheduled job for state=${state}...`);
- try {
- const result = await runBulkDetection({
- state,
- onlyUnknown,
- onlyMissingPlatformId,
- includeDutchieMissingPlatformId,
- });
- const status = result.totalFailed === 0 ? 'success' :
- result.totalSucceeded === 0 ? 'error' : 'partial';
- return {
- status,
- itemsProcessed: result.totalProcessed,
- itemsSucceeded: result.totalSucceeded,
- itemsFailed: result.totalFailed,
- errorMessage: result.errors.length > 0 ? result.errors.slice(0, 5).join('; ') : undefined,
- metadata: {
- state,
- onlyUnknown,
- onlyMissingPlatformId,
- providerCounts: countByProvider(result.results),
- },
- };
- }
- catch (error) {
- return {
- status: 'error',
- itemsProcessed: 0,
- itemsSucceeded: 0,
- itemsFailed: 0,
- errorMessage: error.message,
- };
- }
-}
-/**
- * Count results by detected provider
- */
-function countByProvider(results) {
- const counts = {};
- for (const r of results) {
- counts[r.detectedProvider] = (counts[r.detectedProvider] || 0) + 1;
- }
- return counts;
-}
-// ============================================================
-// UTILITY FUNCTIONS
-// ============================================================
-/**
- * Get detection stats for dashboard
- */
-async function getDetectionStats() {
- const { rows } = await (0, connection_1.query)(`
- SELECT
- COUNT(*) as total,
- COUNT(*) FILTER (WHERE menu_type IS NOT NULL AND menu_type != '' AND menu_type != 'unknown') as with_menu_type,
- COUNT(*) FILTER (WHERE platform_dispensary_id IS NOT NULL) as with_platform_id,
- COUNT(*) FILTER (WHERE menu_url IS NOT NULL AND (menu_type IS NULL OR menu_type = '' OR menu_type = 'unknown')) as needs_detection
- FROM dispensaries
- WHERE state = 'AZ'
- `);
- const stats = rows[0] || {};
- // Get provider breakdown
- const { rows: providerRows } = await (0, connection_1.query)(`
- SELECT menu_type, COUNT(*) as count
- FROM dispensaries
- WHERE state = 'AZ' AND menu_type IS NOT NULL AND menu_type != ''
- GROUP BY menu_type
- ORDER BY count DESC
- `);
- const byProvider = {};
- for (const row of providerRows) {
- byProvider[row.menu_type] = parseInt(row.count, 10);
- }
- return {
- totalDispensaries: parseInt(stats.total || '0', 10),
- withMenuType: parseInt(stats.with_menu_type || '0', 10),
- withPlatformId: parseInt(stats.with_platform_id || '0', 10),
- needsDetection: parseInt(stats.needs_detection || '0', 10),
- byProvider,
- };
-}
-/**
- * Get dispensaries needing detection
- * Includes dispensaries with website but no menu_url for website crawl discovery
- */
-async function getDispensariesNeedingDetection(options = {}) {
- const { state = 'AZ', limit = 100, includeWebsiteCrawl = true } = options;
- const { rows } = await (0, connection_1.query)(`
- SELECT ${DISPENSARY_COLUMNS} FROM dispensaries
- WHERE state = $1
- AND (
- (menu_url IS NOT NULL AND (menu_type IS NULL OR menu_type = '' OR menu_type = 'unknown'
- OR (menu_type = 'dutchie' AND platform_dispensary_id IS NULL)))
- ${includeWebsiteCrawl ? `OR (
- menu_url IS NULL
- AND website IS NOT NULL
- AND website != ''
- AND (provider_detection_data IS NULL OR NOT (provider_detection_data->>'not_crawlable')::boolean)
- )` : ''}
- )
- ORDER BY name
- LIMIT $2
- `, [state, limit]);
- return rows.map(discovery_1.mapDbRowToDispensary);
-}
diff --git a/backend/dist/dutchie-az/services/product-crawler.js b/backend/dist/dutchie-az/services/product-crawler.js
deleted file mode 100644
index b831835d..00000000
--- a/backend/dist/dutchie-az/services/product-crawler.js
+++ /dev/null
@@ -1,843 +0,0 @@
-"use strict";
-/**
- * Dutchie AZ Product Crawler Service
- *
- * Crawls products from Dutchie dispensaries and stores them in the dutchie_az database.
- * Handles normalization from GraphQL response to database entities.
- *
- * IMPORTANT: Uses chunked batch processing per CLAUDE.md Rule #15 to avoid OOM.
- */
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.normalizeProduct = normalizeProduct;
-exports.normalizeSnapshot = normalizeSnapshot;
-exports.crawlDispensaryProducts = crawlDispensaryProducts;
-exports.crawlAllArizonaDispensaries = crawlAllArizonaDispensaries;
-const connection_1 = require("../db/connection");
-const graphql_client_1 = require("./graphql-client");
-const discovery_1 = require("./discovery");
-const types_1 = require("../types");
-const image_storage_1 = require("../../utils/image-storage");
-// Explicit column list for dispensaries table (avoids SELECT * issues with schema differences)
-const DISPENSARY_COLUMNS = `
- id, name, slug, city, state, zip, address, latitude, longitude,
- menu_type, menu_url, platform_dispensary_id, website,
- provider_detection_data, created_at, updated_at
-`;
-// ============================================================
-// BATCH PROCESSING CONFIGURATION
-// ============================================================
-/** Chunk size for batch DB writes (per CLAUDE.md Rule #15) */
-const BATCH_CHUNK_SIZE = 100;
-// ============================================================
-// NORMALIZATION FUNCTIONS
-// ============================================================
-/**
- * Convert price to cents
- */
-function toCents(price) {
- if (price === undefined || price === null)
- return undefined;
- return Math.round(price * 100);
-}
-/**
- * Get min value from array of numbers
- */
-function getMin(arr) {
- if (!arr || arr.length === 0)
- return undefined;
- return Math.min(...arr.filter((n) => n !== null && n !== undefined));
-}
-/**
- * Get max value from array of numbers
- */
-function getMax(arr) {
- if (!arr || arr.length === 0)
- return undefined;
- return Math.max(...arr.filter((n) => n !== null && n !== undefined));
-}
-/**
- * Normalize a value to boolean
- * Handles Dutchie API returning {} or [] or other non-boolean values
- * that would cause "invalid input syntax for type boolean" errors
- */
-function normBool(v, defaultVal = false) {
- if (v === true)
- return true;
- if (v === false)
- return false;
- // Log unexpected object/array values once for debugging
- if (v !== null && v !== undefined && typeof v === 'object') {
- console.warn(`[normBool] Unexpected object value, coercing to ${defaultVal}:`, JSON.stringify(v));
- }
- return defaultVal;
-}
-/**
- * Normalize a value to Date or undefined
- * Handles Dutchie API returning {} or [] or other non-date values
- * that would cause "invalid input syntax for type timestamp" errors
- */
-function normDate(v) {
- if (!v)
- return undefined;
- // Reject objects/arrays that aren't dates
- if (typeof v === 'object' && !(v instanceof Date)) {
- console.warn(`[normDate] Unexpected object value, ignoring:`, JSON.stringify(v));
- return undefined;
- }
- // Try parsing
- const d = new Date(v);
- if (isNaN(d.getTime())) {
- console.warn(`[normDate] Invalid date value, ignoring:`, v);
- return undefined;
- }
- return d;
-}
-/**
- * Extract cName (Dutchie slug) from menuUrl or dispensary slug
- * Handles URL formats:
- * - https://dutchie.com/embedded-menu/AZ-Deeply-Rooted -> AZ-Deeply-Rooted
- * - https://dutchie.com/dispensary/sol-flower-dispensary-mcclintock -> sol-flower-dispensary-mcclintock
- * Falls back to dispensary.slug if menuUrl extraction fails
- */
-function extractCName(dispensary) {
- if (dispensary.menuUrl) {
- try {
- const url = new URL(dispensary.menuUrl);
- // Extract last path segment: /embedded-menu/X or /dispensary/X
- const segments = url.pathname.split('/').filter(Boolean);
- if (segments.length >= 2) {
- const cName = segments[segments.length - 1];
- if (cName) {
- console.log(`[ProductCrawler] Extracted cName "${cName}" from menuUrl`);
- return cName;
- }
- }
- }
- catch (e) {
- console.warn(`[ProductCrawler] Failed to parse menuUrl: ${dispensary.menuUrl}`);
- }
- }
- // Fallback to slug
- console.log(`[ProductCrawler] Using dispensary slug "${dispensary.slug}" as cName`);
- return dispensary.slug;
-}
-/**
- * Normalize a POSMetaData.children entry to DutchieProductOptionSnapshot
- */
-function normalizeOption(child) {
- return {
- optionId: child.canonicalID || child.canonicalPackageId || child.canonicalSKU || child.option || 'unknown',
- canonicalId: child.canonicalID,
- canonicalPackageId: child.canonicalPackageId,
- canonicalSKU: child.canonicalSKU,
- canonicalName: child.canonicalName,
- canonicalCategory: child.canonicalCategory,
- canonicalCategoryId: child.canonicalCategoryId,
- canonicalBrandId: child.canonicalBrandId,
- canonicalBrandName: child.canonicalBrandName,
- canonicalStrainId: child.canonicalStrainId,
- canonicalVendorId: child.canonicalVendorId,
- optionLabel: child.option,
- packageQuantity: child.packageQuantity,
- recEquivalent: child.recEquivalent,
- standardEquivalent: child.standardEquivalent,
- priceCents: toCents(child.price),
- recPriceCents: toCents(child.recPrice),
- medPriceCents: toCents(child.medPrice),
- quantity: child.quantity,
- quantityAvailable: child.quantityAvailable,
- kioskQuantityAvailable: child.kioskQuantityAvailable,
- activeBatchTags: child.activeBatchTags,
- canonicalImgUrl: child.canonicalImgUrl,
- canonicalLabResultUrl: child.canonicalLabResultUrl,
- canonicalEffectivePotencyMg: child.canonicalEffectivePotencyMg,
- rawChildPayload: child,
- };
-}
-/**
- * Normalize a raw Dutchie product to DutchieProduct (canonical identity)
- */
-function normalizeProduct(raw, dispensaryId, platformDispensaryId) {
- return {
- dispensaryId,
- platform: 'dutchie',
- externalProductId: raw._id || raw.id || '',
- platformDispensaryId,
- cName: raw.cName,
- name: raw.Name,
- // Brand
- brandName: raw.brandName || raw.brand?.name,
- brandId: raw.brandId || raw.brand?.id,
- brandLogoUrl: raw.brandLogo || raw.brand?.imageUrl,
- // Classification
- type: raw.type,
- subcategory: raw.subcategory,
- strainType: raw.strainType,
- provider: raw.provider,
- // Potency
- thc: raw.THC,
- thcContent: raw.THCContent?.range?.[0],
- cbd: raw.CBD,
- cbdContent: raw.CBDContent?.range?.[0],
- cannabinoidsV2: raw.cannabinoidsV2,
- effects: raw.effects,
- // Status / flags
- status: raw.Status,
- medicalOnly: normBool(raw.medicalOnly, false),
- recOnly: normBool(raw.recOnly, false),
- featured: normBool(raw.featured, false),
- comingSoon: normBool(raw.comingSoon, false),
- certificateOfAnalysisEnabled: normBool(raw.certificateOfAnalysisEnabled, false),
- isBelowThreshold: normBool(raw.isBelowThreshold, false),
- isBelowKioskThreshold: normBool(raw.isBelowKioskThreshold, false),
- optionsBelowThreshold: normBool(raw.optionsBelowThreshold, false),
- optionsBelowKioskThreshold: normBool(raw.optionsBelowKioskThreshold, false),
- // Derived stock status
- stockStatus: (0, types_1.deriveStockStatus)(raw),
- totalQuantityAvailable: (0, types_1.calculateTotalQuantity)(raw),
- // Images
- primaryImageUrl: raw.Image || raw.images?.[0]?.url,
- images: raw.images,
- // Misc
- measurements: raw.measurements,
- weight: typeof raw.weight === 'number' ? String(raw.weight) : raw.weight,
- pastCNames: raw.pastCNames,
- createdAtDutchie: normDate(raw.createdAt),
- updatedAtDutchie: normDate(raw.updatedAt),
- latestRawPayload: raw,
- };
-}
-/**
- * Normalize a raw Dutchie product to DutchieProductSnapshot (time-series data)
- */
-function normalizeSnapshot(raw, dutchieProductId, dispensaryId, platformDispensaryId, pricingType, crawlMode = 'mode_a') {
- const children = raw.POSMetaData?.children || [];
- const options = children.map(normalizeOption);
- // Aggregate prices from various sources
- const recPrices = raw.recPrices || [];
- const medPrices = raw.medicalPrices || [];
- const recSpecialPrices = raw.recSpecialPrices || [];
- const medSpecialPrices = raw.medicalSpecialPrices || [];
- const wholesalePrices = raw.wholesalePrices || [];
- // Also consider child prices
- const childRecPrices = children.map((c) => c.recPrice).filter((p) => p !== undefined);
- const childMedPrices = children.map((c) => c.medPrice).filter((p) => p !== undefined);
- const childPrices = children.map((c) => c.price).filter((p) => p !== undefined);
- // Aggregate inventory - use calculateTotalQuantity for proper null handling
- const totalQty = (0, types_1.calculateTotalQuantity)(raw);
- const hasAnyKioskQty = children.some(c => typeof c.kioskQuantityAvailable === 'number');
- const totalKioskQty = hasAnyKioskQty
- ? children.reduce((sum, c) => sum + (c.kioskQuantityAvailable || 0), 0)
- : null;
- // Determine if on special
- const isOnSpecial = raw.special === true ||
- (raw.specialData?.saleSpecials && raw.specialData.saleSpecials.length > 0) ||
- (recSpecialPrices.length > 0 && recSpecialPrices[0] !== null) ||
- (medSpecialPrices.length > 0 && medSpecialPrices[0] !== null);
- return {
- dutchieProductId,
- dispensaryId,
- platformDispensaryId,
- externalProductId: raw._id || raw.id || '',
- pricingType,
- crawlMode,
- status: raw.Status,
- featured: normBool(raw.featured, false),
- special: normBool(isOnSpecial, false),
- medicalOnly: normBool(raw.medicalOnly, false),
- recOnly: normBool(raw.recOnly, false),
- // Product was present in feed
- isPresentInFeed: true,
- // Derived stock status
- stockStatus: (0, types_1.deriveStockStatus)(raw),
- // Price summary
- recMinPriceCents: toCents(getMin([...recPrices, ...childRecPrices, ...childPrices])),
- recMaxPriceCents: toCents(getMax([...recPrices, ...childRecPrices, ...childPrices])),
- recMinSpecialPriceCents: toCents(getMin(recSpecialPrices)),
- medMinPriceCents: toCents(getMin([...medPrices, ...childMedPrices])),
- medMaxPriceCents: toCents(getMax([...medPrices, ...childMedPrices])),
- medMinSpecialPriceCents: toCents(getMin(medSpecialPrices)),
- wholesaleMinPriceCents: toCents(getMin(wholesalePrices)),
- // Inventory summary - null = unknown, 0 = all OOS
- totalQuantityAvailable: totalQty,
- totalKioskQuantityAvailable: totalKioskQty,
- manualInventory: normBool(raw.manualInventory, false),
- isBelowThreshold: normBool(raw.isBelowThreshold, false),
- isBelowKioskThreshold: normBool(raw.isBelowKioskThreshold, false),
- options,
- rawPayload: raw,
- crawledAt: new Date(),
- };
-}
-// ============================================================
-// DATABASE OPERATIONS
-// ============================================================
-/**
- * Upsert a DutchieProduct record
- */
-async function upsertProduct(product) {
- const result = await (0, connection_1.query)(`
- INSERT INTO dutchie_products (
- dispensary_id, platform, external_product_id, platform_dispensary_id,
- c_name, name, brand_name, brand_id, brand_logo_url,
- type, subcategory, strain_type, provider,
- thc, thc_content, cbd, cbd_content, cannabinoids_v2, effects,
- status, medical_only, rec_only, featured, coming_soon, certificate_of_analysis_enabled,
- is_below_threshold, is_below_kiosk_threshold, options_below_threshold, options_below_kiosk_threshold,
- stock_status, total_quantity_available,
- primary_image_url, images, measurements, weight, past_c_names,
- created_at_dutchie, updated_at_dutchie, latest_raw_payload, updated_at
- ) VALUES (
- $1, $2, $3, $4,
- $5, $6, $7, $8, $9,
- $10, $11, $12, $13,
- $14, $15, $16, $17, $18, $19,
- $20, $21, $22, $23, $24, $25,
- $26, $27, $28, $29,
- $30, $31,
- $32, $33, $34, $35, $36,
- $37, $38, $39, NOW()
- )
- ON CONFLICT (dispensary_id, external_product_id) DO UPDATE SET
- c_name = EXCLUDED.c_name,
- name = EXCLUDED.name,
- brand_name = EXCLUDED.brand_name,
- brand_id = EXCLUDED.brand_id,
- brand_logo_url = EXCLUDED.brand_logo_url,
- type = EXCLUDED.type,
- subcategory = EXCLUDED.subcategory,
- strain_type = EXCLUDED.strain_type,
- provider = EXCLUDED.provider,
- thc = EXCLUDED.thc,
- thc_content = EXCLUDED.thc_content,
- cbd = EXCLUDED.cbd,
- cbd_content = EXCLUDED.cbd_content,
- cannabinoids_v2 = EXCLUDED.cannabinoids_v2,
- effects = EXCLUDED.effects,
- status = EXCLUDED.status,
- medical_only = EXCLUDED.medical_only,
- rec_only = EXCLUDED.rec_only,
- featured = EXCLUDED.featured,
- coming_soon = EXCLUDED.coming_soon,
- certificate_of_analysis_enabled = EXCLUDED.certificate_of_analysis_enabled,
- is_below_threshold = EXCLUDED.is_below_threshold,
- is_below_kiosk_threshold = EXCLUDED.is_below_kiosk_threshold,
- options_below_threshold = EXCLUDED.options_below_threshold,
- options_below_kiosk_threshold = EXCLUDED.options_below_kiosk_threshold,
- stock_status = EXCLUDED.stock_status,
- total_quantity_available = EXCLUDED.total_quantity_available,
- primary_image_url = EXCLUDED.primary_image_url,
- images = EXCLUDED.images,
- measurements = EXCLUDED.measurements,
- weight = EXCLUDED.weight,
- past_c_names = EXCLUDED.past_c_names,
- created_at_dutchie = EXCLUDED.created_at_dutchie,
- updated_at_dutchie = EXCLUDED.updated_at_dutchie,
- latest_raw_payload = EXCLUDED.latest_raw_payload,
- updated_at = NOW()
- RETURNING id
- `, [
- product.dispensaryId,
- product.platform,
- product.externalProductId,
- product.platformDispensaryId,
- product.cName,
- product.name,
- product.brandName,
- product.brandId,
- product.brandLogoUrl,
- product.type,
- product.subcategory,
- product.strainType,
- product.provider,
- product.thc,
- product.thcContent,
- product.cbd,
- product.cbdContent,
- product.cannabinoidsV2 ? JSON.stringify(product.cannabinoidsV2) : null,
- product.effects ? JSON.stringify(product.effects) : null,
- product.status,
- product.medicalOnly,
- product.recOnly,
- product.featured,
- product.comingSoon,
- product.certificateOfAnalysisEnabled,
- product.isBelowThreshold,
- product.isBelowKioskThreshold,
- product.optionsBelowThreshold,
- product.optionsBelowKioskThreshold,
- product.stockStatus,
- product.totalQuantityAvailable,
- product.primaryImageUrl,
- product.images ? JSON.stringify(product.images) : null,
- product.measurements ? JSON.stringify(product.measurements) : null,
- product.weight,
- product.pastCNames,
- product.createdAtDutchie,
- product.updatedAtDutchie,
- product.latestRawPayload ? JSON.stringify(product.latestRawPayload) : null,
- ]);
- return result.rows[0].id;
-}
-/**
- * Download product image and update local image URLs
- * Skips download if local image already exists for this product+URL combo
- */
-async function downloadAndUpdateProductImage(productId, dispensaryId, externalProductId, primaryImageUrl) {
- if (!primaryImageUrl) {
- return { downloaded: false, error: 'No image URL' };
- }
- try {
- // Check if we already have this image locally
- const exists = await (0, image_storage_1.imageExists)(dispensaryId, externalProductId, primaryImageUrl);
- if (exists) {
- return { downloaded: false };
- }
- // Download and process the image
- const result = await (0, image_storage_1.downloadProductImage)(primaryImageUrl, dispensaryId, externalProductId);
- if (!result.success || !result.urls) {
- return { downloaded: false, error: result.error };
- }
- // Update the product record with local image URLs
- await (0, connection_1.query)(`
- UPDATE dutchie_products
- SET
- local_image_url = $1,
- local_image_thumb_url = $2,
- local_image_medium_url = $3,
- original_image_url = COALESCE(original_image_url, primary_image_url),
- updated_at = NOW()
- WHERE id = $4
- `, [result.urls.full, result.urls.thumb, result.urls.medium, productId]);
- return { downloaded: true };
- }
- catch (error) {
- return { downloaded: false, error: error.message };
- }
-}
-/**
- * Insert a snapshot record
- */
-async function insertSnapshot(snapshot) {
- const result = await (0, connection_1.query)(`
- INSERT INTO dutchie_product_snapshots (
- dutchie_product_id, dispensary_id, platform_dispensary_id, external_product_id,
- pricing_type, crawl_mode, status, featured, special, medical_only, rec_only,
- is_present_in_feed, stock_status,
- rec_min_price_cents, rec_max_price_cents, rec_min_special_price_cents,
- med_min_price_cents, med_max_price_cents, med_min_special_price_cents,
- wholesale_min_price_cents,
- total_quantity_available, total_kiosk_quantity_available, manual_inventory,
- is_below_threshold, is_below_kiosk_threshold,
- options, raw_payload, crawled_at
- ) VALUES (
- $1, $2, $3, $4,
- $5, $6, $7, $8, $9, $10, $11,
- $12, $13,
- $14, $15, $16,
- $17, $18, $19,
- $20,
- $21, $22, $23,
- $24, $25,
- $26, $27, $28
- )
- RETURNING id
- `, [
- snapshot.dutchieProductId,
- snapshot.dispensaryId,
- snapshot.platformDispensaryId,
- snapshot.externalProductId,
- snapshot.pricingType,
- snapshot.crawlMode,
- snapshot.status,
- snapshot.featured,
- snapshot.special,
- snapshot.medicalOnly,
- snapshot.recOnly,
- snapshot.isPresentInFeed ?? true,
- snapshot.stockStatus,
- snapshot.recMinPriceCents,
- snapshot.recMaxPriceCents,
- snapshot.recMinSpecialPriceCents,
- snapshot.medMinPriceCents,
- snapshot.medMaxPriceCents,
- snapshot.medMinSpecialPriceCents,
- snapshot.wholesaleMinPriceCents,
- snapshot.totalQuantityAvailable,
- snapshot.totalKioskQuantityAvailable,
- snapshot.manualInventory,
- snapshot.isBelowThreshold,
- snapshot.isBelowKioskThreshold,
- JSON.stringify(snapshot.options || []),
- JSON.stringify(snapshot.rawPayload || {}),
- snapshot.crawledAt,
- ]);
- return result.rows[0].id;
-}
-// ============================================================
-// BATCH DATABASE OPERATIONS (per CLAUDE.md Rule #15)
-// ============================================================
-/**
- * Helper to chunk an array into smaller arrays
- */
-function chunkArray(array, size) {
- const chunks = [];
- for (let i = 0; i < array.length; i += size) {
- chunks.push(array.slice(i, i + size));
- }
- return chunks;
-}
-/**
- * Batch upsert products - processes in chunks to avoid OOM
- * Returns a Map of externalProductId -> database id
- */
-async function batchUpsertProducts(products) {
- const productIdMap = new Map();
- const chunks = chunkArray(products, BATCH_CHUNK_SIZE);
- console.log(`[ProductCrawler] Batch upserting ${products.length} products in ${chunks.length} chunks of ${BATCH_CHUNK_SIZE}...`);
- for (let i = 0; i < chunks.length; i++) {
- const chunk = chunks[i];
- // Process each product in the chunk
- for (const product of chunk) {
- try {
- const id = await upsertProduct(product);
- if (product.externalProductId) {
- productIdMap.set(product.externalProductId, id);
- }
- }
- catch (error) {
- console.error(`[ProductCrawler] Error upserting product ${product.externalProductId}:`, error.message);
- }
- }
- // Log progress
- if ((i + 1) % 5 === 0 || i === chunks.length - 1) {
- console.log(`[ProductCrawler] Upserted chunk ${i + 1}/${chunks.length} (${productIdMap.size} products so far)`);
- }
- }
- return productIdMap;
-}
-/**
- * Batch insert snapshots - processes in chunks to avoid OOM
- */
-async function batchInsertSnapshots(snapshots) {
- const chunks = chunkArray(snapshots, BATCH_CHUNK_SIZE);
- let inserted = 0;
- console.log(`[ProductCrawler] Batch inserting ${snapshots.length} snapshots in ${chunks.length} chunks of ${BATCH_CHUNK_SIZE}...`);
- for (let i = 0; i < chunks.length; i++) {
- const chunk = chunks[i];
- // Process each snapshot in the chunk
- for (const snapshot of chunk) {
- try {
- await insertSnapshot(snapshot);
- inserted++;
- }
- catch (error) {
- console.error(`[ProductCrawler] Error inserting snapshot for ${snapshot.externalProductId}:`, error.message);
- }
- }
- // Log progress
- if ((i + 1) % 5 === 0 || i === chunks.length - 1) {
- console.log(`[ProductCrawler] Inserted snapshot chunk ${i + 1}/${chunks.length} (${inserted} snapshots so far)`);
- }
- }
- return inserted;
-}
-/**
- * Update dispensary last_crawled_at and product_count
- */
-async function updateDispensaryCrawlStats(dispensaryId, productCount) {
- // Update last_crawl_at to track when we last crawled
- // Skip product_count as that column may not exist
- await (0, connection_1.query)(`
- UPDATE dispensaries
- SET last_crawl_at = NOW(), updated_at = NOW()
- WHERE id = $1
- `, [dispensaryId]);
-}
-/**
- * Mark products as missing from feed
- * Creates a snapshot with isPresentInFeed=false and stockStatus='missing_from_feed'
- * for products that were NOT in the UNION of Mode A and Mode B product lists
- *
- * IMPORTANT: Uses UNION of both modes to avoid false positives
- * If the union is empty (possible outage), we skip marking to avoid data corruption
- */
-async function markMissingProducts(dispensaryId, platformDispensaryId, modeAProductIds, modeBProductIds, pricingType) {
- // Build UNION of Mode A + Mode B product IDs
- const unionProductIds = new Set([...Array.from(modeAProductIds), ...Array.from(modeBProductIds)]);
- // OUTAGE DETECTION: If union is empty, something went wrong - don't mark anything as missing
- if (unionProductIds.size === 0) {
- console.warn('[ProductCrawler] OUTAGE DETECTED: Both Mode A and Mode B returned 0 products. Skipping missing product marking.');
- return 0;
- }
- // Get all existing products for this dispensary that were not in the UNION
- const { rows: missingProducts } = await (0, connection_1.query)(`
- SELECT id, external_product_id, name
- FROM dutchie_products
- WHERE dispensary_id = $1
- AND external_product_id NOT IN (SELECT unnest($2::text[]))
- `, [dispensaryId, Array.from(unionProductIds)]);
- if (missingProducts.length === 0) {
- return 0;
- }
- console.log(`[ProductCrawler] Marking ${missingProducts.length} products as missing from feed (union of ${modeAProductIds.size} Mode A + ${modeBProductIds.size} Mode B = ${unionProductIds.size} unique)...`);
- const crawledAt = new Date();
- // Build all missing snapshots first (per CLAUDE.md Rule #15 - batch writes)
- const missingSnapshots = missingProducts.map(product => ({
- dutchieProductId: product.id,
- dispensaryId,
- platformDispensaryId,
- externalProductId: product.external_product_id,
- pricingType,
- crawlMode: 'mode_a', // Use mode_a for missing snapshots (convention)
- status: undefined,
- featured: false,
- special: false,
- medicalOnly: false,
- recOnly: false,
- isPresentInFeed: false,
- stockStatus: 'missing_from_feed',
- totalQuantityAvailable: undefined, // null = unknown, not 0
- manualInventory: false,
- isBelowThreshold: false,
- isBelowKioskThreshold: false,
- options: [],
- rawPayload: { _missingFromFeed: true, lastKnownName: product.name },
- crawledAt,
- }));
- // Batch insert missing snapshots
- const snapshotsInserted = await batchInsertSnapshots(missingSnapshots);
- // Batch update product stock status in chunks
- const productIds = missingProducts.map(p => p.id);
- const productChunks = chunkArray(productIds, BATCH_CHUNK_SIZE);
- console.log(`[ProductCrawler] Updating ${productIds.length} product statuses in ${productChunks.length} chunks...`);
- for (const chunk of productChunks) {
- await (0, connection_1.query)(`
- UPDATE dutchie_products
- SET stock_status = 'missing_from_feed', total_quantity_available = NULL, updated_at = NOW()
- WHERE id = ANY($1::int[])
- `, [chunk]);
- }
- console.log(`[ProductCrawler] Marked ${snapshotsInserted} products as missing from feed`);
- return snapshotsInserted;
-}
-/**
- * Process a batch of products from a single crawl mode
- * IMPORTANT: Stores ALL products, never filters before DB
- * Uses chunked batch processing per CLAUDE.md Rule #15 to avoid OOM
- * Returns the set of external product IDs that were processed
- */
-async function processProducts(products, dispensary, pricingType, crawlMode, options = {}) {
- const { downloadImages = true } = options;
- const productIds = new Set();
- let imagesDownloaded = 0;
- let imageErrors = 0;
- console.log(`[ProductCrawler] Processing ${products.length} products using chunked batch processing...`);
- // Step 1: Normalize all products and collect IDs
- const normalizedProducts = [];
- const rawByExternalId = new Map();
- for (const raw of products) {
- const externalId = raw._id || raw.id || '';
- productIds.add(externalId);
- rawByExternalId.set(externalId, raw);
- const normalized = normalizeProduct(raw, dispensary.id, dispensary.platformDispensaryId);
- normalizedProducts.push(normalized);
- }
- // Step 2: Batch upsert products (chunked)
- const productIdMap = await batchUpsertProducts(normalizedProducts);
- const upserted = productIdMap.size;
- // Step 3: Create and batch insert snapshots (chunked)
- // IMPORTANT: Do this BEFORE image downloads to ensure snapshots are created even if images fail
- const snapshots = [];
- for (const [externalId, productId] of Array.from(productIdMap.entries())) {
- const raw = rawByExternalId.get(externalId);
- if (raw) {
- const snapshot = normalizeSnapshot(raw, productId, dispensary.id, dispensary.platformDispensaryId, pricingType, crawlMode);
- snapshots.push(snapshot);
- }
- }
- const snapshotsInserted = await batchInsertSnapshots(snapshots);
- // Step 4: Download images in chunks (if enabled)
- // This is done AFTER snapshots to ensure core data is saved even if image downloads fail
- if (downloadImages) {
- const imageChunks = chunkArray(Array.from(productIdMap.entries()), BATCH_CHUNK_SIZE);
- console.log(`[ProductCrawler] Downloading images in ${imageChunks.length} chunks...`);
- for (let i = 0; i < imageChunks.length; i++) {
- const chunk = imageChunks[i];
- for (const [externalId, productId] of chunk) {
- const normalized = normalizedProducts.find(p => p.externalProductId === externalId);
- if (normalized?.primaryImageUrl) {
- try {
- const imageResult = await downloadAndUpdateProductImage(productId, dispensary.id, externalId, normalized.primaryImageUrl);
- if (imageResult.downloaded) {
- imagesDownloaded++;
- }
- else if (imageResult.error && imageResult.error !== 'No image URL') {
- imageErrors++;
- }
- }
- catch (error) {
- imageErrors++;
- }
- }
- }
- if ((i + 1) % 5 === 0 || i === imageChunks.length - 1) {
- console.log(`[ProductCrawler] Image download chunk ${i + 1}/${imageChunks.length} (${imagesDownloaded} downloaded, ${imageErrors} errors)`);
- }
- }
- }
- // Clear references to help GC
- normalizedProducts.length = 0;
- rawByExternalId.clear();
- return { upserted, snapshots: snapshotsInserted, productIds, imagesDownloaded, imageErrors };
-}
-async function crawlDispensaryProducts(dispensary, pricingType = 'rec', options = {}) {
- const { useBothModes = true, downloadImages = true, onProgress } = options;
- const startTime = Date.now();
- if (!dispensary.platformDispensaryId) {
- return {
- success: false,
- dispensaryId: dispensary.id,
- productsFound: 0,
- productsFetched: 0,
- productsUpserted: 0,
- snapshotsCreated: 0,
- errorMessage: 'Missing platformDispensaryId',
- durationMs: Date.now() - startTime,
- };
- }
- try {
- console.log(`[ProductCrawler] Crawling ${dispensary.name} (${dispensary.platformDispensaryId})...`);
- let totalUpserted = 0;
- let totalSnapshots = 0;
- let totalImagesDownloaded = 0;
- let totalImageErrors = 0;
- let modeAProducts = 0;
- let modeBProducts = 0;
- let missingMarked = 0;
- // Track product IDs separately for each mode (needed for missing product detection)
- const modeAProductIds = new Set();
- const modeBProductIds = new Set();
- // Extract cName for this specific dispensary (used for Puppeteer session & headers)
- const cName = extractCName(dispensary);
- console.log(`[ProductCrawler] Using cName="${cName}" for dispensary ${dispensary.name}`);
- if (useBothModes) {
- // Run two-mode crawl for maximum coverage
- const bothResults = await (0, graphql_client_1.fetchAllProductsBothModes)(dispensary.platformDispensaryId, pricingType, { cName });
- modeAProducts = bothResults.modeA.products.length;
- modeBProducts = bothResults.modeB.products.length;
- console.log(`[ProductCrawler] Two-mode crawl: Mode A=${modeAProducts}, Mode B=${modeBProducts}, Merged=${bothResults.merged.products.length}`);
- // Collect Mode A product IDs
- for (const p of bothResults.modeA.products) {
- modeAProductIds.add(p._id);
- }
- // Collect Mode B product IDs
- for (const p of bothResults.modeB.products) {
- modeBProductIds.add(p._id);
- }
- // Process MERGED products (includes options from both modes)
- if (bothResults.merged.products.length > 0) {
- const mergedResult = await processProducts(bothResults.merged.products, dispensary, pricingType, 'mode_a', // Use mode_a for merged products (convention)
- { downloadImages });
- totalUpserted = mergedResult.upserted;
- totalSnapshots = mergedResult.snapshots;
- totalImagesDownloaded = mergedResult.imagesDownloaded;
- totalImageErrors = mergedResult.imageErrors;
- // Report progress
- if (onProgress) {
- await onProgress({
- productsFound: bothResults.merged.products.length,
- productsUpserted: totalUpserted,
- snapshotsCreated: totalSnapshots,
- currentPage: 1,
- totalPages: 1,
- });
- }
- }
- }
- else {
- // Single mode crawl (Mode A only)
- const { products, crawlMode } = await (0, graphql_client_1.fetchAllProducts)(dispensary.platformDispensaryId, pricingType, { crawlMode: 'mode_a', cName });
- modeAProducts = products.length;
- // Collect Mode A product IDs
- for (const p of products) {
- modeAProductIds.add(p._id);
- }
- const result = await processProducts(products, dispensary, pricingType, crawlMode, { downloadImages });
- totalUpserted = result.upserted;
- totalSnapshots = result.snapshots;
- totalImagesDownloaded = result.imagesDownloaded;
- totalImageErrors = result.imageErrors;
- // Report progress
- if (onProgress) {
- await onProgress({
- productsFound: products.length,
- productsUpserted: totalUpserted,
- snapshotsCreated: totalSnapshots,
- currentPage: 1,
- totalPages: 1,
- });
- }
- }
- // Mark products as missing using UNION of Mode A + Mode B
- // The function handles outage detection (empty union = skip marking)
- missingMarked = await markMissingProducts(dispensary.id, dispensary.platformDispensaryId, modeAProductIds, modeBProductIds, pricingType);
- totalSnapshots += missingMarked;
- // Update dispensary stats
- await updateDispensaryCrawlStats(dispensary.id, totalUpserted);
- console.log(`[ProductCrawler] Completed: ${totalUpserted} products, ${totalSnapshots} snapshots, ${missingMarked} marked missing, ${totalImagesDownloaded} images downloaded`);
- const totalProductsFound = modeAProducts + modeBProducts;
- return {
- success: true,
- dispensaryId: dispensary.id,
- productsFound: totalProductsFound,
- productsFetched: totalProductsFound,
- productsUpserted: totalUpserted,
- snapshotsCreated: totalSnapshots,
- modeAProducts,
- modeBProducts,
- missingProductsMarked: missingMarked,
- imagesDownloaded: totalImagesDownloaded,
- imageErrors: totalImageErrors,
- durationMs: Date.now() - startTime,
- };
- }
- catch (error) {
- console.error(`[ProductCrawler] Failed to crawl ${dispensary.name}:`, error.message);
- return {
- success: false,
- dispensaryId: dispensary.id,
- productsFound: 0,
- productsFetched: 0,
- productsUpserted: 0,
- snapshotsCreated: 0,
- errorMessage: error.message,
- durationMs: Date.now() - startTime,
- };
- }
-}
-/**
- * Crawl all Arizona dispensaries
- */
-async function crawlAllArizonaDispensaries(pricingType = 'rec') {
- const results = [];
- // Get all AZ dispensaries with platform IDs
- const { rows: rawRows } = await (0, connection_1.query)(`
- SELECT ${DISPENSARY_COLUMNS} FROM dispensaries
- WHERE state = 'AZ' AND menu_type = 'dutchie' AND platform_dispensary_id IS NOT NULL
- ORDER BY id
- `);
- const dispensaries = rawRows.map(discovery_1.mapDbRowToDispensary);
- console.log(`[ProductCrawler] Starting crawl of ${dispensaries.length} dispensaries...`);
- for (const dispensary of dispensaries) {
- const result = await crawlDispensaryProducts(dispensary, pricingType);
- results.push(result);
- // Delay between dispensaries
- await new Promise((r) => setTimeout(r, 2000));
- }
- const successful = results.filter((r) => r.success).length;
- const totalProducts = results.reduce((sum, r) => sum + r.productsUpserted, 0);
- const totalSnapshots = results.reduce((sum, r) => sum + r.snapshotsCreated, 0);
- console.log(`[ProductCrawler] Completed: ${successful}/${dispensaries.length} stores, ${totalProducts} products, ${totalSnapshots} snapshots`);
- return results;
-}
diff --git a/backend/dist/dutchie-az/services/scheduler.js b/backend/dist/dutchie-az/services/scheduler.js
deleted file mode 100644
index 2911df96..00000000
--- a/backend/dist/dutchie-az/services/scheduler.js
+++ /dev/null
@@ -1,595 +0,0 @@
-"use strict";
-/**
- * Dutchie AZ Scheduler Service
- *
- * Handles scheduled crawling with JITTER - no fixed intervals!
- * Each job re-schedules itself with a NEW random offset after each run.
- * This makes timing "wander" around the clock, avoiding detectable patterns.
- *
- * Jitter Logic:
- * nextRunAt = lastRunAt + baseIntervalMinutes + random(-jitterMinutes, +jitterMinutes)
- *
- * Example: 4-hour base with ±30min jitter = runs anywhere from 3h30m to 4h30m apart
- */
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.crawlSingleDispensary = void 0;
-exports.getAllSchedules = getAllSchedules;
-exports.getScheduleById = getScheduleById;
-exports.createSchedule = createSchedule;
-exports.updateSchedule = updateSchedule;
-exports.deleteSchedule = deleteSchedule;
-exports.getRunLogs = getRunLogs;
-exports.startScheduler = startScheduler;
-exports.stopScheduler = stopScheduler;
-exports.getSchedulerStatus = getSchedulerStatus;
-exports.triggerScheduleNow = triggerScheduleNow;
-exports.initializeDefaultSchedules = initializeDefaultSchedules;
-exports.triggerImmediateCrawl = triggerImmediateCrawl;
-const connection_1 = require("../db/connection");
-const menu_detection_1 = require("./menu-detection");
-const job_queue_1 = require("./job-queue");
-// Scheduler poll interval (how often we check for due jobs)
-const SCHEDULER_POLL_INTERVAL_MS = 60 * 1000; // 1 minute
-// Track running state
-let isSchedulerRunning = false;
-let schedulerInterval = null;
-// ============================================================
-// JITTER CALCULATION
-// ============================================================
-/**
- * Generate a random jitter value in minutes
- * Returns a value between -jitterMinutes and +jitterMinutes
- */
-function getRandomJitterMinutes(jitterMinutes) {
- // random() returns [0, 1), we want [-jitter, +jitter]
- return (Math.random() * 2 - 1) * jitterMinutes;
-}
-/**
- * Calculate next run time with jitter
- * nextRunAt = baseTime + baseIntervalMinutes + random(-jitter, +jitter)
- */
-function calculateNextRunAt(baseTime, baseIntervalMinutes, jitterMinutes) {
- const jitter = getRandomJitterMinutes(jitterMinutes);
- const totalMinutes = baseIntervalMinutes + jitter;
- const totalMs = totalMinutes * 60 * 1000;
- return new Date(baseTime.getTime() + totalMs);
-}
-// ============================================================
-// DATABASE OPERATIONS
-// ============================================================
-/**
- * Get all job schedules
- */
-async function getAllSchedules() {
- const { rows } = await (0, connection_1.query)(`
- SELECT
- id, job_name, description, enabled,
- base_interval_minutes, jitter_minutes,
- last_run_at, last_status, last_error_message, last_duration_ms,
- next_run_at, job_config, created_at, updated_at
- FROM job_schedules
- ORDER BY job_name
- `);
- return rows.map(row => ({
- id: row.id,
- jobName: row.job_name,
- description: row.description,
- enabled: row.enabled,
- baseIntervalMinutes: row.base_interval_minutes,
- jitterMinutes: row.jitter_minutes,
- lastRunAt: row.last_run_at,
- lastStatus: row.last_status,
- lastErrorMessage: row.last_error_message,
- lastDurationMs: row.last_duration_ms,
- nextRunAt: row.next_run_at,
- jobConfig: row.job_config,
- createdAt: row.created_at,
- updatedAt: row.updated_at,
- }));
-}
-/**
- * Get a single schedule by ID
- */
-async function getScheduleById(id) {
- const { rows } = await (0, connection_1.query)(`SELECT * FROM job_schedules WHERE id = $1`, [id]);
- if (rows.length === 0)
- return null;
- const row = rows[0];
- return {
- id: row.id,
- jobName: row.job_name,
- description: row.description,
- enabled: row.enabled,
- baseIntervalMinutes: row.base_interval_minutes,
- jitterMinutes: row.jitter_minutes,
- lastRunAt: row.last_run_at,
- lastStatus: row.last_status,
- lastErrorMessage: row.last_error_message,
- lastDurationMs: row.last_duration_ms,
- nextRunAt: row.next_run_at,
- jobConfig: row.job_config,
- createdAt: row.created_at,
- updatedAt: row.updated_at,
- };
-}
-/**
- * Create a new schedule
- */
-async function createSchedule(schedule) {
- // Calculate initial nextRunAt
- const nextRunAt = schedule.startImmediately
- ? new Date() // Start immediately
- : calculateNextRunAt(new Date(), schedule.baseIntervalMinutes, schedule.jitterMinutes);
- const { rows } = await (0, connection_1.query)(`
- INSERT INTO job_schedules (
- job_name, description, enabled,
- base_interval_minutes, jitter_minutes,
- next_run_at, job_config
- ) VALUES ($1, $2, $3, $4, $5, $6, $7)
- RETURNING *
- `, [
- schedule.jobName,
- schedule.description || null,
- schedule.enabled ?? true,
- schedule.baseIntervalMinutes,
- schedule.jitterMinutes,
- nextRunAt,
- schedule.jobConfig ? JSON.stringify(schedule.jobConfig) : null,
- ]);
- const row = rows[0];
- console.log(`[Scheduler] Created schedule "${schedule.jobName}" - next run at ${nextRunAt.toISOString()}`);
- return {
- id: row.id,
- jobName: row.job_name,
- description: row.description,
- enabled: row.enabled,
- baseIntervalMinutes: row.base_interval_minutes,
- jitterMinutes: row.jitter_minutes,
- lastRunAt: row.last_run_at,
- lastStatus: row.last_status,
- lastErrorMessage: row.last_error_message,
- lastDurationMs: row.last_duration_ms,
- nextRunAt: row.next_run_at,
- jobConfig: row.job_config,
- createdAt: row.created_at,
- updatedAt: row.updated_at,
- };
-}
-/**
- * Update a schedule
- */
-async function updateSchedule(id, updates) {
- const setClauses = [];
- const params = [];
- let paramIndex = 1;
- if (updates.description !== undefined) {
- setClauses.push(`description = $${paramIndex++}`);
- params.push(updates.description);
- }
- if (updates.enabled !== undefined) {
- setClauses.push(`enabled = $${paramIndex++}`);
- params.push(updates.enabled);
- }
- if (updates.baseIntervalMinutes !== undefined) {
- setClauses.push(`base_interval_minutes = $${paramIndex++}`);
- params.push(updates.baseIntervalMinutes);
- }
- if (updates.jitterMinutes !== undefined) {
- setClauses.push(`jitter_minutes = $${paramIndex++}`);
- params.push(updates.jitterMinutes);
- }
- if (updates.jobConfig !== undefined) {
- setClauses.push(`job_config = $${paramIndex++}`);
- params.push(JSON.stringify(updates.jobConfig));
- }
- if (setClauses.length === 0) {
- return getScheduleById(id);
- }
- setClauses.push(`updated_at = NOW()`);
- params.push(id);
- const { rows } = await (0, connection_1.query)(`UPDATE job_schedules SET ${setClauses.join(', ')} WHERE id = $${paramIndex} RETURNING *`, params);
- if (rows.length === 0)
- return null;
- const row = rows[0];
- return {
- id: row.id,
- jobName: row.job_name,
- description: row.description,
- enabled: row.enabled,
- baseIntervalMinutes: row.base_interval_minutes,
- jitterMinutes: row.jitter_minutes,
- lastRunAt: row.last_run_at,
- lastStatus: row.last_status,
- lastErrorMessage: row.last_error_message,
- lastDurationMs: row.last_duration_ms,
- nextRunAt: row.next_run_at,
- jobConfig: row.job_config,
- createdAt: row.created_at,
- updatedAt: row.updated_at,
- };
-}
-/**
- * Delete a schedule
- */
-async function deleteSchedule(id) {
- const result = await (0, connection_1.query)(`DELETE FROM job_schedules WHERE id = $1`, [id]);
- return (result.rowCount || 0) > 0;
-}
-/**
- * Mark a schedule as running
- */
-async function markScheduleRunning(id) {
- await (0, connection_1.query)(`UPDATE job_schedules SET last_status = 'running', updated_at = NOW() WHERE id = $1`, [id]);
-}
-/**
- * Update schedule after job completion with NEW jittered next_run_at
- */
-async function updateScheduleAfterRun(id, status, durationMs, errorMessage) {
- // Get current schedule to calculate new nextRunAt
- const schedule = await getScheduleById(id);
- if (!schedule)
- return;
- const now = new Date();
- const newNextRunAt = calculateNextRunAt(now, schedule.baseIntervalMinutes, schedule.jitterMinutes);
- console.log(`[Scheduler] Schedule "${schedule.jobName}" completed (${status}). Next run: ${newNextRunAt.toISOString()}`);
- await (0, connection_1.query)(`
- UPDATE job_schedules SET
- last_run_at = $2,
- last_status = $3,
- last_error_message = $4,
- last_duration_ms = $5,
- next_run_at = $6,
- updated_at = NOW()
- WHERE id = $1
- `, [id, now, status, errorMessage || null, durationMs, newNextRunAt]);
-}
-/**
- * Create a job run log entry
- */
-async function createRunLog(scheduleId, jobName, status) {
- const { rows } = await (0, connection_1.query)(`
- INSERT INTO job_run_logs (schedule_id, job_name, status, started_at)
- VALUES ($1, $2, $3, NOW())
- RETURNING id
- `, [scheduleId, jobName, status]);
- return rows[0].id;
-}
-/**
- * Update a job run log entry
- */
-async function updateRunLog(runLogId, status, results) {
- await (0, connection_1.query)(`
- UPDATE job_run_logs SET
- status = $2,
- completed_at = NOW(),
- duration_ms = $3,
- error_message = $4,
- items_processed = $5,
- items_succeeded = $6,
- items_failed = $7,
- metadata = $8
- WHERE id = $1
- `, [
- runLogId,
- status,
- results.durationMs,
- results.errorMessage || null,
- results.itemsProcessed || 0,
- results.itemsSucceeded || 0,
- results.itemsFailed || 0,
- results.metadata ? JSON.stringify(results.metadata) : null,
- ]);
-}
-/**
- * Get job run logs
- */
-async function getRunLogs(options) {
- const { scheduleId, jobName, limit = 50, offset = 0 } = options;
- let whereClause = 'WHERE 1=1';
- const params = [];
- let paramIndex = 1;
- if (scheduleId) {
- whereClause += ` AND schedule_id = $${paramIndex++}`;
- params.push(scheduleId);
- }
- if (jobName) {
- whereClause += ` AND job_name = $${paramIndex++}`;
- params.push(jobName);
- }
- params.push(limit, offset);
- const { rows } = await (0, connection_1.query)(`
- SELECT * FROM job_run_logs
- ${whereClause}
- ORDER BY created_at DESC
- LIMIT $${paramIndex} OFFSET $${paramIndex + 1}
- `, params);
- const { rows: countRows } = await (0, connection_1.query)(`SELECT COUNT(*) as total FROM job_run_logs ${whereClause}`, params.slice(0, -2));
- return {
- logs: rows,
- total: parseInt(countRows[0]?.total || '0', 10),
- };
-}
-// ============================================================
-// JOB EXECUTION
-// ============================================================
-/**
- * Execute a job based on its name
- */
-async function executeJob(schedule) {
- const config = schedule.jobConfig || {};
- switch (schedule.jobName) {
- case 'dutchie_az_product_crawl':
- return executeProductCrawl(config);
- case 'dutchie_az_discovery':
- return executeDiscovery(config);
- case 'dutchie_az_menu_detection':
- return (0, menu_detection_1.executeMenuDetectionJob)(config);
- default:
- throw new Error(`Unknown job type: ${schedule.jobName}`);
- }
-}
-/**
- * Execute the AZ Dutchie product crawl job
- *
- * NEW BEHAVIOR: Instead of running crawls directly, this now ENQUEUES jobs
- * into the crawl_jobs queue. Workers (running as separate replicas) will
- * pick up and process these jobs.
- *
- * This allows:
- * - Multiple workers to process jobs in parallel
- * - No double-crawls (DB-level locking per dispensary)
- * - Better scalability (add more worker replicas)
- * - Live monitoring of individual job progress
- */
-async function executeProductCrawl(config) {
- const pricingType = config.pricingType || 'rec';
- const useBothModes = config.useBothModes !== false;
- // Get all "ready" dispensaries (menu_type='dutchie' AND platform_dispensary_id IS NOT NULL AND not failed)
- // Note: Menu detection is handled separately by the dutchie_az_menu_detection schedule
- const { rows: rawRows } = await (0, connection_1.query)(`
- SELECT id FROM dispensaries
- WHERE state = 'AZ'
- AND menu_type = 'dutchie'
- AND platform_dispensary_id IS NOT NULL
- AND failed_at IS NULL
- ORDER BY last_crawl_at ASC NULLS FIRST
- `);
- const dispensaryIds = rawRows.map((r) => r.id);
- if (dispensaryIds.length === 0) {
- return {
- status: 'success',
- itemsProcessed: 0,
- itemsSucceeded: 0,
- itemsFailed: 0,
- metadata: { message: 'No ready dispensaries to crawl. Run menu detection to discover more.' },
- };
- }
- console.log(`[Scheduler] Enqueueing crawl jobs for ${dispensaryIds.length} dispensaries...`);
- // Bulk enqueue jobs (skips dispensaries that already have pending/running jobs)
- const { enqueued, skipped } = await (0, job_queue_1.bulkEnqueueJobs)('dutchie_product_crawl', dispensaryIds, {
- priority: 0,
- metadata: { pricingType, useBothModes },
- });
- console.log(`[Scheduler] Enqueued ${enqueued} jobs, skipped ${skipped} (already queued)`);
- // Get current queue stats
- const queueStats = await (0, job_queue_1.getQueueStats)();
- return {
- status: 'success',
- itemsProcessed: dispensaryIds.length,
- itemsSucceeded: enqueued,
- itemsFailed: 0, // Enqueue itself doesn't fail
- metadata: {
- enqueued,
- skipped,
- queueStats,
- pricingType,
- useBothModes,
- message: `Enqueued ${enqueued} jobs. Workers will process them. Check /scraper-monitor for progress.`,
- },
- };
-}
-/**
- * Execute the AZ Dutchie discovery job (placeholder)
- */
-async function executeDiscovery(_config) {
- // Placeholder - implement discovery logic
- return {
- status: 'success',
- itemsProcessed: 0,
- itemsSucceeded: 0,
- itemsFailed: 0,
- metadata: { message: 'Discovery not yet implemented' },
- };
-}
-// ============================================================
-// SCHEDULER RUNNER
-// ============================================================
-/**
- * Check for due jobs and run them
- */
-async function checkAndRunDueJobs() {
- try {
- // Get enabled schedules where nextRunAt <= now
- const { rows } = await (0, connection_1.query)(`
- SELECT * FROM job_schedules
- WHERE enabled = true
- AND next_run_at IS NOT NULL
- AND next_run_at <= NOW()
- AND (last_status IS NULL OR last_status != 'running')
- ORDER BY next_run_at ASC
- `);
- if (rows.length === 0)
- return;
- console.log(`[Scheduler] Found ${rows.length} due job(s)`);
- for (const row of rows) {
- const schedule = {
- id: row.id,
- jobName: row.job_name,
- description: row.description,
- enabled: row.enabled,
- baseIntervalMinutes: row.base_interval_minutes,
- jitterMinutes: row.jitter_minutes,
- lastRunAt: row.last_run_at,
- lastStatus: row.last_status,
- lastErrorMessage: row.last_error_message,
- lastDurationMs: row.last_duration_ms,
- nextRunAt: row.next_run_at,
- jobConfig: row.job_config,
- createdAt: row.created_at,
- updatedAt: row.updated_at,
- };
- await runScheduledJob(schedule);
- }
- }
- catch (error) {
- console.error('[Scheduler] Error checking for due jobs:', error);
- }
-}
-/**
- * Run a single scheduled job
- */
-async function runScheduledJob(schedule) {
- const startTime = Date.now();
- console.log(`[Scheduler] Starting job "${schedule.jobName}"...`);
- // Mark as running
- await markScheduleRunning(schedule.id);
- // Create run log entry
- const runLogId = await createRunLog(schedule.id, schedule.jobName, 'running');
- try {
- // Execute the job
- const result = await executeJob(schedule);
- const durationMs = Date.now() - startTime;
- // Determine final status (exclude 'running' and null)
- const finalStatus = result.status === 'running' || result.status === null
- ? 'success'
- : result.status;
- // Update run log
- await updateRunLog(runLogId, finalStatus, {
- durationMs,
- errorMessage: result.errorMessage,
- itemsProcessed: result.itemsProcessed,
- itemsSucceeded: result.itemsSucceeded,
- itemsFailed: result.itemsFailed,
- metadata: result.metadata,
- });
- // Update schedule with NEW jittered next_run_at
- await updateScheduleAfterRun(schedule.id, result.status, durationMs, result.errorMessage);
- console.log(`[Scheduler] Job "${schedule.jobName}" completed in ${Math.round(durationMs / 1000)}s (${result.status})`);
- }
- catch (error) {
- const durationMs = Date.now() - startTime;
- console.error(`[Scheduler] Job "${schedule.jobName}" failed:`, error.message);
- // Update run log with error
- await updateRunLog(runLogId, 'error', {
- durationMs,
- errorMessage: error.message,
- itemsProcessed: 0,
- itemsSucceeded: 0,
- itemsFailed: 0,
- });
- // Update schedule with NEW jittered next_run_at
- await updateScheduleAfterRun(schedule.id, 'error', durationMs, error.message);
- }
-}
-// ============================================================
-// PUBLIC API
-// ============================================================
-/**
- * Start the scheduler
- */
-function startScheduler() {
- if (isSchedulerRunning) {
- console.log('[Scheduler] Scheduler is already running');
- return;
- }
- isSchedulerRunning = true;
- console.log(`[Scheduler] Starting scheduler (polling every ${SCHEDULER_POLL_INTERVAL_MS / 1000}s)...`);
- // Immediately check for due jobs
- checkAndRunDueJobs();
- // Set up interval to check for due jobs
- schedulerInterval = setInterval(checkAndRunDueJobs, SCHEDULER_POLL_INTERVAL_MS);
-}
-/**
- * Stop the scheduler
- */
-function stopScheduler() {
- if (!isSchedulerRunning) {
- console.log('[Scheduler] Scheduler is not running');
- return;
- }
- isSchedulerRunning = false;
- if (schedulerInterval) {
- clearInterval(schedulerInterval);
- schedulerInterval = null;
- }
- console.log('[Scheduler] Scheduler stopped');
-}
-/**
- * Get scheduler status
- */
-function getSchedulerStatus() {
- return {
- running: isSchedulerRunning,
- pollIntervalMs: SCHEDULER_POLL_INTERVAL_MS,
- };
-}
-/**
- * Trigger immediate execution of a schedule
- */
-async function triggerScheduleNow(scheduleId) {
- const schedule = await getScheduleById(scheduleId);
- if (!schedule) {
- return { success: false, message: 'Schedule not found' };
- }
- if (schedule.lastStatus === 'running') {
- return { success: false, message: 'Job is already running' };
- }
- // Run the job
- await runScheduledJob(schedule);
- return { success: true, message: 'Job triggered successfully' };
-}
-/**
- * Initialize default schedules if they don't exist
- */
-async function initializeDefaultSchedules() {
- const schedules = await getAllSchedules();
- // Check if product crawl schedule exists
- const productCrawlExists = schedules.some(s => s.jobName === 'dutchie_az_product_crawl');
- if (!productCrawlExists) {
- await createSchedule({
- jobName: 'dutchie_az_product_crawl',
- description: 'Crawl all AZ Dutchie dispensary products',
- enabled: true,
- baseIntervalMinutes: 240, // 4 hours
- jitterMinutes: 30, // ±30 minutes
- jobConfig: { pricingType: 'rec', useBothModes: true },
- startImmediately: false,
- });
- console.log('[Scheduler] Created default product crawl schedule');
- }
- // Check if menu detection schedule exists
- const menuDetectionExists = schedules.some(s => s.jobName === 'dutchie_az_menu_detection');
- if (!menuDetectionExists) {
- await createSchedule({
- jobName: 'dutchie_az_menu_detection',
- description: 'Detect menu providers and resolve platform IDs for AZ dispensaries',
- enabled: true,
- baseIntervalMinutes: 1440, // 24 hours
- jitterMinutes: 60, // ±1 hour
- jobConfig: { state: 'AZ', onlyUnknown: true },
- startImmediately: false,
- });
- console.log('[Scheduler] Created default menu detection schedule');
- }
-}
-// Re-export for backward compatibility
-var product_crawler_1 = require("./product-crawler");
-Object.defineProperty(exports, "crawlSingleDispensary", { enumerable: true, get: function () { return product_crawler_1.crawlDispensaryProducts; } });
-async function triggerImmediateCrawl() {
- const schedules = await getAllSchedules();
- const productCrawl = schedules.find(s => s.jobName === 'dutchie_az_product_crawl');
- if (productCrawl) {
- return triggerScheduleNow(productCrawl.id);
- }
- return { success: false, message: 'Product crawl schedule not found' };
-}
diff --git a/backend/dist/dutchie-az/services/worker.js b/backend/dist/dutchie-az/services/worker.js
deleted file mode 100644
index 43f0fbf6..00000000
--- a/backend/dist/dutchie-az/services/worker.js
+++ /dev/null
@@ -1,440 +0,0 @@
-"use strict";
-/**
- * Worker Service
- *
- * Polls the job queue and processes crawl jobs.
- * Each worker instance runs independently, claiming jobs atomically.
- */
-var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
- if (k2 === undefined) k2 = k;
- var desc = Object.getOwnPropertyDescriptor(m, k);
- if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
- desc = { enumerable: true, get: function() { return m[k]; } };
- }
- Object.defineProperty(o, k2, desc);
-}) : (function(o, m, k, k2) {
- if (k2 === undefined) k2 = k;
- o[k2] = m[k];
-}));
-var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
- Object.defineProperty(o, "default", { enumerable: true, value: v });
-}) : function(o, v) {
- o["default"] = v;
-});
-var __importStar = (this && this.__importStar) || (function () {
- var ownKeys = function(o) {
- ownKeys = Object.getOwnPropertyNames || function (o) {
- var ar = [];
- for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
- return ar;
- };
- return ownKeys(o);
- };
- return function (mod) {
- if (mod && mod.__esModule) return mod;
- var result = {};
- if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
- __setModuleDefault(result, mod);
- return result;
- };
-})();
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.startWorker = startWorker;
-exports.stopWorker = stopWorker;
-exports.getWorkerStatus = getWorkerStatus;
-const job_queue_1 = require("./job-queue");
-const product_crawler_1 = require("./product-crawler");
-const discovery_1 = require("./discovery");
-const connection_1 = require("../db/connection");
-// Explicit column list for dispensaries table (avoids SELECT * issues with schema differences)
-// NOTE: failed_at is included for worker compatibility checks
-const DISPENSARY_COLUMNS = `
- id, name, slug, city, state, zip, address, latitude, longitude,
- menu_type, menu_url, platform_dispensary_id, website,
- provider_detection_data, created_at, updated_at, failed_at
-`;
-// ============================================================
-// WORKER CONFIG
-// ============================================================
-const POLL_INTERVAL_MS = 5000; // Check for jobs every 5 seconds
-const HEARTBEAT_INTERVAL_MS = 60000; // Send heartbeat every 60 seconds
-const STALE_CHECK_INTERVAL_MS = 300000; // Check for stale jobs every 5 minutes
-const SHUTDOWN_GRACE_PERIOD_MS = 30000; // Wait 30s for job to complete on shutdown
-// ============================================================
-// WORKER STATE
-// ============================================================
-let isRunning = false;
-let currentJob = null;
-let pollTimer = null;
-let heartbeatTimer = null;
-let staleCheckTimer = null;
-let shutdownPromise = null;
-// ============================================================
-// WORKER LIFECYCLE
-// ============================================================
-/**
- * Start the worker
- */
-async function startWorker() {
- if (isRunning) {
- console.log('[Worker] Already running');
- return;
- }
- const workerId = (0, job_queue_1.getWorkerId)();
- const hostname = (0, job_queue_1.getWorkerHostname)();
- console.log(`[Worker] Starting worker ${workerId} on ${hostname}`);
- isRunning = true;
- // Set up graceful shutdown
- setupShutdownHandlers();
- // Start polling for jobs
- pollTimer = setInterval(pollForJobs, POLL_INTERVAL_MS);
- // Start stale job recovery (only one worker should do this, but it's idempotent)
- staleCheckTimer = setInterval(async () => {
- try {
- await (0, job_queue_1.recoverStaleJobs)(15);
- }
- catch (error) {
- console.error('[Worker] Error recovering stale jobs:', error);
- }
- }, STALE_CHECK_INTERVAL_MS);
- // Immediately poll for a job
- await pollForJobs();
- console.log(`[Worker] Worker ${workerId} started, polling every ${POLL_INTERVAL_MS}ms`);
-}
-/**
- * Stop the worker gracefully
- */
-async function stopWorker() {
- if (!isRunning)
- return;
- console.log('[Worker] Stopping worker...');
- isRunning = false;
- // Clear timers
- if (pollTimer) {
- clearInterval(pollTimer);
- pollTimer = null;
- }
- if (heartbeatTimer) {
- clearInterval(heartbeatTimer);
- heartbeatTimer = null;
- }
- if (staleCheckTimer) {
- clearInterval(staleCheckTimer);
- staleCheckTimer = null;
- }
- // Wait for current job to complete
- if (currentJob) {
- console.log(`[Worker] Waiting for job ${currentJob.id} to complete...`);
- const startWait = Date.now();
- while (currentJob && Date.now() - startWait < SHUTDOWN_GRACE_PERIOD_MS) {
- await new Promise(r => setTimeout(r, 1000));
- }
- if (currentJob) {
- console.log(`[Worker] Job ${currentJob.id} did not complete in time, marking for retry`);
- await (0, job_queue_1.failJob)(currentJob.id, 'Worker shutdown');
- }
- }
- console.log('[Worker] Worker stopped');
-}
-/**
- * Get worker status
- */
-function getWorkerStatus() {
- return {
- isRunning,
- workerId: (0, job_queue_1.getWorkerId)(),
- hostname: (0, job_queue_1.getWorkerHostname)(),
- currentJob,
- };
-}
-// ============================================================
-// JOB PROCESSING
-// ============================================================
-/**
- * Poll for and process the next available job
- */
-async function pollForJobs() {
- if (!isRunning || currentJob) {
- return; // Already processing a job
- }
- try {
- const workerId = (0, job_queue_1.getWorkerId)();
- // Try to claim a job
- const job = await (0, job_queue_1.claimNextJob)({
- workerId,
- jobTypes: ['dutchie_product_crawl', 'menu_detection', 'menu_detection_single'],
- lockDurationMinutes: 30,
- });
- if (!job) {
- return; // No jobs available
- }
- currentJob = job;
- console.log(`[Worker] Processing job ${job.id} (type=${job.jobType}, dispensary=${job.dispensaryId})`);
- // Start heartbeat for this job
- heartbeatTimer = setInterval(async () => {
- if (currentJob) {
- try {
- await (0, job_queue_1.heartbeat)(currentJob.id);
- }
- catch (error) {
- console.error('[Worker] Heartbeat error:', error);
- }
- }
- }, HEARTBEAT_INTERVAL_MS);
- // Process the job
- await processJob(job);
- }
- catch (error) {
- console.error('[Worker] Error polling for jobs:', error);
- if (currentJob) {
- try {
- await (0, job_queue_1.failJob)(currentJob.id, error.message);
- }
- catch (failError) {
- console.error('[Worker] Error failing job:', failError);
- }
- }
- }
- finally {
- // Clear heartbeat timer
- if (heartbeatTimer) {
- clearInterval(heartbeatTimer);
- heartbeatTimer = null;
- }
- currentJob = null;
- }
-}
-/**
- * Process a single job
- */
-async function processJob(job) {
- try {
- switch (job.jobType) {
- case 'dutchie_product_crawl':
- await processProductCrawlJob(job);
- break;
- case 'menu_detection':
- await processMenuDetectionJob(job);
- break;
- case 'menu_detection_single':
- await processSingleDetectionJob(job);
- break;
- default:
- throw new Error(`Unknown job type: ${job.jobType}`);
- }
- }
- catch (error) {
- console.error(`[Worker] Job ${job.id} failed:`, error);
- await (0, job_queue_1.failJob)(job.id, error.message);
- }
-}
-// Maximum consecutive failures before flagging a dispensary
-const MAX_CONSECUTIVE_FAILURES = 3;
-/**
- * Record a successful crawl - resets failure counter
- */
-async function recordCrawlSuccess(dispensaryId) {
- await (0, connection_1.query)(`UPDATE dispensaries
- SET consecutive_failures = 0,
- last_crawl_at = NOW(),
- updated_at = NOW()
- WHERE id = $1`, [dispensaryId]);
-}
-/**
- * Record a crawl failure - increments counter and may flag dispensary
- * Returns true if dispensary was flagged as failed
- */
-async function recordCrawlFailure(dispensaryId, errorMessage) {
- // Increment failure counter
- const { rows } = await (0, connection_1.query)(`UPDATE dispensaries
- SET consecutive_failures = consecutive_failures + 1,
- last_failure_at = NOW(),
- last_failure_reason = $2,
- updated_at = NOW()
- WHERE id = $1
- RETURNING consecutive_failures`, [dispensaryId, errorMessage]);
- const failures = rows[0]?.consecutive_failures || 0;
- // If we've hit the threshold, flag the dispensary as failed
- if (failures >= MAX_CONSECUTIVE_FAILURES) {
- await (0, connection_1.query)(`UPDATE dispensaries
- SET failed_at = NOW(),
- menu_type = NULL,
- platform_dispensary_id = NULL,
- failure_notes = $2,
- updated_at = NOW()
- WHERE id = $1`, [dispensaryId, `Auto-flagged after ${failures} consecutive failures. Last error: ${errorMessage}`]);
- console.log(`[Worker] Dispensary ${dispensaryId} flagged as FAILED after ${failures} consecutive failures`);
- return true;
- }
- console.log(`[Worker] Dispensary ${dispensaryId} failure recorded (${failures}/${MAX_CONSECUTIVE_FAILURES})`);
- return false;
-}
-/**
- * Process a product crawl job for a single dispensary
- */
-async function processProductCrawlJob(job) {
- if (!job.dispensaryId) {
- throw new Error('Product crawl job requires dispensary_id');
- }
- // Get dispensary details
- const { rows } = await (0, connection_1.query)(`SELECT ${DISPENSARY_COLUMNS} FROM dispensaries WHERE id = $1`, [job.dispensaryId]);
- if (rows.length === 0) {
- throw new Error(`Dispensary ${job.dispensaryId} not found`);
- }
- const dispensary = (0, discovery_1.mapDbRowToDispensary)(rows[0]);
- // Check if dispensary is already flagged as failed
- if (rows[0].failed_at) {
- console.log(`[Worker] Skipping dispensary ${job.dispensaryId} - already flagged as failed`);
- await (0, job_queue_1.completeJob)(job.id, { productsFound: 0, productsUpserted: 0 });
- return;
- }
- if (!dispensary.platformDispensaryId) {
- // Record failure and potentially flag
- await recordCrawlFailure(job.dispensaryId, 'Missing platform_dispensary_id');
- throw new Error(`Dispensary ${job.dispensaryId} has no platform_dispensary_id`);
- }
- // Get crawl options from job metadata
- const pricingType = job.metadata?.pricingType || 'rec';
- const useBothModes = job.metadata?.useBothModes !== false;
- try {
- // Crawl the dispensary
- const result = await (0, product_crawler_1.crawlDispensaryProducts)(dispensary, pricingType, {
- useBothModes,
- onProgress: async (progress) => {
- // Update progress for live monitoring
- await (0, job_queue_1.updateJobProgress)(job.id, {
- productsFound: progress.productsFound,
- productsUpserted: progress.productsUpserted,
- snapshotsCreated: progress.snapshotsCreated,
- currentPage: progress.currentPage,
- totalPages: progress.totalPages,
- });
- },
- });
- if (result.success) {
- // Success! Reset failure counter
- await recordCrawlSuccess(job.dispensaryId);
- await (0, job_queue_1.completeJob)(job.id, {
- productsFound: result.productsFetched,
- productsUpserted: result.productsUpserted,
- snapshotsCreated: result.snapshotsCreated,
- });
- }
- else {
- // Crawl returned failure - record it
- const wasFlagged = await recordCrawlFailure(job.dispensaryId, result.errorMessage || 'Crawl failed');
- if (wasFlagged) {
- // Don't throw - the dispensary is now flagged, job is "complete"
- await (0, job_queue_1.completeJob)(job.id, { productsFound: 0, productsUpserted: 0 });
- }
- else {
- throw new Error(result.errorMessage || 'Crawl failed');
- }
- }
- }
- catch (error) {
- // Record the failure
- const wasFlagged = await recordCrawlFailure(job.dispensaryId, error.message);
- if (wasFlagged) {
- // Dispensary is now flagged - complete the job rather than fail it
- await (0, job_queue_1.completeJob)(job.id, { productsFound: 0, productsUpserted: 0 });
- }
- else {
- throw error;
- }
- }
-}
-/**
- * Process a menu detection job (bulk)
- */
-async function processMenuDetectionJob(job) {
- const { executeMenuDetectionJob } = await Promise.resolve().then(() => __importStar(require('./menu-detection')));
- const config = job.metadata || {};
- const result = await executeMenuDetectionJob(config);
- if (result.status === 'error') {
- throw new Error(result.errorMessage || 'Menu detection failed');
- }
- await (0, job_queue_1.completeJob)(job.id, {
- productsFound: result.itemsProcessed,
- productsUpserted: result.itemsSucceeded,
- });
-}
-/**
- * Process a single dispensary menu detection job
- * This is the parallelizable version - each worker can detect one dispensary at a time
- */
-async function processSingleDetectionJob(job) {
- if (!job.dispensaryId) {
- throw new Error('Single detection job requires dispensary_id');
- }
- const { detectAndResolveDispensary } = await Promise.resolve().then(() => __importStar(require('./menu-detection')));
- // Get dispensary details
- const { rows } = await (0, connection_1.query)(`SELECT ${DISPENSARY_COLUMNS} FROM dispensaries WHERE id = $1`, [job.dispensaryId]);
- if (rows.length === 0) {
- throw new Error(`Dispensary ${job.dispensaryId} not found`);
- }
- const dispensary = rows[0];
- // Skip if already detected or failed
- if (dispensary.failed_at) {
- console.log(`[Worker] Skipping dispensary ${job.dispensaryId} - already flagged as failed`);
- await (0, job_queue_1.completeJob)(job.id, { productsFound: 0, productsUpserted: 0 });
- return;
- }
- if (dispensary.menu_type && dispensary.menu_type !== 'unknown') {
- console.log(`[Worker] Skipping dispensary ${job.dispensaryId} - already detected as ${dispensary.menu_type}`);
- await (0, job_queue_1.completeJob)(job.id, { productsFound: 0, productsUpserted: 1 });
- return;
- }
- console.log(`[Worker] Detecting menu for dispensary ${job.dispensaryId} (${dispensary.name})...`);
- try {
- const result = await detectAndResolveDispensary(job.dispensaryId);
- if (result.success) {
- console.log(`[Worker] Dispensary ${job.dispensaryId}: detected ${result.detectedProvider}, platformId=${result.platformDispensaryId || 'none'}`);
- await (0, job_queue_1.completeJob)(job.id, {
- productsFound: 1,
- productsUpserted: result.platformDispensaryId ? 1 : 0,
- });
- }
- else {
- // Detection failed - record failure
- await recordCrawlFailure(job.dispensaryId, result.error || 'Detection failed');
- throw new Error(result.error || 'Detection failed');
- }
- }
- catch (error) {
- // Record the failure
- const wasFlagged = await recordCrawlFailure(job.dispensaryId, error.message);
- if (wasFlagged) {
- // Dispensary is now flagged - complete the job rather than fail it
- await (0, job_queue_1.completeJob)(job.id, { productsFound: 0, productsUpserted: 0 });
- }
- else {
- throw error;
- }
- }
-}
-// ============================================================
-// SHUTDOWN HANDLING
-// ============================================================
-function setupShutdownHandlers() {
- const shutdown = async (signal) => {
- if (shutdownPromise)
- return shutdownPromise;
- console.log(`\n[Worker] Received ${signal}, shutting down...`);
- shutdownPromise = stopWorker();
- await shutdownPromise;
- process.exit(0);
- };
- process.on('SIGTERM', () => shutdown('SIGTERM'));
- process.on('SIGINT', () => shutdown('SIGINT'));
-}
-// ============================================================
-// STANDALONE WORKER ENTRY POINT
-// ============================================================
-if (require.main === module) {
- // Run as standalone worker
- startWorker().catch((error) => {
- console.error('[Worker] Fatal error:', error);
- process.exit(1);
- });
-}
diff --git a/backend/dist/dutchie-az/types/index.js b/backend/dist/dutchie-az/types/index.js
deleted file mode 100644
index 098e21a3..00000000
--- a/backend/dist/dutchie-az/types/index.js
+++ /dev/null
@@ -1,96 +0,0 @@
-"use strict";
-/**
- * Dutchie AZ Data Types
- *
- * Complete TypeScript interfaces for the isolated Dutchie Arizona data pipeline.
- * These types map directly to Dutchie's GraphQL FilteredProducts response.
- */
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.getOptionQuantity = getOptionQuantity;
-exports.deriveOptionStockStatus = deriveOptionStockStatus;
-exports.deriveStockStatus = deriveStockStatus;
-exports.calculateTotalQuantity = calculateTotalQuantity;
-exports.calculateTotalKioskQuantity = calculateTotalKioskQuantity;
-/**
- * Get available quantity for a single option
- * Priority: quantityAvailable > kioskQuantityAvailable > quantity
- */
-function getOptionQuantity(child) {
- if (typeof child.quantityAvailable === 'number')
- return child.quantityAvailable;
- if (typeof child.kioskQuantityAvailable === 'number')
- return child.kioskQuantityAvailable;
- if (typeof child.quantity === 'number')
- return child.quantity;
- return null; // No quantity data available
-}
-/**
- * Derive stock status for a single option
- * Returns: 'in_stock' if qty > 0, 'out_of_stock' if qty === 0, 'unknown' if no data
- */
-function deriveOptionStockStatus(child) {
- const qty = getOptionQuantity(child);
- if (qty === null)
- return 'unknown';
- return qty > 0 ? 'in_stock' : 'out_of_stock';
-}
-/**
- * Derive product-level stock status from POSMetaData.children
- *
- * Logic per spec:
- * - If ANY child is "in_stock" → product is "in_stock"
- * - Else if ALL children are "out_of_stock" → product is "out_of_stock"
- * - Else → product is "unknown"
- *
- * IMPORTANT: Threshold flags (isBelowThreshold, etc.) do NOT override stock status.
- * They only indicate "low stock" - if qty > 0, status stays "in_stock".
- */
-function deriveStockStatus(product) {
- const children = product.POSMetaData?.children;
- // No children data - unknown
- if (!children || children.length === 0) {
- return 'unknown';
- }
- // Get stock status for each option
- const optionStatuses = children.map(deriveOptionStockStatus);
- // If ANY option is in_stock → product is in_stock
- if (optionStatuses.some(status => status === 'in_stock')) {
- return 'in_stock';
- }
- // If ALL options are out_of_stock → product is out_of_stock
- if (optionStatuses.every(status => status === 'out_of_stock')) {
- return 'out_of_stock';
- }
- // Otherwise (mix of out_of_stock and unknown) → unknown
- return 'unknown';
-}
-/**
- * Calculate total quantity available across all options
- * Returns null if no children data (unknown inventory), 0 if children exist but all have 0 qty
- */
-function calculateTotalQuantity(product) {
- const children = product.POSMetaData?.children;
- // No children = unknown inventory, return null (NOT 0)
- if (!children || children.length === 0)
- return null;
- // Check if any child has quantity data
- const hasAnyQtyData = children.some(child => getOptionQuantity(child) !== null);
- if (!hasAnyQtyData)
- return null; // All children lack qty data = unknown
- return children.reduce((sum, child) => {
- const qty = getOptionQuantity(child);
- return sum + (qty ?? 0);
- }, 0);
-}
-/**
- * Calculate total kiosk quantity available across all options
- */
-function calculateTotalKioskQuantity(product) {
- const children = product.POSMetaData?.children;
- if (!children || children.length === 0)
- return null;
- const hasAnyKioskQty = children.some(child => typeof child.kioskQuantityAvailable === 'number');
- if (!hasAnyKioskQty)
- return null;
- return children.reduce((sum, child) => sum + (child.kioskQuantityAvailable ?? 0), 0);
-}
diff --git a/backend/dist/index.js b/backend/dist/index.js
deleted file mode 100644
index 2ac40a57..00000000
--- a/backend/dist/index.js
+++ /dev/null
@@ -1,119 +0,0 @@
-"use strict";
-var __importDefault = (this && this.__importDefault) || function (mod) {
- return (mod && mod.__esModule) ? mod : { "default": mod };
-};
-Object.defineProperty(exports, "__esModule", { value: true });
-const express_1 = __importDefault(require("express"));
-const cors_1 = __importDefault(require("cors"));
-const dotenv_1 = __importDefault(require("dotenv"));
-const minio_1 = require("./utils/minio");
-const image_storage_1 = require("./utils/image-storage");
-const logger_1 = require("./services/logger");
-const proxyTestQueue_1 = require("./services/proxyTestQueue");
-dotenv_1.default.config();
-const app = (0, express_1.default)();
-const PORT = process.env.PORT || 3010;
-app.use((0, cors_1.default)());
-app.use(express_1.default.json());
-// Serve static images when MinIO is not configured
-const LOCAL_IMAGES_PATH = process.env.LOCAL_IMAGES_PATH || '/app/public/images';
-app.use('/images', express_1.default.static(LOCAL_IMAGES_PATH));
-// Serve static downloads (plugin files, etc.)
-const LOCAL_DOWNLOADS_PATH = process.env.LOCAL_DOWNLOADS_PATH || '/app/public/downloads';
-app.use('/downloads', express_1.default.static(LOCAL_DOWNLOADS_PATH));
-app.get('/health', (req, res) => {
- res.json({ status: 'ok', timestamp: new Date().toISOString() });
-});
-// Endpoint to check server's outbound IP (for proxy whitelist setup)
-app.get('/outbound-ip', async (req, res) => {
- try {
- const axios = require('axios');
- const response = await axios.get('https://api.ipify.org?format=json', { timeout: 10000 });
- res.json({ outbound_ip: response.data.ip });
- }
- catch (error) {
- res.status(500).json({ error: error.message });
- }
-});
-const auth_1 = __importDefault(require("./routes/auth"));
-const dashboard_1 = __importDefault(require("./routes/dashboard"));
-const stores_1 = __importDefault(require("./routes/stores"));
-const dispensaries_1 = __importDefault(require("./routes/dispensaries"));
-const changes_1 = __importDefault(require("./routes/changes"));
-const categories_1 = __importDefault(require("./routes/categories"));
-const products_1 = __importDefault(require("./routes/products"));
-const campaigns_1 = __importDefault(require("./routes/campaigns"));
-const analytics_1 = __importDefault(require("./routes/analytics"));
-const settings_1 = __importDefault(require("./routes/settings"));
-const proxies_1 = __importDefault(require("./routes/proxies"));
-const logs_1 = __importDefault(require("./routes/logs"));
-const scraper_monitor_1 = __importDefault(require("./routes/scraper-monitor"));
-const api_tokens_1 = __importDefault(require("./routes/api-tokens"));
-const api_permissions_1 = __importDefault(require("./routes/api-permissions"));
-const parallel_scrape_1 = __importDefault(require("./routes/parallel-scrape"));
-const schedule_1 = __importDefault(require("./routes/schedule"));
-const crawler_sandbox_1 = __importDefault(require("./routes/crawler-sandbox"));
-const version_1 = __importDefault(require("./routes/version"));
-const public_api_1 = __importDefault(require("./routes/public-api"));
-const dutchie_az_1 = require("./dutchie-az");
-const apiTokenTracker_1 = require("./middleware/apiTokenTracker");
-const crawl_scheduler_1 = require("./services/crawl-scheduler");
-const wordpressPermissions_1 = require("./middleware/wordpressPermissions");
-// Apply WordPress permissions validation first (sets req.apiToken)
-app.use(wordpressPermissions_1.validateWordPressPermissions);
-// Apply API tracking middleware globally
-app.use(apiTokenTracker_1.trackApiUsage);
-app.use(apiTokenTracker_1.checkRateLimit);
-app.use('/api/auth', auth_1.default);
-app.use('/api/dashboard', dashboard_1.default);
-app.use('/api/stores', stores_1.default);
-app.use('/api/dispensaries', dispensaries_1.default);
-app.use('/api/changes', changes_1.default);
-app.use('/api/categories', categories_1.default);
-app.use('/api/products', products_1.default);
-app.use('/api/campaigns', campaigns_1.default);
-app.use('/api/analytics', analytics_1.default);
-app.use('/api/settings', settings_1.default);
-app.use('/api/proxies', proxies_1.default);
-app.use('/api/logs', logs_1.default);
-app.use('/api/scraper-monitor', scraper_monitor_1.default);
-app.use('/api/api-tokens', api_tokens_1.default);
-app.use('/api/api-permissions', api_permissions_1.default);
-app.use('/api/parallel-scrape', parallel_scrape_1.default);
-app.use('/api/schedule', schedule_1.default);
-app.use('/api/crawler-sandbox', crawler_sandbox_1.default);
-app.use('/api/version', version_1.default);
-// Vendor-agnostic AZ data pipeline routes (new public surface)
-app.use('/api/az', dutchie_az_1.dutchieAZRouter);
-// Legacy alias (kept temporarily for backward compatibility)
-app.use('/api/dutchie-az', dutchie_az_1.dutchieAZRouter);
-// Public API v1 - External consumer endpoints (WordPress, etc.)
-// Uses dutchie_az data pipeline with per-dispensary API key auth
-app.use('/api/v1', public_api_1.default);
-async function startServer() {
- try {
- logger_1.logger.info('system', 'Starting server...');
- await (0, minio_1.initializeMinio)();
- await (0, image_storage_1.initializeImageStorage)();
- logger_1.logger.info('system', (0, minio_1.isMinioEnabled)() ? 'MinIO storage initialized' : 'Local filesystem storage initialized');
- // Clean up any orphaned proxy test jobs from previous server runs
- await (0, proxyTestQueue_1.cleanupOrphanedJobs)();
- // Start the crawl scheduler (checks every minute for jobs to run)
- (0, crawl_scheduler_1.startCrawlScheduler)();
- logger_1.logger.info('system', 'Crawl scheduler started');
- // Start the Dutchie AZ scheduler (enqueues jobs for workers)
- await (0, dutchie_az_1.initializeDefaultSchedules)();
- (0, dutchie_az_1.startScheduler)();
- logger_1.logger.info('system', 'Dutchie AZ scheduler started');
- app.listen(PORT, () => {
- logger_1.logger.info('system', `Server running on port ${PORT}`);
- console.log(`🚀 Server running on port ${PORT}`);
- });
- }
- catch (error) {
- logger_1.logger.error('system', `Failed to start server: ${error}`);
- console.error('Failed to start server:', error);
- process.exit(1);
- }
-}
-startServer();
diff --git a/backend/dist/middleware/apiTokenTracker.js b/backend/dist/middleware/apiTokenTracker.js
deleted file mode 100644
index 013da933..00000000
--- a/backend/dist/middleware/apiTokenTracker.js
+++ /dev/null
@@ -1,94 +0,0 @@
-"use strict";
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.trackApiUsage = trackApiUsage;
-exports.checkRateLimit = checkRateLimit;
-const migrate_1 = require("../db/migrate");
-async function trackApiUsage(req, res, next) {
- // Only track if authenticated via API token
- if (!req.apiToken) {
- return next();
- }
- const startTime = Date.now();
- req.startTime = startTime;
- // Get request size
- const requestSize = req.headers['content-length']
- ? parseInt(req.headers['content-length'])
- : 0;
- // Capture original res.json to measure response
- const originalJson = res.json.bind(res);
- let responseSize = 0;
- res.json = function (body) {
- responseSize = JSON.stringify(body).length;
- return originalJson(body);
- };
- // Track after response is sent
- res.on('finish', async () => {
- const responseTime = Date.now() - startTime;
- try {
- await migrate_1.pool.query(`
- INSERT INTO api_token_usage (
- token_id,
- endpoint,
- method,
- status_code,
- response_time_ms,
- request_size,
- response_size,
- ip_address,
- user_agent
- )
- VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)
- `, [
- req.apiToken.id,
- req.path,
- req.method,
- res.statusCode,
- responseTime,
- requestSize,
- responseSize,
- req.ip,
- req.headers['user-agent'] || null
- ]);
- // Update last_used_at
- await migrate_1.pool.query('UPDATE api_tokens SET last_used_at = CURRENT_TIMESTAMP WHERE id = $1', [req.apiToken.id]);
- }
- catch (error) {
- console.error('Error tracking API usage:', error);
- }
- });
- next();
-}
-// Rate limiting check
-async function checkRateLimit(req, res, next) {
- if (!req.apiToken) {
- return next();
- }
- const { id, rate_limit } = req.apiToken;
- try {
- // Count requests in the last minute
- const result = await migrate_1.pool.query(`
- SELECT COUNT(*) as request_count
- FROM api_token_usage
- WHERE token_id = $1
- AND created_at > NOW() - INTERVAL '1 minute'
- `, [id]);
- const requestCount = parseInt(result.rows[0].request_count);
- if (requestCount >= rate_limit) {
- return res.status(429).json({
- error: 'Rate limit exceeded',
- limit: rate_limit,
- current: requestCount,
- retry_after: 60
- });
- }
- // Add rate limit headers
- res.setHeader('X-RateLimit-Limit', rate_limit.toString());
- res.setHeader('X-RateLimit-Remaining', (rate_limit - requestCount).toString());
- res.setHeader('X-RateLimit-Reset', new Date(Date.now() + 60000).toISOString());
- next();
- }
- catch (error) {
- console.error('Error checking rate limit:', error);
- next();
- }
-}
diff --git a/backend/dist/middleware/wordpressPermissions.js b/backend/dist/middleware/wordpressPermissions.js
deleted file mode 100644
index c4e13c55..00000000
--- a/backend/dist/middleware/wordpressPermissions.js
+++ /dev/null
@@ -1,163 +0,0 @@
-"use strict";
-var __importDefault = (this && this.__importDefault) || function (mod) {
- return (mod && mod.__esModule) ? mod : { "default": mod };
-};
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.validateWordPressPermissions = validateWordPressPermissions;
-const migrate_1 = require("../db/migrate");
-const ipaddr_js_1 = __importDefault(require("ipaddr.js"));
-/**
- * Validates if an IP address matches any of the allowed IP patterns
- * Supports CIDR notation and wildcards
- */
-function isIpAllowed(clientIp, allowedIps) {
- try {
- const clientAddr = ipaddr_js_1.default.process(clientIp);
- for (const allowedIp of allowedIps) {
- const trimmed = allowedIp.trim();
- if (!trimmed)
- continue;
- // Check for CIDR notation
- if (trimmed.includes('/')) {
- try {
- const [subnet, bits] = trimmed.split('/');
- const range = ipaddr_js_1.default.parseCIDR(trimmed);
- if (clientAddr.match(range)) {
- return true;
- }
- }
- catch (e) {
- console.warn(`Invalid CIDR notation: ${trimmed}`);
- continue;
- }
- }
- else {
- // Exact match
- try {
- const allowedAddr = ipaddr_js_1.default.process(trimmed);
- if (clientAddr.toString() === allowedAddr.toString()) {
- return true;
- }
- }
- catch (e) {
- console.warn(`Invalid IP address: ${trimmed}`);
- continue;
- }
- }
- }
- return false;
- }
- catch (error) {
- console.error('Error processing client IP:', error);
- return false;
- }
-}
-/**
- * Validates if a domain matches any of the allowed domain patterns
- * Supports wildcard subdomains (*.example.com)
- */
-function isDomainAllowed(origin, allowedDomains) {
- try {
- // Extract domain from origin URL
- const url = new URL(origin);
- const domain = url.hostname;
- for (const allowedDomain of allowedDomains) {
- const trimmed = allowedDomain.trim();
- if (!trimmed)
- continue;
- // Wildcard subdomain support
- if (trimmed.startsWith('*.')) {
- const baseDomain = trimmed.substring(2);
- if (domain === baseDomain || domain.endsWith('.' + baseDomain)) {
- return true;
- }
- }
- else {
- // Exact match
- if (domain === trimmed) {
- return true;
- }
- }
- }
- return false;
- }
- catch (error) {
- console.error('Error processing domain:', error);
- return false;
- }
-}
-/**
- * WordPress API Permissions Middleware
- * Validates API access based on WordPress permissions table
- */
-async function validateWordPressPermissions(req, res, next) {
- // Get API key from header
- const apiKey = req.headers['x-api-key'];
- // If no API key provided, skip WordPress validation
- if (!apiKey) {
- return next();
- }
- try {
- // Query WordPress permissions table
- const result = await migrate_1.pool.query(`
- SELECT id, user_name, api_key, allowed_ips, allowed_domains, is_active
- FROM wp_dutchie_api_permissions
- WHERE api_key = $1 AND is_active = 1
- `, [apiKey]);
- if (result.rows.length === 0) {
- return res.status(401).json({
- error: 'Invalid API key'
- });
- }
- const permission = result.rows[0];
- // Get client IP
- const clientIp = req.headers['x-forwarded-for']?.split(',')[0].trim() ||
- req.headers['x-real-ip'] ||
- req.ip ||
- req.connection.remoteAddress ||
- '';
- // Validate IP if configured
- if (permission.allowed_ips) {
- const allowedIps = permission.allowed_ips.split('\n').filter((ip) => ip.trim());
- if (allowedIps.length > 0 && !isIpAllowed(clientIp, allowedIps)) {
- return res.status(403).json({
- error: 'IP address not allowed',
- client_ip: clientIp
- });
- }
- }
- // Validate domain if configured
- const origin = req.get('origin') || req.get('referer') || '';
- if (permission.allowed_domains && origin) {
- const allowedDomains = permission.allowed_domains.split('\n').filter((d) => d.trim());
- if (allowedDomains.length > 0 && !isDomainAllowed(origin, allowedDomains)) {
- return res.status(403).json({
- error: 'Domain not allowed',
- origin: origin
- });
- }
- }
- // Update last_used_at timestamp (async, don't wait)
- migrate_1.pool.query(`
- UPDATE wp_dutchie_api_permissions
- SET last_used_at = CURRENT_TIMESTAMP
- WHERE id = $1
- `, [permission.id]).catch((err) => {
- console.error('Error updating last_used_at:', err);
- });
- // Set apiToken on request for tracking middleware
- // Default rate limit of 100 requests/minute for WordPress permissions
- req.apiToken = {
- id: permission.id,
- name: permission.user_name,
- rate_limit: 100
- };
- next();
- }
- catch (error) {
- console.error('WordPress permissions validation error:', error);
- return res.status(500).json({
- error: 'Internal server error during API validation'
- });
- }
-}
diff --git a/backend/dist/migrations-runner/009_image_sizes.js b/backend/dist/migrations-runner/009_image_sizes.js
deleted file mode 100644
index 30858a3d..00000000
--- a/backend/dist/migrations-runner/009_image_sizes.js
+++ /dev/null
@@ -1,32 +0,0 @@
-"use strict";
-Object.defineProperty(exports, "__esModule", { value: true });
-const migrate_1 = require("../db/migrate");
-(async () => {
- try {
- console.log('🔄 Running image sizes migration...');
- // Add thumbnail and medium paths
- await migrate_1.pool.query(`
- ALTER TABLE products
- ADD COLUMN IF NOT EXISTS thumbnail_path TEXT,
- ADD COLUMN IF NOT EXISTS medium_path TEXT
- `);
- console.log('✅ Added thumbnail_path and medium_path columns');
- // Rename local_image_path to full_path
- await migrate_1.pool.query(`
- ALTER TABLE products
- RENAME COLUMN local_image_path TO full_path
- `);
- console.log('✅ Renamed local_image_path to full_path');
- // Add index
- await migrate_1.pool.query(`
- CREATE INDEX IF NOT EXISTS idx_products_images ON products(full_path, thumbnail_path, medium_path)
- `);
- console.log('✅ Created image index');
- console.log('✅ Migration complete!');
- process.exit(0);
- }
- catch (error) {
- console.error('❌ Migration failed:', error);
- process.exit(1);
- }
-})();
diff --git a/backend/dist/routes/analytics.js b/backend/dist/routes/analytics.js
deleted file mode 100644
index b14eed37..00000000
--- a/backend/dist/routes/analytics.js
+++ /dev/null
@@ -1,121 +0,0 @@
-"use strict";
-Object.defineProperty(exports, "__esModule", { value: true });
-const express_1 = require("express");
-const middleware_1 = require("../auth/middleware");
-const migrate_1 = require("../db/migrate");
-const router = (0, express_1.Router)();
-router.use(middleware_1.authMiddleware);
-// Get analytics overview
-router.get('/overview', async (req, res) => {
- try {
- const { days = 30 } = req.query;
- // Total clicks
- const clicksResult = await migrate_1.pool.query(`
- SELECT COUNT(*) as total_clicks
- FROM clicks
- WHERE clicked_at >= NOW() - INTERVAL '${parseInt(days)} days'
- `);
- // Unique products clicked
- const uniqueProductsResult = await migrate_1.pool.query(`
- SELECT COUNT(DISTINCT product_id) as unique_products
- FROM clicks
- WHERE clicked_at >= NOW() - INTERVAL '${parseInt(days)} days'
- `);
- // Clicks by day
- const clicksByDayResult = await migrate_1.pool.query(`
- SELECT DATE(clicked_at) as date, COUNT(*) as clicks
- FROM clicks
- WHERE clicked_at >= NOW() - INTERVAL '${parseInt(days)} days'
- GROUP BY DATE(clicked_at)
- ORDER BY date DESC
- `);
- // Top products
- const topProductsResult = await migrate_1.pool.query(`
- SELECT p.id, p.name, p.price, COUNT(c.id) as click_count
- FROM clicks c
- JOIN products p ON c.product_id = p.id
- WHERE c.clicked_at >= NOW() - INTERVAL '${parseInt(days)} days'
- GROUP BY p.id, p.name, p.price
- ORDER BY click_count DESC
- LIMIT 10
- `);
- res.json({
- overview: {
- total_clicks: parseInt(clicksResult.rows[0].total_clicks),
- unique_products: parseInt(uniqueProductsResult.rows[0].unique_products)
- },
- clicks_by_day: clicksByDayResult.rows,
- top_products: topProductsResult.rows
- });
- }
- catch (error) {
- console.error('Error fetching analytics:', error);
- res.status(500).json({ error: 'Failed to fetch analytics' });
- }
-});
-// Get product analytics
-router.get('/products/:id', async (req, res) => {
- try {
- const { id } = req.params;
- const { days = 30 } = req.query;
- // Total clicks for this product
- const totalResult = await migrate_1.pool.query(`
- SELECT COUNT(*) as total_clicks
- FROM clicks
- WHERE product_id = $1
- AND clicked_at >= NOW() - INTERVAL '${parseInt(days)} days'
- `, [id]);
- // Clicks by day
- const byDayResult = await migrate_1.pool.query(`
- SELECT DATE(clicked_at) as date, COUNT(*) as clicks
- FROM clicks
- WHERE product_id = $1
- AND clicked_at >= NOW() - INTERVAL '${parseInt(days)} days'
- GROUP BY DATE(clicked_at)
- ORDER BY date DESC
- `, [id]);
- res.json({
- product_id: parseInt(id),
- total_clicks: parseInt(totalResult.rows[0].total_clicks),
- clicks_by_day: byDayResult.rows
- });
- }
- catch (error) {
- console.error('Error fetching product analytics:', error);
- res.status(500).json({ error: 'Failed to fetch product analytics' });
- }
-});
-// Get campaign analytics
-router.get('/campaigns/:id', async (req, res) => {
- try {
- const { id } = req.params;
- const { days = 30 } = req.query;
- // Total clicks for this campaign
- const totalResult = await migrate_1.pool.query(`
- SELECT COUNT(*) as total_clicks
- FROM clicks
- WHERE campaign_id = $1
- AND clicked_at >= NOW() - INTERVAL '${parseInt(days)} days'
- `, [id]);
- // Clicks by product in this campaign
- const byProductResult = await migrate_1.pool.query(`
- SELECT p.id, p.name, COUNT(c.id) as clicks
- FROM clicks c
- JOIN products p ON c.product_id = p.id
- WHERE c.campaign_id = $1
- AND c.clicked_at >= NOW() - INTERVAL '${parseInt(days)} days'
- GROUP BY p.id, p.name
- ORDER BY clicks DESC
- `, [id]);
- res.json({
- campaign_id: parseInt(id),
- total_clicks: parseInt(totalResult.rows[0].total_clicks),
- clicks_by_product: byProductResult.rows
- });
- }
- catch (error) {
- console.error('Error fetching campaign analytics:', error);
- res.status(500).json({ error: 'Failed to fetch campaign analytics' });
- }
-});
-exports.default = router;
diff --git a/backend/dist/routes/api-permissions.js b/backend/dist/routes/api-permissions.js
deleted file mode 100644
index 8123a646..00000000
--- a/backend/dist/routes/api-permissions.js
+++ /dev/null
@@ -1,174 +0,0 @@
-"use strict";
-var __importDefault = (this && this.__importDefault) || function (mod) {
- return (mod && mod.__esModule) ? mod : { "default": mod };
-};
-Object.defineProperty(exports, "__esModule", { value: true });
-const express_1 = require("express");
-const middleware_1 = require("../auth/middleware");
-const migrate_1 = require("../db/migrate");
-const crypto_1 = __importDefault(require("crypto"));
-const router = (0, express_1.Router)();
-router.use(middleware_1.authMiddleware);
-// Generate secure random API key (64-character hex)
-function generateApiKey() {
- return crypto_1.default.randomBytes(32).toString('hex');
-}
-// Get all API permissions
-router.get('/', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
- try {
- const result = await migrate_1.pool.query(`
- SELECT *
- FROM wp_dutchie_api_permissions
- ORDER BY created_at DESC
- `);
- res.json({ permissions: result.rows });
- }
- catch (error) {
- console.error('Error fetching API permissions:', error);
- res.status(500).json({ error: 'Failed to fetch API permissions' });
- }
-});
-// Get all dispensaries for dropdown (must be before /:id to avoid route conflict)
-router.get('/dispensaries', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
- try {
- const result = await migrate_1.pool.query(`
- SELECT id, name
- FROM dispensaries
- ORDER BY name
- `);
- res.json({ dispensaries: result.rows });
- }
- catch (error) {
- console.error('Error fetching dispensaries:', error);
- res.status(500).json({ error: 'Failed to fetch dispensaries' });
- }
-});
-// Get single API permission
-router.get('/:id', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
- try {
- const { id } = req.params;
- const result = await migrate_1.pool.query(`
- SELECT *
- FROM wp_dutchie_api_permissions
- WHERE id = $1
- `, [id]);
- if (result.rows.length === 0) {
- return res.status(404).json({ error: 'Permission not found' });
- }
- res.json({ permission: result.rows[0] });
- }
- catch (error) {
- console.error('Error fetching API permission:', error);
- res.status(500).json({ error: 'Failed to fetch API permission' });
- }
-});
-// Create new API permission
-router.post('/', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
- try {
- // Support both store_id (existing) and dispensary_id (for compatibility)
- const { user_name, allowed_ips, allowed_domains, store_id, dispensary_id } = req.body;
- const storeIdToUse = store_id || dispensary_id;
- if (!user_name) {
- return res.status(400).json({ error: 'User name is required' });
- }
- if (!storeIdToUse) {
- return res.status(400).json({ error: 'Store/Dispensary is required' });
- }
- // Get dispensary name for display
- const dispensaryResult = await migrate_1.pool.query('SELECT name FROM dispensaries WHERE id = $1', [storeIdToUse]);
- if (dispensaryResult.rows.length === 0) {
- return res.status(400).json({ error: 'Invalid store/dispensary ID' });
- }
- const storeName = dispensaryResult.rows[0].name;
- const apiKey = generateApiKey();
- const result = await migrate_1.pool.query(`
- INSERT INTO wp_dutchie_api_permissions (
- user_name,
- api_key,
- allowed_ips,
- allowed_domains,
- is_active,
- store_id,
- store_name
- )
- VALUES ($1, $2, $3, $4, 1, $5, $6)
- RETURNING *
- `, [
- user_name,
- apiKey,
- allowed_ips || null,
- allowed_domains || null,
- storeIdToUse,
- storeName
- ]);
- res.status(201).json({
- permission: result.rows[0],
- message: 'API permission created successfully. Save the API key securely - it cannot be retrieved later.'
- });
- }
- catch (error) {
- console.error('Error creating API permission:', error);
- res.status(500).json({ error: 'Failed to create API permission' });
- }
-});
-// Update API permission
-router.put('/:id', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
- try {
- const { id } = req.params;
- const { user_name, allowed_ips, allowed_domains, is_active } = req.body;
- const result = await migrate_1.pool.query(`
- UPDATE wp_dutchie_api_permissions
- SET
- user_name = COALESCE($1, user_name),
- allowed_ips = COALESCE($2, allowed_ips),
- allowed_domains = COALESCE($3, allowed_domains),
- is_active = COALESCE($4, is_active)
- WHERE id = $5
- RETURNING *
- `, [user_name, allowed_ips, allowed_domains, is_active, id]);
- if (result.rows.length === 0) {
- return res.status(404).json({ error: 'Permission not found' });
- }
- res.json({ permission: result.rows[0] });
- }
- catch (error) {
- console.error('Error updating API permission:', error);
- res.status(500).json({ error: 'Failed to update API permission' });
- }
-});
-// Toggle permission active status
-router.patch('/:id/toggle', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
- try {
- const { id } = req.params;
- const result = await migrate_1.pool.query(`
- UPDATE wp_dutchie_api_permissions
- SET is_active = NOT is_active
- WHERE id = $1
- RETURNING *
- `, [id]);
- if (result.rows.length === 0) {
- return res.status(404).json({ error: 'Permission not found' });
- }
- res.json({ permission: result.rows[0] });
- }
- catch (error) {
- console.error('Error toggling API permission:', error);
- res.status(500).json({ error: 'Failed to toggle API permission' });
- }
-});
-// Delete API permission
-router.delete('/:id', (0, middleware_1.requireRole)('superadmin'), async (req, res) => {
- try {
- const { id } = req.params;
- const result = await migrate_1.pool.query('DELETE FROM wp_dutchie_api_permissions WHERE id = $1 RETURNING *', [id]);
- if (result.rows.length === 0) {
- return res.status(404).json({ error: 'Permission not found' });
- }
- res.json({ message: 'API permission deleted successfully' });
- }
- catch (error) {
- console.error('Error deleting API permission:', error);
- res.status(500).json({ error: 'Failed to delete API permission' });
- }
-});
-exports.default = router;
diff --git a/backend/dist/routes/api-tokens.js b/backend/dist/routes/api-tokens.js
deleted file mode 100644
index 39139e9c..00000000
--- a/backend/dist/routes/api-tokens.js
+++ /dev/null
@@ -1,265 +0,0 @@
-"use strict";
-var __importDefault = (this && this.__importDefault) || function (mod) {
- return (mod && mod.__esModule) ? mod : { "default": mod };
-};
-Object.defineProperty(exports, "__esModule", { value: true });
-const express_1 = require("express");
-const middleware_1 = require("../auth/middleware");
-const migrate_1 = require("../db/migrate");
-const crypto_1 = __importDefault(require("crypto"));
-const router = (0, express_1.Router)();
-router.use(middleware_1.authMiddleware);
-// Generate secure random token
-function generateToken() {
- return crypto_1.default.randomBytes(32).toString('hex');
-}
-// Get all API tokens
-router.get('/', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
- try {
- const result = await migrate_1.pool.query(`
- SELECT
- t.*,
- u.email as created_by_email,
- (
- SELECT COUNT(*)
- FROM api_token_usage
- WHERE token_id = t.id
- AND created_at > NOW() - INTERVAL '24 hours'
- ) as requests_24h,
- (
- SELECT COUNT(*)
- FROM api_token_usage
- WHERE token_id = t.id
- AND created_at > NOW() - INTERVAL '7 days'
- ) as requests_7d,
- (
- SELECT COUNT(*)
- FROM api_token_usage
- WHERE token_id = t.id
- ) as total_requests
- FROM api_tokens t
- LEFT JOIN users u ON t.user_id = u.id
- ORDER BY t.created_at DESC
- `);
- res.json({ tokens: result.rows });
- }
- catch (error) {
- console.error('Error fetching API tokens:', error);
- res.status(500).json({ error: 'Failed to fetch API tokens' });
- }
-});
-// Get single API token
-router.get('/:id', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
- try {
- const { id } = req.params;
- const result = await migrate_1.pool.query(`
- SELECT
- t.*,
- u.email as created_by_email
- FROM api_tokens t
- LEFT JOIN users u ON t.user_id = u.id
- WHERE t.id = $1
- `, [id]);
- if (result.rows.length === 0) {
- return res.status(404).json({ error: 'Token not found' });
- }
- res.json({ token: result.rows[0] });
- }
- catch (error) {
- console.error('Error fetching API token:', error);
- res.status(500).json({ error: 'Failed to fetch API token' });
- }
-});
-// Create new API token
-router.post('/', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
- try {
- const { name, description, rate_limit, allowed_endpoints, expires_at } = req.body;
- const userId = req.user.userId;
- if (!name) {
- return res.status(400).json({ error: 'Name is required' });
- }
- const token = generateToken();
- const result = await migrate_1.pool.query(`
- INSERT INTO api_tokens (
- name,
- token,
- description,
- user_id,
- rate_limit,
- allowed_endpoints,
- expires_at
- )
- VALUES ($1, $2, $3, $4, $5, $6, $7)
- RETURNING *
- `, [
- name,
- token,
- description || null,
- userId,
- rate_limit || 100,
- allowed_endpoints || null,
- expires_at || null
- ]);
- res.status(201).json({
- token: result.rows[0],
- message: 'API token created successfully. Save this token securely - it cannot be retrieved later.'
- });
- }
- catch (error) {
- console.error('Error creating API token:', error);
- res.status(500).json({ error: 'Failed to create API token' });
- }
-});
-// Update API token
-router.put('/:id', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
- try {
- const { id } = req.params;
- const { name, description, active, rate_limit, allowed_endpoints, expires_at } = req.body;
- const result = await migrate_1.pool.query(`
- UPDATE api_tokens
- SET
- name = COALESCE($1, name),
- description = COALESCE($2, description),
- active = COALESCE($3, active),
- rate_limit = COALESCE($4, rate_limit),
- allowed_endpoints = COALESCE($5, allowed_endpoints),
- expires_at = COALESCE($6, expires_at)
- WHERE id = $7
- RETURNING *
- `, [name, description, active, rate_limit, allowed_endpoints, expires_at, id]);
- if (result.rows.length === 0) {
- return res.status(404).json({ error: 'Token not found' });
- }
- res.json({ token: result.rows[0] });
- }
- catch (error) {
- console.error('Error updating API token:', error);
- res.status(500).json({ error: 'Failed to update API token' });
- }
-});
-// Delete API token
-router.delete('/:id', (0, middleware_1.requireRole)('superadmin'), async (req, res) => {
- try {
- const { id } = req.params;
- const result = await migrate_1.pool.query('DELETE FROM api_tokens WHERE id = $1 RETURNING *', [id]);
- if (result.rows.length === 0) {
- return res.status(404).json({ error: 'Token not found' });
- }
- res.json({ message: 'API token deleted successfully' });
- }
- catch (error) {
- console.error('Error deleting API token:', error);
- res.status(500).json({ error: 'Failed to delete API token' });
- }
-});
-// Get token usage statistics
-router.get('/:id/usage', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
- try {
- const { id } = req.params;
- const { days = 7 } = req.query;
- // Get hourly usage for the past N days
- const hourlyUsage = await migrate_1.pool.query(`
- SELECT
- DATE_TRUNC('hour', created_at) as hour,
- COUNT(*) as requests,
- AVG(response_time_ms) as avg_response_time,
- SUM(CASE WHEN status_code >= 200 AND status_code < 300 THEN 1 ELSE 0 END) as successful_requests,
- SUM(CASE WHEN status_code >= 400 THEN 1 ELSE 0 END) as failed_requests
- FROM api_token_usage
- WHERE token_id = $1
- AND created_at > NOW() - INTERVAL '${parseInt(days)} days'
- GROUP BY hour
- ORDER BY hour DESC
- `, [id]);
- // Get endpoint usage
- const endpointUsage = await migrate_1.pool.query(`
- SELECT
- endpoint,
- method,
- COUNT(*) as requests,
- AVG(response_time_ms) as avg_response_time
- FROM api_token_usage
- WHERE token_id = $1
- AND created_at > NOW() - INTERVAL '${parseInt(days)} days'
- GROUP BY endpoint, method
- ORDER BY requests DESC
- LIMIT 20
- `, [id]);
- // Get recent requests
- const recentRequests = await migrate_1.pool.query(`
- SELECT
- endpoint,
- method,
- status_code,
- response_time_ms,
- ip_address,
- created_at
- FROM api_token_usage
- WHERE token_id = $1
- ORDER BY created_at DESC
- LIMIT 100
- `, [id]);
- res.json({
- hourly_usage: hourlyUsage.rows,
- endpoint_usage: endpointUsage.rows,
- recent_requests: recentRequests.rows
- });
- }
- catch (error) {
- console.error('Error fetching token usage:', error);
- res.status(500).json({ error: 'Failed to fetch token usage' });
- }
-});
-// Get overall API usage statistics
-router.get('/stats/overview', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
- try {
- const { days = 7 } = req.query;
- const stats = await migrate_1.pool.query(`
- SELECT
- COUNT(DISTINCT token_id) as active_tokens,
- COUNT(*) as total_requests,
- AVG(response_time_ms) as avg_response_time,
- SUM(CASE WHEN status_code >= 200 AND status_code < 300 THEN 1 ELSE 0 END) as successful_requests,
- SUM(CASE WHEN status_code >= 400 THEN 1 ELSE 0 END) as failed_requests
- FROM api_token_usage
- WHERE created_at > NOW() - INTERVAL '${parseInt(days)} days'
- `);
- // Top tokens by usage
- const topTokens = await migrate_1.pool.query(`
- SELECT
- t.id,
- t.name,
- COUNT(u.id) as requests,
- AVG(u.response_time_ms) as avg_response_time
- FROM api_tokens t
- LEFT JOIN api_token_usage u ON t.id = u.token_id
- WHERE u.created_at > NOW() - INTERVAL '${parseInt(days)} days'
- GROUP BY t.id, t.name
- ORDER BY requests DESC
- LIMIT 10
- `);
- // Most used endpoints
- const topEndpoints = await migrate_1.pool.query(`
- SELECT
- endpoint,
- method,
- COUNT(*) as requests,
- AVG(response_time_ms) as avg_response_time
- FROM api_token_usage
- WHERE created_at > NOW() - INTERVAL '${parseInt(days)} days'
- GROUP BY endpoint, method
- ORDER BY requests DESC
- LIMIT 10
- `);
- res.json({
- overview: stats.rows[0],
- top_tokens: topTokens.rows,
- top_endpoints: topEndpoints.rows
- });
- }
- catch (error) {
- console.error('Error fetching API stats:', error);
- res.status(500).json({ error: 'Failed to fetch API stats' });
- }
-});
-exports.default = router;
diff --git a/backend/dist/routes/auth.js b/backend/dist/routes/auth.js
deleted file mode 100644
index 8c495798..00000000
--- a/backend/dist/routes/auth.js
+++ /dev/null
@@ -1,43 +0,0 @@
-"use strict";
-Object.defineProperty(exports, "__esModule", { value: true });
-const express_1 = require("express");
-const middleware_1 = require("../auth/middleware");
-const router = (0, express_1.Router)();
-// Login
-router.post('/login', async (req, res) => {
- try {
- const { email, password } = req.body;
- if (!email || !password) {
- return res.status(400).json({ error: 'Email and password required' });
- }
- const user = await (0, middleware_1.authenticateUser)(email, password);
- if (!user) {
- return res.status(401).json({ error: 'Invalid credentials' });
- }
- const token = (0, middleware_1.generateToken)(user);
- res.json({
- token,
- user: {
- id: user.id,
- email: user.email,
- role: user.role
- }
- });
- }
- catch (error) {
- console.error('Login error:', error);
- res.status(500).json({ error: 'Internal server error' });
- }
-});
-// Get current user
-router.get('/me', middleware_1.authMiddleware, async (req, res) => {
- res.json({
- user: req.user
- });
-});
-// Refresh token
-router.post('/refresh', middleware_1.authMiddleware, async (req, res) => {
- const token = (0, middleware_1.generateToken)(req.user);
- res.json({ token });
-});
-exports.default = router;
diff --git a/backend/dist/routes/campaigns.js b/backend/dist/routes/campaigns.js
deleted file mode 100644
index e96ee8a9..00000000
--- a/backend/dist/routes/campaigns.js
+++ /dev/null
@@ -1,163 +0,0 @@
-"use strict";
-Object.defineProperty(exports, "__esModule", { value: true });
-const express_1 = require("express");
-const middleware_1 = require("../auth/middleware");
-const migrate_1 = require("../db/migrate");
-const router = (0, express_1.Router)();
-router.use(middleware_1.authMiddleware);
-// Get all campaigns
-router.get('/', async (req, res) => {
- try {
- const result = await migrate_1.pool.query(`
- SELECT c.*, COUNT(cp.product_id) as product_count
- FROM campaigns c
- LEFT JOIN campaign_products cp ON c.id = cp.campaign_id
- GROUP BY c.id
- ORDER BY c.created_at DESC
- `);
- res.json({ campaigns: result.rows });
- }
- catch (error) {
- console.error('Error fetching campaigns:', error);
- res.status(500).json({ error: 'Failed to fetch campaigns' });
- }
-});
-// Get single campaign with products
-router.get('/:id', async (req, res) => {
- try {
- const { id } = req.params;
- const campaignResult = await migrate_1.pool.query(`
- SELECT * FROM campaigns WHERE id = $1
- `, [id]);
- if (campaignResult.rows.length === 0) {
- return res.status(404).json({ error: 'Campaign not found' });
- }
- const productsResult = await migrate_1.pool.query(`
- SELECT p.*, cp.display_order
- FROM products p
- JOIN campaign_products cp ON p.id = cp.product_id
- WHERE cp.campaign_id = $1
- ORDER BY cp.display_order
- `, [id]);
- res.json({
- campaign: campaignResult.rows[0],
- products: productsResult.rows
- });
- }
- catch (error) {
- console.error('Error fetching campaign:', error);
- res.status(500).json({ error: 'Failed to fetch campaign' });
- }
-});
-// Create campaign
-router.post('/', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
- try {
- const { name, slug, description, display_style, active, start_date, end_date } = req.body;
- if (!name || !slug) {
- return res.status(400).json({ error: 'Name and slug required' });
- }
- const result = await migrate_1.pool.query(`
- INSERT INTO campaigns (name, slug, description, display_style, active, start_date, end_date)
- VALUES ($1, $2, $3, $4, $5, $6, $7)
- RETURNING *
- `, [name, slug, description, display_style || 'grid', active !== false, start_date, end_date]);
- res.status(201).json({ campaign: result.rows[0] });
- }
- catch (error) {
- console.error('Error creating campaign:', error);
- if (error.code === '23505') {
- return res.status(409).json({ error: 'Campaign slug already exists' });
- }
- res.status(500).json({ error: 'Failed to create campaign' });
- }
-});
-// Update campaign
-router.put('/:id', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
- try {
- const { id } = req.params;
- const { name, slug, description, display_style, active, start_date, end_date } = req.body;
- const result = await migrate_1.pool.query(`
- UPDATE campaigns
- SET name = COALESCE($1, name),
- slug = COALESCE($2, slug),
- description = COALESCE($3, description),
- display_style = COALESCE($4, display_style),
- active = COALESCE($5, active),
- start_date = COALESCE($6, start_date),
- end_date = COALESCE($7, end_date),
- updated_at = CURRENT_TIMESTAMP
- WHERE id = $8
- RETURNING *
- `, [name, slug, description, display_style, active, start_date, end_date, id]);
- if (result.rows.length === 0) {
- return res.status(404).json({ error: 'Campaign not found' });
- }
- res.json({ campaign: result.rows[0] });
- }
- catch (error) {
- console.error('Error updating campaign:', error);
- if (error.code === '23505') {
- return res.status(409).json({ error: 'Campaign slug already exists' });
- }
- res.status(500).json({ error: 'Failed to update campaign' });
- }
-});
-// Delete campaign
-router.delete('/:id', (0, middleware_1.requireRole)('superadmin'), async (req, res) => {
- try {
- const { id } = req.params;
- const result = await migrate_1.pool.query(`
- DELETE FROM campaigns WHERE id = $1 RETURNING id
- `, [id]);
- if (result.rows.length === 0) {
- return res.status(404).json({ error: 'Campaign not found' });
- }
- res.json({ message: 'Campaign deleted successfully' });
- }
- catch (error) {
- console.error('Error deleting campaign:', error);
- res.status(500).json({ error: 'Failed to delete campaign' });
- }
-});
-// Add product to campaign
-router.post('/:id/products', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
- try {
- const { id } = req.params;
- const { product_id, display_order } = req.body;
- if (!product_id) {
- return res.status(400).json({ error: 'Product ID required' });
- }
- const result = await migrate_1.pool.query(`
- INSERT INTO campaign_products (campaign_id, product_id, display_order)
- VALUES ($1, $2, $3)
- ON CONFLICT (campaign_id, product_id)
- DO UPDATE SET display_order = $3
- RETURNING *
- `, [id, product_id, display_order || 0]);
- res.status(201).json({ campaign_product: result.rows[0] });
- }
- catch (error) {
- console.error('Error adding product to campaign:', error);
- res.status(500).json({ error: 'Failed to add product to campaign' });
- }
-});
-// Remove product from campaign
-router.delete('/:id/products/:product_id', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
- try {
- const { id, product_id } = req.params;
- const result = await migrate_1.pool.query(`
- DELETE FROM campaign_products
- WHERE campaign_id = $1 AND product_id = $2
- RETURNING *
- `, [id, product_id]);
- if (result.rows.length === 0) {
- return res.status(404).json({ error: 'Product not in campaign' });
- }
- res.json({ message: 'Product removed from campaign' });
- }
- catch (error) {
- console.error('Error removing product from campaign:', error);
- res.status(500).json({ error: 'Failed to remove product from campaign' });
- }
-});
-exports.default = router;
diff --git a/backend/dist/routes/categories.js b/backend/dist/routes/categories.js
deleted file mode 100644
index e04ca1e7..00000000
--- a/backend/dist/routes/categories.js
+++ /dev/null
@@ -1,84 +0,0 @@
-"use strict";
-Object.defineProperty(exports, "__esModule", { value: true });
-const express_1 = require("express");
-const middleware_1 = require("../auth/middleware");
-const migrate_1 = require("../db/migrate");
-const router = (0, express_1.Router)();
-router.use(middleware_1.authMiddleware);
-// Get categories (flat list)
-router.get('/', async (req, res) => {
- try {
- const { store_id } = req.query;
- let query = `
- SELECT
- c.*,
- COUNT(DISTINCT p.id) as product_count,
- pc.name as parent_name
- FROM categories c
- LEFT JOIN products p ON c.id = p.category_id
- LEFT JOIN categories pc ON c.parent_id = pc.id
- `;
- const params = [];
- if (store_id) {
- query += ' WHERE c.store_id = $1';
- params.push(store_id);
- }
- query += `
- GROUP BY c.id, pc.name
- ORDER BY c.display_order, c.name
- `;
- const result = await migrate_1.pool.query(query, params);
- res.json({ categories: result.rows });
- }
- catch (error) {
- console.error('Error fetching categories:', error);
- res.status(500).json({ error: 'Failed to fetch categories' });
- }
-});
-// Get category tree (hierarchical)
-router.get('/tree', async (req, res) => {
- try {
- const { store_id } = req.query;
- if (!store_id) {
- return res.status(400).json({ error: 'store_id is required' });
- }
- // Get all categories for the store
- const result = await migrate_1.pool.query(`
- SELECT
- c.*,
- COUNT(DISTINCT p.id) as product_count
- FROM categories c
- LEFT JOIN products p ON c.id = p.category_id AND p.in_stock = true
- WHERE c.store_id = $1
- GROUP BY c.id
- ORDER BY c.display_order, c.name
- `, [store_id]);
- // Build tree structure
- const categories = result.rows;
- const categoryMap = new Map();
- const tree = [];
- // First pass: create map
- categories.forEach((cat) => {
- categoryMap.set(cat.id, { ...cat, children: [] });
- });
- // Second pass: build tree
- categories.forEach((cat) => {
- const node = categoryMap.get(cat.id);
- if (cat.parent_id) {
- const parent = categoryMap.get(cat.parent_id);
- if (parent) {
- parent.children.push(node);
- }
- }
- else {
- tree.push(node);
- }
- });
- res.json({ tree });
- }
- catch (error) {
- console.error('Error fetching category tree:', error);
- res.status(500).json({ error: 'Failed to fetch category tree' });
- }
-});
-exports.default = router;
diff --git a/backend/dist/routes/changes.js b/backend/dist/routes/changes.js
deleted file mode 100644
index 0af6afd6..00000000
--- a/backend/dist/routes/changes.js
+++ /dev/null
@@ -1,152 +0,0 @@
-"use strict";
-Object.defineProperty(exports, "__esModule", { value: true });
-const express_1 = require("express");
-const middleware_1 = require("../auth/middleware");
-const migrate_1 = require("../db/migrate");
-const router = (0, express_1.Router)();
-router.use(middleware_1.authMiddleware);
-// Get all changes with optional status filter
-router.get('/', async (req, res) => {
- try {
- const { status } = req.query;
- let query = `
- SELECT
- dc.id,
- dc.dispensary_id,
- dc.field_name,
- dc.old_value,
- dc.new_value,
- dc.source,
- dc.confidence_score,
- dc.change_notes,
- dc.status,
- dc.requires_recrawl,
- dc.created_at,
- dc.reviewed_at,
- dc.reviewed_by,
- dc.rejection_reason,
- d.name as dispensary_name,
- d.slug as dispensary_slug,
- d.city,
- d.state
- FROM dispensary_changes dc
- JOIN dispensaries d ON dc.dispensary_id = d.id
- `;
- const params = [];
- if (status) {
- query += ` WHERE dc.status = $1`;
- params.push(status);
- }
- query += ` ORDER BY dc.created_at DESC`;
- const result = await migrate_1.pool.query(query, params);
- res.json({ changes: result.rows });
- }
- catch (error) {
- console.error('Error fetching changes:', error);
- res.status(500).json({ error: 'Failed to fetch changes' });
- }
-});
-// Get changes statistics (for alert banner)
-router.get('/stats', async (req, res) => {
- try {
- const result = await migrate_1.pool.query(`
- SELECT
- COUNT(*) FILTER (WHERE status = 'pending') as pending_count,
- COUNT(*) FILTER (WHERE status = 'pending' AND requires_recrawl = TRUE) as pending_recrawl_count,
- COUNT(*) FILTER (WHERE status = 'approved') as approved_count,
- COUNT(*) FILTER (WHERE status = 'rejected') as rejected_count
- FROM dispensary_changes
- `);
- res.json(result.rows[0]);
- }
- catch (error) {
- console.error('Error fetching change stats:', error);
- res.status(500).json({ error: 'Failed to fetch change stats' });
- }
-});
-// Approve a change and apply it to the dispensary
-router.post('/:id/approve', async (req, res) => {
- const client = await migrate_1.pool.connect();
- try {
- await client.query('BEGIN');
- const { id } = req.params;
- const userId = req.user?.id; // From auth middleware
- // Get the change record
- const changeResult = await client.query(`
- SELECT * FROM dispensary_changes WHERE id = $1 AND status = 'pending'
- `, [id]);
- if (changeResult.rows.length === 0) {
- await client.query('ROLLBACK');
- return res.status(404).json({ error: 'Pending change not found' });
- }
- const change = changeResult.rows[0];
- // Apply the change to the dispensary table
- const updateQuery = `
- UPDATE dispensaries
- SET ${change.field_name} = $1, updated_at = CURRENT_TIMESTAMP
- WHERE id = $2
- RETURNING *
- `;
- const dispensaryResult = await client.query(updateQuery, [
- change.new_value,
- change.dispensary_id
- ]);
- if (dispensaryResult.rows.length === 0) {
- await client.query('ROLLBACK');
- return res.status(404).json({ error: 'Dispensary not found' });
- }
- // Mark the change as approved
- await client.query(`
- UPDATE dispensary_changes
- SET
- status = 'approved',
- reviewed_at = CURRENT_TIMESTAMP,
- reviewed_by = $1
- WHERE id = $2
- `, [userId, id]);
- await client.query('COMMIT');
- res.json({
- message: 'Change approved and applied',
- dispensary: dispensaryResult.rows[0],
- requires_recrawl: change.requires_recrawl
- });
- }
- catch (error) {
- await client.query('ROLLBACK');
- console.error('Error approving change:', error);
- res.status(500).json({ error: 'Failed to approve change' });
- }
- finally {
- client.release();
- }
-});
-// Reject a change with optional reason
-router.post('/:id/reject', async (req, res) => {
- try {
- const { id } = req.params;
- const { reason } = req.body;
- const userId = req.user?.id; // From auth middleware
- const result = await migrate_1.pool.query(`
- UPDATE dispensary_changes
- SET
- status = 'rejected',
- reviewed_at = CURRENT_TIMESTAMP,
- reviewed_by = $1,
- rejection_reason = $2
- WHERE id = $3 AND status = 'pending'
- RETURNING *
- `, [userId, reason, id]);
- if (result.rows.length === 0) {
- return res.status(404).json({ error: 'Pending change not found' });
- }
- res.json({
- message: 'Change rejected',
- change: result.rows[0]
- });
- }
- catch (error) {
- console.error('Error rejecting change:', error);
- res.status(500).json({ error: 'Failed to reject change' });
- }
-});
-exports.default = router;
diff --git a/backend/dist/routes/crawler-sandbox.js b/backend/dist/routes/crawler-sandbox.js
deleted file mode 100644
index b7d2870f..00000000
--- a/backend/dist/routes/crawler-sandbox.js
+++ /dev/null
@@ -1,497 +0,0 @@
-"use strict";
-/**
- * Crawler Sandbox API Routes
- *
- * Endpoints for managing sandbox crawls, templates, and provider detection
- */
-var __importDefault = (this && this.__importDefault) || function (mod) {
- return (mod && mod.__esModule) ? mod : { "default": mod };
-};
-Object.defineProperty(exports, "__esModule", { value: true });
-const express_1 = __importDefault(require("express"));
-const migrate_1 = require("../db/migrate");
-const middleware_1 = require("../auth/middleware");
-const logger_1 = require("../services/logger");
-const crawler_jobs_1 = require("../services/crawler-jobs");
-const router = express_1.default.Router();
-// Apply auth middleware to all routes
-router.use(middleware_1.authMiddleware);
-// ========================================
-// Sandbox Entries
-// ========================================
-/**
- * GET /api/crawler-sandbox
- * List sandbox entries with optional filters
- */
-router.get('/', async (req, res) => {
- try {
- const { status, dispensaryId, limit = 50, offset = 0 } = req.query;
- let query = `
- SELECT cs.*, d.name as dispensary_name, d.website, d.menu_provider, d.crawler_status
- FROM crawler_sandboxes cs
- JOIN dispensaries d ON d.id = cs.dispensary_id
- WHERE 1=1
- `;
- const params = [];
- let paramIndex = 1;
- if (status) {
- query += ` AND cs.status = $${paramIndex}`;
- params.push(status);
- paramIndex++;
- }
- if (dispensaryId) {
- query += ` AND cs.dispensary_id = $${paramIndex}`;
- params.push(Number(dispensaryId));
- paramIndex++;
- }
- query += ` ORDER BY cs.created_at DESC LIMIT $${paramIndex} OFFSET $${paramIndex + 1}`;
- params.push(Number(limit), Number(offset));
- const result = await migrate_1.pool.query(query, params);
- // Get total count
- const countResult = await migrate_1.pool.query(`SELECT COUNT(*) FROM crawler_sandboxes cs WHERE 1=1
- ${status ? 'AND cs.status = $1' : ''}
- ${dispensaryId ? `AND cs.dispensary_id = $${status ? 2 : 1}` : ''}`, status && dispensaryId ? [status, dispensaryId] : status ? [status] : dispensaryId ? [dispensaryId] : []);
- res.json({
- sandboxes: result.rows,
- total: parseInt(countResult.rows[0].count),
- limit: Number(limit),
- offset: Number(offset),
- });
- }
- catch (error) {
- logger_1.logger.error('api', `Get sandboxes error: ${error.message}`);
- res.status(500).json({ error: error.message });
- }
-});
-/**
- * GET /api/crawler-sandbox/:id
- * Get a single sandbox entry with full details
- */
-router.get('/:id', async (req, res) => {
- try {
- const { id } = req.params;
- const result = await migrate_1.pool.query(`SELECT cs.*, d.name as dispensary_name, d.website, d.menu_url,
- d.menu_provider, d.menu_provider_confidence, d.crawler_mode, d.crawler_status
- FROM crawler_sandboxes cs
- JOIN dispensaries d ON d.id = cs.dispensary_id
- WHERE cs.id = $1`, [id]);
- if (result.rows.length === 0) {
- return res.status(404).json({ error: 'Sandbox entry not found' });
- }
- // Get related jobs
- const jobs = await migrate_1.pool.query(`SELECT * FROM sandbox_crawl_jobs
- WHERE sandbox_id = $1 OR dispensary_id = $2
- ORDER BY created_at DESC
- LIMIT 10`, [id, result.rows[0].dispensary_id]);
- res.json({
- sandbox: result.rows[0],
- jobs: jobs.rows,
- });
- }
- catch (error) {
- logger_1.logger.error('api', `Get sandbox error: ${error.message}`);
- res.status(500).json({ error: error.message });
- }
-});
-/**
- * POST /api/crawler-sandbox/:id/analyze
- * Trigger re-analysis of a sandbox entry
- */
-router.post('/:id/analyze', (0, middleware_1.requireRole)('admin'), async (req, res) => {
- try {
- const { id } = req.params;
- const sandbox = await migrate_1.pool.query('SELECT * FROM crawler_sandboxes WHERE id = $1', [id]);
- if (sandbox.rows.length === 0) {
- return res.status(404).json({ error: 'Sandbox entry not found' });
- }
- // Queue a new sandbox job
- const job = await migrate_1.pool.query(`INSERT INTO sandbox_crawl_jobs (dispensary_id, sandbox_id, job_type, status, priority)
- VALUES ($1, $2, 'deep_crawl', 'pending', 20)
- RETURNING id`, [sandbox.rows[0].dispensary_id, id]);
- // Update sandbox status
- await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = 'pending', updated_at = NOW() WHERE id = $1`, [id]);
- res.json({
- message: 'Analysis job queued',
- jobId: job.rows[0].id,
- });
- }
- catch (error) {
- logger_1.logger.error('api', `Analyze sandbox error: ${error.message}`);
- res.status(500).json({ error: error.message });
- }
-});
-/**
- * POST /api/crawler-sandbox/:id/move-to-production
- * Move a sandbox entry to production (for Dutchie dispensaries)
- */
-router.post('/:id/move-to-production', (0, middleware_1.requireRole)('admin'), async (req, res) => {
- try {
- const { id } = req.params;
- const sandbox = await migrate_1.pool.query(`SELECT cs.*, d.menu_provider
- FROM crawler_sandboxes cs
- JOIN dispensaries d ON d.id = cs.dispensary_id
- WHERE cs.id = $1`, [id]);
- if (sandbox.rows.length === 0) {
- return res.status(404).json({ error: 'Sandbox entry not found' });
- }
- // Can only move to production if provider is dutchie
- if (sandbox.rows[0].menu_provider !== 'dutchie') {
- return res.status(400).json({
- error: 'Only Dutchie dispensaries can be moved to production currently',
- });
- }
- // Update dispensary to production mode
- await migrate_1.pool.query(`UPDATE dispensaries
- SET crawler_mode = 'production', crawler_status = 'idle', updated_at = NOW()
- WHERE id = $1`, [sandbox.rows[0].dispensary_id]);
- // Mark sandbox as moved
- await migrate_1.pool.query(`UPDATE crawler_sandboxes
- SET status = 'moved_to_production', updated_at = NOW()
- WHERE id = $1`, [id]);
- res.json({ message: 'Dispensary moved to production' });
- }
- catch (error) {
- logger_1.logger.error('api', `Move to production error: ${error.message}`);
- res.status(500).json({ error: error.message });
- }
-});
-/**
- * PATCH /api/crawler-sandbox/:id
- * Update sandbox entry (e.g., add human review notes)
- */
-router.patch('/:id', (0, middleware_1.requireRole)('admin'), async (req, res) => {
- try {
- const { id } = req.params;
- const { human_review_notes, status, suspected_menu_provider } = req.body;
- const updates = [];
- const params = [];
- let paramIndex = 1;
- if (human_review_notes !== undefined) {
- updates.push(`human_review_notes = $${paramIndex}`);
- params.push(human_review_notes);
- paramIndex++;
- }
- if (status) {
- updates.push(`status = $${paramIndex}`);
- params.push(status);
- paramIndex++;
- }
- if (suspected_menu_provider !== undefined) {
- updates.push(`suspected_menu_provider = $${paramIndex}`);
- params.push(suspected_menu_provider);
- paramIndex++;
- }
- if (updates.length === 0) {
- return res.status(400).json({ error: 'No updates provided' });
- }
- updates.push('updated_at = NOW()');
- if (human_review_notes !== undefined) {
- updates.push('reviewed_at = NOW()');
- }
- params.push(id);
- await migrate_1.pool.query(`UPDATE crawler_sandboxes SET ${updates.join(', ')} WHERE id = $${paramIndex}`, params);
- res.json({ message: 'Sandbox updated' });
- }
- catch (error) {
- logger_1.logger.error('api', `Update sandbox error: ${error.message}`);
- res.status(500).json({ error: error.message });
- }
-});
-// ========================================
-// Templates
-// ========================================
-/**
- * GET /api/crawler-sandbox/templates
- * List all crawler templates
- */
-router.get('/templates/list', async (req, res) => {
- try {
- const result = await migrate_1.pool.query(`SELECT * FROM crawler_templates ORDER BY provider, is_default_for_provider DESC, name`);
- res.json({ templates: result.rows });
- }
- catch (error) {
- logger_1.logger.error('api', `Get templates error: ${error.message}`);
- res.status(500).json({ error: error.message });
- }
-});
-/**
- * GET /api/crawler-sandbox/templates/:id
- * Get a single template
- */
-router.get('/templates/:id', async (req, res) => {
- try {
- const { id } = req.params;
- const result = await migrate_1.pool.query('SELECT * FROM crawler_templates WHERE id = $1', [id]);
- if (result.rows.length === 0) {
- return res.status(404).json({ error: 'Template not found' });
- }
- res.json({ template: result.rows[0] });
- }
- catch (error) {
- logger_1.logger.error('api', `Get template error: ${error.message}`);
- res.status(500).json({ error: error.message });
- }
-});
-/**
- * POST /api/crawler-sandbox/templates
- * Create a new template
- */
-router.post('/templates', (0, middleware_1.requireRole)('admin'), async (req, res) => {
- try {
- const { provider, name, selector_config, navigation_config, transform_config, validation_rules, notes, } = req.body;
- if (!provider || !name) {
- return res.status(400).json({ error: 'provider and name are required' });
- }
- const result = await migrate_1.pool.query(`INSERT INTO crawler_templates
- (provider, name, selector_config, navigation_config, transform_config, validation_rules, notes, created_by)
- VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
- RETURNING *`, [
- provider,
- name,
- JSON.stringify(selector_config || {}),
- JSON.stringify(navigation_config || {}),
- JSON.stringify(transform_config || {}),
- JSON.stringify(validation_rules || {}),
- notes,
- req.user?.email || 'system',
- ]);
- res.status(201).json({ template: result.rows[0] });
- }
- catch (error) {
- logger_1.logger.error('api', `Create template error: ${error.message}`);
- res.status(500).json({ error: error.message });
- }
-});
-/**
- * PUT /api/crawler-sandbox/templates/:id
- * Update a template
- */
-router.put('/templates/:id', (0, middleware_1.requireRole)('admin'), async (req, res) => {
- try {
- const { id } = req.params;
- const { is_active, is_default_for_provider, selector_config, navigation_config, transform_config, validation_rules, notes, } = req.body;
- const updates = [];
- const params = [];
- let paramIndex = 1;
- if (is_active !== undefined) {
- updates.push(`is_active = $${paramIndex}`);
- params.push(is_active);
- paramIndex++;
- }
- if (is_default_for_provider !== undefined) {
- updates.push(`is_default_for_provider = $${paramIndex}`);
- params.push(is_default_for_provider);
- paramIndex++;
- }
- if (selector_config !== undefined) {
- updates.push(`selector_config = $${paramIndex}`);
- params.push(JSON.stringify(selector_config));
- paramIndex++;
- }
- if (navigation_config !== undefined) {
- updates.push(`navigation_config = $${paramIndex}`);
- params.push(JSON.stringify(navigation_config));
- paramIndex++;
- }
- if (transform_config !== undefined) {
- updates.push(`transform_config = $${paramIndex}`);
- params.push(JSON.stringify(transform_config));
- paramIndex++;
- }
- if (validation_rules !== undefined) {
- updates.push(`validation_rules = $${paramIndex}`);
- params.push(JSON.stringify(validation_rules));
- paramIndex++;
- }
- if (notes !== undefined) {
- updates.push(`notes = $${paramIndex}`);
- params.push(notes);
- paramIndex++;
- }
- if (updates.length === 0) {
- return res.status(400).json({ error: 'No updates provided' });
- }
- updates.push('updated_at = NOW()');
- params.push(id);
- await migrate_1.pool.query(`UPDATE crawler_templates SET ${updates.join(', ')} WHERE id = $${paramIndex}`, params);
- const result = await migrate_1.pool.query('SELECT * FROM crawler_templates WHERE id = $1', [id]);
- res.json({ template: result.rows[0] });
- }
- catch (error) {
- logger_1.logger.error('api', `Update template error: ${error.message}`);
- res.status(500).json({ error: error.message });
- }
-});
-// ========================================
-// Jobs
-// ========================================
-/**
- * GET /api/crawler-sandbox/jobs
- * List sandbox crawl jobs
- */
-router.get('/jobs/list', async (req, res) => {
- try {
- const { status, dispensaryId, limit = 50 } = req.query;
- let query = `
- SELECT sj.*, d.name as dispensary_name
- FROM sandbox_crawl_jobs sj
- JOIN dispensaries d ON d.id = sj.dispensary_id
- WHERE 1=1
- `;
- const params = [];
- let paramIndex = 1;
- if (status) {
- query += ` AND sj.status = $${paramIndex}`;
- params.push(status);
- paramIndex++;
- }
- if (dispensaryId) {
- query += ` AND sj.dispensary_id = $${paramIndex}`;
- params.push(Number(dispensaryId));
- paramIndex++;
- }
- query += ` ORDER BY sj.created_at DESC LIMIT $${paramIndex}`;
- params.push(Number(limit));
- const result = await migrate_1.pool.query(query, params);
- res.json({ jobs: result.rows });
- }
- catch (error) {
- logger_1.logger.error('api', `Get jobs error: ${error.message}`);
- res.status(500).json({ error: error.message });
- }
-});
-/**
- * POST /api/crawler-sandbox/jobs/detect/:dispensaryId
- * Trigger provider detection for a dispensary
- */
-router.post('/jobs/detect/:dispensaryId', (0, middleware_1.requireRole)('admin'), async (req, res) => {
- try {
- const { dispensaryId } = req.params;
- // Create detection job
- const job = await migrate_1.pool.query(`INSERT INTO sandbox_crawl_jobs (dispensary_id, job_type, status, priority)
- VALUES ($1, 'detection', 'pending', 30)
- RETURNING id`, [dispensaryId]);
- // Update dispensary status
- await migrate_1.pool.query(`UPDATE dispensaries SET crawler_status = 'queued_detection', updated_at = NOW() WHERE id = $1`, [dispensaryId]);
- res.json({
- message: 'Detection job queued',
- jobId: job.rows[0].id,
- });
- }
- catch (error) {
- logger_1.logger.error('api', `Queue detection error: ${error.message}`);
- res.status(500).json({ error: error.message });
- }
-});
-/**
- * POST /api/crawler-sandbox/jobs/run/:id
- * Immediately run a sandbox job
- */
-router.post('/jobs/run/:id', (0, middleware_1.requireRole)('admin'), async (req, res) => {
- try {
- const { id } = req.params;
- const job = await migrate_1.pool.query('SELECT * FROM sandbox_crawl_jobs WHERE id = $1', [id]);
- if (job.rows.length === 0) {
- return res.status(404).json({ error: 'Job not found' });
- }
- const jobData = job.rows[0];
- // Run the job immediately
- let result;
- if (jobData.job_type === 'detection') {
- result = await (0, crawler_jobs_1.runDetectMenuProviderJob)(jobData.dispensary_id);
- }
- else {
- result = await (0, crawler_jobs_1.runSandboxCrawlJob)(jobData.dispensary_id, jobData.sandbox_id);
- }
- // Update job status
- await migrate_1.pool.query(`UPDATE sandbox_crawl_jobs
- SET status = $1, completed_at = NOW(), result_summary = $2, error_message = $3
- WHERE id = $4`, [
- result.success ? 'completed' : 'failed',
- JSON.stringify(result.data || {}),
- result.success ? null : result.message,
- id,
- ]);
- res.json(result);
- }
- catch (error) {
- logger_1.logger.error('api', `Run job error: ${error.message}`);
- res.status(500).json({ error: error.message });
- }
-});
-// ========================================
-// Stats
-// ========================================
-/**
- * GET /api/crawler-sandbox/stats
- * Get sandbox/crawler statistics
- */
-router.get('/stats/overview', async (req, res) => {
- try {
- // Dispensary provider stats
- const providerStats = await migrate_1.pool.query(`
- SELECT
- menu_provider,
- COUNT(*) as count,
- AVG(menu_provider_confidence)::integer as avg_confidence
- FROM dispensaries
- WHERE menu_provider IS NOT NULL
- GROUP BY menu_provider
- ORDER BY count DESC
- `);
- // Mode stats
- const modeStats = await migrate_1.pool.query(`
- SELECT
- crawler_mode,
- COUNT(*) as count
- FROM dispensaries
- GROUP BY crawler_mode
- `);
- // Status stats
- const statusStats = await migrate_1.pool.query(`
- SELECT
- crawler_status,
- COUNT(*) as count
- FROM dispensaries
- GROUP BY crawler_status
- ORDER BY count DESC
- `);
- // Sandbox stats
- const sandboxStats = await migrate_1.pool.query(`
- SELECT
- status,
- COUNT(*) as count
- FROM crawler_sandboxes
- GROUP BY status
- `);
- // Job stats
- const jobStats = await migrate_1.pool.query(`
- SELECT
- status,
- job_type,
- COUNT(*) as count
- FROM sandbox_crawl_jobs
- GROUP BY status, job_type
- `);
- // Recent activity
- const recentActivity = await migrate_1.pool.query(`
- SELECT 'sandbox' as type, id, dispensary_id, status, created_at
- FROM crawler_sandboxes
- ORDER BY created_at DESC
- LIMIT 5
- `);
- res.json({
- providers: providerStats.rows,
- modes: modeStats.rows,
- statuses: statusStats.rows,
- sandbox: sandboxStats.rows,
- jobs: jobStats.rows,
- recentActivity: recentActivity.rows,
- });
- }
- catch (error) {
- logger_1.logger.error('api', `Get stats error: ${error.message}`);
- res.status(500).json({ error: error.message });
- }
-});
-exports.default = router;
diff --git a/backend/dist/routes/dashboard.js b/backend/dist/routes/dashboard.js
deleted file mode 100644
index 2fbaeab3..00000000
--- a/backend/dist/routes/dashboard.js
+++ /dev/null
@@ -1,116 +0,0 @@
-"use strict";
-Object.defineProperty(exports, "__esModule", { value: true });
-const express_1 = require("express");
-const middleware_1 = require("../auth/middleware");
-const connection_1 = require("../dutchie-az/db/connection");
-const router = (0, express_1.Router)();
-router.use(middleware_1.authMiddleware);
-// Get dashboard stats - uses consolidated dutchie-az DB
-router.get('/stats', async (req, res) => {
- try {
- // Store stats from dispensaries table in consolidated DB
- const dispensariesResult = await (0, connection_1.query)(`
- SELECT
- COUNT(*) as total,
- COUNT(*) FILTER (WHERE menu_type IS NOT NULL AND menu_type != 'unknown') as active,
- COUNT(*) FILTER (WHERE platform_dispensary_id IS NOT NULL) as with_platform_id,
- COUNT(*) FILTER (WHERE menu_url IS NOT NULL) as with_menu_url,
- MIN(last_crawled_at) as oldest_crawl,
- MAX(last_crawled_at) as latest_crawl
- FROM dispensaries
- `);
- // Product stats from dutchie_products table
- const productsResult = await (0, connection_1.query)(`
- SELECT
- COUNT(*) as total,
- COUNT(*) FILTER (WHERE stock_status = 'in_stock') as in_stock,
- COUNT(*) FILTER (WHERE primary_image_url IS NOT NULL) as with_images,
- COUNT(DISTINCT brand_name) FILTER (WHERE brand_name IS NOT NULL AND brand_name != '') as unique_brands,
- COUNT(DISTINCT dispensary_id) as dispensaries_with_products
- FROM dutchie_products
- `);
- // Brand stats from dutchie_products
- const brandResult = await (0, connection_1.query)(`
- SELECT COUNT(DISTINCT brand_name) as total
- FROM dutchie_products
- WHERE brand_name IS NOT NULL AND brand_name != ''
- `);
- // Recent products added (last 24 hours)
- const recentProductsResult = await (0, connection_1.query)(`
- SELECT COUNT(*) as new_products_24h
- FROM dutchie_products
- WHERE created_at >= NOW() - INTERVAL '24 hours'
- `);
- // Combine results
- const storeStats = dispensariesResult.rows[0];
- const productStats = productsResult.rows[0];
- res.json({
- stores: {
- total: parseInt(storeStats.total) || 0,
- active: parseInt(storeStats.active) || 0,
- with_menu_url: parseInt(storeStats.with_menu_url) || 0,
- with_platform_id: parseInt(storeStats.with_platform_id) || 0,
- oldest_crawl: storeStats.oldest_crawl,
- latest_crawl: storeStats.latest_crawl
- },
- products: {
- total: parseInt(productStats.total) || 0,
- in_stock: parseInt(productStats.in_stock) || 0,
- with_images: parseInt(productStats.with_images) || 0,
- unique_brands: parseInt(productStats.unique_brands) || 0,
- dispensaries_with_products: parseInt(productStats.dispensaries_with_products) || 0
- },
- brands: {
- total: parseInt(brandResult.rows[0].total) || 0
- },
- campaigns: { total: 0, active: 0 }, // Legacy - no longer used
- clicks: { clicks_24h: 0 }, // Legacy - no longer used
- recent: recentProductsResult.rows[0]
- });
- }
- catch (error) {
- console.error('Error fetching dashboard stats:', error);
- res.status(500).json({ error: 'Failed to fetch dashboard stats' });
- }
-});
-// Get recent activity - from consolidated dutchie-az DB
-router.get('/activity', async (req, res) => {
- try {
- const { limit = 20 } = req.query;
- // Recent crawls from dispensaries (with product counts from dutchie_products)
- const scrapesResult = await (0, connection_1.query)(`
- SELECT
- d.name,
- d.last_crawled_at as last_scraped_at,
- d.product_count
- FROM dispensaries d
- WHERE d.last_crawled_at IS NOT NULL
- ORDER BY d.last_crawled_at DESC
- LIMIT $1
- `, [limit]);
- // Recent products from dutchie_products
- const productsResult = await (0, connection_1.query)(`
- SELECT
- p.name,
- 0 as price,
- p.brand_name as brand,
- p.thc as thc_percentage,
- p.cbd as cbd_percentage,
- d.name as store_name,
- p.created_at as first_seen_at
- FROM dutchie_products p
- JOIN dispensaries d ON p.dispensary_id = d.id
- ORDER BY p.created_at DESC
- LIMIT $1
- `, [limit]);
- res.json({
- recent_scrapes: scrapesResult.rows,
- recent_products: productsResult.rows
- });
- }
- catch (error) {
- console.error('Error fetching dashboard activity:', error);
- res.status(500).json({ error: 'Failed to fetch dashboard activity' });
- }
-});
-exports.default = router;
diff --git a/backend/dist/routes/dispensaries.js b/backend/dist/routes/dispensaries.js
deleted file mode 100644
index cbb08c75..00000000
--- a/backend/dist/routes/dispensaries.js
+++ /dev/null
@@ -1,437 +0,0 @@
-"use strict";
-Object.defineProperty(exports, "__esModule", { value: true });
-const express_1 = require("express");
-const middleware_1 = require("../auth/middleware");
-const migrate_1 = require("../db/migrate");
-const router = (0, express_1.Router)();
-router.use(middleware_1.authMiddleware);
-// Valid menu_type values
-const VALID_MENU_TYPES = ['dutchie', 'treez', 'jane', 'weedmaps', 'leafly', 'meadow', 'blaze', 'flowhub', 'dispense', 'cova', 'other', 'unknown'];
-// Get all dispensaries
-router.get('/', async (req, res) => {
- try {
- const { menu_type } = req.query;
- let query = `
- SELECT
- id,
- azdhs_id,
- name,
- company_name,
- slug,
- address,
- city,
- state,
- zip,
- phone,
- email,
- website,
- dba_name,
- google_rating,
- google_review_count,
- status_line,
- azdhs_url,
- latitude,
- longitude,
- menu_url,
- menu_type,
- menu_provider,
- menu_provider_confidence,
- scraper_template,
- last_menu_scrape,
- menu_scrape_status,
- platform_dispensary_id,
- created_at,
- updated_at
- FROM dispensaries
- `;
- const params = [];
- // Filter by menu_type if provided
- if (menu_type) {
- query += ` WHERE menu_type = $1`;
- params.push(menu_type);
- }
- query += ` ORDER BY name`;
- const result = await migrate_1.pool.query(query, params);
- res.json({ dispensaries: result.rows });
- }
- catch (error) {
- console.error('Error fetching dispensaries:', error);
- res.status(500).json({ error: 'Failed to fetch dispensaries' });
- }
-});
-// Get menu type stats
-router.get('/stats/menu-types', async (req, res) => {
- try {
- const result = await migrate_1.pool.query(`
- SELECT menu_type, COUNT(*) as count
- FROM dispensaries
- GROUP BY menu_type
- ORDER BY count DESC
- `);
- res.json({ menu_types: result.rows, valid_types: VALID_MENU_TYPES });
- }
- catch (error) {
- console.error('Error fetching menu type stats:', error);
- res.status(500).json({ error: 'Failed to fetch menu type stats' });
- }
-});
-// Get single dispensary by slug
-router.get('/:slug', async (req, res) => {
- try {
- const { slug } = req.params;
- const result = await migrate_1.pool.query(`
- SELECT
- id,
- azdhs_id,
- name,
- company_name,
- slug,
- address,
- city,
- state,
- zip,
- phone,
- email,
- website,
- dba_name,
- google_rating,
- google_review_count,
- status_line,
- azdhs_url,
- latitude,
- longitude,
- menu_url,
- menu_type,
- menu_provider,
- menu_provider_confidence,
- scraper_template,
- scraper_config,
- last_menu_scrape,
- menu_scrape_status,
- platform_dispensary_id,
- created_at,
- updated_at
- FROM dispensaries
- WHERE slug = $1
- `, [slug]);
- if (result.rows.length === 0) {
- return res.status(404).json({ error: 'Dispensary not found' });
- }
- res.json(result.rows[0]);
- }
- catch (error) {
- console.error('Error fetching dispensary:', error);
- res.status(500).json({ error: 'Failed to fetch dispensary' });
- }
-});
-// Update dispensary
-router.put('/:id', async (req, res) => {
- try {
- const { id } = req.params;
- const { dba_name, website, phone, email, google_rating, google_review_count, menu_url, menu_type, scraper_template, scraper_config, menu_scrape_status } = req.body;
- // Validate menu_type if provided
- if (menu_type !== undefined && menu_type !== null && menu_type !== '' && !VALID_MENU_TYPES.includes(menu_type)) {
- return res.status(400).json({
- error: `Invalid menu_type. Must be one of: ${VALID_MENU_TYPES.join(', ')}`
- });
- }
- const result = await migrate_1.pool.query(`
- UPDATE dispensaries
- SET
- dba_name = COALESCE($1, dba_name),
- website = COALESCE($2, website),
- phone = COALESCE($3, phone),
- email = COALESCE($4, email),
- google_rating = COALESCE($5, google_rating),
- google_review_count = COALESCE($6, google_review_count),
- menu_url = COALESCE($7, menu_url),
- menu_type = COALESCE($8, menu_type),
- scraper_template = COALESCE($9, scraper_template),
- scraper_config = COALESCE($10, scraper_config),
- menu_scrape_status = COALESCE($11, menu_scrape_status),
- updated_at = CURRENT_TIMESTAMP
- WHERE id = $12
- RETURNING *
- `, [
- dba_name,
- website,
- phone,
- email,
- google_rating,
- google_review_count,
- menu_url,
- menu_type,
- scraper_template,
- scraper_config,
- menu_scrape_status,
- id
- ]);
- if (result.rows.length === 0) {
- return res.status(404).json({ error: 'Dispensary not found' });
- }
- res.json(result.rows[0]);
- }
- catch (error) {
- console.error('Error updating dispensary:', error);
- res.status(500).json({ error: 'Failed to update dispensary' });
- }
-});
-// Get products for a dispensary by slug
-router.get('/:slug/products', async (req, res) => {
- try {
- const { slug } = req.params;
- const { category } = req.query;
- // First get the dispensary ID from slug
- const dispensaryResult = await migrate_1.pool.query(`
- SELECT id FROM dispensaries WHERE slug = $1
- `, [slug]);
- if (dispensaryResult.rows.length === 0) {
- return res.status(404).json({ error: 'Dispensary not found' });
- }
- const dispensaryId = dispensaryResult.rows[0].id;
- // Build query for products
- let query = `
- SELECT
- p.id,
- p.name,
- p.brand,
- p.variant,
- p.slug,
- p.description,
- p.regular_price,
- p.sale_price,
- p.thc_percentage,
- p.cbd_percentage,
- p.strain_type,
- p.terpenes,
- p.effects,
- p.flavors,
- p.image_url,
- p.dutchie_url,
- p.in_stock,
- p.created_at,
- p.updated_at
- FROM products p
- WHERE p.dispensary_id = $1
- `;
- const params = [dispensaryId];
- if (category) {
- query += ` AND p.category = $2`;
- params.push(category);
- }
- query += ` ORDER BY p.created_at DESC`;
- const result = await migrate_1.pool.query(query, params);
- res.json({ products: result.rows });
- }
- catch (error) {
- console.error('Error fetching dispensary products:', error);
- res.status(500).json({ error: 'Failed to fetch products' });
- }
-});
-// Get unique brands for a dispensary by slug
-router.get('/:slug/brands', async (req, res) => {
- try {
- const { slug } = req.params;
- const { search } = req.query;
- // First get the dispensary ID from slug
- const dispensaryResult = await migrate_1.pool.query(`
- SELECT id FROM dispensaries WHERE slug = $1
- `, [slug]);
- if (dispensaryResult.rows.length === 0) {
- return res.status(404).json({ error: 'Dispensary not found' });
- }
- const dispensaryId = dispensaryResult.rows[0].id;
- // Build query with optional search filter
- let query = `
- SELECT DISTINCT
- brand,
- COUNT(*) as product_count
- FROM products
- WHERE dispensary_id = $1 AND brand IS NOT NULL
- `;
- const params = [dispensaryId];
- // Add search filter if provided
- if (search) {
- query += ` AND brand ILIKE $2`;
- params.push(`%${search}%`);
- }
- query += ` GROUP BY brand ORDER BY product_count DESC, brand ASC`;
- const result = await migrate_1.pool.query(query, params);
- res.json({ brands: result.rows });
- }
- catch (error) {
- console.error('Error fetching dispensary brands:', error);
- res.status(500).json({ error: 'Failed to fetch brands' });
- }
-});
-// Get products with discounts/specials for a dispensary by slug
-router.get('/:slug/specials', async (req, res) => {
- try {
- const { slug } = req.params;
- const { search } = req.query;
- // First get the dispensary ID from slug
- const dispensaryResult = await migrate_1.pool.query(`
- SELECT id FROM dispensaries WHERE slug = $1
- `, [slug]);
- if (dispensaryResult.rows.length === 0) {
- return res.status(404).json({ error: 'Dispensary not found' });
- }
- const dispensaryId = dispensaryResult.rows[0].id;
- // Build query to get products with discounts
- let query = `
- SELECT
- p.id,
- p.name,
- p.brand,
- p.variant,
- p.slug,
- p.description,
- p.regular_price,
- p.sale_price,
- p.discount_type,
- p.discount_value,
- p.thc_percentage,
- p.cbd_percentage,
- p.strain_type,
- p.terpenes,
- p.effects,
- p.flavors,
- p.image_url,
- p.dutchie_url,
- p.in_stock,
- p.created_at,
- p.updated_at
- FROM products p
- WHERE p.dispensary_id = $1
- AND p.discount_type IS NOT NULL
- AND p.discount_value IS NOT NULL
- `;
- const params = [dispensaryId];
- // Add search filter if provided
- if (search) {
- query += ` AND (p.name ILIKE $2 OR p.brand ILIKE $2 OR p.description ILIKE $2)`;
- params.push(`%${search}%`);
- }
- query += ` ORDER BY p.created_at DESC`;
- const result = await migrate_1.pool.query(query, params);
- res.json({ specials: result.rows });
- }
- catch (error) {
- console.error('Error fetching dispensary specials:', error);
- res.status(500).json({ error: 'Failed to fetch specials' });
- }
-});
-// Trigger scraping for a dispensary
-router.post('/:slug/scrape', async (req, res) => {
- try {
- const { slug } = req.params;
- const { type } = req.body; // 'products' | 'brands' | 'specials' | 'all'
- if (!['products', 'brands', 'specials', 'all'].includes(type)) {
- return res.status(400).json({ error: 'Invalid type. Must be: products, brands, specials, or all' });
- }
- // Get the dispensary
- const dispensaryResult = await migrate_1.pool.query(`
- SELECT id, name, slug, website, menu_url, scraper_template, scraper_config
- FROM dispensaries
- WHERE slug = $1
- `, [slug]);
- if (dispensaryResult.rows.length === 0) {
- return res.status(404).json({ error: 'Dispensary not found' });
- }
- const dispensary = dispensaryResult.rows[0];
- if (!dispensary.menu_url && !dispensary.website) {
- return res.status(400).json({ error: 'Dispensary has no menu URL or website configured' });
- }
- // Update last_menu_scrape time and status
- await migrate_1.pool.query(`
- UPDATE dispensaries
- SET
- last_menu_scrape = CURRENT_TIMESTAMP,
- menu_scrape_status = 'pending',
- updated_at = CURRENT_TIMESTAMP
- WHERE id = $1
- `, [dispensary.id]);
- // Log the scrape request
- console.log(`[SCRAPE REQUEST] Dispensary: ${dispensary.name} (${slug}), Type: ${type}`);
- console.log(` Menu URL: ${dispensary.menu_url || dispensary.website}`);
- console.log(` Template: ${dispensary.scraper_template || 'N/A'}`);
- // TODO: Actually trigger the scraper here
- // For now, this is a placeholder that updates the status
- // You can integrate with your existing scraper infrastructure
- res.json({
- success: true,
- message: `Scraping queued for ${dispensary.name}`,
- type,
- dispensary: {
- id: dispensary.id,
- name: dispensary.name,
- slug: dispensary.slug
- }
- });
- }
- catch (error) {
- console.error('Error triggering scrape:', error);
- res.status(500).json({ error: 'Failed to trigger scraping' });
- }
-});
-// Update menu_type for a dispensary (dedicated endpoint)
-router.patch('/:id/menu-type', async (req, res) => {
- try {
- const { id } = req.params;
- const { menu_type } = req.body;
- // Validate menu_type
- if (menu_type !== null && menu_type !== '' && !VALID_MENU_TYPES.includes(menu_type)) {
- return res.status(400).json({
- error: `Invalid menu_type. Must be one of: ${VALID_MENU_TYPES.join(', ')} (or null to clear)`
- });
- }
- const result = await migrate_1.pool.query(`
- UPDATE dispensaries
- SET menu_type = $1, updated_at = CURRENT_TIMESTAMP
- WHERE id = $2
- RETURNING id, name, slug, menu_type, menu_provider, menu_url
- `, [menu_type || null, id]);
- if (result.rows.length === 0) {
- return res.status(404).json({ error: 'Dispensary not found' });
- }
- res.json({
- success: true,
- dispensary: result.rows[0]
- });
- }
- catch (error) {
- console.error('Error updating menu_type:', error);
- res.status(500).json({ error: 'Failed to update menu_type' });
- }
-});
-// Bulk update menu_type for multiple dispensaries
-router.post('/bulk/menu-type', async (req, res) => {
- try {
- const { dispensary_ids, menu_type } = req.body;
- if (!Array.isArray(dispensary_ids) || dispensary_ids.length === 0) {
- return res.status(400).json({ error: 'dispensary_ids must be a non-empty array' });
- }
- // Validate menu_type
- if (menu_type !== null && menu_type !== '' && !VALID_MENU_TYPES.includes(menu_type)) {
- return res.status(400).json({
- error: `Invalid menu_type. Must be one of: ${VALID_MENU_TYPES.join(', ')} (or null to clear)`
- });
- }
- const result = await migrate_1.pool.query(`
- UPDATE dispensaries
- SET menu_type = $1, updated_at = CURRENT_TIMESTAMP
- WHERE id = ANY($2::int[])
- RETURNING id, name, slug, menu_type
- `, [menu_type || null, dispensary_ids]);
- res.json({
- success: true,
- updated_count: result.rowCount,
- dispensaries: result.rows
- });
- }
- catch (error) {
- console.error('Error bulk updating menu_type:', error);
- res.status(500).json({ error: 'Failed to bulk update menu_type' });
- }
-});
-exports.default = router;
diff --git a/backend/dist/routes/logs.js b/backend/dist/routes/logs.js
deleted file mode 100644
index b26654c6..00000000
--- a/backend/dist/routes/logs.js
+++ /dev/null
@@ -1,29 +0,0 @@
-"use strict";
-Object.defineProperty(exports, "__esModule", { value: true });
-const express_1 = require("express");
-const middleware_1 = require("../auth/middleware");
-const logger_1 = require("../services/logger");
-const router = (0, express_1.Router)();
-router.use(middleware_1.authMiddleware);
-router.get('/', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
- try {
- const { limit = '100', level, category } = req.query;
- const logs = logger_1.logger.getLogs(parseInt(limit), level, category);
- res.json({ logs });
- }
- catch (error) {
- console.error('Error fetching logs:', error);
- res.status(500).json({ error: 'Failed to fetch logs' });
- }
-});
-router.delete('/', (0, middleware_1.requireRole)('superadmin'), async (req, res) => {
- try {
- logger_1.logger.clear();
- res.json({ message: 'Logs cleared' });
- }
- catch (error) {
- console.error('Error clearing logs:', error);
- res.status(500).json({ error: 'Failed to clear logs' });
- }
-});
-exports.default = router;
diff --git a/backend/dist/routes/parallel-scrape.js b/backend/dist/routes/parallel-scrape.js
deleted file mode 100644
index 5384c256..00000000
--- a/backend/dist/routes/parallel-scrape.js
+++ /dev/null
@@ -1,182 +0,0 @@
-"use strict";
-Object.defineProperty(exports, "__esModule", { value: true });
-const express_1 = require("express");
-const migrate_1 = require("../db/migrate");
-const proxy_1 = require("../services/proxy");
-const middleware_1 = require("../auth/middleware");
-const router = (0, express_1.Router)();
-router.use(middleware_1.authMiddleware);
-const FIREFOX_USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0';
-// In-memory job tracking
-const activeJobs = new Map();
-// Get job status
-router.get('/status/:jobId', (req, res) => {
- const job = activeJobs.get(req.params.jobId);
- if (!job) {
- return res.status(404).json({ error: 'Job not found' });
- }
- res.json(job);
-});
-// List active jobs
-router.get('/jobs', (req, res) => {
- const jobs = Array.from(activeJobs.values());
- res.json({ jobs });
-});
-// Start parallel scrape
-router.post('/start', async (req, res) => {
- const { storeName = 'Deeply Rooted', workers = 15, useProxies = true } = req.body;
- try {
- // Find the store
- const storeResult = await migrate_1.pool.query(`SELECT id, name, slug, dutchie_url FROM stores WHERE name ILIKE $1 LIMIT 1`, [`%${storeName}%`]);
- if (storeResult.rows.length === 0) {
- return res.status(404).json({ error: `Store not found: ${storeName}` });
- }
- const store = storeResult.rows[0];
- // Get categories
- const categoriesResult = await migrate_1.pool.query(`SELECT id, name, slug, dutchie_url as url FROM categories WHERE store_id = $1 AND scrape_enabled = true`, [store.id]);
- if (categoriesResult.rows.length === 0) {
- return res.status(404).json({ error: 'No categories found for this store' });
- }
- const categories = categoriesResult.rows;
- // Create job
- const jobId = `scrape-${Date.now()}`;
- const job = {
- id: jobId,
- storeName: store.name,
- status: 'running',
- workers,
- startedAt: new Date(),
- results: []
- };
- activeJobs.set(jobId, job);
- // Start scraping in background
- runParallelScrape(job, store, categories, workers, useProxies).catch(err => {
- console.error('Parallel scrape error:', err);
- job.status = 'failed';
- });
- res.json({
- message: 'Parallel scrape started',
- jobId,
- store: store.name,
- categories: categories.length,
- workers
- });
- }
- catch (error) {
- console.error('Failed to start parallel scrape:', error);
- res.status(500).json({ error: error.message });
- }
-});
-async function runParallelScrape(job, store, categories, numWorkers, useProxies) {
- const puppeteer = require('puppeteer-extra');
- const StealthPlugin = require('puppeteer-extra-plugin-stealth');
- puppeteer.use(StealthPlugin());
- // Expand categories for multiple passes
- const expandedCategories = [];
- const passes = Math.ceil(numWorkers / Math.max(categories.length, 1));
- for (let i = 0; i < passes; i++) {
- expandedCategories.push(...categories);
- }
- const categoryIndex = { current: 0 };
- const worker = async (workerId) => {
- while (categoryIndex.current < expandedCategories.length) {
- const idx = categoryIndex.current++;
- const category = expandedCategories[idx];
- if (!category)
- break;
- const result = await scrapeCategory(puppeteer, workerId, category, useProxies);
- job.results.push({
- category: category.name,
- success: result.success,
- products: result.products,
- error: result.error
- });
- // Delay between requests
- await new Promise(resolve => setTimeout(resolve, 2000 + Math.random() * 3000));
- }
- };
- // Start workers with staggered starts
- const workers = [];
- for (let i = 0; i < numWorkers; i++) {
- workers.push(worker(i + 1));
- await new Promise(resolve => setTimeout(resolve, 500));
- }
- await Promise.all(workers);
- job.status = 'completed';
- job.completedAt = new Date();
- // Clean up job after 1 hour
- setTimeout(() => activeJobs.delete(job.id), 60 * 60 * 1000);
-}
-async function scrapeCategory(puppeteer, workerId, category, useProxies) {
- let browser = null;
- let proxyId = null;
- try {
- let proxy = null;
- if (useProxies) {
- proxy = await (0, proxy_1.getActiveProxy)();
- }
- const args = [
- '--no-sandbox',
- '--disable-setuid-sandbox',
- '--disable-dev-shm-usage',
- '--disable-accelerated-2d-canvas',
- '--disable-gpu',
- '--window-size=1920,1080',
- ];
- if (proxy) {
- proxyId = proxy.id;
- if (proxy.protocol === 'socks5' || proxy.protocol === 'socks') {
- args.push(`--proxy-server=socks5://${proxy.host}:${proxy.port}`);
- }
- else {
- args.push(`--proxy-server=${proxy.protocol}://${proxy.host}:${proxy.port}`);
- }
- }
- browser = await puppeteer.launch({
- headless: 'new',
- args,
- executablePath: process.env.PUPPETEER_EXECUTABLE_PATH || '/usr/bin/chromium',
- });
- const page = await browser.newPage();
- await page.setUserAgent(FIREFOX_USER_AGENT);
- await page.setViewport({ width: 1920, height: 1080 });
- if (proxy?.username && proxy?.password) {
- await page.authenticate({
- username: proxy.username,
- password: proxy.password,
- });
- }
- console.log(`[Worker ${workerId}] Scraping: ${category.name} (${category.url})`);
- const response = await page.goto(category.url, {
- waitUntil: 'networkidle2',
- timeout: 60000,
- });
- if (!response || !response.ok()) {
- throw new Error(`Failed to load page: ${response?.status()}`);
- }
- await page.waitForSelector('[data-testid="product-list-item"], a[href*="/product/"]', {
- timeout: 30000,
- }).catch(() => { });
- const products = await page.evaluate(() => {
- // Try data-testid first, then fall back to product links
- const listItems = document.querySelectorAll('[data-testid="product-list-item"]');
- if (listItems.length > 0)
- return listItems.length;
- return document.querySelectorAll('a[href*="/product/"]').length;
- });
- console.log(`[Worker ${workerId}] Found ${products} products in ${category.name}`);
- await browser.close();
- return { success: true, products };
- }
- catch (error) {
- console.error(`[Worker ${workerId}] Error:`, error.message);
- if (proxyId && (0, proxy_1.isBotDetectionError)(error.message)) {
- (0, proxy_1.putProxyInTimeout)(proxyId, error.message);
- }
- if (browser) {
- await browser.close().catch(() => { });
- }
- return { success: false, products: 0, error: error.message };
- }
-}
-exports.default = router;
diff --git a/backend/dist/routes/products.js b/backend/dist/routes/products.js
deleted file mode 100644
index 3cab78b3..00000000
--- a/backend/dist/routes/products.js
+++ /dev/null
@@ -1,341 +0,0 @@
-"use strict";
-Object.defineProperty(exports, "__esModule", { value: true });
-const express_1 = require("express");
-const middleware_1 = require("../auth/middleware");
-const migrate_1 = require("../db/migrate");
-const minio_1 = require("../utils/minio");
-const router = (0, express_1.Router)();
-router.use(middleware_1.authMiddleware);
-// Freshness threshold: data older than this is considered stale
-const STALE_THRESHOLD_HOURS = 4;
-function calculateFreshness(lastCrawlAt) {
- if (!lastCrawlAt) {
- return {
- last_crawl_at: null,
- is_stale: true,
- freshness: 'Never crawled',
- hours_since_crawl: null
- };
- }
- const now = new Date();
- const diffMs = now.getTime() - lastCrawlAt.getTime();
- const diffHours = diffMs / (1000 * 60 * 60);
- const isStale = diffHours > STALE_THRESHOLD_HOURS;
- let freshnessText;
- if (diffHours < 1) {
- const mins = Math.round(diffHours * 60);
- freshnessText = `Last crawled ${mins} minute${mins !== 1 ? 's' : ''} ago`;
- }
- else if (diffHours < 24) {
- const hrs = Math.round(diffHours);
- freshnessText = `Last crawled ${hrs} hour${hrs !== 1 ? 's' : ''} ago`;
- }
- else {
- const days = Math.round(diffHours / 24);
- freshnessText = `Last crawled ${days} day${days !== 1 ? 's' : ''} ago`;
- }
- if (isStale) {
- freshnessText += ' (STALE)';
- }
- return {
- last_crawl_at: lastCrawlAt.toISOString(),
- is_stale: isStale,
- freshness: freshnessText,
- hours_since_crawl: Math.round(diffHours * 10) / 10
- };
-}
-// Helper function to filter fields from object
-function selectFields(obj, fields) {
- if (!fields || fields.length === 0)
- return obj;
- const result = {};
- fields.forEach(field => {
- if (obj.hasOwnProperty(field)) {
- result[field] = obj[field];
- }
- });
- return result;
-}
-// Get all products with filters, sorting, and field selection
-router.get('/', async (req, res) => {
- try {
- const { store_id, category_id, in_stock, search, brand, min_price, max_price, min_thc, max_thc, strain_type, sort_by = 'last_seen_at', sort_order = 'desc', limit = 50, offset = 0, fields } = req.query;
- // Validate sort field to prevent SQL injection
- const allowedSortFields = [
- 'id', 'name', 'brand', 'price', 'thc_percentage',
- 'cbd_percentage', 'last_seen_at', 'created_at'
- ];
- const sortField = allowedSortFields.includes(sort_by)
- ? sort_by
- : 'last_seen_at';
- const sortDirection = sort_order.toLowerCase() === 'asc' ? 'ASC' : 'DESC';
- let query = `
- SELECT p.*, s.name as store_name, c.name as category_name
- FROM products p
- LEFT JOIN stores s ON p.store_id = s.id
- LEFT JOIN categories c ON p.category_id = c.id
- WHERE 1=1
- `;
- const params = [];
- let paramCount = 1;
- // Store filter
- if (store_id) {
- query += ` AND p.store_id = $${paramCount}`;
- params.push(store_id);
- paramCount++;
- }
- // Category filter
- if (category_id) {
- query += ` AND p.category_id = $${paramCount}`;
- params.push(category_id);
- paramCount++;
- }
- // Stock filter
- if (in_stock !== undefined) {
- query += ` AND p.in_stock = $${paramCount}`;
- params.push(in_stock === 'true');
- paramCount++;
- }
- // Search filter
- if (search) {
- query += ` AND (p.name ILIKE $${paramCount} OR p.brand ILIKE $${paramCount} OR p.description ILIKE $${paramCount})`;
- params.push(`%${search}%`);
- paramCount++;
- }
- // Brand filter
- if (brand) {
- query += ` AND p.brand ILIKE $${paramCount}`;
- params.push(`%${brand}%`);
- paramCount++;
- }
- // Price range filter
- if (min_price) {
- query += ` AND p.price >= $${paramCount}`;
- params.push(parseFloat(min_price));
- paramCount++;
- }
- if (max_price) {
- query += ` AND p.price <= $${paramCount}`;
- params.push(parseFloat(max_price));
- paramCount++;
- }
- // THC range filter
- if (min_thc) {
- query += ` AND p.thc_percentage >= $${paramCount}`;
- params.push(parseFloat(min_thc));
- paramCount++;
- }
- if (max_thc) {
- query += ` AND p.thc_percentage <= $${paramCount}`;
- params.push(parseFloat(max_thc));
- paramCount++;
- }
- // Strain type filter
- if (strain_type) {
- query += ` AND p.strain_type = $${paramCount}`;
- params.push(strain_type);
- paramCount++;
- }
- // Sorting
- query += ` ORDER BY p.${sortField} ${sortDirection} LIMIT $${paramCount} OFFSET $${paramCount + 1}`;
- params.push(limit, offset);
- const result = await migrate_1.pool.query(query, params);
- // Add image URLs
- let products = result.rows.map((p) => ({
- ...p,
- image_url_full: p.local_image_path ? (0, minio_1.getImageUrl)(p.local_image_path) : p.image_url,
- thumbnail_url: p.thumbnail_path ? (0, minio_1.getImageUrl)(p.thumbnail_path) : null,
- medium_url: p.medium_path ? (0, minio_1.getImageUrl)(p.medium_path) : null,
- }));
- // Field selection
- if (fields) {
- const selectedFields = fields.split(',').map(f => f.trim());
- products = products.map((p) => selectFields(p, selectedFields));
- }
- // Get total count (reuse same filters)
- let countQuery = `SELECT COUNT(*) FROM products p WHERE 1=1`;
- const countParams = [];
- let countParamCount = 1;
- if (store_id) {
- countQuery += ` AND p.store_id = $${countParamCount}`;
- countParams.push(store_id);
- countParamCount++;
- }
- if (category_id) {
- countQuery += ` AND p.category_id = $${countParamCount}`;
- countParams.push(category_id);
- countParamCount++;
- }
- if (in_stock !== undefined) {
- countQuery += ` AND p.in_stock = $${countParamCount}`;
- countParams.push(in_stock === 'true');
- countParamCount++;
- }
- if (search) {
- countQuery += ` AND (p.name ILIKE $${countParamCount} OR p.brand ILIKE $${countParamCount} OR p.description ILIKE $${countParamCount})`;
- countParams.push(`%${search}%`);
- countParamCount++;
- }
- if (brand) {
- countQuery += ` AND p.brand ILIKE $${countParamCount}`;
- countParams.push(`%${brand}%`);
- countParamCount++;
- }
- if (min_price) {
- countQuery += ` AND p.price >= $${countParamCount}`;
- countParams.push(parseFloat(min_price));
- countParamCount++;
- }
- if (max_price) {
- countQuery += ` AND p.price <= $${countParamCount}`;
- countParams.push(parseFloat(max_price));
- countParamCount++;
- }
- if (min_thc) {
- countQuery += ` AND p.thc_percentage >= $${countParamCount}`;
- countParams.push(parseFloat(min_thc));
- countParamCount++;
- }
- if (max_thc) {
- countQuery += ` AND p.thc_percentage <= $${countParamCount}`;
- countParams.push(parseFloat(max_thc));
- countParamCount++;
- }
- if (strain_type) {
- countQuery += ` AND p.strain_type = $${countParamCount}`;
- countParams.push(strain_type);
- countParamCount++;
- }
- const countResult = await migrate_1.pool.query(countQuery, countParams);
- // Get freshness info if store_id is specified
- let freshnessInfo = null;
- let storeInfo = null;
- if (store_id) {
- const storeResult = await migrate_1.pool.query('SELECT id, name, last_scraped_at FROM stores WHERE id = $1', [store_id]);
- if (storeResult.rows.length > 0) {
- const store = storeResult.rows[0];
- storeInfo = { id: store.id, name: store.name };
- freshnessInfo = calculateFreshness(store.last_scraped_at);
- }
- }
- res.json({
- products,
- total: parseInt(countResult.rows[0].count),
- limit: parseInt(limit),
- offset: parseInt(offset),
- // Add freshness metadata when store_id is provided
- ...(freshnessInfo && {
- store: storeInfo,
- last_crawl_at: freshnessInfo.last_crawl_at,
- is_stale: freshnessInfo.is_stale,
- freshness: freshnessInfo.freshness,
- hours_since_crawl: freshnessInfo.hours_since_crawl
- }),
- filters: {
- store_id,
- category_id,
- in_stock,
- search,
- brand,
- min_price,
- max_price,
- min_thc,
- max_thc,
- strain_type,
- sort_by: sortField,
- sort_order: sortDirection
- }
- });
- }
- catch (error) {
- console.error('Error fetching products:', error);
- res.status(500).json({ error: 'Failed to fetch products' });
- }
-});
-// Get single product with optional field selection
-router.get('/:id', async (req, res) => {
- try {
- const { id } = req.params;
- const { fields } = req.query;
- const result = await migrate_1.pool.query(`
- SELECT p.*, s.name as store_name, c.name as category_name
- FROM products p
- LEFT JOIN stores s ON p.store_id = s.id
- LEFT JOIN categories c ON p.category_id = c.id
- WHERE p.id = $1
- `, [id]);
- if (result.rows.length === 0) {
- return res.status(404).json({ error: 'Product not found' });
- }
- let product = result.rows[0];
- product.image_url_full = product.local_image_path
- ? (0, minio_1.getImageUrl)(product.local_image_path)
- : product.image_url;
- product.thumbnail_url = product.thumbnail_path ? (0, minio_1.getImageUrl)(product.thumbnail_path) : null;
- product.medium_url = product.medium_path ? (0, minio_1.getImageUrl)(product.medium_path) : null;
- // Field selection
- if (fields) {
- const selectedFields = fields.split(',').map(f => f.trim());
- product = selectFields(product, selectedFields);
- }
- res.json({ product });
- }
- catch (error) {
- console.error('Error fetching product:', error);
- res.status(500).json({ error: 'Failed to fetch product' });
- }
-});
-// Get available brands (for filter dropdowns)
-router.get('/meta/brands', async (req, res) => {
- try {
- const { store_id } = req.query;
- let query = `
- SELECT DISTINCT brand
- FROM products
- WHERE brand IS NOT NULL AND brand != ''
- `;
- const params = [];
- if (store_id) {
- query += ' AND store_id = $1';
- params.push(store_id);
- }
- query += ' ORDER BY brand';
- const result = await migrate_1.pool.query(query, params);
- const brands = result.rows.map((row) => row.brand);
- res.json({ brands });
- }
- catch (error) {
- console.error('Error fetching brands:', error);
- res.status(500).json({ error: 'Failed to fetch brands' });
- }
-});
-// Get price range (for filter sliders)
-router.get('/meta/price-range', async (req, res) => {
- try {
- const { store_id } = req.query;
- let query = `
- SELECT
- MIN(price) as min_price,
- MAX(price) as max_price,
- AVG(price) as avg_price
- FROM products
- WHERE price IS NOT NULL
- `;
- const params = [];
- if (store_id) {
- query += ' AND store_id = $1';
- params.push(store_id);
- }
- const result = await migrate_1.pool.query(query, params);
- res.json({
- min_price: parseFloat(result.rows[0].min_price) || 0,
- max_price: parseFloat(result.rows[0].max_price) || 0,
- avg_price: parseFloat(result.rows[0].avg_price) || 0
- });
- }
- catch (error) {
- console.error('Error fetching price range:', error);
- res.status(500).json({ error: 'Failed to fetch price range' });
- }
-});
-exports.default = router;
diff --git a/backend/dist/routes/proxies.js b/backend/dist/routes/proxies.js
deleted file mode 100644
index 24d2d1d2..00000000
--- a/backend/dist/routes/proxies.js
+++ /dev/null
@@ -1,262 +0,0 @@
-"use strict";
-var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
- if (k2 === undefined) k2 = k;
- var desc = Object.getOwnPropertyDescriptor(m, k);
- if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
- desc = { enumerable: true, get: function() { return m[k]; } };
- }
- Object.defineProperty(o, k2, desc);
-}) : (function(o, m, k, k2) {
- if (k2 === undefined) k2 = k;
- o[k2] = m[k];
-}));
-var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
- Object.defineProperty(o, "default", { enumerable: true, value: v });
-}) : function(o, v) {
- o["default"] = v;
-});
-var __importStar = (this && this.__importStar) || (function () {
- var ownKeys = function(o) {
- ownKeys = Object.getOwnPropertyNames || function (o) {
- var ar = [];
- for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
- return ar;
- };
- return ownKeys(o);
- };
- return function (mod) {
- if (mod && mod.__esModule) return mod;
- var result = {};
- if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
- __setModuleDefault(result, mod);
- return result;
- };
-})();
-Object.defineProperty(exports, "__esModule", { value: true });
-const express_1 = require("express");
-const middleware_1 = require("../auth/middleware");
-const migrate_1 = require("../db/migrate");
-const proxy_1 = require("../services/proxy");
-const proxyTestQueue_1 = require("../services/proxyTestQueue");
-const router = (0, express_1.Router)();
-router.use(middleware_1.authMiddleware);
-// Get all proxies
-router.get('/', async (req, res) => {
- try {
- const result = await migrate_1.pool.query(`
- SELECT id, host, port, protocol, active, is_anonymous,
- last_tested_at, test_result, response_time_ms, created_at,
- city, state, country, country_code, location_updated_at
- FROM proxies
- ORDER BY created_at DESC
- `);
- res.json({ proxies: result.rows });
- }
- catch (error) {
- console.error('Error fetching proxies:', error);
- res.status(500).json({ error: 'Failed to fetch proxies' });
- }
-});
-// Get active proxy test job (must be before /:id route)
-router.get('/test-job', async (req, res) => {
- try {
- const job = await (0, proxyTestQueue_1.getActiveProxyTestJob)();
- res.json({ job });
- }
- catch (error) {
- console.error('Error fetching active job:', error);
- res.status(500).json({ error: 'Failed to fetch active job' });
- }
-});
-// Get proxy test job status (must be before /:id route)
-router.get('/test-job/:jobId', async (req, res) => {
- try {
- const { jobId } = req.params;
- const job = await (0, proxyTestQueue_1.getProxyTestJob)(parseInt(jobId));
- if (!job) {
- return res.status(404).json({ error: 'Job not found' });
- }
- res.json({ job });
- }
- catch (error) {
- console.error('Error fetching job status:', error);
- res.status(500).json({ error: 'Failed to fetch job status' });
- }
-});
-// Get single proxy
-router.get('/:id', async (req, res) => {
- try {
- const { id } = req.params;
- const result = await migrate_1.pool.query(`
- SELECT id, host, port, protocol, username, active, is_anonymous,
- last_tested_at, test_result, response_time_ms, created_at
- FROM proxies
- WHERE id = $1
- `, [id]);
- if (result.rows.length === 0) {
- return res.status(404).json({ error: 'Proxy not found' });
- }
- res.json({ proxy: result.rows[0] });
- }
- catch (error) {
- console.error('Error fetching proxy:', error);
- res.status(500).json({ error: 'Failed to fetch proxy' });
- }
-});
-// Add single proxy
-router.post('/', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
- try {
- const { host, port, protocol, username, password } = req.body;
- if (!host || !port || !protocol) {
- return res.status(400).json({ error: 'Host, port, and protocol required' });
- }
- // Test and add proxy
- const proxyId = await (0, proxy_1.addProxy)(host, port, protocol, username, password);
- const result = await migrate_1.pool.query(`
- SELECT * FROM proxies WHERE id = $1
- `, [proxyId]);
- res.status(201).json({ proxy: result.rows[0] });
- }
- catch (error) {
- console.error('Error adding proxy:', error);
- res.status(400).json({ error: error.message || 'Failed to add proxy' });
- }
-});
-// Add multiple proxies
-router.post('/bulk', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
- try {
- const { proxies } = req.body;
- if (!proxies || !Array.isArray(proxies)) {
- return res.status(400).json({ error: 'Proxies array required' });
- }
- const result = await (0, proxy_1.addProxiesFromList)(proxies);
- res.status(201).json(result);
- }
- catch (error) {
- console.error('Error adding proxies:', error);
- res.status(500).json({ error: 'Failed to add proxies' });
- }
-});
-// Test single proxy
-router.post('/:id/test', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
- try {
- const { id } = req.params;
- const proxyResult = await migrate_1.pool.query(`
- SELECT host, port, protocol, username, password
- FROM proxies
- WHERE id = $1
- `, [id]);
- if (proxyResult.rows.length === 0) {
- return res.status(404).json({ error: 'Proxy not found' });
- }
- const proxy = proxyResult.rows[0];
- const testResult = await (0, proxy_1.testProxy)(proxy.host, proxy.port, proxy.protocol, proxy.username, proxy.password);
- // Update proxy with test results
- await migrate_1.pool.query(`
- UPDATE proxies
- SET last_tested_at = CURRENT_TIMESTAMP,
- test_result = $1,
- response_time_ms = $2,
- is_anonymous = $3,
- active = $4
- WHERE id = $5
- `, [
- testResult.success ? 'success' : 'failed',
- testResult.responseTimeMs,
- testResult.isAnonymous,
- testResult.success,
- id
- ]);
- res.json({ test_result: testResult });
- }
- catch (error) {
- console.error('Error testing proxy:', error);
- res.status(500).json({ error: 'Failed to test proxy' });
- }
-});
-// Start proxy test job
-router.post('/test-all', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
- try {
- const jobId = await (0, proxyTestQueue_1.createProxyTestJob)();
- res.json({ jobId, message: 'Proxy test job started' });
- }
- catch (error) {
- console.error('Error starting proxy test job:', error);
- res.status(500).json({ error: 'Failed to start proxy test job' });
- }
-});
-// Cancel proxy test job
-router.post('/test-job/:jobId/cancel', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
- try {
- const { jobId } = req.params;
- const cancelled = await (0, proxyTestQueue_1.cancelProxyTestJob)(parseInt(jobId));
- if (!cancelled) {
- return res.status(404).json({ error: 'Job not found or already completed' });
- }
- res.json({ message: 'Job cancelled successfully' });
- }
- catch (error) {
- console.error('Error cancelling job:', error);
- res.status(500).json({ error: 'Failed to cancel job' });
- }
-});
-// Update proxy
-router.put('/:id', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
- try {
- const { id } = req.params;
- const { host, port, protocol, username, password, active } = req.body;
- const result = await migrate_1.pool.query(`
- UPDATE proxies
- SET host = COALESCE($1, host),
- port = COALESCE($2, port),
- protocol = COALESCE($3, protocol),
- username = COALESCE($4, username),
- password = COALESCE($5, password),
- active = COALESCE($6, active),
- updated_at = CURRENT_TIMESTAMP
- WHERE id = $7
- RETURNING *
- `, [host, port, protocol, username, password, active, id]);
- if (result.rows.length === 0) {
- return res.status(404).json({ error: 'Proxy not found' });
- }
- res.json({ proxy: result.rows[0] });
- }
- catch (error) {
- console.error('Error updating proxy:', error);
- res.status(500).json({ error: 'Failed to update proxy' });
- }
-});
-// Delete proxy
-router.delete('/:id', (0, middleware_1.requireRole)('superadmin'), async (req, res) => {
- try {
- const { id } = req.params;
- const result = await migrate_1.pool.query(`
- DELETE FROM proxies WHERE id = $1 RETURNING id
- `, [id]);
- if (result.rows.length === 0) {
- return res.status(404).json({ error: 'Proxy not found' });
- }
- res.json({ message: 'Proxy deleted successfully' });
- }
- catch (error) {
- console.error('Error deleting proxy:', error);
- res.status(500).json({ error: 'Failed to delete proxy' });
- }
-});
-// Update all proxy locations
-router.post('/update-locations', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
- try {
- const { updateAllProxyLocations } = await Promise.resolve().then(() => __importStar(require('../services/geolocation')));
- // Run in background
- updateAllProxyLocations().catch(err => {
- console.error('❌ Location update failed:', err);
- });
- res.json({ message: 'Location update job started' });
- }
- catch (error) {
- console.error('Error starting location update:', error);
- res.status(500).json({ error: 'Failed to start location update' });
- }
-});
-exports.default = router;
diff --git a/backend/dist/routes/public-api.js b/backend/dist/routes/public-api.js
deleted file mode 100644
index 88b78aa6..00000000
--- a/backend/dist/routes/public-api.js
+++ /dev/null
@@ -1,668 +0,0 @@
-"use strict";
-/**
- * Public API Routes for External Consumers (WordPress, etc.)
- *
- * These routes use the dutchie_az data pipeline and are protected by API key auth.
- * Designed for Deeply Rooted and other WordPress sites consuming menu data.
- */
-var __importDefault = (this && this.__importDefault) || function (mod) {
- return (mod && mod.__esModule) ? mod : { "default": mod };
-};
-Object.defineProperty(exports, "__esModule", { value: true });
-const express_1 = require("express");
-const migrate_1 = require("../db/migrate");
-const connection_1 = require("../dutchie-az/db/connection");
-const ipaddr_js_1 = __importDefault(require("ipaddr.js"));
-const router = (0, express_1.Router)();
-// ============================================================
-// MIDDLEWARE
-// ============================================================
-/**
- * Validates if an IP address matches any of the allowed IP patterns
- */
-function isIpAllowed(clientIp, allowedIps) {
- try {
- const clientAddr = ipaddr_js_1.default.process(clientIp);
- for (const allowedIp of allowedIps) {
- const trimmed = allowedIp.trim();
- if (!trimmed)
- continue;
- if (trimmed.includes('/')) {
- try {
- const range = ipaddr_js_1.default.parseCIDR(trimmed);
- if (clientAddr.match(range)) {
- return true;
- }
- }
- catch (e) {
- console.warn(`Invalid CIDR notation: ${trimmed}`);
- continue;
- }
- }
- else {
- try {
- const allowedAddr = ipaddr_js_1.default.process(trimmed);
- if (clientAddr.toString() === allowedAddr.toString()) {
- return true;
- }
- }
- catch (e) {
- console.warn(`Invalid IP address: ${trimmed}`);
- continue;
- }
- }
- }
- return false;
- }
- catch (error) {
- console.error('Error processing client IP:', error);
- return false;
- }
-}
-/**
- * Validates if a domain matches any of the allowed domain patterns
- */
-function isDomainAllowed(origin, allowedDomains) {
- try {
- const url = new URL(origin);
- const domain = url.hostname;
- for (const allowedDomain of allowedDomains) {
- const trimmed = allowedDomain.trim();
- if (!trimmed)
- continue;
- if (trimmed.startsWith('*.')) {
- const baseDomain = trimmed.substring(2);
- if (domain === baseDomain || domain.endsWith('.' + baseDomain)) {
- return true;
- }
- }
- else {
- if (domain === trimmed) {
- return true;
- }
- }
- }
- return false;
- }
- catch (error) {
- console.error('Error processing domain:', error);
- return false;
- }
-}
-/**
- * Middleware to validate API key and resolve dispensary -> dutchie_az store mapping
- */
-async function validatePublicApiKey(req, res, next) {
- const apiKey = req.headers['x-api-key'];
- if (!apiKey) {
- return res.status(401).json({
- error: 'Missing API key',
- message: 'Provide your API key in the X-API-Key header'
- });
- }
- try {
- // Query WordPress permissions table with store info
- const result = await migrate_1.pool.query(`
- SELECT
- p.id,
- p.user_name,
- p.api_key,
- p.allowed_ips,
- p.allowed_domains,
- p.is_active,
- p.store_id,
- p.store_name
- FROM wp_dutchie_api_permissions p
- WHERE p.api_key = $1 AND p.is_active = 1
- `, [apiKey]);
- if (result.rows.length === 0) {
- return res.status(401).json({
- error: 'Invalid API key'
- });
- }
- const permission = result.rows[0];
- // Validate IP if configured
- const clientIp = req.headers['x-forwarded-for']?.split(',')[0].trim() ||
- req.headers['x-real-ip'] ||
- req.ip ||
- req.connection.remoteAddress ||
- '';
- if (permission.allowed_ips) {
- const allowedIps = permission.allowed_ips.split('\n').filter((ip) => ip.trim());
- if (allowedIps.length > 0 && !isIpAllowed(clientIp, allowedIps)) {
- return res.status(403).json({
- error: 'IP address not allowed',
- client_ip: clientIp
- });
- }
- }
- // Validate domain if configured
- const origin = req.get('origin') || req.get('referer') || '';
- if (permission.allowed_domains && origin) {
- const allowedDomains = permission.allowed_domains.split('\n').filter((d) => d.trim());
- if (allowedDomains.length > 0 && !isDomainAllowed(origin, allowedDomains)) {
- return res.status(403).json({
- error: 'Domain not allowed',
- origin: origin
- });
- }
- }
- // Resolve the dutchie_az store for this store
- // Match by store name (from main DB) to dutchie_az.dispensaries.name
- const storeResult = await (0, connection_1.query)(`
- SELECT id FROM dispensaries
- WHERE LOWER(TRIM(name)) = LOWER(TRIM($1))
- OR LOWER(TRIM(name)) LIKE LOWER(TRIM($1)) || '%'
- OR LOWER(TRIM($1)) LIKE LOWER(TRIM(name)) || '%'
- ORDER BY
- CASE WHEN LOWER(TRIM(name)) = LOWER(TRIM($1)) THEN 0 ELSE 1 END,
- id
- LIMIT 1
- `, [permission.store_name]);
- if (storeResult.rows.length > 0) {
- permission.dutchie_az_store_id = storeResult.rows[0].id;
- }
- // Update last_used_at timestamp (async, don't wait)
- migrate_1.pool.query(`
- UPDATE wp_dutchie_api_permissions
- SET last_used_at = CURRENT_TIMESTAMP
- WHERE id = $1
- `, [permission.id]).catch((err) => {
- console.error('Error updating last_used_at:', err);
- });
- req.apiPermission = permission;
- next();
- }
- catch (error) {
- console.error('Public API validation error:', error);
- return res.status(500).json({
- error: 'Internal server error during API validation'
- });
- }
-}
-// Apply middleware to all routes
-router.use(validatePublicApiKey);
-// ============================================================
-// PRODUCT ENDPOINTS
-// ============================================================
-/**
- * GET /api/v1/products
- * Get products for the authenticated dispensary
- *
- * Query params:
- * - category: Filter by product type (e.g., 'flower', 'edible')
- * - brand: Filter by brand name
- * - in_stock_only: Only return in-stock products (default: false)
- * - limit: Max products to return (default: 100, max: 500)
- * - offset: Pagination offset (default: 0)
- */
-router.get('/products', async (req, res) => {
- try {
- const permission = req.apiPermission;
- // Check if we have a dutchie_az store mapping
- if (!permission.dutchie_az_store_id) {
- return res.status(503).json({
- error: 'No menu data available',
- message: `Menu data for ${permission.store_name} is not yet available. The dispensary may not be set up in the new data pipeline.`,
- dispensary_name: permission.store_name
- });
- }
- const { category, brand, in_stock_only = 'false', limit = '100', offset = '0' } = req.query;
- // Build query
- let whereClause = 'WHERE p.dispensary_id = $1';
- const params = [permission.dutchie_az_store_id];
- let paramIndex = 2;
- // Filter by stock status if requested
- if (in_stock_only === 'true' || in_stock_only === '1') {
- whereClause += ` AND p.stock_status = 'in_stock'`;
- }
- // Filter by category (maps to 'type' in dutchie_az)
- if (category) {
- whereClause += ` AND LOWER(p.type) = LOWER($${paramIndex})`;
- params.push(category);
- paramIndex++;
- }
- // Filter by brand
- if (brand) {
- whereClause += ` AND LOWER(p.brand_name) LIKE LOWER($${paramIndex})`;
- params.push(`%${brand}%`);
- paramIndex++;
- }
- // Enforce limits
- const limitNum = Math.min(parseInt(limit, 10) || 100, 500);
- const offsetNum = parseInt(offset, 10) || 0;
- params.push(limitNum, offsetNum);
- // Query products with latest snapshot data
- const { rows: products } = await (0, connection_1.query)(`
- SELECT
- p.id,
- p.external_product_id as dutchie_id,
- p.name,
- p.brand_name as brand,
- p.type as category,
- p.subcategory,
- p.strain_type,
- p.stock_status,
- p.thc,
- p.cbd,
- p.primary_image_url as image_url,
- p.images,
- p.effects,
- p.created_at,
- p.updated_at,
- -- Latest snapshot data for pricing
- s.rec_min_price_cents,
- s.rec_max_price_cents,
- s.rec_min_special_price_cents,
- s.med_min_price_cents,
- s.med_max_price_cents,
- s.med_min_special_price_cents,
- s.total_quantity_available,
- s.options,
- s.special,
- s.crawled_at as snapshot_at
- FROM dutchie_products p
- LEFT JOIN LATERAL (
- SELECT * FROM dutchie_product_snapshots
- WHERE dutchie_product_id = p.id
- ORDER BY crawled_at DESC
- LIMIT 1
- ) s ON true
- ${whereClause}
- ORDER BY p.name ASC
- LIMIT $${paramIndex} OFFSET $${paramIndex + 1}
- `, params);
- // Get total count for pagination
- const { rows: countRows } = await (0, connection_1.query)(`
- SELECT COUNT(*) as total FROM dutchie_products p ${whereClause}
- `, params.slice(0, -2));
- // Transform products to backward-compatible format
- const transformedProducts = products.map((p) => {
- // Extract first image URL from images array
- let imageUrl = p.image_url;
- if (!imageUrl && p.images && Array.isArray(p.images) && p.images.length > 0) {
- const firstImage = p.images[0];
- imageUrl = typeof firstImage === 'string' ? firstImage : firstImage?.url;
- }
- // Convert prices from cents to dollars
- const regularPrice = p.rec_min_price_cents
- ? (p.rec_min_price_cents / 100).toFixed(2)
- : null;
- const salePrice = p.rec_min_special_price_cents
- ? (p.rec_min_special_price_cents / 100).toFixed(2)
- : null;
- return {
- id: p.id,
- dutchie_id: p.dutchie_id,
- name: p.name,
- brand: p.brand || null,
- category: p.category || null,
- subcategory: p.subcategory || null,
- strain_type: p.strain_type || null,
- description: null, // Not stored in dutchie_products, would need snapshot
- regular_price: regularPrice,
- sale_price: salePrice,
- thc_percentage: p.thc ? parseFloat(p.thc) : null,
- cbd_percentage: p.cbd ? parseFloat(p.cbd) : null,
- image_url: imageUrl || null,
- in_stock: p.stock_status === 'in_stock',
- on_special: p.special || false,
- effects: p.effects || [],
- options: p.options || [],
- quantity_available: p.total_quantity_available || 0,
- created_at: p.created_at,
- updated_at: p.updated_at,
- snapshot_at: p.snapshot_at
- };
- });
- res.json({
- success: true,
- dispensary: permission.store_name,
- products: transformedProducts,
- pagination: {
- total: parseInt(countRows[0]?.total || '0', 10),
- limit: limitNum,
- offset: offsetNum,
- has_more: offsetNum + products.length < parseInt(countRows[0]?.total || '0', 10)
- }
- });
- }
- catch (error) {
- console.error('Public API products error:', error);
- res.status(500).json({
- error: 'Failed to fetch products',
- message: error.message
- });
- }
-});
-/**
- * GET /api/v1/products/:id
- * Get a single product by ID
- */
-router.get('/products/:id', async (req, res) => {
- try {
- const permission = req.apiPermission;
- const { id } = req.params;
- if (!permission.dutchie_az_store_id) {
- return res.status(503).json({
- error: 'No menu data available',
- message: `Menu data for ${permission.store_name} is not yet available.`
- });
- }
- // Get product with latest snapshot
- const { rows: products } = await (0, connection_1.query)(`
- SELECT
- p.*,
- s.rec_min_price_cents,
- s.rec_max_price_cents,
- s.rec_min_special_price_cents,
- s.med_min_price_cents,
- s.med_max_price_cents,
- s.total_quantity_available,
- s.options,
- s.special,
- s.crawled_at as snapshot_at
- FROM dutchie_products p
- LEFT JOIN LATERAL (
- SELECT * FROM dutchie_product_snapshots
- WHERE dutchie_product_id = p.id
- ORDER BY crawled_at DESC
- LIMIT 1
- ) s ON true
- WHERE p.id = $1 AND p.dispensary_id = $2
- `, [id, permission.dutchie_az_store_id]);
- if (products.length === 0) {
- return res.status(404).json({
- error: 'Product not found'
- });
- }
- const p = products[0];
- // Extract first image URL
- let imageUrl = p.primary_image_url;
- if (!imageUrl && p.images && Array.isArray(p.images) && p.images.length > 0) {
- const firstImage = p.images[0];
- imageUrl = typeof firstImage === 'string' ? firstImage : firstImage?.url;
- }
- res.json({
- success: true,
- product: {
- id: p.id,
- dutchie_id: p.external_product_id,
- name: p.name,
- brand: p.brand_name || null,
- category: p.type || null,
- subcategory: p.subcategory || null,
- strain_type: p.strain_type || null,
- regular_price: p.rec_min_price_cents ? (p.rec_min_price_cents / 100).toFixed(2) : null,
- sale_price: p.rec_min_special_price_cents ? (p.rec_min_special_price_cents / 100).toFixed(2) : null,
- thc_percentage: p.thc ? parseFloat(p.thc) : null,
- cbd_percentage: p.cbd ? parseFloat(p.cbd) : null,
- image_url: imageUrl || null,
- images: p.images || [],
- in_stock: p.stock_status === 'in_stock',
- on_special: p.special || false,
- effects: p.effects || [],
- options: p.options || [],
- quantity_available: p.total_quantity_available || 0,
- created_at: p.created_at,
- updated_at: p.updated_at,
- snapshot_at: p.snapshot_at
- }
- });
- }
- catch (error) {
- console.error('Public API product detail error:', error);
- res.status(500).json({
- error: 'Failed to fetch product',
- message: error.message
- });
- }
-});
-/**
- * GET /api/v1/categories
- * Get all categories for the authenticated dispensary
- */
-router.get('/categories', async (req, res) => {
- try {
- const permission = req.apiPermission;
- if (!permission.dutchie_az_store_id) {
- return res.status(503).json({
- error: 'No menu data available',
- message: `Menu data for ${permission.store_name} is not yet available.`
- });
- }
- const { rows: categories } = await (0, connection_1.query)(`
- SELECT
- type as category,
- subcategory,
- COUNT(*) as product_count,
- COUNT(*) FILTER (WHERE stock_status = 'in_stock') as in_stock_count
- FROM dutchie_products
- WHERE dispensary_id = $1 AND type IS NOT NULL
- GROUP BY type, subcategory
- ORDER BY type, subcategory
- `, [permission.dutchie_az_store_id]);
- res.json({
- success: true,
- dispensary: permission.store_name,
- categories
- });
- }
- catch (error) {
- console.error('Public API categories error:', error);
- res.status(500).json({
- error: 'Failed to fetch categories',
- message: error.message
- });
- }
-});
-/**
- * GET /api/v1/brands
- * Get all brands for the authenticated dispensary
- */
-router.get('/brands', async (req, res) => {
- try {
- const permission = req.apiPermission;
- if (!permission.dutchie_az_store_id) {
- return res.status(503).json({
- error: 'No menu data available',
- message: `Menu data for ${permission.store_name} is not yet available.`
- });
- }
- const { rows: brands } = await (0, connection_1.query)(`
- SELECT
- brand_name as brand,
- COUNT(*) as product_count,
- COUNT(*) FILTER (WHERE stock_status = 'in_stock') as in_stock_count
- FROM dutchie_products
- WHERE dispensary_id = $1 AND brand_name IS NOT NULL
- GROUP BY brand_name
- ORDER BY product_count DESC
- `, [permission.dutchie_az_store_id]);
- res.json({
- success: true,
- dispensary: permission.store_name,
- brands
- });
- }
- catch (error) {
- console.error('Public API brands error:', error);
- res.status(500).json({
- error: 'Failed to fetch brands',
- message: error.message
- });
- }
-});
-/**
- * GET /api/v1/specials
- * Get products on special/sale for the authenticated dispensary
- */
-router.get('/specials', async (req, res) => {
- try {
- const permission = req.apiPermission;
- if (!permission.dutchie_az_store_id) {
- return res.status(503).json({
- error: 'No menu data available',
- message: `Menu data for ${permission.store_name} is not yet available.`
- });
- }
- const { limit = '100', offset = '0' } = req.query;
- const limitNum = Math.min(parseInt(limit, 10) || 100, 500);
- const offsetNum = parseInt(offset, 10) || 0;
- // Get products with special pricing from latest snapshot
- const { rows: products } = await (0, connection_1.query)(`
- SELECT
- p.id,
- p.external_product_id as dutchie_id,
- p.name,
- p.brand_name as brand,
- p.type as category,
- p.subcategory,
- p.strain_type,
- p.stock_status,
- p.primary_image_url as image_url,
- s.rec_min_price_cents,
- s.rec_min_special_price_cents,
- s.special,
- s.options,
- p.updated_at,
- s.crawled_at as snapshot_at
- FROM dutchie_products p
- INNER JOIN LATERAL (
- SELECT * FROM dutchie_product_snapshots
- WHERE dutchie_product_id = p.id
- ORDER BY crawled_at DESC
- LIMIT 1
- ) s ON true
- WHERE p.dispensary_id = $1
- AND s.special = true
- AND p.stock_status = 'in_stock'
- ORDER BY p.name ASC
- LIMIT $2 OFFSET $3
- `, [permission.dutchie_az_store_id, limitNum, offsetNum]);
- // Get total count
- const { rows: countRows } = await (0, connection_1.query)(`
- SELECT COUNT(*) as total
- FROM dutchie_products p
- INNER JOIN LATERAL (
- SELECT special FROM dutchie_product_snapshots
- WHERE dutchie_product_id = p.id
- ORDER BY crawled_at DESC
- LIMIT 1
- ) s ON true
- WHERE p.dispensary_id = $1
- AND s.special = true
- AND p.stock_status = 'in_stock'
- `, [permission.dutchie_az_store_id]);
- const transformedProducts = products.map((p) => ({
- id: p.id,
- dutchie_id: p.dutchie_id,
- name: p.name,
- brand: p.brand || null,
- category: p.category || null,
- strain_type: p.strain_type || null,
- regular_price: p.rec_min_price_cents ? (p.rec_min_price_cents / 100).toFixed(2) : null,
- sale_price: p.rec_min_special_price_cents ? (p.rec_min_special_price_cents / 100).toFixed(2) : null,
- image_url: p.image_url || null,
- in_stock: p.stock_status === 'in_stock',
- options: p.options || [],
- updated_at: p.updated_at,
- snapshot_at: p.snapshot_at
- }));
- res.json({
- success: true,
- dispensary: permission.store_name,
- specials: transformedProducts,
- pagination: {
- total: parseInt(countRows[0]?.total || '0', 10),
- limit: limitNum,
- offset: offsetNum,
- has_more: offsetNum + products.length < parseInt(countRows[0]?.total || '0', 10)
- }
- });
- }
- catch (error) {
- console.error('Public API specials error:', error);
- res.status(500).json({
- error: 'Failed to fetch specials',
- message: error.message
- });
- }
-});
-/**
- * GET /api/v1/menu
- * Get complete menu summary for the authenticated dispensary
- */
-router.get('/menu', async (req, res) => {
- try {
- const permission = req.apiPermission;
- if (!permission.dutchie_az_store_id) {
- return res.status(503).json({
- error: 'No menu data available',
- message: `Menu data for ${permission.store_name} is not yet available.`
- });
- }
- // Get counts by category
- const { rows: categoryCounts } = await (0, connection_1.query)(`
- SELECT
- type as category,
- COUNT(*) as total,
- COUNT(*) FILTER (WHERE stock_status = 'in_stock') as in_stock
- FROM dutchie_products
- WHERE dispensary_id = $1 AND type IS NOT NULL
- GROUP BY type
- ORDER BY total DESC
- `, [permission.dutchie_az_store_id]);
- // Get overall stats
- const { rows: stats } = await (0, connection_1.query)(`
- SELECT
- COUNT(*) as total_products,
- COUNT(*) FILTER (WHERE stock_status = 'in_stock') as in_stock_count,
- COUNT(DISTINCT brand_name) as brand_count,
- COUNT(DISTINCT type) as category_count,
- MAX(updated_at) as last_updated
- FROM dutchie_products
- WHERE dispensary_id = $1
- `, [permission.dutchie_az_store_id]);
- // Get specials count
- const { rows: specialsCount } = await (0, connection_1.query)(`
- SELECT COUNT(*) as count
- FROM dutchie_products p
- INNER JOIN LATERAL (
- SELECT special FROM dutchie_product_snapshots
- WHERE dutchie_product_id = p.id
- ORDER BY crawled_at DESC
- LIMIT 1
- ) s ON true
- WHERE p.dispensary_id = $1
- AND s.special = true
- AND p.stock_status = 'in_stock'
- `, [permission.dutchie_az_store_id]);
- const summary = stats[0] || {};
- res.json({
- success: true,
- dispensary: permission.store_name,
- menu: {
- total_products: parseInt(summary.total_products || '0', 10),
- in_stock_count: parseInt(summary.in_stock_count || '0', 10),
- brand_count: parseInt(summary.brand_count || '0', 10),
- category_count: parseInt(summary.category_count || '0', 10),
- specials_count: parseInt(specialsCount[0]?.count || '0', 10),
- last_updated: summary.last_updated,
- categories: categoryCounts.map((c) => ({
- name: c.category,
- total: parseInt(c.total, 10),
- in_stock: parseInt(c.in_stock, 10)
- }))
- }
- });
- }
- catch (error) {
- console.error('Public API menu error:', error);
- res.status(500).json({
- error: 'Failed to fetch menu summary',
- message: error.message
- });
- }
-});
-exports.default = router;
diff --git a/backend/dist/routes/schedule.js b/backend/dist/routes/schedule.js
deleted file mode 100644
index 1bad705c..00000000
--- a/backend/dist/routes/schedule.js
+++ /dev/null
@@ -1,887 +0,0 @@
-"use strict";
-Object.defineProperty(exports, "__esModule", { value: true });
-const express_1 = require("express");
-const middleware_1 = require("../auth/middleware");
-const crawl_scheduler_1 = require("../services/crawl-scheduler");
-const store_crawl_orchestrator_1 = require("../services/store-crawl-orchestrator");
-const dispensary_orchestrator_1 = require("../services/dispensary-orchestrator");
-const migrate_1 = require("../db/migrate");
-const graphql_client_1 = require("../dutchie-az/services/graphql-client");
-const router = (0, express_1.Router)();
-router.use(middleware_1.authMiddleware);
-// ============================================
-// Global Schedule Endpoints
-// ============================================
-/**
- * GET /api/schedule/global
- * Get global schedule settings
- */
-router.get('/global', async (req, res) => {
- try {
- const schedules = await (0, crawl_scheduler_1.getGlobalSchedule)();
- res.json({ schedules });
- }
- catch (error) {
- console.error('Error fetching global schedule:', error);
- res.status(500).json({ error: 'Failed to fetch global schedule' });
- }
-});
-/**
- * PUT /api/schedule/global/:type
- * Update global schedule setting
- */
-router.put('/global/:type', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
- try {
- const { type } = req.params;
- const { enabled, interval_hours, run_time } = req.body;
- if (type !== 'global_interval' && type !== 'daily_special') {
- return res.status(400).json({ error: 'Invalid schedule type' });
- }
- const schedule = await (0, crawl_scheduler_1.updateGlobalSchedule)(type, {
- enabled,
- interval_hours,
- run_time
- });
- // Restart scheduler to apply changes
- await (0, crawl_scheduler_1.restartCrawlScheduler)();
- res.json({ schedule, message: 'Schedule updated and scheduler restarted' });
- }
- catch (error) {
- console.error('Error updating global schedule:', error);
- res.status(500).json({ error: 'Failed to update global schedule' });
- }
-});
-// ============================================
-// Store Schedule Endpoints
-// ============================================
-/**
- * GET /api/schedule/stores
- * Get all store schedule statuses
- */
-router.get('/stores', async (req, res) => {
- try {
- const stores = await (0, crawl_scheduler_1.getStoreScheduleStatuses)();
- res.json({ stores });
- }
- catch (error) {
- console.error('Error fetching store schedules:', error);
- res.status(500).json({ error: 'Failed to fetch store schedules' });
- }
-});
-/**
- * GET /api/schedule/stores/:storeId
- * Get schedule for a specific store
- */
-router.get('/stores/:storeId', async (req, res) => {
- try {
- const storeId = parseInt(req.params.storeId);
- if (isNaN(storeId)) {
- return res.status(400).json({ error: 'Invalid store ID' });
- }
- const schedule = await (0, crawl_scheduler_1.getStoreSchedule)(storeId);
- res.json({ schedule });
- }
- catch (error) {
- console.error('Error fetching store schedule:', error);
- res.status(500).json({ error: 'Failed to fetch store schedule' });
- }
-});
-/**
- * PUT /api/schedule/stores/:storeId
- * Update schedule for a specific store
- */
-router.put('/stores/:storeId', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
- try {
- const storeId = parseInt(req.params.storeId);
- if (isNaN(storeId)) {
- return res.status(400).json({ error: 'Invalid store ID' });
- }
- const { enabled, interval_hours, daily_special_enabled, daily_special_time, priority } = req.body;
- const schedule = await (0, crawl_scheduler_1.updateStoreSchedule)(storeId, {
- enabled,
- interval_hours,
- daily_special_enabled,
- daily_special_time,
- priority
- });
- res.json({ schedule });
- }
- catch (error) {
- console.error('Error updating store schedule:', error);
- res.status(500).json({ error: 'Failed to update store schedule' });
- }
-});
-// ============================================
-// Job Queue Endpoints
-// ============================================
-/**
- * GET /api/schedule/jobs
- * Get recent jobs
- */
-router.get('/jobs', async (req, res) => {
- try {
- const limit = parseInt(req.query.limit) || 50;
- const jobs = await (0, crawl_scheduler_1.getAllRecentJobs)(Math.min(limit, 200));
- res.json({ jobs });
- }
- catch (error) {
- console.error('Error fetching jobs:', error);
- res.status(500).json({ error: 'Failed to fetch jobs' });
- }
-});
-/**
- * GET /api/schedule/jobs/store/:storeId
- * Get recent jobs for a specific store
- */
-router.get('/jobs/store/:storeId', async (req, res) => {
- try {
- const storeId = parseInt(req.params.storeId);
- if (isNaN(storeId)) {
- return res.status(400).json({ error: 'Invalid store ID' });
- }
- const limit = parseInt(req.query.limit) || 10;
- const jobs = await (0, crawl_scheduler_1.getRecentJobs)(storeId, Math.min(limit, 100));
- res.json({ jobs });
- }
- catch (error) {
- console.error('Error fetching store jobs:', error);
- res.status(500).json({ error: 'Failed to fetch store jobs' });
- }
-});
-/**
- * POST /api/schedule/jobs/:jobId/cancel
- * Cancel a pending job
- */
-router.post('/jobs/:jobId/cancel', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
- try {
- const jobId = parseInt(req.params.jobId);
- if (isNaN(jobId)) {
- return res.status(400).json({ error: 'Invalid job ID' });
- }
- const cancelled = await (0, crawl_scheduler_1.cancelJob)(jobId);
- if (cancelled) {
- res.json({ success: true, message: 'Job cancelled' });
- }
- else {
- res.status(400).json({ error: 'Job could not be cancelled (may not be pending)' });
- }
- }
- catch (error) {
- console.error('Error cancelling job:', error);
- res.status(500).json({ error: 'Failed to cancel job' });
- }
-});
-// ============================================
-// Manual Trigger Endpoints
-// ============================================
-/**
- * POST /api/schedule/trigger/store/:storeId
- * Manually trigger orchestrated crawl for a specific store
- * Uses the intelligent orchestrator which:
- * - Checks provider detection status
- * - Runs detection if needed
- * - Queues appropriate crawl type (production/sandbox)
- */
-router.post('/trigger/store/:storeId', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
- try {
- const storeId = parseInt(req.params.storeId);
- if (isNaN(storeId)) {
- return res.status(400).json({ error: 'Invalid store ID' });
- }
- // Use the orchestrator instead of simple triggerManualCrawl
- const result = await (0, store_crawl_orchestrator_1.runStoreCrawlOrchestrator)(storeId);
- res.json({
- result,
- message: result.summary,
- success: result.status === 'success' || result.status === 'sandbox_only',
- });
- }
- catch (error) {
- console.error('Error triggering orchestrated crawl:', error);
- res.status(500).json({ error: 'Failed to trigger crawl' });
- }
-});
-/**
- * POST /api/schedule/trigger/store/:storeId/legacy
- * Legacy: Simple job queue trigger (no orchestration)
- */
-router.post('/trigger/store/:storeId/legacy', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
- try {
- const storeId = parseInt(req.params.storeId);
- if (isNaN(storeId)) {
- return res.status(400).json({ error: 'Invalid store ID' });
- }
- const job = await (0, crawl_scheduler_1.triggerManualCrawl)(storeId);
- res.json({ job, message: 'Crawl job created' });
- }
- catch (error) {
- console.error('Error triggering manual crawl:', error);
- res.status(500).json({ error: 'Failed to trigger crawl' });
- }
-});
-/**
- * POST /api/schedule/trigger/all
- * Manually trigger crawls for all stores
- */
-router.post('/trigger/all', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
- try {
- const jobsCreated = await (0, crawl_scheduler_1.triggerAllStoresCrawl)();
- res.json({ jobs_created: jobsCreated, message: `Created ${jobsCreated} crawl jobs` });
- }
- catch (error) {
- console.error('Error triggering all crawls:', error);
- res.status(500).json({ error: 'Failed to trigger crawls' });
- }
-});
-/**
- * POST /api/schedule/restart
- * Restart the scheduler
- */
-router.post('/restart', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
- try {
- await (0, crawl_scheduler_1.restartCrawlScheduler)();
- res.json({ message: 'Scheduler restarted', mode: (0, crawl_scheduler_1.getSchedulerMode)() });
- }
- catch (error) {
- console.error('Error restarting scheduler:', error);
- res.status(500).json({ error: 'Failed to restart scheduler' });
- }
-});
-// ============================================
-// Scheduler Mode Endpoints
-// ============================================
-/**
- * GET /api/schedule/mode
- * Get current scheduler mode
- */
-router.get('/mode', async (req, res) => {
- try {
- const mode = (0, crawl_scheduler_1.getSchedulerMode)();
- res.json({ mode });
- }
- catch (error) {
- console.error('Error getting scheduler mode:', error);
- res.status(500).json({ error: 'Failed to get scheduler mode' });
- }
-});
-/**
- * PUT /api/schedule/mode
- * Set scheduler mode (legacy or orchestrator)
- */
-router.put('/mode', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
- try {
- const { mode } = req.body;
- if (mode !== 'legacy' && mode !== 'orchestrator') {
- return res.status(400).json({ error: 'Invalid mode. Must be "legacy" or "orchestrator"' });
- }
- (0, crawl_scheduler_1.setSchedulerMode)(mode);
- // Restart scheduler with new mode
- await (0, crawl_scheduler_1.restartCrawlScheduler)();
- res.json({ mode, message: `Scheduler mode set to ${mode} and restarted` });
- }
- catch (error) {
- console.error('Error setting scheduler mode:', error);
- res.status(500).json({ error: 'Failed to set scheduler mode' });
- }
-});
-/**
- * GET /api/schedule/due
- * Get stores that are due for orchestration
- */
-router.get('/due', async (req, res) => {
- try {
- const limit = parseInt(req.query.limit) || 10;
- const storeIds = await (0, store_crawl_orchestrator_1.getStoresDueForOrchestration)(Math.min(limit, 50));
- res.json({ stores_due: storeIds, count: storeIds.length });
- }
- catch (error) {
- console.error('Error getting stores due for orchestration:', error);
- res.status(500).json({ error: 'Failed to get stores due' });
- }
-});
-// ============================================
-// Dispensary Schedule Endpoints (NEW - dispensary-centric)
-// ============================================
-/**
- * GET /api/schedule/dispensaries
- * Get all dispensary schedule statuses with optional filters
- * Query params:
- * - state: filter by state (e.g., 'AZ')
- * - search: search by name or slug
- */
-router.get('/dispensaries', async (req, res) => {
- try {
- const { state, search } = req.query;
- // Build dynamic query with optional filters
- const conditions = [];
- const params = [];
- let paramIndex = 1;
- if (state) {
- conditions.push(`d.state = $${paramIndex}`);
- params.push(state);
- paramIndex++;
- }
- if (search) {
- conditions.push(`(d.name ILIKE $${paramIndex} OR d.slug ILIKE $${paramIndex} OR d.dba_name ILIKE $${paramIndex})`);
- params.push(`%${search}%`);
- paramIndex++;
- }
- const whereClause = conditions.length > 0 ? `WHERE ${conditions.join(' AND ')}` : '';
- const query = `
- SELECT
- d.id AS dispensary_id,
- COALESCE(d.dba_name, d.name) AS dispensary_name,
- d.slug AS dispensary_slug,
- d.city,
- d.state,
- d.menu_url,
- d.menu_type,
- d.platform_dispensary_id,
- d.scrape_enabled,
- d.last_crawl_at,
- d.crawl_status,
- d.product_crawler_mode,
- d.product_provider,
- cs.interval_minutes,
- cs.is_active,
- cs.priority,
- cs.last_run_at,
- cs.next_run_at,
- cs.last_status AS schedule_last_status,
- cs.last_error AS schedule_last_error,
- cs.consecutive_failures,
- j.id AS latest_job_id,
- j.status AS latest_job_status,
- j.job_type AS latest_job_type,
- j.started_at AS latest_job_started,
- j.completed_at AS latest_job_completed,
- j.products_found AS latest_products_found,
- j.products_new AS latest_products_created,
- j.products_updated AS latest_products_updated,
- j.error_message AS latest_job_error,
- CASE
- WHEN d.menu_type = 'dutchie' AND d.platform_dispensary_id IS NOT NULL THEN true
- ELSE false
- END AS can_crawl,
- CASE
- WHEN d.menu_type IS NULL OR d.menu_type = 'unknown' THEN 'menu_type not detected'
- WHEN d.menu_type != 'dutchie' THEN 'not dutchie platform'
- WHEN d.platform_dispensary_id IS NULL THEN 'platform ID not resolved'
- WHEN d.scrape_enabled = false THEN 'scraping disabled'
- ELSE 'ready'
- END AS schedule_status_reason
- FROM public.dispensaries d
- LEFT JOIN public.dispensary_crawl_schedule cs ON cs.dispensary_id = d.id
- LEFT JOIN LATERAL (
- SELECT *
- FROM public.dispensary_crawl_jobs dj
- WHERE dj.dispensary_id = d.id
- ORDER BY dj.created_at DESC
- LIMIT 1
- ) j ON true
- ${whereClause}
- ORDER BY cs.priority DESC NULLS LAST, COALESCE(d.dba_name, d.name)
- `;
- const result = await migrate_1.pool.query(query, params);
- res.json({ dispensaries: result.rows });
- }
- catch (error) {
- console.error('Error fetching dispensary schedules:', error);
- res.status(500).json({ error: 'Failed to fetch dispensary schedules' });
- }
-});
-/**
- * GET /api/schedule/dispensaries/:id
- * Get schedule for a specific dispensary
- */
-router.get('/dispensaries/:id', async (req, res) => {
- try {
- const dispensaryId = parseInt(req.params.id);
- if (isNaN(dispensaryId)) {
- return res.status(400).json({ error: 'Invalid dispensary ID' });
- }
- const result = await migrate_1.pool.query(`
- SELECT * FROM dispensary_crawl_status
- WHERE dispensary_id = $1
- `, [dispensaryId]);
- if (result.rows.length === 0) {
- return res.status(404).json({ error: 'Dispensary not found' });
- }
- res.json({ schedule: result.rows[0] });
- }
- catch (error) {
- console.error('Error fetching dispensary schedule:', error);
- res.status(500).json({ error: 'Failed to fetch dispensary schedule' });
- }
-});
-/**
- * PUT /api/schedule/dispensaries/:id
- * Update schedule for a specific dispensary
- */
-router.put('/dispensaries/:id', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
- try {
- const dispensaryId = parseInt(req.params.id);
- if (isNaN(dispensaryId)) {
- return res.status(400).json({ error: 'Invalid dispensary ID' });
- }
- const { is_active, interval_minutes, priority } = req.body;
- // Upsert schedule
- const result = await migrate_1.pool.query(`
- INSERT INTO dispensary_crawl_schedule (dispensary_id, is_active, interval_minutes, priority)
- VALUES ($1, COALESCE($2, TRUE), COALESCE($3, 240), COALESCE($4, 0))
- ON CONFLICT (dispensary_id) DO UPDATE SET
- is_active = COALESCE($2, dispensary_crawl_schedule.is_active),
- interval_minutes = COALESCE($3, dispensary_crawl_schedule.interval_minutes),
- priority = COALESCE($4, dispensary_crawl_schedule.priority),
- updated_at = NOW()
- RETURNING *
- `, [dispensaryId, is_active, interval_minutes, priority]);
- res.json({ schedule: result.rows[0] });
- }
- catch (error) {
- console.error('Error updating dispensary schedule:', error);
- res.status(500).json({ error: 'Failed to update dispensary schedule' });
- }
-});
-/**
- * GET /api/schedule/dispensary-jobs
- * Get recent dispensary crawl jobs
- */
-router.get('/dispensary-jobs', async (req, res) => {
- try {
- const limit = parseInt(req.query.limit) || 50;
- const result = await migrate_1.pool.query(`
- SELECT dcj.*, d.name as dispensary_name
- FROM dispensary_crawl_jobs dcj
- JOIN dispensaries d ON d.id = dcj.dispensary_id
- ORDER BY dcj.created_at DESC
- LIMIT $1
- `, [Math.min(limit, 200)]);
- res.json({ jobs: result.rows });
- }
- catch (error) {
- console.error('Error fetching dispensary jobs:', error);
- res.status(500).json({ error: 'Failed to fetch dispensary jobs' });
- }
-});
-/**
- * GET /api/schedule/dispensary-jobs/:dispensaryId
- * Get recent jobs for a specific dispensary
- */
-router.get('/dispensary-jobs/:dispensaryId', async (req, res) => {
- try {
- const dispensaryId = parseInt(req.params.dispensaryId);
- if (isNaN(dispensaryId)) {
- return res.status(400).json({ error: 'Invalid dispensary ID' });
- }
- const limit = parseInt(req.query.limit) || 10;
- const result = await migrate_1.pool.query(`
- SELECT dcj.*, d.name as dispensary_name
- FROM dispensary_crawl_jobs dcj
- JOIN dispensaries d ON d.id = dcj.dispensary_id
- WHERE dcj.dispensary_id = $1
- ORDER BY dcj.created_at DESC
- LIMIT $2
- `, [dispensaryId, Math.min(limit, 100)]);
- res.json({ jobs: result.rows });
- }
- catch (error) {
- console.error('Error fetching dispensary jobs:', error);
- res.status(500).json({ error: 'Failed to fetch dispensary jobs' });
- }
-});
-/**
- * POST /api/schedule/trigger/dispensary/:id
- * Trigger orchestrator for a specific dispensary (Run Now button)
- */
-router.post('/trigger/dispensary/:id', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
- try {
- const dispensaryId = parseInt(req.params.id);
- if (isNaN(dispensaryId)) {
- return res.status(400).json({ error: 'Invalid dispensary ID' });
- }
- // Run the dispensary orchestrator
- const result = await (0, dispensary_orchestrator_1.runDispensaryOrchestrator)(dispensaryId);
- res.json({
- result,
- message: result.summary,
- success: result.status === 'success' || result.status === 'sandbox_only' || result.status === 'detection_only',
- });
- }
- catch (error) {
- console.error('Error triggering dispensary orchestrator:', error);
- res.status(500).json({ error: 'Failed to trigger orchestrator' });
- }
-});
-/**
- * POST /api/schedule/trigger/dispensaries/batch
- * Trigger orchestrator for multiple dispensaries
- */
-router.post('/trigger/dispensaries/batch', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
- try {
- const { dispensary_ids, concurrency } = req.body;
- if (!Array.isArray(dispensary_ids) || dispensary_ids.length === 0) {
- return res.status(400).json({ error: 'dispensary_ids must be a non-empty array' });
- }
- const results = await (0, dispensary_orchestrator_1.runBatchDispensaryOrchestrator)(dispensary_ids, concurrency || 3);
- const summary = {
- total: results.length,
- success: results.filter(r => r.status === 'success').length,
- sandbox_only: results.filter(r => r.status === 'sandbox_only').length,
- detection_only: results.filter(r => r.status === 'detection_only').length,
- error: results.filter(r => r.status === 'error').length,
- };
- res.json({ results, summary });
- }
- catch (error) {
- console.error('Error triggering batch orchestrator:', error);
- res.status(500).json({ error: 'Failed to trigger batch orchestrator' });
- }
-});
-/**
- * GET /api/schedule/dispensary-due
- * Get dispensaries that are due for orchestration
- */
-router.get('/dispensary-due', async (req, res) => {
- try {
- const limit = parseInt(req.query.limit) || 10;
- const dispensaryIds = await (0, dispensary_orchestrator_1.getDispensariesDueForOrchestration)(Math.min(limit, 50));
- // Get details for the due dispensaries
- if (dispensaryIds.length > 0) {
- const details = await migrate_1.pool.query(`
- SELECT d.id, d.name, d.product_provider, d.product_crawler_mode,
- dcs.next_run_at, dcs.last_status, dcs.priority
- FROM dispensaries d
- LEFT JOIN dispensary_crawl_schedule dcs ON dcs.dispensary_id = d.id
- WHERE d.id = ANY($1)
- ORDER BY COALESCE(dcs.priority, 0) DESC, dcs.last_run_at ASC NULLS FIRST
- `, [dispensaryIds]);
- res.json({ dispensaries_due: details.rows, count: dispensaryIds.length });
- }
- else {
- res.json({ dispensaries_due: [], count: 0 });
- }
- }
- catch (error) {
- console.error('Error getting dispensaries due for orchestration:', error);
- res.status(500).json({ error: 'Failed to get dispensaries due' });
- }
-});
-/**
- * POST /api/schedule/dispensaries/bootstrap
- * Ensure all dispensaries have schedule entries
- */
-router.post('/dispensaries/bootstrap', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
- try {
- const { interval_minutes } = req.body;
- const result = await (0, dispensary_orchestrator_1.ensureAllDispensariesHaveSchedules)(interval_minutes || 240);
- res.json({
- message: `Created ${result.created} new schedules, ${result.existing} already existed`,
- created: result.created,
- existing: result.existing,
- });
- }
- catch (error) {
- console.error('Error bootstrapping dispensary schedules:', error);
- res.status(500).json({ error: 'Failed to bootstrap schedules' });
- }
-});
-// ============================================
-// Platform ID & Menu Type Detection Endpoints
-// ============================================
-/**
- * POST /api/schedule/dispensaries/:id/resolve-platform-id
- * Resolve the Dutchie platform_dispensary_id from menu_url slug
- */
-router.post('/dispensaries/:id/resolve-platform-id', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
- try {
- const dispensaryId = parseInt(req.params.id);
- if (isNaN(dispensaryId)) {
- return res.status(400).json({ error: 'Invalid dispensary ID' });
- }
- // Get dispensary info
- const dispensaryResult = await migrate_1.pool.query(`
- SELECT id, name, slug, menu_url, menu_type, platform_dispensary_id
- FROM dispensaries WHERE id = $1
- `, [dispensaryId]);
- if (dispensaryResult.rows.length === 0) {
- return res.status(404).json({ error: 'Dispensary not found' });
- }
- const dispensary = dispensaryResult.rows[0];
- // Check if already resolved
- if (dispensary.platform_dispensary_id) {
- return res.json({
- success: true,
- message: 'Platform ID already resolved',
- platform_dispensary_id: dispensary.platform_dispensary_id,
- already_resolved: true
- });
- }
- // Extract slug from menu_url for Dutchie URLs
- let slugToResolve = dispensary.slug;
- if (dispensary.menu_url) {
- // Match embedded-menu or dispensary URLs
- const match = dispensary.menu_url.match(/(?:embedded-menu|dispensar(?:y|ies))\/([^\/\?#]+)/i);
- if (match) {
- slugToResolve = match[1];
- }
- }
- if (!slugToResolve) {
- return res.status(400).json({
- error: 'No slug available to resolve platform ID',
- menu_url: dispensary.menu_url
- });
- }
- console.log(`[Schedule] Resolving platform ID for ${dispensary.name} using slug: ${slugToResolve}`);
- // Resolve platform ID using GraphQL client
- const platformId = await (0, graphql_client_1.resolveDispensaryId)(slugToResolve);
- if (!platformId) {
- return res.status(404).json({
- error: 'Could not resolve platform ID',
- slug_tried: slugToResolve,
- message: 'The dispensary might not be on Dutchie or the slug is incorrect'
- });
- }
- // Update the dispensary with resolved platform ID
- await migrate_1.pool.query(`
- UPDATE dispensaries
- SET platform_dispensary_id = $1,
- menu_type = COALESCE(menu_type, 'dutchie'),
- updated_at = NOW()
- WHERE id = $2
- `, [platformId, dispensaryId]);
- res.json({
- success: true,
- platform_dispensary_id: platformId,
- slug_resolved: slugToResolve,
- message: `Platform ID resolved: ${platformId}`
- });
- }
- catch (error) {
- console.error('Error resolving platform ID:', error);
- res.status(500).json({ error: 'Failed to resolve platform ID', details: error.message });
- }
-});
-/**
- * POST /api/schedule/dispensaries/:id/detect-menu-type
- * Detect menu type from menu_url
- */
-router.post('/dispensaries/:id/detect-menu-type', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
- try {
- const dispensaryId = parseInt(req.params.id);
- if (isNaN(dispensaryId)) {
- return res.status(400).json({ error: 'Invalid dispensary ID' });
- }
- // Get dispensary info
- const dispensaryResult = await migrate_1.pool.query(`
- SELECT id, name, menu_url, website FROM dispensaries WHERE id = $1
- `, [dispensaryId]);
- if (dispensaryResult.rows.length === 0) {
- return res.status(404).json({ error: 'Dispensary not found' });
- }
- const dispensary = dispensaryResult.rows[0];
- const urlToCheck = dispensary.menu_url || dispensary.website;
- if (!urlToCheck) {
- return res.status(400).json({ error: 'No menu_url or website to detect from' });
- }
- // Detect menu type from URL patterns
- let detectedType = 'unknown';
- if (urlToCheck.includes('dutchie.com') || urlToCheck.includes('embedded-menu')) {
- detectedType = 'dutchie';
- }
- else if (urlToCheck.includes('iheartjane.com') || urlToCheck.includes('jane.co')) {
- detectedType = 'jane';
- }
- else if (urlToCheck.includes('weedmaps.com')) {
- detectedType = 'weedmaps';
- }
- else if (urlToCheck.includes('leafly.com')) {
- detectedType = 'leafly';
- }
- else if (urlToCheck.includes('treez.io') || urlToCheck.includes('treez.co')) {
- detectedType = 'treez';
- }
- else if (urlToCheck.includes('meadow.com')) {
- detectedType = 'meadow';
- }
- else if (urlToCheck.includes('blaze.me') || urlToCheck.includes('blazepay')) {
- detectedType = 'blaze';
- }
- else if (urlToCheck.includes('flowhub.com')) {
- detectedType = 'flowhub';
- }
- else if (urlToCheck.includes('dispense.app')) {
- detectedType = 'dispense';
- }
- else if (urlToCheck.includes('covasoft.com')) {
- detectedType = 'cova';
- }
- // Update menu_type
- await migrate_1.pool.query(`
- UPDATE dispensaries
- SET menu_type = $1, updated_at = NOW()
- WHERE id = $2
- `, [detectedType, dispensaryId]);
- res.json({
- success: true,
- menu_type: detectedType,
- url_checked: urlToCheck,
- message: `Menu type detected: ${detectedType}`
- });
- }
- catch (error) {
- console.error('Error detecting menu type:', error);
- res.status(500).json({ error: 'Failed to detect menu type' });
- }
-});
-/**
- * POST /api/schedule/dispensaries/:id/refresh-detection
- * Combined: detect menu_type AND resolve platform_dispensary_id if dutchie
- */
-router.post('/dispensaries/:id/refresh-detection', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
- try {
- const dispensaryId = parseInt(req.params.id);
- if (isNaN(dispensaryId)) {
- return res.status(400).json({ error: 'Invalid dispensary ID' });
- }
- // Get dispensary info
- const dispensaryResult = await migrate_1.pool.query(`
- SELECT id, name, slug, menu_url, website FROM dispensaries WHERE id = $1
- `, [dispensaryId]);
- if (dispensaryResult.rows.length === 0) {
- return res.status(404).json({ error: 'Dispensary not found' });
- }
- const dispensary = dispensaryResult.rows[0];
- const urlToCheck = dispensary.menu_url || dispensary.website;
- if (!urlToCheck) {
- return res.status(400).json({ error: 'No menu_url or website to detect from' });
- }
- // Detect menu type from URL patterns
- let detectedType = 'unknown';
- if (urlToCheck.includes('dutchie.com') || urlToCheck.includes('embedded-menu')) {
- detectedType = 'dutchie';
- }
- else if (urlToCheck.includes('iheartjane.com') || urlToCheck.includes('jane.co')) {
- detectedType = 'jane';
- }
- else if (urlToCheck.includes('weedmaps.com')) {
- detectedType = 'weedmaps';
- }
- else if (urlToCheck.includes('leafly.com')) {
- detectedType = 'leafly';
- }
- else if (urlToCheck.includes('treez.io') || urlToCheck.includes('treez.co')) {
- detectedType = 'treez';
- }
- else if (urlToCheck.includes('meadow.com')) {
- detectedType = 'meadow';
- }
- else if (urlToCheck.includes('blaze.me') || urlToCheck.includes('blazepay')) {
- detectedType = 'blaze';
- }
- else if (urlToCheck.includes('flowhub.com')) {
- detectedType = 'flowhub';
- }
- else if (urlToCheck.includes('dispense.app')) {
- detectedType = 'dispense';
- }
- else if (urlToCheck.includes('covasoft.com')) {
- detectedType = 'cova';
- }
- // Update menu_type first
- await migrate_1.pool.query(`
- UPDATE dispensaries SET menu_type = $1, updated_at = NOW() WHERE id = $2
- `, [detectedType, dispensaryId]);
- let platformId = null;
- // If dutchie, also try to resolve platform ID
- if (detectedType === 'dutchie') {
- let slugToResolve = dispensary.slug;
- const match = urlToCheck.match(/(?:embedded-menu|dispensar(?:y|ies))\/([^\/\?#]+)/i);
- if (match) {
- slugToResolve = match[1];
- }
- if (slugToResolve) {
- try {
- console.log(`[Schedule] Resolving platform ID for ${dispensary.name} using slug: ${slugToResolve}`);
- platformId = await (0, graphql_client_1.resolveDispensaryId)(slugToResolve);
- if (platformId) {
- await migrate_1.pool.query(`
- UPDATE dispensaries SET platform_dispensary_id = $1, updated_at = NOW() WHERE id = $2
- `, [platformId, dispensaryId]);
- }
- }
- catch (err) {
- console.warn(`[Schedule] Failed to resolve platform ID: ${err.message}`);
- }
- }
- }
- res.json({
- success: true,
- menu_type: detectedType,
- platform_dispensary_id: platformId,
- url_checked: urlToCheck,
- can_crawl: detectedType === 'dutchie' && !!platformId
- });
- }
- catch (error) {
- console.error('Error refreshing detection:', error);
- res.status(500).json({ error: 'Failed to refresh detection' });
- }
-});
-/**
- * PUT /api/schedule/dispensaries/:id/toggle-active
- * Enable or disable schedule for a dispensary
- */
-router.put('/dispensaries/:id/toggle-active', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
- try {
- const dispensaryId = parseInt(req.params.id);
- if (isNaN(dispensaryId)) {
- return res.status(400).json({ error: 'Invalid dispensary ID' });
- }
- const { is_active } = req.body;
- // Upsert schedule with new is_active value
- const result = await migrate_1.pool.query(`
- INSERT INTO dispensary_crawl_schedule (dispensary_id, is_active, interval_minutes, priority)
- VALUES ($1, $2, 240, 0)
- ON CONFLICT (dispensary_id) DO UPDATE SET
- is_active = $2,
- updated_at = NOW()
- RETURNING *
- `, [dispensaryId, is_active]);
- res.json({
- success: true,
- schedule: result.rows[0],
- message: is_active ? 'Schedule enabled' : 'Schedule disabled'
- });
- }
- catch (error) {
- console.error('Error toggling schedule active status:', error);
- res.status(500).json({ error: 'Failed to toggle schedule' });
- }
-});
-/**
- * DELETE /api/schedule/dispensaries/:id/schedule
- * Delete schedule for a dispensary
- */
-router.delete('/dispensaries/:id/schedule', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
- try {
- const dispensaryId = parseInt(req.params.id);
- if (isNaN(dispensaryId)) {
- return res.status(400).json({ error: 'Invalid dispensary ID' });
- }
- const result = await migrate_1.pool.query(`
- DELETE FROM dispensary_crawl_schedule WHERE dispensary_id = $1 RETURNING id
- `, [dispensaryId]);
- const deleted = (result.rowCount ?? 0) > 0;
- res.json({
- success: true,
- deleted,
- message: deleted ? 'Schedule deleted' : 'No schedule to delete'
- });
- }
- catch (error) {
- console.error('Error deleting schedule:', error);
- res.status(500).json({ error: 'Failed to delete schedule' });
- }
-});
-exports.default = router;
diff --git a/backend/dist/routes/scraper-monitor.js b/backend/dist/routes/scraper-monitor.js
deleted file mode 100644
index 62bd924b..00000000
--- a/backend/dist/routes/scraper-monitor.js
+++ /dev/null
@@ -1,349 +0,0 @@
-"use strict";
-var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
- if (k2 === undefined) k2 = k;
- var desc = Object.getOwnPropertyDescriptor(m, k);
- if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
- desc = { enumerable: true, get: function() { return m[k]; } };
- }
- Object.defineProperty(o, k2, desc);
-}) : (function(o, m, k, k2) {
- if (k2 === undefined) k2 = k;
- o[k2] = m[k];
-}));
-var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
- Object.defineProperty(o, "default", { enumerable: true, value: v });
-}) : function(o, v) {
- o["default"] = v;
-});
-var __importStar = (this && this.__importStar) || (function () {
- var ownKeys = function(o) {
- ownKeys = Object.getOwnPropertyNames || function (o) {
- var ar = [];
- for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
- return ar;
- };
- return ownKeys(o);
- };
- return function (mod) {
- if (mod && mod.__esModule) return mod;
- var result = {};
- if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
- __setModuleDefault(result, mod);
- return result;
- };
-})();
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.activeScrapers = void 0;
-exports.registerScraper = registerScraper;
-exports.updateScraperStats = updateScraperStats;
-exports.completeScraper = completeScraper;
-const express_1 = require("express");
-const middleware_1 = require("../auth/middleware");
-const migrate_1 = require("../db/migrate");
-const router = (0, express_1.Router)();
-router.use(middleware_1.authMiddleware);
-exports.activeScrapers = new Map();
-// Get all active scrapers
-router.get('/active', async (req, res) => {
- try {
- const scrapers = Array.from(exports.activeScrapers.values()).map(scraper => ({
- ...scraper,
- duration: Date.now() - scraper.startTime.getTime(),
- isStale: Date.now() - scraper.lastUpdate.getTime() > 60000 // 1 minute
- }));
- res.json({ scrapers });
- }
- catch (error) {
- console.error('Error fetching active scrapers:', error);
- res.status(500).json({ error: 'Failed to fetch active scrapers' });
- }
-});
-// Get scraper by ID
-router.get('/active/:id', async (req, res) => {
- try {
- const { id } = req.params;
- const scraper = exports.activeScrapers.get(id);
- if (!scraper) {
- return res.status(404).json({ error: 'Scraper not found' });
- }
- res.json({
- scraper: {
- ...scraper,
- duration: Date.now() - scraper.startTime.getTime(),
- isStale: Date.now() - scraper.lastUpdate.getTime() > 60000
- }
- });
- }
- catch (error) {
- console.error('Error fetching scraper:', error);
- res.status(500).json({ error: 'Failed to fetch scraper' });
- }
-});
-// Get scraper history (last 50 completed scrapes)
-router.get('/history', async (req, res) => {
- try {
- const { limit = 50, dispensary_id } = req.query;
- let query = `
- SELECT
- d.id as dispensary_id,
- COALESCE(d.dba_name, d.name) as dispensary_name,
- d.city,
- d.state,
- dcj.id as job_id,
- dcj.job_type,
- dcj.status,
- dcj.products_found,
- dcj.products_new,
- dcj.products_updated,
- dcj.in_stock_count,
- dcj.out_of_stock_count,
- dcj.duration_ms,
- dcj.completed_at as last_scraped_at,
- dcj.error_message,
- (
- SELECT COUNT(*)
- FROM products p
- WHERE p.dispensary_id = d.id
- AND p.last_seen_at >= NOW() - INTERVAL '7 days'
- ) as product_count
- FROM dispensary_crawl_jobs dcj
- JOIN dispensaries d ON d.id = dcj.dispensary_id
- WHERE dcj.completed_at IS NOT NULL
- `;
- const params = [];
- let paramCount = 1;
- if (dispensary_id) {
- query += ` AND d.id = $${paramCount}`;
- params.push(dispensary_id);
- paramCount++;
- }
- query += ` ORDER BY dcj.completed_at DESC LIMIT $${paramCount}`;
- params.push(limit);
- const result = await migrate_1.pool.query(query, params);
- res.json({ history: result.rows });
- }
- catch (error) {
- console.error('Error fetching scraper history:', error);
- res.status(500).json({ error: 'Failed to fetch scraper history' });
- }
-});
-// Helper function to register a scraper
-function registerScraper(id, storeId, storeName, categoryId, categoryName) {
- exports.activeScrapers.set(id, {
- id,
- storeId,
- storeName,
- categoryId,
- categoryName,
- startTime: new Date(),
- lastUpdate: new Date(),
- status: 'running',
- stats: {
- requestsTotal: 0,
- requestsSuccess: 0,
- itemsSaved: 0,
- itemsDropped: 0,
- errorsCount: 0
- }
- });
-}
-// Helper function to update scraper stats
-function updateScraperStats(id, stats, currentActivity) {
- const scraper = exports.activeScrapers.get(id);
- if (scraper) {
- scraper.stats = { ...scraper.stats, ...stats };
- scraper.lastUpdate = new Date();
- if (currentActivity) {
- scraper.currentActivity = currentActivity;
- }
- }
-}
-// Helper function to mark scraper as completed
-function completeScraper(id, error) {
- const scraper = exports.activeScrapers.get(id);
- if (scraper) {
- scraper.status = error ? 'error' : 'completed';
- scraper.lastUpdate = new Date();
- // Remove after 5 minutes
- setTimeout(() => {
- exports.activeScrapers.delete(id);
- }, 5 * 60 * 1000);
- }
-}
-// Dispensary crawl jobs endpoints
-router.get('/jobs/stats', async (req, res) => {
- try {
- const { dispensary_id } = req.query;
- let whereClause = '';
- const params = [];
- if (dispensary_id) {
- whereClause = 'WHERE dispensary_id = $1';
- params.push(dispensary_id);
- }
- const result = await migrate_1.pool.query(`
- SELECT
- status,
- COUNT(*) as count,
- SUM(products_found) as total_products_found,
- SUM(COALESCE(products_new, 0) + COALESCE(products_updated, 0)) as total_products_saved
- FROM dispensary_crawl_jobs
- ${whereClause}
- GROUP BY status
- `, params);
- const stats = {
- pending: 0,
- in_progress: 0,
- completed: 0,
- failed: 0,
- total_products_found: 0,
- total_products_saved: 0
- };
- result.rows.forEach((row) => {
- stats[row.status] = parseInt(row.count);
- if (row.status === 'completed') {
- stats.total_products_found += parseInt(row.total_products_found || '0');
- stats.total_products_saved += parseInt(row.total_products_saved || '0');
- }
- });
- res.json(stats);
- }
- catch (error) {
- console.error('Error fetching job stats:', error);
- res.status(500).json({ error: 'Failed to fetch job stats' });
- }
-});
-router.get('/jobs/active', async (req, res) => {
- try {
- const { dispensary_id } = req.query;
- let whereClause = "WHERE dcj.status = 'in_progress'";
- const params = [];
- let paramCount = 1;
- if (dispensary_id) {
- whereClause += ` AND dcj.dispensary_id = $${paramCount}`;
- params.push(dispensary_id);
- paramCount++;
- }
- const result = await migrate_1.pool.query(`
- SELECT
- dcj.id,
- dcj.dispensary_id,
- COALESCE(d.dba_name, d.name) as dispensary_name,
- dcj.job_type,
- dcj.status,
- dcj.worker_id,
- dcj.started_at,
- dcj.products_found,
- COALESCE(dcj.products_new, 0) + COALESCE(dcj.products_updated, 0) as products_saved,
- EXTRACT(EPOCH FROM (NOW() - dcj.started_at)) as duration_seconds
- FROM dispensary_crawl_jobs dcj
- JOIN dispensaries d ON d.id = dcj.dispensary_id
- ${whereClause}
- ORDER BY dcj.started_at DESC
- `, params);
- res.json({ jobs: result.rows });
- }
- catch (error) {
- console.error('Error fetching active jobs:', error);
- res.status(500).json({ error: 'Failed to fetch active jobs' });
- }
-});
-router.get('/jobs/recent', async (req, res) => {
- try {
- const { limit = 50, dispensary_id, status } = req.query;
- let whereClause = '';
- const params = [];
- let paramCount = 1;
- const conditions = [];
- if (dispensary_id) {
- conditions.push(`dcj.dispensary_id = $${paramCount}`);
- params.push(dispensary_id);
- paramCount++;
- }
- if (status) {
- conditions.push(`dcj.status = $${paramCount}`);
- params.push(status);
- paramCount++;
- }
- if (conditions.length > 0) {
- whereClause = 'WHERE ' + conditions.join(' AND ');
- }
- params.push(limit);
- const result = await migrate_1.pool.query(`
- SELECT
- dcj.id,
- dcj.dispensary_id,
- COALESCE(d.dba_name, d.name) as dispensary_name,
- dcj.job_type,
- dcj.status,
- dcj.worker_id,
- dcj.started_at,
- dcj.completed_at,
- dcj.products_found,
- COALESCE(dcj.products_new, 0) + COALESCE(dcj.products_updated, 0) as products_saved,
- dcj.error_message,
- EXTRACT(EPOCH FROM (COALESCE(dcj.completed_at, NOW()) - dcj.started_at)) as duration_seconds
- FROM dispensary_crawl_jobs dcj
- JOIN dispensaries d ON d.id = dcj.dispensary_id
- ${whereClause}
- ORDER BY dcj.created_at DESC
- LIMIT $${paramCount}
- `, params);
- res.json({ jobs: result.rows });
- }
- catch (error) {
- console.error('Error fetching recent jobs:', error);
- res.status(500).json({ error: 'Failed to fetch recent jobs' });
- }
-});
-router.get('/jobs/workers', async (req, res) => {
- try {
- const { dispensary_id } = req.query;
- let whereClause = "WHERE status = 'in_progress' AND worker_id IS NOT NULL";
- const params = [];
- if (dispensary_id) {
- whereClause += ` AND dispensary_id = $1`;
- params.push(dispensary_id);
- }
- const result = await migrate_1.pool.query(`
- SELECT
- worker_id,
- COUNT(*) as active_jobs,
- SUM(products_found) as total_products_found,
- SUM(COALESCE(products_new, 0) + COALESCE(products_updated, 0)) as total_products_saved,
- MIN(started_at) as earliest_start,
- MAX(started_at) as latest_start
- FROM dispensary_crawl_jobs
- ${whereClause}
- GROUP BY worker_id
- ORDER BY worker_id
- `, params);
- res.json({ workers: result.rows });
- }
- catch (error) {
- console.error('Error fetching worker stats:', error);
- res.status(500).json({ error: 'Failed to fetch worker stats' });
- }
-});
-router.get('/jobs/worker-logs/:workerId', async (req, res) => {
- try {
- const { workerId } = req.params;
- const fs = await Promise.resolve().then(() => __importStar(require('fs/promises')));
- const path = await Promise.resolve().then(() => __importStar(require('path')));
- const logPath = path.join('/tmp', `worker-${workerId}.log`);
- try {
- const logs = await fs.readFile(logPath, 'utf-8');
- const lines = logs.split('\n');
- // Return last 100 lines
- const recentLogs = lines.slice(-100).join('\n');
- res.json({ logs: recentLogs });
- }
- catch (fileError) {
- res.json({ logs: 'No logs available for this worker yet.' });
- }
- }
- catch (error) {
- console.error('Failed to get worker logs:', error);
- res.status(500).json({ error: 'Failed to get worker logs' });
- }
-});
-exports.default = router;
diff --git a/backend/dist/routes/settings.js b/backend/dist/routes/settings.js
deleted file mode 100644
index efcf4b64..00000000
--- a/backend/dist/routes/settings.js
+++ /dev/null
@@ -1,118 +0,0 @@
-"use strict";
-Object.defineProperty(exports, "__esModule", { value: true });
-const express_1 = require("express");
-const middleware_1 = require("../auth/middleware");
-const migrate_1 = require("../db/migrate");
-const scheduler_1 = require("../services/scheduler");
-const router = (0, express_1.Router)();
-router.use(middleware_1.authMiddleware);
-// Get all settings
-router.get('/', async (req, res) => {
- try {
- const result = await migrate_1.pool.query(`
- SELECT key, value, description, updated_at
- FROM settings
- ORDER BY key
- `);
- res.json({ settings: result.rows });
- }
- catch (error) {
- console.error('Error fetching settings:', error);
- res.status(500).json({ error: 'Failed to fetch settings' });
- }
-});
-// Get single setting
-router.get('/:key', async (req, res) => {
- try {
- const { key } = req.params;
- const result = await migrate_1.pool.query(`
- SELECT key, value, description, updated_at
- FROM settings
- WHERE key = $1
- `, [key]);
- if (result.rows.length === 0) {
- return res.status(404).json({ error: 'Setting not found' });
- }
- res.json({ setting: result.rows[0] });
- }
- catch (error) {
- console.error('Error fetching setting:', error);
- res.status(500).json({ error: 'Failed to fetch setting' });
- }
-});
-// Update setting
-router.put('/:key', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
- try {
- const { key } = req.params;
- const { value } = req.body;
- if (value === undefined) {
- return res.status(400).json({ error: 'Value required' });
- }
- const result = await migrate_1.pool.query(`
- UPDATE settings
- SET value = $1, updated_at = CURRENT_TIMESTAMP
- WHERE key = $2
- RETURNING *
- `, [value, key]);
- if (result.rows.length === 0) {
- return res.status(404).json({ error: 'Setting not found' });
- }
- // Restart scheduler if scrape settings changed
- if (key === 'scrape_interval_hours' || key === 'scrape_specials_time') {
- console.log('Restarting scheduler due to setting change...');
- await (0, scheduler_1.restartScheduler)();
- }
- res.json({ setting: result.rows[0] });
- }
- catch (error) {
- console.error('Error updating setting:', error);
- res.status(500).json({ error: 'Failed to update setting' });
- }
-});
-// Update multiple settings at once
-router.put('/', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
- try {
- const { settings } = req.body;
- if (!settings || !Array.isArray(settings)) {
- return res.status(400).json({ error: 'Settings array required' });
- }
- const client = await migrate_1.pool.connect();
- try {
- await client.query('BEGIN');
- const updated = [];
- let needsSchedulerRestart = false;
- for (const setting of settings) {
- const result = await client.query(`
- UPDATE settings
- SET value = $1, updated_at = CURRENT_TIMESTAMP
- WHERE key = $2
- RETURNING *
- `, [setting.value, setting.key]);
- if (result.rows.length > 0) {
- updated.push(result.rows[0]);
- if (setting.key === 'scrape_interval_hours' || setting.key === 'scrape_specials_time') {
- needsSchedulerRestart = true;
- }
- }
- }
- await client.query('COMMIT');
- if (needsSchedulerRestart) {
- console.log('Restarting scheduler due to setting changes...');
- await (0, scheduler_1.restartScheduler)();
- }
- res.json({ settings: updated });
- }
- catch (error) {
- await client.query('ROLLBACK');
- throw error;
- }
- finally {
- client.release();
- }
- }
- catch (error) {
- console.error('Error updating settings:', error);
- res.status(500).json({ error: 'Failed to update settings' });
- }
-});
-exports.default = router;
diff --git a/backend/dist/routes/stores.js b/backend/dist/routes/stores.js
deleted file mode 100644
index 406ca032..00000000
--- a/backend/dist/routes/stores.js
+++ /dev/null
@@ -1,412 +0,0 @@
-"use strict";
-var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
- if (k2 === undefined) k2 = k;
- var desc = Object.getOwnPropertyDescriptor(m, k);
- if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
- desc = { enumerable: true, get: function() { return m[k]; } };
- }
- Object.defineProperty(o, k2, desc);
-}) : (function(o, m, k, k2) {
- if (k2 === undefined) k2 = k;
- o[k2] = m[k];
-}));
-var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
- Object.defineProperty(o, "default", { enumerable: true, value: v });
-}) : function(o, v) {
- o["default"] = v;
-});
-var __importStar = (this && this.__importStar) || (function () {
- var ownKeys = function(o) {
- ownKeys = Object.getOwnPropertyNames || function (o) {
- var ar = [];
- for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
- return ar;
- };
- return ownKeys(o);
- };
- return function (mod) {
- if (mod && mod.__esModule) return mod;
- var result = {};
- if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
- __setModuleDefault(result, mod);
- return result;
- };
-})();
-Object.defineProperty(exports, "__esModule", { value: true });
-const express_1 = require("express");
-const middleware_1 = require("../auth/middleware");
-const migrate_1 = require("../db/migrate");
-const scraper_v2_1 = require("../scraper-v2");
-const router = (0, express_1.Router)();
-router.use(middleware_1.authMiddleware);
-// Get all stores
-router.get('/', async (req, res) => {
- try {
- const result = await migrate_1.pool.query(`
- SELECT
- s.*,
- COUNT(DISTINCT p.id) as product_count,
- COUNT(DISTINCT c.id) as category_count
- FROM stores s
- LEFT JOIN products p ON s.id = p.store_id
- LEFT JOIN categories c ON s.id = c.store_id
- GROUP BY s.id
- ORDER BY s.name
- `);
- res.json({ stores: result.rows });
- }
- catch (error) {
- console.error('Error fetching stores:', error);
- res.status(500).json({ error: 'Failed to fetch stores' });
- }
-});
-// Freshness threshold in hours
-const STALE_THRESHOLD_HOURS = 4;
-function calculateFreshness(lastScrapedAt) {
- if (!lastScrapedAt) {
- return {
- last_scraped_at: null,
- is_stale: true,
- freshness: 'Never scraped',
- hours_since_scrape: null
- };
- }
- const now = new Date();
- const diffMs = now.getTime() - lastScrapedAt.getTime();
- const diffHours = diffMs / (1000 * 60 * 60);
- const isStale = diffHours > STALE_THRESHOLD_HOURS;
- let freshnessText;
- if (diffHours < 1) {
- const mins = Math.round(diffHours * 60);
- freshnessText = `${mins} minute${mins !== 1 ? 's' : ''} ago`;
- }
- else if (diffHours < 24) {
- const hrs = Math.round(diffHours);
- freshnessText = `${hrs} hour${hrs !== 1 ? 's' : ''} ago`;
- }
- else {
- const days = Math.round(diffHours / 24);
- freshnessText = `${days} day${days !== 1 ? 's' : ''} ago`;
- }
- return {
- last_scraped_at: lastScrapedAt.toISOString(),
- is_stale: isStale,
- freshness: freshnessText,
- hours_since_scrape: Math.round(diffHours * 10) / 10
- };
-}
-function detectProvider(dutchieUrl) {
- if (!dutchieUrl)
- return 'unknown';
- if (dutchieUrl.includes('dutchie.com'))
- return 'Dutchie';
- if (dutchieUrl.includes('iheartjane.com') || dutchieUrl.includes('jane.co'))
- return 'Jane';
- if (dutchieUrl.includes('treez.io'))
- return 'Treez';
- if (dutchieUrl.includes('weedmaps.com'))
- return 'Weedmaps';
- if (dutchieUrl.includes('leafly.com'))
- return 'Leafly';
- return 'Custom';
-}
-// Get single store with full details
-router.get('/:id', async (req, res) => {
- try {
- const { id } = req.params;
- // Get store with counts and linked dispensary
- const result = await migrate_1.pool.query(`
- SELECT
- s.*,
- d.id as dispensary_id,
- d.name as dispensary_name,
- d.slug as dispensary_slug,
- d.state as dispensary_state,
- d.city as dispensary_city,
- d.address as dispensary_address,
- d.menu_provider as dispensary_menu_provider,
- COUNT(DISTINCT p.id) as product_count,
- COUNT(DISTINCT c.id) as category_count,
- COUNT(DISTINCT p.id) FILTER (WHERE p.in_stock = true) as in_stock_count,
- COUNT(DISTINCT p.id) FILTER (WHERE p.in_stock = false) as out_of_stock_count
- FROM stores s
- LEFT JOIN dispensaries d ON s.dispensary_id = d.id
- LEFT JOIN products p ON s.id = p.store_id
- LEFT JOIN categories c ON s.id = c.store_id
- WHERE s.id = $1
- GROUP BY s.id, d.id, d.name, d.slug, d.state, d.city, d.address, d.menu_provider
- `, [id]);
- if (result.rows.length === 0) {
- return res.status(404).json({ error: 'Store not found' });
- }
- const store = result.rows[0];
- // Get recent crawl jobs for this store
- const jobsResult = await migrate_1.pool.query(`
- SELECT
- id, status, job_type, trigger_type,
- started_at, completed_at,
- products_found, products_new, products_updated,
- in_stock_count, out_of_stock_count,
- error_message
- FROM crawl_jobs
- WHERE store_id = $1
- ORDER BY created_at DESC
- LIMIT 10
- `, [id]);
- // Get schedule info if exists
- const scheduleResult = await migrate_1.pool.query(`
- SELECT
- enabled, interval_hours, next_run_at, last_run_at
- FROM store_crawl_schedule
- WHERE store_id = $1
- `, [id]);
- // Calculate freshness
- const freshness = calculateFreshness(store.last_scraped_at);
- // Detect provider from URL
- const provider = detectProvider(store.dutchie_url);
- // Build response
- const response = {
- ...store,
- provider,
- freshness: freshness.freshness,
- is_stale: freshness.is_stale,
- hours_since_scrape: freshness.hours_since_scrape,
- linked_dispensary: store.dispensary_id ? {
- id: store.dispensary_id,
- name: store.dispensary_name,
- slug: store.dispensary_slug,
- state: store.dispensary_state,
- city: store.dispensary_city,
- address: store.dispensary_address,
- menu_provider: store.dispensary_menu_provider
- } : null,
- schedule: scheduleResult.rows[0] || null,
- recent_jobs: jobsResult.rows
- };
- // Remove redundant dispensary fields from root
- delete response.dispensary_name;
- delete response.dispensary_slug;
- delete response.dispensary_state;
- delete response.dispensary_city;
- delete response.dispensary_address;
- delete response.dispensary_menu_provider;
- res.json(response);
- }
- catch (error) {
- console.error('Error fetching store:', error);
- res.status(500).json({ error: 'Failed to fetch store' });
- }
-});
-// Get store brands
-router.get('/:id/brands', async (req, res) => {
- try {
- const { id } = req.params;
- const result = await migrate_1.pool.query(`
- SELECT name
- FROM brands
- WHERE store_id = $1
- ORDER BY name
- `, [id]);
- const brands = result.rows.map((row) => row.name);
- res.json({ brands });
- }
- catch (error) {
- console.error('Error fetching store brands:', error);
- res.status(500).json({ error: 'Failed to fetch store brands' });
- }
-});
-// Get store specials
-router.get('/:id/specials', async (req, res) => {
- try {
- const { id } = req.params;
- const { date } = req.query;
- // Use provided date or today's date
- const queryDate = date || new Date().toISOString().split('T')[0];
- const result = await migrate_1.pool.query(`
- SELECT
- s.*,
- p.name as product_name,
- p.image_url as product_image
- FROM specials s
- LEFT JOIN products p ON s.product_id = p.id
- WHERE s.store_id = $1 AND s.valid_date = $2
- ORDER BY s.name
- `, [id, queryDate]);
- res.json({ specials: result.rows, date: queryDate });
- }
- catch (error) {
- console.error('Error fetching store specials:', error);
- res.status(500).json({ error: 'Failed to fetch store specials' });
- }
-});
-// Create store
-router.post('/', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
- try {
- const { name, slug, dutchie_url, active, scrape_enabled } = req.body;
- const result = await migrate_1.pool.query(`
- INSERT INTO stores (name, slug, dutchie_url, active, scrape_enabled)
- VALUES ($1, $2, $3, $4, $5)
- RETURNING *
- `, [name, slug, dutchie_url, active ?? true, scrape_enabled ?? true]);
- res.status(201).json(result.rows[0]);
- }
- catch (error) {
- console.error('Error creating store:', error);
- res.status(500).json({ error: 'Failed to create store' });
- }
-});
-// Update store
-router.put('/:id', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
- try {
- const { id } = req.params;
- const { name, slug, dutchie_url, active, scrape_enabled } = req.body;
- const result = await migrate_1.pool.query(`
- UPDATE stores
- SET name = COALESCE($1, name),
- slug = COALESCE($2, slug),
- dutchie_url = COALESCE($3, dutchie_url),
- active = COALESCE($4, active),
- scrape_enabled = COALESCE($5, scrape_enabled),
- updated_at = CURRENT_TIMESTAMP
- WHERE id = $6
- RETURNING *
- `, [name, slug, dutchie_url, active, scrape_enabled, id]);
- if (result.rows.length === 0) {
- return res.status(404).json({ error: 'Store not found' });
- }
- res.json(result.rows[0]);
- }
- catch (error) {
- console.error('Error updating store:', error);
- res.status(500).json({ error: 'Failed to update store' });
- }
-});
-// Delete store
-router.delete('/:id', (0, middleware_1.requireRole)('superadmin'), async (req, res) => {
- try {
- const { id } = req.params;
- const result = await migrate_1.pool.query('DELETE FROM stores WHERE id = $1 RETURNING *', [id]);
- if (result.rows.length === 0) {
- return res.status(404).json({ error: 'Store not found' });
- }
- res.json({ message: 'Store deleted successfully' });
- }
- catch (error) {
- console.error('Error deleting store:', error);
- res.status(500).json({ error: 'Failed to delete store' });
- }
-});
-// Trigger scrape for a store
-router.post('/:id/scrape', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
- try {
- const { id } = req.params;
- const { parallel = 3, userAgent } = req.body; // Default to 3 parallel scrapers
- const storeResult = await migrate_1.pool.query('SELECT id FROM stores WHERE id = $1', [id]);
- if (storeResult.rows.length === 0) {
- return res.status(404).json({ error: 'Store not found' });
- }
- (0, scraper_v2_1.scrapeStore)(parseInt(id), parseInt(parallel), userAgent).catch(err => {
- console.error('Background scrape error:', err);
- });
- res.json({
- message: 'Scrape started',
- parallel: parseInt(parallel),
- userAgent: userAgent || 'random'
- });
- }
- catch (error) {
- console.error('Error triggering scrape:', error);
- res.status(500).json({ error: 'Failed to trigger scrape' });
- }
-});
-// Download missing images for a store
-router.post('/:id/download-images', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
- try {
- const { id } = req.params;
- const storeResult = await migrate_1.pool.query('SELECT id, name FROM stores WHERE id = $1', [id]);
- if (storeResult.rows.length === 0) {
- return res.status(404).json({ error: 'Store not found' });
- }
- const store = storeResult.rows[0];
- const productsResult = await migrate_1.pool.query(`
- SELECT id, name, image_url
- FROM products
- WHERE store_id = $1
- AND image_url IS NOT NULL
- AND local_image_path IS NULL
- `, [id]);
- (async () => {
- const { uploadImageFromUrl } = await Promise.resolve().then(() => __importStar(require('../utils/minio')));
- let downloaded = 0;
- for (const product of productsResult.rows) {
- try {
- console.log(`📸 Downloading image for: ${product.name}`);
- const localPath = await uploadImageFromUrl(product.image_url, product.id);
- await migrate_1.pool.query(`
- UPDATE products
- SET local_image_path = $1
- WHERE id = $2
- `, [localPath, product.id]);
- downloaded++;
- }
- catch (error) {
- console.error(`Failed to download image for ${product.name}:`, error);
- }
- }
- console.log(`✅ Downloaded ${downloaded} of ${productsResult.rows.length} missing images for ${store.name}`);
- })().catch(err => console.error('Background image download error:', err));
- res.json({
- message: 'Image download started',
- total_missing: productsResult.rows.length
- });
- }
- catch (error) {
- console.error('Error triggering image download:', error);
- res.status(500).json({ error: 'Failed to trigger image download' });
- }
-});
-// Discover categories for a store
-router.post('/:id/discover-categories', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
- try {
- const { id } = req.params;
- const storeResult = await migrate_1.pool.query('SELECT id FROM stores WHERE id = $1', [id]);
- if (storeResult.rows.length === 0) {
- return res.status(404).json({ error: 'Store not found' });
- }
- (0, scraper_v2_1.discoverCategories)(parseInt(id)).catch(err => {
- console.error('Background category discovery error:', err);
- });
- res.json({ message: 'Category discovery started' });
- }
- catch (error) {
- console.error('Error triggering category discovery:', error);
- res.status(500).json({ error: 'Failed to trigger category discovery' });
- }
-});
-// Debug scraper
-router.post('/:id/debug-scrape', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => {
- try {
- const { id } = req.params;
- console.log('Debug scrape triggered for store:', id);
- const categoryResult = await migrate_1.pool.query(`
- SELECT c.dutchie_url, c.name
- FROM categories c
- WHERE c.store_id = $1 AND c.slug = 'edibles'
- LIMIT 1
- `, [id]);
- if (categoryResult.rows.length === 0) {
- return res.status(404).json({ error: 'Edibles category not found' });
- }
- console.log('Found category:', categoryResult.rows[0]);
- const { debugDutchiePage } = await Promise.resolve().then(() => __importStar(require('../services/scraper-debug')));
- debugDutchiePage(categoryResult.rows[0].dutchie_url).catch(err => {
- console.error('Debug error:', err);
- });
- res.json({ message: 'Debug started, check logs', url: categoryResult.rows[0].dutchie_url });
- }
- catch (error) {
- console.error('Debug endpoint error:', error);
- res.status(500).json({ error: 'Failed to debug' });
- }
-});
-exports.default = router;
diff --git a/backend/dist/routes/version.js b/backend/dist/routes/version.js
deleted file mode 100644
index c3f353ea..00000000
--- a/backend/dist/routes/version.js
+++ /dev/null
@@ -1,24 +0,0 @@
-"use strict";
-Object.defineProperty(exports, "__esModule", { value: true });
-const express_1 = require("express");
-const router = (0, express_1.Router)();
-/**
- * GET /api/version
- * Returns build version information for display in admin UI
- */
-router.get('/', async (req, res) => {
- try {
- const versionInfo = {
- build_version: process.env.APP_BUILD_VERSION || 'dev',
- git_sha: process.env.APP_GIT_SHA || 'local',
- build_time: process.env.APP_BUILD_TIME || new Date().toISOString(),
- image_tag: process.env.CONTAINER_IMAGE_TAG || 'local',
- };
- res.json(versionInfo);
- }
- catch (error) {
- console.error('Error fetching version info:', error);
- res.status(500).json({ error: 'Failed to fetch version info' });
- }
-});
-exports.default = router;
diff --git a/backend/dist/scraper-v2/downloader.js b/backend/dist/scraper-v2/downloader.js
deleted file mode 100644
index 2855a60b..00000000
--- a/backend/dist/scraper-v2/downloader.js
+++ /dev/null
@@ -1,502 +0,0 @@
-"use strict";
-var __importDefault = (this && this.__importDefault) || function (mod) {
- return (mod && mod.__esModule) ? mod : { "default": mod };
-};
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.Downloader = void 0;
-const puppeteer_1 = __importDefault(require("puppeteer"));
-const axios_1 = __importDefault(require("axios"));
-const types_1 = require("./types");
-const logger_1 = require("../services/logger");
-// Fingerprint profiles for randomization
-const SCREEN_RESOLUTIONS = [
- { width: 1920, height: 1080 },
- { width: 1366, height: 768 },
- { width: 1536, height: 864 },
- { width: 1440, height: 900 },
- { width: 1280, height: 720 },
- { width: 2560, height: 1440 },
- { width: 1680, height: 1050 },
- { width: 1600, height: 900 },
-];
-const TIMEZONES = [
- 'America/New_York',
- 'America/Chicago',
- 'America/Denver',
- 'America/Los_Angeles',
- 'America/Phoenix',
-];
-const LANGUAGES = [
- ['en-US', 'en'],
- ['en-US', 'en', 'es'],
- ['en-US'],
-];
-const PLATFORMS = [
- 'Win32',
- 'MacIntel',
- 'Linux x86_64',
-];
-const WEBGL_VENDORS = [
- 'Google Inc. (NVIDIA)',
- 'Google Inc. (Intel)',
- 'Google Inc. (AMD)',
- 'Intel Inc.',
- 'NVIDIA Corporation',
-];
-const WEBGL_RENDERERS = [
- 'ANGLE (NVIDIA GeForce GTX 1080 Direct3D11 vs_5_0 ps_5_0)',
- 'ANGLE (Intel(R) UHD Graphics 630 Direct3D11 vs_5_0 ps_5_0)',
- 'ANGLE (AMD Radeon RX 580 Series Direct3D11 vs_5_0 ps_5_0)',
- 'Intel Iris OpenGL Engine',
- 'NVIDIA GeForce RTX 3070/PCIe/SSE2',
- 'AMD Radeon Pro 5500M OpenGL Engine',
-];
-function generateRandomFingerprint() {
- return {
- screen: SCREEN_RESOLUTIONS[Math.floor(Math.random() * SCREEN_RESOLUTIONS.length)],
- timezone: TIMEZONES[Math.floor(Math.random() * TIMEZONES.length)],
- languages: LANGUAGES[Math.floor(Math.random() * LANGUAGES.length)],
- platform: PLATFORMS[Math.floor(Math.random() * PLATFORMS.length)],
- hardwareConcurrency: [4, 8, 12, 16][Math.floor(Math.random() * 4)],
- deviceMemory: [4, 8, 16, 32][Math.floor(Math.random() * 4)],
- webglVendor: WEBGL_VENDORS[Math.floor(Math.random() * WEBGL_VENDORS.length)],
- webglRenderer: WEBGL_RENDERERS[Math.floor(Math.random() * WEBGL_RENDERERS.length)],
- };
-}
-class Downloader {
- browser = null;
- page = null;
- pageInUse = false;
- currentFingerprint = generateRandomFingerprint();
- needsNewFingerprint = false;
- /**
- * Force new fingerprint on next browser creation
- */
- rotateFingerprint() {
- this.needsNewFingerprint = true;
- logger_1.logger.info('scraper', '🔄 Fingerprint rotation scheduled');
- }
- /**
- * Initialize browser instance with fingerprint
- */
- async getBrowser(forceNew = false) {
- // Create new browser if needed for fingerprint rotation
- if (forceNew || this.needsNewFingerprint) {
- await this.close();
- this.currentFingerprint = generateRandomFingerprint();
- this.needsNewFingerprint = false;
- logger_1.logger.info('scraper', `🎭 New fingerprint: ${this.currentFingerprint.screen.width}x${this.currentFingerprint.screen.height}, ${this.currentFingerprint.timezone}, ${this.currentFingerprint.platform}`);
- }
- if (!this.browser || !this.browser.isConnected()) {
- const { screen } = this.currentFingerprint;
- const launchOptions = {
- headless: 'new',
- args: [
- '--no-sandbox',
- '--disable-setuid-sandbox',
- '--disable-dev-shm-usage',
- '--disable-blink-features=AutomationControlled',
- `--window-size=${screen.width},${screen.height}`,
- '--disable-web-security',
- '--disable-features=IsolateOrigins,site-per-process',
- '--disable-infobars',
- '--disable-extensions',
- ]
- };
- this.browser = await puppeteer_1.default.launch(launchOptions);
- logger_1.logger.info('scraper', 'Browser instance created');
- }
- return this.browser;
- }
- /**
- * Get or create a page instance with current fingerprint
- */
- async getPage(forceNew = false) {
- if (!this.page || this.page.isClosed() || forceNew) {
- const browser = await this.getBrowser(forceNew);
- this.page = await browser.newPage();
- const { screen } = this.currentFingerprint;
- await this.page.setViewport({
- width: screen.width,
- height: screen.height,
- deviceScaleFactor: 1,
- });
- // Apply fingerprint
- await this.applyFingerprint(this.page);
- logger_1.logger.debug('scraper', 'New page created with fingerprint');
- }
- return this.page;
- }
- /**
- * Apply full fingerprint to page
- */
- async applyFingerprint(page) {
- const fp = this.currentFingerprint;
- await page.evaluateOnNewDocument((fingerprint) => {
- // Hide webdriver
- Object.defineProperty(navigator, 'webdriver', {
- get: () => false,
- });
- // Spoof platform
- Object.defineProperty(navigator, 'platform', {
- get: () => fingerprint.platform,
- });
- // Spoof languages
- Object.defineProperty(navigator, 'languages', {
- get: () => fingerprint.languages,
- });
- // Spoof hardware concurrency
- Object.defineProperty(navigator, 'hardwareConcurrency', {
- get: () => fingerprint.hardwareConcurrency,
- });
- // Spoof device memory
- Object.defineProperty(navigator, 'deviceMemory', {
- get: () => fingerprint.deviceMemory,
- });
- // Spoof plugins (realistic count)
- Object.defineProperty(navigator, 'plugins', {
- get: () => {
- const plugins = [];
- for (let i = 0; i < 5; i++) {
- plugins.push({
- name: `Plugin ${i}`,
- filename: `plugin${i}.dll`,
- description: `Description ${i}`,
- });
- }
- plugins.length = 5;
- return plugins;
- },
- });
- // Chrome object
- window.chrome = {
- runtime: {},
- loadTimes: () => ({}),
- csi: () => ({}),
- app: {},
- };
- // Permissions
- const originalQuery = window.navigator.permissions.query;
- window.navigator.permissions.query = (parameters) => parameters.name === 'notifications'
- ? Promise.resolve({ state: 'denied' })
- : originalQuery(parameters);
- // WebGL fingerprint spoofing
- const getParameterProxyHandler = {
- apply: function (target, thisArg, argumentsList) {
- const param = argumentsList[0];
- // UNMASKED_VENDOR_WEBGL
- if (param === 37445) {
- return fingerprint.webglVendor;
- }
- // UNMASKED_RENDERER_WEBGL
- if (param === 37446) {
- return fingerprint.webglRenderer;
- }
- return Reflect.apply(target, thisArg, argumentsList);
- }
- };
- // Override WebGL
- const originalGetContext = HTMLCanvasElement.prototype.getContext;
- HTMLCanvasElement.prototype.getContext = function (type, ...args) {
- const context = originalGetContext.call(this, type, ...args);
- if (context && (type === 'webgl' || type === 'webgl2' || type === 'experimental-webgl')) {
- const glContext = context;
- const originalGetParameter = glContext.getParameter.bind(glContext);
- glContext.getParameter = new Proxy(originalGetParameter, getParameterProxyHandler);
- }
- return context;
- };
- // Canvas fingerprint noise
- const originalToDataURL = HTMLCanvasElement.prototype.toDataURL;
- HTMLCanvasElement.prototype.toDataURL = function (type) {
- const context = this.getContext('2d');
- if (context) {
- const imageData = context.getImageData(0, 0, this.width, this.height);
- for (let i = 0; i < imageData.data.length; i += 4) {
- // Add tiny noise to RGB values
- imageData.data[i] = imageData.data[i] ^ (Math.random() > 0.5 ? 1 : 0);
- }
- context.putImageData(imageData, 0, 0);
- }
- return originalToDataURL.call(this, type);
- };
- // Screen dimensions
- Object.defineProperty(window.screen, 'width', { get: () => fingerprint.screen.width });
- Object.defineProperty(window.screen, 'height', { get: () => fingerprint.screen.height });
- Object.defineProperty(window.screen, 'availWidth', { get: () => fingerprint.screen.width });
- Object.defineProperty(window.screen, 'availHeight', { get: () => fingerprint.screen.height - 40 });
- Object.defineProperty(window, 'innerWidth', { get: () => fingerprint.screen.width });
- Object.defineProperty(window, 'innerHeight', { get: () => fingerprint.screen.height - 140 });
- Object.defineProperty(window, 'outerWidth', { get: () => fingerprint.screen.width });
- Object.defineProperty(window, 'outerHeight', { get: () => fingerprint.screen.height });
- }, fp);
- // Set timezone via CDP
- const client = await page.target().createCDPSession();
- await client.send('Emulation.setTimezoneOverride', { timezoneId: fp.timezone });
- }
- /**
- * Apply stealth mode to page (legacy - now uses applyFingerprint)
- */
- async makePageStealthy(page) {
- // Now handled by applyFingerprint
- await this.applyFingerprint(page);
- }
- /**
- * Configure proxy for browser
- */
- getProxyArgs(proxy) {
- if (proxy.protocol === 'socks5') {
- return [`--proxy-server=socks5://${proxy.host}:${proxy.port}`];
- }
- else if (proxy.protocol === 'http' || proxy.protocol === 'https') {
- return [`--proxy-server=${proxy.protocol}://${proxy.host}:${proxy.port}`];
- }
- return [];
- }
- /**
- * HTTP-based fetch (lightweight, fast)
- */
- async httpFetch(request) {
- try {
- const config = {
- timeout: 30000,
- headers: {
- 'User-Agent': request.metadata.userAgent || 'Mozilla/5.0',
- ...request.metadata.headers
- },
- validateStatus: () => true // Don't throw on any status
- };
- // Add proxy if available
- if (request.metadata.proxy) {
- const proxy = request.metadata.proxy;
- config.proxy = {
- host: proxy.host,
- port: proxy.port,
- protocol: proxy.protocol
- };
- if (proxy.username && proxy.password) {
- config.proxy.auth = {
- username: proxy.username,
- password: proxy.password
- };
- }
- }
- const response = await axios_1.default.get(request.url, config);
- return {
- url: request.url,
- statusCode: response.status,
- content: response.data,
- metadata: {
- headers: response.headers,
- method: 'http'
- },
- request
- };
- }
- catch (error) {
- const scraperError = new Error(error.message);
- if (error.code === 'ETIMEDOUT' || error.code === 'ECONNABORTED') {
- scraperError.type = types_1.ErrorType.TIMEOUT;
- }
- else if (error.code === 'ECONNREFUSED' || error.code === 'ENOTFOUND') {
- scraperError.type = types_1.ErrorType.NETWORK_ERROR;
- }
- else {
- scraperError.type = types_1.ErrorType.UNKNOWN;
- }
- scraperError.retryable = true;
- scraperError.request = request;
- throw scraperError;
- }
- }
- /**
- * Browser-based fetch (for JS-heavy sites)
- */
- async browserFetch(request) {
- // Wait if page is in use
- while (this.pageInUse) {
- await new Promise(resolve => setTimeout(resolve, 100));
- }
- this.pageInUse = true;
- try {
- const page = await this.getPage();
- // Apply stealth mode if required
- if (request.metadata.requiresStealth) {
- await this.makePageStealthy(page);
- }
- // Set user agent
- if (request.metadata.userAgent) {
- await page.setUserAgent(request.metadata.userAgent);
- }
- // Navigate to page - use networkidle2 for SPAs like Dutchie
- // Increased timeout to 90s - Dutchie pages can take 30-40s to fully load
- const navigationPromise = page.goto(request.url, {
- waitUntil: 'networkidle2',
- timeout: 90000
- });
- const response = await navigationPromise;
- if (!response) {
- throw new Error('Navigation failed - no response');
- }
- // Wait for React to render product content
- // Try to wait for products, but don't fail if they don't appear (empty category)
- try {
- await page.waitForSelector('[data-testid="product-list-item"], [data-testid="empty-state"]', {
- timeout: 10000
- });
- }
- catch {
- // Products might not exist in this category - continue anyway
- logger_1.logger.debug('scraper', 'No products found within timeout - continuing');
- }
- // Additional wait for any lazy-loaded content
- await page.waitForTimeout(2000);
- // Check for lazy-loaded content
- await this.autoScroll(page);
- // Get page content
- const content = await page.content();
- const statusCode = response.status();
- return {
- url: request.url,
- statusCode,
- content,
- metadata: {
- method: 'browser',
- finalUrl: page.url()
- },
- request
- };
- }
- catch (error) {
- const scraperError = new Error(error.message);
- if (error.message.includes('timeout') || error.message.includes('Navigation timeout')) {
- scraperError.type = types_1.ErrorType.TIMEOUT;
- }
- else if (error.message.includes('net::')) {
- scraperError.type = types_1.ErrorType.NETWORK_ERROR;
- }
- else if (error.message.includes('404')) {
- scraperError.type = types_1.ErrorType.NOT_FOUND;
- }
- else {
- scraperError.type = types_1.ErrorType.UNKNOWN;
- }
- scraperError.retryable = scraperError.type !== types_1.ErrorType.NOT_FOUND;
- scraperError.request = request;
- throw scraperError;
- }
- finally {
- this.pageInUse = false;
- }
- }
- /**
- * Auto-scroll to load lazy content
- */
- async autoScroll(page) {
- try {
- await page.evaluate(async () => {
- await new Promise((resolve) => {
- let totalHeight = 0;
- const distance = 500;
- const maxScrolls = 20; // Prevent infinite scrolling
- let scrollCount = 0;
- const timer = setInterval(() => {
- // @ts-ignore - runs in browser context
- const scrollHeight = document.body.scrollHeight;
- // @ts-ignore - runs in browser context
- window.scrollBy(0, distance);
- totalHeight += distance;
- scrollCount++;
- if (totalHeight >= scrollHeight || scrollCount >= maxScrolls) {
- clearInterval(timer);
- // Scroll back to top
- // @ts-ignore - runs in browser context
- window.scrollTo(0, 0);
- resolve();
- }
- }, 200);
- });
- });
- // Wait for any lazy-loaded content
- await page.waitForTimeout(1000);
- }
- catch (error) {
- logger_1.logger.warn('scraper', `Auto-scroll failed: ${error}`);
- }
- }
- /**
- * Main fetch method - tries HTTP first, falls back to browser
- */
- async fetch(request) {
- const startTime = Date.now();
- try {
- // Force browser mode if required
- if (request.metadata.requiresBrowser) {
- logger_1.logger.debug('scraper', `Browser fetch: ${request.url}`);
- const response = await this.browserFetch(request);
- logger_1.logger.debug('scraper', `Fetch completed in ${Date.now() - startTime}ms`);
- return response;
- }
- // Try HTTP first (faster)
- try {
- logger_1.logger.debug('scraper', `HTTP fetch: ${request.url}`);
- const response = await this.httpFetch(request);
- // Check if we got a meaningful response
- if (response.statusCode && response.statusCode >= 200 && response.statusCode < 300) {
- logger_1.logger.debug('scraper', `HTTP fetch succeeded in ${Date.now() - startTime}ms`);
- return response;
- }
- // Fall through to browser mode for non-2xx responses
- logger_1.logger.debug('scraper', `HTTP got ${response.statusCode || 'unknown'}, trying browser`);
- }
- catch (httpError) {
- logger_1.logger.debug('scraper', `HTTP failed, falling back to browser: ${httpError}`);
- }
- // Fall back to browser
- request.metadata.requiresBrowser = true;
- const response = await this.browserFetch(request);
- logger_1.logger.debug('scraper', `Browser fetch completed in ${Date.now() - startTime}ms`);
- return response;
- }
- catch (error) {
- logger_1.logger.error('scraper', `Fetch failed after ${Date.now() - startTime}ms: ${error}`);
- throw error;
- }
- }
- /**
- * Evaluate JavaScript in the current page context
- */
- async evaluate(fn) {
- if (!this.page || this.page.isClosed()) {
- throw new Error('No active page for evaluation');
- }
- return await this.page.evaluate(fn);
- }
- /**
- * Get the current page (for custom operations)
- */
- async getCurrentPage() {
- return this.page;
- }
- /**
- * Close the browser
- */
- async close() {
- if (this.page && !this.page.isClosed()) {
- await this.page.close();
- this.page = null;
- }
- if (this.browser && this.browser.isConnected()) {
- await this.browser.close();
- this.browser = null;
- logger_1.logger.info('scraper', 'Browser closed');
- }
- }
- /**
- * Clean up resources
- */
- async cleanup() {
- await this.close();
- }
-}
-exports.Downloader = Downloader;
diff --git a/backend/dist/scraper-v2/engine.js b/backend/dist/scraper-v2/engine.js
deleted file mode 100644
index e7cf36bf..00000000
--- a/backend/dist/scraper-v2/engine.js
+++ /dev/null
@@ -1,693 +0,0 @@
-"use strict";
-var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
- if (k2 === undefined) k2 = k;
- var desc = Object.getOwnPropertyDescriptor(m, k);
- if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
- desc = { enumerable: true, get: function() { return m[k]; } };
- }
- Object.defineProperty(o, k2, desc);
-}) : (function(o, m, k, k2) {
- if (k2 === undefined) k2 = k;
- o[k2] = m[k];
-}));
-var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
- Object.defineProperty(o, "default", { enumerable: true, value: v });
-}) : function(o, v) {
- o["default"] = v;
-});
-var __importStar = (this && this.__importStar) || (function () {
- var ownKeys = function(o) {
- ownKeys = Object.getOwnPropertyNames || function (o) {
- var ar = [];
- for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
- return ar;
- };
- return ownKeys(o);
- };
- return function (mod) {
- if (mod && mod.__esModule) return mod;
- var result = {};
- if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
- __setModuleDefault(result, mod);
- return result;
- };
-})();
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.DutchieSpider = exports.ScraperEngine = void 0;
-const scheduler_1 = require("./scheduler");
-const downloader_1 = require("./downloader");
-const middlewares_1 = require("./middlewares");
-const pipelines_1 = require("./pipelines");
-const logger_1 = require("../services/logger");
-const migrate_1 = require("../db/migrate");
-/**
- * Main Scraper Engine - orchestrates the entire scraping process
- */
-class ScraperEngine {
- scheduler;
- downloader;
- middlewareEngine;
- pipelineEngine;
- stats;
- isRunning = false;
- concurrency = 1; // Conservative default
- constructor(concurrency = 1) {
- this.scheduler = new scheduler_1.RequestScheduler();
- this.downloader = new downloader_1.Downloader();
- this.middlewareEngine = new middlewares_1.MiddlewareEngine();
- this.pipelineEngine = new pipelines_1.PipelineEngine();
- this.concurrency = concurrency;
- // Initialize stats
- this.stats = {
- requestsTotal: 0,
- requestsSuccess: 0,
- requestsFailed: 0,
- itemsScraped: 0,
- itemsSaved: 0,
- itemsDropped: 0,
- errorsCount: 0,
- startTime: new Date()
- };
- // Setup middlewares
- this.setupMiddlewares();
- // Setup pipelines
- this.setupPipelines();
- }
- /**
- * Setup middleware chain
- */
- setupMiddlewares() {
- this.middlewareEngine.use(new middlewares_1.UserAgentMiddleware());
- this.middlewareEngine.use(new middlewares_1.ProxyMiddleware());
- this.middlewareEngine.use(new middlewares_1.RateLimitMiddleware());
- this.middlewareEngine.use(new middlewares_1.RetryMiddleware());
- this.middlewareEngine.use(new middlewares_1.BotDetectionMiddleware());
- this.middlewareEngine.use(new middlewares_1.StealthMiddleware());
- }
- /**
- * Setup pipeline chain
- */
- setupPipelines() {
- this.pipelineEngine.use(new pipelines_1.ValidationPipeline());
- this.pipelineEngine.use(new pipelines_1.SanitizationPipeline());
- this.pipelineEngine.use(new pipelines_1.DeduplicationPipeline());
- this.pipelineEngine.use(new pipelines_1.ImagePipeline());
- this.pipelineEngine.use(new pipelines_1.StatsPipeline());
- this.pipelineEngine.use(new pipelines_1.DatabasePipeline());
- }
- /**
- * Add a request to the queue
- */
- enqueue(request) {
- this.scheduler.enqueue(request);
- }
- /**
- * Start the scraping engine
- */
- async start() {
- if (this.isRunning) {
- logger_1.logger.warn('scraper', 'Engine is already running');
- return;
- }
- this.isRunning = true;
- this.stats.startTime = new Date();
- logger_1.logger.info('scraper', `🚀 Starting scraper engine (concurrency: ${this.concurrency})`);
- // Process queue
- await this.processQueue();
- this.isRunning = false;
- this.stats.endTime = new Date();
- this.stats.duration = this.stats.endTime.getTime() - this.stats.startTime.getTime();
- logger_1.logger.info('scraper', `✅ Scraper engine finished`);
- this.logStats();
- // Cleanup
- await this.downloader.cleanup();
- }
- /**
- * Process the request queue
- */
- async processQueue() {
- while (!this.scheduler.isEmpty() && this.isRunning) {
- const request = this.scheduler.dequeue();
- if (!request) {
- // Wait a bit and check again
- await new Promise(resolve => setTimeout(resolve, 100));
- continue;
- }
- try {
- await this.processRequest(request);
- }
- catch (error) {
- logger_1.logger.error('scraper', `Failed to process request: ${error}`);
- }
- }
- }
- /**
- * Process a single request
- */
- async processRequest(request) {
- this.stats.requestsTotal++;
- try {
- logger_1.logger.debug('scraper', `Processing: ${request.url}`);
- // Apply request middlewares
- const processedRequest = await this.middlewareEngine.processRequest(request);
- // Download
- let response = await this.downloader.fetch(processedRequest);
- // Apply response middlewares
- response = await this.middlewareEngine.processResponse(response);
- // Parse response using callback
- const parseResult = await request.callback(response);
- // Process items through pipeline
- if (parseResult.items && parseResult.items.length > 0) {
- for (const item of parseResult.items) {
- await this.processItem(item, 'default');
- }
- }
- // Enqueue follow-up requests
- if (parseResult.requests && parseResult.requests.length > 0) {
- for (const followUpRequest of parseResult.requests) {
- this.scheduler.enqueue(followUpRequest);
- }
- }
- this.stats.requestsSuccess++;
- this.scheduler.markComplete(request);
- }
- catch (error) {
- this.stats.requestsFailed++;
- this.stats.errorsCount++;
- logger_1.logger.error('scraper', `Request failed: ${request.url} - ${error.message}`);
- // Apply error middlewares
- const handledError = await this.middlewareEngine.processError(error, request);
- // If error is null, it was handled (e.g., retry)
- if (handledError === null) {
- this.scheduler.requeueForRetry(request);
- }
- else {
- this.scheduler.markComplete(request);
- // Call error handler if provided
- if (request.errorHandler) {
- await request.errorHandler(error, request);
- }
- }
- }
- }
- /**
- * Process an item through pipelines
- */
- async processItem(item, spider) {
- this.stats.itemsScraped++;
- try {
- const processedItem = await this.pipelineEngine.processItem(item, spider);
- if (processedItem) {
- this.stats.itemsSaved++;
- }
- else {
- this.stats.itemsDropped++;
- }
- }
- catch (error) {
- logger_1.logger.error('scraper', `Failed to process item: ${error}`);
- this.stats.itemsDropped++;
- }
- }
- /**
- * Log statistics
- */
- logStats() {
- logger_1.logger.info('scraper', '━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
- logger_1.logger.info('scraper', '📊 Scraper Statistics');
- logger_1.logger.info('scraper', '━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
- logger_1.logger.info('scraper', ` Requests: ${this.stats.requestsSuccess}/${this.stats.requestsTotal} successful`);
- logger_1.logger.info('scraper', ` Items: ${this.stats.itemsSaved} saved, ${this.stats.itemsDropped} dropped`);
- logger_1.logger.info('scraper', ` Errors: ${this.stats.errorsCount}`);
- logger_1.logger.info('scraper', ` Duration: ${Math.round((this.stats.duration || 0) / 1000)}s`);
- // Get stats from StatsPipeline
- const statsPipeline = this.pipelineEngine.getPipeline('StatsPipeline');
- if (statsPipeline) {
- const itemStats = statsPipeline.getStats();
- logger_1.logger.info('scraper', ` Items with images: ${itemStats.withImages}/${itemStats.total}`);
- logger_1.logger.info('scraper', ` Items with THC: ${itemStats.withThc}/${itemStats.total}`);
- logger_1.logger.info('scraper', ` Items with descriptions: ${itemStats.withDescription}/${itemStats.total}`);
- }
- logger_1.logger.info('scraper', '━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
- }
- /**
- * Stop the engine
- */
- stop() {
- this.isRunning = false;
- logger_1.logger.info('scraper', 'Stopping scraper engine...');
- }
- /**
- * Get current stats
- */
- getStats() {
- return { ...this.stats };
- }
- /**
- * Get queue stats
- */
- getQueueStats() {
- return this.scheduler.getStats();
- }
-}
-exports.ScraperEngine = ScraperEngine;
-/**
- * Spider for scraping Dutchie categories
- */
-class DutchieSpider {
- engine;
- constructor(engine) {
- this.engine = engine;
- }
- /**
- * Scrape a category
- */
- async scrapeCategory(storeId, categoryId) {
- logger_1.logger.info('scraper', `Starting category scrape: store=${storeId}, category=${categoryId}`);
- const scraperId = `scraper-${storeId}-${categoryId}-${Date.now()}`;
- let registerScraper, updateScraperStats, completeScraper;
- try {
- // Import monitoring functions
- const monitor = await Promise.resolve().then(() => __importStar(require('../routes/scraper-monitor')));
- registerScraper = monitor.registerScraper;
- updateScraperStats = monitor.updateScraperStats;
- completeScraper = monitor.completeScraper;
- }
- catch (e) {
- // Monitoring not available
- }
- try {
- // Get category info
- const categoryResult = await migrate_1.pool.query(`
- SELECT c.*, s.slug as store_slug, s.name as store_name
- FROM categories c
- JOIN stores s ON c.store_id = s.id
- WHERE c.id = $1
- `, [categoryId]);
- if (categoryResult.rows.length === 0) {
- throw new Error('Category not found');
- }
- const category = categoryResult.rows[0];
- logger_1.logger.info('scraper', `Category: ${category.name} (${category.dutchie_url})`);
- // Register with monitoring system
- if (registerScraper) {
- registerScraper(scraperId, storeId, category.store_name, categoryId, category.name);
- }
- // Mark products as out of stock before scraping
- await migrate_1.pool.query(`
- UPDATE products
- SET in_stock = false
- WHERE store_id = $1 AND category_id = $2
- `, [storeId, categoryId]);
- if (updateScraperStats) {
- updateScraperStats(scraperId, {}, 'Marking products as out of stock');
- }
- // Enqueue category page request
- this.engine.enqueue({
- url: category.dutchie_url,
- priority: 100,
- maxRetries: 3,
- metadata: {
- requiresBrowser: true,
- storeId,
- categoryId,
- categorySlug: category.slug,
- storeSlug: category.store_slug
- },
- callback: this.parseCategoryPage.bind(this)
- });
- // Start the engine
- if (updateScraperStats) {
- updateScraperStats(scraperId, {}, 'Scraping category page');
- }
- await this.engine.start();
- // Update stats from engine
- const engineStats = this.engine.getStats();
- if (updateScraperStats) {
- updateScraperStats(scraperId, {
- requestsTotal: engineStats.requestsTotal,
- requestsSuccess: engineStats.requestsSuccess,
- itemsSaved: engineStats.itemsSaved,
- itemsDropped: engineStats.itemsDropped,
- errorsCount: engineStats.errorsCount
- }, 'Finalizing');
- }
- // Update category last_scraped_at
- await migrate_1.pool.query(`
- UPDATE categories
- SET last_scraped_at = CURRENT_TIMESTAMP
- WHERE id = $1
- `, [categoryId]);
- logger_1.logger.info('scraper', `✅ Category scrape completed: ${category.name}`);
- if (completeScraper) {
- completeScraper(scraperId);
- }
- }
- catch (error) {
- logger_1.logger.error('scraper', `Category scrape failed: ${error}`);
- if (completeScraper) {
- completeScraper(scraperId, String(error));
- }
- throw error;
- }
- }
- /**
- * Parse category page (product listing)
- */
- async parseCategoryPage(response) {
- const page = await this.engine['downloader'].getCurrentPage();
- if (!page) {
- throw new Error('No active page');
- }
- logger_1.logger.info('scraper', 'Parsing category page...');
- // Extract product cards
- const productCards = await page.evaluate(() => {
- // @ts-ignore - runs in browser context
- const cards = document.querySelectorAll('[data-testid="product-list-item"]');
- const items = [];
- cards.forEach((card) => {
- try {
- const allText = card.textContent || '';
- // Extract name
- let name = '';
- const nameSelectors = ['a[href*="/product/"]', 'h1', 'h2', 'h3', 'h4'];
- for (const sel of nameSelectors) {
- const el = card.querySelector(sel);
- if (el?.textContent?.trim()) {
- name = el.textContent.trim().split('\n')[0].trim();
- break;
- }
- }
- if (!name || name.length < 2)
- return;
- // Extract price
- let price = null;
- let originalPrice = null;
- const priceMatches = allText.match(/\$(\d+\.?\d*)/g);
- if (priceMatches && priceMatches.length > 0) {
- price = parseFloat(priceMatches[0].replace('$', ''));
- if (priceMatches.length > 1) {
- originalPrice = parseFloat(priceMatches[1].replace('$', ''));
- }
- }
- // Extract link
- const linkEl = card.querySelector('a[href*="/product/"]');
- let href = linkEl?.getAttribute('href') || '';
- if (href && href.startsWith('/')) {
- // @ts-ignore - runs in browser context
- href = window.location.origin + href;
- }
- // Extract image URL from product card
- let imageUrl = null;
- const imgSelectors = [
- 'img[src*="images.dutchie.com"]',
- 'img[src*="dutchie"]',
- 'img[data-testid*="product"]',
- 'img[class*="product"]',
- 'img[class*="Product"]',
- 'picture img',
- 'img'
- ];
- for (const sel of imgSelectors) {
- const img = card.querySelector(sel);
- if (img) {
- const src = img.getAttribute('src') || img.getAttribute('data-src') || '';
- if (src && (src.includes('dutchie.com') || src.includes('images.'))) {
- imageUrl = src;
- break;
- }
- }
- }
- items.push({ name, price, originalPrice, href, imageUrl });
- }
- catch (err) {
- console.error('Error parsing product card:', err);
- }
- });
- return items;
- });
- logger_1.logger.info('scraper', `Found ${productCards.length} products on listing page`);
- // Create follow-up requests for each product
- const requests = productCards.map((card, index) => ({
- url: card.href,
- priority: 50,
- maxRetries: 3,
- metadata: {
- ...response.request.metadata,
- productName: card.name,
- productPrice: card.price,
- productOriginalPrice: card.originalPrice,
- productImageUrl: card.imageUrl, // Pass image from category page
- requiresBrowser: true
- },
- callback: this.parseProductPage.bind(this)
- }));
- return { items: [], requests };
- }
- /**
- * Parse individual product page
- */
- async parseProductPage(response) {
- const page = await this.engine['downloader'].getCurrentPage();
- if (!page) {
- throw new Error('No active page');
- }
- const productName = response.request.metadata.productName;
- logger_1.logger.debug('scraper', `Parsing product: ${productName}`);
- // Extract product details
- const details = await page.evaluate(() => {
- // @ts-ignore - runs in browser context
- const allText = document.body.textContent || '';
- // Extract image - expanded selectors for better coverage
- let fullSizeImage = null;
- const mainImageSelectors = [
- 'img[src*="images.dutchie.com"]',
- 'img[src*="dutchie"]',
- 'img[class*="ProductImage"]',
- 'img[class*="product-image"]',
- 'img[class*="Product"]',
- '[class*="ImageGallery"] img',
- '[data-testid*="product"] img',
- '[data-testid*="image"] img',
- 'picture img',
- 'main img'
- ];
- for (const sel of mainImageSelectors) {
- // @ts-ignore - runs in browser context
- const img = document.querySelector(sel);
- const src = img?.src || img?.getAttribute('data-src') || '';
- if (src && (src.includes('dutchie.com') || src.includes('images.'))) {
- fullSizeImage = src;
- break;
- }
- }
- // Extract description
- let description = '';
- const descSelectors = [
- '[class*="description"]',
- '[class*="Description"]',
- '[data-testid*="description"]',
- 'p[class*="product"]'
- ];
- for (const sel of descSelectors) {
- // @ts-ignore - runs in browser context
- const el = document.querySelector(sel);
- if (el?.textContent?.trim() && el.textContent.length > 20) {
- description = el.textContent.trim();
- break;
- }
- }
- // Extract THC/CBD
- let thc = null;
- const thcPatterns = [
- /THC[:\s]*(\d+\.?\d*)\s*%/i,
- /Total\s+THC[:\s]*(\d+\.?\d*)\s*%/i,
- /(\d+\.?\d*)\s*%\s+THC/i
- ];
- for (const pattern of thcPatterns) {
- const match = allText.match(pattern);
- if (match) {
- thc = parseFloat(match[1]);
- break;
- }
- }
- let cbd = null;
- const cbdPatterns = [
- /CBD[:\s]*(\d+\.?\d*)\s*%/i,
- /Total\s+CBD[:\s]*(\d+\.?\d*)\s*%/i,
- /(\d+\.?\d*)\s*%\s+CBD/i
- ];
- for (const pattern of cbdPatterns) {
- const match = allText.match(pattern);
- if (match) {
- cbd = parseFloat(match[1]);
- break;
- }
- }
- // Extract strain type
- let strainType = null;
- if (allText.match(/\bindica\b/i))
- strainType = 'Indica';
- else if (allText.match(/\bsativa\b/i))
- strainType = 'Sativa';
- else if (allText.match(/\bhybrid\b/i))
- strainType = 'Hybrid';
- // Extract brand
- let brand = null;
- const brandSelectors = [
- '[class*="brand"]',
- '[class*="Brand"]',
- '[data-testid*="brand"]'
- ];
- for (const sel of brandSelectors) {
- // @ts-ignore - runs in browser context
- const el = document.querySelector(sel);
- if (el?.textContent?.trim()) {
- brand = el.textContent.trim();
- break;
- }
- }
- // Extract metadata
- const terpenes = [];
- const terpeneNames = ['Myrcene', 'Limonene', 'Caryophyllene', 'Pinene', 'Linalool', 'Humulene'];
- terpeneNames.forEach(terp => {
- if (allText.match(new RegExp(`\\b${terp}\\b`, 'i'))) {
- terpenes.push(terp);
- }
- });
- const effects = [];
- const effectNames = ['Relaxed', 'Happy', 'Euphoric', 'Uplifted', 'Creative', 'Energetic'];
- effectNames.forEach(effect => {
- if (allText.match(new RegExp(`\\b${effect}\\b`, 'i'))) {
- effects.push(effect);
- }
- });
- return {
- fullSizeImage,
- description,
- thc,
- cbd,
- strainType,
- brand,
- terpenes,
- effects
- };
- });
- // Create product item
- // Use image from product page, fallback to category page image
- const imageUrl = details.fullSizeImage || response.request.metadata.productImageUrl || undefined;
- const product = {
- dutchieProductId: `${response.request.metadata.storeSlug}-${response.request.metadata.categorySlug}-${Date.now()}-${Math.random()}`,
- name: productName || 'Unknown Product',
- description: details.description,
- price: response.request.metadata.productPrice,
- originalPrice: response.request.metadata.productOriginalPrice,
- thcPercentage: details.thc || undefined,
- cbdPercentage: details.cbd || undefined,
- strainType: details.strainType || undefined,
- brand: details.brand || undefined,
- imageUrl: imageUrl,
- dutchieUrl: response.url,
- metadata: {
- terpenes: details.terpenes,
- effects: details.effects
- },
- storeId: response.request.metadata.storeId,
- categoryId: response.request.metadata.categoryId
- };
- return { items: [product], requests: [] };
- }
- /**
- * Scrape entire store
- */
- async scrapeStore(storeId, parallel = 3) {
- logger_1.logger.info('scraper', `🏪 Starting store scrape: ${storeId} (${parallel} parallel scrapers)`);
- try {
- // Check if categories exist, if not, discover them first
- const categoryCountResult = await migrate_1.pool.query(`
- SELECT COUNT(*) as count
- FROM categories
- WHERE store_id = $1
- `, [storeId]);
- if (parseInt(categoryCountResult.rows[0].count) === 0) {
- logger_1.logger.info('scraper', 'No categories found - running discovery first');
- const { discoverCategories } = await Promise.resolve().then(() => __importStar(require('./index')));
- await discoverCategories(storeId);
- }
- // Get all leaf categories (no children)
- const categoriesResult = await migrate_1.pool.query(`
- SELECT c.id, c.name
- FROM categories c
- WHERE c.store_id = $1
- AND c.scrape_enabled = true
- AND NOT EXISTS (
- SELECT 1 FROM categories child
- WHERE child.parent_id = c.id
- )
- ORDER BY c.name
- `, [storeId]);
- const categories = categoriesResult.rows;
- logger_1.logger.info('scraper', `Found ${categories.length} categories to scrape`);
- if (parallel === 1) {
- // Sequential scraping (original behavior)
- for (const category of categories) {
- try {
- await this.scrapeCategory(storeId, category.id);
- await new Promise(resolve => setTimeout(resolve, 3000));
- }
- catch (error) {
- logger_1.logger.error('scraper', `Failed to scrape category ${category.name}: ${error}`);
- }
- }
- }
- else {
- // Parallel scraping with concurrency limit
- const results = await this.scrapeMultipleCategoriesParallel(storeId, categories, parallel);
- const successful = results.filter(r => r.status === 'fulfilled').length;
- const failed = results.filter(r => r.status === 'rejected').length;
- logger_1.logger.info('scraper', `Parallel scrape results: ${successful} successful, ${failed} failed`);
- }
- // Update store last_scraped_at
- await migrate_1.pool.query(`
- UPDATE stores
- SET last_scraped_at = CURRENT_TIMESTAMP
- WHERE id = $1
- `, [storeId]);
- logger_1.logger.info('scraper', `🎉 Store scrape completed: ${storeId}`);
- }
- catch (error) {
- logger_1.logger.error('scraper', `Store scrape failed: ${error}`);
- throw error;
- }
- }
- /**
- * Scrape multiple categories in parallel with concurrency limit
- */
- async scrapeMultipleCategoriesParallel(storeId, categories, concurrency) {
- const results = [];
- // Process categories in batches
- for (let i = 0; i < categories.length; i += concurrency) {
- const batch = categories.slice(i, i + concurrency);
- logger_1.logger.info('scraper', `Scraping batch ${Math.floor(i / concurrency) + 1}: ${batch.map(c => c.name).join(', ')}`);
- const batchPromises = batch.map(category => {
- // Create a new spider instance for each category
- const engine = new ScraperEngine(1); // 1 concurrent request per spider
- const spider = new DutchieSpider(engine);
- return spider.scrapeCategory(storeId, category.id)
- .catch(error => {
- logger_1.logger.error('scraper', `Category ${category.name} failed: ${error}`);
- throw error;
- });
- });
- const batchResults = await Promise.allSettled(batchPromises);
- results.push(...batchResults);
- // Delay between batches to avoid overwhelming the server
- if (i + concurrency < categories.length) {
- logger_1.logger.info('scraper', 'Waiting 5s before next batch...');
- await new Promise(resolve => setTimeout(resolve, 5000));
- }
- }
- return results;
- }
-}
-exports.DutchieSpider = DutchieSpider;
diff --git a/backend/dist/scraper-v2/index.js b/backend/dist/scraper-v2/index.js
deleted file mode 100644
index 57669863..00000000
--- a/backend/dist/scraper-v2/index.js
+++ /dev/null
@@ -1,115 +0,0 @@
-"use strict";
-/**
- * Scraper V2 - Scrapy-inspired web scraping framework
- *
- * IMPORTANT: For Dutchie stores, DO NOT USE scrapeStore() from this module.
- * Dutchie crawling must go through the dutchie-az GraphQL pipeline:
- * src/dutchie-az/services/product-crawler.ts
- *
- * This scraper-v2 module uses DOM-based extraction which is unreliable
- * for Dutchie. The new dutchie-az pipeline uses GraphQL directly.
- *
- * Architecture:
- * - Engine: Main orchestrator
- * - Scheduler: Priority queue with deduplication
- * - Downloader: HTTP + Browser hybrid fetcher
- * - Middlewares: Request/response processing chain
- * - Pipelines: Item processing and persistence
- * - Navigation: Category discovery
- */
-var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
- if (k2 === undefined) k2 = k;
- var desc = Object.getOwnPropertyDescriptor(m, k);
- if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
- desc = { enumerable: true, get: function() { return m[k]; } };
- }
- Object.defineProperty(o, k2, desc);
-}) : (function(o, m, k, k2) {
- if (k2 === undefined) k2 = k;
- o[k2] = m[k];
-}));
-var __exportStar = (this && this.__exportStar) || function(m, exports) {
- for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
-};
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.StatsPipeline = exports.DatabasePipeline = exports.ImagePipeline = exports.DeduplicationPipeline = exports.SanitizationPipeline = exports.ValidationPipeline = exports.PipelineEngine = exports.StealthMiddleware = exports.BotDetectionMiddleware = exports.RetryMiddleware = exports.RateLimitMiddleware = exports.ProxyMiddleware = exports.UserAgentMiddleware = exports.MiddlewareEngine = exports.NavigationDiscovery = exports.Downloader = exports.RequestScheduler = exports.DutchieSpider = exports.ScraperEngine = void 0;
-exports.scrapeCategory = scrapeCategory;
-exports.scrapeStore = scrapeStore;
-exports.discoverCategories = discoverCategories;
-var engine_1 = require("./engine");
-Object.defineProperty(exports, "ScraperEngine", { enumerable: true, get: function () { return engine_1.ScraperEngine; } });
-Object.defineProperty(exports, "DutchieSpider", { enumerable: true, get: function () { return engine_1.DutchieSpider; } });
-var scheduler_1 = require("./scheduler");
-Object.defineProperty(exports, "RequestScheduler", { enumerable: true, get: function () { return scheduler_1.RequestScheduler; } });
-var downloader_1 = require("./downloader");
-Object.defineProperty(exports, "Downloader", { enumerable: true, get: function () { return downloader_1.Downloader; } });
-var navigation_1 = require("./navigation");
-Object.defineProperty(exports, "NavigationDiscovery", { enumerable: true, get: function () { return navigation_1.NavigationDiscovery; } });
-var middlewares_1 = require("./middlewares");
-Object.defineProperty(exports, "MiddlewareEngine", { enumerable: true, get: function () { return middlewares_1.MiddlewareEngine; } });
-Object.defineProperty(exports, "UserAgentMiddleware", { enumerable: true, get: function () { return middlewares_1.UserAgentMiddleware; } });
-Object.defineProperty(exports, "ProxyMiddleware", { enumerable: true, get: function () { return middlewares_1.ProxyMiddleware; } });
-Object.defineProperty(exports, "RateLimitMiddleware", { enumerable: true, get: function () { return middlewares_1.RateLimitMiddleware; } });
-Object.defineProperty(exports, "RetryMiddleware", { enumerable: true, get: function () { return middlewares_1.RetryMiddleware; } });
-Object.defineProperty(exports, "BotDetectionMiddleware", { enumerable: true, get: function () { return middlewares_1.BotDetectionMiddleware; } });
-Object.defineProperty(exports, "StealthMiddleware", { enumerable: true, get: function () { return middlewares_1.StealthMiddleware; } });
-var pipelines_1 = require("./pipelines");
-Object.defineProperty(exports, "PipelineEngine", { enumerable: true, get: function () { return pipelines_1.PipelineEngine; } });
-Object.defineProperty(exports, "ValidationPipeline", { enumerable: true, get: function () { return pipelines_1.ValidationPipeline; } });
-Object.defineProperty(exports, "SanitizationPipeline", { enumerable: true, get: function () { return pipelines_1.SanitizationPipeline; } });
-Object.defineProperty(exports, "DeduplicationPipeline", { enumerable: true, get: function () { return pipelines_1.DeduplicationPipeline; } });
-Object.defineProperty(exports, "ImagePipeline", { enumerable: true, get: function () { return pipelines_1.ImagePipeline; } });
-Object.defineProperty(exports, "DatabasePipeline", { enumerable: true, get: function () { return pipelines_1.DatabasePipeline; } });
-Object.defineProperty(exports, "StatsPipeline", { enumerable: true, get: function () { return pipelines_1.StatsPipeline; } });
-__exportStar(require("./types"), exports);
-// Main API functions
-const engine_2 = require("./engine");
-const navigation_2 = require("./navigation");
-const downloader_2 = require("./downloader");
-const logger_1 = require("../services/logger");
-/**
- * Scrape a single category
- */
-async function scrapeCategory(storeId, categoryId) {
- const engine = new engine_2.ScraperEngine(1);
- const spider = new engine_2.DutchieSpider(engine);
- try {
- await spider.scrapeCategory(storeId, categoryId);
- }
- catch (error) {
- logger_1.logger.error('scraper', `scrapeCategory failed: ${error}`);
- throw error;
- }
-}
-/**
- * Scrape an entire store
- */
-async function scrapeStore(storeId, parallel = 3, _userAgent) {
- const engine = new engine_2.ScraperEngine(1);
- const spider = new engine_2.DutchieSpider(engine);
- try {
- await spider.scrapeStore(storeId, parallel);
- }
- catch (error) {
- logger_1.logger.error('scraper', `scrapeStore failed: ${error}`);
- throw error;
- }
-}
-/**
- * Discover categories for a store
- */
-async function discoverCategories(storeId) {
- const downloader = new downloader_2.Downloader();
- const discovery = new navigation_2.NavigationDiscovery(downloader);
- try {
- // Discover categories (uses your existing Dutchie category structure)
- await discovery.discoverCategories(storeId);
- }
- catch (error) {
- logger_1.logger.error('scraper', `discoverCategories failed: ${error}`);
- throw error;
- }
- finally {
- await downloader.cleanup();
- }
-}
diff --git a/backend/dist/scraper-v2/middlewares.js b/backend/dist/scraper-v2/middlewares.js
deleted file mode 100644
index 5d10ef79..00000000
--- a/backend/dist/scraper-v2/middlewares.js
+++ /dev/null
@@ -1,351 +0,0 @@
-"use strict";
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.MiddlewareEngine = exports.StealthMiddleware = exports.BotDetectionMiddleware = exports.RetryMiddleware = exports.RateLimitMiddleware = exports.ProxyMiddleware = exports.UserAgentMiddleware = void 0;
-const types_1 = require("./types");
-const logger_1 = require("../services/logger");
-const proxy_1 = require("../services/proxy");
-// Diverse, realistic user agents - updated for 2024/2025
-const USER_AGENTS = [
- // Chrome on Windows (most common)
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
- // Chrome on Mac
- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
- // Chrome on Linux
- 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
- 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
- // Firefox
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0',
- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 14.0; rv:121.0) Gecko/20100101 Firefox/121.0',
- // Safari
- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15',
- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_0) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15',
- // Edge
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0',
-];
-function getRandomUserAgent() {
- return USER_AGENTS[Math.floor(Math.random() * USER_AGENTS.length)];
-}
-function sleep(ms) {
- return new Promise(resolve => setTimeout(resolve, ms));
-}
-/**
- * User Agent Rotation Middleware - rotates UA on each request for better evasion
- */
-class UserAgentMiddleware {
- name = 'UserAgentMiddleware';
- priority = 100;
- lastUserAgent = null;
- async processRequest(request) {
- // Always rotate UA on retries or bot detection
- const forceRotation = request.retryCount > 0 || request.metadata.botDetected;
- if (!request.metadata.userAgent || forceRotation) {
- // Get a different UA than the last one used
- let newUA = getRandomUserAgent();
- let attempts = 0;
- while (newUA === this.lastUserAgent && attempts < 5) {
- newUA = getRandomUserAgent();
- attempts++;
- }
- request.metadata.userAgent = newUA;
- this.lastUserAgent = newUA;
- if (forceRotation) {
- logger_1.logger.debug('scraper', `🔄 Rotated User-Agent: ${newUA.substring(0, 50)}...`);
- }
- }
- return request;
- }
-}
-exports.UserAgentMiddleware = UserAgentMiddleware;
-// Domains that should skip proxy (datacenter IPs are blocked)
-const PROXY_SKIP_DOMAINS = [
- 'dutchie.com',
-];
-function shouldSkipProxy(url) {
- try {
- const urlObj = new URL(url);
- return PROXY_SKIP_DOMAINS.some(domain => urlObj.hostname.includes(domain));
- }
- catch {
- return false;
- }
-}
-/**
- * Proxy Rotation Middleware - uses the central proxy service with timeout handling
- */
-class ProxyMiddleware {
- name = 'ProxyMiddleware';
- priority = 90;
- currentProxyId = null;
- async processRequest(request) {
- // Skip proxy for domains that block datacenter IPs
- if (shouldSkipProxy(request.url)) {
- logger_1.logger.info('scraper', `⏭️ Skipping proxy for ${new URL(request.url).hostname} (datacenter IPs blocked)`);
- return request;
- }
- // Always try to use a proxy from the central proxy service
- // The service handles bot detection timeouts automatically
- const forceRotation = request.retryCount > 0 || request.metadata.botDetected;
- if (!request.metadata.proxy || forceRotation) {
- // Get proxy from central service - it handles timeouts automatically
- const proxy = await (0, proxy_1.getActiveProxy)();
- if (proxy) {
- request.metadata.proxy = {
- host: proxy.host,
- port: proxy.port,
- protocol: proxy.protocol,
- username: proxy.username,
- password: proxy.password,
- };
- request.metadata.proxyId = proxy.id;
- this.currentProxyId = proxy.id;
- const reason = forceRotation ? 'rotation' : 'initial';
- logger_1.logger.info('scraper', `🔄 Using proxy (${reason}): ${proxy.protocol}://${proxy.host}:${proxy.port}`);
- }
- else {
- logger_1.logger.warn('scraper', '⚠️ No proxy available - running without proxy');
- }
- }
- return request;
- }
- async processResponse(response) {
- // If bot detection was triggered, put the proxy in timeout
- if (response.request.metadata.botDetected && response.request.metadata.proxyId) {
- (0, proxy_1.putProxyInTimeout)(response.request.metadata.proxyId, 'Bot detection triggered');
- logger_1.logger.info('scraper', `🚫 Proxy ${response.request.metadata.proxyId} put in timeout due to bot detection`);
- }
- return response;
- }
- async processError(error, request) {
- // If bot detection error, put proxy in timeout
- if ((0, proxy_1.isBotDetectionError)(error.message) && request.metadata.proxyId) {
- (0, proxy_1.putProxyInTimeout)(request.metadata.proxyId, error.message);
- logger_1.logger.info('scraper', `🚫 Proxy ${request.metadata.proxyId} put in timeout: ${error.message}`);
- }
- return error;
- }
-}
-exports.ProxyMiddleware = ProxyMiddleware;
-/**
- * Rate Limiting Middleware with Adaptive Delays
- */
-class RateLimitMiddleware {
- name = 'RateLimitMiddleware';
- priority = 80;
- requestTimes = [];
- errorCount = 0;
- baseDelay = 2000; // 2 seconds base delay
- maxDelay = 30000; // 30 seconds max
- async processRequest(request) {
- await this.waitForNextRequest();
- return request;
- }
- async processResponse(response) {
- // Record success - gradually reduce error count
- this.errorCount = Math.max(0, this.errorCount - 1);
- return response;
- }
- async processError(error) {
- // Record error - increase delay
- this.errorCount++;
- return error;
- }
- async waitForNextRequest() {
- // Calculate adaptive delay based on error count
- const errorMultiplier = Math.pow(1.5, Math.min(this.errorCount, 5));
- const adaptiveDelay = Math.min(this.baseDelay * errorMultiplier, this.maxDelay);
- // Add random jitter (±20%)
- const jitter = (Math.random() - 0.5) * 0.4 * adaptiveDelay;
- const delay = adaptiveDelay + jitter;
- const now = Date.now();
- const lastRequest = this.requestTimes[this.requestTimes.length - 1] || 0;
- const timeSinceLast = now - lastRequest;
- if (timeSinceLast < delay) {
- const waitTime = delay - timeSinceLast;
- logger_1.logger.debug('scraper', `Rate limiting: waiting ${Math.round(waitTime)}ms`);
- await sleep(waitTime);
- }
- this.requestTimes.push(Date.now());
- this.cleanup();
- }
- cleanup() {
- // Keep only last minute of requests
- const cutoff = Date.now() - 60000;
- this.requestTimes = this.requestTimes.filter(t => t > cutoff);
- }
- setBaseDelay(ms) {
- this.baseDelay = ms;
- }
-}
-exports.RateLimitMiddleware = RateLimitMiddleware;
-/**
- * Retry Middleware with Exponential Backoff
- */
-class RetryMiddleware {
- name = 'RetryMiddleware';
- priority = 70;
- isRetryable(error) {
- const retryableErrors = [
- types_1.ErrorType.NETWORK_ERROR,
- types_1.ErrorType.TIMEOUT,
- types_1.ErrorType.SERVER_ERROR
- ];
- if ('type' in error) {
- return retryableErrors.includes(error.type);
- }
- // Check error message for common retryable patterns
- const message = error.message.toLowerCase();
- return (message.includes('timeout') ||
- message.includes('network') ||
- message.includes('econnreset') ||
- message.includes('econnrefused') ||
- message.includes('500') ||
- message.includes('502') ||
- message.includes('503'));
- }
- async processError(error, request) {
- if (!this.isRetryable(error)) {
- logger_1.logger.warn('scraper', `Non-retryable error for ${request.url}: ${error.message}`);
- return error;
- }
- if (request.retryCount < request.maxRetries) {
- // Calculate backoff delay
- const backoffDelay = Math.min(1000 * Math.pow(2, request.retryCount), 30000);
- logger_1.logger.info('scraper', `Retry ${request.retryCount + 1}/${request.maxRetries} for ${request.url} after ${backoffDelay}ms`);
- await sleep(backoffDelay);
- // Return null to indicate retry should happen
- return null;
- }
- logger_1.logger.error('scraper', `Max retries exceeded for ${request.url}`);
- return error;
- }
-}
-exports.RetryMiddleware = RetryMiddleware;
-/**
- * Bot Detection Middleware - detects bot blocking and triggers fingerprint rotation
- */
-class BotDetectionMiddleware {
- name = 'BotDetectionMiddleware';
- priority = 60;
- detectedCount = 0;
- DETECTION_THRESHOLD = 3;
- // Export for use by other middlewares
- static shouldRotateFingerprint = false;
- async processResponse(response) {
- const content = typeof response.content === 'string'
- ? response.content
- : JSON.stringify(response.content);
- // Check for bot detection indicators
- const botIndicators = [
- /captcha/i,
- /cloudflare/i,
- /access denied/i,
- /you have been blocked/i,
- /unusual traffic/i,
- /robot/i,
- /verify.*human/i,
- /security check/i,
- /please wait/i,
- /checking your browser/i,
- /ray id/i
- ];
- const detected = botIndicators.some(pattern => pattern.test(content));
- if (detected) {
- this.detectedCount++;
- BotDetectionMiddleware.shouldRotateFingerprint = true;
- logger_1.logger.warn('scraper', `Bot detection suspected (${this.detectedCount}/${this.DETECTION_THRESHOLD}): ${response.url}`);
- logger_1.logger.info('scraper', '🔄 Flagging for proxy/UA rotation on next request');
- // Mark the request for rotation on retry
- response.request.metadata.botDetected = true;
- response.request.metadata.needsNewBrowser = true;
- if (this.detectedCount >= this.DETECTION_THRESHOLD) {
- const error = new Error('Bot detection threshold reached - rotating fingerprint');
- error.type = types_1.ErrorType.BOT_DETECTION;
- error.retryable = true;
- error.request = response.request;
- throw error;
- }
- }
- else {
- // Gradually decrease detection count on successful requests
- this.detectedCount = Math.max(0, this.detectedCount - 0.5);
- BotDetectionMiddleware.shouldRotateFingerprint = false;
- }
- return response;
- }
- async processError(error, request) {
- // If bot detection error, flag for rotation and allow retry
- if ('type' in error && error.type === types_1.ErrorType.BOT_DETECTION) {
- request.metadata.botDetected = true;
- request.metadata.needsNewBrowser = true;
- logger_1.logger.info('scraper', '🔄 Bot detection error - will rotate proxy/UA on retry');
- // Add delay before retry to avoid rate limiting
- await sleep(5000 + Math.random() * 5000);
- return null; // Return null to trigger retry
- }
- return error;
- }
-}
-exports.BotDetectionMiddleware = BotDetectionMiddleware;
-/**
- * Stealth Mode Middleware
- */
-class StealthMiddleware {
- name = 'StealthMiddleware';
- priority = 95;
- async processRequest(request) {
- // Flag that this request needs stealth mode
- request.metadata.requiresStealth = true;
- return request;
- }
-}
-exports.StealthMiddleware = StealthMiddleware;
-/**
- * Middleware Engine to orchestrate all middlewares
- */
-class MiddlewareEngine {
- middlewares = [];
- use(middleware) {
- this.middlewares.push(middleware);
- // Sort by priority (higher first)
- this.middlewares.sort((a, b) => b.priority - a.priority);
- }
- async processRequest(request) {
- let current = request;
- for (const middleware of this.middlewares) {
- if (middleware.processRequest) {
- current = await middleware.processRequest(current);
- }
- }
- return current;
- }
- async processResponse(response) {
- let current = response;
- for (const middleware of this.middlewares) {
- if (middleware.processResponse) {
- current = await middleware.processResponse(current);
- }
- }
- return current;
- }
- async processError(error, request) {
- let currentError = error;
- for (const middleware of this.middlewares) {
- if (middleware.processError && currentError) {
- currentError = await middleware.processError(currentError, request);
- if (currentError === null) {
- // Middleware handled the error (e.g., retry)
- break;
- }
- }
- }
- return currentError;
- }
-}
-exports.MiddlewareEngine = MiddlewareEngine;
diff --git a/backend/dist/scraper-v2/navigation.js b/backend/dist/scraper-v2/navigation.js
deleted file mode 100644
index f7a7a66a..00000000
--- a/backend/dist/scraper-v2/navigation.js
+++ /dev/null
@@ -1,278 +0,0 @@
-"use strict";
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.NavigationDiscovery = void 0;
-const migrate_1 = require("../db/migrate");
-const logger_1 = require("../services/logger");
-/**
- * Navigation Discovery - finds and builds category structure
- */
-class NavigationDiscovery {
- downloader;
- constructor(downloader) {
- this.downloader = downloader;
- }
- /**
- * Discover categories from a store's main page
- */
- async discoverCategories(storeId) {
- logger_1.logger.info('categories', `Starting category discovery for store ${storeId}`);
- try {
- // Get store info
- const storeResult = await migrate_1.pool.query(`
- SELECT id, name, slug, dutchie_url
- FROM stores
- WHERE id = $1
- `, [storeId]);
- if (storeResult.rows.length === 0) {
- throw new Error('Store not found');
- }
- const store = storeResult.rows[0];
- const baseUrl = store.dutchie_url;
- // Create request to fetch the main page
- const request = {
- url: baseUrl,
- priority: 100,
- retryCount: 0,
- maxRetries: 3,
- metadata: {
- requiresBrowser: true,
- requiresStealth: true
- },
- callback: async () => ({ items: [], requests: [] })
- };
- // Fetch the page
- const response = await this.downloader.fetch(request);
- // Extract navigation links
- const page = await this.downloader.getCurrentPage();
- if (!page) {
- throw new Error('No active page for navigation extraction');
- }
- const links = await this.extractNavigationLinks(page, baseUrl);
- logger_1.logger.info('categories', `Found ${links.length} navigation links`);
- // Check if it's a Dutchie menu
- const isDutchie = await this.isDutchieMenu(page);
- if (isDutchie) {
- logger_1.logger.info('categories', 'Detected Dutchie menu - using predefined structure');
- await this.createDutchieCategories(storeId, store, links);
- }
- else {
- logger_1.logger.info('categories', 'Custom menu detected - extracting from navigation');
- await this.createCustomCategories(storeId, store, links);
- }
- logger_1.logger.info('categories', `✅ Category discovery completed for ${store.name}`);
- }
- catch (error) {
- logger_1.logger.error('categories', `Category discovery failed: ${error}`);
- throw error;
- }
- }
- /**
- * Extract navigation links from page
- */
- async extractNavigationLinks(page, baseUrl) {
- return await page.evaluate((base) => {
- const links = [];
- // Look for navigation elements
- const navSelectors = [
- 'nav a',
- '[role="navigation"] a',
- '[class*="nav"] a',
- '[class*="menu"] a',
- '[class*="category"] a',
- 'header a'
- ];
- const foundLinks = new Set();
- for (const selector of navSelectors) {
- // @ts-ignore - runs in browser context
- const elements = document.querySelectorAll(selector);
- elements.forEach((el) => {
- const text = el.textContent?.trim();
- let href = el.href || el.getAttribute('href');
- if (!text || !href || text.length < 2)
- return;
- // Normalize href
- if (href.startsWith('/')) {
- // @ts-ignore - runs in browser context
- const url = new URL(base);
- href = `${url.origin}${href}`;
- }
- // Skip external links and anchors
- if (!href.includes(base) || href.includes('#'))
- return;
- // Skip duplicates
- const linkKey = `${text}:${href}`;
- if (foundLinks.has(linkKey))
- return;
- foundLinks.add(linkKey);
- // Determine if it's likely a category
- const categoryKeywords = [
- 'flower', 'pre-roll', 'vape', 'edible', 'concentrate',
- 'topical', 'accessory', 'brand', 'special', 'shop',
- 'indica', 'sativa', 'hybrid', 'cbd', 'thc'
- ];
- const isCategory = categoryKeywords.some(kw => text.toLowerCase().includes(kw) ||
- href.toLowerCase().includes(kw));
- links.push({
- text,
- href,
- isCategory
- });
- });
- }
- return links;
- }, baseUrl);
- }
- /**
- * Check if it's a Dutchie menu
- */
- async isDutchieMenu(page) {
- return await page.evaluate(() => {
- // Check for Dutchie markers
- // @ts-ignore - runs in browser context
- if (window.reactEnv) {
- // @ts-ignore - runs in browser context
- const env = window.reactEnv;
- if (env.adminUrl?.includes('dutchie.com') ||
- env.apiUrl?.includes('dutchie.com') ||
- env.consumerUrl?.includes('dutchie.com')) {
- return true;
- }
- }
- // @ts-ignore - runs in browser context
- const htmlContent = document.documentElement.innerHTML;
- return (htmlContent.includes('admin.dutchie.com') ||
- htmlContent.includes('api.dutchie.com') ||
- htmlContent.includes('embedded-menu') ||
- htmlContent.includes('window.reactEnv'));
- });
- }
- /**
- * Create categories for Dutchie menus (predefined structure)
- * Uses your existing Dutchie category structure
- */
- async createDutchieCategories(storeId, store, discoveredLinks) {
- const client = await migrate_1.pool.connect();
- try {
- await client.query('BEGIN');
- logger_1.logger.info('categories', `Creating predefined Dutchie category structure`);
- const baseUrl = store.dutchie_url;
- // Your existing Dutchie categories structure
- const DUTCHIE_CATEGORIES = [
- { name: 'Shop', slug: 'shop', parentSlug: undefined },
- { name: 'Flower', slug: 'flower', parentSlug: 'shop' },
- { name: 'Pre-Rolls', slug: 'pre-rolls', parentSlug: 'shop' },
- { name: 'Vaporizers', slug: 'vaporizers', parentSlug: 'shop' },
- { name: 'Concentrates', slug: 'concentrates', parentSlug: 'shop' },
- { name: 'Edibles', slug: 'edibles', parentSlug: 'shop' },
- { name: 'Topicals', slug: 'topicals', parentSlug: 'shop' },
- { name: 'Accessories', slug: 'accessories', parentSlug: 'shop' },
- { name: 'Brands', slug: 'brands', parentSlug: undefined },
- { name: 'Specials', slug: 'specials', parentSlug: undefined }
- ];
- for (const category of DUTCHIE_CATEGORIES) {
- let categoryUrl;
- if (category.parentSlug) {
- // Subcategory: /embedded-menu/{slug}/shop/flower
- categoryUrl = `${baseUrl}/${category.parentSlug}/${category.slug}`;
- }
- else {
- // Top-level: /embedded-menu/{slug}/shop
- categoryUrl = `${baseUrl}/${category.slug}`;
- }
- const path = category.parentSlug ? `${category.parentSlug}/${category.slug}` : category.slug;
- if (!category.parentSlug) {
- // Create parent category
- await client.query(`
- INSERT INTO categories (store_id, name, slug, dutchie_url, path, scrape_enabled, parent_id)
- VALUES ($1, $2, $3, $4, $5, true, NULL)
- ON CONFLICT (store_id, slug)
- DO UPDATE SET name = $2, dutchie_url = $4, path = $5
- RETURNING id
- `, [storeId, category.name, category.slug, categoryUrl, path]);
- logger_1.logger.info('categories', `📁 ${category.name}`);
- }
- else {
- // Create subcategory
- const parentResult = await client.query(`
- SELECT id FROM categories
- WHERE store_id = $1 AND slug = $2
- `, [storeId, category.parentSlug]);
- if (parentResult.rows.length > 0) {
- const parentId = parentResult.rows[0].id;
- await client.query(`
- INSERT INTO categories (store_id, name, slug, dutchie_url, path, scrape_enabled, parent_id)
- VALUES ($1, $2, $3, $4, $5, true, $6)
- ON CONFLICT (store_id, slug)
- DO UPDATE SET name = $2, dutchie_url = $4, path = $5, parent_id = $6
- `, [storeId, category.name, category.slug, categoryUrl, path, parentId]);
- logger_1.logger.info('categories', ` └── ${category.name}`);
- }
- }
- }
- await client.query('COMMIT');
- logger_1.logger.info('categories', `✅ Created ${DUTCHIE_CATEGORIES.length} Dutchie categories successfully`);
- }
- catch (error) {
- await client.query('ROLLBACK');
- logger_1.logger.error('categories', `Failed to create Dutchie categories: ${error}`);
- throw error;
- }
- finally {
- client.release();
- }
- }
- /**
- * Create categories from discovered links (custom menus)
- */
- async createCustomCategories(storeId, store, links) {
- const client = await migrate_1.pool.connect();
- try {
- await client.query('BEGIN');
- // Filter to likely category links
- const categoryLinks = links.filter(link => link.isCategory);
- let displayOrder = 0;
- for (const link of categoryLinks) {
- // Generate slug from text
- const slug = link.text
- .toLowerCase()
- .replace(/[^a-z0-9]+/g, '-')
- .replace(/^-|-$/g, '');
- // Determine path from URL
- const url = new URL(link.href);
- const path = url.pathname.replace(/^\//, '');
- await client.query(`
- INSERT INTO categories (store_id, name, slug, dutchie_url, path, scrape_enabled, display_order)
- VALUES ($1, $2, $3, $4, $5, true, $6)
- ON CONFLICT (store_id, slug)
- DO UPDATE SET name = $2, dutchie_url = $4, path = $5, display_order = $6
- `, [storeId, link.text, slug, link.href, path, displayOrder++]);
- logger_1.logger.info('categories', `📁 ${link.text} -> ${link.href}`);
- }
- await client.query('COMMIT');
- logger_1.logger.info('categories', `✅ Created ${categoryLinks.length} custom categories`);
- }
- catch (error) {
- await client.query('ROLLBACK');
- throw error;
- }
- finally {
- client.release();
- }
- }
- /**
- * Update display_order column in categories table
- */
- async ensureDisplayOrderColumn() {
- try {
- await migrate_1.pool.query(`
- ALTER TABLE categories
- ADD COLUMN IF NOT EXISTS display_order INTEGER DEFAULT 0
- `);
- logger_1.logger.info('categories', 'Ensured display_order column exists');
- }
- catch (error) {
- logger_1.logger.warn('categories', `Could not add display_order column: ${error}`);
- }
- }
-}
-exports.NavigationDiscovery = NavigationDiscovery;
diff --git a/backend/dist/scraper-v2/pipelines.js b/backend/dist/scraper-v2/pipelines.js
deleted file mode 100644
index ce5c74ff..00000000
--- a/backend/dist/scraper-v2/pipelines.js
+++ /dev/null
@@ -1,459 +0,0 @@
-"use strict";
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.PipelineEngine = exports.StatsPipeline = exports.DatabasePipeline = exports.ImagePipeline = exports.DeduplicationPipeline = exports.SanitizationPipeline = exports.ValidationPipeline = void 0;
-const logger_1 = require("../services/logger");
-const migrate_1 = require("../db/migrate");
-const minio_1 = require("../utils/minio");
-const product_normalizer_1 = require("../utils/product-normalizer");
-/**
- * Validation Pipeline - ensures data quality
- */
-class ValidationPipeline {
- name = 'ValidationPipeline';
- priority = 100;
- async process(item, spider) {
- // Required fields
- if (!item.name || item.name.trim().length < 2) {
- logger_1.logger.warn('pipeline', `Dropping product: invalid name`);
- return null;
- }
- if (!item.dutchieUrl) {
- logger_1.logger.warn('pipeline', `Dropping product ${item.name}: no URL`);
- return null;
- }
- // Validate numeric fields
- if (item.price !== undefined && (item.price < 0 || item.price > 10000)) {
- logger_1.logger.warn('pipeline', `Invalid price for ${item.name}: ${item.price}`);
- item.price = undefined;
- }
- if (item.thcPercentage !== undefined && (item.thcPercentage < 0 || item.thcPercentage > 100)) {
- logger_1.logger.warn('pipeline', `Invalid THC for ${item.name}: ${item.thcPercentage}`);
- item.thcPercentage = undefined;
- }
- if (item.cbdPercentage !== undefined && (item.cbdPercentage < 0 || item.cbdPercentage > 100)) {
- logger_1.logger.warn('pipeline', `Invalid CBD for ${item.name}: ${item.cbdPercentage}`);
- item.cbdPercentage = undefined;
- }
- return item;
- }
-}
-exports.ValidationPipeline = ValidationPipeline;
-/**
- * Sanitization Pipeline - cleans and normalizes data
- */
-class SanitizationPipeline {
- name = 'SanitizationPipeline';
- priority = 90;
- async process(item, spider) {
- // Truncate long strings
- if (item.name) {
- item.name = item.name.substring(0, 500).trim();
- }
- if (item.description) {
- item.description = item.description.substring(0, 5000).trim();
- }
- if (item.brand) {
- item.brand = item.brand.substring(0, 255).trim();
- }
- if (item.weight) {
- item.weight = item.weight.substring(0, 100).trim();
- }
- // Normalize strain type
- if (item.strainType) {
- const normalized = item.strainType.toLowerCase();
- if (normalized.includes('indica')) {
- item.strainType = 'Indica';
- }
- else if (normalized.includes('sativa')) {
- item.strainType = 'Sativa';
- }
- else if (normalized.includes('hybrid')) {
- item.strainType = 'Hybrid';
- }
- else {
- item.strainType = undefined;
- }
- }
- // Clean up metadata
- if (item.metadata) {
- // Remove empty arrays
- Object.keys(item.metadata).forEach(key => {
- if (Array.isArray(item.metadata[key]) && item.metadata[key].length === 0) {
- delete item.metadata[key];
- }
- });
- }
- return item;
- }
-}
-exports.SanitizationPipeline = SanitizationPipeline;
-/**
- * Deduplication Pipeline - prevents duplicate items
- */
-class DeduplicationPipeline {
- name = 'DeduplicationPipeline';
- priority = 80;
- seen = new Set();
- async process(item, spider) {
- const fingerprint = `${item.dutchieProductId}`;
- if (this.seen.has(fingerprint)) {
- logger_1.logger.debug('pipeline', `Duplicate product detected: ${item.name}`);
- return null;
- }
- this.seen.add(fingerprint);
- return item;
- }
- clear() {
- this.seen.clear();
- }
-}
-exports.DeduplicationPipeline = DeduplicationPipeline;
-/**
- * Image Processing Pipeline - handles image downloads
- */
-class ImagePipeline {
- name = 'ImagePipeline';
- priority = 70;
- extractImageId(url) {
- try {
- const match = url.match(/images\.dutchie\.com\/([a-f0-9]+)/i);
- return match ? match[1] : null;
- }
- catch (e) {
- return null;
- }
- }
- getFullSizeImageUrl(imageUrl) {
- const imageId = this.extractImageId(imageUrl);
- if (!imageId)
- return imageUrl;
- return `https://images.dutchie.com/${imageId}?auto=format&fit=max&q=95&w=2000&h=2000`;
- }
- async process(item, spider) {
- if (item.imageUrl) {
- // Convert to full-size URL
- item.imageUrl = this.getFullSizeImageUrl(item.imageUrl);
- }
- return item;
- }
-}
-exports.ImagePipeline = ImagePipeline;
-/**
- * Generate a URL-safe slug from a product name
- */
-function generateSlug(name) {
- return name
- .toLowerCase()
- .replace(/[^a-z0-9]+/g, '-')
- .replace(/^-+|-+$/g, '')
- .substring(0, 400);
-}
-/**
- * Database Pipeline - saves items to database with improved matching
- *
- * MATCHING PRIORITY:
- * 1. external_id (dutchie_product_id) - exact match
- * 2. normalized name + brand + category - strong match
- * 3. normalized name + category - weak match (same product, different/missing brand)
- *
- * ALWAYS creates a snapshot after upsert for historical tracking.
- */
-class DatabasePipeline {
- name = 'DatabasePipeline';
- priority = 10; // Low priority - runs last
- crawlId = null;
- setCrawlId(id) {
- this.crawlId = id;
- }
- async process(item, spider) {
- const client = await migrate_1.pool.connect();
- try {
- // Extract store and category from metadata (set by spider)
- const storeId = item.storeId;
- const categoryId = item.categoryId;
- const dispensaryId = item.dispensaryId;
- const categoryName = item.categoryName;
- // Generate normalized values for matching
- const nameNormalized = (0, product_normalizer_1.normalizeProductName)(item.name);
- const brandNormalized = (0, product_normalizer_1.normalizeBrandName)(item.brand);
- const slug = generateSlug(item.name);
- const externalId = item.dutchieProductId || null;
- if (!storeId || !categoryId) {
- logger_1.logger.error('pipeline', `Missing storeId or categoryId for ${item.name}`);
- return null;
- }
- let productId = null;
- let localImagePath = null;
- let isNewProduct = false;
- // STEP 1: Try to match by external_id (most reliable)
- if (externalId) {
- const extMatch = await client.query(`
- SELECT id, image_url, local_image_path
- FROM products
- WHERE store_id = $1 AND (external_id = $2 OR dutchie_product_id = $2)
- `, [storeId, externalId]);
- if (extMatch.rows.length > 0) {
- productId = extMatch.rows[0].id;
- localImagePath = extMatch.rows[0].local_image_path;
- logger_1.logger.debug('pipeline', `Matched by external_id: ${item.name}`);
- }
- }
- // STEP 2: Try to match by normalized name + brand + category
- if (!productId) {
- const normMatch = await client.query(`
- SELECT id, image_url, local_image_path
- FROM products
- WHERE store_id = $1
- AND name_normalized = $2
- AND brand_normalized = $3
- AND category_id = $4
- `, [storeId, nameNormalized, brandNormalized, categoryId]);
- if (normMatch.rows.length > 0) {
- productId = normMatch.rows[0].id;
- localImagePath = normMatch.rows[0].local_image_path;
- logger_1.logger.debug('pipeline', `Matched by normalized name+brand+category: ${item.name}`);
- }
- }
- // STEP 3: Fallback to normalized name + category only (weaker match)
- if (!productId) {
- const weakMatch = await client.query(`
- SELECT id, image_url, local_image_path
- FROM products
- WHERE store_id = $1
- AND name_normalized = $2
- AND category_id = $3
- LIMIT 1
- `, [storeId, nameNormalized, categoryId]);
- if (weakMatch.rows.length === 1) {
- productId = weakMatch.rows[0].id;
- localImagePath = weakMatch.rows[0].local_image_path;
- logger_1.logger.debug('pipeline', `Matched by normalized name+category: ${item.name}`);
- }
- }
- // STEP 4: Final fallback - exact name match (legacy compatibility)
- if (!productId) {
- const exactMatch = await client.query(`
- SELECT id, image_url, local_image_path
- FROM products
- WHERE store_id = $1 AND name = $2 AND category_id = $3
- `, [storeId, item.name, categoryId]);
- if (exactMatch.rows.length > 0) {
- productId = exactMatch.rows[0].id;
- localImagePath = exactMatch.rows[0].local_image_path;
- logger_1.logger.debug('pipeline', `Matched by exact name: ${item.name}`);
- }
- }
- // UPDATE or INSERT
- if (productId) {
- // Update existing product
- await client.query(`
- UPDATE products
- SET name = $1, description = $2, price = $3,
- strain_type = $4, thc_percentage = $5, cbd_percentage = $6,
- brand = $7, weight = $8, image_url = COALESCE($9, image_url), dutchie_url = $10,
- in_stock = true, metadata = $11, last_seen_at = CURRENT_TIMESTAMP,
- updated_at = CURRENT_TIMESTAMP, dispensary_id = $13, slug = COALESCE(slug, $14),
- name_normalized = $15, brand_normalized = $16,
- external_id = COALESCE(external_id, $17), source_platform = COALESCE(source_platform, 'dutchie')
- WHERE id = $12
- `, [
- item.name, item.description, item.price,
- item.strainType, item.thcPercentage, item.cbdPercentage,
- item.brand, item.weight, item.imageUrl, item.dutchieUrl,
- JSON.stringify(item.metadata || {}), productId, dispensaryId, slug,
- nameNormalized, brandNormalized, externalId
- ]);
- logger_1.logger.debug('pipeline', `Updated product: ${item.name}`);
- }
- else {
- // Insert new product
- isNewProduct = true;
- const insertResult = await client.query(`
- INSERT INTO products (
- store_id, category_id, dispensary_id, dutchie_product_id, external_id,
- slug, name, name_normalized, description,
- price, strain_type, thc_percentage, cbd_percentage,
- brand, brand_normalized, weight, image_url, dutchie_url, in_stock, metadata,
- source_platform
- ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, true, $19, 'dutchie')
- RETURNING id
- `, [
- storeId, categoryId, dispensaryId, externalId, externalId,
- slug, item.name, nameNormalized, item.description,
- item.price, item.strainType, item.thcPercentage, item.cbdPercentage,
- item.brand, brandNormalized, item.weight, item.imageUrl, item.dutchieUrl,
- JSON.stringify(item.metadata || {})
- ]);
- productId = insertResult.rows[0].id;
- logger_1.logger.debug('pipeline', `Inserted NEW product: ${item.name}`);
- }
- // ALWAYS create a snapshot for historical tracking
- await this.createSnapshot(client, {
- productId: productId,
- dispensaryId,
- externalId,
- slug,
- item,
- categoryName
- });
- // Download image if needed (only for new products or missing local image)
- if (item.imageUrl && !localImagePath && productId) {
- try {
- const storeResult = await client.query('SELECT slug FROM stores WHERE id = $1', [storeId]);
- const storeSlug = storeResult.rows[0]?.slug || undefined;
- const imageSizes = await (0, minio_1.uploadImageFromUrl)(item.imageUrl, productId, storeSlug);
- localImagePath = imageSizes.thumbnail;
- await client.query(`
- UPDATE products SET local_image_path = $1 WHERE id = $2
- `, [imageSizes.thumbnail, productId]);
- logger_1.logger.debug('pipeline', `Downloaded image for: ${item.name}`);
- }
- catch (error) {
- logger_1.logger.error('pipeline', `Failed to download image for ${item.name}: ${error}`);
- }
- }
- // Attach metadata for stats tracking
- item.isNewProduct = isNewProduct;
- item.productId = productId;
- return item;
- }
- catch (error) {
- logger_1.logger.error('pipeline', `Failed to save product ${item.name}: ${error}`);
- return null;
- }
- finally {
- client.release();
- }
- }
- /**
- * Create a snapshot record for historical tracking
- */
- async createSnapshot(client, params) {
- try {
- // Only create snapshots if the table exists (graceful degradation)
- const tableExists = await client.query(`
- SELECT EXISTS (
- SELECT FROM information_schema.tables
- WHERE table_name = 'product_snapshots'
- )
- `);
- if (!tableExists.rows[0].exists) {
- return; // Snapshot table not yet created
- }
- const crawlId = this.crawlId || crypto.randomUUID();
- const { productId, dispensaryId, externalId, slug, item, categoryName } = params;
- await client.query(`
- INSERT INTO product_snapshots (
- crawl_id, dispensary_id, external_product_id, product_slug,
- name, brand, category, price, original_price, sale_price,
- discount_type, discount_value, availability_status, stock_quantity,
- thc_percentage, cbd_percentage, strain_type, weight, variant,
- description, image_url, effects, terpenes, captured_at
- ) VALUES (
- $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, NOW()
- )
- `, [
- crawlId,
- dispensaryId,
- externalId,
- slug,
- item.name,
- item.brand || null,
- categoryName || null,
- item.price || null,
- item.originalPrice || null,
- item.metadata?.salePrice || null,
- item.metadata?.discountType || null,
- item.metadata?.discountValue || null,
- 'in_stock', // availability_status - if we scraped it, it's in stock
- item.metadata?.stockQuantity || null,
- item.thcPercentage || null,
- item.cbdPercentage || null,
- item.strainType || null,
- item.weight || null,
- item.metadata?.variant || null,
- item.description || null,
- item.imageUrl || null,
- item.metadata?.effects || null,
- item.metadata?.terpenes || null
- ]);
- }
- catch (error) {
- // Don't fail the whole pipeline if snapshot creation fails
- logger_1.logger.warn('pipeline', `Failed to create snapshot for ${params.item.name}: ${error}`);
- }
- }
-}
-exports.DatabasePipeline = DatabasePipeline;
-/**
- * Stats Pipeline - tracks statistics
- */
-class StatsPipeline {
- name = 'StatsPipeline';
- priority = 50;
- stats = {
- total: 0,
- withImages: 0,
- withThc: 0,
- withCbd: 0,
- withDescription: 0
- };
- async process(item, spider) {
- this.stats.total++;
- if (item.imageUrl)
- this.stats.withImages++;
- if (item.thcPercentage)
- this.stats.withThc++;
- if (item.cbdPercentage)
- this.stats.withCbd++;
- if (item.description)
- this.stats.withDescription++;
- return item;
- }
- getStats() {
- return { ...this.stats };
- }
- clear() {
- this.stats = {
- total: 0,
- withImages: 0,
- withThc: 0,
- withCbd: 0,
- withDescription: 0
- };
- }
-}
-exports.StatsPipeline = StatsPipeline;
-/**
- * Pipeline Engine - orchestrates all pipelines
- */
-class PipelineEngine {
- pipelines = [];
- use(pipeline) {
- this.pipelines.push(pipeline);
- // Sort by priority (higher first)
- this.pipelines.sort((a, b) => b.priority - a.priority);
- }
- async processItem(item, spider) {
- let current = item;
- for (const pipeline of this.pipelines) {
- try {
- current = await pipeline.process(current, spider);
- if (!current) {
- // Item was filtered out
- logger_1.logger.debug('pipeline', `Item filtered by ${pipeline.name}`);
- return null;
- }
- }
- catch (error) {
- logger_1.logger.error('pipeline', `Error in ${pipeline.name}: ${error}`);
- // Continue with other pipelines
- }
- }
- return current;
- }
- getPipeline(name) {
- return this.pipelines.find(p => p.name === name);
- }
-}
-exports.PipelineEngine = PipelineEngine;
diff --git a/backend/dist/scraper-v2/scheduler.js b/backend/dist/scraper-v2/scheduler.js
deleted file mode 100644
index cb911427..00000000
--- a/backend/dist/scraper-v2/scheduler.js
+++ /dev/null
@@ -1,136 +0,0 @@
-"use strict";
-var __importDefault = (this && this.__importDefault) || function (mod) {
- return (mod && mod.__esModule) ? mod : { "default": mod };
-};
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.RequestScheduler = void 0;
-const logger_1 = require("../services/logger");
-const crypto_1 = __importDefault(require("crypto"));
-class RequestScheduler {
- queue = [];
- inProgress = new Set();
- seen = new Set();
- deduplicationEnabled = true;
- constructor(deduplicationEnabled = true) {
- this.deduplicationEnabled = deduplicationEnabled;
- }
- /**
- * Generate fingerprint for request deduplication
- */
- generateFingerprint(request) {
- if (request.fingerprint) {
- return request.fingerprint;
- }
- // Generate fingerprint based on URL and relevant metadata
- const data = {
- url: request.url,
- method: request.metadata?.method || 'GET',
- body: request.metadata?.body
- };
- return crypto_1.default.createHash('md5').update(JSON.stringify(data)).digest('hex');
- }
- /**
- * Add a request to the queue
- */
- enqueue(partialRequest) {
- if (!partialRequest.url) {
- logger_1.logger.warn('scraper', 'Cannot enqueue request without URL');
- return false;
- }
- const fingerprint = this.generateFingerprint(partialRequest);
- // Check for duplicates
- if (this.deduplicationEnabled && this.seen.has(fingerprint)) {
- logger_1.logger.debug('scraper', `Request already seen: ${partialRequest.url}`);
- return false;
- }
- // Create full request with defaults
- const request = {
- url: partialRequest.url,
- priority: partialRequest.priority ?? 0,
- retryCount: partialRequest.retryCount ?? 0,
- maxRetries: partialRequest.maxRetries ?? 3,
- metadata: partialRequest.metadata || {},
- callback: partialRequest.callback,
- errorHandler: partialRequest.errorHandler,
- fingerprint
- };
- this.queue.push(request);
- this.seen.add(fingerprint);
- // Sort by priority (higher priority first)
- this.queue.sort((a, b) => b.priority - a.priority);
- logger_1.logger.debug('scraper', `Enqueued: ${request.url} (priority: ${request.priority})`);
- return true;
- }
- /**
- * Get the next request from the queue
- */
- dequeue() {
- const request = this.queue.shift();
- if (request) {
- this.inProgress.add(request.fingerprint);
- }
- return request || null;
- }
- /**
- * Mark a request as complete
- */
- markComplete(request) {
- if (request.fingerprint) {
- this.inProgress.delete(request.fingerprint);
- }
- }
- /**
- * Requeue a failed request (for retry)
- */
- requeueForRetry(request) {
- if (request.fingerprint) {
- this.inProgress.delete(request.fingerprint);
- this.seen.delete(request.fingerprint);
- }
- request.retryCount++;
- if (request.retryCount > request.maxRetries) {
- logger_1.logger.warn('scraper', `Max retries exceeded for: ${request.url}`);
- return false;
- }
- // Decrease priority for retried requests
- request.priority = Math.max(0, request.priority - 1);
- return this.enqueue(request);
- }
- /**
- * Get queue stats
- */
- getStats() {
- return {
- pending: this.queue.length,
- inProgress: this.inProgress.size,
- total: this.seen.size
- };
- }
- /**
- * Check if queue is empty
- */
- isEmpty() {
- return this.queue.length === 0 && this.inProgress.size === 0;
- }
- /**
- * Clear all queues
- */
- clear() {
- this.queue = [];
- this.inProgress.clear();
- this.seen.clear();
- }
- /**
- * Get pending requests count
- */
- getPendingCount() {
- return this.queue.length;
- }
- /**
- * Get in-progress count
- */
- getInProgressCount() {
- return this.inProgress.size;
- }
-}
-exports.RequestScheduler = RequestScheduler;
diff --git a/backend/dist/scraper-v2/types.js b/backend/dist/scraper-v2/types.js
deleted file mode 100644
index 740be005..00000000
--- a/backend/dist/scraper-v2/types.js
+++ /dev/null
@@ -1,13 +0,0 @@
-"use strict";
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.ErrorType = void 0;
-var ErrorType;
-(function (ErrorType) {
- ErrorType["NETWORK_ERROR"] = "NETWORK_ERROR";
- ErrorType["TIMEOUT"] = "TIMEOUT";
- ErrorType["PARSE_ERROR"] = "PARSE_ERROR";
- ErrorType["BOT_DETECTION"] = "BOT_DETECTION";
- ErrorType["NOT_FOUND"] = "NOT_FOUND";
- ErrorType["SERVER_ERROR"] = "SERVER_ERROR";
- ErrorType["UNKNOWN"] = "UNKNOWN";
-})(ErrorType || (exports.ErrorType = ErrorType = {}));
diff --git a/backend/dist/scrapers/dutchie-graphql-direct.js b/backend/dist/scrapers/dutchie-graphql-direct.js
deleted file mode 100644
index d8710717..00000000
--- a/backend/dist/scrapers/dutchie-graphql-direct.js
+++ /dev/null
@@ -1,360 +0,0 @@
-"use strict";
-// ============================================================================
-// DEPRECATED: This scraper writes to the LEGACY products table.
-// DO NOT USE - All Dutchie crawling must use the new dutchie-az pipeline.
-//
-// New pipeline location: src/dutchie-az/services/product-crawler.ts
-// - Uses fetch-based GraphQL (no Puppeteer needed)
-// - Writes to isolated dutchie_az_* tables with snapshot model
-// - Tracks stockStatus, isPresentInFeed, missing_from_feed
-// ============================================================================
-var __importDefault = (this && this.__importDefault) || function (mod) {
- return (mod && mod.__esModule) ? mod : { "default": mod };
-};
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.fetchAllDutchieProducts = fetchAllDutchieProducts;
-exports.upsertProductsDirect = upsertProductsDirect;
-exports.scrapeAllDutchieProducts = scrapeAllDutchieProducts;
-/**
- * @deprecated DEPRECATED - Use src/dutchie-az/services/product-crawler.ts instead.
- * This scraper writes to the legacy products table, not the new dutchie_az tables.
- *
- * Makes direct GraphQL requests from within the browser context to:
- * 1. Bypass Cloudflare (using browser session)
- * 2. Fetch ALL products including out-of-stock (Status: null)
- * 3. Paginate through complete menu
- */
-const puppeteer_extra_1 = __importDefault(require("puppeteer-extra"));
-const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth"));
-const dutchie_graphql_1 = require("./dutchie-graphql");
-puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)());
-// GraphQL persisted query hashes
-const GRAPHQL_HASHES = {
- FilteredProducts: 'ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0',
- GetAddressBasedDispensaryData: '13461f73abf7268770dfd05fe7e10c523084b2bb916a929c08efe3d87531977b',
-};
-/**
- * Fetch all products via in-page GraphQL requests
- * This includes both in-stock and out-of-stock items
- */
-async function fetchAllDutchieProducts(menuUrl, options = {}) {
- const { headless = 'new', timeout = 90000, perPage = 100, includeOutOfStock = true, } = options;
- let browser;
- try {
- browser = await puppeteer_extra_1.default.launch({
- headless,
- args: [
- '--no-sandbox',
- '--disable-setuid-sandbox',
- '--disable-dev-shm-usage',
- '--disable-blink-features=AutomationControlled',
- ],
- });
- const page = await browser.newPage();
- // Stealth configuration
- await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
- await page.setViewport({ width: 1920, height: 1080 });
- await page.evaluateOnNewDocument(() => {
- Object.defineProperty(navigator, 'webdriver', { get: () => false });
- window.chrome = { runtime: {} };
- });
- // Navigate to menu page to establish session
- console.log('[DutchieGraphQL] Loading menu page to establish session...');
- await page.goto(menuUrl, {
- waitUntil: 'networkidle2',
- timeout,
- });
- // Get dispensary ID from page
- const dispensaryId = await page.evaluate(() => {
- const env = window.reactEnv;
- return env?.dispensaryId || env?.retailerId || '';
- });
- if (!dispensaryId) {
- throw new Error('Could not determine dispensaryId from page');
- }
- console.log(`[DutchieGraphQL] Dispensary ID: ${dispensaryId}`);
- // Fetch all products via in-page GraphQL requests
- const allProducts = [];
- let page_num = 0;
- let hasMore = true;
- while (hasMore) {
- console.log(`[DutchieGraphQL] Fetching page ${page_num} (perPage=${perPage})...`);
- const result = await page.evaluate(async (dispensaryId, page_num, perPage, includeOutOfStock, hash) => {
- const variables = {
- includeEnterpriseSpecials: false,
- productsFilter: {
- dispensaryId,
- pricingType: 'rec',
- Status: includeOutOfStock ? null : 'Active', // null = include out-of-stock
- types: [],
- useCache: false, // Don't cache to get fresh data
- isDefaultSort: true,
- sortBy: 'popularSortIdx',
- sortDirection: 1,
- bypassOnlineThresholds: true,
- isKioskMenu: false,
- removeProductsBelowOptionThresholds: false,
- },
- page: page_num,
- perPage,
- };
- const qs = new URLSearchParams({
- operationName: 'FilteredProducts',
- variables: JSON.stringify(variables),
- extensions: JSON.stringify({
- persistedQuery: { version: 1, sha256Hash: hash },
- }),
- });
- const response = await fetch(`https://dutchie.com/graphql?${qs.toString()}`, {
- method: 'GET',
- headers: {
- 'content-type': 'application/json',
- 'apollographql-client-name': 'Marketplace (production)',
- },
- credentials: 'include', // Include cookies/session
- });
- if (!response.ok) {
- throw new Error(`HTTP ${response.status}`);
- }
- return response.json();
- }, dispensaryId, page_num, perPage, includeOutOfStock, GRAPHQL_HASHES.FilteredProducts);
- if (result.errors) {
- console.error('[DutchieGraphQL] GraphQL errors:', result.errors);
- break;
- }
- const products = result?.data?.filteredProducts?.products || [];
- console.log(`[DutchieGraphQL] Page ${page_num}: ${products.length} products`);
- if (products.length === 0) {
- hasMore = false;
- }
- else {
- allProducts.push(...products);
- page_num++;
- // Safety limit
- if (page_num > 50) {
- console.log('[DutchieGraphQL] Reached page limit, stopping');
- hasMore = false;
- }
- }
- }
- // Count active vs inactive
- const activeCount = allProducts.filter((p) => p.Status === 'Active').length;
- const inactiveCount = allProducts.filter((p) => p.Status !== 'Active').length;
- console.log(`[DutchieGraphQL] Total: ${allProducts.length} products (${activeCount} active, ${inactiveCount} inactive)`);
- return {
- products: allProducts,
- dispensaryId,
- totalProducts: allProducts.length,
- activeCount,
- inactiveCount,
- };
- }
- finally {
- if (browser) {
- await browser.close();
- }
- }
-}
-/**
- * Upsert products to database
- */
-async function upsertProductsDirect(pool, storeId, products) {
- const client = await pool.connect();
- let inserted = 0;
- let updated = 0;
- try {
- await client.query('BEGIN');
- for (const product of products) {
- const result = await client.query(`
- INSERT INTO products (
- store_id, external_id, slug, name, enterprise_product_id,
- brand, brand_external_id, brand_logo_url,
- subcategory, strain_type, canonical_category,
- price, rec_price, med_price, rec_special_price, med_special_price,
- is_on_special, special_name, discount_percent, special_data,
- sku, inventory_quantity, inventory_available, is_below_threshold, status,
- thc_percentage, cbd_percentage, cannabinoids,
- weight_mg, net_weight_value, net_weight_unit, options, raw_options,
- image_url, additional_images,
- is_featured, medical_only, rec_only,
- source_created_at, source_updated_at,
- description, raw_data,
- dutchie_url, last_seen_at, updated_at
- )
- VALUES (
- $1, $2, $3, $4, $5,
- $6, $7, $8,
- $9, $10, $11,
- $12, $13, $14, $15, $16,
- $17, $18, $19, $20,
- $21, $22, $23, $24, $25,
- $26, $27, $28,
- $29, $30, $31, $32, $33,
- $34, $35,
- $36, $37, $38,
- $39, $40,
- $41, $42,
- '', NOW(), NOW()
- )
- ON CONFLICT (store_id, slug) DO UPDATE SET
- name = EXCLUDED.name,
- enterprise_product_id = EXCLUDED.enterprise_product_id,
- brand = EXCLUDED.brand,
- brand_external_id = EXCLUDED.brand_external_id,
- brand_logo_url = EXCLUDED.brand_logo_url,
- subcategory = EXCLUDED.subcategory,
- strain_type = EXCLUDED.strain_type,
- canonical_category = EXCLUDED.canonical_category,
- price = EXCLUDED.price,
- rec_price = EXCLUDED.rec_price,
- med_price = EXCLUDED.med_price,
- rec_special_price = EXCLUDED.rec_special_price,
- med_special_price = EXCLUDED.med_special_price,
- is_on_special = EXCLUDED.is_on_special,
- special_name = EXCLUDED.special_name,
- discount_percent = EXCLUDED.discount_percent,
- special_data = EXCLUDED.special_data,
- sku = EXCLUDED.sku,
- inventory_quantity = EXCLUDED.inventory_quantity,
- inventory_available = EXCLUDED.inventory_available,
- is_below_threshold = EXCLUDED.is_below_threshold,
- status = EXCLUDED.status,
- thc_percentage = EXCLUDED.thc_percentage,
- cbd_percentage = EXCLUDED.cbd_percentage,
- cannabinoids = EXCLUDED.cannabinoids,
- weight_mg = EXCLUDED.weight_mg,
- net_weight_value = EXCLUDED.net_weight_value,
- net_weight_unit = EXCLUDED.net_weight_unit,
- options = EXCLUDED.options,
- raw_options = EXCLUDED.raw_options,
- image_url = EXCLUDED.image_url,
- additional_images = EXCLUDED.additional_images,
- is_featured = EXCLUDED.is_featured,
- medical_only = EXCLUDED.medical_only,
- rec_only = EXCLUDED.rec_only,
- source_created_at = EXCLUDED.source_created_at,
- source_updated_at = EXCLUDED.source_updated_at,
- description = EXCLUDED.description,
- raw_data = EXCLUDED.raw_data,
- last_seen_at = NOW(),
- updated_at = NOW()
- RETURNING (xmax = 0) AS was_inserted
- `, [
- storeId,
- product.external_id,
- product.slug,
- product.name,
- product.enterprise_product_id,
- product.brand,
- product.brand_external_id,
- product.brand_logo_url,
- product.subcategory,
- product.strain_type,
- product.canonical_category,
- product.price,
- product.rec_price,
- product.med_price,
- product.rec_special_price,
- product.med_special_price,
- product.is_on_special,
- product.special_name,
- product.discount_percent,
- product.special_data ? JSON.stringify(product.special_data) : null,
- product.sku,
- product.inventory_quantity,
- product.inventory_available,
- product.is_below_threshold,
- product.status,
- product.thc_percentage,
- product.cbd_percentage,
- product.cannabinoids ? JSON.stringify(product.cannabinoids) : null,
- product.weight_mg,
- product.net_weight_value,
- product.net_weight_unit,
- product.options,
- product.raw_options,
- product.image_url,
- product.additional_images,
- product.is_featured,
- product.medical_only,
- product.rec_only,
- product.source_created_at,
- product.source_updated_at,
- product.description,
- product.raw_data ? JSON.stringify(product.raw_data) : null,
- ]);
- if (result.rows[0]?.was_inserted) {
- inserted++;
- }
- else {
- updated++;
- }
- }
- await client.query('COMMIT');
- return { inserted, updated };
- }
- catch (error) {
- await client.query('ROLLBACK');
- throw error;
- }
- finally {
- client.release();
- }
-}
-/**
- * @deprecated DEPRECATED - Use src/dutchie-az/services/product-crawler.ts instead.
- * This function is disabled and will throw an error if called.
- * Main entry point - scrape all products including out-of-stock
- */
-async function scrapeAllDutchieProducts(pool, storeId, menuUrl) {
- // DEPRECATED: Throw error to prevent accidental use
- throw new Error('DEPRECATED: scrapeAllDutchieProducts() is deprecated. ' +
- 'Use src/dutchie-az/services/product-crawler.ts instead. ' +
- 'This scraper writes to the legacy products table.');
- // Original code below is unreachable but kept for reference
- try {
- console.log(`[DutchieGraphQL] Scraping ALL products (including out-of-stock): ${menuUrl}`);
- // Fetch all products via direct GraphQL
- const { products, totalProducts, activeCount, inactiveCount } = await fetchAllDutchieProducts(menuUrl, {
- includeOutOfStock: true,
- perPage: 100,
- });
- if (products.length === 0) {
- return {
- success: false,
- totalProducts: 0,
- activeCount: 0,
- inactiveCount: 0,
- inserted: 0,
- updated: 0,
- error: 'No products returned from GraphQL',
- };
- }
- // Normalize products
- const normalized = products.map(dutchie_graphql_1.normalizeDutchieProduct);
- // Upsert to database
- const { inserted, updated } = await upsertProductsDirect(pool, storeId, normalized);
- console.log(`[DutchieGraphQL] Complete: ${totalProducts} products (${activeCount} active, ${inactiveCount} inactive)`);
- console.log(`[DutchieGraphQL] Database: ${inserted} inserted, ${updated} updated`);
- return {
- success: true,
- totalProducts,
- activeCount,
- inactiveCount,
- inserted,
- updated,
- };
- }
- catch (error) {
- console.error(`[DutchieGraphQL] Error:`, error.message);
- return {
- success: false,
- totalProducts: 0,
- activeCount: 0,
- inactiveCount: 0,
- inserted: 0,
- updated: 0,
- error: error.message,
- };
- }
-}
diff --git a/backend/dist/scrapers/dutchie-graphql.js b/backend/dist/scrapers/dutchie-graphql.js
deleted file mode 100644
index d1dab343..00000000
--- a/backend/dist/scrapers/dutchie-graphql.js
+++ /dev/null
@@ -1,446 +0,0 @@
-"use strict";
-// ============================================================================
-// DEPRECATED: This scraper writes to the LEGACY products table.
-// DO NOT USE - All Dutchie crawling must use the new dutchie-az pipeline.
-//
-// New pipeline location: src/dutchie-az/services/product-crawler.ts
-// - Uses fetch-based GraphQL (no Puppeteer needed)
-// - Writes to isolated dutchie_az_* tables with snapshot model
-// - Tracks stockStatus, isPresentInFeed, missing_from_feed
-//
-// The normalizer functions in this file (normalizeDutchieProduct) may still
-// be imported for reference, but do NOT call scrapeDutchieMenu() or upsertProducts().
-// ============================================================================
-var __importDefault = (this && this.__importDefault) || function (mod) {
- return (mod && mod.__esModule) ? mod : { "default": mod };
-};
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.normalizeDutchieProduct = normalizeDutchieProduct;
-exports.fetchDutchieMenuViaPuppeteer = fetchDutchieMenuViaPuppeteer;
-exports.upsertProducts = upsertProducts;
-exports.scrapeDutchieMenu = scrapeDutchieMenu;
-/**
- * @deprecated DEPRECATED - Use src/dutchie-az/services/product-crawler.ts instead.
- * This scraper writes to the legacy products table, not the new dutchie_az tables.
- *
- * Fetches product data via Puppeteer interception of Dutchie's GraphQL API.
- * This bypasses Cloudflare by using a real browser to load the menu page.
- *
- * GraphQL Operations:
- * - FilteredProducts: Returns paginated product list with full details
- * - GetAddressBasedDispensaryData: Resolves dispensary cName to dispensaryId
- */
-const puppeteer_extra_1 = __importDefault(require("puppeteer-extra"));
-const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth"));
-puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)());
-// =====================================================
-// NORMALIZER: Dutchie GraphQL → DB Schema
-// =====================================================
-function normalizeDutchieProduct(product) {
- // Extract first special if exists
- const saleSpecial = product.specialData?.saleSpecials?.[0];
- // Calculate inventory from POSMetaData children
- const children = product.POSMetaData?.children || [];
- const totalQuantity = children.reduce((sum, c) => sum + (c.quantity || 0), 0);
- const availableQuantity = children.reduce((sum, c) => sum + (c.quantityAvailable || 0), 0);
- // Parse timestamps
- let sourceCreatedAt;
- if (product.createdAt) {
- // createdAt is a timestamp string like "1729044510543"
- const ts = parseInt(product.createdAt, 10);
- if (!isNaN(ts)) {
- sourceCreatedAt = new Date(ts);
- }
- }
- let sourceUpdatedAt;
- if (product.updatedAt) {
- sourceUpdatedAt = new Date(product.updatedAt);
- }
- return {
- // Identity
- external_id: product._id || product.id,
- slug: product.cName,
- name: product.Name,
- enterprise_product_id: product.enterpriseProductId,
- // Brand
- brand: product.brandName || product.brand?.name,
- brand_external_id: product.brandId || product.brand?.id,
- brand_logo_url: product.brandLogo || product.brand?.imageUrl,
- // Category
- subcategory: product.subcategory,
- strain_type: product.strainType,
- canonical_category: product.POSMetaData?.canonicalCategory,
- // Pricing
- price: product.Prices?.[0],
- rec_price: product.recPrices?.[0],
- med_price: product.medicalPrices?.[0],
- rec_special_price: product.recSpecialPrices?.[0],
- med_special_price: product.medicalSpecialPrices?.[0],
- // Specials
- is_on_special: product.special === true,
- special_name: saleSpecial?.specialName,
- discount_percent: saleSpecial?.percentDiscount ? saleSpecial.discount : undefined,
- special_data: product.specialData,
- // Inventory
- sku: product.POSMetaData?.canonicalSKU,
- inventory_quantity: totalQuantity || undefined,
- inventory_available: availableQuantity || undefined,
- is_below_threshold: product.isBelowThreshold === true,
- status: product.Status,
- // Potency
- thc_percentage: product.THCContent?.range?.[0],
- cbd_percentage: product.CBDContent?.range?.[0],
- cannabinoids: product.cannabinoidsV2,
- // Weight/Options
- weight_mg: product.weight,
- net_weight_value: product.measurements?.netWeight?.values?.[0],
- net_weight_unit: product.measurements?.netWeight?.unit,
- options: product.Options,
- raw_options: product.rawOptions,
- // Images
- image_url: product.Image,
- additional_images: product.images?.length ? product.images : undefined,
- // Flags
- is_featured: product.featured === true,
- medical_only: product.medicalOnly === true,
- rec_only: product.recOnly === true,
- // Timestamps
- source_created_at: sourceCreatedAt,
- source_updated_at: sourceUpdatedAt,
- // Description
- description: typeof product.description === 'string' ? product.description : undefined,
- // Raw
- raw_data: product,
- };
-}
-async function fetchDutchieMenuViaPuppeteer(menuUrl, options = {}) {
- const { headless = 'new', timeout = 90000, maxScrolls = 30, // Increased for full menu capture
- } = options;
- let browser;
- const capturedProducts = [];
- let dispensaryId = '';
- try {
- browser = await puppeteer_extra_1.default.launch({
- headless,
- args: [
- '--no-sandbox',
- '--disable-setuid-sandbox',
- '--disable-dev-shm-usage',
- '--disable-blink-features=AutomationControlled',
- ],
- });
- const page = await browser.newPage();
- // Stealth configuration
- await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
- await page.setViewport({ width: 1920, height: 1080 });
- await page.evaluateOnNewDocument(() => {
- Object.defineProperty(navigator, 'webdriver', { get: () => false });
- window.chrome = { runtime: {} };
- });
- // Track seen product IDs to avoid duplicates
- const seenIds = new Set();
- // Intercept GraphQL responses
- page.on('response', async (response) => {
- const url = response.url();
- if (!url.includes('graphql'))
- return;
- try {
- const contentType = response.headers()['content-type'] || '';
- if (!contentType.includes('application/json'))
- return;
- const data = await response.json();
- // Capture dispensary ID
- if (data?.data?.getAddressBasedDispensaryData?.dispensaryData?.dispensaryId) {
- dispensaryId = data.data.getAddressBasedDispensaryData.dispensaryData.dispensaryId;
- }
- // Capture products from FilteredProducts
- if (data?.data?.filteredProducts?.products) {
- const products = data.data.filteredProducts.products;
- for (const product of products) {
- if (!seenIds.has(product._id)) {
- seenIds.add(product._id);
- capturedProducts.push(product);
- }
- }
- }
- }
- catch {
- // Ignore parse errors
- }
- });
- // Navigate to menu
- console.log('[DutchieGraphQL] Loading menu page...');
- await page.goto(menuUrl, {
- waitUntil: 'networkidle2',
- timeout,
- });
- // Get dispensary ID from window.reactEnv if not captured
- if (!dispensaryId) {
- dispensaryId = await page.evaluate(() => {
- const env = window.reactEnv;
- return env?.dispensaryId || env?.retailerId || '';
- });
- }
- // Helper function to scroll through a page until no more products load
- async function scrollToLoadAll(maxScrollAttempts = maxScrolls) {
- let scrollCount = 0;
- let previousCount = 0;
- let noNewProductsCount = 0;
- while (scrollCount < maxScrollAttempts && noNewProductsCount < 3) {
- await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
- await new Promise((r) => setTimeout(r, 1500));
- const currentCount = seenIds.size;
- if (currentCount === previousCount) {
- noNewProductsCount++;
- }
- else {
- noNewProductsCount = 0;
- }
- previousCount = currentCount;
- scrollCount++;
- }
- }
- // First, scroll through the main page (all products)
- console.log('[DutchieGraphQL] Scrolling main page...');
- await scrollToLoadAll();
- console.log(`[DutchieGraphQL] After main page: ${seenIds.size} products`);
- // Get category links from the navigation
- const categoryLinks = await page.evaluate(() => {
- const links = [];
- // Look for category navigation links
- const navLinks = document.querySelectorAll('a[href*="/products/"]');
- navLinks.forEach((link) => {
- const href = link.href;
- if (href && !links.includes(href)) {
- links.push(href);
- }
- });
- return links;
- });
- console.log(`[DutchieGraphQL] Found ${categoryLinks.length} category links`);
- // Visit each category page to capture all products
- for (const categoryUrl of categoryLinks) {
- try {
- console.log(`[DutchieGraphQL] Visiting category: ${categoryUrl.split('/').pop()}`);
- await page.goto(categoryUrl, {
- waitUntil: 'networkidle2',
- timeout: 30000,
- });
- await scrollToLoadAll(15); // Fewer scrolls per category
- console.log(`[DutchieGraphQL] Total products: ${seenIds.size}`);
- }
- catch (e) {
- console.log(`[DutchieGraphQL] Category error: ${e.message}`);
- }
- }
- // Wait for any final responses
- await new Promise((r) => setTimeout(r, 2000));
- return {
- products: capturedProducts,
- dispensaryId,
- menuUrl,
- };
- }
- finally {
- if (browser) {
- await browser.close();
- }
- }
-}
-// =====================================================
-// DATABASE OPERATIONS
-// =====================================================
-async function upsertProducts(pool, storeId, products) {
- const client = await pool.connect();
- let inserted = 0;
- let updated = 0;
- try {
- await client.query('BEGIN');
- for (const product of products) {
- // Upsert product
- const result = await client.query(`
- INSERT INTO products (
- store_id, external_id, slug, name, enterprise_product_id,
- brand, brand_external_id, brand_logo_url,
- subcategory, strain_type, canonical_category,
- price, rec_price, med_price, rec_special_price, med_special_price,
- is_on_special, special_name, discount_percent, special_data,
- sku, inventory_quantity, inventory_available, is_below_threshold, status,
- thc_percentage, cbd_percentage, cannabinoids,
- weight_mg, net_weight_value, net_weight_unit, options, raw_options,
- image_url, additional_images,
- is_featured, medical_only, rec_only,
- source_created_at, source_updated_at,
- description, raw_data,
- dutchie_url, last_seen_at, updated_at
- )
- VALUES (
- $1, $2, $3, $4, $5,
- $6, $7, $8,
- $9, $10, $11,
- $12, $13, $14, $15, $16,
- $17, $18, $19, $20,
- $21, $22, $23, $24, $25,
- $26, $27, $28,
- $29, $30, $31, $32, $33,
- $34, $35,
- $36, $37, $38,
- $39, $40,
- $41, $42,
- '', NOW(), NOW()
- )
- ON CONFLICT (store_id, slug) DO UPDATE SET
- name = EXCLUDED.name,
- enterprise_product_id = EXCLUDED.enterprise_product_id,
- brand = EXCLUDED.brand,
- brand_external_id = EXCLUDED.brand_external_id,
- brand_logo_url = EXCLUDED.brand_logo_url,
- subcategory = EXCLUDED.subcategory,
- strain_type = EXCLUDED.strain_type,
- canonical_category = EXCLUDED.canonical_category,
- price = EXCLUDED.price,
- rec_price = EXCLUDED.rec_price,
- med_price = EXCLUDED.med_price,
- rec_special_price = EXCLUDED.rec_special_price,
- med_special_price = EXCLUDED.med_special_price,
- is_on_special = EXCLUDED.is_on_special,
- special_name = EXCLUDED.special_name,
- discount_percent = EXCLUDED.discount_percent,
- special_data = EXCLUDED.special_data,
- sku = EXCLUDED.sku,
- inventory_quantity = EXCLUDED.inventory_quantity,
- inventory_available = EXCLUDED.inventory_available,
- is_below_threshold = EXCLUDED.is_below_threshold,
- status = EXCLUDED.status,
- thc_percentage = EXCLUDED.thc_percentage,
- cbd_percentage = EXCLUDED.cbd_percentage,
- cannabinoids = EXCLUDED.cannabinoids,
- weight_mg = EXCLUDED.weight_mg,
- net_weight_value = EXCLUDED.net_weight_value,
- net_weight_unit = EXCLUDED.net_weight_unit,
- options = EXCLUDED.options,
- raw_options = EXCLUDED.raw_options,
- image_url = EXCLUDED.image_url,
- additional_images = EXCLUDED.additional_images,
- is_featured = EXCLUDED.is_featured,
- medical_only = EXCLUDED.medical_only,
- rec_only = EXCLUDED.rec_only,
- source_created_at = EXCLUDED.source_created_at,
- source_updated_at = EXCLUDED.source_updated_at,
- description = EXCLUDED.description,
- raw_data = EXCLUDED.raw_data,
- last_seen_at = NOW(),
- updated_at = NOW()
- RETURNING (xmax = 0) AS was_inserted
- `, [
- storeId,
- product.external_id,
- product.slug,
- product.name,
- product.enterprise_product_id,
- product.brand,
- product.brand_external_id,
- product.brand_logo_url,
- product.subcategory,
- product.strain_type,
- product.canonical_category,
- product.price,
- product.rec_price,
- product.med_price,
- product.rec_special_price,
- product.med_special_price,
- product.is_on_special,
- product.special_name,
- product.discount_percent,
- product.special_data ? JSON.stringify(product.special_data) : null,
- product.sku,
- product.inventory_quantity,
- product.inventory_available,
- product.is_below_threshold,
- product.status,
- product.thc_percentage,
- product.cbd_percentage,
- product.cannabinoids ? JSON.stringify(product.cannabinoids) : null,
- product.weight_mg,
- product.net_weight_value,
- product.net_weight_unit,
- product.options,
- product.raw_options,
- product.image_url,
- product.additional_images,
- product.is_featured,
- product.medical_only,
- product.rec_only,
- product.source_created_at,
- product.source_updated_at,
- product.description,
- product.raw_data ? JSON.stringify(product.raw_data) : null,
- ]);
- if (result.rows[0]?.was_inserted) {
- inserted++;
- }
- else {
- updated++;
- }
- }
- await client.query('COMMIT');
- return { inserted, updated };
- }
- catch (error) {
- await client.query('ROLLBACK');
- throw error;
- }
- finally {
- client.release();
- }
-}
-// =====================================================
-// MAIN ENTRY POINT
-// =====================================================
-/**
- * @deprecated DEPRECATED - Use src/dutchie-az/services/product-crawler.ts instead.
- * This function is disabled and will throw an error if called.
- */
-async function scrapeDutchieMenu(pool, storeId, menuUrl) {
- // DEPRECATED: Throw error to prevent accidental use
- throw new Error('DEPRECATED: scrapeDutchieMenu() is deprecated. ' +
- 'Use src/dutchie-az/services/product-crawler.ts instead. ' +
- 'This scraper writes to the legacy products table.');
- // Original code below is unreachable but kept for reference
- try {
- console.log(`[DutchieGraphQL] Scraping: ${menuUrl}`);
- // Fetch products via Puppeteer
- const { products, dispensaryId } = await fetchDutchieMenuViaPuppeteer(menuUrl);
- console.log(`[DutchieGraphQL] Captured ${products.length} products, dispensaryId: ${dispensaryId}`);
- if (products.length === 0) {
- return {
- success: false,
- productsFound: 0,
- inserted: 0,
- updated: 0,
- error: 'No products captured from GraphQL responses',
- };
- }
- // Normalize products
- const normalized = products.map(normalizeDutchieProduct);
- // Upsert to database
- const { inserted, updated } = await upsertProducts(pool, storeId, normalized);
- console.log(`[DutchieGraphQL] Upsert complete: ${inserted} inserted, ${updated} updated`);
- return {
- success: true,
- productsFound: products.length,
- inserted,
- updated,
- };
- }
- catch (error) {
- console.error(`[DutchieGraphQL] Error:`, error.message);
- return {
- success: false,
- productsFound: 0,
- inserted: 0,
- updated: 0,
- error: error.message,
- };
- }
-}
diff --git a/backend/dist/scrapers/templates/dutchie.js b/backend/dist/scrapers/templates/dutchie.js
deleted file mode 100644
index 54f1f96d..00000000
--- a/backend/dist/scrapers/templates/dutchie.js
+++ /dev/null
@@ -1,85 +0,0 @@
-"use strict";
-// ============================================================================
-// DEPRECATED: Dutchie now crawled via GraphQL only (see dutchie-az pipeline)
-// DO NOT USE - This HTML scraper is unreliable and targets the legacy products table.
-// All Dutchie crawling must go through: src/dutchie-az/services/product-crawler.ts
-// ============================================================================
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.dutchieTemplate = void 0;
-exports.getTemplateForUrl = getTemplateForUrl;
-const logger_1 = require("../../services/logger");
-/**
- * @deprecated DEPRECATED - Dutchie HTML scraping is no longer supported.
- * Use the dutchie-az GraphQL pipeline instead: src/dutchie-az/services/product-crawler.ts
- * This template relied on unstable DOM selectors and wrote to legacy tables.
- */
-exports.dutchieTemplate = {
- name: 'Dutchie Marketplace',
- urlPattern: /dutchie\.com\/dispensary\//,
- buildCategoryUrl: (baseUrl, category) => {
- // Remove trailing slash
- const base = baseUrl.replace(/\/$/, '');
- // Convert category name to URL-friendly slug
- const categorySlug = category.toLowerCase().replace(/\s+/g, '-');
- return `${base}/products/${categorySlug}`;
- },
- extractProducts: async (page) => {
- const products = [];
- try {
- // Wait for product cards to load
- await page.waitForSelector('a[data-testid="card-link"]', { timeout: 10000 }).catch(() => {
- logger_1.logger.warn('scraper', 'No product cards found with data-testid="card-link"');
- });
- // Get all product card links
- const productCards = await page.locator('a[href*="/product/"][data-testid="card-link"]').all();
- logger_1.logger.info('scraper', `Found ${productCards.length} Dutchie product cards`);
- for (const card of productCards) {
- try {
- // Extract all data at once using evaluate for speed
- const cardData = await card.evaluate((el) => {
- const href = el.getAttribute('href') || '';
- const img = el.querySelector('img');
- const imageUrl = img ? img.getAttribute('src') || '' : '';
- // Get all text nodes in order
- const textElements = Array.from(el.querySelectorAll('*'))
- .filter(el => el.textContent && el.children.length === 0)
- .map(el => (el.textContent || '').trim())
- .filter(text => text.length > 0);
- const name = textElements[0] || '';
- const brand = textElements[1] || '';
- // Look for price
- const priceMatch = el.textContent?.match(/\$(\d+(?:\.\d{2})?)/);
- const price = priceMatch ? parseFloat(priceMatch[1]) : undefined;
- return { href, imageUrl, name, brand, price };
- });
- if (cardData.name && cardData.href) {
- products.push({
- name: cardData.name,
- brand: cardData.brand || undefined,
- product_url: cardData.href.startsWith('http') ? cardData.href : `https://dutchie.com${cardData.href}`,
- image_url: cardData.imageUrl || undefined,
- price: cardData.price,
- in_stock: true,
- });
- }
- }
- catch (err) {
- logger_1.logger.warn('scraper', `Error extracting Dutchie product card: ${err}`);
- }
- }
- }
- catch (err) {
- logger_1.logger.error('scraper', `Error in Dutchie product extraction: ${err}`);
- }
- return products;
- },
-};
-/**
- * Get the appropriate scraper template based on URL
- */
-function getTemplateForUrl(url) {
- if (exports.dutchieTemplate.urlPattern.test(url)) {
- return exports.dutchieTemplate;
- }
- return null;
-}
diff --git a/backend/dist/scripts/backfill-store-dispensary.js b/backend/dist/scripts/backfill-store-dispensary.js
deleted file mode 100644
index 4a9ea57a..00000000
--- a/backend/dist/scripts/backfill-store-dispensary.js
+++ /dev/null
@@ -1,287 +0,0 @@
-#!/usr/bin/env npx tsx
-"use strict";
-/**
- * Backfill Store-Dispensary Mapping
- *
- * Links existing stores (scheduler) to dispensaries (master AZDHS directory)
- * by matching on name, city, and zip code.
- *
- * Usage:
- * npx tsx src/scripts/backfill-store-dispensary.ts # Preview matches
- * npx tsx src/scripts/backfill-store-dispensary.ts --apply # Apply matches
- * npx tsx src/scripts/backfill-store-dispensary.ts --verbose # Show all match details
- */
-Object.defineProperty(exports, "__esModule", { value: true });
-const migrate_1 = require("../db/migrate");
-const logger_1 = require("../services/logger");
-const args = process.argv.slice(2);
-const flags = {
- apply: args.includes('--apply'),
- verbose: args.includes('--verbose'),
- help: args.includes('--help') || args.includes('-h'),
-};
-/**
- * Normalize a store/dispensary name for comparison
- * Removes common suffixes, punctuation, and extra whitespace
- */
-function normalizeName(name) {
- return name
- .toLowerCase()
- .replace(/\s*[-–—]\s*/g, ' ') // Normalize dashes to spaces
- .replace(/\s*(dispensary|cannabis|marijuana|weed|shop|store|llc|inc)\s*/gi, ' ')
- .replace(/['']/g, "'") // Normalize apostrophes
- .replace(/[^\w\s']/g, '') // Remove other punctuation
- .replace(/\s+/g, ' ') // Collapse whitespace
- .trim();
-}
-/**
- * Simple Levenshtein distance for fuzzy matching
- */
-function levenshteinDistance(a, b) {
- const matrix = [];
- for (let i = 0; i <= b.length; i++) {
- matrix[i] = [i];
- }
- for (let j = 0; j <= a.length; j++) {
- matrix[0][j] = j;
- }
- for (let i = 1; i <= b.length; i++) {
- for (let j = 1; j <= a.length; j++) {
- if (b.charAt(i - 1) === a.charAt(j - 1)) {
- matrix[i][j] = matrix[i - 1][j - 1];
- }
- else {
- matrix[i][j] = Math.min(matrix[i - 1][j - 1] + 1, // substitution
- matrix[i][j - 1] + 1, // insertion
- matrix[i - 1][j] + 1 // deletion
- );
- }
- }
- }
- return matrix[b.length][a.length];
-}
-/**
- * Calculate similarity score (0-100)
- */
-function similarityScore(a, b) {
- const maxLen = Math.max(a.length, b.length);
- if (maxLen === 0)
- return 100;
- const distance = levenshteinDistance(a, b);
- return Math.round((1 - distance / maxLen) * 100);
-}
-/**
- * Find the best dispensary match for a store
- */
-function findBestMatch(store, dispensaries) {
- const normalizedStoreName = normalizeName(store.name);
- const storeSlug = store.slug.toLowerCase();
- let bestMatch = {
- store,
- dispensary: null,
- matchType: 'none',
- score: 0,
- };
- for (const disp of dispensaries) {
- const normalizedDispName = normalizeName(disp.name);
- const normalizedCompanyName = disp.company_name ? normalizeName(disp.company_name) : '';
- const dispSlug = disp.slug.toLowerCase();
- // 1. Exact name match (case-insensitive)
- if (store.name.toLowerCase() === disp.name.toLowerCase()) {
- return {
- store,
- dispensary: disp,
- matchType: 'exact_name',
- score: 100,
- };
- }
- // 2. Normalized name match
- if (normalizedStoreName === normalizedDispName) {
- return {
- store,
- dispensary: disp,
- matchType: 'normalized_name',
- score: 95,
- };
- }
- // 3. Store name matches company name
- if (normalizedCompanyName && normalizedStoreName === normalizedCompanyName) {
- return {
- store,
- dispensary: disp,
- matchType: 'company_name',
- score: 90,
- };
- }
- // 4. Slug match
- if (storeSlug === dispSlug) {
- return {
- store,
- dispensary: disp,
- matchType: 'slug',
- score: 85,
- };
- }
- // 5. Fuzzy matching (only if score > 70)
- const nameScore = similarityScore(normalizedStoreName, normalizedDispName);
- const companyScore = normalizedCompanyName
- ? similarityScore(normalizedStoreName, normalizedCompanyName)
- : 0;
- const fuzzyScore = Math.max(nameScore, companyScore);
- if (fuzzyScore > bestMatch.score && fuzzyScore >= 70) {
- bestMatch = {
- store,
- dispensary: disp,
- matchType: 'fuzzy',
- score: fuzzyScore,
- };
- }
- }
- return bestMatch;
-}
-async function main() {
- if (flags.help) {
- console.log(`
-Backfill Store-Dispensary Mapping
-
-Links existing stores (scheduler) to dispensaries (master AZDHS directory)
-by matching on name, company name, or slug similarity.
-
-USAGE:
- npx tsx src/scripts/backfill-store-dispensary.ts [OPTIONS]
-
-OPTIONS:
- --apply Apply the mappings to the database (default: preview only)
- --verbose Show detailed match information for all stores
- --help, -h Show this help message
-
-EXAMPLES:
- # Preview what would be matched
- npx tsx src/scripts/backfill-store-dispensary.ts
-
- # Apply the mappings
- npx tsx src/scripts/backfill-store-dispensary.ts --apply
-
- # Show verbose output
- npx tsx src/scripts/backfill-store-dispensary.ts --verbose
-`);
- process.exit(0);
- }
- console.log('\n📦 Backfill Store-Dispensary Mapping');
- console.log('=====================================\n');
- try {
- // Fetch all stores without a dispensary_id
- const storesResult = await migrate_1.pool.query(`
- SELECT id, name, slug, dispensary_id
- FROM stores
- WHERE dispensary_id IS NULL
- ORDER BY name
- `);
- const unmappedStores = storesResult.rows;
- // Fetch all already-mapped stores for context
- const mappedResult = await migrate_1.pool.query(`
- SELECT id, name, slug, dispensary_id
- FROM stores
- WHERE dispensary_id IS NOT NULL
- ORDER BY name
- `);
- const mappedStores = mappedResult.rows;
- // Fetch all dispensaries
- const dispResult = await migrate_1.pool.query(`
- SELECT id, name, company_name, city, address, slug
- FROM dispensaries
- ORDER BY name
- `);
- const dispensaries = dispResult.rows;
- console.log(`📊 Current Status:`);
- console.log(` Stores without dispensary_id: ${unmappedStores.length}`);
- console.log(` Stores already mapped: ${mappedStores.length}`);
- console.log(` Total dispensaries: ${dispensaries.length}\n`);
- if (unmappedStores.length === 0) {
- console.log('✅ All stores are already mapped to dispensaries!\n');
- await migrate_1.pool.end();
- process.exit(0);
- }
- // Find matches for each unmapped store
- const matches = [];
- const noMatches = [];
- for (const store of unmappedStores) {
- const match = findBestMatch(store, dispensaries);
- if (match.dispensary) {
- matches.push(match);
- }
- else {
- noMatches.push(store);
- }
- }
- // Sort matches by score (highest first)
- matches.sort((a, b) => b.score - a.score);
- // Display results
- console.log(`\n🔗 Matches Found: ${matches.length}`);
- console.log('----------------------------------\n');
- if (matches.length > 0) {
- // Group by match type
- const byType = {};
- for (const m of matches) {
- if (!byType[m.matchType])
- byType[m.matchType] = [];
- byType[m.matchType].push(m);
- }
- const typeLabels = {
- exact_name: '✅ Exact Name Match',
- normalized_name: '✅ Normalized Name Match',
- company_name: '🏢 Company Name Match',
- slug: '🔗 Slug Match',
- fuzzy: '🔍 Fuzzy Match',
- };
- for (const [type, results] of Object.entries(byType)) {
- console.log(`${typeLabels[type]} (${results.length}):`);
- for (const r of results) {
- const dispInfo = r.dispensary;
- console.log(` • "${r.store.name}" → "${dispInfo.name}" (${dispInfo.city}) [${r.score}%]`);
- }
- console.log('');
- }
- }
- if (noMatches.length > 0) {
- console.log(`\n❌ No Match Found: ${noMatches.length}`);
- console.log('----------------------------------\n');
- for (const store of noMatches) {
- console.log(` • "${store.name}" (slug: ${store.slug})`);
- }
- console.log('');
- }
- // Apply if requested
- if (flags.apply && matches.length > 0) {
- console.log('\n🔧 Applying mappings...\n');
- let updated = 0;
- for (const match of matches) {
- if (!match.dispensary)
- continue;
- await migrate_1.pool.query('UPDATE stores SET dispensary_id = $1 WHERE id = $2', [match.dispensary.id, match.store.id]);
- updated++;
- if (flags.verbose) {
- console.log(` ✓ Linked store ${match.store.id} to dispensary ${match.dispensary.id}`);
- }
- }
- console.log(`\n✅ Updated ${updated} stores with dispensary mappings\n`);
- logger_1.logger.info('system', `Backfill complete: linked ${updated} stores to dispensaries`);
- }
- else if (matches.length > 0 && !flags.apply) {
- console.log('\n💡 Run with --apply to update the database\n');
- }
- // Summary
- console.log('📈 Summary:');
- console.log(` Would match: ${matches.length} stores`);
- console.log(` No match: ${noMatches.length} stores`);
- console.log(` Match rate: ${Math.round((matches.length / unmappedStores.length) * 100)}%\n`);
- }
- catch (error) {
- console.error('Error:', error);
- process.exit(1);
- }
- finally {
- await migrate_1.pool.end();
- }
-}
-main().catch(console.error);
diff --git a/backend/dist/scripts/bootstrap-discovery.js b/backend/dist/scripts/bootstrap-discovery.js
deleted file mode 100644
index eac151f4..00000000
--- a/backend/dist/scripts/bootstrap-discovery.js
+++ /dev/null
@@ -1,332 +0,0 @@
-#!/usr/bin/env npx tsx
-"use strict";
-/**
- * Bootstrap Discovery Script
- *
- * One-time (but reusable) bootstrap command that:
- * 1. Ensures every Dispensary has a dispensary_crawl_schedule entry (4h default)
- * 2. Optionally runs RunDispensaryOrchestrator for each dispensary
- *
- * Usage:
- * npx tsx src/scripts/bootstrap-discovery.ts # Create schedules only
- * npx tsx src/scripts/bootstrap-discovery.ts --run # Create schedules + run orchestrator
- * npx tsx src/scripts/bootstrap-discovery.ts --run --limit=10 # Run for first 10 dispensaries
- * npx tsx src/scripts/bootstrap-discovery.ts --dry-run # Preview what would happen
- * npx tsx src/scripts/bootstrap-discovery.ts --status # Show current status only
- */
-Object.defineProperty(exports, "__esModule", { value: true });
-const migrate_1 = require("../db/migrate");
-const dispensary_orchestrator_1 = require("../services/dispensary-orchestrator");
-// Parse command line args
-const args = process.argv.slice(2);
-const flags = {
- run: args.includes('--run'),
- dryRun: args.includes('--dry-run'),
- status: args.includes('--status'),
- help: args.includes('--help') || args.includes('-h'),
- limit: parseInt(args.find(a => a.startsWith('--limit='))?.split('=')[1] || '0'),
- concurrency: parseInt(args.find(a => a.startsWith('--concurrency='))?.split('=')[1] || '3'),
- interval: parseInt(args.find(a => a.startsWith('--interval='))?.split('=')[1] || '240'),
- detectionOnly: args.includes('--detection-only'),
- productionOnly: args.includes('--production-only'),
- sandboxOnly: args.includes('--sandbox-only'),
-};
-async function showHelp() {
- console.log(`
-Bootstrap Discovery - Initialize Dispensary Crawl System
-
-USAGE:
- npx tsx src/scripts/bootstrap-discovery.ts [OPTIONS]
-
-OPTIONS:
- --run After creating schedules, run the orchestrator for each dispensary
- --dry-run Show what would happen without making changes
- --status Show current status and exit
- --limit=N Limit how many dispensaries to process (0 = all, default: 0)
- --concurrency=N How many dispensaries to process in parallel (default: 3)
- --interval=M Default interval in minutes for new schedules (default: 240 = 4 hours)
- --detection-only Only run detection, don't crawl
- --production-only Only run dispensaries in production mode
- --sandbox-only Only run dispensaries in sandbox mode
- --help, -h Show this help message
-
-EXAMPLES:
- # Create schedule entries for all dispensaries (no crawling)
- npx tsx src/scripts/bootstrap-discovery.ts
-
- # Create schedules and run orchestrator for all dispensaries
- npx tsx src/scripts/bootstrap-discovery.ts --run
-
- # Run orchestrator for first 10 dispensaries
- npx tsx src/scripts/bootstrap-discovery.ts --run --limit=10
-
- # Run with higher concurrency
- npx tsx src/scripts/bootstrap-discovery.ts --run --concurrency=5
-
- # Show current status
- npx tsx src/scripts/bootstrap-discovery.ts --status
-
-WHAT IT DOES:
- 1. Creates dispensary_crawl_schedule entries for all dispensaries that don't have one
- 2. If --run: For each dispensary, runs the orchestrator which:
- a. Checks if provider detection is needed (null/unknown/stale/low confidence)
- b. Runs detection if needed
- c. If Dutchie + production mode: runs production crawl
- d. Otherwise: runs sandbox crawl
- 3. Updates schedule status and job records
-`);
-}
-async function showStatus() {
- console.log('\n📊 Current Dispensary Crawl Status\n');
- console.log('═'.repeat(70));
- // Get dispensary counts by provider
- const providerStats = await migrate_1.pool.query(`
- SELECT
- COALESCE(product_provider, 'undetected') as provider,
- COUNT(*) as count,
- COUNT(*) FILTER (WHERE product_crawler_mode = 'production') as production,
- COUNT(*) FILTER (WHERE product_crawler_mode = 'sandbox') as sandbox,
- COUNT(*) FILTER (WHERE product_crawler_mode IS NULL) as no_mode
- FROM dispensaries
- GROUP BY COALESCE(product_provider, 'undetected')
- ORDER BY count DESC
- `);
- console.log('\nProvider Distribution:');
- console.log('-'.repeat(60));
- console.log('Provider'.padEnd(20) +
- 'Total'.padStart(8) +
- 'Production'.padStart(12) +
- 'Sandbox'.padStart(10) +
- 'No Mode'.padStart(10));
- console.log('-'.repeat(60));
- for (const row of providerStats.rows) {
- console.log(row.provider.padEnd(20) +
- row.count.toString().padStart(8) +
- row.production.toString().padStart(12) +
- row.sandbox.toString().padStart(10) +
- row.no_mode.toString().padStart(10));
- }
- // Get schedule stats
- const scheduleStats = await migrate_1.pool.query(`
- SELECT
- COUNT(DISTINCT d.id) as total_dispensaries,
- COUNT(DISTINCT dcs.id) as with_schedule,
- COUNT(DISTINCT d.id) - COUNT(DISTINCT dcs.id) as without_schedule,
- COUNT(*) FILTER (WHERE dcs.is_active = TRUE) as active_schedules,
- COUNT(*) FILTER (WHERE dcs.last_status = 'success') as last_success,
- COUNT(*) FILTER (WHERE dcs.last_status = 'error') as last_error,
- COUNT(*) FILTER (WHERE dcs.last_status = 'sandbox_only') as last_sandbox,
- COUNT(*) FILTER (WHERE dcs.last_status = 'detection_only') as last_detection,
- COUNT(*) FILTER (WHERE dcs.next_run_at <= NOW()) as due_now,
- AVG(dcs.interval_minutes)::INTEGER as avg_interval
- FROM dispensaries d
- LEFT JOIN dispensary_crawl_schedule dcs ON dcs.dispensary_id = d.id
- `);
- const s = scheduleStats.rows[0];
- console.log('\n\nSchedule Status:');
- console.log('-'.repeat(60));
- console.log(` Total Dispensaries: ${s.total_dispensaries}`);
- console.log(` With Schedule: ${s.with_schedule}`);
- console.log(` Without Schedule: ${s.without_schedule}`);
- console.log(` Active Schedules: ${s.active_schedules || 0}`);
- console.log(` Average Interval: ${s.avg_interval || 240} minutes`);
- console.log('\n Last Run Status:');
- console.log(` - Success: ${s.last_success || 0}`);
- console.log(` - Error: ${s.last_error || 0}`);
- console.log(` - Sandbox Only: ${s.last_sandbox || 0}`);
- console.log(` - Detection Only: ${s.last_detection || 0}`);
- console.log(` - Due Now: ${s.due_now || 0}`);
- // Get recent job stats
- const jobStats = await migrate_1.pool.query(`
- SELECT
- COUNT(*) as total,
- COUNT(*) FILTER (WHERE status = 'completed') as completed,
- COUNT(*) FILTER (WHERE status = 'failed') as failed,
- COUNT(*) FILTER (WHERE status = 'running') as running,
- COUNT(*) FILTER (WHERE status = 'pending') as pending,
- COUNT(*) FILTER (WHERE detection_ran = TRUE) as with_detection,
- COUNT(*) FILTER (WHERE crawl_ran = TRUE) as with_crawl,
- COUNT(*) FILTER (WHERE crawl_type = 'production') as production_crawls,
- COUNT(*) FILTER (WHERE crawl_type = 'sandbox') as sandbox_crawls,
- SUM(products_found) as total_products_found
- FROM dispensary_crawl_jobs
- WHERE created_at > NOW() - INTERVAL '24 hours'
- `);
- const j = jobStats.rows[0];
- console.log('\n\nJobs (Last 24 Hours):');
- console.log('-'.repeat(60));
- console.log(` Total Jobs: ${j.total || 0}`);
- console.log(` Completed: ${j.completed || 0}`);
- console.log(` Failed: ${j.failed || 0}`);
- console.log(` Running: ${j.running || 0}`);
- console.log(` Pending: ${j.pending || 0}`);
- console.log(` With Detection: ${j.with_detection || 0}`);
- console.log(` With Crawl: ${j.with_crawl || 0}`);
- console.log(` - Production: ${j.production_crawls || 0}`);
- console.log(` - Sandbox: ${j.sandbox_crawls || 0}`);
- console.log(` Products Found: ${j.total_products_found || 0}`);
- console.log('\n' + '═'.repeat(70) + '\n');
-}
-async function createSchedules() {
- console.log('\n📅 Creating Dispensary Schedules...\n');
- if (flags.dryRun) {
- // Count how many would be created
- const result = await migrate_1.pool.query(`
- SELECT COUNT(*) as count
- FROM dispensaries d
- WHERE NOT EXISTS (
- SELECT 1 FROM dispensary_crawl_schedule dcs WHERE dcs.dispensary_id = d.id
- )
- `);
- const wouldCreate = parseInt(result.rows[0].count);
- console.log(` Would create ${wouldCreate} new schedule entries (${flags.interval} minute interval)`);
- return { created: wouldCreate, existing: 0 };
- }
- const result = await (0, dispensary_orchestrator_1.ensureAllDispensariesHaveSchedules)(flags.interval);
- console.log(` ✓ Created ${result.created} new schedule entries`);
- console.log(` ✓ ${result.existing} dispensaries already had schedules`);
- return result;
-}
-async function getDispensariesToProcess() {
- // Build query based on filters
- let whereClause = 'TRUE';
- if (flags.productionOnly) {
- whereClause += ` AND d.product_crawler_mode = 'production'`;
- }
- else if (flags.sandboxOnly) {
- whereClause += ` AND d.product_crawler_mode = 'sandbox'`;
- }
- if (flags.detectionOnly) {
- whereClause += ` AND (d.product_provider IS NULL OR d.product_provider = 'unknown' OR d.product_confidence < 50)`;
- }
- const limitClause = flags.limit > 0 ? `LIMIT ${flags.limit}` : '';
- const query = `
- SELECT d.id, d.name, d.product_provider, d.product_crawler_mode
- FROM dispensaries d
- LEFT JOIN dispensary_crawl_schedule dcs ON dcs.dispensary_id = d.id
- WHERE ${whereClause}
- ORDER BY
- COALESCE(dcs.priority, 0) DESC,
- dcs.last_run_at ASC NULLS FIRST,
- d.id ASC
- ${limitClause}
- `;
- const result = await migrate_1.pool.query(query);
- return result.rows.map(row => row.id);
-}
-async function runOrchestrator() {
- console.log('\n🚀 Running Dispensary Orchestrator...\n');
- const dispensaryIds = await getDispensariesToProcess();
- if (dispensaryIds.length === 0) {
- console.log(' No dispensaries to process.');
- return;
- }
- console.log(` Found ${dispensaryIds.length} dispensaries to process`);
- console.log(` Concurrency: ${flags.concurrency}`);
- if (flags.dryRun) {
- console.log('\n Would process these dispensaries:');
- const details = await migrate_1.pool.query(`SELECT id, name, product_provider, product_crawler_mode
- FROM dispensaries WHERE id = ANY($1) ORDER BY id`, [dispensaryIds]);
- for (const row of details.rows.slice(0, 20)) {
- console.log(` - [${row.id}] ${row.name} (${row.product_provider || 'undetected'}, ${row.product_crawler_mode || 'no mode'})`);
- }
- if (details.rows.length > 20) {
- console.log(` ... and ${details.rows.length - 20} more`);
- }
- return;
- }
- console.log('\n Starting batch processing...\n');
- const results = await (0, dispensary_orchestrator_1.runBatchDispensaryOrchestrator)(dispensaryIds, flags.concurrency);
- // Summarize results
- const summary = {
- total: results.length,
- success: results.filter(r => r.status === 'success').length,
- sandboxOnly: results.filter(r => r.status === 'sandbox_only').length,
- detectionOnly: results.filter(r => r.status === 'detection_only').length,
- error: results.filter(r => r.status === 'error').length,
- detectionsRan: results.filter(r => r.detectionRan).length,
- crawlsRan: results.filter(r => r.crawlRan).length,
- productionCrawls: results.filter(r => r.crawlType === 'production').length,
- sandboxCrawls: results.filter(r => r.crawlType === 'sandbox').length,
- totalProducts: results.reduce((sum, r) => sum + (r.productsFound || 0), 0),
- totalDuration: results.reduce((sum, r) => sum + r.durationMs, 0),
- };
- console.log('\n' + '═'.repeat(70));
- console.log(' Orchestrator Results');
- console.log('═'.repeat(70));
- console.log(`
- Total Processed: ${summary.total}
-
- Status:
- - Success: ${summary.success}
- - Sandbox Only: ${summary.sandboxOnly}
- - Detection Only: ${summary.detectionOnly}
- - Error: ${summary.error}
-
- Operations:
- - Detections Ran: ${summary.detectionsRan}
- - Crawls Ran: ${summary.crawlsRan}
- - Production: ${summary.productionCrawls}
- - Sandbox: ${summary.sandboxCrawls}
-
- Results:
- - Products Found: ${summary.totalProducts}
- - Total Duration: ${(summary.totalDuration / 1000).toFixed(1)}s
- - Avg per Dispensary: ${(summary.totalDuration / summary.total / 1000).toFixed(1)}s
-`);
- console.log('═'.repeat(70) + '\n');
- // Show errors if any
- const errors = results.filter(r => r.status === 'error');
- if (errors.length > 0) {
- console.log('\n⚠️ Errors encountered:');
- for (const err of errors.slice(0, 10)) {
- console.log(` - [${err.dispensaryId}] ${err.dispensaryName}: ${err.error}`);
- }
- if (errors.length > 10) {
- console.log(` ... and ${errors.length - 10} more errors`);
- }
- }
-}
-async function main() {
- if (flags.help) {
- await showHelp();
- process.exit(0);
- }
- console.log('\n' + '═'.repeat(70));
- console.log(' Dispensary Crawl Bootstrap Discovery');
- console.log('═'.repeat(70));
- if (flags.dryRun) {
- console.log('\n🔍 DRY RUN MODE - No changes will be made');
- }
- try {
- // Always show status first
- await showStatus();
- if (flags.status) {
- // Status-only mode, we're done
- await migrate_1.pool.end();
- process.exit(0);
- }
- // Step 1: Create schedule entries
- await createSchedules();
- // Step 2: Optionally run orchestrator
- if (flags.run) {
- await runOrchestrator();
- }
- else {
- console.log('\n💡 Tip: Use --run to also run the orchestrator for each dispensary');
- }
- // Show final status
- if (!flags.dryRun) {
- await showStatus();
- }
- }
- catch (error) {
- console.error('\n❌ Fatal error:', error.message);
- console.error(error.stack);
- process.exit(1);
- }
- finally {
- await migrate_1.pool.end();
- }
-}
-main();
diff --git a/backend/dist/scripts/bootstrap-stores-for-dispensaries.js b/backend/dist/scripts/bootstrap-stores-for-dispensaries.js
deleted file mode 100644
index d05098a5..00000000
--- a/backend/dist/scripts/bootstrap-stores-for-dispensaries.js
+++ /dev/null
@@ -1,65 +0,0 @@
-"use strict";
-Object.defineProperty(exports, "__esModule", { value: true });
-const pg_1 = require("pg");
-const pool = new pg_1.Pool({ connectionString: process.env.DATABASE_URL });
-/**
- * Creates `stores` table records for all dispensaries that:
- * 1. Have menu_type = 'dutchie' AND platform_dispensary_id (ready for GraphQL crawl)
- * 2. Don't already have a linked stores record
- *
- * The stores table is required by the scraper engine (scrapeStore function)
- */
-async function bootstrapStores() {
- console.log('=== Bootstrapping stores for Dutchie dispensaries ===\n');
- // Find all dutchie dispensaries without linked stores
- const result = await pool.query(`
- SELECT d.id, d.name, d.slug, d.menu_type, d.platform_dispensary_id, d.menu_url
- FROM dispensaries d
- LEFT JOIN stores s ON s.dispensary_id = d.id
- WHERE d.menu_type = 'dutchie'
- AND d.platform_dispensary_id IS NOT NULL
- AND s.id IS NULL
- ORDER BY d.id
- `);
- console.log(`Found ${result.rows.length} dispensaries needing store records\n`);
- let created = 0;
- let errors = 0;
- for (const d of result.rows) {
- try {
- // Insert store record linking to dispensary
- // Note: stores table only has basic fields: name, slug, dispensary_id, dutchie_url
- // The platform_dispensary_id for GraphQL crawling lives in the dispensaries table
- const insertResult = await pool.query(`
- INSERT INTO stores (
- name,
- slug,
- dispensary_id,
- active,
- scrape_enabled,
- created_at,
- updated_at
- ) VALUES ($1, $2, $3, true, true, NOW(), NOW())
- RETURNING id
- `, [
- d.name,
- d.slug || d.name.toLowerCase().replace(/[^a-z0-9]+/g, '-'),
- d.id
- ]);
- console.log(`[CREATED] Store ${insertResult.rows[0].id} for dispensary ${d.id}: ${d.name}`);
- created++;
- }
- catch (e) {
- console.error(`[ERROR] Dispensary ${d.id} (${d.name}): ${e.message}`);
- errors++;
- }
- }
- console.log('\n=== Bootstrap Summary ===');
- console.log(`Created: ${created}`);
- console.log(`Errors: ${errors}`);
- console.log(`Total needing stores: ${result.rows.length}`);
- await pool.end();
-}
-bootstrapStores().catch(e => {
- console.error('Fatal error:', e.message);
- process.exit(1);
-});
diff --git a/backend/dist/scripts/capture-dutchie-schema.js b/backend/dist/scripts/capture-dutchie-schema.js
deleted file mode 100644
index a0960547..00000000
--- a/backend/dist/scripts/capture-dutchie-schema.js
+++ /dev/null
@@ -1,236 +0,0 @@
-"use strict";
-/**
- * Capture Dutchie GraphQL response structure via Puppeteer interception
- * This script navigates to a Dutchie menu page and captures the GraphQL responses
- * to understand the exact product data structure
- */
-var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
- if (k2 === undefined) k2 = k;
- var desc = Object.getOwnPropertyDescriptor(m, k);
- if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
- desc = { enumerable: true, get: function() { return m[k]; } };
- }
- Object.defineProperty(o, k2, desc);
-}) : (function(o, m, k, k2) {
- if (k2 === undefined) k2 = k;
- o[k2] = m[k];
-}));
-var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
- Object.defineProperty(o, "default", { enumerable: true, value: v });
-}) : function(o, v) {
- o["default"] = v;
-});
-var __importStar = (this && this.__importStar) || (function () {
- var ownKeys = function(o) {
- ownKeys = Object.getOwnPropertyNames || function (o) {
- var ar = [];
- for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
- return ar;
- };
- return ownKeys(o);
- };
- return function (mod) {
- if (mod && mod.__esModule) return mod;
- var result = {};
- if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
- __setModuleDefault(result, mod);
- return result;
- };
-})();
-var __importDefault = (this && this.__importDefault) || function (mod) {
- return (mod && mod.__esModule) ? mod : { "default": mod };
-};
-Object.defineProperty(exports, "__esModule", { value: true });
-const puppeteer_extra_1 = __importDefault(require("puppeteer-extra"));
-const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth"));
-const fs = __importStar(require("fs"));
-puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)());
-async function captureSchema(menuUrl) {
- let browser;
- const capturedResponses = [];
- try {
- console.log('='.repeat(80));
- console.log('DUTCHIE GRAPHQL SCHEMA CAPTURE');
- console.log('='.repeat(80));
- console.log(`\nTarget URL: ${menuUrl}\n`);
- browser = await puppeteer_extra_1.default.launch({
- headless: 'new',
- args: [
- '--no-sandbox',
- '--disable-setuid-sandbox',
- '--disable-dev-shm-usage',
- '--disable-blink-features=AutomationControlled',
- ]
- });
- const page = await browser.newPage();
- // Use a realistic user agent
- await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
- // Set viewport to desktop size
- await page.setViewport({ width: 1920, height: 1080 });
- // Hide webdriver flag
- await page.evaluateOnNewDocument(() => {
- Object.defineProperty(navigator, 'webdriver', { get: () => false });
- window.chrome = { runtime: {} };
- });
- // Intercept all GraphQL responses
- page.on('response', async (response) => {
- const url = response.url();
- // Only capture GraphQL responses
- if (!url.includes('graphql'))
- return;
- try {
- const contentType = response.headers()['content-type'] || '';
- if (!contentType.includes('application/json'))
- return;
- const data = await response.json();
- // Extract operation name from URL if possible
- const urlParams = new URLSearchParams(url.split('?')[1] || '');
- const operationName = urlParams.get('operationName') || 'Unknown';
- capturedResponses.push({
- operationName,
- url: url.substring(0, 200),
- data,
- timestamp: new Date()
- });
- console.log(`📡 Captured: ${operationName}`);
- // Check for product data
- if (data?.data?.filteredProducts?.products) {
- const products = data.data.filteredProducts.products;
- console.log(` Found ${products.length} products`);
- }
- }
- catch (e) {
- // Ignore parse errors
- }
- });
- console.log('Navigating to page...');
- await page.goto(menuUrl, {
- waitUntil: 'networkidle2',
- timeout: 90000
- });
- // Check if it's a Dutchie menu
- const isDutchie = await page.evaluate(() => {
- return typeof window.reactEnv !== 'undefined';
- });
- if (isDutchie) {
- console.log('✅ Dutchie menu detected\n');
- // Get environment info
- const reactEnv = await page.evaluate(() => window.reactEnv);
- console.log('Dutchie Environment:');
- console.log(` dispensaryId: ${reactEnv?.dispensaryId}`);
- console.log(` retailerId: ${reactEnv?.retailerId}`);
- console.log(` chainId: ${reactEnv?.chainId}`);
- }
- // Scroll to trigger lazy loading
- console.log('\nScrolling to load more products...');
- await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
- await new Promise(r => setTimeout(r, 3000));
- // Click on a category to trigger more loads
- const categoryLinks = await page.$$('a[href*="/products/"]');
- if (categoryLinks.length > 0) {
- console.log(`Found ${categoryLinks.length} category links, clicking first one...`);
- try {
- await categoryLinks[0].click();
- await page.waitForNavigation({ waitUntil: 'networkidle2', timeout: 30000 });
- }
- catch (e) {
- console.log('Category navigation failed, continuing...');
- }
- }
- // Wait a bit more for any final responses
- await new Promise(r => setTimeout(r, 2000));
- console.log(`\n${'='.repeat(80)}`);
- console.log(`CAPTURED ${capturedResponses.length} GRAPHQL RESPONSES`);
- console.log('='.repeat(80));
- // Find product data
- let productSchema = null;
- let sampleProduct = null;
- for (const resp of capturedResponses) {
- console.log(`\n${resp.operationName}:`);
- console.log(` URL: ${resp.url.substring(0, 100)}...`);
- if (resp.data?.data?.filteredProducts?.products) {
- const products = resp.data.data.filteredProducts.products;
- console.log(` ✅ Contains ${products.length} products`);
- if (products.length > 0 && !sampleProduct) {
- sampleProduct = products[0];
- productSchema = extractSchema(products[0]);
- }
- }
- // Show top-level data keys
- if (resp.data?.data) {
- console.log(` Data keys: ${Object.keys(resp.data.data).join(', ')}`);
- }
- }
- // Output the product schema
- if (productSchema) {
- console.log('\n' + '='.repeat(80));
- console.log('PRODUCT SCHEMA (from first product):');
- console.log('='.repeat(80));
- console.log(JSON.stringify(productSchema, null, 2));
- console.log('\n' + '='.repeat(80));
- console.log('SAMPLE PRODUCT:');
- console.log('='.repeat(80));
- console.log(JSON.stringify(sampleProduct, null, 2));
- // Save to file
- const outputData = {
- capturedAt: new Date().toISOString(),
- menuUrl,
- schema: productSchema,
- sampleProduct,
- allResponses: capturedResponses.map(r => ({
- operationName: r.operationName,
- dataKeys: r.data?.data ? Object.keys(r.data.data) : [],
- productCount: r.data?.data?.filteredProducts?.products?.length || 0
- }))
- };
- const outputPath = '/tmp/dutchie-schema-capture.json';
- fs.writeFileSync(outputPath, JSON.stringify(outputData, null, 2));
- console.log(`\nSaved capture to: ${outputPath}`);
- }
- else {
- console.log('\n❌ No product data captured');
- // Debug: show all responses
- console.log('\nAll captured responses:');
- for (const resp of capturedResponses) {
- console.log(`\n${resp.operationName}:`);
- console.log(JSON.stringify(resp.data, null, 2).substring(0, 500));
- }
- }
- }
- catch (error) {
- console.error('Error:', error.message);
- }
- finally {
- if (browser) {
- await browser.close();
- }
- }
-}
-/**
- * Extract schema from an object (field names + types)
- */
-function extractSchema(obj, prefix = '') {
- if (obj === null)
- return { type: 'null' };
- if (obj === undefined)
- return { type: 'undefined' };
- if (Array.isArray(obj)) {
- if (obj.length === 0)
- return { type: 'array', items: 'unknown' };
- return {
- type: 'array',
- items: extractSchema(obj[0], prefix + '[]')
- };
- }
- if (typeof obj === 'object') {
- const schema = { type: 'object', properties: {} };
- for (const [key, value] of Object.entries(obj)) {
- schema.properties[key] = extractSchema(value, prefix ? `${prefix}.${key}` : key);
- }
- return schema;
- }
- return { type: typeof obj, example: String(obj).substring(0, 100) };
-}
-// Run
-const url = process.argv[2] || 'https://dutchie.com/embedded-menu/AZ-Deeply-Rooted';
-captureSchema(url).catch(console.error);
diff --git a/backend/dist/scripts/check-store-linking.js b/backend/dist/scripts/check-store-linking.js
deleted file mode 100644
index bbdd2e41..00000000
--- a/backend/dist/scripts/check-store-linking.js
+++ /dev/null
@@ -1,31 +0,0 @@
-"use strict";
-Object.defineProperty(exports, "__esModule", { value: true });
-const pg_1 = require("pg");
-const pool = new pg_1.Pool({ connectionString: process.env.DATABASE_URL });
-async function check() {
- // Check which dispensaries have linked stores
- const result = await pool.query(`
- SELECT d.id as disp_id, d.name, d.menu_type, d.platform_dispensary_id,
- s.id as store_id, s.name as store_name
- FROM dispensaries d
- LEFT JOIN stores s ON s.dispensary_id = d.id
- WHERE d.menu_type = 'dutchie' AND d.platform_dispensary_id IS NOT NULL
- LIMIT 15
- `);
- console.log('Dispensaries with linked stores:');
- result.rows.forEach(r => {
- console.log(` [${r.disp_id}] ${r.name} -> store ${r.store_id || 'NONE'} (${r.store_name || 'NOT LINKED'})`);
- });
- // Count how many have linked stores
- const countResult = await pool.query(`
- SELECT
- COUNT(*) FILTER (WHERE s.id IS NOT NULL) as with_store,
- COUNT(*) FILTER (WHERE s.id IS NULL) as without_store
- FROM dispensaries d
- LEFT JOIN stores s ON s.dispensary_id = d.id
- WHERE d.menu_type = 'dutchie' AND d.platform_dispensary_id IS NOT NULL
- `);
- console.log('\nSummary:', countResult.rows[0]);
- await pool.end();
-}
-check();
diff --git a/backend/dist/scripts/crawl-all-dutchie.js b/backend/dist/scripts/crawl-all-dutchie.js
deleted file mode 100644
index 96378479..00000000
--- a/backend/dist/scripts/crawl-all-dutchie.js
+++ /dev/null
@@ -1,56 +0,0 @@
-"use strict";
-/**
- * Seed crawl: trigger dutchie crawls for all dispensaries with menu_type='dutchie'
- * and a resolved platform_dispensary_id. This uses the AZ orchestrator endpoint logic.
- *
- * Usage (local):
- * node dist/scripts/crawl-all-dutchie.js
- *
- * Requires:
- * - DATABASE_URL/CRAWLSY_DATABASE_URL pointing to the consolidated DB
- * - Dispensaries table populated with menu_type and platform_dispensary_id
- */
-Object.defineProperty(exports, "__esModule", { value: true });
-const connection_1 = require("../dutchie-az/db/connection");
-const dispensary_orchestrator_1 = require("../services/dispensary-orchestrator");
-async function main() {
- const { rows } = await (0, connection_1.query)(`
- SELECT id, name, slug, platform_dispensary_id
- FROM dispensaries
- WHERE menu_type = 'dutchie'
- AND platform_dispensary_id IS NOT NULL
- ORDER BY id
- `);
- if (!rows.length) {
- console.log('No dutchie dispensaries with resolved platform_dispensary_id found.');
- process.exit(0);
- }
- console.log(`Found ${rows.length} dutchie dispensaries with resolved IDs. Triggering crawls...`);
- let success = 0;
- let failed = 0;
- for (const row of rows) {
- try {
- console.log(`Crawling ${row.id} (${row.name})...`);
- const result = await (0, dispensary_orchestrator_1.runDispensaryOrchestrator)(row.id);
- const ok = result.status === 'success' ||
- result.status === 'sandbox_only' ||
- result.status === 'detection_only';
- if (ok) {
- success++;
- }
- else {
- failed++;
- console.warn(`Crawl returned status ${result.status} for ${row.id} (${row.name})`);
- }
- }
- catch (err) {
- failed++;
- console.error(`Failed crawl for ${row.id} (${row.name}): ${err.message}`);
- }
- }
- console.log(`Completed. Success: ${success}, Failed: ${failed}`);
-}
-main().catch((err) => {
- console.error('Fatal:', err);
- process.exit(1);
-});
diff --git a/backend/dist/scripts/crawl-five-sequential.js b/backend/dist/scripts/crawl-five-sequential.js
deleted file mode 100644
index db5c0f4c..00000000
--- a/backend/dist/scripts/crawl-five-sequential.js
+++ /dev/null
@@ -1,44 +0,0 @@
-"use strict";
-Object.defineProperty(exports, "__esModule", { value: true });
-const dispensary_orchestrator_1 = require("../services/dispensary-orchestrator");
-// All 57 dutchie stores with platform_dispensary_id (as of 2024-12)
-const ALL_DISPENSARY_IDS = [
- 72, 74, 75, 76, 77, 78, 81, 82, 85, 87, 91, 92, 97, 101, 106, 108, 110, 112,
- 115, 120, 123, 125, 128, 131, 135, 139, 140, 143, 144, 145, 152, 153, 161,
- 168, 176, 177, 180, 181, 189, 195, 196, 199, 200, 201, 205, 206, 207, 213,
- 214, 224, 225, 227, 232, 235, 248, 252, 281
-];
-const BATCH_SIZE = 5;
-async function run() {
- const totalBatches = Math.ceil(ALL_DISPENSARY_IDS.length / BATCH_SIZE);
- console.log(`Starting crawl of ${ALL_DISPENSARY_IDS.length} stores in ${totalBatches} batches of ${BATCH_SIZE}...`);
- let successCount = 0;
- let errorCount = 0;
- for (let i = 0; i < ALL_DISPENSARY_IDS.length; i += BATCH_SIZE) {
- const batch = ALL_DISPENSARY_IDS.slice(i, i + BATCH_SIZE);
- const batchNum = Math.floor(i / BATCH_SIZE) + 1;
- console.log(`\n========== BATCH ${batchNum}/${totalBatches} (IDs: ${batch.join(', ')}) ==========`);
- for (const id of batch) {
- console.log(`\n--- Crawling dispensary ${id} ---`);
- try {
- const result = await (0, dispensary_orchestrator_1.runDispensaryOrchestrator)(id);
- console.log(` Status: ${result.status}`);
- console.log(` Summary: ${result.summary}`);
- if (result.productsFound) {
- console.log(` Products: ${result.productsFound} found, ${result.productsNew} new, ${result.productsUpdated} updated`);
- }
- successCount++;
- }
- catch (e) {
- console.log(` ERROR: ${e.message}`);
- errorCount++;
- }
- }
- console.log(`\n--- Batch ${batchNum} complete. Progress: ${Math.min(i + BATCH_SIZE, ALL_DISPENSARY_IDS.length)}/${ALL_DISPENSARY_IDS.length} ---`);
- }
- console.log('\n========================================');
- console.log(`=== ALL CRAWLS COMPLETE ===`);
- console.log(`Success: ${successCount}, Errors: ${errorCount}`);
- console.log('========================================');
-}
-run().catch(e => console.log('Fatal:', e.message));
diff --git a/backend/dist/scripts/detect-all.js b/backend/dist/scripts/detect-all.js
deleted file mode 100644
index 0d014f89..00000000
--- a/backend/dist/scripts/detect-all.js
+++ /dev/null
@@ -1,111 +0,0 @@
-"use strict";
-Object.defineProperty(exports, "__esModule", { value: true });
-const pg_1 = require("pg");
-const pool = new pg_1.Pool({ connectionString: process.env.DATABASE_URL });
-// Simple fetch with timeout
-async function fetchWithTimeout(url, timeout = 10000) {
- const controller = new AbortController();
- const id = setTimeout(() => controller.abort(), timeout);
- try {
- const resp = await fetch(url, {
- signal: controller.signal,
- headers: {
- 'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
- },
- redirect: 'follow',
- });
- clearTimeout(id);
- return await resp.text();
- }
- catch (e) {
- clearTimeout(id);
- throw e;
- }
-}
-// Check for dutchie patterns in HTML
-function detectDutchie(html) {
- // Check for reactEnv.dispensaryId (Curaleaf/Sol pattern)
- const reactEnvMatch = html.match(/"dispensaryId"\s*:\s*"([a-fA-F0-9]{24})"/i);
- if (reactEnvMatch) {
- return { provider: 'dutchie', platformId: reactEnvMatch[1] };
- }
- // Check for Dutchie embedded-menu script (Trulieve pattern)
- // Look for: embedded-menu/5eaf48fc972e6200b1303b97.js
- const embedMatch = html.match(/embedded-menu\/([a-f0-9]{24})(?:\.js)?/i);
- if (embedMatch) {
- return { provider: 'dutchie', platformId: embedMatch[1] };
- }
- // Check for dutchie.com links
- const dutchieLink = html.match(/https?:\/\/(?:www\.)?dutchie\.com\/(?:dispensary|embedded-menu|stores)\/([a-zA-Z0-9-]+)/i);
- if (dutchieLink) {
- return { provider: 'dutchie', menuUrl: dutchieLink[0] };
- }
- // Check for jane
- if (html.includes('iheartjane.com') || html.includes('jane.co')) {
- const janeMatch = html.match(/https?:\/\/(?:www\.)?(?:iheartjane\.com|jane\.co)\/[^"\s]+/i);
- return { provider: 'jane', menuUrl: janeMatch?.[0] };
- }
- // Check for treez
- if (html.includes('.treez.io')) {
- const treezMatch = html.match(/https?:\/\/[a-zA-Z0-9-]+\.treez\.io[^"\s]*/i);
- return { provider: 'treez', menuUrl: treezMatch?.[0] };
- }
- // Check for leafly
- if (html.includes('leafly.com/dispensary')) {
- return { provider: 'leafly' };
- }
- return { provider: 'unknown' };
-}
-async function main() {
- const { rows: stores } = await pool.query(`
- SELECT id, name, website
- FROM dispensaries
- WHERE platform_dispensary_id IS NULL
- AND website IS NOT NULL
- AND website NOT LIKE '%example%'
- ORDER BY id
- LIMIT 150
- `);
- console.log('Checking ' + stores.length + ' stores...\n');
- let dutchieCount = 0;
- let otherCount = 0;
- let errorCount = 0;
- for (const store of stores) {
- try {
- const html = await fetchWithTimeout(store.website);
- const result = detectDutchie(html);
- if (result.provider === 'dutchie') {
- if (result.platformId) {
- await pool.query('UPDATE dispensaries SET menu_type = $1, platform_dispensary_id = $2, updated_at = NOW() WHERE id = $3', ['dutchie', result.platformId, store.id]);
- console.log('[' + store.id + '] ' + store.name + ' => DUTCHIE (ID: ' + result.platformId + ')');
- dutchieCount++;
- }
- else if (result.menuUrl) {
- await pool.query('UPDATE dispensaries SET menu_type = $1, menu_url = $2, updated_at = NOW() WHERE id = $3', ['dutchie', result.menuUrl, store.id]);
- console.log('[' + store.id + '] ' + store.name + ' => DUTCHIE (URL: ' + result.menuUrl.slice(0, 60) + ')');
- dutchieCount++;
- }
- }
- else if (result.provider !== 'unknown') {
- await pool.query('UPDATE dispensaries SET menu_type = $1, menu_url = COALESCE($2, menu_url), updated_at = NOW() WHERE id = $3', [result.provider, result.menuUrl, store.id]);
- console.log('[' + store.id + '] ' + store.name + ' => ' + result.provider.toUpperCase());
- otherCount++;
- }
- else {
- console.log('[' + store.id + '] ' + store.name + ' => no menu found');
- }
- }
- catch (err) {
- const errMsg = err.name === 'AbortError' ? 'timeout' : err.message?.slice(0, 40) || 'error';
- console.log('[' + store.id + '] ' + store.name + ' => ERROR: ' + errMsg);
- errorCount++;
- }
- }
- console.log('\n=== Summary ===');
- console.log('Dutchie detected: ' + dutchieCount);
- console.log('Other providers: ' + otherCount);
- console.log('Errors: ' + errorCount);
- await pool.end();
-}
-main().catch(console.error);
diff --git a/backend/dist/scripts/export-dispensaries.js b/backend/dist/scripts/export-dispensaries.js
deleted file mode 100644
index 13f2c868..00000000
--- a/backend/dist/scripts/export-dispensaries.js
+++ /dev/null
@@ -1,18 +0,0 @@
-"use strict";
-Object.defineProperty(exports, "__esModule", { value: true });
-const pg_1 = require("pg");
-const pool = new pg_1.Pool({ connectionString: process.env.DATABASE_URL });
-async function exportDispensaries() {
- const { rows } = await pool.query(`
- SELECT id, name, dba_name, company_name, slug,
- address, city, state, zip, latitude, longitude,
- website, menu_type, menu_url, platform_dispensary_id,
- created_at, updated_at
- FROM dispensaries
- WHERE menu_type IS NOT NULL
- ORDER BY id
- `);
- console.log(JSON.stringify(rows, null, 2));
- await pool.end();
-}
-exportDispensaries();
diff --git a/backend/dist/scripts/extract-platform-ids.js b/backend/dist/scripts/extract-platform-ids.js
deleted file mode 100644
index 06bbcad0..00000000
--- a/backend/dist/scripts/extract-platform-ids.js
+++ /dev/null
@@ -1,240 +0,0 @@
-"use strict";
-Object.defineProperty(exports, "__esModule", { value: true });
-const playwright_1 = require("playwright");
-const pg_1 = require("pg");
-const pool = new pg_1.Pool({
- connectionString: process.env.DATABASE_URL
-});
-async function extractPlatformId(browser, dispensary) {
- let capturedId = null;
- const context = await browser.newContext();
- const page = await context.newPage();
- // Intercept network requests to find retailer IDs
- page.on('request', (request) => {
- const url = request.url();
- if (url.includes('dutchie') || url.includes('plus.dutchie') || url.includes('api.dutchie')) {
- // Check URL for retailer ID
- const urlMatch = url.match(/[\/=]([a-f0-9]{24})(?:[\/\?&]|$)/i);
- if (urlMatch && !capturedId) {
- capturedId = urlMatch[1];
- console.log(` Captured from URL: ${capturedId}`);
- }
- const postData = request.postData();
- if (postData) {
- // Look for retailerId in GraphQL variables
- const match = postData.match(/["']?retailerId["']?\s*:\s*["']([a-f0-9]{24})["']/i);
- if (match && !capturedId) {
- capturedId = match[1];
- console.log(` Captured retailerId: ${capturedId}`);
- }
- // Also look for dispensaryId
- const dispMatch = postData.match(/["']?dispensaryId["']?\s*:\s*["']([a-f0-9]{24})["']/i);
- if (dispMatch && !capturedId) {
- capturedId = dispMatch[1];
- console.log(` Captured dispensaryId: ${capturedId}`);
- }
- }
- }
- });
- try {
- console.log(`\nLoading ${dispensary.name}: ${dispensary.website}`);
- await page.goto(dispensary.website, { waitUntil: 'domcontentloaded', timeout: 30000 });
- // Wait for initial load
- await page.waitForTimeout(2000);
- // Check page content for retailerId
- const content = await page.content();
- // Try various patterns in page content
- const patterns = [
- /["']retailerId["']\s*:\s*["']([a-f0-9]{24})["']/i,
- /dispensaryId["']\s*:\s*["']([a-f0-9]{24})["']/i,
- /retailer["']?\s*:\s*["']([a-f0-9]{24})["']/i,
- /dutchie\.com\/embedded-menu\/([a-f0-9]{24})/i,
- /dutchie\.com\/dispensary\/([a-f0-9]{24})/i,
- /plus\.dutchie\.com\/plus\/([a-f0-9]{24})/i,
- /retailerId=([a-f0-9]{24})/i,
- ];
- for (const pattern of patterns) {
- const match = content.match(pattern);
- if (match && !capturedId) {
- capturedId = match[1];
- console.log(` Found in content: ${capturedId}`);
- break;
- }
- }
- // Check __NEXT_DATA__ if present
- if (!capturedId) {
- const nextData = await page.evaluate(() => {
- const el = document.getElementById('__NEXT_DATA__');
- return el?.textContent || null;
- });
- if (nextData) {
- for (const pattern of patterns) {
- const match = nextData.match(pattern);
- if (match) {
- capturedId = match[1];
- console.log(` Found in __NEXT_DATA__: ${capturedId}`);
- break;
- }
- }
- }
- }
- // Look for iframes that might contain dutchie embed
- if (!capturedId) {
- const iframes = await page.evaluate(() => {
- return Array.from(document.querySelectorAll('iframe')).map(f => f.src);
- });
- for (const src of iframes) {
- if (src.includes('dutchie')) {
- const match = src.match(/([a-f0-9]{24})/i);
- if (match) {
- capturedId = match[1];
- console.log(` Found in iframe: ${capturedId}`);
- break;
- }
- }
- }
- }
- // If still not found, try clicking on "Shop" or "Menu" links
- if (!capturedId) {
- const menuSelectors = [
- 'a:has-text("Shop")',
- 'a:has-text("Menu")',
- 'a:has-text("Order")',
- 'a[href*="menu"]',
- 'a[href*="shop"]',
- 'a[href*="order"]',
- 'button:has-text("Shop")',
- 'button:has-text("Menu")',
- ];
- for (const selector of menuSelectors) {
- try {
- const element = page.locator(selector).first();
- const isVisible = await element.isVisible({ timeout: 500 });
- if (isVisible) {
- const href = await element.getAttribute('href');
- // If it's an internal link, click it
- if (href && !href.startsWith('http')) {
- console.log(` Clicking ${selector}...`);
- await element.click();
- await page.waitForTimeout(3000);
- // Check new page content
- const newContent = await page.content();
- for (const pattern of patterns) {
- const match = newContent.match(pattern);
- if (match && !capturedId) {
- capturedId = match[1];
- console.log(` Found after navigation: ${capturedId}`);
- break;
- }
- }
- // Check iframes on new page
- if (!capturedId) {
- const newIframes = await page.evaluate(() => {
- return Array.from(document.querySelectorAll('iframe')).map(f => f.src);
- });
- for (const src of newIframes) {
- if (src.includes('dutchie')) {
- const match = src.match(/([a-f0-9]{24})/i);
- if (match) {
- capturedId = match[1];
- console.log(` Found in iframe after nav: ${capturedId}`);
- break;
- }
- }
- }
- }
- if (capturedId)
- break;
- }
- }
- }
- catch (e) {
- // Continue to next selector
- }
- }
- }
- // If still not found, wait longer for async dutchie widget to load
- if (!capturedId) {
- console.log(` Waiting for async content...`);
- await page.waitForTimeout(5000);
- // Check for dutchie script tags
- const scripts = await page.evaluate(() => {
- return Array.from(document.querySelectorAll('script')).map(s => s.src || s.innerHTML?.substring(0, 500));
- });
- for (const script of scripts) {
- if (script && script.includes('dutchie')) {
- for (const pattern of patterns) {
- const match = script.match(pattern);
- if (match && !capturedId) {
- capturedId = match[1];
- console.log(` Found in script: ${capturedId}`);
- break;
- }
- }
- if (capturedId)
- break;
- }
- }
- // Final check of iframes after wait
- if (!capturedId) {
- const finalIframes = await page.evaluate(() => {
- return Array.from(document.querySelectorAll('iframe')).map(f => f.src);
- });
- for (const src of finalIframes) {
- if (src.includes('dutchie')) {
- const match = src.match(/([a-f0-9]{24})/i);
- if (match) {
- capturedId = match[1];
- console.log(` Found in iframe (delayed): ${capturedId}`);
- break;
- }
- }
- }
- }
- }
- }
- catch (e) {
- console.log(` Error: ${e.message.substring(0, 80)}`);
- }
- finally {
- await context.close();
- }
- return capturedId;
-}
-async function main() {
- // Get dispensaries missing platform IDs
- const result = await pool.query(`
- SELECT id, name, website
- FROM dispensaries
- WHERE state = 'AZ'
- AND menu_type = 'dutchie'
- AND (platform_dispensary_id IS NULL OR platform_dispensary_id = '')
- AND website IS NOT NULL AND website != ''
- ORDER BY name
- `);
- console.log(`Found ${result.rows.length} dispensaries to process\n`);
- const browser = await playwright_1.chromium.launch({ headless: true });
- const results = [];
- for (const dispensary of result.rows) {
- const platformId = await extractPlatformId(browser, dispensary);
- results.push({ id: dispensary.id, name: dispensary.name, platformId });
- if (platformId) {
- // Update database
- await pool.query('UPDATE dispensaries SET platform_dispensary_id = $1 WHERE id = $2', [platformId, dispensary.id]);
- console.log(` Updated database with ${platformId}`);
- }
- }
- await browser.close();
- console.log('\n=== SUMMARY ===');
- const found = results.filter(r => r.platformId);
- const notFound = results.filter(r => !r.platformId);
- console.log(`\nFound (${found.length}):`);
- found.forEach(r => console.log(` ${r.id}: ${r.name} -> ${r.platformId}`));
- console.log(`\nNot Found (${notFound.length}):`);
- notFound.forEach(r => console.log(` ${r.id}: ${r.name}`));
- await pool.end();
-}
-main().catch(e => {
- console.error('Error:', e);
- process.exit(1);
-});
diff --git a/backend/dist/scripts/import-dispensaries.js b/backend/dist/scripts/import-dispensaries.js
deleted file mode 100644
index c4cc3a4f..00000000
--- a/backend/dist/scripts/import-dispensaries.js
+++ /dev/null
@@ -1,108 +0,0 @@
-"use strict";
-var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
- if (k2 === undefined) k2 = k;
- var desc = Object.getOwnPropertyDescriptor(m, k);
- if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
- desc = { enumerable: true, get: function() { return m[k]; } };
- }
- Object.defineProperty(o, k2, desc);
-}) : (function(o, m, k, k2) {
- if (k2 === undefined) k2 = k;
- o[k2] = m[k];
-}));
-var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
- Object.defineProperty(o, "default", { enumerable: true, value: v });
-}) : function(o, v) {
- o["default"] = v;
-});
-var __importStar = (this && this.__importStar) || (function () {
- var ownKeys = function(o) {
- ownKeys = Object.getOwnPropertyNames || function (o) {
- var ar = [];
- for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
- return ar;
- };
- return ownKeys(o);
- };
- return function (mod) {
- if (mod && mod.__esModule) return mod;
- var result = {};
- if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
- __setModuleDefault(result, mod);
- return result;
- };
-})();
-Object.defineProperty(exports, "__esModule", { value: true });
-const pg_1 = require("pg");
-const fs = __importStar(require("fs"));
-const pool = new pg_1.Pool({ connectionString: process.env.DATABASE_URL });
-async function importDispensaries(filePath) {
- const data = JSON.parse(fs.readFileSync(filePath, 'utf-8'));
- console.log(`Importing ${data.length} dispensaries...`);
- let inserted = 0;
- let updated = 0;
- let errors = 0;
- for (const d of data) {
- try {
- // Check if dispensary exists by name and city
- const { rows: existing } = await pool.query(`SELECT id FROM dispensaries WHERE name = $1 AND city = $2`, [d.name, d.city]);
- if (existing.length > 0) {
- // Update existing
- await pool.query(`
- UPDATE dispensaries SET
- dba_name = COALESCE($1, dba_name),
- company_name = COALESCE($2, company_name),
- slug = COALESCE($3, slug),
- address = COALESCE($4, address),
- state = COALESCE($5, state),
- zip = COALESCE($6, zip),
- latitude = COALESCE($7, latitude),
- longitude = COALESCE($8, longitude),
- website = COALESCE($9, website),
- menu_type = COALESCE($10, menu_type),
- menu_url = COALESCE($11, menu_url),
- platform_dispensary_id = COALESCE($12, platform_dispensary_id),
- updated_at = NOW()
- WHERE id = $13
- `, [
- d.dba_name, d.company_name, d.slug,
- d.address, d.state, d.zip,
- d.latitude, d.longitude, d.website,
- d.menu_type, d.menu_url, d.platform_dispensary_id,
- existing[0].id
- ]);
- console.log(`Updated: [${existing[0].id}] ${d.name} (${d.city})`);
- updated++;
- }
- else {
- // Insert new
- const { rows: newRow } = await pool.query(`
- INSERT INTO dispensaries (
- name, dba_name, company_name, slug,
- address, city, state, zip, latitude, longitude,
- website, menu_type, menu_url, platform_dispensary_id,
- created_at, updated_at
- ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, NOW(), NOW())
- RETURNING id
- `, [
- d.name, d.dba_name, d.company_name, d.slug,
- d.address, d.city, d.state, d.zip, d.latitude, d.longitude,
- d.website, d.menu_type, d.menu_url, d.platform_dispensary_id
- ]);
- console.log(`Inserted: [${newRow[0].id}] ${d.name} (${d.city})`);
- inserted++;
- }
- }
- catch (err) {
- console.error(`Error for ${d.name}: ${err.message}`);
- errors++;
- }
- }
- console.log(`\n=== Import Summary ===`);
- console.log(`Inserted: ${inserted}`);
- console.log(`Updated: ${updated}`);
- console.log(`Errors: ${errors}`);
- await pool.end();
-}
-const filePath = process.argv[2] || '/tmp/dispensaries-export.json';
-importDispensaries(filePath).catch(console.error);
diff --git a/backend/dist/scripts/jars-az-extractor.js b/backend/dist/scripts/jars-az-extractor.js
deleted file mode 100644
index 2df24136..00000000
--- a/backend/dist/scripts/jars-az-extractor.js
+++ /dev/null
@@ -1,118 +0,0 @@
-"use strict";
-Object.defineProperty(exports, "__esModule", { value: true });
-const playwright_1 = require("playwright");
-async function extractJarsAzStoreIds() {
- const browser = await playwright_1.chromium.launch({ headless: true });
- const page = await browser.newPage();
- const results = [];
- const capturedIds = [];
- const allRequests = [];
- // Intercept network requests to find Dutchie Plus API calls
- page.on('request', (request) => {
- const url = request.url();
- allRequests.push(url.substring(0, 100));
- if (url.includes('dutchie') || url.includes('graphql')) {
- const postData = request.postData();
- console.log('Dutchie request to:', url.substring(0, 80));
- if (postData) {
- // Look for retailerId in GraphQL variables
- const match = postData.match(/"retailerId"\s*:\s*"([a-f0-9-]{36})"/i);
- if (match) {
- const id = match[1];
- if (capturedIds.indexOf(id) === -1) {
- capturedIds.push(id);
- console.log('Captured retailerId from request:', id);
- }
- }
- }
- }
- });
- try {
- // Just load one page first and thoroughly debug it
- console.log('Loading Mesa store with full network debugging...');
- await page.goto('https://jarscannabis.com/shop/mesa-az/', {
- waitUntil: 'networkidle',
- timeout: 60000
- });
- console.log('\nWaiting 5 seconds for dynamic content...');
- await page.waitForTimeout(5000);
- // Get page title and content
- const title = await page.title();
- console.log('Page title:', title);
- const content = await page.content();
- console.log('Page content length:', content.length);
- // Save screenshot
- await page.screenshot({ path: '/tmp/jars-mesa-debug.png', fullPage: true });
- console.log('Screenshot saved to /tmp/jars-mesa-debug.png');
- // Look for all UUIDs in content
- const uuidPattern = /[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}/gi;
- const uuids = content.match(uuidPattern);
- if (uuids) {
- const uniqueUuids = [...new Set(uuids)];
- console.log('\n=== All UUIDs found on page ===');
- uniqueUuids.forEach(u => console.log(u));
- }
- // Look for all iframes
- const iframes = await page.evaluate(() => {
- return Array.from(document.querySelectorAll('iframe')).map(f => ({
- src: f.src,
- id: f.id,
- name: f.name,
- className: f.className
- }));
- });
- console.log('\n=== Iframes ===');
- console.log(JSON.stringify(iframes, null, 2));
- // Look for any elements with dutchie
- const dutchieElements = await page.evaluate(() => {
- const elements = document.body.innerHTML.match(/dutchie[^<>]*\"/gi) || [];
- return elements.slice(0, 20);
- });
- console.log('\n=== Dutchie mentions ===');
- dutchieElements.forEach(e => console.log(e));
- // Look for script src containing dutchie
- const scripts = await page.evaluate(() => {
- return Array.from(document.querySelectorAll('script[src]'))
- .map(s => s.getAttribute('src'))
- .filter(src => src && (src.includes('dutchie') || src.includes('embed')));
- });
- console.log('\n=== Relevant scripts ===');
- scripts.forEach(s => console.log(s));
- // Look for __NEXT_DATA__
- const nextData = await page.evaluate(() => {
- const el = document.getElementById('__NEXT_DATA__');
- return el ? el.textContent : null;
- });
- if (nextData) {
- console.log('\n=== __NEXT_DATA__ found ===');
- const data = JSON.parse(nextData);
- // Look for retailer in various places
- const propsStr = JSON.stringify(data, null, 2);
- // Find all UUID patterns in the props
- const propsUuids = propsStr.match(/[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}/gi);
- if (propsUuids) {
- console.log('UUIDs in __NEXT_DATA__:', [...new Set(propsUuids)]);
- }
- }
- else {
- console.log('\nNo __NEXT_DATA__ found');
- }
- // Look for specific Dutchie embed patterns
- const embedPatterns = content.match(/https:\/\/[^"'\s]*dutchie[^"'\s]*/gi);
- if (embedPatterns) {
- console.log('\n=== Dutchie embed URLs ===');
- [...new Set(embedPatterns)].forEach(u => console.log(u));
- }
- console.log('\n=== Network requests summary ===');
- console.log('Total requests:', allRequests.length);
- const dutchieRequests = allRequests.filter(r => r.includes('dutchie'));
- console.log('Dutchie requests:', dutchieRequests.length);
- dutchieRequests.forEach(r => console.log(r));
- console.log('\n=== CAPTURED IDS ===');
- console.log(capturedIds);
- }
- finally {
- await browser.close();
- }
-}
-extractJarsAzStoreIds().catch(e => console.error('Error:', e.message));
diff --git a/backend/dist/scripts/jars-az-finder.js b/backend/dist/scripts/jars-az-finder.js
deleted file mode 100644
index 625d2405..00000000
--- a/backend/dist/scripts/jars-az-finder.js
+++ /dev/null
@@ -1,177 +0,0 @@
-"use strict";
-Object.defineProperty(exports, "__esModule", { value: true });
-const playwright_1 = require("playwright");
-async function findJarsAzStores() {
- const browser = await playwright_1.chromium.launch({ headless: true });
- const page = await browser.newPage();
- const capturedRetailerIds = [];
- const allApiCalls = [];
- // Intercept ALL requests to find retailer IDs
- page.on('request', (request) => {
- const url = request.url();
- // Log Buddy API calls
- if (url.includes('buddyapi') || url.includes('dutchie') || url.includes('graphql')) {
- allApiCalls.push(url);
- const postData = request.postData();
- if (postData) {
- // Look for retailerId in various formats
- const match = postData.match(/retailerId['":\s]+([a-f0-9-]{36})/i);
- if (match) {
- capturedRetailerIds.push({ url, retailerId: match[1] });
- }
- }
- // Also check URL params
- const urlMatch = url.match(/retailerId=([a-f0-9-]{36})/i);
- if (urlMatch) {
- capturedRetailerIds.push({ url, retailerId: urlMatch[1] });
- }
- }
- });
- try {
- // First, let's try to find the actual Arizona menu URLs
- console.log('Loading JARS find-a-dispensary page...');
- await page.goto('https://jarscannabis.com/find-a-dispensary', {
- waitUntil: 'networkidle',
- timeout: 30000
- });
- await page.waitForTimeout(3000);
- // Take screenshot
- await page.screenshot({ path: '/tmp/jars-find-dispensary.png', fullPage: true });
- console.log('Screenshot saved to /tmp/jars-find-dispensary.png');
- // Try to find state selector and click Arizona
- console.log('\nLooking for state selector...');
- // Try various ways to select Arizona
- const stateSelectors = [
- 'select[name*="state"]',
- '[class*="state"] select',
- 'select option[value="AZ"]',
- 'button:has-text("Arizona")',
- 'a:has-text("Arizona")',
- '[data-state="AZ"]',
- 'div:has-text("Arizona")',
- ];
- for (const selector of stateSelectors) {
- try {
- const element = page.locator(selector).first();
- const isVisible = await element.isVisible({ timeout: 1000 });
- if (isVisible) {
- console.log(`Found element with selector: ${selector}`);
- await element.click();
- await page.waitForTimeout(2000);
- }
- }
- catch (e) {
- // Continue to next selector
- }
- }
- // Get all links on the page
- const links = await page.evaluate(() => {
- return Array.from(document.querySelectorAll('a')).map(a => ({
- href: a.href,
- text: a.textContent?.trim()
- })).filter(l => l.href.includes('/shop') || l.href.includes('menu') || l.href.includes('arizona') || l.href.includes('-az'));
- });
- console.log('\n=== Shop/Menu Links Found ===');
- links.forEach(l => console.log(`${l.text}: ${l.href}`));
- // Look for __NEXT_DATA__ which might have location data
- const nextData = await page.evaluate(() => {
- const el = document.getElementById('__NEXT_DATA__');
- return el?.textContent || null;
- });
- if (nextData) {
- console.log('\n=== Analyzing __NEXT_DATA__ ===');
- const data = JSON.parse(nextData);
- const dataStr = JSON.stringify(data);
- // Look for Arizona references
- if (dataStr.includes('Arizona') || dataStr.includes('AZ')) {
- console.log('Found Arizona references in __NEXT_DATA__');
- // Extract all objects that might be Arizona stores
- const findArizonaStores = (obj, path = '') => {
- const results = [];
- if (!obj || typeof obj !== 'object')
- return results;
- if (Array.isArray(obj)) {
- obj.forEach((item, i) => {
- results.push(...findArizonaStores(item, `${path}[${i}]`));
- });
- }
- else {
- // Check if this object looks like an AZ store
- if (obj.state === 'AZ' || obj.state === 'Arizona' ||
- obj.stateCode === 'AZ' || obj.region === 'Arizona' ||
- (obj.city && ['Mesa', 'Phoenix', 'Peoria', 'Payson', 'Globe', 'Safford', 'Somerton', 'Prescott Valley'].includes(obj.city))) {
- results.push({ path, data: obj });
- }
- for (const key of Object.keys(obj)) {
- results.push(...findArizonaStores(obj[key], `${path}.${key}`));
- }
- }
- return results;
- };
- const azStores = findArizonaStores(data);
- console.log(`Found ${azStores.length} Arizona store objects`);
- azStores.forEach(s => {
- console.log('\n---');
- console.log('Path:', s.path);
- console.log(JSON.stringify(s.data, null, 2));
- });
- }
- // Also look for retailer IDs
- const retailerMatches = dataStr.match(/"retailerId"\s*:\s*"([a-f0-9-]{36})"/gi);
- if (retailerMatches) {
- console.log('\n=== RetailerIds in __NEXT_DATA__ ===');
- const uniqueIds = [...new Set(retailerMatches.map(m => {
- const match = m.match(/([a-f0-9-]{36})/i);
- return match ? match[1] : null;
- }).filter(Boolean))];
- uniqueIds.forEach(id => console.log(id));
- }
- }
- // Try loading a known store URL pattern
- const testUrls = [
- 'https://jarscannabis.com/arizona/',
- 'https://jarscannabis.com/az/',
- 'https://jarscannabis.com/stores/arizona/',
- 'https://jarscannabis.com/locations/arizona/',
- 'https://jarscannabis.com/shop/arizona/',
- 'https://az.jarscannabis.com/',
- ];
- console.log('\n=== Testing Arizona URLs ===');
- for (const testUrl of testUrls) {
- try {
- const response = await page.goto(testUrl, { waitUntil: 'domcontentloaded', timeout: 10000 });
- const status = response?.status();
- console.log(`${testUrl}: ${status}`);
- if (status === 200) {
- const title = await page.title();
- console.log(` Title: ${title}`);
- // If we found a working page, extract store links
- const storeLinks = await page.evaluate(() => {
- return Array.from(document.querySelectorAll('a')).map(a => ({
- href: a.href,
- text: a.textContent?.trim()
- })).filter(l => l.href.includes('shop') || l.href.includes('menu'));
- });
- if (storeLinks.length > 0) {
- console.log(' Store links:');
- storeLinks.forEach(l => console.log(` ${l.text}: ${l.href}`));
- }
- }
- }
- catch (e) {
- console.log(`${testUrl}: Error - ${e.message.substring(0, 50)}`);
- }
- }
- console.log('\n=== Captured Retailer IDs from API calls ===');
- const uniqueRetailerIds = [...new Map(capturedRetailerIds.map(r => [r.retailerId, r])).values()];
- uniqueRetailerIds.forEach(r => {
- console.log(`${r.retailerId} (from: ${r.url.substring(0, 60)}...)`);
- });
- console.log('\n=== All API calls ===');
- allApiCalls.forEach(url => console.log(url.substring(0, 100)));
- }
- finally {
- await browser.close();
- }
-}
-findJarsAzStores().catch(e => console.error('Error:', e.message));
diff --git a/backend/dist/scripts/parallel-scrape.js b/backend/dist/scripts/parallel-scrape.js
deleted file mode 100644
index a13dff89..00000000
--- a/backend/dist/scripts/parallel-scrape.js
+++ /dev/null
@@ -1,181 +0,0 @@
-"use strict";
-var __importDefault = (this && this.__importDefault) || function (mod) {
- return (mod && mod.__esModule) ? mod : { "default": mod };
-};
-Object.defineProperty(exports, "__esModule", { value: true });
-const migrate_1 = require("../db/migrate");
-const proxy_1 = require("../services/proxy");
-const puppeteer_extra_1 = __importDefault(require("puppeteer-extra"));
-const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth"));
-puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)());
-const FIREFOX_USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0';
-const NUM_WORKERS = parseInt(process.argv[2] || '15');
-const DISPENSARY_NAME = process.argv[3] || 'Deeply Rooted';
-const USE_PROXIES = process.argv[4] !== 'no-proxy';
-async function getStore(name) {
- const result = await migrate_1.pool.query(`SELECT id, name, slug, dutchie_url FROM stores WHERE name ILIKE $1 LIMIT 1`, [`%${name}%`]);
- return result.rows[0] || null;
-}
-async function getCategories(storeId) {
- const result = await migrate_1.pool.query(`SELECT id, name, slug, dutchie_url as url FROM categories WHERE store_id = $1 AND scrape_enabled = true`, [storeId]);
- return result.rows;
-}
-async function scrapeWithProxy(workerId, store, category) {
- let browser = null;
- let proxyId = null;
- try {
- // Get a proxy (if enabled)
- let proxy = null;
- if (USE_PROXIES) {
- proxy = await (0, proxy_1.getActiveProxy)();
- if (proxy) {
- proxyId = proxy.id;
- console.log(`[Worker ${workerId}] Using proxy: ${proxy.protocol}://${proxy.host}:${proxy.port}`);
- }
- else {
- console.log(`[Worker ${workerId}] No proxy available, using direct connection`);
- }
- }
- else {
- console.log(`[Worker ${workerId}] Direct connection (proxies disabled)`);
- }
- // Build browser args
- const args = [
- '--no-sandbox',
- '--disable-setuid-sandbox',
- '--disable-dev-shm-usage',
- '--disable-accelerated-2d-canvas',
- '--disable-gpu',
- '--window-size=1920,1080',
- ];
- if (proxy) {
- if (proxy.protocol === 'socks5' || proxy.protocol === 'socks') {
- args.push(`--proxy-server=socks5://${proxy.host}:${proxy.port}`);
- }
- else {
- args.push(`--proxy-server=${proxy.protocol}://${proxy.host}:${proxy.port}`);
- }
- }
- browser = await puppeteer_extra_1.default.launch({
- headless: true,
- args,
- executablePath: process.env.PUPPETEER_EXECUTABLE_PATH,
- });
- const page = await browser.newPage();
- await page.setUserAgent(FIREFOX_USER_AGENT);
- await page.setViewport({ width: 1920, height: 1080 });
- // Handle proxy auth if needed
- if (proxy?.username && proxy?.password) {
- await page.authenticate({
- username: proxy.username,
- password: proxy.password,
- });
- }
- console.log(`[Worker ${workerId}] Scraping category: ${category.name} (${category.url})`);
- // Navigate to the category page
- const response = await page.goto(category.url, {
- waitUntil: 'networkidle2',
- timeout: 60000,
- });
- if (!response || !response.ok()) {
- throw new Error(`Failed to load page: ${response?.status()}`);
- }
- // Wait for products to load
- await page.waitForSelector('[data-testid="product-list-item"], a[href*="/product/"]', {
- timeout: 30000,
- }).catch(() => {
- console.log(`[Worker ${workerId}] No products found on page`);
- });
- // Extract products
- const products = await page.evaluate(() => {
- // Try data-testid first, then fall back to product links
- const listItems = document.querySelectorAll('[data-testid="product-list-item"]');
- if (listItems.length > 0)
- return listItems.length;
- return document.querySelectorAll('a[href*="/product/"]').length;
- });
- console.log(`[Worker ${workerId}] Found ${products} products in ${category.name}`);
- await browser.close();
- return { success: true, products };
- }
- catch (error) {
- console.error(`[Worker ${workerId}] Error:`, error.message);
- // Check for bot detection
- if (proxyId && (0, proxy_1.isBotDetectionError)(error.message)) {
- (0, proxy_1.putProxyInTimeout)(proxyId, error.message);
- }
- if (browser) {
- await browser.close().catch(() => { });
- }
- return { success: false, products: 0, error: error.message };
- }
-}
-async function worker(workerId, store, categories, categoryIndex) {
- while (categoryIndex.current < categories.length) {
- const idx = categoryIndex.current++;
- const category = categories[idx];
- if (!category)
- break;
- console.log(`[Worker ${workerId}] Starting category ${idx + 1}/${categories.length}: ${category.name}`);
- const result = await scrapeWithProxy(workerId, store, category);
- if (result.success) {
- console.log(`[Worker ${workerId}] Completed ${category.name}: ${result.products} products`);
- }
- else {
- console.log(`[Worker ${workerId}] Failed ${category.name}: ${result.error}`);
- }
- // Small delay between requests
- await new Promise(resolve => setTimeout(resolve, 2000 + Math.random() * 3000));
- }
- console.log(`[Worker ${workerId}] Finished all assigned work`);
-}
-async function main() {
- console.log(`\n${'='.repeat(60)}`);
- console.log(`Parallel Scraper - ${NUM_WORKERS} workers`);
- console.log(`Target: ${DISPENSARY_NAME}`);
- console.log(`User Agent: Firefox`);
- console.log(`Proxies: ${USE_PROXIES ? 'Enabled' : 'Disabled'}`);
- console.log(`${'='.repeat(60)}\n`);
- // Find the store
- const store = await getStore(DISPENSARY_NAME);
- if (!store) {
- console.error(`Store not found: ${DISPENSARY_NAME}`);
- process.exit(1);
- }
- console.log(`Found store: ${store.name} (ID: ${store.id})`);
- // Get categories
- const categories = await getCategories(store.id);
- if (categories.length === 0) {
- console.error('No categories found for this store');
- process.exit(1);
- }
- console.log(`Found ${categories.length} categories to scrape`);
- console.log(`Categories: ${categories.map(c => c.name).join(', ')}\n`);
- // Check proxies
- const proxyResult = await migrate_1.pool.query('SELECT COUNT(*) as total, COUNT(*) FILTER (WHERE active = true) as active FROM proxies');
- console.log(`Proxies: ${proxyResult.rows[0].active} active / ${proxyResult.rows[0].total} total\n`);
- // Shared index for work distribution
- const categoryIndex = { current: 0 };
- // For a store with few categories, we'll run multiple passes
- // Expand the work by duplicating categories for parallel workers
- const expandedCategories = [];
- const passes = Math.ceil(NUM_WORKERS / Math.max(categories.length, 1));
- for (let i = 0; i < passes; i++) {
- expandedCategories.push(...categories);
- }
- console.log(`Running ${NUM_WORKERS} workers across ${expandedCategories.length} category scrapes\n`);
- // Start workers
- const workers = [];
- for (let i = 0; i < NUM_WORKERS; i++) {
- workers.push(worker(i + 1, store, expandedCategories, categoryIndex));
- // Stagger worker starts
- await new Promise(resolve => setTimeout(resolve, 500));
- }
- // Wait for all workers
- await Promise.all(workers);
- console.log(`\n${'='.repeat(60)}`);
- console.log('All workers completed!');
- console.log(`${'='.repeat(60)}\n`);
- await migrate_1.pool.end();
-}
-main().catch(console.error);
diff --git a/backend/dist/scripts/platform-id-extractor.js b/backend/dist/scripts/platform-id-extractor.js
deleted file mode 100644
index 9584d975..00000000
--- a/backend/dist/scripts/platform-id-extractor.js
+++ /dev/null
@@ -1,301 +0,0 @@
-"use strict";
-/**
- * Platform ID Extractor - Standalone script for extracting Dutchie platform IDs
- *
- * This script visits dispensary websites to capture their Dutchie retailerId
- * by intercepting network requests to the Dutchie GraphQL API.
- *
- * It does NOT use the main orchestrator - it's a standalone browser-based tool.
- */
-Object.defineProperty(exports, "__esModule", { value: true });
-const playwright_1 = require("playwright");
-const pg_1 = require("pg");
-const pool = new pg_1.Pool({
- connectionString: process.env.DATABASE_URL
-});
-async function extractPlatformId(browser, dispensary) {
- let capturedId = null;
- let captureSource = null;
- let errorMsg = null;
- const context = await browser.newContext({
- userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
- });
- const page = await context.newPage();
- // Patterns to match retailer IDs in various formats
- const idPatterns = [
- /["']retailerId["']\s*:\s*["']([a-f0-9]{24})["']/i,
- /["']dispensaryId["']\s*:\s*["']([a-f0-9]{24})["']/i,
- /retailer["']?\s*:\s*["']([a-f0-9]{24})["']/i,
- /dutchie\.com\/embedded-menu\/([a-f0-9]{24})/i,
- /dutchie\.com\/dispensary\/([a-f0-9]{24})/i,
- /plus\.dutchie\.com\/plus\/([a-f0-9]{24})/i,
- /retailerId=([a-f0-9]{24})/i,
- /\/([a-f0-9]{24})(?:\/|\?|$)/i, // Generic ID in URL path
- ];
- // Intercept network requests
- page.on('request', (request) => {
- if (capturedId)
- return;
- const url = request.url();
- if (url.includes('dutchie') || url.includes('api.dutchie')) {
- // Check URL for retailer ID
- for (const pattern of idPatterns) {
- const match = url.match(pattern);
- if (match && match[1] && match[1].length === 24) {
- capturedId = match[1];
- captureSource = 'request_url';
- break;
- }
- }
- // Check POST data
- const postData = request.postData();
- if (postData && !capturedId) {
- for (const pattern of idPatterns) {
- const match = postData.match(pattern);
- if (match && match[1] && match[1].length === 24) {
- capturedId = match[1];
- captureSource = 'request_body';
- break;
- }
- }
- }
- }
- });
- try {
- console.log(`\n[${dispensary.id}] ${dispensary.name}: ${dispensary.website}`);
- // Load main page
- await page.goto(dispensary.website, {
- waitUntil: 'domcontentloaded',
- timeout: 25000
- });
- await page.waitForTimeout(2000);
- // Check page content
- if (!capturedId) {
- const content = await page.content();
- for (const pattern of idPatterns) {
- const match = content.match(pattern);
- if (match && match[1] && match[1].length === 24) {
- capturedId = match[1];
- captureSource = 'page_content';
- break;
- }
- }
- }
- // Check __NEXT_DATA__
- if (!capturedId) {
- const nextData = await page.evaluate(() => {
- const el = document.getElementById('__NEXT_DATA__');
- return el?.textContent || null;
- });
- if (nextData) {
- for (const pattern of idPatterns) {
- const match = nextData.match(pattern);
- if (match && match[1] && match[1].length === 24) {
- capturedId = match[1];
- captureSource = '__NEXT_DATA__';
- break;
- }
- }
- }
- }
- // Check iframes
- if (!capturedId) {
- const iframes = await page.evaluate(() => {
- return Array.from(document.querySelectorAll('iframe')).map(f => f.src);
- });
- for (const src of iframes) {
- if (src.includes('dutchie')) {
- const match = src.match(/([a-f0-9]{24})/i);
- if (match) {
- capturedId = match[1];
- captureSource = 'iframe_src';
- break;
- }
- }
- }
- }
- // Check scripts
- if (!capturedId) {
- const scripts = await page.evaluate(() => {
- return Array.from(document.querySelectorAll('script'))
- .map(s => s.src || s.innerHTML?.substring(0, 1000))
- .filter(Boolean);
- });
- for (const script of scripts) {
- if (script && (script.includes('dutchie') || script.includes('retailerId'))) {
- for (const pattern of idPatterns) {
- const match = script.match(pattern);
- if (match && match[1] && match[1].length === 24) {
- capturedId = match[1];
- captureSource = 'script';
- break;
- }
- }
- if (capturedId)
- break;
- }
- }
- }
- // Try navigating to menu/shop page
- if (!capturedId) {
- const menuLink = await page.evaluate(() => {
- const links = Array.from(document.querySelectorAll('a'));
- for (const link of links) {
- const href = link.href?.toLowerCase() || '';
- const text = link.textContent?.toLowerCase() || '';
- if (href.includes('menu') || href.includes('shop') || href.includes('order') ||
- text.includes('menu') || text.includes('shop') || text.includes('order')) {
- return link.href;
- }
- }
- return null;
- });
- if (menuLink && !menuLink.startsWith('javascript:')) {
- try {
- console.log(` -> Following menu link: ${menuLink.substring(0, 60)}...`);
- await page.goto(menuLink, { waitUntil: 'domcontentloaded', timeout: 20000 });
- await page.waitForTimeout(3000);
- // Recheck all sources on new page
- const newContent = await page.content();
- for (const pattern of idPatterns) {
- const match = newContent.match(pattern);
- if (match && match[1] && match[1].length === 24) {
- capturedId = match[1];
- captureSource = 'menu_page_content';
- break;
- }
- }
- // Check iframes on new page
- if (!capturedId) {
- const newIframes = await page.evaluate(() => {
- return Array.from(document.querySelectorAll('iframe')).map(f => f.src);
- });
- for (const src of newIframes) {
- if (src.includes('dutchie')) {
- const match = src.match(/([a-f0-9]{24})/i);
- if (match) {
- capturedId = match[1];
- captureSource = 'menu_page_iframe';
- break;
- }
- }
- }
- }
- }
- catch (navError) {
- // Menu navigation failed, continue
- }
- }
- }
- // Final wait for async content
- if (!capturedId) {
- await page.waitForTimeout(3000);
- // Final iframe check
- const finalIframes = await page.evaluate(() => {
- return Array.from(document.querySelectorAll('iframe')).map(f => f.src);
- });
- for (const src of finalIframes) {
- if (src.includes('dutchie')) {
- const match = src.match(/([a-f0-9]{24})/i);
- if (match) {
- capturedId = match[1];
- captureSource = 'delayed_iframe';
- break;
- }
- }
- }
- }
- if (capturedId) {
- console.log(` ✓ Found: ${capturedId} (${captureSource})`);
- }
- else {
- console.log(` ✗ Not found`);
- }
- }
- catch (e) {
- errorMsg = e.message.substring(0, 100);
- console.log(` ✗ Error: ${errorMsg}`);
- }
- finally {
- await context.close();
- }
- return {
- id: dispensary.id,
- name: dispensary.name,
- website: dispensary.website,
- platformId: capturedId,
- source: captureSource,
- error: errorMsg
- };
-}
-async function main() {
- // Get specific dispensary ID from command line, or process all missing
- const targetId = process.argv[2] ? parseInt(process.argv[2], 10) : null;
- let query;
- let params = [];
- if (targetId) {
- query = `
- SELECT id, name, website
- FROM dispensaries
- WHERE id = $1
- AND website IS NOT NULL AND website != ''
- `;
- params = [targetId];
- }
- else {
- query = `
- SELECT id, name, website
- FROM dispensaries
- WHERE state = 'AZ'
- AND menu_type = 'dutchie'
- AND (platform_dispensary_id IS NULL OR platform_dispensary_id = '')
- AND website IS NOT NULL AND website != ''
- ORDER BY name
- `;
- }
- const result = await pool.query(query, params);
- if (result.rows.length === 0) {
- console.log('No dispensaries to process');
- await pool.end();
- return;
- }
- console.log(`\n=== Platform ID Extractor ===`);
- console.log(`Processing ${result.rows.length} dispensaries...\n`);
- const browser = await playwright_1.chromium.launch({
- headless: true,
- args: ['--no-sandbox', '--disable-setuid-sandbox']
- });
- const results = [];
- for (const dispensary of result.rows) {
- const extractionResult = await extractPlatformId(browser, dispensary);
- results.push(extractionResult);
- // Update database immediately if found
- if (extractionResult.platformId) {
- await pool.query('UPDATE dispensaries SET platform_dispensary_id = $1 WHERE id = $2', [extractionResult.platformId, extractionResult.id]);
- }
- }
- await browser.close();
- // Summary
- console.log('\n' + '='.repeat(60));
- console.log('SUMMARY');
- console.log('='.repeat(60));
- const found = results.filter(r => r.platformId);
- const notFound = results.filter(r => !r.platformId);
- console.log(`\nFound: ${found.length}/${results.length}`);
- if (found.length > 0) {
- console.log('\nSuccessful extractions:');
- found.forEach(r => console.log(` [${r.id}] ${r.name} -> ${r.platformId} (${r.source})`));
- }
- if (notFound.length > 0) {
- console.log(`\nNot found: ${notFound.length}`);
- notFound.forEach(r => {
- const reason = r.error || 'No Dutchie ID detected';
- console.log(` [${r.id}] ${r.name}: ${reason}`);
- });
- }
- await pool.end();
-}
-main().catch(e => {
- console.error('Fatal error:', e);
- process.exit(1);
-});
diff --git a/backend/dist/scripts/queue-dispensaries.js b/backend/dist/scripts/queue-dispensaries.js
deleted file mode 100644
index 4dc7f5b8..00000000
--- a/backend/dist/scripts/queue-dispensaries.js
+++ /dev/null
@@ -1,344 +0,0 @@
-#!/usr/bin/env npx tsx
-"use strict";
-/**
- * Queue Dispensaries Script
- *
- * Orchestrates the multi-provider crawler system:
- * 1. Queue dispensaries that need provider detection
- * 2. Queue Dutchie dispensaries for production crawl
- * 3. Queue sandbox dispensaries for learning crawls
- *
- * Usage:
- * npx tsx src/scripts/queue-dispensaries.ts [--detection] [--production] [--sandbox] [--all]
- * npx tsx src/scripts/queue-dispensaries.ts --dry-run
- * npx tsx src/scripts/queue-dispensaries.ts --process # Process queued jobs
- */
-Object.defineProperty(exports, "__esModule", { value: true });
-const migrate_1 = require("../db/migrate");
-const crawler_jobs_1 = require("../services/crawler-jobs");
-// Parse command line args
-const args = process.argv.slice(2);
-const flags = {
- detection: args.includes('--detection') || args.includes('--all'),
- production: args.includes('--production') || args.includes('--all'),
- sandbox: args.includes('--sandbox') || args.includes('--all'),
- dryRun: args.includes('--dry-run'),
- process: args.includes('--process'),
- help: args.includes('--help') || args.includes('-h'),
- limit: parseInt(args.find(a => a.startsWith('--limit='))?.split('=')[1] || '10'),
-};
-// If no specific flags, default to all
-if (!flags.detection && !flags.production && !flags.sandbox && !flags.process) {
- flags.detection = true;
- flags.production = true;
- flags.sandbox = true;
-}
-async function showHelp() {
- console.log(`
-Queue Dispensaries - Multi-Provider Crawler Orchestration
-
-USAGE:
- npx tsx src/scripts/queue-dispensaries.ts [OPTIONS]
-
-OPTIONS:
- --detection Queue dispensaries that need provider detection
- --production Queue Dutchie production crawls
- --sandbox Queue sandbox/learning crawls
- --all Queue all job types (default if no specific flag)
- --process Process queued jobs instead of just queuing
- --dry-run Show what would be queued without making changes
- --limit=N Maximum dispensaries to queue per type (default: 10)
- --help, -h Show this help message
-
-EXAMPLES:
- # Queue all dispensaries for appropriate jobs
- npx tsx src/scripts/queue-dispensaries.ts
-
- # Only queue detection jobs
- npx tsx src/scripts/queue-dispensaries.ts --detection --limit=20
-
- # Dry run to see what would be queued
- npx tsx src/scripts/queue-dispensaries.ts --dry-run
-
- # Process sandbox jobs
- npx tsx src/scripts/queue-dispensaries.ts --process
-`);
-}
-async function queueDetectionJobs() {
- console.log('\n📡 Queueing Detection Jobs...');
- // Find dispensaries that need provider detection:
- // - menu_provider is null OR
- // - menu_provider_confidence < 70 AND
- // - crawler_status is idle (not already queued/running)
- // - has a website URL
- const query = `
- SELECT id, name, website, menu_url, menu_provider, menu_provider_confidence
- FROM dispensaries
- WHERE (website IS NOT NULL OR menu_url IS NOT NULL)
- AND crawler_status = 'idle'
- AND (menu_provider IS NULL OR menu_provider_confidence < 70)
- ORDER BY
- CASE WHEN menu_provider IS NULL THEN 0 ELSE 1 END,
- menu_provider_confidence ASC
- LIMIT $1
- `;
- const result = await migrate_1.pool.query(query, [flags.limit]);
- if (flags.dryRun) {
- console.log(` Would queue ${result.rows.length} dispensaries for detection:`);
- for (const row of result.rows) {
- console.log(` - [${row.id}] ${row.name} (current: ${row.menu_provider || 'unknown'}, confidence: ${row.menu_provider_confidence}%)`);
- }
- return result.rows.length;
- }
- let queued = 0;
- for (const dispensary of result.rows) {
- try {
- // Update status to queued
- await migrate_1.pool.query(`UPDATE dispensaries SET crawler_status = 'queued_detection', updated_at = NOW() WHERE id = $1`, [dispensary.id]);
- // Create sandbox job for detection
- await migrate_1.pool.query(`INSERT INTO sandbox_crawl_jobs (dispensary_id, job_type, status, priority)
- VALUES ($1, 'detection', 'pending', 10)`, [dispensary.id]);
- console.log(` ✓ Queued detection: [${dispensary.id}] ${dispensary.name}`);
- queued++;
- }
- catch (error) {
- console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`);
- }
- }
- return queued;
-}
-async function queueProductionCrawls() {
- console.log('\n🏭 Queueing Production Dutchie Crawls...');
- // Find Dutchie dispensaries ready for production crawl:
- // - menu_provider = 'dutchie'
- // - crawler_mode = 'production'
- // - crawler_status is idle
- // - last_menu_scrape is old or null
- const query = `
- SELECT d.id, d.name, d.last_menu_scrape, d.menu_url
- FROM dispensaries d
- WHERE d.menu_provider = 'dutchie'
- AND d.crawler_mode = 'production'
- AND d.crawler_status = 'idle'
- AND (d.last_menu_scrape IS NULL OR d.last_menu_scrape < NOW() - INTERVAL '4 hours')
- ORDER BY
- CASE WHEN d.last_menu_scrape IS NULL THEN 0 ELSE 1 END,
- d.last_menu_scrape ASC
- LIMIT $1
- `;
- const result = await migrate_1.pool.query(query, [flags.limit]);
- if (flags.dryRun) {
- console.log(` Would queue ${result.rows.length} Dutchie dispensaries for production crawl:`);
- for (const row of result.rows) {
- const lastScrape = row.last_menu_scrape ? new Date(row.last_menu_scrape).toISOString() : 'never';
- console.log(` - [${row.id}] ${row.name} (last scrape: ${lastScrape})`);
- }
- return result.rows.length;
- }
- let queued = 0;
- for (const dispensary of result.rows) {
- try {
- // Update status to queued
- await migrate_1.pool.query(`UPDATE dispensaries SET crawler_status = 'queued_crawl', updated_at = NOW() WHERE id = $1`, [dispensary.id]);
- // Create crawl job in the main crawl_jobs table (production queue)
- await migrate_1.pool.query(`INSERT INTO crawl_jobs (store_id, job_type, trigger_type, status, priority, metadata)
- SELECT s.id, 'full_crawl', 'scheduled', 'pending', 50,
- jsonb_build_object('dispensary_id', $1, 'source', 'queue-dispensaries')
- FROM stores s
- JOIN dispensaries d ON (d.menu_url = s.dutchie_url OR d.name ILIKE '%' || s.name || '%')
- WHERE d.id = $1
- LIMIT 1`, [dispensary.id]);
- console.log(` ✓ Queued production crawl: [${dispensary.id}] ${dispensary.name}`);
- queued++;
- }
- catch (error) {
- console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`);
- }
- }
- return queued;
-}
-async function queueSandboxCrawls() {
- console.log('\n🧪 Queueing Sandbox Crawls...');
- // Find sandbox dispensaries needing crawls:
- // - crawler_mode = 'sandbox'
- // - crawler_status in (idle, error_needs_review)
- // - No recent sandbox job
- const query = `
- SELECT d.id, d.name, d.menu_provider, d.crawler_status, d.website
- FROM dispensaries d
- WHERE d.crawler_mode = 'sandbox'
- AND d.crawler_status IN ('idle', 'error_needs_review')
- AND (d.website IS NOT NULL OR d.menu_url IS NOT NULL)
- AND NOT EXISTS (
- SELECT 1 FROM sandbox_crawl_jobs sj
- WHERE sj.dispensary_id = d.id
- AND sj.status IN ('pending', 'running')
- )
- ORDER BY d.updated_at ASC
- LIMIT $1
- `;
- const result = await migrate_1.pool.query(query, [flags.limit]);
- if (flags.dryRun) {
- console.log(` Would queue ${result.rows.length} dispensaries for sandbox crawl:`);
- for (const row of result.rows) {
- console.log(` - [${row.id}] ${row.name} (provider: ${row.menu_provider || 'unknown'}, status: ${row.crawler_status})`);
- }
- return result.rows.length;
- }
- let queued = 0;
- for (const dispensary of result.rows) {
- try {
- // Update status
- await migrate_1.pool.query(`UPDATE dispensaries SET crawler_status = 'queued_crawl', updated_at = NOW() WHERE id = $1`, [dispensary.id]);
- // Create sandbox job
- await migrate_1.pool.query(`INSERT INTO sandbox_crawl_jobs (dispensary_id, job_type, status, priority)
- VALUES ($1, 'deep_crawl', 'pending', 5)`, [dispensary.id]);
- console.log(` ✓ Queued sandbox crawl: [${dispensary.id}] ${dispensary.name}`);
- queued++;
- }
- catch (error) {
- console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`);
- }
- }
- return queued;
-}
-async function processJobs() {
- console.log('\n⚙️ Processing Queued Jobs...\n');
- // Process sandbox jobs (detection + sandbox crawls)
- const sandboxJobs = await migrate_1.pool.query(`SELECT * FROM sandbox_crawl_jobs
- WHERE status = 'pending'
- ORDER BY priority DESC, scheduled_at ASC
- LIMIT $1`, [flags.limit]);
- console.log(`Found ${sandboxJobs.rows.length} pending sandbox jobs\n`);
- for (const job of sandboxJobs.rows) {
- console.log(`Processing job ${job.id} (${job.job_type}) for dispensary ${job.dispensary_id}...`);
- try {
- // Mark as running
- await migrate_1.pool.query(`UPDATE sandbox_crawl_jobs SET status = 'running', started_at = NOW() WHERE id = $1`, [job.id]);
- let result;
- if (job.job_type === 'detection') {
- result = await (0, crawler_jobs_1.runDetectMenuProviderJob)(job.dispensary_id);
- }
- else {
- result = await (0, crawler_jobs_1.runSandboxCrawlJob)(job.dispensary_id, job.sandbox_id);
- }
- // Update job status
- await migrate_1.pool.query(`UPDATE sandbox_crawl_jobs
- SET status = $1, completed_at = NOW(), result_summary = $2, error_message = $3
- WHERE id = $4`, [
- result.success ? 'completed' : 'failed',
- JSON.stringify(result.data || {}),
- result.success ? null : result.message,
- job.id,
- ]);
- console.log(` ${result.success ? '✓' : '✗'} ${result.message}\n`);
- }
- catch (error) {
- await migrate_1.pool.query(`UPDATE sandbox_crawl_jobs SET status = 'failed', error_message = $1 WHERE id = $2`, [error.message, job.id]);
- console.log(` ✗ Error: ${error.message}\n`);
- }
- }
-}
-async function showStats() {
- console.log('\n📊 Current Stats:');
- // Dispensary stats
- const stats = await migrate_1.pool.query(`
- SELECT
- COUNT(*) as total,
- COUNT(*) FILTER (WHERE menu_provider IS NULL) as no_provider,
- COUNT(*) FILTER (WHERE menu_provider = 'dutchie') as dutchie,
- COUNT(*) FILTER (WHERE menu_provider NOT IN ('dutchie', 'unknown') AND menu_provider IS NOT NULL) as other_providers,
- COUNT(*) FILTER (WHERE menu_provider = 'unknown') as unknown,
- COUNT(*) FILTER (WHERE crawler_mode = 'production') as production_mode,
- COUNT(*) FILTER (WHERE crawler_mode = 'sandbox') as sandbox_mode,
- COUNT(*) FILTER (WHERE crawler_status = 'idle') as idle,
- COUNT(*) FILTER (WHERE crawler_status LIKE 'queued%') as queued,
- COUNT(*) FILTER (WHERE crawler_status = 'running') as running,
- COUNT(*) FILTER (WHERE crawler_status = 'ok') as ok,
- COUNT(*) FILTER (WHERE crawler_status = 'error_needs_review') as needs_review
- FROM dispensaries
- `);
- const s = stats.rows[0];
- console.log(`
- Dispensaries: ${s.total}
- - No provider detected: ${s.no_provider}
- - Dutchie: ${s.dutchie}
- - Other providers: ${s.other_providers}
- - Unknown: ${s.unknown}
-
- Crawler Mode:
- - Production: ${s.production_mode}
- - Sandbox: ${s.sandbox_mode}
-
- Status:
- - Idle: ${s.idle}
- - Queued: ${s.queued}
- - Running: ${s.running}
- - OK: ${s.ok}
- - Needs Review: ${s.needs_review}
-`);
- // Job stats
- const jobStats = await migrate_1.pool.query(`
- SELECT
- COUNT(*) FILTER (WHERE status = 'pending') as pending,
- COUNT(*) FILTER (WHERE status = 'running') as running,
- COUNT(*) FILTER (WHERE status = 'completed') as completed,
- COUNT(*) FILTER (WHERE status = 'failed') as failed
- FROM sandbox_crawl_jobs
- `);
- const j = jobStats.rows[0];
- console.log(` Sandbox Jobs:
- - Pending: ${j.pending}
- - Running: ${j.running}
- - Completed: ${j.completed}
- - Failed: ${j.failed}
-`);
-}
-async function main() {
- if (flags.help) {
- await showHelp();
- process.exit(0);
- }
- console.log('═══════════════════════════════════════════════════════');
- console.log(' Multi-Provider Crawler Queue Manager');
- console.log('═══════════════════════════════════════════════════════');
- if (flags.dryRun) {
- console.log('\n🔍 DRY RUN MODE - No changes will be made\n');
- }
- try {
- // Show current stats first
- await showStats();
- if (flags.process) {
- // Process mode - run jobs instead of queuing
- await processJobs();
- }
- else {
- // Queuing mode
- let totalQueued = 0;
- if (flags.detection) {
- totalQueued += await queueDetectionJobs();
- }
- if (flags.production) {
- totalQueued += await queueProductionCrawls();
- }
- if (flags.sandbox) {
- totalQueued += await queueSandboxCrawls();
- }
- console.log('\n═══════════════════════════════════════════════════════');
- console.log(` Total dispensaries queued: ${totalQueued}`);
- console.log('═══════════════════════════════════════════════════════\n');
- }
- // Show updated stats
- if (!flags.dryRun) {
- await showStats();
- }
- }
- catch (error) {
- console.error('Fatal error:', error);
- process.exit(1);
- }
- finally {
- await migrate_1.pool.end();
- }
-}
-main();
diff --git a/backend/dist/scripts/queue-intelligence.js b/backend/dist/scripts/queue-intelligence.js
deleted file mode 100644
index 7a07f115..00000000
--- a/backend/dist/scripts/queue-intelligence.js
+++ /dev/null
@@ -1,473 +0,0 @@
-#!/usr/bin/env npx tsx
-"use strict";
-/**
- * Queue Intelligence Script
- *
- * Orchestrates the multi-category intelligence crawler system:
- * 1. Queue dispensaries that need provider detection (all 4 categories)
- * 2. Queue per-category production crawls (Dutchie products only for now)
- * 3. Queue per-category sandbox crawls (all providers)
- *
- * Each category (product, specials, brand, metadata) is handled independently.
- * A failure in one category does NOT affect other categories.
- *
- * Usage:
- * npx tsx src/scripts/queue-intelligence.ts [--detection] [--production] [--sandbox] [--all]
- * npx tsx src/scripts/queue-intelligence.ts --category=product --sandbox
- * npx tsx src/scripts/queue-intelligence.ts --process --category=product
- * npx tsx src/scripts/queue-intelligence.ts --dry-run
- */
-Object.defineProperty(exports, "__esModule", { value: true });
-const migrate_1 = require("../db/migrate");
-const intelligence_detector_1 = require("../services/intelligence-detector");
-const category_crawler_jobs_1 = require("../services/category-crawler-jobs");
-// Parse command line args
-const args = process.argv.slice(2);
-const flags = {
- detection: args.includes('--detection') || args.includes('--all'),
- production: args.includes('--production') || args.includes('--all'),
- sandbox: args.includes('--sandbox') || args.includes('--all'),
- dryRun: args.includes('--dry-run'),
- process: args.includes('--process'),
- help: args.includes('--help') || args.includes('-h'),
- limit: parseInt(args.find(a => a.startsWith('--limit='))?.split('=')[1] || '10'),
- category: args.find(a => a.startsWith('--category='))?.split('=')[1],
- dispensary: parseInt(args.find(a => a.startsWith('--dispensary='))?.split('=')[1] || '0'),
-};
-// If no specific flags, default to all
-if (!flags.detection && !flags.production && !flags.sandbox && !flags.process) {
- flags.detection = true;
- flags.production = true;
- flags.sandbox = true;
-}
-const CATEGORIES = ['product', 'specials', 'brand', 'metadata'];
-async function showHelp() {
- console.log(`
-Queue Intelligence - Multi-Category Crawler Orchestration
-
-USAGE:
- npx tsx src/scripts/queue-intelligence.ts [OPTIONS]
-
-OPTIONS:
- --detection Queue dispensaries that need multi-category detection
- --production Queue per-category production crawls
- --sandbox Queue per-category sandbox crawls
- --all Queue all job types (default if no specific flag)
- --process Process queued jobs instead of just queuing
- --category=CATEGORY Filter to specific category (product|specials|brand|metadata)
- --dispensary=ID Process only a specific dispensary
- --dry-run Show what would be queued without making changes
- --limit=N Maximum dispensaries to queue per type (default: 10)
- --help, -h Show this help message
-
-CATEGORIES:
- product - Product/menu data (Dutchie=production, others=sandbox)
- specials - Deals and specials (all sandbox for now)
- brand - Brand intelligence (all sandbox for now)
- metadata - Categories/taxonomy (all sandbox for now)
-
-EXAMPLES:
- # Queue all dispensaries for appropriate jobs
- npx tsx src/scripts/queue-intelligence.ts
-
- # Only queue product detection jobs
- npx tsx src/scripts/queue-intelligence.ts --detection --category=product
-
- # Process sandbox jobs for specials category
- npx tsx src/scripts/queue-intelligence.ts --process --category=specials --limit=5
-
- # Run full detection for a specific dispensary
- npx tsx src/scripts/queue-intelligence.ts --process --detection --dispensary=123
-
- # Dry run to see what would be queued
- npx tsx src/scripts/queue-intelligence.ts --dry-run
-`);
-}
-async function queueMultiCategoryDetection() {
- console.log('\n📡 Queueing Multi-Category Detection Jobs...');
- // Find dispensaries that need provider detection for any category:
- // - Any *_provider is null OR
- // - Any *_confidence < 70
- // - has a website URL
- const query = `
- SELECT id, name, website, menu_url,
- product_provider, product_confidence, product_crawler_mode,
- specials_provider, specials_confidence, specials_crawler_mode,
- brand_provider, brand_confidence, brand_crawler_mode,
- metadata_provider, metadata_confidence, metadata_crawler_mode
- FROM dispensaries
- WHERE (website IS NOT NULL OR menu_url IS NOT NULL)
- AND (
- product_provider IS NULL OR product_confidence < 70 OR
- specials_provider IS NULL OR specials_confidence < 70 OR
- brand_provider IS NULL OR brand_confidence < 70 OR
- metadata_provider IS NULL OR metadata_confidence < 70
- )
- ORDER BY
- CASE WHEN product_provider IS NULL THEN 0 ELSE 1 END,
- product_confidence ASC
- LIMIT $1
- `;
- const result = await migrate_1.pool.query(query, [flags.limit]);
- if (flags.dryRun) {
- console.log(` Would queue ${result.rows.length} dispensaries for multi-category detection:`);
- for (const row of result.rows) {
- const needsDetection = [];
- if (!row.product_provider || row.product_confidence < 70)
- needsDetection.push('product');
- if (!row.specials_provider || row.specials_confidence < 70)
- needsDetection.push('specials');
- if (!row.brand_provider || row.brand_confidence < 70)
- needsDetection.push('brand');
- if (!row.metadata_provider || row.metadata_confidence < 70)
- needsDetection.push('metadata');
- console.log(` - [${row.id}] ${row.name} (needs: ${needsDetection.join(', ')})`);
- }
- return result.rows.length;
- }
- let queued = 0;
- for (const dispensary of result.rows) {
- try {
- // Create detection jobs for each category that needs it
- for (const category of CATEGORIES) {
- const provider = dispensary[`${category}_provider`];
- const confidence = dispensary[`${category}_confidence`];
- if (!provider || confidence < 70) {
- await migrate_1.pool.query(`INSERT INTO sandbox_crawl_jobs (dispensary_id, category, job_type, status, priority)
- VALUES ($1, $2, 'detection', 'pending', 10)
- ON CONFLICT DO NOTHING`, [dispensary.id, category]);
- }
- }
- console.log(` ✓ Queued detection: [${dispensary.id}] ${dispensary.name}`);
- queued++;
- }
- catch (error) {
- console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`);
- }
- }
- return queued;
-}
-async function queueCategoryProductionCrawls(category) {
- const categories = category ? [category] : CATEGORIES;
- let totalQueued = 0;
- for (const cat of categories) {
- console.log(`\n🏭 Queueing Production ${cat.toUpperCase()} Crawls...`);
- // For now, only products have production-ready crawlers (Dutchie only)
- if (cat !== 'product') {
- console.log(` ⏭️ No production crawler for ${cat} yet - skipping`);
- continue;
- }
- // Find dispensaries ready for production crawl
- const query = `
- SELECT id, name, ${cat}_provider as provider, last_${cat}_scan_at as last_scan
- FROM dispensaries
- WHERE ${cat}_provider = 'dutchie'
- AND ${cat}_crawler_mode = 'production'
- AND ${cat}_confidence >= 70
- AND (last_${cat}_scan_at IS NULL OR last_${cat}_scan_at < NOW() - INTERVAL '4 hours')
- ORDER BY
- CASE WHEN last_${cat}_scan_at IS NULL THEN 0 ELSE 1 END,
- last_${cat}_scan_at ASC
- LIMIT $1
- `;
- const result = await migrate_1.pool.query(query, [flags.limit]);
- if (flags.dryRun) {
- console.log(` Would queue ${result.rows.length} dispensaries for ${cat} production crawl:`);
- for (const row of result.rows) {
- const lastScan = row.last_scan ? new Date(row.last_scan).toISOString() : 'never';
- console.log(` - [${row.id}] ${row.name} (provider: ${row.provider}, last: ${lastScan})`);
- }
- totalQueued += result.rows.length;
- continue;
- }
- for (const dispensary of result.rows) {
- try {
- // For products, use the existing crawl_jobs table for production
- await migrate_1.pool.query(`INSERT INTO crawl_jobs (store_id, job_type, trigger_type, status, priority, metadata)
- SELECT s.id, 'full_crawl', 'scheduled', 'pending', 50,
- jsonb_build_object('dispensary_id', $1, 'category', $2, 'source', 'queue-intelligence')
- FROM stores s
- JOIN dispensaries d ON (d.menu_url = s.dutchie_url OR d.name ILIKE '%' || s.name || '%')
- WHERE d.id = $1
- LIMIT 1`, [dispensary.id, cat]);
- console.log(` ✓ Queued ${cat} production: [${dispensary.id}] ${dispensary.name}`);
- totalQueued++;
- }
- catch (error) {
- console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`);
- }
- }
- }
- return totalQueued;
-}
-async function queueCategorySandboxCrawls(category) {
- const categories = category ? [category] : CATEGORIES;
- let totalQueued = 0;
- for (const cat of categories) {
- console.log(`\n🧪 Queueing Sandbox ${cat.toUpperCase()} Crawls...`);
- // Find dispensaries in sandbox mode for this category
- const query = `
- SELECT d.id, d.name, d.${cat}_provider as provider, d.${cat}_confidence as confidence,
- d.website, d.menu_url
- FROM dispensaries d
- WHERE d.${cat}_crawler_mode = 'sandbox'
- AND d.${cat}_provider IS NOT NULL
- AND (d.website IS NOT NULL OR d.menu_url IS NOT NULL)
- AND NOT EXISTS (
- SELECT 1 FROM sandbox_crawl_jobs sj
- WHERE sj.dispensary_id = d.id
- AND sj.category = $1
- AND sj.status IN ('pending', 'running')
- )
- ORDER BY d.${cat}_confidence DESC, d.updated_at ASC
- LIMIT $2
- `;
- const result = await migrate_1.pool.query(query, [cat, flags.limit]);
- if (flags.dryRun) {
- console.log(` Would queue ${result.rows.length} dispensaries for ${cat} sandbox crawl:`);
- for (const row of result.rows) {
- console.log(` - [${row.id}] ${row.name} (provider: ${row.provider}, confidence: ${row.confidence}%)`);
- }
- totalQueued += result.rows.length;
- continue;
- }
- for (const dispensary of result.rows) {
- try {
- // Create sandbox entry if needed
- const sandboxResult = await migrate_1.pool.query(`INSERT INTO crawler_sandboxes (dispensary_id, category, suspected_menu_provider, mode, status)
- VALUES ($1, $2, $3, 'template_learning', 'pending')
- ON CONFLICT (dispensary_id, category) WHERE status NOT IN ('moved_to_production', 'failed')
- DO UPDATE SET updated_at = NOW()
- RETURNING id`, [dispensary.id, cat, dispensary.provider]);
- const sandboxId = sandboxResult.rows[0]?.id;
- // Create sandbox job
- await migrate_1.pool.query(`INSERT INTO sandbox_crawl_jobs (dispensary_id, sandbox_id, category, job_type, status, priority)
- VALUES ($1, $2, $3, 'crawl', 'pending', 5)`, [dispensary.id, sandboxId, cat]);
- console.log(` ✓ Queued ${cat} sandbox: [${dispensary.id}] ${dispensary.name} (${dispensary.provider})`);
- totalQueued++;
- }
- catch (error) {
- console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`);
- }
- }
- }
- return totalQueued;
-}
-async function processDetectionJobs() {
- console.log('\n🔍 Processing Detection Jobs...');
- // Get pending detection jobs
- const jobs = await migrate_1.pool.query(`SELECT DISTINCT dispensary_id
- FROM sandbox_crawl_jobs
- WHERE job_type = 'detection' AND status = 'pending'
- ${flags.category ? `AND category = $2` : ''}
- ${flags.dispensary ? `AND dispensary_id = $${flags.category ? '3' : '2'}` : ''}
- LIMIT $1`, flags.category
- ? (flags.dispensary ? [flags.limit, flags.category, flags.dispensary] : [flags.limit, flags.category])
- : (flags.dispensary ? [flags.limit, flags.dispensary] : [flags.limit]));
- for (const job of jobs.rows) {
- console.log(`\nProcessing detection for dispensary ${job.dispensary_id}...`);
- try {
- // Get dispensary info
- const dispResult = await migrate_1.pool.query('SELECT id, name, website, menu_url FROM dispensaries WHERE id = $1', [job.dispensary_id]);
- const dispensary = dispResult.rows[0];
- if (!dispensary) {
- console.log(` ✗ Dispensary not found`);
- continue;
- }
- const websiteUrl = dispensary.website || dispensary.menu_url;
- if (!websiteUrl) {
- console.log(` ✗ No website URL`);
- continue;
- }
- // Mark jobs as running
- await migrate_1.pool.query(`UPDATE sandbox_crawl_jobs SET status = 'running', started_at = NOW()
- WHERE dispensary_id = $1 AND job_type = 'detection' AND status = 'pending'`, [job.dispensary_id]);
- // Run multi-category detection
- console.log(` Detecting providers for ${dispensary.name}...`);
- const detection = await (0, intelligence_detector_1.detectMultiCategoryProviders)(websiteUrl, { timeout: 45000 });
- // Update all categories
- await (0, intelligence_detector_1.updateAllCategoryProviders)(job.dispensary_id, detection);
- // Mark jobs as completed
- await migrate_1.pool.query(`UPDATE sandbox_crawl_jobs SET status = 'completed', completed_at = NOW(),
- result_summary = $1
- WHERE dispensary_id = $2 AND job_type = 'detection' AND status = 'running'`, [JSON.stringify({
- product: { provider: detection.product.provider, confidence: detection.product.confidence },
- specials: { provider: detection.specials.provider, confidence: detection.specials.confidence },
- brand: { provider: detection.brand.provider, confidence: detection.brand.confidence },
- metadata: { provider: detection.metadata.provider, confidence: detection.metadata.confidence },
- }), job.dispensary_id]);
- console.log(` ✓ Detection complete:`);
- console.log(` Product: ${detection.product.provider} (${detection.product.confidence}%) -> ${detection.product.mode}`);
- console.log(` Specials: ${detection.specials.provider} (${detection.specials.confidence}%) -> ${detection.specials.mode}`);
- console.log(` Brand: ${detection.brand.provider} (${detection.brand.confidence}%) -> ${detection.brand.mode}`);
- console.log(` Metadata: ${detection.metadata.provider} (${detection.metadata.confidence}%) -> ${detection.metadata.mode}`);
- }
- catch (error) {
- console.log(` ✗ Error: ${error.message}`);
- await migrate_1.pool.query(`UPDATE sandbox_crawl_jobs SET status = 'failed', error_message = $1
- WHERE dispensary_id = $2 AND job_type = 'detection' AND status = 'running'`, [error.message, job.dispensary_id]);
- }
- }
-}
-async function processCrawlJobs() {
- const categories = flags.category ? [flags.category] : CATEGORIES;
- for (const cat of categories) {
- console.log(`\n⚙️ Processing ${cat.toUpperCase()} Crawl Jobs...\n`);
- // Process sandbox jobs for this category
- if (flags.sandbox || !flags.production) {
- await (0, category_crawler_jobs_1.processCategorySandboxJobs)(cat, flags.limit);
- }
- // Process production jobs for this category
- if (flags.production && cat === 'product') {
- // Get pending production crawls
- const prodJobs = await migrate_1.pool.query(`SELECT d.id
- FROM dispensaries d
- WHERE d.product_provider = 'dutchie'
- AND d.product_crawler_mode = 'production'
- AND d.product_confidence >= 70
- ${flags.dispensary ? 'AND d.id = $2' : ''}
- LIMIT $1`, flags.dispensary ? [flags.limit, flags.dispensary] : [flags.limit]);
- for (const job of prodJobs.rows) {
- console.log(`Processing production ${cat} crawl for dispensary ${job.id}...`);
- const result = await (0, category_crawler_jobs_1.runCrawlProductsJob)(job.id);
- console.log(` ${result.success ? '✓' : '✗'} ${result.message}`);
- }
- }
- }
-}
-async function processSpecificDispensary() {
- if (!flags.dispensary)
- return;
- console.log(`\n🎯 Processing Dispensary ${flags.dispensary}...\n`);
- const dispResult = await migrate_1.pool.query('SELECT * FROM dispensaries WHERE id = $1', [flags.dispensary]);
- if (dispResult.rows.length === 0) {
- console.log('Dispensary not found');
- return;
- }
- const dispensary = dispResult.rows[0];
- console.log(`Name: ${dispensary.name}`);
- console.log(`Website: ${dispensary.website || dispensary.menu_url || 'none'}`);
- console.log('');
- if (flags.detection) {
- console.log('Running multi-category detection...');
- const websiteUrl = dispensary.website || dispensary.menu_url;
- if (websiteUrl) {
- const detection = await (0, intelligence_detector_1.detectMultiCategoryProviders)(websiteUrl);
- await (0, intelligence_detector_1.updateAllCategoryProviders)(flags.dispensary, detection);
- console.log('Detection results:');
- console.log(` Product: ${detection.product.provider} (${detection.product.confidence}%) -> ${detection.product.mode}`);
- console.log(` Specials: ${detection.specials.provider} (${detection.specials.confidence}%) -> ${detection.specials.mode}`);
- console.log(` Brand: ${detection.brand.provider} (${detection.brand.confidence}%) -> ${detection.brand.mode}`);
- console.log(` Metadata: ${detection.metadata.provider} (${detection.metadata.confidence}%) -> ${detection.metadata.mode}`);
- }
- }
- if (flags.production) {
- console.log('\nRunning production crawls...');
- const results = await (0, category_crawler_jobs_1.runAllCategoryProductionCrawls)(flags.dispensary);
- console.log(` ${results.summary}`);
- }
- if (flags.sandbox) {
- console.log('\nRunning sandbox crawls...');
- const results = await (0, category_crawler_jobs_1.runAllCategorySandboxCrawls)(flags.dispensary);
- console.log(` ${results.summary}`);
- }
-}
-async function showStats() {
- console.log('\n📊 Multi-Category Intelligence Stats:');
- // Per-category stats
- for (const cat of CATEGORIES) {
- const stats = await migrate_1.pool.query(`
- SELECT
- COUNT(*) as total,
- COUNT(*) FILTER (WHERE ${cat}_provider IS NULL) as no_provider,
- COUNT(*) FILTER (WHERE ${cat}_provider = 'dutchie') as dutchie,
- COUNT(*) FILTER (WHERE ${cat}_provider = 'treez') as treez,
- COUNT(*) FILTER (WHERE ${cat}_provider NOT IN ('dutchie', 'treez', 'unknown') AND ${cat}_provider IS NOT NULL) as other,
- COUNT(*) FILTER (WHERE ${cat}_provider = 'unknown') as unknown,
- COUNT(*) FILTER (WHERE ${cat}_crawler_mode = 'production') as production,
- COUNT(*) FILTER (WHERE ${cat}_crawler_mode = 'sandbox') as sandbox,
- AVG(${cat}_confidence) as avg_confidence
- FROM dispensaries
- `);
- const s = stats.rows[0];
- console.log(`
- ${cat.toUpperCase()}:
- Providers: Dutchie=${s.dutchie}, Treez=${s.treez}, Other=${s.other}, Unknown=${s.unknown}, None=${s.no_provider}
- Modes: Production=${s.production}, Sandbox=${s.sandbox}
- Avg Confidence: ${Math.round(s.avg_confidence || 0)}%`);
- }
- // Job stats per category
- console.log('\n Sandbox Jobs by Category:');
- const jobStats = await migrate_1.pool.query(`
- SELECT
- category,
- COUNT(*) FILTER (WHERE status = 'pending') as pending,
- COUNT(*) FILTER (WHERE status = 'running') as running,
- COUNT(*) FILTER (WHERE status = 'completed') as completed,
- COUNT(*) FILTER (WHERE status = 'failed') as failed
- FROM sandbox_crawl_jobs
- GROUP BY category
- ORDER BY category
- `);
- for (const row of jobStats.rows) {
- console.log(` ${row.category}: pending=${row.pending}, running=${row.running}, completed=${row.completed}, failed=${row.failed}`);
- }
-}
-async function main() {
- if (flags.help) {
- await showHelp();
- process.exit(0);
- }
- console.log('═══════════════════════════════════════════════════════');
- console.log(' Multi-Category Intelligence Queue Manager');
- console.log('═══════════════════════════════════════════════════════');
- if (flags.dryRun) {
- console.log('\n🔍 DRY RUN MODE - No changes will be made\n');
- }
- if (flags.category) {
- console.log(`\n📌 Filtering to category: ${flags.category}\n`);
- }
- try {
- // Show current stats first
- await showStats();
- // If specific dispensary specified, process it directly
- if (flags.dispensary && flags.process) {
- await processSpecificDispensary();
- }
- else if (flags.process) {
- // Process mode - run jobs
- if (flags.detection) {
- await processDetectionJobs();
- }
- await processCrawlJobs();
- }
- else {
- // Queuing mode
- let totalQueued = 0;
- if (flags.detection) {
- totalQueued += await queueMultiCategoryDetection();
- }
- if (flags.production) {
- totalQueued += await queueCategoryProductionCrawls(flags.category);
- }
- if (flags.sandbox) {
- totalQueued += await queueCategorySandboxCrawls(flags.category);
- }
- console.log('\n═══════════════════════════════════════════════════════');
- console.log(` Total queued: ${totalQueued}`);
- console.log('═══════════════════════════════════════════════════════\n');
- }
- // Show updated stats
- if (!flags.dryRun) {
- await showStats();
- }
- }
- catch (error) {
- console.error('Fatal error:', error);
- process.exit(1);
- }
- finally {
- await migrate_1.pool.end();
- }
-}
-main();
diff --git a/backend/dist/scripts/run-dutchie-scrape.js b/backend/dist/scripts/run-dutchie-scrape.js
deleted file mode 100644
index c2c8ca98..00000000
--- a/backend/dist/scripts/run-dutchie-scrape.js
+++ /dev/null
@@ -1,125 +0,0 @@
-"use strict";
-/**
- * Run Dutchie GraphQL Scrape
- *
- * This script demonstrates the full pipeline:
- * 1. Puppeteer navigates to Dutchie menu
- * 2. GraphQL responses are intercepted
- * 3. Products are normalized to our schema
- * 4. Products are upserted to database
- * 5. Derived views (brands, categories, specials) are automatically updated
- */
-Object.defineProperty(exports, "__esModule", { value: true });
-const pg_1 = require("pg");
-const dutchie_graphql_1 = require("../scrapers/dutchie-graphql");
-const DATABASE_URL = process.env.DATABASE_URL || 'postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus';
-async function main() {
- const pool = new pg_1.Pool({ connectionString: DATABASE_URL });
- try {
- console.log('='.repeat(80));
- console.log('DUTCHIE GRAPHQL SCRAPER - FULL PIPELINE TEST');
- console.log('='.repeat(80));
- console.log(`Database: ${DATABASE_URL.replace(/:[^:@]+@/, ':***@')}`);
- // Configuration
- const storeId = 1; // Deeply Rooted
- const menuUrl = 'https://dutchie.com/embedded-menu/AZ-Deeply-Rooted';
- console.log(`\nStore ID: ${storeId}`);
- console.log(`Menu URL: ${menuUrl}`);
- console.log('\n' + '-'.repeat(80));
- // Run the scrape
- console.log('\n🚀 Starting scrape...\n');
- const result = await (0, dutchie_graphql_1.scrapeDutchieMenu)(pool, storeId, menuUrl);
- console.log('\n' + '-'.repeat(80));
- console.log('📊 SCRAPE RESULTS:');
- console.log('-'.repeat(80));
- console.log(` Success: ${result.success}`);
- console.log(` Products Found: ${result.productsFound}`);
- console.log(` Inserted: ${result.inserted}`);
- console.log(` Updated: ${result.updated}`);
- if (result.error) {
- console.log(` Error: ${result.error}`);
- }
- // Query derived views to show the result
- if (result.success) {
- console.log('\n' + '-'.repeat(80));
- console.log('📈 DERIVED DATA (from products table):');
- console.log('-'.repeat(80));
- // Brands
- const brandsResult = await pool.query(`
- SELECT brand_name, product_count, min_price, max_price
- FROM derived_brands
- WHERE store_id = $1
- ORDER BY product_count DESC
- LIMIT 5
- `, [storeId]);
- console.log('\nTop 5 Brands:');
- brandsResult.rows.forEach(row => {
- console.log(` - ${row.brand_name}: ${row.product_count} products ($${row.min_price} - $${row.max_price})`);
- });
- // Specials
- const specialsResult = await pool.query(`
- SELECT name, brand, rec_price, rec_special_price, discount_percent
- FROM current_specials
- WHERE store_id = $1
- LIMIT 5
- `, [storeId]);
- console.log('\nTop 5 Specials:');
- if (specialsResult.rows.length === 0) {
- console.log(' (No specials found - is_on_special may not be populated yet)');
- }
- else {
- specialsResult.rows.forEach(row => {
- console.log(` - ${row.name} (${row.brand}): $${row.rec_price} → $${row.rec_special_price} (${row.discount_percent}% off)`);
- });
- }
- // Categories
- const categoriesResult = await pool.query(`
- SELECT category_name, product_count
- FROM derived_categories
- WHERE store_id = $1
- ORDER BY product_count DESC
- LIMIT 5
- `, [storeId]);
- console.log('\nTop 5 Categories:');
- if (categoriesResult.rows.length === 0) {
- console.log(' (No categories found - subcategory may not be populated yet)');
- }
- else {
- categoriesResult.rows.forEach(row => {
- console.log(` - ${row.category_name}: ${row.product_count} products`);
- });
- }
- // Sample product
- const sampleResult = await pool.query(`
- SELECT name, brand, subcategory, rec_price, rec_special_price, is_on_special, thc_percentage, status
- FROM products
- WHERE store_id = $1 AND subcategory IS NOT NULL
- ORDER BY updated_at DESC
- LIMIT 1
- `, [storeId]);
- if (sampleResult.rows.length > 0) {
- const sample = sampleResult.rows[0];
- console.log('\nSample Product (with new fields):');
- console.log(` Name: ${sample.name}`);
- console.log(` Brand: ${sample.brand}`);
- console.log(` Category: ${sample.subcategory}`);
- console.log(` Price: $${sample.rec_price}`);
- console.log(` Sale Price: ${sample.rec_special_price ? `$${sample.rec_special_price}` : 'N/A'}`);
- console.log(` On Special: ${sample.is_on_special}`);
- console.log(` THC: ${sample.thc_percentage}%`);
- console.log(` Status: ${sample.status}`);
- }
- }
- console.log('\n' + '='.repeat(80));
- console.log('✅ SCRAPE COMPLETE');
- console.log('='.repeat(80));
- }
- catch (error) {
- console.error('\n❌ Error:', error.message);
- throw error;
- }
- finally {
- await pool.end();
- }
-}
-main().catch(console.error);
diff --git a/backend/dist/scripts/scrape-all-active.js b/backend/dist/scripts/scrape-all-active.js
deleted file mode 100644
index fb55b0d6..00000000
--- a/backend/dist/scripts/scrape-all-active.js
+++ /dev/null
@@ -1,279 +0,0 @@
-"use strict";
-/**
- * Scrape ALL active products via direct GraphQL pagination
- * This is more reliable than category navigation
- */
-var __importDefault = (this && this.__importDefault) || function (mod) {
- return (mod && mod.__esModule) ? mod : { "default": mod };
-};
-Object.defineProperty(exports, "__esModule", { value: true });
-const puppeteer_extra_1 = __importDefault(require("puppeteer-extra"));
-const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth"));
-const pg_1 = require("pg");
-const dutchie_graphql_1 = require("../scrapers/dutchie-graphql");
-puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)());
-const DATABASE_URL = process.env.DATABASE_URL || 'postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus';
-const GRAPHQL_HASH = 'ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0';
-async function scrapeAllProducts(menuUrl, storeId) {
- const pool = new pg_1.Pool({ connectionString: DATABASE_URL });
- const browser = await puppeteer_extra_1.default.launch({
- headless: 'new',
- args: ['--no-sandbox', '--disable-setuid-sandbox'],
- });
- try {
- const page = await browser.newPage();
- await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36');
- console.log('Loading menu to establish session...');
- await page.goto(menuUrl, {
- waitUntil: 'networkidle2',
- timeout: 60000,
- });
- await new Promise((r) => setTimeout(r, 3000));
- const dispensaryId = await page.evaluate(() => window.reactEnv?.dispensaryId);
- console.log('Dispensary ID:', dispensaryId);
- // Paginate through all products
- const allProducts = [];
- let pageNum = 0;
- const perPage = 100;
- console.log('\nFetching all products via paginated GraphQL...');
- while (true) {
- const result = await page.evaluate(async (dispId, hash, page, perPage) => {
- const variables = {
- includeEnterpriseSpecials: false,
- productsFilter: {
- dispensaryId: dispId,
- pricingType: 'rec',
- Status: 'Active',
- types: [],
- useCache: false,
- isDefaultSort: true,
- sortBy: 'popularSortIdx',
- sortDirection: 1,
- bypassOnlineThresholds: true,
- isKioskMenu: false,
- removeProductsBelowOptionThresholds: false,
- },
- page,
- perPage,
- };
- const qs = new URLSearchParams({
- operationName: 'FilteredProducts',
- variables: JSON.stringify(variables),
- extensions: JSON.stringify({ persistedQuery: { version: 1, sha256Hash: hash } }),
- });
- const resp = await fetch(`https://dutchie.com/graphql?${qs.toString()}`, {
- method: 'GET',
- headers: {
- 'content-type': 'application/json',
- 'apollographql-client-name': 'Marketplace (production)',
- },
- credentials: 'include',
- });
- const json = await resp.json();
- return {
- products: json?.data?.filteredProducts?.products || [],
- totalCount: json?.data?.filteredProducts?.queryInfo?.totalCount,
- };
- }, dispensaryId, GRAPHQL_HASH, pageNum, perPage);
- if (result.products.length === 0) {
- break;
- }
- allProducts.push(...result.products);
- console.log(`Page ${pageNum}: ${result.products.length} products (total so far: ${allProducts.length}/${result.totalCount})`);
- pageNum++;
- // Safety limit
- if (pageNum > 50) {
- console.log('Reached page limit');
- break;
- }
- }
- console.log(`\nTotal products fetched: ${allProducts.length}`);
- // Normalize and upsert
- console.log('\nNormalizing and upserting to database...');
- const normalized = allProducts.map(dutchie_graphql_1.normalizeDutchieProduct);
- const client = await pool.connect();
- let inserted = 0;
- let updated = 0;
- try {
- await client.query('BEGIN');
- for (const product of normalized) {
- const result = await client.query(`
- INSERT INTO products (
- store_id, external_id, slug, name, enterprise_product_id,
- brand, brand_external_id, brand_logo_url,
- subcategory, strain_type, canonical_category,
- price, rec_price, med_price, rec_special_price, med_special_price,
- is_on_special, special_name, discount_percent, special_data,
- sku, inventory_quantity, inventory_available, is_below_threshold, status,
- thc_percentage, cbd_percentage, cannabinoids,
- weight_mg, net_weight_value, net_weight_unit, options, raw_options,
- image_url, additional_images,
- is_featured, medical_only, rec_only,
- source_created_at, source_updated_at,
- description, raw_data,
- dutchie_url, last_seen_at, updated_at
- )
- VALUES (
- $1, $2, $3, $4, $5,
- $6, $7, $8,
- $9, $10, $11,
- $12, $13, $14, $15, $16,
- $17, $18, $19, $20,
- $21, $22, $23, $24, $25,
- $26, $27, $28,
- $29, $30, $31, $32, $33,
- $34, $35,
- $36, $37, $38,
- $39, $40,
- $41, $42,
- '', NOW(), NOW()
- )
- ON CONFLICT (store_id, slug) DO UPDATE SET
- name = EXCLUDED.name,
- enterprise_product_id = EXCLUDED.enterprise_product_id,
- brand = EXCLUDED.brand,
- brand_external_id = EXCLUDED.brand_external_id,
- brand_logo_url = EXCLUDED.brand_logo_url,
- subcategory = EXCLUDED.subcategory,
- strain_type = EXCLUDED.strain_type,
- canonical_category = EXCLUDED.canonical_category,
- price = EXCLUDED.price,
- rec_price = EXCLUDED.rec_price,
- med_price = EXCLUDED.med_price,
- rec_special_price = EXCLUDED.rec_special_price,
- med_special_price = EXCLUDED.med_special_price,
- is_on_special = EXCLUDED.is_on_special,
- special_name = EXCLUDED.special_name,
- discount_percent = EXCLUDED.discount_percent,
- special_data = EXCLUDED.special_data,
- sku = EXCLUDED.sku,
- inventory_quantity = EXCLUDED.inventory_quantity,
- inventory_available = EXCLUDED.inventory_available,
- is_below_threshold = EXCLUDED.is_below_threshold,
- status = EXCLUDED.status,
- thc_percentage = EXCLUDED.thc_percentage,
- cbd_percentage = EXCLUDED.cbd_percentage,
- cannabinoids = EXCLUDED.cannabinoids,
- weight_mg = EXCLUDED.weight_mg,
- net_weight_value = EXCLUDED.net_weight_value,
- net_weight_unit = EXCLUDED.net_weight_unit,
- options = EXCLUDED.options,
- raw_options = EXCLUDED.raw_options,
- image_url = EXCLUDED.image_url,
- additional_images = EXCLUDED.additional_images,
- is_featured = EXCLUDED.is_featured,
- medical_only = EXCLUDED.medical_only,
- rec_only = EXCLUDED.rec_only,
- source_created_at = EXCLUDED.source_created_at,
- source_updated_at = EXCLUDED.source_updated_at,
- description = EXCLUDED.description,
- raw_data = EXCLUDED.raw_data,
- last_seen_at = NOW(),
- updated_at = NOW()
- RETURNING (xmax = 0) AS was_inserted
- `, [
- storeId,
- product.external_id,
- product.slug,
- product.name,
- product.enterprise_product_id,
- product.brand,
- product.brand_external_id,
- product.brand_logo_url,
- product.subcategory,
- product.strain_type,
- product.canonical_category,
- product.price,
- product.rec_price,
- product.med_price,
- product.rec_special_price,
- product.med_special_price,
- product.is_on_special,
- product.special_name,
- product.discount_percent,
- product.special_data ? JSON.stringify(product.special_data) : null,
- product.sku,
- product.inventory_quantity,
- product.inventory_available,
- product.is_below_threshold,
- product.status,
- product.thc_percentage,
- product.cbd_percentage,
- product.cannabinoids ? JSON.stringify(product.cannabinoids) : null,
- product.weight_mg,
- product.net_weight_value,
- product.net_weight_unit,
- product.options,
- product.raw_options,
- product.image_url,
- product.additional_images,
- product.is_featured,
- product.medical_only,
- product.rec_only,
- product.source_created_at,
- product.source_updated_at,
- product.description,
- product.raw_data ? JSON.stringify(product.raw_data) : null,
- ]);
- if (result.rows[0]?.was_inserted) {
- inserted++;
- }
- else {
- updated++;
- }
- }
- await client.query('COMMIT');
- }
- catch (error) {
- await client.query('ROLLBACK');
- throw error;
- }
- finally {
- client.release();
- }
- console.log(`\nDatabase: ${inserted} inserted, ${updated} updated`);
- // Show summary stats
- const stats = await pool.query(`
- SELECT
- COUNT(*) as total,
- COUNT(*) FILTER (WHERE is_on_special) as specials,
- COUNT(DISTINCT brand) as brands,
- COUNT(DISTINCT subcategory) as categories
- FROM products WHERE store_id = $1
- `, [storeId]);
- console.log('\nStore summary:');
- console.log(` Total products: ${stats.rows[0].total}`);
- console.log(` On special: ${stats.rows[0].specials}`);
- console.log(` Unique brands: ${stats.rows[0].brands}`);
- console.log(` Categories: ${stats.rows[0].categories}`);
- return {
- success: true,
- totalProducts: allProducts.length,
- inserted,
- updated,
- };
- }
- finally {
- await browser.close();
- await pool.end();
- }
-}
-// Run
-const menuUrl = process.argv[2] || 'https://dutchie.com/embedded-menu/AZ-Deeply-Rooted';
-const storeId = parseInt(process.argv[3] || '1', 10);
-console.log('='.repeat(60));
-console.log('DUTCHIE GRAPHQL FULL SCRAPE');
-console.log('='.repeat(60));
-console.log(`Menu URL: ${menuUrl}`);
-console.log(`Store ID: ${storeId}`);
-console.log('');
-scrapeAllProducts(menuUrl, storeId)
- .then((result) => {
- console.log('\n' + '='.repeat(60));
- console.log('COMPLETE');
- console.log(JSON.stringify(result, null, 2));
-})
- .catch((error) => {
- console.error('Error:', error.message);
- process.exit(1);
-});
diff --git a/backend/dist/scripts/test-dutchie-e2e.js b/backend/dist/scripts/test-dutchie-e2e.js
deleted file mode 100644
index 63bb215a..00000000
--- a/backend/dist/scripts/test-dutchie-e2e.js
+++ /dev/null
@@ -1,169 +0,0 @@
-"use strict";
-/**
- * Test script: End-to-end Dutchie GraphQL → DB → Dashboard flow
- *
- * This demonstrates the complete data pipeline:
- * 1. Fetch one product from Dutchie GraphQL via Puppeteer
- * 2. Normalize it to our schema
- * 3. Show the mapping
- */
-var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
- if (k2 === undefined) k2 = k;
- var desc = Object.getOwnPropertyDescriptor(m, k);
- if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
- desc = { enumerable: true, get: function() { return m[k]; } };
- }
- Object.defineProperty(o, k2, desc);
-}) : (function(o, m, k, k2) {
- if (k2 === undefined) k2 = k;
- o[k2] = m[k];
-}));
-var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
- Object.defineProperty(o, "default", { enumerable: true, value: v });
-}) : function(o, v) {
- o["default"] = v;
-});
-var __importStar = (this && this.__importStar) || (function () {
- var ownKeys = function(o) {
- ownKeys = Object.getOwnPropertyNames || function (o) {
- var ar = [];
- for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
- return ar;
- };
- return ownKeys(o);
- };
- return function (mod) {
- if (mod && mod.__esModule) return mod;
- var result = {};
- if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
- __setModuleDefault(result, mod);
- return result;
- };
-})();
-Object.defineProperty(exports, "__esModule", { value: true });
-const dutchie_graphql_1 = require("../scrapers/dutchie-graphql");
-const fs = __importStar(require("fs"));
-// Load the captured sample product from schema capture
-const capturedData = JSON.parse(fs.readFileSync('/tmp/dutchie-schema-capture.json', 'utf-8'));
-const sampleProduct = capturedData.sampleProduct;
-console.log('='.repeat(80));
-console.log('DUTCHIE GRAPHQL → DATABASE MAPPING DEMONSTRATION');
-console.log('='.repeat(80));
-console.log('\n📥 RAW DUTCHIE GRAPHQL PRODUCT:');
-console.log('-'.repeat(80));
-// Show key fields from raw product
-const keyRawFields = {
- '_id': sampleProduct._id,
- 'Name': sampleProduct.Name,
- 'cName': sampleProduct.cName,
- 'brandName': sampleProduct.brandName,
- 'brand.id': sampleProduct.brand?.id,
- 'type': sampleProduct.type,
- 'subcategory': sampleProduct.subcategory,
- 'strainType': sampleProduct.strainType,
- 'Prices': sampleProduct.Prices,
- 'recPrices': sampleProduct.recPrices,
- 'recSpecialPrices': sampleProduct.recSpecialPrices,
- 'special': sampleProduct.special,
- 'specialData.saleSpecials[0].specialName': sampleProduct.specialData?.saleSpecials?.[0]?.specialName,
- 'specialData.saleSpecials[0].discount': sampleProduct.specialData?.saleSpecials?.[0]?.discount,
- 'THCContent.range[0]': sampleProduct.THCContent?.range?.[0],
- 'CBDContent.range[0]': sampleProduct.CBDContent?.range?.[0],
- 'Status': sampleProduct.Status,
- 'Image': sampleProduct.Image,
- 'POSMetaData.canonicalSKU': sampleProduct.POSMetaData?.canonicalSKU,
- 'POSMetaData.children[0].quantity': sampleProduct.POSMetaData?.children?.[0]?.quantity,
- 'POSMetaData.children[0].quantityAvailable': sampleProduct.POSMetaData?.children?.[0]?.quantityAvailable,
-};
-Object.entries(keyRawFields).forEach(([key, value]) => {
- console.log(` ${key}: ${JSON.stringify(value)}`);
-});
-console.log('\n📤 NORMALIZED DATABASE ROW:');
-console.log('-'.repeat(80));
-// Normalize the product
-const normalized = (0, dutchie_graphql_1.normalizeDutchieProduct)(sampleProduct);
-// Show the normalized result (excluding raw_data for readability)
-const { raw_data, cannabinoids, special_data, ...displayFields } = normalized;
-Object.entries(displayFields).forEach(([key, value]) => {
- if (value !== undefined && value !== null) {
- console.log(` ${key}: ${JSON.stringify(value)}`);
- }
-});
-console.log('\n🔗 FIELD MAPPING:');
-console.log('-'.repeat(80));
-const fieldMappings = [
- ['_id / id', 'external_id', sampleProduct._id, normalized.external_id],
- ['Name', 'name', sampleProduct.Name, normalized.name],
- ['cName', 'slug', sampleProduct.cName, normalized.slug],
- ['brandName', 'brand', sampleProduct.brandName, normalized.brand],
- ['brand.id', 'brand_external_id', sampleProduct.brand?.id, normalized.brand_external_id],
- ['subcategory', 'subcategory', sampleProduct.subcategory, normalized.subcategory],
- ['strainType', 'strain_type', sampleProduct.strainType, normalized.strain_type],
- ['recPrices[0]', 'rec_price', sampleProduct.recPrices?.[0], normalized.rec_price],
- ['recSpecialPrices[0]', 'rec_special_price', sampleProduct.recSpecialPrices?.[0], normalized.rec_special_price],
- ['special', 'is_on_special', sampleProduct.special, normalized.is_on_special],
- ['specialData...specialName', 'special_name', sampleProduct.specialData?.saleSpecials?.[0]?.specialName?.substring(0, 40) + '...', normalized.special_name?.substring(0, 40) + '...'],
- ['THCContent.range[0]', 'thc_percentage', sampleProduct.THCContent?.range?.[0], normalized.thc_percentage],
- ['CBDContent.range[0]', 'cbd_percentage', sampleProduct.CBDContent?.range?.[0], normalized.cbd_percentage],
- ['Status', 'status', sampleProduct.Status, normalized.status],
- ['Image', 'image_url', sampleProduct.Image?.substring(0, 50) + '...', normalized.image_url?.substring(0, 50) + '...'],
- ['POSMetaData.canonicalSKU', 'sku', sampleProduct.POSMetaData?.canonicalSKU, normalized.sku],
-];
-console.log(' GraphQL Field → DB Column | Value');
-console.log(' ' + '-'.repeat(75));
-fieldMappings.forEach(([gqlField, dbCol, gqlVal, dbVal]) => {
- const gqlStr = String(gqlField).padEnd(30);
- const dbStr = String(dbCol).padEnd(20);
- console.log(` ${gqlStr} → ${dbStr} | ${JSON.stringify(dbVal)}`);
-});
-console.log('\n📊 SQL INSERT STATEMENT:');
-console.log('-'.repeat(80));
-// Generate example SQL
-const sqlExample = `
-INSERT INTO products (
- store_id, external_id, slug, name,
- brand, brand_external_id,
- subcategory, strain_type,
- rec_price, rec_special_price,
- is_on_special, special_name, discount_percent,
- thc_percentage, cbd_percentage,
- status, image_url, sku
-) VALUES (
- 1, -- store_id (Deeply Rooted)
- '${normalized.external_id}', -- external_id
- '${normalized.slug}', -- slug
- '${normalized.name}', -- name
- '${normalized.brand}', -- brand
- '${normalized.brand_external_id}', -- brand_external_id
- '${normalized.subcategory}', -- subcategory
- '${normalized.strain_type}', -- strain_type
- ${normalized.rec_price}, -- rec_price
- ${normalized.rec_special_price}, -- rec_special_price
- ${normalized.is_on_special}, -- is_on_special
- '${normalized.special_name?.substring(0, 50)}...', -- special_name
- ${normalized.discount_percent || 'NULL'}, -- discount_percent
- ${normalized.thc_percentage}, -- thc_percentage
- ${normalized.cbd_percentage}, -- cbd_percentage
- '${normalized.status}', -- status
- '${normalized.image_url}', -- image_url
- '${normalized.sku}' -- sku
-)
-ON CONFLICT (store_id, slug) DO UPDATE SET ...;
-`;
-console.log(sqlExample);
-console.log('\n✅ SUMMARY:');
-console.log('-'.repeat(80));
-console.log(` Product: ${normalized.name}`);
-console.log(` Brand: ${normalized.brand}`);
-console.log(` Category: ${normalized.subcategory}`);
-console.log(` Price: $${normalized.rec_price} → $${normalized.rec_special_price} (${normalized.discount_percent}% off)`);
-console.log(` THC: ${normalized.thc_percentage}%`);
-console.log(` Status: ${normalized.status}`);
-console.log(` On Special: ${normalized.is_on_special}`);
-console.log(` SKU: ${normalized.sku}`);
-console.log('\n🎯 DERIVED VIEWS (computed from products table):');
-console.log('-'.repeat(80));
-console.log(' - current_specials: Products where is_on_special = true');
-console.log(' - derived_brands: Aggregated by brand name with counts/prices');
-console.log(' - derived_categories: Aggregated by subcategory');
-console.log('\nAll views are computed from the single products table - no separate tables needed!');
diff --git a/backend/dist/scripts/test-dutchie-graphql.js b/backend/dist/scripts/test-dutchie-graphql.js
deleted file mode 100644
index 8cf8962f..00000000
--- a/backend/dist/scripts/test-dutchie-graphql.js
+++ /dev/null
@@ -1,179 +0,0 @@
-"use strict";
-/**
- * Test script to validate Dutchie GraphQL API access and capture response structure
- */
-var __importDefault = (this && this.__importDefault) || function (mod) {
- return (mod && mod.__esModule) ? mod : { "default": mod };
-};
-Object.defineProperty(exports, "__esModule", { value: true });
-// @ts-ignore - node-fetch type declaration not installed
-const node_fetch_1 = __importDefault(require("node-fetch"));
-const GRAPHQL_HASHES = {
- ConsumerDispensaries: '0a5bfa6ca1d64ae47bcccb7c8077c87147cbc4e6982c17ceec97a2a4948b311b',
- GetAddressBasedDispensaryData: '13461f73abf7268770dfd05fe7e10c523084b2bb916a929c08efe3d87531977b',
- FilteredProducts: 'ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0',
- MenuFiltersV2: '2f0b3233b8a2426b391649ca3f0f7a5d43b9aefd683f6286d7261a2517e3568e',
- FilteredSpecials: '0dfb85a4fc138c55a076d4d11bf6d1a25f7cbd511428e1cf5a5b863b3eb23f25',
-};
-async function fetchProducts(dispensaryId, page = 0, perPage = 25) {
- const session = 'crawlsy-session-' + Date.now();
- const variables = {
- includeEnterpriseSpecials: false,
- productsFilter: {
- dispensaryId,
- pricingType: 'rec',
- Status: null, // null to include all (in-stock and out-of-stock)
- types: [],
- useCache: true,
- isDefaultSort: true,
- sortBy: 'popularSortIdx',
- sortDirection: 1,
- bypassOnlineThresholds: true,
- isKioskMenu: false,
- removeProductsBelowOptionThresholds: false
- },
- page,
- perPage
- };
- const qs = new URLSearchParams({
- operationName: 'FilteredProducts',
- variables: JSON.stringify(variables),
- extensions: JSON.stringify({ persistedQuery: { version: 1, sha256Hash: GRAPHQL_HASHES.FilteredProducts } })
- });
- const res = await (0, node_fetch_1.default)(`https://dutchie.com/api-3/graphql?${qs.toString()}`, {
- headers: {
- 'x-dutchie-session': session,
- 'apollographql-client-name': 'Marketplace (production)',
- 'content-type': 'application/json',
- 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
- }
- });
- if (!res.ok) {
- const text = await res.text();
- console.error('HTTP Status:', res.status);
- console.error('Response:', text.substring(0, 500));
- throw new Error(`HTTP ${res.status}: ${text.substring(0, 200)}`);
- }
- return res.json();
-}
-async function resolveDispensaryId(cName) {
- const session = 'crawlsy-session-' + Date.now();
- const variables = { input: { dispensaryId: cName } };
- const qs = new URLSearchParams({
- operationName: 'GetAddressBasedDispensaryData',
- variables: JSON.stringify(variables),
- extensions: JSON.stringify({ persistedQuery: { version: 1, sha256Hash: GRAPHQL_HASHES.GetAddressBasedDispensaryData } })
- });
- const res = await (0, node_fetch_1.default)(`https://dutchie.com/graphql?${qs.toString()}`, {
- headers: {
- 'x-dutchie-session': session,
- 'apollographql-client-name': 'Marketplace (production)',
- 'content-type': 'application/json',
- 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
- }
- });
- if (!res.ok) {
- console.error('Failed to resolve dispensary ID:', res.status);
- return null;
- }
- const data = await res.json();
- return data?.data?.getAddressBasedDispensaryData?.dispensaryData?.dispensaryId || null;
-}
-function enumerateFields(obj, prefix = '') {
- const fields = [];
- for (const [key, value] of Object.entries(obj)) {
- const path = prefix ? `${prefix}.${key}` : key;
- if (value === null) {
- fields.push(`${path}: null`);
- }
- else if (Array.isArray(value)) {
- fields.push(`${path}: Array[${value.length}]`);
- if (value.length > 0 && typeof value[0] === 'object') {
- const subFields = enumerateFields(value[0], `${path}[0]`);
- fields.push(...subFields);
- }
- }
- else if (typeof value === 'object') {
- fields.push(`${path}: Object`);
- const subFields = enumerateFields(value, path);
- fields.push(...subFields);
- }
- else {
- const typeStr = typeof value;
- const preview = String(value).substring(0, 50);
- fields.push(`${path}: ${typeStr} = "${preview}"`);
- }
- }
- return fields;
-}
-async function main() {
- console.log('='.repeat(80));
- console.log('DUTCHIE GRAPHQL API TEST');
- console.log('='.repeat(80));
- const cName = 'AZ-Deeply-Rooted';
- // Step 1: Resolve dispensary ID
- console.log(`\n1. Resolving dispensary ID for "${cName}"...`);
- const dispensaryId = await resolveDispensaryId(cName);
- const finalDispensaryId = dispensaryId || '6405ef617056e8014d79101b'; // Fallback to known ID
- if (!dispensaryId) {
- console.log(' Failed to resolve via API, using hardcoded ID: 6405ef617056e8014d79101b');
- }
- console.log(` Final ID: ${finalDispensaryId}`);
- // Step 2: Fetch first page of products
- console.log('\n2. Fetching products (page 0, perPage 5)...');
- const result = await fetchProducts(finalDispensaryId, 0, 5);
- if (result.errors) {
- console.error('\nGraphQL Errors:');
- console.error(JSON.stringify(result.errors, null, 2));
- return;
- }
- const products = result?.data?.filteredProducts?.products || [];
- console.log(` Found ${products.length} products in this page`);
- if (products.length === 0) {
- console.log('No products returned. Full response:');
- console.log(JSON.stringify(result, null, 2));
- return;
- }
- // Step 3: Enumerate all fields from first product
- console.log('\n3. PRODUCT FIELD STRUCTURE (from first product):');
- console.log('-'.repeat(80));
- const product = products[0];
- const fields = enumerateFields(product);
- fields.forEach(f => console.log(` ${f}`));
- // Step 4: Show full sample product JSON
- console.log('\n4. FULL SAMPLE PRODUCT JSON:');
- console.log('-'.repeat(80));
- console.log(JSON.stringify(product, null, 2));
- // Step 5: Summary of key fields for schema design
- console.log('\n5. KEY FIELDS FOR SCHEMA DESIGN:');
- console.log('-'.repeat(80));
- const keyFields = [
- { field: 'id', value: product.id },
- { field: 'name', value: product.name },
- { field: 'slug', value: product.slug },
- { field: 'brand', value: product.brand },
- { field: 'brandId', value: product.brandId },
- { field: 'type', value: product.type },
- { field: 'category', value: product.category },
- { field: 'subcategory', value: product.subcategory },
- { field: 'strainType', value: product.strainType },
- { field: 'THCContent', value: product.THCContent },
- { field: 'CBDContent', value: product.CBDContent },
- { field: 'description', value: product.description?.substring(0, 100) + '...' },
- { field: 'image', value: product.image },
- { field: 'options.length', value: product.options?.length },
- { field: 'pricing', value: product.pricing },
- { field: 'terpenes.length', value: product.terpenes?.length },
- { field: 'effects.length', value: product.effects?.length },
- ];
- keyFields.forEach(({ field, value }) => {
- console.log(` ${field}: ${JSON.stringify(value)}`);
- });
- // Step 6: Show an option (variant) if available
- if (product.options && product.options.length > 0) {
- console.log('\n6. SAMPLE OPTION/VARIANT:');
- console.log('-'.repeat(80));
- console.log(JSON.stringify(product.options[0], null, 2));
- }
-}
-main().catch(console.error);
diff --git a/backend/dist/scripts/test-jane-scraper.js b/backend/dist/scripts/test-jane-scraper.js
deleted file mode 100644
index 3477a724..00000000
--- a/backend/dist/scripts/test-jane-scraper.js
+++ /dev/null
@@ -1,255 +0,0 @@
-"use strict";
-/**
- * Test script for iHeartJane menu scraping via Playwright
- * Intercepts API/Algolia calls made by the browser
- */
-Object.defineProperty(exports, "__esModule", { value: true });
-const playwright_1 = require("playwright");
-async function scrapeJaneMenu(urlOrStoreId) {
- // Handle either a full URL or just a store ID
- const menuUrl = urlOrStoreId.startsWith('http')
- ? urlOrStoreId
- : `https://www.iheartjane.com/embed/stores/${urlOrStoreId}/menu`;
- console.log(`Starting Playwright scrape for iHeartJane: ${menuUrl}`);
- const browser = await playwright_1.chromium.launch({
- headless: true,
- args: [
- '--no-sandbox',
- '--disable-setuid-sandbox',
- '--disable-blink-features=AutomationControlled'
- ]
- });
- const context = await browser.newContext({
- userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
- viewport: { width: 1920, height: 1080 },
- locale: 'en-US',
- timezoneId: 'America/Chicago'
- });
- // Add stealth scripts to avoid detection
- await context.addInitScript(() => {
- Object.defineProperty(navigator, 'webdriver', { get: () => false });
- window.chrome = { runtime: {} };
- });
- const page = await context.newPage();
- const products = [];
- const apiResponses = [];
- const capturedCredentials = {};
- // Intercept ALL network requests to capture API/Algolia data and credentials
- page.on('request', (request) => {
- const url = request.url();
- const headers = request.headers();
- // Capture Algolia credentials from request headers
- if (url.includes('algolia')) {
- const appId = headers['x-algolia-application-id'];
- const apiKey = headers['x-algolia-api-key'];
- if (appId && apiKey) {
- capturedCredentials.algolia = { appId, apiKey };
- console.log(`Captured Algolia credentials: App=${appId}, Key=${apiKey.substring(0, 10)}...`);
- }
- }
- });
- page.on('response', async (response) => {
- const url = response.url();
- // Capture Algolia search results
- if (url.includes('algolia.net') || url.includes('algolianet.com')) {
- try {
- const data = await response.json();
- if (data.results && data.results[0] && data.results[0].hits) {
- console.log(`Captured ${data.results[0].hits.length} products from Algolia`);
- apiResponses.push({ type: 'algolia', data: data.results[0] });
- }
- }
- catch (e) {
- // Not JSON or error parsing
- }
- }
- // Capture Jane API responses
- if (url.includes('api.iheartjane.com') && url.includes('products')) {
- try {
- const data = await response.json();
- console.log(`Captured Jane API response: ${url}`);
- apiResponses.push({ type: 'jane-api', url, data });
- }
- catch (e) {
- // Not JSON or error parsing
- }
- }
- });
- try {
- console.log(`Navigating to: ${menuUrl}`);
- await page.goto(menuUrl, {
- waitUntil: 'domcontentloaded',
- timeout: 60000
- });
- // Wait for page to settle
- await page.waitForTimeout(2000);
- // Handle age gate - use Playwright locator with force click
- console.log('Looking for age gate...');
- try {
- let clicked = false;
- // Method 1: Use Playwright locator with exact text match
- try {
- const yesButton = page.locator('button:has-text("Yes")').first();
- await yesButton.waitFor({ state: 'visible', timeout: 5000 });
- await yesButton.click({ force: true });
- clicked = true;
- console.log('Clicked age gate via Playwright locator');
- await page.waitForTimeout(5000);
- }
- catch (e) {
- console.log('Playwright locator failed:', e.message);
- }
- // Method 2: Try clicking by visible bounding box
- if (!clicked) {
- try {
- const box = await page.locator('button:has-text("Yes")').first().boundingBox();
- if (box) {
- await page.mouse.click(box.x + box.width / 2, box.y + box.height / 2);
- clicked = true;
- console.log(`Clicked age gate at coordinates: ${box.x + box.width / 2}, ${box.y + box.height / 2}`);
- await page.waitForTimeout(5000);
- }
- }
- catch (e) {
- console.log('Bounding box click failed');
- }
- }
- // Method 3: Try JavaScript click
- if (!clicked) {
- const jsClickResult = await page.evaluate(() => {
- const buttons = Array.from(document.querySelectorAll('button'));
- for (const btn of buttons) {
- if (btn.textContent?.includes('Yes')) {
- btn.click();
- return { success: true, buttonText: btn.textContent };
- }
- }
- return { success: false };
- });
- if (jsClickResult.success) {
- clicked = true;
- console.log(`Clicked via JS: ${jsClickResult.buttonText}`);
- await page.waitForTimeout(5000);
- }
- }
- // Method 4: Click element containing "Yes" with dispatchEvent
- if (!clicked) {
- const dispatchResult = await page.evaluate(() => {
- const buttons = Array.from(document.querySelectorAll('button'));
- for (const btn of buttons) {
- if (btn.textContent?.includes('Yes')) {
- btn.dispatchEvent(new MouseEvent('click', { bubbles: true, cancelable: true }));
- return true;
- }
- }
- return false;
- });
- if (dispatchResult) {
- clicked = true;
- console.log('Clicked via dispatchEvent');
- await page.waitForTimeout(5000);
- }
- }
- // Log button info for debugging
- const buttonInfo = await page.evaluate(() => {
- const buttons = Array.from(document.querySelectorAll('button'));
- return buttons.map(b => ({
- text: b.textContent?.trim(),
- visible: b.offsetParent !== null,
- rect: b.getBoundingClientRect()
- }));
- });
- console.log('Buttons found:', JSON.stringify(buttonInfo, null, 2));
- }
- catch (e) {
- console.log('Age gate handling error:', e);
- }
- // Wait for content to load after age gate
- await page.waitForTimeout(3000);
- // Try to scroll to trigger more product loads
- console.log('Scrolling to load more products...');
- for (let i = 0; i < 3; i++) {
- await page.evaluate(() => window.scrollBy(0, 1000));
- await page.waitForTimeout(1000);
- }
- // Extract products from the page DOM as backup
- const domProducts = await page.evaluate(() => {
- const items = [];
- // Try various selectors that Jane might use
- const productCards = document.querySelectorAll('[data-testid*="product"], [class*="ProductCard"], [class*="product-card"], .product-tile');
- productCards.forEach((card) => {
- const name = card.querySelector('[class*="name"], [class*="title"], h3, h4')?.textContent?.trim();
- const brand = card.querySelector('[class*="brand"]')?.textContent?.trim();
- const price = card.querySelector('[class*="price"]')?.textContent?.trim();
- const image = card.querySelector('img')?.getAttribute('src');
- if (name) {
- items.push({ name, brand, price, image, source: 'dom' });
- }
- });
- return items;
- });
- console.log(`Extracted ${domProducts.length} products from DOM`);
- // Check for __NEXT_DATA__ or similar embedded data
- const embeddedData = await page.evaluate(() => {
- // Check for Next.js data
- const nextData = document.getElementById('__NEXT_DATA__');
- if (nextData) {
- return { type: 'next', data: JSON.parse(nextData.textContent || '{}') };
- }
- // Check for any window-level product data
- const win = window;
- if (win.__INITIAL_STATE__)
- return { type: 'initial_state', data: win.__INITIAL_STATE__ };
- if (win.__PRELOADED_STATE__)
- return { type: 'preloaded', data: win.__PRELOADED_STATE__ };
- if (win.products)
- return { type: 'products', data: win.products };
- return null;
- });
- if (embeddedData) {
- console.log(`Found embedded data: ${embeddedData.type}`);
- apiResponses.push(embeddedData);
- }
- // Take a screenshot for debugging
- const screenshotPath = `/tmp/jane-scrape-${Date.now()}.png`;
- await page.screenshot({ path: screenshotPath, fullPage: true });
- console.log(`Screenshot saved to ${screenshotPath}`);
- // Process captured API responses
- console.log('\n=== API Responses Summary ===');
- for (const resp of apiResponses) {
- console.log(`Type: ${resp.type}`);
- if (resp.type === 'algolia' && resp.data.hits) {
- console.log(` Hits: ${resp.data.hits.length}`);
- console.log(` Total: ${resp.data.nbHits}`);
- if (resp.data.hits[0]) {
- console.log(` Sample product:`, JSON.stringify(resp.data.hits[0], null, 2).substring(0, 1000));
- }
- }
- }
- console.log('\n=== DOM Products Sample ===');
- console.log(JSON.stringify(domProducts.slice(0, 3), null, 2));
- console.log('\n=== Captured Credentials ===');
- console.log(JSON.stringify(capturedCredentials, null, 2));
- return {
- apiResponses,
- domProducts,
- embeddedData,
- capturedCredentials
- };
- }
- finally {
- await browser.close();
- }
-}
-// Main execution
-const urlOrStoreId = process.argv[2] || 'https://iheartjane.com/aly2djS2yXoTGnR0/DBeqE6HSSwijog9l'; // Default to The Flower Shop Az
-scrapeJaneMenu(urlOrStoreId)
- .then((result) => {
- console.log('\n=== Scrape Complete ===');
- console.log(`Total API responses captured: ${result.apiResponses.length}`);
- console.log(`Total DOM products: ${result.domProducts.length}`);
-})
- .catch((err) => {
- console.error('Scrape failed:', err);
- process.exit(1);
-});
diff --git a/backend/dist/scripts/test-status-filter.js b/backend/dist/scripts/test-status-filter.js
deleted file mode 100644
index 86a663c0..00000000
--- a/backend/dist/scripts/test-status-filter.js
+++ /dev/null
@@ -1,84 +0,0 @@
-"use strict";
-/**
- * Test different Status filter values in Dutchie GraphQL
- */
-var __importDefault = (this && this.__importDefault) || function (mod) {
- return (mod && mod.__esModule) ? mod : { "default": mod };
-};
-Object.defineProperty(exports, "__esModule", { value: true });
-const puppeteer_extra_1 = __importDefault(require("puppeteer-extra"));
-const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth"));
-puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)());
-const GRAPHQL_HASH = 'ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0';
-async function main() {
- const browser = await puppeteer_extra_1.default.launch({
- headless: 'new',
- args: ['--no-sandbox', '--disable-setuid-sandbox'],
- });
- const page = await browser.newPage();
- await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36');
- console.log('Loading menu...');
- await page.goto('https://dutchie.com/embedded-menu/AZ-Deeply-Rooted', {
- waitUntil: 'networkidle2',
- timeout: 60000,
- });
- await new Promise((r) => setTimeout(r, 3000));
- const dispensaryId = await page.evaluate(() => window.reactEnv?.dispensaryId);
- console.log('Dispensary ID:', dispensaryId);
- // Test different status values
- const testCases = [
- { label: 'Active', status: 'Active', includeStatus: true },
- { label: 'Inactive', status: 'Inactive', includeStatus: true },
- { label: 'null', status: null, includeStatus: true },
- { label: 'omitted', status: null, includeStatus: false },
- ];
- for (const testCase of testCases) {
- const result = await page.evaluate(async (dispId, hash, status, includeStatus) => {
- const filter = {
- dispensaryId: dispId,
- pricingType: 'rec',
- types: [],
- useCache: false,
- isDefaultSort: true,
- sortBy: 'popularSortIdx',
- sortDirection: 1,
- bypassOnlineThresholds: true,
- isKioskMenu: false,
- removeProductsBelowOptionThresholds: false,
- };
- if (includeStatus) {
- filter.Status = status;
- }
- const variables = {
- includeEnterpriseSpecials: false,
- productsFilter: filter,
- page: 0,
- perPage: 100,
- };
- const qs = new URLSearchParams({
- operationName: 'FilteredProducts',
- variables: JSON.stringify(variables),
- extensions: JSON.stringify({ persistedQuery: { version: 1, sha256Hash: hash } }),
- });
- const resp = await fetch(`https://dutchie.com/graphql?${qs.toString()}`, {
- method: 'GET',
- headers: {
- 'content-type': 'application/json',
- 'apollographql-client-name': 'Marketplace (production)',
- },
- credentials: 'include',
- });
- const json = await resp.json();
- const products = json?.data?.filteredProducts?.products || [];
- return {
- count: products.length,
- totalCount: json?.data?.filteredProducts?.queryInfo?.totalCount,
- sampleStatus: products[0]?.Status,
- statuses: [...new Set(products.map((p) => p.Status))],
- };
- }, dispensaryId, GRAPHQL_HASH, testCase.status, testCase.includeStatus);
- console.log(`Status ${testCase.label}: Products=${result.count}, Total=${result.totalCount}, Statuses=${JSON.stringify(result.statuses)}`);
- }
- await browser.close();
-}
-main().catch(console.error);
diff --git a/backend/dist/services/availability.js b/backend/dist/services/availability.js
deleted file mode 100644
index 001c5917..00000000
--- a/backend/dist/services/availability.js
+++ /dev/null
@@ -1,201 +0,0 @@
-"use strict";
-/**
- * Availability Service
- *
- * Normalizes product availability from various menu providers and tracks
- * state transitions for inventory analytics.
- */
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.normalizeAvailability = normalizeAvailability;
-exports.extractAvailabilityHints = extractAvailabilityHints;
-exports.hintsToAvailability = hintsToAvailability;
-exports.aggregateAvailability = aggregateAvailability;
-// Threshold for considering stock as "limited"
-const LIMITED_THRESHOLD = 5;
-/**
- * Normalize availability from a Dutchie product
- *
- * Dutchie products can have various availability indicators:
- * - potencyAmount.quantity: explicit stock count
- * - status: sometimes includes stock status
- * - variants[].quantity: stock per variant
- * - isInStock / inStock: boolean flags
- */
-function normalizeAvailability(dutchieProduct) {
- const raw = {};
- // Collect raw availability data for debugging
- if (dutchieProduct.potencyAmount?.quantity !== undefined) {
- raw.potencyQuantity = dutchieProduct.potencyAmount.quantity;
- }
- if (dutchieProduct.status !== undefined) {
- raw.status = dutchieProduct.status;
- }
- if (dutchieProduct.isInStock !== undefined) {
- raw.isInStock = dutchieProduct.isInStock;
- }
- if (dutchieProduct.inStock !== undefined) {
- raw.inStock = dutchieProduct.inStock;
- }
- if (dutchieProduct.variants?.length) {
- const variantQuantities = dutchieProduct.variants
- .filter((v) => v.quantity !== undefined)
- .map((v) => ({ option: v.option, quantity: v.quantity }));
- if (variantQuantities.length) {
- raw.variantQuantities = variantQuantities;
- }
- }
- // Try to extract quantity
- let quantity = null;
- // Check potencyAmount.quantity first (most reliable for Dutchie)
- if (typeof dutchieProduct.potencyAmount?.quantity === 'number') {
- quantity = dutchieProduct.potencyAmount.quantity;
- }
- // Sum variant quantities if available
- else if (dutchieProduct.variants?.length) {
- const totalVariantQty = dutchieProduct.variants.reduce((sum, v) => {
- return sum + (typeof v.quantity === 'number' ? v.quantity : 0);
- }, 0);
- if (totalVariantQty > 0) {
- quantity = totalVariantQty;
- }
- }
- // Determine status
- let status = 'unknown';
- // Explicit boolean flags take precedence
- if (dutchieProduct.isInStock === false || dutchieProduct.inStock === false) {
- status = 'out_of_stock';
- }
- else if (dutchieProduct.isInStock === true || dutchieProduct.inStock === true) {
- status = quantity !== null && quantity <= LIMITED_THRESHOLD ? 'limited' : 'in_stock';
- }
- // Check status string
- else if (typeof dutchieProduct.status === 'string') {
- const statusLower = dutchieProduct.status.toLowerCase();
- if (statusLower.includes('out') || statusLower.includes('unavailable')) {
- status = 'out_of_stock';
- }
- else if (statusLower.includes('limited') || statusLower.includes('low')) {
- status = 'limited';
- }
- else if (statusLower.includes('in') || statusLower.includes('available')) {
- status = 'in_stock';
- }
- }
- // Infer from quantity
- else if (quantity !== null) {
- if (quantity === 0) {
- status = 'out_of_stock';
- }
- else if (quantity <= LIMITED_THRESHOLD) {
- status = 'limited';
- }
- else {
- status = 'in_stock';
- }
- }
- return { status, quantity, raw };
-}
-/**
- * Extract availability hints from page content or product card HTML
- *
- * Used for sandbox provider scraping where we don't have structured data
- */
-function extractAvailabilityHints(pageContent, productElement) {
- const hints = {};
- const content = (productElement || pageContent).toLowerCase();
- // Check for out-of-stock indicators
- const oosPatterns = [
- 'out of stock',
- 'out-of-stock',
- 'sold out',
- 'soldout',
- 'unavailable',
- 'not available',
- 'coming soon',
- 'notify me'
- ];
- hints.hasOutOfStockBadge = oosPatterns.some(p => content.includes(p));
- // Check for limited stock indicators
- const limitedPatterns = [
- 'limited stock',
- 'limited quantity',
- 'low stock',
- 'only \\d+ left',
- 'few remaining',
- 'almost gone',
- 'selling fast'
- ];
- hints.hasLimitedBadge = limitedPatterns.some(p => {
- if (p.includes('\\d')) {
- return new RegExp(p, 'i').test(content);
- }
- return content.includes(p);
- });
- // Check for in-stock indicators
- const inStockPatterns = [
- 'in stock',
- 'in-stock',
- 'add to cart',
- 'add to bag',
- 'buy now',
- 'available'
- ];
- hints.hasInStockBadge = inStockPatterns.some(p => content.includes(p));
- // Try to extract quantity text
- const qtyMatch = content.match(/(\d+)\s*(left|remaining|in stock|available)/i);
- if (qtyMatch) {
- hints.quantityText = qtyMatch[0];
- }
- // Look for explicit stock text
- const stockTextMatch = content.match(/(out of stock|in stock|low stock|limited|sold out)[^<]*/i);
- if (stockTextMatch) {
- hints.stockText = stockTextMatch[0].trim();
- }
- return hints;
-}
-/**
- * Convert availability hints to normalized availability
- */
-function hintsToAvailability(hints) {
- let status = 'unknown';
- let quantity = null;
- // Extract quantity if present
- if (hints.quantityText) {
- const match = hints.quantityText.match(/(\d+)/);
- if (match) {
- quantity = parseInt(match[1], 10);
- }
- }
- // Determine status from hints
- if (hints.hasOutOfStockBadge) {
- status = 'out_of_stock';
- }
- else if (hints.hasLimitedBadge) {
- status = 'limited';
- }
- else if (hints.hasInStockBadge) {
- status = quantity !== null && quantity <= LIMITED_THRESHOLD ? 'limited' : 'in_stock';
- }
- return {
- status,
- quantity,
- raw: hints
- };
-}
-function aggregateAvailability(products) {
- const counts = {
- in_stock: 0,
- out_of_stock: 0,
- limited: 0,
- unknown: 0,
- changed: 0
- };
- for (const product of products) {
- const status = product.availability_status || 'unknown';
- counts[status]++;
- if (product.previous_status && product.previous_status !== status) {
- counts.changed++;
- }
- }
- return counts;
-}
diff --git a/backend/dist/services/category-crawler-jobs.js b/backend/dist/services/category-crawler-jobs.js
deleted file mode 100644
index b6f0d5d9..00000000
--- a/backend/dist/services/category-crawler-jobs.js
+++ /dev/null
@@ -1,1107 +0,0 @@
-"use strict";
-/**
- * Category-Specific Crawler Jobs
- *
- * Handles crawl jobs for each intelligence category independently:
- * - CrawlProductsJob - Production product crawling (Dutchie only)
- * - CrawlSpecialsJob - Production specials crawling
- * - CrawlBrandIntelligenceJob - Production brand intelligence crawling
- * - CrawlMetadataJob - Production metadata crawling
- * - SandboxProductsJob - Sandbox product crawling (all providers)
- * - SandboxSpecialsJob - Sandbox specials crawling
- * - SandboxBrandJob - Sandbox brand crawling
- * - SandboxMetadataJob - Sandbox metadata crawling
- */
-var __importDefault = (this && this.__importDefault) || function (mod) {
- return (mod && mod.__esModule) ? mod : { "default": mod };
-};
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.runCrawlProductsJob = runCrawlProductsJob;
-exports.runCrawlSpecialsJob = runCrawlSpecialsJob;
-exports.runCrawlBrandIntelligenceJob = runCrawlBrandIntelligenceJob;
-exports.runCrawlMetadataJob = runCrawlMetadataJob;
-exports.runSandboxProductsJob = runSandboxProductsJob;
-exports.runSandboxSpecialsJob = runSandboxSpecialsJob;
-exports.runSandboxBrandJob = runSandboxBrandJob;
-exports.runSandboxMetadataJob = runSandboxMetadataJob;
-exports.processCategorySandboxJobs = processCategorySandboxJobs;
-exports.runAllCategoryProductionCrawls = runAllCategoryProductionCrawls;
-exports.runAllCategorySandboxCrawls = runAllCategorySandboxCrawls;
-const migrate_1 = require("../db/migrate");
-const crawler_logger_1 = require("./crawler-logger");
-// Note: scrapeStore from scraper-v2 is NOT used for Dutchie - we use GraphQL API directly
-const product_crawler_1 = require("../dutchie-az/services/product-crawler");
-const puppeteer_1 = __importDefault(require("puppeteer"));
-const WORKER_ID = `crawler-${process.pid}-${Date.now()}`;
-// ========================================
-// Helper Functions
-// ========================================
-async function getDispensaryWithCategories(dispensaryId) {
- const result = await migrate_1.pool.query(`SELECT id, name, website, menu_url, menu_type, platform_dispensary_id,
- product_provider, product_confidence, product_crawler_mode, last_product_scan_at,
- specials_provider, specials_confidence, specials_crawler_mode, last_specials_scan_at,
- brand_provider, brand_confidence, brand_crawler_mode, last_brand_scan_at,
- metadata_provider, metadata_confidence, metadata_crawler_mode, last_metadata_scan_at,
- crawler_status, scraper_template
- FROM dispensaries WHERE id = $1`, [dispensaryId]);
- return result.rows[0] || null;
-}
-async function updateCategoryScanTime(dispensaryId, category) {
- const column = `last_${category}_scan_at`;
- await migrate_1.pool.query(`UPDATE dispensaries SET ${column} = NOW(), updated_at = NOW() WHERE id = $1`, [dispensaryId]);
-}
-async function getStoreIdForDispensary(dispensaryId) {
- // First check if dispensary has menu_url - if so, try to match with stores.dutchie_url
- const result = await migrate_1.pool.query(`SELECT s.id FROM stores s
- JOIN dispensaries d ON d.menu_url = s.dutchie_url OR d.name ILIKE '%' || s.name || '%'
- WHERE d.id = $1
- LIMIT 1`, [dispensaryId]);
- if (result.rows.length > 0) {
- return result.rows[0].id;
- }
- // Try matching by slug
- const result2 = await migrate_1.pool.query(`SELECT s.id FROM stores s
- JOIN dispensaries d ON d.website ILIKE '%' || s.slug || '%'
- WHERE d.id = $1
- LIMIT 1`, [dispensaryId]);
- return result2.rows[0]?.id || null;
-}
-async function createCategorySandboxEntry(dispensaryId, category, suspectedProvider, templateName, detectionSignals) {
- // Check for existing sandbox for this category
- const existing = await migrate_1.pool.query(`SELECT id FROM crawler_sandboxes
- WHERE dispensary_id = $1 AND category = $2 AND status NOT IN ('moved_to_production', 'failed')`, [dispensaryId, category]);
- if (existing.rows.length > 0) {
- await migrate_1.pool.query(`UPDATE crawler_sandboxes
- SET suspected_menu_provider = $2, template_name = $3, detection_signals = COALESCE($4, detection_signals), updated_at = NOW()
- WHERE id = $1`, [existing.rows[0].id, suspectedProvider, templateName, detectionSignals ? JSON.stringify(detectionSignals) : null]);
- return existing.rows[0].id;
- }
- const result = await migrate_1.pool.query(`INSERT INTO crawler_sandboxes (dispensary_id, category, suspected_menu_provider, template_name, mode, detection_signals, status)
- VALUES ($1, $2, $3, $4, 'template_learning', $5, 'pending')
- RETURNING id`, [dispensaryId, category, suspectedProvider, templateName, detectionSignals ? JSON.stringify(detectionSignals) : '{}']);
- return result.rows[0].id;
-}
-async function createCategorySandboxJob(dispensaryId, sandboxId, category, templateName, jobType = 'crawl', priority = 0) {
- const result = await migrate_1.pool.query(`INSERT INTO sandbox_crawl_jobs (dispensary_id, sandbox_id, category, template_name, job_type, status, priority)
- VALUES ($1, $2, $3, $4, $5, 'pending', $6)
- RETURNING id`, [dispensaryId, sandboxId, category, templateName, jobType, priority]);
- return result.rows[0].id;
-}
-async function updateSandboxQuality(sandboxId, metrics) {
- await migrate_1.pool.query(`UPDATE crawler_sandboxes SET
- quality_score = $1,
- products_extracted = $2,
- fields_missing = $3,
- error_count = $4,
- analysis_json = COALESCE(analysis_json, '{}'::jsonb) || $5::jsonb,
- analyzed_at = NOW(),
- updated_at = NOW()
- WHERE id = $6`, [
- metrics.quality_score,
- metrics.items_extracted,
- metrics.fields_missing,
- metrics.error_count,
- JSON.stringify({ sample_data: metrics.sample_data }),
- sandboxId,
- ]);
-}
-async function getCrawlerTemplate(provider, category, environment) {
- const result = await migrate_1.pool.query(`SELECT id, name, selector_config, navigation_config
- FROM crawler_templates
- WHERE provider = $1 AND environment = $2 AND is_active = true
- ORDER BY is_default_for_provider DESC, version DESC
- LIMIT 1`, [provider, environment]);
- return result.rows[0] || null;
-}
-// ========================================
-// Production Crawl Jobs
-// ========================================
-/**
- * CrawlProductsJob - Production product crawling
- * Uses Dutchie GraphQL API directly (NOT browser-based scraping)
- *
- * IMPORTANT: This function calls crawlDispensaryProducts() from dutchie-az
- * which uses the GraphQL API. The GraphQL response includes categories directly,
- * so no browser-based category discovery is needed.
- */
-async function runCrawlProductsJob(dispensaryId) {
- const category = 'product';
- const startTime = Date.now();
- const dispensary = await getDispensaryWithCategories(dispensaryId);
- if (!dispensary) {
- return { success: false, category, message: `Dispensary ${dispensaryId} not found` };
- }
- // Verify production eligibility - accept either:
- // 1. product_provider = 'dutchie' with product_crawler_mode = 'production', OR
- // 2. menu_type = 'dutchie' with platform_dispensary_id (known Dutchie store)
- const isDutchieProduction = (dispensary.product_provider === 'dutchie' && dispensary.product_crawler_mode === 'production') ||
- (dispensary.menu_type === 'dutchie' && dispensary.platform_dispensary_id);
- if (!isDutchieProduction) {
- return { success: false, category, message: 'Not a Dutchie dispensary for products' };
- }
- if (!dispensary.platform_dispensary_id) {
- return { success: false, category, message: 'Missing platform_dispensary_id for GraphQL crawl' };
- }
- // Log job start
- crawler_logger_1.crawlerLogger.jobStarted({
- job_id: 0, // Category jobs don't have traditional job IDs
- store_id: dispensaryId, // Use dispensary ID since we're not using stores table
- store_name: dispensary.name,
- job_type: 'CrawlProductsJob',
- trigger_type: 'category_crawl',
- provider: 'dutchie',
- });
- try {
- // Build Dispensary object for GraphQL crawler
- // The crawler uses platformDispensaryId to call the Dutchie GraphQL API directly
- const dispensaryForCrawl = {
- id: dispensary.id,
- platform: 'dutchie',
- name: dispensary.name,
- slug: dispensary.name.toLowerCase().replace(/[^a-z0-9]+/g, '-'),
- city: '',
- state: 'AZ',
- menuType: dispensary.menu_type || 'dutchie',
- menuUrl: dispensary.menu_url || undefined,
- platformDispensaryId: dispensary.platform_dispensary_id || undefined,
- website: dispensary.website || undefined,
- createdAt: new Date(),
- updatedAt: new Date(),
- };
- // Use GraphQL crawler directly - this calls the Dutchie API, not browser scraping
- const crawlResult = await (0, product_crawler_1.crawlDispensaryProducts)(dispensaryForCrawl, 'rec', // Default to recreational pricing
- { useBothModes: true, downloadImages: true });
- // Update scan time
- await updateCategoryScanTime(dispensaryId, category);
- const durationMs = Date.now() - startTime;
- if (crawlResult.success) {
- // Log job completion with summary
- crawler_logger_1.crawlerLogger.jobCompleted({
- job_id: 0,
- store_id: dispensaryId,
- store_name: dispensary.name,
- duration_ms: durationMs,
- products_found: crawlResult.productsFound,
- products_new: 0, // GraphQL crawler doesn't track new vs updated separately
- products_updated: crawlResult.productsUpserted,
- provider: 'dutchie',
- });
- return {
- success: true,
- category,
- message: `GraphQL crawl completed: ${crawlResult.productsUpserted} products, ${crawlResult.snapshotsCreated} snapshots`,
- data: {
- dispensaryId,
- provider: 'dutchie',
- durationMs,
- productsFound: crawlResult.productsFound,
- productsUpserted: crawlResult.productsUpserted,
- snapshotsCreated: crawlResult.snapshotsCreated,
- modeAProducts: crawlResult.modeAProducts,
- modeBProducts: crawlResult.modeBProducts,
- },
- };
- }
- else {
- // Log job failure
- crawler_logger_1.crawlerLogger.jobFailed({
- job_id: 0,
- store_id: dispensaryId,
- store_name: dispensary.name,
- duration_ms: durationMs,
- error_message: crawlResult.errorMessage || 'Unknown error',
- provider: 'dutchie',
- });
- return { success: false, category, message: crawlResult.errorMessage || 'GraphQL crawl failed' };
- }
- }
- catch (error) {
- const durationMs = Date.now() - startTime;
- // Log job failure
- crawler_logger_1.crawlerLogger.jobFailed({
- job_id: 0,
- store_id: dispensaryId,
- store_name: dispensary.name,
- duration_ms: durationMs,
- error_message: error.message,
- provider: 'dutchie',
- });
- return { success: false, category, message: error.message };
- }
-}
-/**
- * CrawlSpecialsJob - Production specials crawling
- * Currently no production-ready providers, so always returns false
- */
-async function runCrawlSpecialsJob(dispensaryId) {
- const category = 'specials';
- const dispensary = await getDispensaryWithCategories(dispensaryId);
- if (!dispensary) {
- return { success: false, category, message: `Dispensary ${dispensaryId} not found` };
- }
- // No production-ready providers for specials yet
- if (dispensary.specials_crawler_mode !== 'production') {
- return { success: false, category, message: 'Specials not in production mode' };
- }
- // Would implement provider-specific specials crawling here
- // For now, no providers are production-ready
- return {
- success: false,
- category,
- message: `No production crawler for specials provider: ${dispensary.specials_provider}`,
- };
-}
-/**
- * CrawlBrandIntelligenceJob - Production brand intelligence crawling
- * Currently no production-ready providers
- */
-async function runCrawlBrandIntelligenceJob(dispensaryId) {
- const category = 'brand';
- const dispensary = await getDispensaryWithCategories(dispensaryId);
- if (!dispensary) {
- return { success: false, category, message: `Dispensary ${dispensaryId} not found` };
- }
- if (dispensary.brand_crawler_mode !== 'production') {
- return { success: false, category, message: 'Brand not in production mode' };
- }
- return {
- success: false,
- category,
- message: `No production crawler for brand provider: ${dispensary.brand_provider}`,
- };
-}
-/**
- * CrawlMetadataJob - Production metadata crawling
- * Currently no production-ready providers
- */
-async function runCrawlMetadataJob(dispensaryId) {
- const category = 'metadata';
- const dispensary = await getDispensaryWithCategories(dispensaryId);
- if (!dispensary) {
- return { success: false, category, message: `Dispensary ${dispensaryId} not found` };
- }
- if (dispensary.metadata_crawler_mode !== 'production') {
- return { success: false, category, message: 'Metadata not in production mode' };
- }
- return {
- success: false,
- category,
- message: `No production crawler for metadata provider: ${dispensary.metadata_provider}`,
- };
-}
-// ========================================
-// Sandbox Crawl Jobs
-// ========================================
-/**
- * SandboxProductsJob - Sandbox product crawling
- * Works with any provider including Treez
- */
-async function runSandboxProductsJob(dispensaryId, sandboxId) {
- const category = 'product';
- const startTime = Date.now();
- const dispensary = await getDispensaryWithCategories(dispensaryId);
- if (!dispensary) {
- return { success: false, category, message: `Dispensary ${dispensaryId} not found` };
- }
- // Get or create sandbox entry
- let sandbox;
- if (sandboxId) {
- const result = await migrate_1.pool.query('SELECT * FROM crawler_sandboxes WHERE id = $1', [sandboxId]);
- sandbox = result.rows[0];
- }
- else {
- const result = await migrate_1.pool.query(`SELECT * FROM crawler_sandboxes
- WHERE dispensary_id = $1 AND category = $2 AND status NOT IN ('moved_to_production', 'failed')
- ORDER BY created_at DESC LIMIT 1`, [dispensaryId, category]);
- sandbox = result.rows[0];
- if (!sandbox) {
- const newSandboxId = await createCategorySandboxEntry(dispensaryId, category, dispensary.product_provider, null);
- const result = await migrate_1.pool.query('SELECT * FROM crawler_sandboxes WHERE id = $1', [newSandboxId]);
- sandbox = result.rows[0];
- }
- }
- const websiteUrl = dispensary.menu_url || dispensary.website;
- if (!websiteUrl) {
- await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = 'failed', failure_reason = 'No website URL' WHERE id = $1`, [sandbox.id]);
- return { success: false, category, message: 'No website URL available' };
- }
- let browser = null;
- try {
- // Update status
- await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = 'analyzing', updated_at = NOW() WHERE id = $1`, [sandbox.id]);
- browser = await puppeteer_1.default.launch({
- headless: true,
- args: ['--no-sandbox', '--disable-setuid-sandbox'],
- });
- const page = await browser.newPage();
- await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36');
- // Get provider-specific template if available
- const provider = dispensary.product_provider || 'unknown';
- const template = await getCrawlerTemplate(provider, category, 'sandbox');
- let products = [];
- let metrics = {
- quality_score: 0,
- items_extracted: 0,
- fields_missing: 0,
- error_count: 0,
- };
- // Provider-specific extraction logic
- if (provider === 'treez' && template) {
- // Use Treez-specific extraction
- const treezResult = await extractTreezProducts(page, websiteUrl);
- products = treezResult.products;
- metrics = treezResult.metrics;
- }
- else {
- // Generic product extraction
- const genericResult = await extractGenericProducts(page, websiteUrl);
- products = genericResult.products;
- metrics = genericResult.metrics;
- }
- // Update sandbox with results
- metrics.sample_data = products.slice(0, 5);
- await updateSandboxQuality(sandbox.id, metrics);
- // Determine final status based on quality
- const status = metrics.quality_score >= 70 ? 'ready_for_review' :
- metrics.quality_score >= 40 ? 'needs_human_review' : 'pending';
- await migrate_1.pool.query(`UPDATE crawler_sandboxes SET
- status = $1,
- urls_tested = $2,
- updated_at = NOW()
- WHERE id = $3`, [status, JSON.stringify([websiteUrl]), sandbox.id]);
- // Update scan time
- await updateCategoryScanTime(dispensaryId, category);
- // Log sandbox completion
- crawler_logger_1.crawlerLogger.sandboxEvent({
- event: 'sandbox_completed',
- dispensary_id: dispensaryId,
- dispensary_name: dispensary.name,
- template_name: provider,
- category: 'product',
- quality_score: metrics.quality_score,
- products_extracted: products.length,
- fields_missing: metrics.fields_missing,
- provider: provider,
- });
- return {
- success: true,
- category,
- message: `Sandbox crawl completed. ${products.length} products extracted, quality score ${metrics.quality_score}`,
- data: {
- sandboxId: sandbox.id,
- productsExtracted: products.length,
- qualityScore: metrics.quality_score,
- status,
- },
- };
- }
- catch (error) {
- // Log sandbox failure
- crawler_logger_1.crawlerLogger.sandboxEvent({
- event: 'sandbox_failed',
- dispensary_id: dispensaryId,
- dispensary_name: dispensary.name,
- template_name: dispensary.product_provider || 'unknown',
- category: 'product',
- error_message: error.message,
- });
- await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = 'failed', failure_reason = $1, error_count = error_count + 1 WHERE id = $2`, [error.message, sandbox.id]);
- return { success: false, category, message: error.message };
- }
- finally {
- if (browser)
- await browser.close();
- }
-}
-/**
- * SandboxSpecialsJob - Sandbox specials crawling
- */
-async function runSandboxSpecialsJob(dispensaryId, sandboxId) {
- const category = 'specials';
- const dispensary = await getDispensaryWithCategories(dispensaryId);
- if (!dispensary) {
- return { success: false, category, message: `Dispensary ${dispensaryId} not found` };
- }
- let sandbox;
- if (sandboxId) {
- const result = await migrate_1.pool.query('SELECT * FROM crawler_sandboxes WHERE id = $1', [sandboxId]);
- sandbox = result.rows[0];
- }
- else {
- const newSandboxId = await createCategorySandboxEntry(dispensaryId, category, dispensary.specials_provider, null);
- const result = await migrate_1.pool.query('SELECT * FROM crawler_sandboxes WHERE id = $1', [newSandboxId]);
- sandbox = result.rows[0];
- }
- const websiteUrl = dispensary.website;
- if (!websiteUrl) {
- return { success: false, category, message: 'No website URL available' };
- }
- let browser = null;
- try {
- await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = 'analyzing', updated_at = NOW() WHERE id = $1`, [sandbox.id]);
- browser = await puppeteer_1.default.launch({
- headless: true,
- args: ['--no-sandbox', '--disable-setuid-sandbox'],
- });
- const page = await browser.newPage();
- await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36');
- const result = await extractSpecials(page, websiteUrl);
- await updateSandboxQuality(sandbox.id, {
- ...result.metrics,
- sample_data: result.specials.slice(0, 5),
- });
- const status = result.metrics.quality_score >= 70 ? 'ready_for_review' :
- result.metrics.quality_score >= 40 ? 'needs_human_review' : 'pending';
- await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = $1, updated_at = NOW() WHERE id = $2`, [status, sandbox.id]);
- await updateCategoryScanTime(dispensaryId, category);
- return {
- success: true,
- category,
- message: `Sandbox specials crawl completed. ${result.specials.length} specials found.`,
- data: { sandboxId: sandbox.id, specialsCount: result.specials.length },
- };
- }
- catch (error) {
- await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = 'failed', failure_reason = $1 WHERE id = $2`, [error.message, sandbox.id]);
- return { success: false, category, message: error.message };
- }
- finally {
- if (browser)
- await browser.close();
- }
-}
-/**
- * SandboxBrandJob - Sandbox brand intelligence crawling
- */
-async function runSandboxBrandJob(dispensaryId, sandboxId) {
- const category = 'brand';
- const dispensary = await getDispensaryWithCategories(dispensaryId);
- if (!dispensary) {
- return { success: false, category, message: `Dispensary ${dispensaryId} not found` };
- }
- let sandbox;
- if (sandboxId) {
- const result = await migrate_1.pool.query('SELECT * FROM crawler_sandboxes WHERE id = $1', [sandboxId]);
- sandbox = result.rows[0];
- }
- else {
- const newSandboxId = await createCategorySandboxEntry(dispensaryId, category, dispensary.brand_provider, null);
- const result = await migrate_1.pool.query('SELECT * FROM crawler_sandboxes WHERE id = $1', [newSandboxId]);
- sandbox = result.rows[0];
- }
- const websiteUrl = dispensary.website;
- if (!websiteUrl) {
- return { success: false, category, message: 'No website URL available' };
- }
- let browser = null;
- try {
- await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = 'analyzing', updated_at = NOW() WHERE id = $1`, [sandbox.id]);
- browser = await puppeteer_1.default.launch({
- headless: true,
- args: ['--no-sandbox', '--disable-setuid-sandbox'],
- });
- const page = await browser.newPage();
- await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36');
- const result = await extractBrands(page, websiteUrl);
- await updateSandboxQuality(sandbox.id, {
- ...result.metrics,
- sample_data: result.brands.slice(0, 10),
- });
- const status = result.metrics.quality_score >= 70 ? 'ready_for_review' : 'pending';
- await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = $1, updated_at = NOW() WHERE id = $2`, [status, sandbox.id]);
- await updateCategoryScanTime(dispensaryId, category);
- return {
- success: true,
- category,
- message: `Sandbox brand crawl completed. ${result.brands.length} brands found.`,
- data: { sandboxId: sandbox.id, brandsCount: result.brands.length },
- };
- }
- catch (error) {
- await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = 'failed', failure_reason = $1 WHERE id = $2`, [error.message, sandbox.id]);
- return { success: false, category, message: error.message };
- }
- finally {
- if (browser)
- await browser.close();
- }
-}
-/**
- * SandboxMetadataJob - Sandbox metadata crawling
- */
-async function runSandboxMetadataJob(dispensaryId, sandboxId) {
- const category = 'metadata';
- const dispensary = await getDispensaryWithCategories(dispensaryId);
- if (!dispensary) {
- return { success: false, category, message: `Dispensary ${dispensaryId} not found` };
- }
- let sandbox;
- if (sandboxId) {
- const result = await migrate_1.pool.query('SELECT * FROM crawler_sandboxes WHERE id = $1', [sandboxId]);
- sandbox = result.rows[0];
- }
- else {
- const newSandboxId = await createCategorySandboxEntry(dispensaryId, category, dispensary.metadata_provider, null);
- const result = await migrate_1.pool.query('SELECT * FROM crawler_sandboxes WHERE id = $1', [newSandboxId]);
- sandbox = result.rows[0];
- }
- const websiteUrl = dispensary.website;
- if (!websiteUrl) {
- return { success: false, category, message: 'No website URL available' };
- }
- let browser = null;
- try {
- await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = 'analyzing', updated_at = NOW() WHERE id = $1`, [sandbox.id]);
- browser = await puppeteer_1.default.launch({
- headless: true,
- args: ['--no-sandbox', '--disable-setuid-sandbox'],
- });
- const page = await browser.newPage();
- await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36');
- const result = await extractMetadata(page, websiteUrl);
- await updateSandboxQuality(sandbox.id, {
- ...result.metrics,
- sample_data: result.categories.slice(0, 20),
- });
- const status = result.metrics.quality_score >= 70 ? 'ready_for_review' : 'pending';
- await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = $1, updated_at = NOW() WHERE id = $2`, [status, sandbox.id]);
- await updateCategoryScanTime(dispensaryId, category);
- return {
- success: true,
- category,
- message: `Sandbox metadata crawl completed. ${result.categories.length} categories found.`,
- data: { sandboxId: sandbox.id, categoriesCount: result.categories.length },
- };
- }
- catch (error) {
- await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = 'failed', failure_reason = $1 WHERE id = $2`, [error.message, sandbox.id]);
- return { success: false, category, message: error.message };
- }
- finally {
- if (browser)
- await browser.close();
- }
-}
-// ========================================
-// Extraction Functions
-// ========================================
-/**
- * Extract products from Treez-powered sites
- */
-async function extractTreezProducts(page, baseUrl) {
- const products = [];
- let errorCount = 0;
- let fieldsMissing = 0;
- try {
- // Navigate to menu
- const menuUrls = ['/menu', '/shop', '/products', '/order'];
- let menuUrl = baseUrl;
- for (const path of menuUrls) {
- try {
- const testUrl = new URL(path, baseUrl).toString();
- await page.goto(testUrl, { waitUntil: 'networkidle2', timeout: 20000 });
- const hasProducts = await page.evaluate(() => {
- const text = document.body.innerText.toLowerCase();
- return text.includes('add to cart') || text.includes('thc') || text.includes('indica');
- });
- if (hasProducts) {
- menuUrl = testUrl;
- break;
- }
- }
- catch {
- // Try next URL
- }
- }
- await page.goto(menuUrl, { waitUntil: 'networkidle2', timeout: 30000 });
- await new Promise(r => setTimeout(r, 3000)); // Wait for dynamic content
- // Look for Treez API data in network requests or page content
- const pageProducts = await page.evaluate(() => {
- const extractedProducts = [];
- // Try common Treez selectors
- const selectors = [
- '.product-card',
- '.menu-item',
- '[data-product]',
- '.product-tile',
- '.menu-product',
- ];
- for (const selector of selectors) {
- const elements = document.querySelectorAll(selector);
- if (elements.length > 3) {
- elements.forEach((el) => {
- const nameEl = el.querySelector('h2, h3, .product-name, .name, [class*="name"]');
- const priceEl = el.querySelector('.price, [class*="price"]');
- const thcEl = el.querySelector('[class*="thc"], [class*="potency"]');
- if (nameEl) {
- extractedProducts.push({
- name: nameEl.textContent?.trim(),
- price: priceEl?.textContent?.trim(),
- thc: thcEl?.textContent?.trim(),
- html: el.outerHTML.slice(0, 500),
- });
- }
- });
- break;
- }
- }
- return extractedProducts;
- });
- products.push(...pageProducts);
- // Calculate quality metrics
- for (const product of products) {
- if (!product.name)
- fieldsMissing++;
- if (!product.price)
- fieldsMissing++;
- }
- }
- catch (error) {
- // Error tracked via errorCount - logged at job level
- errorCount++;
- }
- const qualityScore = products.length > 0
- ? Math.min(100, Math.max(0, 100 - (fieldsMissing * 5) - (errorCount * 10)))
- : 0;
- return {
- products,
- metrics: {
- quality_score: qualityScore,
- items_extracted: products.length,
- fields_missing: fieldsMissing,
- error_count: errorCount,
- },
- };
-}
-/**
- * Extract products using generic selectors
- */
-async function extractGenericProducts(page, baseUrl) {
- const products = [];
- let errorCount = 0;
- let fieldsMissing = 0;
- try {
- // Try common menu paths
- const menuPaths = ['/menu', '/shop', '/products', '/order'];
- let foundMenu = false;
- for (const path of menuPaths) {
- try {
- const fullUrl = new URL(path, baseUrl).toString();
- await page.goto(fullUrl, { waitUntil: 'networkidle2', timeout: 20000 });
- const hasProducts = await page.evaluate(() => {
- const text = document.body.innerText.toLowerCase();
- return text.includes('add to cart') || text.includes('thc') || text.includes('gram');
- });
- if (hasProducts) {
- foundMenu = true;
- break;
- }
- }
- catch {
- continue;
- }
- }
- if (!foundMenu) {
- await page.goto(baseUrl, { waitUntil: 'networkidle2', timeout: 30000 });
- }
- await new Promise(r => setTimeout(r, 2000));
- // Generic product extraction
- const pageProducts = await page.evaluate(() => {
- const extractedProducts = [];
- const selectors = [
- '.product',
- '.product-card',
- '.menu-item',
- '.item-card',
- '[data-product]',
- '.strain',
- '.listing',
- ];
- for (const selector of selectors) {
- const elements = document.querySelectorAll(selector);
- if (elements.length > 3) {
- elements.forEach((el) => {
- const nameEl = el.querySelector('h2, h3, h4, .name, .title, [class*="name"]');
- const priceEl = el.querySelector('.price, [class*="price"]');
- const brandEl = el.querySelector('.brand, [class*="brand"]');
- const categoryEl = el.querySelector('.category, [class*="category"], [class*="type"]');
- if (nameEl?.textContent?.trim()) {
- extractedProducts.push({
- name: nameEl.textContent.trim(),
- price: priceEl?.textContent?.trim(),
- brand: brandEl?.textContent?.trim(),
- category: categoryEl?.textContent?.trim(),
- });
- }
- });
- break;
- }
- }
- return extractedProducts;
- });
- products.push(...pageProducts);
- // Calculate missing fields
- for (const product of products) {
- if (!product.name)
- fieldsMissing++;
- if (!product.price)
- fieldsMissing++;
- }
- }
- catch (error) {
- // Error tracked via errorCount - logged at job level
- errorCount++;
- }
- const qualityScore = products.length > 0
- ? Math.min(100, Math.max(0, 80 - (fieldsMissing * 3) - (errorCount * 10)))
- : 0;
- return {
- products,
- metrics: {
- quality_score: qualityScore,
- items_extracted: products.length,
- fields_missing: fieldsMissing,
- error_count: errorCount,
- },
- };
-}
-/**
- * Extract specials/deals
- */
-async function extractSpecials(page, baseUrl) {
- const specials = [];
- let errorCount = 0;
- let fieldsMissing = 0;
- try {
- const specialsPaths = ['/specials', '/deals', '/promotions', '/offers', '/sale'];
- for (const path of specialsPaths) {
- try {
- const fullUrl = new URL(path, baseUrl).toString();
- await page.goto(fullUrl, { waitUntil: 'networkidle2', timeout: 20000 });
- const pageSpecials = await page.evaluate(() => {
- const extracted = [];
- const selectors = [
- '.special',
- '.deal',
- '.promotion',
- '.offer',
- '[class*="special"]',
- '[class*="deal"]',
- ];
- for (const selector of selectors) {
- const elements = document.querySelectorAll(selector);
- elements.forEach((el) => {
- const titleEl = el.querySelector('h2, h3, h4, .title, .name');
- const descEl = el.querySelector('p, .description, .details');
- const discountEl = el.querySelector('.discount, .savings, [class*="percent"]');
- if (titleEl?.textContent?.trim()) {
- extracted.push({
- title: titleEl.textContent.trim(),
- description: descEl?.textContent?.trim(),
- discount: discountEl?.textContent?.trim(),
- });
- }
- });
- }
- return extracted;
- });
- specials.push(...pageSpecials);
- if (specials.length > 0)
- break;
- }
- catch {
- continue;
- }
- }
- for (const special of specials) {
- if (!special.title)
- fieldsMissing++;
- if (!special.description && !special.discount)
- fieldsMissing++;
- }
- }
- catch (error) {
- // Error tracked via errorCount - logged at job level
- errorCount++;
- }
- const qualityScore = specials.length > 0
- ? Math.min(100, Math.max(0, 70 - (fieldsMissing * 5) - (errorCount * 10)))
- : 0;
- return {
- specials,
- metrics: {
- quality_score: qualityScore,
- items_extracted: specials.length,
- fields_missing: fieldsMissing,
- error_count: errorCount,
- },
- };
-}
-/**
- * Extract brand information
- */
-async function extractBrands(page, baseUrl) {
- const brands = [];
- let errorCount = 0;
- let fieldsMissing = 0;
- try {
- const brandPaths = ['/brands', '/vendors', '/producers', '/menu'];
- for (const path of brandPaths) {
- try {
- const fullUrl = new URL(path, baseUrl).toString();
- await page.goto(fullUrl, { waitUntil: 'networkidle2', timeout: 20000 });
- const pageBrands = await page.evaluate(() => {
- const extracted = [];
- const brandNames = new Set();
- // Look for brand elements
- const selectors = [
- '.brand',
- '[class*="brand"]',
- '.vendor',
- '.producer',
- ];
- for (const selector of selectors) {
- document.querySelectorAll(selector).forEach((el) => {
- const name = el.textContent?.trim();
- if (name && name.length > 1 && name.length < 100 && !brandNames.has(name)) {
- brandNames.add(name);
- extracted.push({ name });
- }
- });
- }
- // Also extract from filter dropdowns
- document.querySelectorAll('select option, [role="option"]').forEach((el) => {
- const name = el.textContent?.trim();
- if (name && name.length > 1 && name.length < 100 && !brandNames.has(name)) {
- const lowerName = name.toLowerCase();
- if (!['all', 'any', 'select', 'choose', '--'].some(skip => lowerName.includes(skip))) {
- brandNames.add(name);
- extracted.push({ name, source: 'filter' });
- }
- }
- });
- return extracted;
- });
- brands.push(...pageBrands);
- if (brands.length > 5)
- break;
- }
- catch {
- continue;
- }
- }
- }
- catch (error) {
- // Error tracked via errorCount - logged at job level
- errorCount++;
- }
- const qualityScore = brands.length > 0
- ? Math.min(100, Math.max(0, 60 + Math.min(30, brands.length * 2) - (errorCount * 10)))
- : 0;
- return {
- brands,
- metrics: {
- quality_score: qualityScore,
- items_extracted: brands.length,
- fields_missing: fieldsMissing,
- error_count: errorCount,
- },
- };
-}
-/**
- * Extract metadata (categories, taxonomy)
- */
-async function extractMetadata(page, baseUrl) {
- const categories = [];
- let errorCount = 0;
- let fieldsMissing = 0;
- try {
- await page.goto(baseUrl, { waitUntil: 'networkidle2', timeout: 30000 });
- const menuPaths = ['/menu', '/shop', '/products'];
- for (const path of menuPaths) {
- try {
- await page.goto(new URL(path, baseUrl).toString(), { waitUntil: 'networkidle2', timeout: 15000 });
- break;
- }
- catch {
- continue;
- }
- }
- const pageCategories = await page.evaluate(() => {
- const extracted = [];
- const categoryNames = new Set();
- // Navigation/tab categories
- const navSelectors = [
- 'nav a',
- '.category-nav a',
- '.menu-categories a',
- '[class*="category"] a',
- '.tabs button',
- '.tab-list button',
- ];
- for (const selector of navSelectors) {
- document.querySelectorAll(selector).forEach((el) => {
- const name = el.textContent?.trim();
- if (name && name.length > 1 && name.length < 50 && !categoryNames.has(name)) {
- const lowerName = name.toLowerCase();
- const categoryKeywords = ['flower', 'edible', 'concentrate', 'vape', 'preroll', 'tincture', 'topical', 'accessory', 'indica', 'sativa', 'hybrid'];
- if (categoryKeywords.some(kw => lowerName.includes(kw)) || el.closest('[class*="category"], [class*="menu"]')) {
- categoryNames.add(name);
- extracted.push({ name, type: 'navigation' });
- }
- }
- });
- }
- // Filter categories
- document.querySelectorAll('select, [role="listbox"]').forEach((select) => {
- const label = select.getAttribute('aria-label') || select.previousElementSibling?.textContent?.trim();
- if (label?.toLowerCase().includes('category') || label?.toLowerCase().includes('type')) {
- select.querySelectorAll('option, [role="option"]').forEach((opt) => {
- const name = opt.textContent?.trim();
- if (name && name.length > 1 && !categoryNames.has(name)) {
- const lowerName = name.toLowerCase();
- if (!['all', 'any', 'select', 'choose'].some(skip => lowerName.includes(skip))) {
- categoryNames.add(name);
- extracted.push({ name, type: 'filter' });
- }
- }
- });
- }
- });
- return extracted;
- });
- categories.push(...pageCategories);
- }
- catch (error) {
- // Error tracked via errorCount - logged at job level
- errorCount++;
- }
- const qualityScore = categories.length > 0
- ? Math.min(100, Math.max(0, 50 + Math.min(40, categories.length * 3) - (errorCount * 10)))
- : 0;
- return {
- categories,
- metrics: {
- quality_score: qualityScore,
- items_extracted: categories.length,
- fields_missing: fieldsMissing,
- error_count: errorCount,
- },
- };
-}
-// ========================================
-// Queue Processing Functions
-// ========================================
-/**
- * Process pending category-specific sandbox jobs
- */
-async function processCategorySandboxJobs(category, limit = 5) {
- const jobs = await migrate_1.pool.query(`UPDATE sandbox_crawl_jobs
- SET status = 'running', worker_id = $1, started_at = NOW()
- WHERE id IN (
- SELECT id FROM sandbox_crawl_jobs
- WHERE status = 'pending' AND category = $2 AND scheduled_at <= NOW()
- ORDER BY priority DESC, scheduled_at ASC
- LIMIT $3
- FOR UPDATE SKIP LOCKED
- )
- RETURNING *`, [WORKER_ID, category, limit]);
- for (const job of jobs.rows) {
- try {
- let result;
- switch (category) {
- case 'product':
- result = await runSandboxProductsJob(job.dispensary_id, job.sandbox_id);
- break;
- case 'specials':
- result = await runSandboxSpecialsJob(job.dispensary_id, job.sandbox_id);
- break;
- case 'brand':
- result = await runSandboxBrandJob(job.dispensary_id, job.sandbox_id);
- break;
- case 'metadata':
- result = await runSandboxMetadataJob(job.dispensary_id, job.sandbox_id);
- break;
- default:
- result = { success: false, category, message: `Unknown category: ${category}` };
- }
- await migrate_1.pool.query(`UPDATE sandbox_crawl_jobs
- SET status = $1, completed_at = NOW(), result_summary = $2, error_message = $3
- WHERE id = $4`, [
- result.success ? 'completed' : 'failed',
- JSON.stringify(result.data || {}),
- result.success ? null : result.message,
- job.id,
- ]);
- }
- catch (error) {
- await migrate_1.pool.query(`UPDATE sandbox_crawl_jobs SET status = 'failed', error_message = $1 WHERE id = $2`, [error.message, job.id]);
- }
- }
-}
-/**
- * Run all category production crawls for a dispensary
- * Each category runs independently - failures don't affect others
- */
-async function runAllCategoryProductionCrawls(dispensaryId) {
- const results = [];
- // Run all categories in parallel - independent failures
- const [productResult, specialsResult, brandResult, metadataResult] = await Promise.allSettled([
- runCrawlProductsJob(dispensaryId),
- runCrawlSpecialsJob(dispensaryId),
- runCrawlBrandIntelligenceJob(dispensaryId),
- runCrawlMetadataJob(dispensaryId),
- ]);
- if (productResult.status === 'fulfilled')
- results.push(productResult.value);
- else
- results.push({ success: false, category: 'product', message: productResult.reason?.message || 'Unknown error' });
- if (specialsResult.status === 'fulfilled')
- results.push(specialsResult.value);
- else
- results.push({ success: false, category: 'specials', message: specialsResult.reason?.message || 'Unknown error' });
- if (brandResult.status === 'fulfilled')
- results.push(brandResult.value);
- else
- results.push({ success: false, category: 'brand', message: brandResult.reason?.message || 'Unknown error' });
- if (metadataResult.status === 'fulfilled')
- results.push(metadataResult.value);
- else
- results.push({ success: false, category: 'metadata', message: metadataResult.reason?.message || 'Unknown error' });
- const successCount = results.filter(r => r.success).length;
- const summary = `${successCount}/4 categories succeeded: ${results.map(r => `${r.category}:${r.success ? 'ok' : 'fail'}`).join(', ')}`;
- // Individual category jobs log their own completion via crawlerLogger
- return { results, summary };
-}
-/**
- * Run all category sandbox crawls for a dispensary
- */
-async function runAllCategorySandboxCrawls(dispensaryId) {
- const results = [];
- const [productResult, specialsResult, brandResult, metadataResult] = await Promise.allSettled([
- runSandboxProductsJob(dispensaryId),
- runSandboxSpecialsJob(dispensaryId),
- runSandboxBrandJob(dispensaryId),
- runSandboxMetadataJob(dispensaryId),
- ]);
- if (productResult.status === 'fulfilled')
- results.push(productResult.value);
- else
- results.push({ success: false, category: 'product', message: productResult.reason?.message || 'Unknown error' });
- if (specialsResult.status === 'fulfilled')
- results.push(specialsResult.value);
- else
- results.push({ success: false, category: 'specials', message: specialsResult.reason?.message || 'Unknown error' });
- if (brandResult.status === 'fulfilled')
- results.push(brandResult.value);
- else
- results.push({ success: false, category: 'brand', message: brandResult.reason?.message || 'Unknown error' });
- if (metadataResult.status === 'fulfilled')
- results.push(metadataResult.value);
- else
- results.push({ success: false, category: 'metadata', message: metadataResult.reason?.message || 'Unknown error' });
- const successCount = results.filter(r => r.success).length;
- const summary = `${successCount}/4 sandbox crawls: ${results.map(r => `${r.category}:${r.success ? 'ok' : 'fail'}`).join(', ')}`;
- // Individual sandbox jobs log their own completion via crawlerLogger
- return { results, summary };
-}
diff --git a/backend/dist/services/category-discovery.js b/backend/dist/services/category-discovery.js
deleted file mode 100644
index ce53f818..00000000
--- a/backend/dist/services/category-discovery.js
+++ /dev/null
@@ -1,246 +0,0 @@
-"use strict";
-var __importDefault = (this && this.__importDefault) || function (mod) {
- return (mod && mod.__esModule) ? mod : { "default": mod };
-};
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.discoverCategories = discoverCategories;
-const puppeteer_extra_1 = __importDefault(require("puppeteer-extra"));
-const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth"));
-const migrate_1 = require("../db/migrate");
-const logger_1 = require("./logger");
-const age_gate_1 = require("../utils/age-gate");
-const dutchie_1 = require("../scrapers/templates/dutchie");
-// Apply stealth plugin
-puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)());
-const DUTCHIE_CATEGORIES = [
- { name: 'Shop', slug: 'shop' },
- { name: 'Flower', slug: 'flower', parentSlug: 'shop' },
- { name: 'Pre-Rolls', slug: 'pre-rolls', parentSlug: 'shop' },
- { name: 'Vaporizers', slug: 'vaporizers', parentSlug: 'shop' },
- { name: 'Concentrates', slug: 'concentrates', parentSlug: 'shop' },
- { name: 'Edibles', slug: 'edibles', parentSlug: 'shop' },
- { name: 'Topicals', slug: 'topicals', parentSlug: 'shop' },
- { name: 'Accessories', slug: 'accessories', parentSlug: 'shop' },
- { name: 'Brands', slug: 'brands' },
- { name: 'Specials', slug: 'specials' }
-];
-const CURALEAF_CATEGORIES = [
- { name: 'Shop', slug: 'shop' },
- { name: 'Flower', slug: 'flower', parentSlug: 'shop' },
- { name: 'Pre-Rolls', slug: 'pre-rolls', parentSlug: 'shop' },
- { name: 'Vaporizers', slug: 'vaporizers', parentSlug: 'shop' },
- { name: 'Concentrates', slug: 'concentrates', parentSlug: 'shop' },
- { name: 'Edibles', slug: 'edibles', parentSlug: 'shop' },
- { name: 'Tinctures', slug: 'tinctures', parentSlug: 'shop' },
- { name: 'Topicals', slug: 'topicals', parentSlug: 'shop' },
- { name: 'Capsules', slug: 'capsules', parentSlug: 'shop' },
- { name: 'Accessories', slug: 'accessories', parentSlug: 'shop' }
-];
-async function makePageStealthy(page) {
- await page.evaluateOnNewDocument(() => {
- Object.defineProperty(navigator, 'webdriver', { get: () => false });
- Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] });
- Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] });
- window.chrome = { runtime: {} };
- });
-}
-async function isDutchieMenu(page) {
- try {
- // Check page source for Dutchie markers
- const isDutchie = await page.evaluate(() => {
- // Check for window.reactEnv with dutchie URLs
- if (window.reactEnv) {
- const env = window.reactEnv;
- if (env.adminUrl?.includes('dutchie.com') ||
- env.apiUrl?.includes('dutchie.com') ||
- env.consumerUrl?.includes('dutchie.com')) {
- return true;
- }
- }
- // Check HTML source for dutchie references
- const htmlContent = document.documentElement.innerHTML;
- if (htmlContent.includes('admin.dutchie.com') ||
- htmlContent.includes('api.dutchie.com') ||
- htmlContent.includes('embedded-menu') ||
- htmlContent.includes('window.reactEnv')) {
- return true;
- }
- return false;
- });
- return isDutchie;
- }
- catch (error) {
- logger_1.logger.warn('categories', `Error detecting Dutchie menu: ${error}`);
- return false;
- }
-}
-async function discoverCategories(storeId) {
- let browser = null;
- try {
- logger_1.logger.info('categories', `Discovering categories for store ID: ${storeId}`);
- const storeResult = await migrate_1.pool.query(`
- SELECT id, name, slug, dutchie_url
- FROM stores
- WHERE id = $1
- `, [storeId]);
- if (storeResult.rows.length === 0) {
- throw new Error('Store not found');
- }
- const store = storeResult.rows[0];
- const baseUrl = store.dutchie_url;
- // Launch browser to check page source
- browser = await puppeteer_extra_1.default.launch({
- headless: 'new',
- args: [
- '--no-sandbox',
- '--disable-setuid-sandbox',
- '--disable-dev-shm-usage',
- '--disable-blink-features=AutomationControlled'
- ]
- });
- const page = await browser.newPage();
- await makePageStealthy(page);
- await page.setViewport({ width: 1920, height: 1080 });
- await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
- // Set age gate bypass cookies BEFORE navigation (standard for all cannabis sites)
- const state = (0, age_gate_1.detectStateFromUrl)(baseUrl);
- await (0, age_gate_1.setAgeGateCookies)(page, baseUrl, state);
- logger_1.logger.info('categories', `Loading page to detect menu type: ${baseUrl}`);
- await page.goto(baseUrl, { waitUntil: 'domcontentloaded', timeout: 60000 });
- await page.waitForTimeout(3000);
- // If age gate still appears, try to bypass it
- await (0, age_gate_1.bypassAgeGate)(page, state);
- // Detect if it's a Dutchie menu by inspecting page source
- const isDutchie = await isDutchieMenu(page);
- await browser.close();
- browser = null;
- if (isDutchie) {
- logger_1.logger.info('categories', `✅ Detected Dutchie menu for ${store.name}`);
- await createDutchieCategories(storeId, store);
- }
- else {
- // Fallback: Use standard cannabis categories for non-Dutchie sites
- logger_1.logger.info('categories', `Non-Dutchie menu detected, using standard cannabis categories for ${store.name}`);
- await createCuraleafCategories(storeId, store);
- }
- }
- catch (error) {
- logger_1.logger.error('categories', `Category discovery error: ${error}`);
- if (browser)
- await browser.close();
- throw error;
- }
-}
-async function createDutchieCategories(storeId, store) {
- const client = await migrate_1.pool.connect();
- try {
- await client.query('BEGIN');
- logger_1.logger.info('categories', `Creating predefined Dutchie category structure`);
- const baseUrl = store.dutchie_url;
- for (const category of DUTCHIE_CATEGORIES) {
- let categoryUrl;
- // Use Dutchie template to build correct category URLs
- if (category.parentSlug) {
- // Subcategory: Use template's buildCategoryUrl (e.g., /products/flower)
- categoryUrl = dutchie_1.dutchieTemplate.buildCategoryUrl(baseUrl, category.name);
- }
- else {
- // Top-level: Use base URL with slug
- categoryUrl = `${baseUrl}/${category.slug}`;
- }
- if (!category.parentSlug) {
- // Create parent category
- await client.query(`
- INSERT INTO categories (store_id, name, slug, dutchie_url, scrape_enabled)
- VALUES ($1, $2, $3, $4, true)
- ON CONFLICT (store_id, slug)
- DO UPDATE SET name = $2, dutchie_url = $4
- RETURNING id
- `, [storeId, category.name, category.slug, categoryUrl]);
- logger_1.logger.info('categories', `📁 ${category.name}`);
- }
- else {
- // Create subcategory
- const parentResult = await client.query(`
- SELECT id FROM categories
- WHERE store_id = $1 AND slug = $2
- `, [storeId, category.parentSlug]);
- if (parentResult.rows.length > 0) {
- await client.query(`
- INSERT INTO categories (store_id, name, slug, dutchie_url, scrape_enabled)
- VALUES ($1, $2, $3, $4, true)
- ON CONFLICT (store_id, slug)
- DO UPDATE SET name = $2, dutchie_url = $4
- `, [storeId, category.name, category.slug, categoryUrl]);
- logger_1.logger.info('categories', ` └── ${category.name}`);
- }
- }
- }
- await client.query('COMMIT');
- logger_1.logger.info('categories', `✅ Created ${DUTCHIE_CATEGORIES.length} Dutchie categories successfully`);
- }
- catch (error) {
- await client.query('ROLLBACK');
- logger_1.logger.error('categories', `Failed to create Dutchie categories: ${error}`);
- throw error;
- }
- finally {
- client.release();
- }
-}
-async function createCuraleafCategories(storeId, store) {
- const client = await migrate_1.pool.connect();
- try {
- await client.query('BEGIN');
- logger_1.logger.info('categories', `Creating predefined Curaleaf category structure`);
- const baseUrl = store.dutchie_url;
- for (const category of CURALEAF_CATEGORIES) {
- let categoryUrl;
- if (category.parentSlug) {
- // Subcategory URL - Curaleaf uses pattern like: /stores/{store-slug}/{category}
- categoryUrl = `${baseUrl}?category=${category.slug}`;
- }
- else {
- // Top-level category
- categoryUrl = baseUrl;
- }
- if (!category.parentSlug) {
- // Create parent category
- await client.query(`
- INSERT INTO categories (store_id, name, slug, dutchie_url, scrape_enabled)
- VALUES ($1, $2, $3, $4, true)
- ON CONFLICT (store_id, slug)
- DO UPDATE SET name = $2, dutchie_url = $4
- RETURNING id
- `, [storeId, category.name, category.slug, categoryUrl]);
- logger_1.logger.info('categories', `📁 ${category.name}`);
- }
- else {
- // Create subcategory
- const parentResult = await client.query(`
- SELECT id FROM categories
- WHERE store_id = $1 AND slug = $2
- `, [storeId, category.parentSlug]);
- if (parentResult.rows.length > 0) {
- await client.query(`
- INSERT INTO categories (store_id, name, slug, dutchie_url, scrape_enabled)
- VALUES ($1, $2, $3, $4, true)
- ON CONFLICT (store_id, slug)
- DO UPDATE SET name = $2, dutchie_url = $4
- `, [storeId, category.name, category.slug, categoryUrl]);
- logger_1.logger.info('categories', ` └── ${category.name}`);
- }
- }
- }
- await client.query('COMMIT');
- logger_1.logger.info('categories', `✅ Created ${CURALEAF_CATEGORIES.length} Curaleaf categories successfully`);
- }
- catch (error) {
- await client.query('ROLLBACK');
- logger_1.logger.error('categories', `Failed to create Curaleaf categories: ${error}`);
- throw error;
- }
- finally {
- client.release();
- }
-}
diff --git a/backend/dist/services/crawl-scheduler.js b/backend/dist/services/crawl-scheduler.js
deleted file mode 100644
index 271609bc..00000000
--- a/backend/dist/services/crawl-scheduler.js
+++ /dev/null
@@ -1,536 +0,0 @@
-"use strict";
-/**
- * Crawl Scheduler Service
- *
- * This service manages crawl scheduling using a job queue approach.
- * It does NOT modify the crawler - it only TRIGGERS the existing crawler.
- *
- * Features:
- * - Global schedule: crawl all stores every N hours
- * - Daily special run: 12:01 AM local store time
- * - Per-store schedule overrides
- * - Job queue for tracking pending/running crawls
- */
-var __importDefault = (this && this.__importDefault) || function (mod) {
- return (mod && mod.__esModule) ? mod : { "default": mod };
-};
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.getGlobalSchedule = getGlobalSchedule;
-exports.updateGlobalSchedule = updateGlobalSchedule;
-exports.getStoreScheduleStatuses = getStoreScheduleStatuses;
-exports.getStoreSchedule = getStoreSchedule;
-exports.updateStoreSchedule = updateStoreSchedule;
-exports.createCrawlJob = createCrawlJob;
-exports.getPendingJobs = getPendingJobs;
-exports.claimJob = claimJob;
-exports.completeJob = completeJob;
-exports.getRecentJobs = getRecentJobs;
-exports.getAllRecentJobs = getAllRecentJobs;
-exports.checkAndCreateScheduledJobs = checkAndCreateScheduledJobs;
-exports.checkAndCreateDailySpecialJobs = checkAndCreateDailySpecialJobs;
-exports.processJobs = processJobs;
-exports.processOrchestrator = processOrchestrator;
-exports.setSchedulerMode = setSchedulerMode;
-exports.getSchedulerMode = getSchedulerMode;
-exports.startCrawlScheduler = startCrawlScheduler;
-exports.stopCrawlScheduler = stopCrawlScheduler;
-exports.restartCrawlScheduler = restartCrawlScheduler;
-exports.triggerManualCrawl = triggerManualCrawl;
-exports.triggerAllStoresCrawl = triggerAllStoresCrawl;
-exports.cancelJob = cancelJob;
-const node_cron_1 = __importDefault(require("node-cron"));
-const migrate_1 = require("../db/migrate");
-const scraper_v2_1 = require("../scraper-v2");
-const store_crawl_orchestrator_1 = require("./store-crawl-orchestrator");
-// Worker identification
-const WORKER_ID = `worker-${process.pid}-${Date.now()}`;
-let schedulerCronJob = null;
-let jobProcessorRunning = false;
-let orchestratorProcessorRunning = false;
-// Scheduler mode: 'legacy' uses job queue, 'orchestrator' uses intelligent orchestration
-let schedulerMode = 'orchestrator';
-// ============================================
-// Schedule Management
-// ============================================
-/**
- * Get global schedule settings
- */
-async function getGlobalSchedule() {
- const result = await migrate_1.pool.query(`
- SELECT * FROM crawler_schedule ORDER BY id
- `);
- return result.rows;
-}
-/**
- * Update global schedule setting
- */
-async function updateGlobalSchedule(scheduleType, updates) {
- const setClauses = [];
- const values = [];
- let paramIndex = 1;
- if (updates.enabled !== undefined) {
- setClauses.push(`enabled = $${paramIndex++}`);
- values.push(updates.enabled);
- }
- if (updates.interval_hours !== undefined) {
- setClauses.push(`interval_hours = $${paramIndex++}`);
- values.push(updates.interval_hours);
- }
- if (updates.run_time !== undefined) {
- setClauses.push(`run_time = $${paramIndex++}`);
- values.push(updates.run_time);
- }
- values.push(scheduleType);
- const result = await migrate_1.pool.query(`
- UPDATE crawler_schedule
- SET ${setClauses.join(', ')}
- WHERE schedule_type = $${paramIndex}
- RETURNING *
- `, values);
- return result.rows[0];
-}
-/**
- * Get all store schedule statuses
- */
-async function getStoreScheduleStatuses() {
- const result = await migrate_1.pool.query(`SELECT * FROM crawl_schedule_status ORDER BY priority DESC, store_name`);
- return result.rows;
-}
-/**
- * Get or create per-store schedule override
- */
-async function getStoreSchedule(storeId) {
- const result = await migrate_1.pool.query(`
- SELECT * FROM store_crawl_schedule WHERE store_id = $1
- `, [storeId]);
- if (result.rows.length > 0) {
- return result.rows[0];
- }
- // Return default (use global)
- return {
- store_id: storeId,
- enabled: true,
- interval_hours: null,
- daily_special_enabled: true,
- daily_special_time: null,
- priority: 0
- };
-}
-/**
- * Update per-store schedule override
- */
-async function updateStoreSchedule(storeId, updates) {
- const result = await migrate_1.pool.query(`
- INSERT INTO store_crawl_schedule (store_id, enabled, interval_hours, daily_special_enabled, daily_special_time, priority)
- VALUES ($1, $2, $3, $4, $5, $6)
- ON CONFLICT (store_id) DO UPDATE SET
- enabled = COALESCE(EXCLUDED.enabled, store_crawl_schedule.enabled),
- interval_hours = EXCLUDED.interval_hours,
- daily_special_enabled = COALESCE(EXCLUDED.daily_special_enabled, store_crawl_schedule.daily_special_enabled),
- daily_special_time = EXCLUDED.daily_special_time,
- priority = COALESCE(EXCLUDED.priority, store_crawl_schedule.priority),
- updated_at = NOW()
- RETURNING *
- `, [
- storeId,
- updates.enabled ?? true,
- updates.interval_hours ?? null,
- updates.daily_special_enabled ?? true,
- updates.daily_special_time ?? null,
- updates.priority ?? 0
- ]);
- return result.rows[0];
-}
-// ============================================
-// Job Queue Management
-// ============================================
-/**
- * Create a new crawl job
- */
-async function createCrawlJob(storeId, jobType = 'full_crawl', triggerType = 'scheduled', scheduledAt = new Date(), priority = 0) {
- // Check if there's already a pending or running job for this store
- const existing = await migrate_1.pool.query(`
- SELECT id FROM crawl_jobs
- WHERE store_id = $1 AND status IN ('pending', 'running')
- LIMIT 1
- `, [storeId]);
- if (existing.rows.length > 0) {
- console.log(`Skipping job creation for store ${storeId} - already has pending/running job`);
- return existing.rows[0];
- }
- const result = await migrate_1.pool.query(`
- INSERT INTO crawl_jobs (store_id, job_type, trigger_type, scheduled_at, priority, status)
- VALUES ($1, $2, $3, $4, $5, 'pending')
- RETURNING *
- `, [storeId, jobType, triggerType, scheduledAt, priority]);
- console.log(`Created crawl job ${result.rows[0].id} for store ${storeId} (${triggerType})`);
- return result.rows[0];
-}
-/**
- * Get pending jobs ready to run
- */
-async function getPendingJobs(limit = 5) {
- const result = await migrate_1.pool.query(`
- SELECT cj.*, s.name as store_name
- FROM crawl_jobs cj
- JOIN stores s ON s.id = cj.store_id
- WHERE cj.status = 'pending'
- AND cj.scheduled_at <= NOW()
- ORDER BY cj.priority DESC, cj.scheduled_at ASC
- LIMIT $1
- `, [limit]);
- return result.rows;
-}
-/**
- * Claim a job for processing
- */
-async function claimJob(jobId) {
- const result = await migrate_1.pool.query(`
- UPDATE crawl_jobs
- SET status = 'running', started_at = NOW(), worker_id = $2
- WHERE id = $1 AND status = 'pending'
- RETURNING id
- `, [jobId, WORKER_ID]);
- return result.rows.length > 0;
-}
-/**
- * Complete a job
- */
-async function completeJob(jobId, success, results) {
- await migrate_1.pool.query(`
- UPDATE crawl_jobs
- SET
- status = $2,
- completed_at = NOW(),
- products_found = $3,
- error_message = $4
- WHERE id = $1
- `, [
- jobId,
- success ? 'completed' : 'failed',
- results?.products_found ?? null,
- results?.error_message ?? null
- ]);
-}
-/**
- * Get recent jobs for a store
- */
-async function getRecentJobs(storeId, limit = 10) {
- const result = await migrate_1.pool.query(`
- SELECT * FROM crawl_jobs
- WHERE store_id = $1
- ORDER BY created_at DESC
- LIMIT $2
- `, [storeId, limit]);
- return result.rows;
-}
-/**
- * Get all recent jobs
- */
-async function getAllRecentJobs(limit = 50) {
- const result = await migrate_1.pool.query(`
- SELECT cj.*, s.name as store_name, s.slug as store_slug
- FROM crawl_jobs cj
- JOIN stores s ON s.id = cj.store_id
- ORDER BY cj.created_at DESC
- LIMIT $1
- `, [limit]);
- return result.rows;
-}
-// ============================================
-// Scheduler Logic
-// ============================================
-/**
- * Check which stores are due for a crawl and create jobs
- */
-async function checkAndCreateScheduledJobs() {
- console.log('Checking for stores due for crawl...');
- // Get global schedule settings
- const globalSchedule = await migrate_1.pool.query(`
- SELECT * FROM crawler_schedule WHERE schedule_type = 'global_interval'
- `);
- if (globalSchedule.rows.length === 0 || !globalSchedule.rows[0].enabled) {
- console.log('Global scheduler is disabled');
- return 0;
- }
- const intervalHours = globalSchedule.rows[0].interval_hours || 4;
- // Find stores due for crawl
- const result = await migrate_1.pool.query(`
- SELECT
- s.id,
- s.name,
- s.timezone,
- s.last_scraped_at,
- COALESCE(scs.enabled, TRUE) as schedule_enabled,
- COALESCE(scs.interval_hours, $1) as interval_hours,
- COALESCE(scs.priority, 0) as priority
- FROM stores s
- LEFT JOIN store_crawl_schedule scs ON scs.store_id = s.id
- WHERE s.active = TRUE
- AND s.scrape_enabled = TRUE
- AND COALESCE(scs.enabled, TRUE) = TRUE
- AND (
- s.last_scraped_at IS NULL
- OR s.last_scraped_at < NOW() - (COALESCE(scs.interval_hours, $1) || ' hours')::INTERVAL
- )
- AND NOT EXISTS (
- SELECT 1 FROM crawl_jobs cj
- WHERE cj.store_id = s.id AND cj.status IN ('pending', 'running')
- )
- ORDER BY COALESCE(scs.priority, 0) DESC, s.last_scraped_at ASC NULLS FIRST
- `, [intervalHours]);
- let jobsCreated = 0;
- for (const store of result.rows) {
- try {
- await createCrawlJob(store.id, 'full_crawl', 'scheduled', new Date(), store.priority);
- jobsCreated++;
- console.log(`Scheduled crawl job for: ${store.name}`);
- }
- catch (error) {
- console.error(`Failed to create job for store ${store.name}:`, error);
- }
- }
- console.log(`Created ${jobsCreated} scheduled crawl jobs`);
- return jobsCreated;
-}
-/**
- * Check for daily special runs (12:01 AM local time)
- */
-async function checkAndCreateDailySpecialJobs() {
- console.log('Checking for daily special runs...');
- // Get daily special schedule
- const dailySchedule = await migrate_1.pool.query(`
- SELECT * FROM crawler_schedule WHERE schedule_type = 'daily_special'
- `);
- if (dailySchedule.rows.length === 0 || !dailySchedule.rows[0].enabled) {
- console.log('Daily special scheduler is disabled');
- return 0;
- }
- const targetTime = dailySchedule.rows[0].run_time || '00:01';
- // Find stores where it's currently the target time in their local timezone
- // and they haven't had a daily special run today
- const result = await migrate_1.pool.query(`
- SELECT
- s.id,
- s.name,
- s.timezone,
- COALESCE(scs.daily_special_enabled, TRUE) as daily_special_enabled,
- COALESCE(scs.daily_special_time, $1::TIME) as daily_special_time,
- COALESCE(scs.priority, 0) as priority
- FROM stores s
- LEFT JOIN store_crawl_schedule scs ON scs.store_id = s.id
- WHERE s.active = TRUE
- AND s.scrape_enabled = TRUE
- AND COALESCE(scs.daily_special_enabled, TRUE) = TRUE
- -- Check if current time in store timezone matches the target time (within 2 minutes)
- AND ABS(
- EXTRACT(EPOCH FROM (
- (NOW() AT TIME ZONE COALESCE(s.timezone, 'America/Phoenix'))::TIME
- - COALESCE(scs.daily_special_time, $1::TIME)
- ))
- ) < 120 -- within 2 minutes
- -- Ensure we haven't already created a daily_special job today for this store
- AND NOT EXISTS (
- SELECT 1 FROM crawl_jobs cj
- WHERE cj.store_id = s.id
- AND cj.trigger_type = 'daily_special'
- AND cj.created_at > (NOW() AT TIME ZONE COALESCE(s.timezone, 'America/Phoenix'))::DATE
- )
- AND NOT EXISTS (
- SELECT 1 FROM crawl_jobs cj
- WHERE cj.store_id = s.id AND cj.status IN ('pending', 'running')
- )
- ORDER BY COALESCE(scs.priority, 0) DESC
- `, [targetTime]);
- let jobsCreated = 0;
- for (const store of result.rows) {
- try {
- await createCrawlJob(store.id, 'full_crawl', 'daily_special', new Date(), store.priority + 10);
- jobsCreated++;
- console.log(`Created daily special job for: ${store.name} (${store.timezone})`);
- }
- catch (error) {
- console.error(`Failed to create daily special job for store ${store.name}:`, error);
- }
- }
- if (jobsCreated > 0) {
- console.log(`Created ${jobsCreated} daily special crawl jobs`);
- }
- return jobsCreated;
-}
-/**
- * Process pending jobs
- */
-async function processJobs() {
- if (jobProcessorRunning) {
- console.log('Job processor already running, skipping...');
- return;
- }
- jobProcessorRunning = true;
- try {
- const jobs = await getPendingJobs(1); // Process one at a time for safety
- for (const job of jobs) {
- console.log(`Processing job ${job.id} for store: ${job.store_name}`);
- const claimed = await claimJob(job.id);
- if (!claimed) {
- console.log(`Job ${job.id} already claimed by another worker`);
- continue;
- }
- try {
- // Call the existing scraper - DO NOT MODIFY SCRAPER LOGIC
- await (0, scraper_v2_1.scrapeStore)(job.store_id);
- // Update store's last_scraped_at
- await migrate_1.pool.query(`
- UPDATE stores SET last_scraped_at = NOW() WHERE id = $1
- `, [job.store_id]);
- await completeJob(job.id, true, {});
- console.log(`Job ${job.id} completed successfully`);
- }
- catch (error) {
- console.error(`Job ${job.id} failed:`, error);
- await completeJob(job.id, false, { error_message: error.message });
- }
- }
- }
- finally {
- jobProcessorRunning = false;
- }
-}
-/**
- * Process stores using the intelligent orchestrator
- * This replaces the simple job queue approach with intelligent provider detection
- */
-async function processOrchestrator() {
- if (orchestratorProcessorRunning) {
- console.log('Orchestrator processor already running, skipping...');
- return;
- }
- orchestratorProcessorRunning = true;
- try {
- // Get stores due for orchestration (respects schedule, intervals, etc.)
- const storeIds = await (0, store_crawl_orchestrator_1.getStoresDueForOrchestration)(3); // Process up to 3 at a time
- if (storeIds.length === 0) {
- return;
- }
- console.log(`Orchestrator: Processing ${storeIds.length} stores due for crawl`);
- // Process each store through the orchestrator
- for (const storeId of storeIds) {
- try {
- console.log(`Orchestrator: Starting crawl for store ${storeId}`);
- const result = await (0, store_crawl_orchestrator_1.runStoreCrawlOrchestrator)(storeId);
- console.log(`Orchestrator: Store ${storeId} completed - ${result.summary}`);
- }
- catch (error) {
- console.error(`Orchestrator: Store ${storeId} failed - ${error.message}`);
- }
- }
- console.log(`Orchestrator: Finished processing ${storeIds.length} stores`);
- }
- finally {
- orchestratorProcessorRunning = false;
- }
-}
-// ============================================
-// Scheduler Control
-// ============================================
-/**
- * Set scheduler mode
- */
-function setSchedulerMode(mode) {
- schedulerMode = mode;
- console.log(`Scheduler mode set to: ${mode}`);
-}
-/**
- * Get current scheduler mode
- */
-function getSchedulerMode() {
- return schedulerMode;
-}
-/**
- * Start the scheduler (runs every minute to check for due jobs)
- */
-async function startCrawlScheduler() {
- stopCrawlScheduler();
- console.log(`Starting crawl scheduler in ${schedulerMode} mode...`);
- // Run every minute
- schedulerCronJob = node_cron_1.default.schedule('* * * * *', async () => {
- try {
- if (schedulerMode === 'orchestrator') {
- // Use intelligent orchestrator (handles detection + crawl)
- await processOrchestrator();
- }
- else {
- // Legacy mode: job queue approach
- // Check for interval-based scheduled jobs
- await checkAndCreateScheduledJobs();
- // Check for daily special runs
- await checkAndCreateDailySpecialJobs();
- // Process any pending jobs
- await processJobs();
- }
- }
- catch (error) {
- console.error('Scheduler tick error:', error);
- }
- });
- console.log(`Crawl scheduler started in ${schedulerMode} mode (checking every minute)`);
-}
-/**
- * Stop the scheduler
- */
-function stopCrawlScheduler() {
- if (schedulerCronJob) {
- schedulerCronJob.stop();
- schedulerCronJob = null;
- console.log('Crawl scheduler stopped');
- }
-}
-/**
- * Restart the scheduler
- */
-async function restartCrawlScheduler() {
- await startCrawlScheduler();
-}
-// ============================================
-// Manual Triggers
-// ============================================
-/**
- * Manually trigger a crawl for a specific store (creates a job immediately)
- */
-async function triggerManualCrawl(storeId) {
- console.log(`Manual crawl triggered for store ID: ${storeId}`);
- return await createCrawlJob(storeId, 'full_crawl', 'manual', new Date(), 100); // High priority
-}
-/**
- * Manually trigger crawls for all stores
- */
-async function triggerAllStoresCrawl() {
- console.log('Manual crawl triggered for all stores');
- const result = await migrate_1.pool.query(`
- SELECT id, name FROM stores
- WHERE active = TRUE AND scrape_enabled = TRUE
- AND NOT EXISTS (
- SELECT 1 FROM crawl_jobs cj
- WHERE cj.store_id = stores.id AND cj.status IN ('pending', 'running')
- )
- `);
- let jobsCreated = 0;
- for (const store of result.rows) {
- await createCrawlJob(store.id, 'full_crawl', 'manual', new Date(), 50);
- jobsCreated++;
- }
- console.log(`Created ${jobsCreated} manual crawl jobs`);
- return jobsCreated;
-}
-/**
- * Cancel a pending job
- */
-async function cancelJob(jobId) {
- const result = await migrate_1.pool.query(`
- UPDATE crawl_jobs
- SET status = 'cancelled'
- WHERE id = $1 AND status = 'pending'
- RETURNING id
- `, [jobId]);
- return result.rows.length > 0;
-}
diff --git a/backend/dist/services/crawler-jobs.js b/backend/dist/services/crawler-jobs.js
deleted file mode 100644
index 6bf28e3f..00000000
--- a/backend/dist/services/crawler-jobs.js
+++ /dev/null
@@ -1,476 +0,0 @@
-"use strict";
-/**
- * Crawler Jobs Service
- *
- * Handles three types of jobs:
- * 1. DetectMenuProviderJob - Detect menu provider for a dispensary
- * 2. DutchieMenuCrawlJob - Production Dutchie crawl
- * 3. SandboxCrawlJob - Learning/testing crawl for unknown providers
- */
-var __importDefault = (this && this.__importDefault) || function (mod) {
- return (mod && mod.__esModule) ? mod : { "default": mod };
-};
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.runDetectMenuProviderJob = runDetectMenuProviderJob;
-exports.runDutchieMenuCrawlJob = runDutchieMenuCrawlJob;
-exports.runSandboxCrawlJob = runSandboxCrawlJob;
-exports.processSandboxJobs = processSandboxJobs;
-const migrate_1 = require("../db/migrate");
-const logger_1 = require("./logger");
-const menu_provider_detector_1 = require("./menu-provider-detector");
-const scraper_v2_1 = require("../scraper-v2");
-const puppeteer_1 = __importDefault(require("puppeteer"));
-const fs_1 = require("fs");
-const path_1 = __importDefault(require("path"));
-const availability_1 = require("./availability");
-const WORKER_ID = `crawler-${process.pid}-${Date.now()}`;
-// ========================================
-// Helper Functions
-// ========================================
-async function getDispensary(dispensaryId) {
- const result = await migrate_1.pool.query(`SELECT id, name, website, menu_url, menu_provider, menu_provider_confidence,
- crawler_mode, crawler_status, scraper_template
- FROM dispensaries WHERE id = $1`, [dispensaryId]);
- return result.rows[0] || null;
-}
-async function updateDispensary(dispensaryId, updates) {
- const setClauses = [];
- const values = [];
- let paramIndex = 1;
- for (const [key, value] of Object.entries(updates)) {
- setClauses.push(`${key} = $${paramIndex}`);
- values.push(value);
- paramIndex++;
- }
- setClauses.push(`updated_at = NOW()`);
- values.push(dispensaryId);
- await migrate_1.pool.query(`UPDATE dispensaries SET ${setClauses.join(', ')} WHERE id = $${paramIndex}`, values);
-}
-async function createSandboxEntry(dispensaryId, suspectedProvider, mode, detectionSignals) {
- // First, check if there's an existing active sandbox
- const existing = await migrate_1.pool.query(`SELECT id FROM crawler_sandboxes
- WHERE dispensary_id = $1 AND status NOT IN ('moved_to_production', 'failed')`, [dispensaryId]);
- if (existing.rows.length > 0) {
- // Update existing
- await migrate_1.pool.query(`UPDATE crawler_sandboxes
- SET suspected_menu_provider = $2, mode = $3, detection_signals = COALESCE($4, detection_signals), updated_at = NOW()
- WHERE id = $1`, [existing.rows[0].id, suspectedProvider, mode, detectionSignals ? JSON.stringify(detectionSignals) : null]);
- return existing.rows[0].id;
- }
- // Create new
- const result = await migrate_1.pool.query(`INSERT INTO crawler_sandboxes (dispensary_id, suspected_menu_provider, mode, detection_signals, status)
- VALUES ($1, $2, $3, $4, 'pending')
- RETURNING id`, [dispensaryId, suspectedProvider, mode, detectionSignals ? JSON.stringify(detectionSignals) : '{}']);
- return result.rows[0].id;
-}
-async function createSandboxJob(dispensaryId, sandboxId, jobType, priority = 0) {
- const result = await migrate_1.pool.query(`INSERT INTO sandbox_crawl_jobs (dispensary_id, sandbox_id, job_type, status, priority)
- VALUES ($1, $2, $3, 'pending', $4)
- RETURNING id`, [dispensaryId, sandboxId, jobType, priority]);
- return result.rows[0].id;
-}
-// Get linked store ID for a dispensary (for using existing scraper)
-async function getStoreIdForDispensary(dispensaryId) {
- // Check if there's a stores entry linked to this dispensary
- const result = await migrate_1.pool.query(`SELECT s.id FROM stores s
- JOIN dispensaries d ON d.menu_url = s.dutchie_url OR d.name ILIKE '%' || s.name || '%'
- WHERE d.id = $1
- LIMIT 1`, [dispensaryId]);
- if (result.rows.length > 0) {
- return result.rows[0].id;
- }
- // Try to find by website
- const result2 = await migrate_1.pool.query(`SELECT s.id FROM stores s
- JOIN dispensaries d ON d.website ILIKE '%' || s.slug || '%'
- WHERE d.id = $1
- LIMIT 1`, [dispensaryId]);
- return result2.rows[0]?.id || null;
-}
-// ========================================
-// Job 1: Detect Menu Provider
-// ========================================
-async function runDetectMenuProviderJob(dispensaryId) {
- logger_1.logger.info('crawler-jobs', `Starting menu provider detection for dispensary ${dispensaryId}`);
- const dispensary = await getDispensary(dispensaryId);
- if (!dispensary) {
- return { success: false, message: `Dispensary ${dispensaryId} not found` };
- }
- // Check for website URL
- const websiteUrl = dispensary.website || dispensary.menu_url;
- if (!websiteUrl) {
- await updateDispensary(dispensaryId, {
- crawler_status: 'error_needs_review',
- last_menu_error_at: new Date(),
- last_error_message: 'No website URL available for detection',
- });
- return { success: false, message: 'No website URL available' };
- }
- try {
- // Run detection
- const detection = await (0, menu_provider_detector_1.detectMenuProvider)(websiteUrl, {
- checkMenuPaths: true,
- timeout: 30000,
- });
- // Update dispensary with results
- const updates = {
- menu_provider: detection.provider,
- menu_provider_confidence: detection.confidence,
- provider_detection_data: JSON.stringify({
- signals: detection.signals,
- urlsTested: detection.urlsTested,
- menuEntryPoints: detection.menuEntryPoints,
- rawSignals: detection.rawSignals,
- detectedAt: new Date().toISOString(),
- }),
- crawler_status: 'idle',
- };
- // Decide crawler mode based on provider
- if (detection.provider === 'dutchie' && detection.confidence >= 70) {
- // Dutchie with high confidence -> production
- updates.crawler_mode = 'production';
- logger_1.logger.info('crawler-jobs', `Dispensary ${dispensaryId} detected as Dutchie (${detection.confidence}%), setting to production`);
- }
- else {
- // Unknown or non-Dutchie -> sandbox
- updates.crawler_mode = 'sandbox';
- // Create sandbox entry for further analysis
- const sandboxId = await createSandboxEntry(dispensaryId, detection.provider, 'detection', {
- signals: detection.signals,
- rawSignals: detection.rawSignals,
- });
- // Queue sandbox crawl job
- await createSandboxJob(dispensaryId, sandboxId, 'detection');
- logger_1.logger.info('crawler-jobs', `Dispensary ${dispensaryId} detected as ${detection.provider} (${detection.confidence}%), setting to sandbox`);
- }
- // Update menu entry points if found
- if (detection.menuEntryPoints.length > 0 && !dispensary.menu_url) {
- updates.menu_url = detection.menuEntryPoints[0];
- }
- await updateDispensary(dispensaryId, updates);
- return {
- success: true,
- message: `Detected provider: ${detection.provider} (${detection.confidence}%)`,
- data: {
- provider: detection.provider,
- confidence: detection.confidence,
- mode: updates.crawler_mode,
- menuEntryPoints: detection.menuEntryPoints,
- },
- };
- }
- catch (error) {
- logger_1.logger.error('crawler-jobs', `Detection failed for dispensary ${dispensaryId}: ${error.message}`);
- await updateDispensary(dispensaryId, {
- crawler_status: 'error_needs_review',
- last_menu_error_at: new Date(),
- last_error_message: `Detection failed: ${error.message}`,
- });
- return { success: false, message: error.message };
- }
-}
-// ========================================
-// Job 2: Dutchie Menu Crawl (Production)
-// ========================================
-async function runDutchieMenuCrawlJob(dispensaryId) {
- logger_1.logger.info('crawler-jobs', `Starting Dutchie production crawl for dispensary ${dispensaryId}`);
- const dispensary = await getDispensary(dispensaryId);
- if (!dispensary) {
- return { success: false, message: `Dispensary ${dispensaryId} not found` };
- }
- // Verify it's a Dutchie production dispensary
- if (dispensary.menu_provider !== 'dutchie') {
- logger_1.logger.warn('crawler-jobs', `Dispensary ${dispensaryId} is not Dutchie, skipping production crawl`);
- return { success: false, message: 'Not a Dutchie dispensary' };
- }
- if (dispensary.crawler_mode !== 'production') {
- logger_1.logger.warn('crawler-jobs', `Dispensary ${dispensaryId} is not in production mode, skipping`);
- return { success: false, message: 'Not in production mode' };
- }
- // Find linked store ID
- const storeId = await getStoreIdForDispensary(dispensaryId);
- if (!storeId) {
- // Need to create a store entry or handle differently
- logger_1.logger.warn('crawler-jobs', `No linked store found for dispensary ${dispensaryId}`);
- return { success: false, message: 'No linked store found - needs setup' };
- }
- try {
- // Update status to running
- await updateDispensary(dispensaryId, { crawler_status: 'running' });
- // Run the existing Dutchie scraper
- await (0, scraper_v2_1.scrapeStore)(storeId, 3); // 3 parallel workers
- // Update success status
- await updateDispensary(dispensaryId, {
- crawler_status: 'ok',
- last_menu_scrape: new Date(),
- menu_scrape_status: 'active',
- });
- logger_1.logger.info('crawler-jobs', `Dutchie crawl completed for dispensary ${dispensaryId}`);
- return {
- success: true,
- message: 'Dutchie crawl completed successfully',
- data: { storeId },
- };
- }
- catch (error) {
- logger_1.logger.error('crawler-jobs', `Dutchie crawl failed for dispensary ${dispensaryId}: ${error.message}`);
- // Check if this might be a provider change
- let providerChanged = false;
- try {
- const browser = await puppeteer_1.default.launch({ headless: true, args: ['--no-sandbox'] });
- const page = await browser.newPage();
- const url = dispensary.menu_url || dispensary.website;
- if (url) {
- await page.goto(url, { waitUntil: 'networkidle2', timeout: 30000 });
- const changeResult = await (0, menu_provider_detector_1.detectProviderChange)(page, 'dutchie');
- providerChanged = changeResult.changed;
- if (providerChanged) {
- // Provider changed - move to sandbox
- await updateDispensary(dispensaryId, {
- crawler_mode: 'sandbox',
- crawler_status: 'error_needs_review',
- last_menu_error_at: new Date(),
- last_error_message: `Provider appears to have changed from Dutchie to ${changeResult.newProvider}`,
- });
- const sandboxId = await createSandboxEntry(dispensaryId, changeResult.newProvider || 'unknown', 'detection', { providerChangeDetected: true, previousProvider: 'dutchie' });
- await createSandboxJob(dispensaryId, sandboxId, 'detection');
- logger_1.logger.warn('crawler-jobs', `Provider change detected for dispensary ${dispensaryId}: Dutchie -> ${changeResult.newProvider}`);
- }
- }
- await browser.close();
- }
- catch {
- // Ignore detection errors during failure handling
- }
- if (!providerChanged) {
- await updateDispensary(dispensaryId, {
- crawler_status: 'error_needs_review',
- last_menu_error_at: new Date(),
- last_error_message: error.message,
- });
- }
- return { success: false, message: error.message };
- }
-}
-// ========================================
-// Job 3: Sandbox Crawl (Learning Mode)
-// ========================================
-async function runSandboxCrawlJob(dispensaryId, sandboxId) {
- logger_1.logger.info('crawler-jobs', `Starting sandbox crawl for dispensary ${dispensaryId}`);
- const dispensary = await getDispensary(dispensaryId);
- if (!dispensary) {
- return { success: false, message: `Dispensary ${dispensaryId} not found` };
- }
- // Get or create sandbox entry
- let sandbox;
- if (sandboxId) {
- const result = await migrate_1.pool.query('SELECT * FROM crawler_sandboxes WHERE id = $1', [sandboxId]);
- sandbox = result.rows[0];
- }
- else {
- const result = await migrate_1.pool.query(`SELECT * FROM crawler_sandboxes
- WHERE dispensary_id = $1 AND status NOT IN ('moved_to_production', 'failed')
- ORDER BY created_at DESC LIMIT 1`, [dispensaryId]);
- sandbox = result.rows[0];
- if (!sandbox) {
- const newSandboxId = await createSandboxEntry(dispensaryId, dispensary.menu_provider, 'template_learning');
- const result = await migrate_1.pool.query('SELECT * FROM crawler_sandboxes WHERE id = $1', [newSandboxId]);
- sandbox = result.rows[0];
- }
- }
- const websiteUrl = dispensary.menu_url || dispensary.website;
- if (!websiteUrl) {
- await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = 'failed', failure_reason = 'No website URL' WHERE id = $1`, [sandbox.id]);
- return { success: false, message: 'No website URL available' };
- }
- let browser = null;
- try {
- // Update status
- await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = 'analyzing', updated_at = NOW() WHERE id = $1`, [sandbox.id]);
- await updateDispensary(dispensaryId, { crawler_status: 'running' });
- // Launch browser
- browser = await puppeteer_1.default.launch({
- headless: true,
- args: ['--no-sandbox', '--disable-setuid-sandbox'],
- });
- const page = await browser.newPage();
- await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
- // URLs to crawl (limited depth for sandbox)
- const urlsToVisit = [websiteUrl];
- const menuPaths = ['/menu', '/shop', '/products', '/order'];
- for (const path of menuPaths) {
- const baseUrl = new URL(websiteUrl).origin;
- urlsToVisit.push(`${baseUrl}${path}`);
- }
- const urlsTested = [];
- const menuEntryPoints = [];
- const capturedHtml = [];
- const analysisData = {
- provider_signals: {},
- selector_candidates: [],
- page_structures: [],
- };
- // Crawl each URL
- for (const url of urlsToVisit) {
- try {
- urlsTested.push(url);
- await page.goto(url, { waitUntil: 'networkidle2', timeout: 30000 });
- await new Promise(r => setTimeout(r, 2000)); // Wait for dynamic content
- // Get page HTML
- const html = await page.content();
- // Check if this looks like a menu page
- const hasMenuContent = await page.evaluate(() => {
- const text = document.body.innerText.toLowerCase();
- return (text.includes('add to cart') ||
- text.includes('thc') ||
- text.includes('indica') ||
- text.includes('sativa'));
- });
- if (hasMenuContent) {
- menuEntryPoints.push(url);
- capturedHtml.push({ url, html });
- // Analyze page structure for selector candidates
- const structure = await page.evaluate(() => {
- const candidates = [];
- // Look for product-like containers
- const productSelectors = [
- '.product', '.product-card', '.menu-item', '.item-card',
- '[data-product]', '[data-item]', '.strain', '.listing',
- ];
- for (const selector of productSelectors) {
- const els = document.querySelectorAll(selector);
- if (els.length > 3) { // Likely a list
- candidates.push({
- selector,
- count: els.length,
- type: 'product_container',
- });
- }
- }
- // Look for price patterns
- const pricePattern = /\$\d+(\.\d{2})?/;
- const textNodes = document.body.innerText;
- const priceMatches = textNodes.match(/\$\d+(\.\d{2})?/g);
- return {
- candidates,
- priceCount: priceMatches?.length || 0,
- hasAddToCart: textNodes.toLowerCase().includes('add to cart'),
- };
- });
- // Extract availability hints from page content
- const availabilityHints = (0, availability_1.extractAvailabilityHints)(html);
- analysisData.page_structures.push({
- url,
- ...structure,
- availabilityHints,
- });
- }
- }
- catch (pageError) {
- if (!pageError.message.includes('404')) {
- logger_1.logger.warn('crawler-jobs', `Sandbox crawl error for ${url}: ${pageError.message}`);
- }
- }
- }
- // Save HTML to storage (local for now, S3 later)
- let rawHtmlLocation = null;
- if (capturedHtml.length > 0) {
- const htmlDir = path_1.default.join(process.cwd(), 'sandbox-data', `dispensary-${dispensaryId}`);
- await fs_1.promises.mkdir(htmlDir, { recursive: true });
- for (const { url, html } of capturedHtml) {
- const filename = `${Date.now()}-${url.replace(/[^a-z0-9]/gi, '_')}.html`;
- await fs_1.promises.writeFile(path_1.default.join(htmlDir, filename), html);
- }
- rawHtmlLocation = htmlDir;
- }
- // Update sandbox with results
- await migrate_1.pool.query(`UPDATE crawler_sandboxes SET
- status = $1,
- urls_tested = $2,
- menu_entry_points = $3,
- raw_html_location = $4,
- analysis_json = $5,
- confidence_score = $6,
- analyzed_at = NOW(),
- updated_at = NOW()
- WHERE id = $7`, [
- menuEntryPoints.length > 0 ? 'needs_human_review' : 'pending',
- JSON.stringify(urlsTested),
- JSON.stringify(menuEntryPoints),
- rawHtmlLocation,
- JSON.stringify(analysisData),
- menuEntryPoints.length > 0 ? 50 : 20,
- sandbox.id,
- ]);
- // Update dispensary status
- await updateDispensary(dispensaryId, {
- crawler_status: 'error_needs_review', // Sandbox results need review
- });
- logger_1.logger.info('crawler-jobs', `Sandbox crawl completed for dispensary ${dispensaryId}: ${menuEntryPoints.length} menu pages found`);
- return {
- success: true,
- message: `Sandbox crawl completed. Found ${menuEntryPoints.length} menu entry points.`,
- data: {
- sandboxId: sandbox.id,
- urlsTested: urlsTested.length,
- menuEntryPoints,
- analysisData,
- },
- };
- }
- catch (error) {
- logger_1.logger.error('crawler-jobs', `Sandbox crawl failed for dispensary ${dispensaryId}: ${error.message}`);
- await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = 'failed', failure_reason = $1 WHERE id = $2`, [error.message, sandbox.id]);
- await updateDispensary(dispensaryId, {
- crawler_status: 'error_needs_review',
- last_menu_error_at: new Date(),
- last_error_message: `Sandbox crawl failed: ${error.message}`,
- });
- return { success: false, message: error.message };
- }
- finally {
- if (browser) {
- await browser.close();
- }
- }
-}
-// ========================================
-// Queue Processing Functions
-// ========================================
-/**
- * Process pending sandbox jobs
- */
-async function processSandboxJobs(limit = 5) {
- // Claim pending jobs
- const jobs = await migrate_1.pool.query(`UPDATE sandbox_crawl_jobs
- SET status = 'running', worker_id = $1, started_at = NOW()
- WHERE id IN (
- SELECT id FROM sandbox_crawl_jobs
- WHERE status = 'pending' AND scheduled_at <= NOW()
- ORDER BY priority DESC, scheduled_at ASC
- LIMIT $2
- FOR UPDATE SKIP LOCKED
- )
- RETURNING *`, [WORKER_ID, limit]);
- for (const job of jobs.rows) {
- try {
- let result;
- if (job.job_type === 'detection') {
- result = await runDetectMenuProviderJob(job.dispensary_id);
- }
- else {
- result = await runSandboxCrawlJob(job.dispensary_id, job.sandbox_id);
- }
- await migrate_1.pool.query(`UPDATE sandbox_crawl_jobs
- SET status = $1, completed_at = NOW(), result_summary = $2, error_message = $3
- WHERE id = $4`, [
- result.success ? 'completed' : 'failed',
- JSON.stringify(result.data || {}),
- result.success ? null : result.message,
- job.id,
- ]);
- }
- catch (error) {
- await migrate_1.pool.query(`UPDATE sandbox_crawl_jobs SET status = 'failed', error_message = $1 WHERE id = $2`, [error.message, job.id]);
- }
- }
-}
diff --git a/backend/dist/services/crawler-logger.js b/backend/dist/services/crawler-logger.js
deleted file mode 100644
index 72c0fcbe..00000000
--- a/backend/dist/services/crawler-logger.js
+++ /dev/null
@@ -1,202 +0,0 @@
-"use strict";
-/**
- * CrawlerLogger - Structured logging for crawler operations
- *
- * High-signal, low-noise logging with JSON output for:
- * - Job lifecycle (one summary per job)
- * - Provider/mode changes
- * - Sandbox events
- * - Queue failures
- *
- * NO per-product logging - that's too noisy.
- */
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.crawlerLogger = void 0;
-class CrawlerLoggerService {
- formatLog(payload) {
- return JSON.stringify(payload);
- }
- log(payload) {
- const formatted = this.formatLog(payload);
- switch (payload.level) {
- case 'error':
- console.error(`[CRAWLER] ${formatted}`);
- break;
- case 'warn':
- console.warn(`[CRAWLER] ${formatted}`);
- break;
- case 'debug':
- console.debug(`[CRAWLER] ${formatted}`);
- break;
- default:
- console.log(`[CRAWLER] ${formatted}`);
- }
- }
- /**
- * Log when a crawl job starts
- */
- jobStarted(params) {
- this.log({
- timestamp: new Date().toISOString(),
- level: 'info',
- event: 'job_started',
- job_id: params.job_id,
- store_id: params.store_id,
- store_name: params.store_name,
- job_type: params.job_type,
- trigger_type: params.trigger_type,
- provider: params.provider,
- });
- }
- /**
- * Log when a crawl job completes successfully
- */
- jobCompleted(params) {
- this.log({
- timestamp: new Date().toISOString(),
- level: 'info',
- event: 'job_completed',
- job_id: params.job_id,
- store_id: params.store_id,
- store_name: params.store_name,
- duration_ms: params.duration_ms,
- products_found: params.products_found,
- products_new: params.products_new,
- products_updated: params.products_updated,
- products_marked_oos: params.products_marked_oos,
- provider: params.provider,
- });
- }
- /**
- * Log when a crawl job fails
- */
- jobFailed(params) {
- this.log({
- timestamp: new Date().toISOString(),
- level: 'error',
- event: 'job_failed',
- job_id: params.job_id,
- store_id: params.store_id,
- store_name: params.store_name,
- duration_ms: params.duration_ms,
- error_message: params.error_message,
- error_code: params.error_code,
- provider: params.provider,
- });
- }
- /**
- * Log when a provider is detected for a dispensary
- */
- providerDetected(params) {
- this.log({
- timestamp: new Date().toISOString(),
- level: 'info',
- event: 'provider_detected',
- dispensary_id: params.dispensary_id,
- dispensary_name: params.dispensary_name,
- detected_provider: params.detected_provider,
- confidence: params.confidence,
- detection_method: params.detection_method,
- menu_url: params.menu_url,
- category: params.category,
- });
- }
- /**
- * Log when a dispensary's provider changes
- */
- providerChanged(params) {
- this.log({
- timestamp: new Date().toISOString(),
- level: 'info',
- event: 'provider_changed',
- dispensary_id: params.dispensary_id,
- dispensary_name: params.dispensary_name,
- old_provider: params.old_provider,
- new_provider: params.new_provider,
- old_confidence: params.old_confidence,
- new_confidence: params.new_confidence,
- category: params.category,
- });
- }
- /**
- * Log when a dispensary's crawler mode changes (sandbox -> production, etc.)
- */
- modeChanged(params) {
- this.log({
- timestamp: new Date().toISOString(),
- level: 'info',
- event: 'mode_changed',
- dispensary_id: params.dispensary_id,
- dispensary_name: params.dispensary_name,
- old_mode: params.old_mode,
- new_mode: params.new_mode,
- reason: params.reason,
- category: params.category,
- provider: params.provider,
- });
- }
- /**
- * Log sandbox crawl events
- */
- sandboxEvent(params) {
- const level = params.event === 'sandbox_failed' ? 'error' : 'info';
- this.log({
- timestamp: new Date().toISOString(),
- level,
- event: params.event,
- dispensary_id: params.dispensary_id,
- dispensary_name: params.dispensary_name,
- template_name: params.template_name,
- category: params.category,
- quality_score: params.quality_score,
- products_extracted: params.products_extracted,
- fields_missing: params.fields_missing,
- error_message: params.error_message,
- provider: params.provider,
- });
- }
- /**
- * Log queue processing failures
- */
- queueFailure(params) {
- this.log({
- timestamp: new Date().toISOString(),
- level: 'error',
- event: 'queue_failure',
- queue_type: params.queue_type,
- error_message: params.error_message,
- affected_items: params.affected_items,
- });
- }
- /**
- * Log detection scan summary
- */
- detectionScan(params) {
- this.log({
- timestamp: new Date().toISOString(),
- level: 'info',
- event: 'detection_scan',
- total_scanned: params.total_scanned,
- detected: params.detected,
- failed: params.failed,
- skipped: params.skipped,
- duration_ms: params.duration_ms,
- });
- }
- /**
- * Log intelligence run summary
- */
- intelligenceRun(params) {
- this.log({
- timestamp: new Date().toISOString(),
- level: 'info',
- event: 'intelligence_run',
- run_type: params.run_type,
- dispensaries_processed: params.dispensaries_processed,
- jobs_queued: params.jobs_queued,
- duration_ms: params.duration_ms,
- });
- }
-}
-// Export singleton instance
-exports.crawlerLogger = new CrawlerLoggerService();
diff --git a/backend/dist/services/dispensary-orchestrator.js b/backend/dist/services/dispensary-orchestrator.js
deleted file mode 100644
index 69b92245..00000000
--- a/backend/dist/services/dispensary-orchestrator.js
+++ /dev/null
@@ -1,394 +0,0 @@
-"use strict";
-/**
- * Dispensary Crawl Orchestrator
- *
- * Orchestrates the complete crawl workflow for a dispensary:
- * 1. Load dispensary data
- * 2. Check if provider detection is needed
- * 3. Run provider detection if needed
- * 4. Queue appropriate crawl jobs based on provider/mode
- * 5. Update dispensary_crawl_schedule with meaningful status
- *
- * This works DIRECTLY with dispensaries (not through stores table).
- */
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.runDispensaryOrchestrator = runDispensaryOrchestrator;
-exports.runBatchDispensaryOrchestrator = runBatchDispensaryOrchestrator;
-exports.getDispensariesDueForOrchestration = getDispensariesDueForOrchestration;
-exports.ensureAllDispensariesHaveSchedules = ensureAllDispensariesHaveSchedules;
-exports.processDispensaryScheduler = processDispensaryScheduler;
-const uuid_1 = require("uuid");
-const migrate_1 = require("../db/migrate");
-const crawler_logger_1 = require("./crawler-logger");
-const intelligence_detector_1 = require("./intelligence-detector");
-const category_crawler_jobs_1 = require("./category-crawler-jobs");
-// ========================================
-// Main Orchestrator Function
-// ========================================
-/**
- * Run the complete crawl orchestration for a dispensary
- *
- * Behavior:
- * 1. Load the dispensary info
- * 2. If product_provider is missing or stale (>7 days), run detection
- * 3. After detection:
- * - If product_provider = 'dutchie' and product_crawler_mode = 'production': Run production crawl
- * - Otherwise: Run sandbox crawl
- * 4. Update dispensary_crawl_schedule with status/summary
- */
-async function runDispensaryOrchestrator(dispensaryId, scheduleId) {
- const startTime = Date.now();
- const runId = (0, uuid_1.v4)();
- let result = {
- status: 'pending',
- summary: '',
- runId,
- dispensaryId,
- dispensaryName: '',
- detectionRan: false,
- crawlRan: false,
- durationMs: 0,
- };
- try {
- // Mark schedule as running
- await updateScheduleStatus(dispensaryId, 'running', 'Starting orchestrator...', null, runId);
- // 1. Load dispensary info
- const dispensary = await getDispensaryInfo(dispensaryId);
- if (!dispensary) {
- throw new Error(`Dispensary ${dispensaryId} not found`);
- }
- result.dispensaryName = dispensary.name;
- // 2. Check if provider detection is needed
- const needsDetection = await checkNeedsDetection(dispensary);
- if (needsDetection) {
- // Run provider detection
- const websiteUrl = dispensary.menu_url || dispensary.website;
- if (!websiteUrl) {
- result.status = 'error';
- result.summary = 'No website URL available for detection';
- result.error = 'Dispensary has no menu_url or website configured';
- await updateScheduleStatus(dispensaryId, 'error', result.summary, result.error, runId);
- result.durationMs = Date.now() - startTime;
- await createJobRecord(dispensaryId, scheduleId, result);
- return result;
- }
- await updateScheduleStatus(dispensaryId, 'running', 'Running provider detection...', null, runId);
- const detectionResult = await (0, intelligence_detector_1.detectMultiCategoryProviders)(websiteUrl);
- result.detectionRan = true;
- result.detectionResult = detectionResult;
- // Save detection results to dispensary
- await (0, intelligence_detector_1.updateAllCategoryProviders)(dispensaryId, detectionResult);
- crawler_logger_1.crawlerLogger.providerDetected({
- dispensary_id: dispensaryId,
- dispensary_name: dispensary.name,
- detected_provider: detectionResult.product.provider,
- confidence: detectionResult.product.confidence,
- detection_method: 'dispensary_orchestrator',
- menu_url: websiteUrl,
- category: 'product',
- });
- // Refresh dispensary info after detection
- const updatedDispensary = await getDispensaryInfo(dispensaryId);
- if (updatedDispensary) {
- Object.assign(dispensary, updatedDispensary);
- }
- }
- // 3. Determine crawl type and run
- // Use product_provider if available, otherwise fall back to menu_type
- const provider = dispensary.product_provider || dispensary.menu_type;
- const mode = dispensary.product_crawler_mode;
- // Run production Dutchie crawl if:
- // 1. product_provider is 'dutchie' with production mode, OR
- // 2. menu_type is 'dutchie' with platform_dispensary_id (known Dutchie store)
- const isDutchieProduction = (provider === 'dutchie' && mode === 'production') ||
- (dispensary.menu_type === 'dutchie' && dispensary.platform_dispensary_id);
- if (isDutchieProduction) {
- // Production Dutchie crawl
- await updateScheduleStatus(dispensaryId, 'running', 'Running Dutchie production crawl...', null, runId);
- try {
- // Run the category-specific crawl job
- const crawlResult = await (0, category_crawler_jobs_1.runCrawlProductsJob)(dispensaryId);
- result.crawlRan = true;
- result.crawlType = 'production';
- if (crawlResult.success) {
- result.productsFound = crawlResult.data?.productsFound || 0;
- const detectionPart = result.detectionRan ? 'Detection + ' : '';
- result.summary = `${detectionPart}Dutchie products crawl completed`;
- result.status = 'success';
- crawler_logger_1.crawlerLogger.jobCompleted({
- job_id: 0,
- store_id: 0,
- store_name: dispensary.name,
- duration_ms: Date.now() - startTime,
- products_found: result.productsFound || 0,
- products_new: 0,
- products_updated: 0,
- provider: 'dutchie',
- });
- }
- else {
- result.status = 'error';
- result.error = crawlResult.message;
- result.summary = `Dutchie crawl failed: ${crawlResult.message.slice(0, 100)}`;
- }
- }
- catch (crawlError) {
- result.status = 'error';
- result.error = crawlError.message;
- result.summary = `Dutchie crawl failed: ${crawlError.message.slice(0, 100)}`;
- result.crawlRan = true;
- result.crawlType = 'production';
- crawler_logger_1.crawlerLogger.jobFailed({
- job_id: 0,
- store_id: 0,
- store_name: dispensary.name,
- duration_ms: Date.now() - startTime,
- error_message: crawlError.message,
- provider: 'dutchie',
- });
- }
- }
- else if (provider && provider !== 'unknown') {
- // Sandbox crawl for non-Dutchie or sandbox mode
- await updateScheduleStatus(dispensaryId, 'running', `Running ${provider} sandbox crawl...`, null, runId);
- try {
- const sandboxResult = await (0, category_crawler_jobs_1.runSandboxProductsJob)(dispensaryId);
- result.crawlRan = true;
- result.crawlType = 'sandbox';
- result.productsFound = sandboxResult.data?.productsExtracted || 0;
- const detectionPart = result.detectionRan ? 'Detection + ' : '';
- if (sandboxResult.success) {
- result.summary = `${detectionPart}${provider} sandbox crawl (${result.productsFound} items, quality ${sandboxResult.data?.qualityScore || 0}%)`;
- result.status = 'sandbox_only';
- }
- else {
- result.summary = `${detectionPart}${provider} sandbox failed: ${sandboxResult.message}`;
- result.status = 'error';
- result.error = sandboxResult.message;
- }
- }
- catch (sandboxError) {
- result.status = 'error';
- result.error = sandboxError.message;
- result.summary = `Sandbox crawl failed: ${sandboxError.message.slice(0, 100)}`;
- result.crawlRan = true;
- result.crawlType = 'sandbox';
- }
- }
- else {
- // No provider detected - detection only
- if (result.detectionRan) {
- result.summary = `Detection complete: provider=${dispensary.product_provider || 'unknown'}, confidence=${dispensary.product_confidence || 0}%`;
- result.status = 'detection_only';
- }
- else {
- result.summary = 'No provider detected and no crawl possible';
- result.status = 'error';
- result.error = 'Could not determine menu provider';
- }
- }
- }
- catch (error) {
- result.status = 'error';
- result.error = error.message;
- result.summary = `Orchestrator error: ${error.message.slice(0, 100)}`;
- crawler_logger_1.crawlerLogger.queueFailure({
- queue_type: 'dispensary_orchestrator',
- error_message: error.message,
- });
- }
- result.durationMs = Date.now() - startTime;
- // Update final schedule status
- await updateScheduleStatus(dispensaryId, result.status, result.summary, result.error || null, runId);
- // Create job record
- await createJobRecord(dispensaryId, scheduleId, result);
- return result;
-}
-// ========================================
-// Helper Functions
-// ========================================
-async function getDispensaryInfo(dispensaryId) {
- const result = await migrate_1.pool.query(`SELECT id, name, city, website, menu_url, menu_type, platform_dispensary_id,
- product_provider, product_confidence, product_crawler_mode, last_product_scan_at
- FROM dispensaries
- WHERE id = $1`, [dispensaryId]);
- return result.rows[0] || null;
-}
-async function checkNeedsDetection(dispensary) {
- // If menu_type is already 'dutchie' and we have platform_dispensary_id, skip detection entirely
- // This avoids wasteful detection timeouts for known Dutchie stores
- if (dispensary.menu_type === 'dutchie' && dispensary.platform_dispensary_id) {
- return false;
- }
- // No provider = definitely needs detection
- if (!dispensary.product_provider)
- return true;
- // Unknown provider = needs detection
- if (dispensary.product_provider === 'unknown')
- return true;
- // Low confidence = needs re-detection
- if (dispensary.product_confidence !== null && dispensary.product_confidence < 50)
- return true;
- // Stale detection (> 7 days) = needs refresh
- if (dispensary.last_product_scan_at) {
- const daysSince = (Date.now() - new Date(dispensary.last_product_scan_at).getTime()) / (1000 * 60 * 60 * 24);
- if (daysSince > 7)
- return true;
- }
- return false;
-}
-async function updateScheduleStatus(dispensaryId, status, summary, error, runId) {
- await migrate_1.pool.query(`INSERT INTO dispensary_crawl_schedule (dispensary_id, last_status, last_summary, last_error, last_run_at, updated_at)
- VALUES ($1, $2, $3, $4, NOW(), NOW())
- ON CONFLICT (dispensary_id) DO UPDATE SET
- last_status = $2,
- last_summary = $3,
- last_error = $4,
- last_run_at = NOW(),
- updated_at = NOW()`, [dispensaryId, status, summary, error]);
-}
-async function createJobRecord(dispensaryId, scheduleId, result) {
- await migrate_1.pool.query(`INSERT INTO dispensary_crawl_jobs (
- dispensary_id, schedule_id, job_type, trigger_type, status, priority,
- scheduled_at, started_at, completed_at, duration_ms,
- detection_ran, crawl_ran, crawl_type,
- products_found, products_new, products_updated,
- detected_provider, detected_confidence, detected_mode,
- error_message, run_id
- ) VALUES (
- $1, $2, 'orchestrator', 'manual', $3, 100,
- NOW(), NOW(), NOW(), $4,
- $5, $6, $7,
- $8, $9, $10,
- $11, $12, $13,
- $14, $15
- )`, [
- dispensaryId,
- scheduleId || null,
- result.status === 'success' ? 'completed' : result.status === 'error' ? 'failed' : 'completed',
- result.durationMs,
- result.detectionRan,
- result.crawlRan,
- result.crawlType || null,
- result.productsFound || null,
- result.productsNew || null,
- result.productsUpdated || null,
- result.detectionResult?.product.provider || null,
- result.detectionResult?.product.confidence || null,
- result.detectionResult?.product.mode || null,
- result.error || null,
- result.runId,
- ]);
- // Update schedule stats
- if (result.status === 'success' || result.status === 'sandbox_only' || result.status === 'detection_only') {
- await migrate_1.pool.query(`UPDATE dispensary_crawl_schedule SET
- total_runs = COALESCE(total_runs, 0) + 1,
- successful_runs = COALESCE(successful_runs, 0) + 1,
- consecutive_failures = 0,
- next_run_at = NOW() + (interval_minutes || ' minutes')::INTERVAL,
- last_duration_ms = $2
- WHERE dispensary_id = $1`, [dispensaryId, result.durationMs]);
- }
- else if (result.status === 'error') {
- await migrate_1.pool.query(`UPDATE dispensary_crawl_schedule SET
- total_runs = COALESCE(total_runs, 0) + 1,
- consecutive_failures = COALESCE(consecutive_failures, 0) + 1,
- next_run_at = NOW() + (interval_minutes || ' minutes')::INTERVAL,
- last_duration_ms = $2
- WHERE dispensary_id = $1`, [dispensaryId, result.durationMs]);
- }
-}
-// ========================================
-// Batch Processing
-// ========================================
-/**
- * Run orchestrator for multiple dispensaries
- */
-async function runBatchDispensaryOrchestrator(dispensaryIds, concurrency = 3) {
- const results = [];
- // Process in batches
- for (let i = 0; i < dispensaryIds.length; i += concurrency) {
- const batch = dispensaryIds.slice(i, i + concurrency);
- console.log(`Processing batch ${Math.floor(i / concurrency) + 1}: dispensaries ${batch.join(', ')}`);
- const batchResults = await Promise.all(batch.map(id => runDispensaryOrchestrator(id)));
- results.push(...batchResults);
- // Small delay between batches to avoid overwhelming the system
- if (i + concurrency < dispensaryIds.length) {
- await new Promise(r => setTimeout(r, 1000));
- }
- }
- return results;
-}
-/**
- * Get dispensaries that are due for orchestration
- */
-async function getDispensariesDueForOrchestration(limit = 10) {
- const result = await migrate_1.pool.query(`SELECT d.id
- FROM dispensaries d
- LEFT JOIN dispensary_crawl_schedule dcs ON dcs.dispensary_id = d.id
- WHERE COALESCE(dcs.is_active, TRUE) = TRUE
- AND (
- dcs.next_run_at IS NULL
- OR dcs.next_run_at <= NOW()
- )
- AND (dcs.last_status IS NULL OR dcs.last_status NOT IN ('running', 'pending'))
- ORDER BY COALESCE(dcs.priority, 0) DESC, dcs.last_run_at ASC NULLS FIRST
- LIMIT $1`, [limit]);
- return result.rows.map(row => row.id);
-}
-/**
- * Ensure all dispensaries have schedule entries
- */
-async function ensureAllDispensariesHaveSchedules(intervalMinutes = 240) {
- // Get all dispensary IDs that don't have a schedule
- const result = await migrate_1.pool.query(`INSERT INTO dispensary_crawl_schedule (dispensary_id, is_active, interval_minutes, priority)
- SELECT d.id, TRUE, $1, 0
- FROM dispensaries d
- WHERE NOT EXISTS (
- SELECT 1 FROM dispensary_crawl_schedule dcs WHERE dcs.dispensary_id = d.id
- )
- RETURNING id`, [intervalMinutes]);
- const existingCount = await migrate_1.pool.query('SELECT COUNT(*) FROM dispensary_crawl_schedule');
- return {
- created: result.rowCount || 0,
- existing: parseInt(existingCount.rows[0].count) - (result.rowCount || 0),
- };
-}
-// ========================================
-// Scheduler Integration
-// ========================================
-let dispensarySchedulerRunning = false;
-/**
- * Process dispensaries using the intelligent orchestrator
- * Called periodically by the scheduler
- */
-async function processDispensaryScheduler() {
- if (dispensarySchedulerRunning) {
- console.log('Dispensary scheduler already running, skipping...');
- return;
- }
- dispensarySchedulerRunning = true;
- try {
- // Get dispensaries due for orchestration
- const dispensaryIds = await getDispensariesDueForOrchestration(3);
- if (dispensaryIds.length === 0) {
- return;
- }
- console.log(`Dispensary Scheduler: Processing ${dispensaryIds.length} dispensaries due for crawl`);
- // Process each dispensary through the orchestrator
- for (const dispensaryId of dispensaryIds) {
- try {
- console.log(`Dispensary Scheduler: Starting crawl for dispensary ${dispensaryId}`);
- const result = await runDispensaryOrchestrator(dispensaryId);
- console.log(`Dispensary Scheduler: Dispensary ${dispensaryId} completed - ${result.summary}`);
- }
- catch (error) {
- console.error(`Dispensary Scheduler: Dispensary ${dispensaryId} failed - ${error.message}`);
- }
- }
- console.log(`Dispensary Scheduler: Finished processing ${dispensaryIds.length} dispensaries`);
- }
- finally {
- dispensarySchedulerRunning = false;
- }
-}
diff --git a/backend/dist/services/geolocation.js b/backend/dist/services/geolocation.js
deleted file mode 100644
index 32917440..00000000
--- a/backend/dist/services/geolocation.js
+++ /dev/null
@@ -1,125 +0,0 @@
-"use strict";
-var __importDefault = (this && this.__importDefault) || function (mod) {
- return (mod && mod.__esModule) ? mod : { "default": mod };
-};
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.lookupProxyLocation = lookupProxyLocation;
-exports.updateProxyLocation = updateProxyLocation;
-exports.updateAllProxyLocations = updateAllProxyLocations;
-exports.queueProxyLocationUpdate = queueProxyLocationUpdate;
-const axios_1 = __importDefault(require("axios"));
-const migrate_1 = require("../db/migrate");
-// Free API - 45 requests/minute limit
-const GEOLOCATION_API = 'http://ip-api.com/json/';
-async function lookupProxyLocation(host) {
- try {
- const response = await axios_1.default.get(`${GEOLOCATION_API}${host}?fields=status,message,country,countryCode,regionName,city,query`);
- const data = response.data;
- if (data.status === 'fail') {
- console.log(`❌ Geolocation lookup failed for ${host}: ${data.message}`);
- return null;
- }
- return data;
- }
- catch (error) {
- console.error(`❌ Error looking up location for ${host}:`, error.message);
- return null;
- }
-}
-async function updateProxyLocation(proxyId, location) {
- await migrate_1.pool.query(`
- UPDATE proxies
- SET city = $1,
- state = $2,
- country = $3,
- country_code = $4,
- location_updated_at = CURRENT_TIMESTAMP
- WHERE id = $5
- `, [
- location.city,
- location.regionName,
- location.country,
- location.countryCode,
- proxyId
- ]);
-}
-async function updateAllProxyLocations(batchSize = 45) {
- console.log('🌍 Starting proxy location update job...');
- // Get all proxies without location data
- const result = await migrate_1.pool.query(`
- SELECT id, host
- FROM proxies
- WHERE location_updated_at IS NULL
- OR location_updated_at < CURRENT_TIMESTAMP - INTERVAL '30 days'
- ORDER BY id
- `);
- const proxies = result.rows;
- console.log(`📊 Found ${proxies.length} proxies to update`);
- let updated = 0;
- let failed = 0;
- // Process in batches to respect rate limit (45 req/min)
- for (let i = 0; i < proxies.length; i += batchSize) {
- const batch = proxies.slice(i, i + batchSize);
- console.log(`🔄 Processing batch ${Math.floor(i / batchSize) + 1}/${Math.ceil(proxies.length / batchSize)} (${batch.length} proxies)`);
- // Process batch
- for (const proxy of batch) {
- const location = await lookupProxyLocation(proxy.host);
- if (location) {
- await updateProxyLocation(proxy.id, location);
- console.log(`✅ Updated ${proxy.id}: ${location.city}, ${location.regionName} - ${location.country}`);
- updated++;
- }
- else {
- console.log(`⚠️ Failed to get location for proxy ${proxy.id} (${proxy.host})`);
- failed++;
- }
- // Small delay between requests
- await new Promise(resolve => setTimeout(resolve, 100));
- }
- // Wait 60 seconds before next batch to respect rate limit
- if (i + batchSize < proxies.length) {
- console.log(`⏳ Waiting 60s before next batch (rate limit: 45 req/min)...`);
- await new Promise(resolve => setTimeout(resolve, 60000));
- }
- }
- console.log(`✅ Proxy location update complete!`);
- console.log(` Updated: ${updated}`);
- console.log(` Failed: ${failed}`);
-}
-// Queue for background processing
-const locationUpdateQueue = new Set();
-let isProcessing = false;
-function queueProxyLocationUpdate(proxyId) {
- locationUpdateQueue.add(proxyId);
- processLocationQueue();
-}
-async function processLocationQueue() {
- if (isProcessing || locationUpdateQueue.size === 0)
- return;
- isProcessing = true;
- try {
- const proxyIds = Array.from(locationUpdateQueue);
- locationUpdateQueue.clear();
- console.log(`🌍 Processing ${proxyIds.length} proxy location updates from queue`);
- for (const proxyId of proxyIds) {
- const result = await migrate_1.pool.query('SELECT host FROM proxies WHERE id = $1', [proxyId]);
- if (result.rows.length === 0)
- continue;
- const host = result.rows[0].host;
- const location = await lookupProxyLocation(host);
- if (location) {
- await updateProxyLocation(proxyId, location);
- console.log(`✅ Queue: Updated ${proxyId}: ${location.city}, ${location.regionName} - ${location.country}`);
- }
- // Respect rate limit
- await new Promise(resolve => setTimeout(resolve, 1500)); // ~40 req/min
- }
- }
- finally {
- isProcessing = false;
- // Process any new items that were added while we were processing
- if (locationUpdateQueue.size > 0) {
- processLocationQueue();
- }
- }
-}
diff --git a/backend/dist/services/intelligence-detector.js b/backend/dist/services/intelligence-detector.js
deleted file mode 100644
index 0f5993b6..00000000
--- a/backend/dist/services/intelligence-detector.js
+++ /dev/null
@@ -1,493 +0,0 @@
-"use strict";
-/**
- * Multi-Category Intelligence Detector
- *
- * Detects providers for each intelligence category independently:
- * - Products: Which provider serves product data
- * - Specials: Which provider serves deals/specials
- * - Brand: Which provider serves brand information
- * - Metadata: Which provider serves taxonomy/category data
- */
-var __importDefault = (this && this.__importDefault) || function (mod) {
- return (mod && mod.__esModule) ? mod : { "default": mod };
-};
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.detectMultiCategoryProviders = detectMultiCategoryProviders;
-exports.detectCategoryProviderChange = detectCategoryProviderChange;
-exports.updateDispensaryCategoryProvider = updateDispensaryCategoryProvider;
-exports.updateAllCategoryProviders = updateAllCategoryProviders;
-exports.moveCategoryToSandbox = moveCategoryToSandbox;
-const migrate_1 = require("../db/migrate");
-const logger_1 = require("./logger");
-const puppeteer_1 = __importDefault(require("puppeteer"));
-// Production-ready providers per category
-// Only these combinations can be set to production mode
-const PRODUCTION_READY = {
- product: ['dutchie'], // Only Dutchie products are production-ready
- specials: [], // None yet
- brand: [], // None yet
- metadata: [], // None yet
-};
-// Provider detection patterns
-const PROVIDER_PATTERNS = {
- dutchie: {
- scripts: [
- /dutchie\.com/i,
- /dutchie-plus/i,
- /dutchie\.js/i,
- /__DUTCHIE__/i,
- /dutchie-embed/i,
- ],
- iframes: [
- /dutchie\.com/i,
- /dutchie-plus\.com/i,
- /embed\.dutchie/i,
- ],
- html: [
- /class="dutchie/i,
- /id="dutchie/i,
- /data-dutchie/i,
- /"menuType":\s*"dutchie"/i,
- ],
- apiEndpoints: [
- /dutchie\.com\/graphql/i,
- /plus\.dutchie\.com/i,
- ],
- metaTags: [
- /dutchie/i,
- ],
- },
- treez: {
- scripts: [
- /treez\.io/i,
- /treez-ecommerce/i,
- /treez\.js/i,
- ],
- iframes: [
- /treez\.io/i,
- /shop\.treez/i,
- ],
- html: [
- /class="treez/i,
- /data-treez/i,
- /treez-menu/i,
- ],
- apiEndpoints: [
- /api\.treez\.io/i,
- /treez\.io\/api/i,
- ],
- metaTags: [],
- },
- jane: {
- scripts: [
- /jane\.co/i,
- /iheartjane\.com/i,
- /jane-frame/i,
- /jane\.js/i,
- ],
- iframes: [
- /jane\.co/i,
- /iheartjane\.com/i,
- /embed\.iheartjane/i,
- ],
- html: [
- /class="jane/i,
- /data-jane/i,
- /jane-embed/i,
- ],
- apiEndpoints: [
- /api\.iheartjane/i,
- /jane\.co\/api/i,
- ],
- metaTags: [],
- },
- weedmaps: {
- scripts: [
- /weedmaps\.com/i,
- /wm-menu/i,
- ],
- iframes: [
- /weedmaps\.com/i,
- /menu\.weedmaps/i,
- ],
- html: [
- /data-weedmaps/i,
- /wm-menu/i,
- ],
- apiEndpoints: [
- /api-g\.weedmaps/i,
- /weedmaps\.com\/api/i,
- ],
- metaTags: [],
- },
- leafly: {
- scripts: [
- /leafly\.com/i,
- /leafly-menu/i,
- ],
- iframes: [
- /leafly\.com/i,
- /order\.leafly/i,
- ],
- html: [
- /data-leafly/i,
- /leafly-embed/i,
- ],
- apiEndpoints: [
- /api\.leafly/i,
- ],
- metaTags: [],
- },
-};
-// Category-specific detection signals
-const CATEGORY_SIGNALS = {
- product: {
- urlPatterns: [/\/menu/i, /\/products/i, /\/shop/i, /\/order/i],
- htmlPatterns: [/product-card/i, /menu-item/i, /product-list/i, /product-grid/i],
- jsonKeys: ['products', 'menuItems', 'items', 'inventory'],
- },
- specials: {
- urlPatterns: [/\/specials/i, /\/deals/i, /\/promotions/i, /\/offers/i],
- htmlPatterns: [/special/i, /deal/i, /promotion/i, /discount/i, /sale/i],
- jsonKeys: ['specials', 'deals', 'promotions', 'offers'],
- },
- brand: {
- urlPatterns: [/\/brands/i, /\/vendors/i, /\/producers/i],
- htmlPatterns: [/brand-list/i, /vendor/i, /producer/i, /manufacturer/i],
- jsonKeys: ['brands', 'vendors', 'producers', 'manufacturers'],
- },
- metadata: {
- urlPatterns: [/\/categories/i, /\/taxonomy/i],
- htmlPatterns: [/category-nav/i, /menu-categories/i, /filter-category/i],
- jsonKeys: ['categories', 'taxonomy', 'filters', 'types'],
- },
-};
-// ========================================
-// Main Detection Function
-// ========================================
-async function detectMultiCategoryProviders(websiteUrl, options = {}) {
- const { timeout = 30000, headless = true, existingBrowser } = options;
- let browser = null;
- let page = null;
- const urlsTested = [];
- const rawSignals = {};
- try {
- browser = existingBrowser || await puppeteer_1.default.launch({
- headless,
- args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'],
- });
- page = await browser.newPage();
- await page.setViewport({ width: 1920, height: 1080 });
- await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36');
- // Navigate to main site
- const baseUrl = normalizeUrl(websiteUrl);
- urlsTested.push(baseUrl);
- await page.goto(baseUrl, { waitUntil: 'networkidle2', timeout });
- // Collect signals from main page
- const mainPageSignals = await collectPageSignals(page);
- rawSignals.mainPage = mainPageSignals;
- // Try common menu URLs
- const menuUrls = ['/menu', '/shop', '/products', '/order', '/specials', '/deals', '/brands'];
- for (const path of menuUrls) {
- try {
- const fullUrl = new URL(path, baseUrl).toString();
- urlsTested.push(fullUrl);
- await page.goto(fullUrl, { waitUntil: 'networkidle2', timeout: 15000 });
- const signals = await collectPageSignals(page);
- rawSignals[path] = signals;
- }
- catch {
- // URL doesn't exist or timed out
- }
- }
- // Analyze signals for each category
- const result = {
- product: analyzeCategorySignals('product', rawSignals),
- specials: analyzeCategorySignals('specials', rawSignals),
- brand: analyzeCategorySignals('brand', rawSignals),
- metadata: analyzeCategorySignals('metadata', rawSignals),
- urlsTested,
- rawSignals,
- };
- logger_1.logger.info('provider-detection', `Multi-category detection complete for ${websiteUrl}`);
- return result;
- }
- catch (error) {
- logger_1.logger.error('provider-detection', `Detection failed for ${websiteUrl}: ${error.message}`);
- // Return unknown results for all categories
- return {
- product: createUnknownResult(),
- specials: createUnknownResult(),
- brand: createUnknownResult(),
- metadata: createUnknownResult(),
- urlsTested,
- rawSignals: { error: error.message },
- };
- }
- finally {
- if (page)
- await page.close().catch(() => { });
- if (browser && !existingBrowser)
- await browser.close().catch(() => { });
- }
-}
-// ========================================
-// Helper Functions
-// ========================================
-function normalizeUrl(url) {
- if (!url.startsWith('http')) {
- url = 'https://' + url;
- }
- return url.replace(/\/$/, '');
-}
-async function collectPageSignals(page) {
- return page.evaluate(() => {
- const signals = {
- scripts: [],
- iframes: [],
- links: [],
- metaTags: [],
- bodyClasses: document.body?.className || '',
- bodyId: document.body?.id || '',
- htmlSnippet: document.documentElement.outerHTML.slice(0, 10000),
- };
- // Collect script sources
- document.querySelectorAll('script[src]').forEach((el) => {
- signals.scripts.push(el.src);
- });
- // Collect inline scripts
- document.querySelectorAll('script:not([src])').forEach((el) => {
- const content = el.textContent || '';
- if (content.length < 5000) {
- signals.scripts.push(`inline:${content.slice(0, 500)}`);
- }
- });
- // Collect iframes
- document.querySelectorAll('iframe').forEach((el) => {
- signals.iframes.push(el.src);
- });
- // Collect links
- document.querySelectorAll('a[href]').forEach((el) => {
- signals.links.push(el.href);
- });
- // Collect meta tags
- document.querySelectorAll('meta').forEach((el) => {
- const content = el.getAttribute('content') || '';
- const name = el.getAttribute('name') || el.getAttribute('property') || '';
- if (content || name) {
- signals.metaTags.push(`${name}:${content}`);
- }
- });
- // Look for JSON data
- const jsonBlocks = [];
- document.querySelectorAll('script[type="application/json"]').forEach((el) => {
- jsonBlocks.push(el.textContent?.slice(0, 2000) || '');
- });
- signals.jsonBlocks = jsonBlocks;
- return signals;
- });
-}
-function analyzeCategorySignals(category, allSignals) {
- const providerScores = {};
- const detectedSignals = {};
- // Initialize scores
- for (const provider of Object.keys(PROVIDER_PATTERNS)) {
- providerScores[provider] = 0;
- }
- // Analyze each page's signals
- for (const [pagePath, signals] of Object.entries(allSignals)) {
- if (!signals || typeof signals !== 'object')
- continue;
- // Check for provider-specific patterns
- for (const [provider, patterns] of Object.entries(PROVIDER_PATTERNS)) {
- let score = 0;
- // Check scripts
- if (signals.scripts) {
- for (const script of signals.scripts) {
- for (const pattern of patterns.scripts) {
- if (pattern.test(script)) {
- score += 20;
- detectedSignals[`${provider}_script_${pagePath}`] = script;
- }
- }
- }
- }
- // Check iframes
- if (signals.iframes) {
- for (const iframe of signals.iframes) {
- for (const pattern of patterns.iframes) {
- if (pattern.test(iframe)) {
- score += 25;
- detectedSignals[`${provider}_iframe_${pagePath}`] = iframe;
- }
- }
- }
- }
- // Check HTML content
- if (signals.htmlSnippet) {
- for (const pattern of patterns.html) {
- if (pattern.test(signals.htmlSnippet)) {
- score += 15;
- detectedSignals[`${provider}_html_${pagePath}`] = true;
- }
- }
- }
- providerScores[provider] += score;
- }
- // Check for category-specific signals on relevant pages
- const categorySignals = CATEGORY_SIGNALS[category];
- const isRelevantPage = categorySignals.urlPatterns.some((p) => p.test(pagePath));
- if (isRelevantPage && signals.htmlSnippet) {
- for (const pattern of categorySignals.htmlPatterns) {
- if (pattern.test(signals.htmlSnippet)) {
- detectedSignals[`${category}_html_pattern`] = true;
- }
- }
- }
- // Check JSON blocks for category data
- if (signals.jsonBlocks) {
- for (const json of signals.jsonBlocks) {
- for (const key of categorySignals.jsonKeys) {
- if (json.toLowerCase().includes(`"${key}"`)) {
- detectedSignals[`${category}_json_key_${key}`] = true;
- }
- }
- }
- }
- }
- // Determine winning provider
- let bestProvider = 'unknown';
- let bestScore = 0;
- for (const [provider, score] of Object.entries(providerScores)) {
- if (score > bestScore) {
- bestScore = score;
- bestProvider = provider;
- }
- }
- // Calculate confidence (0-100)
- const confidence = Math.min(100, bestScore);
- // Determine mode based on provider and confidence
- const isProductionReady = PRODUCTION_READY[category].includes(bestProvider);
- const mode = isProductionReady && confidence >= 70
- ? 'production'
- : 'sandbox';
- // Get template name if available
- let templateName;
- if (bestProvider === 'dutchie' && category === 'product') {
- templateName = 'dutchie_standard';
- }
- else if (bestProvider === 'treez') {
- templateName = 'treez_products_v0';
- }
- return {
- provider: bestProvider,
- confidence,
- mode,
- signals: detectedSignals,
- templateName,
- };
-}
-function createUnknownResult() {
- return {
- provider: 'unknown',
- confidence: 0,
- mode: 'sandbox',
- signals: {},
- };
-}
-// ========================================
-// Lightweight Per-Category Change Detection
-// ========================================
-async function detectCategoryProviderChange(page, category, expectedProvider) {
- try {
- const signals = await collectPageSignals(page);
- const result = analyzeCategorySignals(category, { currentPage: signals });
- if (result.provider !== expectedProvider && result.confidence > 50) {
- logger_1.logger.warn('provider-detection', `Provider change detected for ${category}: ${expectedProvider} -> ${result.provider}`);
- return {
- changed: true,
- newProvider: result.provider,
- confidence: result.confidence,
- };
- }
- return { changed: false };
- }
- catch (error) {
- logger_1.logger.error('provider-detection', `Change detection failed: ${error.message}`);
- return { changed: false };
- }
-}
-// ========================================
-// Database Operations
-// ========================================
-async function updateDispensaryCategoryProvider(dispensaryId, category, result) {
- const columnPrefix = category === 'product' ? 'product' :
- category === 'specials' ? 'specials' :
- category === 'brand' ? 'brand' : 'metadata';
- await migrate_1.pool.query(`UPDATE dispensaries SET
- ${columnPrefix}_provider = $1,
- ${columnPrefix}_confidence = $2,
- ${columnPrefix}_crawler_mode = $3,
- ${columnPrefix}_detection_data = $4,
- updated_at = NOW()
- WHERE id = $5`, [
- result.provider,
- result.confidence,
- result.mode,
- JSON.stringify(result.signals),
- dispensaryId,
- ]);
-}
-async function updateAllCategoryProviders(dispensaryId, result) {
- await migrate_1.pool.query(`UPDATE dispensaries SET
- product_provider = $1,
- product_confidence = $2,
- product_crawler_mode = $3,
- product_detection_data = $4,
- specials_provider = $5,
- specials_confidence = $6,
- specials_crawler_mode = $7,
- specials_detection_data = $8,
- brand_provider = $9,
- brand_confidence = $10,
- brand_crawler_mode = $11,
- brand_detection_data = $12,
- metadata_provider = $13,
- metadata_confidence = $14,
- metadata_crawler_mode = $15,
- metadata_detection_data = $16,
- updated_at = NOW()
- WHERE id = $17`, [
- result.product.provider,
- result.product.confidence,
- result.product.mode,
- JSON.stringify(result.product.signals),
- result.specials.provider,
- result.specials.confidence,
- result.specials.mode,
- JSON.stringify(result.specials.signals),
- result.brand.provider,
- result.brand.confidence,
- result.brand.mode,
- JSON.stringify(result.brand.signals),
- result.metadata.provider,
- result.metadata.confidence,
- result.metadata.mode,
- JSON.stringify(result.metadata.signals),
- dispensaryId,
- ]);
-}
-async function moveCategoryToSandbox(dispensaryId, category, reason) {
- const columnPrefix = category === 'product' ? 'product' :
- category === 'specials' ? 'specials' :
- category === 'brand' ? 'brand' : 'metadata';
- await migrate_1.pool.query(`UPDATE dispensaries SET
- ${columnPrefix}_crawler_mode = 'sandbox',
- ${columnPrefix}_detection_data = ${columnPrefix}_detection_data || $1::jsonb,
- updated_at = NOW()
- WHERE id = $2`, [
- JSON.stringify({ sandbox_reason: reason, sandbox_at: new Date().toISOString() }),
- dispensaryId,
- ]);
- logger_1.logger.info('provider-detection', `Moved dispensary ${dispensaryId} ${category} to sandbox: ${reason}`);
-}
diff --git a/backend/dist/services/logger.js b/backend/dist/services/logger.js
deleted file mode 100644
index da69295c..00000000
--- a/backend/dist/services/logger.js
+++ /dev/null
@@ -1,56 +0,0 @@
-"use strict";
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.logger = void 0;
-class LogService {
- logs = [];
- maxLogs = 1000;
- log(level, category, message) {
- const entry = {
- timestamp: new Date(),
- level,
- category,
- message
- };
- this.logs.unshift(entry);
- if (this.logs.length > this.maxLogs) {
- this.logs = this.logs.slice(0, this.maxLogs);
- }
- const timestamp = entry.timestamp.toISOString();
- const prefix = `[${timestamp}] [${category.toUpperCase()}] [${level.toUpperCase()}]`;
- if (level === 'error') {
- console.error(prefix, message);
- }
- else if (level === 'warn') {
- console.warn(prefix, message);
- }
- else {
- console.log(prefix, message);
- }
- }
- info(category, message) {
- this.log('info', category, message);
- }
- error(category, message) {
- this.log('error', category, message);
- }
- warn(category, message) {
- this.log('warn', category, message);
- }
- debug(category, message) {
- this.log('debug', category, message);
- }
- getLogs(limit = 100, level, category) {
- let filtered = this.logs;
- if (level) {
- filtered = filtered.filter(log => log.level === level);
- }
- if (category) {
- filtered = filtered.filter(log => log.category === category);
- }
- return filtered.slice(0, limit);
- }
- clear() {
- this.logs = [];
- }
-}
-exports.logger = new LogService();
diff --git a/backend/dist/services/menu-provider-detector.js b/backend/dist/services/menu-provider-detector.js
deleted file mode 100644
index f3faa9a9..00000000
--- a/backend/dist/services/menu-provider-detector.js
+++ /dev/null
@@ -1,612 +0,0 @@
-"use strict";
-/**
- * Menu Provider Detection Service
- *
- * Detects which menu platform a dispensary is using by analyzing:
- * - HTML content patterns (scripts, iframes, classes)
- * - URL patterns (embedded menu paths)
- * - API endpoint signatures
- * - Meta tags and headers
- */
-var __importDefault = (this && this.__importDefault) || function (mod) {
- return (mod && mod.__esModule) ? mod : { "default": mod };
-};
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.detectMenuProvider = detectMenuProvider;
-exports.quickDutchieCheck = quickDutchieCheck;
-exports.detectProviderChange = detectProviderChange;
-const puppeteer_1 = __importDefault(require("puppeteer"));
-const logger_1 = require("./logger");
-// Provider detection patterns
-const PROVIDER_PATTERNS = {
- dutchie: {
- scripts: [
- /dutchie/i,
- /dutchie-plus/i,
- /dutchie\.com/i,
- /dutchie-embed/i,
- ],
- iframes: [
- /dutchie\.com/i,
- /embed\.dutchie/i,
- /iframe\.dutchie/i,
- ],
- classes: [
- /dutchie-/i,
- /DutchieEmbed/i,
- ],
- urls: [
- /dutchie\.com/i,
- /\.dutchie\./i,
- ],
- meta: [
- /dutchie/i,
- ],
- apiEndpoints: [
- /graphql.*dutchie/i,
- /api\.dutchie/i,
- ],
- htmlPatterns: [
- /data-dutchie/i,
- /__DUTCHIE__/i,
- /dutchie-plus-iframe/i,
- ],
- },
- treez: {
- scripts: [
- /treez/i,
- /treez\.io/i,
- /treezpay/i,
- ],
- iframes: [
- /treez\.io/i,
- /menu\.treez/i,
- ],
- classes: [
- /treez-/i,
- ],
- urls: [
- /treez\.io/i,
- /\.treez\./i,
- ],
- meta: [
- /treez/i,
- ],
- apiEndpoints: [
- /api\.treez/i,
- ],
- htmlPatterns: [
- /data-treez/i,
- /treez-embed/i,
- ],
- },
- jane: {
- scripts: [
- /jane\.co/i,
- /iheartjane/i,
- /jane-embed/i,
- /janetechnologies/i,
- ],
- iframes: [
- /jane\.co/i,
- /iheartjane\.com/i,
- /menu\.jane/i,
- ],
- classes: [
- /jane-/i,
- /iheartjane/i,
- ],
- urls: [
- /jane\.co/i,
- /iheartjane\.com/i,
- ],
- meta: [
- /jane/i,
- /iheartjane/i,
- ],
- apiEndpoints: [
- /api\.iheartjane/i,
- /api\.jane\.co/i,
- ],
- htmlPatterns: [
- /data-jane/i,
- /jane-root/i,
- /jane-embed/i,
- ],
- },
- weedmaps: {
- scripts: [
- /weedmaps/i,
- /wm\.com/i,
- ],
- iframes: [
- /weedmaps\.com/i,
- /menu\.weedmaps/i,
- ],
- classes: [
- /weedmaps-/i,
- /wm-/i,
- ],
- urls: [
- /weedmaps\.com/i,
- ],
- meta: [
- /weedmaps/i,
- ],
- apiEndpoints: [
- /api.*weedmaps/i,
- ],
- htmlPatterns: [
- /data-weedmaps/i,
- ],
- },
- leafly: {
- scripts: [
- /leafly/i,
- /leafly\.com/i,
- ],
- iframes: [
- /leafly\.com/i,
- /menu\.leafly/i,
- ],
- classes: [
- /leafly-/i,
- ],
- urls: [
- /leafly\.com/i,
- ],
- meta: [
- /leafly/i,
- ],
- apiEndpoints: [
- /api\.leafly/i,
- ],
- htmlPatterns: [
- /data-leafly/i,
- ],
- },
- meadow: {
- scripts: [
- /meadow/i,
- /getmeadow/i,
- ],
- iframes: [
- /getmeadow\.com/i,
- ],
- classes: [
- /meadow-/i,
- ],
- urls: [
- /getmeadow\.com/i,
- ],
- meta: [],
- apiEndpoints: [
- /api\.getmeadow/i,
- ],
- htmlPatterns: [],
- },
- greenlight: {
- scripts: [
- /greenlight/i,
- /greenlightmenu/i,
- ],
- iframes: [
- /greenlight/i,
- ],
- classes: [
- /greenlight-/i,
- ],
- urls: [
- /greenlight/i,
- ],
- meta: [],
- apiEndpoints: [],
- htmlPatterns: [],
- },
- blaze: {
- scripts: [
- /blaze\.me/i,
- /blazepos/i,
- ],
- iframes: [
- /blaze\.me/i,
- ],
- classes: [
- /blaze-/i,
- ],
- urls: [
- /blaze\.me/i,
- ],
- meta: [],
- apiEndpoints: [
- /api\.blaze/i,
- ],
- htmlPatterns: [],
- },
- flowhub: {
- scripts: [
- /flowhub/i,
- ],
- iframes: [
- /flowhub\.com/i,
- ],
- classes: [
- /flowhub-/i,
- ],
- urls: [
- /flowhub\.com/i,
- ],
- meta: [],
- apiEndpoints: [],
- htmlPatterns: [],
- },
- dispense: {
- scripts: [
- /dispenseapp/i,
- ],
- iframes: [
- /dispenseapp\.com/i,
- ],
- classes: [
- /dispense-/i,
- ],
- urls: [
- /dispenseapp\.com/i,
- ],
- meta: [],
- apiEndpoints: [],
- htmlPatterns: [],
- },
- cova: {
- scripts: [
- /covasoftware/i,
- /cova\.software/i,
- ],
- iframes: [
- /cova/i,
- ],
- classes: [
- /cova-/i,
- ],
- urls: [
- /cova/i,
- ],
- meta: [],
- apiEndpoints: [],
- htmlPatterns: [],
- },
-};
-// Common menu URL paths to check
-const MENU_PATHS = [
- '/menu',
- '/shop',
- '/products',
- '/order',
- '/store',
- '/dispensary-menu',
- '/online-menu',
- '/shop-all',
- '/browse',
- '/catalog',
-];
-/**
- * Analyze a single page for provider signals
- */
-async function analyzePageForProviders(page, url) {
- const signals = [];
- try {
- // Get page HTML
- const html = await page.content();
- const lowerHtml = html.toLowerCase();
- // Check each provider's patterns
- for (const [provider, patterns] of Object.entries(PROVIDER_PATTERNS)) {
- // Check script sources
- const scripts = await page.$$eval('script[src]', els => els.map(el => el.getAttribute('src') || ''));
- for (const script of scripts) {
- for (const pattern of patterns.scripts) {
- if (pattern.test(script)) {
- signals.push({
- provider: provider,
- confidence: 90,
- source: 'script_src',
- details: script,
- });
- }
- }
- }
- // Check inline scripts
- const inlineScripts = await page.$$eval('script:not([src])', els => els.map(el => el.textContent || ''));
- for (const scriptContent of inlineScripts) {
- for (const pattern of patterns.scripts) {
- if (pattern.test(scriptContent)) {
- signals.push({
- provider: provider,
- confidence: 70,
- source: 'inline_script',
- details: `Pattern: ${pattern}`,
- });
- }
- }
- }
- // Check iframes
- const iframes = await page.$$eval('iframe', els => els.map(el => el.getAttribute('src') || ''));
- for (const iframe of iframes) {
- for (const pattern of patterns.iframes) {
- if (pattern.test(iframe)) {
- signals.push({
- provider: provider,
- confidence: 95,
- source: 'iframe_src',
- details: iframe,
- });
- }
- }
- }
- // Check HTML patterns
- for (const pattern of patterns.htmlPatterns) {
- if (pattern.test(html)) {
- signals.push({
- provider: provider,
- confidence: 85,
- source: 'html_pattern',
- details: `Pattern: ${pattern}`,
- });
- }
- }
- // Check CSS classes
- for (const pattern of patterns.classes) {
- if (pattern.test(html)) {
- signals.push({
- provider: provider,
- confidence: 60,
- source: 'css_class',
- details: `Pattern: ${pattern}`,
- });
- }
- }
- // Check meta tags
- const metaTags = await page.$$eval('meta', els => els.map(el => `${el.getAttribute('name')} ${el.getAttribute('content')}`));
- for (const meta of metaTags) {
- for (const pattern of patterns.meta) {
- if (pattern.test(meta)) {
- signals.push({
- provider: provider,
- confidence: 80,
- source: 'meta_tag',
- details: meta,
- });
- }
- }
- }
- }
- // Check for network requests (if we intercepted them)
- // This would be enhanced with request interception
- }
- catch (error) {
- logger_1.logger.error('provider-detection', `Error analyzing page ${url}: ${error}`);
- }
- return signals;
-}
-/**
- * Aggregate signals into a final detection result
- */
-function aggregateSignals(signals) {
- if (signals.length === 0) {
- return { provider: 'unknown', confidence: 0 };
- }
- // Group signals by provider
- const providerScores = {};
- for (const signal of signals) {
- if (!providerScores[signal.provider]) {
- providerScores[signal.provider] = [];
- }
- providerScores[signal.provider].push(signal.confidence);
- }
- // Calculate weighted score for each provider
- const scores = [];
- for (const [provider, confidences] of Object.entries(providerScores)) {
- // Use max confidence + bonus for multiple signals
- const maxConf = Math.max(...confidences);
- const multiSignalBonus = Math.min(10, (confidences.length - 1) * 3);
- const score = Math.min(100, maxConf + multiSignalBonus);
- scores.push({ provider: provider, score });
- }
- // Sort by score descending
- scores.sort((a, b) => b.score - a.score);
- const best = scores[0];
- // If there's a clear winner (20+ point lead), use it
- if (scores.length === 1 || best.score - scores[1].score >= 20) {
- return { provider: best.provider, confidence: best.score };
- }
- // Multiple contenders - reduce confidence
- return { provider: best.provider, confidence: Math.max(50, best.score - 20) };
-}
-/**
- * Detect the menu provider for a dispensary
- */
-async function detectMenuProvider(websiteUrl, options = {}) {
- const { checkMenuPaths = true, timeout = 30000 } = options;
- const result = {
- provider: 'unknown',
- confidence: 0,
- signals: [],
- urlsTested: [],
- menuEntryPoints: [],
- rawSignals: {},
- };
- let browser = null;
- try {
- // Normalize URL
- let baseUrl = websiteUrl.trim();
- if (!baseUrl.startsWith('http')) {
- baseUrl = `https://${baseUrl}`;
- }
- baseUrl = baseUrl.replace(/\/$/, ''); // Remove trailing slash
- // Launch browser
- browser = await puppeteer_1.default.launch({
- headless: true,
- args: [
- '--no-sandbox',
- '--disable-setuid-sandbox',
- '--disable-dev-shm-usage',
- '--disable-gpu',
- ],
- });
- const page = await browser.newPage();
- await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
- // Track network requests for API detection
- const apiRequests = [];
- await page.setRequestInterception(true);
- page.on('request', (request) => {
- const url = request.url();
- if (url.includes('api') || url.includes('graphql')) {
- apiRequests.push(url);
- }
- request.continue();
- });
- // URLs to check
- const urlsToCheck = [baseUrl];
- if (checkMenuPaths) {
- for (const path of MENU_PATHS) {
- urlsToCheck.push(`${baseUrl}${path}`);
- }
- }
- // Check each URL
- for (const url of urlsToCheck) {
- try {
- result.urlsTested.push(url);
- await page.goto(url, {
- waitUntil: 'networkidle2',
- timeout,
- });
- // Wait a bit for dynamic content
- await new Promise(r => setTimeout(r, 2000));
- // Analyze page
- const pageSignals = await analyzePageForProviders(page, url);
- result.signals.push(...pageSignals);
- // Track if this URL has menu content
- const hasMenuContent = await page.evaluate(() => {
- const text = document.body.innerText.toLowerCase();
- return (text.includes('add to cart') ||
- text.includes('add to bag') ||
- text.includes('product') ||
- text.includes('indica') ||
- text.includes('sativa') ||
- text.includes('hybrid') ||
- text.includes('thc') ||
- text.includes('cbd'));
- });
- if (hasMenuContent && url !== baseUrl) {
- result.menuEntryPoints.push(url);
- }
- }
- catch (pageError) {
- // 404s are fine, just skip
- if (!pageError.message?.includes('404')) {
- logger_1.logger.warn('provider-detection', `Could not load ${url}: ${pageError.message}`);
- }
- }
- }
- // Check API requests for provider hints
- for (const apiUrl of apiRequests) {
- for (const [provider, patterns] of Object.entries(PROVIDER_PATTERNS)) {
- for (const pattern of patterns.apiEndpoints) {
- if (pattern.test(apiUrl)) {
- result.signals.push({
- provider: provider,
- confidence: 95,
- source: 'api_request',
- details: apiUrl,
- });
- }
- }
- }
- }
- // Record raw signals
- result.rawSignals = {
- apiRequestsFound: apiRequests.length,
- menuEntryPointsFound: result.menuEntryPoints.length,
- totalSignals: result.signals.length,
- uniqueProviders: [...new Set(result.signals.map(s => s.provider))].length,
- };
- // Aggregate signals into final result
- const aggregated = aggregateSignals(result.signals);
- result.provider = aggregated.provider;
- result.confidence = aggregated.confidence;
- }
- catch (error) {
- result.error = error.message;
- logger_1.logger.error('provider-detection', `Detection failed for ${websiteUrl}: ${error.message}`);
- }
- finally {
- if (browser) {
- await browser.close();
- }
- }
- return result;
-}
-/**
- * Quick check if a site has Dutchie - used during production crawls
- */
-async function quickDutchieCheck(page) {
- try {
- const html = await page.content();
- // Check for Dutchie-specific patterns
- const dutchiePatterns = [
- /dutchie/i,
- /dutchie-plus/i,
- /__DUTCHIE__/i,
- /data-dutchie/i,
- /embed\.dutchie/i,
- ];
- for (const pattern of dutchiePatterns) {
- if (pattern.test(html)) {
- return true;
- }
- }
- // Check iframes
- const iframes = await page.$$eval('iframe', els => els.map(el => el.getAttribute('src') || ''));
- for (const iframe of iframes) {
- if (/dutchie/i.test(iframe)) {
- return true;
- }
- }
- return false;
- }
- catch {
- return false;
- }
-}
-/**
- * Check if provider has changed from expected
- */
-async function detectProviderChange(page, expectedProvider) {
- try {
- const signals = await analyzePageForProviders(page, page.url());
- const aggregated = aggregateSignals(signals);
- // If we expected Dutchie but found something else with high confidence
- if (expectedProvider === 'dutchie' && aggregated.provider !== 'dutchie' && aggregated.confidence >= 70) {
- return {
- changed: true,
- newProvider: aggregated.provider,
- confidence: aggregated.confidence,
- };
- }
- // If we expected Dutchie and found nothing/low confidence, might have switched
- if (expectedProvider === 'dutchie' && aggregated.confidence < 30) {
- // Check if Dutchie is definitely NOT present
- const hasDutchie = await quickDutchieCheck(page);
- if (!hasDutchie) {
- return {
- changed: true,
- newProvider: aggregated.provider !== 'unknown' ? aggregated.provider : 'other',
- confidence: Math.max(30, aggregated.confidence),
- };
- }
- }
- return { changed: false };
- }
- catch {
- return { changed: false };
- }
-}
diff --git a/backend/dist/services/proxy.js b/backend/dist/services/proxy.js
deleted file mode 100644
index 0989c314..00000000
--- a/backend/dist/services/proxy.js
+++ /dev/null
@@ -1,323 +0,0 @@
-"use strict";
-var __importDefault = (this && this.__importDefault) || function (mod) {
- return (mod && mod.__esModule) ? mod : { "default": mod };
-};
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.isBotDetectionError = isBotDetectionError;
-exports.putProxyInTimeout = putProxyInTimeout;
-exports.isProxyInTimeout = isProxyInTimeout;
-exports.getActiveProxy = getActiveProxy;
-exports.testProxy = testProxy;
-exports.saveProxyTestResult = saveProxyTestResult;
-exports.testAllProxies = testAllProxies;
-exports.addProxy = addProxy;
-exports.addProxiesFromList = addProxiesFromList;
-exports.moveProxyToFailed = moveProxyToFailed;
-exports.incrementProxyFailure = incrementProxyFailure;
-const axios_1 = __importDefault(require("axios"));
-const socks_proxy_agent_1 = require("socks-proxy-agent");
-const https_proxy_agent_1 = require("https-proxy-agent");
-const migrate_1 = require("../db/migrate");
-// In-memory proxy timeout tracking
-// Maps proxy ID to timestamp when timeout expires
-const proxyTimeouts = new Map();
-const PROXY_TIMEOUT_MS = 35000; // 35 seconds timeout for bot-detected proxies
-// Check if error message indicates bot detection
-function isBotDetectionError(errorMsg) {
- const botPatterns = [
- /bot detection/i,
- /captcha/i,
- /challenge/i,
- /cloudflare/i,
- /access denied/i,
- /rate limit/i,
- /too many requests/i,
- /temporarily blocked/i,
- /suspicious activity/i,
- ];
- return botPatterns.some(pattern => pattern.test(errorMsg));
-}
-// Put proxy in timeout (bot detection cooldown)
-function putProxyInTimeout(proxyId, reason) {
- const timeoutUntil = Date.now() + PROXY_TIMEOUT_MS;
- proxyTimeouts.set(proxyId, timeoutUntil);
- console.log(`🚫 Proxy ${proxyId} in timeout for ${PROXY_TIMEOUT_MS / 1000}s: ${reason}`);
-}
-// Check if proxy is currently in timeout
-function isProxyInTimeout(proxyId) {
- const timeoutUntil = proxyTimeouts.get(proxyId);
- if (!timeoutUntil)
- return false;
- if (Date.now() >= timeoutUntil) {
- // Timeout expired, remove it
- proxyTimeouts.delete(proxyId);
- console.log(`✅ Proxy ${proxyId} timeout expired, back in rotation`);
- return false;
- }
- return true;
-}
-// Get active proxy that's not in timeout
-async function getActiveProxy() {
- const result = await migrate_1.pool.query(`
- SELECT id, host, port, protocol, username, password
- FROM proxies
- WHERE active = true
- ORDER BY RANDOM()
- `);
- // Filter out proxies in timeout
- for (const proxy of result.rows) {
- if (!isProxyInTimeout(proxy.id)) {
- return proxy;
- }
- }
- // All proxies are in timeout, wait for first one to expire
- if (proxyTimeouts.size > 0) {
- const nextAvailable = Math.min(...Array.from(proxyTimeouts.values()));
- const waitTime = Math.max(0, nextAvailable - Date.now());
- console.log(`⏳ All proxies in timeout, waiting ${Math.ceil(waitTime / 1000)}s for next available...`);
- await new Promise(resolve => setTimeout(resolve, waitTime));
- // Try again after waiting
- return getActiveProxy();
- }
- console.log('⚠️ No active proxies available');
- return null;
-}
-async function getSettings() {
- const result = await migrate_1.pool.query(`
- SELECT key, value FROM settings
- WHERE key IN ('proxy_timeout_ms', 'proxy_test_url')
- `);
- const settings = {};
- result.rows.forEach((row) => {
- settings[row.key] = row.value;
- });
- return {
- timeout: parseInt(settings.proxy_timeout_ms || '3000'),
- testUrl: settings.proxy_test_url || 'https://httpbin.org/ip'
- };
-}
-async function testProxy(host, port, protocol, username, password) {
- try {
- const { timeout, testUrl } = await getSettings();
- const startTime = Date.now();
- // Construct proxy URL
- let proxyUrl;
- if (username && password) {
- proxyUrl = `${protocol}://${username}:${password}@${host}:${port}`;
- }
- else {
- proxyUrl = `${protocol}://${host}:${port}`;
- }
- // Create appropriate agent based on protocol
- let agent;
- if (protocol === 'socks5' || protocol === 'socks') {
- agent = new socks_proxy_agent_1.SocksProxyAgent(proxyUrl);
- }
- else if (protocol === 'http' || protocol === 'https') {
- agent = new https_proxy_agent_1.HttpsProxyAgent(proxyUrl);
- }
- else {
- return {
- success: false,
- error: `Unsupported protocol: ${protocol}`
- };
- }
- // Make test request
- const response = await axios_1.default.get(testUrl, {
- httpAgent: agent,
- httpsAgent: agent,
- timeout,
- });
- const responseTimeMs = Date.now() - startTime;
- // Check anonymity - the test URL should return our IP
- // If it returns the proxy's IP, we're anonymous
- let isAnonymous = false;
- if (response.data && response.data.origin) {
- // If the returned IP is different from our actual IP, the proxy is working
- // For simplicity, we'll consider it anonymous if we get a response
- isAnonymous = true;
- }
- return {
- success: true,
- responseTimeMs,
- isAnonymous
- };
- }
- catch (error) {
- return {
- success: false,
- error: error.message || 'Unknown error'
- };
- }
-}
-async function saveProxyTestResult(proxyId, result) {
- await migrate_1.pool.query(`
- UPDATE proxies
- SET last_tested_at = CURRENT_TIMESTAMP,
- test_result = $1,
- response_time_ms = $2,
- is_anonymous = $3,
- active = $4,
- updated_at = CURRENT_TIMESTAMP
- WHERE id = $5
- `, [
- result.success ? 'success' : 'failed',
- result.responseTimeMs || null,
- result.isAnonymous || false,
- result.success,
- proxyId
- ]);
-}
-async function testAllProxies() {
- console.log('🔍 Testing all proxies...');
- const result = await migrate_1.pool.query(`
- SELECT id, host, port, protocol, username, password
- FROM proxies
- `);
- for (const proxy of result.rows) {
- console.log(`Testing proxy: ${proxy.protocol}://${proxy.host}:${proxy.port}`);
- const testResult = await testProxy(proxy.host, proxy.port, proxy.protocol, proxy.username, proxy.password);
- await saveProxyTestResult(proxy.id, testResult);
- if (testResult.success) {
- console.log(`✅ Proxy OK (${testResult.responseTimeMs}ms, anonymous: ${testResult.isAnonymous})`);
- }
- else {
- console.log(`❌ Proxy failed: ${testResult.error}`);
- }
- // Small delay between tests
- await new Promise(resolve => setTimeout(resolve, 500));
- }
- console.log('✅ Proxy testing complete');
-}
-async function addProxy(host, port, protocol, username, password) {
- // Test the proxy first
- const testResult = await testProxy(host, port, protocol, username, password);
- if (!testResult.success) {
- throw new Error(`Proxy test failed: ${testResult.error}`);
- }
- // Insert into database
- const result = await migrate_1.pool.query(`
- INSERT INTO proxies (host, port, protocol, username, password, active, is_anonymous, test_result, response_time_ms, last_tested_at)
- VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, CURRENT_TIMESTAMP)
- RETURNING id
- `, [
- host,
- port,
- protocol,
- username,
- password,
- testResult.success,
- testResult.isAnonymous,
- 'success',
- testResult.responseTimeMs
- ]);
- return result.rows[0].id;
-}
-async function addProxiesFromList(proxies) {
- let added = 0;
- let failed = 0;
- let duplicates = 0;
- const errors = [];
- console.log(`📥 Importing ${proxies.length} proxies without testing...`);
- for (const proxy of proxies) {
- try {
- // Insert without testing first
- await migrate_1.pool.query(`
- INSERT INTO proxies (host, port, protocol, username, password, active)
- VALUES ($1, $2, $3, $4, $5, false)
- ON CONFLICT (host, port, protocol) DO NOTHING
- `, [
- proxy.host,
- proxy.port,
- proxy.protocol,
- proxy.username,
- proxy.password
- ]);
- // Check if it was actually inserted
- const result = await migrate_1.pool.query(`
- SELECT id FROM proxies
- WHERE host = $1 AND port = $2 AND protocol = $3
- `, [proxy.host, proxy.port, proxy.protocol]);
- if (result.rows.length > 0) {
- // Check if it was just inserted (no last_tested_at means new)
- const checkResult = await migrate_1.pool.query(`
- SELECT last_tested_at FROM proxies
- WHERE host = $1 AND port = $2 AND protocol = $3
- `, [proxy.host, proxy.port, proxy.protocol]);
- if (checkResult.rows[0].last_tested_at === null) {
- added++;
- if (added % 100 === 0) {
- console.log(`📥 Imported ${added} proxies...`);
- }
- }
- else {
- duplicates++;
- }
- }
- }
- catch (error) {
- failed++;
- const errorMsg = `${proxy.host}:${proxy.port} - ${error.message}`;
- errors.push(errorMsg);
- console.log(`❌ Failed to add proxy: ${errorMsg}`);
- }
- }
- console.log(`✅ Import complete: ${added} added, ${duplicates} duplicates, ${failed} failed`);
- return { added, failed, duplicates, errors };
-}
-async function moveProxyToFailed(proxyId, errorMsg) {
- // Get proxy details
- const proxyResult = await migrate_1.pool.query(`
- SELECT host, port, protocol, username, password, failure_count
- FROM proxies
- WHERE id = $1
- `, [proxyId]);
- if (proxyResult.rows.length === 0) {
- return;
- }
- const proxy = proxyResult.rows[0];
- // Insert into failed_proxies table
- await migrate_1.pool.query(`
- INSERT INTO failed_proxies (host, port, protocol, username, password, failure_count, last_error)
- VALUES ($1, $2, $3, $4, $5, $6, $7)
- ON CONFLICT (host, port, protocol)
- DO UPDATE SET
- failure_count = $6,
- last_error = $7,
- failed_at = CURRENT_TIMESTAMP
- `, [
- proxy.host,
- proxy.port,
- proxy.protocol,
- proxy.username,
- proxy.password,
- proxy.failure_count,
- errorMsg
- ]);
- // Delete from active proxies
- await migrate_1.pool.query(`DELETE FROM proxies WHERE id = $1`, [proxyId]);
- console.log(`🔴 Moved proxy to failed: ${proxy.protocol}://${proxy.host}:${proxy.port} (${proxy.failure_count} failures)`);
-}
-async function incrementProxyFailure(proxyId, errorMsg) {
- // Increment failure count
- const result = await migrate_1.pool.query(`
- UPDATE proxies
- SET failure_count = failure_count + 1,
- active = false,
- updated_at = CURRENT_TIMESTAMP
- WHERE id = $1
- RETURNING failure_count, host, port, protocol
- `, [proxyId]);
- if (result.rows.length === 0) {
- return false;
- }
- const proxy = result.rows[0];
- const failureCount = proxy.failure_count;
- console.log(`⚠️ Proxy failure #${failureCount}: ${proxy.protocol}://${proxy.host}:${proxy.port}`);
- // If failed 3 times, move to failed table
- if (failureCount >= 3) {
- await moveProxyToFailed(proxyId, errorMsg);
- return true; // Moved to failed
- }
- return false; // Still in active proxies
-}
diff --git a/backend/dist/services/proxyTestQueue.js b/backend/dist/services/proxyTestQueue.js
deleted file mode 100644
index e79c5735..00000000
--- a/backend/dist/services/proxyTestQueue.js
+++ /dev/null
@@ -1,174 +0,0 @@
-"use strict";
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.cleanupOrphanedJobs = cleanupOrphanedJobs;
-exports.createProxyTestJob = createProxyTestJob;
-exports.getProxyTestJob = getProxyTestJob;
-exports.getActiveProxyTestJob = getActiveProxyTestJob;
-exports.cancelProxyTestJob = cancelProxyTestJob;
-const migrate_1 = require("../db/migrate");
-const proxy_1 = require("./proxy");
-// Simple in-memory queue - could be replaced with Bull/Bee-Queue for production
-const activeJobs = new Map();
-// Clean up orphaned jobs on server startup
-async function cleanupOrphanedJobs() {
- try {
- const result = await migrate_1.pool.query(`
- UPDATE proxy_test_jobs
- SET status = 'cancelled',
- completed_at = CURRENT_TIMESTAMP,
- updated_at = CURRENT_TIMESTAMP
- WHERE status IN ('pending', 'running')
- RETURNING id
- `);
- if (result.rows.length > 0) {
- console.log(`🧹 Cleaned up ${result.rows.length} orphaned proxy test jobs`);
- }
- }
- catch (error) {
- console.error('Error cleaning up orphaned jobs:', error);
- }
-}
-async function createProxyTestJob() {
- // Check for existing running jobs first
- const existingJob = await getActiveProxyTestJob();
- if (existingJob) {
- throw new Error('A proxy test job is already running. Please cancel it first.');
- }
- const result = await migrate_1.pool.query(`
- SELECT COUNT(*) as count FROM proxies
- `);
- const totalProxies = parseInt(result.rows[0].count);
- const jobResult = await migrate_1.pool.query(`
- INSERT INTO proxy_test_jobs (status, total_proxies)
- VALUES ('pending', $1)
- RETURNING id
- `, [totalProxies]);
- const jobId = jobResult.rows[0].id;
- // Start job in background
- runProxyTestJob(jobId).catch(err => {
- console.error(`❌ Proxy test job ${jobId} failed:`, err);
- });
- return jobId;
-}
-async function getProxyTestJob(jobId) {
- const result = await migrate_1.pool.query(`
- SELECT id, status, total_proxies, tested_proxies, passed_proxies, failed_proxies
- FROM proxy_test_jobs
- WHERE id = $1
- `, [jobId]);
- if (result.rows.length === 0) {
- return null;
- }
- return result.rows[0];
-}
-async function getActiveProxyTestJob() {
- const result = await migrate_1.pool.query(`
- SELECT id, status, total_proxies, tested_proxies, passed_proxies, failed_proxies
- FROM proxy_test_jobs
- WHERE status IN ('pending', 'running')
- ORDER BY created_at DESC
- LIMIT 1
- `);
- if (result.rows.length === 0) {
- return null;
- }
- return result.rows[0];
-}
-async function cancelProxyTestJob(jobId) {
- // Try to cancel in-memory job first
- const jobControl = activeJobs.get(jobId);
- if (jobControl) {
- jobControl.cancelled = true;
- }
- // Always update database to handle orphaned jobs
- const result = await migrate_1.pool.query(`
- UPDATE proxy_test_jobs
- SET status = 'cancelled',
- completed_at = CURRENT_TIMESTAMP,
- updated_at = CURRENT_TIMESTAMP
- WHERE id = $1 AND status IN ('pending', 'running')
- RETURNING id
- `, [jobId]);
- return result.rows.length > 0;
-}
-async function runProxyTestJob(jobId) {
- // Register job as active
- activeJobs.set(jobId, { cancelled: false });
- try {
- // Update status to running
- await migrate_1.pool.query(`
- UPDATE proxy_test_jobs
- SET status = 'running',
- started_at = CURRENT_TIMESTAMP,
- updated_at = CURRENT_TIMESTAMP
- WHERE id = $1
- `, [jobId]);
- console.log(`🔍 Starting proxy test job ${jobId}...`);
- // Get all proxies
- const result = await migrate_1.pool.query(`
- SELECT id, host, port, protocol, username, password
- FROM proxies
- ORDER BY id
- `);
- let tested = 0;
- let passed = 0;
- let failed = 0;
- for (const proxy of result.rows) {
- // Check if job was cancelled
- const jobControl = activeJobs.get(jobId);
- if (jobControl?.cancelled) {
- console.log(`⏸️ Proxy test job ${jobId} cancelled`);
- break;
- }
- // Test the proxy
- const testResult = await (0, proxy_1.testProxy)(proxy.host, proxy.port, proxy.protocol, proxy.username, proxy.password);
- // Save result
- await (0, proxy_1.saveProxyTestResult)(proxy.id, testResult);
- tested++;
- if (testResult.success) {
- passed++;
- }
- else {
- failed++;
- }
- // Update job progress
- await migrate_1.pool.query(`
- UPDATE proxy_test_jobs
- SET tested_proxies = $1,
- passed_proxies = $2,
- failed_proxies = $3,
- updated_at = CURRENT_TIMESTAMP
- WHERE id = $4
- `, [tested, passed, failed, jobId]);
- // Log progress every 10 proxies
- if (tested % 10 === 0) {
- console.log(`📊 Job ${jobId}: ${tested}/${result.rows.length} proxies tested (${passed} passed, ${failed} failed)`);
- }
- }
- // Mark job as completed
- const jobControl = activeJobs.get(jobId);
- const finalStatus = jobControl?.cancelled ? 'cancelled' : 'completed';
- await migrate_1.pool.query(`
- UPDATE proxy_test_jobs
- SET status = $1,
- completed_at = CURRENT_TIMESTAMP,
- updated_at = CURRENT_TIMESTAMP
- WHERE id = $2
- `, [finalStatus, jobId]);
- console.log(`✅ Proxy test job ${jobId} ${finalStatus}: ${tested} tested, ${passed} passed, ${failed} failed`);
- }
- catch (error) {
- console.error(`❌ Proxy test job ${jobId} error:`, error);
- await migrate_1.pool.query(`
- UPDATE proxy_test_jobs
- SET status = 'failed',
- completed_at = CURRENT_TIMESTAMP,
- updated_at = CURRENT_TIMESTAMP
- WHERE id = $1
- `, [jobId]);
- }
- finally {
- // Remove from active jobs
- activeJobs.delete(jobId);
- }
-}
diff --git a/backend/dist/services/scheduler.js b/backend/dist/services/scheduler.js
deleted file mode 100644
index dfa670a4..00000000
--- a/backend/dist/services/scheduler.js
+++ /dev/null
@@ -1,104 +0,0 @@
-"use strict";
-var __importDefault = (this && this.__importDefault) || function (mod) {
- return (mod && mod.__esModule) ? mod : { "default": mod };
-};
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.startScheduler = startScheduler;
-exports.stopScheduler = stopScheduler;
-exports.restartScheduler = restartScheduler;
-exports.triggerStoreScrape = triggerStoreScrape;
-exports.triggerAllStoresScrape = triggerAllStoresScrape;
-const node_cron_1 = __importDefault(require("node-cron"));
-const migrate_1 = require("../db/migrate");
-const scraper_v2_1 = require("../scraper-v2");
-let scheduledJobs = [];
-async function getSettings() {
- const result = await migrate_1.pool.query(`
- SELECT key, value FROM settings
- WHERE key IN ('scrape_interval_hours', 'scrape_specials_time')
- `);
- const settings = {};
- result.rows.forEach((row) => {
- settings[row.key] = row.value;
- });
- return {
- scrapeIntervalHours: parseInt(settings.scrape_interval_hours || '4'),
- scrapeSpecialsTime: settings.scrape_specials_time || '00:01'
- };
-}
-async function scrapeAllStores() {
- console.log('🔄 Starting scheduled scrape for all stores...');
- const result = await migrate_1.pool.query(`
- SELECT id, name FROM stores WHERE active = true AND scrape_enabled = true
- `);
- for (const store of result.rows) {
- try {
- console.log(`Scraping store: ${store.name}`);
- await (0, scraper_v2_1.scrapeStore)(store.id);
- }
- catch (error) {
- console.error(`Failed to scrape store ${store.name}:`, error);
- }
- }
- console.log('✅ Scheduled scrape completed');
-}
-async function scrapeSpecials() {
- console.log('🌟 Starting scheduled specials scrape...');
- const result = await migrate_1.pool.query(`
- SELECT s.id, s.name, c.id as category_id
- FROM stores s
- JOIN categories c ON c.store_id = s.id
- WHERE s.active = true AND s.scrape_enabled = true
- AND c.slug = 'specials' AND c.scrape_enabled = true
- `);
- for (const row of result.rows) {
- try {
- console.log(`Scraping specials for: ${row.name}`);
- await (0, scraper_v2_1.scrapeCategory)(row.id, row.category_id);
- }
- catch (error) {
- console.error(`Failed to scrape specials for ${row.name}:`, error);
- }
- }
- console.log('✅ Specials scrape completed');
-}
-async function startScheduler() {
- // Stop any existing jobs
- stopScheduler();
- const settings = await getSettings();
- // Schedule regular store scrapes (every N hours)
- const scrapeIntervalCron = `0 */${settings.scrapeIntervalHours} * * *`;
- const storeJob = node_cron_1.default.schedule(scrapeIntervalCron, scrapeAllStores);
- scheduledJobs.push(storeJob);
- console.log(`📅 Scheduled store scraping: every ${settings.scrapeIntervalHours} hours`);
- // Schedule specials scraping (daily at specified time)
- const [hours, minutes] = settings.scrapeSpecialsTime.split(':');
- const specialsCron = `${minutes} ${hours} * * *`;
- const specialsJob = node_cron_1.default.schedule(specialsCron, scrapeSpecials);
- scheduledJobs.push(specialsJob);
- console.log(`📅 Scheduled specials scraping: daily at ${settings.scrapeSpecialsTime}`);
- // Initial scrape on startup (after 10 seconds)
- setTimeout(() => {
- console.log('🚀 Running initial scrape...');
- scrapeAllStores().catch(console.error);
- }, 10000);
-}
-function stopScheduler() {
- scheduledJobs.forEach(job => job.stop());
- scheduledJobs = [];
- console.log('🛑 Scheduler stopped');
-}
-async function restartScheduler() {
- console.log('🔄 Restarting scheduler...');
- stopScheduler();
- await startScheduler();
-}
-// Manual trigger functions for admin
-async function triggerStoreScrape(storeId) {
- console.log(`🔧 Manual scrape triggered for store ID: ${storeId}`);
- await (0, scraper_v2_1.scrapeStore)(storeId);
-}
-async function triggerAllStoresScrape() {
- console.log('🔧 Manual scrape triggered for all stores');
- await scrapeAllStores();
-}
diff --git a/backend/dist/services/scraper-debug.js b/backend/dist/services/scraper-debug.js
deleted file mode 100644
index 2050279f..00000000
--- a/backend/dist/services/scraper-debug.js
+++ /dev/null
@@ -1,83 +0,0 @@
-"use strict";
-var __importDefault = (this && this.__importDefault) || function (mod) {
- return (mod && mod.__esModule) ? mod : { "default": mod };
-};
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.debugDutchiePage = debugDutchiePage;
-const puppeteer_extra_1 = __importDefault(require("puppeteer-extra"));
-const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth"));
-const logger_1 = require("./logger");
-// Apply stealth plugin
-puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)());
-async function debugDutchiePage(url) {
- const browser = await puppeteer_extra_1.default.launch({
- headless: 'new',
- args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage']
- });
- const page = await browser.newPage();
- await page.setViewport({ width: 1920, height: 1080 });
- await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
- logger_1.logger.info('scraper', `Loading: ${url}`);
- try {
- await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 30000 });
- logger_1.logger.info('scraper', 'Page loaded, waiting for content...');
- // Wait for content to render
- await page.waitForTimeout(8000);
- const debug = await page.evaluate(() => {
- // Try to find product cards
- const productSelectors = [
- '[data-testid*="product"]',
- '[class*="Product"]',
- '[class*="product"]',
- 'article',
- '[role="article"]',
- 'li'
- ];
- const results = {
- selectors: {}
- };
- for (const selector of productSelectors) {
- const elements = document.querySelectorAll(selector);
- results.selectors[selector] = elements.length;
- }
- // Get sample HTML from first few matches
- const firstMatch = document.querySelector('[class*="product" i], article, [data-testid*="product"]');
- if (firstMatch) {
- results.sampleHTML = firstMatch.outerHTML.substring(0, 1000);
- results.sampleText = firstMatch.textContent?.substring(0, 500);
- }
- // Get all class names that might be products
- const allElements = document.querySelectorAll('*');
- const classNames = new Set();
- allElements.forEach(el => {
- const classes = el.className;
- if (typeof classes === 'string' && classes.toLowerCase().includes('product')) {
- classes.split(' ').forEach(c => classNames.add(c));
- }
- });
- results.productClasses = Array.from(classNames).slice(0, 20);
- results.bodyTextSample = document.body.innerText.substring(0, 500);
- return results;
- });
- logger_1.logger.info('scraper', `Debug results:\n${JSON.stringify(debug, null, 2)}`);
- }
- catch (error) {
- logger_1.logger.error('scraper', `Debug navigation error: ${error}`);
- // Try to get whatever we can
- try {
- const partialDebug = await page.evaluate(() => {
- return {
- url: window.location.href,
- title: document.title,
- bodyLength: document.body?.innerHTML?.length || 0,
- bodyStart: document.body?.innerHTML?.substring(0, 500) || ''
- };
- });
- logger_1.logger.info('scraper', `Partial debug:\n${JSON.stringify(partialDebug, null, 2)}`);
- }
- catch (e) {
- logger_1.logger.error('scraper', `Could not get partial debug: ${e}`);
- }
- }
- await browser.close();
-}
diff --git a/backend/dist/services/scraper-playwright.js b/backend/dist/services/scraper-playwright.js
deleted file mode 100644
index ad2ec2fa..00000000
--- a/backend/dist/services/scraper-playwright.js
+++ /dev/null
@@ -1,236 +0,0 @@
-"use strict";
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.scrapeCategoryPlaywright = scrapeCategoryPlaywright;
-exports.testScrapeCategoryPlaywright = testScrapeCategoryPlaywright;
-const age_gate_playwright_1 = require("../utils/age-gate-playwright");
-const logger_1 = require("./logger");
-const stealthBrowser_1 = require("../utils/stealthBrowser");
-const dutchie_1 = require("../scrapers/templates/dutchie");
-/**
- * Scrapes a category page using Playwright with stealth mode to extract product information
- */
-async function scrapeCategoryPlaywright(categoryUrl, categoryName, state = 'Arizona', proxy) {
- logger_1.logger.info('scraper', `Scraping category: ${categoryName}`);
- logger_1.logger.info('scraper', `URL: ${categoryUrl}`);
- // Create stealth browser with optional proxy
- const browser = await (0, stealthBrowser_1.createStealthBrowser)({ proxy, headless: true });
- try {
- // Create stealth context with age gate cookies
- const context = await (0, stealthBrowser_1.createStealthContext)(browser, { state });
- // Try to load saved session cookies
- const cookiesPath = `/tmp/dutchie-session-${state.toLowerCase()}.json`;
- await (0, stealthBrowser_1.loadCookies)(context, cookiesPath);
- const page = await context.newPage();
- // Navigate to category page
- logger_1.logger.info('scraper', `Loading page: ${categoryUrl}`);
- await page.goto(categoryUrl, { waitUntil: 'domcontentloaded', timeout: 60000 });
- // Random delay to appear more human
- await (0, stealthBrowser_1.randomDelay)(1000, 2000);
- // Check for Cloudflare challenge
- if (await (0, stealthBrowser_1.isCloudflareChallenge)(page)) {
- logger_1.logger.info('scraper', '🛡️ Cloudflare challenge detected, waiting...');
- const passed = await (0, stealthBrowser_1.waitForCloudflareChallenge)(page, 30000);
- if (!passed) {
- logger_1.logger.error('scraper', '❌ Failed to pass Cloudflare challenge');
- await browser.close();
- return [];
- }
- // Save successful session cookies
- await (0, stealthBrowser_1.saveCookies)(context, cookiesPath);
- }
- // Wait for page to be fully loaded
- await (0, stealthBrowser_1.waitForPageLoad)(page);
- // Simulate human behavior
- await (0, stealthBrowser_1.simulateHumanBehavior)(page);
- // Check for and bypass age gate
- const bypassed = await (0, age_gate_playwright_1.bypassAgeGatePlaywright)(page, state);
- if (!bypassed) {
- logger_1.logger.error('scraper', 'Failed to bypass age gate');
- await browser.close();
- return [];
- }
- // Wait for products to load with random delay
- logger_1.logger.info('scraper', 'Waiting for products to load...');
- await (0, stealthBrowser_1.randomDelay)(2000, 4000);
- // Scroll to load all products with human-like behavior
- logger_1.logger.info('scraper', 'Scrolling to load all products...');
- await scrollToBottomHuman(page);
- // Extract products
- logger_1.logger.info('scraper', 'Extracting products from page...');
- const products = await extractProducts(page, categoryUrl, categoryName);
- logger_1.logger.info('scraper', `Found ${products.length} products`);
- await browser.close();
- return products;
- }
- catch (error) {
- logger_1.logger.error('scraper', `Error scraping category: ${error}`);
- await browser.close();
- return [];
- }
-}
-/**
- * Scrolls to the bottom of the page with human-like behavior
- */
-async function scrollToBottomHuman(page) {
- let previousHeight = 0;
- let currentHeight = await page.evaluate(() => document.body.scrollHeight);
- let attempts = 0;
- const maxAttempts = 20;
- while (previousHeight < currentHeight && attempts < maxAttempts) {
- previousHeight = currentHeight;
- // Scroll down in chunks with randomized delays
- const scrollAmount = Math.floor(Math.random() * 200) + 300; // 300-500px
- await (0, stealthBrowser_1.humanScroll)(page, scrollAmount);
- // Random pause like a human reading
- await (0, stealthBrowser_1.randomDelay)(500, 1500);
- // Check new height
- currentHeight = await page.evaluate(() => document.body.scrollHeight);
- attempts++;
- }
- // Final wait for any lazy-loaded content
- await (0, stealthBrowser_1.randomDelay)(1000, 2000);
-}
-/**
- * Extracts product information from the page
- */
-async function extractProducts(page, categoryUrl, categoryName) {
- let products = [];
- // Check if we have a template for this URL
- const template = (0, dutchie_1.getTemplateForUrl)(categoryUrl);
- if (template) {
- logger_1.logger.info('scraper', `Using ${template.name} template for extraction`);
- try {
- const templateProducts = await template.extractProducts(page);
- // Add category to products from template
- products = templateProducts.map(p => ({
- ...p,
- category: categoryName,
- }));
- logger_1.logger.info('scraper', `Template extracted ${products.length} products`);
- return products;
- }
- catch (err) {
- logger_1.logger.error('scraper', `Template extraction failed: ${err}`);
- // Fall through to fallback methods
- }
- }
- // Fallback Method 1: Dutchie products (for Sol Flower, etc.)
- try {
- const dutchieProducts = await page.locator('[data-testid^="product-"], .product-card, [class*="ProductCard"]').all();
- if (dutchieProducts.length > 0) {
- logger_1.logger.info('scraper', `Found ${dutchieProducts.length} Dutchie-style products`);
- for (const productEl of dutchieProducts) {
- try {
- const name = await productEl.locator('[data-testid="product-name"], .product-name, h3, h4').first().textContent() || '';
- const brand = await productEl.locator('[data-testid="product-brand"], .product-brand, .brand').first().textContent().catch(() => '');
- const priceText = await productEl.locator('[data-testid="product-price"], .product-price, .price').first().textContent().catch(() => '');
- const imageUrl = await productEl.locator('img').first().getAttribute('src').catch(() => '');
- const productLink = await productEl.locator('a').first().getAttribute('href').catch(() => '');
- // Parse price
- const price = priceText ? parseFloat(priceText.replace(/[^0-9.]/g, '')) : undefined;
- if (name) {
- products.push({
- name: name.trim(),
- brand: brand ? brand.trim() : undefined,
- category: categoryName,
- price,
- image_url: imageUrl || undefined,
- product_url: productLink ? new URL(productLink, categoryUrl).toString() : categoryUrl,
- in_stock: true
- });
- }
- }
- catch (err) {
- logger_1.logger.warn('scraper', `Error extracting Dutchie product: ${err}`);
- }
- }
- }
- }
- catch (err) {
- logger_1.logger.warn('scraper', `Dutchie product extraction failed: ${err}`);
- }
- // Method 2: Curaleaf products
- if (products.length === 0) {
- try {
- const curaleafProducts = await page.locator('.product, [class*="Product"], [class*="item"]').all();
- if (curaleafProducts.length > 0) {
- logger_1.logger.info('scraper', `Found ${curaleafProducts.length} Curaleaf-style products`);
- for (const productEl of curaleafProducts) {
- try {
- const name = await productEl.locator('h1, h2, h3, h4, .title, .name').first().textContent() || '';
- const priceText = await productEl.locator('.price, [class*="price"]').first().textContent().catch(() => '');
- const imageUrl = await productEl.locator('img').first().getAttribute('src').catch(() => '');
- const price = priceText ? parseFloat(priceText.replace(/[^0-9.]/g, '')) : undefined;
- if (name && name.length > 3) {
- products.push({
- name: name.trim(),
- category: categoryName,
- price,
- image_url: imageUrl || undefined,
- product_url: categoryUrl,
- in_stock: true
- });
- }
- }
- catch (err) {
- logger_1.logger.warn('scraper', `Error extracting Curaleaf product: ${err}`);
- }
- }
- }
- }
- catch (err) {
- logger_1.logger.warn('scraper', `Curaleaf product extraction failed: ${err}`);
- }
- }
- // Method 3: Generic product cards
- if (products.length === 0) {
- try {
- const genericProducts = await page.locator('article, [role="article"], .card, [class*="card"]').all();
- logger_1.logger.info('scraper', `Trying generic selectors, found ${genericProducts.length} elements`);
- for (const productEl of genericProducts) {
- try {
- const text = await productEl.textContent() || '';
- // Only consider elements that look like products
- if (text.includes('$') || text.toLowerCase().includes('price') || text.toLowerCase().includes('thc')) {
- const name = await productEl.locator('h1, h2, h3, h4').first().textContent() || '';
- if (name && name.length > 3) {
- products.push({
- name: name.trim(),
- category: categoryName,
- product_url: categoryUrl,
- in_stock: true
- });
- }
- }
- }
- catch (err) {
- // Skip this element
- }
- }
- }
- catch (err) {
- logger_1.logger.warn('scraper', `Generic product extraction failed: ${err}`);
- }
- }
- return products;
-}
-/**
- * Test function to scrape a single category
- */
-async function testScrapeCategoryPlaywright(url, categoryName, state = 'Arizona') {
- console.log(`\n🎭 Testing Playwright Category Scraper\n`);
- console.log(`Category: ${categoryName}`);
- console.log(`URL: ${url}\n`);
- const products = await scrapeCategoryPlaywright(url, categoryName, state);
- console.log(`\n✅ Found ${products.length} products\n`);
- products.slice(0, 5).forEach((p, i) => {
- console.log(`${i + 1}. ${p.name}`);
- if (p.brand)
- console.log(` Brand: ${p.brand}`);
- if (p.price)
- console.log(` Price: $${p.price}`);
- console.log(` URL: ${p.product_url}`);
- console.log('');
- });
- return products;
-}
diff --git a/backend/dist/services/scraper.js b/backend/dist/services/scraper.js
deleted file mode 100644
index aaaa917d..00000000
--- a/backend/dist/services/scraper.js
+++ /dev/null
@@ -1,717 +0,0 @@
-"use strict";
-var __importDefault = (this && this.__importDefault) || function (mod) {
- return (mod && mod.__esModule) ? mod : { "default": mod };
-};
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.USER_AGENT_GROUPS = exports.USER_AGENTS = void 0;
-exports.getUserAgent = getUserAgent;
-exports.scrapeCategory = scrapeCategory;
-exports.saveProducts = saveProducts;
-exports.scrapeStore = scrapeStore;
-const puppeteer_extra_1 = __importDefault(require("puppeteer-extra"));
-const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth"));
-const migrate_1 = require("../db/migrate");
-const minio_1 = require("../utils/minio");
-const logger_1 = require("./logger");
-const scraper_monitor_1 = require("../routes/scraper-monitor");
-const proxy_1 = require("./proxy");
-const age_gate_1 = require("../utils/age-gate");
-const availability_1 = require("./availability");
-// Apply stealth plugin for antidetect/anti-fingerprinting
-puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)());
-exports.USER_AGENTS = {
- 'chrome-windows': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
- 'chrome-mac': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
- 'chrome-linux': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
- 'mobile-ios': 'Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1',
- 'mobile-android': 'Mozilla/5.0 (Linux; Android 13; Pixel 7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Mobile Safari/537.36',
- 'googlebot': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
- 'bingbot': 'Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)'
-};
-exports.USER_AGENT_GROUPS = {
- desktop: ['chrome-windows', 'chrome-mac', 'chrome-linux'],
- mobile: ['mobile-ios', 'mobile-android'],
- serp: ['googlebot', 'bingbot']
-};
-function getRandomUserAgentFromGroup(group) {
- const randomKey = group[Math.floor(Math.random() * group.length)];
- return exports.USER_AGENTS[randomKey];
-}
-function getUserAgent(key) {
- if (!key)
- return getRandomUserAgentFromGroup(exports.USER_AGENT_GROUPS.desktop);
- // Check if it's a group
- if (key === 'rotate-desktop')
- return getRandomUserAgentFromGroup(exports.USER_AGENT_GROUPS.desktop);
- if (key === 'rotate-mobile')
- return getRandomUserAgentFromGroup(exports.USER_AGENT_GROUPS.mobile);
- if (key === 'rotate-serp')
- return getRandomUserAgentFromGroup(exports.USER_AGENT_GROUPS.serp);
- // Otherwise treat as specific UA
- return exports.USER_AGENTS[key] || getRandomUserAgentFromGroup(exports.USER_AGENT_GROUPS.desktop);
-}
-function extractImageIdFromUrl(url) {
- try {
- const match = url.match(/images\.dutchie\.com\/([a-f0-9]+)/i);
- return match ? match[1] : null;
- }
- catch (e) {
- return null;
- }
-}
-function getFullSizeImageUrl(imageUrl) {
- const imageId = extractImageIdFromUrl(imageUrl);
- if (!imageId)
- return imageUrl;
- return `https://images.dutchie.com/${imageId}?auto=format&fit=max&q=95&w=2000&h=2000`;
-}
-function sanitizeProductData(product) {
- return {
- ...product,
- name: product.name?.substring(0, 500) || 'Unnamed Product',
- description: product.description || null,
- brand: product.brand?.substring(0, 500) || null,
- weight: product.weight?.substring(0, 100) || null,
- thc: product.thc && product.thc < 100 ? product.thc : null,
- cbd: product.cbd && product.cbd < 100 ? product.cbd : null
- };
-}
-async function makePageStealthy(page) {
- await page.evaluateOnNewDocument(() => {
- Object.defineProperty(navigator, 'webdriver', {
- get: () => false,
- });
- });
- await page.evaluateOnNewDocument(() => {
- Object.defineProperty(navigator, 'plugins', {
- get: () => [1, 2, 3, 4, 5],
- });
- });
- await page.evaluateOnNewDocument(() => {
- Object.defineProperty(navigator, 'languages', {
- get: () => ['en-US', 'en'],
- });
- });
- await page.evaluateOnNewDocument(() => {
- window.chrome = {
- runtime: {},
- };
- });
- await page.evaluateOnNewDocument(() => {
- const originalQuery = window.navigator.permissions.query;
- window.navigator.permissions.query = (parameters) => parameters.name === 'notifications'
- ? Promise.resolve({ state: 'denied' })
- : originalQuery(parameters);
- });
-}
-async function scrapeProductDetails(page, productUrl, productName) {
- const maxRetries = 3;
- let lastError = null;
- for (let attempt = 1; attempt <= maxRetries; attempt++) {
- try {
- await page.goto(productUrl, { waitUntil: 'domcontentloaded', timeout: 30000 });
- const details = await page.evaluate(() => {
- const allText = document.body.textContent || '';
- let fullSizeImage = null;
- const mainImageSelectors = [
- 'img[class*="ProductImage"]',
- 'img[class*="product-image"]',
- '[class*="ImageGallery"] img',
- 'main img',
- 'img[src*="images.dutchie.com"]'
- ];
- for (const sel of mainImageSelectors) {
- const img = document.querySelector(sel);
- if (img?.src && img.src.includes('dutchie.com')) {
- fullSizeImage = img.src;
- break;
- }
- }
- let description = '';
- const descSelectors = [
- '[class*="description"]',
- '[class*="Description"]',
- '[data-testid*="description"]',
- 'p[class*="product"]'
- ];
- for (const sel of descSelectors) {
- const el = document.querySelector(sel);
- if (el?.textContent?.trim() && el.textContent.length > 20) {
- description = el.textContent.trim();
- break;
- }
- }
- let thc = null;
- const thcPatterns = [
- /THC[:\s]*(\d+\.?\d*)\s*%/i,
- /Total\s+THC[:\s]*(\d+\.?\d*)\s*%/i,
- /(\d+\.?\d*)\s*%\s+THC/i
- ];
- for (const pattern of thcPatterns) {
- const match = allText.match(pattern);
- if (match) {
- thc = parseFloat(match[1]);
- break;
- }
- }
- let cbd = null;
- const cbdPatterns = [
- /CBD[:\s]*(\d+\.?\d*)\s*%/i,
- /Total\s+CBD[:\s]*(\d+\.?\d*)\s*%/i,
- /(\d+\.?\d*)\s*%\s+CBD/i
- ];
- for (const pattern of cbdPatterns) {
- const match = allText.match(pattern);
- if (match) {
- cbd = parseFloat(match[1]);
- break;
- }
- }
- let strainType = null;
- if (allText.match(/\bindica\b/i))
- strainType = 'Indica';
- else if (allText.match(/\bsativa\b/i))
- strainType = 'Sativa';
- else if (allText.match(/\bhybrid\b/i))
- strainType = 'Hybrid';
- const terpenes = [];
- const terpeneNames = [
- 'Myrcene', 'Limonene', 'Caryophyllene', 'Pinene', 'Linalool',
- 'Humulene', 'Terpinolene', 'Ocimene', 'Bisabolol', 'Valencene'
- ];
- terpeneNames.forEach(terp => {
- if (allText.match(new RegExp(`\\b${terp}\\b`, 'i'))) {
- terpenes.push(terp);
- }
- });
- const effects = [];
- const effectNames = [
- 'Relaxed', 'Happy', 'Euphoric', 'Uplifted', 'Creative',
- 'Energetic', 'Focused', 'Calm', 'Sleepy', 'Hungry',
- 'Talkative', 'Giggly', 'Aroused'
- ];
- effectNames.forEach(effect => {
- if (allText.match(new RegExp(`\\b${effect}\\b`, 'i'))) {
- effects.push(effect);
- }
- });
- let brand = null;
- const brandSelectors = [
- '[class*="brand"]',
- '[class*="Brand"]',
- '[data-testid*="brand"]'
- ];
- for (const sel of brandSelectors) {
- const el = document.querySelector(sel);
- if (el?.textContent?.trim()) {
- brand = el.textContent.trim();
- break;
- }
- }
- let lineage = null;
- const lineageMatch = allText.match(/(?:Lineage|Genetics|Parents?)[:\s]*([^\n]+)/i);
- if (lineageMatch) {
- lineage = lineageMatch[1].trim();
- }
- const flavors = [];
- const flavorNames = [
- 'Sweet', 'Citrus', 'Earthy', 'Pine', 'Berry', 'Diesel',
- 'Sour', 'Floral', 'Spicy', 'Woody', 'Tropical', 'Fruity',
- 'Vanilla', 'Mint', 'Cheese', 'Grape', 'Lemon', 'Orange'
- ];
- flavorNames.forEach(flavor => {
- if (allText.match(new RegExp(`\\b${flavor}\\b`, 'i'))) {
- flavors.push(flavor);
- }
- });
- const weights = [];
- const weightMatches = allText.matchAll(/(\d+\.?\d*\s*(?:g|oz|mg|gram))/gi);
- for (const match of weightMatches) {
- const weight = match[1].trim();
- if (!weights.includes(weight)) {
- weights.push(weight);
- }
- }
- return {
- fullSizeImage,
- description,
- thc,
- cbd,
- strainType,
- terpenes,
- effects,
- brand,
- lineage,
- flavors,
- weights
- };
- });
- return details;
- }
- catch (error) {
- lastError = error;
- logger_1.logger.warn('scraper', ` Attempt ${attempt}/${maxRetries} failed for ${productName}: ${error}`);
- // No delays - just retry immediately
- }
- }
- logger_1.logger.error('scraper', ` ✗ All attempts failed for ${productName}`);
- return {
- fullSizeImage: null,
- description: null,
- thc: null,
- cbd: null,
- strainType: null,
- terpenes: [],
- effects: [],
- brand: null,
- lineage: null,
- flavors: [],
- weights: []
- };
-}
-async function scrapeCategory(storeId, categoryId, userAgent) {
- let browser = null;
- const scraperId = `cat-${categoryId}-${Date.now()}`;
- let proxyId = null;
- try {
- const categoryResult = await migrate_1.pool.query(`
- SELECT c.*, s.slug as store_slug, s.name as store_name
- FROM categories c
- JOIN stores s ON c.store_id = s.id
- WHERE c.id = $1
- `, [categoryId]);
- if (categoryResult.rows.length === 0) {
- throw new Error('Category not found');
- }
- const category = categoryResult.rows[0];
- logger_1.logger.info('scraper', `Scraping category: ${category.name} for ${category.store_name}`);
- // Register scraper with monitoring system
- (0, scraper_monitor_1.registerScraper)(scraperId, storeId, category.store_name, categoryId, category.name);
- const proxy = await (0, proxy_1.getActiveProxy)();
- if (proxy) {
- proxyId = proxy.id;
- }
- const launchOptions = {
- headless: 'new',
- args: [
- '--no-sandbox',
- '--disable-setuid-sandbox',
- '--disable-dev-shm-usage',
- '--disable-blink-features=AutomationControlled',
- '--window-size=1920,1080'
- ]
- };
- if (proxy) {
- if (proxy.protocol === 'socks5') {
- launchOptions.args.push(`--proxy-server=socks5://${proxy.host}:${proxy.port}`);
- }
- else if (proxy.protocol === 'http' || proxy.protocol === 'https') {
- launchOptions.args.push(`--proxy-server=${proxy.protocol}://${proxy.host}:${proxy.port}`);
- }
- logger_1.logger.info('scraper', `Using proxy: ${proxy.protocol}://${proxy.host}:${proxy.port}`);
- }
- browser = await puppeteer_extra_1.default.launch(launchOptions);
- const page = await browser.newPage();
- await makePageStealthy(page);
- await page.setViewport({ width: 1920, height: 1080 });
- // Use provided userAgent or random if not specified
- const ua = getUserAgent(userAgent);
- await page.setUserAgent(ua);
- // Set age gate bypass cookies BEFORE navigation (standard for all cannabis sites)
- const state = (0, age_gate_1.detectStateFromUrl)(category.dutchie_url);
- await (0, age_gate_1.setAgeGateCookies)(page, category.dutchie_url, state);
- logger_1.logger.info('scraper', `Loading page: ${category.dutchie_url}`);
- try {
- await page.goto(category.dutchie_url, {
- waitUntil: 'networkidle2',
- timeout: 60000
- });
- // If age gate still appears, try to bypass it
- await (0, age_gate_1.bypassAgeGate)(page, state);
- // Wait for products to load
- await page.waitForSelector('[data-testid="product-list-item"], a[href*="/product/"]', {
- timeout: 30000,
- }).catch(() => {
- logger_1.logger.warn('scraper', 'No product selectors found, trying anyway...');
- });
- logger_1.logger.info('scraper', 'Scrolling to load all products...');
- await autoScroll(page);
- }
- catch (navError) {
- logger_1.logger.error('scraper', `Navigation error: ${navError}`);
- // Check if this is bot detection - put proxy in timeout instead of hard failure
- if (proxyId) {
- const errorMsg = String(navError);
- if ((0, proxy_1.isBotDetectionError)(errorMsg)) {
- // Bot detection! Put this proxy in timeout and get a new one
- logger_1.logger.warn('scraper', `🤖 Bot detection triggered for proxy ${proxyId}!`);
- (0, proxy_1.putProxyInTimeout)(proxyId, errorMsg);
- throw new Error(`Bot detection: ${errorMsg}`);
- }
- else if (errorMsg.includes('timeout') || errorMsg.includes('net::') ||
- errorMsg.includes('ERR_') || errorMsg.includes('Navigation')) {
- // Regular proxy failure - increment failure count
- logger_1.logger.warn('scraper', `Proxy failure detected, incrementing failure count for proxy ${proxyId}`);
- await (0, proxy_1.incrementProxyFailure)(proxyId, errorMsg);
- }
- }
- throw navError;
- }
- logger_1.logger.info('scraper', 'Extracting product list from page...');
- const products = await page.evaluate(() => {
- const items = [];
- const cards = document.querySelectorAll('[data-testid="product-list-item"]');
- console.log(`Found ${cards.length} product cards`);
- cards.forEach((card) => {
- try {
- const allText = card.textContent || '';
- let name = '';
- const nameSelectors = ['a[href*="/product/"]', 'h1', 'h2', 'h3', 'h4'];
- for (const sel of nameSelectors) {
- const el = card.querySelector(sel);
- if (el?.textContent?.trim()) {
- name = el.textContent.trim();
- name = name.split('\n')[0].trim();
- break;
- }
- }
- if (!name || name.length < 2)
- return;
- let price = null;
- let originalPrice = null;
- const priceMatches = allText.match(/\$(\d+\.?\d*)/g);
- if (priceMatches && priceMatches.length > 0) {
- price = parseFloat(priceMatches[0].replace('$', ''));
- if (priceMatches.length > 1) {
- originalPrice = parseFloat(priceMatches[1].replace('$', ''));
- }
- }
- // Extract variant (weight/size) - look for common patterns
- let variant = null;
- const variantPatterns = [
- /(\d+\.?\d*\s*(?:g|oz|mg|ml|gram|ounce))/i, // Weight units
- /(\d+\s*pack)/i, // Pack sizes
- /(\d+\s*ct)/i, // Count
- /(\d+\s*x\s*\d+\.?\d*\s*(?:g|mg|ml))/i // Multi-pack (e.g., 5x0.5g)
- ];
- for (const pattern of variantPatterns) {
- const match = allText.match(pattern);
- if (match) {
- variant = match[1].trim();
- break;
- }
- }
- const linkEl = card.querySelector('a[href*="/product/"]');
- let href = linkEl?.href || linkEl?.getAttribute('href') || '';
- if (href && href.startsWith('/')) {
- href = 'https://dutchie.com' + href;
- }
- items.push({
- name,
- variant,
- price,
- originalPrice,
- href: href || window.location.href
- });
- }
- catch (err) {
- console.error('Error parsing product card:', err);
- }
- });
- return items;
- });
- logger_1.logger.info('scraper', `Found ${products.length} products total`);
- logger_1.logger.info('scraper', `Now visiting each product page for complete details...`);
- let successCount = 0;
- let failCount = 0;
- // Update initial stats
- (0, scraper_monitor_1.updateScraperStats)(scraperId, {
- productsProcessed: 0,
- productsTotal: products.length
- });
- for (let i = 0; i < products.length; i++) {
- const product = products[i];
- try {
- logger_1.logger.info('scraper', ` [${i + 1}/${products.length}] ${product.name}`);
- (0, scraper_monitor_1.updateScraperStats)(scraperId, {
- productsProcessed: i + 1,
- productsTotal: products.length
- }, `Processing: ${product.name}`);
- if (!product.href) {
- logger_1.logger.warn('scraper', ` ⚠ No product URL, skipping details`);
- product.metadata = {};
- failCount++;
- continue;
- }
- const details = await scrapeProductDetails(page, product.href, product.name);
- product.imageUrl = details.fullSizeImage ? getFullSizeImageUrl(details.fullSizeImage) : null;
- product.description = details.description;
- product.thc = details.thc;
- product.cbd = details.cbd;
- product.strainType = details.strainType;
- product.brand = details.brand;
- product.weight = details.weights.length > 0 ? details.weights[0] : null;
- product.metadata = {
- terpenes: details.terpenes,
- effects: details.effects,
- lineage: details.lineage,
- flavors: details.flavors,
- allWeights: details.weights
- };
- if (details.thc || details.cbd || details.description) {
- logger_1.logger.info('scraper', ` ✓ THC: ${details.thc}%, CBD: ${details.cbd}%`);
- successCount++;
- }
- else {
- logger_1.logger.warn('scraper', ` ⚠ Limited data extracted`);
- failCount++;
- }
- // No delays - scrape fast!
- }
- catch (error) {
- logger_1.logger.error('scraper', ` ✗ Unexpected error: ${error}`);
- product.metadata = {};
- failCount++;
- }
- }
- await browser.close();
- logger_1.logger.info('scraper', `━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
- logger_1.logger.info('scraper', `✅ Category complete: ${category.name}`);
- logger_1.logger.info('scraper', ` Total products: ${products.length}`);
- logger_1.logger.info('scraper', ` Success: ${successCount}`);
- logger_1.logger.info('scraper', ` Failed: ${failCount}`);
- logger_1.logger.info('scraper', `━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
- await migrate_1.pool.query(`
- UPDATE categories
- SET last_scraped_at = CURRENT_TIMESTAMP
- WHERE id = $1
- `, [categoryId]);
- // Mark scraper as complete
- (0, scraper_monitor_1.completeScraper)(scraperId);
- const formattedProducts = products.map((p, index) => {
- const sanitized = sanitizeProductData(p);
- // Normalize availability from Dutchie product data
- const availability = (0, availability_1.normalizeAvailability)(p);
- return {
- dutchieProductId: `${category.store_slug}-${category.slug}-${Date.now()}-${index}`,
- name: sanitized.name,
- variant: p.variant || null,
- description: sanitized.description,
- price: p.price,
- originalPrice: p.originalPrice,
- thcPercentage: sanitized.thc,
- cbdPercentage: sanitized.cbd,
- strainType: p.strainType,
- brand: sanitized.brand,
- weight: sanitized.weight,
- imageUrl: p.imageUrl,
- dutchieUrl: p.href,
- metadata: p.metadata || {},
- availabilityStatus: availability.status,
- availabilityRaw: availability.raw,
- stockQuantity: availability.quantity
- };
- });
- return formattedProducts;
- }
- catch (error) {
- logger_1.logger.error('scraper', `❌ Category scraping error: ${error}`);
- // Smart proxy error handling
- if (proxyId) {
- const errorMsg = String(error);
- if ((0, proxy_1.isBotDetectionError)(errorMsg)) {
- // Bot detection! Put this proxy in timeout
- logger_1.logger.warn('scraper', `🤖 Bot detection triggered for proxy ${proxyId}!`);
- (0, proxy_1.putProxyInTimeout)(proxyId, errorMsg);
- }
- else if (errorMsg.includes('timeout') || errorMsg.includes('net::') ||
- errorMsg.includes('ERR_') || errorMsg.includes('Navigation') ||
- errorMsg.includes('Protocol error') || errorMsg.includes('Target closed')) {
- // Regular proxy failure - increment failure count
- logger_1.logger.warn('scraper', `Proxy failure detected, incrementing failure count for proxy ${proxyId}`);
- await (0, proxy_1.incrementProxyFailure)(proxyId, errorMsg);
- }
- }
- // Mark scraper as failed
- (0, scraper_monitor_1.completeScraper)(scraperId, String(error));
- if (browser) {
- try {
- await browser.close();
- }
- catch (e) {
- logger_1.logger.error('scraper', `Error closing browser: ${e}`);
- }
- }
- throw error;
- }
-}
-async function autoScroll(page) {
- await page.evaluate(async () => {
- await new Promise((resolve) => {
- let totalHeight = 0;
- const distance = 500;
- const timer = setInterval(() => {
- const scrollHeight = document.body.scrollHeight;
- window.scrollBy(0, distance);
- totalHeight += distance;
- if (totalHeight >= scrollHeight) {
- clearInterval(timer);
- resolve();
- }
- }, 200);
- });
- });
-}
-async function saveProducts(storeId, categoryId, products) {
- const client = await migrate_1.pool.connect();
- try {
- await client.query('BEGIN');
- logger_1.logger.info('scraper', `Saving ${products.length} products to database...`);
- // Mark all products as out-of-stock before processing (they'll be re-marked if found)
- // Also update availability_status and last_seen_out_of_stock_at for state transition tracking
- await client.query(`
- UPDATE products
- SET in_stock = false,
- availability_status = 'out_of_stock',
- last_seen_out_of_stock_at = CASE
- WHEN availability_status != 'out_of_stock' THEN CURRENT_TIMESTAMP
- ELSE last_seen_out_of_stock_at
- END
- WHERE store_id = $1 AND category_id = $2 AND in_stock = true
- `, [storeId, categoryId]);
- for (const product of products) {
- try {
- // Get availability from product (defaults to in_stock if product exists in scraped data)
- const availStatus = product.availabilityStatus || 'in_stock';
- const availRaw = product.availabilityRaw ? JSON.stringify(product.availabilityRaw) : null;
- const stockQty = product.stockQuantity ?? null;
- const existingResult = await client.query(`
- SELECT id, image_url, local_image_path, availability_status
- FROM products
- WHERE store_id = $1 AND name = $2 AND category_id = $3
- AND (variant = $4 OR (variant IS NULL AND $4 IS NULL))
- `, [storeId, product.name, categoryId, product.variant || null]);
- let localImagePath = null;
- let productId;
- if (existingResult.rows.length > 0) {
- productId = existingResult.rows[0].id;
- localImagePath = existingResult.rows[0].local_image_path;
- const prevStatus = existingResult.rows[0].availability_status;
- // Determine if we need to update last_seen_in_stock_at
- const isNowInStock = availStatus === 'in_stock' || availStatus === 'limited';
- const wasOutOfStock = prevStatus === 'out_of_stock' || prevStatus === 'unknown';
- await client.query(`
- UPDATE products
- SET name = $1, variant = $2, description = $3, price = $4,
- strain_type = $5, thc_percentage = $6, cbd_percentage = $7,
- brand = $8, weight = $9, image_url = $10, dutchie_url = $11,
- in_stock = true, metadata = $12, last_seen_at = CURRENT_TIMESTAMP,
- updated_at = CURRENT_TIMESTAMP,
- availability_status = $14,
- availability_raw = $15,
- stock_quantity = $16,
- last_seen_in_stock_at = CASE
- WHEN $17 THEN CURRENT_TIMESTAMP
- ELSE last_seen_in_stock_at
- END
- WHERE id = $13
- `, [
- product.name, product.variant, product.description, product.price,
- product.strainType, product.thcPercentage, product.cbdPercentage,
- product.brand, product.weight, product.imageUrl, product.dutchieUrl,
- JSON.stringify(product.metadata), productId, availStatus, availRaw, stockQty,
- isNowInStock && wasOutOfStock
- ]);
- }
- else {
- // Generate unique slug from product name + timestamp + random suffix
- const baseSlug = product.name
- .toLowerCase()
- .replace(/[^a-z0-9]+/g, '-')
- .replace(/^-|-$/g, '')
- .substring(0, 150);
- const uniqueSuffix = `${Date.now()}-${Math.random().toString(36).substr(2, 6)}`;
- const slug = `${baseSlug}-${uniqueSuffix}`;
- const insertResult = await client.query(`
- INSERT INTO products (
- store_id, category_id, dutchie_product_id, name, slug, variant, description,
- price, strain_type, thc_percentage, cbd_percentage,
- brand, weight, image_url, dutchie_url, in_stock, metadata,
- availability_status, availability_raw, stock_quantity, last_seen_in_stock_at
- ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, true, $16, $17, $18, $19, CURRENT_TIMESTAMP)
- RETURNING id
- `, [
- storeId, categoryId, product.dutchieProductId, product.name, slug, product.variant, product.description,
- product.price, product.strainType, product.thcPercentage, product.cbdPercentage,
- product.brand, product.weight, product.imageUrl, product.dutchieUrl,
- JSON.stringify(product.metadata), availStatus, availRaw, stockQty
- ]);
- productId = insertResult.rows[0].id;
- }
- if (product.imageUrl && !localImagePath) {
- try {
- localImagePath = await (0, minio_1.uploadImageFromUrl)(product.imageUrl, productId);
- await client.query(`
- UPDATE products
- SET local_image_path = $1
- WHERE id = $2
- `, [localImagePath, productId]);
- }
- catch (error) {
- logger_1.logger.error('images', `Failed to download image for ${product.name}: ${error}`);
- }
- }
- }
- catch (productError) {
- logger_1.logger.error('scraper', `Failed to save product ${product.name}: ${productError}`);
- }
- }
- await client.query('COMMIT');
- logger_1.logger.info('scraper', `✅ Saved ${products.length} products successfully`);
- }
- catch (error) {
- await client.query('ROLLBACK');
- logger_1.logger.error('scraper', `Error saving products: ${error}`);
- throw error;
- }
- finally {
- client.release();
- }
-}
-async function scrapeStore(storeId, parallel = 3, userAgent) {
- try {
- logger_1.logger.info('scraper', `🏪 Starting scrape for store ID: ${storeId} (${parallel} parallel, UA: ${userAgent || 'random'})`);
- const categoriesResult = await migrate_1.pool.query(`
- SELECT c.id, c.name, c.slug, c.dutchie_url
- FROM categories c
- WHERE c.store_id = $1
- AND c.scrape_enabled = true
- ORDER BY c.name
- `, [storeId]);
- logger_1.logger.info('scraper', `Found ${categoriesResult.rows.length} categories to scrape`);
- for (const category of categoriesResult.rows) {
- try {
- logger_1.logger.info('scraper', `━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
- logger_1.logger.info('scraper', `📂 Scraping: ${category.name}`);
- logger_1.logger.info('scraper', `━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
- const products = await scrapeCategory(storeId, category.id, userAgent);
- await saveProducts(storeId, category.id, products);
- logger_1.logger.info('scraper', `✅ Completed ${category.name} - ${products.length} products saved`);
- }
- catch (error) {
- logger_1.logger.error('scraper', `❌ Failed to scrape ${category.name}: ${error}`);
- }
- // No delays - scrape fast!
- }
- await migrate_1.pool.query(`
- UPDATE stores
- SET last_scraped_at = CURRENT_TIMESTAMP
- WHERE id = $1
- `, [storeId]);
- logger_1.logger.info('scraper', `🎉 Store scrape completed: ID ${storeId}`);
- }
- catch (error) {
- logger_1.logger.error('scraper', `❌ Store scrape failed: ${error}`);
- throw error;
- }
-}
diff --git a/backend/dist/services/store-crawl-orchestrator.js b/backend/dist/services/store-crawl-orchestrator.js
deleted file mode 100644
index 11831849..00000000
--- a/backend/dist/services/store-crawl-orchestrator.js
+++ /dev/null
@@ -1,351 +0,0 @@
-"use strict";
-/**
- * Store Crawl Orchestrator
- *
- * Orchestrates the complete crawl workflow for a store:
- * 1. Load store and its linked dispensary
- * 2. Check if provider detection is needed
- * 3. Run provider detection if needed
- * 4. Queue appropriate crawl jobs based on provider/mode
- * 5. Update store_crawl_schedule with meaningful status
- *
- * This replaces the simple "triggerManualCrawl" with intelligent orchestration.
- */
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.runStoreCrawlOrchestrator = runStoreCrawlOrchestrator;
-exports.runBatchOrchestrator = runBatchOrchestrator;
-exports.getStoresDueForOrchestration = getStoresDueForOrchestration;
-const uuid_1 = require("uuid");
-const migrate_1 = require("../db/migrate");
-const crawler_logger_1 = require("./crawler-logger");
-const intelligence_detector_1 = require("./intelligence-detector");
-const category_crawler_jobs_1 = require("./category-crawler-jobs");
-// DEPRECATED: scrapeStore writes to legacy products table
-// import { scrapeStore } from '../scraper-v2';
-// Import the new dutchie-az pipeline for Dutchie crawling
-const product_crawler_1 = require("../dutchie-az/services/product-crawler");
-const connection_1 = require("../dutchie-az/db/connection");
-// ========================================
-// Main Orchestrator Function
-// ========================================
-/**
- * Run the complete crawl orchestration for a store
- *
- * Behavior:
- * 1. Load the store and its linked dispensary
- * 2. If no dispensary is linked, report error
- * 3. If product_provider is missing or stale (>7 days), run detection
- * 4. After detection:
- * - If product_provider = 'dutchie' and product_crawler_mode = 'production': Run production crawl
- * - Otherwise: Run sandbox crawl
- * 5. Update store_crawl_schedule with status/summary
- */
-async function runStoreCrawlOrchestrator(storeId) {
- const startTime = Date.now();
- const runId = (0, uuid_1.v4)();
- let result = {
- status: 'pending',
- summary: '',
- runId,
- storeId,
- dispensaryId: null,
- detectionRan: false,
- crawlRan: false,
- durationMs: 0,
- };
- try {
- // Mark schedule as running
- await updateScheduleStatus(storeId, 'running', 'Starting orchestrator...', runId);
- // 1. Load store with dispensary info
- const store = await getStoreWithDispensary(storeId);
- if (!store) {
- throw new Error(`Store ${storeId} not found`);
- }
- result.dispensaryId = store.dispensary_id;
- // 2. Check if dispensary is linked
- if (!store.dispensary_id) {
- result.status = 'error';
- result.summary = 'No dispensary linked - cannot determine provider';
- result.error = 'Store is not linked to a dispensary. Link it in the Dispensaries page.';
- await updateScheduleStatus(storeId, 'error', result.summary, runId, result.error);
- result.durationMs = Date.now() - startTime;
- return result;
- }
- // 3. Check if provider detection is needed
- const needsDetection = await checkNeedsDetection(store);
- if (needsDetection) {
- // Run provider detection
- const websiteUrl = store.dispensary_menu_url || store.dispensary_website;
- if (!websiteUrl) {
- result.status = 'error';
- result.summary = 'No website URL available for detection';
- result.error = 'Dispensary has no menu_url or website configured';
- await updateScheduleStatus(storeId, 'error', result.summary, runId, result.error);
- result.durationMs = Date.now() - startTime;
- return result;
- }
- await updateScheduleStatus(storeId, 'running', 'Running provider detection...', runId);
- const detectionResult = await (0, intelligence_detector_1.detectMultiCategoryProviders)(websiteUrl);
- result.detectionRan = true;
- result.detectionResult = detectionResult;
- // Save detection results to dispensary
- await (0, intelligence_detector_1.updateAllCategoryProviders)(store.dispensary_id, detectionResult);
- crawler_logger_1.crawlerLogger.providerDetected({
- dispensary_id: store.dispensary_id,
- dispensary_name: store.dispensary_name || store.name,
- detected_provider: detectionResult.product.provider,
- confidence: detectionResult.product.confidence,
- detection_method: 'orchestrator_run',
- menu_url: websiteUrl,
- category: 'product',
- });
- // Refresh store info after detection
- const updatedStore = await getStoreWithDispensary(storeId);
- if (updatedStore) {
- Object.assign(store, updatedStore);
- }
- }
- // 4. Determine crawl type and run
- const provider = store.product_provider;
- const mode = store.product_crawler_mode;
- if (provider === 'dutchie' && mode === 'production') {
- // Production Dutchie crawl - now uses the new dutchie-az GraphQL pipeline
- await updateScheduleStatus(storeId, 'running', 'Running Dutchie GraphQL crawl (dutchie-az)...', runId);
- try {
- // Look up the dispensary in the dutchie-az database
- // The dutchie-az pipeline has its own dispensaries table
- // We try multiple matching strategies: name, slug, or platform_dispensary_id
- const dispensaryResult = await (0, connection_1.query)(`SELECT * FROM dispensaries
- WHERE name ILIKE $1
- OR slug ILIKE $2
- LIMIT 1`, [store.dispensary_name, store.slug]);
- if (dispensaryResult.rows.length === 0) {
- throw new Error(`Dispensary not found in dutchie-az database. ` +
- `You must add this dispensary to the dutchie-az pipeline first. ` +
- `Store: ${store.name} (${store.dispensary_name})`);
- }
- const dutchieDispensary = dispensaryResult.rows[0];
- // Run the new dutchie-az GraphQL crawler
- const crawlResult = await (0, product_crawler_1.crawlDispensaryProducts)(dutchieDispensary, 'rec', { useBothModes: true });
- result.crawlRan = true;
- result.crawlType = 'production';
- result.productsFound = crawlResult.productsFound ?? undefined;
- result.productsNew = crawlResult.productsUpserted ?? undefined;
- result.productsUpdated = crawlResult.snapshotsCreated ?? undefined;
- if (crawlResult.success) {
- const detectionPart = result.detectionRan ? 'Detection + ' : '';
- result.summary = `${detectionPart}Dutchie GraphQL crawl (${crawlResult.productsFound || 0} items, ${crawlResult.productsUpserted || 0} upserted, ${crawlResult.snapshotsCreated || 0} snapshots)`;
- result.status = 'success';
- // Update store's last_scraped_at
- await migrate_1.pool.query('UPDATE stores SET last_scraped_at = NOW() WHERE id = $1', [storeId]);
- crawler_logger_1.crawlerLogger.jobCompleted({
- job_id: 0, // Orchestrator doesn't create traditional jobs
- store_id: storeId,
- store_name: store.name,
- duration_ms: crawlResult.durationMs,
- products_found: crawlResult.productsFound || 0,
- products_new: crawlResult.productsUpserted || 0,
- products_updated: crawlResult.snapshotsCreated || 0,
- provider: 'dutchie',
- });
- }
- else {
- throw new Error(crawlResult.errorMessage || 'Crawl failed');
- }
- }
- catch (crawlError) {
- result.status = 'error';
- result.error = crawlError.message;
- result.summary = `Dutchie crawl failed: ${crawlError.message.slice(0, 100)}`;
- result.crawlRan = true;
- result.crawlType = 'production';
- crawler_logger_1.crawlerLogger.jobFailed({
- job_id: 0,
- store_id: storeId,
- store_name: store.name,
- duration_ms: Date.now() - startTime,
- error_message: crawlError.message,
- provider: 'dutchie',
- });
- }
- }
- else if (provider && provider !== 'unknown') {
- // Sandbox crawl for non-Dutchie or sandbox mode
- await updateScheduleStatus(storeId, 'running', `Running ${provider} sandbox crawl...`, runId);
- try {
- const sandboxResult = await (0, category_crawler_jobs_1.runSandboxProductsJob)(store.dispensary_id);
- result.crawlRan = true;
- result.crawlType = 'sandbox';
- result.productsFound = sandboxResult.data?.productsExtracted || 0;
- const detectionPart = result.detectionRan ? 'Detection + ' : '';
- if (sandboxResult.success) {
- result.summary = `${detectionPart}${provider} sandbox crawl (${result.productsFound} items, quality ${sandboxResult.data?.qualityScore || 0}%)`;
- result.status = 'sandbox_only';
- }
- else {
- result.summary = `${detectionPart}${provider} sandbox failed: ${sandboxResult.message}`;
- result.status = 'error';
- result.error = sandboxResult.message;
- }
- }
- catch (sandboxError) {
- result.status = 'error';
- result.error = sandboxError.message;
- result.summary = `Sandbox crawl failed: ${sandboxError.message.slice(0, 100)}`;
- result.crawlRan = true;
- result.crawlType = 'sandbox';
- }
- }
- else {
- // No provider detected - detection only
- if (result.detectionRan) {
- result.summary = `Detection complete: provider=${store.product_provider || 'unknown'}, confidence=${store.product_confidence || 0}%`;
- result.status = 'detection_only';
- }
- else {
- result.summary = 'No provider detected and no crawl possible';
- result.status = 'error';
- result.error = 'Could not determine menu provider';
- }
- }
- }
- catch (error) {
- result.status = 'error';
- result.error = error.message;
- result.summary = `Orchestrator error: ${error.message.slice(0, 100)}`;
- crawler_logger_1.crawlerLogger.queueFailure({
- queue_type: 'orchestrator',
- error_message: error.message,
- });
- }
- result.durationMs = Date.now() - startTime;
- // Update final schedule status
- await updateScheduleStatus(storeId, result.status, result.summary, runId, result.error);
- // Create a crawl_job record for tracking
- await createOrchestratorJobRecord(storeId, result);
- return result;
-}
-// ========================================
-// Helper Functions
-// ========================================
-async function getStoreWithDispensary(storeId) {
- const result = await migrate_1.pool.query(`SELECT
- s.id, s.name, s.slug, s.timezone, s.dispensary_id,
- d.name as dispensary_name,
- d.menu_url as dispensary_menu_url,
- d.website as dispensary_website,
- d.product_provider,
- d.product_confidence,
- d.product_crawler_mode,
- d.last_product_scan_at
- FROM stores s
- LEFT JOIN dispensaries d ON d.id = s.dispensary_id
- WHERE s.id = $1`, [storeId]);
- return result.rows[0] || null;
-}
-async function checkNeedsDetection(store) {
- // No dispensary = can't detect
- if (!store.dispensary_id)
- return false;
- // No provider = definitely needs detection
- if (!store.product_provider)
- return true;
- // Unknown provider = needs detection
- if (store.product_provider === 'unknown')
- return true;
- // Low confidence = needs re-detection
- if (store.product_confidence !== null && store.product_confidence < 50)
- return true;
- // Stale detection (> 7 days) = needs refresh
- if (store.last_product_scan_at) {
- const daysSince = (Date.now() - new Date(store.last_product_scan_at).getTime()) / (1000 * 60 * 60 * 24);
- if (daysSince > 7)
- return true;
- }
- return false;
-}
-async function updateScheduleStatus(storeId, status, summary, runId, error) {
- await migrate_1.pool.query(`INSERT INTO store_crawl_schedule (store_id, last_status, last_summary, last_run_at, last_error)
- VALUES ($1, $2, $3, NOW(), $4)
- ON CONFLICT (store_id) DO UPDATE SET
- last_status = $2,
- last_summary = $3,
- last_run_at = NOW(),
- last_error = $4,
- updated_at = NOW()`, [storeId, status, summary, error || null]);
-}
-async function getLatestCrawlStats(storeId) {
- // Get count of products for this store
- const result = await migrate_1.pool.query(`SELECT
- COUNT(*) as total,
- COUNT(*) FILTER (WHERE created_at > NOW() - INTERVAL '1 hour') as recent_new,
- COUNT(*) FILTER (WHERE updated_at > NOW() - INTERVAL '1 hour' AND created_at < NOW() - INTERVAL '1 hour') as recent_updated
- FROM products
- WHERE store_id = $1`, [storeId]);
- return {
- products_found: parseInt(result.rows[0]?.total || '0'),
- products_new: parseInt(result.rows[0]?.recent_new || '0'),
- products_updated: parseInt(result.rows[0]?.recent_updated || '0'),
- };
-}
-async function createOrchestratorJobRecord(storeId, result) {
- await migrate_1.pool.query(`INSERT INTO crawl_jobs (
- store_id, job_type, trigger_type, status, priority,
- scheduled_at, started_at, completed_at,
- products_found, products_new, products_updated,
- error_message, orchestrator_run_id, detection_result
- ) VALUES (
- $1, 'orchestrator', 'manual', $2, 100,
- NOW(), NOW(), NOW(),
- $3, $4, $5,
- $6, $7, $8
- )`, [
- storeId,
- result.status === 'success' ? 'completed' : result.status === 'error' ? 'failed' : 'completed',
- result.productsFound || null,
- result.productsNew || null,
- result.productsUpdated || null,
- result.error || null,
- result.runId,
- result.detectionResult ? JSON.stringify({
- product_provider: result.detectionResult.product.provider,
- product_confidence: result.detectionResult.product.confidence,
- product_mode: result.detectionResult.product.mode,
- }) : null,
- ]);
-}
-// ========================================
-// Batch Orchestration
-// ========================================
-/**
- * Run orchestrator for multiple stores
- */
-async function runBatchOrchestrator(storeIds, concurrency = 3) {
- const results = [];
- // Process in batches
- for (let i = 0; i < storeIds.length; i += concurrency) {
- const batch = storeIds.slice(i, i + concurrency);
- const batchResults = await Promise.all(batch.map(storeId => runStoreCrawlOrchestrator(storeId)));
- results.push(...batchResults);
- }
- return results;
-}
-/**
- * Get stores that are due for orchestration
- */
-async function getStoresDueForOrchestration(limit = 10) {
- const result = await migrate_1.pool.query(`SELECT s.id
- FROM stores s
- LEFT JOIN store_crawl_schedule scs ON scs.store_id = s.id
- WHERE s.active = TRUE
- AND s.scrape_enabled = TRUE
- AND COALESCE(scs.enabled, TRUE) = TRUE
- AND (
- scs.last_run_at IS NULL
- OR scs.last_run_at < NOW() - (COALESCE(scs.interval_hours, 4) || ' hours')::INTERVAL
- )
- AND (scs.last_status IS NULL OR scs.last_status NOT IN ('running', 'pending'))
- ORDER BY COALESCE(scs.priority, 0) DESC, scs.last_run_at ASC NULLS FIRST
- LIMIT $1`, [limit]);
- return result.rows.map(row => row.id);
-}
diff --git a/backend/dist/utils/age-gate-playwright.js b/backend/dist/utils/age-gate-playwright.js
deleted file mode 100644
index ac32cce4..00000000
--- a/backend/dist/utils/age-gate-playwright.js
+++ /dev/null
@@ -1,175 +0,0 @@
-"use strict";
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.hasAgeGatePlaywright = hasAgeGatePlaywright;
-exports.bypassAgeGatePlaywright = bypassAgeGatePlaywright;
-exports.detectStateFromUrlPlaywright = detectStateFromUrlPlaywright;
-const logger_1 = require("../services/logger");
-/**
- * Detects if a Playwright page has an age verification gate
- */
-async function hasAgeGatePlaywright(page) {
- try {
- const url = page.url();
- const bodyText = await page.textContent('body') || '';
- const hasAgeVerification = url.includes('/age-gate') ||
- bodyText.includes('age verification') ||
- bodyText.includes('Please select your state') ||
- bodyText.includes('are you 21') ||
- bodyText.includes('are you 18') ||
- bodyText.includes('Enter your date of birth') ||
- bodyText.toLowerCase().includes('verify your age');
- return hasAgeVerification;
- }
- catch (err) {
- logger_1.logger.warn('age-gate', `Error detecting age gate: ${err}`);
- return false;
- }
-}
-/**
- * Attempts to bypass an age gate using Playwright
- * Handles multiple age gate patterns including Curaleaf's complex React-based gate
- *
- * @param page - Playwright page object
- * @param state - State to select (e.g., 'Arizona', 'California')
- * @returns Promise - true if bypass succeeded, false otherwise
- */
-async function bypassAgeGatePlaywright(page, state = 'Arizona') {
- try {
- const hasGate = await hasAgeGatePlaywright(page);
- if (!hasGate) {
- logger_1.logger.info('age-gate', 'No age gate detected');
- return true;
- }
- logger_1.logger.info('age-gate', `Age gate detected - attempting to bypass with state: ${state}...`);
- // Wait for age gate to fully render
- await page.waitForTimeout(2000);
- // Method 1: Curaleaf-style (state dropdown + "I'm over 21" button)
- try {
- const stateButton = page.locator('button#state, button[id="state"]').first();
- const stateButtonExists = await stateButton.count() > 0;
- if (stateButtonExists) {
- logger_1.logger.info('age-gate', 'Found Curaleaf-style state dropdown...');
- await stateButton.click();
- await page.waitForTimeout(1000);
- // Select state
- const stateOption = page.locator('[role="option"]').filter({ hasText: new RegExp(`^${state}$`, 'i') });
- const stateExists = await stateOption.count() > 0;
- if (stateExists) {
- logger_1.logger.info('age-gate', `Clicking ${state} option...`);
- await stateOption.first().click();
- await page.waitForTimeout(2000);
- // Look for "I'm over 21" button
- const ageButton = page.locator('button').filter({ hasText: /I'm over 21|I am 21|I'm 21|over 21/i });
- const ageButtonExists = await ageButton.count() > 0;
- if (ageButtonExists) {
- logger_1.logger.info('age-gate', 'Clicking age verification button...');
- await ageButton.first().click();
- await page.waitForLoadState('domcontentloaded', { timeout: 15000 });
- await page.waitForTimeout(3000);
- // Check if we successfully bypassed
- const finalUrl = page.url();
- if (!finalUrl.includes('/age-gate')) {
- logger_1.logger.info('age-gate', `✅ Age gate bypass successful`);
- return true;
- }
- }
- }
- }
- }
- catch (e) {
- logger_1.logger.warn('age-gate', `Curaleaf method failed: ${e}`);
- }
- // Method 2: Simple "Yes" or "I'm 21" button (for simpler age gates)
- try {
- const simpleButton = page.locator('button, a, [role="button"]').filter({
- hasText: /yes|i am 21|i'm 21|enter the site|continue|confirm/i
- });
- const simpleExists = await simpleButton.count() > 0;
- if (simpleExists) {
- logger_1.logger.info('age-gate', 'Found simple age gate button...');
- await simpleButton.first().click();
- await page.waitForLoadState('domcontentloaded', { timeout: 10000 });
- await page.waitForTimeout(2000);
- const finalUrl = page.url();
- if (!finalUrl.includes('/age-gate')) {
- logger_1.logger.info('age-gate', `✅ Age gate bypass successful`);
- return true;
- }
- }
- }
- catch (e) {
- logger_1.logger.warn('age-gate', `Simple button method failed: ${e}`);
- }
- // Method 3: Standard select dropdown
- try {
- const selectExists = await page.locator('select').count() > 0;
- if (selectExists) {
- logger_1.logger.info('age-gate', 'Found select dropdown...');
- const select = page.locator('select').first();
- await select.selectOption({ label: state });
- await page.waitForTimeout(1000);
- // Look for submit button
- const submitButton = page.locator('button[type="submit"], input[type="submit"]');
- const submitExists = await submitButton.count() > 0;
- if (submitExists) {
- await submitButton.first().click();
- await page.waitForLoadState('domcontentloaded', { timeout: 10000 });
- await page.waitForTimeout(2000);
- const finalUrl = page.url();
- if (!finalUrl.includes('/age-gate')) {
- logger_1.logger.info('age-gate', `✅ Age gate bypass successful`);
- return true;
- }
- }
- }
- }
- catch (e) {
- logger_1.logger.warn('age-gate', `Select dropdown method failed: ${e}`);
- }
- // Verify final state
- const finalUrl = page.url();
- if (finalUrl.includes('/age-gate')) {
- logger_1.logger.error('age-gate', `❌ Age gate bypass failed - still at: ${finalUrl}`);
- return false;
- }
- logger_1.logger.info('age-gate', `✅ Age gate bypass successful`);
- return true;
- }
- catch (err) {
- logger_1.logger.error('age-gate', `Error bypassing age gate: ${err}`);
- return false;
- }
-}
-/**
- * Helper to detect the state from a store URL
- */
-function detectStateFromUrlPlaywright(url) {
- const stateMap = {
- '-az-': 'Arizona',
- 'arizona': 'Arizona',
- '-ca-': 'California',
- 'california': 'California',
- '-co-': 'Colorado',
- 'colorado': 'Colorado',
- '-fl-': 'Florida',
- 'florida': 'Florida',
- '-il-': 'Illinois',
- 'illinois': 'Illinois',
- '-ma-': 'Massachusetts',
- '-mi-': 'Michigan',
- '-nv-': 'Nevada',
- '-nj-': 'New Jersey',
- '-ny-': 'New York',
- '-or-': 'Oregon',
- '-pa-': 'Pennsylvania',
- '-wa-': 'Washington',
- };
- const lowerUrl = url.toLowerCase();
- for (const [pattern, stateName] of Object.entries(stateMap)) {
- if (lowerUrl.includes(pattern)) {
- return stateName;
- }
- }
- // Default to Arizona
- return 'Arizona';
-}
diff --git a/backend/dist/utils/age-gate.js b/backend/dist/utils/age-gate.js
deleted file mode 100644
index 392e7b6e..00000000
--- a/backend/dist/utils/age-gate.js
+++ /dev/null
@@ -1,263 +0,0 @@
-"use strict";
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.setAgeGateCookies = setAgeGateCookies;
-exports.hasAgeGate = hasAgeGate;
-exports.bypassAgeGate = bypassAgeGate;
-exports.detectStateFromUrl = detectStateFromUrl;
-const logger_1 = require("../services/logger");
-/**
- * Sets age gate bypass cookies before navigating to a page
- * This should be called BEFORE page.goto() to prevent the age gate from showing
- *
- * @param page - Puppeteer page object
- * @param url - URL to extract domain from
- * @param state - State to set in cookie
- */
-async function setAgeGateCookies(page, url, state = 'Arizona') {
- try {
- const urlObj = new URL(url);
- const domain = urlObj.hostname.replace('www.', '');
- // Set cookies that bypass age gates
- await page.setCookie({
- name: 'age_gate_passed',
- value: 'true',
- domain: `.${domain}`,
- path: '/',
- expires: Date.now() / 1000 + 365 * 24 * 60 * 60, // 1 year
- httpOnly: false,
- secure: false,
- sameSite: 'Lax'
- }, {
- name: 'selected_state',
- value: state,
- domain: `.${domain}`,
- path: '/',
- expires: Date.now() / 1000 + 365 * 24 * 60 * 60, // 1 year
- httpOnly: false,
- secure: false,
- sameSite: 'Lax'
- }, {
- name: 'age_verified',
- value: 'true',
- domain: `.${domain}`,
- path: '/',
- expires: Date.now() / 1000 + 365 * 24 * 60 * 60,
- httpOnly: false,
- secure: false,
- sameSite: 'Lax'
- });
- logger_1.logger.info('age-gate', `Set age gate bypass cookies for ${domain} (state: ${state})`);
- }
- catch (err) {
- logger_1.logger.warn('age-gate', `Failed to set age gate cookies: ${err}`);
- }
-}
-/**
- * Detects if a page has an age verification gate
- */
-async function hasAgeGate(page) {
- return await page.evaluate(() => {
- const bodyText = document.body.textContent || '';
- const hasAgeVerification = bodyText.includes('age verification') ||
- bodyText.includes('Please select your state') ||
- bodyText.includes('are you 21') ||
- bodyText.includes('are you 18') ||
- bodyText.includes('Enter your date of birth') ||
- bodyText.toLowerCase().includes('verify');
- return hasAgeVerification;
- });
-}
-/**
- * Attempts to bypass an age gate by selecting the appropriate state
- * Works with multiple age gate patterns used by cannabis dispensaries
- *
- * @param page - Puppeteer page object
- * @param state - State to select (e.g., 'Arizona', 'California'). Defaults to 'Arizona'
- * @returns Promise - true if bypass was attempted, false if no age gate found
- */
-async function bypassAgeGate(page, state = 'Arizona', useSavedCookies = true) {
- try {
- const hasGate = await hasAgeGate(page);
- if (!hasGate) {
- logger_1.logger.info('age-gate', 'No age gate detected');
- return false;
- }
- logger_1.logger.info('age-gate', `Age gate detected - attempting to bypass with state: ${state}...`);
- // Wait a bit for React components to fully render
- await page.waitForTimeout(2000);
- // Try Method 0: Custom dropdown button (shadcn/radix style - Curaleaf)
- let customDropdownWorked = false;
- try {
- // Click button to open dropdown
- const dropdownButton = await page.$('button#state, button[id="state"]');
- if (dropdownButton) {
- logger_1.logger.info('age-gate', 'Found state dropdown button, clicking...');
- await dropdownButton.click();
- await page.waitForTimeout(800);
- // Click the state option and trigger React events
- const stateClicked = await page.evaluate((selectedState) => {
- const options = Array.from(document.querySelectorAll('[role="option"]'));
- const stateOption = options.find(el => el.textContent?.toLowerCase() === selectedState.toLowerCase());
- if (stateOption instanceof HTMLElement) {
- // Trigger multiple events that React might be listening for
- stateOption.dispatchEvent(new MouseEvent('mousedown', { bubbles: true }));
- stateOption.dispatchEvent(new MouseEvent('mouseup', { bubbles: true }));
- stateOption.click();
- stateOption.dispatchEvent(new MouseEvent('click', { bubbles: true }));
- stateOption.dispatchEvent(new Event('change', { bubbles: true }));
- stateOption.dispatchEvent(new Event('input', { bubbles: true }));
- return true;
- }
- return false;
- }, state);
- if (stateClicked) {
- logger_1.logger.info('age-gate', `Clicked ${state} option with React events`);
- await page.waitForTimeout(1000);
- // Look for and click any submit/continue button that appeared
- const submitClicked = await page.evaluate(() => {
- const buttons = Array.from(document.querySelectorAll('button, [role="button"], a'));
- const submitBtn = buttons.find(el => {
- const text = el.textContent?.toLowerCase() || '';
- const ariaLabel = el.getAttribute('aria-label')?.toLowerCase() || '';
- return text.includes('continue') || text.includes('submit') ||
- text.includes('enter') || text.includes('confirm') ||
- ariaLabel.includes('continue') || ariaLabel.includes('submit');
- });
- if (submitBtn instanceof HTMLElement && submitBtn.offsetParent !== null) {
- submitBtn.click();
- return true;
- }
- return false;
- });
- if (submitClicked) {
- logger_1.logger.info('age-gate', `Found and clicked submit button`);
- }
- customDropdownWorked = true;
- }
- }
- }
- catch (e) {
- logger_1.logger.warn('age-gate', `Dropdown method failed: ${e}`);
- }
- // Try Method 1: Dropdown select
- const selectFound = await page.evaluate((selectedState) => {
- const selects = Array.from(document.querySelectorAll('select'));
- for (const select of selects) {
- const options = Array.from(select.options);
- const stateOption = options.find(opt => opt.text.toLowerCase().includes(selectedState.toLowerCase()) ||
- opt.value.toLowerCase().includes(selectedState.toLowerCase()));
- if (stateOption) {
- select.value = stateOption.value;
- select.dispatchEvent(new Event('change', { bubbles: true }));
- select.dispatchEvent(new Event('input', { bubbles: true }));
- return true;
- }
- }
- return false;
- }, state);
- // Try Method 2: State button/card (click state, then click confirm)
- let stateClicked = false;
- if (!selectFound) {
- stateClicked = await page.evaluate((selectedState) => {
- const allElements = Array.from(document.querySelectorAll('button, a, div, span, [role="button"], [class*="state"], [class*="State"], [class*="card"], [class*="option"]'));
- const stateButton = allElements.find(el => el.textContent?.toLowerCase().includes(selectedState.toLowerCase()));
- if (stateButton instanceof HTMLElement) {
- stateButton.click();
- return true;
- }
- return false;
- }, state);
- if (stateClicked) {
- // Wait for confirm button to appear and click it
- await page.waitForTimeout(1000);
- await page.evaluate(() => {
- const confirmBtns = Array.from(document.querySelectorAll('button, a, [role="button"]'));
- const confirmBtn = confirmBtns.find(el => {
- const text = el.textContent?.toLowerCase() || '';
- return text.includes('enter') || text.includes('continue') || text.includes('yes') || text.includes('confirm');
- });
- if (confirmBtn instanceof HTMLElement) {
- confirmBtn.click();
- }
- });
- }
- }
- // Try Method 3: Direct "Yes" or age confirmation button
- const yesClicked = await page.evaluate(() => {
- const confirmButtons = Array.from(document.querySelectorAll('button, a, [role="button"]'));
- const yesButton = confirmButtons.find(el => {
- const text = el.textContent?.toLowerCase() || '';
- return text.includes('yes') ||
- text.includes('i am 21') ||
- text.includes('i am 18') ||
- text.includes('enter the site') ||
- text.includes('enter') ||
- text.includes('continue');
- });
- if (yesButton instanceof HTMLElement) {
- yesButton.click();
- return true;
- }
- return false;
- });
- const bypassed = customDropdownWorked || selectFound || stateClicked || yesClicked;
- if (bypassed) {
- // Wait for navigation to complete after clicking age gate button
- logger_1.logger.info('age-gate', `Waiting for navigation after age gate bypass...`);
- try {
- await page.waitForNavigation({ waitUntil: 'domcontentloaded', timeout: 15000 });
- }
- catch (e) {
- // Navigation might not trigger, that's ok - wait a bit anyway
- await page.waitForTimeout(3000);
- }
- // Give the page extra time to load content
- await page.waitForTimeout(3000);
- // Verify we actually bypassed by checking the URL
- const finalUrl = page.url();
- if (finalUrl.includes('/age-gate')) {
- logger_1.logger.error('age-gate', `❌ Age gate bypass failed - still at age gate URL: ${finalUrl}`);
- return false;
- }
- logger_1.logger.info('age-gate', `✅ Age gate bypass completed - now at: ${finalUrl}`);
- return true;
- }
- else {
- logger_1.logger.warn('age-gate', `Could not find ${state} option or confirmation button in age gate`);
- return false;
- }
- }
- catch (err) {
- logger_1.logger.error('age-gate', `Error bypassing age gate: ${err}`);
- return false;
- }
-}
-/**
- * Helper to detect the state from a store URL
- * @param url - Store URL
- * @returns State name (e.g., 'Arizona', 'California')
- */
-function detectStateFromUrl(url) {
- const stateMap = {
- '-az-': 'Arizona',
- '-ca-': 'California',
- '-co-': 'Colorado',
- '-fl-': 'Florida',
- '-il-': 'Illinois',
- '-ma-': 'Massachusetts',
- '-mi-': 'Michigan',
- '-nv-': 'Nevada',
- '-nj-': 'New Jersey',
- '-ny-': 'New York',
- '-or-': 'Oregon',
- '-pa-': 'Pennsylvania',
- '-wa-': 'Washington',
- };
- for (const [pattern, stateName] of Object.entries(stateMap)) {
- if (url.toLowerCase().includes(pattern)) {
- return stateName;
- }
- }
- // Default to Arizona if state not detected
- return 'Arizona';
-}
diff --git a/backend/dist/utils/image-storage.js b/backend/dist/utils/image-storage.js
deleted file mode 100644
index 8f346232..00000000
--- a/backend/dist/utils/image-storage.js
+++ /dev/null
@@ -1,296 +0,0 @@
-"use strict";
-/**
- * Local Image Storage Utility
- *
- * Downloads and stores product images to local filesystem.
- * Replaces MinIO-based storage with simple local file storage.
- *
- * Directory structure:
- * /images/products//.webp
- * /images/products//-thumb.webp
- * /images/products//-medium.webp
- * /images/brands/.webp
- */
-var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
- if (k2 === undefined) k2 = k;
- var desc = Object.getOwnPropertyDescriptor(m, k);
- if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
- desc = { enumerable: true, get: function() { return m[k]; } };
- }
- Object.defineProperty(o, k2, desc);
-}) : (function(o, m, k, k2) {
- if (k2 === undefined) k2 = k;
- o[k2] = m[k];
-}));
-var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
- Object.defineProperty(o, "default", { enumerable: true, value: v });
-}) : function(o, v) {
- o["default"] = v;
-});
-var __importStar = (this && this.__importStar) || (function () {
- var ownKeys = function(o) {
- ownKeys = Object.getOwnPropertyNames || function (o) {
- var ar = [];
- for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
- return ar;
- };
- return ownKeys(o);
- };
- return function (mod) {
- if (mod && mod.__esModule) return mod;
- var result = {};
- if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
- __setModuleDefault(result, mod);
- return result;
- };
-})();
-var __importDefault = (this && this.__importDefault) || function (mod) {
- return (mod && mod.__esModule) ? mod : { "default": mod };
-};
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.downloadProductImage = downloadProductImage;
-exports.downloadBrandLogo = downloadBrandLogo;
-exports.imageExists = imageExists;
-exports.deleteProductImages = deleteProductImages;
-exports.initializeImageStorage = initializeImageStorage;
-exports.getStorageStats = getStorageStats;
-const axios_1 = __importDefault(require("axios"));
-const sharp_1 = __importDefault(require("sharp"));
-const fs = __importStar(require("fs/promises"));
-const path = __importStar(require("path"));
-const crypto_1 = require("crypto");
-// Base path for image storage - configurable via env
-const IMAGES_BASE_PATH = process.env.IMAGES_PATH || '/app/public/images';
-// Public URL base for serving images
-const IMAGES_PUBLIC_URL = process.env.IMAGES_PUBLIC_URL || '/images';
-/**
- * Ensure a directory exists
- */
-async function ensureDir(dirPath) {
- try {
- await fs.mkdir(dirPath, { recursive: true });
- }
- catch (error) {
- if (error.code !== 'EEXIST')
- throw error;
- }
-}
-/**
- * Generate a short hash from a URL for deduplication
- */
-function hashUrl(url) {
- return (0, crypto_1.createHash)('md5').update(url).digest('hex').substring(0, 8);
-}
-/**
- * Download an image from a URL and return the buffer
- */
-async function downloadImage(imageUrl) {
- const response = await axios_1.default.get(imageUrl, {
- responseType: 'arraybuffer',
- timeout: 30000,
- headers: {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
- 'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8',
- },
- });
- return Buffer.from(response.data);
-}
-/**
- * Process and save image in multiple sizes
- * Returns the file paths relative to IMAGES_BASE_PATH
- */
-async function processAndSaveImage(buffer, outputDir, baseFilename) {
- await ensureDir(outputDir);
- const fullPath = path.join(outputDir, `${baseFilename}.webp`);
- const mediumPath = path.join(outputDir, `${baseFilename}-medium.webp`);
- const thumbPath = path.join(outputDir, `${baseFilename}-thumb.webp`);
- // Process images in parallel
- const [fullBuffer, mediumBuffer, thumbBuffer] = await Promise.all([
- // Full: max 1200x1200, high quality
- (0, sharp_1.default)(buffer)
- .resize(1200, 1200, { fit: 'inside', withoutEnlargement: true })
- .webp({ quality: 85 })
- .toBuffer(),
- // Medium: 600x600
- (0, sharp_1.default)(buffer)
- .resize(600, 600, { fit: 'inside', withoutEnlargement: true })
- .webp({ quality: 80 })
- .toBuffer(),
- // Thumb: 200x200
- (0, sharp_1.default)(buffer)
- .resize(200, 200, { fit: 'inside', withoutEnlargement: true })
- .webp({ quality: 75 })
- .toBuffer(),
- ]);
- // Save all sizes
- await Promise.all([
- fs.writeFile(fullPath, fullBuffer),
- fs.writeFile(mediumPath, mediumBuffer),
- fs.writeFile(thumbPath, thumbBuffer),
- ]);
- const totalBytes = fullBuffer.length + mediumBuffer.length + thumbBuffer.length;
- return {
- full: fullPath,
- medium: mediumPath,
- thumb: thumbPath,
- totalBytes,
- };
-}
-/**
- * Convert a file path to a public URL
- */
-function pathToUrl(filePath) {
- const relativePath = filePath.replace(IMAGES_BASE_PATH, '');
- return `${IMAGES_PUBLIC_URL}${relativePath}`;
-}
-/**
- * Download and store a product image locally
- *
- * @param imageUrl - The third-party image URL to download
- * @param dispensaryId - The dispensary ID (for directory organization)
- * @param productId - The product ID or external ID (for filename)
- * @returns Download result with local URLs
- */
-async function downloadProductImage(imageUrl, dispensaryId, productId) {
- try {
- if (!imageUrl) {
- return { success: false, error: 'No image URL provided' };
- }
- // Download the image
- const buffer = await downloadImage(imageUrl);
- // Organize by dispensary ID
- const outputDir = path.join(IMAGES_BASE_PATH, 'products', String(dispensaryId));
- // Use product ID + URL hash for uniqueness
- const urlHash = hashUrl(imageUrl);
- const baseFilename = `${productId}-${urlHash}`;
- // Process and save
- const result = await processAndSaveImage(buffer, outputDir, baseFilename);
- return {
- success: true,
- urls: {
- full: pathToUrl(result.full),
- medium: pathToUrl(result.medium),
- thumb: pathToUrl(result.thumb),
- },
- bytesDownloaded: result.totalBytes,
- };
- }
- catch (error) {
- return {
- success: false,
- error: error.message || 'Failed to download image',
- };
- }
-}
-/**
- * Download and store a brand logo locally
- *
- * @param logoUrl - The brand logo URL
- * @param brandId - The brand ID or slug
- * @returns Download result with local URL
- */
-async function downloadBrandLogo(logoUrl, brandId) {
- try {
- if (!logoUrl) {
- return { success: false, error: 'No logo URL provided' };
- }
- // Download the image
- const buffer = await downloadImage(logoUrl);
- // Brand logos go in /images/brands/
- const outputDir = path.join(IMAGES_BASE_PATH, 'brands');
- // Sanitize brand ID for filename
- const safeBrandId = brandId.replace(/[^a-zA-Z0-9-_]/g, '_');
- const urlHash = hashUrl(logoUrl);
- const baseFilename = `${safeBrandId}-${urlHash}`;
- // Process and save (single size for logos)
- await ensureDir(outputDir);
- const logoPath = path.join(outputDir, `${baseFilename}.webp`);
- const logoBuffer = await (0, sharp_1.default)(buffer)
- .resize(400, 400, { fit: 'inside', withoutEnlargement: true })
- .webp({ quality: 85 })
- .toBuffer();
- await fs.writeFile(logoPath, logoBuffer);
- return {
- success: true,
- urls: {
- full: pathToUrl(logoPath),
- medium: pathToUrl(logoPath),
- thumb: pathToUrl(logoPath),
- },
- bytesDownloaded: logoBuffer.length,
- };
- }
- catch (error) {
- return {
- success: false,
- error: error.message || 'Failed to download brand logo',
- };
- }
-}
-/**
- * Check if a local image already exists
- */
-async function imageExists(dispensaryId, productId, imageUrl) {
- const urlHash = hashUrl(imageUrl);
- const imagePath = path.join(IMAGES_BASE_PATH, 'products', String(dispensaryId), `${productId}-${urlHash}.webp`);
- try {
- await fs.access(imagePath);
- return true;
- }
- catch {
- return false;
- }
-}
-/**
- * Delete a product's local images
- */
-async function deleteProductImages(dispensaryId, productId, imageUrl) {
- const productDir = path.join(IMAGES_BASE_PATH, 'products', String(dispensaryId));
- const prefix = imageUrl
- ? `${productId}-${hashUrl(imageUrl)}`
- : String(productId);
- try {
- const files = await fs.readdir(productDir);
- const toDelete = files.filter(f => f.startsWith(prefix));
- await Promise.all(toDelete.map(f => fs.unlink(path.join(productDir, f))));
- }
- catch {
- // Directory might not exist, that's fine
- }
-}
-/**
- * Initialize the image storage directories
- */
-async function initializeImageStorage() {
- await ensureDir(path.join(IMAGES_BASE_PATH, 'products'));
- await ensureDir(path.join(IMAGES_BASE_PATH, 'brands'));
- console.log(`✅ Image storage initialized at ${IMAGES_BASE_PATH}`);
-}
-/**
- * Get storage stats
- */
-async function getStorageStats() {
- const productsDir = path.join(IMAGES_BASE_PATH, 'products');
- const brandsDir = path.join(IMAGES_BASE_PATH, 'brands');
- let productCount = 0;
- let brandCount = 0;
- try {
- const productDirs = await fs.readdir(productsDir);
- for (const dir of productDirs) {
- const files = await fs.readdir(path.join(productsDir, dir));
- productCount += files.filter(f => f.endsWith('.webp') && !f.includes('-')).length;
- }
- }
- catch { /* ignore */ }
- try {
- const brandFiles = await fs.readdir(brandsDir);
- brandCount = brandFiles.filter(f => f.endsWith('.webp')).length;
- }
- catch { /* ignore */ }
- return {
- productsDir,
- brandsDir,
- productCount,
- brandCount,
- };
-}
diff --git a/backend/dist/utils/minio.js b/backend/dist/utils/minio.js
deleted file mode 100644
index 552cdffb..00000000
--- a/backend/dist/utils/minio.js
+++ /dev/null
@@ -1,262 +0,0 @@
-"use strict";
-var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
- if (k2 === undefined) k2 = k;
- var desc = Object.getOwnPropertyDescriptor(m, k);
- if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
- desc = { enumerable: true, get: function() { return m[k]; } };
- }
- Object.defineProperty(o, k2, desc);
-}) : (function(o, m, k, k2) {
- if (k2 === undefined) k2 = k;
- o[k2] = m[k];
-}));
-var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
- Object.defineProperty(o, "default", { enumerable: true, value: v });
-}) : function(o, v) {
- o["default"] = v;
-});
-var __importStar = (this && this.__importStar) || (function () {
- var ownKeys = function(o) {
- ownKeys = Object.getOwnPropertyNames || function (o) {
- var ar = [];
- for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
- return ar;
- };
- return ownKeys(o);
- };
- return function (mod) {
- if (mod && mod.__esModule) return mod;
- var result = {};
- if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
- __setModuleDefault(result, mod);
- return result;
- };
-})();
-var __importDefault = (this && this.__importDefault) || function (mod) {
- return (mod && mod.__esModule) ? mod : { "default": mod };
-};
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.isMinioEnabled = isMinioEnabled;
-exports.initializeMinio = initializeMinio;
-exports.uploadImageFromUrl = uploadImageFromUrl;
-exports.getImageUrl = getImageUrl;
-exports.deleteImage = deleteImage;
-exports.minioClient = getMinioClient;
-const Minio = __importStar(require("minio"));
-const axios_1 = __importDefault(require("axios"));
-const uuid_1 = require("uuid");
-const sharp_1 = __importDefault(require("sharp"));
-const fs = __importStar(require("fs/promises"));
-const path = __importStar(require("path"));
-let minioClient = null;
-// Check if MinIO is configured
-function isMinioEnabled() {
- return !!process.env.MINIO_ENDPOINT;
-}
-// Local storage path for images when MinIO is not configured
-const LOCAL_IMAGES_PATH = process.env.LOCAL_IMAGES_PATH || '/app/public/images';
-function getMinioClient() {
- if (!minioClient) {
- minioClient = new Minio.Client({
- endPoint: process.env.MINIO_ENDPOINT || 'minio',
- port: parseInt(process.env.MINIO_PORT || '9000'),
- useSSL: process.env.MINIO_USE_SSL === 'true',
- accessKey: process.env.MINIO_ACCESS_KEY || 'minioadmin',
- secretKey: process.env.MINIO_SECRET_KEY || 'minioadmin',
- });
- }
- return minioClient;
-}
-const BUCKET_NAME = process.env.MINIO_BUCKET || 'dutchie';
-async function initializeMinio() {
- // Skip MinIO initialization if not configured
- if (!isMinioEnabled()) {
- console.log('ℹ️ MinIO not configured (MINIO_ENDPOINT not set), using local filesystem storage');
- // Ensure local images directory exists
- try {
- await fs.mkdir(LOCAL_IMAGES_PATH, { recursive: true });
- await fs.mkdir(path.join(LOCAL_IMAGES_PATH, 'products'), { recursive: true });
- console.log(`✅ Local images directory ready: ${LOCAL_IMAGES_PATH}`);
- }
- catch (error) {
- console.error('❌ Failed to create local images directory:', error);
- throw error;
- }
- return;
- }
- try {
- const client = getMinioClient();
- // Check if bucket exists
- const exists = await client.bucketExists(BUCKET_NAME);
- if (!exists) {
- // Create bucket
- await client.makeBucket(BUCKET_NAME, 'us-east-1');
- console.log(`✅ Minio bucket created: ${BUCKET_NAME}`);
- // Set public read policy
- const policy = {
- Version: '2012-10-17',
- Statement: [
- {
- Effect: 'Allow',
- Principal: { AWS: ['*'] },
- Action: ['s3:GetObject'],
- Resource: [`arn:aws:s3:::${BUCKET_NAME}/*`],
- },
- ],
- };
- await client.setBucketPolicy(BUCKET_NAME, JSON.stringify(policy));
- console.log(`✅ Bucket policy set to public read`);
- }
- else {
- console.log(`✅ Minio bucket already exists: ${BUCKET_NAME}`);
- }
- }
- catch (error) {
- console.error('❌ Minio initialization error:', error);
- throw error;
- }
-}
-async function removeBackground(buffer) {
- try {
- // Get image metadata to check if it has an alpha channel
- const metadata = await (0, sharp_1.default)(buffer).metadata();
- // If image already has transparency, trim and optimize it
- if (metadata.hasAlpha) {
- return await (0, sharp_1.default)(buffer)
- .trim() // Remove transparent borders
- .toBuffer();
- }
- // For images without alpha (like JPEGs with solid backgrounds),
- // we'll use a threshold-based approach to detect and remove solid backgrounds
- // This works well for product images on solid color backgrounds
- // Convert to PNG with alpha channel, then flatten with transparency
- const withAlpha = await (0, sharp_1.default)(buffer)
- .ensureAlpha() // Add alpha channel
- .toBuffer();
- // Use threshold to make similar colors transparent (targets solid backgrounds)
- // This is a simple approach - for better results, use remove.bg API or ML models
- return await (0, sharp_1.default)(withAlpha)
- .flatten({ background: { r: 0, g: 0, b: 0, alpha: 0 } })
- .trim()
- .toBuffer();
- }
- catch (error) {
- console.warn('Background removal failed, using original image:', error);
- return buffer;
- }
-}
-async function uploadToLocalFilesystem(thumbnailBuffer, mediumBuffer, fullBuffer, baseFilename) {
- const thumbnailPath = `${baseFilename}-thumb.png`;
- const mediumPath = `${baseFilename}-medium.png`;
- const fullPath = `${baseFilename}-full.png`;
- // Ensure the target directory exists (in case initializeMinio wasn't called)
- // Extract directory from baseFilename (e.g., 'products/store-slug' or just 'products')
- const targetDir = path.join(LOCAL_IMAGES_PATH, path.dirname(baseFilename));
- await fs.mkdir(targetDir, { recursive: true });
- await Promise.all([
- fs.writeFile(path.join(LOCAL_IMAGES_PATH, thumbnailPath), thumbnailBuffer),
- fs.writeFile(path.join(LOCAL_IMAGES_PATH, mediumPath), mediumBuffer),
- fs.writeFile(path.join(LOCAL_IMAGES_PATH, fullPath), fullBuffer),
- ]);
- return {
- thumbnail: thumbnailPath,
- medium: mediumPath,
- full: fullPath,
- };
-}
-async function uploadToMinio(thumbnailBuffer, mediumBuffer, fullBuffer, baseFilename) {
- const client = getMinioClient();
- const thumbnailPath = `${baseFilename}-thumb.png`;
- const mediumPath = `${baseFilename}-medium.png`;
- const fullPath = `${baseFilename}-full.png`;
- await Promise.all([
- client.putObject(BUCKET_NAME, thumbnailPath, thumbnailBuffer, thumbnailBuffer.length, {
- 'Content-Type': 'image/png',
- }),
- client.putObject(BUCKET_NAME, mediumPath, mediumBuffer, mediumBuffer.length, {
- 'Content-Type': 'image/png',
- }),
- client.putObject(BUCKET_NAME, fullPath, fullBuffer, fullBuffer.length, {
- 'Content-Type': 'image/png',
- }),
- ]);
- return {
- thumbnail: thumbnailPath,
- medium: mediumPath,
- full: fullPath,
- };
-}
-async function uploadImageFromUrl(imageUrl, productId, storeSlug, removeBackgrounds = true) {
- try {
- // Download image
- const response = await axios_1.default.get(imageUrl, { responseType: 'arraybuffer' });
- let buffer = Buffer.from(response.data);
- // Remove background if enabled
- if (removeBackgrounds) {
- buffer = await removeBackground(buffer);
- }
- // Generate unique base filename - organize by store if slug provided
- const storeDir = storeSlug ? `products/${storeSlug}` : 'products';
- const baseFilename = `${storeDir}/${productId}-${(0, uuid_1.v4)()}`;
- // Create multiple sizes with Sharp and convert to WebP/PNG for better compression
- // Use PNG for images with transparency
- const [thumbnailBuffer, mediumBuffer, fullBuffer] = await Promise.all([
- // Thumbnail: 300x300
- (0, sharp_1.default)(buffer)
- .resize(300, 300, { fit: 'inside', background: { r: 0, g: 0, b: 0, alpha: 0 } })
- .png({ quality: 80, compressionLevel: 9 })
- .toBuffer(),
- // Medium: 800x800
- (0, sharp_1.default)(buffer)
- .resize(800, 800, { fit: 'inside', background: { r: 0, g: 0, b: 0, alpha: 0 } })
- .png({ quality: 85, compressionLevel: 9 })
- .toBuffer(),
- // Full: 2000x2000 (optimized)
- (0, sharp_1.default)(buffer)
- .resize(2000, 2000, { fit: 'inside', withoutEnlargement: true, background: { r: 0, g: 0, b: 0, alpha: 0 } })
- .png({ quality: 90, compressionLevel: 9 })
- .toBuffer(),
- ]);
- // Upload to appropriate storage backend
- let result;
- if (isMinioEnabled()) {
- result = await uploadToMinio(thumbnailBuffer, mediumBuffer, fullBuffer, baseFilename);
- }
- else {
- result = await uploadToLocalFilesystem(thumbnailBuffer, mediumBuffer, fullBuffer, baseFilename);
- }
- console.log(`✅ Uploaded 3 sizes for product ${productId}: ${thumbnailBuffer.length + mediumBuffer.length + fullBuffer.length} bytes total`);
- return result;
- }
- catch (error) {
- console.error('Error uploading image:', error);
- throw error;
- }
-}
-function getImageUrl(imagePath) {
- if (isMinioEnabled()) {
- // Use MinIO endpoint for browser access
- const endpoint = process.env.MINIO_PUBLIC_ENDPOINT || 'http://localhost:9020';
- return `${endpoint}/${BUCKET_NAME}/${imagePath}`;
- }
- else {
- // Use local path - served via Express static middleware
- const publicUrl = process.env.PUBLIC_URL || '';
- return `${publicUrl}/images/${imagePath}`;
- }
-}
-async function deleteImage(imagePath) {
- try {
- if (isMinioEnabled()) {
- const client = getMinioClient();
- await client.removeObject(BUCKET_NAME, imagePath);
- }
- else {
- const fullPath = path.join(LOCAL_IMAGES_PATH, imagePath);
- await fs.unlink(fullPath);
- }
- }
- catch (error) {
- console.error('Error deleting image:', error);
- }
-}
diff --git a/backend/dist/utils/product-normalizer.js b/backend/dist/utils/product-normalizer.js
deleted file mode 100644
index 6d98adcd..00000000
--- a/backend/dist/utils/product-normalizer.js
+++ /dev/null
@@ -1,181 +0,0 @@
-"use strict";
-/**
- * Product Normalizer Utility
- *
- * Functions for normalizing product data to enable consistent matching
- * and prevent duplicate product entries.
- */
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.normalizeProductName = normalizeProductName;
-exports.normalizeBrandName = normalizeBrandName;
-exports.normalizeWeight = normalizeWeight;
-exports.generateProductFingerprint = generateProductFingerprint;
-exports.stringSimilarity = stringSimilarity;
-exports.areProductsSimilar = areProductsSimilar;
-/**
- * Normalize product name for matching
- * - Lowercase
- * - Remove punctuation
- * - Remove THC/CBD percentages often appended to names
- * - Remove weight suffixes
- * - Remove emoji
- * - Normalize whitespace
- */
-function normalizeProductName(name) {
- if (!name)
- return '';
- return name
- .toLowerCase()
- .trim()
- // Remove special characters except alphanumeric and spaces
- .replace(/[^\w\s]/g, ' ')
- // Remove common suffixes like THC/CBD percentages appended to names
- .replace(/\s*(thc|cbd|cbg|cbn|tac)\s*[:=]?\s*[\d.]+\s*%?/gi, '')
- // Remove weight/size suffixes often appended
- .replace(/\s*\d+(\.\d+)?\s*(mg|g|oz|ml|gram|grams|ounce|ounces)\b/gi, '')
- // Remove emoji
- .replace(/[\u{1F300}-\u{1F9FF}]/gu, '')
- // Remove "special offer" type suffixes
- .replace(/\s*special\s*offer\s*/gi, '')
- // Normalize multiple spaces to single space
- .replace(/\s+/g, ' ')
- .trim();
-}
-/**
- * Normalize brand name for matching
- */
-function normalizeBrandName(brand) {
- if (!brand)
- return '';
- return brand
- .toLowerCase()
- .trim()
- // Remove special characters
- .replace(/[^\w\s]/g, ' ')
- // Normalize whitespace
- .replace(/\s+/g, ' ')
- .trim();
-}
-/**
- * Normalize weight string to standard format
- * e.g., "3.5 grams" -> "3.5g", "1/8 oz" -> "3.5g"
- */
-function normalizeWeight(weight) {
- if (!weight)
- return '';
- const w = weight.toLowerCase().trim();
- // Handle fractional ounces
- if (w.includes('1/8') || w.includes('eighth')) {
- return '3.5g';
- }
- if (w.includes('1/4') || w.includes('quarter')) {
- return '7g';
- }
- if (w.includes('1/2') || w.includes('half')) {
- return '14g';
- }
- if (w.includes('1 oz') || w === 'oz' || w === '1oz') {
- return '28g';
- }
- // Extract numeric value and unit
- const match = w.match(/([\d.]+)\s*(mg|g|oz|ml|gram|grams?|ounce|ounces?)?/i);
- if (!match)
- return w;
- const value = parseFloat(match[1]);
- let unit = (match[2] || 'g').toLowerCase();
- // Normalize unit names
- unit = unit.replace(/gram(s)?/, 'g').replace(/ounce(s)?/, 'oz');
- // Convert oz to grams for consistency
- if (unit === 'oz') {
- return `${(value * 28).toFixed(1)}g`;
- }
- return `${value}${unit}`;
-}
-/**
- * Generate a matching fingerprint for a product
- * Used for deduplication
- */
-function generateProductFingerprint(name, brand, weight, categoryId) {
- const parts = [
- normalizeProductName(name),
- normalizeBrandName(brand),
- normalizeWeight(weight),
- categoryId?.toString() || ''
- ];
- return parts.filter(Boolean).join('|');
-}
-/**
- * Calculate similarity between two strings (0-100)
- * Uses Levenshtein distance
- */
-function stringSimilarity(str1, str2) {
- if (str1 === str2)
- return 100;
- if (!str1 || !str2)
- return 0;
- const s1 = str1.toLowerCase();
- const s2 = str2.toLowerCase();
- if (s1 === s2)
- return 100;
- const longer = s1.length > s2.length ? s1 : s2;
- const shorter = s1.length > s2.length ? s2 : s1;
- const longerLength = longer.length;
- if (longerLength === 0)
- return 100;
- const distance = levenshteinDistance(longer, shorter);
- return Math.round(((longerLength - distance) / longerLength) * 100);
-}
-/**
- * Levenshtein distance between two strings
- */
-function levenshteinDistance(str1, str2) {
- const m = str1.length;
- const n = str2.length;
- // Create distance matrix
- const dp = Array(m + 1).fill(null).map(() => Array(n + 1).fill(0));
- // Initialize first row and column
- for (let i = 0; i <= m; i++)
- dp[i][0] = i;
- for (let j = 0; j <= n; j++)
- dp[0][j] = j;
- // Fill in the rest
- for (let i = 1; i <= m; i++) {
- for (let j = 1; j <= n; j++) {
- const cost = str1[i - 1] === str2[j - 1] ? 0 : 1;
- dp[i][j] = Math.min(dp[i - 1][j] + 1, // deletion
- dp[i][j - 1] + 1, // insertion
- dp[i - 1][j - 1] + cost // substitution
- );
- }
- }
- return dp[m][n];
-}
-/**
- * Check if two products are likely the same
- * Returns confidence score (0-100)
- */
-function areProductsSimilar(product1, product2, threshold = 92) {
- const name1 = normalizeProductName(product1.name);
- const name2 = normalizeProductName(product2.name);
- const nameSimilarity = stringSimilarity(name1, name2);
- // If names are very similar, likely same product
- if (nameSimilarity >= threshold) {
- return { isSimilar: true, confidence: nameSimilarity };
- }
- // Check brand match for additional confidence
- const brand1 = normalizeBrandName(product1.brand);
- const brand2 = normalizeBrandName(product2.brand);
- if (brand1 && brand2 && brand1 === brand2) {
- // Same brand, lower threshold for name match
- if (nameSimilarity >= threshold - 10) {
- return { isSimilar: true, confidence: nameSimilarity + 5 };
- }
- }
- // Check weight match
- const weight1 = normalizeWeight(product1.weight);
- const weight2 = normalizeWeight(product2.weight);
- if (weight1 && weight2 && weight1 === weight2 && nameSimilarity >= threshold - 15) {
- return { isSimilar: true, confidence: nameSimilarity + 3 };
- }
- return { isSimilar: false, confidence: nameSimilarity };
-}
diff --git a/backend/dist/utils/proxyManager.js b/backend/dist/utils/proxyManager.js
deleted file mode 100644
index 688939b4..00000000
--- a/backend/dist/utils/proxyManager.js
+++ /dev/null
@@ -1,112 +0,0 @@
-"use strict";
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.getProxy = getProxy;
-exports.getPhoenixProxy = getPhoenixProxy;
-exports.getStateProxy = getStateProxy;
-exports.getCityProxy = getCityProxy;
-exports.getRandomProxy = getRandomProxy;
-exports.getProxyLocationStats = getProxyLocationStats;
-const migrate_1 = require("../db/migrate");
-const logger_1 = require("../services/logger");
-/**
- * Get an active proxy from the database, optionally filtered by location
- */
-async function getProxy(locationFilter) {
- try {
- let query = `
- SELECT protocol, host, port, username, password
- FROM proxies
- WHERE active = true
- `;
- const params = [];
- let paramIndex = 1;
- if (locationFilter) {
- if (locationFilter.city) {
- query += ` AND LOWER(city) = LOWER($${paramIndex})`;
- params.push(locationFilter.city);
- paramIndex++;
- }
- if (locationFilter.state) {
- query += ` AND LOWER(state) = LOWER($${paramIndex})`;
- params.push(locationFilter.state);
- paramIndex++;
- }
- if (locationFilter.country) {
- query += ` AND LOWER(country) = LOWER($${paramIndex})`;
- params.push(locationFilter.country);
- paramIndex++;
- }
- if (locationFilter.countryCode) {
- query += ` AND LOWER(country_code) = LOWER($${paramIndex})`;
- params.push(locationFilter.countryCode);
- paramIndex++;
- }
- }
- // Use RANDOM() for true randomization instead of least recently used
- query += ` ORDER BY RANDOM() LIMIT 1`;
- const result = await migrate_1.pool.query(query, params);
- if (result.rows.length === 0) {
- logger_1.logger.warn('proxy', `No active proxies found with filter: ${JSON.stringify(locationFilter)}`);
- return null;
- }
- const proxy = result.rows[0];
- return {
- server: `${proxy.protocol}://${proxy.host}:${proxy.port}`,
- username: proxy.username || undefined,
- password: proxy.password || undefined,
- };
- }
- catch (error) {
- logger_1.logger.error('proxy', `Error fetching proxy: ${error}`);
- return null;
- }
-}
-/**
- * Get a proxy from Phoenix, AZ, USA (ideal for Arizona dispensaries)
- */
-async function getPhoenixProxy() {
- return getProxy({ city: 'Phoenix', state: 'Arizona', country: 'United States' });
-}
-/**
- * Get a proxy from a specific US state
- */
-async function getStateProxy(state) {
- return getProxy({ state, country: 'United States' });
-}
-/**
- * Get a proxy from a specific city
- */
-async function getCityProxy(city, state) {
- return getProxy({ city, state });
-}
-/**
- * Get a random active proxy (no location filter)
- */
-async function getRandomProxy() {
- return getProxy();
-}
-/**
- * Get proxy location statistics
- */
-async function getProxyLocationStats() {
- try {
- const result = await migrate_1.pool.query(`
- SELECT
- country,
- state,
- city,
- COUNT(*) as count,
- SUM(CASE WHEN active THEN 1 ELSE 0 END) as active_count
- FROM proxies
- WHERE country IS NOT NULL
- GROUP BY country, state, city
- ORDER BY count DESC
- LIMIT 50
- `);
- return result.rows;
- }
- catch (error) {
- logger_1.logger.error('proxy', `Error fetching proxy stats: ${error}`);
- return [];
- }
-}
diff --git a/backend/dist/utils/stealthBrowser.js b/backend/dist/utils/stealthBrowser.js
deleted file mode 100644
index c6161cac..00000000
--- a/backend/dist/utils/stealthBrowser.js
+++ /dev/null
@@ -1,264 +0,0 @@
-"use strict";
-var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
- if (k2 === undefined) k2 = k;
- var desc = Object.getOwnPropertyDescriptor(m, k);
- if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
- desc = { enumerable: true, get: function() { return m[k]; } };
- }
- Object.defineProperty(o, k2, desc);
-}) : (function(o, m, k, k2) {
- if (k2 === undefined) k2 = k;
- o[k2] = m[k];
-}));
-var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
- Object.defineProperty(o, "default", { enumerable: true, value: v });
-}) : function(o, v) {
- o["default"] = v;
-});
-var __importStar = (this && this.__importStar) || (function () {
- var ownKeys = function(o) {
- ownKeys = Object.getOwnPropertyNames || function (o) {
- var ar = [];
- for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
- return ar;
- };
- return ownKeys(o);
- };
- return function (mod) {
- if (mod && mod.__esModule) return mod;
- var result = {};
- if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
- __setModuleDefault(result, mod);
- return result;
- };
-})();
-var __importDefault = (this && this.__importDefault) || function (mod) {
- return (mod && mod.__esModule) ? mod : { "default": mod };
-};
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.createStealthBrowser = createStealthBrowser;
-exports.createStealthContext = createStealthContext;
-exports.randomDelay = randomDelay;
-exports.humanMouseMove = humanMouseMove;
-exports.humanScroll = humanScroll;
-exports.humanType = humanType;
-exports.simulateHumanBehavior = simulateHumanBehavior;
-exports.waitForPageLoad = waitForPageLoad;
-exports.isCloudflareChallenge = isCloudflareChallenge;
-exports.waitForCloudflareChallenge = waitForCloudflareChallenge;
-exports.saveCookies = saveCookies;
-exports.loadCookies = loadCookies;
-const playwright_extra_1 = require("playwright-extra");
-const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth"));
-// Add stealth plugin
-playwright_extra_1.chromium.use((0, puppeteer_extra_plugin_stealth_1.default)());
-/**
- * Create a stealth browser instance with anti-detection measures
- */
-async function createStealthBrowser(options = {}) {
- const launchOptions = {
- headless: options.headless !== false,
- args: [
- '--disable-blink-features=AutomationControlled',
- '--disable-features=IsolateOrigins,site-per-process',
- '--disable-web-security',
- '--disable-features=VizDisplayCompositor',
- '--no-sandbox',
- '--disable-setuid-sandbox',
- '--disable-dev-shm-usage',
- '--disable-accelerated-2d-canvas',
- '--no-first-run',
- '--no-zygote',
- '--disable-gpu',
- ],
- };
- if (options.proxy) {
- launchOptions.proxy = options.proxy;
- }
- const browser = await playwright_extra_1.chromium.launch(launchOptions);
- return browser;
-}
-/**
- * Create a stealth context with realistic browser fingerprint
- */
-async function createStealthContext(browser, options = {}) {
- const userAgent = options.userAgent ||
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36';
- const context = await browser.newContext({
- userAgent,
- viewport: { width: 1920, height: 1080 },
- locale: 'en-US',
- timezoneId: 'America/Phoenix',
- permissions: ['geolocation'],
- geolocation: { latitude: 33.4484, longitude: -112.074 }, // Phoenix, AZ
- colorScheme: 'light',
- deviceScaleFactor: 1,
- hasTouch: false,
- isMobile: false,
- javaScriptEnabled: true,
- extraHTTPHeaders: {
- 'Accept-Language': 'en-US,en;q=0.9',
- 'Accept-Encoding': 'gzip, deflate, br',
- Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
- 'Sec-Fetch-Site': 'none',
- 'Sec-Fetch-Mode': 'navigate',
- 'Sec-Fetch-User': '?1',
- 'Sec-Fetch-Dest': 'document',
- 'Upgrade-Insecure-Requests': '1',
- },
- });
- // Set age verification cookies for Dutchie
- await context.addCookies([
- {
- name: 'age_verified',
- value: 'true',
- domain: '.dutchie.com',
- path: '/',
- expires: Math.floor(Date.now() / 1000) + 86400 * 30, // 30 days
- },
- {
- name: 'initial_location',
- value: JSON.stringify({ state: options.state || 'Arizona' }),
- domain: '.dutchie.com',
- path: '/',
- expires: Math.floor(Date.now() / 1000) + 86400 * 30,
- },
- ]);
- return context;
-}
-/**
- * Random delay between min and max milliseconds
- */
-function randomDelay(min, max) {
- const delay = Math.floor(Math.random() * (max - min + 1)) + min;
- return new Promise((resolve) => setTimeout(resolve, delay));
-}
-/**
- * Simulate human-like mouse movement
- */
-async function humanMouseMove(page, x, y) {
- const steps = 20;
- const currentPos = await page.evaluate(() => ({ x: 0, y: 0 }));
- for (let i = 0; i <= steps; i++) {
- const progress = i / steps;
- const easeProgress = easeInOutQuad(progress);
- const nextX = currentPos.x + (x - currentPos.x) * easeProgress;
- const nextY = currentPos.y + (y - currentPos.y) * easeProgress;
- await page.mouse.move(nextX, nextY);
- await randomDelay(5, 15);
- }
-}
-/**
- * Easing function for smooth mouse movement
- */
-function easeInOutQuad(t) {
- return t < 0.5 ? 2 * t * t : -1 + (4 - 2 * t) * t;
-}
-/**
- * Simulate human-like scrolling
- */
-async function humanScroll(page, scrollAmount = 500) {
- const scrollSteps = 10;
- const stepSize = scrollAmount / scrollSteps;
- for (let i = 0; i < scrollSteps; i++) {
- await page.mouse.wheel(0, stepSize);
- await randomDelay(50, 150);
- }
-}
-/**
- * Simulate human-like typing
- */
-async function humanType(page, selector, text) {
- await page.click(selector);
- await randomDelay(100, 300);
- for (const char of text) {
- await page.keyboard.type(char);
- await randomDelay(50, 150);
- }
-}
-/**
- * Random realistic behavior before interacting with page
- */
-async function simulateHumanBehavior(page) {
- // Random small mouse movements
- for (let i = 0; i < 3; i++) {
- const x = Math.random() * 500 + 100;
- const y = Math.random() * 300 + 100;
- await humanMouseMove(page, x, y);
- await randomDelay(200, 500);
- }
- // Small scroll
- await humanScroll(page, 100);
- await randomDelay(300, 700);
-}
-/**
- * Wait for page to be fully loaded with human-like delay
- */
-async function waitForPageLoad(page, timeout = 60000) {
- try {
- await page.waitForLoadState('networkidle', { timeout });
- await randomDelay(500, 1500); // Random delay after load
- }
- catch (error) {
- // If networkidle times out, try domcontentloaded as fallback
- console.log('⚠️ networkidle timeout, waiting for domcontentloaded...');
- await page.waitForLoadState('domcontentloaded', { timeout: 30000 });
- await randomDelay(1000, 2000);
- }
-}
-/**
- * Check if we're on a Cloudflare challenge page
- */
-async function isCloudflareChallenge(page) {
- const title = await page.title();
- const content = await page.content();
- return (title.includes('Cloudflare') ||
- title.includes('Just a moment') ||
- title.includes('Attention Required') ||
- content.includes('challenge-platform') ||
- content.includes('cf-challenge') ||
- content.includes('Checking your browser'));
-}
-/**
- * Wait for Cloudflare challenge to complete
- */
-async function waitForCloudflareChallenge(page, maxWaitMs = 60000) {
- const startTime = Date.now();
- let attempts = 0;
- while (Date.now() - startTime < maxWaitMs) {
- attempts++;
- if (!(await isCloudflareChallenge(page))) {
- console.log(`✅ Cloudflare challenge passed after ${attempts} attempts (${Math.floor((Date.now() - startTime) / 1000)}s)`);
- return true;
- }
- const remaining = Math.floor((maxWaitMs - (Date.now() - startTime)) / 1000);
- console.log(`⏳ Waiting for Cloudflare challenge... (attempt ${attempts}, ${remaining}s remaining)`);
- // Random delay between checks
- await randomDelay(2000, 3000);
- }
- console.log('❌ Cloudflare challenge timeout - may need residential proxy or manual intervention');
- return false;
-}
-/**
- * Save session cookies to file
- */
-async function saveCookies(context, filepath) {
- const cookies = await context.cookies();
- const fs = await Promise.resolve().then(() => __importStar(require('fs/promises')));
- await fs.writeFile(filepath, JSON.stringify(cookies, null, 2));
-}
-/**
- * Load session cookies from file
- */
-async function loadCookies(context, filepath) {
- try {
- const fs = await Promise.resolve().then(() => __importStar(require('fs/promises')));
- const cookiesString = await fs.readFile(filepath, 'utf-8');
- const cookies = JSON.parse(cookiesString);
- await context.addCookies(cookies);
- return true;
- }
- catch (error) {
- return false;
- }
-}