diff --git a/backend/dist/auth/middleware.js b/backend/dist/auth/middleware.js deleted file mode 100644 index 280a8cf7..00000000 --- a/backend/dist/auth/middleware.js +++ /dev/null @@ -1,113 +0,0 @@ -"use strict"; -var __importDefault = (this && this.__importDefault) || function (mod) { - return (mod && mod.__esModule) ? mod : { "default": mod }; -}; -Object.defineProperty(exports, "__esModule", { value: true }); -exports.generateToken = generateToken; -exports.verifyToken = verifyToken; -exports.authenticateUser = authenticateUser; -exports.authMiddleware = authMiddleware; -exports.requireRole = requireRole; -const jsonwebtoken_1 = __importDefault(require("jsonwebtoken")); -const bcrypt_1 = __importDefault(require("bcrypt")); -const migrate_1 = require("../db/migrate"); -const JWT_SECRET = process.env.JWT_SECRET || 'change_this_in_production'; -function generateToken(user) { - return jsonwebtoken_1.default.sign({ id: user.id, email: user.email, role: user.role }, JWT_SECRET, { expiresIn: '7d' }); -} -function verifyToken(token) { - try { - return jsonwebtoken_1.default.verify(token, JWT_SECRET); - } - catch (error) { - return null; - } -} -async function authenticateUser(email, password) { - const result = await migrate_1.pool.query('SELECT id, email, password_hash, role FROM users WHERE email = $1', [email]); - if (result.rows.length === 0) { - return null; - } - const user = result.rows[0]; - const isValid = await bcrypt_1.default.compare(password, user.password_hash); - if (!isValid) { - return null; - } - return { - id: user.id, - email: user.email, - role: user.role - }; -} -async function authMiddleware(req, res, next) { - const authHeader = req.headers.authorization; - if (!authHeader || !authHeader.startsWith('Bearer ')) { - return res.status(401).json({ error: 'No token provided' }); - } - const token = authHeader.substring(7); - // Try JWT first - const jwtUser = verifyToken(token); - if (jwtUser) { - req.user = jwtUser; - return next(); - } - // If JWT fails, try API token - try { - const result = await migrate_1.pool.query(` - SELECT id, name, rate_limit, active, expires_at, allowed_endpoints - FROM api_tokens - WHERE token = $1 - `, [token]); - if (result.rows.length === 0) { - return res.status(401).json({ error: 'Invalid token' }); - } - const apiToken = result.rows[0]; - // Check if token is active - if (!apiToken.active) { - return res.status(401).json({ error: 'Token is disabled' }); - } - // Check if token is expired - if (apiToken.expires_at && new Date(apiToken.expires_at) < new Date()) { - return res.status(401).json({ error: 'Token has expired' }); - } - // Check allowed endpoints - if (apiToken.allowed_endpoints && apiToken.allowed_endpoints.length > 0) { - const isAllowed = apiToken.allowed_endpoints.some((pattern) => { - // Simple wildcard matching - const regex = new RegExp('^' + pattern.replace('*', '.*') + '$'); - return regex.test(req.path); - }); - if (!isAllowed) { - return res.status(403).json({ error: 'Endpoint not allowed for this token' }); - } - } - // Set API token on request for tracking - req.apiToken = { - id: apiToken.id, - name: apiToken.name, - rate_limit: apiToken.rate_limit - }; - // Set a generic user for compatibility with existing code - req.user = { - id: apiToken.id, - email: `api-token-${apiToken.id}@system`, - role: 'api' - }; - next(); - } - catch (error) { - console.error('Error verifying API token:', error); - return res.status(500).json({ error: 'Authentication failed' }); - } -} -function requireRole(...roles) { - return (req, res, next) => { - if (!req.user) { - return res.status(401).json({ error: 'Not authenticated' }); - } - if (!roles.includes(req.user.role)) { - return res.status(403).json({ error: 'Insufficient permissions' }); - } - next(); - }; -} diff --git a/backend/dist/db/add-jobs-table.js b/backend/dist/db/add-jobs-table.js deleted file mode 100644 index 58db75bc..00000000 --- a/backend/dist/db/add-jobs-table.js +++ /dev/null @@ -1,41 +0,0 @@ -"use strict"; -Object.defineProperty(exports, "__esModule", { value: true }); -const migrate_1 = require("./migrate"); -async function addJobsTable() { - const client = await migrate_1.pool.connect(); - try { - await client.query('BEGIN'); - await client.query(` - CREATE TABLE IF NOT EXISTS jobs ( - id SERIAL PRIMARY KEY, - type VARCHAR(50) NOT NULL, - status VARCHAR(50) DEFAULT 'pending', - store_id INTEGER REFERENCES stores(id) ON DELETE CASCADE, - progress INTEGER DEFAULT 0, - total_items INTEGER, - processed_items INTEGER DEFAULT 0, - error TEXT, - started_at TIMESTAMP, - completed_at TIMESTAMP, - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP - ); - - CREATE INDEX IF NOT EXISTS idx_jobs_status ON jobs(status); - CREATE INDEX IF NOT EXISTS idx_jobs_type ON jobs(type); - CREATE INDEX IF NOT EXISTS idx_jobs_store_id ON jobs(store_id); - `); - await client.query('COMMIT'); - console.log('✅ Jobs table created successfully'); - } - catch (error) { - await client.query('ROLLBACK'); - console.error('❌ Failed to create jobs table:', error); - throw error; - } - finally { - client.release(); - } -} -addJobsTable() - .then(() => process.exit(0)) - .catch(() => process.exit(1)); diff --git a/backend/dist/db/migrate.js b/backend/dist/db/migrate.js deleted file mode 100644 index 5af42b0c..00000000 --- a/backend/dist/db/migrate.js +++ /dev/null @@ -1,321 +0,0 @@ -"use strict"; -Object.defineProperty(exports, "__esModule", { value: true }); -exports.pool = void 0; -exports.runMigrations = runMigrations; -const pg_1 = require("pg"); -// Consolidated DB connection: -// - Prefer CRAWLSY_DATABASE_URL (e.g., crawlsy_local, crawlsy_prod) -// - Then DATABASE_URL (default) -const DATABASE_URL = process.env.CRAWLSY_DATABASE_URL || - process.env.DATABASE_URL || - 'postgresql://dutchie:dutchie_local_pass@localhost:54320/crawlsy_local'; -const pool = new pg_1.Pool({ - connectionString: DATABASE_URL, -}); -exports.pool = pool; -async function runMigrations() { - const client = await pool.connect(); - try { - await client.query('BEGIN'); - // Users table - await client.query(` - CREATE TABLE IF NOT EXISTS users ( - id SERIAL PRIMARY KEY, - email VARCHAR(255) UNIQUE NOT NULL, - password_hash VARCHAR(255) NOT NULL, - role VARCHAR(50) DEFAULT 'admin', - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP - ); - `); - // Stores table - await client.query(` - CREATE TABLE IF NOT EXISTS stores ( - id SERIAL PRIMARY KEY, - name VARCHAR(255) NOT NULL, - slug VARCHAR(255) UNIQUE NOT NULL, - dutchie_url TEXT NOT NULL, - active BOOLEAN DEFAULT true, - scrape_enabled BOOLEAN DEFAULT true, - last_scraped_at TIMESTAMP, - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP - ); - `); - // Categories table (shop, brands, specials) - await client.query(` - CREATE TABLE IF NOT EXISTS categories ( - id SERIAL PRIMARY KEY, - store_id INTEGER REFERENCES stores(id) ON DELETE CASCADE, - name VARCHAR(255) NOT NULL, - slug VARCHAR(255) NOT NULL, - dutchie_url TEXT NOT NULL, - scrape_enabled BOOLEAN DEFAULT true, - last_scraped_at TIMESTAMP, - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - UNIQUE(store_id, slug) - ); - `); - // Products table - await client.query(` - CREATE TABLE IF NOT EXISTS products ( - id SERIAL PRIMARY KEY, - store_id INTEGER REFERENCES stores(id) ON DELETE CASCADE, - category_id INTEGER REFERENCES categories(id) ON DELETE SET NULL, - dutchie_product_id VARCHAR(255), - name VARCHAR(500) NOT NULL, - slug VARCHAR(500), - description TEXT, - price DECIMAL(10, 2), - original_price DECIMAL(10, 2), - strain_type VARCHAR(100), - thc_percentage DECIMAL(5, 2), - cbd_percentage DECIMAL(5, 2), - brand VARCHAR(255), - weight VARCHAR(100), - image_url TEXT, - local_image_path TEXT, - dutchie_url TEXT NOT NULL, - in_stock BOOLEAN DEFAULT true, - is_special BOOLEAN DEFAULT false, - metadata JSONB, - first_seen_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - last_seen_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - UNIQUE(store_id, dutchie_product_id) - ); - `); - // Campaigns table - await client.query(` - CREATE TABLE IF NOT EXISTS campaigns ( - id SERIAL PRIMARY KEY, - name VARCHAR(255) NOT NULL, - slug VARCHAR(255) UNIQUE NOT NULL, - description TEXT, - display_style VARCHAR(50) DEFAULT 'grid', - active BOOLEAN DEFAULT true, - start_date TIMESTAMP, - end_date TIMESTAMP, - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP - ); - `); - // Add variant column to products table (for different sizes/options of same product) - await client.query(` - ALTER TABLE products ADD COLUMN IF NOT EXISTS variant VARCHAR(255); - `); - // Add special tracking columns (DEPRECATED - not used with new approach) - await client.query(` - ALTER TABLE products ADD COLUMN IF NOT EXISTS special_ends_at TIMESTAMP; - ALTER TABLE products ADD COLUMN IF NOT EXISTS special_text TEXT; - ALTER TABLE products ADD COLUMN IF NOT EXISTS special_type VARCHAR(100); - `); - // ====== NEW SCHEMA ADDITIONS ====== - // Add array columns for product attributes - await client.query(` - ALTER TABLE products ADD COLUMN IF NOT EXISTS terpenes TEXT[]; - ALTER TABLE products ADD COLUMN IF NOT EXISTS effects TEXT[]; - ALTER TABLE products ADD COLUMN IF NOT EXISTS flavors TEXT[]; - `); - // Add new price columns (regular_price = market price, sale_price = discount price) - await client.query(` - ALTER TABLE products ADD COLUMN IF NOT EXISTS regular_price DECIMAL(10, 2); - ALTER TABLE products ADD COLUMN IF NOT EXISTS sale_price DECIMAL(10, 2); - `); - // Migrate existing price data - await client.query(` - UPDATE products - SET regular_price = original_price - WHERE regular_price IS NULL AND original_price IS NOT NULL; - `); - await client.query(` - UPDATE products - SET sale_price = price - WHERE sale_price IS NULL AND price IS NOT NULL AND original_price IS NOT NULL AND price < original_price; - `); - // Make slug NOT NULL and add unique constraint - await client.query(` - UPDATE products SET slug = dutchie_product_id WHERE slug IS NULL; - ALTER TABLE products ALTER COLUMN slug SET NOT NULL; - `); - // Drop old unique constraint and add new one on slug - await client.query(` - ALTER TABLE products DROP CONSTRAINT IF EXISTS products_store_id_dutchie_product_id_key; - DO $$ - BEGIN - IF NOT EXISTS (SELECT 1 FROM pg_constraint WHERE conname = 'products_store_id_slug_unique') THEN - ALTER TABLE products ADD CONSTRAINT products_store_id_slug_unique UNIQUE (store_id, slug); - END IF; - END$$; - `); - // Product Categories (many-to-many) - products can appear in multiple categories - await client.query(` - CREATE TABLE IF NOT EXISTS product_categories ( - id SERIAL PRIMARY KEY, - product_id INTEGER REFERENCES products(id) ON DELETE CASCADE, - category_slug VARCHAR(255) NOT NULL, - first_seen_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - last_seen_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - UNIQUE(product_id, category_slug) - ); - `); - await client.query(` - CREATE INDEX IF NOT EXISTS idx_product_categories_slug ON product_categories(category_slug, last_seen_at DESC); - CREATE INDEX IF NOT EXISTS idx_product_categories_product ON product_categories(product_id); - `); - // Price History - track regular and sale price changes over time - await client.query(` - CREATE TABLE IF NOT EXISTS price_history ( - id SERIAL PRIMARY KEY, - product_id INTEGER REFERENCES products(id) ON DELETE CASCADE, - regular_price DECIMAL(10, 2), - sale_price DECIMAL(10, 2), - recorded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP - ); - `); - await client.query(` - CREATE INDEX IF NOT EXISTS idx_price_history_product ON price_history(product_id, recorded_at DESC); - CREATE INDEX IF NOT EXISTS idx_price_history_recorded ON price_history(recorded_at DESC); - `); - // Batch History - track cannabinoid/terpene changes (different batches) - await client.query(` - CREATE TABLE IF NOT EXISTS batch_history ( - id SERIAL PRIMARY KEY, - product_id INTEGER REFERENCES products(id) ON DELETE CASCADE, - thc_percentage DECIMAL(5, 2), - cbd_percentage DECIMAL(5, 2), - terpenes TEXT[], - strain_type VARCHAR(100), - recorded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP - ); - `); - await client.query(` - CREATE INDEX IF NOT EXISTS idx_batch_history_product ON batch_history(product_id, recorded_at DESC); - CREATE INDEX IF NOT EXISTS idx_batch_history_recorded ON batch_history(recorded_at DESC); - `); - // Campaign products (many-to-many with ordering) - await client.query(` - CREATE TABLE IF NOT EXISTS campaign_products ( - id SERIAL PRIMARY KEY, - campaign_id INTEGER REFERENCES campaigns(id) ON DELETE CASCADE, - product_id INTEGER REFERENCES products(id) ON DELETE CASCADE, - display_order INTEGER DEFAULT 0, - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - UNIQUE(campaign_id, product_id) - ); - `); - // Click tracking - await client.query(` - CREATE TABLE IF NOT EXISTS clicks ( - id SERIAL PRIMARY KEY, - product_id INTEGER REFERENCES products(id) ON DELETE CASCADE, - campaign_id INTEGER REFERENCES campaigns(id) ON DELETE SET NULL, - ip_address VARCHAR(45), - user_agent TEXT, - referrer TEXT, - clicked_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP - ); - `); - // Create index on clicked_at for analytics queries - await client.query(` - CREATE INDEX IF NOT EXISTS idx_clicks_clicked_at ON clicks(clicked_at); - CREATE INDEX IF NOT EXISTS idx_clicks_product_id ON clicks(product_id); - CREATE INDEX IF NOT EXISTS idx_clicks_campaign_id ON clicks(campaign_id); - `); - // Proxies table - await client.query(` - CREATE TABLE IF NOT EXISTS proxies ( - id SERIAL PRIMARY KEY, - host VARCHAR(255) NOT NULL, - port INTEGER NOT NULL, - protocol VARCHAR(10) NOT NULL, - username VARCHAR(255), - password VARCHAR(255), - active BOOLEAN DEFAULT true, - is_anonymous BOOLEAN DEFAULT false, - last_tested_at TIMESTAMP, - test_result VARCHAR(50), - response_time_ms INTEGER, - failure_count INTEGER DEFAULT 0, - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - UNIQUE(host, port, protocol) - ); - `); - // Add failure_count column if it doesn't exist - await client.query(` - ALTER TABLE proxies ADD COLUMN IF NOT EXISTS failure_count INTEGER DEFAULT 0; - `); - // Failed proxies table - await client.query(` - CREATE TABLE IF NOT EXISTS failed_proxies ( - id SERIAL PRIMARY KEY, - host VARCHAR(255) NOT NULL, - port INTEGER NOT NULL, - protocol VARCHAR(10) NOT NULL, - username VARCHAR(255), - password VARCHAR(255), - failure_count INTEGER NOT NULL, - last_error TEXT, - failed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - UNIQUE(host, port, protocol) - ); - `); - // Proxy test jobs table - await client.query(` - CREATE TABLE IF NOT EXISTS proxy_test_jobs ( - id SERIAL PRIMARY KEY, - status VARCHAR(20) NOT NULL DEFAULT 'pending', - total_proxies INTEGER NOT NULL DEFAULT 0, - tested_proxies INTEGER NOT NULL DEFAULT 0, - passed_proxies INTEGER NOT NULL DEFAULT 0, - failed_proxies INTEGER NOT NULL DEFAULT 0, - started_at TIMESTAMP, - completed_at TIMESTAMP, - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP - ); - `); - await client.query(` - CREATE INDEX IF NOT EXISTS idx_proxy_test_jobs_status ON proxy_test_jobs(status); - CREATE INDEX IF NOT EXISTS idx_proxy_test_jobs_created_at ON proxy_test_jobs(created_at DESC); - `); - // Settings table - await client.query(` - CREATE TABLE IF NOT EXISTS settings ( - key VARCHAR(255) PRIMARY KEY, - value TEXT NOT NULL, - description TEXT, - updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP - ); - `); - // Insert default settings - await client.query(` - INSERT INTO settings (key, value, description) VALUES - ('scrape_interval_hours', '4', 'How often to scrape stores (in hours)'), - ('scrape_specials_time', '00:01', 'Time to scrape specials daily (HH:MM in 24h format)'), - ('analytics_retention_days', '365', 'How many days to keep analytics data'), - ('proxy_timeout_ms', '3000', 'Proxy timeout in milliseconds'), - ('proxy_test_url', 'https://httpbin.org/ip', 'URL to test proxies against') - ON CONFLICT (key) DO NOTHING; - `); - await client.query('COMMIT'); - console.log('✅ Migrations completed successfully'); - } - catch (error) { - await client.query('ROLLBACK'); - console.error('❌ Migration failed:', error); - throw error; - } - finally { - client.release(); - } -} -// Run migrations if this file is executed directly -if (require.main === module) { - runMigrations() - .then(() => process.exit(0)) - .catch(() => process.exit(1)); -} diff --git a/backend/dist/db/run-notifications-migration.js b/backend/dist/db/run-notifications-migration.js deleted file mode 100644 index 008b33d1..00000000 --- a/backend/dist/db/run-notifications-migration.js +++ /dev/null @@ -1,56 +0,0 @@ -"use strict"; -var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { - if (k2 === undefined) k2 = k; - var desc = Object.getOwnPropertyDescriptor(m, k); - if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { - desc = { enumerable: true, get: function() { return m[k]; } }; - } - Object.defineProperty(o, k2, desc); -}) : (function(o, m, k, k2) { - if (k2 === undefined) k2 = k; - o[k2] = m[k]; -})); -var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { - Object.defineProperty(o, "default", { enumerable: true, value: v }); -}) : function(o, v) { - o["default"] = v; -}); -var __importStar = (this && this.__importStar) || (function () { - var ownKeys = function(o) { - ownKeys = Object.getOwnPropertyNames || function (o) { - var ar = []; - for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; - return ar; - }; - return ownKeys(o); - }; - return function (mod) { - if (mod && mod.__esModule) return mod; - var result = {}; - if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); - __setModuleDefault(result, mod); - return result; - }; -})(); -Object.defineProperty(exports, "__esModule", { value: true }); -const migrate_1 = require("./migrate"); -const fs = __importStar(require("fs")); -const path = __importStar(require("path")); -async function runNotificationsMigration() { - const client = await migrate_1.pool.connect(); - try { - console.log('Running notifications migration...'); - const migrationSQL = fs.readFileSync(path.join(__dirname, '../../migrations/005_notifications.sql'), 'utf-8'); - await client.query(migrationSQL); - console.log('✅ Notifications migration completed successfully'); - process.exit(0); - } - catch (error) { - console.error('❌ Migration failed:', error); - process.exit(1); - } - finally { - client.release(); - } -} -runNotificationsMigration(); diff --git a/backend/dist/db/seed.js b/backend/dist/db/seed.js deleted file mode 100644 index 638b9bc2..00000000 --- a/backend/dist/db/seed.js +++ /dev/null @@ -1,72 +0,0 @@ -"use strict"; -var __importDefault = (this && this.__importDefault) || function (mod) { - return (mod && mod.__esModule) ? mod : { "default": mod }; -}; -Object.defineProperty(exports, "__esModule", { value: true }); -exports.seedDatabase = seedDatabase; -const migrate_1 = require("./migrate"); -const bcrypt_1 = __importDefault(require("bcrypt")); -async function seedDatabase() { - const client = await migrate_1.pool.connect(); - try { - // Create admin user - const adminEmail = process.env.ADMIN_EMAIL || 'admin@example.com'; - const adminPassword = process.env.ADMIN_PASSWORD || 'password'; - const passwordHash = await bcrypt_1.default.hash(adminPassword, 10); - await client.query(` - INSERT INTO users (email, password_hash, role) - VALUES ($1, $2, 'superadmin') - ON CONFLICT (email) DO UPDATE - SET password_hash = $2, role = 'superadmin' - `, [adminEmail, passwordHash]); - console.log(`✅ Admin user created: ${adminEmail}`); - // Create Deeply Rooted store - const storeResult = await client.query(` - INSERT INTO stores (name, slug, dutchie_url, active, scrape_enabled) - VALUES ('Deeply Rooted', 'AZ-Deeply-Rooted', 'https://dutchie.com/embedded-menu/AZ-Deeply-Rooted', true, true) - ON CONFLICT (slug) DO UPDATE - SET name = 'Deeply Rooted', dutchie_url = 'https://dutchie.com/embedded-menu/AZ-Deeply-Rooted' - RETURNING id - `); - const storeId = storeResult.rows[0].id; - console.log(`✅ Store created: Deeply Rooted (ID: ${storeId})`); - // Create categories for the store - const categories = [ - { name: 'Shop', slug: 'shop', url: 'https://dutchie.com/embedded-menu/AZ-Deeply-Rooted' }, - { name: 'Brands', slug: 'brands', url: 'https://dutchie.com/embedded-menu/AZ-Deeply-Rooted/brands' }, - { name: 'Specials', slug: 'specials', url: 'https://dutchie.com/embedded-menu/AZ-Deeply-Rooted/specials/sale/66501e094faefa00079b1835' } - ]; - for (const cat of categories) { - await client.query(` - INSERT INTO categories (store_id, name, slug, dutchie_url, scrape_enabled) - VALUES ($1, $2, $3, $4, true) - ON CONFLICT (store_id, slug) DO UPDATE - SET name = $2, dutchie_url = $4 - `, [storeId, cat.name, cat.slug, cat.url]); - } - console.log('✅ Categories created: Shop, Brands, Specials'); - // Create a default "Featured Products" campaign - await client.query(` - INSERT INTO campaigns (name, slug, description, display_style, active) - VALUES ('Featured Products', 'featured', 'Default featured products campaign', 'grid', true) - ON CONFLICT (slug) DO NOTHING - `); - console.log('✅ Default campaign created: Featured Products'); - console.log('\n🎉 Seeding completed successfully!'); - console.log(`\n📧 Login: ${adminEmail}`); - console.log(`🔑 Password: ${adminPassword}`); - } - catch (error) { - console.error('❌ Seeding failed:', error); - throw error; - } - finally { - client.release(); - } -} -// Run seed if this file is executed directly -if (require.main === module) { - seedDatabase() - .then(() => process.exit(0)) - .catch(() => process.exit(1)); -} diff --git a/backend/dist/db/update-categories-hierarchy.js b/backend/dist/db/update-categories-hierarchy.js deleted file mode 100644 index 02f15fee..00000000 --- a/backend/dist/db/update-categories-hierarchy.js +++ /dev/null @@ -1,48 +0,0 @@ -"use strict"; -Object.defineProperty(exports, "__esModule", { value: true }); -const migrate_1 = require("./migrate"); -async function updateCategoriesHierarchy() { - const client = await migrate_1.pool.connect(); - try { - await client.query('BEGIN'); - // Add parent_id for nested categories - await client.query(` - ALTER TABLE categories - ADD COLUMN IF NOT EXISTS parent_id INTEGER REFERENCES categories(id) ON DELETE CASCADE; - - ALTER TABLE categories - ADD COLUMN IF NOT EXISTS display_order INTEGER DEFAULT 0; - - ALTER TABLE categories - ADD COLUMN IF NOT EXISTS description TEXT; - - CREATE INDEX IF NOT EXISTS idx_categories_parent_id ON categories(parent_id); - `); - // Add category_path for easy searching (e.g., 'shop/flower') - await client.query(` - ALTER TABLE categories - ADD COLUMN IF NOT EXISTS path VARCHAR(500); - - CREATE INDEX IF NOT EXISTS idx_categories_path ON categories(path); - `); - // Update existing categories to have paths - await client.query(` - UPDATE categories - SET path = slug - WHERE path IS NULL; - `); - await client.query('COMMIT'); - console.log('✅ Categories hierarchy updated successfully'); - } - catch (error) { - await client.query('ROLLBACK'); - console.error('❌ Failed to update categories:', error); - throw error; - } - finally { - client.release(); - } -} -updateCategoriesHierarchy() - .then(() => process.exit(0)) - .catch(() => process.exit(1)); diff --git a/backend/dist/dutchie-az/config/dutchie.js b/backend/dist/dutchie-az/config/dutchie.js deleted file mode 100644 index f9b2088b..00000000 --- a/backend/dist/dutchie-az/config/dutchie.js +++ /dev/null @@ -1,106 +0,0 @@ -"use strict"; -/** - * Dutchie Configuration - * - * Centralized configuration for Dutchie GraphQL API interaction. - * Update hashes here when Dutchie changes their persisted query system. - */ -Object.defineProperty(exports, "__esModule", { value: true }); -exports.ARIZONA_CENTERPOINTS = exports.GRAPHQL_HASHES = exports.dutchieConfig = void 0; -exports.dutchieConfig = { - // ============================================================ - // GRAPHQL ENDPOINT - // ============================================================ - /** GraphQL endpoint - must be the api-3 graphql endpoint (NOT api-gw.dutchie.com which no longer exists) */ - graphqlEndpoint: 'https://dutchie.com/api-3/graphql', - // ============================================================ - // GRAPHQL PERSISTED QUERY HASHES - // ============================================================ - // - // These hashes identify specific GraphQL operations. - // If Dutchie changes their schema, you may need to capture - // new hashes from live browser traffic (Network tab → graphql requests). - /** FilteredProducts - main product listing query */ - filteredProductsHash: 'ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0', - /** GetAddressBasedDispensaryData - resolve slug to internal ID */ - getDispensaryDataHash: '13461f73abf7268770dfd05fe7e10c523084b2bb916a929c08efe3d87531977b', - /** - * ConsumerDispensaries - geo-based discovery - * NOTE: This is a placeholder guess. If discovery fails, either: - * 1. Capture the real hash from live traffic - * 2. Rely on known AZDHS slugs instead (set useDiscovery: false) - */ - consumerDispensariesHash: '0a5bfa6ca1d64ae47bcccb7c8077c87147cbc4e6982c17ceec97a2a4948b311b', - // ============================================================ - // BEHAVIOR FLAGS - // ============================================================ - /** Enable geo-based discovery (false = use known AZDHS slugs only) */ - useDiscovery: true, - /** Prefer GET requests (true) or POST (false). GET is default. */ - preferGet: true, - /** - * Enable POST fallback when GET fails with 405 or blocked. - * If true, will retry failed GETs as POSTs. - */ - enablePostFallback: true, - // ============================================================ - // PAGINATION & RETRY - // ============================================================ - /** Products per page for pagination */ - perPage: 100, - /** Maximum pages to fetch (safety limit) */ - maxPages: 200, - /** Number of retries for failed page fetches */ - maxRetries: 1, - /** Delay between pages in ms */ - pageDelayMs: 500, - /** Delay between modes in ms */ - modeDelayMs: 2000, - // ============================================================ - // HTTP HEADERS - // ============================================================ - /** Default headers to mimic browser requests */ - defaultHeaders: { - 'accept': 'application/json, text/plain, */*', - 'accept-language': 'en-US,en;q=0.9', - 'apollographql-client-name': 'Marketplace (production)', - }, - /** User agent string */ - userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', - // ============================================================ - // BROWSER LAUNCH OPTIONS - // ============================================================ - browserArgs: [ - '--no-sandbox', - '--disable-setuid-sandbox', - '--disable-dev-shm-usage', - '--disable-blink-features=AutomationControlled', - ], - /** Navigation timeout in ms */ - navigationTimeout: 60000, - /** Initial page load delay in ms */ - pageLoadDelay: 2000, -}; -/** - * Get GraphQL hashes object for backward compatibility - */ -exports.GRAPHQL_HASHES = { - FilteredProducts: exports.dutchieConfig.filteredProductsHash, - GetAddressBasedDispensaryData: exports.dutchieConfig.getDispensaryDataHash, - ConsumerDispensaries: exports.dutchieConfig.consumerDispensariesHash, -}; -/** - * Arizona geo centerpoints for discovery scans - */ -exports.ARIZONA_CENTERPOINTS = [ - { name: 'Phoenix', lat: 33.4484, lng: -112.074 }, - { name: 'Tucson', lat: 32.2226, lng: -110.9747 }, - { name: 'Flagstaff', lat: 35.1983, lng: -111.6513 }, - { name: 'Mesa', lat: 33.4152, lng: -111.8315 }, - { name: 'Scottsdale', lat: 33.4942, lng: -111.9261 }, - { name: 'Tempe', lat: 33.4255, lng: -111.94 }, - { name: 'Yuma', lat: 32.6927, lng: -114.6277 }, - { name: 'Prescott', lat: 34.54, lng: -112.4685 }, - { name: 'Lake Havasu', lat: 34.4839, lng: -114.3224 }, - { name: 'Sierra Vista', lat: 31.5455, lng: -110.2773 }, -]; diff --git a/backend/dist/dutchie-az/db/connection.js b/backend/dist/dutchie-az/db/connection.js deleted file mode 100644 index e3b32e39..00000000 --- a/backend/dist/dutchie-az/db/connection.js +++ /dev/null @@ -1,79 +0,0 @@ -"use strict"; -/** - * Dutchie AZ Database Connection - * - * Isolated database connection for Dutchie Arizona data. - * Uses a separate database/schema to prevent cross-contamination with main app data. - */ -Object.defineProperty(exports, "__esModule", { value: true }); -exports.getDutchieAZPool = getDutchieAZPool; -exports.query = query; -exports.getClient = getClient; -exports.closePool = closePool; -exports.healthCheck = healthCheck; -const pg_1 = require("pg"); -// Consolidated DB naming: -// - Prefer CRAWLSY_DATABASE_URL (e.g., crawlsy_local, crawlsy_prod) -// - Then DUTCHIE_AZ_DATABASE_URL (legacy) -// - Finally DATABASE_URL (legacy main DB) -const DUTCHIE_AZ_DATABASE_URL = process.env.CRAWLSY_DATABASE_URL || - process.env.DUTCHIE_AZ_DATABASE_URL || - process.env.DATABASE_URL || - 'postgresql://dutchie:dutchie_local_pass@localhost:54320/crawlsy_local'; -let pool = null; -/** - * Get the Dutchie AZ database pool (singleton) - */ -function getDutchieAZPool() { - if (!pool) { - pool = new pg_1.Pool({ - connectionString: DUTCHIE_AZ_DATABASE_URL, - max: 10, - idleTimeoutMillis: 30000, - connectionTimeoutMillis: 5000, - }); - pool.on('error', (err) => { - console.error('[DutchieAZ DB] Unexpected error on idle client:', err); - }); - console.log('[DutchieAZ DB] Pool initialized'); - } - return pool; -} -/** - * Execute a query on the Dutchie AZ database - */ -async function query(text, params) { - const p = getDutchieAZPool(); - const result = await p.query(text, params); - return { rows: result.rows, rowCount: result.rowCount || 0 }; -} -/** - * Get a client from the pool for transaction use - */ -async function getClient() { - const p = getDutchieAZPool(); - return p.connect(); -} -/** - * Close the pool connection - */ -async function closePool() { - if (pool) { - await pool.end(); - pool = null; - console.log('[DutchieAZ DB] Pool closed'); - } -} -/** - * Check if the database is accessible - */ -async function healthCheck() { - try { - const result = await query('SELECT 1 as ok'); - return result.rows.length > 0 && result.rows[0].ok === 1; - } - catch (error) { - console.error('[DutchieAZ DB] Health check failed:', error); - return false; - } -} diff --git a/backend/dist/dutchie-az/db/migrate.js b/backend/dist/dutchie-az/db/migrate.js deleted file mode 100644 index a4ea4eae..00000000 --- a/backend/dist/dutchie-az/db/migrate.js +++ /dev/null @@ -1,30 +0,0 @@ -"use strict"; -/** - * Dutchie AZ Schema Bootstrap - * - * Run this to create/update the dutchie_az tables (dutchie_products, dutchie_product_snapshots, etc.) - * in the AZ pipeline database. This is separate from the legacy schema. - * - * Usage: - * TS_NODE_TRANSPILE_ONLY=1 npx ts-node src/dutchie-az/db/migrate.ts - * or (after build) - * node dist/dutchie-az/db/migrate.js - */ -Object.defineProperty(exports, "__esModule", { value: true }); -const schema_1 = require("./schema"); -const connection_1 = require("./connection"); -async function main() { - try { - console.log('[DutchieAZ] Running schema migration...'); - await (0, schema_1.createSchema)(); - console.log('[DutchieAZ] Schema migration complete.'); - } - catch (err) { - console.error('[DutchieAZ] Schema migration failed:', err.message); - process.exitCode = 1; - } - finally { - await (0, connection_1.closePool)(); - } -} -main(); diff --git a/backend/dist/dutchie-az/db/schema.js b/backend/dist/dutchie-az/db/schema.js deleted file mode 100644 index 493692a3..00000000 --- a/backend/dist/dutchie-az/db/schema.js +++ /dev/null @@ -1,405 +0,0 @@ -"use strict"; -/** - * Dutchie AZ Database Schema - * - * Creates all tables for the isolated Dutchie Arizona data pipeline. - * Run this to initialize the dutchie_az database. - */ -Object.defineProperty(exports, "__esModule", { value: true }); -exports.createSchema = createSchema; -exports.dropSchema = dropSchema; -exports.schemaExists = schemaExists; -exports.ensureSchema = ensureSchema; -const connection_1 = require("./connection"); -/** - * SQL statements to create all tables - */ -const SCHEMA_SQL = ` --- ============================================================ --- DISPENSARIES TABLE --- Stores discovered Dutchie dispensaries in Arizona --- ============================================================ -CREATE TABLE IF NOT EXISTS dispensaries ( - id SERIAL PRIMARY KEY, - platform VARCHAR(20) NOT NULL DEFAULT 'dutchie', - name VARCHAR(255) NOT NULL, - slug VARCHAR(255) NOT NULL, - city VARCHAR(100) NOT NULL, - state VARCHAR(10) NOT NULL DEFAULT 'AZ', - postal_code VARCHAR(20), - address TEXT, - latitude DECIMAL(10, 7), - longitude DECIMAL(10, 7), - platform_dispensary_id VARCHAR(100), - is_delivery BOOLEAN DEFAULT false, - is_pickup BOOLEAN DEFAULT true, - raw_metadata JSONB, - last_crawled_at TIMESTAMPTZ, - product_count INTEGER DEFAULT 0, - created_at TIMESTAMPTZ DEFAULT NOW(), - updated_at TIMESTAMPTZ DEFAULT NOW(), - - CONSTRAINT uk_dispensaries_platform_slug UNIQUE (platform, slug, city, state) -); - -CREATE INDEX IF NOT EXISTS idx_dispensaries_platform ON dispensaries(platform); -CREATE INDEX IF NOT EXISTS idx_dispensaries_platform_id ON dispensaries(platform_dispensary_id); -CREATE INDEX IF NOT EXISTS idx_dispensaries_state ON dispensaries(state); -CREATE INDEX IF NOT EXISTS idx_dispensaries_city ON dispensaries(city); - --- ============================================================ --- DUTCHIE_PRODUCTS TABLE --- Canonical product identity per store --- ============================================================ -CREATE TABLE IF NOT EXISTS dutchie_products ( - id SERIAL PRIMARY KEY, - dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE, - platform VARCHAR(20) NOT NULL DEFAULT 'dutchie', - - external_product_id VARCHAR(100) NOT NULL, - platform_dispensary_id VARCHAR(100) NOT NULL, - c_name VARCHAR(500), - name VARCHAR(500) NOT NULL, - - -- Brand - brand_name VARCHAR(255), - brand_id VARCHAR(100), - brand_logo_url TEXT, - - -- Classification - type VARCHAR(100), - subcategory VARCHAR(100), - strain_type VARCHAR(50), - provider VARCHAR(100), - - -- Potency - thc DECIMAL(10, 4), - thc_content DECIMAL(10, 4), - cbd DECIMAL(10, 4), - cbd_content DECIMAL(10, 4), - cannabinoids_v2 JSONB, - effects JSONB, - - -- Status / flags - status VARCHAR(50), - medical_only BOOLEAN DEFAULT false, - rec_only BOOLEAN DEFAULT false, - featured BOOLEAN DEFAULT false, - coming_soon BOOLEAN DEFAULT false, - certificate_of_analysis_enabled BOOLEAN DEFAULT false, - - is_below_threshold BOOLEAN DEFAULT false, - is_below_kiosk_threshold BOOLEAN DEFAULT false, - options_below_threshold BOOLEAN DEFAULT false, - options_below_kiosk_threshold BOOLEAN DEFAULT false, - - -- Derived stock status: 'in_stock', 'out_of_stock', 'unknown' - stock_status VARCHAR(20) DEFAULT 'unknown', - total_quantity_available INTEGER DEFAULT 0, - - -- Images - primary_image_url TEXT, - images JSONB, - - -- Misc - measurements JSONB, - weight VARCHAR(50), - past_c_names TEXT[], - - created_at_dutchie TIMESTAMPTZ, - updated_at_dutchie TIMESTAMPTZ, - - latest_raw_payload JSONB, - - created_at TIMESTAMPTZ DEFAULT NOW(), - updated_at TIMESTAMPTZ DEFAULT NOW(), - - CONSTRAINT uk_dutchie_products UNIQUE (dispensary_id, external_product_id) -); - -CREATE INDEX IF NOT EXISTS idx_dutchie_products_dispensary ON dutchie_products(dispensary_id); -CREATE INDEX IF NOT EXISTS idx_dutchie_products_external_id ON dutchie_products(external_product_id); -CREATE INDEX IF NOT EXISTS idx_dutchie_products_platform_disp ON dutchie_products(platform_dispensary_id); -CREATE INDEX IF NOT EXISTS idx_dutchie_products_brand ON dutchie_products(brand_name); -CREATE INDEX IF NOT EXISTS idx_dutchie_products_type ON dutchie_products(type); -CREATE INDEX IF NOT EXISTS idx_dutchie_products_subcategory ON dutchie_products(subcategory); -CREATE INDEX IF NOT EXISTS idx_dutchie_products_status ON dutchie_products(status); -CREATE INDEX IF NOT EXISTS idx_dutchie_products_strain ON dutchie_products(strain_type); -CREATE INDEX IF NOT EXISTS idx_dutchie_products_stock_status ON dutchie_products(stock_status); - --- ============================================================ --- DUTCHIE_PRODUCT_SNAPSHOTS TABLE --- Historical state per crawl, includes options[] --- ============================================================ -CREATE TABLE IF NOT EXISTS dutchie_product_snapshots ( - id SERIAL PRIMARY KEY, - dutchie_product_id INTEGER NOT NULL REFERENCES dutchie_products(id) ON DELETE CASCADE, - dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id) ON DELETE CASCADE, - platform_dispensary_id VARCHAR(100) NOT NULL, - external_product_id VARCHAR(100) NOT NULL, - pricing_type VARCHAR(20) DEFAULT 'unknown', - crawl_mode VARCHAR(20) DEFAULT 'mode_a', -- 'mode_a' (UI parity) or 'mode_b' (max coverage) - - status VARCHAR(50), - featured BOOLEAN DEFAULT false, - special BOOLEAN DEFAULT false, - medical_only BOOLEAN DEFAULT false, - rec_only BOOLEAN DEFAULT false, - - -- Flag indicating if product was present in feed (false = missing_from_feed snapshot) - is_present_in_feed BOOLEAN DEFAULT true, - - -- Derived stock status - stock_status VARCHAR(20) DEFAULT 'unknown', - - -- Price summary (in cents) - rec_min_price_cents INTEGER, - rec_max_price_cents INTEGER, - rec_min_special_price_cents INTEGER, - med_min_price_cents INTEGER, - med_max_price_cents INTEGER, - med_min_special_price_cents INTEGER, - wholesale_min_price_cents INTEGER, - - -- Inventory summary - total_quantity_available INTEGER, - total_kiosk_quantity_available INTEGER, - manual_inventory BOOLEAN DEFAULT false, - is_below_threshold BOOLEAN DEFAULT false, - is_below_kiosk_threshold BOOLEAN DEFAULT false, - - -- Option-level data (from POSMetaData.children) - options JSONB, - - -- Full raw product node - raw_payload JSONB NOT NULL, - - crawled_at TIMESTAMPTZ NOT NULL, - created_at TIMESTAMPTZ DEFAULT NOW(), - updated_at TIMESTAMPTZ DEFAULT NOW() -); - -CREATE INDEX IF NOT EXISTS idx_snapshots_product ON dutchie_product_snapshots(dutchie_product_id); -CREATE INDEX IF NOT EXISTS idx_snapshots_dispensary ON dutchie_product_snapshots(dispensary_id); -CREATE INDEX IF NOT EXISTS idx_snapshots_crawled_at ON dutchie_product_snapshots(crawled_at); -CREATE INDEX IF NOT EXISTS idx_snapshots_platform_disp ON dutchie_product_snapshots(platform_dispensary_id); -CREATE INDEX IF NOT EXISTS idx_snapshots_external_id ON dutchie_product_snapshots(external_product_id); -CREATE INDEX IF NOT EXISTS idx_snapshots_special ON dutchie_product_snapshots(special) WHERE special = true; -CREATE INDEX IF NOT EXISTS idx_snapshots_stock_status ON dutchie_product_snapshots(stock_status); -CREATE INDEX IF NOT EXISTS idx_snapshots_crawl_mode ON dutchie_product_snapshots(crawl_mode); - --- ============================================================ --- CRAWL_JOBS TABLE --- Tracks crawl execution status --- ============================================================ -CREATE TABLE IF NOT EXISTS crawl_jobs ( - id SERIAL PRIMARY KEY, - job_type VARCHAR(50) NOT NULL, - dispensary_id INTEGER REFERENCES dispensaries(id) ON DELETE SET NULL, - status VARCHAR(20) NOT NULL DEFAULT 'pending', - started_at TIMESTAMPTZ, - completed_at TIMESTAMPTZ, - error_message TEXT, - products_found INTEGER, - snapshots_created INTEGER, - metadata JSONB, - created_at TIMESTAMPTZ DEFAULT NOW(), - updated_at TIMESTAMPTZ DEFAULT NOW() -); - -CREATE INDEX IF NOT EXISTS idx_crawl_jobs_type ON crawl_jobs(job_type); -CREATE INDEX IF NOT EXISTS idx_crawl_jobs_status ON crawl_jobs(status); -CREATE INDEX IF NOT EXISTS idx_crawl_jobs_dispensary ON crawl_jobs(dispensary_id); -CREATE INDEX IF NOT EXISTS idx_crawl_jobs_created ON crawl_jobs(created_at); - --- ============================================================ --- JOB_SCHEDULES TABLE --- Stores schedule configuration for recurring jobs with jitter support --- Each job has independent timing that "wanders" over time --- ============================================================ -CREATE TABLE IF NOT EXISTS job_schedules ( - id SERIAL PRIMARY KEY, - job_name VARCHAR(100) NOT NULL UNIQUE, - description TEXT, - enabled BOOLEAN DEFAULT true, - - -- Timing configuration (jitter makes times "wander") - base_interval_minutes INTEGER NOT NULL DEFAULT 240, -- e.g., 4 hours - jitter_minutes INTEGER NOT NULL DEFAULT 30, -- e.g., ±30 min - - -- Last run tracking - last_run_at TIMESTAMPTZ, - last_status VARCHAR(20), -- 'success', 'error', 'partial', 'running' - last_error_message TEXT, - last_duration_ms INTEGER, - - -- Next run (calculated with jitter after each run) - next_run_at TIMESTAMPTZ, - - -- Additional config - job_config JSONB, -- e.g., { pricingType: 'rec', useBothModes: true } - - created_at TIMESTAMPTZ DEFAULT NOW(), - updated_at TIMESTAMPTZ DEFAULT NOW() -); - -CREATE INDEX IF NOT EXISTS idx_job_schedules_enabled ON job_schedules(enabled); -CREATE INDEX IF NOT EXISTS idx_job_schedules_next_run ON job_schedules(next_run_at); - --- ============================================================ --- JOB_RUN_LOGS TABLE --- Stores history of job runs for monitoring --- ============================================================ -CREATE TABLE IF NOT EXISTS job_run_logs ( - id SERIAL PRIMARY KEY, - schedule_id INTEGER NOT NULL REFERENCES job_schedules(id) ON DELETE CASCADE, - job_name VARCHAR(100) NOT NULL, - status VARCHAR(20) NOT NULL, -- 'pending', 'running', 'success', 'error', 'partial' - started_at TIMESTAMPTZ, - completed_at TIMESTAMPTZ, - duration_ms INTEGER, - error_message TEXT, - - -- Results summary - items_processed INTEGER, - items_succeeded INTEGER, - items_failed INTEGER, - - metadata JSONB, -- Additional run details - - created_at TIMESTAMPTZ DEFAULT NOW() -); - -CREATE INDEX IF NOT EXISTS idx_job_run_logs_schedule ON job_run_logs(schedule_id); -CREATE INDEX IF NOT EXISTS idx_job_run_logs_job_name ON job_run_logs(job_name); -CREATE INDEX IF NOT EXISTS idx_job_run_logs_status ON job_run_logs(status); -CREATE INDEX IF NOT EXISTS idx_job_run_logs_created ON job_run_logs(created_at); - --- ============================================================ --- VIEWS FOR EASY QUERYING --- ============================================================ - --- Categories derived from products -CREATE OR REPLACE VIEW v_categories AS -SELECT - type, - subcategory, - COUNT(DISTINCT id) as product_count, - COUNT(DISTINCT dispensary_id) as dispensary_count, - AVG(thc) as avg_thc, - MIN(thc) as min_thc, - MAX(thc) as max_thc -FROM dutchie_products -WHERE type IS NOT NULL -GROUP BY type, subcategory -ORDER BY type, subcategory; - --- Brands derived from products -CREATE OR REPLACE VIEW v_brands AS -SELECT - brand_name, - brand_id, - MAX(brand_logo_url) as brand_logo_url, - COUNT(DISTINCT id) as product_count, - COUNT(DISTINCT dispensary_id) as dispensary_count, - ARRAY_AGG(DISTINCT type) FILTER (WHERE type IS NOT NULL) as product_types -FROM dutchie_products -WHERE brand_name IS NOT NULL -GROUP BY brand_name, brand_id -ORDER BY product_count DESC; - --- Latest snapshot per product (most recent crawl data) -CREATE OR REPLACE VIEW v_latest_snapshots AS -SELECT DISTINCT ON (dutchie_product_id) - s.* -FROM dutchie_product_snapshots s -ORDER BY dutchie_product_id, crawled_at DESC; - --- Dashboard stats -CREATE OR REPLACE VIEW v_dashboard_stats AS -SELECT - (SELECT COUNT(*) FROM dispensaries WHERE state = 'AZ') as dispensary_count, - (SELECT COUNT(*) FROM dutchie_products) as product_count, - (SELECT COUNT(*) FROM dutchie_product_snapshots WHERE crawled_at > NOW() - INTERVAL '24 hours') as snapshots_24h, - (SELECT MAX(crawled_at) FROM dutchie_product_snapshots) as last_crawl_time, - (SELECT COUNT(*) FROM crawl_jobs WHERE status = 'failed' AND created_at > NOW() - INTERVAL '24 hours') as failed_jobs_24h, - (SELECT COUNT(DISTINCT brand_name) FROM dutchie_products WHERE brand_name IS NOT NULL) as brand_count, - (SELECT COUNT(DISTINCT (type, subcategory)) FROM dutchie_products WHERE type IS NOT NULL) as category_count; -`; -/** - * Run the schema migration - */ -async function createSchema() { - console.log('[DutchieAZ Schema] Creating database schema...'); - const client = await (0, connection_1.getClient)(); - try { - await client.query('BEGIN'); - // Split into individual statements and execute - const statements = SCHEMA_SQL - .split(';') - .map(s => s.trim()) - .filter(s => s.length > 0 && !s.startsWith('--')); - for (const statement of statements) { - if (statement.trim()) { - await client.query(statement + ';'); - } - } - await client.query('COMMIT'); - console.log('[DutchieAZ Schema] Schema created successfully'); - } - catch (error) { - await client.query('ROLLBACK'); - console.error('[DutchieAZ Schema] Failed to create schema:', error); - throw error; - } - finally { - client.release(); - } -} -/** - * Drop all tables (for development/testing) - */ -async function dropSchema() { - console.log('[DutchieAZ Schema] Dropping all tables...'); - await (0, connection_1.query)(` - DROP VIEW IF EXISTS v_dashboard_stats CASCADE; - DROP VIEW IF EXISTS v_latest_snapshots CASCADE; - DROP VIEW IF EXISTS v_brands CASCADE; - DROP VIEW IF EXISTS v_categories CASCADE; - DROP TABLE IF EXISTS crawl_schedule CASCADE; - DROP TABLE IF EXISTS crawl_jobs CASCADE; - DROP TABLE IF EXISTS dutchie_product_snapshots CASCADE; - DROP TABLE IF EXISTS dutchie_products CASCADE; - DROP TABLE IF EXISTS dispensaries CASCADE; - `); - console.log('[DutchieAZ Schema] All tables dropped'); -} -/** - * Check if schema exists - */ -async function schemaExists() { - try { - const result = await (0, connection_1.query)(` - SELECT EXISTS ( - SELECT FROM information_schema.tables - WHERE table_name = 'dispensaries' - ) as exists - `); - return result.rows[0]?.exists === true; - } - catch (error) { - return false; - } -} -/** - * Initialize schema if it doesn't exist - */ -async function ensureSchema() { - const exists = await schemaExists(); - if (!exists) { - await createSchema(); - } - else { - console.log('[DutchieAZ Schema] Schema already exists'); - } -} diff --git a/backend/dist/dutchie-az/index.js b/backend/dist/dutchie-az/index.js deleted file mode 100644 index b0887874..00000000 --- a/backend/dist/dutchie-az/index.js +++ /dev/null @@ -1,95 +0,0 @@ -"use strict"; -/** - * Dutchie AZ Data Pipeline - * - * Isolated data pipeline for crawling and storing Dutchie Arizona dispensary data. - * This module is completely separate from the main application database. - * - * Features: - * - Two-mode crawling (Mode A: UI parity, Mode B: MAX COVERAGE) - * - Derived stockStatus field (in_stock, out_of_stock, unknown) - * - Full raw payload storage for 100% data preservation - * - AZDHS dispensary list as canonical source - */ -var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { - if (k2 === undefined) k2 = k; - var desc = Object.getOwnPropertyDescriptor(m, k); - if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { - desc = { enumerable: true, get: function() { return m[k]; } }; - } - Object.defineProperty(o, k2, desc); -}) : (function(o, m, k, k2) { - if (k2 === undefined) k2 = k; - o[k2] = m[k]; -})); -var __exportStar = (this && this.__exportStar) || function(m, exports) { - for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p); -}; -var __importDefault = (this && this.__importDefault) || function (mod) { - return (mod && mod.__esModule) ? mod : { "default": mod }; -}; -Object.defineProperty(exports, "__esModule", { value: true }); -exports.dutchieAZRouter = exports.getImportStats = exports.importFromJSON = exports.importAZDHSDispensaries = exports.getRunLogs = exports.initializeDefaultSchedules = exports.triggerScheduleNow = exports.deleteSchedule = exports.updateSchedule = exports.createSchedule = exports.getScheduleById = exports.getAllSchedules = exports.crawlSingleDispensary = exports.getSchedulerStatus = exports.triggerImmediateCrawl = exports.stopScheduler = exports.startScheduler = exports.crawlAllArizonaDispensaries = exports.crawlDispensaryProducts = exports.normalizeSnapshot = exports.normalizeProduct = exports.getDispensariesWithPlatformIds = exports.getDispensaryById = exports.getAllDispensaries = exports.resolvePlatformDispensaryIds = exports.discoverAndSaveDispensaries = exports.importFromExistingDispensaries = exports.discoverDispensaries = exports.discoverArizonaDispensaries = exports.fetchAllProductsBothModes = exports.fetchAllProducts = exports.resolveDispensaryId = exports.ARIZONA_CENTERPOINTS = exports.GRAPHQL_HASHES = exports.ensureSchema = exports.schemaExists = exports.dropSchema = exports.createSchema = exports.healthCheck = exports.closePool = exports.getClient = exports.query = exports.getDutchieAZPool = void 0; -// Types -__exportStar(require("./types"), exports); -// Database -var connection_1 = require("./db/connection"); -Object.defineProperty(exports, "getDutchieAZPool", { enumerable: true, get: function () { return connection_1.getDutchieAZPool; } }); -Object.defineProperty(exports, "query", { enumerable: true, get: function () { return connection_1.query; } }); -Object.defineProperty(exports, "getClient", { enumerable: true, get: function () { return connection_1.getClient; } }); -Object.defineProperty(exports, "closePool", { enumerable: true, get: function () { return connection_1.closePool; } }); -Object.defineProperty(exports, "healthCheck", { enumerable: true, get: function () { return connection_1.healthCheck; } }); -var schema_1 = require("./db/schema"); -Object.defineProperty(exports, "createSchema", { enumerable: true, get: function () { return schema_1.createSchema; } }); -Object.defineProperty(exports, "dropSchema", { enumerable: true, get: function () { return schema_1.dropSchema; } }); -Object.defineProperty(exports, "schemaExists", { enumerable: true, get: function () { return schema_1.schemaExists; } }); -Object.defineProperty(exports, "ensureSchema", { enumerable: true, get: function () { return schema_1.ensureSchema; } }); -// Services - GraphQL Client -var graphql_client_1 = require("./services/graphql-client"); -Object.defineProperty(exports, "GRAPHQL_HASHES", { enumerable: true, get: function () { return graphql_client_1.GRAPHQL_HASHES; } }); -Object.defineProperty(exports, "ARIZONA_CENTERPOINTS", { enumerable: true, get: function () { return graphql_client_1.ARIZONA_CENTERPOINTS; } }); -Object.defineProperty(exports, "resolveDispensaryId", { enumerable: true, get: function () { return graphql_client_1.resolveDispensaryId; } }); -Object.defineProperty(exports, "fetchAllProducts", { enumerable: true, get: function () { return graphql_client_1.fetchAllProducts; } }); -Object.defineProperty(exports, "fetchAllProductsBothModes", { enumerable: true, get: function () { return graphql_client_1.fetchAllProductsBothModes; } }); -Object.defineProperty(exports, "discoverArizonaDispensaries", { enumerable: true, get: function () { return graphql_client_1.discoverArizonaDispensaries; } }); -// Alias for backward compatibility -Object.defineProperty(exports, "discoverDispensaries", { enumerable: true, get: function () { return graphql_client_1.discoverArizonaDispensaries; } }); -// Services - Discovery -var discovery_1 = require("./services/discovery"); -Object.defineProperty(exports, "importFromExistingDispensaries", { enumerable: true, get: function () { return discovery_1.importFromExistingDispensaries; } }); -Object.defineProperty(exports, "discoverAndSaveDispensaries", { enumerable: true, get: function () { return discovery_1.discoverDispensaries; } }); -Object.defineProperty(exports, "resolvePlatformDispensaryIds", { enumerable: true, get: function () { return discovery_1.resolvePlatformDispensaryIds; } }); -Object.defineProperty(exports, "getAllDispensaries", { enumerable: true, get: function () { return discovery_1.getAllDispensaries; } }); -Object.defineProperty(exports, "getDispensaryById", { enumerable: true, get: function () { return discovery_1.getDispensaryById; } }); -Object.defineProperty(exports, "getDispensariesWithPlatformIds", { enumerable: true, get: function () { return discovery_1.getDispensariesWithPlatformIds; } }); -// Services - Product Crawler -var product_crawler_1 = require("./services/product-crawler"); -Object.defineProperty(exports, "normalizeProduct", { enumerable: true, get: function () { return product_crawler_1.normalizeProduct; } }); -Object.defineProperty(exports, "normalizeSnapshot", { enumerable: true, get: function () { return product_crawler_1.normalizeSnapshot; } }); -Object.defineProperty(exports, "crawlDispensaryProducts", { enumerable: true, get: function () { return product_crawler_1.crawlDispensaryProducts; } }); -Object.defineProperty(exports, "crawlAllArizonaDispensaries", { enumerable: true, get: function () { return product_crawler_1.crawlAllArizonaDispensaries; } }); -// Services - Scheduler -var scheduler_1 = require("./services/scheduler"); -Object.defineProperty(exports, "startScheduler", { enumerable: true, get: function () { return scheduler_1.startScheduler; } }); -Object.defineProperty(exports, "stopScheduler", { enumerable: true, get: function () { return scheduler_1.stopScheduler; } }); -Object.defineProperty(exports, "triggerImmediateCrawl", { enumerable: true, get: function () { return scheduler_1.triggerImmediateCrawl; } }); -Object.defineProperty(exports, "getSchedulerStatus", { enumerable: true, get: function () { return scheduler_1.getSchedulerStatus; } }); -Object.defineProperty(exports, "crawlSingleDispensary", { enumerable: true, get: function () { return scheduler_1.crawlSingleDispensary; } }); -// Schedule config CRUD -Object.defineProperty(exports, "getAllSchedules", { enumerable: true, get: function () { return scheduler_1.getAllSchedules; } }); -Object.defineProperty(exports, "getScheduleById", { enumerable: true, get: function () { return scheduler_1.getScheduleById; } }); -Object.defineProperty(exports, "createSchedule", { enumerable: true, get: function () { return scheduler_1.createSchedule; } }); -Object.defineProperty(exports, "updateSchedule", { enumerable: true, get: function () { return scheduler_1.updateSchedule; } }); -Object.defineProperty(exports, "deleteSchedule", { enumerable: true, get: function () { return scheduler_1.deleteSchedule; } }); -Object.defineProperty(exports, "triggerScheduleNow", { enumerable: true, get: function () { return scheduler_1.triggerScheduleNow; } }); -Object.defineProperty(exports, "initializeDefaultSchedules", { enumerable: true, get: function () { return scheduler_1.initializeDefaultSchedules; } }); -// Run logs -Object.defineProperty(exports, "getRunLogs", { enumerable: true, get: function () { return scheduler_1.getRunLogs; } }); -// Services - AZDHS Import -var azdhs_import_1 = require("./services/azdhs-import"); -Object.defineProperty(exports, "importAZDHSDispensaries", { enumerable: true, get: function () { return azdhs_import_1.importAZDHSDispensaries; } }); -Object.defineProperty(exports, "importFromJSON", { enumerable: true, get: function () { return azdhs_import_1.importFromJSON; } }); -Object.defineProperty(exports, "getImportStats", { enumerable: true, get: function () { return azdhs_import_1.getImportStats; } }); -// Routes -var routes_1 = require("./routes"); -Object.defineProperty(exports, "dutchieAZRouter", { enumerable: true, get: function () { return __importDefault(routes_1).default; } }); diff --git a/backend/dist/dutchie-az/routes/index.js b/backend/dist/dutchie-az/routes/index.js deleted file mode 100644 index 5e4c313a..00000000 --- a/backend/dist/dutchie-az/routes/index.js +++ /dev/null @@ -1,1729 +0,0 @@ -"use strict"; -/** - * Dutchie AZ API Routes - * - * Express routes for the Dutchie AZ data pipeline. - * Provides API endpoints for stores, products, categories, and dashboard. - */ -var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { - if (k2 === undefined) k2 = k; - var desc = Object.getOwnPropertyDescriptor(m, k); - if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { - desc = { enumerable: true, get: function() { return m[k]; } }; - } - Object.defineProperty(o, k2, desc); -}) : (function(o, m, k, k2) { - if (k2 === undefined) k2 = k; - o[k2] = m[k]; -})); -var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { - Object.defineProperty(o, "default", { enumerable: true, value: v }); -}) : function(o, v) { - o["default"] = v; -}); -var __importStar = (this && this.__importStar) || (function () { - var ownKeys = function(o) { - ownKeys = Object.getOwnPropertyNames || function (o) { - var ar = []; - for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; - return ar; - }; - return ownKeys(o); - }; - return function (mod) { - if (mod && mod.__esModule) return mod; - var result = {}; - if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); - __setModuleDefault(result, mod); - return result; - }; -})(); -Object.defineProperty(exports, "__esModule", { value: true }); -const express_1 = require("express"); -const connection_1 = require("../db/connection"); -const schema_1 = require("../db/schema"); -const azdhs_import_1 = require("../services/azdhs-import"); -const discovery_1 = require("../services/discovery"); -const product_crawler_1 = require("../services/product-crawler"); -// Explicit column list for dispensaries table (avoids SELECT * issues with schema differences) -const DISPENSARY_COLUMNS = ` - id, name, dba_name, slug, city, state, zip, address, latitude, longitude, - menu_type, menu_url, platform_dispensary_id, website, - provider_detection_data, created_at, updated_at -`; -const scheduler_1 = require("../services/scheduler"); -const router = (0, express_1.Router)(); -// ============================================================ -// DASHBOARD -// ============================================================ -/** - * GET /api/dutchie-az/dashboard - * Dashboard stats overview - */ -router.get('/dashboard', async (_req, res) => { - try { - const { rows } = await (0, connection_1.query)(`SELECT * FROM v_dashboard_stats`); - const stats = rows[0] || {}; - res.json({ - dispensaryCount: parseInt(stats.dispensary_count || '0', 10), - productCount: parseInt(stats.product_count || '0', 10), - snapshotCount24h: parseInt(stats.snapshots_24h || '0', 10), - lastCrawlTime: stats.last_crawl_time, - failedJobCount: parseInt(stats.failed_jobs_24h || '0', 10), - brandCount: parseInt(stats.brand_count || '0', 10), - categoryCount: parseInt(stats.category_count || '0', 10), - }); - } - catch (error) { - res.status(500).json({ error: error.message }); - } -}); -// ============================================================ -// DISPENSARIES (STORES) -// ============================================================ -/** - * GET /api/dutchie-az/stores - * List all stores with optional filters - */ -router.get('/stores', async (req, res) => { - try { - const { city, hasPlatformId, limit = '100', offset = '0' } = req.query; - let whereClause = 'WHERE state = \'AZ\''; - const params = []; - let paramIndex = 1; - if (city) { - whereClause += ` AND city = $${paramIndex}`; - params.push(city); - paramIndex++; - } - if (hasPlatformId === 'true') { - whereClause += ' AND platform_dispensary_id IS NOT NULL'; - } - else if (hasPlatformId === 'false') { - whereClause += ' AND platform_dispensary_id IS NULL'; - } - params.push(parseInt(limit, 10), parseInt(offset, 10)); - const { rows, rowCount } = await (0, connection_1.query)(` - SELECT ${DISPENSARY_COLUMNS} FROM dispensaries - ${whereClause} - ORDER BY name - LIMIT $${paramIndex} OFFSET $${paramIndex + 1} - `, params); - // Get total count - const { rows: countRows } = await (0, connection_1.query)(`SELECT COUNT(*) as total FROM dispensaries ${whereClause}`, params.slice(0, -2)); - res.json({ - stores: rows, - total: parseInt(countRows[0]?.total || '0', 10), - limit: parseInt(limit, 10), - offset: parseInt(offset, 10), - }); - } - catch (error) { - res.status(500).json({ error: error.message }); - } -}); -/** - * GET /api/dutchie-az/stores/slug/:slug - * Resolve a store by slug (case-insensitive) or platform_dispensary_id - */ -router.get('/stores/slug/:slug', async (req, res) => { - try { - const { slug } = req.params; - const normalized = slug.toLowerCase(); - const { rows } = await (0, connection_1.query)(` - SELECT ${DISPENSARY_COLUMNS} - FROM dispensaries - WHERE lower(slug) = $1 - OR lower(platform_dispensary_id) = $1 - LIMIT 1 - `, [normalized]); - if (!rows || rows.length === 0) { - return res.status(404).json({ error: 'Store not found' }); - } - res.json(rows[0]); - } - catch (error) { - res.status(500).json({ error: error.message }); - } -}); -/** - * GET /api/dutchie-az/stores/:id - * Get a single store by ID - */ -router.get('/stores/:id', async (req, res) => { - try { - const { id } = req.params; - const store = await (0, discovery_1.getDispensaryById)(parseInt(id, 10)); - if (!store) { - return res.status(404).json({ error: 'Store not found' }); - } - res.json(store); - } - catch (error) { - res.status(500).json({ error: error.message }); - } -}); -/** - * GET /api/dutchie-az/stores/:id/summary - * Get store summary with product count, categories, and brands - * This is the main endpoint for the DispensaryDetail panel - */ -router.get('/stores/:id/summary', async (req, res) => { - try { - const { id } = req.params; - // Get dispensary info - const { rows: dispensaryRows } = await (0, connection_1.query)(`SELECT ${DISPENSARY_COLUMNS} FROM dispensaries WHERE id = $1`, [parseInt(id, 10)]); - if (dispensaryRows.length === 0) { - return res.status(404).json({ error: 'Store not found' }); - } - const dispensary = dispensaryRows[0]; - // Get product counts by stock status - const { rows: countRows } = await (0, connection_1.query)(` - SELECT - COUNT(*) as total_products, - COUNT(*) FILTER (WHERE stock_status = 'in_stock') as in_stock_count, - COUNT(*) FILTER (WHERE stock_status = 'out_of_stock') as out_of_stock_count, - COUNT(*) FILTER (WHERE stock_status = 'unknown') as unknown_count, - COUNT(*) FILTER (WHERE stock_status = 'missing_from_feed') as missing_count - FROM dutchie_products - WHERE dispensary_id = $1 - `, [id]); - // Get categories with counts for this store - const { rows: categories } = await (0, connection_1.query)(` - SELECT - type, - subcategory, - COUNT(*) as product_count - FROM dutchie_products - WHERE dispensary_id = $1 AND type IS NOT NULL - GROUP BY type, subcategory - ORDER BY type, subcategory - `, [id]); - // Get brands with counts for this store - const { rows: brands } = await (0, connection_1.query)(` - SELECT - brand_name, - COUNT(*) as product_count - FROM dutchie_products - WHERE dispensary_id = $1 AND brand_name IS NOT NULL - GROUP BY brand_name - ORDER BY product_count DESC - `, [id]); - // Get last crawl info - const { rows: lastCrawl } = await (0, connection_1.query)(` - SELECT - id, - status, - started_at, - completed_at, - products_found, - products_new, - products_updated, - error_message - FROM dispensary_crawl_jobs - WHERE dispensary_id = $1 - ORDER BY created_at DESC - LIMIT 1 - `, [id]); - const counts = countRows[0] || {}; - res.json({ - dispensary, - totalProducts: parseInt(counts.total_products || '0', 10), - inStockCount: parseInt(counts.in_stock_count || '0', 10), - outOfStockCount: parseInt(counts.out_of_stock_count || '0', 10), - unknownStockCount: parseInt(counts.unknown_count || '0', 10), - missingFromFeedCount: parseInt(counts.missing_count || '0', 10), - categories, - brands, - brandCount: brands.length, - categoryCount: categories.length, - lastCrawl: lastCrawl[0] || null, - }); - } - catch (error) { - res.status(500).json({ error: error.message }); - } -}); -/** - * GET /api/dutchie-az/stores/:id/products - * Get paginated products for a store with latest snapshot data - */ -router.get('/stores/:id/products', async (req, res) => { - try { - const { id } = req.params; - const { stockStatus, type, subcategory, brandName, search, limit = '50', offset = '0', } = req.query; - let whereClause = 'WHERE p.dispensary_id = $1'; - const params = [parseInt(id, 10)]; - let paramIndex = 2; - if (stockStatus) { - whereClause += ` AND p.stock_status = $${paramIndex}`; - params.push(stockStatus); - paramIndex++; - } - if (type) { - whereClause += ` AND p.type = $${paramIndex}`; - params.push(type); - paramIndex++; - } - if (subcategory) { - whereClause += ` AND p.subcategory = $${paramIndex}`; - params.push(subcategory); - paramIndex++; - } - if (brandName) { - whereClause += ` AND p.brand_name ILIKE $${paramIndex}`; - params.push(`%${brandName}%`); - paramIndex++; - } - if (search) { - whereClause += ` AND (p.name ILIKE $${paramIndex} OR p.brand_name ILIKE $${paramIndex})`; - params.push(`%${search}%`); - paramIndex++; - } - params.push(parseInt(limit, 10), parseInt(offset, 10)); - // Get products with their latest snapshot data - const { rows: products } = await (0, connection_1.query)(` - SELECT - p.id, - p.external_product_id, - p.name, - p.brand_name, - p.type, - p.subcategory, - p.strain_type, - p.stock_status, - p.created_at, - p.updated_at, - p.primary_image_url, - p.thc_content, - p.cbd_content, - -- Latest snapshot data (prices in cents) - s.rec_min_price_cents, - s.rec_max_price_cents, - s.med_min_price_cents, - s.med_max_price_cents, - s.rec_min_special_price_cents, - s.med_min_special_price_cents, - s.total_quantity_available, - s.options, - s.stock_status as snapshot_stock_status, - s.crawled_at as snapshot_at - FROM dutchie_products p - LEFT JOIN LATERAL ( - SELECT * FROM dutchie_product_snapshots - WHERE dutchie_product_id = p.id - ORDER BY crawled_at DESC - LIMIT 1 - ) s ON true - ${whereClause} - ORDER BY p.updated_at DESC - LIMIT $${paramIndex} OFFSET $${paramIndex + 1} - `, params); - // Get total count - const { rows: countRows } = await (0, connection_1.query)(`SELECT COUNT(*) as total FROM dutchie_products p ${whereClause}`, params.slice(0, -2)); - // Transform products for frontend compatibility - const transformedProducts = products.map((p) => ({ - id: p.id, - external_id: p.external_product_id, - name: p.name, - brand: p.brand_name, - type: p.type, - subcategory: p.subcategory, - strain_type: p.strain_type, - stock_status: p.snapshot_stock_status || p.stock_status, - in_stock: (p.snapshot_stock_status || p.stock_status) === 'in_stock', - // Prices from latest snapshot (convert cents to dollars) - regular_price: p.rec_min_price_cents ? p.rec_min_price_cents / 100 : null, - regular_price_max: p.rec_max_price_cents ? p.rec_max_price_cents / 100 : null, - sale_price: p.rec_min_special_price_cents ? p.rec_min_special_price_cents / 100 : null, - med_price: p.med_min_price_cents ? p.med_min_price_cents / 100 : null, - med_price_max: p.med_max_price_cents ? p.med_max_price_cents / 100 : null, - med_sale_price: p.med_min_special_price_cents ? p.med_min_special_price_cents / 100 : null, - // Potency from products table - thc_percentage: p.thc_content, - cbd_percentage: p.cbd_content, - // Images from products table - image_url: p.primary_image_url, - // Other - options: p.options, - total_quantity: p.total_quantity_available, - // Timestamps - created_at: p.created_at, - updated_at: p.updated_at, - snapshot_at: p.snapshot_at, - })); - res.json({ - products: transformedProducts, - total: parseInt(countRows[0]?.total || '0', 10), - limit: parseInt(limit, 10), - offset: parseInt(offset, 10), - }); - } - catch (error) { - res.status(500).json({ error: error.message }); - } -}); -/** - * GET /api/dutchie-az/stores/:id/brands - * Get brands for a specific store - */ -router.get('/stores/:id/brands', async (req, res) => { - try { - const { id } = req.params; - const { rows: brands } = await (0, connection_1.query)(` - SELECT - brand_name as brand, - COUNT(*) as product_count - FROM dutchie_products - WHERE dispensary_id = $1 AND brand_name IS NOT NULL - GROUP BY brand_name - ORDER BY product_count DESC - `, [parseInt(id, 10)]); - res.json({ brands }); - } - catch (error) { - res.status(500).json({ error: error.message }); - } -}); -/** - * GET /api/dutchie-az/stores/:id/categories - * Get categories for a specific store - */ -router.get('/stores/:id/categories', async (req, res) => { - try { - const { id } = req.params; - const { rows: categories } = await (0, connection_1.query)(` - SELECT - type, - subcategory, - COUNT(*) as product_count - FROM dutchie_products - WHERE dispensary_id = $1 AND type IS NOT NULL - GROUP BY type, subcategory - ORDER BY type, subcategory - `, [parseInt(id, 10)]); - res.json({ categories }); - } - catch (error) { - res.status(500).json({ error: error.message }); - } -}); -// ============================================================ -// PRODUCTS -// ============================================================ -/** - * GET /api/dutchie-az/products - * List products with filtering on our own DB - */ -router.get('/products', async (req, res) => { - try { - const { storeId, stockStatus, type, subcategory, brandName, search, limit = '50', offset = '0', } = req.query; - let whereClause = 'WHERE 1=1'; - const params = []; - let paramIndex = 1; - if (storeId) { - whereClause += ` AND dispensary_id = $${paramIndex}`; - params.push(parseInt(storeId, 10)); - paramIndex++; - } - if (stockStatus) { - whereClause += ` AND stock_status = $${paramIndex}`; - params.push(stockStatus); - paramIndex++; - } - if (type) { - whereClause += ` AND type = $${paramIndex}`; - params.push(type); - paramIndex++; - } - if (subcategory) { - whereClause += ` AND subcategory = $${paramIndex}`; - params.push(subcategory); - paramIndex++; - } - if (brandName) { - whereClause += ` AND brand_name ILIKE $${paramIndex}`; - params.push(`%${brandName}%`); - paramIndex++; - } - if (search) { - whereClause += ` AND (name ILIKE $${paramIndex} OR brand_name ILIKE $${paramIndex})`; - params.push(`%${search}%`); - paramIndex++; - } - params.push(parseInt(limit, 10), parseInt(offset, 10)); - const { rows } = await (0, connection_1.query)(` - SELECT - p.*, - d.name as store_name, - d.city as store_city - FROM dutchie_products p - JOIN dispensaries d ON p.dispensary_id = d.id - ${whereClause} - ORDER BY p.updated_at DESC - LIMIT $${paramIndex} OFFSET $${paramIndex + 1} - `, params); - // Get total count - const { rows: countRows } = await (0, connection_1.query)(`SELECT COUNT(*) as total FROM dutchie_products ${whereClause}`, params.slice(0, -2)); - res.json({ - products: rows, - total: parseInt(countRows[0]?.total || '0', 10), - limit: parseInt(limit, 10), - offset: parseInt(offset, 10), - }); - } - catch (error) { - res.status(500).json({ error: error.message }); - } -}); -/** - * GET /api/dutchie-az/products/:id - * Get a single product with its latest snapshot - */ -router.get('/products/:id', async (req, res) => { - try { - const { id } = req.params; - const { rows: productRows } = await (0, connection_1.query)(` - SELECT - p.*, - d.name as store_name, - d.city as store_city, - d.slug as store_slug - FROM dutchie_products p - JOIN dispensaries d ON p.dispensary_id = d.id - WHERE p.id = $1 - `, [id]); - if (productRows.length === 0) { - return res.status(404).json({ error: 'Product not found' }); - } - // Get latest snapshot - const { rows: snapshotRows } = await (0, connection_1.query)(` - SELECT * FROM dutchie_product_snapshots - WHERE dutchie_product_id = $1 - ORDER BY crawled_at DESC - LIMIT 1 - `, [id]); - res.json({ - product: productRows[0], - latestSnapshot: snapshotRows[0] || null, - }); - } - catch (error) { - res.status(500).json({ error: error.message }); - } -}); -/** - * GET /api/dutchie-az/products/:id/snapshots - * Get snapshot history for a product - */ -router.get('/products/:id/snapshots', async (req, res) => { - try { - const { id } = req.params; - const { limit = '50', offset = '0' } = req.query; - const { rows } = await (0, connection_1.query)(` - SELECT * FROM dutchie_product_snapshots - WHERE dutchie_product_id = $1 - ORDER BY crawled_at DESC - LIMIT $2 OFFSET $3 - `, [id, parseInt(limit, 10), parseInt(offset, 10)]); - res.json({ snapshots: rows }); - } - catch (error) { - res.status(500).json({ error: error.message }); - } -}); -// ============================================================ -// CATEGORIES -// ============================================================ -/** - * GET /api/dutchie-az/categories - * Get all categories with counts - */ -router.get('/categories', async (_req, res) => { - try { - const { rows } = await (0, connection_1.query)(`SELECT * FROM v_categories`); - res.json({ categories: rows }); - } - catch (error) { - res.status(500).json({ error: error.message }); - } -}); -// ============================================================ -// BRANDS -// ============================================================ -/** - * GET /api/dutchie-az/brands - * Get all brands with counts - */ -router.get('/brands', async (req, res) => { - try { - const { limit = '100', offset = '0' } = req.query; - const { rows } = await (0, connection_1.query)(` - SELECT * FROM v_brands - LIMIT $1 OFFSET $2 - `, [parseInt(limit, 10), parseInt(offset, 10)]); - res.json({ brands: rows }); - } - catch (error) { - res.status(500).json({ error: error.message }); - } -}); -// ============================================================ -// ADMIN ACTIONS -// ============================================================ -/** - * POST /api/dutchie-az/admin/init-schema - * Initialize the database schema - */ -router.post('/admin/init-schema', async (_req, res) => { - try { - await (0, schema_1.ensureSchema)(); - res.json({ success: true, message: 'Schema initialized' }); - } - catch (error) { - res.status(500).json({ error: error.message }); - } -}); -/** - * POST /api/dutchie-az/admin/import-azdhs - * Import dispensaries from AZDHS (main database) - */ -router.post('/admin/import-azdhs', async (_req, res) => { - try { - const result = await (0, azdhs_import_1.importAZDHSDispensaries)(); - res.json(result); - } - catch (error) { - res.status(500).json({ error: error.message }); - } -}); -/** - * POST /api/dutchie-az/admin/resolve-platform-ids - * Resolve Dutchie platform IDs for all dispensaries - */ -router.post('/admin/resolve-platform-ids', async (_req, res) => { - try { - const result = await (0, discovery_1.resolvePlatformDispensaryIds)(); - res.json(result); - } - catch (error) { - res.status(500).json({ error: error.message }); - } -}); -/** - * POST /api/dutchie-az/admin/crawl-store/:id - * Crawl a single store - */ -router.post('/admin/crawl-store/:id', async (req, res) => { - try { - const { id } = req.params; - const { pricingType = 'rec', useBothModes = true } = req.body; - const dispensary = await (0, discovery_1.getDispensaryById)(parseInt(id, 10)); - if (!dispensary) { - return res.status(404).json({ error: 'Store not found' }); - } - const result = await (0, product_crawler_1.crawlDispensaryProducts)(dispensary, pricingType, { useBothModes }); - res.json(result); - } - catch (error) { - res.status(500).json({ error: error.message }); - } -}); -/** - * GET /api/dutchie-az/admin/stats - * Get import and crawl statistics - */ -router.get('/admin/stats', async (_req, res) => { - try { - const importStats = await (0, azdhs_import_1.getImportStats)(); - // Get stock status distribution - const { rows: stockStats } = await (0, connection_1.query)(` - SELECT - stock_status, - COUNT(*) as count - FROM dutchie_products - GROUP BY stock_status - `); - // Get recent crawl jobs - const { rows: recentJobs } = await (0, connection_1.query)(` - SELECT * FROM dispensary_crawl_jobs - ORDER BY created_at DESC - LIMIT 10 - `); - res.json({ - import: importStats, - stockDistribution: stockStats, - recentJobs, - }); - } - catch (error) { - res.status(500).json({ error: error.message }); - } -}); -// ============================================================ -// SCHEDULER ADMIN -// ============================================================ -/** - * GET /api/dutchie-az/admin/scheduler/status - * Get scheduler status - */ -router.get('/admin/scheduler/status', async (_req, res) => { - try { - const status = (0, scheduler_1.getSchedulerStatus)(); - res.json(status); - } - catch (error) { - res.status(500).json({ error: error.message }); - } -}); -/** - * POST /api/dutchie-az/admin/scheduler/start - * Start the scheduler - */ -router.post('/admin/scheduler/start', async (_req, res) => { - try { - (0, scheduler_1.startScheduler)(); - res.json({ success: true, message: 'Scheduler started' }); - } - catch (error) { - res.status(500).json({ error: error.message }); - } -}); -/** - * POST /api/dutchie-az/admin/scheduler/stop - * Stop the scheduler - */ -router.post('/admin/scheduler/stop', async (_req, res) => { - try { - (0, scheduler_1.stopScheduler)(); - res.json({ success: true, message: 'Scheduler stopped' }); - } - catch (error) { - res.status(500).json({ error: error.message }); - } -}); -/** - * POST /api/dutchie-az/admin/scheduler/trigger - * Trigger an immediate crawl cycle - */ -router.post('/admin/scheduler/trigger', async (_req, res) => { - try { - const result = await (0, scheduler_1.triggerImmediateCrawl)(); - res.json(result); - } - catch (error) { - res.status(500).json({ error: error.message }); - } -}); -/** - * POST /api/dutchie-az/admin/crawl/:id - * Crawl a single dispensary with job tracking - */ -router.post('/admin/crawl/:id', async (req, res) => { - try { - const { id } = req.params; - const { pricingType = 'rec', useBothModes = true } = req.body; - // Fetch the dispensary first - const dispensary = await (0, discovery_1.getDispensaryById)(parseInt(id, 10)); - if (!dispensary) { - return res.status(404).json({ error: 'Dispensary not found' }); - } - const result = await (0, scheduler_1.crawlSingleDispensary)(dispensary, pricingType, { useBothModes }); - res.json(result); - } - catch (error) { - res.status(500).json({ error: error.message }); - } -}); -const job_queue_1 = require("../services/job-queue"); -/** - * GET /api/dutchie-az/admin/dutchie-stores - * Get all Dutchie stores with their crawl status - */ -router.get('/admin/dutchie-stores', async (_req, res) => { - try { - const { rows } = await (0, connection_1.query)(` - SELECT - d.id, - d.name, - d.dba_name, - d.city, - d.state, - d.menu_type, - d.platform_dispensary_id, - d.menu_url, - d.website, - d.last_crawl_at, - d.consecutive_failures, - d.failed_at, - ( - SELECT COUNT(*) - FROM dutchie_products - WHERE dispensary_id = d.id - ) as product_count, - ( - SELECT MAX(crawled_at) - FROM dutchie_product_snapshots s - JOIN dutchie_products p ON s.dutchie_product_id = p.id - WHERE p.dispensary_id = d.id - ) as last_snapshot_at - FROM dispensaries d - WHERE d.menu_type = 'dutchie' - AND d.state = 'AZ' - ORDER BY d.name - `); - const ready = rows.filter((r) => r.platform_dispensary_id && !r.failed_at); - const needsPlatformId = rows.filter((r) => !r.platform_dispensary_id && !r.failed_at); - const failed = rows.filter((r) => r.failed_at); - res.json({ - total: rows.length, - ready: ready.length, - needsPlatformId: needsPlatformId.length, - failed: failed.length, - stores: rows.map((r) => ({ - id: r.id, - name: r.dba_name || r.name, - city: r.city, - state: r.state, - menuType: r.menu_type, - platformDispensaryId: r.platform_dispensary_id, - menuUrl: r.menu_url, - website: r.website, - lastCrawlAt: r.last_crawl_at, - productCount: parseInt(r.product_count || '0', 10), - lastSnapshotAt: r.last_snapshot_at, - status: r.failed_at - ? 'failed' - : r.platform_dispensary_id - ? 'ready' - : 'needs_platform_id', - })), - }); - } - catch (error) { - res.status(500).json({ error: error.message }); - } -}); -/** - * POST /api/dutchie-az/admin/crawl-all - * Enqueue crawl jobs for ALL ready Dutchie stores - * This is a convenience endpoint to queue all stores without triggering the scheduler - */ -router.post('/admin/crawl-all', async (req, res) => { - try { - const { pricingType = 'rec', useBothModes = true } = req.body; - // Get all "ready" dispensaries (menu_type='dutchie' AND platform_dispensary_id IS NOT NULL AND not failed) - const { rows: rawRows } = await (0, connection_1.query)(` - SELECT id, name, platform_dispensary_id FROM dispensaries - WHERE state = 'AZ' - AND menu_type = 'dutchie' - AND platform_dispensary_id IS NOT NULL - AND failed_at IS NULL - ORDER BY last_crawl_at ASC NULLS FIRST - `); - if (rawRows.length === 0) { - return res.json({ - success: true, - message: 'No ready dispensaries to crawl. Run menu detection first.', - enqueued: 0, - skipped: 0, - dispensaries: [], - }); - } - const dispensaryIds = rawRows.map((r) => r.id); - // Bulk enqueue jobs (skips dispensaries that already have pending/running jobs) - const { enqueued, skipped } = await (0, job_queue_1.bulkEnqueueJobs)('dutchie_product_crawl', dispensaryIds, { - priority: 0, - metadata: { pricingType, useBothModes }, - }); - // Get current queue stats - const queueStats = await (0, job_queue_1.getQueueStats)(); - res.json({ - success: true, - message: `Enqueued ${enqueued} crawl jobs for Dutchie stores`, - totalReady: rawRows.length, - enqueued, - skipped, - queueStats, - dispensaries: rawRows.map((r) => ({ - id: r.id, - name: r.name, - platformDispensaryId: r.platform_dispensary_id, - })), - }); - } - catch (error) { - res.status(500).json({ error: error.message }); - } -}); -/** - * GET /api/dutchie-az/admin/jobs - * Get crawl job history - */ -router.get('/admin/jobs', async (req, res) => { - try { - const { status, dispensaryId, limit = '50', offset = '0' } = req.query; - let whereClause = 'WHERE 1=1'; - const params = []; - let paramIndex = 1; - if (status) { - whereClause += ` AND status = $${paramIndex}`; - params.push(status); - paramIndex++; - } - if (dispensaryId) { - whereClause += ` AND dispensary_id = $${paramIndex}`; - params.push(parseInt(dispensaryId, 10)); - paramIndex++; - } - params.push(parseInt(limit, 10), parseInt(offset, 10)); - const { rows } = await (0, connection_1.query)(` - SELECT - cj.*, - d.name as dispensary_name, - d.slug as dispensary_slug - FROM dispensary_crawl_jobs cj - LEFT JOIN dispensaries d ON cj.dispensary_id = d.id - ${whereClause} - ORDER BY cj.created_at DESC - LIMIT $${paramIndex} OFFSET $${paramIndex + 1} - `, params); - const { rows: countRows } = await (0, connection_1.query)(`SELECT COUNT(*) as total FROM dispensary_crawl_jobs ${whereClause}`, params.slice(0, -2)); - res.json({ - jobs: rows, - total: parseInt(countRows[0]?.total || '0', 10), - limit: parseInt(limit, 10), - offset: parseInt(offset, 10), - }); - } - catch (error) { - res.status(500).json({ error: error.message }); - } -}); -// ============================================================ -// SCHEDULES (CONFIG CRUD) -// ============================================================ -/** - * GET /api/dutchie-az/admin/schedules - * Get all schedule configurations - */ -router.get('/admin/schedules', async (_req, res) => { - try { - const schedules = await (0, scheduler_1.getAllSchedules)(); - res.json({ schedules }); - } - catch (error) { - res.status(500).json({ error: error.message }); - } -}); -/** - * GET /api/dutchie-az/admin/schedules/:id - * Get a single schedule by ID - */ -router.get('/admin/schedules/:id', async (req, res) => { - try { - const { id } = req.params; - const schedule = await (0, scheduler_1.getScheduleById)(parseInt(id, 10)); - if (!schedule) { - return res.status(404).json({ error: 'Schedule not found' }); - } - res.json(schedule); - } - catch (error) { - res.status(500).json({ error: error.message }); - } -}); -/** - * POST /api/dutchie-az/admin/schedules - * Create a new schedule - */ -router.post('/admin/schedules', async (req, res) => { - try { - const { jobName, description, enabled = true, baseIntervalMinutes, jitterMinutes, jobConfig, startImmediately = false, } = req.body; - if (!jobName || typeof baseIntervalMinutes !== 'number' || typeof jitterMinutes !== 'number') { - return res.status(400).json({ - error: 'jobName, baseIntervalMinutes, and jitterMinutes are required', - }); - } - const schedule = await (0, scheduler_1.createSchedule)({ - jobName, - description, - enabled, - baseIntervalMinutes, - jitterMinutes, - jobConfig, - startImmediately, - }); - res.status(201).json(schedule); - } - catch (error) { - // Handle unique constraint violation - if (error.code === '23505') { - return res.status(409).json({ error: `Schedule "${req.body.jobName}" already exists` }); - } - res.status(500).json({ error: error.message }); - } -}); -/** - * PUT /api/dutchie-az/admin/schedules/:id - * Update a schedule - */ -router.put('/admin/schedules/:id', async (req, res) => { - try { - const { id } = req.params; - const { description, enabled, baseIntervalMinutes, jitterMinutes, jobConfig } = req.body; - const schedule = await (0, scheduler_1.updateSchedule)(parseInt(id, 10), { - description, - enabled, - baseIntervalMinutes, - jitterMinutes, - jobConfig, - }); - if (!schedule) { - return res.status(404).json({ error: 'Schedule not found' }); - } - res.json(schedule); - } - catch (error) { - res.status(500).json({ error: error.message }); - } -}); -/** - * DELETE /api/dutchie-az/admin/schedules/:id - * Delete a schedule - */ -router.delete('/admin/schedules/:id', async (req, res) => { - try { - const { id } = req.params; - const deleted = await (0, scheduler_1.deleteSchedule)(parseInt(id, 10)); - if (!deleted) { - return res.status(404).json({ error: 'Schedule not found' }); - } - res.json({ success: true, message: 'Schedule deleted' }); - } - catch (error) { - res.status(500).json({ error: error.message }); - } -}); -/** - * POST /api/dutchie-az/admin/schedules/:id/trigger - * Trigger immediate execution of a schedule - */ -router.post('/admin/schedules/:id/trigger', async (req, res) => { - try { - const { id } = req.params; - const result = await (0, scheduler_1.triggerScheduleNow)(parseInt(id, 10)); - if (!result.success) { - return res.status(400).json({ error: result.message }); - } - res.json(result); - } - catch (error) { - res.status(500).json({ error: error.message }); - } -}); -/** - * POST /api/dutchie-az/admin/schedules/init - * Initialize default schedules if they don't exist - */ -router.post('/admin/schedules/init', async (_req, res) => { - try { - await (0, scheduler_1.initializeDefaultSchedules)(); - const schedules = await (0, scheduler_1.getAllSchedules)(); - res.json({ success: true, schedules }); - } - catch (error) { - res.status(500).json({ error: error.message }); - } -}); -/** - * GET /api/dutchie-az/admin/schedules/:id/logs - * Get run logs for a specific schedule - */ -router.get('/admin/schedules/:id/logs', async (req, res) => { - try { - const { id } = req.params; - const { limit = '50', offset = '0' } = req.query; - const result = await (0, scheduler_1.getRunLogs)({ - scheduleId: parseInt(id, 10), - limit: parseInt(limit, 10), - offset: parseInt(offset, 10), - }); - res.json(result); - } - catch (error) { - res.status(500).json({ error: error.message }); - } -}); -/** - * GET /api/dutchie-az/admin/run-logs - * Get all run logs with filtering - */ -router.get('/admin/run-logs', async (req, res) => { - try { - const { scheduleId, jobName, limit = '50', offset = '0' } = req.query; - const result = await (0, scheduler_1.getRunLogs)({ - scheduleId: scheduleId ? parseInt(scheduleId, 10) : undefined, - jobName: jobName, - limit: parseInt(limit, 10), - offset: parseInt(offset, 10), - }); - res.json(result); - } - catch (error) { - res.status(500).json({ error: error.message }); - } -}); -// ============================================================ -// DEBUG ROUTES -// ============================================================ -/** - * GET /api/dutchie-az/debug/summary - * Get overall system summary for debugging - */ -router.get('/debug/summary', async (_req, res) => { - try { - // Get table counts - const { rows: tableCounts } = await (0, connection_1.query)(` - SELECT - (SELECT COUNT(*) FROM dispensaries) as dispensary_count, - (SELECT COUNT(*) FROM dispensaries WHERE platform_dispensary_id IS NOT NULL) as dispensaries_with_platform_id, - (SELECT COUNT(*) FROM dutchie_products) as product_count, - (SELECT COUNT(*) FROM dutchie_product_snapshots) as snapshot_count, - (SELECT COUNT(*) FROM dispensary_crawl_jobs) as job_count, - (SELECT COUNT(*) FROM dispensary_crawl_jobs WHERE status = 'completed') as completed_jobs, - (SELECT COUNT(*) FROM dispensary_crawl_jobs WHERE status = 'failed') as failed_jobs - `); - // Get stock status distribution - const { rows: stockDistribution } = await (0, connection_1.query)(` - SELECT - stock_status, - COUNT(*) as count - FROM dutchie_products - GROUP BY stock_status - ORDER BY count DESC - `); - // Get products by dispensary - const { rows: productsByDispensary } = await (0, connection_1.query)(` - SELECT - d.id, - d.name, - d.slug, - d.platform_dispensary_id, - COUNT(p.id) as product_count, - MAX(p.updated_at) as last_product_update - FROM dispensaries d - LEFT JOIN dutchie_products p ON d.id = p.dispensary_id - WHERE d.state = 'AZ' - GROUP BY d.id, d.name, d.slug, d.platform_dispensary_id - ORDER BY product_count DESC - LIMIT 20 - `); - // Get recent snapshots - const { rows: recentSnapshots } = await (0, connection_1.query)(` - SELECT - s.id, - s.dutchie_product_id, - p.name as product_name, - d.name as dispensary_name, - s.crawled_at - FROM dutchie_product_snapshots s - JOIN dutchie_products p ON s.dutchie_product_id = p.id - JOIN dispensaries d ON p.dispensary_id = d.id - ORDER BY s.crawled_at DESC - LIMIT 10 - `); - res.json({ - tableCounts: tableCounts[0], - stockDistribution, - productsByDispensary, - recentSnapshots, - }); - } - catch (error) { - res.status(500).json({ error: error.message }); - } -}); -/** - * GET /api/dutchie-az/debug/store/:id - * Get detailed debug info for a specific store - */ -router.get('/debug/store/:id', async (req, res) => { - try { - const { id } = req.params; - // Get dispensary info - const { rows: dispensaryRows } = await (0, connection_1.query)(`SELECT ${DISPENSARY_COLUMNS} FROM dispensaries WHERE id = $1`, [parseInt(id, 10)]); - if (dispensaryRows.length === 0) { - return res.status(404).json({ error: 'Store not found' }); - } - const dispensary = dispensaryRows[0]; - // Get product stats - const { rows: productStats } = await (0, connection_1.query)(` - SELECT - COUNT(*) as total_products, - COUNT(*) FILTER (WHERE stock_status = 'in_stock') as in_stock, - COUNT(*) FILTER (WHERE stock_status = 'out_of_stock') as out_of_stock, - COUNT(*) FILTER (WHERE stock_status = 'unknown') as unknown, - COUNT(*) FILTER (WHERE stock_status = 'missing_from_feed') as missing_from_feed, - MIN(first_seen_at) as earliest_product, - MAX(last_seen_at) as latest_product, - MAX(updated_at) as last_update - FROM dutchie_products - WHERE dispensary_id = $1 - `, [id]); - // Get snapshot stats - const { rows: snapshotStats } = await (0, connection_1.query)(` - SELECT - COUNT(*) as total_snapshots, - MIN(crawled_at) as earliest_snapshot, - MAX(crawled_at) as latest_snapshot, - COUNT(DISTINCT dutchie_product_id) as products_with_snapshots - FROM dutchie_product_snapshots s - JOIN dutchie_products p ON s.dutchie_product_id = p.id - WHERE p.dispensary_id = $1 - `, [id]); - // Get crawl job history - const { rows: recentJobs } = await (0, connection_1.query)(` - SELECT - id, - status, - started_at, - completed_at, - products_found, - products_new, - products_updated, - error_message, - created_at - FROM dispensary_crawl_jobs - WHERE dispensary_id = $1 - ORDER BY created_at DESC - LIMIT 10 - `, [id]); - // Get sample products (5 in-stock, 5 out-of-stock) - const { rows: sampleInStock } = await (0, connection_1.query)(` - SELECT - p.id, - p.name, - p.brand_name, - p.type, - p.stock_status, - p.updated_at - FROM dutchie_products p - WHERE p.dispensary_id = $1 AND p.stock_status = 'in_stock' - ORDER BY p.updated_at DESC - LIMIT 5 - `, [id]); - const { rows: sampleOutOfStock } = await (0, connection_1.query)(` - SELECT - p.id, - p.name, - p.brand_name, - p.type, - p.stock_status, - p.updated_at - FROM dutchie_products p - WHERE p.dispensary_id = $1 AND p.stock_status = 'out_of_stock' - ORDER BY p.updated_at DESC - LIMIT 5 - `, [id]); - // Get categories breakdown - const { rows: categories } = await (0, connection_1.query)(` - SELECT - type, - subcategory, - COUNT(*) as count - FROM dutchie_products - WHERE dispensary_id = $1 - GROUP BY type, subcategory - ORDER BY count DESC - `, [id]); - res.json({ - dispensary, - productStats: productStats[0], - snapshotStats: snapshotStats[0], - recentJobs, - sampleProducts: { - inStock: sampleInStock, - outOfStock: sampleOutOfStock, - }, - categories, - }); - } - catch (error) { - res.status(500).json({ error: error.message }); - } -}); -// ============================================================ -// LIVE CRAWLER STATUS ROUTES -// ============================================================ -const job_queue_2 = require("../services/job-queue"); -/** - * GET /api/dutchie-az/monitor/active-jobs - * Get all currently running jobs with real-time status including worker info - */ -router.get('/monitor/active-jobs', async (_req, res) => { - try { - // Get running jobs from job_run_logs (scheduled jobs like "enqueue all") - const { rows: runningScheduledJobs } = await (0, connection_1.query)(` - SELECT - jrl.id, - jrl.schedule_id, - jrl.job_name, - jrl.status, - jrl.started_at, - jrl.items_processed, - jrl.items_succeeded, - jrl.items_failed, - jrl.metadata, - js.description as job_description, - EXTRACT(EPOCH FROM (NOW() - jrl.started_at)) as duration_seconds - FROM job_run_logs jrl - LEFT JOIN job_schedules js ON jrl.schedule_id = js.id - WHERE jrl.status = 'running' - ORDER BY jrl.started_at DESC - `); - // Get running crawl jobs (individual store crawls with worker info) - const { rows: runningCrawlJobs } = await (0, connection_1.query)(` - SELECT - cj.id, - cj.job_type, - cj.dispensary_id, - d.name as dispensary_name, - d.city, - d.platform_dispensary_id, - cj.status, - cj.started_at, - cj.claimed_by as worker_id, - cj.worker_hostname, - cj.claimed_at, - cj.products_found, - cj.products_upserted, - cj.snapshots_created, - cj.current_page, - cj.total_pages, - cj.last_heartbeat_at, - cj.retry_count, - cj.metadata, - EXTRACT(EPOCH FROM (NOW() - cj.started_at)) as duration_seconds - FROM dispensary_crawl_jobs cj - LEFT JOIN dispensaries d ON cj.dispensary_id = d.id - WHERE cj.status = 'running' - ORDER BY cj.started_at DESC - `); - // Get queue stats - const queueStats = await (0, job_queue_2.getQueueStats)(); - // Get active workers - const activeWorkers = await (0, job_queue_2.getActiveWorkers)(); - // Also get in-memory scrapers if any (from the legacy system) - let inMemoryScrapers = []; - try { - const { activeScrapers } = await Promise.resolve().then(() => __importStar(require('../../routes/scraper-monitor'))); - inMemoryScrapers = Array.from(activeScrapers.values()).map(scraper => ({ - ...scraper, - source: 'in_memory', - duration_seconds: (Date.now() - scraper.startTime.getTime()) / 1000, - })); - } - catch { - // Legacy scraper monitor not available - } - res.json({ - scheduledJobs: runningScheduledJobs, - crawlJobs: runningCrawlJobs, - inMemoryScrapers, - activeWorkers, - queueStats, - totalActive: runningScheduledJobs.length + runningCrawlJobs.length + inMemoryScrapers.length, - }); - } - catch (error) { - res.status(500).json({ error: error.message }); - } -}); -/** - * GET /api/dutchie-az/monitor/recent-jobs - * Get recent completed jobs - */ -router.get('/monitor/recent-jobs', async (req, res) => { - try { - const { limit = '50' } = req.query; - const limitNum = Math.min(parseInt(limit, 10), 200); - // Recent job run logs - const { rows: recentJobLogs } = await (0, connection_1.query)(` - SELECT - jrl.id, - jrl.schedule_id, - jrl.job_name, - jrl.status, - jrl.started_at, - jrl.completed_at, - jrl.duration_ms, - jrl.error_message, - jrl.items_processed, - jrl.items_succeeded, - jrl.items_failed, - jrl.metadata, - js.description as job_description - FROM job_run_logs jrl - LEFT JOIN job_schedules js ON jrl.schedule_id = js.id - ORDER BY jrl.created_at DESC - LIMIT $1 - `, [limitNum]); - // Recent crawl jobs - const { rows: recentCrawlJobs } = await (0, connection_1.query)(` - SELECT - cj.id, - cj.job_type, - cj.dispensary_id, - d.name as dispensary_name, - d.city, - cj.status, - cj.started_at, - cj.completed_at, - cj.error_message, - cj.products_found, - cj.snapshots_created, - cj.metadata, - EXTRACT(EPOCH FROM (COALESCE(cj.completed_at, NOW()) - cj.started_at)) * 1000 as duration_ms - FROM dispensary_crawl_jobs cj - LEFT JOIN dispensaries d ON cj.dispensary_id = d.id - ORDER BY cj.created_at DESC - LIMIT $1 - `, [limitNum]); - res.json({ - jobLogs: recentJobLogs, - crawlJobs: recentCrawlJobs, - }); - } - catch (error) { - res.status(500).json({ error: error.message }); - } -}); -/** - * GET /api/dutchie-az/monitor/errors - * Get recent job errors - */ -router.get('/monitor/errors', async (req, res) => { - try { - const { limit = '20', hours = '24' } = req.query; - const limitNum = Math.min(parseInt(limit, 10), 100); - const hoursNum = Math.min(parseInt(hours, 10), 168); - // Errors from job_run_logs - const { rows: jobErrors } = await (0, connection_1.query)(` - SELECT - 'job_run_log' as source, - jrl.id, - jrl.job_name, - jrl.status, - jrl.started_at, - jrl.completed_at, - jrl.error_message, - jrl.items_processed, - jrl.items_failed, - jrl.metadata - FROM job_run_logs jrl - WHERE jrl.status IN ('error', 'partial') - AND jrl.created_at > NOW() - INTERVAL '${hoursNum} hours' - ORDER BY jrl.created_at DESC - LIMIT $1 - `, [limitNum]); - // Errors from dispensary_crawl_jobs - const { rows: crawlErrors } = await (0, connection_1.query)(` - SELECT - 'crawl_job' as source, - cj.id, - cj.job_type as job_name, - d.name as dispensary_name, - cj.status, - cj.started_at, - cj.completed_at, - cj.error_message, - cj.products_found as items_processed, - cj.metadata - FROM dispensary_crawl_jobs cj - LEFT JOIN dispensaries d ON cj.dispensary_id = d.id - WHERE cj.status = 'failed' - AND cj.created_at > NOW() - INTERVAL '${hoursNum} hours' - ORDER BY cj.created_at DESC - LIMIT $1 - `, [limitNum]); - res.json({ - errors: [...jobErrors, ...crawlErrors].sort((a, b) => new Date(b.started_at || b.created_at).getTime() - - new Date(a.started_at || a.created_at).getTime()).slice(0, limitNum), - }); - } - catch (error) { - res.status(500).json({ error: error.message }); - } -}); -/** - * GET /api/dutchie-az/monitor/summary - * Get overall monitoring summary - */ -router.get('/monitor/summary', async (_req, res) => { - try { - const { rows: stats } = await (0, connection_1.query)(` - SELECT - (SELECT COUNT(*) FROM job_run_logs WHERE status = 'running') as running_scheduled_jobs, - (SELECT COUNT(*) FROM dispensary_crawl_jobs WHERE status = 'running') as running_dispensary_crawl_jobs, - (SELECT COUNT(*) FROM job_run_logs WHERE status = 'success' AND created_at > NOW() - INTERVAL '24 hours') as successful_jobs_24h, - (SELECT COUNT(*) FROM job_run_logs WHERE status IN ('error', 'partial') AND created_at > NOW() - INTERVAL '24 hours') as failed_jobs_24h, - (SELECT COUNT(*) FROM dispensary_crawl_jobs WHERE status = 'completed' AND created_at > NOW() - INTERVAL '24 hours') as successful_crawls_24h, - (SELECT COUNT(*) FROM dispensary_crawl_jobs WHERE status = 'failed' AND created_at > NOW() - INTERVAL '24 hours') as failed_crawls_24h, - (SELECT SUM(products_found) FROM dispensary_crawl_jobs WHERE status = 'completed' AND created_at > NOW() - INTERVAL '24 hours') as products_found_24h, - (SELECT SUM(snapshots_created) FROM dispensary_crawl_jobs WHERE status = 'completed' AND created_at > NOW() - INTERVAL '24 hours') as snapshots_created_24h, - (SELECT MAX(started_at) FROM job_run_logs) as last_job_started, - (SELECT MAX(completed_at) FROM job_run_logs WHERE status = 'success') as last_job_completed - `); - // Get next scheduled runs - const { rows: nextRuns } = await (0, connection_1.query)(` - SELECT - id, - job_name, - description, - enabled, - next_run_at, - last_status, - last_run_at - FROM job_schedules - WHERE enabled = true AND next_run_at IS NOT NULL - ORDER BY next_run_at ASC - LIMIT 5 - `); - res.json({ - ...(stats[0] || {}), - nextRuns, - }); - } - catch (error) { - res.status(500).json({ error: error.message }); - } -}); -// ============================================================ -// MENU DETECTION ROUTES -// ============================================================ -const menu_detection_1 = require("../services/menu-detection"); -/** - * GET /api/dutchie-az/admin/detection/stats - * Get menu detection statistics - */ -router.get('/admin/detection/stats', async (_req, res) => { - try { - const stats = await (0, menu_detection_1.getDetectionStats)(); - res.json(stats); - } - catch (error) { - res.status(500).json({ error: error.message }); - } -}); -/** - * GET /api/dutchie-az/admin/detection/pending - * Get dispensaries that need menu detection - */ -router.get('/admin/detection/pending', async (req, res) => { - try { - const { state = 'AZ', limit = '100' } = req.query; - const dispensaries = await (0, menu_detection_1.getDispensariesNeedingDetection)({ - state: state, - limit: parseInt(limit, 10), - }); - res.json({ dispensaries, total: dispensaries.length }); - } - catch (error) { - res.status(500).json({ error: error.message }); - } -}); -/** - * POST /api/dutchie-az/admin/detection/detect/:id - * Detect menu provider and resolve platform ID for a single dispensary - */ -router.post('/admin/detection/detect/:id', async (req, res) => { - try { - const { id } = req.params; - const result = await (0, menu_detection_1.detectAndResolveDispensary)(parseInt(id, 10)); - res.json(result); - } - catch (error) { - res.status(500).json({ error: error.message }); - } -}); -/** - * POST /api/dutchie-az/admin/detection/detect-all - * Run bulk menu detection on all dispensaries needing it - */ -router.post('/admin/detection/detect-all', async (req, res) => { - try { - const { state = 'AZ', onlyUnknown = true, onlyMissingPlatformId = false, limit } = req.body; - const result = await (0, menu_detection_1.runBulkDetection)({ - state, - onlyUnknown, - onlyMissingPlatformId, - limit, - }); - res.json(result); - } - catch (error) { - res.status(500).json({ error: error.message }); - } -}); -/** - * POST /api/dutchie-az/admin/detection/trigger - * Trigger the menu detection scheduled job immediately - */ -router.post('/admin/detection/trigger', async (_req, res) => { - try { - // Find the menu detection schedule and trigger it - const schedules = await (0, scheduler_1.getAllSchedules)(); - const menuDetection = schedules.find(s => s.jobName === 'dutchie_az_menu_detection'); - if (!menuDetection) { - return res.status(404).json({ error: 'Menu detection schedule not found. Run /admin/schedules/init first.' }); - } - const result = await (0, scheduler_1.triggerScheduleNow)(menuDetection.id); - res.json(result); - } - catch (error) { - res.status(500).json({ error: error.message }); - } -}); -// ============================================================ -// FAILED DISPENSARIES ROUTES -// ============================================================ -/** - * GET /api/dutchie-az/admin/dispensaries/failed - * Get all dispensaries flagged as failed (for admin review) - */ -router.get('/admin/dispensaries/failed', async (_req, res) => { - try { - const { rows } = await (0, connection_1.query)(` - SELECT - id, - name, - city, - state, - menu_url, - menu_type, - platform_dispensary_id, - consecutive_failures, - last_failure_at, - last_failure_reason, - failed_at, - failure_notes, - last_crawl_at, - updated_at - FROM dispensaries - WHERE failed_at IS NOT NULL - ORDER BY failed_at DESC - `); - res.json({ - failed: rows, - total: rows.length, - }); - } - catch (error) { - res.status(500).json({ error: error.message }); - } -}); -/** - * GET /api/dutchie-az/admin/dispensaries/at-risk - * Get dispensaries with high failure counts (but not yet flagged as failed) - */ -router.get('/admin/dispensaries/at-risk', async (_req, res) => { - try { - const { rows } = await (0, connection_1.query)(` - SELECT - id, - name, - city, - state, - menu_url, - menu_type, - consecutive_failures, - last_failure_at, - last_failure_reason, - last_crawl_at - FROM dispensaries - WHERE consecutive_failures >= 1 - AND failed_at IS NULL - ORDER BY consecutive_failures DESC, last_failure_at DESC - `); - res.json({ - atRisk: rows, - total: rows.length, - }); - } - catch (error) { - res.status(500).json({ error: error.message }); - } -}); -/** - * POST /api/dutchie-az/admin/dispensaries/:id/unfail - * Restore a failed dispensary - clears failed status and resets for re-detection - */ -router.post('/admin/dispensaries/:id/unfail', async (req, res) => { - try { - const { id } = req.params; - await (0, connection_1.query)(` - UPDATE dispensaries - SET failed_at = NULL, - consecutive_failures = 0, - last_failure_at = NULL, - last_failure_reason = NULL, - failure_notes = NULL, - menu_type = NULL, - platform_dispensary_id = NULL, - updated_at = NOW() - WHERE id = $1 - `, [id]); - res.json({ success: true, message: `Dispensary ${id} restored for re-detection` }); - } - catch (error) { - res.status(500).json({ error: error.message }); - } -}); -/** - * POST /api/dutchie-az/admin/dispensaries/:id/reset-failures - * Reset failure counter for a dispensary (without unflagging) - */ -router.post('/admin/dispensaries/:id/reset-failures', async (req, res) => { - try { - const { id } = req.params; - await (0, connection_1.query)(` - UPDATE dispensaries - SET consecutive_failures = 0, - last_failure_at = NULL, - last_failure_reason = NULL, - updated_at = NOW() - WHERE id = $1 - `, [id]); - res.json({ success: true, message: `Failure counter reset for dispensary ${id}` }); - } - catch (error) { - res.status(500).json({ error: error.message }); - } -}); -/** - * GET /api/dutchie-az/admin/dispensaries/health-summary - * Get a summary of dispensary health status - */ -router.get('/admin/dispensaries/health-summary', async (_req, res) => { - try { - const { rows } = await (0, connection_1.query)(` - SELECT - COUNT(*) as total, - COUNT(*) FILTER (WHERE state = 'AZ') as arizona_total, - COUNT(*) FILTER (WHERE failed_at IS NOT NULL) as failed, - COUNT(*) FILTER (WHERE consecutive_failures >= 1 AND failed_at IS NULL) as at_risk, - COUNT(*) FILTER (WHERE menu_type = 'dutchie' AND platform_dispensary_id IS NOT NULL AND failed_at IS NULL) as ready_to_crawl, - COUNT(*) FILTER (WHERE menu_type = 'dutchie' AND failed_at IS NULL) as dutchie_detected, - COUNT(*) FILTER (WHERE (menu_type IS NULL OR menu_type = 'unknown') AND failed_at IS NULL) as needs_detection, - COUNT(*) FILTER (WHERE menu_type NOT IN ('dutchie', 'unknown') AND menu_type IS NOT NULL AND failed_at IS NULL) as non_dutchie - FROM dispensaries - WHERE state = 'AZ' - `); - res.json(rows[0] || {}); - } - catch (error) { - res.status(500).json({ error: error.message }); - } -}); -exports.default = router; diff --git a/backend/dist/dutchie-az/services/azdhs-import.js b/backend/dist/dutchie-az/services/azdhs-import.js deleted file mode 100644 index bad6cdcf..00000000 --- a/backend/dist/dutchie-az/services/azdhs-import.js +++ /dev/null @@ -1,229 +0,0 @@ -"use strict"; -/** - * AZDHS Import Service - * - * Imports Arizona dispensaries from the main database's dispensaries table - * (which was populated from AZDHS data) into the isolated Dutchie AZ database. - * - * This establishes the canonical list of AZ dispensaries to match against Dutchie. - */ -var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { - if (k2 === undefined) k2 = k; - var desc = Object.getOwnPropertyDescriptor(m, k); - if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { - desc = { enumerable: true, get: function() { return m[k]; } }; - } - Object.defineProperty(o, k2, desc); -}) : (function(o, m, k, k2) { - if (k2 === undefined) k2 = k; - o[k2] = m[k]; -})); -var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { - Object.defineProperty(o, "default", { enumerable: true, value: v }); -}) : function(o, v) { - o["default"] = v; -}); -var __importStar = (this && this.__importStar) || (function () { - var ownKeys = function(o) { - ownKeys = Object.getOwnPropertyNames || function (o) { - var ar = []; - for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; - return ar; - }; - return ownKeys(o); - }; - return function (mod) { - if (mod && mod.__esModule) return mod; - var result = {}; - if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); - __setModuleDefault(result, mod); - return result; - }; -})(); -Object.defineProperty(exports, "__esModule", { value: true }); -exports.importAZDHSDispensaries = importAZDHSDispensaries; -exports.importFromJSON = importFromJSON; -exports.getImportStats = getImportStats; -const pg_1 = require("pg"); -const connection_1 = require("../db/connection"); -// Main database connection (source of AZDHS data) -const MAIN_DATABASE_URL = process.env.DATABASE_URL || - 'postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus'; -/** - * Create a temporary connection to the main database - */ -function getMainDBPool() { - return new pg_1.Pool({ - connectionString: MAIN_DATABASE_URL, - max: 5, - idleTimeoutMillis: 30000, - connectionTimeoutMillis: 5000, - }); -} -/** - * Fetch all AZ dispensaries from the main database - */ -async function fetchAZDHSDispensaries() { - const pool = getMainDBPool(); - try { - const result = await pool.query(` - SELECT - id, azdhs_id, name, company_name, address, city, state, zip, - latitude, longitude, dba_name, phone, email, website, - google_rating, google_review_count, slug, - menu_provider, product_provider, - created_at, updated_at - FROM dispensaries - WHERE state = 'AZ' - ORDER BY id - `); - return result.rows; - } - finally { - await pool.end(); - } -} -/** - * Import a single dispensary into the Dutchie AZ database - */ -async function importDispensary(disp) { - const result = await (0, connection_1.query)(` - INSERT INTO dispensaries ( - platform, name, slug, city, state, postal_code, address, - latitude, longitude, is_delivery, is_pickup, raw_metadata, updated_at - ) VALUES ( - $1, $2, $3, $4, $5, $6, $7, - $8, $9, $10, $11, $12, NOW() - ) - ON CONFLICT (platform, slug, city, state) DO UPDATE SET - name = EXCLUDED.name, - postal_code = EXCLUDED.postal_code, - address = EXCLUDED.address, - latitude = EXCLUDED.latitude, - longitude = EXCLUDED.longitude, - raw_metadata = EXCLUDED.raw_metadata, - updated_at = NOW() - RETURNING id - `, [ - 'dutchie', // Will be updated when Dutchie match is found - disp.dba_name || disp.name, - disp.slug, - disp.city, - disp.state, - disp.zip, - disp.address, - disp.latitude, - disp.longitude, - false, // is_delivery - unknown - true, // is_pickup - assume true - JSON.stringify({ - azdhs_id: disp.azdhs_id, - main_db_id: disp.id, - company_name: disp.company_name, - phone: disp.phone, - email: disp.email, - website: disp.website, - google_rating: disp.google_rating, - google_review_count: disp.google_review_count, - menu_provider: disp.menu_provider, - product_provider: disp.product_provider, - }), - ]); - return result.rows[0].id; -} -/** - * Import all AZDHS dispensaries into the Dutchie AZ database - */ -async function importAZDHSDispensaries() { - console.log('[AZDHS Import] Starting import from main database...'); - const result = { - total: 0, - imported: 0, - skipped: 0, - errors: [], - }; - try { - const dispensaries = await fetchAZDHSDispensaries(); - result.total = dispensaries.length; - console.log(`[AZDHS Import] Found ${dispensaries.length} AZ dispensaries in main DB`); - for (const disp of dispensaries) { - try { - const id = await importDispensary(disp); - result.imported++; - console.log(`[AZDHS Import] Imported: ${disp.name} (${disp.city}) -> id=${id}`); - } - catch (error) { - if (error.message.includes('duplicate')) { - result.skipped++; - } - else { - result.errors.push(`${disp.name}: ${error.message}`); - } - } - } - } - catch (error) { - result.errors.push(`Failed to fetch from main DB: ${error.message}`); - } - console.log(`[AZDHS Import] Complete: ${result.imported} imported, ${result.skipped} skipped, ${result.errors.length} errors`); - return result; -} -/** - * Import dispensaries from JSON file (backup export) - */ -async function importFromJSON(jsonPath) { - console.log(`[AZDHS Import] Importing from JSON: ${jsonPath}`); - const result = { - total: 0, - imported: 0, - skipped: 0, - errors: [], - }; - try { - const fs = await Promise.resolve().then(() => __importStar(require('fs/promises'))); - const data = await fs.readFile(jsonPath, 'utf-8'); - const dispensaries = JSON.parse(data); - result.total = dispensaries.length; - console.log(`[AZDHS Import] Found ${dispensaries.length} dispensaries in JSON file`); - for (const disp of dispensaries) { - try { - const id = await importDispensary(disp); - result.imported++; - } - catch (error) { - if (error.message.includes('duplicate')) { - result.skipped++; - } - else { - result.errors.push(`${disp.name}: ${error.message}`); - } - } - } - } - catch (error) { - result.errors.push(`Failed to read JSON file: ${error.message}`); - } - console.log(`[AZDHS Import] Complete: ${result.imported} imported, ${result.skipped} skipped`); - return result; -} -/** - * Get import statistics - */ -async function getImportStats() { - const { rows } = await (0, connection_1.query)(` - SELECT - COUNT(*) as total, - COUNT(platform_dispensary_id) as with_platform_id, - COUNT(*) - COUNT(platform_dispensary_id) as without_platform_id, - MAX(updated_at) as last_updated - FROM dispensaries - WHERE state = 'AZ' - `); - const stats = rows[0]; - return { - totalDispensaries: parseInt(stats.total, 10), - withPlatformIds: parseInt(stats.with_platform_id, 10), - withoutPlatformIds: parseInt(stats.without_platform_id, 10), - lastImportedAt: stats.last_updated, - }; -} diff --git a/backend/dist/dutchie-az/services/directory-matcher.js b/backend/dist/dutchie-az/services/directory-matcher.js deleted file mode 100644 index 1ce11368..00000000 --- a/backend/dist/dutchie-az/services/directory-matcher.js +++ /dev/null @@ -1,380 +0,0 @@ -"use strict"; -/** - * Directory-Based Store Matcher - * - * Scrapes provider directory pages (Curaleaf, Sol, etc.) to get store lists, - * then matches them to existing dispensaries by fuzzy name/city/address matching. - * - * This allows us to: - * 1. Find specific store URLs for directory-style websites - * 2. Match stores confidently by name+city - * 3. Mark non-Dutchie providers as not_crawlable until we build crawlers - */ -Object.defineProperty(exports, "__esModule", { value: true }); -exports.scrapeSolDirectory = scrapeSolDirectory; -exports.scrapeCuraleafDirectory = scrapeCuraleafDirectory; -exports.matchDirectoryToDispensaries = matchDirectoryToDispensaries; -exports.previewDirectoryMatches = previewDirectoryMatches; -exports.applyHighConfidenceMatches = applyHighConfidenceMatches; -const connection_1 = require("../db/connection"); -// ============================================================ -// NORMALIZATION FUNCTIONS -// ============================================================ -/** - * Normalize a string for comparison: - * - Lowercase - * - Remove common suffixes (dispensary, cannabis, etc.) - * - Remove punctuation - * - Collapse whitespace - */ -function normalizeForComparison(str) { - if (!str) - return ''; - return str - .toLowerCase() - .replace(/\s+(dispensary|cannabis|marijuana|medical|recreational|shop|store|flower|wellness)(\s|$)/gi, ' ') - .replace(/[^\w\s]/g, ' ') // Remove punctuation - .replace(/\s+/g, ' ') // Collapse whitespace - .trim(); -} -/** - * Normalize city name for comparison - */ -function normalizeCity(city) { - if (!city) - return ''; - return city - .toLowerCase() - .replace(/[^\w\s]/g, '') - .trim(); -} -/** - * Calculate similarity between two strings (0-1) - * Uses Levenshtein distance normalized by max length - */ -function stringSimilarity(a, b) { - if (!a || !b) - return 0; - if (a === b) - return 1; - const longer = a.length > b.length ? a : b; - const shorter = a.length > b.length ? b : a; - if (longer.length === 0) - return 1; - const distance = levenshteinDistance(longer, shorter); - return (longer.length - distance) / longer.length; -} -/** - * Levenshtein distance between two strings - */ -function levenshteinDistance(a, b) { - const matrix = []; - for (let i = 0; i <= b.length; i++) { - matrix[i] = [i]; - } - for (let j = 0; j <= a.length; j++) { - matrix[0][j] = j; - } - for (let i = 1; i <= b.length; i++) { - for (let j = 1; j <= a.length; j++) { - if (b.charAt(i - 1) === a.charAt(j - 1)) { - matrix[i][j] = matrix[i - 1][j - 1]; - } - else { - matrix[i][j] = Math.min(matrix[i - 1][j - 1] + 1, // substitution - matrix[i][j - 1] + 1, // insertion - matrix[i - 1][j] + 1 // deletion - ); - } - } - } - return matrix[b.length][a.length]; -} -/** - * Check if string contains another (with normalization) - */ -function containsNormalized(haystack, needle) { - return normalizeForComparison(haystack).includes(normalizeForComparison(needle)); -} -// ============================================================ -// PROVIDER DIRECTORY SCRAPERS -// ============================================================ -/** - * Sol Flower (livewithsol.com) - Static HTML, easy to scrape - */ -async function scrapeSolDirectory() { - console.log('[DirectoryMatcher] Scraping Sol Flower directory...'); - try { - const response = await fetch('https://www.livewithsol.com/locations/', { - headers: { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', - Accept: 'text/html', - }, - }); - if (!response.ok) { - throw new Error(`HTTP ${response.status}`); - } - const html = await response.text(); - // Extract store entries from HTML - // Sol's structure: Each location has name, address in specific divs - const stores = []; - // Pattern to find location cards - // Format: NAME with address nearby - const locationRegex = /]+href="(\/locations\/[^"]+)"[^>]*>([^<]+)<\/a>[\s\S]*?(\d+[^<]+(?:Ave|St|Blvd|Dr|Rd|Way)[^<]*)/gi; - let match; - while ((match = locationRegex.exec(html)) !== null) { - const [, path, name, address] = match; - // Extract city from common Arizona cities - let city = 'Unknown'; - const cityPatterns = [ - { pattern: /phoenix/i, city: 'Phoenix' }, - { pattern: /scottsdale/i, city: 'Scottsdale' }, - { pattern: /tempe/i, city: 'Tempe' }, - { pattern: /tucson/i, city: 'Tucson' }, - { pattern: /mesa/i, city: 'Mesa' }, - { pattern: /sun city/i, city: 'Sun City' }, - { pattern: /glendale/i, city: 'Glendale' }, - ]; - for (const { pattern, city: cityName } of cityPatterns) { - if (pattern.test(name) || pattern.test(address)) { - city = cityName; - break; - } - } - stores.push({ - name: name.trim(), - city, - state: 'AZ', - address: address.trim(), - storeUrl: `https://www.livewithsol.com${path}`, - }); - } - // If regex didn't work, use known hardcoded values (fallback) - if (stores.length === 0) { - console.log('[DirectoryMatcher] Using hardcoded Sol locations'); - return [ - { name: 'Sol Flower 32nd & Shea', city: 'Phoenix', state: 'AZ', address: '3217 E Shea Blvd Suite 1 A', storeUrl: 'https://www.livewithsol.com/locations/deer-valley/' }, - { name: 'Sol Flower Scottsdale Airpark', city: 'Scottsdale', state: 'AZ', address: '14980 N 78th Way Ste 204', storeUrl: 'https://www.livewithsol.com/locations/scottsdale-airpark/' }, - { name: 'Sol Flower Sun City', city: 'Sun City', state: 'AZ', address: '13650 N 99th Ave', storeUrl: 'https://www.livewithsol.com/locations/sun-city/' }, - { name: 'Sol Flower Tempe McClintock', city: 'Tempe', state: 'AZ', address: '1322 N McClintock Dr', storeUrl: 'https://www.livewithsol.com/locations/tempe-mcclintock/' }, - { name: 'Sol Flower Tempe University', city: 'Tempe', state: 'AZ', address: '2424 W University Dr', storeUrl: 'https://www.livewithsol.com/locations/tempe-university/' }, - { name: 'Sol Flower Foothills Tucson', city: 'Tucson', state: 'AZ', address: '6026 N Oracle Rd', storeUrl: 'https://www.livewithsol.com/locations/foothills-tucson/' }, - { name: 'Sol Flower South Tucson', city: 'Tucson', state: 'AZ', address: '3000 W Valencia Rd Ste 210', storeUrl: 'https://www.livewithsol.com/locations/south-tucson/' }, - { name: 'Sol Flower North Tucson', city: 'Tucson', state: 'AZ', address: '4837 N 1st Ave', storeUrl: 'https://www.livewithsol.com/locations/north-tucson/' }, - { name: 'Sol Flower Casas Adobes', city: 'Tucson', state: 'AZ', address: '6437 N Oracle Rd', storeUrl: 'https://www.livewithsol.com/locations/casas-adobes/' }, - ]; - } - console.log(`[DirectoryMatcher] Found ${stores.length} Sol Flower locations`); - return stores; - } - catch (error) { - console.error('[DirectoryMatcher] Error scraping Sol directory:', error.message); - // Return hardcoded fallback - return [ - { name: 'Sol Flower 32nd & Shea', city: 'Phoenix', state: 'AZ', address: '3217 E Shea Blvd Suite 1 A', storeUrl: 'https://www.livewithsol.com/locations/deer-valley/' }, - { name: 'Sol Flower Scottsdale Airpark', city: 'Scottsdale', state: 'AZ', address: '14980 N 78th Way Ste 204', storeUrl: 'https://www.livewithsol.com/locations/scottsdale-airpark/' }, - { name: 'Sol Flower Sun City', city: 'Sun City', state: 'AZ', address: '13650 N 99th Ave', storeUrl: 'https://www.livewithsol.com/locations/sun-city/' }, - { name: 'Sol Flower Tempe McClintock', city: 'Tempe', state: 'AZ', address: '1322 N McClintock Dr', storeUrl: 'https://www.livewithsol.com/locations/tempe-mcclintock/' }, - { name: 'Sol Flower Tempe University', city: 'Tempe', state: 'AZ', address: '2424 W University Dr', storeUrl: 'https://www.livewithsol.com/locations/tempe-university/' }, - { name: 'Sol Flower Foothills Tucson', city: 'Tucson', state: 'AZ', address: '6026 N Oracle Rd', storeUrl: 'https://www.livewithsol.com/locations/foothills-tucson/' }, - { name: 'Sol Flower South Tucson', city: 'Tucson', state: 'AZ', address: '3000 W Valencia Rd Ste 210', storeUrl: 'https://www.livewithsol.com/locations/south-tucson/' }, - { name: 'Sol Flower North Tucson', city: 'Tucson', state: 'AZ', address: '4837 N 1st Ave', storeUrl: 'https://www.livewithsol.com/locations/north-tucson/' }, - { name: 'Sol Flower Casas Adobes', city: 'Tucson', state: 'AZ', address: '6437 N Oracle Rd', storeUrl: 'https://www.livewithsol.com/locations/casas-adobes/' }, - ]; - } -} -/** - * Curaleaf - Has age-gate, so we need hardcoded AZ locations - * In production, this would use Playwright to bypass age-gate - */ -async function scrapeCuraleafDirectory() { - console.log('[DirectoryMatcher] Using hardcoded Curaleaf AZ locations (age-gate blocks simple fetch)...'); - // Hardcoded Arizona Curaleaf locations from public knowledge - // These would be scraped via Playwright in production - return [ - { name: 'Curaleaf Phoenix Camelback', city: 'Phoenix', state: 'AZ', address: '4811 E Camelback Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-phoenix-camelback' }, - { name: 'Curaleaf Phoenix Midtown', city: 'Phoenix', state: 'AZ', address: '1928 E Highland Ave', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-phoenix-midtown' }, - { name: 'Curaleaf Glendale East', city: 'Glendale', state: 'AZ', address: '5150 W Glendale Ave', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-glendale-east' }, - { name: 'Curaleaf Glendale West', city: 'Glendale', state: 'AZ', address: '6501 W Glendale Ave', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-glendale-west' }, - { name: 'Curaleaf Gilbert', city: 'Gilbert', state: 'AZ', address: '1736 E Williams Field Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-gilbert' }, - { name: 'Curaleaf Mesa', city: 'Mesa', state: 'AZ', address: '1540 S Power Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-mesa' }, - { name: 'Curaleaf Tempe', city: 'Tempe', state: 'AZ', address: '1815 E Broadway Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-tempe' }, - { name: 'Curaleaf Scottsdale', city: 'Scottsdale', state: 'AZ', address: '8904 E Indian Bend Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-scottsdale' }, - { name: 'Curaleaf Tucson Prince', city: 'Tucson', state: 'AZ', address: '3955 W Prince Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-tucson-prince' }, - { name: 'Curaleaf Tucson Midvale', city: 'Tucson', state: 'AZ', address: '2936 N Midvale Park Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-tucson-midvale' }, - { name: 'Curaleaf Sedona', city: 'Sedona', state: 'AZ', address: '525 AZ-179', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-sedona' }, - { name: 'Curaleaf Youngtown', city: 'Youngtown', state: 'AZ', address: '11125 W Grand Ave', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-youngtown' }, - ]; -} -/** - * Match a directory store to an existing dispensary - */ -function matchStoreToDispensary(store, dispensaries) { - const normalizedStoreName = normalizeForComparison(store.name); - const normalizedStoreCity = normalizeCity(store.city); - let bestMatch = null; - let bestScore = 0; - let matchReason = ''; - for (const disp of dispensaries) { - const normalizedDispName = normalizeForComparison(disp.name); - const normalizedDispCity = normalizeCity(disp.city || ''); - let score = 0; - const reasons = []; - // 1. Name similarity (max 50 points) - const nameSimilarity = stringSimilarity(normalizedStoreName, normalizedDispName); - score += nameSimilarity * 50; - if (nameSimilarity > 0.8) - reasons.push(`name_match(${(nameSimilarity * 100).toFixed(0)}%)`); - // 2. City match (25 points for exact, 15 for partial) - if (normalizedStoreCity && normalizedDispCity) { - if (normalizedStoreCity === normalizedDispCity) { - score += 25; - reasons.push('city_exact'); - } - else if (normalizedStoreCity.includes(normalizedDispCity) || - normalizedDispCity.includes(normalizedStoreCity)) { - score += 15; - reasons.push('city_partial'); - } - } - // 3. Address contains street name (15 points) - if (store.address && disp.address) { - const storeStreet = store.address.toLowerCase().split(/\s+/).slice(1, 4).join(' '); - const dispStreet = disp.address.toLowerCase().split(/\s+/).slice(1, 4).join(' '); - if (storeStreet && dispStreet && stringSimilarity(storeStreet, dispStreet) > 0.7) { - score += 15; - reasons.push('address_match'); - } - } - // 4. Brand name in dispensary name (10 points) - const brandName = store.name.split(' ')[0].toLowerCase(); // e.g., "Curaleaf", "Sol" - if (disp.name.toLowerCase().includes(brandName)) { - score += 10; - reasons.push('brand_match'); - } - if (score > bestScore) { - bestScore = score; - bestMatch = disp; - matchReason = reasons.join(', '); - } - } - // Determine confidence level - let confidence; - if (bestScore >= 70) { - confidence = 'high'; - } - else if (bestScore >= 50) { - confidence = 'medium'; - } - else if (bestScore >= 30) { - confidence = 'low'; - } - else { - confidence = 'none'; - } - return { - directoryStore: store, - dispensaryId: bestMatch?.id || null, - dispensaryName: bestMatch?.name || null, - confidence, - matchReason: matchReason || 'no_match', - }; -} -// ============================================================ -// MAIN FUNCTIONS -// ============================================================ -/** - * Run directory matching for a provider and update database - * Only applies high-confidence matches automatically - */ -async function matchDirectoryToDispensaries(provider, dryRun = true) { - console.log(`[DirectoryMatcher] Running ${provider} directory matching (dryRun=${dryRun})...`); - // Get directory stores - let directoryStores; - if (provider === 'curaleaf') { - directoryStores = await scrapeCuraleafDirectory(); - } - else if (provider === 'sol') { - directoryStores = await scrapeSolDirectory(); - } - else { - throw new Error(`Unknown provider: ${provider}`); - } - // Get all AZ dispensaries from database - const { rows: dispensaries } = await (0, connection_1.query)(`SELECT id, name, city, state, address, menu_type, menu_url, website - FROM dispensaries - WHERE state = 'AZ'`); - console.log(`[DirectoryMatcher] Matching ${directoryStores.length} directory stores against ${dispensaries.length} dispensaries`); - // Match each directory store - const results = []; - for (const store of directoryStores) { - const match = matchStoreToDispensary(store, dispensaries); - results.push(match); - // Only apply high-confidence matches if not dry run - if (!dryRun && match.confidence === 'high' && match.dispensaryId) { - await applyDirectoryMatch(match.dispensaryId, provider, store); - } - } - // Count results - const report = { - provider, - totalDirectoryStores: directoryStores.length, - highConfidenceMatches: results.filter((r) => r.confidence === 'high').length, - mediumConfidenceMatches: results.filter((r) => r.confidence === 'medium').length, - lowConfidenceMatches: results.filter((r) => r.confidence === 'low').length, - unmatched: results.filter((r) => r.confidence === 'none').length, - results, - }; - console.log(`[DirectoryMatcher] ${provider} matching complete:`); - console.log(` - High confidence: ${report.highConfidenceMatches}`); - console.log(` - Medium confidence: ${report.mediumConfidenceMatches}`); - console.log(` - Low confidence: ${report.lowConfidenceMatches}`); - console.log(` - Unmatched: ${report.unmatched}`); - return report; -} -/** - * Apply a directory match to a dispensary - */ -async function applyDirectoryMatch(dispensaryId, provider, store) { - console.log(`[DirectoryMatcher] Applying match: dispensary ${dispensaryId} -> ${store.storeUrl}`); - await (0, connection_1.query)(` - UPDATE dispensaries SET - menu_type = $1, - menu_url = $2, - platform_dispensary_id = NULL, - provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) || - jsonb_build_object( - 'detected_provider', $1::text, - 'detection_method', 'directory_match'::text, - 'detected_at', NOW(), - 'directory_store_name', $3::text, - 'directory_store_url', $2::text, - 'directory_store_city', $4::text, - 'directory_store_address', $5::text, - 'not_crawlable', true, - 'not_crawlable_reason', $6::text - ), - updated_at = NOW() - WHERE id = $7 - `, [ - provider, - store.storeUrl, - store.name, - store.city, - store.address, - `${provider} proprietary menu - no crawler available`, - dispensaryId, - ]); -} -/** - * Preview matches without applying them - */ -async function previewDirectoryMatches(provider) { - return matchDirectoryToDispensaries(provider, true); -} -/** - * Apply high-confidence matches - */ -async function applyHighConfidenceMatches(provider) { - return matchDirectoryToDispensaries(provider, false); -} diff --git a/backend/dist/dutchie-az/services/discovery.js b/backend/dist/dutchie-az/services/discovery.js deleted file mode 100644 index 0b09a9f5..00000000 --- a/backend/dist/dutchie-az/services/discovery.js +++ /dev/null @@ -1,515 +0,0 @@ -"use strict"; -/** - * Dutchie AZ Discovery Service - * - * Discovers and manages dispensaries from Dutchie for Arizona. - */ -Object.defineProperty(exports, "__esModule", { value: true }); -exports.importFromExistingDispensaries = importFromExistingDispensaries; -exports.discoverDispensaries = discoverDispensaries; -exports.isObjectId = isObjectId; -exports.extractFromMenuUrl = extractFromMenuUrl; -exports.extractCNameFromMenuUrl = extractCNameFromMenuUrl; -exports.resolvePlatformDispensaryIds = resolvePlatformDispensaryIds; -exports.getAllDispensaries = getAllDispensaries; -exports.mapDbRowToDispensary = mapDbRowToDispensary; -exports.getDispensaryById = getDispensaryById; -exports.getDispensariesWithPlatformIds = getDispensariesWithPlatformIds; -exports.reResolveDispensaryPlatformId = reResolveDispensaryPlatformId; -exports.updateMenuUrlAndResolve = updateMenuUrlAndResolve; -exports.markDispensaryNotCrawlable = markDispensaryNotCrawlable; -exports.getDispensaryCName = getDispensaryCName; -const connection_1 = require("../db/connection"); -const graphql_client_1 = require("./graphql-client"); -/** - * Upsert a dispensary record - */ -async function upsertDispensary(dispensary) { - const result = await (0, connection_1.query)(` - INSERT INTO dispensaries ( - platform, name, slug, city, state, postal_code, address, - latitude, longitude, platform_dispensary_id, - is_delivery, is_pickup, raw_metadata, updated_at - ) VALUES ( - $1, $2, $3, $4, $5, $6, $7, - $8, $9, $10, - $11, $12, $13, NOW() - ) - ON CONFLICT (platform, slug, city, state) DO UPDATE SET - name = EXCLUDED.name, - postal_code = EXCLUDED.postal_code, - address = EXCLUDED.address, - latitude = EXCLUDED.latitude, - longitude = EXCLUDED.longitude, - platform_dispensary_id = COALESCE(EXCLUDED.platform_dispensary_id, dispensaries.platform_dispensary_id), - is_delivery = EXCLUDED.is_delivery, - is_pickup = EXCLUDED.is_pickup, - raw_metadata = EXCLUDED.raw_metadata, - updated_at = NOW() - RETURNING id - `, [ - dispensary.platform || 'dutchie', - dispensary.name, - dispensary.slug, - dispensary.city, - dispensary.state || 'AZ', - dispensary.postalCode, - dispensary.address, - dispensary.latitude, - dispensary.longitude, - dispensary.platformDispensaryId, - dispensary.isDelivery || false, - dispensary.isPickup || true, - dispensary.rawMetadata ? JSON.stringify(dispensary.rawMetadata) : null, - ]); - return result.rows[0].id; -} -/** - * Normalize a raw discovery result to Dispensary - */ -function normalizeDispensary(raw) { - return { - platform: 'dutchie', - name: raw.name || raw.Name || '', - slug: raw.slug || raw.cName || raw.id || '', - city: raw.city || raw.address?.city || '', - state: 'AZ', - postalCode: raw.postalCode || raw.address?.postalCode || raw.address?.zip, - address: raw.streetAddress || raw.address?.streetAddress, - latitude: raw.latitude || raw.location?.lat, - longitude: raw.longitude || raw.location?.lng, - platformDispensaryId: raw.dispensaryId || raw.id || null, - isDelivery: raw.isDelivery || raw.delivery || false, - isPickup: raw.isPickup || raw.pickup || true, - rawMetadata: raw, - }; -} -/** - * Import dispensaries from the existing dispensaries table (from AZDHS data) - * This creates records in the dutchie_az database for AZ dispensaries - */ -async function importFromExistingDispensaries() { - console.log('[Discovery] Importing from existing dispensaries table...'); - // This is a workaround - we'll use the dispensaries we already know about - // and try to resolve their Dutchie IDs - const knownDispensaries = [ - { name: 'Deeply Rooted', slug: 'AZ-Deeply-Rooted', city: 'Phoenix', state: 'AZ' }, - { name: 'Curaleaf Gilbert', slug: 'curaleaf-gilbert', city: 'Gilbert', state: 'AZ' }, - { name: 'Zen Leaf Prescott', slug: 'AZ-zen-leaf-prescott', city: 'Prescott', state: 'AZ' }, - // Add more known Dutchie stores here - ]; - let imported = 0; - for (const disp of knownDispensaries) { - try { - const id = await upsertDispensary({ - platform: 'dutchie', - name: disp.name, - slug: disp.slug, - city: disp.city, - state: disp.state, - }); - imported++; - console.log(`[Discovery] Imported: ${disp.name} (id=${id})`); - } - catch (error) { - console.error(`[Discovery] Failed to import ${disp.name}:`, error.message); - } - } - return { imported }; -} -/** - * Discover all Arizona Dutchie dispensaries via GraphQL - */ -async function discoverDispensaries() { - console.log('[Discovery] Starting Arizona dispensary discovery...'); - const errors = []; - let discovered = 0; - try { - const rawDispensaries = await (0, graphql_client_1.discoverArizonaDispensaries)(); - console.log(`[Discovery] Found ${rawDispensaries.length} dispensaries from GraphQL`); - for (const raw of rawDispensaries) { - try { - const normalized = normalizeDispensary(raw); - if (normalized.name && normalized.slug && normalized.city) { - await upsertDispensary(normalized); - discovered++; - } - } - catch (error) { - errors.push(`${raw.name || raw.slug}: ${error.message}`); - } - } - } - catch (error) { - errors.push(`Discovery failed: ${error.message}`); - } - console.log(`[Discovery] Completed: ${discovered} dispensaries, ${errors.length} errors`); - return { discovered, errors }; -} -/** - * Check if a string looks like a MongoDB ObjectId (24 hex chars) - */ -function isObjectId(value) { - return /^[a-f0-9]{24}$/i.test(value); -} -function extractFromMenuUrl(menuUrl) { - if (!menuUrl) - return null; - try { - const url = new URL(menuUrl); - const pathname = url.pathname; - // Match /api/v2/embedded-menu/.js - this contains the platform_dispensary_id directly - const apiMatch = pathname.match(/^\/api\/v2\/embedded-menu\/([a-f0-9]{24})\.js$/i); - if (apiMatch) { - return { type: 'platformId', value: apiMatch[1] }; - } - // Match /embedded-menu/ or /dispensary/ - const embeddedMatch = pathname.match(/^\/embedded-menu\/([^/?]+)/); - if (embeddedMatch) { - const value = embeddedMatch[1]; - // Check if it's actually an ObjectId (some URLs use ID directly) - if (isObjectId(value)) { - return { type: 'platformId', value }; - } - return { type: 'cName', value }; - } - const dispensaryMatch = pathname.match(/^\/dispensary\/([^/?]+)/); - if (dispensaryMatch) { - const value = dispensaryMatch[1]; - if (isObjectId(value)) { - return { type: 'platformId', value }; - } - return { type: 'cName', value }; - } - return null; - } - catch { - return null; - } -} -/** - * Extract cName (slug) from a Dutchie menu_url - * Backward compatible - use extractFromMenuUrl for full info - */ -function extractCNameFromMenuUrl(menuUrl) { - const extraction = extractFromMenuUrl(menuUrl); - return extraction?.value || null; -} -/** - * Resolve platform dispensary IDs for all dispensaries that don't have one - * CRITICAL: Uses cName extracted from menu_url, NOT the slug column! - * - * Uses the new resolveDispensaryIdWithDetails which: - * 1. Extracts dispensaryId from window.reactEnv in the embedded menu page (preferred) - * 2. Falls back to GraphQL if reactEnv extraction fails - * 3. Returns HTTP status so we can mark 403/404 stores as not_crawlable - */ -async function resolvePlatformDispensaryIds() { - console.log('[Discovery] Resolving platform dispensary IDs...'); - const { rows: dispensaries } = await (0, connection_1.query)(` - SELECT id, name, slug, menu_url, menu_type, platform_dispensary_id, crawl_status - FROM dispensaries - WHERE menu_type = 'dutchie' - AND platform_dispensary_id IS NULL - AND menu_url IS NOT NULL - AND (crawl_status IS NULL OR crawl_status != 'not_crawlable') - ORDER BY id - `); - let resolved = 0; - let failed = 0; - let skipped = 0; - let notCrawlable = 0; - for (const dispensary of dispensaries) { - try { - // Extract cName from menu_url - this is the CORRECT way to get the Dutchie slug - const cName = extractCNameFromMenuUrl(dispensary.menu_url); - if (!cName) { - console.log(`[Discovery] Skipping ${dispensary.name}: Could not extract cName from menu_url: ${dispensary.menu_url}`); - skipped++; - continue; - } - console.log(`[Discovery] Resolving ID for: ${dispensary.name} (cName=${cName}, menu_url=${dispensary.menu_url})`); - // Use the new detailed resolver that extracts from reactEnv first - const result = await (0, graphql_client_1.resolveDispensaryIdWithDetails)(cName); - if (result.dispensaryId) { - // SUCCESS: Store resolved - await (0, connection_1.query)(` - UPDATE dispensaries - SET platform_dispensary_id = $1, - platform_dispensary_id_resolved_at = NOW(), - crawl_status = 'ready', - crawl_status_reason = $2, - crawl_status_updated_at = NOW(), - last_tested_menu_url = $3, - last_http_status = $4, - updated_at = NOW() - WHERE id = $5 - `, [ - result.dispensaryId, - `Resolved from ${result.source || 'page'}`, - dispensary.menu_url, - result.httpStatus, - dispensary.id, - ]); - resolved++; - console.log(`[Discovery] Resolved: ${cName} -> ${result.dispensaryId} (source: ${result.source})`); - } - else if (result.httpStatus === 403 || result.httpStatus === 404) { - // NOT CRAWLABLE: Store removed or not accessible - await (0, connection_1.query)(` - UPDATE dispensaries - SET platform_dispensary_id = NULL, - crawl_status = 'not_crawlable', - crawl_status_reason = $1, - crawl_status_updated_at = NOW(), - last_tested_menu_url = $2, - last_http_status = $3, - updated_at = NOW() - WHERE id = $4 - `, [ - result.error || `HTTP ${result.httpStatus}: Removed from Dutchie`, - dispensary.menu_url, - result.httpStatus, - dispensary.id, - ]); - notCrawlable++; - console.log(`[Discovery] Marked not crawlable: ${cName} (HTTP ${result.httpStatus})`); - } - else { - // FAILED: Could not resolve but page loaded - await (0, connection_1.query)(` - UPDATE dispensaries - SET crawl_status = 'not_ready', - crawl_status_reason = $1, - crawl_status_updated_at = NOW(), - last_tested_menu_url = $2, - last_http_status = $3, - updated_at = NOW() - WHERE id = $4 - `, [ - result.error || 'Could not extract dispensaryId from page', - dispensary.menu_url, - result.httpStatus, - dispensary.id, - ]); - failed++; - console.log(`[Discovery] Could not resolve: ${cName} - ${result.error}`); - } - // Delay between requests - await new Promise((r) => setTimeout(r, 2000)); - } - catch (error) { - failed++; - console.error(`[Discovery] Error resolving ${dispensary.name}:`, error.message); - } - } - console.log(`[Discovery] Completed: ${resolved} resolved, ${failed} failed, ${skipped} skipped, ${notCrawlable} not crawlable`); - return { resolved, failed, skipped, notCrawlable }; -} -/** - * Get all dispensaries - */ -// Explicit column list for dispensaries table (avoids SELECT * issues with schema differences) -const DISPENSARY_COLUMNS = ` - id, name, slug, city, state, zip, address, latitude, longitude, - menu_type, menu_url, platform_dispensary_id, website, - provider_detection_data, created_at, updated_at -`; -async function getAllDispensaries() { - const { rows } = await (0, connection_1.query)(`SELECT ${DISPENSARY_COLUMNS} FROM dispensaries WHERE menu_type = 'dutchie' ORDER BY name`); - return rows.map(mapDbRowToDispensary); -} -/** - * Map snake_case DB row to camelCase Dispensary object - * CRITICAL: DB returns snake_case (platform_dispensary_id) but TypeScript expects camelCase (platformDispensaryId) - * This function is exported for use in other modules that query dispensaries directly. - * - * NOTE: The consolidated dispensaries table column mappings: - * - zip → postalCode - * - menu_type → menuType (keep platform as 'dutchie') - * - last_crawl_at → lastCrawledAt - * - platform_dispensary_id → platformDispensaryId - */ -function mapDbRowToDispensary(row) { - // Extract website from raw_metadata if available (field may not exist in all environments) - let rawMetadata = undefined; - if (row.raw_metadata !== undefined) { - rawMetadata = typeof row.raw_metadata === 'string' - ? JSON.parse(row.raw_metadata) - : row.raw_metadata; - } - const website = row.website || rawMetadata?.website || undefined; - return { - id: row.id, - platform: row.platform || 'dutchie', // keep platform as-is, default to 'dutchie' - name: row.name, - dbaName: row.dbaName || row.dba_name, - slug: row.slug, - city: row.city, - state: row.state, - postalCode: row.postalCode || row.zip || row.postal_code, - latitude: row.latitude ? parseFloat(row.latitude) : undefined, - longitude: row.longitude ? parseFloat(row.longitude) : undefined, - address: row.address, - platformDispensaryId: row.platformDispensaryId || row.platform_dispensary_id, // CRITICAL mapping! - isDelivery: row.is_delivery, - isPickup: row.is_pickup, - rawMetadata: rawMetadata, - lastCrawledAt: row.lastCrawledAt || row.last_crawl_at, // use last_crawl_at - productCount: row.product_count, - createdAt: row.created_at, - updatedAt: row.updated_at, - menuType: row.menuType || row.menu_type, - menuUrl: row.menuUrl || row.menu_url, - scrapeEnabled: row.scrapeEnabled ?? row.scrape_enabled, - providerDetectionData: row.provider_detection_data, - platformDispensaryIdResolvedAt: row.platform_dispensary_id_resolved_at, - website, - }; -} -/** - * Get dispensary by ID - * NOTE: Uses SQL aliases to map snake_case → camelCase directly - */ -async function getDispensaryById(id) { - const { rows } = await (0, connection_1.query)(` - SELECT - id, - name, - dba_name AS "dbaName", - slug, - city, - state, - zip AS "postalCode", - address, - latitude, - longitude, - menu_type AS "menuType", - menu_url AS "menuUrl", - platform_dispensary_id AS "platformDispensaryId", - website, - provider_detection_data AS "providerDetectionData", - created_at, - updated_at - FROM dispensaries - WHERE id = $1 - `, [id]); - if (!rows[0]) - return null; - return mapDbRowToDispensary(rows[0]); -} -/** - * Get dispensaries with platform IDs (ready for crawling) - */ -async function getDispensariesWithPlatformIds() { - const { rows } = await (0, connection_1.query)(` - SELECT ${DISPENSARY_COLUMNS} FROM dispensaries - WHERE menu_type = 'dutchie' AND platform_dispensary_id IS NOT NULL - ORDER BY name - `); - return rows.map(mapDbRowToDispensary); -} -/** - * Re-resolve a single dispensary's platform ID - * Clears the existing ID and re-resolves from the menu_url cName - */ -async function reResolveDispensaryPlatformId(dispensaryId) { - console.log(`[Discovery] Re-resolving platform ID for dispensary ${dispensaryId}...`); - const dispensary = await getDispensaryById(dispensaryId); - if (!dispensary) { - return { success: false, platformId: null, cName: null, error: 'Dispensary not found' }; - } - const cName = extractCNameFromMenuUrl(dispensary.menuUrl); - if (!cName) { - console.log(`[Discovery] Could not extract cName from menu_url: ${dispensary.menuUrl}`); - return { - success: false, - platformId: null, - cName: null, - error: `Could not extract cName from menu_url: ${dispensary.menuUrl}`, - }; - } - console.log(`[Discovery] Extracted cName: ${cName} from menu_url: ${dispensary.menuUrl}`); - try { - const platformId = await (0, graphql_client_1.resolveDispensaryId)(cName); - if (platformId) { - await (0, connection_1.query)(` - UPDATE dispensaries - SET platform_dispensary_id = $1, - platform_dispensary_id_resolved_at = NOW(), - updated_at = NOW() - WHERE id = $2 - `, [platformId, dispensaryId]); - console.log(`[Discovery] Resolved: ${cName} -> ${platformId}`); - return { success: true, platformId, cName }; - } - else { - // Clear the invalid platform ID and mark as not crawlable - await (0, connection_1.query)(` - UPDATE dispensaries - SET platform_dispensary_id = NULL, - provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) || - '{"resolution_error": "cName no longer exists on Dutchie", "not_crawlable": true}'::jsonb, - updated_at = NOW() - WHERE id = $1 - `, [dispensaryId]); - console.log(`[Discovery] Could not resolve: ${cName} - marked as not crawlable`); - return { - success: false, - platformId: null, - cName, - error: `cName "${cName}" no longer exists on Dutchie`, - }; - } - } - catch (error) { - console.error(`[Discovery] Error resolving ${cName}:`, error.message); - return { success: false, platformId: null, cName, error: error.message }; - } -} -/** - * Update menu_url for a dispensary and re-resolve platform ID - */ -async function updateMenuUrlAndResolve(dispensaryId, newMenuUrl) { - console.log(`[Discovery] Updating menu_url for dispensary ${dispensaryId} to: ${newMenuUrl}`); - const cName = extractCNameFromMenuUrl(newMenuUrl); - if (!cName) { - return { - success: false, - platformId: null, - cName: null, - error: `Could not extract cName from new menu_url: ${newMenuUrl}`, - }; - } - // Update the menu_url first - await (0, connection_1.query)(` - UPDATE dispensaries - SET menu_url = $1, - menu_type = 'dutchie', - platform_dispensary_id = NULL, - updated_at = NOW() - WHERE id = $2 - `, [newMenuUrl, dispensaryId]); - // Now resolve the platform ID with the new cName - return await reResolveDispensaryPlatformId(dispensaryId); -} -/** - * Mark a dispensary as not crawlable (when resolution fails permanently) - */ -async function markDispensaryNotCrawlable(dispensaryId, reason) { - await (0, connection_1.query)(` - UPDATE dispensaries - SET platform_dispensary_id = NULL, - provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) || - jsonb_build_object('not_crawlable', true, 'not_crawlable_reason', $1::text, 'not_crawlable_at', NOW()::text), - updated_at = NOW() - WHERE id = $2 - `, [reason, dispensaryId]); - console.log(`[Discovery] Marked dispensary ${dispensaryId} as not crawlable: ${reason}`); -} -/** - * Get the cName for a dispensary (extracted from menu_url) - */ -function getDispensaryCName(dispensary) { - return extractCNameFromMenuUrl(dispensary.menuUrl); -} diff --git a/backend/dist/dutchie-az/services/graphql-client.js b/backend/dist/dutchie-az/services/graphql-client.js deleted file mode 100644 index b19f7146..00000000 --- a/backend/dist/dutchie-az/services/graphql-client.js +++ /dev/null @@ -1,538 +0,0 @@ -"use strict"; -/** - * Dutchie GraphQL Client - * - * Uses Puppeteer to establish a session (get CF cookies), then makes - * SERVER-SIDE fetch calls to api-gw.dutchie.com with those cookies. - * - * DUTCHIE FETCH RULES: - * 1. Server-side only - use axios (never browser fetch with CORS) - * 2. Use dispensaryFilter.cNameOrID, NOT dispensaryId directly - * 3. Headers must mimic Chrome: User-Agent, Origin, Referer - * 4. If 403, extract CF cookies from Puppeteer session and include them - * 5. Log status codes, error bodies, and product counts - */ -var __importDefault = (this && this.__importDefault) || function (mod) { - return (mod && mod.__esModule) ? mod : { "default": mod }; -}; -Object.defineProperty(exports, "__esModule", { value: true }); -exports.ARIZONA_CENTERPOINTS = exports.GRAPHQL_HASHES = void 0; -exports.resolveDispensaryId = resolveDispensaryId; -exports.resolveDispensaryIdWithDetails = resolveDispensaryIdWithDetails; -exports.discoverArizonaDispensaries = discoverArizonaDispensaries; -exports.fetchAllProducts = fetchAllProducts; -exports.fetchAllProductsBothModes = fetchAllProductsBothModes; -const axios_1 = __importDefault(require("axios")); -const puppeteer_extra_1 = __importDefault(require("puppeteer-extra")); -const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth")); -const dutchie_1 = require("../config/dutchie"); -Object.defineProperty(exports, "GRAPHQL_HASHES", { enumerable: true, get: function () { return dutchie_1.GRAPHQL_HASHES; } }); -Object.defineProperty(exports, "ARIZONA_CENTERPOINTS", { enumerable: true, get: function () { return dutchie_1.ARIZONA_CENTERPOINTS; } }); -puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)()); -/** - * Create a session by navigating to the embedded menu page - * and extracting CF clearance cookies for server-side requests. - * Also extracts dispensaryId from window.reactEnv if available. - */ -async function createSession(cName) { - const browser = await puppeteer_extra_1.default.launch({ - headless: 'new', - args: dutchie_1.dutchieConfig.browserArgs, - }); - const page = await browser.newPage(); - const userAgent = dutchie_1.dutchieConfig.userAgent; - await page.setUserAgent(userAgent); - await page.setViewport({ width: 1920, height: 1080 }); - await page.evaluateOnNewDocument(() => { - Object.defineProperty(navigator, 'webdriver', { get: () => false }); - window.chrome = { runtime: {} }; - }); - // Navigate to the embedded menu page for this dispensary - const embeddedMenuUrl = `https://dutchie.com/embedded-menu/${cName}`; - console.log(`[GraphQL Client] Loading ${embeddedMenuUrl} to get CF cookies...`); - let httpStatus; - let dispensaryId; - try { - const response = await page.goto(embeddedMenuUrl, { - waitUntil: 'networkidle2', - timeout: dutchie_1.dutchieConfig.navigationTimeout, - }); - httpStatus = response?.status(); - await new Promise((r) => setTimeout(r, dutchie_1.dutchieConfig.pageLoadDelay)); - // Try to extract dispensaryId from window.reactEnv - try { - dispensaryId = await page.evaluate(() => { - return window.reactEnv?.dispensaryId || null; - }); - if (dispensaryId) { - console.log(`[GraphQL Client] Extracted dispensaryId from reactEnv: ${dispensaryId}`); - } - } - catch (evalError) { - console.log(`[GraphQL Client] Could not extract dispensaryId from reactEnv: ${evalError.message}`); - } - } - catch (error) { - console.warn(`[GraphQL Client] Navigation warning: ${error.message}`); - // Continue anyway - we may have gotten cookies - } - // Extract cookies - const cookies = await page.cookies(); - const cookieString = cookies.map((c) => `${c.name}=${c.value}`).join('; '); - console.log(`[GraphQL Client] Got ${cookies.length} cookies, HTTP status: ${httpStatus}`); - if (cookies.length > 0) { - console.log(`[GraphQL Client] Cookie names: ${cookies.map(c => c.name).join(', ')}`); - } - return { cookies: cookieString, userAgent, browser, page, dispensaryId, httpStatus }; -} -/** - * Close session (browser) - */ -async function closeSession(session) { - await session.browser.close(); -} -// ============================================================ -// SERVER-SIDE GRAPHQL FETCH USING AXIOS -// ============================================================ -/** - * Build headers that mimic a real browser request - */ -function buildHeaders(session, cName) { - const embeddedMenuUrl = `https://dutchie.com/embedded-menu/${cName}`; - return { - 'accept': 'application/json, text/plain, */*', - 'accept-language': 'en-US,en;q=0.9', - 'accept-encoding': 'gzip, deflate, br', - 'content-type': 'application/json', - 'origin': 'https://dutchie.com', - 'referer': embeddedMenuUrl, - 'user-agent': session.userAgent, - 'apollographql-client-name': 'Marketplace (production)', - 'sec-ch-ua': '"Chromium";v="120", "Google Chrome";v="120", "Not-A.Brand";v="99"', - 'sec-ch-ua-mobile': '?0', - 'sec-ch-ua-platform': '"Windows"', - 'sec-fetch-dest': 'empty', - 'sec-fetch-mode': 'cors', - 'sec-fetch-site': 'same-site', - ...(session.cookies ? { 'cookie': session.cookies } : {}), - }; -} -/** - * Execute GraphQL query server-side using axios - * Uses cookies from the browser session to bypass CF - */ -async function executeGraphQL(session, operationName, variables, hash, cName) { - const endpoint = dutchie_1.dutchieConfig.graphqlEndpoint; - const headers = buildHeaders(session, cName); - // Build request body for POST - const body = { - operationName, - variables, - extensions: { - persistedQuery: { version: 1, sha256Hash: hash }, - }, - }; - console.log(`[GraphQL Client] POST: ${operationName} -> ${endpoint}`); - console.log(`[GraphQL Client] Variables: ${JSON.stringify(variables).slice(0, 300)}...`); - try { - const response = await axios_1.default.post(endpoint, body, { - headers, - timeout: 30000, - validateStatus: () => true, // Don't throw on non-2xx - }); - // Log response details - console.log(`[GraphQL Client] Response status: ${response.status}`); - if (response.status !== 200) { - const bodyPreview = typeof response.data === 'string' - ? response.data.slice(0, 500) - : JSON.stringify(response.data).slice(0, 500); - console.error(`[GraphQL Client] HTTP ${response.status}: ${bodyPreview}`); - throw new Error(`HTTP ${response.status}`); - } - // Check for GraphQL errors - if (response.data?.errors && response.data.errors.length > 0) { - console.error(`[GraphQL Client] GraphQL errors: ${JSON.stringify(response.data.errors[0])}`); - } - return response.data; - } - catch (error) { - if (axios_1.default.isAxiosError(error)) { - const axiosError = error; - console.error(`[GraphQL Client] Axios error: ${axiosError.message}`); - if (axiosError.response) { - console.error(`[GraphQL Client] Response status: ${axiosError.response.status}`); - console.error(`[GraphQL Client] Response data: ${JSON.stringify(axiosError.response.data).slice(0, 500)}`); - } - if (axiosError.code) { - console.error(`[GraphQL Client] Error code: ${axiosError.code}`); - } - } - else { - console.error(`[GraphQL Client] Error: ${error.message}`); - } - throw error; - } -} -/** - * Resolve a dispensary slug to its internal platform ID. - * - * STRATEGY: - * 1. Navigate to embedded menu page and extract window.reactEnv.dispensaryId (preferred) - * 2. Fall back to GraphQL GetAddressBasedDispensaryData query if reactEnv fails - * - * Returns the dispensaryId (platform_dispensary_id) or null if not found. - * Throws if page returns 403/404 so caller can mark as not_crawlable. - */ -async function resolveDispensaryId(slug) { - const result = await resolveDispensaryIdWithDetails(slug); - return result.dispensaryId; -} -/** - * Resolve a dispensary slug with full details (HTTP status, source, error). - * Use this when you need to know WHY resolution failed. - */ -async function resolveDispensaryIdWithDetails(slug) { - console.log(`[GraphQL Client] Resolving dispensary ID for slug: ${slug}`); - const session = await createSession(slug); - try { - // Check HTTP status first - if 403/404, the store is not crawlable - if (session.httpStatus && (session.httpStatus === 403 || session.httpStatus === 404)) { - console.log(`[GraphQL Client] Page returned HTTP ${session.httpStatus} for ${slug} - not crawlable`); - return { - dispensaryId: null, - httpStatus: session.httpStatus, - error: `HTTP ${session.httpStatus}: Store removed or not accessible`, - source: 'reactEnv', - }; - } - // PREFERRED: Use dispensaryId from window.reactEnv (extracted during createSession) - if (session.dispensaryId) { - console.log(`[GraphQL Client] Resolved ${slug} -> ${session.dispensaryId} (from reactEnv)`); - return { - dispensaryId: session.dispensaryId, - httpStatus: session.httpStatus, - source: 'reactEnv', - }; - } - // FALLBACK: Try GraphQL query - console.log(`[GraphQL Client] reactEnv.dispensaryId not found for ${slug}, trying GraphQL...`); - const variables = { - dispensaryFilter: { - cNameOrID: slug, - }, - }; - const result = await executeGraphQL(session, 'GetAddressBasedDispensaryData', variables, dutchie_1.GRAPHQL_HASHES.GetAddressBasedDispensaryData, slug); - const dispensaryId = result?.data?.dispensaryBySlug?.id || - result?.data?.dispensary?.id || - result?.data?.getAddressBasedDispensaryData?.dispensary?.id; - if (dispensaryId) { - console.log(`[GraphQL Client] Resolved ${slug} -> ${dispensaryId} (from GraphQL)`); - return { - dispensaryId, - httpStatus: session.httpStatus, - source: 'graphql', - }; - } - console.log(`[GraphQL Client] Could not resolve ${slug}, GraphQL response:`, JSON.stringify(result).slice(0, 300)); - return { - dispensaryId: null, - httpStatus: session.httpStatus, - error: 'Could not extract dispensaryId from reactEnv or GraphQL', - }; - } - finally { - await closeSession(session); - } -} -/** - * Discover Arizona dispensaries via geo-based query - */ -async function discoverArizonaDispensaries() { - console.log('[GraphQL Client] Discovering Arizona dispensaries...'); - // Use Phoenix as the default center - const session = await createSession('AZ-Deeply-Rooted'); - const allDispensaries = []; - const seenIds = new Set(); - try { - for (const centerpoint of dutchie_1.ARIZONA_CENTERPOINTS) { - console.log(`[GraphQL Client] Scanning ${centerpoint.name}...`); - const variables = { - dispensariesFilter: { - latitude: centerpoint.lat, - longitude: centerpoint.lng, - distance: 100, - state: 'AZ', - }, - }; - try { - const result = await executeGraphQL(session, 'ConsumerDispensaries', variables, dutchie_1.GRAPHQL_HASHES.ConsumerDispensaries, 'AZ-Deeply-Rooted'); - const dispensaries = result?.data?.consumerDispensaries || []; - for (const d of dispensaries) { - const id = d.id || d.dispensaryId; - if (id && !seenIds.has(id)) { - seenIds.add(id); - allDispensaries.push(d); - } - } - console.log(`[GraphQL Client] Found ${dispensaries.length} in ${centerpoint.name} (${allDispensaries.length} total unique)`); - } - catch (error) { - console.warn(`[GraphQL Client] Error scanning ${centerpoint.name}: ${error.message}`); - } - // Delay between requests - await new Promise((r) => setTimeout(r, 1000)); - } - } - finally { - await closeSession(session); - } - console.log(`[GraphQL Client] Discovery complete: ${allDispensaries.length} dispensaries`); - return allDispensaries; -} -// ============================================================ -// PRODUCT FILTERING VARIABLES -// ============================================================ -/** - * Build filter variables for FilteredProducts query - * - * CRITICAL: Uses dispensaryId directly (the MongoDB ObjectId, e.g. "6405ef617056e8014d79101b") - * NOT dispensaryFilter.cNameOrID! - * - * The actual browser request structure is: - * { - * "productsFilter": { - * "dispensaryId": "6405ef617056e8014d79101b", - * "pricingType": "rec", - * "Status": "Active", // Mode A only - * "strainTypes": [], - * "subcategories": [], - * "types": [], - * "useCache": true, - * ... - * }, - * "page": 0, - * "perPage": 100 - * } - * - * Mode A = UI parity (Status: "Active") - * Mode B = MAX COVERAGE (no Status filter) - */ -function buildFilterVariables(platformDispensaryId, pricingType, crawlMode, page, perPage) { - const isModeA = crawlMode === 'mode_a'; - // Per CLAUDE.md Rule #11: Use simple productsFilter with dispensaryId directly - // Do NOT use dispensaryFilter.cNameOrID - that's outdated - const productsFilter = { - dispensaryId: platformDispensaryId, - pricingType: pricingType, - }; - // Mode A: Only active products (UI parity) - Status: "Active" - // Mode B: MAX COVERAGE (OOS/inactive) - omit Status or set to null - if (isModeA) { - productsFilter.Status = 'Active'; - } - // Mode B: No Status filter = returns all products including OOS/inactive - return { - productsFilter, - page, - perPage, - }; -} -// ============================================================ -// PRODUCT FETCHING WITH PAGINATION -// ============================================================ -/** - * Fetch products for a single mode with pagination - */ -async function fetchProductsForMode(session, platformDispensaryId, cName, pricingType, crawlMode) { - const perPage = dutchie_1.dutchieConfig.perPage; - const maxPages = dutchie_1.dutchieConfig.maxPages; - const maxRetries = dutchie_1.dutchieConfig.maxRetries; - const pageDelayMs = dutchie_1.dutchieConfig.pageDelayMs; - const allProducts = []; - let pageNum = 0; - let totalCount = 0; - let consecutiveEmptyPages = 0; - console.log(`[GraphQL Client] Fetching products for ${cName} (platformId: ${platformDispensaryId}, ${pricingType}, ${crawlMode})...`); - while (pageNum < maxPages) { - const variables = buildFilterVariables(platformDispensaryId, pricingType, crawlMode, pageNum, perPage); - let result = null; - let lastError = null; - // Retry logic - for (let attempt = 0; attempt <= maxRetries; attempt++) { - try { - result = await executeGraphQL(session, 'FilteredProducts', variables, dutchie_1.GRAPHQL_HASHES.FilteredProducts, cName); - lastError = null; - break; - } - catch (error) { - lastError = error; - console.warn(`[GraphQL Client] Page ${pageNum} attempt ${attempt + 1} failed: ${error.message}`); - if (attempt < maxRetries) { - await new Promise((r) => setTimeout(r, 1000 * (attempt + 1))); - } - } - } - if (lastError) { - console.error(`[GraphQL Client] Page ${pageNum} failed after ${maxRetries + 1} attempts`); - break; - } - if (result?.errors) { - console.error('[GraphQL Client] GraphQL errors:', JSON.stringify(result.errors)); - break; - } - // Log response shape on first page - if (pageNum === 0) { - console.log(`[GraphQL Client] Response keys: ${Object.keys(result || {}).join(', ')}`); - if (result?.data) { - console.log(`[GraphQL Client] data keys: ${Object.keys(result.data || {}).join(', ')}`); - } - if (!result?.data?.filteredProducts) { - console.log(`[GraphQL Client] WARNING: No filteredProducts in response!`); - console.log(`[GraphQL Client] Full response: ${JSON.stringify(result).slice(0, 1000)}`); - } - } - const products = result?.data?.filteredProducts?.products || []; - const queryInfo = result?.data?.filteredProducts?.queryInfo; - if (queryInfo?.totalCount) { - totalCount = queryInfo.totalCount; - } - console.log(`[GraphQL Client] Page ${pageNum}: ${products.length} products (total so far: ${allProducts.length + products.length}/${totalCount})`); - if (products.length === 0) { - consecutiveEmptyPages++; - if (consecutiveEmptyPages >= 2) { - console.log('[GraphQL Client] Multiple empty pages, stopping pagination'); - break; - } - } - else { - consecutiveEmptyPages = 0; - allProducts.push(...products); - } - // Stop if incomplete page (last page) - if (products.length < perPage) { - console.log(`[GraphQL Client] Incomplete page (${products.length} < ${perPage}), stopping`); - break; - } - pageNum++; - await new Promise((r) => setTimeout(r, pageDelayMs)); - } - console.log(`[GraphQL Client] Fetched ${allProducts.length} total products (${crawlMode})`); - return { products: allProducts, totalCount: totalCount || allProducts.length, crawlMode }; -} -// ============================================================ -// LEGACY SINGLE-MODE INTERFACE -// ============================================================ -/** - * Fetch all products for a dispensary (single mode) - */ -async function fetchAllProducts(platformDispensaryId, pricingType = 'rec', options = {}) { - const { crawlMode = 'mode_a' } = options; - // cName is now REQUIRED - no default fallback to avoid using wrong store's session - const cName = options.cName; - if (!cName) { - throw new Error('[GraphQL Client] cName is required for fetchAllProducts - cannot use another store\'s session'); - } - const session = await createSession(cName); - try { - return await fetchProductsForMode(session, platformDispensaryId, cName, pricingType, crawlMode); - } - finally { - await closeSession(session); - } -} -// ============================================================ -// MODE A+B MERGING -// ============================================================ -/** - * Merge POSMetaData.children arrays from Mode A and Mode B products - */ -function mergeProductOptions(modeAProduct, modeBProduct) { - const modeAChildren = modeAProduct.POSMetaData?.children || []; - const modeBChildren = modeBProduct.POSMetaData?.children || []; - const getOptionKey = (child) => { - return child.canonicalID || child.canonicalSKU || child.canonicalPackageId || child.option || ''; - }; - const mergedMap = new Map(); - for (const child of modeAChildren) { - const key = getOptionKey(child); - if (key) - mergedMap.set(key, child); - } - for (const child of modeBChildren) { - const key = getOptionKey(child); - if (key && !mergedMap.has(key)) { - mergedMap.set(key, child); - } - } - return Array.from(mergedMap.values()); -} -/** - * Merge a Mode A product with a Mode B product - */ -function mergeProducts(modeAProduct, modeBProduct) { - if (!modeBProduct) { - return modeAProduct; - } - const mergedChildren = mergeProductOptions(modeAProduct, modeBProduct); - return { - ...modeAProduct, - POSMetaData: { - ...modeAProduct.POSMetaData, - children: mergedChildren, - }, - }; -} -// ============================================================ -// MAIN EXPORT: TWO-MODE CRAWL -// ============================================================ -/** - * Fetch products using BOTH crawl modes with SINGLE session - * Runs Mode A then Mode B, merges results - */ -async function fetchAllProductsBothModes(platformDispensaryId, pricingType = 'rec', options = {}) { - // cName is now REQUIRED - no default fallback to avoid using wrong store's session - const cName = options.cName; - if (!cName) { - throw new Error('[GraphQL Client] cName is required for fetchAllProductsBothModes - cannot use another store\'s session'); - } - console.log(`[GraphQL Client] Running two-mode crawl for ${cName} (${pricingType})...`); - console.log(`[GraphQL Client] Platform ID: ${platformDispensaryId}, cName: ${cName}`); - const session = await createSession(cName); - try { - // Mode A (UI parity) - const modeAResult = await fetchProductsForMode(session, platformDispensaryId, cName, pricingType, 'mode_a'); - // Delay between modes - await new Promise((r) => setTimeout(r, dutchie_1.dutchieConfig.modeDelayMs)); - // Mode B (MAX COVERAGE) - const modeBResult = await fetchProductsForMode(session, platformDispensaryId, cName, pricingType, 'mode_b'); - // Merge results - const modeBMap = new Map(); - for (const product of modeBResult.products) { - modeBMap.set(product._id, product); - } - const productMap = new Map(); - // Add Mode A products, merging with Mode B if exists - for (const product of modeAResult.products) { - const modeBProduct = modeBMap.get(product._id); - const mergedProduct = mergeProducts(product, modeBProduct); - productMap.set(product._id, mergedProduct); - } - // Add Mode B products not in Mode A - for (const product of modeBResult.products) { - if (!productMap.has(product._id)) { - productMap.set(product._id, product); - } - } - const mergedProducts = Array.from(productMap.values()); - console.log(`[GraphQL Client] Merged: ${mergedProducts.length} unique products`); - console.log(`[GraphQL Client] Mode A: ${modeAResult.products.length}, Mode B: ${modeBResult.products.length}`); - return { - modeA: { products: modeAResult.products, totalCount: modeAResult.totalCount }, - modeB: { products: modeBResult.products, totalCount: modeBResult.totalCount }, - merged: { products: mergedProducts, totalCount: mergedProducts.length }, - }; - } - finally { - await closeSession(session); - } -} diff --git a/backend/dist/dutchie-az/services/job-queue.js b/backend/dist/dutchie-az/services/job-queue.js deleted file mode 100644 index dca167a7..00000000 --- a/backend/dist/dutchie-az/services/job-queue.js +++ /dev/null @@ -1,414 +0,0 @@ -"use strict"; -/** - * Job Queue Service - * - * DB-backed job queue with claiming/locking for distributed workers. - * Ensures only one worker processes a given store at a time. - */ -var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { - if (k2 === undefined) k2 = k; - var desc = Object.getOwnPropertyDescriptor(m, k); - if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { - desc = { enumerable: true, get: function() { return m[k]; } }; - } - Object.defineProperty(o, k2, desc); -}) : (function(o, m, k, k2) { - if (k2 === undefined) k2 = k; - o[k2] = m[k]; -})); -var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { - Object.defineProperty(o, "default", { enumerable: true, value: v }); -}) : function(o, v) { - o["default"] = v; -}); -var __importStar = (this && this.__importStar) || (function () { - var ownKeys = function(o) { - ownKeys = Object.getOwnPropertyNames || function (o) { - var ar = []; - for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; - return ar; - }; - return ownKeys(o); - }; - return function (mod) { - if (mod && mod.__esModule) return mod; - var result = {}; - if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); - __setModuleDefault(result, mod); - return result; - }; -})(); -Object.defineProperty(exports, "__esModule", { value: true }); -exports.getWorkerId = getWorkerId; -exports.getWorkerHostname = getWorkerHostname; -exports.enqueueJob = enqueueJob; -exports.bulkEnqueueJobs = bulkEnqueueJobs; -exports.claimNextJob = claimNextJob; -exports.updateJobProgress = updateJobProgress; -exports.heartbeat = heartbeat; -exports.completeJob = completeJob; -exports.failJob = failJob; -exports.getQueueStats = getQueueStats; -exports.getActiveWorkers = getActiveWorkers; -exports.getRunningJobs = getRunningJobs; -exports.recoverStaleJobs = recoverStaleJobs; -exports.cleanupOldJobs = cleanupOldJobs; -const connection_1 = require("../db/connection"); -const uuid_1 = require("uuid"); -const os = __importStar(require("os")); -// ============================================================ -// WORKER IDENTITY -// ============================================================ -let _workerId = null; -/** - * Get or create a unique worker ID for this process - * In Kubernetes, uses POD_NAME for clarity; otherwise generates a unique ID - */ -function getWorkerId() { - if (!_workerId) { - // Prefer POD_NAME in K8s (set via fieldRef) - const podName = process.env.POD_NAME; - if (podName) { - _workerId = podName; - } - else { - const hostname = os.hostname(); - const pid = process.pid; - const uuid = (0, uuid_1.v4)().slice(0, 8); - _workerId = `${hostname}-${pid}-${uuid}`; - } - } - return _workerId; -} -/** - * Get hostname for worker tracking - * In Kubernetes, uses POD_NAME; otherwise uses os.hostname() - */ -function getWorkerHostname() { - return process.env.POD_NAME || os.hostname(); -} -// ============================================================ -// JOB ENQUEUEING -// ============================================================ -/** - * Enqueue a new job for processing - * Returns null if a pending/running job already exists for this dispensary - */ -async function enqueueJob(options) { - const { jobType, dispensaryId, priority = 0, metadata, maxRetries = 3, } = options; - // Check if there's already a pending/running job for this dispensary - if (dispensaryId) { - const { rows: existing } = await (0, connection_1.query)(`SELECT id FROM dispensary_crawl_jobs - WHERE dispensary_id = $1 AND status IN ('pending', 'running') - LIMIT 1`, [dispensaryId]); - if (existing.length > 0) { - console.log(`[JobQueue] Skipping enqueue - job already exists for dispensary ${dispensaryId}`); - return null; - } - } - const { rows } = await (0, connection_1.query)(`INSERT INTO dispensary_crawl_jobs (job_type, dispensary_id, status, priority, max_retries, metadata, created_at) - VALUES ($1, $2, 'pending', $3, $4, $5, NOW()) - RETURNING id`, [jobType, dispensaryId || null, priority, maxRetries, metadata ? JSON.stringify(metadata) : null]); - const jobId = rows[0].id; - console.log(`[JobQueue] Enqueued job ${jobId} (type=${jobType}, dispensary=${dispensaryId})`); - return jobId; -} -/** - * Bulk enqueue jobs for multiple dispensaries - * Skips dispensaries that already have pending/running jobs - */ -async function bulkEnqueueJobs(jobType, dispensaryIds, options = {}) { - const { priority = 0, metadata } = options; - // Get dispensaries that already have pending/running jobs - const { rows: existing } = await (0, connection_1.query)(`SELECT DISTINCT dispensary_id FROM dispensary_crawl_jobs - WHERE dispensary_id = ANY($1) AND status IN ('pending', 'running')`, [dispensaryIds]); - const existingSet = new Set(existing.map((r) => r.dispensary_id)); - // Filter out dispensaries with existing jobs - const toEnqueue = dispensaryIds.filter(id => !existingSet.has(id)); - if (toEnqueue.length === 0) { - return { enqueued: 0, skipped: dispensaryIds.length }; - } - // Bulk insert - each row needs 4 params: job_type, dispensary_id, priority, metadata - const metadataJson = metadata ? JSON.stringify(metadata) : null; - const values = toEnqueue.map((_, i) => { - const offset = i * 4; - return `($${offset + 1}, $${offset + 2}, 'pending', $${offset + 3}, 3, $${offset + 4}, NOW())`; - }).join(', '); - const params = []; - toEnqueue.forEach(dispensaryId => { - params.push(jobType, dispensaryId, priority, metadataJson); - }); - await (0, connection_1.query)(`INSERT INTO dispensary_crawl_jobs (job_type, dispensary_id, status, priority, max_retries, metadata, created_at) - VALUES ${values}`, params); - console.log(`[JobQueue] Bulk enqueued ${toEnqueue.length} jobs, skipped ${existingSet.size}`); - return { enqueued: toEnqueue.length, skipped: existingSet.size }; -} -// ============================================================ -// JOB CLAIMING (with locking) -// ============================================================ -/** - * Claim the next available job from the queue - * Uses SELECT FOR UPDATE SKIP LOCKED to prevent double-claims - */ -async function claimNextJob(options) { - const { workerId, jobTypes, lockDurationMinutes = 30 } = options; - const hostname = getWorkerHostname(); - const client = await (0, connection_1.getClient)(); - try { - await client.query('BEGIN'); - // Build job type filter - let typeFilter = ''; - const params = [workerId, hostname, lockDurationMinutes]; - let paramIndex = 4; - if (jobTypes && jobTypes.length > 0) { - typeFilter = `AND job_type = ANY($${paramIndex})`; - params.push(jobTypes); - paramIndex++; - } - // Claim the next pending job using FOR UPDATE SKIP LOCKED - // This atomically selects and locks a row, skipping any already locked by other workers - const { rows } = await client.query(`UPDATE dispensary_crawl_jobs - SET - status = 'running', - claimed_by = $1, - claimed_at = NOW(), - worker_id = $1, - worker_hostname = $2, - started_at = NOW(), - locked_until = NOW() + ($3 || ' minutes')::INTERVAL, - last_heartbeat_at = NOW(), - updated_at = NOW() - WHERE id = ( - SELECT id FROM dispensary_crawl_jobs - WHERE status = 'pending' - ${typeFilter} - ORDER BY priority DESC, created_at ASC - FOR UPDATE SKIP LOCKED - LIMIT 1 - ) - RETURNING *`, params); - await client.query('COMMIT'); - if (rows.length === 0) { - return null; - } - const job = mapDbRowToJob(rows[0]); - console.log(`[JobQueue] Worker ${workerId} claimed job ${job.id} (type=${job.jobType}, dispensary=${job.dispensaryId})`); - return job; - } - catch (error) { - await client.query('ROLLBACK'); - throw error; - } - finally { - client.release(); - } -} -// ============================================================ -// JOB PROGRESS & COMPLETION -// ============================================================ -/** - * Update job progress (for live monitoring) - */ -async function updateJobProgress(jobId, progress) { - const updates = ['last_heartbeat_at = NOW()', 'updated_at = NOW()']; - const params = []; - let paramIndex = 1; - if (progress.productsFound !== undefined) { - updates.push(`products_found = $${paramIndex++}`); - params.push(progress.productsFound); - } - if (progress.productsUpserted !== undefined) { - updates.push(`products_upserted = $${paramIndex++}`); - params.push(progress.productsUpserted); - } - if (progress.snapshotsCreated !== undefined) { - updates.push(`snapshots_created = $${paramIndex++}`); - params.push(progress.snapshotsCreated); - } - if (progress.currentPage !== undefined) { - updates.push(`current_page = $${paramIndex++}`); - params.push(progress.currentPage); - } - if (progress.totalPages !== undefined) { - updates.push(`total_pages = $${paramIndex++}`); - params.push(progress.totalPages); - } - params.push(jobId); - await (0, connection_1.query)(`UPDATE dispensary_crawl_jobs SET ${updates.join(', ')} WHERE id = $${paramIndex}`, params); -} -/** - * Send heartbeat to keep job alive (prevents timeout) - */ -async function heartbeat(jobId) { - await (0, connection_1.query)(`UPDATE dispensary_crawl_jobs - SET last_heartbeat_at = NOW(), locked_until = NOW() + INTERVAL '30 minutes' - WHERE id = $1 AND status = 'running'`, [jobId]); -} -/** - * Mark job as completed - */ -async function completeJob(jobId, result) { - await (0, connection_1.query)(`UPDATE dispensary_crawl_jobs - SET - status = 'completed', - completed_at = NOW(), - products_found = COALESCE($2, products_found), - products_upserted = COALESCE($3, products_upserted), - snapshots_created = COALESCE($4, snapshots_created), - updated_at = NOW() - WHERE id = $1`, [jobId, result.productsFound, result.productsUpserted, result.snapshotsCreated]); - console.log(`[JobQueue] Job ${jobId} completed`); -} -/** - * Mark job as failed - */ -async function failJob(jobId, errorMessage) { - // Check if we should retry - const { rows } = await (0, connection_1.query)(`SELECT retry_count, max_retries FROM dispensary_crawl_jobs WHERE id = $1`, [jobId]); - if (rows.length === 0) - return false; - const { retry_count, max_retries } = rows[0]; - if (retry_count < max_retries) { - // Re-queue for retry - await (0, connection_1.query)(`UPDATE dispensary_crawl_jobs - SET - status = 'pending', - retry_count = retry_count + 1, - claimed_by = NULL, - claimed_at = NULL, - worker_id = NULL, - worker_hostname = NULL, - started_at = NULL, - locked_until = NULL, - last_heartbeat_at = NULL, - error_message = $2, - updated_at = NOW() - WHERE id = $1`, [jobId, errorMessage]); - console.log(`[JobQueue] Job ${jobId} failed, re-queued for retry (${retry_count + 1}/${max_retries})`); - return true; // Will retry - } - else { - // Mark as failed permanently - await (0, connection_1.query)(`UPDATE dispensary_crawl_jobs - SET - status = 'failed', - completed_at = NOW(), - error_message = $2, - updated_at = NOW() - WHERE id = $1`, [jobId, errorMessage]); - console.log(`[JobQueue] Job ${jobId} failed permanently after ${retry_count} retries`); - return false; // No more retries - } -} -// ============================================================ -// QUEUE MONITORING -// ============================================================ -/** - * Get queue statistics - */ -async function getQueueStats() { - const { rows } = await (0, connection_1.query)(`SELECT * FROM v_queue_stats`); - const stats = rows[0] || {}; - return { - pending: parseInt(stats.pending_jobs || '0', 10), - running: parseInt(stats.running_jobs || '0', 10), - completed1h: parseInt(stats.completed_1h || '0', 10), - failed1h: parseInt(stats.failed_1h || '0', 10), - activeWorkers: parseInt(stats.active_workers || '0', 10), - avgDurationSeconds: stats.avg_duration_seconds ? parseFloat(stats.avg_duration_seconds) : null, - }; -} -/** - * Get active workers - */ -async function getActiveWorkers() { - const { rows } = await (0, connection_1.query)(`SELECT * FROM v_active_workers`); - return rows.map((row) => ({ - workerId: row.worker_id, - hostname: row.worker_hostname, - currentJobs: parseInt(row.current_jobs || '0', 10), - totalProductsFound: parseInt(row.total_products_found || '0', 10), - totalProductsUpserted: parseInt(row.total_products_upserted || '0', 10), - totalSnapshots: parseInt(row.total_snapshots || '0', 10), - firstClaimedAt: new Date(row.first_claimed_at), - lastHeartbeat: row.last_heartbeat ? new Date(row.last_heartbeat) : null, - })); -} -/** - * Get running jobs with worker info - */ -async function getRunningJobs() { - const { rows } = await (0, connection_1.query)(`SELECT cj.*, d.name as dispensary_name, d.city - FROM dispensary_crawl_jobs cj - LEFT JOIN dispensaries d ON cj.dispensary_id = d.id - WHERE cj.status = 'running' - ORDER BY cj.started_at DESC`); - return rows.map(mapDbRowToJob); -} -/** - * Recover stale jobs (workers that died without completing) - */ -async function recoverStaleJobs(staleMinutes = 15) { - const { rowCount } = await (0, connection_1.query)(`UPDATE dispensary_crawl_jobs - SET - status = 'pending', - claimed_by = NULL, - claimed_at = NULL, - worker_id = NULL, - worker_hostname = NULL, - started_at = NULL, - locked_until = NULL, - error_message = 'Recovered from stale worker', - retry_count = retry_count + 1, - updated_at = NOW() - WHERE status = 'running' - AND last_heartbeat_at < NOW() - ($1 || ' minutes')::INTERVAL - AND retry_count < max_retries`, [staleMinutes]); - if (rowCount && rowCount > 0) { - console.log(`[JobQueue] Recovered ${rowCount} stale jobs`); - } - return rowCount || 0; -} -/** - * Clean up old completed/failed jobs - */ -async function cleanupOldJobs(olderThanDays = 7) { - const { rowCount } = await (0, connection_1.query)(`DELETE FROM dispensary_crawl_jobs - WHERE status IN ('completed', 'failed') - AND completed_at < NOW() - ($1 || ' days')::INTERVAL`, [olderThanDays]); - if (rowCount && rowCount > 0) { - console.log(`[JobQueue] Cleaned up ${rowCount} old jobs`); - } - return rowCount || 0; -} -// ============================================================ -// HELPERS -// ============================================================ -function mapDbRowToJob(row) { - return { - id: row.id, - jobType: row.job_type, - dispensaryId: row.dispensary_id, - status: row.status, - priority: row.priority || 0, - retryCount: row.retry_count || 0, - maxRetries: row.max_retries || 3, - claimedBy: row.claimed_by, - claimedAt: row.claimed_at ? new Date(row.claimed_at) : null, - workerHostname: row.worker_hostname, - startedAt: row.started_at ? new Date(row.started_at) : null, - completedAt: row.completed_at ? new Date(row.completed_at) : null, - errorMessage: row.error_message, - productsFound: row.products_found || 0, - productsUpserted: row.products_upserted || 0, - snapshotsCreated: row.snapshots_created || 0, - currentPage: row.current_page || 0, - totalPages: row.total_pages, - lastHeartbeatAt: row.last_heartbeat_at ? new Date(row.last_heartbeat_at) : null, - metadata: row.metadata, - createdAt: new Date(row.created_at), - // Add extra fields from join if present - ...(row.dispensary_name && { dispensaryName: row.dispensary_name }), - ...(row.city && { city: row.city }), - }; -} diff --git a/backend/dist/dutchie-az/services/menu-detection.js b/backend/dist/dutchie-az/services/menu-detection.js deleted file mode 100644 index 4a91bf93..00000000 --- a/backend/dist/dutchie-az/services/menu-detection.js +++ /dev/null @@ -1,909 +0,0 @@ -"use strict"; -/** - * Menu Detection Service - * - * Detects menu provider (dutchie, treez, jane, etc.) from dispensary menu_url - * and resolves platform_dispensary_id for dutchie stores. - * - * This service: - * 1. Iterates dispensaries with unknown/missing menu_type or platform_dispensary_id - * 2. Detects provider from menu_url patterns - * 3. For dutchie: extracts cName and resolves platform_dispensary_id via GraphQL - * 4. Logs results to job_run_logs - */ -Object.defineProperty(exports, "__esModule", { value: true }); -exports.crawlWebsiteForMenuLinks = crawlWebsiteForMenuLinks; -exports.detectProviderFromUrl = detectProviderFromUrl; -exports.detectAndResolveDispensary = detectAndResolveDispensary; -exports.runBulkDetection = runBulkDetection; -exports.executeMenuDetectionJob = executeMenuDetectionJob; -exports.getDetectionStats = getDetectionStats; -exports.getDispensariesNeedingDetection = getDispensariesNeedingDetection; -const connection_1 = require("../db/connection"); -const discovery_1 = require("./discovery"); -const graphql_client_1 = require("./graphql-client"); -// Explicit column list for dispensaries table (avoids SELECT * issues with schema differences) -const DISPENSARY_COLUMNS = ` - id, name, slug, city, state, zip, address, latitude, longitude, - menu_type, menu_url, platform_dispensary_id, website, - provider_detection_data, created_at, updated_at -`; -// ============================================================ -// PROVIDER DETECTION PATTERNS -// ============================================================ -const PROVIDER_URL_PATTERNS = [ - // We detect provider based on the actual menu link we find, not just the site domain. - { - provider: 'dutchie', - patterns: [ - /dutchie\.com/i, - /\/embedded-menu\//i, - /\/dispensary\/[A-Z]{2}-/i, // e.g., /dispensary/AZ-store-name - /dutchie-plus/i, - /curaleaf\.com/i, // Curaleaf uses Dutchie platform - /livewithsol\.com/i, // Sol Flower uses Dutchie platform - ], - }, - { - provider: 'treez', - patterns: [ - /treez\.io/i, - /shop\.treez/i, - /treez-ecommerce/i, - ], - }, - { - provider: 'jane', - patterns: [ - /jane\.co/i, - /iheartjane\.com/i, - /embed\.iheartjane/i, - ], - }, - { - provider: 'weedmaps', - patterns: [ - /weedmaps\.com/i, - /menu\.weedmaps/i, - ], - }, - { - provider: 'leafly', - patterns: [ - /leafly\.com/i, - /order\.leafly/i, - ], - }, - { - provider: 'meadow', - patterns: [ - /getmeadow\.com/i, - /meadow\.co/i, - ], - }, - { - provider: 'blaze', - patterns: [ - /blaze\.me/i, - /blazepos\.com/i, - ], - }, - { - provider: 'flowhub', - patterns: [ - /flowhub\.com/i, - /flowhub\.co/i, - ], - }, - { - provider: 'dispense', - patterns: [ - /dispense\.io/i, - /dispenseapp\.com/i, - ], - }, -]; -/** - * Link patterns that suggest a menu or ordering page - */ -const MENU_LINK_PATTERNS = [ - /\/menu/i, - /\/order/i, - /\/shop/i, - /\/products/i, - /\/dispensary/i, - /\/store/i, - /curaleaf\.com/i, - /dutchie\.com/i, - /treez\.io/i, - /jane\.co/i, - /iheartjane\.com/i, - /weedmaps\.com/i, - /leafly\.com/i, - /getmeadow\.com/i, - /blaze\.me/i, - /flowhub\.com/i, - /dispense\.io/i, -]; -/** - * Check if a URL is a Curaleaf store URL - */ -function isCuraleafUrl(url) { - if (!url) - return false; - return /curaleaf\.com\/(stores|dispensary)\//i.test(url); -} -/** - * Fetch a page and extract all links - */ -async function fetchPageLinks(url, timeout = 10000) { - try { - const controller = new AbortController(); - const timeoutId = setTimeout(() => controller.abort(), timeout); - // Use Googlebot User-Agent to bypass age gates on dispensary websites - const response = await fetch(url, { - signal: controller.signal, - headers: { - 'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)', - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', - }, - redirect: 'follow', - }); - clearTimeout(timeoutId); - if (!response.ok) { - return { links: [], error: `HTTP ${response.status}` }; - } - const html = await response.text(); - // Quick check: if the page contains reactEnv.dispensaryId, treat it as Dutchie - // Use direct match for dispensaryId - the [^}]* pattern fails with nested braces in JSON - const reactEnvMatch = /"dispensaryId"\s*:\s*"([a-fA-F0-9]{24})"/i.exec(html); - if (reactEnvMatch && reactEnvMatch[1]) { - return { links: [`dutchie-reactenv:${reactEnvMatch[1]}`] }; - } - // Extract all href attributes from anchor tags - const linkRegex = /href=["']([^"']+)["']/gi; - const links = []; - let match; - while ((match = linkRegex.exec(html)) !== null) { - const href = match[1]; - // Convert relative URLs to absolute - try { - const absoluteUrl = new URL(href, url).href; - links.push(absoluteUrl); - } - catch { - // Skip invalid URLs - } - } - // Also look for iframe src attributes (common for embedded menus) - const iframeRegex = /src=["']([^"']+)["']/gi; - while ((match = iframeRegex.exec(html)) !== null) { - const src = match[1]; - try { - const absoluteUrl = new URL(src, url).href; - // Only add if it matches a provider pattern - for (const { patterns } of PROVIDER_URL_PATTERNS) { - if (patterns.some(p => p.test(absoluteUrl))) { - links.push(absoluteUrl); - break; - } - } - } - catch { - // Skip invalid URLs - } - } - return { links: [...new Set(links)] }; // Deduplicate - } - catch (error) { - if (error.name === 'AbortError') { - return { links: [], error: 'Timeout' }; - } - return { links: [], error: error.message }; - } -} -/** - * Crawl a dispensary's website to find menu provider links - * - * Strategy: - * 1. Fetch the homepage and extract all links - * 2. Look for links that match known provider patterns (dutchie, treez, etc.) - * 3. If no direct match, look for menu/order/shop links and follow them (1-2 hops) - * 4. Check followed pages for provider patterns - */ -async function crawlWebsiteForMenuLinks(websiteUrl) { - console.log(`[WebsiteCrawl] Crawling ${websiteUrl} for menu links...`); - const result = { - menuUrl: null, - provider: 'unknown', - foundLinks: [], - crawledPages: [], - }; - // Normalize URL - let baseUrl; - try { - baseUrl = new URL(websiteUrl); - if (!baseUrl.protocol.startsWith('http')) { - baseUrl = new URL(`https://${websiteUrl}`); - } - } - catch { - result.error = 'Invalid website URL'; - return result; - } - // Step 1: Fetch the homepage - const homepage = baseUrl.href; - result.crawledPages.push(homepage); - const { links: homepageLinks, error: homepageError } = await fetchPageLinks(homepage); - if (homepageError) { - result.error = `Failed to fetch homepage: ${homepageError}`; - return result; - } - result.foundLinks = homepageLinks; - // Step 2: Try to extract reactEnv.dispensaryId (embedded Dutchie menu) from homepage HTML - try { - // Use Googlebot User-Agent to bypass age gates on dispensary websites - const resp = await fetch(homepage, { - headers: { - 'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)', - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', - }, - redirect: 'follow', - }); - if (resp.ok) { - const html = await resp.text(); - // Look for dispensaryId directly - the [^}]* pattern fails with nested braces - const reactEnvMatch = /"dispensaryId"\s*:\s*"([a-fA-F0-9]{24})"/i.exec(html); - if (reactEnvMatch && reactEnvMatch[1]) { - result.provider = 'dutchie'; - result.menuUrl = homepage; - result.platformDispensaryId = reactEnvMatch[1]; - console.log(`[WebsiteCrawl] Found reactEnv.dispensaryId=${reactEnvMatch[1]} on homepage ${homepage}`); - return result; - } - } - } - catch (err) { - console.log(`[WebsiteCrawl] reactEnv check failed for ${homepage}: ${err.message}`); - } - // Step 2: Check for reactEnv token from fetchPageLinks (encoded as dutchie-reactenv:) - for (const link of homepageLinks) { - const reactEnvToken = /^dutchie-reactenv:(.+)$/.exec(link); - if (reactEnvToken) { - result.menuUrl = homepage; - result.provider = 'dutchie'; - result.platformDispensaryId = reactEnvToken[1]; - console.log(`[WebsiteCrawl] Found reactEnv.dispensaryId=${reactEnvToken[1]} on ${homepage}`); - return result; - } - } - // Step 3: Check for direct provider matches in homepage links - for (const link of homepageLinks) { - for (const { provider, patterns } of PROVIDER_URL_PATTERNS) { - if (patterns.some(p => p.test(link))) { - console.log(`[WebsiteCrawl] Found ${provider} link on homepage: ${link}`); - result.menuUrl = link; - result.provider = provider; - return result; - } - } - } - // Step 4: Find menu/order/shop links to follow - const menuLinks = homepageLinks.filter(link => { - // Must be same domain or a known provider domain - try { - const linkUrl = new URL(link); - const isSameDomain = linkUrl.hostname === baseUrl.hostname || - linkUrl.hostname.endsWith(`.${baseUrl.hostname}`); - const isProviderDomain = PROVIDER_URL_PATTERNS.some(({ patterns }) => patterns.some(p => p.test(link))); - const isMenuPath = MENU_LINK_PATTERNS.some(p => p.test(link)); - return (isSameDomain && isMenuPath) || isProviderDomain; - } - catch { - return false; - } - }); - console.log(`[WebsiteCrawl] Found ${menuLinks.length} potential menu links to follow`); - // Step 4: Follow menu links (limit to 3 to avoid excessive crawling) - for (const menuLink of menuLinks.slice(0, 3)) { - // Skip if we've already crawled this page - if (result.crawledPages.includes(menuLink)) - continue; - // Check if this link itself is a provider URL - for (const { provider, patterns } of PROVIDER_URL_PATTERNS) { - if (patterns.some(p => p.test(menuLink))) { - console.log(`[WebsiteCrawl] Menu link is a ${provider} URL: ${menuLink}`); - result.menuUrl = menuLink; - result.provider = provider; - return result; - } - } - result.crawledPages.push(menuLink); - // Rate limit - await new Promise(r => setTimeout(r, 500)); - const { links: pageLinks, error: pageError } = await fetchPageLinks(menuLink); - if (pageError) { - console.log(`[WebsiteCrawl] Failed to fetch ${menuLink}: ${pageError}`); - continue; - } - result.foundLinks.push(...pageLinks); - // Check for provider matches on this page - for (const link of pageLinks) { - for (const { provider, patterns } of PROVIDER_URL_PATTERNS) { - if (patterns.some(p => p.test(link))) { - console.log(`[WebsiteCrawl] Found ${provider} link on ${menuLink}: ${link}`); - result.menuUrl = link; - result.provider = provider; - return result; - } - } - } - } - console.log(`[WebsiteCrawl] No menu provider found on ${websiteUrl}`); - return result; -} -// ============================================================ -// CORE DETECTION FUNCTIONS -// ============================================================ -/** - * Detect menu provider from a URL - */ -function detectProviderFromUrl(menuUrl) { - if (!menuUrl) - return 'unknown'; - for (const { provider, patterns } of PROVIDER_URL_PATTERNS) { - for (const pattern of patterns) { - if (pattern.test(menuUrl)) { - return provider; - } - } - } - // Check if it's a custom website (has a domain but doesn't match known providers) - try { - const url = new URL(menuUrl); - if (url.hostname && !url.hostname.includes('localhost')) { - return 'custom'; - } - } - catch { - // Invalid URL - } - return 'unknown'; -} -/** - * Detect provider and resolve platform ID for a single dispensary - */ -async function detectAndResolveDispensary(dispensaryId) { - console.log(`[MenuDetection] Processing dispensary ${dispensaryId}...`); - // Get dispensary record - const { rows } = await (0, connection_1.query)(`SELECT ${DISPENSARY_COLUMNS} FROM dispensaries WHERE id = $1`, [dispensaryId]); - if (rows.length === 0) { - return { - dispensaryId, - dispensaryName: 'Unknown', - previousMenuType: null, - detectedProvider: 'unknown', - cName: null, - platformDispensaryId: null, - success: false, - error: 'Dispensary not found', - }; - } - const dispensary = (0, discovery_1.mapDbRowToDispensary)(rows[0]); - let menuUrl = dispensary.menuUrl; - const previousMenuType = dispensary.menuType || null; - const website = dispensary.website; - // If menu_url is null or empty, try to discover it by crawling the dispensary website - if (!menuUrl || menuUrl.trim() === '') { - console.log(`[MenuDetection] ${dispensary.name}: No menu_url - attempting website crawl`); - // Check if website is available - if (!website || website.trim() === '') { - console.log(`[MenuDetection] ${dispensary.name}: No website available - marking as not crawlable`); - await (0, connection_1.query)(` - UPDATE dispensaries SET - menu_type = 'unknown', - provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) || - jsonb_build_object( - 'detected_provider', 'unknown'::text, - 'detection_method', 'no_data'::text, - 'detected_at', NOW(), - 'resolution_error', 'No menu_url and no website available'::text, - 'not_crawlable', true, - 'website_crawl_attempted', false - ), - updated_at = NOW() - WHERE id = $1 - `, [dispensaryId]); - return { - dispensaryId, - dispensaryName: dispensary.name, - previousMenuType, - detectedProvider: 'unknown', - cName: null, - platformDispensaryId: null, - success: true, - error: 'No menu_url and no website available - marked as not crawlable', - }; - } - // Crawl the website to find menu provider links - console.log(`[MenuDetection] ${dispensary.name}: Crawling website ${website} for menu links...`); - const crawlResult = await crawlWebsiteForMenuLinks(website); - if (crawlResult.menuUrl && crawlResult.provider !== 'unknown') { - // SUCCESS: Found a menu URL from website crawl! - console.log(`[MenuDetection] ${dispensary.name}: Found ${crawlResult.provider} menu at ${crawlResult.menuUrl}`); - menuUrl = crawlResult.menuUrl; - // Update the dispensary with the discovered menu_url - await (0, connection_1.query)(` - UPDATE dispensaries SET - menu_url = $1, - menu_type = $2, - provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) || - jsonb_build_object( - 'detected_provider', $2::text, - 'detection_method', 'website_crawl'::text, - 'detected_at', NOW(), - 'website_crawled', $3::text, - 'website_crawl_pages', $4::jsonb, - 'not_crawlable', false - ), - updated_at = NOW() - WHERE id = $5 - `, [ - crawlResult.menuUrl, - crawlResult.provider, - website, - JSON.stringify(crawlResult.crawledPages), - dispensaryId - ]); - // Continue with full detection flow using the discovered menu_url - } - else { - // Website crawl failed to find a menu provider - const errorReason = crawlResult.error || 'No menu provider links found on website'; - console.log(`[MenuDetection] ${dispensary.name}: Website crawl failed - ${errorReason}`); - await (0, connection_1.query)(` - UPDATE dispensaries SET - menu_type = 'unknown', - provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) || - jsonb_build_object( - 'detected_provider', 'unknown'::text, - 'detection_method', 'website_crawl'::text, - 'detected_at', NOW(), - 'website_crawled', $1::text, - 'website_crawl_pages', $2::jsonb, - 'resolution_error', $3::text, - 'not_crawlable', true - ), - updated_at = NOW() - WHERE id = $4 - `, [ - website, - JSON.stringify(crawlResult.crawledPages), - errorReason, - dispensaryId - ]); - return { - dispensaryId, - dispensaryName: dispensary.name, - previousMenuType, - detectedProvider: 'unknown', - cName: null, - platformDispensaryId: null, - success: true, - error: `Website crawl failed: ${errorReason}`, - }; - } - } - // Detect provider from URL - const detectedProvider = detectProviderFromUrl(menuUrl); - console.log(`[MenuDetection] ${dispensary.name}: Detected provider = ${detectedProvider} from URL: ${menuUrl}`); - // Initialize result - const result = { - dispensaryId, - dispensaryName: dispensary.name, - previousMenuType, - detectedProvider, - cName: null, - platformDispensaryId: null, - success: false, - }; - // If not dutchie, just update menu_type (non-dutchie providers) - // Note: curaleaf.com and livewithsol.com are detected directly as 'dutchie' via PROVIDER_URL_PATTERNS - if (detectedProvider !== 'dutchie') { - await (0, connection_1.query)(` - UPDATE dispensaries SET - menu_type = $1, - provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) || - jsonb_build_object( - 'detected_provider', $1::text, - 'detection_method', 'url_pattern'::text, - 'detected_at', NOW(), - 'not_crawlable', false - ), - updated_at = NOW() - WHERE id = $2 - `, [detectedProvider, dispensaryId]); - result.success = true; - console.log(`[MenuDetection] ${dispensary.name}: Updated menu_type to ${detectedProvider}`); - return result; - } - // For dutchie: extract cName or platformId from menu_url - const extraction = (0, discovery_1.extractFromMenuUrl)(menuUrl); - if (!extraction) { - result.error = `Could not extract cName or platformId from menu_url: ${menuUrl}`; - await (0, connection_1.query)(` - UPDATE dispensaries SET - menu_type = 'dutchie', - provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) || - jsonb_build_object( - 'detected_provider', 'dutchie'::text, - 'detection_method', 'url_pattern'::text, - 'detected_at', NOW(), - 'resolution_error', $1::text, - 'not_crawlable', true - ), - updated_at = NOW() - WHERE id = $2 - `, [result.error, dispensaryId]); - console.log(`[MenuDetection] ${dispensary.name}: ${result.error}`); - return result; - } - // If URL contains platform_dispensary_id directly (e.g., /api/v2/embedded-menu/.js), skip GraphQL resolution - if (extraction.type === 'platformId') { - const platformId = extraction.value; - result.platformDispensaryId = platformId; - result.success = true; - await (0, connection_1.query)(` - UPDATE dispensaries SET - menu_type = 'dutchie', - platform_dispensary_id = $1, - provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) || - jsonb_build_object( - 'detected_provider', 'dutchie'::text, - 'detection_method', 'url_direct_platform_id'::text, - 'detected_at', NOW(), - 'platform_id_source', 'url_embedded'::text, - 'platform_id_resolved', true, - 'platform_id_resolved_at', NOW(), - 'resolution_error', NULL::text, - 'not_crawlable', false - ), - updated_at = NOW() - WHERE id = $2 - `, [platformId, dispensaryId]); - console.log(`[MenuDetection] ${dispensary.name}: Platform ID extracted directly from URL = ${platformId}`); - return result; - } - // Otherwise, we have a cName that needs GraphQL resolution - const cName = extraction.value; - result.cName = cName; - // Resolve platform_dispensary_id from cName - console.log(`[MenuDetection] ${dispensary.name}: Resolving platform ID for cName = ${cName}`); - try { - const platformId = await (0, graphql_client_1.resolveDispensaryId)(cName); - if (platformId) { - result.platformDispensaryId = platformId; - result.success = true; - await (0, connection_1.query)(` - UPDATE dispensaries SET - menu_type = 'dutchie', - platform_dispensary_id = $1, - provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) || - jsonb_build_object( - 'detected_provider', 'dutchie'::text, - 'detection_method', 'url_pattern'::text, - 'detected_at', NOW(), - 'cname_extracted', $2::text, - 'platform_id_resolved', true, - 'platform_id_resolved_at', NOW(), - 'resolution_error', NULL::text, - 'not_crawlable', false - ), - updated_at = NOW() - WHERE id = $3 - `, [platformId, cName, dispensaryId]); - console.log(`[MenuDetection] ${dispensary.name}: Resolved platform ID = ${platformId}`); - } - else { - // cName resolution failed - try crawling website as fallback - console.log(`[MenuDetection] ${dispensary.name}: cName "${cName}" not found on Dutchie, trying website crawl fallback...`); - if (website && website.trim() !== '') { - const fallbackCrawl = await crawlWebsiteForMenuLinks(website); - if (fallbackCrawl.menuUrl && fallbackCrawl.provider === 'dutchie') { - // Found Dutchie menu via website crawl! - console.log(`[MenuDetection] ${dispensary.name}: Found Dutchie menu via website crawl: ${fallbackCrawl.menuUrl}`); - // Extract from the new menu URL - const newExtraction = (0, discovery_1.extractFromMenuUrl)(fallbackCrawl.menuUrl); - if (newExtraction) { - let fallbackPlatformId = null; - if (newExtraction.type === 'platformId') { - fallbackPlatformId = newExtraction.value; - } - else { - // Try to resolve the new cName - fallbackPlatformId = await (0, graphql_client_1.resolveDispensaryId)(newExtraction.value); - } - if (fallbackPlatformId) { - result.platformDispensaryId = fallbackPlatformId; - result.success = true; - result.cName = newExtraction.value; - await (0, connection_1.query)(` - UPDATE dispensaries SET - menu_type = 'dutchie', - menu_url = $1, - platform_dispensary_id = $2, - provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) || - jsonb_build_object( - 'detected_provider', 'dutchie'::text, - 'detection_method', 'website_crawl_fallback'::text, - 'detected_at', NOW(), - 'original_cname', $3::text, - 'fallback_cname', $4::text, - 'website_crawled', $5::text, - 'platform_id_resolved', true, - 'platform_id_resolved_at', NOW(), - 'not_crawlable', false - ), - updated_at = NOW() - WHERE id = $6 - `, [fallbackCrawl.menuUrl, fallbackPlatformId, cName, newExtraction.value, website, dispensaryId]); - console.log(`[MenuDetection] ${dispensary.name}: Resolved via website crawl, platform ID = ${fallbackPlatformId}`); - return result; - } - } - } - } - // Website crawl fallback didn't work either - result.error = `cName "${cName}" could not be resolved - may not exist on Dutchie`; - await (0, connection_1.query)(` - UPDATE dispensaries SET - menu_type = 'dutchie', - platform_dispensary_id = NULL, - provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) || - jsonb_build_object( - 'detected_provider', 'dutchie'::text, - 'detection_method', 'url_pattern'::text, - 'detected_at', NOW(), - 'cname_extracted', $1::text, - 'platform_id_resolved', false, - 'resolution_error', $2::text, - 'website_crawl_attempted', true, - 'not_crawlable', true - ), - updated_at = NOW() - WHERE id = $3 - `, [cName, result.error, dispensaryId]); - console.log(`[MenuDetection] ${dispensary.name}: ${result.error}`); - } - } - catch (error) { - result.error = `Resolution failed: ${error.message}`; - await (0, connection_1.query)(` - UPDATE dispensaries SET - menu_type = 'dutchie', - provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) || - jsonb_build_object( - 'detected_provider', 'dutchie'::text, - 'detection_method', 'url_pattern'::text, - 'detected_at', NOW(), - 'cname_extracted', $1::text, - 'platform_id_resolved', false, - 'resolution_error', $2::text, - 'not_crawlable', true - ), - updated_at = NOW() - WHERE id = $3 - `, [cName, result.error, dispensaryId]); - console.error(`[MenuDetection] ${dispensary.name}: ${result.error}`); - } - return result; -} -/** - * Run bulk detection on all dispensaries with unknown/missing menu_type or platform_dispensary_id - * Also includes dispensaries with no menu_url but with a website (for website crawl discovery) - */ -async function runBulkDetection(options = {}) { - const { state, onlyUnknown = true, onlyMissingPlatformId = false, includeWebsiteCrawl = true, includeDutchieMissingPlatformId = true, limit, } = options; - console.log('[MenuDetection] Starting bulk detection...'); - // Build query to find dispensaries needing detection - // Includes: dispensaries with menu_url OR (no menu_url but has website and not already marked not_crawlable) - // Optionally includes dutchie stores missing platform ID - let whereClause = `WHERE ( - menu_url IS NOT NULL - ${includeWebsiteCrawl ? `OR ( - menu_url IS NULL - AND website IS NOT NULL - AND website != '' - AND (provider_detection_data IS NULL OR NOT (provider_detection_data->>'not_crawlable')::boolean) - )` : ''} - ${includeDutchieMissingPlatformId ? `OR ( - menu_type = 'dutchie' AND platform_dispensary_id IS NULL - )` : ''} - )`; - const params = []; - let paramIndex = 1; - if (state) { - whereClause += ` AND state = $${paramIndex++}`; - params.push(state); - } - // Handle filters for unknown and/or missing platform IDs - if (onlyUnknown && onlyMissingPlatformId) { - whereClause += ` AND ( - (menu_type IS NULL OR menu_type = '' OR menu_type = 'unknown') - OR (menu_type = 'dutchie' AND platform_dispensary_id IS NULL) - )`; - } - else if (onlyUnknown) { - whereClause += ` AND ( - (menu_type IS NULL OR menu_type = '' OR menu_type = 'unknown') - ${includeDutchieMissingPlatformId ? `OR (menu_type = 'dutchie' AND platform_dispensary_id IS NULL)` : ''} - )`; - } - else if (onlyMissingPlatformId) { - whereClause += ` AND (menu_type = 'dutchie' AND platform_dispensary_id IS NULL)`; - } - else if (includeDutchieMissingPlatformId) { - // Always attempt to resolve dutchie stores missing platform IDs - whereClause += ` AND (menu_type = 'dutchie' AND platform_dispensary_id IS NULL)`; - } - let query_str = ` - SELECT ${DISPENSARY_COLUMNS} FROM dispensaries - ${whereClause} - ORDER BY name - `; - if (limit) { - query_str += ` LIMIT $${paramIndex}`; - params.push(limit); - } - const { rows: dispensaries } = await (0, connection_1.query)(query_str, params); - console.log(`[MenuDetection] Found ${dispensaries.length} dispensaries to process (includeWebsiteCrawl=${includeWebsiteCrawl})`); - const result = { - totalProcessed: 0, - totalSucceeded: 0, - totalFailed: 0, - totalSkipped: 0, - results: [], - errors: [], - }; - for (const row of dispensaries) { - result.totalProcessed++; - try { - const detectionResult = await detectAndResolveDispensary(row.id); - result.results.push(detectionResult); - if (detectionResult.success) { - result.totalSucceeded++; - } - else { - result.totalFailed++; - if (detectionResult.error) { - result.errors.push(`${detectionResult.dispensaryName}: ${detectionResult.error}`); - } - } - // Rate limit between requests - await new Promise(r => setTimeout(r, 1000)); - } - catch (error) { - result.totalFailed++; - result.errors.push(`${row.name || row.id}: ${error.message}`); - } - } - console.log(`[MenuDetection] Bulk detection complete: ${result.totalSucceeded} succeeded, ${result.totalFailed} failed`); - return result; -} -// ============================================================ -// SCHEDULED JOB EXECUTOR -// ============================================================ -/** - * Execute the menu detection job (called by scheduler) - */ -async function executeMenuDetectionJob(config = {}) { - const state = config.state || 'AZ'; - const onlyUnknown = config.onlyUnknown !== false; - // Default to true - always try to resolve platform IDs for dutchie stores - const onlyMissingPlatformId = config.onlyMissingPlatformId !== false; - const includeDutchieMissingPlatformId = config.includeDutchieMissingPlatformId !== false; - console.log(`[MenuDetection] Executing scheduled job for state=${state}...`); - try { - const result = await runBulkDetection({ - state, - onlyUnknown, - onlyMissingPlatformId, - includeDutchieMissingPlatformId, - }); - const status = result.totalFailed === 0 ? 'success' : - result.totalSucceeded === 0 ? 'error' : 'partial'; - return { - status, - itemsProcessed: result.totalProcessed, - itemsSucceeded: result.totalSucceeded, - itemsFailed: result.totalFailed, - errorMessage: result.errors.length > 0 ? result.errors.slice(0, 5).join('; ') : undefined, - metadata: { - state, - onlyUnknown, - onlyMissingPlatformId, - providerCounts: countByProvider(result.results), - }, - }; - } - catch (error) { - return { - status: 'error', - itemsProcessed: 0, - itemsSucceeded: 0, - itemsFailed: 0, - errorMessage: error.message, - }; - } -} -/** - * Count results by detected provider - */ -function countByProvider(results) { - const counts = {}; - for (const r of results) { - counts[r.detectedProvider] = (counts[r.detectedProvider] || 0) + 1; - } - return counts; -} -// ============================================================ -// UTILITY FUNCTIONS -// ============================================================ -/** - * Get detection stats for dashboard - */ -async function getDetectionStats() { - const { rows } = await (0, connection_1.query)(` - SELECT - COUNT(*) as total, - COUNT(*) FILTER (WHERE menu_type IS NOT NULL AND menu_type != '' AND menu_type != 'unknown') as with_menu_type, - COUNT(*) FILTER (WHERE platform_dispensary_id IS NOT NULL) as with_platform_id, - COUNT(*) FILTER (WHERE menu_url IS NOT NULL AND (menu_type IS NULL OR menu_type = '' OR menu_type = 'unknown')) as needs_detection - FROM dispensaries - WHERE state = 'AZ' - `); - const stats = rows[0] || {}; - // Get provider breakdown - const { rows: providerRows } = await (0, connection_1.query)(` - SELECT menu_type, COUNT(*) as count - FROM dispensaries - WHERE state = 'AZ' AND menu_type IS NOT NULL AND menu_type != '' - GROUP BY menu_type - ORDER BY count DESC - `); - const byProvider = {}; - for (const row of providerRows) { - byProvider[row.menu_type] = parseInt(row.count, 10); - } - return { - totalDispensaries: parseInt(stats.total || '0', 10), - withMenuType: parseInt(stats.with_menu_type || '0', 10), - withPlatformId: parseInt(stats.with_platform_id || '0', 10), - needsDetection: parseInt(stats.needs_detection || '0', 10), - byProvider, - }; -} -/** - * Get dispensaries needing detection - * Includes dispensaries with website but no menu_url for website crawl discovery - */ -async function getDispensariesNeedingDetection(options = {}) { - const { state = 'AZ', limit = 100, includeWebsiteCrawl = true } = options; - const { rows } = await (0, connection_1.query)(` - SELECT ${DISPENSARY_COLUMNS} FROM dispensaries - WHERE state = $1 - AND ( - (menu_url IS NOT NULL AND (menu_type IS NULL OR menu_type = '' OR menu_type = 'unknown' - OR (menu_type = 'dutchie' AND platform_dispensary_id IS NULL))) - ${includeWebsiteCrawl ? `OR ( - menu_url IS NULL - AND website IS NOT NULL - AND website != '' - AND (provider_detection_data IS NULL OR NOT (provider_detection_data->>'not_crawlable')::boolean) - )` : ''} - ) - ORDER BY name - LIMIT $2 - `, [state, limit]); - return rows.map(discovery_1.mapDbRowToDispensary); -} diff --git a/backend/dist/dutchie-az/services/product-crawler.js b/backend/dist/dutchie-az/services/product-crawler.js deleted file mode 100644 index b831835d..00000000 --- a/backend/dist/dutchie-az/services/product-crawler.js +++ /dev/null @@ -1,843 +0,0 @@ -"use strict"; -/** - * Dutchie AZ Product Crawler Service - * - * Crawls products from Dutchie dispensaries and stores them in the dutchie_az database. - * Handles normalization from GraphQL response to database entities. - * - * IMPORTANT: Uses chunked batch processing per CLAUDE.md Rule #15 to avoid OOM. - */ -Object.defineProperty(exports, "__esModule", { value: true }); -exports.normalizeProduct = normalizeProduct; -exports.normalizeSnapshot = normalizeSnapshot; -exports.crawlDispensaryProducts = crawlDispensaryProducts; -exports.crawlAllArizonaDispensaries = crawlAllArizonaDispensaries; -const connection_1 = require("../db/connection"); -const graphql_client_1 = require("./graphql-client"); -const discovery_1 = require("./discovery"); -const types_1 = require("../types"); -const image_storage_1 = require("../../utils/image-storage"); -// Explicit column list for dispensaries table (avoids SELECT * issues with schema differences) -const DISPENSARY_COLUMNS = ` - id, name, slug, city, state, zip, address, latitude, longitude, - menu_type, menu_url, platform_dispensary_id, website, - provider_detection_data, created_at, updated_at -`; -// ============================================================ -// BATCH PROCESSING CONFIGURATION -// ============================================================ -/** Chunk size for batch DB writes (per CLAUDE.md Rule #15) */ -const BATCH_CHUNK_SIZE = 100; -// ============================================================ -// NORMALIZATION FUNCTIONS -// ============================================================ -/** - * Convert price to cents - */ -function toCents(price) { - if (price === undefined || price === null) - return undefined; - return Math.round(price * 100); -} -/** - * Get min value from array of numbers - */ -function getMin(arr) { - if (!arr || arr.length === 0) - return undefined; - return Math.min(...arr.filter((n) => n !== null && n !== undefined)); -} -/** - * Get max value from array of numbers - */ -function getMax(arr) { - if (!arr || arr.length === 0) - return undefined; - return Math.max(...arr.filter((n) => n !== null && n !== undefined)); -} -/** - * Normalize a value to boolean - * Handles Dutchie API returning {} or [] or other non-boolean values - * that would cause "invalid input syntax for type boolean" errors - */ -function normBool(v, defaultVal = false) { - if (v === true) - return true; - if (v === false) - return false; - // Log unexpected object/array values once for debugging - if (v !== null && v !== undefined && typeof v === 'object') { - console.warn(`[normBool] Unexpected object value, coercing to ${defaultVal}:`, JSON.stringify(v)); - } - return defaultVal; -} -/** - * Normalize a value to Date or undefined - * Handles Dutchie API returning {} or [] or other non-date values - * that would cause "invalid input syntax for type timestamp" errors - */ -function normDate(v) { - if (!v) - return undefined; - // Reject objects/arrays that aren't dates - if (typeof v === 'object' && !(v instanceof Date)) { - console.warn(`[normDate] Unexpected object value, ignoring:`, JSON.stringify(v)); - return undefined; - } - // Try parsing - const d = new Date(v); - if (isNaN(d.getTime())) { - console.warn(`[normDate] Invalid date value, ignoring:`, v); - return undefined; - } - return d; -} -/** - * Extract cName (Dutchie slug) from menuUrl or dispensary slug - * Handles URL formats: - * - https://dutchie.com/embedded-menu/AZ-Deeply-Rooted -> AZ-Deeply-Rooted - * - https://dutchie.com/dispensary/sol-flower-dispensary-mcclintock -> sol-flower-dispensary-mcclintock - * Falls back to dispensary.slug if menuUrl extraction fails - */ -function extractCName(dispensary) { - if (dispensary.menuUrl) { - try { - const url = new URL(dispensary.menuUrl); - // Extract last path segment: /embedded-menu/X or /dispensary/X - const segments = url.pathname.split('/').filter(Boolean); - if (segments.length >= 2) { - const cName = segments[segments.length - 1]; - if (cName) { - console.log(`[ProductCrawler] Extracted cName "${cName}" from menuUrl`); - return cName; - } - } - } - catch (e) { - console.warn(`[ProductCrawler] Failed to parse menuUrl: ${dispensary.menuUrl}`); - } - } - // Fallback to slug - console.log(`[ProductCrawler] Using dispensary slug "${dispensary.slug}" as cName`); - return dispensary.slug; -} -/** - * Normalize a POSMetaData.children entry to DutchieProductOptionSnapshot - */ -function normalizeOption(child) { - return { - optionId: child.canonicalID || child.canonicalPackageId || child.canonicalSKU || child.option || 'unknown', - canonicalId: child.canonicalID, - canonicalPackageId: child.canonicalPackageId, - canonicalSKU: child.canonicalSKU, - canonicalName: child.canonicalName, - canonicalCategory: child.canonicalCategory, - canonicalCategoryId: child.canonicalCategoryId, - canonicalBrandId: child.canonicalBrandId, - canonicalBrandName: child.canonicalBrandName, - canonicalStrainId: child.canonicalStrainId, - canonicalVendorId: child.canonicalVendorId, - optionLabel: child.option, - packageQuantity: child.packageQuantity, - recEquivalent: child.recEquivalent, - standardEquivalent: child.standardEquivalent, - priceCents: toCents(child.price), - recPriceCents: toCents(child.recPrice), - medPriceCents: toCents(child.medPrice), - quantity: child.quantity, - quantityAvailable: child.quantityAvailable, - kioskQuantityAvailable: child.kioskQuantityAvailable, - activeBatchTags: child.activeBatchTags, - canonicalImgUrl: child.canonicalImgUrl, - canonicalLabResultUrl: child.canonicalLabResultUrl, - canonicalEffectivePotencyMg: child.canonicalEffectivePotencyMg, - rawChildPayload: child, - }; -} -/** - * Normalize a raw Dutchie product to DutchieProduct (canonical identity) - */ -function normalizeProduct(raw, dispensaryId, platformDispensaryId) { - return { - dispensaryId, - platform: 'dutchie', - externalProductId: raw._id || raw.id || '', - platformDispensaryId, - cName: raw.cName, - name: raw.Name, - // Brand - brandName: raw.brandName || raw.brand?.name, - brandId: raw.brandId || raw.brand?.id, - brandLogoUrl: raw.brandLogo || raw.brand?.imageUrl, - // Classification - type: raw.type, - subcategory: raw.subcategory, - strainType: raw.strainType, - provider: raw.provider, - // Potency - thc: raw.THC, - thcContent: raw.THCContent?.range?.[0], - cbd: raw.CBD, - cbdContent: raw.CBDContent?.range?.[0], - cannabinoidsV2: raw.cannabinoidsV2, - effects: raw.effects, - // Status / flags - status: raw.Status, - medicalOnly: normBool(raw.medicalOnly, false), - recOnly: normBool(raw.recOnly, false), - featured: normBool(raw.featured, false), - comingSoon: normBool(raw.comingSoon, false), - certificateOfAnalysisEnabled: normBool(raw.certificateOfAnalysisEnabled, false), - isBelowThreshold: normBool(raw.isBelowThreshold, false), - isBelowKioskThreshold: normBool(raw.isBelowKioskThreshold, false), - optionsBelowThreshold: normBool(raw.optionsBelowThreshold, false), - optionsBelowKioskThreshold: normBool(raw.optionsBelowKioskThreshold, false), - // Derived stock status - stockStatus: (0, types_1.deriveStockStatus)(raw), - totalQuantityAvailable: (0, types_1.calculateTotalQuantity)(raw), - // Images - primaryImageUrl: raw.Image || raw.images?.[0]?.url, - images: raw.images, - // Misc - measurements: raw.measurements, - weight: typeof raw.weight === 'number' ? String(raw.weight) : raw.weight, - pastCNames: raw.pastCNames, - createdAtDutchie: normDate(raw.createdAt), - updatedAtDutchie: normDate(raw.updatedAt), - latestRawPayload: raw, - }; -} -/** - * Normalize a raw Dutchie product to DutchieProductSnapshot (time-series data) - */ -function normalizeSnapshot(raw, dutchieProductId, dispensaryId, platformDispensaryId, pricingType, crawlMode = 'mode_a') { - const children = raw.POSMetaData?.children || []; - const options = children.map(normalizeOption); - // Aggregate prices from various sources - const recPrices = raw.recPrices || []; - const medPrices = raw.medicalPrices || []; - const recSpecialPrices = raw.recSpecialPrices || []; - const medSpecialPrices = raw.medicalSpecialPrices || []; - const wholesalePrices = raw.wholesalePrices || []; - // Also consider child prices - const childRecPrices = children.map((c) => c.recPrice).filter((p) => p !== undefined); - const childMedPrices = children.map((c) => c.medPrice).filter((p) => p !== undefined); - const childPrices = children.map((c) => c.price).filter((p) => p !== undefined); - // Aggregate inventory - use calculateTotalQuantity for proper null handling - const totalQty = (0, types_1.calculateTotalQuantity)(raw); - const hasAnyKioskQty = children.some(c => typeof c.kioskQuantityAvailable === 'number'); - const totalKioskQty = hasAnyKioskQty - ? children.reduce((sum, c) => sum + (c.kioskQuantityAvailable || 0), 0) - : null; - // Determine if on special - const isOnSpecial = raw.special === true || - (raw.specialData?.saleSpecials && raw.specialData.saleSpecials.length > 0) || - (recSpecialPrices.length > 0 && recSpecialPrices[0] !== null) || - (medSpecialPrices.length > 0 && medSpecialPrices[0] !== null); - return { - dutchieProductId, - dispensaryId, - platformDispensaryId, - externalProductId: raw._id || raw.id || '', - pricingType, - crawlMode, - status: raw.Status, - featured: normBool(raw.featured, false), - special: normBool(isOnSpecial, false), - medicalOnly: normBool(raw.medicalOnly, false), - recOnly: normBool(raw.recOnly, false), - // Product was present in feed - isPresentInFeed: true, - // Derived stock status - stockStatus: (0, types_1.deriveStockStatus)(raw), - // Price summary - recMinPriceCents: toCents(getMin([...recPrices, ...childRecPrices, ...childPrices])), - recMaxPriceCents: toCents(getMax([...recPrices, ...childRecPrices, ...childPrices])), - recMinSpecialPriceCents: toCents(getMin(recSpecialPrices)), - medMinPriceCents: toCents(getMin([...medPrices, ...childMedPrices])), - medMaxPriceCents: toCents(getMax([...medPrices, ...childMedPrices])), - medMinSpecialPriceCents: toCents(getMin(medSpecialPrices)), - wholesaleMinPriceCents: toCents(getMin(wholesalePrices)), - // Inventory summary - null = unknown, 0 = all OOS - totalQuantityAvailable: totalQty, - totalKioskQuantityAvailable: totalKioskQty, - manualInventory: normBool(raw.manualInventory, false), - isBelowThreshold: normBool(raw.isBelowThreshold, false), - isBelowKioskThreshold: normBool(raw.isBelowKioskThreshold, false), - options, - rawPayload: raw, - crawledAt: new Date(), - }; -} -// ============================================================ -// DATABASE OPERATIONS -// ============================================================ -/** - * Upsert a DutchieProduct record - */ -async function upsertProduct(product) { - const result = await (0, connection_1.query)(` - INSERT INTO dutchie_products ( - dispensary_id, platform, external_product_id, platform_dispensary_id, - c_name, name, brand_name, brand_id, brand_logo_url, - type, subcategory, strain_type, provider, - thc, thc_content, cbd, cbd_content, cannabinoids_v2, effects, - status, medical_only, rec_only, featured, coming_soon, certificate_of_analysis_enabled, - is_below_threshold, is_below_kiosk_threshold, options_below_threshold, options_below_kiosk_threshold, - stock_status, total_quantity_available, - primary_image_url, images, measurements, weight, past_c_names, - created_at_dutchie, updated_at_dutchie, latest_raw_payload, updated_at - ) VALUES ( - $1, $2, $3, $4, - $5, $6, $7, $8, $9, - $10, $11, $12, $13, - $14, $15, $16, $17, $18, $19, - $20, $21, $22, $23, $24, $25, - $26, $27, $28, $29, - $30, $31, - $32, $33, $34, $35, $36, - $37, $38, $39, NOW() - ) - ON CONFLICT (dispensary_id, external_product_id) DO UPDATE SET - c_name = EXCLUDED.c_name, - name = EXCLUDED.name, - brand_name = EXCLUDED.brand_name, - brand_id = EXCLUDED.brand_id, - brand_logo_url = EXCLUDED.brand_logo_url, - type = EXCLUDED.type, - subcategory = EXCLUDED.subcategory, - strain_type = EXCLUDED.strain_type, - provider = EXCLUDED.provider, - thc = EXCLUDED.thc, - thc_content = EXCLUDED.thc_content, - cbd = EXCLUDED.cbd, - cbd_content = EXCLUDED.cbd_content, - cannabinoids_v2 = EXCLUDED.cannabinoids_v2, - effects = EXCLUDED.effects, - status = EXCLUDED.status, - medical_only = EXCLUDED.medical_only, - rec_only = EXCLUDED.rec_only, - featured = EXCLUDED.featured, - coming_soon = EXCLUDED.coming_soon, - certificate_of_analysis_enabled = EXCLUDED.certificate_of_analysis_enabled, - is_below_threshold = EXCLUDED.is_below_threshold, - is_below_kiosk_threshold = EXCLUDED.is_below_kiosk_threshold, - options_below_threshold = EXCLUDED.options_below_threshold, - options_below_kiosk_threshold = EXCLUDED.options_below_kiosk_threshold, - stock_status = EXCLUDED.stock_status, - total_quantity_available = EXCLUDED.total_quantity_available, - primary_image_url = EXCLUDED.primary_image_url, - images = EXCLUDED.images, - measurements = EXCLUDED.measurements, - weight = EXCLUDED.weight, - past_c_names = EXCLUDED.past_c_names, - created_at_dutchie = EXCLUDED.created_at_dutchie, - updated_at_dutchie = EXCLUDED.updated_at_dutchie, - latest_raw_payload = EXCLUDED.latest_raw_payload, - updated_at = NOW() - RETURNING id - `, [ - product.dispensaryId, - product.platform, - product.externalProductId, - product.platformDispensaryId, - product.cName, - product.name, - product.brandName, - product.brandId, - product.brandLogoUrl, - product.type, - product.subcategory, - product.strainType, - product.provider, - product.thc, - product.thcContent, - product.cbd, - product.cbdContent, - product.cannabinoidsV2 ? JSON.stringify(product.cannabinoidsV2) : null, - product.effects ? JSON.stringify(product.effects) : null, - product.status, - product.medicalOnly, - product.recOnly, - product.featured, - product.comingSoon, - product.certificateOfAnalysisEnabled, - product.isBelowThreshold, - product.isBelowKioskThreshold, - product.optionsBelowThreshold, - product.optionsBelowKioskThreshold, - product.stockStatus, - product.totalQuantityAvailable, - product.primaryImageUrl, - product.images ? JSON.stringify(product.images) : null, - product.measurements ? JSON.stringify(product.measurements) : null, - product.weight, - product.pastCNames, - product.createdAtDutchie, - product.updatedAtDutchie, - product.latestRawPayload ? JSON.stringify(product.latestRawPayload) : null, - ]); - return result.rows[0].id; -} -/** - * Download product image and update local image URLs - * Skips download if local image already exists for this product+URL combo - */ -async function downloadAndUpdateProductImage(productId, dispensaryId, externalProductId, primaryImageUrl) { - if (!primaryImageUrl) { - return { downloaded: false, error: 'No image URL' }; - } - try { - // Check if we already have this image locally - const exists = await (0, image_storage_1.imageExists)(dispensaryId, externalProductId, primaryImageUrl); - if (exists) { - return { downloaded: false }; - } - // Download and process the image - const result = await (0, image_storage_1.downloadProductImage)(primaryImageUrl, dispensaryId, externalProductId); - if (!result.success || !result.urls) { - return { downloaded: false, error: result.error }; - } - // Update the product record with local image URLs - await (0, connection_1.query)(` - UPDATE dutchie_products - SET - local_image_url = $1, - local_image_thumb_url = $2, - local_image_medium_url = $3, - original_image_url = COALESCE(original_image_url, primary_image_url), - updated_at = NOW() - WHERE id = $4 - `, [result.urls.full, result.urls.thumb, result.urls.medium, productId]); - return { downloaded: true }; - } - catch (error) { - return { downloaded: false, error: error.message }; - } -} -/** - * Insert a snapshot record - */ -async function insertSnapshot(snapshot) { - const result = await (0, connection_1.query)(` - INSERT INTO dutchie_product_snapshots ( - dutchie_product_id, dispensary_id, platform_dispensary_id, external_product_id, - pricing_type, crawl_mode, status, featured, special, medical_only, rec_only, - is_present_in_feed, stock_status, - rec_min_price_cents, rec_max_price_cents, rec_min_special_price_cents, - med_min_price_cents, med_max_price_cents, med_min_special_price_cents, - wholesale_min_price_cents, - total_quantity_available, total_kiosk_quantity_available, manual_inventory, - is_below_threshold, is_below_kiosk_threshold, - options, raw_payload, crawled_at - ) VALUES ( - $1, $2, $3, $4, - $5, $6, $7, $8, $9, $10, $11, - $12, $13, - $14, $15, $16, - $17, $18, $19, - $20, - $21, $22, $23, - $24, $25, - $26, $27, $28 - ) - RETURNING id - `, [ - snapshot.dutchieProductId, - snapshot.dispensaryId, - snapshot.platformDispensaryId, - snapshot.externalProductId, - snapshot.pricingType, - snapshot.crawlMode, - snapshot.status, - snapshot.featured, - snapshot.special, - snapshot.medicalOnly, - snapshot.recOnly, - snapshot.isPresentInFeed ?? true, - snapshot.stockStatus, - snapshot.recMinPriceCents, - snapshot.recMaxPriceCents, - snapshot.recMinSpecialPriceCents, - snapshot.medMinPriceCents, - snapshot.medMaxPriceCents, - snapshot.medMinSpecialPriceCents, - snapshot.wholesaleMinPriceCents, - snapshot.totalQuantityAvailable, - snapshot.totalKioskQuantityAvailable, - snapshot.manualInventory, - snapshot.isBelowThreshold, - snapshot.isBelowKioskThreshold, - JSON.stringify(snapshot.options || []), - JSON.stringify(snapshot.rawPayload || {}), - snapshot.crawledAt, - ]); - return result.rows[0].id; -} -// ============================================================ -// BATCH DATABASE OPERATIONS (per CLAUDE.md Rule #15) -// ============================================================ -/** - * Helper to chunk an array into smaller arrays - */ -function chunkArray(array, size) { - const chunks = []; - for (let i = 0; i < array.length; i += size) { - chunks.push(array.slice(i, i + size)); - } - return chunks; -} -/** - * Batch upsert products - processes in chunks to avoid OOM - * Returns a Map of externalProductId -> database id - */ -async function batchUpsertProducts(products) { - const productIdMap = new Map(); - const chunks = chunkArray(products, BATCH_CHUNK_SIZE); - console.log(`[ProductCrawler] Batch upserting ${products.length} products in ${chunks.length} chunks of ${BATCH_CHUNK_SIZE}...`); - for (let i = 0; i < chunks.length; i++) { - const chunk = chunks[i]; - // Process each product in the chunk - for (const product of chunk) { - try { - const id = await upsertProduct(product); - if (product.externalProductId) { - productIdMap.set(product.externalProductId, id); - } - } - catch (error) { - console.error(`[ProductCrawler] Error upserting product ${product.externalProductId}:`, error.message); - } - } - // Log progress - if ((i + 1) % 5 === 0 || i === chunks.length - 1) { - console.log(`[ProductCrawler] Upserted chunk ${i + 1}/${chunks.length} (${productIdMap.size} products so far)`); - } - } - return productIdMap; -} -/** - * Batch insert snapshots - processes in chunks to avoid OOM - */ -async function batchInsertSnapshots(snapshots) { - const chunks = chunkArray(snapshots, BATCH_CHUNK_SIZE); - let inserted = 0; - console.log(`[ProductCrawler] Batch inserting ${snapshots.length} snapshots in ${chunks.length} chunks of ${BATCH_CHUNK_SIZE}...`); - for (let i = 0; i < chunks.length; i++) { - const chunk = chunks[i]; - // Process each snapshot in the chunk - for (const snapshot of chunk) { - try { - await insertSnapshot(snapshot); - inserted++; - } - catch (error) { - console.error(`[ProductCrawler] Error inserting snapshot for ${snapshot.externalProductId}:`, error.message); - } - } - // Log progress - if ((i + 1) % 5 === 0 || i === chunks.length - 1) { - console.log(`[ProductCrawler] Inserted snapshot chunk ${i + 1}/${chunks.length} (${inserted} snapshots so far)`); - } - } - return inserted; -} -/** - * Update dispensary last_crawled_at and product_count - */ -async function updateDispensaryCrawlStats(dispensaryId, productCount) { - // Update last_crawl_at to track when we last crawled - // Skip product_count as that column may not exist - await (0, connection_1.query)(` - UPDATE dispensaries - SET last_crawl_at = NOW(), updated_at = NOW() - WHERE id = $1 - `, [dispensaryId]); -} -/** - * Mark products as missing from feed - * Creates a snapshot with isPresentInFeed=false and stockStatus='missing_from_feed' - * for products that were NOT in the UNION of Mode A and Mode B product lists - * - * IMPORTANT: Uses UNION of both modes to avoid false positives - * If the union is empty (possible outage), we skip marking to avoid data corruption - */ -async function markMissingProducts(dispensaryId, platformDispensaryId, modeAProductIds, modeBProductIds, pricingType) { - // Build UNION of Mode A + Mode B product IDs - const unionProductIds = new Set([...Array.from(modeAProductIds), ...Array.from(modeBProductIds)]); - // OUTAGE DETECTION: If union is empty, something went wrong - don't mark anything as missing - if (unionProductIds.size === 0) { - console.warn('[ProductCrawler] OUTAGE DETECTED: Both Mode A and Mode B returned 0 products. Skipping missing product marking.'); - return 0; - } - // Get all existing products for this dispensary that were not in the UNION - const { rows: missingProducts } = await (0, connection_1.query)(` - SELECT id, external_product_id, name - FROM dutchie_products - WHERE dispensary_id = $1 - AND external_product_id NOT IN (SELECT unnest($2::text[])) - `, [dispensaryId, Array.from(unionProductIds)]); - if (missingProducts.length === 0) { - return 0; - } - console.log(`[ProductCrawler] Marking ${missingProducts.length} products as missing from feed (union of ${modeAProductIds.size} Mode A + ${modeBProductIds.size} Mode B = ${unionProductIds.size} unique)...`); - const crawledAt = new Date(); - // Build all missing snapshots first (per CLAUDE.md Rule #15 - batch writes) - const missingSnapshots = missingProducts.map(product => ({ - dutchieProductId: product.id, - dispensaryId, - platformDispensaryId, - externalProductId: product.external_product_id, - pricingType, - crawlMode: 'mode_a', // Use mode_a for missing snapshots (convention) - status: undefined, - featured: false, - special: false, - medicalOnly: false, - recOnly: false, - isPresentInFeed: false, - stockStatus: 'missing_from_feed', - totalQuantityAvailable: undefined, // null = unknown, not 0 - manualInventory: false, - isBelowThreshold: false, - isBelowKioskThreshold: false, - options: [], - rawPayload: { _missingFromFeed: true, lastKnownName: product.name }, - crawledAt, - })); - // Batch insert missing snapshots - const snapshotsInserted = await batchInsertSnapshots(missingSnapshots); - // Batch update product stock status in chunks - const productIds = missingProducts.map(p => p.id); - const productChunks = chunkArray(productIds, BATCH_CHUNK_SIZE); - console.log(`[ProductCrawler] Updating ${productIds.length} product statuses in ${productChunks.length} chunks...`); - for (const chunk of productChunks) { - await (0, connection_1.query)(` - UPDATE dutchie_products - SET stock_status = 'missing_from_feed', total_quantity_available = NULL, updated_at = NOW() - WHERE id = ANY($1::int[]) - `, [chunk]); - } - console.log(`[ProductCrawler] Marked ${snapshotsInserted} products as missing from feed`); - return snapshotsInserted; -} -/** - * Process a batch of products from a single crawl mode - * IMPORTANT: Stores ALL products, never filters before DB - * Uses chunked batch processing per CLAUDE.md Rule #15 to avoid OOM - * Returns the set of external product IDs that were processed - */ -async function processProducts(products, dispensary, pricingType, crawlMode, options = {}) { - const { downloadImages = true } = options; - const productIds = new Set(); - let imagesDownloaded = 0; - let imageErrors = 0; - console.log(`[ProductCrawler] Processing ${products.length} products using chunked batch processing...`); - // Step 1: Normalize all products and collect IDs - const normalizedProducts = []; - const rawByExternalId = new Map(); - for (const raw of products) { - const externalId = raw._id || raw.id || ''; - productIds.add(externalId); - rawByExternalId.set(externalId, raw); - const normalized = normalizeProduct(raw, dispensary.id, dispensary.platformDispensaryId); - normalizedProducts.push(normalized); - } - // Step 2: Batch upsert products (chunked) - const productIdMap = await batchUpsertProducts(normalizedProducts); - const upserted = productIdMap.size; - // Step 3: Create and batch insert snapshots (chunked) - // IMPORTANT: Do this BEFORE image downloads to ensure snapshots are created even if images fail - const snapshots = []; - for (const [externalId, productId] of Array.from(productIdMap.entries())) { - const raw = rawByExternalId.get(externalId); - if (raw) { - const snapshot = normalizeSnapshot(raw, productId, dispensary.id, dispensary.platformDispensaryId, pricingType, crawlMode); - snapshots.push(snapshot); - } - } - const snapshotsInserted = await batchInsertSnapshots(snapshots); - // Step 4: Download images in chunks (if enabled) - // This is done AFTER snapshots to ensure core data is saved even if image downloads fail - if (downloadImages) { - const imageChunks = chunkArray(Array.from(productIdMap.entries()), BATCH_CHUNK_SIZE); - console.log(`[ProductCrawler] Downloading images in ${imageChunks.length} chunks...`); - for (let i = 0; i < imageChunks.length; i++) { - const chunk = imageChunks[i]; - for (const [externalId, productId] of chunk) { - const normalized = normalizedProducts.find(p => p.externalProductId === externalId); - if (normalized?.primaryImageUrl) { - try { - const imageResult = await downloadAndUpdateProductImage(productId, dispensary.id, externalId, normalized.primaryImageUrl); - if (imageResult.downloaded) { - imagesDownloaded++; - } - else if (imageResult.error && imageResult.error !== 'No image URL') { - imageErrors++; - } - } - catch (error) { - imageErrors++; - } - } - } - if ((i + 1) % 5 === 0 || i === imageChunks.length - 1) { - console.log(`[ProductCrawler] Image download chunk ${i + 1}/${imageChunks.length} (${imagesDownloaded} downloaded, ${imageErrors} errors)`); - } - } - } - // Clear references to help GC - normalizedProducts.length = 0; - rawByExternalId.clear(); - return { upserted, snapshots: snapshotsInserted, productIds, imagesDownloaded, imageErrors }; -} -async function crawlDispensaryProducts(dispensary, pricingType = 'rec', options = {}) { - const { useBothModes = true, downloadImages = true, onProgress } = options; - const startTime = Date.now(); - if (!dispensary.platformDispensaryId) { - return { - success: false, - dispensaryId: dispensary.id, - productsFound: 0, - productsFetched: 0, - productsUpserted: 0, - snapshotsCreated: 0, - errorMessage: 'Missing platformDispensaryId', - durationMs: Date.now() - startTime, - }; - } - try { - console.log(`[ProductCrawler] Crawling ${dispensary.name} (${dispensary.platformDispensaryId})...`); - let totalUpserted = 0; - let totalSnapshots = 0; - let totalImagesDownloaded = 0; - let totalImageErrors = 0; - let modeAProducts = 0; - let modeBProducts = 0; - let missingMarked = 0; - // Track product IDs separately for each mode (needed for missing product detection) - const modeAProductIds = new Set(); - const modeBProductIds = new Set(); - // Extract cName for this specific dispensary (used for Puppeteer session & headers) - const cName = extractCName(dispensary); - console.log(`[ProductCrawler] Using cName="${cName}" for dispensary ${dispensary.name}`); - if (useBothModes) { - // Run two-mode crawl for maximum coverage - const bothResults = await (0, graphql_client_1.fetchAllProductsBothModes)(dispensary.platformDispensaryId, pricingType, { cName }); - modeAProducts = bothResults.modeA.products.length; - modeBProducts = bothResults.modeB.products.length; - console.log(`[ProductCrawler] Two-mode crawl: Mode A=${modeAProducts}, Mode B=${modeBProducts}, Merged=${bothResults.merged.products.length}`); - // Collect Mode A product IDs - for (const p of bothResults.modeA.products) { - modeAProductIds.add(p._id); - } - // Collect Mode B product IDs - for (const p of bothResults.modeB.products) { - modeBProductIds.add(p._id); - } - // Process MERGED products (includes options from both modes) - if (bothResults.merged.products.length > 0) { - const mergedResult = await processProducts(bothResults.merged.products, dispensary, pricingType, 'mode_a', // Use mode_a for merged products (convention) - { downloadImages }); - totalUpserted = mergedResult.upserted; - totalSnapshots = mergedResult.snapshots; - totalImagesDownloaded = mergedResult.imagesDownloaded; - totalImageErrors = mergedResult.imageErrors; - // Report progress - if (onProgress) { - await onProgress({ - productsFound: bothResults.merged.products.length, - productsUpserted: totalUpserted, - snapshotsCreated: totalSnapshots, - currentPage: 1, - totalPages: 1, - }); - } - } - } - else { - // Single mode crawl (Mode A only) - const { products, crawlMode } = await (0, graphql_client_1.fetchAllProducts)(dispensary.platformDispensaryId, pricingType, { crawlMode: 'mode_a', cName }); - modeAProducts = products.length; - // Collect Mode A product IDs - for (const p of products) { - modeAProductIds.add(p._id); - } - const result = await processProducts(products, dispensary, pricingType, crawlMode, { downloadImages }); - totalUpserted = result.upserted; - totalSnapshots = result.snapshots; - totalImagesDownloaded = result.imagesDownloaded; - totalImageErrors = result.imageErrors; - // Report progress - if (onProgress) { - await onProgress({ - productsFound: products.length, - productsUpserted: totalUpserted, - snapshotsCreated: totalSnapshots, - currentPage: 1, - totalPages: 1, - }); - } - } - // Mark products as missing using UNION of Mode A + Mode B - // The function handles outage detection (empty union = skip marking) - missingMarked = await markMissingProducts(dispensary.id, dispensary.platformDispensaryId, modeAProductIds, modeBProductIds, pricingType); - totalSnapshots += missingMarked; - // Update dispensary stats - await updateDispensaryCrawlStats(dispensary.id, totalUpserted); - console.log(`[ProductCrawler] Completed: ${totalUpserted} products, ${totalSnapshots} snapshots, ${missingMarked} marked missing, ${totalImagesDownloaded} images downloaded`); - const totalProductsFound = modeAProducts + modeBProducts; - return { - success: true, - dispensaryId: dispensary.id, - productsFound: totalProductsFound, - productsFetched: totalProductsFound, - productsUpserted: totalUpserted, - snapshotsCreated: totalSnapshots, - modeAProducts, - modeBProducts, - missingProductsMarked: missingMarked, - imagesDownloaded: totalImagesDownloaded, - imageErrors: totalImageErrors, - durationMs: Date.now() - startTime, - }; - } - catch (error) { - console.error(`[ProductCrawler] Failed to crawl ${dispensary.name}:`, error.message); - return { - success: false, - dispensaryId: dispensary.id, - productsFound: 0, - productsFetched: 0, - productsUpserted: 0, - snapshotsCreated: 0, - errorMessage: error.message, - durationMs: Date.now() - startTime, - }; - } -} -/** - * Crawl all Arizona dispensaries - */ -async function crawlAllArizonaDispensaries(pricingType = 'rec') { - const results = []; - // Get all AZ dispensaries with platform IDs - const { rows: rawRows } = await (0, connection_1.query)(` - SELECT ${DISPENSARY_COLUMNS} FROM dispensaries - WHERE state = 'AZ' AND menu_type = 'dutchie' AND platform_dispensary_id IS NOT NULL - ORDER BY id - `); - const dispensaries = rawRows.map(discovery_1.mapDbRowToDispensary); - console.log(`[ProductCrawler] Starting crawl of ${dispensaries.length} dispensaries...`); - for (const dispensary of dispensaries) { - const result = await crawlDispensaryProducts(dispensary, pricingType); - results.push(result); - // Delay between dispensaries - await new Promise((r) => setTimeout(r, 2000)); - } - const successful = results.filter((r) => r.success).length; - const totalProducts = results.reduce((sum, r) => sum + r.productsUpserted, 0); - const totalSnapshots = results.reduce((sum, r) => sum + r.snapshotsCreated, 0); - console.log(`[ProductCrawler] Completed: ${successful}/${dispensaries.length} stores, ${totalProducts} products, ${totalSnapshots} snapshots`); - return results; -} diff --git a/backend/dist/dutchie-az/services/scheduler.js b/backend/dist/dutchie-az/services/scheduler.js deleted file mode 100644 index 2911df96..00000000 --- a/backend/dist/dutchie-az/services/scheduler.js +++ /dev/null @@ -1,595 +0,0 @@ -"use strict"; -/** - * Dutchie AZ Scheduler Service - * - * Handles scheduled crawling with JITTER - no fixed intervals! - * Each job re-schedules itself with a NEW random offset after each run. - * This makes timing "wander" around the clock, avoiding detectable patterns. - * - * Jitter Logic: - * nextRunAt = lastRunAt + baseIntervalMinutes + random(-jitterMinutes, +jitterMinutes) - * - * Example: 4-hour base with ±30min jitter = runs anywhere from 3h30m to 4h30m apart - */ -Object.defineProperty(exports, "__esModule", { value: true }); -exports.crawlSingleDispensary = void 0; -exports.getAllSchedules = getAllSchedules; -exports.getScheduleById = getScheduleById; -exports.createSchedule = createSchedule; -exports.updateSchedule = updateSchedule; -exports.deleteSchedule = deleteSchedule; -exports.getRunLogs = getRunLogs; -exports.startScheduler = startScheduler; -exports.stopScheduler = stopScheduler; -exports.getSchedulerStatus = getSchedulerStatus; -exports.triggerScheduleNow = triggerScheduleNow; -exports.initializeDefaultSchedules = initializeDefaultSchedules; -exports.triggerImmediateCrawl = triggerImmediateCrawl; -const connection_1 = require("../db/connection"); -const menu_detection_1 = require("./menu-detection"); -const job_queue_1 = require("./job-queue"); -// Scheduler poll interval (how often we check for due jobs) -const SCHEDULER_POLL_INTERVAL_MS = 60 * 1000; // 1 minute -// Track running state -let isSchedulerRunning = false; -let schedulerInterval = null; -// ============================================================ -// JITTER CALCULATION -// ============================================================ -/** - * Generate a random jitter value in minutes - * Returns a value between -jitterMinutes and +jitterMinutes - */ -function getRandomJitterMinutes(jitterMinutes) { - // random() returns [0, 1), we want [-jitter, +jitter] - return (Math.random() * 2 - 1) * jitterMinutes; -} -/** - * Calculate next run time with jitter - * nextRunAt = baseTime + baseIntervalMinutes + random(-jitter, +jitter) - */ -function calculateNextRunAt(baseTime, baseIntervalMinutes, jitterMinutes) { - const jitter = getRandomJitterMinutes(jitterMinutes); - const totalMinutes = baseIntervalMinutes + jitter; - const totalMs = totalMinutes * 60 * 1000; - return new Date(baseTime.getTime() + totalMs); -} -// ============================================================ -// DATABASE OPERATIONS -// ============================================================ -/** - * Get all job schedules - */ -async function getAllSchedules() { - const { rows } = await (0, connection_1.query)(` - SELECT - id, job_name, description, enabled, - base_interval_minutes, jitter_minutes, - last_run_at, last_status, last_error_message, last_duration_ms, - next_run_at, job_config, created_at, updated_at - FROM job_schedules - ORDER BY job_name - `); - return rows.map(row => ({ - id: row.id, - jobName: row.job_name, - description: row.description, - enabled: row.enabled, - baseIntervalMinutes: row.base_interval_minutes, - jitterMinutes: row.jitter_minutes, - lastRunAt: row.last_run_at, - lastStatus: row.last_status, - lastErrorMessage: row.last_error_message, - lastDurationMs: row.last_duration_ms, - nextRunAt: row.next_run_at, - jobConfig: row.job_config, - createdAt: row.created_at, - updatedAt: row.updated_at, - })); -} -/** - * Get a single schedule by ID - */ -async function getScheduleById(id) { - const { rows } = await (0, connection_1.query)(`SELECT * FROM job_schedules WHERE id = $1`, [id]); - if (rows.length === 0) - return null; - const row = rows[0]; - return { - id: row.id, - jobName: row.job_name, - description: row.description, - enabled: row.enabled, - baseIntervalMinutes: row.base_interval_minutes, - jitterMinutes: row.jitter_minutes, - lastRunAt: row.last_run_at, - lastStatus: row.last_status, - lastErrorMessage: row.last_error_message, - lastDurationMs: row.last_duration_ms, - nextRunAt: row.next_run_at, - jobConfig: row.job_config, - createdAt: row.created_at, - updatedAt: row.updated_at, - }; -} -/** - * Create a new schedule - */ -async function createSchedule(schedule) { - // Calculate initial nextRunAt - const nextRunAt = schedule.startImmediately - ? new Date() // Start immediately - : calculateNextRunAt(new Date(), schedule.baseIntervalMinutes, schedule.jitterMinutes); - const { rows } = await (0, connection_1.query)(` - INSERT INTO job_schedules ( - job_name, description, enabled, - base_interval_minutes, jitter_minutes, - next_run_at, job_config - ) VALUES ($1, $2, $3, $4, $5, $6, $7) - RETURNING * - `, [ - schedule.jobName, - schedule.description || null, - schedule.enabled ?? true, - schedule.baseIntervalMinutes, - schedule.jitterMinutes, - nextRunAt, - schedule.jobConfig ? JSON.stringify(schedule.jobConfig) : null, - ]); - const row = rows[0]; - console.log(`[Scheduler] Created schedule "${schedule.jobName}" - next run at ${nextRunAt.toISOString()}`); - return { - id: row.id, - jobName: row.job_name, - description: row.description, - enabled: row.enabled, - baseIntervalMinutes: row.base_interval_minutes, - jitterMinutes: row.jitter_minutes, - lastRunAt: row.last_run_at, - lastStatus: row.last_status, - lastErrorMessage: row.last_error_message, - lastDurationMs: row.last_duration_ms, - nextRunAt: row.next_run_at, - jobConfig: row.job_config, - createdAt: row.created_at, - updatedAt: row.updated_at, - }; -} -/** - * Update a schedule - */ -async function updateSchedule(id, updates) { - const setClauses = []; - const params = []; - let paramIndex = 1; - if (updates.description !== undefined) { - setClauses.push(`description = $${paramIndex++}`); - params.push(updates.description); - } - if (updates.enabled !== undefined) { - setClauses.push(`enabled = $${paramIndex++}`); - params.push(updates.enabled); - } - if (updates.baseIntervalMinutes !== undefined) { - setClauses.push(`base_interval_minutes = $${paramIndex++}`); - params.push(updates.baseIntervalMinutes); - } - if (updates.jitterMinutes !== undefined) { - setClauses.push(`jitter_minutes = $${paramIndex++}`); - params.push(updates.jitterMinutes); - } - if (updates.jobConfig !== undefined) { - setClauses.push(`job_config = $${paramIndex++}`); - params.push(JSON.stringify(updates.jobConfig)); - } - if (setClauses.length === 0) { - return getScheduleById(id); - } - setClauses.push(`updated_at = NOW()`); - params.push(id); - const { rows } = await (0, connection_1.query)(`UPDATE job_schedules SET ${setClauses.join(', ')} WHERE id = $${paramIndex} RETURNING *`, params); - if (rows.length === 0) - return null; - const row = rows[0]; - return { - id: row.id, - jobName: row.job_name, - description: row.description, - enabled: row.enabled, - baseIntervalMinutes: row.base_interval_minutes, - jitterMinutes: row.jitter_minutes, - lastRunAt: row.last_run_at, - lastStatus: row.last_status, - lastErrorMessage: row.last_error_message, - lastDurationMs: row.last_duration_ms, - nextRunAt: row.next_run_at, - jobConfig: row.job_config, - createdAt: row.created_at, - updatedAt: row.updated_at, - }; -} -/** - * Delete a schedule - */ -async function deleteSchedule(id) { - const result = await (0, connection_1.query)(`DELETE FROM job_schedules WHERE id = $1`, [id]); - return (result.rowCount || 0) > 0; -} -/** - * Mark a schedule as running - */ -async function markScheduleRunning(id) { - await (0, connection_1.query)(`UPDATE job_schedules SET last_status = 'running', updated_at = NOW() WHERE id = $1`, [id]); -} -/** - * Update schedule after job completion with NEW jittered next_run_at - */ -async function updateScheduleAfterRun(id, status, durationMs, errorMessage) { - // Get current schedule to calculate new nextRunAt - const schedule = await getScheduleById(id); - if (!schedule) - return; - const now = new Date(); - const newNextRunAt = calculateNextRunAt(now, schedule.baseIntervalMinutes, schedule.jitterMinutes); - console.log(`[Scheduler] Schedule "${schedule.jobName}" completed (${status}). Next run: ${newNextRunAt.toISOString()}`); - await (0, connection_1.query)(` - UPDATE job_schedules SET - last_run_at = $2, - last_status = $3, - last_error_message = $4, - last_duration_ms = $5, - next_run_at = $6, - updated_at = NOW() - WHERE id = $1 - `, [id, now, status, errorMessage || null, durationMs, newNextRunAt]); -} -/** - * Create a job run log entry - */ -async function createRunLog(scheduleId, jobName, status) { - const { rows } = await (0, connection_1.query)(` - INSERT INTO job_run_logs (schedule_id, job_name, status, started_at) - VALUES ($1, $2, $3, NOW()) - RETURNING id - `, [scheduleId, jobName, status]); - return rows[0].id; -} -/** - * Update a job run log entry - */ -async function updateRunLog(runLogId, status, results) { - await (0, connection_1.query)(` - UPDATE job_run_logs SET - status = $2, - completed_at = NOW(), - duration_ms = $3, - error_message = $4, - items_processed = $5, - items_succeeded = $6, - items_failed = $7, - metadata = $8 - WHERE id = $1 - `, [ - runLogId, - status, - results.durationMs, - results.errorMessage || null, - results.itemsProcessed || 0, - results.itemsSucceeded || 0, - results.itemsFailed || 0, - results.metadata ? JSON.stringify(results.metadata) : null, - ]); -} -/** - * Get job run logs - */ -async function getRunLogs(options) { - const { scheduleId, jobName, limit = 50, offset = 0 } = options; - let whereClause = 'WHERE 1=1'; - const params = []; - let paramIndex = 1; - if (scheduleId) { - whereClause += ` AND schedule_id = $${paramIndex++}`; - params.push(scheduleId); - } - if (jobName) { - whereClause += ` AND job_name = $${paramIndex++}`; - params.push(jobName); - } - params.push(limit, offset); - const { rows } = await (0, connection_1.query)(` - SELECT * FROM job_run_logs - ${whereClause} - ORDER BY created_at DESC - LIMIT $${paramIndex} OFFSET $${paramIndex + 1} - `, params); - const { rows: countRows } = await (0, connection_1.query)(`SELECT COUNT(*) as total FROM job_run_logs ${whereClause}`, params.slice(0, -2)); - return { - logs: rows, - total: parseInt(countRows[0]?.total || '0', 10), - }; -} -// ============================================================ -// JOB EXECUTION -// ============================================================ -/** - * Execute a job based on its name - */ -async function executeJob(schedule) { - const config = schedule.jobConfig || {}; - switch (schedule.jobName) { - case 'dutchie_az_product_crawl': - return executeProductCrawl(config); - case 'dutchie_az_discovery': - return executeDiscovery(config); - case 'dutchie_az_menu_detection': - return (0, menu_detection_1.executeMenuDetectionJob)(config); - default: - throw new Error(`Unknown job type: ${schedule.jobName}`); - } -} -/** - * Execute the AZ Dutchie product crawl job - * - * NEW BEHAVIOR: Instead of running crawls directly, this now ENQUEUES jobs - * into the crawl_jobs queue. Workers (running as separate replicas) will - * pick up and process these jobs. - * - * This allows: - * - Multiple workers to process jobs in parallel - * - No double-crawls (DB-level locking per dispensary) - * - Better scalability (add more worker replicas) - * - Live monitoring of individual job progress - */ -async function executeProductCrawl(config) { - const pricingType = config.pricingType || 'rec'; - const useBothModes = config.useBothModes !== false; - // Get all "ready" dispensaries (menu_type='dutchie' AND platform_dispensary_id IS NOT NULL AND not failed) - // Note: Menu detection is handled separately by the dutchie_az_menu_detection schedule - const { rows: rawRows } = await (0, connection_1.query)(` - SELECT id FROM dispensaries - WHERE state = 'AZ' - AND menu_type = 'dutchie' - AND platform_dispensary_id IS NOT NULL - AND failed_at IS NULL - ORDER BY last_crawl_at ASC NULLS FIRST - `); - const dispensaryIds = rawRows.map((r) => r.id); - if (dispensaryIds.length === 0) { - return { - status: 'success', - itemsProcessed: 0, - itemsSucceeded: 0, - itemsFailed: 0, - metadata: { message: 'No ready dispensaries to crawl. Run menu detection to discover more.' }, - }; - } - console.log(`[Scheduler] Enqueueing crawl jobs for ${dispensaryIds.length} dispensaries...`); - // Bulk enqueue jobs (skips dispensaries that already have pending/running jobs) - const { enqueued, skipped } = await (0, job_queue_1.bulkEnqueueJobs)('dutchie_product_crawl', dispensaryIds, { - priority: 0, - metadata: { pricingType, useBothModes }, - }); - console.log(`[Scheduler] Enqueued ${enqueued} jobs, skipped ${skipped} (already queued)`); - // Get current queue stats - const queueStats = await (0, job_queue_1.getQueueStats)(); - return { - status: 'success', - itemsProcessed: dispensaryIds.length, - itemsSucceeded: enqueued, - itemsFailed: 0, // Enqueue itself doesn't fail - metadata: { - enqueued, - skipped, - queueStats, - pricingType, - useBothModes, - message: `Enqueued ${enqueued} jobs. Workers will process them. Check /scraper-monitor for progress.`, - }, - }; -} -/** - * Execute the AZ Dutchie discovery job (placeholder) - */ -async function executeDiscovery(_config) { - // Placeholder - implement discovery logic - return { - status: 'success', - itemsProcessed: 0, - itemsSucceeded: 0, - itemsFailed: 0, - metadata: { message: 'Discovery not yet implemented' }, - }; -} -// ============================================================ -// SCHEDULER RUNNER -// ============================================================ -/** - * Check for due jobs and run them - */ -async function checkAndRunDueJobs() { - try { - // Get enabled schedules where nextRunAt <= now - const { rows } = await (0, connection_1.query)(` - SELECT * FROM job_schedules - WHERE enabled = true - AND next_run_at IS NOT NULL - AND next_run_at <= NOW() - AND (last_status IS NULL OR last_status != 'running') - ORDER BY next_run_at ASC - `); - if (rows.length === 0) - return; - console.log(`[Scheduler] Found ${rows.length} due job(s)`); - for (const row of rows) { - const schedule = { - id: row.id, - jobName: row.job_name, - description: row.description, - enabled: row.enabled, - baseIntervalMinutes: row.base_interval_minutes, - jitterMinutes: row.jitter_minutes, - lastRunAt: row.last_run_at, - lastStatus: row.last_status, - lastErrorMessage: row.last_error_message, - lastDurationMs: row.last_duration_ms, - nextRunAt: row.next_run_at, - jobConfig: row.job_config, - createdAt: row.created_at, - updatedAt: row.updated_at, - }; - await runScheduledJob(schedule); - } - } - catch (error) { - console.error('[Scheduler] Error checking for due jobs:', error); - } -} -/** - * Run a single scheduled job - */ -async function runScheduledJob(schedule) { - const startTime = Date.now(); - console.log(`[Scheduler] Starting job "${schedule.jobName}"...`); - // Mark as running - await markScheduleRunning(schedule.id); - // Create run log entry - const runLogId = await createRunLog(schedule.id, schedule.jobName, 'running'); - try { - // Execute the job - const result = await executeJob(schedule); - const durationMs = Date.now() - startTime; - // Determine final status (exclude 'running' and null) - const finalStatus = result.status === 'running' || result.status === null - ? 'success' - : result.status; - // Update run log - await updateRunLog(runLogId, finalStatus, { - durationMs, - errorMessage: result.errorMessage, - itemsProcessed: result.itemsProcessed, - itemsSucceeded: result.itemsSucceeded, - itemsFailed: result.itemsFailed, - metadata: result.metadata, - }); - // Update schedule with NEW jittered next_run_at - await updateScheduleAfterRun(schedule.id, result.status, durationMs, result.errorMessage); - console.log(`[Scheduler] Job "${schedule.jobName}" completed in ${Math.round(durationMs / 1000)}s (${result.status})`); - } - catch (error) { - const durationMs = Date.now() - startTime; - console.error(`[Scheduler] Job "${schedule.jobName}" failed:`, error.message); - // Update run log with error - await updateRunLog(runLogId, 'error', { - durationMs, - errorMessage: error.message, - itemsProcessed: 0, - itemsSucceeded: 0, - itemsFailed: 0, - }); - // Update schedule with NEW jittered next_run_at - await updateScheduleAfterRun(schedule.id, 'error', durationMs, error.message); - } -} -// ============================================================ -// PUBLIC API -// ============================================================ -/** - * Start the scheduler - */ -function startScheduler() { - if (isSchedulerRunning) { - console.log('[Scheduler] Scheduler is already running'); - return; - } - isSchedulerRunning = true; - console.log(`[Scheduler] Starting scheduler (polling every ${SCHEDULER_POLL_INTERVAL_MS / 1000}s)...`); - // Immediately check for due jobs - checkAndRunDueJobs(); - // Set up interval to check for due jobs - schedulerInterval = setInterval(checkAndRunDueJobs, SCHEDULER_POLL_INTERVAL_MS); -} -/** - * Stop the scheduler - */ -function stopScheduler() { - if (!isSchedulerRunning) { - console.log('[Scheduler] Scheduler is not running'); - return; - } - isSchedulerRunning = false; - if (schedulerInterval) { - clearInterval(schedulerInterval); - schedulerInterval = null; - } - console.log('[Scheduler] Scheduler stopped'); -} -/** - * Get scheduler status - */ -function getSchedulerStatus() { - return { - running: isSchedulerRunning, - pollIntervalMs: SCHEDULER_POLL_INTERVAL_MS, - }; -} -/** - * Trigger immediate execution of a schedule - */ -async function triggerScheduleNow(scheduleId) { - const schedule = await getScheduleById(scheduleId); - if (!schedule) { - return { success: false, message: 'Schedule not found' }; - } - if (schedule.lastStatus === 'running') { - return { success: false, message: 'Job is already running' }; - } - // Run the job - await runScheduledJob(schedule); - return { success: true, message: 'Job triggered successfully' }; -} -/** - * Initialize default schedules if they don't exist - */ -async function initializeDefaultSchedules() { - const schedules = await getAllSchedules(); - // Check if product crawl schedule exists - const productCrawlExists = schedules.some(s => s.jobName === 'dutchie_az_product_crawl'); - if (!productCrawlExists) { - await createSchedule({ - jobName: 'dutchie_az_product_crawl', - description: 'Crawl all AZ Dutchie dispensary products', - enabled: true, - baseIntervalMinutes: 240, // 4 hours - jitterMinutes: 30, // ±30 minutes - jobConfig: { pricingType: 'rec', useBothModes: true }, - startImmediately: false, - }); - console.log('[Scheduler] Created default product crawl schedule'); - } - // Check if menu detection schedule exists - const menuDetectionExists = schedules.some(s => s.jobName === 'dutchie_az_menu_detection'); - if (!menuDetectionExists) { - await createSchedule({ - jobName: 'dutchie_az_menu_detection', - description: 'Detect menu providers and resolve platform IDs for AZ dispensaries', - enabled: true, - baseIntervalMinutes: 1440, // 24 hours - jitterMinutes: 60, // ±1 hour - jobConfig: { state: 'AZ', onlyUnknown: true }, - startImmediately: false, - }); - console.log('[Scheduler] Created default menu detection schedule'); - } -} -// Re-export for backward compatibility -var product_crawler_1 = require("./product-crawler"); -Object.defineProperty(exports, "crawlSingleDispensary", { enumerable: true, get: function () { return product_crawler_1.crawlDispensaryProducts; } }); -async function triggerImmediateCrawl() { - const schedules = await getAllSchedules(); - const productCrawl = schedules.find(s => s.jobName === 'dutchie_az_product_crawl'); - if (productCrawl) { - return triggerScheduleNow(productCrawl.id); - } - return { success: false, message: 'Product crawl schedule not found' }; -} diff --git a/backend/dist/dutchie-az/services/worker.js b/backend/dist/dutchie-az/services/worker.js deleted file mode 100644 index 43f0fbf6..00000000 --- a/backend/dist/dutchie-az/services/worker.js +++ /dev/null @@ -1,440 +0,0 @@ -"use strict"; -/** - * Worker Service - * - * Polls the job queue and processes crawl jobs. - * Each worker instance runs independently, claiming jobs atomically. - */ -var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { - if (k2 === undefined) k2 = k; - var desc = Object.getOwnPropertyDescriptor(m, k); - if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { - desc = { enumerable: true, get: function() { return m[k]; } }; - } - Object.defineProperty(o, k2, desc); -}) : (function(o, m, k, k2) { - if (k2 === undefined) k2 = k; - o[k2] = m[k]; -})); -var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { - Object.defineProperty(o, "default", { enumerable: true, value: v }); -}) : function(o, v) { - o["default"] = v; -}); -var __importStar = (this && this.__importStar) || (function () { - var ownKeys = function(o) { - ownKeys = Object.getOwnPropertyNames || function (o) { - var ar = []; - for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; - return ar; - }; - return ownKeys(o); - }; - return function (mod) { - if (mod && mod.__esModule) return mod; - var result = {}; - if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); - __setModuleDefault(result, mod); - return result; - }; -})(); -Object.defineProperty(exports, "__esModule", { value: true }); -exports.startWorker = startWorker; -exports.stopWorker = stopWorker; -exports.getWorkerStatus = getWorkerStatus; -const job_queue_1 = require("./job-queue"); -const product_crawler_1 = require("./product-crawler"); -const discovery_1 = require("./discovery"); -const connection_1 = require("../db/connection"); -// Explicit column list for dispensaries table (avoids SELECT * issues with schema differences) -// NOTE: failed_at is included for worker compatibility checks -const DISPENSARY_COLUMNS = ` - id, name, slug, city, state, zip, address, latitude, longitude, - menu_type, menu_url, platform_dispensary_id, website, - provider_detection_data, created_at, updated_at, failed_at -`; -// ============================================================ -// WORKER CONFIG -// ============================================================ -const POLL_INTERVAL_MS = 5000; // Check for jobs every 5 seconds -const HEARTBEAT_INTERVAL_MS = 60000; // Send heartbeat every 60 seconds -const STALE_CHECK_INTERVAL_MS = 300000; // Check for stale jobs every 5 minutes -const SHUTDOWN_GRACE_PERIOD_MS = 30000; // Wait 30s for job to complete on shutdown -// ============================================================ -// WORKER STATE -// ============================================================ -let isRunning = false; -let currentJob = null; -let pollTimer = null; -let heartbeatTimer = null; -let staleCheckTimer = null; -let shutdownPromise = null; -// ============================================================ -// WORKER LIFECYCLE -// ============================================================ -/** - * Start the worker - */ -async function startWorker() { - if (isRunning) { - console.log('[Worker] Already running'); - return; - } - const workerId = (0, job_queue_1.getWorkerId)(); - const hostname = (0, job_queue_1.getWorkerHostname)(); - console.log(`[Worker] Starting worker ${workerId} on ${hostname}`); - isRunning = true; - // Set up graceful shutdown - setupShutdownHandlers(); - // Start polling for jobs - pollTimer = setInterval(pollForJobs, POLL_INTERVAL_MS); - // Start stale job recovery (only one worker should do this, but it's idempotent) - staleCheckTimer = setInterval(async () => { - try { - await (0, job_queue_1.recoverStaleJobs)(15); - } - catch (error) { - console.error('[Worker] Error recovering stale jobs:', error); - } - }, STALE_CHECK_INTERVAL_MS); - // Immediately poll for a job - await pollForJobs(); - console.log(`[Worker] Worker ${workerId} started, polling every ${POLL_INTERVAL_MS}ms`); -} -/** - * Stop the worker gracefully - */ -async function stopWorker() { - if (!isRunning) - return; - console.log('[Worker] Stopping worker...'); - isRunning = false; - // Clear timers - if (pollTimer) { - clearInterval(pollTimer); - pollTimer = null; - } - if (heartbeatTimer) { - clearInterval(heartbeatTimer); - heartbeatTimer = null; - } - if (staleCheckTimer) { - clearInterval(staleCheckTimer); - staleCheckTimer = null; - } - // Wait for current job to complete - if (currentJob) { - console.log(`[Worker] Waiting for job ${currentJob.id} to complete...`); - const startWait = Date.now(); - while (currentJob && Date.now() - startWait < SHUTDOWN_GRACE_PERIOD_MS) { - await new Promise(r => setTimeout(r, 1000)); - } - if (currentJob) { - console.log(`[Worker] Job ${currentJob.id} did not complete in time, marking for retry`); - await (0, job_queue_1.failJob)(currentJob.id, 'Worker shutdown'); - } - } - console.log('[Worker] Worker stopped'); -} -/** - * Get worker status - */ -function getWorkerStatus() { - return { - isRunning, - workerId: (0, job_queue_1.getWorkerId)(), - hostname: (0, job_queue_1.getWorkerHostname)(), - currentJob, - }; -} -// ============================================================ -// JOB PROCESSING -// ============================================================ -/** - * Poll for and process the next available job - */ -async function pollForJobs() { - if (!isRunning || currentJob) { - return; // Already processing a job - } - try { - const workerId = (0, job_queue_1.getWorkerId)(); - // Try to claim a job - const job = await (0, job_queue_1.claimNextJob)({ - workerId, - jobTypes: ['dutchie_product_crawl', 'menu_detection', 'menu_detection_single'], - lockDurationMinutes: 30, - }); - if (!job) { - return; // No jobs available - } - currentJob = job; - console.log(`[Worker] Processing job ${job.id} (type=${job.jobType}, dispensary=${job.dispensaryId})`); - // Start heartbeat for this job - heartbeatTimer = setInterval(async () => { - if (currentJob) { - try { - await (0, job_queue_1.heartbeat)(currentJob.id); - } - catch (error) { - console.error('[Worker] Heartbeat error:', error); - } - } - }, HEARTBEAT_INTERVAL_MS); - // Process the job - await processJob(job); - } - catch (error) { - console.error('[Worker] Error polling for jobs:', error); - if (currentJob) { - try { - await (0, job_queue_1.failJob)(currentJob.id, error.message); - } - catch (failError) { - console.error('[Worker] Error failing job:', failError); - } - } - } - finally { - // Clear heartbeat timer - if (heartbeatTimer) { - clearInterval(heartbeatTimer); - heartbeatTimer = null; - } - currentJob = null; - } -} -/** - * Process a single job - */ -async function processJob(job) { - try { - switch (job.jobType) { - case 'dutchie_product_crawl': - await processProductCrawlJob(job); - break; - case 'menu_detection': - await processMenuDetectionJob(job); - break; - case 'menu_detection_single': - await processSingleDetectionJob(job); - break; - default: - throw new Error(`Unknown job type: ${job.jobType}`); - } - } - catch (error) { - console.error(`[Worker] Job ${job.id} failed:`, error); - await (0, job_queue_1.failJob)(job.id, error.message); - } -} -// Maximum consecutive failures before flagging a dispensary -const MAX_CONSECUTIVE_FAILURES = 3; -/** - * Record a successful crawl - resets failure counter - */ -async function recordCrawlSuccess(dispensaryId) { - await (0, connection_1.query)(`UPDATE dispensaries - SET consecutive_failures = 0, - last_crawl_at = NOW(), - updated_at = NOW() - WHERE id = $1`, [dispensaryId]); -} -/** - * Record a crawl failure - increments counter and may flag dispensary - * Returns true if dispensary was flagged as failed - */ -async function recordCrawlFailure(dispensaryId, errorMessage) { - // Increment failure counter - const { rows } = await (0, connection_1.query)(`UPDATE dispensaries - SET consecutive_failures = consecutive_failures + 1, - last_failure_at = NOW(), - last_failure_reason = $2, - updated_at = NOW() - WHERE id = $1 - RETURNING consecutive_failures`, [dispensaryId, errorMessage]); - const failures = rows[0]?.consecutive_failures || 0; - // If we've hit the threshold, flag the dispensary as failed - if (failures >= MAX_CONSECUTIVE_FAILURES) { - await (0, connection_1.query)(`UPDATE dispensaries - SET failed_at = NOW(), - menu_type = NULL, - platform_dispensary_id = NULL, - failure_notes = $2, - updated_at = NOW() - WHERE id = $1`, [dispensaryId, `Auto-flagged after ${failures} consecutive failures. Last error: ${errorMessage}`]); - console.log(`[Worker] Dispensary ${dispensaryId} flagged as FAILED after ${failures} consecutive failures`); - return true; - } - console.log(`[Worker] Dispensary ${dispensaryId} failure recorded (${failures}/${MAX_CONSECUTIVE_FAILURES})`); - return false; -} -/** - * Process a product crawl job for a single dispensary - */ -async function processProductCrawlJob(job) { - if (!job.dispensaryId) { - throw new Error('Product crawl job requires dispensary_id'); - } - // Get dispensary details - const { rows } = await (0, connection_1.query)(`SELECT ${DISPENSARY_COLUMNS} FROM dispensaries WHERE id = $1`, [job.dispensaryId]); - if (rows.length === 0) { - throw new Error(`Dispensary ${job.dispensaryId} not found`); - } - const dispensary = (0, discovery_1.mapDbRowToDispensary)(rows[0]); - // Check if dispensary is already flagged as failed - if (rows[0].failed_at) { - console.log(`[Worker] Skipping dispensary ${job.dispensaryId} - already flagged as failed`); - await (0, job_queue_1.completeJob)(job.id, { productsFound: 0, productsUpserted: 0 }); - return; - } - if (!dispensary.platformDispensaryId) { - // Record failure and potentially flag - await recordCrawlFailure(job.dispensaryId, 'Missing platform_dispensary_id'); - throw new Error(`Dispensary ${job.dispensaryId} has no platform_dispensary_id`); - } - // Get crawl options from job metadata - const pricingType = job.metadata?.pricingType || 'rec'; - const useBothModes = job.metadata?.useBothModes !== false; - try { - // Crawl the dispensary - const result = await (0, product_crawler_1.crawlDispensaryProducts)(dispensary, pricingType, { - useBothModes, - onProgress: async (progress) => { - // Update progress for live monitoring - await (0, job_queue_1.updateJobProgress)(job.id, { - productsFound: progress.productsFound, - productsUpserted: progress.productsUpserted, - snapshotsCreated: progress.snapshotsCreated, - currentPage: progress.currentPage, - totalPages: progress.totalPages, - }); - }, - }); - if (result.success) { - // Success! Reset failure counter - await recordCrawlSuccess(job.dispensaryId); - await (0, job_queue_1.completeJob)(job.id, { - productsFound: result.productsFetched, - productsUpserted: result.productsUpserted, - snapshotsCreated: result.snapshotsCreated, - }); - } - else { - // Crawl returned failure - record it - const wasFlagged = await recordCrawlFailure(job.dispensaryId, result.errorMessage || 'Crawl failed'); - if (wasFlagged) { - // Don't throw - the dispensary is now flagged, job is "complete" - await (0, job_queue_1.completeJob)(job.id, { productsFound: 0, productsUpserted: 0 }); - } - else { - throw new Error(result.errorMessage || 'Crawl failed'); - } - } - } - catch (error) { - // Record the failure - const wasFlagged = await recordCrawlFailure(job.dispensaryId, error.message); - if (wasFlagged) { - // Dispensary is now flagged - complete the job rather than fail it - await (0, job_queue_1.completeJob)(job.id, { productsFound: 0, productsUpserted: 0 }); - } - else { - throw error; - } - } -} -/** - * Process a menu detection job (bulk) - */ -async function processMenuDetectionJob(job) { - const { executeMenuDetectionJob } = await Promise.resolve().then(() => __importStar(require('./menu-detection'))); - const config = job.metadata || {}; - const result = await executeMenuDetectionJob(config); - if (result.status === 'error') { - throw new Error(result.errorMessage || 'Menu detection failed'); - } - await (0, job_queue_1.completeJob)(job.id, { - productsFound: result.itemsProcessed, - productsUpserted: result.itemsSucceeded, - }); -} -/** - * Process a single dispensary menu detection job - * This is the parallelizable version - each worker can detect one dispensary at a time - */ -async function processSingleDetectionJob(job) { - if (!job.dispensaryId) { - throw new Error('Single detection job requires dispensary_id'); - } - const { detectAndResolveDispensary } = await Promise.resolve().then(() => __importStar(require('./menu-detection'))); - // Get dispensary details - const { rows } = await (0, connection_1.query)(`SELECT ${DISPENSARY_COLUMNS} FROM dispensaries WHERE id = $1`, [job.dispensaryId]); - if (rows.length === 0) { - throw new Error(`Dispensary ${job.dispensaryId} not found`); - } - const dispensary = rows[0]; - // Skip if already detected or failed - if (dispensary.failed_at) { - console.log(`[Worker] Skipping dispensary ${job.dispensaryId} - already flagged as failed`); - await (0, job_queue_1.completeJob)(job.id, { productsFound: 0, productsUpserted: 0 }); - return; - } - if (dispensary.menu_type && dispensary.menu_type !== 'unknown') { - console.log(`[Worker] Skipping dispensary ${job.dispensaryId} - already detected as ${dispensary.menu_type}`); - await (0, job_queue_1.completeJob)(job.id, { productsFound: 0, productsUpserted: 1 }); - return; - } - console.log(`[Worker] Detecting menu for dispensary ${job.dispensaryId} (${dispensary.name})...`); - try { - const result = await detectAndResolveDispensary(job.dispensaryId); - if (result.success) { - console.log(`[Worker] Dispensary ${job.dispensaryId}: detected ${result.detectedProvider}, platformId=${result.platformDispensaryId || 'none'}`); - await (0, job_queue_1.completeJob)(job.id, { - productsFound: 1, - productsUpserted: result.platformDispensaryId ? 1 : 0, - }); - } - else { - // Detection failed - record failure - await recordCrawlFailure(job.dispensaryId, result.error || 'Detection failed'); - throw new Error(result.error || 'Detection failed'); - } - } - catch (error) { - // Record the failure - const wasFlagged = await recordCrawlFailure(job.dispensaryId, error.message); - if (wasFlagged) { - // Dispensary is now flagged - complete the job rather than fail it - await (0, job_queue_1.completeJob)(job.id, { productsFound: 0, productsUpserted: 0 }); - } - else { - throw error; - } - } -} -// ============================================================ -// SHUTDOWN HANDLING -// ============================================================ -function setupShutdownHandlers() { - const shutdown = async (signal) => { - if (shutdownPromise) - return shutdownPromise; - console.log(`\n[Worker] Received ${signal}, shutting down...`); - shutdownPromise = stopWorker(); - await shutdownPromise; - process.exit(0); - }; - process.on('SIGTERM', () => shutdown('SIGTERM')); - process.on('SIGINT', () => shutdown('SIGINT')); -} -// ============================================================ -// STANDALONE WORKER ENTRY POINT -// ============================================================ -if (require.main === module) { - // Run as standalone worker - startWorker().catch((error) => { - console.error('[Worker] Fatal error:', error); - process.exit(1); - }); -} diff --git a/backend/dist/dutchie-az/types/index.js b/backend/dist/dutchie-az/types/index.js deleted file mode 100644 index 098e21a3..00000000 --- a/backend/dist/dutchie-az/types/index.js +++ /dev/null @@ -1,96 +0,0 @@ -"use strict"; -/** - * Dutchie AZ Data Types - * - * Complete TypeScript interfaces for the isolated Dutchie Arizona data pipeline. - * These types map directly to Dutchie's GraphQL FilteredProducts response. - */ -Object.defineProperty(exports, "__esModule", { value: true }); -exports.getOptionQuantity = getOptionQuantity; -exports.deriveOptionStockStatus = deriveOptionStockStatus; -exports.deriveStockStatus = deriveStockStatus; -exports.calculateTotalQuantity = calculateTotalQuantity; -exports.calculateTotalKioskQuantity = calculateTotalKioskQuantity; -/** - * Get available quantity for a single option - * Priority: quantityAvailable > kioskQuantityAvailable > quantity - */ -function getOptionQuantity(child) { - if (typeof child.quantityAvailable === 'number') - return child.quantityAvailable; - if (typeof child.kioskQuantityAvailable === 'number') - return child.kioskQuantityAvailable; - if (typeof child.quantity === 'number') - return child.quantity; - return null; // No quantity data available -} -/** - * Derive stock status for a single option - * Returns: 'in_stock' if qty > 0, 'out_of_stock' if qty === 0, 'unknown' if no data - */ -function deriveOptionStockStatus(child) { - const qty = getOptionQuantity(child); - if (qty === null) - return 'unknown'; - return qty > 0 ? 'in_stock' : 'out_of_stock'; -} -/** - * Derive product-level stock status from POSMetaData.children - * - * Logic per spec: - * - If ANY child is "in_stock" → product is "in_stock" - * - Else if ALL children are "out_of_stock" → product is "out_of_stock" - * - Else → product is "unknown" - * - * IMPORTANT: Threshold flags (isBelowThreshold, etc.) do NOT override stock status. - * They only indicate "low stock" - if qty > 0, status stays "in_stock". - */ -function deriveStockStatus(product) { - const children = product.POSMetaData?.children; - // No children data - unknown - if (!children || children.length === 0) { - return 'unknown'; - } - // Get stock status for each option - const optionStatuses = children.map(deriveOptionStockStatus); - // If ANY option is in_stock → product is in_stock - if (optionStatuses.some(status => status === 'in_stock')) { - return 'in_stock'; - } - // If ALL options are out_of_stock → product is out_of_stock - if (optionStatuses.every(status => status === 'out_of_stock')) { - return 'out_of_stock'; - } - // Otherwise (mix of out_of_stock and unknown) → unknown - return 'unknown'; -} -/** - * Calculate total quantity available across all options - * Returns null if no children data (unknown inventory), 0 if children exist but all have 0 qty - */ -function calculateTotalQuantity(product) { - const children = product.POSMetaData?.children; - // No children = unknown inventory, return null (NOT 0) - if (!children || children.length === 0) - return null; - // Check if any child has quantity data - const hasAnyQtyData = children.some(child => getOptionQuantity(child) !== null); - if (!hasAnyQtyData) - return null; // All children lack qty data = unknown - return children.reduce((sum, child) => { - const qty = getOptionQuantity(child); - return sum + (qty ?? 0); - }, 0); -} -/** - * Calculate total kiosk quantity available across all options - */ -function calculateTotalKioskQuantity(product) { - const children = product.POSMetaData?.children; - if (!children || children.length === 0) - return null; - const hasAnyKioskQty = children.some(child => typeof child.kioskQuantityAvailable === 'number'); - if (!hasAnyKioskQty) - return null; - return children.reduce((sum, child) => sum + (child.kioskQuantityAvailable ?? 0), 0); -} diff --git a/backend/dist/index.js b/backend/dist/index.js deleted file mode 100644 index 2ac40a57..00000000 --- a/backend/dist/index.js +++ /dev/null @@ -1,119 +0,0 @@ -"use strict"; -var __importDefault = (this && this.__importDefault) || function (mod) { - return (mod && mod.__esModule) ? mod : { "default": mod }; -}; -Object.defineProperty(exports, "__esModule", { value: true }); -const express_1 = __importDefault(require("express")); -const cors_1 = __importDefault(require("cors")); -const dotenv_1 = __importDefault(require("dotenv")); -const minio_1 = require("./utils/minio"); -const image_storage_1 = require("./utils/image-storage"); -const logger_1 = require("./services/logger"); -const proxyTestQueue_1 = require("./services/proxyTestQueue"); -dotenv_1.default.config(); -const app = (0, express_1.default)(); -const PORT = process.env.PORT || 3010; -app.use((0, cors_1.default)()); -app.use(express_1.default.json()); -// Serve static images when MinIO is not configured -const LOCAL_IMAGES_PATH = process.env.LOCAL_IMAGES_PATH || '/app/public/images'; -app.use('/images', express_1.default.static(LOCAL_IMAGES_PATH)); -// Serve static downloads (plugin files, etc.) -const LOCAL_DOWNLOADS_PATH = process.env.LOCAL_DOWNLOADS_PATH || '/app/public/downloads'; -app.use('/downloads', express_1.default.static(LOCAL_DOWNLOADS_PATH)); -app.get('/health', (req, res) => { - res.json({ status: 'ok', timestamp: new Date().toISOString() }); -}); -// Endpoint to check server's outbound IP (for proxy whitelist setup) -app.get('/outbound-ip', async (req, res) => { - try { - const axios = require('axios'); - const response = await axios.get('https://api.ipify.org?format=json', { timeout: 10000 }); - res.json({ outbound_ip: response.data.ip }); - } - catch (error) { - res.status(500).json({ error: error.message }); - } -}); -const auth_1 = __importDefault(require("./routes/auth")); -const dashboard_1 = __importDefault(require("./routes/dashboard")); -const stores_1 = __importDefault(require("./routes/stores")); -const dispensaries_1 = __importDefault(require("./routes/dispensaries")); -const changes_1 = __importDefault(require("./routes/changes")); -const categories_1 = __importDefault(require("./routes/categories")); -const products_1 = __importDefault(require("./routes/products")); -const campaigns_1 = __importDefault(require("./routes/campaigns")); -const analytics_1 = __importDefault(require("./routes/analytics")); -const settings_1 = __importDefault(require("./routes/settings")); -const proxies_1 = __importDefault(require("./routes/proxies")); -const logs_1 = __importDefault(require("./routes/logs")); -const scraper_monitor_1 = __importDefault(require("./routes/scraper-monitor")); -const api_tokens_1 = __importDefault(require("./routes/api-tokens")); -const api_permissions_1 = __importDefault(require("./routes/api-permissions")); -const parallel_scrape_1 = __importDefault(require("./routes/parallel-scrape")); -const schedule_1 = __importDefault(require("./routes/schedule")); -const crawler_sandbox_1 = __importDefault(require("./routes/crawler-sandbox")); -const version_1 = __importDefault(require("./routes/version")); -const public_api_1 = __importDefault(require("./routes/public-api")); -const dutchie_az_1 = require("./dutchie-az"); -const apiTokenTracker_1 = require("./middleware/apiTokenTracker"); -const crawl_scheduler_1 = require("./services/crawl-scheduler"); -const wordpressPermissions_1 = require("./middleware/wordpressPermissions"); -// Apply WordPress permissions validation first (sets req.apiToken) -app.use(wordpressPermissions_1.validateWordPressPermissions); -// Apply API tracking middleware globally -app.use(apiTokenTracker_1.trackApiUsage); -app.use(apiTokenTracker_1.checkRateLimit); -app.use('/api/auth', auth_1.default); -app.use('/api/dashboard', dashboard_1.default); -app.use('/api/stores', stores_1.default); -app.use('/api/dispensaries', dispensaries_1.default); -app.use('/api/changes', changes_1.default); -app.use('/api/categories', categories_1.default); -app.use('/api/products', products_1.default); -app.use('/api/campaigns', campaigns_1.default); -app.use('/api/analytics', analytics_1.default); -app.use('/api/settings', settings_1.default); -app.use('/api/proxies', proxies_1.default); -app.use('/api/logs', logs_1.default); -app.use('/api/scraper-monitor', scraper_monitor_1.default); -app.use('/api/api-tokens', api_tokens_1.default); -app.use('/api/api-permissions', api_permissions_1.default); -app.use('/api/parallel-scrape', parallel_scrape_1.default); -app.use('/api/schedule', schedule_1.default); -app.use('/api/crawler-sandbox', crawler_sandbox_1.default); -app.use('/api/version', version_1.default); -// Vendor-agnostic AZ data pipeline routes (new public surface) -app.use('/api/az', dutchie_az_1.dutchieAZRouter); -// Legacy alias (kept temporarily for backward compatibility) -app.use('/api/dutchie-az', dutchie_az_1.dutchieAZRouter); -// Public API v1 - External consumer endpoints (WordPress, etc.) -// Uses dutchie_az data pipeline with per-dispensary API key auth -app.use('/api/v1', public_api_1.default); -async function startServer() { - try { - logger_1.logger.info('system', 'Starting server...'); - await (0, minio_1.initializeMinio)(); - await (0, image_storage_1.initializeImageStorage)(); - logger_1.logger.info('system', (0, minio_1.isMinioEnabled)() ? 'MinIO storage initialized' : 'Local filesystem storage initialized'); - // Clean up any orphaned proxy test jobs from previous server runs - await (0, proxyTestQueue_1.cleanupOrphanedJobs)(); - // Start the crawl scheduler (checks every minute for jobs to run) - (0, crawl_scheduler_1.startCrawlScheduler)(); - logger_1.logger.info('system', 'Crawl scheduler started'); - // Start the Dutchie AZ scheduler (enqueues jobs for workers) - await (0, dutchie_az_1.initializeDefaultSchedules)(); - (0, dutchie_az_1.startScheduler)(); - logger_1.logger.info('system', 'Dutchie AZ scheduler started'); - app.listen(PORT, () => { - logger_1.logger.info('system', `Server running on port ${PORT}`); - console.log(`🚀 Server running on port ${PORT}`); - }); - } - catch (error) { - logger_1.logger.error('system', `Failed to start server: ${error}`); - console.error('Failed to start server:', error); - process.exit(1); - } -} -startServer(); diff --git a/backend/dist/middleware/apiTokenTracker.js b/backend/dist/middleware/apiTokenTracker.js deleted file mode 100644 index 013da933..00000000 --- a/backend/dist/middleware/apiTokenTracker.js +++ /dev/null @@ -1,94 +0,0 @@ -"use strict"; -Object.defineProperty(exports, "__esModule", { value: true }); -exports.trackApiUsage = trackApiUsage; -exports.checkRateLimit = checkRateLimit; -const migrate_1 = require("../db/migrate"); -async function trackApiUsage(req, res, next) { - // Only track if authenticated via API token - if (!req.apiToken) { - return next(); - } - const startTime = Date.now(); - req.startTime = startTime; - // Get request size - const requestSize = req.headers['content-length'] - ? parseInt(req.headers['content-length']) - : 0; - // Capture original res.json to measure response - const originalJson = res.json.bind(res); - let responseSize = 0; - res.json = function (body) { - responseSize = JSON.stringify(body).length; - return originalJson(body); - }; - // Track after response is sent - res.on('finish', async () => { - const responseTime = Date.now() - startTime; - try { - await migrate_1.pool.query(` - INSERT INTO api_token_usage ( - token_id, - endpoint, - method, - status_code, - response_time_ms, - request_size, - response_size, - ip_address, - user_agent - ) - VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9) - `, [ - req.apiToken.id, - req.path, - req.method, - res.statusCode, - responseTime, - requestSize, - responseSize, - req.ip, - req.headers['user-agent'] || null - ]); - // Update last_used_at - await migrate_1.pool.query('UPDATE api_tokens SET last_used_at = CURRENT_TIMESTAMP WHERE id = $1', [req.apiToken.id]); - } - catch (error) { - console.error('Error tracking API usage:', error); - } - }); - next(); -} -// Rate limiting check -async function checkRateLimit(req, res, next) { - if (!req.apiToken) { - return next(); - } - const { id, rate_limit } = req.apiToken; - try { - // Count requests in the last minute - const result = await migrate_1.pool.query(` - SELECT COUNT(*) as request_count - FROM api_token_usage - WHERE token_id = $1 - AND created_at > NOW() - INTERVAL '1 minute' - `, [id]); - const requestCount = parseInt(result.rows[0].request_count); - if (requestCount >= rate_limit) { - return res.status(429).json({ - error: 'Rate limit exceeded', - limit: rate_limit, - current: requestCount, - retry_after: 60 - }); - } - // Add rate limit headers - res.setHeader('X-RateLimit-Limit', rate_limit.toString()); - res.setHeader('X-RateLimit-Remaining', (rate_limit - requestCount).toString()); - res.setHeader('X-RateLimit-Reset', new Date(Date.now() + 60000).toISOString()); - next(); - } - catch (error) { - console.error('Error checking rate limit:', error); - next(); - } -} diff --git a/backend/dist/middleware/wordpressPermissions.js b/backend/dist/middleware/wordpressPermissions.js deleted file mode 100644 index c4e13c55..00000000 --- a/backend/dist/middleware/wordpressPermissions.js +++ /dev/null @@ -1,163 +0,0 @@ -"use strict"; -var __importDefault = (this && this.__importDefault) || function (mod) { - return (mod && mod.__esModule) ? mod : { "default": mod }; -}; -Object.defineProperty(exports, "__esModule", { value: true }); -exports.validateWordPressPermissions = validateWordPressPermissions; -const migrate_1 = require("../db/migrate"); -const ipaddr_js_1 = __importDefault(require("ipaddr.js")); -/** - * Validates if an IP address matches any of the allowed IP patterns - * Supports CIDR notation and wildcards - */ -function isIpAllowed(clientIp, allowedIps) { - try { - const clientAddr = ipaddr_js_1.default.process(clientIp); - for (const allowedIp of allowedIps) { - const trimmed = allowedIp.trim(); - if (!trimmed) - continue; - // Check for CIDR notation - if (trimmed.includes('/')) { - try { - const [subnet, bits] = trimmed.split('/'); - const range = ipaddr_js_1.default.parseCIDR(trimmed); - if (clientAddr.match(range)) { - return true; - } - } - catch (e) { - console.warn(`Invalid CIDR notation: ${trimmed}`); - continue; - } - } - else { - // Exact match - try { - const allowedAddr = ipaddr_js_1.default.process(trimmed); - if (clientAddr.toString() === allowedAddr.toString()) { - return true; - } - } - catch (e) { - console.warn(`Invalid IP address: ${trimmed}`); - continue; - } - } - } - return false; - } - catch (error) { - console.error('Error processing client IP:', error); - return false; - } -} -/** - * Validates if a domain matches any of the allowed domain patterns - * Supports wildcard subdomains (*.example.com) - */ -function isDomainAllowed(origin, allowedDomains) { - try { - // Extract domain from origin URL - const url = new URL(origin); - const domain = url.hostname; - for (const allowedDomain of allowedDomains) { - const trimmed = allowedDomain.trim(); - if (!trimmed) - continue; - // Wildcard subdomain support - if (trimmed.startsWith('*.')) { - const baseDomain = trimmed.substring(2); - if (domain === baseDomain || domain.endsWith('.' + baseDomain)) { - return true; - } - } - else { - // Exact match - if (domain === trimmed) { - return true; - } - } - } - return false; - } - catch (error) { - console.error('Error processing domain:', error); - return false; - } -} -/** - * WordPress API Permissions Middleware - * Validates API access based on WordPress permissions table - */ -async function validateWordPressPermissions(req, res, next) { - // Get API key from header - const apiKey = req.headers['x-api-key']; - // If no API key provided, skip WordPress validation - if (!apiKey) { - return next(); - } - try { - // Query WordPress permissions table - const result = await migrate_1.pool.query(` - SELECT id, user_name, api_key, allowed_ips, allowed_domains, is_active - FROM wp_dutchie_api_permissions - WHERE api_key = $1 AND is_active = 1 - `, [apiKey]); - if (result.rows.length === 0) { - return res.status(401).json({ - error: 'Invalid API key' - }); - } - const permission = result.rows[0]; - // Get client IP - const clientIp = req.headers['x-forwarded-for']?.split(',')[0].trim() || - req.headers['x-real-ip'] || - req.ip || - req.connection.remoteAddress || - ''; - // Validate IP if configured - if (permission.allowed_ips) { - const allowedIps = permission.allowed_ips.split('\n').filter((ip) => ip.trim()); - if (allowedIps.length > 0 && !isIpAllowed(clientIp, allowedIps)) { - return res.status(403).json({ - error: 'IP address not allowed', - client_ip: clientIp - }); - } - } - // Validate domain if configured - const origin = req.get('origin') || req.get('referer') || ''; - if (permission.allowed_domains && origin) { - const allowedDomains = permission.allowed_domains.split('\n').filter((d) => d.trim()); - if (allowedDomains.length > 0 && !isDomainAllowed(origin, allowedDomains)) { - return res.status(403).json({ - error: 'Domain not allowed', - origin: origin - }); - } - } - // Update last_used_at timestamp (async, don't wait) - migrate_1.pool.query(` - UPDATE wp_dutchie_api_permissions - SET last_used_at = CURRENT_TIMESTAMP - WHERE id = $1 - `, [permission.id]).catch((err) => { - console.error('Error updating last_used_at:', err); - }); - // Set apiToken on request for tracking middleware - // Default rate limit of 100 requests/minute for WordPress permissions - req.apiToken = { - id: permission.id, - name: permission.user_name, - rate_limit: 100 - }; - next(); - } - catch (error) { - console.error('WordPress permissions validation error:', error); - return res.status(500).json({ - error: 'Internal server error during API validation' - }); - } -} diff --git a/backend/dist/migrations-runner/009_image_sizes.js b/backend/dist/migrations-runner/009_image_sizes.js deleted file mode 100644 index 30858a3d..00000000 --- a/backend/dist/migrations-runner/009_image_sizes.js +++ /dev/null @@ -1,32 +0,0 @@ -"use strict"; -Object.defineProperty(exports, "__esModule", { value: true }); -const migrate_1 = require("../db/migrate"); -(async () => { - try { - console.log('🔄 Running image sizes migration...'); - // Add thumbnail and medium paths - await migrate_1.pool.query(` - ALTER TABLE products - ADD COLUMN IF NOT EXISTS thumbnail_path TEXT, - ADD COLUMN IF NOT EXISTS medium_path TEXT - `); - console.log('✅ Added thumbnail_path and medium_path columns'); - // Rename local_image_path to full_path - await migrate_1.pool.query(` - ALTER TABLE products - RENAME COLUMN local_image_path TO full_path - `); - console.log('✅ Renamed local_image_path to full_path'); - // Add index - await migrate_1.pool.query(` - CREATE INDEX IF NOT EXISTS idx_products_images ON products(full_path, thumbnail_path, medium_path) - `); - console.log('✅ Created image index'); - console.log('✅ Migration complete!'); - process.exit(0); - } - catch (error) { - console.error('❌ Migration failed:', error); - process.exit(1); - } -})(); diff --git a/backend/dist/routes/analytics.js b/backend/dist/routes/analytics.js deleted file mode 100644 index b14eed37..00000000 --- a/backend/dist/routes/analytics.js +++ /dev/null @@ -1,121 +0,0 @@ -"use strict"; -Object.defineProperty(exports, "__esModule", { value: true }); -const express_1 = require("express"); -const middleware_1 = require("../auth/middleware"); -const migrate_1 = require("../db/migrate"); -const router = (0, express_1.Router)(); -router.use(middleware_1.authMiddleware); -// Get analytics overview -router.get('/overview', async (req, res) => { - try { - const { days = 30 } = req.query; - // Total clicks - const clicksResult = await migrate_1.pool.query(` - SELECT COUNT(*) as total_clicks - FROM clicks - WHERE clicked_at >= NOW() - INTERVAL '${parseInt(days)} days' - `); - // Unique products clicked - const uniqueProductsResult = await migrate_1.pool.query(` - SELECT COUNT(DISTINCT product_id) as unique_products - FROM clicks - WHERE clicked_at >= NOW() - INTERVAL '${parseInt(days)} days' - `); - // Clicks by day - const clicksByDayResult = await migrate_1.pool.query(` - SELECT DATE(clicked_at) as date, COUNT(*) as clicks - FROM clicks - WHERE clicked_at >= NOW() - INTERVAL '${parseInt(days)} days' - GROUP BY DATE(clicked_at) - ORDER BY date DESC - `); - // Top products - const topProductsResult = await migrate_1.pool.query(` - SELECT p.id, p.name, p.price, COUNT(c.id) as click_count - FROM clicks c - JOIN products p ON c.product_id = p.id - WHERE c.clicked_at >= NOW() - INTERVAL '${parseInt(days)} days' - GROUP BY p.id, p.name, p.price - ORDER BY click_count DESC - LIMIT 10 - `); - res.json({ - overview: { - total_clicks: parseInt(clicksResult.rows[0].total_clicks), - unique_products: parseInt(uniqueProductsResult.rows[0].unique_products) - }, - clicks_by_day: clicksByDayResult.rows, - top_products: topProductsResult.rows - }); - } - catch (error) { - console.error('Error fetching analytics:', error); - res.status(500).json({ error: 'Failed to fetch analytics' }); - } -}); -// Get product analytics -router.get('/products/:id', async (req, res) => { - try { - const { id } = req.params; - const { days = 30 } = req.query; - // Total clicks for this product - const totalResult = await migrate_1.pool.query(` - SELECT COUNT(*) as total_clicks - FROM clicks - WHERE product_id = $1 - AND clicked_at >= NOW() - INTERVAL '${parseInt(days)} days' - `, [id]); - // Clicks by day - const byDayResult = await migrate_1.pool.query(` - SELECT DATE(clicked_at) as date, COUNT(*) as clicks - FROM clicks - WHERE product_id = $1 - AND clicked_at >= NOW() - INTERVAL '${parseInt(days)} days' - GROUP BY DATE(clicked_at) - ORDER BY date DESC - `, [id]); - res.json({ - product_id: parseInt(id), - total_clicks: parseInt(totalResult.rows[0].total_clicks), - clicks_by_day: byDayResult.rows - }); - } - catch (error) { - console.error('Error fetching product analytics:', error); - res.status(500).json({ error: 'Failed to fetch product analytics' }); - } -}); -// Get campaign analytics -router.get('/campaigns/:id', async (req, res) => { - try { - const { id } = req.params; - const { days = 30 } = req.query; - // Total clicks for this campaign - const totalResult = await migrate_1.pool.query(` - SELECT COUNT(*) as total_clicks - FROM clicks - WHERE campaign_id = $1 - AND clicked_at >= NOW() - INTERVAL '${parseInt(days)} days' - `, [id]); - // Clicks by product in this campaign - const byProductResult = await migrate_1.pool.query(` - SELECT p.id, p.name, COUNT(c.id) as clicks - FROM clicks c - JOIN products p ON c.product_id = p.id - WHERE c.campaign_id = $1 - AND c.clicked_at >= NOW() - INTERVAL '${parseInt(days)} days' - GROUP BY p.id, p.name - ORDER BY clicks DESC - `, [id]); - res.json({ - campaign_id: parseInt(id), - total_clicks: parseInt(totalResult.rows[0].total_clicks), - clicks_by_product: byProductResult.rows - }); - } - catch (error) { - console.error('Error fetching campaign analytics:', error); - res.status(500).json({ error: 'Failed to fetch campaign analytics' }); - } -}); -exports.default = router; diff --git a/backend/dist/routes/api-permissions.js b/backend/dist/routes/api-permissions.js deleted file mode 100644 index 8123a646..00000000 --- a/backend/dist/routes/api-permissions.js +++ /dev/null @@ -1,174 +0,0 @@ -"use strict"; -var __importDefault = (this && this.__importDefault) || function (mod) { - return (mod && mod.__esModule) ? mod : { "default": mod }; -}; -Object.defineProperty(exports, "__esModule", { value: true }); -const express_1 = require("express"); -const middleware_1 = require("../auth/middleware"); -const migrate_1 = require("../db/migrate"); -const crypto_1 = __importDefault(require("crypto")); -const router = (0, express_1.Router)(); -router.use(middleware_1.authMiddleware); -// Generate secure random API key (64-character hex) -function generateApiKey() { - return crypto_1.default.randomBytes(32).toString('hex'); -} -// Get all API permissions -router.get('/', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => { - try { - const result = await migrate_1.pool.query(` - SELECT * - FROM wp_dutchie_api_permissions - ORDER BY created_at DESC - `); - res.json({ permissions: result.rows }); - } - catch (error) { - console.error('Error fetching API permissions:', error); - res.status(500).json({ error: 'Failed to fetch API permissions' }); - } -}); -// Get all dispensaries for dropdown (must be before /:id to avoid route conflict) -router.get('/dispensaries', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => { - try { - const result = await migrate_1.pool.query(` - SELECT id, name - FROM dispensaries - ORDER BY name - `); - res.json({ dispensaries: result.rows }); - } - catch (error) { - console.error('Error fetching dispensaries:', error); - res.status(500).json({ error: 'Failed to fetch dispensaries' }); - } -}); -// Get single API permission -router.get('/:id', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => { - try { - const { id } = req.params; - const result = await migrate_1.pool.query(` - SELECT * - FROM wp_dutchie_api_permissions - WHERE id = $1 - `, [id]); - if (result.rows.length === 0) { - return res.status(404).json({ error: 'Permission not found' }); - } - res.json({ permission: result.rows[0] }); - } - catch (error) { - console.error('Error fetching API permission:', error); - res.status(500).json({ error: 'Failed to fetch API permission' }); - } -}); -// Create new API permission -router.post('/', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => { - try { - // Support both store_id (existing) and dispensary_id (for compatibility) - const { user_name, allowed_ips, allowed_domains, store_id, dispensary_id } = req.body; - const storeIdToUse = store_id || dispensary_id; - if (!user_name) { - return res.status(400).json({ error: 'User name is required' }); - } - if (!storeIdToUse) { - return res.status(400).json({ error: 'Store/Dispensary is required' }); - } - // Get dispensary name for display - const dispensaryResult = await migrate_1.pool.query('SELECT name FROM dispensaries WHERE id = $1', [storeIdToUse]); - if (dispensaryResult.rows.length === 0) { - return res.status(400).json({ error: 'Invalid store/dispensary ID' }); - } - const storeName = dispensaryResult.rows[0].name; - const apiKey = generateApiKey(); - const result = await migrate_1.pool.query(` - INSERT INTO wp_dutchie_api_permissions ( - user_name, - api_key, - allowed_ips, - allowed_domains, - is_active, - store_id, - store_name - ) - VALUES ($1, $2, $3, $4, 1, $5, $6) - RETURNING * - `, [ - user_name, - apiKey, - allowed_ips || null, - allowed_domains || null, - storeIdToUse, - storeName - ]); - res.status(201).json({ - permission: result.rows[0], - message: 'API permission created successfully. Save the API key securely - it cannot be retrieved later.' - }); - } - catch (error) { - console.error('Error creating API permission:', error); - res.status(500).json({ error: 'Failed to create API permission' }); - } -}); -// Update API permission -router.put('/:id', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => { - try { - const { id } = req.params; - const { user_name, allowed_ips, allowed_domains, is_active } = req.body; - const result = await migrate_1.pool.query(` - UPDATE wp_dutchie_api_permissions - SET - user_name = COALESCE($1, user_name), - allowed_ips = COALESCE($2, allowed_ips), - allowed_domains = COALESCE($3, allowed_domains), - is_active = COALESCE($4, is_active) - WHERE id = $5 - RETURNING * - `, [user_name, allowed_ips, allowed_domains, is_active, id]); - if (result.rows.length === 0) { - return res.status(404).json({ error: 'Permission not found' }); - } - res.json({ permission: result.rows[0] }); - } - catch (error) { - console.error('Error updating API permission:', error); - res.status(500).json({ error: 'Failed to update API permission' }); - } -}); -// Toggle permission active status -router.patch('/:id/toggle', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => { - try { - const { id } = req.params; - const result = await migrate_1.pool.query(` - UPDATE wp_dutchie_api_permissions - SET is_active = NOT is_active - WHERE id = $1 - RETURNING * - `, [id]); - if (result.rows.length === 0) { - return res.status(404).json({ error: 'Permission not found' }); - } - res.json({ permission: result.rows[0] }); - } - catch (error) { - console.error('Error toggling API permission:', error); - res.status(500).json({ error: 'Failed to toggle API permission' }); - } -}); -// Delete API permission -router.delete('/:id', (0, middleware_1.requireRole)('superadmin'), async (req, res) => { - try { - const { id } = req.params; - const result = await migrate_1.pool.query('DELETE FROM wp_dutchie_api_permissions WHERE id = $1 RETURNING *', [id]); - if (result.rows.length === 0) { - return res.status(404).json({ error: 'Permission not found' }); - } - res.json({ message: 'API permission deleted successfully' }); - } - catch (error) { - console.error('Error deleting API permission:', error); - res.status(500).json({ error: 'Failed to delete API permission' }); - } -}); -exports.default = router; diff --git a/backend/dist/routes/api-tokens.js b/backend/dist/routes/api-tokens.js deleted file mode 100644 index 39139e9c..00000000 --- a/backend/dist/routes/api-tokens.js +++ /dev/null @@ -1,265 +0,0 @@ -"use strict"; -var __importDefault = (this && this.__importDefault) || function (mod) { - return (mod && mod.__esModule) ? mod : { "default": mod }; -}; -Object.defineProperty(exports, "__esModule", { value: true }); -const express_1 = require("express"); -const middleware_1 = require("../auth/middleware"); -const migrate_1 = require("../db/migrate"); -const crypto_1 = __importDefault(require("crypto")); -const router = (0, express_1.Router)(); -router.use(middleware_1.authMiddleware); -// Generate secure random token -function generateToken() { - return crypto_1.default.randomBytes(32).toString('hex'); -} -// Get all API tokens -router.get('/', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => { - try { - const result = await migrate_1.pool.query(` - SELECT - t.*, - u.email as created_by_email, - ( - SELECT COUNT(*) - FROM api_token_usage - WHERE token_id = t.id - AND created_at > NOW() - INTERVAL '24 hours' - ) as requests_24h, - ( - SELECT COUNT(*) - FROM api_token_usage - WHERE token_id = t.id - AND created_at > NOW() - INTERVAL '7 days' - ) as requests_7d, - ( - SELECT COUNT(*) - FROM api_token_usage - WHERE token_id = t.id - ) as total_requests - FROM api_tokens t - LEFT JOIN users u ON t.user_id = u.id - ORDER BY t.created_at DESC - `); - res.json({ tokens: result.rows }); - } - catch (error) { - console.error('Error fetching API tokens:', error); - res.status(500).json({ error: 'Failed to fetch API tokens' }); - } -}); -// Get single API token -router.get('/:id', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => { - try { - const { id } = req.params; - const result = await migrate_1.pool.query(` - SELECT - t.*, - u.email as created_by_email - FROM api_tokens t - LEFT JOIN users u ON t.user_id = u.id - WHERE t.id = $1 - `, [id]); - if (result.rows.length === 0) { - return res.status(404).json({ error: 'Token not found' }); - } - res.json({ token: result.rows[0] }); - } - catch (error) { - console.error('Error fetching API token:', error); - res.status(500).json({ error: 'Failed to fetch API token' }); - } -}); -// Create new API token -router.post('/', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => { - try { - const { name, description, rate_limit, allowed_endpoints, expires_at } = req.body; - const userId = req.user.userId; - if (!name) { - return res.status(400).json({ error: 'Name is required' }); - } - const token = generateToken(); - const result = await migrate_1.pool.query(` - INSERT INTO api_tokens ( - name, - token, - description, - user_id, - rate_limit, - allowed_endpoints, - expires_at - ) - VALUES ($1, $2, $3, $4, $5, $6, $7) - RETURNING * - `, [ - name, - token, - description || null, - userId, - rate_limit || 100, - allowed_endpoints || null, - expires_at || null - ]); - res.status(201).json({ - token: result.rows[0], - message: 'API token created successfully. Save this token securely - it cannot be retrieved later.' - }); - } - catch (error) { - console.error('Error creating API token:', error); - res.status(500).json({ error: 'Failed to create API token' }); - } -}); -// Update API token -router.put('/:id', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => { - try { - const { id } = req.params; - const { name, description, active, rate_limit, allowed_endpoints, expires_at } = req.body; - const result = await migrate_1.pool.query(` - UPDATE api_tokens - SET - name = COALESCE($1, name), - description = COALESCE($2, description), - active = COALESCE($3, active), - rate_limit = COALESCE($4, rate_limit), - allowed_endpoints = COALESCE($5, allowed_endpoints), - expires_at = COALESCE($6, expires_at) - WHERE id = $7 - RETURNING * - `, [name, description, active, rate_limit, allowed_endpoints, expires_at, id]); - if (result.rows.length === 0) { - return res.status(404).json({ error: 'Token not found' }); - } - res.json({ token: result.rows[0] }); - } - catch (error) { - console.error('Error updating API token:', error); - res.status(500).json({ error: 'Failed to update API token' }); - } -}); -// Delete API token -router.delete('/:id', (0, middleware_1.requireRole)('superadmin'), async (req, res) => { - try { - const { id } = req.params; - const result = await migrate_1.pool.query('DELETE FROM api_tokens WHERE id = $1 RETURNING *', [id]); - if (result.rows.length === 0) { - return res.status(404).json({ error: 'Token not found' }); - } - res.json({ message: 'API token deleted successfully' }); - } - catch (error) { - console.error('Error deleting API token:', error); - res.status(500).json({ error: 'Failed to delete API token' }); - } -}); -// Get token usage statistics -router.get('/:id/usage', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => { - try { - const { id } = req.params; - const { days = 7 } = req.query; - // Get hourly usage for the past N days - const hourlyUsage = await migrate_1.pool.query(` - SELECT - DATE_TRUNC('hour', created_at) as hour, - COUNT(*) as requests, - AVG(response_time_ms) as avg_response_time, - SUM(CASE WHEN status_code >= 200 AND status_code < 300 THEN 1 ELSE 0 END) as successful_requests, - SUM(CASE WHEN status_code >= 400 THEN 1 ELSE 0 END) as failed_requests - FROM api_token_usage - WHERE token_id = $1 - AND created_at > NOW() - INTERVAL '${parseInt(days)} days' - GROUP BY hour - ORDER BY hour DESC - `, [id]); - // Get endpoint usage - const endpointUsage = await migrate_1.pool.query(` - SELECT - endpoint, - method, - COUNT(*) as requests, - AVG(response_time_ms) as avg_response_time - FROM api_token_usage - WHERE token_id = $1 - AND created_at > NOW() - INTERVAL '${parseInt(days)} days' - GROUP BY endpoint, method - ORDER BY requests DESC - LIMIT 20 - `, [id]); - // Get recent requests - const recentRequests = await migrate_1.pool.query(` - SELECT - endpoint, - method, - status_code, - response_time_ms, - ip_address, - created_at - FROM api_token_usage - WHERE token_id = $1 - ORDER BY created_at DESC - LIMIT 100 - `, [id]); - res.json({ - hourly_usage: hourlyUsage.rows, - endpoint_usage: endpointUsage.rows, - recent_requests: recentRequests.rows - }); - } - catch (error) { - console.error('Error fetching token usage:', error); - res.status(500).json({ error: 'Failed to fetch token usage' }); - } -}); -// Get overall API usage statistics -router.get('/stats/overview', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => { - try { - const { days = 7 } = req.query; - const stats = await migrate_1.pool.query(` - SELECT - COUNT(DISTINCT token_id) as active_tokens, - COUNT(*) as total_requests, - AVG(response_time_ms) as avg_response_time, - SUM(CASE WHEN status_code >= 200 AND status_code < 300 THEN 1 ELSE 0 END) as successful_requests, - SUM(CASE WHEN status_code >= 400 THEN 1 ELSE 0 END) as failed_requests - FROM api_token_usage - WHERE created_at > NOW() - INTERVAL '${parseInt(days)} days' - `); - // Top tokens by usage - const topTokens = await migrate_1.pool.query(` - SELECT - t.id, - t.name, - COUNT(u.id) as requests, - AVG(u.response_time_ms) as avg_response_time - FROM api_tokens t - LEFT JOIN api_token_usage u ON t.id = u.token_id - WHERE u.created_at > NOW() - INTERVAL '${parseInt(days)} days' - GROUP BY t.id, t.name - ORDER BY requests DESC - LIMIT 10 - `); - // Most used endpoints - const topEndpoints = await migrate_1.pool.query(` - SELECT - endpoint, - method, - COUNT(*) as requests, - AVG(response_time_ms) as avg_response_time - FROM api_token_usage - WHERE created_at > NOW() - INTERVAL '${parseInt(days)} days' - GROUP BY endpoint, method - ORDER BY requests DESC - LIMIT 10 - `); - res.json({ - overview: stats.rows[0], - top_tokens: topTokens.rows, - top_endpoints: topEndpoints.rows - }); - } - catch (error) { - console.error('Error fetching API stats:', error); - res.status(500).json({ error: 'Failed to fetch API stats' }); - } -}); -exports.default = router; diff --git a/backend/dist/routes/auth.js b/backend/dist/routes/auth.js deleted file mode 100644 index 8c495798..00000000 --- a/backend/dist/routes/auth.js +++ /dev/null @@ -1,43 +0,0 @@ -"use strict"; -Object.defineProperty(exports, "__esModule", { value: true }); -const express_1 = require("express"); -const middleware_1 = require("../auth/middleware"); -const router = (0, express_1.Router)(); -// Login -router.post('/login', async (req, res) => { - try { - const { email, password } = req.body; - if (!email || !password) { - return res.status(400).json({ error: 'Email and password required' }); - } - const user = await (0, middleware_1.authenticateUser)(email, password); - if (!user) { - return res.status(401).json({ error: 'Invalid credentials' }); - } - const token = (0, middleware_1.generateToken)(user); - res.json({ - token, - user: { - id: user.id, - email: user.email, - role: user.role - } - }); - } - catch (error) { - console.error('Login error:', error); - res.status(500).json({ error: 'Internal server error' }); - } -}); -// Get current user -router.get('/me', middleware_1.authMiddleware, async (req, res) => { - res.json({ - user: req.user - }); -}); -// Refresh token -router.post('/refresh', middleware_1.authMiddleware, async (req, res) => { - const token = (0, middleware_1.generateToken)(req.user); - res.json({ token }); -}); -exports.default = router; diff --git a/backend/dist/routes/campaigns.js b/backend/dist/routes/campaigns.js deleted file mode 100644 index e96ee8a9..00000000 --- a/backend/dist/routes/campaigns.js +++ /dev/null @@ -1,163 +0,0 @@ -"use strict"; -Object.defineProperty(exports, "__esModule", { value: true }); -const express_1 = require("express"); -const middleware_1 = require("../auth/middleware"); -const migrate_1 = require("../db/migrate"); -const router = (0, express_1.Router)(); -router.use(middleware_1.authMiddleware); -// Get all campaigns -router.get('/', async (req, res) => { - try { - const result = await migrate_1.pool.query(` - SELECT c.*, COUNT(cp.product_id) as product_count - FROM campaigns c - LEFT JOIN campaign_products cp ON c.id = cp.campaign_id - GROUP BY c.id - ORDER BY c.created_at DESC - `); - res.json({ campaigns: result.rows }); - } - catch (error) { - console.error('Error fetching campaigns:', error); - res.status(500).json({ error: 'Failed to fetch campaigns' }); - } -}); -// Get single campaign with products -router.get('/:id', async (req, res) => { - try { - const { id } = req.params; - const campaignResult = await migrate_1.pool.query(` - SELECT * FROM campaigns WHERE id = $1 - `, [id]); - if (campaignResult.rows.length === 0) { - return res.status(404).json({ error: 'Campaign not found' }); - } - const productsResult = await migrate_1.pool.query(` - SELECT p.*, cp.display_order - FROM products p - JOIN campaign_products cp ON p.id = cp.product_id - WHERE cp.campaign_id = $1 - ORDER BY cp.display_order - `, [id]); - res.json({ - campaign: campaignResult.rows[0], - products: productsResult.rows - }); - } - catch (error) { - console.error('Error fetching campaign:', error); - res.status(500).json({ error: 'Failed to fetch campaign' }); - } -}); -// Create campaign -router.post('/', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => { - try { - const { name, slug, description, display_style, active, start_date, end_date } = req.body; - if (!name || !slug) { - return res.status(400).json({ error: 'Name and slug required' }); - } - const result = await migrate_1.pool.query(` - INSERT INTO campaigns (name, slug, description, display_style, active, start_date, end_date) - VALUES ($1, $2, $3, $4, $5, $6, $7) - RETURNING * - `, [name, slug, description, display_style || 'grid', active !== false, start_date, end_date]); - res.status(201).json({ campaign: result.rows[0] }); - } - catch (error) { - console.error('Error creating campaign:', error); - if (error.code === '23505') { - return res.status(409).json({ error: 'Campaign slug already exists' }); - } - res.status(500).json({ error: 'Failed to create campaign' }); - } -}); -// Update campaign -router.put('/:id', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => { - try { - const { id } = req.params; - const { name, slug, description, display_style, active, start_date, end_date } = req.body; - const result = await migrate_1.pool.query(` - UPDATE campaigns - SET name = COALESCE($1, name), - slug = COALESCE($2, slug), - description = COALESCE($3, description), - display_style = COALESCE($4, display_style), - active = COALESCE($5, active), - start_date = COALESCE($6, start_date), - end_date = COALESCE($7, end_date), - updated_at = CURRENT_TIMESTAMP - WHERE id = $8 - RETURNING * - `, [name, slug, description, display_style, active, start_date, end_date, id]); - if (result.rows.length === 0) { - return res.status(404).json({ error: 'Campaign not found' }); - } - res.json({ campaign: result.rows[0] }); - } - catch (error) { - console.error('Error updating campaign:', error); - if (error.code === '23505') { - return res.status(409).json({ error: 'Campaign slug already exists' }); - } - res.status(500).json({ error: 'Failed to update campaign' }); - } -}); -// Delete campaign -router.delete('/:id', (0, middleware_1.requireRole)('superadmin'), async (req, res) => { - try { - const { id } = req.params; - const result = await migrate_1.pool.query(` - DELETE FROM campaigns WHERE id = $1 RETURNING id - `, [id]); - if (result.rows.length === 0) { - return res.status(404).json({ error: 'Campaign not found' }); - } - res.json({ message: 'Campaign deleted successfully' }); - } - catch (error) { - console.error('Error deleting campaign:', error); - res.status(500).json({ error: 'Failed to delete campaign' }); - } -}); -// Add product to campaign -router.post('/:id/products', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => { - try { - const { id } = req.params; - const { product_id, display_order } = req.body; - if (!product_id) { - return res.status(400).json({ error: 'Product ID required' }); - } - const result = await migrate_1.pool.query(` - INSERT INTO campaign_products (campaign_id, product_id, display_order) - VALUES ($1, $2, $3) - ON CONFLICT (campaign_id, product_id) - DO UPDATE SET display_order = $3 - RETURNING * - `, [id, product_id, display_order || 0]); - res.status(201).json({ campaign_product: result.rows[0] }); - } - catch (error) { - console.error('Error adding product to campaign:', error); - res.status(500).json({ error: 'Failed to add product to campaign' }); - } -}); -// Remove product from campaign -router.delete('/:id/products/:product_id', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => { - try { - const { id, product_id } = req.params; - const result = await migrate_1.pool.query(` - DELETE FROM campaign_products - WHERE campaign_id = $1 AND product_id = $2 - RETURNING * - `, [id, product_id]); - if (result.rows.length === 0) { - return res.status(404).json({ error: 'Product not in campaign' }); - } - res.json({ message: 'Product removed from campaign' }); - } - catch (error) { - console.error('Error removing product from campaign:', error); - res.status(500).json({ error: 'Failed to remove product from campaign' }); - } -}); -exports.default = router; diff --git a/backend/dist/routes/categories.js b/backend/dist/routes/categories.js deleted file mode 100644 index e04ca1e7..00000000 --- a/backend/dist/routes/categories.js +++ /dev/null @@ -1,84 +0,0 @@ -"use strict"; -Object.defineProperty(exports, "__esModule", { value: true }); -const express_1 = require("express"); -const middleware_1 = require("../auth/middleware"); -const migrate_1 = require("../db/migrate"); -const router = (0, express_1.Router)(); -router.use(middleware_1.authMiddleware); -// Get categories (flat list) -router.get('/', async (req, res) => { - try { - const { store_id } = req.query; - let query = ` - SELECT - c.*, - COUNT(DISTINCT p.id) as product_count, - pc.name as parent_name - FROM categories c - LEFT JOIN products p ON c.id = p.category_id - LEFT JOIN categories pc ON c.parent_id = pc.id - `; - const params = []; - if (store_id) { - query += ' WHERE c.store_id = $1'; - params.push(store_id); - } - query += ` - GROUP BY c.id, pc.name - ORDER BY c.display_order, c.name - `; - const result = await migrate_1.pool.query(query, params); - res.json({ categories: result.rows }); - } - catch (error) { - console.error('Error fetching categories:', error); - res.status(500).json({ error: 'Failed to fetch categories' }); - } -}); -// Get category tree (hierarchical) -router.get('/tree', async (req, res) => { - try { - const { store_id } = req.query; - if (!store_id) { - return res.status(400).json({ error: 'store_id is required' }); - } - // Get all categories for the store - const result = await migrate_1.pool.query(` - SELECT - c.*, - COUNT(DISTINCT p.id) as product_count - FROM categories c - LEFT JOIN products p ON c.id = p.category_id AND p.in_stock = true - WHERE c.store_id = $1 - GROUP BY c.id - ORDER BY c.display_order, c.name - `, [store_id]); - // Build tree structure - const categories = result.rows; - const categoryMap = new Map(); - const tree = []; - // First pass: create map - categories.forEach((cat) => { - categoryMap.set(cat.id, { ...cat, children: [] }); - }); - // Second pass: build tree - categories.forEach((cat) => { - const node = categoryMap.get(cat.id); - if (cat.parent_id) { - const parent = categoryMap.get(cat.parent_id); - if (parent) { - parent.children.push(node); - } - } - else { - tree.push(node); - } - }); - res.json({ tree }); - } - catch (error) { - console.error('Error fetching category tree:', error); - res.status(500).json({ error: 'Failed to fetch category tree' }); - } -}); -exports.default = router; diff --git a/backend/dist/routes/changes.js b/backend/dist/routes/changes.js deleted file mode 100644 index 0af6afd6..00000000 --- a/backend/dist/routes/changes.js +++ /dev/null @@ -1,152 +0,0 @@ -"use strict"; -Object.defineProperty(exports, "__esModule", { value: true }); -const express_1 = require("express"); -const middleware_1 = require("../auth/middleware"); -const migrate_1 = require("../db/migrate"); -const router = (0, express_1.Router)(); -router.use(middleware_1.authMiddleware); -// Get all changes with optional status filter -router.get('/', async (req, res) => { - try { - const { status } = req.query; - let query = ` - SELECT - dc.id, - dc.dispensary_id, - dc.field_name, - dc.old_value, - dc.new_value, - dc.source, - dc.confidence_score, - dc.change_notes, - dc.status, - dc.requires_recrawl, - dc.created_at, - dc.reviewed_at, - dc.reviewed_by, - dc.rejection_reason, - d.name as dispensary_name, - d.slug as dispensary_slug, - d.city, - d.state - FROM dispensary_changes dc - JOIN dispensaries d ON dc.dispensary_id = d.id - `; - const params = []; - if (status) { - query += ` WHERE dc.status = $1`; - params.push(status); - } - query += ` ORDER BY dc.created_at DESC`; - const result = await migrate_1.pool.query(query, params); - res.json({ changes: result.rows }); - } - catch (error) { - console.error('Error fetching changes:', error); - res.status(500).json({ error: 'Failed to fetch changes' }); - } -}); -// Get changes statistics (for alert banner) -router.get('/stats', async (req, res) => { - try { - const result = await migrate_1.pool.query(` - SELECT - COUNT(*) FILTER (WHERE status = 'pending') as pending_count, - COUNT(*) FILTER (WHERE status = 'pending' AND requires_recrawl = TRUE) as pending_recrawl_count, - COUNT(*) FILTER (WHERE status = 'approved') as approved_count, - COUNT(*) FILTER (WHERE status = 'rejected') as rejected_count - FROM dispensary_changes - `); - res.json(result.rows[0]); - } - catch (error) { - console.error('Error fetching change stats:', error); - res.status(500).json({ error: 'Failed to fetch change stats' }); - } -}); -// Approve a change and apply it to the dispensary -router.post('/:id/approve', async (req, res) => { - const client = await migrate_1.pool.connect(); - try { - await client.query('BEGIN'); - const { id } = req.params; - const userId = req.user?.id; // From auth middleware - // Get the change record - const changeResult = await client.query(` - SELECT * FROM dispensary_changes WHERE id = $1 AND status = 'pending' - `, [id]); - if (changeResult.rows.length === 0) { - await client.query('ROLLBACK'); - return res.status(404).json({ error: 'Pending change not found' }); - } - const change = changeResult.rows[0]; - // Apply the change to the dispensary table - const updateQuery = ` - UPDATE dispensaries - SET ${change.field_name} = $1, updated_at = CURRENT_TIMESTAMP - WHERE id = $2 - RETURNING * - `; - const dispensaryResult = await client.query(updateQuery, [ - change.new_value, - change.dispensary_id - ]); - if (dispensaryResult.rows.length === 0) { - await client.query('ROLLBACK'); - return res.status(404).json({ error: 'Dispensary not found' }); - } - // Mark the change as approved - await client.query(` - UPDATE dispensary_changes - SET - status = 'approved', - reviewed_at = CURRENT_TIMESTAMP, - reviewed_by = $1 - WHERE id = $2 - `, [userId, id]); - await client.query('COMMIT'); - res.json({ - message: 'Change approved and applied', - dispensary: dispensaryResult.rows[0], - requires_recrawl: change.requires_recrawl - }); - } - catch (error) { - await client.query('ROLLBACK'); - console.error('Error approving change:', error); - res.status(500).json({ error: 'Failed to approve change' }); - } - finally { - client.release(); - } -}); -// Reject a change with optional reason -router.post('/:id/reject', async (req, res) => { - try { - const { id } = req.params; - const { reason } = req.body; - const userId = req.user?.id; // From auth middleware - const result = await migrate_1.pool.query(` - UPDATE dispensary_changes - SET - status = 'rejected', - reviewed_at = CURRENT_TIMESTAMP, - reviewed_by = $1, - rejection_reason = $2 - WHERE id = $3 AND status = 'pending' - RETURNING * - `, [userId, reason, id]); - if (result.rows.length === 0) { - return res.status(404).json({ error: 'Pending change not found' }); - } - res.json({ - message: 'Change rejected', - change: result.rows[0] - }); - } - catch (error) { - console.error('Error rejecting change:', error); - res.status(500).json({ error: 'Failed to reject change' }); - } -}); -exports.default = router; diff --git a/backend/dist/routes/crawler-sandbox.js b/backend/dist/routes/crawler-sandbox.js deleted file mode 100644 index b7d2870f..00000000 --- a/backend/dist/routes/crawler-sandbox.js +++ /dev/null @@ -1,497 +0,0 @@ -"use strict"; -/** - * Crawler Sandbox API Routes - * - * Endpoints for managing sandbox crawls, templates, and provider detection - */ -var __importDefault = (this && this.__importDefault) || function (mod) { - return (mod && mod.__esModule) ? mod : { "default": mod }; -}; -Object.defineProperty(exports, "__esModule", { value: true }); -const express_1 = __importDefault(require("express")); -const migrate_1 = require("../db/migrate"); -const middleware_1 = require("../auth/middleware"); -const logger_1 = require("../services/logger"); -const crawler_jobs_1 = require("../services/crawler-jobs"); -const router = express_1.default.Router(); -// Apply auth middleware to all routes -router.use(middleware_1.authMiddleware); -// ======================================== -// Sandbox Entries -// ======================================== -/** - * GET /api/crawler-sandbox - * List sandbox entries with optional filters - */ -router.get('/', async (req, res) => { - try { - const { status, dispensaryId, limit = 50, offset = 0 } = req.query; - let query = ` - SELECT cs.*, d.name as dispensary_name, d.website, d.menu_provider, d.crawler_status - FROM crawler_sandboxes cs - JOIN dispensaries d ON d.id = cs.dispensary_id - WHERE 1=1 - `; - const params = []; - let paramIndex = 1; - if (status) { - query += ` AND cs.status = $${paramIndex}`; - params.push(status); - paramIndex++; - } - if (dispensaryId) { - query += ` AND cs.dispensary_id = $${paramIndex}`; - params.push(Number(dispensaryId)); - paramIndex++; - } - query += ` ORDER BY cs.created_at DESC LIMIT $${paramIndex} OFFSET $${paramIndex + 1}`; - params.push(Number(limit), Number(offset)); - const result = await migrate_1.pool.query(query, params); - // Get total count - const countResult = await migrate_1.pool.query(`SELECT COUNT(*) FROM crawler_sandboxes cs WHERE 1=1 - ${status ? 'AND cs.status = $1' : ''} - ${dispensaryId ? `AND cs.dispensary_id = $${status ? 2 : 1}` : ''}`, status && dispensaryId ? [status, dispensaryId] : status ? [status] : dispensaryId ? [dispensaryId] : []); - res.json({ - sandboxes: result.rows, - total: parseInt(countResult.rows[0].count), - limit: Number(limit), - offset: Number(offset), - }); - } - catch (error) { - logger_1.logger.error('api', `Get sandboxes error: ${error.message}`); - res.status(500).json({ error: error.message }); - } -}); -/** - * GET /api/crawler-sandbox/:id - * Get a single sandbox entry with full details - */ -router.get('/:id', async (req, res) => { - try { - const { id } = req.params; - const result = await migrate_1.pool.query(`SELECT cs.*, d.name as dispensary_name, d.website, d.menu_url, - d.menu_provider, d.menu_provider_confidence, d.crawler_mode, d.crawler_status - FROM crawler_sandboxes cs - JOIN dispensaries d ON d.id = cs.dispensary_id - WHERE cs.id = $1`, [id]); - if (result.rows.length === 0) { - return res.status(404).json({ error: 'Sandbox entry not found' }); - } - // Get related jobs - const jobs = await migrate_1.pool.query(`SELECT * FROM sandbox_crawl_jobs - WHERE sandbox_id = $1 OR dispensary_id = $2 - ORDER BY created_at DESC - LIMIT 10`, [id, result.rows[0].dispensary_id]); - res.json({ - sandbox: result.rows[0], - jobs: jobs.rows, - }); - } - catch (error) { - logger_1.logger.error('api', `Get sandbox error: ${error.message}`); - res.status(500).json({ error: error.message }); - } -}); -/** - * POST /api/crawler-sandbox/:id/analyze - * Trigger re-analysis of a sandbox entry - */ -router.post('/:id/analyze', (0, middleware_1.requireRole)('admin'), async (req, res) => { - try { - const { id } = req.params; - const sandbox = await migrate_1.pool.query('SELECT * FROM crawler_sandboxes WHERE id = $1', [id]); - if (sandbox.rows.length === 0) { - return res.status(404).json({ error: 'Sandbox entry not found' }); - } - // Queue a new sandbox job - const job = await migrate_1.pool.query(`INSERT INTO sandbox_crawl_jobs (dispensary_id, sandbox_id, job_type, status, priority) - VALUES ($1, $2, 'deep_crawl', 'pending', 20) - RETURNING id`, [sandbox.rows[0].dispensary_id, id]); - // Update sandbox status - await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = 'pending', updated_at = NOW() WHERE id = $1`, [id]); - res.json({ - message: 'Analysis job queued', - jobId: job.rows[0].id, - }); - } - catch (error) { - logger_1.logger.error('api', `Analyze sandbox error: ${error.message}`); - res.status(500).json({ error: error.message }); - } -}); -/** - * POST /api/crawler-sandbox/:id/move-to-production - * Move a sandbox entry to production (for Dutchie dispensaries) - */ -router.post('/:id/move-to-production', (0, middleware_1.requireRole)('admin'), async (req, res) => { - try { - const { id } = req.params; - const sandbox = await migrate_1.pool.query(`SELECT cs.*, d.menu_provider - FROM crawler_sandboxes cs - JOIN dispensaries d ON d.id = cs.dispensary_id - WHERE cs.id = $1`, [id]); - if (sandbox.rows.length === 0) { - return res.status(404).json({ error: 'Sandbox entry not found' }); - } - // Can only move to production if provider is dutchie - if (sandbox.rows[0].menu_provider !== 'dutchie') { - return res.status(400).json({ - error: 'Only Dutchie dispensaries can be moved to production currently', - }); - } - // Update dispensary to production mode - await migrate_1.pool.query(`UPDATE dispensaries - SET crawler_mode = 'production', crawler_status = 'idle', updated_at = NOW() - WHERE id = $1`, [sandbox.rows[0].dispensary_id]); - // Mark sandbox as moved - await migrate_1.pool.query(`UPDATE crawler_sandboxes - SET status = 'moved_to_production', updated_at = NOW() - WHERE id = $1`, [id]); - res.json({ message: 'Dispensary moved to production' }); - } - catch (error) { - logger_1.logger.error('api', `Move to production error: ${error.message}`); - res.status(500).json({ error: error.message }); - } -}); -/** - * PATCH /api/crawler-sandbox/:id - * Update sandbox entry (e.g., add human review notes) - */ -router.patch('/:id', (0, middleware_1.requireRole)('admin'), async (req, res) => { - try { - const { id } = req.params; - const { human_review_notes, status, suspected_menu_provider } = req.body; - const updates = []; - const params = []; - let paramIndex = 1; - if (human_review_notes !== undefined) { - updates.push(`human_review_notes = $${paramIndex}`); - params.push(human_review_notes); - paramIndex++; - } - if (status) { - updates.push(`status = $${paramIndex}`); - params.push(status); - paramIndex++; - } - if (suspected_menu_provider !== undefined) { - updates.push(`suspected_menu_provider = $${paramIndex}`); - params.push(suspected_menu_provider); - paramIndex++; - } - if (updates.length === 0) { - return res.status(400).json({ error: 'No updates provided' }); - } - updates.push('updated_at = NOW()'); - if (human_review_notes !== undefined) { - updates.push('reviewed_at = NOW()'); - } - params.push(id); - await migrate_1.pool.query(`UPDATE crawler_sandboxes SET ${updates.join(', ')} WHERE id = $${paramIndex}`, params); - res.json({ message: 'Sandbox updated' }); - } - catch (error) { - logger_1.logger.error('api', `Update sandbox error: ${error.message}`); - res.status(500).json({ error: error.message }); - } -}); -// ======================================== -// Templates -// ======================================== -/** - * GET /api/crawler-sandbox/templates - * List all crawler templates - */ -router.get('/templates/list', async (req, res) => { - try { - const result = await migrate_1.pool.query(`SELECT * FROM crawler_templates ORDER BY provider, is_default_for_provider DESC, name`); - res.json({ templates: result.rows }); - } - catch (error) { - logger_1.logger.error('api', `Get templates error: ${error.message}`); - res.status(500).json({ error: error.message }); - } -}); -/** - * GET /api/crawler-sandbox/templates/:id - * Get a single template - */ -router.get('/templates/:id', async (req, res) => { - try { - const { id } = req.params; - const result = await migrate_1.pool.query('SELECT * FROM crawler_templates WHERE id = $1', [id]); - if (result.rows.length === 0) { - return res.status(404).json({ error: 'Template not found' }); - } - res.json({ template: result.rows[0] }); - } - catch (error) { - logger_1.logger.error('api', `Get template error: ${error.message}`); - res.status(500).json({ error: error.message }); - } -}); -/** - * POST /api/crawler-sandbox/templates - * Create a new template - */ -router.post('/templates', (0, middleware_1.requireRole)('admin'), async (req, res) => { - try { - const { provider, name, selector_config, navigation_config, transform_config, validation_rules, notes, } = req.body; - if (!provider || !name) { - return res.status(400).json({ error: 'provider and name are required' }); - } - const result = await migrate_1.pool.query(`INSERT INTO crawler_templates - (provider, name, selector_config, navigation_config, transform_config, validation_rules, notes, created_by) - VALUES ($1, $2, $3, $4, $5, $6, $7, $8) - RETURNING *`, [ - provider, - name, - JSON.stringify(selector_config || {}), - JSON.stringify(navigation_config || {}), - JSON.stringify(transform_config || {}), - JSON.stringify(validation_rules || {}), - notes, - req.user?.email || 'system', - ]); - res.status(201).json({ template: result.rows[0] }); - } - catch (error) { - logger_1.logger.error('api', `Create template error: ${error.message}`); - res.status(500).json({ error: error.message }); - } -}); -/** - * PUT /api/crawler-sandbox/templates/:id - * Update a template - */ -router.put('/templates/:id', (0, middleware_1.requireRole)('admin'), async (req, res) => { - try { - const { id } = req.params; - const { is_active, is_default_for_provider, selector_config, navigation_config, transform_config, validation_rules, notes, } = req.body; - const updates = []; - const params = []; - let paramIndex = 1; - if (is_active !== undefined) { - updates.push(`is_active = $${paramIndex}`); - params.push(is_active); - paramIndex++; - } - if (is_default_for_provider !== undefined) { - updates.push(`is_default_for_provider = $${paramIndex}`); - params.push(is_default_for_provider); - paramIndex++; - } - if (selector_config !== undefined) { - updates.push(`selector_config = $${paramIndex}`); - params.push(JSON.stringify(selector_config)); - paramIndex++; - } - if (navigation_config !== undefined) { - updates.push(`navigation_config = $${paramIndex}`); - params.push(JSON.stringify(navigation_config)); - paramIndex++; - } - if (transform_config !== undefined) { - updates.push(`transform_config = $${paramIndex}`); - params.push(JSON.stringify(transform_config)); - paramIndex++; - } - if (validation_rules !== undefined) { - updates.push(`validation_rules = $${paramIndex}`); - params.push(JSON.stringify(validation_rules)); - paramIndex++; - } - if (notes !== undefined) { - updates.push(`notes = $${paramIndex}`); - params.push(notes); - paramIndex++; - } - if (updates.length === 0) { - return res.status(400).json({ error: 'No updates provided' }); - } - updates.push('updated_at = NOW()'); - params.push(id); - await migrate_1.pool.query(`UPDATE crawler_templates SET ${updates.join(', ')} WHERE id = $${paramIndex}`, params); - const result = await migrate_1.pool.query('SELECT * FROM crawler_templates WHERE id = $1', [id]); - res.json({ template: result.rows[0] }); - } - catch (error) { - logger_1.logger.error('api', `Update template error: ${error.message}`); - res.status(500).json({ error: error.message }); - } -}); -// ======================================== -// Jobs -// ======================================== -/** - * GET /api/crawler-sandbox/jobs - * List sandbox crawl jobs - */ -router.get('/jobs/list', async (req, res) => { - try { - const { status, dispensaryId, limit = 50 } = req.query; - let query = ` - SELECT sj.*, d.name as dispensary_name - FROM sandbox_crawl_jobs sj - JOIN dispensaries d ON d.id = sj.dispensary_id - WHERE 1=1 - `; - const params = []; - let paramIndex = 1; - if (status) { - query += ` AND sj.status = $${paramIndex}`; - params.push(status); - paramIndex++; - } - if (dispensaryId) { - query += ` AND sj.dispensary_id = $${paramIndex}`; - params.push(Number(dispensaryId)); - paramIndex++; - } - query += ` ORDER BY sj.created_at DESC LIMIT $${paramIndex}`; - params.push(Number(limit)); - const result = await migrate_1.pool.query(query, params); - res.json({ jobs: result.rows }); - } - catch (error) { - logger_1.logger.error('api', `Get jobs error: ${error.message}`); - res.status(500).json({ error: error.message }); - } -}); -/** - * POST /api/crawler-sandbox/jobs/detect/:dispensaryId - * Trigger provider detection for a dispensary - */ -router.post('/jobs/detect/:dispensaryId', (0, middleware_1.requireRole)('admin'), async (req, res) => { - try { - const { dispensaryId } = req.params; - // Create detection job - const job = await migrate_1.pool.query(`INSERT INTO sandbox_crawl_jobs (dispensary_id, job_type, status, priority) - VALUES ($1, 'detection', 'pending', 30) - RETURNING id`, [dispensaryId]); - // Update dispensary status - await migrate_1.pool.query(`UPDATE dispensaries SET crawler_status = 'queued_detection', updated_at = NOW() WHERE id = $1`, [dispensaryId]); - res.json({ - message: 'Detection job queued', - jobId: job.rows[0].id, - }); - } - catch (error) { - logger_1.logger.error('api', `Queue detection error: ${error.message}`); - res.status(500).json({ error: error.message }); - } -}); -/** - * POST /api/crawler-sandbox/jobs/run/:id - * Immediately run a sandbox job - */ -router.post('/jobs/run/:id', (0, middleware_1.requireRole)('admin'), async (req, res) => { - try { - const { id } = req.params; - const job = await migrate_1.pool.query('SELECT * FROM sandbox_crawl_jobs WHERE id = $1', [id]); - if (job.rows.length === 0) { - return res.status(404).json({ error: 'Job not found' }); - } - const jobData = job.rows[0]; - // Run the job immediately - let result; - if (jobData.job_type === 'detection') { - result = await (0, crawler_jobs_1.runDetectMenuProviderJob)(jobData.dispensary_id); - } - else { - result = await (0, crawler_jobs_1.runSandboxCrawlJob)(jobData.dispensary_id, jobData.sandbox_id); - } - // Update job status - await migrate_1.pool.query(`UPDATE sandbox_crawl_jobs - SET status = $1, completed_at = NOW(), result_summary = $2, error_message = $3 - WHERE id = $4`, [ - result.success ? 'completed' : 'failed', - JSON.stringify(result.data || {}), - result.success ? null : result.message, - id, - ]); - res.json(result); - } - catch (error) { - logger_1.logger.error('api', `Run job error: ${error.message}`); - res.status(500).json({ error: error.message }); - } -}); -// ======================================== -// Stats -// ======================================== -/** - * GET /api/crawler-sandbox/stats - * Get sandbox/crawler statistics - */ -router.get('/stats/overview', async (req, res) => { - try { - // Dispensary provider stats - const providerStats = await migrate_1.pool.query(` - SELECT - menu_provider, - COUNT(*) as count, - AVG(menu_provider_confidence)::integer as avg_confidence - FROM dispensaries - WHERE menu_provider IS NOT NULL - GROUP BY menu_provider - ORDER BY count DESC - `); - // Mode stats - const modeStats = await migrate_1.pool.query(` - SELECT - crawler_mode, - COUNT(*) as count - FROM dispensaries - GROUP BY crawler_mode - `); - // Status stats - const statusStats = await migrate_1.pool.query(` - SELECT - crawler_status, - COUNT(*) as count - FROM dispensaries - GROUP BY crawler_status - ORDER BY count DESC - `); - // Sandbox stats - const sandboxStats = await migrate_1.pool.query(` - SELECT - status, - COUNT(*) as count - FROM crawler_sandboxes - GROUP BY status - `); - // Job stats - const jobStats = await migrate_1.pool.query(` - SELECT - status, - job_type, - COUNT(*) as count - FROM sandbox_crawl_jobs - GROUP BY status, job_type - `); - // Recent activity - const recentActivity = await migrate_1.pool.query(` - SELECT 'sandbox' as type, id, dispensary_id, status, created_at - FROM crawler_sandboxes - ORDER BY created_at DESC - LIMIT 5 - `); - res.json({ - providers: providerStats.rows, - modes: modeStats.rows, - statuses: statusStats.rows, - sandbox: sandboxStats.rows, - jobs: jobStats.rows, - recentActivity: recentActivity.rows, - }); - } - catch (error) { - logger_1.logger.error('api', `Get stats error: ${error.message}`); - res.status(500).json({ error: error.message }); - } -}); -exports.default = router; diff --git a/backend/dist/routes/dashboard.js b/backend/dist/routes/dashboard.js deleted file mode 100644 index 2fbaeab3..00000000 --- a/backend/dist/routes/dashboard.js +++ /dev/null @@ -1,116 +0,0 @@ -"use strict"; -Object.defineProperty(exports, "__esModule", { value: true }); -const express_1 = require("express"); -const middleware_1 = require("../auth/middleware"); -const connection_1 = require("../dutchie-az/db/connection"); -const router = (0, express_1.Router)(); -router.use(middleware_1.authMiddleware); -// Get dashboard stats - uses consolidated dutchie-az DB -router.get('/stats', async (req, res) => { - try { - // Store stats from dispensaries table in consolidated DB - const dispensariesResult = await (0, connection_1.query)(` - SELECT - COUNT(*) as total, - COUNT(*) FILTER (WHERE menu_type IS NOT NULL AND menu_type != 'unknown') as active, - COUNT(*) FILTER (WHERE platform_dispensary_id IS NOT NULL) as with_platform_id, - COUNT(*) FILTER (WHERE menu_url IS NOT NULL) as with_menu_url, - MIN(last_crawled_at) as oldest_crawl, - MAX(last_crawled_at) as latest_crawl - FROM dispensaries - `); - // Product stats from dutchie_products table - const productsResult = await (0, connection_1.query)(` - SELECT - COUNT(*) as total, - COUNT(*) FILTER (WHERE stock_status = 'in_stock') as in_stock, - COUNT(*) FILTER (WHERE primary_image_url IS NOT NULL) as with_images, - COUNT(DISTINCT brand_name) FILTER (WHERE brand_name IS NOT NULL AND brand_name != '') as unique_brands, - COUNT(DISTINCT dispensary_id) as dispensaries_with_products - FROM dutchie_products - `); - // Brand stats from dutchie_products - const brandResult = await (0, connection_1.query)(` - SELECT COUNT(DISTINCT brand_name) as total - FROM dutchie_products - WHERE brand_name IS NOT NULL AND brand_name != '' - `); - // Recent products added (last 24 hours) - const recentProductsResult = await (0, connection_1.query)(` - SELECT COUNT(*) as new_products_24h - FROM dutchie_products - WHERE created_at >= NOW() - INTERVAL '24 hours' - `); - // Combine results - const storeStats = dispensariesResult.rows[0]; - const productStats = productsResult.rows[0]; - res.json({ - stores: { - total: parseInt(storeStats.total) || 0, - active: parseInt(storeStats.active) || 0, - with_menu_url: parseInt(storeStats.with_menu_url) || 0, - with_platform_id: parseInt(storeStats.with_platform_id) || 0, - oldest_crawl: storeStats.oldest_crawl, - latest_crawl: storeStats.latest_crawl - }, - products: { - total: parseInt(productStats.total) || 0, - in_stock: parseInt(productStats.in_stock) || 0, - with_images: parseInt(productStats.with_images) || 0, - unique_brands: parseInt(productStats.unique_brands) || 0, - dispensaries_with_products: parseInt(productStats.dispensaries_with_products) || 0 - }, - brands: { - total: parseInt(brandResult.rows[0].total) || 0 - }, - campaigns: { total: 0, active: 0 }, // Legacy - no longer used - clicks: { clicks_24h: 0 }, // Legacy - no longer used - recent: recentProductsResult.rows[0] - }); - } - catch (error) { - console.error('Error fetching dashboard stats:', error); - res.status(500).json({ error: 'Failed to fetch dashboard stats' }); - } -}); -// Get recent activity - from consolidated dutchie-az DB -router.get('/activity', async (req, res) => { - try { - const { limit = 20 } = req.query; - // Recent crawls from dispensaries (with product counts from dutchie_products) - const scrapesResult = await (0, connection_1.query)(` - SELECT - d.name, - d.last_crawled_at as last_scraped_at, - d.product_count - FROM dispensaries d - WHERE d.last_crawled_at IS NOT NULL - ORDER BY d.last_crawled_at DESC - LIMIT $1 - `, [limit]); - // Recent products from dutchie_products - const productsResult = await (0, connection_1.query)(` - SELECT - p.name, - 0 as price, - p.brand_name as brand, - p.thc as thc_percentage, - p.cbd as cbd_percentage, - d.name as store_name, - p.created_at as first_seen_at - FROM dutchie_products p - JOIN dispensaries d ON p.dispensary_id = d.id - ORDER BY p.created_at DESC - LIMIT $1 - `, [limit]); - res.json({ - recent_scrapes: scrapesResult.rows, - recent_products: productsResult.rows - }); - } - catch (error) { - console.error('Error fetching dashboard activity:', error); - res.status(500).json({ error: 'Failed to fetch dashboard activity' }); - } -}); -exports.default = router; diff --git a/backend/dist/routes/dispensaries.js b/backend/dist/routes/dispensaries.js deleted file mode 100644 index cbb08c75..00000000 --- a/backend/dist/routes/dispensaries.js +++ /dev/null @@ -1,437 +0,0 @@ -"use strict"; -Object.defineProperty(exports, "__esModule", { value: true }); -const express_1 = require("express"); -const middleware_1 = require("../auth/middleware"); -const migrate_1 = require("../db/migrate"); -const router = (0, express_1.Router)(); -router.use(middleware_1.authMiddleware); -// Valid menu_type values -const VALID_MENU_TYPES = ['dutchie', 'treez', 'jane', 'weedmaps', 'leafly', 'meadow', 'blaze', 'flowhub', 'dispense', 'cova', 'other', 'unknown']; -// Get all dispensaries -router.get('/', async (req, res) => { - try { - const { menu_type } = req.query; - let query = ` - SELECT - id, - azdhs_id, - name, - company_name, - slug, - address, - city, - state, - zip, - phone, - email, - website, - dba_name, - google_rating, - google_review_count, - status_line, - azdhs_url, - latitude, - longitude, - menu_url, - menu_type, - menu_provider, - menu_provider_confidence, - scraper_template, - last_menu_scrape, - menu_scrape_status, - platform_dispensary_id, - created_at, - updated_at - FROM dispensaries - `; - const params = []; - // Filter by menu_type if provided - if (menu_type) { - query += ` WHERE menu_type = $1`; - params.push(menu_type); - } - query += ` ORDER BY name`; - const result = await migrate_1.pool.query(query, params); - res.json({ dispensaries: result.rows }); - } - catch (error) { - console.error('Error fetching dispensaries:', error); - res.status(500).json({ error: 'Failed to fetch dispensaries' }); - } -}); -// Get menu type stats -router.get('/stats/menu-types', async (req, res) => { - try { - const result = await migrate_1.pool.query(` - SELECT menu_type, COUNT(*) as count - FROM dispensaries - GROUP BY menu_type - ORDER BY count DESC - `); - res.json({ menu_types: result.rows, valid_types: VALID_MENU_TYPES }); - } - catch (error) { - console.error('Error fetching menu type stats:', error); - res.status(500).json({ error: 'Failed to fetch menu type stats' }); - } -}); -// Get single dispensary by slug -router.get('/:slug', async (req, res) => { - try { - const { slug } = req.params; - const result = await migrate_1.pool.query(` - SELECT - id, - azdhs_id, - name, - company_name, - slug, - address, - city, - state, - zip, - phone, - email, - website, - dba_name, - google_rating, - google_review_count, - status_line, - azdhs_url, - latitude, - longitude, - menu_url, - menu_type, - menu_provider, - menu_provider_confidence, - scraper_template, - scraper_config, - last_menu_scrape, - menu_scrape_status, - platform_dispensary_id, - created_at, - updated_at - FROM dispensaries - WHERE slug = $1 - `, [slug]); - if (result.rows.length === 0) { - return res.status(404).json({ error: 'Dispensary not found' }); - } - res.json(result.rows[0]); - } - catch (error) { - console.error('Error fetching dispensary:', error); - res.status(500).json({ error: 'Failed to fetch dispensary' }); - } -}); -// Update dispensary -router.put('/:id', async (req, res) => { - try { - const { id } = req.params; - const { dba_name, website, phone, email, google_rating, google_review_count, menu_url, menu_type, scraper_template, scraper_config, menu_scrape_status } = req.body; - // Validate menu_type if provided - if (menu_type !== undefined && menu_type !== null && menu_type !== '' && !VALID_MENU_TYPES.includes(menu_type)) { - return res.status(400).json({ - error: `Invalid menu_type. Must be one of: ${VALID_MENU_TYPES.join(', ')}` - }); - } - const result = await migrate_1.pool.query(` - UPDATE dispensaries - SET - dba_name = COALESCE($1, dba_name), - website = COALESCE($2, website), - phone = COALESCE($3, phone), - email = COALESCE($4, email), - google_rating = COALESCE($5, google_rating), - google_review_count = COALESCE($6, google_review_count), - menu_url = COALESCE($7, menu_url), - menu_type = COALESCE($8, menu_type), - scraper_template = COALESCE($9, scraper_template), - scraper_config = COALESCE($10, scraper_config), - menu_scrape_status = COALESCE($11, menu_scrape_status), - updated_at = CURRENT_TIMESTAMP - WHERE id = $12 - RETURNING * - `, [ - dba_name, - website, - phone, - email, - google_rating, - google_review_count, - menu_url, - menu_type, - scraper_template, - scraper_config, - menu_scrape_status, - id - ]); - if (result.rows.length === 0) { - return res.status(404).json({ error: 'Dispensary not found' }); - } - res.json(result.rows[0]); - } - catch (error) { - console.error('Error updating dispensary:', error); - res.status(500).json({ error: 'Failed to update dispensary' }); - } -}); -// Get products for a dispensary by slug -router.get('/:slug/products', async (req, res) => { - try { - const { slug } = req.params; - const { category } = req.query; - // First get the dispensary ID from slug - const dispensaryResult = await migrate_1.pool.query(` - SELECT id FROM dispensaries WHERE slug = $1 - `, [slug]); - if (dispensaryResult.rows.length === 0) { - return res.status(404).json({ error: 'Dispensary not found' }); - } - const dispensaryId = dispensaryResult.rows[0].id; - // Build query for products - let query = ` - SELECT - p.id, - p.name, - p.brand, - p.variant, - p.slug, - p.description, - p.regular_price, - p.sale_price, - p.thc_percentage, - p.cbd_percentage, - p.strain_type, - p.terpenes, - p.effects, - p.flavors, - p.image_url, - p.dutchie_url, - p.in_stock, - p.created_at, - p.updated_at - FROM products p - WHERE p.dispensary_id = $1 - `; - const params = [dispensaryId]; - if (category) { - query += ` AND p.category = $2`; - params.push(category); - } - query += ` ORDER BY p.created_at DESC`; - const result = await migrate_1.pool.query(query, params); - res.json({ products: result.rows }); - } - catch (error) { - console.error('Error fetching dispensary products:', error); - res.status(500).json({ error: 'Failed to fetch products' }); - } -}); -// Get unique brands for a dispensary by slug -router.get('/:slug/brands', async (req, res) => { - try { - const { slug } = req.params; - const { search } = req.query; - // First get the dispensary ID from slug - const dispensaryResult = await migrate_1.pool.query(` - SELECT id FROM dispensaries WHERE slug = $1 - `, [slug]); - if (dispensaryResult.rows.length === 0) { - return res.status(404).json({ error: 'Dispensary not found' }); - } - const dispensaryId = dispensaryResult.rows[0].id; - // Build query with optional search filter - let query = ` - SELECT DISTINCT - brand, - COUNT(*) as product_count - FROM products - WHERE dispensary_id = $1 AND brand IS NOT NULL - `; - const params = [dispensaryId]; - // Add search filter if provided - if (search) { - query += ` AND brand ILIKE $2`; - params.push(`%${search}%`); - } - query += ` GROUP BY brand ORDER BY product_count DESC, brand ASC`; - const result = await migrate_1.pool.query(query, params); - res.json({ brands: result.rows }); - } - catch (error) { - console.error('Error fetching dispensary brands:', error); - res.status(500).json({ error: 'Failed to fetch brands' }); - } -}); -// Get products with discounts/specials for a dispensary by slug -router.get('/:slug/specials', async (req, res) => { - try { - const { slug } = req.params; - const { search } = req.query; - // First get the dispensary ID from slug - const dispensaryResult = await migrate_1.pool.query(` - SELECT id FROM dispensaries WHERE slug = $1 - `, [slug]); - if (dispensaryResult.rows.length === 0) { - return res.status(404).json({ error: 'Dispensary not found' }); - } - const dispensaryId = dispensaryResult.rows[0].id; - // Build query to get products with discounts - let query = ` - SELECT - p.id, - p.name, - p.brand, - p.variant, - p.slug, - p.description, - p.regular_price, - p.sale_price, - p.discount_type, - p.discount_value, - p.thc_percentage, - p.cbd_percentage, - p.strain_type, - p.terpenes, - p.effects, - p.flavors, - p.image_url, - p.dutchie_url, - p.in_stock, - p.created_at, - p.updated_at - FROM products p - WHERE p.dispensary_id = $1 - AND p.discount_type IS NOT NULL - AND p.discount_value IS NOT NULL - `; - const params = [dispensaryId]; - // Add search filter if provided - if (search) { - query += ` AND (p.name ILIKE $2 OR p.brand ILIKE $2 OR p.description ILIKE $2)`; - params.push(`%${search}%`); - } - query += ` ORDER BY p.created_at DESC`; - const result = await migrate_1.pool.query(query, params); - res.json({ specials: result.rows }); - } - catch (error) { - console.error('Error fetching dispensary specials:', error); - res.status(500).json({ error: 'Failed to fetch specials' }); - } -}); -// Trigger scraping for a dispensary -router.post('/:slug/scrape', async (req, res) => { - try { - const { slug } = req.params; - const { type } = req.body; // 'products' | 'brands' | 'specials' | 'all' - if (!['products', 'brands', 'specials', 'all'].includes(type)) { - return res.status(400).json({ error: 'Invalid type. Must be: products, brands, specials, or all' }); - } - // Get the dispensary - const dispensaryResult = await migrate_1.pool.query(` - SELECT id, name, slug, website, menu_url, scraper_template, scraper_config - FROM dispensaries - WHERE slug = $1 - `, [slug]); - if (dispensaryResult.rows.length === 0) { - return res.status(404).json({ error: 'Dispensary not found' }); - } - const dispensary = dispensaryResult.rows[0]; - if (!dispensary.menu_url && !dispensary.website) { - return res.status(400).json({ error: 'Dispensary has no menu URL or website configured' }); - } - // Update last_menu_scrape time and status - await migrate_1.pool.query(` - UPDATE dispensaries - SET - last_menu_scrape = CURRENT_TIMESTAMP, - menu_scrape_status = 'pending', - updated_at = CURRENT_TIMESTAMP - WHERE id = $1 - `, [dispensary.id]); - // Log the scrape request - console.log(`[SCRAPE REQUEST] Dispensary: ${dispensary.name} (${slug}), Type: ${type}`); - console.log(` Menu URL: ${dispensary.menu_url || dispensary.website}`); - console.log(` Template: ${dispensary.scraper_template || 'N/A'}`); - // TODO: Actually trigger the scraper here - // For now, this is a placeholder that updates the status - // You can integrate with your existing scraper infrastructure - res.json({ - success: true, - message: `Scraping queued for ${dispensary.name}`, - type, - dispensary: { - id: dispensary.id, - name: dispensary.name, - slug: dispensary.slug - } - }); - } - catch (error) { - console.error('Error triggering scrape:', error); - res.status(500).json({ error: 'Failed to trigger scraping' }); - } -}); -// Update menu_type for a dispensary (dedicated endpoint) -router.patch('/:id/menu-type', async (req, res) => { - try { - const { id } = req.params; - const { menu_type } = req.body; - // Validate menu_type - if (menu_type !== null && menu_type !== '' && !VALID_MENU_TYPES.includes(menu_type)) { - return res.status(400).json({ - error: `Invalid menu_type. Must be one of: ${VALID_MENU_TYPES.join(', ')} (or null to clear)` - }); - } - const result = await migrate_1.pool.query(` - UPDATE dispensaries - SET menu_type = $1, updated_at = CURRENT_TIMESTAMP - WHERE id = $2 - RETURNING id, name, slug, menu_type, menu_provider, menu_url - `, [menu_type || null, id]); - if (result.rows.length === 0) { - return res.status(404).json({ error: 'Dispensary not found' }); - } - res.json({ - success: true, - dispensary: result.rows[0] - }); - } - catch (error) { - console.error('Error updating menu_type:', error); - res.status(500).json({ error: 'Failed to update menu_type' }); - } -}); -// Bulk update menu_type for multiple dispensaries -router.post('/bulk/menu-type', async (req, res) => { - try { - const { dispensary_ids, menu_type } = req.body; - if (!Array.isArray(dispensary_ids) || dispensary_ids.length === 0) { - return res.status(400).json({ error: 'dispensary_ids must be a non-empty array' }); - } - // Validate menu_type - if (menu_type !== null && menu_type !== '' && !VALID_MENU_TYPES.includes(menu_type)) { - return res.status(400).json({ - error: `Invalid menu_type. Must be one of: ${VALID_MENU_TYPES.join(', ')} (or null to clear)` - }); - } - const result = await migrate_1.pool.query(` - UPDATE dispensaries - SET menu_type = $1, updated_at = CURRENT_TIMESTAMP - WHERE id = ANY($2::int[]) - RETURNING id, name, slug, menu_type - `, [menu_type || null, dispensary_ids]); - res.json({ - success: true, - updated_count: result.rowCount, - dispensaries: result.rows - }); - } - catch (error) { - console.error('Error bulk updating menu_type:', error); - res.status(500).json({ error: 'Failed to bulk update menu_type' }); - } -}); -exports.default = router; diff --git a/backend/dist/routes/logs.js b/backend/dist/routes/logs.js deleted file mode 100644 index b26654c6..00000000 --- a/backend/dist/routes/logs.js +++ /dev/null @@ -1,29 +0,0 @@ -"use strict"; -Object.defineProperty(exports, "__esModule", { value: true }); -const express_1 = require("express"); -const middleware_1 = require("../auth/middleware"); -const logger_1 = require("../services/logger"); -const router = (0, express_1.Router)(); -router.use(middleware_1.authMiddleware); -router.get('/', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => { - try { - const { limit = '100', level, category } = req.query; - const logs = logger_1.logger.getLogs(parseInt(limit), level, category); - res.json({ logs }); - } - catch (error) { - console.error('Error fetching logs:', error); - res.status(500).json({ error: 'Failed to fetch logs' }); - } -}); -router.delete('/', (0, middleware_1.requireRole)('superadmin'), async (req, res) => { - try { - logger_1.logger.clear(); - res.json({ message: 'Logs cleared' }); - } - catch (error) { - console.error('Error clearing logs:', error); - res.status(500).json({ error: 'Failed to clear logs' }); - } -}); -exports.default = router; diff --git a/backend/dist/routes/parallel-scrape.js b/backend/dist/routes/parallel-scrape.js deleted file mode 100644 index 5384c256..00000000 --- a/backend/dist/routes/parallel-scrape.js +++ /dev/null @@ -1,182 +0,0 @@ -"use strict"; -Object.defineProperty(exports, "__esModule", { value: true }); -const express_1 = require("express"); -const migrate_1 = require("../db/migrate"); -const proxy_1 = require("../services/proxy"); -const middleware_1 = require("../auth/middleware"); -const router = (0, express_1.Router)(); -router.use(middleware_1.authMiddleware); -const FIREFOX_USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0'; -// In-memory job tracking -const activeJobs = new Map(); -// Get job status -router.get('/status/:jobId', (req, res) => { - const job = activeJobs.get(req.params.jobId); - if (!job) { - return res.status(404).json({ error: 'Job not found' }); - } - res.json(job); -}); -// List active jobs -router.get('/jobs', (req, res) => { - const jobs = Array.from(activeJobs.values()); - res.json({ jobs }); -}); -// Start parallel scrape -router.post('/start', async (req, res) => { - const { storeName = 'Deeply Rooted', workers = 15, useProxies = true } = req.body; - try { - // Find the store - const storeResult = await migrate_1.pool.query(`SELECT id, name, slug, dutchie_url FROM stores WHERE name ILIKE $1 LIMIT 1`, [`%${storeName}%`]); - if (storeResult.rows.length === 0) { - return res.status(404).json({ error: `Store not found: ${storeName}` }); - } - const store = storeResult.rows[0]; - // Get categories - const categoriesResult = await migrate_1.pool.query(`SELECT id, name, slug, dutchie_url as url FROM categories WHERE store_id = $1 AND scrape_enabled = true`, [store.id]); - if (categoriesResult.rows.length === 0) { - return res.status(404).json({ error: 'No categories found for this store' }); - } - const categories = categoriesResult.rows; - // Create job - const jobId = `scrape-${Date.now()}`; - const job = { - id: jobId, - storeName: store.name, - status: 'running', - workers, - startedAt: new Date(), - results: [] - }; - activeJobs.set(jobId, job); - // Start scraping in background - runParallelScrape(job, store, categories, workers, useProxies).catch(err => { - console.error('Parallel scrape error:', err); - job.status = 'failed'; - }); - res.json({ - message: 'Parallel scrape started', - jobId, - store: store.name, - categories: categories.length, - workers - }); - } - catch (error) { - console.error('Failed to start parallel scrape:', error); - res.status(500).json({ error: error.message }); - } -}); -async function runParallelScrape(job, store, categories, numWorkers, useProxies) { - const puppeteer = require('puppeteer-extra'); - const StealthPlugin = require('puppeteer-extra-plugin-stealth'); - puppeteer.use(StealthPlugin()); - // Expand categories for multiple passes - const expandedCategories = []; - const passes = Math.ceil(numWorkers / Math.max(categories.length, 1)); - for (let i = 0; i < passes; i++) { - expandedCategories.push(...categories); - } - const categoryIndex = { current: 0 }; - const worker = async (workerId) => { - while (categoryIndex.current < expandedCategories.length) { - const idx = categoryIndex.current++; - const category = expandedCategories[idx]; - if (!category) - break; - const result = await scrapeCategory(puppeteer, workerId, category, useProxies); - job.results.push({ - category: category.name, - success: result.success, - products: result.products, - error: result.error - }); - // Delay between requests - await new Promise(resolve => setTimeout(resolve, 2000 + Math.random() * 3000)); - } - }; - // Start workers with staggered starts - const workers = []; - for (let i = 0; i < numWorkers; i++) { - workers.push(worker(i + 1)); - await new Promise(resolve => setTimeout(resolve, 500)); - } - await Promise.all(workers); - job.status = 'completed'; - job.completedAt = new Date(); - // Clean up job after 1 hour - setTimeout(() => activeJobs.delete(job.id), 60 * 60 * 1000); -} -async function scrapeCategory(puppeteer, workerId, category, useProxies) { - let browser = null; - let proxyId = null; - try { - let proxy = null; - if (useProxies) { - proxy = await (0, proxy_1.getActiveProxy)(); - } - const args = [ - '--no-sandbox', - '--disable-setuid-sandbox', - '--disable-dev-shm-usage', - '--disable-accelerated-2d-canvas', - '--disable-gpu', - '--window-size=1920,1080', - ]; - if (proxy) { - proxyId = proxy.id; - if (proxy.protocol === 'socks5' || proxy.protocol === 'socks') { - args.push(`--proxy-server=socks5://${proxy.host}:${proxy.port}`); - } - else { - args.push(`--proxy-server=${proxy.protocol}://${proxy.host}:${proxy.port}`); - } - } - browser = await puppeteer.launch({ - headless: 'new', - args, - executablePath: process.env.PUPPETEER_EXECUTABLE_PATH || '/usr/bin/chromium', - }); - const page = await browser.newPage(); - await page.setUserAgent(FIREFOX_USER_AGENT); - await page.setViewport({ width: 1920, height: 1080 }); - if (proxy?.username && proxy?.password) { - await page.authenticate({ - username: proxy.username, - password: proxy.password, - }); - } - console.log(`[Worker ${workerId}] Scraping: ${category.name} (${category.url})`); - const response = await page.goto(category.url, { - waitUntil: 'networkidle2', - timeout: 60000, - }); - if (!response || !response.ok()) { - throw new Error(`Failed to load page: ${response?.status()}`); - } - await page.waitForSelector('[data-testid="product-list-item"], a[href*="/product/"]', { - timeout: 30000, - }).catch(() => { }); - const products = await page.evaluate(() => { - // Try data-testid first, then fall back to product links - const listItems = document.querySelectorAll('[data-testid="product-list-item"]'); - if (listItems.length > 0) - return listItems.length; - return document.querySelectorAll('a[href*="/product/"]').length; - }); - console.log(`[Worker ${workerId}] Found ${products} products in ${category.name}`); - await browser.close(); - return { success: true, products }; - } - catch (error) { - console.error(`[Worker ${workerId}] Error:`, error.message); - if (proxyId && (0, proxy_1.isBotDetectionError)(error.message)) { - (0, proxy_1.putProxyInTimeout)(proxyId, error.message); - } - if (browser) { - await browser.close().catch(() => { }); - } - return { success: false, products: 0, error: error.message }; - } -} -exports.default = router; diff --git a/backend/dist/routes/products.js b/backend/dist/routes/products.js deleted file mode 100644 index 3cab78b3..00000000 --- a/backend/dist/routes/products.js +++ /dev/null @@ -1,341 +0,0 @@ -"use strict"; -Object.defineProperty(exports, "__esModule", { value: true }); -const express_1 = require("express"); -const middleware_1 = require("../auth/middleware"); -const migrate_1 = require("../db/migrate"); -const minio_1 = require("../utils/minio"); -const router = (0, express_1.Router)(); -router.use(middleware_1.authMiddleware); -// Freshness threshold: data older than this is considered stale -const STALE_THRESHOLD_HOURS = 4; -function calculateFreshness(lastCrawlAt) { - if (!lastCrawlAt) { - return { - last_crawl_at: null, - is_stale: true, - freshness: 'Never crawled', - hours_since_crawl: null - }; - } - const now = new Date(); - const diffMs = now.getTime() - lastCrawlAt.getTime(); - const diffHours = diffMs / (1000 * 60 * 60); - const isStale = diffHours > STALE_THRESHOLD_HOURS; - let freshnessText; - if (diffHours < 1) { - const mins = Math.round(diffHours * 60); - freshnessText = `Last crawled ${mins} minute${mins !== 1 ? 's' : ''} ago`; - } - else if (diffHours < 24) { - const hrs = Math.round(diffHours); - freshnessText = `Last crawled ${hrs} hour${hrs !== 1 ? 's' : ''} ago`; - } - else { - const days = Math.round(diffHours / 24); - freshnessText = `Last crawled ${days} day${days !== 1 ? 's' : ''} ago`; - } - if (isStale) { - freshnessText += ' (STALE)'; - } - return { - last_crawl_at: lastCrawlAt.toISOString(), - is_stale: isStale, - freshness: freshnessText, - hours_since_crawl: Math.round(diffHours * 10) / 10 - }; -} -// Helper function to filter fields from object -function selectFields(obj, fields) { - if (!fields || fields.length === 0) - return obj; - const result = {}; - fields.forEach(field => { - if (obj.hasOwnProperty(field)) { - result[field] = obj[field]; - } - }); - return result; -} -// Get all products with filters, sorting, and field selection -router.get('/', async (req, res) => { - try { - const { store_id, category_id, in_stock, search, brand, min_price, max_price, min_thc, max_thc, strain_type, sort_by = 'last_seen_at', sort_order = 'desc', limit = 50, offset = 0, fields } = req.query; - // Validate sort field to prevent SQL injection - const allowedSortFields = [ - 'id', 'name', 'brand', 'price', 'thc_percentage', - 'cbd_percentage', 'last_seen_at', 'created_at' - ]; - const sortField = allowedSortFields.includes(sort_by) - ? sort_by - : 'last_seen_at'; - const sortDirection = sort_order.toLowerCase() === 'asc' ? 'ASC' : 'DESC'; - let query = ` - SELECT p.*, s.name as store_name, c.name as category_name - FROM products p - LEFT JOIN stores s ON p.store_id = s.id - LEFT JOIN categories c ON p.category_id = c.id - WHERE 1=1 - `; - const params = []; - let paramCount = 1; - // Store filter - if (store_id) { - query += ` AND p.store_id = $${paramCount}`; - params.push(store_id); - paramCount++; - } - // Category filter - if (category_id) { - query += ` AND p.category_id = $${paramCount}`; - params.push(category_id); - paramCount++; - } - // Stock filter - if (in_stock !== undefined) { - query += ` AND p.in_stock = $${paramCount}`; - params.push(in_stock === 'true'); - paramCount++; - } - // Search filter - if (search) { - query += ` AND (p.name ILIKE $${paramCount} OR p.brand ILIKE $${paramCount} OR p.description ILIKE $${paramCount})`; - params.push(`%${search}%`); - paramCount++; - } - // Brand filter - if (brand) { - query += ` AND p.brand ILIKE $${paramCount}`; - params.push(`%${brand}%`); - paramCount++; - } - // Price range filter - if (min_price) { - query += ` AND p.price >= $${paramCount}`; - params.push(parseFloat(min_price)); - paramCount++; - } - if (max_price) { - query += ` AND p.price <= $${paramCount}`; - params.push(parseFloat(max_price)); - paramCount++; - } - // THC range filter - if (min_thc) { - query += ` AND p.thc_percentage >= $${paramCount}`; - params.push(parseFloat(min_thc)); - paramCount++; - } - if (max_thc) { - query += ` AND p.thc_percentage <= $${paramCount}`; - params.push(parseFloat(max_thc)); - paramCount++; - } - // Strain type filter - if (strain_type) { - query += ` AND p.strain_type = $${paramCount}`; - params.push(strain_type); - paramCount++; - } - // Sorting - query += ` ORDER BY p.${sortField} ${sortDirection} LIMIT $${paramCount} OFFSET $${paramCount + 1}`; - params.push(limit, offset); - const result = await migrate_1.pool.query(query, params); - // Add image URLs - let products = result.rows.map((p) => ({ - ...p, - image_url_full: p.local_image_path ? (0, minio_1.getImageUrl)(p.local_image_path) : p.image_url, - thumbnail_url: p.thumbnail_path ? (0, minio_1.getImageUrl)(p.thumbnail_path) : null, - medium_url: p.medium_path ? (0, minio_1.getImageUrl)(p.medium_path) : null, - })); - // Field selection - if (fields) { - const selectedFields = fields.split(',').map(f => f.trim()); - products = products.map((p) => selectFields(p, selectedFields)); - } - // Get total count (reuse same filters) - let countQuery = `SELECT COUNT(*) FROM products p WHERE 1=1`; - const countParams = []; - let countParamCount = 1; - if (store_id) { - countQuery += ` AND p.store_id = $${countParamCount}`; - countParams.push(store_id); - countParamCount++; - } - if (category_id) { - countQuery += ` AND p.category_id = $${countParamCount}`; - countParams.push(category_id); - countParamCount++; - } - if (in_stock !== undefined) { - countQuery += ` AND p.in_stock = $${countParamCount}`; - countParams.push(in_stock === 'true'); - countParamCount++; - } - if (search) { - countQuery += ` AND (p.name ILIKE $${countParamCount} OR p.brand ILIKE $${countParamCount} OR p.description ILIKE $${countParamCount})`; - countParams.push(`%${search}%`); - countParamCount++; - } - if (brand) { - countQuery += ` AND p.brand ILIKE $${countParamCount}`; - countParams.push(`%${brand}%`); - countParamCount++; - } - if (min_price) { - countQuery += ` AND p.price >= $${countParamCount}`; - countParams.push(parseFloat(min_price)); - countParamCount++; - } - if (max_price) { - countQuery += ` AND p.price <= $${countParamCount}`; - countParams.push(parseFloat(max_price)); - countParamCount++; - } - if (min_thc) { - countQuery += ` AND p.thc_percentage >= $${countParamCount}`; - countParams.push(parseFloat(min_thc)); - countParamCount++; - } - if (max_thc) { - countQuery += ` AND p.thc_percentage <= $${countParamCount}`; - countParams.push(parseFloat(max_thc)); - countParamCount++; - } - if (strain_type) { - countQuery += ` AND p.strain_type = $${countParamCount}`; - countParams.push(strain_type); - countParamCount++; - } - const countResult = await migrate_1.pool.query(countQuery, countParams); - // Get freshness info if store_id is specified - let freshnessInfo = null; - let storeInfo = null; - if (store_id) { - const storeResult = await migrate_1.pool.query('SELECT id, name, last_scraped_at FROM stores WHERE id = $1', [store_id]); - if (storeResult.rows.length > 0) { - const store = storeResult.rows[0]; - storeInfo = { id: store.id, name: store.name }; - freshnessInfo = calculateFreshness(store.last_scraped_at); - } - } - res.json({ - products, - total: parseInt(countResult.rows[0].count), - limit: parseInt(limit), - offset: parseInt(offset), - // Add freshness metadata when store_id is provided - ...(freshnessInfo && { - store: storeInfo, - last_crawl_at: freshnessInfo.last_crawl_at, - is_stale: freshnessInfo.is_stale, - freshness: freshnessInfo.freshness, - hours_since_crawl: freshnessInfo.hours_since_crawl - }), - filters: { - store_id, - category_id, - in_stock, - search, - brand, - min_price, - max_price, - min_thc, - max_thc, - strain_type, - sort_by: sortField, - sort_order: sortDirection - } - }); - } - catch (error) { - console.error('Error fetching products:', error); - res.status(500).json({ error: 'Failed to fetch products' }); - } -}); -// Get single product with optional field selection -router.get('/:id', async (req, res) => { - try { - const { id } = req.params; - const { fields } = req.query; - const result = await migrate_1.pool.query(` - SELECT p.*, s.name as store_name, c.name as category_name - FROM products p - LEFT JOIN stores s ON p.store_id = s.id - LEFT JOIN categories c ON p.category_id = c.id - WHERE p.id = $1 - `, [id]); - if (result.rows.length === 0) { - return res.status(404).json({ error: 'Product not found' }); - } - let product = result.rows[0]; - product.image_url_full = product.local_image_path - ? (0, minio_1.getImageUrl)(product.local_image_path) - : product.image_url; - product.thumbnail_url = product.thumbnail_path ? (0, minio_1.getImageUrl)(product.thumbnail_path) : null; - product.medium_url = product.medium_path ? (0, minio_1.getImageUrl)(product.medium_path) : null; - // Field selection - if (fields) { - const selectedFields = fields.split(',').map(f => f.trim()); - product = selectFields(product, selectedFields); - } - res.json({ product }); - } - catch (error) { - console.error('Error fetching product:', error); - res.status(500).json({ error: 'Failed to fetch product' }); - } -}); -// Get available brands (for filter dropdowns) -router.get('/meta/brands', async (req, res) => { - try { - const { store_id } = req.query; - let query = ` - SELECT DISTINCT brand - FROM products - WHERE brand IS NOT NULL AND brand != '' - `; - const params = []; - if (store_id) { - query += ' AND store_id = $1'; - params.push(store_id); - } - query += ' ORDER BY brand'; - const result = await migrate_1.pool.query(query, params); - const brands = result.rows.map((row) => row.brand); - res.json({ brands }); - } - catch (error) { - console.error('Error fetching brands:', error); - res.status(500).json({ error: 'Failed to fetch brands' }); - } -}); -// Get price range (for filter sliders) -router.get('/meta/price-range', async (req, res) => { - try { - const { store_id } = req.query; - let query = ` - SELECT - MIN(price) as min_price, - MAX(price) as max_price, - AVG(price) as avg_price - FROM products - WHERE price IS NOT NULL - `; - const params = []; - if (store_id) { - query += ' AND store_id = $1'; - params.push(store_id); - } - const result = await migrate_1.pool.query(query, params); - res.json({ - min_price: parseFloat(result.rows[0].min_price) || 0, - max_price: parseFloat(result.rows[0].max_price) || 0, - avg_price: parseFloat(result.rows[0].avg_price) || 0 - }); - } - catch (error) { - console.error('Error fetching price range:', error); - res.status(500).json({ error: 'Failed to fetch price range' }); - } -}); -exports.default = router; diff --git a/backend/dist/routes/proxies.js b/backend/dist/routes/proxies.js deleted file mode 100644 index 24d2d1d2..00000000 --- a/backend/dist/routes/proxies.js +++ /dev/null @@ -1,262 +0,0 @@ -"use strict"; -var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { - if (k2 === undefined) k2 = k; - var desc = Object.getOwnPropertyDescriptor(m, k); - if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { - desc = { enumerable: true, get: function() { return m[k]; } }; - } - Object.defineProperty(o, k2, desc); -}) : (function(o, m, k, k2) { - if (k2 === undefined) k2 = k; - o[k2] = m[k]; -})); -var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { - Object.defineProperty(o, "default", { enumerable: true, value: v }); -}) : function(o, v) { - o["default"] = v; -}); -var __importStar = (this && this.__importStar) || (function () { - var ownKeys = function(o) { - ownKeys = Object.getOwnPropertyNames || function (o) { - var ar = []; - for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; - return ar; - }; - return ownKeys(o); - }; - return function (mod) { - if (mod && mod.__esModule) return mod; - var result = {}; - if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); - __setModuleDefault(result, mod); - return result; - }; -})(); -Object.defineProperty(exports, "__esModule", { value: true }); -const express_1 = require("express"); -const middleware_1 = require("../auth/middleware"); -const migrate_1 = require("../db/migrate"); -const proxy_1 = require("../services/proxy"); -const proxyTestQueue_1 = require("../services/proxyTestQueue"); -const router = (0, express_1.Router)(); -router.use(middleware_1.authMiddleware); -// Get all proxies -router.get('/', async (req, res) => { - try { - const result = await migrate_1.pool.query(` - SELECT id, host, port, protocol, active, is_anonymous, - last_tested_at, test_result, response_time_ms, created_at, - city, state, country, country_code, location_updated_at - FROM proxies - ORDER BY created_at DESC - `); - res.json({ proxies: result.rows }); - } - catch (error) { - console.error('Error fetching proxies:', error); - res.status(500).json({ error: 'Failed to fetch proxies' }); - } -}); -// Get active proxy test job (must be before /:id route) -router.get('/test-job', async (req, res) => { - try { - const job = await (0, proxyTestQueue_1.getActiveProxyTestJob)(); - res.json({ job }); - } - catch (error) { - console.error('Error fetching active job:', error); - res.status(500).json({ error: 'Failed to fetch active job' }); - } -}); -// Get proxy test job status (must be before /:id route) -router.get('/test-job/:jobId', async (req, res) => { - try { - const { jobId } = req.params; - const job = await (0, proxyTestQueue_1.getProxyTestJob)(parseInt(jobId)); - if (!job) { - return res.status(404).json({ error: 'Job not found' }); - } - res.json({ job }); - } - catch (error) { - console.error('Error fetching job status:', error); - res.status(500).json({ error: 'Failed to fetch job status' }); - } -}); -// Get single proxy -router.get('/:id', async (req, res) => { - try { - const { id } = req.params; - const result = await migrate_1.pool.query(` - SELECT id, host, port, protocol, username, active, is_anonymous, - last_tested_at, test_result, response_time_ms, created_at - FROM proxies - WHERE id = $1 - `, [id]); - if (result.rows.length === 0) { - return res.status(404).json({ error: 'Proxy not found' }); - } - res.json({ proxy: result.rows[0] }); - } - catch (error) { - console.error('Error fetching proxy:', error); - res.status(500).json({ error: 'Failed to fetch proxy' }); - } -}); -// Add single proxy -router.post('/', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => { - try { - const { host, port, protocol, username, password } = req.body; - if (!host || !port || !protocol) { - return res.status(400).json({ error: 'Host, port, and protocol required' }); - } - // Test and add proxy - const proxyId = await (0, proxy_1.addProxy)(host, port, protocol, username, password); - const result = await migrate_1.pool.query(` - SELECT * FROM proxies WHERE id = $1 - `, [proxyId]); - res.status(201).json({ proxy: result.rows[0] }); - } - catch (error) { - console.error('Error adding proxy:', error); - res.status(400).json({ error: error.message || 'Failed to add proxy' }); - } -}); -// Add multiple proxies -router.post('/bulk', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => { - try { - const { proxies } = req.body; - if (!proxies || !Array.isArray(proxies)) { - return res.status(400).json({ error: 'Proxies array required' }); - } - const result = await (0, proxy_1.addProxiesFromList)(proxies); - res.status(201).json(result); - } - catch (error) { - console.error('Error adding proxies:', error); - res.status(500).json({ error: 'Failed to add proxies' }); - } -}); -// Test single proxy -router.post('/:id/test', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => { - try { - const { id } = req.params; - const proxyResult = await migrate_1.pool.query(` - SELECT host, port, protocol, username, password - FROM proxies - WHERE id = $1 - `, [id]); - if (proxyResult.rows.length === 0) { - return res.status(404).json({ error: 'Proxy not found' }); - } - const proxy = proxyResult.rows[0]; - const testResult = await (0, proxy_1.testProxy)(proxy.host, proxy.port, proxy.protocol, proxy.username, proxy.password); - // Update proxy with test results - await migrate_1.pool.query(` - UPDATE proxies - SET last_tested_at = CURRENT_TIMESTAMP, - test_result = $1, - response_time_ms = $2, - is_anonymous = $3, - active = $4 - WHERE id = $5 - `, [ - testResult.success ? 'success' : 'failed', - testResult.responseTimeMs, - testResult.isAnonymous, - testResult.success, - id - ]); - res.json({ test_result: testResult }); - } - catch (error) { - console.error('Error testing proxy:', error); - res.status(500).json({ error: 'Failed to test proxy' }); - } -}); -// Start proxy test job -router.post('/test-all', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => { - try { - const jobId = await (0, proxyTestQueue_1.createProxyTestJob)(); - res.json({ jobId, message: 'Proxy test job started' }); - } - catch (error) { - console.error('Error starting proxy test job:', error); - res.status(500).json({ error: 'Failed to start proxy test job' }); - } -}); -// Cancel proxy test job -router.post('/test-job/:jobId/cancel', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => { - try { - const { jobId } = req.params; - const cancelled = await (0, proxyTestQueue_1.cancelProxyTestJob)(parseInt(jobId)); - if (!cancelled) { - return res.status(404).json({ error: 'Job not found or already completed' }); - } - res.json({ message: 'Job cancelled successfully' }); - } - catch (error) { - console.error('Error cancelling job:', error); - res.status(500).json({ error: 'Failed to cancel job' }); - } -}); -// Update proxy -router.put('/:id', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => { - try { - const { id } = req.params; - const { host, port, protocol, username, password, active } = req.body; - const result = await migrate_1.pool.query(` - UPDATE proxies - SET host = COALESCE($1, host), - port = COALESCE($2, port), - protocol = COALESCE($3, protocol), - username = COALESCE($4, username), - password = COALESCE($5, password), - active = COALESCE($6, active), - updated_at = CURRENT_TIMESTAMP - WHERE id = $7 - RETURNING * - `, [host, port, protocol, username, password, active, id]); - if (result.rows.length === 0) { - return res.status(404).json({ error: 'Proxy not found' }); - } - res.json({ proxy: result.rows[0] }); - } - catch (error) { - console.error('Error updating proxy:', error); - res.status(500).json({ error: 'Failed to update proxy' }); - } -}); -// Delete proxy -router.delete('/:id', (0, middleware_1.requireRole)('superadmin'), async (req, res) => { - try { - const { id } = req.params; - const result = await migrate_1.pool.query(` - DELETE FROM proxies WHERE id = $1 RETURNING id - `, [id]); - if (result.rows.length === 0) { - return res.status(404).json({ error: 'Proxy not found' }); - } - res.json({ message: 'Proxy deleted successfully' }); - } - catch (error) { - console.error('Error deleting proxy:', error); - res.status(500).json({ error: 'Failed to delete proxy' }); - } -}); -// Update all proxy locations -router.post('/update-locations', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => { - try { - const { updateAllProxyLocations } = await Promise.resolve().then(() => __importStar(require('../services/geolocation'))); - // Run in background - updateAllProxyLocations().catch(err => { - console.error('❌ Location update failed:', err); - }); - res.json({ message: 'Location update job started' }); - } - catch (error) { - console.error('Error starting location update:', error); - res.status(500).json({ error: 'Failed to start location update' }); - } -}); -exports.default = router; diff --git a/backend/dist/routes/public-api.js b/backend/dist/routes/public-api.js deleted file mode 100644 index 88b78aa6..00000000 --- a/backend/dist/routes/public-api.js +++ /dev/null @@ -1,668 +0,0 @@ -"use strict"; -/** - * Public API Routes for External Consumers (WordPress, etc.) - * - * These routes use the dutchie_az data pipeline and are protected by API key auth. - * Designed for Deeply Rooted and other WordPress sites consuming menu data. - */ -var __importDefault = (this && this.__importDefault) || function (mod) { - return (mod && mod.__esModule) ? mod : { "default": mod }; -}; -Object.defineProperty(exports, "__esModule", { value: true }); -const express_1 = require("express"); -const migrate_1 = require("../db/migrate"); -const connection_1 = require("../dutchie-az/db/connection"); -const ipaddr_js_1 = __importDefault(require("ipaddr.js")); -const router = (0, express_1.Router)(); -// ============================================================ -// MIDDLEWARE -// ============================================================ -/** - * Validates if an IP address matches any of the allowed IP patterns - */ -function isIpAllowed(clientIp, allowedIps) { - try { - const clientAddr = ipaddr_js_1.default.process(clientIp); - for (const allowedIp of allowedIps) { - const trimmed = allowedIp.trim(); - if (!trimmed) - continue; - if (trimmed.includes('/')) { - try { - const range = ipaddr_js_1.default.parseCIDR(trimmed); - if (clientAddr.match(range)) { - return true; - } - } - catch (e) { - console.warn(`Invalid CIDR notation: ${trimmed}`); - continue; - } - } - else { - try { - const allowedAddr = ipaddr_js_1.default.process(trimmed); - if (clientAddr.toString() === allowedAddr.toString()) { - return true; - } - } - catch (e) { - console.warn(`Invalid IP address: ${trimmed}`); - continue; - } - } - } - return false; - } - catch (error) { - console.error('Error processing client IP:', error); - return false; - } -} -/** - * Validates if a domain matches any of the allowed domain patterns - */ -function isDomainAllowed(origin, allowedDomains) { - try { - const url = new URL(origin); - const domain = url.hostname; - for (const allowedDomain of allowedDomains) { - const trimmed = allowedDomain.trim(); - if (!trimmed) - continue; - if (trimmed.startsWith('*.')) { - const baseDomain = trimmed.substring(2); - if (domain === baseDomain || domain.endsWith('.' + baseDomain)) { - return true; - } - } - else { - if (domain === trimmed) { - return true; - } - } - } - return false; - } - catch (error) { - console.error('Error processing domain:', error); - return false; - } -} -/** - * Middleware to validate API key and resolve dispensary -> dutchie_az store mapping - */ -async function validatePublicApiKey(req, res, next) { - const apiKey = req.headers['x-api-key']; - if (!apiKey) { - return res.status(401).json({ - error: 'Missing API key', - message: 'Provide your API key in the X-API-Key header' - }); - } - try { - // Query WordPress permissions table with store info - const result = await migrate_1.pool.query(` - SELECT - p.id, - p.user_name, - p.api_key, - p.allowed_ips, - p.allowed_domains, - p.is_active, - p.store_id, - p.store_name - FROM wp_dutchie_api_permissions p - WHERE p.api_key = $1 AND p.is_active = 1 - `, [apiKey]); - if (result.rows.length === 0) { - return res.status(401).json({ - error: 'Invalid API key' - }); - } - const permission = result.rows[0]; - // Validate IP if configured - const clientIp = req.headers['x-forwarded-for']?.split(',')[0].trim() || - req.headers['x-real-ip'] || - req.ip || - req.connection.remoteAddress || - ''; - if (permission.allowed_ips) { - const allowedIps = permission.allowed_ips.split('\n').filter((ip) => ip.trim()); - if (allowedIps.length > 0 && !isIpAllowed(clientIp, allowedIps)) { - return res.status(403).json({ - error: 'IP address not allowed', - client_ip: clientIp - }); - } - } - // Validate domain if configured - const origin = req.get('origin') || req.get('referer') || ''; - if (permission.allowed_domains && origin) { - const allowedDomains = permission.allowed_domains.split('\n').filter((d) => d.trim()); - if (allowedDomains.length > 0 && !isDomainAllowed(origin, allowedDomains)) { - return res.status(403).json({ - error: 'Domain not allowed', - origin: origin - }); - } - } - // Resolve the dutchie_az store for this store - // Match by store name (from main DB) to dutchie_az.dispensaries.name - const storeResult = await (0, connection_1.query)(` - SELECT id FROM dispensaries - WHERE LOWER(TRIM(name)) = LOWER(TRIM($1)) - OR LOWER(TRIM(name)) LIKE LOWER(TRIM($1)) || '%' - OR LOWER(TRIM($1)) LIKE LOWER(TRIM(name)) || '%' - ORDER BY - CASE WHEN LOWER(TRIM(name)) = LOWER(TRIM($1)) THEN 0 ELSE 1 END, - id - LIMIT 1 - `, [permission.store_name]); - if (storeResult.rows.length > 0) { - permission.dutchie_az_store_id = storeResult.rows[0].id; - } - // Update last_used_at timestamp (async, don't wait) - migrate_1.pool.query(` - UPDATE wp_dutchie_api_permissions - SET last_used_at = CURRENT_TIMESTAMP - WHERE id = $1 - `, [permission.id]).catch((err) => { - console.error('Error updating last_used_at:', err); - }); - req.apiPermission = permission; - next(); - } - catch (error) { - console.error('Public API validation error:', error); - return res.status(500).json({ - error: 'Internal server error during API validation' - }); - } -} -// Apply middleware to all routes -router.use(validatePublicApiKey); -// ============================================================ -// PRODUCT ENDPOINTS -// ============================================================ -/** - * GET /api/v1/products - * Get products for the authenticated dispensary - * - * Query params: - * - category: Filter by product type (e.g., 'flower', 'edible') - * - brand: Filter by brand name - * - in_stock_only: Only return in-stock products (default: false) - * - limit: Max products to return (default: 100, max: 500) - * - offset: Pagination offset (default: 0) - */ -router.get('/products', async (req, res) => { - try { - const permission = req.apiPermission; - // Check if we have a dutchie_az store mapping - if (!permission.dutchie_az_store_id) { - return res.status(503).json({ - error: 'No menu data available', - message: `Menu data for ${permission.store_name} is not yet available. The dispensary may not be set up in the new data pipeline.`, - dispensary_name: permission.store_name - }); - } - const { category, brand, in_stock_only = 'false', limit = '100', offset = '0' } = req.query; - // Build query - let whereClause = 'WHERE p.dispensary_id = $1'; - const params = [permission.dutchie_az_store_id]; - let paramIndex = 2; - // Filter by stock status if requested - if (in_stock_only === 'true' || in_stock_only === '1') { - whereClause += ` AND p.stock_status = 'in_stock'`; - } - // Filter by category (maps to 'type' in dutchie_az) - if (category) { - whereClause += ` AND LOWER(p.type) = LOWER($${paramIndex})`; - params.push(category); - paramIndex++; - } - // Filter by brand - if (brand) { - whereClause += ` AND LOWER(p.brand_name) LIKE LOWER($${paramIndex})`; - params.push(`%${brand}%`); - paramIndex++; - } - // Enforce limits - const limitNum = Math.min(parseInt(limit, 10) || 100, 500); - const offsetNum = parseInt(offset, 10) || 0; - params.push(limitNum, offsetNum); - // Query products with latest snapshot data - const { rows: products } = await (0, connection_1.query)(` - SELECT - p.id, - p.external_product_id as dutchie_id, - p.name, - p.brand_name as brand, - p.type as category, - p.subcategory, - p.strain_type, - p.stock_status, - p.thc, - p.cbd, - p.primary_image_url as image_url, - p.images, - p.effects, - p.created_at, - p.updated_at, - -- Latest snapshot data for pricing - s.rec_min_price_cents, - s.rec_max_price_cents, - s.rec_min_special_price_cents, - s.med_min_price_cents, - s.med_max_price_cents, - s.med_min_special_price_cents, - s.total_quantity_available, - s.options, - s.special, - s.crawled_at as snapshot_at - FROM dutchie_products p - LEFT JOIN LATERAL ( - SELECT * FROM dutchie_product_snapshots - WHERE dutchie_product_id = p.id - ORDER BY crawled_at DESC - LIMIT 1 - ) s ON true - ${whereClause} - ORDER BY p.name ASC - LIMIT $${paramIndex} OFFSET $${paramIndex + 1} - `, params); - // Get total count for pagination - const { rows: countRows } = await (0, connection_1.query)(` - SELECT COUNT(*) as total FROM dutchie_products p ${whereClause} - `, params.slice(0, -2)); - // Transform products to backward-compatible format - const transformedProducts = products.map((p) => { - // Extract first image URL from images array - let imageUrl = p.image_url; - if (!imageUrl && p.images && Array.isArray(p.images) && p.images.length > 0) { - const firstImage = p.images[0]; - imageUrl = typeof firstImage === 'string' ? firstImage : firstImage?.url; - } - // Convert prices from cents to dollars - const regularPrice = p.rec_min_price_cents - ? (p.rec_min_price_cents / 100).toFixed(2) - : null; - const salePrice = p.rec_min_special_price_cents - ? (p.rec_min_special_price_cents / 100).toFixed(2) - : null; - return { - id: p.id, - dutchie_id: p.dutchie_id, - name: p.name, - brand: p.brand || null, - category: p.category || null, - subcategory: p.subcategory || null, - strain_type: p.strain_type || null, - description: null, // Not stored in dutchie_products, would need snapshot - regular_price: regularPrice, - sale_price: salePrice, - thc_percentage: p.thc ? parseFloat(p.thc) : null, - cbd_percentage: p.cbd ? parseFloat(p.cbd) : null, - image_url: imageUrl || null, - in_stock: p.stock_status === 'in_stock', - on_special: p.special || false, - effects: p.effects || [], - options: p.options || [], - quantity_available: p.total_quantity_available || 0, - created_at: p.created_at, - updated_at: p.updated_at, - snapshot_at: p.snapshot_at - }; - }); - res.json({ - success: true, - dispensary: permission.store_name, - products: transformedProducts, - pagination: { - total: parseInt(countRows[0]?.total || '0', 10), - limit: limitNum, - offset: offsetNum, - has_more: offsetNum + products.length < parseInt(countRows[0]?.total || '0', 10) - } - }); - } - catch (error) { - console.error('Public API products error:', error); - res.status(500).json({ - error: 'Failed to fetch products', - message: error.message - }); - } -}); -/** - * GET /api/v1/products/:id - * Get a single product by ID - */ -router.get('/products/:id', async (req, res) => { - try { - const permission = req.apiPermission; - const { id } = req.params; - if (!permission.dutchie_az_store_id) { - return res.status(503).json({ - error: 'No menu data available', - message: `Menu data for ${permission.store_name} is not yet available.` - }); - } - // Get product with latest snapshot - const { rows: products } = await (0, connection_1.query)(` - SELECT - p.*, - s.rec_min_price_cents, - s.rec_max_price_cents, - s.rec_min_special_price_cents, - s.med_min_price_cents, - s.med_max_price_cents, - s.total_quantity_available, - s.options, - s.special, - s.crawled_at as snapshot_at - FROM dutchie_products p - LEFT JOIN LATERAL ( - SELECT * FROM dutchie_product_snapshots - WHERE dutchie_product_id = p.id - ORDER BY crawled_at DESC - LIMIT 1 - ) s ON true - WHERE p.id = $1 AND p.dispensary_id = $2 - `, [id, permission.dutchie_az_store_id]); - if (products.length === 0) { - return res.status(404).json({ - error: 'Product not found' - }); - } - const p = products[0]; - // Extract first image URL - let imageUrl = p.primary_image_url; - if (!imageUrl && p.images && Array.isArray(p.images) && p.images.length > 0) { - const firstImage = p.images[0]; - imageUrl = typeof firstImage === 'string' ? firstImage : firstImage?.url; - } - res.json({ - success: true, - product: { - id: p.id, - dutchie_id: p.external_product_id, - name: p.name, - brand: p.brand_name || null, - category: p.type || null, - subcategory: p.subcategory || null, - strain_type: p.strain_type || null, - regular_price: p.rec_min_price_cents ? (p.rec_min_price_cents / 100).toFixed(2) : null, - sale_price: p.rec_min_special_price_cents ? (p.rec_min_special_price_cents / 100).toFixed(2) : null, - thc_percentage: p.thc ? parseFloat(p.thc) : null, - cbd_percentage: p.cbd ? parseFloat(p.cbd) : null, - image_url: imageUrl || null, - images: p.images || [], - in_stock: p.stock_status === 'in_stock', - on_special: p.special || false, - effects: p.effects || [], - options: p.options || [], - quantity_available: p.total_quantity_available || 0, - created_at: p.created_at, - updated_at: p.updated_at, - snapshot_at: p.snapshot_at - } - }); - } - catch (error) { - console.error('Public API product detail error:', error); - res.status(500).json({ - error: 'Failed to fetch product', - message: error.message - }); - } -}); -/** - * GET /api/v1/categories - * Get all categories for the authenticated dispensary - */ -router.get('/categories', async (req, res) => { - try { - const permission = req.apiPermission; - if (!permission.dutchie_az_store_id) { - return res.status(503).json({ - error: 'No menu data available', - message: `Menu data for ${permission.store_name} is not yet available.` - }); - } - const { rows: categories } = await (0, connection_1.query)(` - SELECT - type as category, - subcategory, - COUNT(*) as product_count, - COUNT(*) FILTER (WHERE stock_status = 'in_stock') as in_stock_count - FROM dutchie_products - WHERE dispensary_id = $1 AND type IS NOT NULL - GROUP BY type, subcategory - ORDER BY type, subcategory - `, [permission.dutchie_az_store_id]); - res.json({ - success: true, - dispensary: permission.store_name, - categories - }); - } - catch (error) { - console.error('Public API categories error:', error); - res.status(500).json({ - error: 'Failed to fetch categories', - message: error.message - }); - } -}); -/** - * GET /api/v1/brands - * Get all brands for the authenticated dispensary - */ -router.get('/brands', async (req, res) => { - try { - const permission = req.apiPermission; - if (!permission.dutchie_az_store_id) { - return res.status(503).json({ - error: 'No menu data available', - message: `Menu data for ${permission.store_name} is not yet available.` - }); - } - const { rows: brands } = await (0, connection_1.query)(` - SELECT - brand_name as brand, - COUNT(*) as product_count, - COUNT(*) FILTER (WHERE stock_status = 'in_stock') as in_stock_count - FROM dutchie_products - WHERE dispensary_id = $1 AND brand_name IS NOT NULL - GROUP BY brand_name - ORDER BY product_count DESC - `, [permission.dutchie_az_store_id]); - res.json({ - success: true, - dispensary: permission.store_name, - brands - }); - } - catch (error) { - console.error('Public API brands error:', error); - res.status(500).json({ - error: 'Failed to fetch brands', - message: error.message - }); - } -}); -/** - * GET /api/v1/specials - * Get products on special/sale for the authenticated dispensary - */ -router.get('/specials', async (req, res) => { - try { - const permission = req.apiPermission; - if (!permission.dutchie_az_store_id) { - return res.status(503).json({ - error: 'No menu data available', - message: `Menu data for ${permission.store_name} is not yet available.` - }); - } - const { limit = '100', offset = '0' } = req.query; - const limitNum = Math.min(parseInt(limit, 10) || 100, 500); - const offsetNum = parseInt(offset, 10) || 0; - // Get products with special pricing from latest snapshot - const { rows: products } = await (0, connection_1.query)(` - SELECT - p.id, - p.external_product_id as dutchie_id, - p.name, - p.brand_name as brand, - p.type as category, - p.subcategory, - p.strain_type, - p.stock_status, - p.primary_image_url as image_url, - s.rec_min_price_cents, - s.rec_min_special_price_cents, - s.special, - s.options, - p.updated_at, - s.crawled_at as snapshot_at - FROM dutchie_products p - INNER JOIN LATERAL ( - SELECT * FROM dutchie_product_snapshots - WHERE dutchie_product_id = p.id - ORDER BY crawled_at DESC - LIMIT 1 - ) s ON true - WHERE p.dispensary_id = $1 - AND s.special = true - AND p.stock_status = 'in_stock' - ORDER BY p.name ASC - LIMIT $2 OFFSET $3 - `, [permission.dutchie_az_store_id, limitNum, offsetNum]); - // Get total count - const { rows: countRows } = await (0, connection_1.query)(` - SELECT COUNT(*) as total - FROM dutchie_products p - INNER JOIN LATERAL ( - SELECT special FROM dutchie_product_snapshots - WHERE dutchie_product_id = p.id - ORDER BY crawled_at DESC - LIMIT 1 - ) s ON true - WHERE p.dispensary_id = $1 - AND s.special = true - AND p.stock_status = 'in_stock' - `, [permission.dutchie_az_store_id]); - const transformedProducts = products.map((p) => ({ - id: p.id, - dutchie_id: p.dutchie_id, - name: p.name, - brand: p.brand || null, - category: p.category || null, - strain_type: p.strain_type || null, - regular_price: p.rec_min_price_cents ? (p.rec_min_price_cents / 100).toFixed(2) : null, - sale_price: p.rec_min_special_price_cents ? (p.rec_min_special_price_cents / 100).toFixed(2) : null, - image_url: p.image_url || null, - in_stock: p.stock_status === 'in_stock', - options: p.options || [], - updated_at: p.updated_at, - snapshot_at: p.snapshot_at - })); - res.json({ - success: true, - dispensary: permission.store_name, - specials: transformedProducts, - pagination: { - total: parseInt(countRows[0]?.total || '0', 10), - limit: limitNum, - offset: offsetNum, - has_more: offsetNum + products.length < parseInt(countRows[0]?.total || '0', 10) - } - }); - } - catch (error) { - console.error('Public API specials error:', error); - res.status(500).json({ - error: 'Failed to fetch specials', - message: error.message - }); - } -}); -/** - * GET /api/v1/menu - * Get complete menu summary for the authenticated dispensary - */ -router.get('/menu', async (req, res) => { - try { - const permission = req.apiPermission; - if (!permission.dutchie_az_store_id) { - return res.status(503).json({ - error: 'No menu data available', - message: `Menu data for ${permission.store_name} is not yet available.` - }); - } - // Get counts by category - const { rows: categoryCounts } = await (0, connection_1.query)(` - SELECT - type as category, - COUNT(*) as total, - COUNT(*) FILTER (WHERE stock_status = 'in_stock') as in_stock - FROM dutchie_products - WHERE dispensary_id = $1 AND type IS NOT NULL - GROUP BY type - ORDER BY total DESC - `, [permission.dutchie_az_store_id]); - // Get overall stats - const { rows: stats } = await (0, connection_1.query)(` - SELECT - COUNT(*) as total_products, - COUNT(*) FILTER (WHERE stock_status = 'in_stock') as in_stock_count, - COUNT(DISTINCT brand_name) as brand_count, - COUNT(DISTINCT type) as category_count, - MAX(updated_at) as last_updated - FROM dutchie_products - WHERE dispensary_id = $1 - `, [permission.dutchie_az_store_id]); - // Get specials count - const { rows: specialsCount } = await (0, connection_1.query)(` - SELECT COUNT(*) as count - FROM dutchie_products p - INNER JOIN LATERAL ( - SELECT special FROM dutchie_product_snapshots - WHERE dutchie_product_id = p.id - ORDER BY crawled_at DESC - LIMIT 1 - ) s ON true - WHERE p.dispensary_id = $1 - AND s.special = true - AND p.stock_status = 'in_stock' - `, [permission.dutchie_az_store_id]); - const summary = stats[0] || {}; - res.json({ - success: true, - dispensary: permission.store_name, - menu: { - total_products: parseInt(summary.total_products || '0', 10), - in_stock_count: parseInt(summary.in_stock_count || '0', 10), - brand_count: parseInt(summary.brand_count || '0', 10), - category_count: parseInt(summary.category_count || '0', 10), - specials_count: parseInt(specialsCount[0]?.count || '0', 10), - last_updated: summary.last_updated, - categories: categoryCounts.map((c) => ({ - name: c.category, - total: parseInt(c.total, 10), - in_stock: parseInt(c.in_stock, 10) - })) - } - }); - } - catch (error) { - console.error('Public API menu error:', error); - res.status(500).json({ - error: 'Failed to fetch menu summary', - message: error.message - }); - } -}); -exports.default = router; diff --git a/backend/dist/routes/schedule.js b/backend/dist/routes/schedule.js deleted file mode 100644 index 1bad705c..00000000 --- a/backend/dist/routes/schedule.js +++ /dev/null @@ -1,887 +0,0 @@ -"use strict"; -Object.defineProperty(exports, "__esModule", { value: true }); -const express_1 = require("express"); -const middleware_1 = require("../auth/middleware"); -const crawl_scheduler_1 = require("../services/crawl-scheduler"); -const store_crawl_orchestrator_1 = require("../services/store-crawl-orchestrator"); -const dispensary_orchestrator_1 = require("../services/dispensary-orchestrator"); -const migrate_1 = require("../db/migrate"); -const graphql_client_1 = require("../dutchie-az/services/graphql-client"); -const router = (0, express_1.Router)(); -router.use(middleware_1.authMiddleware); -// ============================================ -// Global Schedule Endpoints -// ============================================ -/** - * GET /api/schedule/global - * Get global schedule settings - */ -router.get('/global', async (req, res) => { - try { - const schedules = await (0, crawl_scheduler_1.getGlobalSchedule)(); - res.json({ schedules }); - } - catch (error) { - console.error('Error fetching global schedule:', error); - res.status(500).json({ error: 'Failed to fetch global schedule' }); - } -}); -/** - * PUT /api/schedule/global/:type - * Update global schedule setting - */ -router.put('/global/:type', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => { - try { - const { type } = req.params; - const { enabled, interval_hours, run_time } = req.body; - if (type !== 'global_interval' && type !== 'daily_special') { - return res.status(400).json({ error: 'Invalid schedule type' }); - } - const schedule = await (0, crawl_scheduler_1.updateGlobalSchedule)(type, { - enabled, - interval_hours, - run_time - }); - // Restart scheduler to apply changes - await (0, crawl_scheduler_1.restartCrawlScheduler)(); - res.json({ schedule, message: 'Schedule updated and scheduler restarted' }); - } - catch (error) { - console.error('Error updating global schedule:', error); - res.status(500).json({ error: 'Failed to update global schedule' }); - } -}); -// ============================================ -// Store Schedule Endpoints -// ============================================ -/** - * GET /api/schedule/stores - * Get all store schedule statuses - */ -router.get('/stores', async (req, res) => { - try { - const stores = await (0, crawl_scheduler_1.getStoreScheduleStatuses)(); - res.json({ stores }); - } - catch (error) { - console.error('Error fetching store schedules:', error); - res.status(500).json({ error: 'Failed to fetch store schedules' }); - } -}); -/** - * GET /api/schedule/stores/:storeId - * Get schedule for a specific store - */ -router.get('/stores/:storeId', async (req, res) => { - try { - const storeId = parseInt(req.params.storeId); - if (isNaN(storeId)) { - return res.status(400).json({ error: 'Invalid store ID' }); - } - const schedule = await (0, crawl_scheduler_1.getStoreSchedule)(storeId); - res.json({ schedule }); - } - catch (error) { - console.error('Error fetching store schedule:', error); - res.status(500).json({ error: 'Failed to fetch store schedule' }); - } -}); -/** - * PUT /api/schedule/stores/:storeId - * Update schedule for a specific store - */ -router.put('/stores/:storeId', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => { - try { - const storeId = parseInt(req.params.storeId); - if (isNaN(storeId)) { - return res.status(400).json({ error: 'Invalid store ID' }); - } - const { enabled, interval_hours, daily_special_enabled, daily_special_time, priority } = req.body; - const schedule = await (0, crawl_scheduler_1.updateStoreSchedule)(storeId, { - enabled, - interval_hours, - daily_special_enabled, - daily_special_time, - priority - }); - res.json({ schedule }); - } - catch (error) { - console.error('Error updating store schedule:', error); - res.status(500).json({ error: 'Failed to update store schedule' }); - } -}); -// ============================================ -// Job Queue Endpoints -// ============================================ -/** - * GET /api/schedule/jobs - * Get recent jobs - */ -router.get('/jobs', async (req, res) => { - try { - const limit = parseInt(req.query.limit) || 50; - const jobs = await (0, crawl_scheduler_1.getAllRecentJobs)(Math.min(limit, 200)); - res.json({ jobs }); - } - catch (error) { - console.error('Error fetching jobs:', error); - res.status(500).json({ error: 'Failed to fetch jobs' }); - } -}); -/** - * GET /api/schedule/jobs/store/:storeId - * Get recent jobs for a specific store - */ -router.get('/jobs/store/:storeId', async (req, res) => { - try { - const storeId = parseInt(req.params.storeId); - if (isNaN(storeId)) { - return res.status(400).json({ error: 'Invalid store ID' }); - } - const limit = parseInt(req.query.limit) || 10; - const jobs = await (0, crawl_scheduler_1.getRecentJobs)(storeId, Math.min(limit, 100)); - res.json({ jobs }); - } - catch (error) { - console.error('Error fetching store jobs:', error); - res.status(500).json({ error: 'Failed to fetch store jobs' }); - } -}); -/** - * POST /api/schedule/jobs/:jobId/cancel - * Cancel a pending job - */ -router.post('/jobs/:jobId/cancel', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => { - try { - const jobId = parseInt(req.params.jobId); - if (isNaN(jobId)) { - return res.status(400).json({ error: 'Invalid job ID' }); - } - const cancelled = await (0, crawl_scheduler_1.cancelJob)(jobId); - if (cancelled) { - res.json({ success: true, message: 'Job cancelled' }); - } - else { - res.status(400).json({ error: 'Job could not be cancelled (may not be pending)' }); - } - } - catch (error) { - console.error('Error cancelling job:', error); - res.status(500).json({ error: 'Failed to cancel job' }); - } -}); -// ============================================ -// Manual Trigger Endpoints -// ============================================ -/** - * POST /api/schedule/trigger/store/:storeId - * Manually trigger orchestrated crawl for a specific store - * Uses the intelligent orchestrator which: - * - Checks provider detection status - * - Runs detection if needed - * - Queues appropriate crawl type (production/sandbox) - */ -router.post('/trigger/store/:storeId', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => { - try { - const storeId = parseInt(req.params.storeId); - if (isNaN(storeId)) { - return res.status(400).json({ error: 'Invalid store ID' }); - } - // Use the orchestrator instead of simple triggerManualCrawl - const result = await (0, store_crawl_orchestrator_1.runStoreCrawlOrchestrator)(storeId); - res.json({ - result, - message: result.summary, - success: result.status === 'success' || result.status === 'sandbox_only', - }); - } - catch (error) { - console.error('Error triggering orchestrated crawl:', error); - res.status(500).json({ error: 'Failed to trigger crawl' }); - } -}); -/** - * POST /api/schedule/trigger/store/:storeId/legacy - * Legacy: Simple job queue trigger (no orchestration) - */ -router.post('/trigger/store/:storeId/legacy', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => { - try { - const storeId = parseInt(req.params.storeId); - if (isNaN(storeId)) { - return res.status(400).json({ error: 'Invalid store ID' }); - } - const job = await (0, crawl_scheduler_1.triggerManualCrawl)(storeId); - res.json({ job, message: 'Crawl job created' }); - } - catch (error) { - console.error('Error triggering manual crawl:', error); - res.status(500).json({ error: 'Failed to trigger crawl' }); - } -}); -/** - * POST /api/schedule/trigger/all - * Manually trigger crawls for all stores - */ -router.post('/trigger/all', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => { - try { - const jobsCreated = await (0, crawl_scheduler_1.triggerAllStoresCrawl)(); - res.json({ jobs_created: jobsCreated, message: `Created ${jobsCreated} crawl jobs` }); - } - catch (error) { - console.error('Error triggering all crawls:', error); - res.status(500).json({ error: 'Failed to trigger crawls' }); - } -}); -/** - * POST /api/schedule/restart - * Restart the scheduler - */ -router.post('/restart', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => { - try { - await (0, crawl_scheduler_1.restartCrawlScheduler)(); - res.json({ message: 'Scheduler restarted', mode: (0, crawl_scheduler_1.getSchedulerMode)() }); - } - catch (error) { - console.error('Error restarting scheduler:', error); - res.status(500).json({ error: 'Failed to restart scheduler' }); - } -}); -// ============================================ -// Scheduler Mode Endpoints -// ============================================ -/** - * GET /api/schedule/mode - * Get current scheduler mode - */ -router.get('/mode', async (req, res) => { - try { - const mode = (0, crawl_scheduler_1.getSchedulerMode)(); - res.json({ mode }); - } - catch (error) { - console.error('Error getting scheduler mode:', error); - res.status(500).json({ error: 'Failed to get scheduler mode' }); - } -}); -/** - * PUT /api/schedule/mode - * Set scheduler mode (legacy or orchestrator) - */ -router.put('/mode', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => { - try { - const { mode } = req.body; - if (mode !== 'legacy' && mode !== 'orchestrator') { - return res.status(400).json({ error: 'Invalid mode. Must be "legacy" or "orchestrator"' }); - } - (0, crawl_scheduler_1.setSchedulerMode)(mode); - // Restart scheduler with new mode - await (0, crawl_scheduler_1.restartCrawlScheduler)(); - res.json({ mode, message: `Scheduler mode set to ${mode} and restarted` }); - } - catch (error) { - console.error('Error setting scheduler mode:', error); - res.status(500).json({ error: 'Failed to set scheduler mode' }); - } -}); -/** - * GET /api/schedule/due - * Get stores that are due for orchestration - */ -router.get('/due', async (req, res) => { - try { - const limit = parseInt(req.query.limit) || 10; - const storeIds = await (0, store_crawl_orchestrator_1.getStoresDueForOrchestration)(Math.min(limit, 50)); - res.json({ stores_due: storeIds, count: storeIds.length }); - } - catch (error) { - console.error('Error getting stores due for orchestration:', error); - res.status(500).json({ error: 'Failed to get stores due' }); - } -}); -// ============================================ -// Dispensary Schedule Endpoints (NEW - dispensary-centric) -// ============================================ -/** - * GET /api/schedule/dispensaries - * Get all dispensary schedule statuses with optional filters - * Query params: - * - state: filter by state (e.g., 'AZ') - * - search: search by name or slug - */ -router.get('/dispensaries', async (req, res) => { - try { - const { state, search } = req.query; - // Build dynamic query with optional filters - const conditions = []; - const params = []; - let paramIndex = 1; - if (state) { - conditions.push(`d.state = $${paramIndex}`); - params.push(state); - paramIndex++; - } - if (search) { - conditions.push(`(d.name ILIKE $${paramIndex} OR d.slug ILIKE $${paramIndex} OR d.dba_name ILIKE $${paramIndex})`); - params.push(`%${search}%`); - paramIndex++; - } - const whereClause = conditions.length > 0 ? `WHERE ${conditions.join(' AND ')}` : ''; - const query = ` - SELECT - d.id AS dispensary_id, - COALESCE(d.dba_name, d.name) AS dispensary_name, - d.slug AS dispensary_slug, - d.city, - d.state, - d.menu_url, - d.menu_type, - d.platform_dispensary_id, - d.scrape_enabled, - d.last_crawl_at, - d.crawl_status, - d.product_crawler_mode, - d.product_provider, - cs.interval_minutes, - cs.is_active, - cs.priority, - cs.last_run_at, - cs.next_run_at, - cs.last_status AS schedule_last_status, - cs.last_error AS schedule_last_error, - cs.consecutive_failures, - j.id AS latest_job_id, - j.status AS latest_job_status, - j.job_type AS latest_job_type, - j.started_at AS latest_job_started, - j.completed_at AS latest_job_completed, - j.products_found AS latest_products_found, - j.products_new AS latest_products_created, - j.products_updated AS latest_products_updated, - j.error_message AS latest_job_error, - CASE - WHEN d.menu_type = 'dutchie' AND d.platform_dispensary_id IS NOT NULL THEN true - ELSE false - END AS can_crawl, - CASE - WHEN d.menu_type IS NULL OR d.menu_type = 'unknown' THEN 'menu_type not detected' - WHEN d.menu_type != 'dutchie' THEN 'not dutchie platform' - WHEN d.platform_dispensary_id IS NULL THEN 'platform ID not resolved' - WHEN d.scrape_enabled = false THEN 'scraping disabled' - ELSE 'ready' - END AS schedule_status_reason - FROM public.dispensaries d - LEFT JOIN public.dispensary_crawl_schedule cs ON cs.dispensary_id = d.id - LEFT JOIN LATERAL ( - SELECT * - FROM public.dispensary_crawl_jobs dj - WHERE dj.dispensary_id = d.id - ORDER BY dj.created_at DESC - LIMIT 1 - ) j ON true - ${whereClause} - ORDER BY cs.priority DESC NULLS LAST, COALESCE(d.dba_name, d.name) - `; - const result = await migrate_1.pool.query(query, params); - res.json({ dispensaries: result.rows }); - } - catch (error) { - console.error('Error fetching dispensary schedules:', error); - res.status(500).json({ error: 'Failed to fetch dispensary schedules' }); - } -}); -/** - * GET /api/schedule/dispensaries/:id - * Get schedule for a specific dispensary - */ -router.get('/dispensaries/:id', async (req, res) => { - try { - const dispensaryId = parseInt(req.params.id); - if (isNaN(dispensaryId)) { - return res.status(400).json({ error: 'Invalid dispensary ID' }); - } - const result = await migrate_1.pool.query(` - SELECT * FROM dispensary_crawl_status - WHERE dispensary_id = $1 - `, [dispensaryId]); - if (result.rows.length === 0) { - return res.status(404).json({ error: 'Dispensary not found' }); - } - res.json({ schedule: result.rows[0] }); - } - catch (error) { - console.error('Error fetching dispensary schedule:', error); - res.status(500).json({ error: 'Failed to fetch dispensary schedule' }); - } -}); -/** - * PUT /api/schedule/dispensaries/:id - * Update schedule for a specific dispensary - */ -router.put('/dispensaries/:id', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => { - try { - const dispensaryId = parseInt(req.params.id); - if (isNaN(dispensaryId)) { - return res.status(400).json({ error: 'Invalid dispensary ID' }); - } - const { is_active, interval_minutes, priority } = req.body; - // Upsert schedule - const result = await migrate_1.pool.query(` - INSERT INTO dispensary_crawl_schedule (dispensary_id, is_active, interval_minutes, priority) - VALUES ($1, COALESCE($2, TRUE), COALESCE($3, 240), COALESCE($4, 0)) - ON CONFLICT (dispensary_id) DO UPDATE SET - is_active = COALESCE($2, dispensary_crawl_schedule.is_active), - interval_minutes = COALESCE($3, dispensary_crawl_schedule.interval_minutes), - priority = COALESCE($4, dispensary_crawl_schedule.priority), - updated_at = NOW() - RETURNING * - `, [dispensaryId, is_active, interval_minutes, priority]); - res.json({ schedule: result.rows[0] }); - } - catch (error) { - console.error('Error updating dispensary schedule:', error); - res.status(500).json({ error: 'Failed to update dispensary schedule' }); - } -}); -/** - * GET /api/schedule/dispensary-jobs - * Get recent dispensary crawl jobs - */ -router.get('/dispensary-jobs', async (req, res) => { - try { - const limit = parseInt(req.query.limit) || 50; - const result = await migrate_1.pool.query(` - SELECT dcj.*, d.name as dispensary_name - FROM dispensary_crawl_jobs dcj - JOIN dispensaries d ON d.id = dcj.dispensary_id - ORDER BY dcj.created_at DESC - LIMIT $1 - `, [Math.min(limit, 200)]); - res.json({ jobs: result.rows }); - } - catch (error) { - console.error('Error fetching dispensary jobs:', error); - res.status(500).json({ error: 'Failed to fetch dispensary jobs' }); - } -}); -/** - * GET /api/schedule/dispensary-jobs/:dispensaryId - * Get recent jobs for a specific dispensary - */ -router.get('/dispensary-jobs/:dispensaryId', async (req, res) => { - try { - const dispensaryId = parseInt(req.params.dispensaryId); - if (isNaN(dispensaryId)) { - return res.status(400).json({ error: 'Invalid dispensary ID' }); - } - const limit = parseInt(req.query.limit) || 10; - const result = await migrate_1.pool.query(` - SELECT dcj.*, d.name as dispensary_name - FROM dispensary_crawl_jobs dcj - JOIN dispensaries d ON d.id = dcj.dispensary_id - WHERE dcj.dispensary_id = $1 - ORDER BY dcj.created_at DESC - LIMIT $2 - `, [dispensaryId, Math.min(limit, 100)]); - res.json({ jobs: result.rows }); - } - catch (error) { - console.error('Error fetching dispensary jobs:', error); - res.status(500).json({ error: 'Failed to fetch dispensary jobs' }); - } -}); -/** - * POST /api/schedule/trigger/dispensary/:id - * Trigger orchestrator for a specific dispensary (Run Now button) - */ -router.post('/trigger/dispensary/:id', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => { - try { - const dispensaryId = parseInt(req.params.id); - if (isNaN(dispensaryId)) { - return res.status(400).json({ error: 'Invalid dispensary ID' }); - } - // Run the dispensary orchestrator - const result = await (0, dispensary_orchestrator_1.runDispensaryOrchestrator)(dispensaryId); - res.json({ - result, - message: result.summary, - success: result.status === 'success' || result.status === 'sandbox_only' || result.status === 'detection_only', - }); - } - catch (error) { - console.error('Error triggering dispensary orchestrator:', error); - res.status(500).json({ error: 'Failed to trigger orchestrator' }); - } -}); -/** - * POST /api/schedule/trigger/dispensaries/batch - * Trigger orchestrator for multiple dispensaries - */ -router.post('/trigger/dispensaries/batch', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => { - try { - const { dispensary_ids, concurrency } = req.body; - if (!Array.isArray(dispensary_ids) || dispensary_ids.length === 0) { - return res.status(400).json({ error: 'dispensary_ids must be a non-empty array' }); - } - const results = await (0, dispensary_orchestrator_1.runBatchDispensaryOrchestrator)(dispensary_ids, concurrency || 3); - const summary = { - total: results.length, - success: results.filter(r => r.status === 'success').length, - sandbox_only: results.filter(r => r.status === 'sandbox_only').length, - detection_only: results.filter(r => r.status === 'detection_only').length, - error: results.filter(r => r.status === 'error').length, - }; - res.json({ results, summary }); - } - catch (error) { - console.error('Error triggering batch orchestrator:', error); - res.status(500).json({ error: 'Failed to trigger batch orchestrator' }); - } -}); -/** - * GET /api/schedule/dispensary-due - * Get dispensaries that are due for orchestration - */ -router.get('/dispensary-due', async (req, res) => { - try { - const limit = parseInt(req.query.limit) || 10; - const dispensaryIds = await (0, dispensary_orchestrator_1.getDispensariesDueForOrchestration)(Math.min(limit, 50)); - // Get details for the due dispensaries - if (dispensaryIds.length > 0) { - const details = await migrate_1.pool.query(` - SELECT d.id, d.name, d.product_provider, d.product_crawler_mode, - dcs.next_run_at, dcs.last_status, dcs.priority - FROM dispensaries d - LEFT JOIN dispensary_crawl_schedule dcs ON dcs.dispensary_id = d.id - WHERE d.id = ANY($1) - ORDER BY COALESCE(dcs.priority, 0) DESC, dcs.last_run_at ASC NULLS FIRST - `, [dispensaryIds]); - res.json({ dispensaries_due: details.rows, count: dispensaryIds.length }); - } - else { - res.json({ dispensaries_due: [], count: 0 }); - } - } - catch (error) { - console.error('Error getting dispensaries due for orchestration:', error); - res.status(500).json({ error: 'Failed to get dispensaries due' }); - } -}); -/** - * POST /api/schedule/dispensaries/bootstrap - * Ensure all dispensaries have schedule entries - */ -router.post('/dispensaries/bootstrap', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => { - try { - const { interval_minutes } = req.body; - const result = await (0, dispensary_orchestrator_1.ensureAllDispensariesHaveSchedules)(interval_minutes || 240); - res.json({ - message: `Created ${result.created} new schedules, ${result.existing} already existed`, - created: result.created, - existing: result.existing, - }); - } - catch (error) { - console.error('Error bootstrapping dispensary schedules:', error); - res.status(500).json({ error: 'Failed to bootstrap schedules' }); - } -}); -// ============================================ -// Platform ID & Menu Type Detection Endpoints -// ============================================ -/** - * POST /api/schedule/dispensaries/:id/resolve-platform-id - * Resolve the Dutchie platform_dispensary_id from menu_url slug - */ -router.post('/dispensaries/:id/resolve-platform-id', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => { - try { - const dispensaryId = parseInt(req.params.id); - if (isNaN(dispensaryId)) { - return res.status(400).json({ error: 'Invalid dispensary ID' }); - } - // Get dispensary info - const dispensaryResult = await migrate_1.pool.query(` - SELECT id, name, slug, menu_url, menu_type, platform_dispensary_id - FROM dispensaries WHERE id = $1 - `, [dispensaryId]); - if (dispensaryResult.rows.length === 0) { - return res.status(404).json({ error: 'Dispensary not found' }); - } - const dispensary = dispensaryResult.rows[0]; - // Check if already resolved - if (dispensary.platform_dispensary_id) { - return res.json({ - success: true, - message: 'Platform ID already resolved', - platform_dispensary_id: dispensary.platform_dispensary_id, - already_resolved: true - }); - } - // Extract slug from menu_url for Dutchie URLs - let slugToResolve = dispensary.slug; - if (dispensary.menu_url) { - // Match embedded-menu or dispensary URLs - const match = dispensary.menu_url.match(/(?:embedded-menu|dispensar(?:y|ies))\/([^\/\?#]+)/i); - if (match) { - slugToResolve = match[1]; - } - } - if (!slugToResolve) { - return res.status(400).json({ - error: 'No slug available to resolve platform ID', - menu_url: dispensary.menu_url - }); - } - console.log(`[Schedule] Resolving platform ID for ${dispensary.name} using slug: ${slugToResolve}`); - // Resolve platform ID using GraphQL client - const platformId = await (0, graphql_client_1.resolveDispensaryId)(slugToResolve); - if (!platformId) { - return res.status(404).json({ - error: 'Could not resolve platform ID', - slug_tried: slugToResolve, - message: 'The dispensary might not be on Dutchie or the slug is incorrect' - }); - } - // Update the dispensary with resolved platform ID - await migrate_1.pool.query(` - UPDATE dispensaries - SET platform_dispensary_id = $1, - menu_type = COALESCE(menu_type, 'dutchie'), - updated_at = NOW() - WHERE id = $2 - `, [platformId, dispensaryId]); - res.json({ - success: true, - platform_dispensary_id: platformId, - slug_resolved: slugToResolve, - message: `Platform ID resolved: ${platformId}` - }); - } - catch (error) { - console.error('Error resolving platform ID:', error); - res.status(500).json({ error: 'Failed to resolve platform ID', details: error.message }); - } -}); -/** - * POST /api/schedule/dispensaries/:id/detect-menu-type - * Detect menu type from menu_url - */ -router.post('/dispensaries/:id/detect-menu-type', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => { - try { - const dispensaryId = parseInt(req.params.id); - if (isNaN(dispensaryId)) { - return res.status(400).json({ error: 'Invalid dispensary ID' }); - } - // Get dispensary info - const dispensaryResult = await migrate_1.pool.query(` - SELECT id, name, menu_url, website FROM dispensaries WHERE id = $1 - `, [dispensaryId]); - if (dispensaryResult.rows.length === 0) { - return res.status(404).json({ error: 'Dispensary not found' }); - } - const dispensary = dispensaryResult.rows[0]; - const urlToCheck = dispensary.menu_url || dispensary.website; - if (!urlToCheck) { - return res.status(400).json({ error: 'No menu_url or website to detect from' }); - } - // Detect menu type from URL patterns - let detectedType = 'unknown'; - if (urlToCheck.includes('dutchie.com') || urlToCheck.includes('embedded-menu')) { - detectedType = 'dutchie'; - } - else if (urlToCheck.includes('iheartjane.com') || urlToCheck.includes('jane.co')) { - detectedType = 'jane'; - } - else if (urlToCheck.includes('weedmaps.com')) { - detectedType = 'weedmaps'; - } - else if (urlToCheck.includes('leafly.com')) { - detectedType = 'leafly'; - } - else if (urlToCheck.includes('treez.io') || urlToCheck.includes('treez.co')) { - detectedType = 'treez'; - } - else if (urlToCheck.includes('meadow.com')) { - detectedType = 'meadow'; - } - else if (urlToCheck.includes('blaze.me') || urlToCheck.includes('blazepay')) { - detectedType = 'blaze'; - } - else if (urlToCheck.includes('flowhub.com')) { - detectedType = 'flowhub'; - } - else if (urlToCheck.includes('dispense.app')) { - detectedType = 'dispense'; - } - else if (urlToCheck.includes('covasoft.com')) { - detectedType = 'cova'; - } - // Update menu_type - await migrate_1.pool.query(` - UPDATE dispensaries - SET menu_type = $1, updated_at = NOW() - WHERE id = $2 - `, [detectedType, dispensaryId]); - res.json({ - success: true, - menu_type: detectedType, - url_checked: urlToCheck, - message: `Menu type detected: ${detectedType}` - }); - } - catch (error) { - console.error('Error detecting menu type:', error); - res.status(500).json({ error: 'Failed to detect menu type' }); - } -}); -/** - * POST /api/schedule/dispensaries/:id/refresh-detection - * Combined: detect menu_type AND resolve platform_dispensary_id if dutchie - */ -router.post('/dispensaries/:id/refresh-detection', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => { - try { - const dispensaryId = parseInt(req.params.id); - if (isNaN(dispensaryId)) { - return res.status(400).json({ error: 'Invalid dispensary ID' }); - } - // Get dispensary info - const dispensaryResult = await migrate_1.pool.query(` - SELECT id, name, slug, menu_url, website FROM dispensaries WHERE id = $1 - `, [dispensaryId]); - if (dispensaryResult.rows.length === 0) { - return res.status(404).json({ error: 'Dispensary not found' }); - } - const dispensary = dispensaryResult.rows[0]; - const urlToCheck = dispensary.menu_url || dispensary.website; - if (!urlToCheck) { - return res.status(400).json({ error: 'No menu_url or website to detect from' }); - } - // Detect menu type from URL patterns - let detectedType = 'unknown'; - if (urlToCheck.includes('dutchie.com') || urlToCheck.includes('embedded-menu')) { - detectedType = 'dutchie'; - } - else if (urlToCheck.includes('iheartjane.com') || urlToCheck.includes('jane.co')) { - detectedType = 'jane'; - } - else if (urlToCheck.includes('weedmaps.com')) { - detectedType = 'weedmaps'; - } - else if (urlToCheck.includes('leafly.com')) { - detectedType = 'leafly'; - } - else if (urlToCheck.includes('treez.io') || urlToCheck.includes('treez.co')) { - detectedType = 'treez'; - } - else if (urlToCheck.includes('meadow.com')) { - detectedType = 'meadow'; - } - else if (urlToCheck.includes('blaze.me') || urlToCheck.includes('blazepay')) { - detectedType = 'blaze'; - } - else if (urlToCheck.includes('flowhub.com')) { - detectedType = 'flowhub'; - } - else if (urlToCheck.includes('dispense.app')) { - detectedType = 'dispense'; - } - else if (urlToCheck.includes('covasoft.com')) { - detectedType = 'cova'; - } - // Update menu_type first - await migrate_1.pool.query(` - UPDATE dispensaries SET menu_type = $1, updated_at = NOW() WHERE id = $2 - `, [detectedType, dispensaryId]); - let platformId = null; - // If dutchie, also try to resolve platform ID - if (detectedType === 'dutchie') { - let slugToResolve = dispensary.slug; - const match = urlToCheck.match(/(?:embedded-menu|dispensar(?:y|ies))\/([^\/\?#]+)/i); - if (match) { - slugToResolve = match[1]; - } - if (slugToResolve) { - try { - console.log(`[Schedule] Resolving platform ID for ${dispensary.name} using slug: ${slugToResolve}`); - platformId = await (0, graphql_client_1.resolveDispensaryId)(slugToResolve); - if (platformId) { - await migrate_1.pool.query(` - UPDATE dispensaries SET platform_dispensary_id = $1, updated_at = NOW() WHERE id = $2 - `, [platformId, dispensaryId]); - } - } - catch (err) { - console.warn(`[Schedule] Failed to resolve platform ID: ${err.message}`); - } - } - } - res.json({ - success: true, - menu_type: detectedType, - platform_dispensary_id: platformId, - url_checked: urlToCheck, - can_crawl: detectedType === 'dutchie' && !!platformId - }); - } - catch (error) { - console.error('Error refreshing detection:', error); - res.status(500).json({ error: 'Failed to refresh detection' }); - } -}); -/** - * PUT /api/schedule/dispensaries/:id/toggle-active - * Enable or disable schedule for a dispensary - */ -router.put('/dispensaries/:id/toggle-active', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => { - try { - const dispensaryId = parseInt(req.params.id); - if (isNaN(dispensaryId)) { - return res.status(400).json({ error: 'Invalid dispensary ID' }); - } - const { is_active } = req.body; - // Upsert schedule with new is_active value - const result = await migrate_1.pool.query(` - INSERT INTO dispensary_crawl_schedule (dispensary_id, is_active, interval_minutes, priority) - VALUES ($1, $2, 240, 0) - ON CONFLICT (dispensary_id) DO UPDATE SET - is_active = $2, - updated_at = NOW() - RETURNING * - `, [dispensaryId, is_active]); - res.json({ - success: true, - schedule: result.rows[0], - message: is_active ? 'Schedule enabled' : 'Schedule disabled' - }); - } - catch (error) { - console.error('Error toggling schedule active status:', error); - res.status(500).json({ error: 'Failed to toggle schedule' }); - } -}); -/** - * DELETE /api/schedule/dispensaries/:id/schedule - * Delete schedule for a dispensary - */ -router.delete('/dispensaries/:id/schedule', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => { - try { - const dispensaryId = parseInt(req.params.id); - if (isNaN(dispensaryId)) { - return res.status(400).json({ error: 'Invalid dispensary ID' }); - } - const result = await migrate_1.pool.query(` - DELETE FROM dispensary_crawl_schedule WHERE dispensary_id = $1 RETURNING id - `, [dispensaryId]); - const deleted = (result.rowCount ?? 0) > 0; - res.json({ - success: true, - deleted, - message: deleted ? 'Schedule deleted' : 'No schedule to delete' - }); - } - catch (error) { - console.error('Error deleting schedule:', error); - res.status(500).json({ error: 'Failed to delete schedule' }); - } -}); -exports.default = router; diff --git a/backend/dist/routes/scraper-monitor.js b/backend/dist/routes/scraper-monitor.js deleted file mode 100644 index 62bd924b..00000000 --- a/backend/dist/routes/scraper-monitor.js +++ /dev/null @@ -1,349 +0,0 @@ -"use strict"; -var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { - if (k2 === undefined) k2 = k; - var desc = Object.getOwnPropertyDescriptor(m, k); - if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { - desc = { enumerable: true, get: function() { return m[k]; } }; - } - Object.defineProperty(o, k2, desc); -}) : (function(o, m, k, k2) { - if (k2 === undefined) k2 = k; - o[k2] = m[k]; -})); -var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { - Object.defineProperty(o, "default", { enumerable: true, value: v }); -}) : function(o, v) { - o["default"] = v; -}); -var __importStar = (this && this.__importStar) || (function () { - var ownKeys = function(o) { - ownKeys = Object.getOwnPropertyNames || function (o) { - var ar = []; - for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; - return ar; - }; - return ownKeys(o); - }; - return function (mod) { - if (mod && mod.__esModule) return mod; - var result = {}; - if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); - __setModuleDefault(result, mod); - return result; - }; -})(); -Object.defineProperty(exports, "__esModule", { value: true }); -exports.activeScrapers = void 0; -exports.registerScraper = registerScraper; -exports.updateScraperStats = updateScraperStats; -exports.completeScraper = completeScraper; -const express_1 = require("express"); -const middleware_1 = require("../auth/middleware"); -const migrate_1 = require("../db/migrate"); -const router = (0, express_1.Router)(); -router.use(middleware_1.authMiddleware); -exports.activeScrapers = new Map(); -// Get all active scrapers -router.get('/active', async (req, res) => { - try { - const scrapers = Array.from(exports.activeScrapers.values()).map(scraper => ({ - ...scraper, - duration: Date.now() - scraper.startTime.getTime(), - isStale: Date.now() - scraper.lastUpdate.getTime() > 60000 // 1 minute - })); - res.json({ scrapers }); - } - catch (error) { - console.error('Error fetching active scrapers:', error); - res.status(500).json({ error: 'Failed to fetch active scrapers' }); - } -}); -// Get scraper by ID -router.get('/active/:id', async (req, res) => { - try { - const { id } = req.params; - const scraper = exports.activeScrapers.get(id); - if (!scraper) { - return res.status(404).json({ error: 'Scraper not found' }); - } - res.json({ - scraper: { - ...scraper, - duration: Date.now() - scraper.startTime.getTime(), - isStale: Date.now() - scraper.lastUpdate.getTime() > 60000 - } - }); - } - catch (error) { - console.error('Error fetching scraper:', error); - res.status(500).json({ error: 'Failed to fetch scraper' }); - } -}); -// Get scraper history (last 50 completed scrapes) -router.get('/history', async (req, res) => { - try { - const { limit = 50, dispensary_id } = req.query; - let query = ` - SELECT - d.id as dispensary_id, - COALESCE(d.dba_name, d.name) as dispensary_name, - d.city, - d.state, - dcj.id as job_id, - dcj.job_type, - dcj.status, - dcj.products_found, - dcj.products_new, - dcj.products_updated, - dcj.in_stock_count, - dcj.out_of_stock_count, - dcj.duration_ms, - dcj.completed_at as last_scraped_at, - dcj.error_message, - ( - SELECT COUNT(*) - FROM products p - WHERE p.dispensary_id = d.id - AND p.last_seen_at >= NOW() - INTERVAL '7 days' - ) as product_count - FROM dispensary_crawl_jobs dcj - JOIN dispensaries d ON d.id = dcj.dispensary_id - WHERE dcj.completed_at IS NOT NULL - `; - const params = []; - let paramCount = 1; - if (dispensary_id) { - query += ` AND d.id = $${paramCount}`; - params.push(dispensary_id); - paramCount++; - } - query += ` ORDER BY dcj.completed_at DESC LIMIT $${paramCount}`; - params.push(limit); - const result = await migrate_1.pool.query(query, params); - res.json({ history: result.rows }); - } - catch (error) { - console.error('Error fetching scraper history:', error); - res.status(500).json({ error: 'Failed to fetch scraper history' }); - } -}); -// Helper function to register a scraper -function registerScraper(id, storeId, storeName, categoryId, categoryName) { - exports.activeScrapers.set(id, { - id, - storeId, - storeName, - categoryId, - categoryName, - startTime: new Date(), - lastUpdate: new Date(), - status: 'running', - stats: { - requestsTotal: 0, - requestsSuccess: 0, - itemsSaved: 0, - itemsDropped: 0, - errorsCount: 0 - } - }); -} -// Helper function to update scraper stats -function updateScraperStats(id, stats, currentActivity) { - const scraper = exports.activeScrapers.get(id); - if (scraper) { - scraper.stats = { ...scraper.stats, ...stats }; - scraper.lastUpdate = new Date(); - if (currentActivity) { - scraper.currentActivity = currentActivity; - } - } -} -// Helper function to mark scraper as completed -function completeScraper(id, error) { - const scraper = exports.activeScrapers.get(id); - if (scraper) { - scraper.status = error ? 'error' : 'completed'; - scraper.lastUpdate = new Date(); - // Remove after 5 minutes - setTimeout(() => { - exports.activeScrapers.delete(id); - }, 5 * 60 * 1000); - } -} -// Dispensary crawl jobs endpoints -router.get('/jobs/stats', async (req, res) => { - try { - const { dispensary_id } = req.query; - let whereClause = ''; - const params = []; - if (dispensary_id) { - whereClause = 'WHERE dispensary_id = $1'; - params.push(dispensary_id); - } - const result = await migrate_1.pool.query(` - SELECT - status, - COUNT(*) as count, - SUM(products_found) as total_products_found, - SUM(COALESCE(products_new, 0) + COALESCE(products_updated, 0)) as total_products_saved - FROM dispensary_crawl_jobs - ${whereClause} - GROUP BY status - `, params); - const stats = { - pending: 0, - in_progress: 0, - completed: 0, - failed: 0, - total_products_found: 0, - total_products_saved: 0 - }; - result.rows.forEach((row) => { - stats[row.status] = parseInt(row.count); - if (row.status === 'completed') { - stats.total_products_found += parseInt(row.total_products_found || '0'); - stats.total_products_saved += parseInt(row.total_products_saved || '0'); - } - }); - res.json(stats); - } - catch (error) { - console.error('Error fetching job stats:', error); - res.status(500).json({ error: 'Failed to fetch job stats' }); - } -}); -router.get('/jobs/active', async (req, res) => { - try { - const { dispensary_id } = req.query; - let whereClause = "WHERE dcj.status = 'in_progress'"; - const params = []; - let paramCount = 1; - if (dispensary_id) { - whereClause += ` AND dcj.dispensary_id = $${paramCount}`; - params.push(dispensary_id); - paramCount++; - } - const result = await migrate_1.pool.query(` - SELECT - dcj.id, - dcj.dispensary_id, - COALESCE(d.dba_name, d.name) as dispensary_name, - dcj.job_type, - dcj.status, - dcj.worker_id, - dcj.started_at, - dcj.products_found, - COALESCE(dcj.products_new, 0) + COALESCE(dcj.products_updated, 0) as products_saved, - EXTRACT(EPOCH FROM (NOW() - dcj.started_at)) as duration_seconds - FROM dispensary_crawl_jobs dcj - JOIN dispensaries d ON d.id = dcj.dispensary_id - ${whereClause} - ORDER BY dcj.started_at DESC - `, params); - res.json({ jobs: result.rows }); - } - catch (error) { - console.error('Error fetching active jobs:', error); - res.status(500).json({ error: 'Failed to fetch active jobs' }); - } -}); -router.get('/jobs/recent', async (req, res) => { - try { - const { limit = 50, dispensary_id, status } = req.query; - let whereClause = ''; - const params = []; - let paramCount = 1; - const conditions = []; - if (dispensary_id) { - conditions.push(`dcj.dispensary_id = $${paramCount}`); - params.push(dispensary_id); - paramCount++; - } - if (status) { - conditions.push(`dcj.status = $${paramCount}`); - params.push(status); - paramCount++; - } - if (conditions.length > 0) { - whereClause = 'WHERE ' + conditions.join(' AND '); - } - params.push(limit); - const result = await migrate_1.pool.query(` - SELECT - dcj.id, - dcj.dispensary_id, - COALESCE(d.dba_name, d.name) as dispensary_name, - dcj.job_type, - dcj.status, - dcj.worker_id, - dcj.started_at, - dcj.completed_at, - dcj.products_found, - COALESCE(dcj.products_new, 0) + COALESCE(dcj.products_updated, 0) as products_saved, - dcj.error_message, - EXTRACT(EPOCH FROM (COALESCE(dcj.completed_at, NOW()) - dcj.started_at)) as duration_seconds - FROM dispensary_crawl_jobs dcj - JOIN dispensaries d ON d.id = dcj.dispensary_id - ${whereClause} - ORDER BY dcj.created_at DESC - LIMIT $${paramCount} - `, params); - res.json({ jobs: result.rows }); - } - catch (error) { - console.error('Error fetching recent jobs:', error); - res.status(500).json({ error: 'Failed to fetch recent jobs' }); - } -}); -router.get('/jobs/workers', async (req, res) => { - try { - const { dispensary_id } = req.query; - let whereClause = "WHERE status = 'in_progress' AND worker_id IS NOT NULL"; - const params = []; - if (dispensary_id) { - whereClause += ` AND dispensary_id = $1`; - params.push(dispensary_id); - } - const result = await migrate_1.pool.query(` - SELECT - worker_id, - COUNT(*) as active_jobs, - SUM(products_found) as total_products_found, - SUM(COALESCE(products_new, 0) + COALESCE(products_updated, 0)) as total_products_saved, - MIN(started_at) as earliest_start, - MAX(started_at) as latest_start - FROM dispensary_crawl_jobs - ${whereClause} - GROUP BY worker_id - ORDER BY worker_id - `, params); - res.json({ workers: result.rows }); - } - catch (error) { - console.error('Error fetching worker stats:', error); - res.status(500).json({ error: 'Failed to fetch worker stats' }); - } -}); -router.get('/jobs/worker-logs/:workerId', async (req, res) => { - try { - const { workerId } = req.params; - const fs = await Promise.resolve().then(() => __importStar(require('fs/promises'))); - const path = await Promise.resolve().then(() => __importStar(require('path'))); - const logPath = path.join('/tmp', `worker-${workerId}.log`); - try { - const logs = await fs.readFile(logPath, 'utf-8'); - const lines = logs.split('\n'); - // Return last 100 lines - const recentLogs = lines.slice(-100).join('\n'); - res.json({ logs: recentLogs }); - } - catch (fileError) { - res.json({ logs: 'No logs available for this worker yet.' }); - } - } - catch (error) { - console.error('Failed to get worker logs:', error); - res.status(500).json({ error: 'Failed to get worker logs' }); - } -}); -exports.default = router; diff --git a/backend/dist/routes/settings.js b/backend/dist/routes/settings.js deleted file mode 100644 index efcf4b64..00000000 --- a/backend/dist/routes/settings.js +++ /dev/null @@ -1,118 +0,0 @@ -"use strict"; -Object.defineProperty(exports, "__esModule", { value: true }); -const express_1 = require("express"); -const middleware_1 = require("../auth/middleware"); -const migrate_1 = require("../db/migrate"); -const scheduler_1 = require("../services/scheduler"); -const router = (0, express_1.Router)(); -router.use(middleware_1.authMiddleware); -// Get all settings -router.get('/', async (req, res) => { - try { - const result = await migrate_1.pool.query(` - SELECT key, value, description, updated_at - FROM settings - ORDER BY key - `); - res.json({ settings: result.rows }); - } - catch (error) { - console.error('Error fetching settings:', error); - res.status(500).json({ error: 'Failed to fetch settings' }); - } -}); -// Get single setting -router.get('/:key', async (req, res) => { - try { - const { key } = req.params; - const result = await migrate_1.pool.query(` - SELECT key, value, description, updated_at - FROM settings - WHERE key = $1 - `, [key]); - if (result.rows.length === 0) { - return res.status(404).json({ error: 'Setting not found' }); - } - res.json({ setting: result.rows[0] }); - } - catch (error) { - console.error('Error fetching setting:', error); - res.status(500).json({ error: 'Failed to fetch setting' }); - } -}); -// Update setting -router.put('/:key', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => { - try { - const { key } = req.params; - const { value } = req.body; - if (value === undefined) { - return res.status(400).json({ error: 'Value required' }); - } - const result = await migrate_1.pool.query(` - UPDATE settings - SET value = $1, updated_at = CURRENT_TIMESTAMP - WHERE key = $2 - RETURNING * - `, [value, key]); - if (result.rows.length === 0) { - return res.status(404).json({ error: 'Setting not found' }); - } - // Restart scheduler if scrape settings changed - if (key === 'scrape_interval_hours' || key === 'scrape_specials_time') { - console.log('Restarting scheduler due to setting change...'); - await (0, scheduler_1.restartScheduler)(); - } - res.json({ setting: result.rows[0] }); - } - catch (error) { - console.error('Error updating setting:', error); - res.status(500).json({ error: 'Failed to update setting' }); - } -}); -// Update multiple settings at once -router.put('/', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => { - try { - const { settings } = req.body; - if (!settings || !Array.isArray(settings)) { - return res.status(400).json({ error: 'Settings array required' }); - } - const client = await migrate_1.pool.connect(); - try { - await client.query('BEGIN'); - const updated = []; - let needsSchedulerRestart = false; - for (const setting of settings) { - const result = await client.query(` - UPDATE settings - SET value = $1, updated_at = CURRENT_TIMESTAMP - WHERE key = $2 - RETURNING * - `, [setting.value, setting.key]); - if (result.rows.length > 0) { - updated.push(result.rows[0]); - if (setting.key === 'scrape_interval_hours' || setting.key === 'scrape_specials_time') { - needsSchedulerRestart = true; - } - } - } - await client.query('COMMIT'); - if (needsSchedulerRestart) { - console.log('Restarting scheduler due to setting changes...'); - await (0, scheduler_1.restartScheduler)(); - } - res.json({ settings: updated }); - } - catch (error) { - await client.query('ROLLBACK'); - throw error; - } - finally { - client.release(); - } - } - catch (error) { - console.error('Error updating settings:', error); - res.status(500).json({ error: 'Failed to update settings' }); - } -}); -exports.default = router; diff --git a/backend/dist/routes/stores.js b/backend/dist/routes/stores.js deleted file mode 100644 index 406ca032..00000000 --- a/backend/dist/routes/stores.js +++ /dev/null @@ -1,412 +0,0 @@ -"use strict"; -var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { - if (k2 === undefined) k2 = k; - var desc = Object.getOwnPropertyDescriptor(m, k); - if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { - desc = { enumerable: true, get: function() { return m[k]; } }; - } - Object.defineProperty(o, k2, desc); -}) : (function(o, m, k, k2) { - if (k2 === undefined) k2 = k; - o[k2] = m[k]; -})); -var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { - Object.defineProperty(o, "default", { enumerable: true, value: v }); -}) : function(o, v) { - o["default"] = v; -}); -var __importStar = (this && this.__importStar) || (function () { - var ownKeys = function(o) { - ownKeys = Object.getOwnPropertyNames || function (o) { - var ar = []; - for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; - return ar; - }; - return ownKeys(o); - }; - return function (mod) { - if (mod && mod.__esModule) return mod; - var result = {}; - if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); - __setModuleDefault(result, mod); - return result; - }; -})(); -Object.defineProperty(exports, "__esModule", { value: true }); -const express_1 = require("express"); -const middleware_1 = require("../auth/middleware"); -const migrate_1 = require("../db/migrate"); -const scraper_v2_1 = require("../scraper-v2"); -const router = (0, express_1.Router)(); -router.use(middleware_1.authMiddleware); -// Get all stores -router.get('/', async (req, res) => { - try { - const result = await migrate_1.pool.query(` - SELECT - s.*, - COUNT(DISTINCT p.id) as product_count, - COUNT(DISTINCT c.id) as category_count - FROM stores s - LEFT JOIN products p ON s.id = p.store_id - LEFT JOIN categories c ON s.id = c.store_id - GROUP BY s.id - ORDER BY s.name - `); - res.json({ stores: result.rows }); - } - catch (error) { - console.error('Error fetching stores:', error); - res.status(500).json({ error: 'Failed to fetch stores' }); - } -}); -// Freshness threshold in hours -const STALE_THRESHOLD_HOURS = 4; -function calculateFreshness(lastScrapedAt) { - if (!lastScrapedAt) { - return { - last_scraped_at: null, - is_stale: true, - freshness: 'Never scraped', - hours_since_scrape: null - }; - } - const now = new Date(); - const diffMs = now.getTime() - lastScrapedAt.getTime(); - const diffHours = diffMs / (1000 * 60 * 60); - const isStale = diffHours > STALE_THRESHOLD_HOURS; - let freshnessText; - if (diffHours < 1) { - const mins = Math.round(diffHours * 60); - freshnessText = `${mins} minute${mins !== 1 ? 's' : ''} ago`; - } - else if (diffHours < 24) { - const hrs = Math.round(diffHours); - freshnessText = `${hrs} hour${hrs !== 1 ? 's' : ''} ago`; - } - else { - const days = Math.round(diffHours / 24); - freshnessText = `${days} day${days !== 1 ? 's' : ''} ago`; - } - return { - last_scraped_at: lastScrapedAt.toISOString(), - is_stale: isStale, - freshness: freshnessText, - hours_since_scrape: Math.round(diffHours * 10) / 10 - }; -} -function detectProvider(dutchieUrl) { - if (!dutchieUrl) - return 'unknown'; - if (dutchieUrl.includes('dutchie.com')) - return 'Dutchie'; - if (dutchieUrl.includes('iheartjane.com') || dutchieUrl.includes('jane.co')) - return 'Jane'; - if (dutchieUrl.includes('treez.io')) - return 'Treez'; - if (dutchieUrl.includes('weedmaps.com')) - return 'Weedmaps'; - if (dutchieUrl.includes('leafly.com')) - return 'Leafly'; - return 'Custom'; -} -// Get single store with full details -router.get('/:id', async (req, res) => { - try { - const { id } = req.params; - // Get store with counts and linked dispensary - const result = await migrate_1.pool.query(` - SELECT - s.*, - d.id as dispensary_id, - d.name as dispensary_name, - d.slug as dispensary_slug, - d.state as dispensary_state, - d.city as dispensary_city, - d.address as dispensary_address, - d.menu_provider as dispensary_menu_provider, - COUNT(DISTINCT p.id) as product_count, - COUNT(DISTINCT c.id) as category_count, - COUNT(DISTINCT p.id) FILTER (WHERE p.in_stock = true) as in_stock_count, - COUNT(DISTINCT p.id) FILTER (WHERE p.in_stock = false) as out_of_stock_count - FROM stores s - LEFT JOIN dispensaries d ON s.dispensary_id = d.id - LEFT JOIN products p ON s.id = p.store_id - LEFT JOIN categories c ON s.id = c.store_id - WHERE s.id = $1 - GROUP BY s.id, d.id, d.name, d.slug, d.state, d.city, d.address, d.menu_provider - `, [id]); - if (result.rows.length === 0) { - return res.status(404).json({ error: 'Store not found' }); - } - const store = result.rows[0]; - // Get recent crawl jobs for this store - const jobsResult = await migrate_1.pool.query(` - SELECT - id, status, job_type, trigger_type, - started_at, completed_at, - products_found, products_new, products_updated, - in_stock_count, out_of_stock_count, - error_message - FROM crawl_jobs - WHERE store_id = $1 - ORDER BY created_at DESC - LIMIT 10 - `, [id]); - // Get schedule info if exists - const scheduleResult = await migrate_1.pool.query(` - SELECT - enabled, interval_hours, next_run_at, last_run_at - FROM store_crawl_schedule - WHERE store_id = $1 - `, [id]); - // Calculate freshness - const freshness = calculateFreshness(store.last_scraped_at); - // Detect provider from URL - const provider = detectProvider(store.dutchie_url); - // Build response - const response = { - ...store, - provider, - freshness: freshness.freshness, - is_stale: freshness.is_stale, - hours_since_scrape: freshness.hours_since_scrape, - linked_dispensary: store.dispensary_id ? { - id: store.dispensary_id, - name: store.dispensary_name, - slug: store.dispensary_slug, - state: store.dispensary_state, - city: store.dispensary_city, - address: store.dispensary_address, - menu_provider: store.dispensary_menu_provider - } : null, - schedule: scheduleResult.rows[0] || null, - recent_jobs: jobsResult.rows - }; - // Remove redundant dispensary fields from root - delete response.dispensary_name; - delete response.dispensary_slug; - delete response.dispensary_state; - delete response.dispensary_city; - delete response.dispensary_address; - delete response.dispensary_menu_provider; - res.json(response); - } - catch (error) { - console.error('Error fetching store:', error); - res.status(500).json({ error: 'Failed to fetch store' }); - } -}); -// Get store brands -router.get('/:id/brands', async (req, res) => { - try { - const { id } = req.params; - const result = await migrate_1.pool.query(` - SELECT name - FROM brands - WHERE store_id = $1 - ORDER BY name - `, [id]); - const brands = result.rows.map((row) => row.name); - res.json({ brands }); - } - catch (error) { - console.error('Error fetching store brands:', error); - res.status(500).json({ error: 'Failed to fetch store brands' }); - } -}); -// Get store specials -router.get('/:id/specials', async (req, res) => { - try { - const { id } = req.params; - const { date } = req.query; - // Use provided date or today's date - const queryDate = date || new Date().toISOString().split('T')[0]; - const result = await migrate_1.pool.query(` - SELECT - s.*, - p.name as product_name, - p.image_url as product_image - FROM specials s - LEFT JOIN products p ON s.product_id = p.id - WHERE s.store_id = $1 AND s.valid_date = $2 - ORDER BY s.name - `, [id, queryDate]); - res.json({ specials: result.rows, date: queryDate }); - } - catch (error) { - console.error('Error fetching store specials:', error); - res.status(500).json({ error: 'Failed to fetch store specials' }); - } -}); -// Create store -router.post('/', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => { - try { - const { name, slug, dutchie_url, active, scrape_enabled } = req.body; - const result = await migrate_1.pool.query(` - INSERT INTO stores (name, slug, dutchie_url, active, scrape_enabled) - VALUES ($1, $2, $3, $4, $5) - RETURNING * - `, [name, slug, dutchie_url, active ?? true, scrape_enabled ?? true]); - res.status(201).json(result.rows[0]); - } - catch (error) { - console.error('Error creating store:', error); - res.status(500).json({ error: 'Failed to create store' }); - } -}); -// Update store -router.put('/:id', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => { - try { - const { id } = req.params; - const { name, slug, dutchie_url, active, scrape_enabled } = req.body; - const result = await migrate_1.pool.query(` - UPDATE stores - SET name = COALESCE($1, name), - slug = COALESCE($2, slug), - dutchie_url = COALESCE($3, dutchie_url), - active = COALESCE($4, active), - scrape_enabled = COALESCE($5, scrape_enabled), - updated_at = CURRENT_TIMESTAMP - WHERE id = $6 - RETURNING * - `, [name, slug, dutchie_url, active, scrape_enabled, id]); - if (result.rows.length === 0) { - return res.status(404).json({ error: 'Store not found' }); - } - res.json(result.rows[0]); - } - catch (error) { - console.error('Error updating store:', error); - res.status(500).json({ error: 'Failed to update store' }); - } -}); -// Delete store -router.delete('/:id', (0, middleware_1.requireRole)('superadmin'), async (req, res) => { - try { - const { id } = req.params; - const result = await migrate_1.pool.query('DELETE FROM stores WHERE id = $1 RETURNING *', [id]); - if (result.rows.length === 0) { - return res.status(404).json({ error: 'Store not found' }); - } - res.json({ message: 'Store deleted successfully' }); - } - catch (error) { - console.error('Error deleting store:', error); - res.status(500).json({ error: 'Failed to delete store' }); - } -}); -// Trigger scrape for a store -router.post('/:id/scrape', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => { - try { - const { id } = req.params; - const { parallel = 3, userAgent } = req.body; // Default to 3 parallel scrapers - const storeResult = await migrate_1.pool.query('SELECT id FROM stores WHERE id = $1', [id]); - if (storeResult.rows.length === 0) { - return res.status(404).json({ error: 'Store not found' }); - } - (0, scraper_v2_1.scrapeStore)(parseInt(id), parseInt(parallel), userAgent).catch(err => { - console.error('Background scrape error:', err); - }); - res.json({ - message: 'Scrape started', - parallel: parseInt(parallel), - userAgent: userAgent || 'random' - }); - } - catch (error) { - console.error('Error triggering scrape:', error); - res.status(500).json({ error: 'Failed to trigger scrape' }); - } -}); -// Download missing images for a store -router.post('/:id/download-images', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => { - try { - const { id } = req.params; - const storeResult = await migrate_1.pool.query('SELECT id, name FROM stores WHERE id = $1', [id]); - if (storeResult.rows.length === 0) { - return res.status(404).json({ error: 'Store not found' }); - } - const store = storeResult.rows[0]; - const productsResult = await migrate_1.pool.query(` - SELECT id, name, image_url - FROM products - WHERE store_id = $1 - AND image_url IS NOT NULL - AND local_image_path IS NULL - `, [id]); - (async () => { - const { uploadImageFromUrl } = await Promise.resolve().then(() => __importStar(require('../utils/minio'))); - let downloaded = 0; - for (const product of productsResult.rows) { - try { - console.log(`📸 Downloading image for: ${product.name}`); - const localPath = await uploadImageFromUrl(product.image_url, product.id); - await migrate_1.pool.query(` - UPDATE products - SET local_image_path = $1 - WHERE id = $2 - `, [localPath, product.id]); - downloaded++; - } - catch (error) { - console.error(`Failed to download image for ${product.name}:`, error); - } - } - console.log(`✅ Downloaded ${downloaded} of ${productsResult.rows.length} missing images for ${store.name}`); - })().catch(err => console.error('Background image download error:', err)); - res.json({ - message: 'Image download started', - total_missing: productsResult.rows.length - }); - } - catch (error) { - console.error('Error triggering image download:', error); - res.status(500).json({ error: 'Failed to trigger image download' }); - } -}); -// Discover categories for a store -router.post('/:id/discover-categories', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => { - try { - const { id } = req.params; - const storeResult = await migrate_1.pool.query('SELECT id FROM stores WHERE id = $1', [id]); - if (storeResult.rows.length === 0) { - return res.status(404).json({ error: 'Store not found' }); - } - (0, scraper_v2_1.discoverCategories)(parseInt(id)).catch(err => { - console.error('Background category discovery error:', err); - }); - res.json({ message: 'Category discovery started' }); - } - catch (error) { - console.error('Error triggering category discovery:', error); - res.status(500).json({ error: 'Failed to trigger category discovery' }); - } -}); -// Debug scraper -router.post('/:id/debug-scrape', (0, middleware_1.requireRole)('superadmin', 'admin'), async (req, res) => { - try { - const { id } = req.params; - console.log('Debug scrape triggered for store:', id); - const categoryResult = await migrate_1.pool.query(` - SELECT c.dutchie_url, c.name - FROM categories c - WHERE c.store_id = $1 AND c.slug = 'edibles' - LIMIT 1 - `, [id]); - if (categoryResult.rows.length === 0) { - return res.status(404).json({ error: 'Edibles category not found' }); - } - console.log('Found category:', categoryResult.rows[0]); - const { debugDutchiePage } = await Promise.resolve().then(() => __importStar(require('../services/scraper-debug'))); - debugDutchiePage(categoryResult.rows[0].dutchie_url).catch(err => { - console.error('Debug error:', err); - }); - res.json({ message: 'Debug started, check logs', url: categoryResult.rows[0].dutchie_url }); - } - catch (error) { - console.error('Debug endpoint error:', error); - res.status(500).json({ error: 'Failed to debug' }); - } -}); -exports.default = router; diff --git a/backend/dist/routes/version.js b/backend/dist/routes/version.js deleted file mode 100644 index c3f353ea..00000000 --- a/backend/dist/routes/version.js +++ /dev/null @@ -1,24 +0,0 @@ -"use strict"; -Object.defineProperty(exports, "__esModule", { value: true }); -const express_1 = require("express"); -const router = (0, express_1.Router)(); -/** - * GET /api/version - * Returns build version information for display in admin UI - */ -router.get('/', async (req, res) => { - try { - const versionInfo = { - build_version: process.env.APP_BUILD_VERSION || 'dev', - git_sha: process.env.APP_GIT_SHA || 'local', - build_time: process.env.APP_BUILD_TIME || new Date().toISOString(), - image_tag: process.env.CONTAINER_IMAGE_TAG || 'local', - }; - res.json(versionInfo); - } - catch (error) { - console.error('Error fetching version info:', error); - res.status(500).json({ error: 'Failed to fetch version info' }); - } -}); -exports.default = router; diff --git a/backend/dist/scraper-v2/downloader.js b/backend/dist/scraper-v2/downloader.js deleted file mode 100644 index 2855a60b..00000000 --- a/backend/dist/scraper-v2/downloader.js +++ /dev/null @@ -1,502 +0,0 @@ -"use strict"; -var __importDefault = (this && this.__importDefault) || function (mod) { - return (mod && mod.__esModule) ? mod : { "default": mod }; -}; -Object.defineProperty(exports, "__esModule", { value: true }); -exports.Downloader = void 0; -const puppeteer_1 = __importDefault(require("puppeteer")); -const axios_1 = __importDefault(require("axios")); -const types_1 = require("./types"); -const logger_1 = require("../services/logger"); -// Fingerprint profiles for randomization -const SCREEN_RESOLUTIONS = [ - { width: 1920, height: 1080 }, - { width: 1366, height: 768 }, - { width: 1536, height: 864 }, - { width: 1440, height: 900 }, - { width: 1280, height: 720 }, - { width: 2560, height: 1440 }, - { width: 1680, height: 1050 }, - { width: 1600, height: 900 }, -]; -const TIMEZONES = [ - 'America/New_York', - 'America/Chicago', - 'America/Denver', - 'America/Los_Angeles', - 'America/Phoenix', -]; -const LANGUAGES = [ - ['en-US', 'en'], - ['en-US', 'en', 'es'], - ['en-US'], -]; -const PLATFORMS = [ - 'Win32', - 'MacIntel', - 'Linux x86_64', -]; -const WEBGL_VENDORS = [ - 'Google Inc. (NVIDIA)', - 'Google Inc. (Intel)', - 'Google Inc. (AMD)', - 'Intel Inc.', - 'NVIDIA Corporation', -]; -const WEBGL_RENDERERS = [ - 'ANGLE (NVIDIA GeForce GTX 1080 Direct3D11 vs_5_0 ps_5_0)', - 'ANGLE (Intel(R) UHD Graphics 630 Direct3D11 vs_5_0 ps_5_0)', - 'ANGLE (AMD Radeon RX 580 Series Direct3D11 vs_5_0 ps_5_0)', - 'Intel Iris OpenGL Engine', - 'NVIDIA GeForce RTX 3070/PCIe/SSE2', - 'AMD Radeon Pro 5500M OpenGL Engine', -]; -function generateRandomFingerprint() { - return { - screen: SCREEN_RESOLUTIONS[Math.floor(Math.random() * SCREEN_RESOLUTIONS.length)], - timezone: TIMEZONES[Math.floor(Math.random() * TIMEZONES.length)], - languages: LANGUAGES[Math.floor(Math.random() * LANGUAGES.length)], - platform: PLATFORMS[Math.floor(Math.random() * PLATFORMS.length)], - hardwareConcurrency: [4, 8, 12, 16][Math.floor(Math.random() * 4)], - deviceMemory: [4, 8, 16, 32][Math.floor(Math.random() * 4)], - webglVendor: WEBGL_VENDORS[Math.floor(Math.random() * WEBGL_VENDORS.length)], - webglRenderer: WEBGL_RENDERERS[Math.floor(Math.random() * WEBGL_RENDERERS.length)], - }; -} -class Downloader { - browser = null; - page = null; - pageInUse = false; - currentFingerprint = generateRandomFingerprint(); - needsNewFingerprint = false; - /** - * Force new fingerprint on next browser creation - */ - rotateFingerprint() { - this.needsNewFingerprint = true; - logger_1.logger.info('scraper', '🔄 Fingerprint rotation scheduled'); - } - /** - * Initialize browser instance with fingerprint - */ - async getBrowser(forceNew = false) { - // Create new browser if needed for fingerprint rotation - if (forceNew || this.needsNewFingerprint) { - await this.close(); - this.currentFingerprint = generateRandomFingerprint(); - this.needsNewFingerprint = false; - logger_1.logger.info('scraper', `🎭 New fingerprint: ${this.currentFingerprint.screen.width}x${this.currentFingerprint.screen.height}, ${this.currentFingerprint.timezone}, ${this.currentFingerprint.platform}`); - } - if (!this.browser || !this.browser.isConnected()) { - const { screen } = this.currentFingerprint; - const launchOptions = { - headless: 'new', - args: [ - '--no-sandbox', - '--disable-setuid-sandbox', - '--disable-dev-shm-usage', - '--disable-blink-features=AutomationControlled', - `--window-size=${screen.width},${screen.height}`, - '--disable-web-security', - '--disable-features=IsolateOrigins,site-per-process', - '--disable-infobars', - '--disable-extensions', - ] - }; - this.browser = await puppeteer_1.default.launch(launchOptions); - logger_1.logger.info('scraper', 'Browser instance created'); - } - return this.browser; - } - /** - * Get or create a page instance with current fingerprint - */ - async getPage(forceNew = false) { - if (!this.page || this.page.isClosed() || forceNew) { - const browser = await this.getBrowser(forceNew); - this.page = await browser.newPage(); - const { screen } = this.currentFingerprint; - await this.page.setViewport({ - width: screen.width, - height: screen.height, - deviceScaleFactor: 1, - }); - // Apply fingerprint - await this.applyFingerprint(this.page); - logger_1.logger.debug('scraper', 'New page created with fingerprint'); - } - return this.page; - } - /** - * Apply full fingerprint to page - */ - async applyFingerprint(page) { - const fp = this.currentFingerprint; - await page.evaluateOnNewDocument((fingerprint) => { - // Hide webdriver - Object.defineProperty(navigator, 'webdriver', { - get: () => false, - }); - // Spoof platform - Object.defineProperty(navigator, 'platform', { - get: () => fingerprint.platform, - }); - // Spoof languages - Object.defineProperty(navigator, 'languages', { - get: () => fingerprint.languages, - }); - // Spoof hardware concurrency - Object.defineProperty(navigator, 'hardwareConcurrency', { - get: () => fingerprint.hardwareConcurrency, - }); - // Spoof device memory - Object.defineProperty(navigator, 'deviceMemory', { - get: () => fingerprint.deviceMemory, - }); - // Spoof plugins (realistic count) - Object.defineProperty(navigator, 'plugins', { - get: () => { - const plugins = []; - for (let i = 0; i < 5; i++) { - plugins.push({ - name: `Plugin ${i}`, - filename: `plugin${i}.dll`, - description: `Description ${i}`, - }); - } - plugins.length = 5; - return plugins; - }, - }); - // Chrome object - window.chrome = { - runtime: {}, - loadTimes: () => ({}), - csi: () => ({}), - app: {}, - }; - // Permissions - const originalQuery = window.navigator.permissions.query; - window.navigator.permissions.query = (parameters) => parameters.name === 'notifications' - ? Promise.resolve({ state: 'denied' }) - : originalQuery(parameters); - // WebGL fingerprint spoofing - const getParameterProxyHandler = { - apply: function (target, thisArg, argumentsList) { - const param = argumentsList[0]; - // UNMASKED_VENDOR_WEBGL - if (param === 37445) { - return fingerprint.webglVendor; - } - // UNMASKED_RENDERER_WEBGL - if (param === 37446) { - return fingerprint.webglRenderer; - } - return Reflect.apply(target, thisArg, argumentsList); - } - }; - // Override WebGL - const originalGetContext = HTMLCanvasElement.prototype.getContext; - HTMLCanvasElement.prototype.getContext = function (type, ...args) { - const context = originalGetContext.call(this, type, ...args); - if (context && (type === 'webgl' || type === 'webgl2' || type === 'experimental-webgl')) { - const glContext = context; - const originalGetParameter = glContext.getParameter.bind(glContext); - glContext.getParameter = new Proxy(originalGetParameter, getParameterProxyHandler); - } - return context; - }; - // Canvas fingerprint noise - const originalToDataURL = HTMLCanvasElement.prototype.toDataURL; - HTMLCanvasElement.prototype.toDataURL = function (type) { - const context = this.getContext('2d'); - if (context) { - const imageData = context.getImageData(0, 0, this.width, this.height); - for (let i = 0; i < imageData.data.length; i += 4) { - // Add tiny noise to RGB values - imageData.data[i] = imageData.data[i] ^ (Math.random() > 0.5 ? 1 : 0); - } - context.putImageData(imageData, 0, 0); - } - return originalToDataURL.call(this, type); - }; - // Screen dimensions - Object.defineProperty(window.screen, 'width', { get: () => fingerprint.screen.width }); - Object.defineProperty(window.screen, 'height', { get: () => fingerprint.screen.height }); - Object.defineProperty(window.screen, 'availWidth', { get: () => fingerprint.screen.width }); - Object.defineProperty(window.screen, 'availHeight', { get: () => fingerprint.screen.height - 40 }); - Object.defineProperty(window, 'innerWidth', { get: () => fingerprint.screen.width }); - Object.defineProperty(window, 'innerHeight', { get: () => fingerprint.screen.height - 140 }); - Object.defineProperty(window, 'outerWidth', { get: () => fingerprint.screen.width }); - Object.defineProperty(window, 'outerHeight', { get: () => fingerprint.screen.height }); - }, fp); - // Set timezone via CDP - const client = await page.target().createCDPSession(); - await client.send('Emulation.setTimezoneOverride', { timezoneId: fp.timezone }); - } - /** - * Apply stealth mode to page (legacy - now uses applyFingerprint) - */ - async makePageStealthy(page) { - // Now handled by applyFingerprint - await this.applyFingerprint(page); - } - /** - * Configure proxy for browser - */ - getProxyArgs(proxy) { - if (proxy.protocol === 'socks5') { - return [`--proxy-server=socks5://${proxy.host}:${proxy.port}`]; - } - else if (proxy.protocol === 'http' || proxy.protocol === 'https') { - return [`--proxy-server=${proxy.protocol}://${proxy.host}:${proxy.port}`]; - } - return []; - } - /** - * HTTP-based fetch (lightweight, fast) - */ - async httpFetch(request) { - try { - const config = { - timeout: 30000, - headers: { - 'User-Agent': request.metadata.userAgent || 'Mozilla/5.0', - ...request.metadata.headers - }, - validateStatus: () => true // Don't throw on any status - }; - // Add proxy if available - if (request.metadata.proxy) { - const proxy = request.metadata.proxy; - config.proxy = { - host: proxy.host, - port: proxy.port, - protocol: proxy.protocol - }; - if (proxy.username && proxy.password) { - config.proxy.auth = { - username: proxy.username, - password: proxy.password - }; - } - } - const response = await axios_1.default.get(request.url, config); - return { - url: request.url, - statusCode: response.status, - content: response.data, - metadata: { - headers: response.headers, - method: 'http' - }, - request - }; - } - catch (error) { - const scraperError = new Error(error.message); - if (error.code === 'ETIMEDOUT' || error.code === 'ECONNABORTED') { - scraperError.type = types_1.ErrorType.TIMEOUT; - } - else if (error.code === 'ECONNREFUSED' || error.code === 'ENOTFOUND') { - scraperError.type = types_1.ErrorType.NETWORK_ERROR; - } - else { - scraperError.type = types_1.ErrorType.UNKNOWN; - } - scraperError.retryable = true; - scraperError.request = request; - throw scraperError; - } - } - /** - * Browser-based fetch (for JS-heavy sites) - */ - async browserFetch(request) { - // Wait if page is in use - while (this.pageInUse) { - await new Promise(resolve => setTimeout(resolve, 100)); - } - this.pageInUse = true; - try { - const page = await this.getPage(); - // Apply stealth mode if required - if (request.metadata.requiresStealth) { - await this.makePageStealthy(page); - } - // Set user agent - if (request.metadata.userAgent) { - await page.setUserAgent(request.metadata.userAgent); - } - // Navigate to page - use networkidle2 for SPAs like Dutchie - // Increased timeout to 90s - Dutchie pages can take 30-40s to fully load - const navigationPromise = page.goto(request.url, { - waitUntil: 'networkidle2', - timeout: 90000 - }); - const response = await navigationPromise; - if (!response) { - throw new Error('Navigation failed - no response'); - } - // Wait for React to render product content - // Try to wait for products, but don't fail if they don't appear (empty category) - try { - await page.waitForSelector('[data-testid="product-list-item"], [data-testid="empty-state"]', { - timeout: 10000 - }); - } - catch { - // Products might not exist in this category - continue anyway - logger_1.logger.debug('scraper', 'No products found within timeout - continuing'); - } - // Additional wait for any lazy-loaded content - await page.waitForTimeout(2000); - // Check for lazy-loaded content - await this.autoScroll(page); - // Get page content - const content = await page.content(); - const statusCode = response.status(); - return { - url: request.url, - statusCode, - content, - metadata: { - method: 'browser', - finalUrl: page.url() - }, - request - }; - } - catch (error) { - const scraperError = new Error(error.message); - if (error.message.includes('timeout') || error.message.includes('Navigation timeout')) { - scraperError.type = types_1.ErrorType.TIMEOUT; - } - else if (error.message.includes('net::')) { - scraperError.type = types_1.ErrorType.NETWORK_ERROR; - } - else if (error.message.includes('404')) { - scraperError.type = types_1.ErrorType.NOT_FOUND; - } - else { - scraperError.type = types_1.ErrorType.UNKNOWN; - } - scraperError.retryable = scraperError.type !== types_1.ErrorType.NOT_FOUND; - scraperError.request = request; - throw scraperError; - } - finally { - this.pageInUse = false; - } - } - /** - * Auto-scroll to load lazy content - */ - async autoScroll(page) { - try { - await page.evaluate(async () => { - await new Promise((resolve) => { - let totalHeight = 0; - const distance = 500; - const maxScrolls = 20; // Prevent infinite scrolling - let scrollCount = 0; - const timer = setInterval(() => { - // @ts-ignore - runs in browser context - const scrollHeight = document.body.scrollHeight; - // @ts-ignore - runs in browser context - window.scrollBy(0, distance); - totalHeight += distance; - scrollCount++; - if (totalHeight >= scrollHeight || scrollCount >= maxScrolls) { - clearInterval(timer); - // Scroll back to top - // @ts-ignore - runs in browser context - window.scrollTo(0, 0); - resolve(); - } - }, 200); - }); - }); - // Wait for any lazy-loaded content - await page.waitForTimeout(1000); - } - catch (error) { - logger_1.logger.warn('scraper', `Auto-scroll failed: ${error}`); - } - } - /** - * Main fetch method - tries HTTP first, falls back to browser - */ - async fetch(request) { - const startTime = Date.now(); - try { - // Force browser mode if required - if (request.metadata.requiresBrowser) { - logger_1.logger.debug('scraper', `Browser fetch: ${request.url}`); - const response = await this.browserFetch(request); - logger_1.logger.debug('scraper', `Fetch completed in ${Date.now() - startTime}ms`); - return response; - } - // Try HTTP first (faster) - try { - logger_1.logger.debug('scraper', `HTTP fetch: ${request.url}`); - const response = await this.httpFetch(request); - // Check if we got a meaningful response - if (response.statusCode && response.statusCode >= 200 && response.statusCode < 300) { - logger_1.logger.debug('scraper', `HTTP fetch succeeded in ${Date.now() - startTime}ms`); - return response; - } - // Fall through to browser mode for non-2xx responses - logger_1.logger.debug('scraper', `HTTP got ${response.statusCode || 'unknown'}, trying browser`); - } - catch (httpError) { - logger_1.logger.debug('scraper', `HTTP failed, falling back to browser: ${httpError}`); - } - // Fall back to browser - request.metadata.requiresBrowser = true; - const response = await this.browserFetch(request); - logger_1.logger.debug('scraper', `Browser fetch completed in ${Date.now() - startTime}ms`); - return response; - } - catch (error) { - logger_1.logger.error('scraper', `Fetch failed after ${Date.now() - startTime}ms: ${error}`); - throw error; - } - } - /** - * Evaluate JavaScript in the current page context - */ - async evaluate(fn) { - if (!this.page || this.page.isClosed()) { - throw new Error('No active page for evaluation'); - } - return await this.page.evaluate(fn); - } - /** - * Get the current page (for custom operations) - */ - async getCurrentPage() { - return this.page; - } - /** - * Close the browser - */ - async close() { - if (this.page && !this.page.isClosed()) { - await this.page.close(); - this.page = null; - } - if (this.browser && this.browser.isConnected()) { - await this.browser.close(); - this.browser = null; - logger_1.logger.info('scraper', 'Browser closed'); - } - } - /** - * Clean up resources - */ - async cleanup() { - await this.close(); - } -} -exports.Downloader = Downloader; diff --git a/backend/dist/scraper-v2/engine.js b/backend/dist/scraper-v2/engine.js deleted file mode 100644 index e7cf36bf..00000000 --- a/backend/dist/scraper-v2/engine.js +++ /dev/null @@ -1,693 +0,0 @@ -"use strict"; -var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { - if (k2 === undefined) k2 = k; - var desc = Object.getOwnPropertyDescriptor(m, k); - if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { - desc = { enumerable: true, get: function() { return m[k]; } }; - } - Object.defineProperty(o, k2, desc); -}) : (function(o, m, k, k2) { - if (k2 === undefined) k2 = k; - o[k2] = m[k]; -})); -var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { - Object.defineProperty(o, "default", { enumerable: true, value: v }); -}) : function(o, v) { - o["default"] = v; -}); -var __importStar = (this && this.__importStar) || (function () { - var ownKeys = function(o) { - ownKeys = Object.getOwnPropertyNames || function (o) { - var ar = []; - for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; - return ar; - }; - return ownKeys(o); - }; - return function (mod) { - if (mod && mod.__esModule) return mod; - var result = {}; - if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); - __setModuleDefault(result, mod); - return result; - }; -})(); -Object.defineProperty(exports, "__esModule", { value: true }); -exports.DutchieSpider = exports.ScraperEngine = void 0; -const scheduler_1 = require("./scheduler"); -const downloader_1 = require("./downloader"); -const middlewares_1 = require("./middlewares"); -const pipelines_1 = require("./pipelines"); -const logger_1 = require("../services/logger"); -const migrate_1 = require("../db/migrate"); -/** - * Main Scraper Engine - orchestrates the entire scraping process - */ -class ScraperEngine { - scheduler; - downloader; - middlewareEngine; - pipelineEngine; - stats; - isRunning = false; - concurrency = 1; // Conservative default - constructor(concurrency = 1) { - this.scheduler = new scheduler_1.RequestScheduler(); - this.downloader = new downloader_1.Downloader(); - this.middlewareEngine = new middlewares_1.MiddlewareEngine(); - this.pipelineEngine = new pipelines_1.PipelineEngine(); - this.concurrency = concurrency; - // Initialize stats - this.stats = { - requestsTotal: 0, - requestsSuccess: 0, - requestsFailed: 0, - itemsScraped: 0, - itemsSaved: 0, - itemsDropped: 0, - errorsCount: 0, - startTime: new Date() - }; - // Setup middlewares - this.setupMiddlewares(); - // Setup pipelines - this.setupPipelines(); - } - /** - * Setup middleware chain - */ - setupMiddlewares() { - this.middlewareEngine.use(new middlewares_1.UserAgentMiddleware()); - this.middlewareEngine.use(new middlewares_1.ProxyMiddleware()); - this.middlewareEngine.use(new middlewares_1.RateLimitMiddleware()); - this.middlewareEngine.use(new middlewares_1.RetryMiddleware()); - this.middlewareEngine.use(new middlewares_1.BotDetectionMiddleware()); - this.middlewareEngine.use(new middlewares_1.StealthMiddleware()); - } - /** - * Setup pipeline chain - */ - setupPipelines() { - this.pipelineEngine.use(new pipelines_1.ValidationPipeline()); - this.pipelineEngine.use(new pipelines_1.SanitizationPipeline()); - this.pipelineEngine.use(new pipelines_1.DeduplicationPipeline()); - this.pipelineEngine.use(new pipelines_1.ImagePipeline()); - this.pipelineEngine.use(new pipelines_1.StatsPipeline()); - this.pipelineEngine.use(new pipelines_1.DatabasePipeline()); - } - /** - * Add a request to the queue - */ - enqueue(request) { - this.scheduler.enqueue(request); - } - /** - * Start the scraping engine - */ - async start() { - if (this.isRunning) { - logger_1.logger.warn('scraper', 'Engine is already running'); - return; - } - this.isRunning = true; - this.stats.startTime = new Date(); - logger_1.logger.info('scraper', `🚀 Starting scraper engine (concurrency: ${this.concurrency})`); - // Process queue - await this.processQueue(); - this.isRunning = false; - this.stats.endTime = new Date(); - this.stats.duration = this.stats.endTime.getTime() - this.stats.startTime.getTime(); - logger_1.logger.info('scraper', `✅ Scraper engine finished`); - this.logStats(); - // Cleanup - await this.downloader.cleanup(); - } - /** - * Process the request queue - */ - async processQueue() { - while (!this.scheduler.isEmpty() && this.isRunning) { - const request = this.scheduler.dequeue(); - if (!request) { - // Wait a bit and check again - await new Promise(resolve => setTimeout(resolve, 100)); - continue; - } - try { - await this.processRequest(request); - } - catch (error) { - logger_1.logger.error('scraper', `Failed to process request: ${error}`); - } - } - } - /** - * Process a single request - */ - async processRequest(request) { - this.stats.requestsTotal++; - try { - logger_1.logger.debug('scraper', `Processing: ${request.url}`); - // Apply request middlewares - const processedRequest = await this.middlewareEngine.processRequest(request); - // Download - let response = await this.downloader.fetch(processedRequest); - // Apply response middlewares - response = await this.middlewareEngine.processResponse(response); - // Parse response using callback - const parseResult = await request.callback(response); - // Process items through pipeline - if (parseResult.items && parseResult.items.length > 0) { - for (const item of parseResult.items) { - await this.processItem(item, 'default'); - } - } - // Enqueue follow-up requests - if (parseResult.requests && parseResult.requests.length > 0) { - for (const followUpRequest of parseResult.requests) { - this.scheduler.enqueue(followUpRequest); - } - } - this.stats.requestsSuccess++; - this.scheduler.markComplete(request); - } - catch (error) { - this.stats.requestsFailed++; - this.stats.errorsCount++; - logger_1.logger.error('scraper', `Request failed: ${request.url} - ${error.message}`); - // Apply error middlewares - const handledError = await this.middlewareEngine.processError(error, request); - // If error is null, it was handled (e.g., retry) - if (handledError === null) { - this.scheduler.requeueForRetry(request); - } - else { - this.scheduler.markComplete(request); - // Call error handler if provided - if (request.errorHandler) { - await request.errorHandler(error, request); - } - } - } - } - /** - * Process an item through pipelines - */ - async processItem(item, spider) { - this.stats.itemsScraped++; - try { - const processedItem = await this.pipelineEngine.processItem(item, spider); - if (processedItem) { - this.stats.itemsSaved++; - } - else { - this.stats.itemsDropped++; - } - } - catch (error) { - logger_1.logger.error('scraper', `Failed to process item: ${error}`); - this.stats.itemsDropped++; - } - } - /** - * Log statistics - */ - logStats() { - logger_1.logger.info('scraper', '━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━'); - logger_1.logger.info('scraper', '📊 Scraper Statistics'); - logger_1.logger.info('scraper', '━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━'); - logger_1.logger.info('scraper', ` Requests: ${this.stats.requestsSuccess}/${this.stats.requestsTotal} successful`); - logger_1.logger.info('scraper', ` Items: ${this.stats.itemsSaved} saved, ${this.stats.itemsDropped} dropped`); - logger_1.logger.info('scraper', ` Errors: ${this.stats.errorsCount}`); - logger_1.logger.info('scraper', ` Duration: ${Math.round((this.stats.duration || 0) / 1000)}s`); - // Get stats from StatsPipeline - const statsPipeline = this.pipelineEngine.getPipeline('StatsPipeline'); - if (statsPipeline) { - const itemStats = statsPipeline.getStats(); - logger_1.logger.info('scraper', ` Items with images: ${itemStats.withImages}/${itemStats.total}`); - logger_1.logger.info('scraper', ` Items with THC: ${itemStats.withThc}/${itemStats.total}`); - logger_1.logger.info('scraper', ` Items with descriptions: ${itemStats.withDescription}/${itemStats.total}`); - } - logger_1.logger.info('scraper', '━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━'); - } - /** - * Stop the engine - */ - stop() { - this.isRunning = false; - logger_1.logger.info('scraper', 'Stopping scraper engine...'); - } - /** - * Get current stats - */ - getStats() { - return { ...this.stats }; - } - /** - * Get queue stats - */ - getQueueStats() { - return this.scheduler.getStats(); - } -} -exports.ScraperEngine = ScraperEngine; -/** - * Spider for scraping Dutchie categories - */ -class DutchieSpider { - engine; - constructor(engine) { - this.engine = engine; - } - /** - * Scrape a category - */ - async scrapeCategory(storeId, categoryId) { - logger_1.logger.info('scraper', `Starting category scrape: store=${storeId}, category=${categoryId}`); - const scraperId = `scraper-${storeId}-${categoryId}-${Date.now()}`; - let registerScraper, updateScraperStats, completeScraper; - try { - // Import monitoring functions - const monitor = await Promise.resolve().then(() => __importStar(require('../routes/scraper-monitor'))); - registerScraper = monitor.registerScraper; - updateScraperStats = monitor.updateScraperStats; - completeScraper = monitor.completeScraper; - } - catch (e) { - // Monitoring not available - } - try { - // Get category info - const categoryResult = await migrate_1.pool.query(` - SELECT c.*, s.slug as store_slug, s.name as store_name - FROM categories c - JOIN stores s ON c.store_id = s.id - WHERE c.id = $1 - `, [categoryId]); - if (categoryResult.rows.length === 0) { - throw new Error('Category not found'); - } - const category = categoryResult.rows[0]; - logger_1.logger.info('scraper', `Category: ${category.name} (${category.dutchie_url})`); - // Register with monitoring system - if (registerScraper) { - registerScraper(scraperId, storeId, category.store_name, categoryId, category.name); - } - // Mark products as out of stock before scraping - await migrate_1.pool.query(` - UPDATE products - SET in_stock = false - WHERE store_id = $1 AND category_id = $2 - `, [storeId, categoryId]); - if (updateScraperStats) { - updateScraperStats(scraperId, {}, 'Marking products as out of stock'); - } - // Enqueue category page request - this.engine.enqueue({ - url: category.dutchie_url, - priority: 100, - maxRetries: 3, - metadata: { - requiresBrowser: true, - storeId, - categoryId, - categorySlug: category.slug, - storeSlug: category.store_slug - }, - callback: this.parseCategoryPage.bind(this) - }); - // Start the engine - if (updateScraperStats) { - updateScraperStats(scraperId, {}, 'Scraping category page'); - } - await this.engine.start(); - // Update stats from engine - const engineStats = this.engine.getStats(); - if (updateScraperStats) { - updateScraperStats(scraperId, { - requestsTotal: engineStats.requestsTotal, - requestsSuccess: engineStats.requestsSuccess, - itemsSaved: engineStats.itemsSaved, - itemsDropped: engineStats.itemsDropped, - errorsCount: engineStats.errorsCount - }, 'Finalizing'); - } - // Update category last_scraped_at - await migrate_1.pool.query(` - UPDATE categories - SET last_scraped_at = CURRENT_TIMESTAMP - WHERE id = $1 - `, [categoryId]); - logger_1.logger.info('scraper', `✅ Category scrape completed: ${category.name}`); - if (completeScraper) { - completeScraper(scraperId); - } - } - catch (error) { - logger_1.logger.error('scraper', `Category scrape failed: ${error}`); - if (completeScraper) { - completeScraper(scraperId, String(error)); - } - throw error; - } - } - /** - * Parse category page (product listing) - */ - async parseCategoryPage(response) { - const page = await this.engine['downloader'].getCurrentPage(); - if (!page) { - throw new Error('No active page'); - } - logger_1.logger.info('scraper', 'Parsing category page...'); - // Extract product cards - const productCards = await page.evaluate(() => { - // @ts-ignore - runs in browser context - const cards = document.querySelectorAll('[data-testid="product-list-item"]'); - const items = []; - cards.forEach((card) => { - try { - const allText = card.textContent || ''; - // Extract name - let name = ''; - const nameSelectors = ['a[href*="/product/"]', 'h1', 'h2', 'h3', 'h4']; - for (const sel of nameSelectors) { - const el = card.querySelector(sel); - if (el?.textContent?.trim()) { - name = el.textContent.trim().split('\n')[0].trim(); - break; - } - } - if (!name || name.length < 2) - return; - // Extract price - let price = null; - let originalPrice = null; - const priceMatches = allText.match(/\$(\d+\.?\d*)/g); - if (priceMatches && priceMatches.length > 0) { - price = parseFloat(priceMatches[0].replace('$', '')); - if (priceMatches.length > 1) { - originalPrice = parseFloat(priceMatches[1].replace('$', '')); - } - } - // Extract link - const linkEl = card.querySelector('a[href*="/product/"]'); - let href = linkEl?.getAttribute('href') || ''; - if (href && href.startsWith('/')) { - // @ts-ignore - runs in browser context - href = window.location.origin + href; - } - // Extract image URL from product card - let imageUrl = null; - const imgSelectors = [ - 'img[src*="images.dutchie.com"]', - 'img[src*="dutchie"]', - 'img[data-testid*="product"]', - 'img[class*="product"]', - 'img[class*="Product"]', - 'picture img', - 'img' - ]; - for (const sel of imgSelectors) { - const img = card.querySelector(sel); - if (img) { - const src = img.getAttribute('src') || img.getAttribute('data-src') || ''; - if (src && (src.includes('dutchie.com') || src.includes('images.'))) { - imageUrl = src; - break; - } - } - } - items.push({ name, price, originalPrice, href, imageUrl }); - } - catch (err) { - console.error('Error parsing product card:', err); - } - }); - return items; - }); - logger_1.logger.info('scraper', `Found ${productCards.length} products on listing page`); - // Create follow-up requests for each product - const requests = productCards.map((card, index) => ({ - url: card.href, - priority: 50, - maxRetries: 3, - metadata: { - ...response.request.metadata, - productName: card.name, - productPrice: card.price, - productOriginalPrice: card.originalPrice, - productImageUrl: card.imageUrl, // Pass image from category page - requiresBrowser: true - }, - callback: this.parseProductPage.bind(this) - })); - return { items: [], requests }; - } - /** - * Parse individual product page - */ - async parseProductPage(response) { - const page = await this.engine['downloader'].getCurrentPage(); - if (!page) { - throw new Error('No active page'); - } - const productName = response.request.metadata.productName; - logger_1.logger.debug('scraper', `Parsing product: ${productName}`); - // Extract product details - const details = await page.evaluate(() => { - // @ts-ignore - runs in browser context - const allText = document.body.textContent || ''; - // Extract image - expanded selectors for better coverage - let fullSizeImage = null; - const mainImageSelectors = [ - 'img[src*="images.dutchie.com"]', - 'img[src*="dutchie"]', - 'img[class*="ProductImage"]', - 'img[class*="product-image"]', - 'img[class*="Product"]', - '[class*="ImageGallery"] img', - '[data-testid*="product"] img', - '[data-testid*="image"] img', - 'picture img', - 'main img' - ]; - for (const sel of mainImageSelectors) { - // @ts-ignore - runs in browser context - const img = document.querySelector(sel); - const src = img?.src || img?.getAttribute('data-src') || ''; - if (src && (src.includes('dutchie.com') || src.includes('images.'))) { - fullSizeImage = src; - break; - } - } - // Extract description - let description = ''; - const descSelectors = [ - '[class*="description"]', - '[class*="Description"]', - '[data-testid*="description"]', - 'p[class*="product"]' - ]; - for (const sel of descSelectors) { - // @ts-ignore - runs in browser context - const el = document.querySelector(sel); - if (el?.textContent?.trim() && el.textContent.length > 20) { - description = el.textContent.trim(); - break; - } - } - // Extract THC/CBD - let thc = null; - const thcPatterns = [ - /THC[:\s]*(\d+\.?\d*)\s*%/i, - /Total\s+THC[:\s]*(\d+\.?\d*)\s*%/i, - /(\d+\.?\d*)\s*%\s+THC/i - ]; - for (const pattern of thcPatterns) { - const match = allText.match(pattern); - if (match) { - thc = parseFloat(match[1]); - break; - } - } - let cbd = null; - const cbdPatterns = [ - /CBD[:\s]*(\d+\.?\d*)\s*%/i, - /Total\s+CBD[:\s]*(\d+\.?\d*)\s*%/i, - /(\d+\.?\d*)\s*%\s+CBD/i - ]; - for (const pattern of cbdPatterns) { - const match = allText.match(pattern); - if (match) { - cbd = parseFloat(match[1]); - break; - } - } - // Extract strain type - let strainType = null; - if (allText.match(/\bindica\b/i)) - strainType = 'Indica'; - else if (allText.match(/\bsativa\b/i)) - strainType = 'Sativa'; - else if (allText.match(/\bhybrid\b/i)) - strainType = 'Hybrid'; - // Extract brand - let brand = null; - const brandSelectors = [ - '[class*="brand"]', - '[class*="Brand"]', - '[data-testid*="brand"]' - ]; - for (const sel of brandSelectors) { - // @ts-ignore - runs in browser context - const el = document.querySelector(sel); - if (el?.textContent?.trim()) { - brand = el.textContent.trim(); - break; - } - } - // Extract metadata - const terpenes = []; - const terpeneNames = ['Myrcene', 'Limonene', 'Caryophyllene', 'Pinene', 'Linalool', 'Humulene']; - terpeneNames.forEach(terp => { - if (allText.match(new RegExp(`\\b${terp}\\b`, 'i'))) { - terpenes.push(terp); - } - }); - const effects = []; - const effectNames = ['Relaxed', 'Happy', 'Euphoric', 'Uplifted', 'Creative', 'Energetic']; - effectNames.forEach(effect => { - if (allText.match(new RegExp(`\\b${effect}\\b`, 'i'))) { - effects.push(effect); - } - }); - return { - fullSizeImage, - description, - thc, - cbd, - strainType, - brand, - terpenes, - effects - }; - }); - // Create product item - // Use image from product page, fallback to category page image - const imageUrl = details.fullSizeImage || response.request.metadata.productImageUrl || undefined; - const product = { - dutchieProductId: `${response.request.metadata.storeSlug}-${response.request.metadata.categorySlug}-${Date.now()}-${Math.random()}`, - name: productName || 'Unknown Product', - description: details.description, - price: response.request.metadata.productPrice, - originalPrice: response.request.metadata.productOriginalPrice, - thcPercentage: details.thc || undefined, - cbdPercentage: details.cbd || undefined, - strainType: details.strainType || undefined, - brand: details.brand || undefined, - imageUrl: imageUrl, - dutchieUrl: response.url, - metadata: { - terpenes: details.terpenes, - effects: details.effects - }, - storeId: response.request.metadata.storeId, - categoryId: response.request.metadata.categoryId - }; - return { items: [product], requests: [] }; - } - /** - * Scrape entire store - */ - async scrapeStore(storeId, parallel = 3) { - logger_1.logger.info('scraper', `🏪 Starting store scrape: ${storeId} (${parallel} parallel scrapers)`); - try { - // Check if categories exist, if not, discover them first - const categoryCountResult = await migrate_1.pool.query(` - SELECT COUNT(*) as count - FROM categories - WHERE store_id = $1 - `, [storeId]); - if (parseInt(categoryCountResult.rows[0].count) === 0) { - logger_1.logger.info('scraper', 'No categories found - running discovery first'); - const { discoverCategories } = await Promise.resolve().then(() => __importStar(require('./index'))); - await discoverCategories(storeId); - } - // Get all leaf categories (no children) - const categoriesResult = await migrate_1.pool.query(` - SELECT c.id, c.name - FROM categories c - WHERE c.store_id = $1 - AND c.scrape_enabled = true - AND NOT EXISTS ( - SELECT 1 FROM categories child - WHERE child.parent_id = c.id - ) - ORDER BY c.name - `, [storeId]); - const categories = categoriesResult.rows; - logger_1.logger.info('scraper', `Found ${categories.length} categories to scrape`); - if (parallel === 1) { - // Sequential scraping (original behavior) - for (const category of categories) { - try { - await this.scrapeCategory(storeId, category.id); - await new Promise(resolve => setTimeout(resolve, 3000)); - } - catch (error) { - logger_1.logger.error('scraper', `Failed to scrape category ${category.name}: ${error}`); - } - } - } - else { - // Parallel scraping with concurrency limit - const results = await this.scrapeMultipleCategoriesParallel(storeId, categories, parallel); - const successful = results.filter(r => r.status === 'fulfilled').length; - const failed = results.filter(r => r.status === 'rejected').length; - logger_1.logger.info('scraper', `Parallel scrape results: ${successful} successful, ${failed} failed`); - } - // Update store last_scraped_at - await migrate_1.pool.query(` - UPDATE stores - SET last_scraped_at = CURRENT_TIMESTAMP - WHERE id = $1 - `, [storeId]); - logger_1.logger.info('scraper', `🎉 Store scrape completed: ${storeId}`); - } - catch (error) { - logger_1.logger.error('scraper', `Store scrape failed: ${error}`); - throw error; - } - } - /** - * Scrape multiple categories in parallel with concurrency limit - */ - async scrapeMultipleCategoriesParallel(storeId, categories, concurrency) { - const results = []; - // Process categories in batches - for (let i = 0; i < categories.length; i += concurrency) { - const batch = categories.slice(i, i + concurrency); - logger_1.logger.info('scraper', `Scraping batch ${Math.floor(i / concurrency) + 1}: ${batch.map(c => c.name).join(', ')}`); - const batchPromises = batch.map(category => { - // Create a new spider instance for each category - const engine = new ScraperEngine(1); // 1 concurrent request per spider - const spider = new DutchieSpider(engine); - return spider.scrapeCategory(storeId, category.id) - .catch(error => { - logger_1.logger.error('scraper', `Category ${category.name} failed: ${error}`); - throw error; - }); - }); - const batchResults = await Promise.allSettled(batchPromises); - results.push(...batchResults); - // Delay between batches to avoid overwhelming the server - if (i + concurrency < categories.length) { - logger_1.logger.info('scraper', 'Waiting 5s before next batch...'); - await new Promise(resolve => setTimeout(resolve, 5000)); - } - } - return results; - } -} -exports.DutchieSpider = DutchieSpider; diff --git a/backend/dist/scraper-v2/index.js b/backend/dist/scraper-v2/index.js deleted file mode 100644 index 57669863..00000000 --- a/backend/dist/scraper-v2/index.js +++ /dev/null @@ -1,115 +0,0 @@ -"use strict"; -/** - * Scraper V2 - Scrapy-inspired web scraping framework - * - * IMPORTANT: For Dutchie stores, DO NOT USE scrapeStore() from this module. - * Dutchie crawling must go through the dutchie-az GraphQL pipeline: - * src/dutchie-az/services/product-crawler.ts - * - * This scraper-v2 module uses DOM-based extraction which is unreliable - * for Dutchie. The new dutchie-az pipeline uses GraphQL directly. - * - * Architecture: - * - Engine: Main orchestrator - * - Scheduler: Priority queue with deduplication - * - Downloader: HTTP + Browser hybrid fetcher - * - Middlewares: Request/response processing chain - * - Pipelines: Item processing and persistence - * - Navigation: Category discovery - */ -var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { - if (k2 === undefined) k2 = k; - var desc = Object.getOwnPropertyDescriptor(m, k); - if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { - desc = { enumerable: true, get: function() { return m[k]; } }; - } - Object.defineProperty(o, k2, desc); -}) : (function(o, m, k, k2) { - if (k2 === undefined) k2 = k; - o[k2] = m[k]; -})); -var __exportStar = (this && this.__exportStar) || function(m, exports) { - for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p); -}; -Object.defineProperty(exports, "__esModule", { value: true }); -exports.StatsPipeline = exports.DatabasePipeline = exports.ImagePipeline = exports.DeduplicationPipeline = exports.SanitizationPipeline = exports.ValidationPipeline = exports.PipelineEngine = exports.StealthMiddleware = exports.BotDetectionMiddleware = exports.RetryMiddleware = exports.RateLimitMiddleware = exports.ProxyMiddleware = exports.UserAgentMiddleware = exports.MiddlewareEngine = exports.NavigationDiscovery = exports.Downloader = exports.RequestScheduler = exports.DutchieSpider = exports.ScraperEngine = void 0; -exports.scrapeCategory = scrapeCategory; -exports.scrapeStore = scrapeStore; -exports.discoverCategories = discoverCategories; -var engine_1 = require("./engine"); -Object.defineProperty(exports, "ScraperEngine", { enumerable: true, get: function () { return engine_1.ScraperEngine; } }); -Object.defineProperty(exports, "DutchieSpider", { enumerable: true, get: function () { return engine_1.DutchieSpider; } }); -var scheduler_1 = require("./scheduler"); -Object.defineProperty(exports, "RequestScheduler", { enumerable: true, get: function () { return scheduler_1.RequestScheduler; } }); -var downloader_1 = require("./downloader"); -Object.defineProperty(exports, "Downloader", { enumerable: true, get: function () { return downloader_1.Downloader; } }); -var navigation_1 = require("./navigation"); -Object.defineProperty(exports, "NavigationDiscovery", { enumerable: true, get: function () { return navigation_1.NavigationDiscovery; } }); -var middlewares_1 = require("./middlewares"); -Object.defineProperty(exports, "MiddlewareEngine", { enumerable: true, get: function () { return middlewares_1.MiddlewareEngine; } }); -Object.defineProperty(exports, "UserAgentMiddleware", { enumerable: true, get: function () { return middlewares_1.UserAgentMiddleware; } }); -Object.defineProperty(exports, "ProxyMiddleware", { enumerable: true, get: function () { return middlewares_1.ProxyMiddleware; } }); -Object.defineProperty(exports, "RateLimitMiddleware", { enumerable: true, get: function () { return middlewares_1.RateLimitMiddleware; } }); -Object.defineProperty(exports, "RetryMiddleware", { enumerable: true, get: function () { return middlewares_1.RetryMiddleware; } }); -Object.defineProperty(exports, "BotDetectionMiddleware", { enumerable: true, get: function () { return middlewares_1.BotDetectionMiddleware; } }); -Object.defineProperty(exports, "StealthMiddleware", { enumerable: true, get: function () { return middlewares_1.StealthMiddleware; } }); -var pipelines_1 = require("./pipelines"); -Object.defineProperty(exports, "PipelineEngine", { enumerable: true, get: function () { return pipelines_1.PipelineEngine; } }); -Object.defineProperty(exports, "ValidationPipeline", { enumerable: true, get: function () { return pipelines_1.ValidationPipeline; } }); -Object.defineProperty(exports, "SanitizationPipeline", { enumerable: true, get: function () { return pipelines_1.SanitizationPipeline; } }); -Object.defineProperty(exports, "DeduplicationPipeline", { enumerable: true, get: function () { return pipelines_1.DeduplicationPipeline; } }); -Object.defineProperty(exports, "ImagePipeline", { enumerable: true, get: function () { return pipelines_1.ImagePipeline; } }); -Object.defineProperty(exports, "DatabasePipeline", { enumerable: true, get: function () { return pipelines_1.DatabasePipeline; } }); -Object.defineProperty(exports, "StatsPipeline", { enumerable: true, get: function () { return pipelines_1.StatsPipeline; } }); -__exportStar(require("./types"), exports); -// Main API functions -const engine_2 = require("./engine"); -const navigation_2 = require("./navigation"); -const downloader_2 = require("./downloader"); -const logger_1 = require("../services/logger"); -/** - * Scrape a single category - */ -async function scrapeCategory(storeId, categoryId) { - const engine = new engine_2.ScraperEngine(1); - const spider = new engine_2.DutchieSpider(engine); - try { - await spider.scrapeCategory(storeId, categoryId); - } - catch (error) { - logger_1.logger.error('scraper', `scrapeCategory failed: ${error}`); - throw error; - } -} -/** - * Scrape an entire store - */ -async function scrapeStore(storeId, parallel = 3, _userAgent) { - const engine = new engine_2.ScraperEngine(1); - const spider = new engine_2.DutchieSpider(engine); - try { - await spider.scrapeStore(storeId, parallel); - } - catch (error) { - logger_1.logger.error('scraper', `scrapeStore failed: ${error}`); - throw error; - } -} -/** - * Discover categories for a store - */ -async function discoverCategories(storeId) { - const downloader = new downloader_2.Downloader(); - const discovery = new navigation_2.NavigationDiscovery(downloader); - try { - // Discover categories (uses your existing Dutchie category structure) - await discovery.discoverCategories(storeId); - } - catch (error) { - logger_1.logger.error('scraper', `discoverCategories failed: ${error}`); - throw error; - } - finally { - await downloader.cleanup(); - } -} diff --git a/backend/dist/scraper-v2/middlewares.js b/backend/dist/scraper-v2/middlewares.js deleted file mode 100644 index 5d10ef79..00000000 --- a/backend/dist/scraper-v2/middlewares.js +++ /dev/null @@ -1,351 +0,0 @@ -"use strict"; -Object.defineProperty(exports, "__esModule", { value: true }); -exports.MiddlewareEngine = exports.StealthMiddleware = exports.BotDetectionMiddleware = exports.RetryMiddleware = exports.RateLimitMiddleware = exports.ProxyMiddleware = exports.UserAgentMiddleware = void 0; -const types_1 = require("./types"); -const logger_1 = require("../services/logger"); -const proxy_1 = require("../services/proxy"); -// Diverse, realistic user agents - updated for 2024/2025 -const USER_AGENTS = [ - // Chrome on Windows (most common) - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36', - // Chrome on Mac - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36', - // Chrome on Linux - 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', - 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36', - // Firefox - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 14.0; rv:121.0) Gecko/20100101 Firefox/121.0', - // Safari - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_0) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15', - // Edge - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0', -]; -function getRandomUserAgent() { - return USER_AGENTS[Math.floor(Math.random() * USER_AGENTS.length)]; -} -function sleep(ms) { - return new Promise(resolve => setTimeout(resolve, ms)); -} -/** - * User Agent Rotation Middleware - rotates UA on each request for better evasion - */ -class UserAgentMiddleware { - name = 'UserAgentMiddleware'; - priority = 100; - lastUserAgent = null; - async processRequest(request) { - // Always rotate UA on retries or bot detection - const forceRotation = request.retryCount > 0 || request.metadata.botDetected; - if (!request.metadata.userAgent || forceRotation) { - // Get a different UA than the last one used - let newUA = getRandomUserAgent(); - let attempts = 0; - while (newUA === this.lastUserAgent && attempts < 5) { - newUA = getRandomUserAgent(); - attempts++; - } - request.metadata.userAgent = newUA; - this.lastUserAgent = newUA; - if (forceRotation) { - logger_1.logger.debug('scraper', `🔄 Rotated User-Agent: ${newUA.substring(0, 50)}...`); - } - } - return request; - } -} -exports.UserAgentMiddleware = UserAgentMiddleware; -// Domains that should skip proxy (datacenter IPs are blocked) -const PROXY_SKIP_DOMAINS = [ - 'dutchie.com', -]; -function shouldSkipProxy(url) { - try { - const urlObj = new URL(url); - return PROXY_SKIP_DOMAINS.some(domain => urlObj.hostname.includes(domain)); - } - catch { - return false; - } -} -/** - * Proxy Rotation Middleware - uses the central proxy service with timeout handling - */ -class ProxyMiddleware { - name = 'ProxyMiddleware'; - priority = 90; - currentProxyId = null; - async processRequest(request) { - // Skip proxy for domains that block datacenter IPs - if (shouldSkipProxy(request.url)) { - logger_1.logger.info('scraper', `⏭️ Skipping proxy for ${new URL(request.url).hostname} (datacenter IPs blocked)`); - return request; - } - // Always try to use a proxy from the central proxy service - // The service handles bot detection timeouts automatically - const forceRotation = request.retryCount > 0 || request.metadata.botDetected; - if (!request.metadata.proxy || forceRotation) { - // Get proxy from central service - it handles timeouts automatically - const proxy = await (0, proxy_1.getActiveProxy)(); - if (proxy) { - request.metadata.proxy = { - host: proxy.host, - port: proxy.port, - protocol: proxy.protocol, - username: proxy.username, - password: proxy.password, - }; - request.metadata.proxyId = proxy.id; - this.currentProxyId = proxy.id; - const reason = forceRotation ? 'rotation' : 'initial'; - logger_1.logger.info('scraper', `🔄 Using proxy (${reason}): ${proxy.protocol}://${proxy.host}:${proxy.port}`); - } - else { - logger_1.logger.warn('scraper', '⚠️ No proxy available - running without proxy'); - } - } - return request; - } - async processResponse(response) { - // If bot detection was triggered, put the proxy in timeout - if (response.request.metadata.botDetected && response.request.metadata.proxyId) { - (0, proxy_1.putProxyInTimeout)(response.request.metadata.proxyId, 'Bot detection triggered'); - logger_1.logger.info('scraper', `🚫 Proxy ${response.request.metadata.proxyId} put in timeout due to bot detection`); - } - return response; - } - async processError(error, request) { - // If bot detection error, put proxy in timeout - if ((0, proxy_1.isBotDetectionError)(error.message) && request.metadata.proxyId) { - (0, proxy_1.putProxyInTimeout)(request.metadata.proxyId, error.message); - logger_1.logger.info('scraper', `🚫 Proxy ${request.metadata.proxyId} put in timeout: ${error.message}`); - } - return error; - } -} -exports.ProxyMiddleware = ProxyMiddleware; -/** - * Rate Limiting Middleware with Adaptive Delays - */ -class RateLimitMiddleware { - name = 'RateLimitMiddleware'; - priority = 80; - requestTimes = []; - errorCount = 0; - baseDelay = 2000; // 2 seconds base delay - maxDelay = 30000; // 30 seconds max - async processRequest(request) { - await this.waitForNextRequest(); - return request; - } - async processResponse(response) { - // Record success - gradually reduce error count - this.errorCount = Math.max(0, this.errorCount - 1); - return response; - } - async processError(error) { - // Record error - increase delay - this.errorCount++; - return error; - } - async waitForNextRequest() { - // Calculate adaptive delay based on error count - const errorMultiplier = Math.pow(1.5, Math.min(this.errorCount, 5)); - const adaptiveDelay = Math.min(this.baseDelay * errorMultiplier, this.maxDelay); - // Add random jitter (±20%) - const jitter = (Math.random() - 0.5) * 0.4 * adaptiveDelay; - const delay = adaptiveDelay + jitter; - const now = Date.now(); - const lastRequest = this.requestTimes[this.requestTimes.length - 1] || 0; - const timeSinceLast = now - lastRequest; - if (timeSinceLast < delay) { - const waitTime = delay - timeSinceLast; - logger_1.logger.debug('scraper', `Rate limiting: waiting ${Math.round(waitTime)}ms`); - await sleep(waitTime); - } - this.requestTimes.push(Date.now()); - this.cleanup(); - } - cleanup() { - // Keep only last minute of requests - const cutoff = Date.now() - 60000; - this.requestTimes = this.requestTimes.filter(t => t > cutoff); - } - setBaseDelay(ms) { - this.baseDelay = ms; - } -} -exports.RateLimitMiddleware = RateLimitMiddleware; -/** - * Retry Middleware with Exponential Backoff - */ -class RetryMiddleware { - name = 'RetryMiddleware'; - priority = 70; - isRetryable(error) { - const retryableErrors = [ - types_1.ErrorType.NETWORK_ERROR, - types_1.ErrorType.TIMEOUT, - types_1.ErrorType.SERVER_ERROR - ]; - if ('type' in error) { - return retryableErrors.includes(error.type); - } - // Check error message for common retryable patterns - const message = error.message.toLowerCase(); - return (message.includes('timeout') || - message.includes('network') || - message.includes('econnreset') || - message.includes('econnrefused') || - message.includes('500') || - message.includes('502') || - message.includes('503')); - } - async processError(error, request) { - if (!this.isRetryable(error)) { - logger_1.logger.warn('scraper', `Non-retryable error for ${request.url}: ${error.message}`); - return error; - } - if (request.retryCount < request.maxRetries) { - // Calculate backoff delay - const backoffDelay = Math.min(1000 * Math.pow(2, request.retryCount), 30000); - logger_1.logger.info('scraper', `Retry ${request.retryCount + 1}/${request.maxRetries} for ${request.url} after ${backoffDelay}ms`); - await sleep(backoffDelay); - // Return null to indicate retry should happen - return null; - } - logger_1.logger.error('scraper', `Max retries exceeded for ${request.url}`); - return error; - } -} -exports.RetryMiddleware = RetryMiddleware; -/** - * Bot Detection Middleware - detects bot blocking and triggers fingerprint rotation - */ -class BotDetectionMiddleware { - name = 'BotDetectionMiddleware'; - priority = 60; - detectedCount = 0; - DETECTION_THRESHOLD = 3; - // Export for use by other middlewares - static shouldRotateFingerprint = false; - async processResponse(response) { - const content = typeof response.content === 'string' - ? response.content - : JSON.stringify(response.content); - // Check for bot detection indicators - const botIndicators = [ - /captcha/i, - /cloudflare/i, - /access denied/i, - /you have been blocked/i, - /unusual traffic/i, - /robot/i, - /verify.*human/i, - /security check/i, - /please wait/i, - /checking your browser/i, - /ray id/i - ]; - const detected = botIndicators.some(pattern => pattern.test(content)); - if (detected) { - this.detectedCount++; - BotDetectionMiddleware.shouldRotateFingerprint = true; - logger_1.logger.warn('scraper', `Bot detection suspected (${this.detectedCount}/${this.DETECTION_THRESHOLD}): ${response.url}`); - logger_1.logger.info('scraper', '🔄 Flagging for proxy/UA rotation on next request'); - // Mark the request for rotation on retry - response.request.metadata.botDetected = true; - response.request.metadata.needsNewBrowser = true; - if (this.detectedCount >= this.DETECTION_THRESHOLD) { - const error = new Error('Bot detection threshold reached - rotating fingerprint'); - error.type = types_1.ErrorType.BOT_DETECTION; - error.retryable = true; - error.request = response.request; - throw error; - } - } - else { - // Gradually decrease detection count on successful requests - this.detectedCount = Math.max(0, this.detectedCount - 0.5); - BotDetectionMiddleware.shouldRotateFingerprint = false; - } - return response; - } - async processError(error, request) { - // If bot detection error, flag for rotation and allow retry - if ('type' in error && error.type === types_1.ErrorType.BOT_DETECTION) { - request.metadata.botDetected = true; - request.metadata.needsNewBrowser = true; - logger_1.logger.info('scraper', '🔄 Bot detection error - will rotate proxy/UA on retry'); - // Add delay before retry to avoid rate limiting - await sleep(5000 + Math.random() * 5000); - return null; // Return null to trigger retry - } - return error; - } -} -exports.BotDetectionMiddleware = BotDetectionMiddleware; -/** - * Stealth Mode Middleware - */ -class StealthMiddleware { - name = 'StealthMiddleware'; - priority = 95; - async processRequest(request) { - // Flag that this request needs stealth mode - request.metadata.requiresStealth = true; - return request; - } -} -exports.StealthMiddleware = StealthMiddleware; -/** - * Middleware Engine to orchestrate all middlewares - */ -class MiddlewareEngine { - middlewares = []; - use(middleware) { - this.middlewares.push(middleware); - // Sort by priority (higher first) - this.middlewares.sort((a, b) => b.priority - a.priority); - } - async processRequest(request) { - let current = request; - for (const middleware of this.middlewares) { - if (middleware.processRequest) { - current = await middleware.processRequest(current); - } - } - return current; - } - async processResponse(response) { - let current = response; - for (const middleware of this.middlewares) { - if (middleware.processResponse) { - current = await middleware.processResponse(current); - } - } - return current; - } - async processError(error, request) { - let currentError = error; - for (const middleware of this.middlewares) { - if (middleware.processError && currentError) { - currentError = await middleware.processError(currentError, request); - if (currentError === null) { - // Middleware handled the error (e.g., retry) - break; - } - } - } - return currentError; - } -} -exports.MiddlewareEngine = MiddlewareEngine; diff --git a/backend/dist/scraper-v2/navigation.js b/backend/dist/scraper-v2/navigation.js deleted file mode 100644 index f7a7a66a..00000000 --- a/backend/dist/scraper-v2/navigation.js +++ /dev/null @@ -1,278 +0,0 @@ -"use strict"; -Object.defineProperty(exports, "__esModule", { value: true }); -exports.NavigationDiscovery = void 0; -const migrate_1 = require("../db/migrate"); -const logger_1 = require("../services/logger"); -/** - * Navigation Discovery - finds and builds category structure - */ -class NavigationDiscovery { - downloader; - constructor(downloader) { - this.downloader = downloader; - } - /** - * Discover categories from a store's main page - */ - async discoverCategories(storeId) { - logger_1.logger.info('categories', `Starting category discovery for store ${storeId}`); - try { - // Get store info - const storeResult = await migrate_1.pool.query(` - SELECT id, name, slug, dutchie_url - FROM stores - WHERE id = $1 - `, [storeId]); - if (storeResult.rows.length === 0) { - throw new Error('Store not found'); - } - const store = storeResult.rows[0]; - const baseUrl = store.dutchie_url; - // Create request to fetch the main page - const request = { - url: baseUrl, - priority: 100, - retryCount: 0, - maxRetries: 3, - metadata: { - requiresBrowser: true, - requiresStealth: true - }, - callback: async () => ({ items: [], requests: [] }) - }; - // Fetch the page - const response = await this.downloader.fetch(request); - // Extract navigation links - const page = await this.downloader.getCurrentPage(); - if (!page) { - throw new Error('No active page for navigation extraction'); - } - const links = await this.extractNavigationLinks(page, baseUrl); - logger_1.logger.info('categories', `Found ${links.length} navigation links`); - // Check if it's a Dutchie menu - const isDutchie = await this.isDutchieMenu(page); - if (isDutchie) { - logger_1.logger.info('categories', 'Detected Dutchie menu - using predefined structure'); - await this.createDutchieCategories(storeId, store, links); - } - else { - logger_1.logger.info('categories', 'Custom menu detected - extracting from navigation'); - await this.createCustomCategories(storeId, store, links); - } - logger_1.logger.info('categories', `✅ Category discovery completed for ${store.name}`); - } - catch (error) { - logger_1.logger.error('categories', `Category discovery failed: ${error}`); - throw error; - } - } - /** - * Extract navigation links from page - */ - async extractNavigationLinks(page, baseUrl) { - return await page.evaluate((base) => { - const links = []; - // Look for navigation elements - const navSelectors = [ - 'nav a', - '[role="navigation"] a', - '[class*="nav"] a', - '[class*="menu"] a', - '[class*="category"] a', - 'header a' - ]; - const foundLinks = new Set(); - for (const selector of navSelectors) { - // @ts-ignore - runs in browser context - const elements = document.querySelectorAll(selector); - elements.forEach((el) => { - const text = el.textContent?.trim(); - let href = el.href || el.getAttribute('href'); - if (!text || !href || text.length < 2) - return; - // Normalize href - if (href.startsWith('/')) { - // @ts-ignore - runs in browser context - const url = new URL(base); - href = `${url.origin}${href}`; - } - // Skip external links and anchors - if (!href.includes(base) || href.includes('#')) - return; - // Skip duplicates - const linkKey = `${text}:${href}`; - if (foundLinks.has(linkKey)) - return; - foundLinks.add(linkKey); - // Determine if it's likely a category - const categoryKeywords = [ - 'flower', 'pre-roll', 'vape', 'edible', 'concentrate', - 'topical', 'accessory', 'brand', 'special', 'shop', - 'indica', 'sativa', 'hybrid', 'cbd', 'thc' - ]; - const isCategory = categoryKeywords.some(kw => text.toLowerCase().includes(kw) || - href.toLowerCase().includes(kw)); - links.push({ - text, - href, - isCategory - }); - }); - } - return links; - }, baseUrl); - } - /** - * Check if it's a Dutchie menu - */ - async isDutchieMenu(page) { - return await page.evaluate(() => { - // Check for Dutchie markers - // @ts-ignore - runs in browser context - if (window.reactEnv) { - // @ts-ignore - runs in browser context - const env = window.reactEnv; - if (env.adminUrl?.includes('dutchie.com') || - env.apiUrl?.includes('dutchie.com') || - env.consumerUrl?.includes('dutchie.com')) { - return true; - } - } - // @ts-ignore - runs in browser context - const htmlContent = document.documentElement.innerHTML; - return (htmlContent.includes('admin.dutchie.com') || - htmlContent.includes('api.dutchie.com') || - htmlContent.includes('embedded-menu') || - htmlContent.includes('window.reactEnv')); - }); - } - /** - * Create categories for Dutchie menus (predefined structure) - * Uses your existing Dutchie category structure - */ - async createDutchieCategories(storeId, store, discoveredLinks) { - const client = await migrate_1.pool.connect(); - try { - await client.query('BEGIN'); - logger_1.logger.info('categories', `Creating predefined Dutchie category structure`); - const baseUrl = store.dutchie_url; - // Your existing Dutchie categories structure - const DUTCHIE_CATEGORIES = [ - { name: 'Shop', slug: 'shop', parentSlug: undefined }, - { name: 'Flower', slug: 'flower', parentSlug: 'shop' }, - { name: 'Pre-Rolls', slug: 'pre-rolls', parentSlug: 'shop' }, - { name: 'Vaporizers', slug: 'vaporizers', parentSlug: 'shop' }, - { name: 'Concentrates', slug: 'concentrates', parentSlug: 'shop' }, - { name: 'Edibles', slug: 'edibles', parentSlug: 'shop' }, - { name: 'Topicals', slug: 'topicals', parentSlug: 'shop' }, - { name: 'Accessories', slug: 'accessories', parentSlug: 'shop' }, - { name: 'Brands', slug: 'brands', parentSlug: undefined }, - { name: 'Specials', slug: 'specials', parentSlug: undefined } - ]; - for (const category of DUTCHIE_CATEGORIES) { - let categoryUrl; - if (category.parentSlug) { - // Subcategory: /embedded-menu/{slug}/shop/flower - categoryUrl = `${baseUrl}/${category.parentSlug}/${category.slug}`; - } - else { - // Top-level: /embedded-menu/{slug}/shop - categoryUrl = `${baseUrl}/${category.slug}`; - } - const path = category.parentSlug ? `${category.parentSlug}/${category.slug}` : category.slug; - if (!category.parentSlug) { - // Create parent category - await client.query(` - INSERT INTO categories (store_id, name, slug, dutchie_url, path, scrape_enabled, parent_id) - VALUES ($1, $2, $3, $4, $5, true, NULL) - ON CONFLICT (store_id, slug) - DO UPDATE SET name = $2, dutchie_url = $4, path = $5 - RETURNING id - `, [storeId, category.name, category.slug, categoryUrl, path]); - logger_1.logger.info('categories', `📁 ${category.name}`); - } - else { - // Create subcategory - const parentResult = await client.query(` - SELECT id FROM categories - WHERE store_id = $1 AND slug = $2 - `, [storeId, category.parentSlug]); - if (parentResult.rows.length > 0) { - const parentId = parentResult.rows[0].id; - await client.query(` - INSERT INTO categories (store_id, name, slug, dutchie_url, path, scrape_enabled, parent_id) - VALUES ($1, $2, $3, $4, $5, true, $6) - ON CONFLICT (store_id, slug) - DO UPDATE SET name = $2, dutchie_url = $4, path = $5, parent_id = $6 - `, [storeId, category.name, category.slug, categoryUrl, path, parentId]); - logger_1.logger.info('categories', ` └── ${category.name}`); - } - } - } - await client.query('COMMIT'); - logger_1.logger.info('categories', `✅ Created ${DUTCHIE_CATEGORIES.length} Dutchie categories successfully`); - } - catch (error) { - await client.query('ROLLBACK'); - logger_1.logger.error('categories', `Failed to create Dutchie categories: ${error}`); - throw error; - } - finally { - client.release(); - } - } - /** - * Create categories from discovered links (custom menus) - */ - async createCustomCategories(storeId, store, links) { - const client = await migrate_1.pool.connect(); - try { - await client.query('BEGIN'); - // Filter to likely category links - const categoryLinks = links.filter(link => link.isCategory); - let displayOrder = 0; - for (const link of categoryLinks) { - // Generate slug from text - const slug = link.text - .toLowerCase() - .replace(/[^a-z0-9]+/g, '-') - .replace(/^-|-$/g, ''); - // Determine path from URL - const url = new URL(link.href); - const path = url.pathname.replace(/^\//, ''); - await client.query(` - INSERT INTO categories (store_id, name, slug, dutchie_url, path, scrape_enabled, display_order) - VALUES ($1, $2, $3, $4, $5, true, $6) - ON CONFLICT (store_id, slug) - DO UPDATE SET name = $2, dutchie_url = $4, path = $5, display_order = $6 - `, [storeId, link.text, slug, link.href, path, displayOrder++]); - logger_1.logger.info('categories', `📁 ${link.text} -> ${link.href}`); - } - await client.query('COMMIT'); - logger_1.logger.info('categories', `✅ Created ${categoryLinks.length} custom categories`); - } - catch (error) { - await client.query('ROLLBACK'); - throw error; - } - finally { - client.release(); - } - } - /** - * Update display_order column in categories table - */ - async ensureDisplayOrderColumn() { - try { - await migrate_1.pool.query(` - ALTER TABLE categories - ADD COLUMN IF NOT EXISTS display_order INTEGER DEFAULT 0 - `); - logger_1.logger.info('categories', 'Ensured display_order column exists'); - } - catch (error) { - logger_1.logger.warn('categories', `Could not add display_order column: ${error}`); - } - } -} -exports.NavigationDiscovery = NavigationDiscovery; diff --git a/backend/dist/scraper-v2/pipelines.js b/backend/dist/scraper-v2/pipelines.js deleted file mode 100644 index ce5c74ff..00000000 --- a/backend/dist/scraper-v2/pipelines.js +++ /dev/null @@ -1,459 +0,0 @@ -"use strict"; -Object.defineProperty(exports, "__esModule", { value: true }); -exports.PipelineEngine = exports.StatsPipeline = exports.DatabasePipeline = exports.ImagePipeline = exports.DeduplicationPipeline = exports.SanitizationPipeline = exports.ValidationPipeline = void 0; -const logger_1 = require("../services/logger"); -const migrate_1 = require("../db/migrate"); -const minio_1 = require("../utils/minio"); -const product_normalizer_1 = require("../utils/product-normalizer"); -/** - * Validation Pipeline - ensures data quality - */ -class ValidationPipeline { - name = 'ValidationPipeline'; - priority = 100; - async process(item, spider) { - // Required fields - if (!item.name || item.name.trim().length < 2) { - logger_1.logger.warn('pipeline', `Dropping product: invalid name`); - return null; - } - if (!item.dutchieUrl) { - logger_1.logger.warn('pipeline', `Dropping product ${item.name}: no URL`); - return null; - } - // Validate numeric fields - if (item.price !== undefined && (item.price < 0 || item.price > 10000)) { - logger_1.logger.warn('pipeline', `Invalid price for ${item.name}: ${item.price}`); - item.price = undefined; - } - if (item.thcPercentage !== undefined && (item.thcPercentage < 0 || item.thcPercentage > 100)) { - logger_1.logger.warn('pipeline', `Invalid THC for ${item.name}: ${item.thcPercentage}`); - item.thcPercentage = undefined; - } - if (item.cbdPercentage !== undefined && (item.cbdPercentage < 0 || item.cbdPercentage > 100)) { - logger_1.logger.warn('pipeline', `Invalid CBD for ${item.name}: ${item.cbdPercentage}`); - item.cbdPercentage = undefined; - } - return item; - } -} -exports.ValidationPipeline = ValidationPipeline; -/** - * Sanitization Pipeline - cleans and normalizes data - */ -class SanitizationPipeline { - name = 'SanitizationPipeline'; - priority = 90; - async process(item, spider) { - // Truncate long strings - if (item.name) { - item.name = item.name.substring(0, 500).trim(); - } - if (item.description) { - item.description = item.description.substring(0, 5000).trim(); - } - if (item.brand) { - item.brand = item.brand.substring(0, 255).trim(); - } - if (item.weight) { - item.weight = item.weight.substring(0, 100).trim(); - } - // Normalize strain type - if (item.strainType) { - const normalized = item.strainType.toLowerCase(); - if (normalized.includes('indica')) { - item.strainType = 'Indica'; - } - else if (normalized.includes('sativa')) { - item.strainType = 'Sativa'; - } - else if (normalized.includes('hybrid')) { - item.strainType = 'Hybrid'; - } - else { - item.strainType = undefined; - } - } - // Clean up metadata - if (item.metadata) { - // Remove empty arrays - Object.keys(item.metadata).forEach(key => { - if (Array.isArray(item.metadata[key]) && item.metadata[key].length === 0) { - delete item.metadata[key]; - } - }); - } - return item; - } -} -exports.SanitizationPipeline = SanitizationPipeline; -/** - * Deduplication Pipeline - prevents duplicate items - */ -class DeduplicationPipeline { - name = 'DeduplicationPipeline'; - priority = 80; - seen = new Set(); - async process(item, spider) { - const fingerprint = `${item.dutchieProductId}`; - if (this.seen.has(fingerprint)) { - logger_1.logger.debug('pipeline', `Duplicate product detected: ${item.name}`); - return null; - } - this.seen.add(fingerprint); - return item; - } - clear() { - this.seen.clear(); - } -} -exports.DeduplicationPipeline = DeduplicationPipeline; -/** - * Image Processing Pipeline - handles image downloads - */ -class ImagePipeline { - name = 'ImagePipeline'; - priority = 70; - extractImageId(url) { - try { - const match = url.match(/images\.dutchie\.com\/([a-f0-9]+)/i); - return match ? match[1] : null; - } - catch (e) { - return null; - } - } - getFullSizeImageUrl(imageUrl) { - const imageId = this.extractImageId(imageUrl); - if (!imageId) - return imageUrl; - return `https://images.dutchie.com/${imageId}?auto=format&fit=max&q=95&w=2000&h=2000`; - } - async process(item, spider) { - if (item.imageUrl) { - // Convert to full-size URL - item.imageUrl = this.getFullSizeImageUrl(item.imageUrl); - } - return item; - } -} -exports.ImagePipeline = ImagePipeline; -/** - * Generate a URL-safe slug from a product name - */ -function generateSlug(name) { - return name - .toLowerCase() - .replace(/[^a-z0-9]+/g, '-') - .replace(/^-+|-+$/g, '') - .substring(0, 400); -} -/** - * Database Pipeline - saves items to database with improved matching - * - * MATCHING PRIORITY: - * 1. external_id (dutchie_product_id) - exact match - * 2. normalized name + brand + category - strong match - * 3. normalized name + category - weak match (same product, different/missing brand) - * - * ALWAYS creates a snapshot after upsert for historical tracking. - */ -class DatabasePipeline { - name = 'DatabasePipeline'; - priority = 10; // Low priority - runs last - crawlId = null; - setCrawlId(id) { - this.crawlId = id; - } - async process(item, spider) { - const client = await migrate_1.pool.connect(); - try { - // Extract store and category from metadata (set by spider) - const storeId = item.storeId; - const categoryId = item.categoryId; - const dispensaryId = item.dispensaryId; - const categoryName = item.categoryName; - // Generate normalized values for matching - const nameNormalized = (0, product_normalizer_1.normalizeProductName)(item.name); - const brandNormalized = (0, product_normalizer_1.normalizeBrandName)(item.brand); - const slug = generateSlug(item.name); - const externalId = item.dutchieProductId || null; - if (!storeId || !categoryId) { - logger_1.logger.error('pipeline', `Missing storeId or categoryId for ${item.name}`); - return null; - } - let productId = null; - let localImagePath = null; - let isNewProduct = false; - // STEP 1: Try to match by external_id (most reliable) - if (externalId) { - const extMatch = await client.query(` - SELECT id, image_url, local_image_path - FROM products - WHERE store_id = $1 AND (external_id = $2 OR dutchie_product_id = $2) - `, [storeId, externalId]); - if (extMatch.rows.length > 0) { - productId = extMatch.rows[0].id; - localImagePath = extMatch.rows[0].local_image_path; - logger_1.logger.debug('pipeline', `Matched by external_id: ${item.name}`); - } - } - // STEP 2: Try to match by normalized name + brand + category - if (!productId) { - const normMatch = await client.query(` - SELECT id, image_url, local_image_path - FROM products - WHERE store_id = $1 - AND name_normalized = $2 - AND brand_normalized = $3 - AND category_id = $4 - `, [storeId, nameNormalized, brandNormalized, categoryId]); - if (normMatch.rows.length > 0) { - productId = normMatch.rows[0].id; - localImagePath = normMatch.rows[0].local_image_path; - logger_1.logger.debug('pipeline', `Matched by normalized name+brand+category: ${item.name}`); - } - } - // STEP 3: Fallback to normalized name + category only (weaker match) - if (!productId) { - const weakMatch = await client.query(` - SELECT id, image_url, local_image_path - FROM products - WHERE store_id = $1 - AND name_normalized = $2 - AND category_id = $3 - LIMIT 1 - `, [storeId, nameNormalized, categoryId]); - if (weakMatch.rows.length === 1) { - productId = weakMatch.rows[0].id; - localImagePath = weakMatch.rows[0].local_image_path; - logger_1.logger.debug('pipeline', `Matched by normalized name+category: ${item.name}`); - } - } - // STEP 4: Final fallback - exact name match (legacy compatibility) - if (!productId) { - const exactMatch = await client.query(` - SELECT id, image_url, local_image_path - FROM products - WHERE store_id = $1 AND name = $2 AND category_id = $3 - `, [storeId, item.name, categoryId]); - if (exactMatch.rows.length > 0) { - productId = exactMatch.rows[0].id; - localImagePath = exactMatch.rows[0].local_image_path; - logger_1.logger.debug('pipeline', `Matched by exact name: ${item.name}`); - } - } - // UPDATE or INSERT - if (productId) { - // Update existing product - await client.query(` - UPDATE products - SET name = $1, description = $2, price = $3, - strain_type = $4, thc_percentage = $5, cbd_percentage = $6, - brand = $7, weight = $8, image_url = COALESCE($9, image_url), dutchie_url = $10, - in_stock = true, metadata = $11, last_seen_at = CURRENT_TIMESTAMP, - updated_at = CURRENT_TIMESTAMP, dispensary_id = $13, slug = COALESCE(slug, $14), - name_normalized = $15, brand_normalized = $16, - external_id = COALESCE(external_id, $17), source_platform = COALESCE(source_platform, 'dutchie') - WHERE id = $12 - `, [ - item.name, item.description, item.price, - item.strainType, item.thcPercentage, item.cbdPercentage, - item.brand, item.weight, item.imageUrl, item.dutchieUrl, - JSON.stringify(item.metadata || {}), productId, dispensaryId, slug, - nameNormalized, brandNormalized, externalId - ]); - logger_1.logger.debug('pipeline', `Updated product: ${item.name}`); - } - else { - // Insert new product - isNewProduct = true; - const insertResult = await client.query(` - INSERT INTO products ( - store_id, category_id, dispensary_id, dutchie_product_id, external_id, - slug, name, name_normalized, description, - price, strain_type, thc_percentage, cbd_percentage, - brand, brand_normalized, weight, image_url, dutchie_url, in_stock, metadata, - source_platform - ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, true, $19, 'dutchie') - RETURNING id - `, [ - storeId, categoryId, dispensaryId, externalId, externalId, - slug, item.name, nameNormalized, item.description, - item.price, item.strainType, item.thcPercentage, item.cbdPercentage, - item.brand, brandNormalized, item.weight, item.imageUrl, item.dutchieUrl, - JSON.stringify(item.metadata || {}) - ]); - productId = insertResult.rows[0].id; - logger_1.logger.debug('pipeline', `Inserted NEW product: ${item.name}`); - } - // ALWAYS create a snapshot for historical tracking - await this.createSnapshot(client, { - productId: productId, - dispensaryId, - externalId, - slug, - item, - categoryName - }); - // Download image if needed (only for new products or missing local image) - if (item.imageUrl && !localImagePath && productId) { - try { - const storeResult = await client.query('SELECT slug FROM stores WHERE id = $1', [storeId]); - const storeSlug = storeResult.rows[0]?.slug || undefined; - const imageSizes = await (0, minio_1.uploadImageFromUrl)(item.imageUrl, productId, storeSlug); - localImagePath = imageSizes.thumbnail; - await client.query(` - UPDATE products SET local_image_path = $1 WHERE id = $2 - `, [imageSizes.thumbnail, productId]); - logger_1.logger.debug('pipeline', `Downloaded image for: ${item.name}`); - } - catch (error) { - logger_1.logger.error('pipeline', `Failed to download image for ${item.name}: ${error}`); - } - } - // Attach metadata for stats tracking - item.isNewProduct = isNewProduct; - item.productId = productId; - return item; - } - catch (error) { - logger_1.logger.error('pipeline', `Failed to save product ${item.name}: ${error}`); - return null; - } - finally { - client.release(); - } - } - /** - * Create a snapshot record for historical tracking - */ - async createSnapshot(client, params) { - try { - // Only create snapshots if the table exists (graceful degradation) - const tableExists = await client.query(` - SELECT EXISTS ( - SELECT FROM information_schema.tables - WHERE table_name = 'product_snapshots' - ) - `); - if (!tableExists.rows[0].exists) { - return; // Snapshot table not yet created - } - const crawlId = this.crawlId || crypto.randomUUID(); - const { productId, dispensaryId, externalId, slug, item, categoryName } = params; - await client.query(` - INSERT INTO product_snapshots ( - crawl_id, dispensary_id, external_product_id, product_slug, - name, brand, category, price, original_price, sale_price, - discount_type, discount_value, availability_status, stock_quantity, - thc_percentage, cbd_percentage, strain_type, weight, variant, - description, image_url, effects, terpenes, captured_at - ) VALUES ( - $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, NOW() - ) - `, [ - crawlId, - dispensaryId, - externalId, - slug, - item.name, - item.brand || null, - categoryName || null, - item.price || null, - item.originalPrice || null, - item.metadata?.salePrice || null, - item.metadata?.discountType || null, - item.metadata?.discountValue || null, - 'in_stock', // availability_status - if we scraped it, it's in stock - item.metadata?.stockQuantity || null, - item.thcPercentage || null, - item.cbdPercentage || null, - item.strainType || null, - item.weight || null, - item.metadata?.variant || null, - item.description || null, - item.imageUrl || null, - item.metadata?.effects || null, - item.metadata?.terpenes || null - ]); - } - catch (error) { - // Don't fail the whole pipeline if snapshot creation fails - logger_1.logger.warn('pipeline', `Failed to create snapshot for ${params.item.name}: ${error}`); - } - } -} -exports.DatabasePipeline = DatabasePipeline; -/** - * Stats Pipeline - tracks statistics - */ -class StatsPipeline { - name = 'StatsPipeline'; - priority = 50; - stats = { - total: 0, - withImages: 0, - withThc: 0, - withCbd: 0, - withDescription: 0 - }; - async process(item, spider) { - this.stats.total++; - if (item.imageUrl) - this.stats.withImages++; - if (item.thcPercentage) - this.stats.withThc++; - if (item.cbdPercentage) - this.stats.withCbd++; - if (item.description) - this.stats.withDescription++; - return item; - } - getStats() { - return { ...this.stats }; - } - clear() { - this.stats = { - total: 0, - withImages: 0, - withThc: 0, - withCbd: 0, - withDescription: 0 - }; - } -} -exports.StatsPipeline = StatsPipeline; -/** - * Pipeline Engine - orchestrates all pipelines - */ -class PipelineEngine { - pipelines = []; - use(pipeline) { - this.pipelines.push(pipeline); - // Sort by priority (higher first) - this.pipelines.sort((a, b) => b.priority - a.priority); - } - async processItem(item, spider) { - let current = item; - for (const pipeline of this.pipelines) { - try { - current = await pipeline.process(current, spider); - if (!current) { - // Item was filtered out - logger_1.logger.debug('pipeline', `Item filtered by ${pipeline.name}`); - return null; - } - } - catch (error) { - logger_1.logger.error('pipeline', `Error in ${pipeline.name}: ${error}`); - // Continue with other pipelines - } - } - return current; - } - getPipeline(name) { - return this.pipelines.find(p => p.name === name); - } -} -exports.PipelineEngine = PipelineEngine; diff --git a/backend/dist/scraper-v2/scheduler.js b/backend/dist/scraper-v2/scheduler.js deleted file mode 100644 index cb911427..00000000 --- a/backend/dist/scraper-v2/scheduler.js +++ /dev/null @@ -1,136 +0,0 @@ -"use strict"; -var __importDefault = (this && this.__importDefault) || function (mod) { - return (mod && mod.__esModule) ? mod : { "default": mod }; -}; -Object.defineProperty(exports, "__esModule", { value: true }); -exports.RequestScheduler = void 0; -const logger_1 = require("../services/logger"); -const crypto_1 = __importDefault(require("crypto")); -class RequestScheduler { - queue = []; - inProgress = new Set(); - seen = new Set(); - deduplicationEnabled = true; - constructor(deduplicationEnabled = true) { - this.deduplicationEnabled = deduplicationEnabled; - } - /** - * Generate fingerprint for request deduplication - */ - generateFingerprint(request) { - if (request.fingerprint) { - return request.fingerprint; - } - // Generate fingerprint based on URL and relevant metadata - const data = { - url: request.url, - method: request.metadata?.method || 'GET', - body: request.metadata?.body - }; - return crypto_1.default.createHash('md5').update(JSON.stringify(data)).digest('hex'); - } - /** - * Add a request to the queue - */ - enqueue(partialRequest) { - if (!partialRequest.url) { - logger_1.logger.warn('scraper', 'Cannot enqueue request without URL'); - return false; - } - const fingerprint = this.generateFingerprint(partialRequest); - // Check for duplicates - if (this.deduplicationEnabled && this.seen.has(fingerprint)) { - logger_1.logger.debug('scraper', `Request already seen: ${partialRequest.url}`); - return false; - } - // Create full request with defaults - const request = { - url: partialRequest.url, - priority: partialRequest.priority ?? 0, - retryCount: partialRequest.retryCount ?? 0, - maxRetries: partialRequest.maxRetries ?? 3, - metadata: partialRequest.metadata || {}, - callback: partialRequest.callback, - errorHandler: partialRequest.errorHandler, - fingerprint - }; - this.queue.push(request); - this.seen.add(fingerprint); - // Sort by priority (higher priority first) - this.queue.sort((a, b) => b.priority - a.priority); - logger_1.logger.debug('scraper', `Enqueued: ${request.url} (priority: ${request.priority})`); - return true; - } - /** - * Get the next request from the queue - */ - dequeue() { - const request = this.queue.shift(); - if (request) { - this.inProgress.add(request.fingerprint); - } - return request || null; - } - /** - * Mark a request as complete - */ - markComplete(request) { - if (request.fingerprint) { - this.inProgress.delete(request.fingerprint); - } - } - /** - * Requeue a failed request (for retry) - */ - requeueForRetry(request) { - if (request.fingerprint) { - this.inProgress.delete(request.fingerprint); - this.seen.delete(request.fingerprint); - } - request.retryCount++; - if (request.retryCount > request.maxRetries) { - logger_1.logger.warn('scraper', `Max retries exceeded for: ${request.url}`); - return false; - } - // Decrease priority for retried requests - request.priority = Math.max(0, request.priority - 1); - return this.enqueue(request); - } - /** - * Get queue stats - */ - getStats() { - return { - pending: this.queue.length, - inProgress: this.inProgress.size, - total: this.seen.size - }; - } - /** - * Check if queue is empty - */ - isEmpty() { - return this.queue.length === 0 && this.inProgress.size === 0; - } - /** - * Clear all queues - */ - clear() { - this.queue = []; - this.inProgress.clear(); - this.seen.clear(); - } - /** - * Get pending requests count - */ - getPendingCount() { - return this.queue.length; - } - /** - * Get in-progress count - */ - getInProgressCount() { - return this.inProgress.size; - } -} -exports.RequestScheduler = RequestScheduler; diff --git a/backend/dist/scraper-v2/types.js b/backend/dist/scraper-v2/types.js deleted file mode 100644 index 740be005..00000000 --- a/backend/dist/scraper-v2/types.js +++ /dev/null @@ -1,13 +0,0 @@ -"use strict"; -Object.defineProperty(exports, "__esModule", { value: true }); -exports.ErrorType = void 0; -var ErrorType; -(function (ErrorType) { - ErrorType["NETWORK_ERROR"] = "NETWORK_ERROR"; - ErrorType["TIMEOUT"] = "TIMEOUT"; - ErrorType["PARSE_ERROR"] = "PARSE_ERROR"; - ErrorType["BOT_DETECTION"] = "BOT_DETECTION"; - ErrorType["NOT_FOUND"] = "NOT_FOUND"; - ErrorType["SERVER_ERROR"] = "SERVER_ERROR"; - ErrorType["UNKNOWN"] = "UNKNOWN"; -})(ErrorType || (exports.ErrorType = ErrorType = {})); diff --git a/backend/dist/scrapers/dutchie-graphql-direct.js b/backend/dist/scrapers/dutchie-graphql-direct.js deleted file mode 100644 index d8710717..00000000 --- a/backend/dist/scrapers/dutchie-graphql-direct.js +++ /dev/null @@ -1,360 +0,0 @@ -"use strict"; -// ============================================================================ -// DEPRECATED: This scraper writes to the LEGACY products table. -// DO NOT USE - All Dutchie crawling must use the new dutchie-az pipeline. -// -// New pipeline location: src/dutchie-az/services/product-crawler.ts -// - Uses fetch-based GraphQL (no Puppeteer needed) -// - Writes to isolated dutchie_az_* tables with snapshot model -// - Tracks stockStatus, isPresentInFeed, missing_from_feed -// ============================================================================ -var __importDefault = (this && this.__importDefault) || function (mod) { - return (mod && mod.__esModule) ? mod : { "default": mod }; -}; -Object.defineProperty(exports, "__esModule", { value: true }); -exports.fetchAllDutchieProducts = fetchAllDutchieProducts; -exports.upsertProductsDirect = upsertProductsDirect; -exports.scrapeAllDutchieProducts = scrapeAllDutchieProducts; -/** - * @deprecated DEPRECATED - Use src/dutchie-az/services/product-crawler.ts instead. - * This scraper writes to the legacy products table, not the new dutchie_az tables. - * - * Makes direct GraphQL requests from within the browser context to: - * 1. Bypass Cloudflare (using browser session) - * 2. Fetch ALL products including out-of-stock (Status: null) - * 3. Paginate through complete menu - */ -const puppeteer_extra_1 = __importDefault(require("puppeteer-extra")); -const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth")); -const dutchie_graphql_1 = require("./dutchie-graphql"); -puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)()); -// GraphQL persisted query hashes -const GRAPHQL_HASHES = { - FilteredProducts: 'ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0', - GetAddressBasedDispensaryData: '13461f73abf7268770dfd05fe7e10c523084b2bb916a929c08efe3d87531977b', -}; -/** - * Fetch all products via in-page GraphQL requests - * This includes both in-stock and out-of-stock items - */ -async function fetchAllDutchieProducts(menuUrl, options = {}) { - const { headless = 'new', timeout = 90000, perPage = 100, includeOutOfStock = true, } = options; - let browser; - try { - browser = await puppeteer_extra_1.default.launch({ - headless, - args: [ - '--no-sandbox', - '--disable-setuid-sandbox', - '--disable-dev-shm-usage', - '--disable-blink-features=AutomationControlled', - ], - }); - const page = await browser.newPage(); - // Stealth configuration - await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'); - await page.setViewport({ width: 1920, height: 1080 }); - await page.evaluateOnNewDocument(() => { - Object.defineProperty(navigator, 'webdriver', { get: () => false }); - window.chrome = { runtime: {} }; - }); - // Navigate to menu page to establish session - console.log('[DutchieGraphQL] Loading menu page to establish session...'); - await page.goto(menuUrl, { - waitUntil: 'networkidle2', - timeout, - }); - // Get dispensary ID from page - const dispensaryId = await page.evaluate(() => { - const env = window.reactEnv; - return env?.dispensaryId || env?.retailerId || ''; - }); - if (!dispensaryId) { - throw new Error('Could not determine dispensaryId from page'); - } - console.log(`[DutchieGraphQL] Dispensary ID: ${dispensaryId}`); - // Fetch all products via in-page GraphQL requests - const allProducts = []; - let page_num = 0; - let hasMore = true; - while (hasMore) { - console.log(`[DutchieGraphQL] Fetching page ${page_num} (perPage=${perPage})...`); - const result = await page.evaluate(async (dispensaryId, page_num, perPage, includeOutOfStock, hash) => { - const variables = { - includeEnterpriseSpecials: false, - productsFilter: { - dispensaryId, - pricingType: 'rec', - Status: includeOutOfStock ? null : 'Active', // null = include out-of-stock - types: [], - useCache: false, // Don't cache to get fresh data - isDefaultSort: true, - sortBy: 'popularSortIdx', - sortDirection: 1, - bypassOnlineThresholds: true, - isKioskMenu: false, - removeProductsBelowOptionThresholds: false, - }, - page: page_num, - perPage, - }; - const qs = new URLSearchParams({ - operationName: 'FilteredProducts', - variables: JSON.stringify(variables), - extensions: JSON.stringify({ - persistedQuery: { version: 1, sha256Hash: hash }, - }), - }); - const response = await fetch(`https://dutchie.com/graphql?${qs.toString()}`, { - method: 'GET', - headers: { - 'content-type': 'application/json', - 'apollographql-client-name': 'Marketplace (production)', - }, - credentials: 'include', // Include cookies/session - }); - if (!response.ok) { - throw new Error(`HTTP ${response.status}`); - } - return response.json(); - }, dispensaryId, page_num, perPage, includeOutOfStock, GRAPHQL_HASHES.FilteredProducts); - if (result.errors) { - console.error('[DutchieGraphQL] GraphQL errors:', result.errors); - break; - } - const products = result?.data?.filteredProducts?.products || []; - console.log(`[DutchieGraphQL] Page ${page_num}: ${products.length} products`); - if (products.length === 0) { - hasMore = false; - } - else { - allProducts.push(...products); - page_num++; - // Safety limit - if (page_num > 50) { - console.log('[DutchieGraphQL] Reached page limit, stopping'); - hasMore = false; - } - } - } - // Count active vs inactive - const activeCount = allProducts.filter((p) => p.Status === 'Active').length; - const inactiveCount = allProducts.filter((p) => p.Status !== 'Active').length; - console.log(`[DutchieGraphQL] Total: ${allProducts.length} products (${activeCount} active, ${inactiveCount} inactive)`); - return { - products: allProducts, - dispensaryId, - totalProducts: allProducts.length, - activeCount, - inactiveCount, - }; - } - finally { - if (browser) { - await browser.close(); - } - } -} -/** - * Upsert products to database - */ -async function upsertProductsDirect(pool, storeId, products) { - const client = await pool.connect(); - let inserted = 0; - let updated = 0; - try { - await client.query('BEGIN'); - for (const product of products) { - const result = await client.query(` - INSERT INTO products ( - store_id, external_id, slug, name, enterprise_product_id, - brand, brand_external_id, brand_logo_url, - subcategory, strain_type, canonical_category, - price, rec_price, med_price, rec_special_price, med_special_price, - is_on_special, special_name, discount_percent, special_data, - sku, inventory_quantity, inventory_available, is_below_threshold, status, - thc_percentage, cbd_percentage, cannabinoids, - weight_mg, net_weight_value, net_weight_unit, options, raw_options, - image_url, additional_images, - is_featured, medical_only, rec_only, - source_created_at, source_updated_at, - description, raw_data, - dutchie_url, last_seen_at, updated_at - ) - VALUES ( - $1, $2, $3, $4, $5, - $6, $7, $8, - $9, $10, $11, - $12, $13, $14, $15, $16, - $17, $18, $19, $20, - $21, $22, $23, $24, $25, - $26, $27, $28, - $29, $30, $31, $32, $33, - $34, $35, - $36, $37, $38, - $39, $40, - $41, $42, - '', NOW(), NOW() - ) - ON CONFLICT (store_id, slug) DO UPDATE SET - name = EXCLUDED.name, - enterprise_product_id = EXCLUDED.enterprise_product_id, - brand = EXCLUDED.brand, - brand_external_id = EXCLUDED.brand_external_id, - brand_logo_url = EXCLUDED.brand_logo_url, - subcategory = EXCLUDED.subcategory, - strain_type = EXCLUDED.strain_type, - canonical_category = EXCLUDED.canonical_category, - price = EXCLUDED.price, - rec_price = EXCLUDED.rec_price, - med_price = EXCLUDED.med_price, - rec_special_price = EXCLUDED.rec_special_price, - med_special_price = EXCLUDED.med_special_price, - is_on_special = EXCLUDED.is_on_special, - special_name = EXCLUDED.special_name, - discount_percent = EXCLUDED.discount_percent, - special_data = EXCLUDED.special_data, - sku = EXCLUDED.sku, - inventory_quantity = EXCLUDED.inventory_quantity, - inventory_available = EXCLUDED.inventory_available, - is_below_threshold = EXCLUDED.is_below_threshold, - status = EXCLUDED.status, - thc_percentage = EXCLUDED.thc_percentage, - cbd_percentage = EXCLUDED.cbd_percentage, - cannabinoids = EXCLUDED.cannabinoids, - weight_mg = EXCLUDED.weight_mg, - net_weight_value = EXCLUDED.net_weight_value, - net_weight_unit = EXCLUDED.net_weight_unit, - options = EXCLUDED.options, - raw_options = EXCLUDED.raw_options, - image_url = EXCLUDED.image_url, - additional_images = EXCLUDED.additional_images, - is_featured = EXCLUDED.is_featured, - medical_only = EXCLUDED.medical_only, - rec_only = EXCLUDED.rec_only, - source_created_at = EXCLUDED.source_created_at, - source_updated_at = EXCLUDED.source_updated_at, - description = EXCLUDED.description, - raw_data = EXCLUDED.raw_data, - last_seen_at = NOW(), - updated_at = NOW() - RETURNING (xmax = 0) AS was_inserted - `, [ - storeId, - product.external_id, - product.slug, - product.name, - product.enterprise_product_id, - product.brand, - product.brand_external_id, - product.brand_logo_url, - product.subcategory, - product.strain_type, - product.canonical_category, - product.price, - product.rec_price, - product.med_price, - product.rec_special_price, - product.med_special_price, - product.is_on_special, - product.special_name, - product.discount_percent, - product.special_data ? JSON.stringify(product.special_data) : null, - product.sku, - product.inventory_quantity, - product.inventory_available, - product.is_below_threshold, - product.status, - product.thc_percentage, - product.cbd_percentage, - product.cannabinoids ? JSON.stringify(product.cannabinoids) : null, - product.weight_mg, - product.net_weight_value, - product.net_weight_unit, - product.options, - product.raw_options, - product.image_url, - product.additional_images, - product.is_featured, - product.medical_only, - product.rec_only, - product.source_created_at, - product.source_updated_at, - product.description, - product.raw_data ? JSON.stringify(product.raw_data) : null, - ]); - if (result.rows[0]?.was_inserted) { - inserted++; - } - else { - updated++; - } - } - await client.query('COMMIT'); - return { inserted, updated }; - } - catch (error) { - await client.query('ROLLBACK'); - throw error; - } - finally { - client.release(); - } -} -/** - * @deprecated DEPRECATED - Use src/dutchie-az/services/product-crawler.ts instead. - * This function is disabled and will throw an error if called. - * Main entry point - scrape all products including out-of-stock - */ -async function scrapeAllDutchieProducts(pool, storeId, menuUrl) { - // DEPRECATED: Throw error to prevent accidental use - throw new Error('DEPRECATED: scrapeAllDutchieProducts() is deprecated. ' + - 'Use src/dutchie-az/services/product-crawler.ts instead. ' + - 'This scraper writes to the legacy products table.'); - // Original code below is unreachable but kept for reference - try { - console.log(`[DutchieGraphQL] Scraping ALL products (including out-of-stock): ${menuUrl}`); - // Fetch all products via direct GraphQL - const { products, totalProducts, activeCount, inactiveCount } = await fetchAllDutchieProducts(menuUrl, { - includeOutOfStock: true, - perPage: 100, - }); - if (products.length === 0) { - return { - success: false, - totalProducts: 0, - activeCount: 0, - inactiveCount: 0, - inserted: 0, - updated: 0, - error: 'No products returned from GraphQL', - }; - } - // Normalize products - const normalized = products.map(dutchie_graphql_1.normalizeDutchieProduct); - // Upsert to database - const { inserted, updated } = await upsertProductsDirect(pool, storeId, normalized); - console.log(`[DutchieGraphQL] Complete: ${totalProducts} products (${activeCount} active, ${inactiveCount} inactive)`); - console.log(`[DutchieGraphQL] Database: ${inserted} inserted, ${updated} updated`); - return { - success: true, - totalProducts, - activeCount, - inactiveCount, - inserted, - updated, - }; - } - catch (error) { - console.error(`[DutchieGraphQL] Error:`, error.message); - return { - success: false, - totalProducts: 0, - activeCount: 0, - inactiveCount: 0, - inserted: 0, - updated: 0, - error: error.message, - }; - } -} diff --git a/backend/dist/scrapers/dutchie-graphql.js b/backend/dist/scrapers/dutchie-graphql.js deleted file mode 100644 index d1dab343..00000000 --- a/backend/dist/scrapers/dutchie-graphql.js +++ /dev/null @@ -1,446 +0,0 @@ -"use strict"; -// ============================================================================ -// DEPRECATED: This scraper writes to the LEGACY products table. -// DO NOT USE - All Dutchie crawling must use the new dutchie-az pipeline. -// -// New pipeline location: src/dutchie-az/services/product-crawler.ts -// - Uses fetch-based GraphQL (no Puppeteer needed) -// - Writes to isolated dutchie_az_* tables with snapshot model -// - Tracks stockStatus, isPresentInFeed, missing_from_feed -// -// The normalizer functions in this file (normalizeDutchieProduct) may still -// be imported for reference, but do NOT call scrapeDutchieMenu() or upsertProducts(). -// ============================================================================ -var __importDefault = (this && this.__importDefault) || function (mod) { - return (mod && mod.__esModule) ? mod : { "default": mod }; -}; -Object.defineProperty(exports, "__esModule", { value: true }); -exports.normalizeDutchieProduct = normalizeDutchieProduct; -exports.fetchDutchieMenuViaPuppeteer = fetchDutchieMenuViaPuppeteer; -exports.upsertProducts = upsertProducts; -exports.scrapeDutchieMenu = scrapeDutchieMenu; -/** - * @deprecated DEPRECATED - Use src/dutchie-az/services/product-crawler.ts instead. - * This scraper writes to the legacy products table, not the new dutchie_az tables. - * - * Fetches product data via Puppeteer interception of Dutchie's GraphQL API. - * This bypasses Cloudflare by using a real browser to load the menu page. - * - * GraphQL Operations: - * - FilteredProducts: Returns paginated product list with full details - * - GetAddressBasedDispensaryData: Resolves dispensary cName to dispensaryId - */ -const puppeteer_extra_1 = __importDefault(require("puppeteer-extra")); -const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth")); -puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)()); -// ===================================================== -// NORMALIZER: Dutchie GraphQL → DB Schema -// ===================================================== -function normalizeDutchieProduct(product) { - // Extract first special if exists - const saleSpecial = product.specialData?.saleSpecials?.[0]; - // Calculate inventory from POSMetaData children - const children = product.POSMetaData?.children || []; - const totalQuantity = children.reduce((sum, c) => sum + (c.quantity || 0), 0); - const availableQuantity = children.reduce((sum, c) => sum + (c.quantityAvailable || 0), 0); - // Parse timestamps - let sourceCreatedAt; - if (product.createdAt) { - // createdAt is a timestamp string like "1729044510543" - const ts = parseInt(product.createdAt, 10); - if (!isNaN(ts)) { - sourceCreatedAt = new Date(ts); - } - } - let sourceUpdatedAt; - if (product.updatedAt) { - sourceUpdatedAt = new Date(product.updatedAt); - } - return { - // Identity - external_id: product._id || product.id, - slug: product.cName, - name: product.Name, - enterprise_product_id: product.enterpriseProductId, - // Brand - brand: product.brandName || product.brand?.name, - brand_external_id: product.brandId || product.brand?.id, - brand_logo_url: product.brandLogo || product.brand?.imageUrl, - // Category - subcategory: product.subcategory, - strain_type: product.strainType, - canonical_category: product.POSMetaData?.canonicalCategory, - // Pricing - price: product.Prices?.[0], - rec_price: product.recPrices?.[0], - med_price: product.medicalPrices?.[0], - rec_special_price: product.recSpecialPrices?.[0], - med_special_price: product.medicalSpecialPrices?.[0], - // Specials - is_on_special: product.special === true, - special_name: saleSpecial?.specialName, - discount_percent: saleSpecial?.percentDiscount ? saleSpecial.discount : undefined, - special_data: product.specialData, - // Inventory - sku: product.POSMetaData?.canonicalSKU, - inventory_quantity: totalQuantity || undefined, - inventory_available: availableQuantity || undefined, - is_below_threshold: product.isBelowThreshold === true, - status: product.Status, - // Potency - thc_percentage: product.THCContent?.range?.[0], - cbd_percentage: product.CBDContent?.range?.[0], - cannabinoids: product.cannabinoidsV2, - // Weight/Options - weight_mg: product.weight, - net_weight_value: product.measurements?.netWeight?.values?.[0], - net_weight_unit: product.measurements?.netWeight?.unit, - options: product.Options, - raw_options: product.rawOptions, - // Images - image_url: product.Image, - additional_images: product.images?.length ? product.images : undefined, - // Flags - is_featured: product.featured === true, - medical_only: product.medicalOnly === true, - rec_only: product.recOnly === true, - // Timestamps - source_created_at: sourceCreatedAt, - source_updated_at: sourceUpdatedAt, - // Description - description: typeof product.description === 'string' ? product.description : undefined, - // Raw - raw_data: product, - }; -} -async function fetchDutchieMenuViaPuppeteer(menuUrl, options = {}) { - const { headless = 'new', timeout = 90000, maxScrolls = 30, // Increased for full menu capture - } = options; - let browser; - const capturedProducts = []; - let dispensaryId = ''; - try { - browser = await puppeteer_extra_1.default.launch({ - headless, - args: [ - '--no-sandbox', - '--disable-setuid-sandbox', - '--disable-dev-shm-usage', - '--disable-blink-features=AutomationControlled', - ], - }); - const page = await browser.newPage(); - // Stealth configuration - await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'); - await page.setViewport({ width: 1920, height: 1080 }); - await page.evaluateOnNewDocument(() => { - Object.defineProperty(navigator, 'webdriver', { get: () => false }); - window.chrome = { runtime: {} }; - }); - // Track seen product IDs to avoid duplicates - const seenIds = new Set(); - // Intercept GraphQL responses - page.on('response', async (response) => { - const url = response.url(); - if (!url.includes('graphql')) - return; - try { - const contentType = response.headers()['content-type'] || ''; - if (!contentType.includes('application/json')) - return; - const data = await response.json(); - // Capture dispensary ID - if (data?.data?.getAddressBasedDispensaryData?.dispensaryData?.dispensaryId) { - dispensaryId = data.data.getAddressBasedDispensaryData.dispensaryData.dispensaryId; - } - // Capture products from FilteredProducts - if (data?.data?.filteredProducts?.products) { - const products = data.data.filteredProducts.products; - for (const product of products) { - if (!seenIds.has(product._id)) { - seenIds.add(product._id); - capturedProducts.push(product); - } - } - } - } - catch { - // Ignore parse errors - } - }); - // Navigate to menu - console.log('[DutchieGraphQL] Loading menu page...'); - await page.goto(menuUrl, { - waitUntil: 'networkidle2', - timeout, - }); - // Get dispensary ID from window.reactEnv if not captured - if (!dispensaryId) { - dispensaryId = await page.evaluate(() => { - const env = window.reactEnv; - return env?.dispensaryId || env?.retailerId || ''; - }); - } - // Helper function to scroll through a page until no more products load - async function scrollToLoadAll(maxScrollAttempts = maxScrolls) { - let scrollCount = 0; - let previousCount = 0; - let noNewProductsCount = 0; - while (scrollCount < maxScrollAttempts && noNewProductsCount < 3) { - await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight)); - await new Promise((r) => setTimeout(r, 1500)); - const currentCount = seenIds.size; - if (currentCount === previousCount) { - noNewProductsCount++; - } - else { - noNewProductsCount = 0; - } - previousCount = currentCount; - scrollCount++; - } - } - // First, scroll through the main page (all products) - console.log('[DutchieGraphQL] Scrolling main page...'); - await scrollToLoadAll(); - console.log(`[DutchieGraphQL] After main page: ${seenIds.size} products`); - // Get category links from the navigation - const categoryLinks = await page.evaluate(() => { - const links = []; - // Look for category navigation links - const navLinks = document.querySelectorAll('a[href*="/products/"]'); - navLinks.forEach((link) => { - const href = link.href; - if (href && !links.includes(href)) { - links.push(href); - } - }); - return links; - }); - console.log(`[DutchieGraphQL] Found ${categoryLinks.length} category links`); - // Visit each category page to capture all products - for (const categoryUrl of categoryLinks) { - try { - console.log(`[DutchieGraphQL] Visiting category: ${categoryUrl.split('/').pop()}`); - await page.goto(categoryUrl, { - waitUntil: 'networkidle2', - timeout: 30000, - }); - await scrollToLoadAll(15); // Fewer scrolls per category - console.log(`[DutchieGraphQL] Total products: ${seenIds.size}`); - } - catch (e) { - console.log(`[DutchieGraphQL] Category error: ${e.message}`); - } - } - // Wait for any final responses - await new Promise((r) => setTimeout(r, 2000)); - return { - products: capturedProducts, - dispensaryId, - menuUrl, - }; - } - finally { - if (browser) { - await browser.close(); - } - } -} -// ===================================================== -// DATABASE OPERATIONS -// ===================================================== -async function upsertProducts(pool, storeId, products) { - const client = await pool.connect(); - let inserted = 0; - let updated = 0; - try { - await client.query('BEGIN'); - for (const product of products) { - // Upsert product - const result = await client.query(` - INSERT INTO products ( - store_id, external_id, slug, name, enterprise_product_id, - brand, brand_external_id, brand_logo_url, - subcategory, strain_type, canonical_category, - price, rec_price, med_price, rec_special_price, med_special_price, - is_on_special, special_name, discount_percent, special_data, - sku, inventory_quantity, inventory_available, is_below_threshold, status, - thc_percentage, cbd_percentage, cannabinoids, - weight_mg, net_weight_value, net_weight_unit, options, raw_options, - image_url, additional_images, - is_featured, medical_only, rec_only, - source_created_at, source_updated_at, - description, raw_data, - dutchie_url, last_seen_at, updated_at - ) - VALUES ( - $1, $2, $3, $4, $5, - $6, $7, $8, - $9, $10, $11, - $12, $13, $14, $15, $16, - $17, $18, $19, $20, - $21, $22, $23, $24, $25, - $26, $27, $28, - $29, $30, $31, $32, $33, - $34, $35, - $36, $37, $38, - $39, $40, - $41, $42, - '', NOW(), NOW() - ) - ON CONFLICT (store_id, slug) DO UPDATE SET - name = EXCLUDED.name, - enterprise_product_id = EXCLUDED.enterprise_product_id, - brand = EXCLUDED.brand, - brand_external_id = EXCLUDED.brand_external_id, - brand_logo_url = EXCLUDED.brand_logo_url, - subcategory = EXCLUDED.subcategory, - strain_type = EXCLUDED.strain_type, - canonical_category = EXCLUDED.canonical_category, - price = EXCLUDED.price, - rec_price = EXCLUDED.rec_price, - med_price = EXCLUDED.med_price, - rec_special_price = EXCLUDED.rec_special_price, - med_special_price = EXCLUDED.med_special_price, - is_on_special = EXCLUDED.is_on_special, - special_name = EXCLUDED.special_name, - discount_percent = EXCLUDED.discount_percent, - special_data = EXCLUDED.special_data, - sku = EXCLUDED.sku, - inventory_quantity = EXCLUDED.inventory_quantity, - inventory_available = EXCLUDED.inventory_available, - is_below_threshold = EXCLUDED.is_below_threshold, - status = EXCLUDED.status, - thc_percentage = EXCLUDED.thc_percentage, - cbd_percentage = EXCLUDED.cbd_percentage, - cannabinoids = EXCLUDED.cannabinoids, - weight_mg = EXCLUDED.weight_mg, - net_weight_value = EXCLUDED.net_weight_value, - net_weight_unit = EXCLUDED.net_weight_unit, - options = EXCLUDED.options, - raw_options = EXCLUDED.raw_options, - image_url = EXCLUDED.image_url, - additional_images = EXCLUDED.additional_images, - is_featured = EXCLUDED.is_featured, - medical_only = EXCLUDED.medical_only, - rec_only = EXCLUDED.rec_only, - source_created_at = EXCLUDED.source_created_at, - source_updated_at = EXCLUDED.source_updated_at, - description = EXCLUDED.description, - raw_data = EXCLUDED.raw_data, - last_seen_at = NOW(), - updated_at = NOW() - RETURNING (xmax = 0) AS was_inserted - `, [ - storeId, - product.external_id, - product.slug, - product.name, - product.enterprise_product_id, - product.brand, - product.brand_external_id, - product.brand_logo_url, - product.subcategory, - product.strain_type, - product.canonical_category, - product.price, - product.rec_price, - product.med_price, - product.rec_special_price, - product.med_special_price, - product.is_on_special, - product.special_name, - product.discount_percent, - product.special_data ? JSON.stringify(product.special_data) : null, - product.sku, - product.inventory_quantity, - product.inventory_available, - product.is_below_threshold, - product.status, - product.thc_percentage, - product.cbd_percentage, - product.cannabinoids ? JSON.stringify(product.cannabinoids) : null, - product.weight_mg, - product.net_weight_value, - product.net_weight_unit, - product.options, - product.raw_options, - product.image_url, - product.additional_images, - product.is_featured, - product.medical_only, - product.rec_only, - product.source_created_at, - product.source_updated_at, - product.description, - product.raw_data ? JSON.stringify(product.raw_data) : null, - ]); - if (result.rows[0]?.was_inserted) { - inserted++; - } - else { - updated++; - } - } - await client.query('COMMIT'); - return { inserted, updated }; - } - catch (error) { - await client.query('ROLLBACK'); - throw error; - } - finally { - client.release(); - } -} -// ===================================================== -// MAIN ENTRY POINT -// ===================================================== -/** - * @deprecated DEPRECATED - Use src/dutchie-az/services/product-crawler.ts instead. - * This function is disabled and will throw an error if called. - */ -async function scrapeDutchieMenu(pool, storeId, menuUrl) { - // DEPRECATED: Throw error to prevent accidental use - throw new Error('DEPRECATED: scrapeDutchieMenu() is deprecated. ' + - 'Use src/dutchie-az/services/product-crawler.ts instead. ' + - 'This scraper writes to the legacy products table.'); - // Original code below is unreachable but kept for reference - try { - console.log(`[DutchieGraphQL] Scraping: ${menuUrl}`); - // Fetch products via Puppeteer - const { products, dispensaryId } = await fetchDutchieMenuViaPuppeteer(menuUrl); - console.log(`[DutchieGraphQL] Captured ${products.length} products, dispensaryId: ${dispensaryId}`); - if (products.length === 0) { - return { - success: false, - productsFound: 0, - inserted: 0, - updated: 0, - error: 'No products captured from GraphQL responses', - }; - } - // Normalize products - const normalized = products.map(normalizeDutchieProduct); - // Upsert to database - const { inserted, updated } = await upsertProducts(pool, storeId, normalized); - console.log(`[DutchieGraphQL] Upsert complete: ${inserted} inserted, ${updated} updated`); - return { - success: true, - productsFound: products.length, - inserted, - updated, - }; - } - catch (error) { - console.error(`[DutchieGraphQL] Error:`, error.message); - return { - success: false, - productsFound: 0, - inserted: 0, - updated: 0, - error: error.message, - }; - } -} diff --git a/backend/dist/scrapers/templates/dutchie.js b/backend/dist/scrapers/templates/dutchie.js deleted file mode 100644 index 54f1f96d..00000000 --- a/backend/dist/scrapers/templates/dutchie.js +++ /dev/null @@ -1,85 +0,0 @@ -"use strict"; -// ============================================================================ -// DEPRECATED: Dutchie now crawled via GraphQL only (see dutchie-az pipeline) -// DO NOT USE - This HTML scraper is unreliable and targets the legacy products table. -// All Dutchie crawling must go through: src/dutchie-az/services/product-crawler.ts -// ============================================================================ -Object.defineProperty(exports, "__esModule", { value: true }); -exports.dutchieTemplate = void 0; -exports.getTemplateForUrl = getTemplateForUrl; -const logger_1 = require("../../services/logger"); -/** - * @deprecated DEPRECATED - Dutchie HTML scraping is no longer supported. - * Use the dutchie-az GraphQL pipeline instead: src/dutchie-az/services/product-crawler.ts - * This template relied on unstable DOM selectors and wrote to legacy tables. - */ -exports.dutchieTemplate = { - name: 'Dutchie Marketplace', - urlPattern: /dutchie\.com\/dispensary\//, - buildCategoryUrl: (baseUrl, category) => { - // Remove trailing slash - const base = baseUrl.replace(/\/$/, ''); - // Convert category name to URL-friendly slug - const categorySlug = category.toLowerCase().replace(/\s+/g, '-'); - return `${base}/products/${categorySlug}`; - }, - extractProducts: async (page) => { - const products = []; - try { - // Wait for product cards to load - await page.waitForSelector('a[data-testid="card-link"]', { timeout: 10000 }).catch(() => { - logger_1.logger.warn('scraper', 'No product cards found with data-testid="card-link"'); - }); - // Get all product card links - const productCards = await page.locator('a[href*="/product/"][data-testid="card-link"]').all(); - logger_1.logger.info('scraper', `Found ${productCards.length} Dutchie product cards`); - for (const card of productCards) { - try { - // Extract all data at once using evaluate for speed - const cardData = await card.evaluate((el) => { - const href = el.getAttribute('href') || ''; - const img = el.querySelector('img'); - const imageUrl = img ? img.getAttribute('src') || '' : ''; - // Get all text nodes in order - const textElements = Array.from(el.querySelectorAll('*')) - .filter(el => el.textContent && el.children.length === 0) - .map(el => (el.textContent || '').trim()) - .filter(text => text.length > 0); - const name = textElements[0] || ''; - const brand = textElements[1] || ''; - // Look for price - const priceMatch = el.textContent?.match(/\$(\d+(?:\.\d{2})?)/); - const price = priceMatch ? parseFloat(priceMatch[1]) : undefined; - return { href, imageUrl, name, brand, price }; - }); - if (cardData.name && cardData.href) { - products.push({ - name: cardData.name, - brand: cardData.brand || undefined, - product_url: cardData.href.startsWith('http') ? cardData.href : `https://dutchie.com${cardData.href}`, - image_url: cardData.imageUrl || undefined, - price: cardData.price, - in_stock: true, - }); - } - } - catch (err) { - logger_1.logger.warn('scraper', `Error extracting Dutchie product card: ${err}`); - } - } - } - catch (err) { - logger_1.logger.error('scraper', `Error in Dutchie product extraction: ${err}`); - } - return products; - }, -}; -/** - * Get the appropriate scraper template based on URL - */ -function getTemplateForUrl(url) { - if (exports.dutchieTemplate.urlPattern.test(url)) { - return exports.dutchieTemplate; - } - return null; -} diff --git a/backend/dist/scripts/backfill-store-dispensary.js b/backend/dist/scripts/backfill-store-dispensary.js deleted file mode 100644 index 4a9ea57a..00000000 --- a/backend/dist/scripts/backfill-store-dispensary.js +++ /dev/null @@ -1,287 +0,0 @@ -#!/usr/bin/env npx tsx -"use strict"; -/** - * Backfill Store-Dispensary Mapping - * - * Links existing stores (scheduler) to dispensaries (master AZDHS directory) - * by matching on name, city, and zip code. - * - * Usage: - * npx tsx src/scripts/backfill-store-dispensary.ts # Preview matches - * npx tsx src/scripts/backfill-store-dispensary.ts --apply # Apply matches - * npx tsx src/scripts/backfill-store-dispensary.ts --verbose # Show all match details - */ -Object.defineProperty(exports, "__esModule", { value: true }); -const migrate_1 = require("../db/migrate"); -const logger_1 = require("../services/logger"); -const args = process.argv.slice(2); -const flags = { - apply: args.includes('--apply'), - verbose: args.includes('--verbose'), - help: args.includes('--help') || args.includes('-h'), -}; -/** - * Normalize a store/dispensary name for comparison - * Removes common suffixes, punctuation, and extra whitespace - */ -function normalizeName(name) { - return name - .toLowerCase() - .replace(/\s*[-–—]\s*/g, ' ') // Normalize dashes to spaces - .replace(/\s*(dispensary|cannabis|marijuana|weed|shop|store|llc|inc)\s*/gi, ' ') - .replace(/['']/g, "'") // Normalize apostrophes - .replace(/[^\w\s']/g, '') // Remove other punctuation - .replace(/\s+/g, ' ') // Collapse whitespace - .trim(); -} -/** - * Simple Levenshtein distance for fuzzy matching - */ -function levenshteinDistance(a, b) { - const matrix = []; - for (let i = 0; i <= b.length; i++) { - matrix[i] = [i]; - } - for (let j = 0; j <= a.length; j++) { - matrix[0][j] = j; - } - for (let i = 1; i <= b.length; i++) { - for (let j = 1; j <= a.length; j++) { - if (b.charAt(i - 1) === a.charAt(j - 1)) { - matrix[i][j] = matrix[i - 1][j - 1]; - } - else { - matrix[i][j] = Math.min(matrix[i - 1][j - 1] + 1, // substitution - matrix[i][j - 1] + 1, // insertion - matrix[i - 1][j] + 1 // deletion - ); - } - } - } - return matrix[b.length][a.length]; -} -/** - * Calculate similarity score (0-100) - */ -function similarityScore(a, b) { - const maxLen = Math.max(a.length, b.length); - if (maxLen === 0) - return 100; - const distance = levenshteinDistance(a, b); - return Math.round((1 - distance / maxLen) * 100); -} -/** - * Find the best dispensary match for a store - */ -function findBestMatch(store, dispensaries) { - const normalizedStoreName = normalizeName(store.name); - const storeSlug = store.slug.toLowerCase(); - let bestMatch = { - store, - dispensary: null, - matchType: 'none', - score: 0, - }; - for (const disp of dispensaries) { - const normalizedDispName = normalizeName(disp.name); - const normalizedCompanyName = disp.company_name ? normalizeName(disp.company_name) : ''; - const dispSlug = disp.slug.toLowerCase(); - // 1. Exact name match (case-insensitive) - if (store.name.toLowerCase() === disp.name.toLowerCase()) { - return { - store, - dispensary: disp, - matchType: 'exact_name', - score: 100, - }; - } - // 2. Normalized name match - if (normalizedStoreName === normalizedDispName) { - return { - store, - dispensary: disp, - matchType: 'normalized_name', - score: 95, - }; - } - // 3. Store name matches company name - if (normalizedCompanyName && normalizedStoreName === normalizedCompanyName) { - return { - store, - dispensary: disp, - matchType: 'company_name', - score: 90, - }; - } - // 4. Slug match - if (storeSlug === dispSlug) { - return { - store, - dispensary: disp, - matchType: 'slug', - score: 85, - }; - } - // 5. Fuzzy matching (only if score > 70) - const nameScore = similarityScore(normalizedStoreName, normalizedDispName); - const companyScore = normalizedCompanyName - ? similarityScore(normalizedStoreName, normalizedCompanyName) - : 0; - const fuzzyScore = Math.max(nameScore, companyScore); - if (fuzzyScore > bestMatch.score && fuzzyScore >= 70) { - bestMatch = { - store, - dispensary: disp, - matchType: 'fuzzy', - score: fuzzyScore, - }; - } - } - return bestMatch; -} -async function main() { - if (flags.help) { - console.log(` -Backfill Store-Dispensary Mapping - -Links existing stores (scheduler) to dispensaries (master AZDHS directory) -by matching on name, company name, or slug similarity. - -USAGE: - npx tsx src/scripts/backfill-store-dispensary.ts [OPTIONS] - -OPTIONS: - --apply Apply the mappings to the database (default: preview only) - --verbose Show detailed match information for all stores - --help, -h Show this help message - -EXAMPLES: - # Preview what would be matched - npx tsx src/scripts/backfill-store-dispensary.ts - - # Apply the mappings - npx tsx src/scripts/backfill-store-dispensary.ts --apply - - # Show verbose output - npx tsx src/scripts/backfill-store-dispensary.ts --verbose -`); - process.exit(0); - } - console.log('\n📦 Backfill Store-Dispensary Mapping'); - console.log('=====================================\n'); - try { - // Fetch all stores without a dispensary_id - const storesResult = await migrate_1.pool.query(` - SELECT id, name, slug, dispensary_id - FROM stores - WHERE dispensary_id IS NULL - ORDER BY name - `); - const unmappedStores = storesResult.rows; - // Fetch all already-mapped stores for context - const mappedResult = await migrate_1.pool.query(` - SELECT id, name, slug, dispensary_id - FROM stores - WHERE dispensary_id IS NOT NULL - ORDER BY name - `); - const mappedStores = mappedResult.rows; - // Fetch all dispensaries - const dispResult = await migrate_1.pool.query(` - SELECT id, name, company_name, city, address, slug - FROM dispensaries - ORDER BY name - `); - const dispensaries = dispResult.rows; - console.log(`📊 Current Status:`); - console.log(` Stores without dispensary_id: ${unmappedStores.length}`); - console.log(` Stores already mapped: ${mappedStores.length}`); - console.log(` Total dispensaries: ${dispensaries.length}\n`); - if (unmappedStores.length === 0) { - console.log('✅ All stores are already mapped to dispensaries!\n'); - await migrate_1.pool.end(); - process.exit(0); - } - // Find matches for each unmapped store - const matches = []; - const noMatches = []; - for (const store of unmappedStores) { - const match = findBestMatch(store, dispensaries); - if (match.dispensary) { - matches.push(match); - } - else { - noMatches.push(store); - } - } - // Sort matches by score (highest first) - matches.sort((a, b) => b.score - a.score); - // Display results - console.log(`\n🔗 Matches Found: ${matches.length}`); - console.log('----------------------------------\n'); - if (matches.length > 0) { - // Group by match type - const byType = {}; - for (const m of matches) { - if (!byType[m.matchType]) - byType[m.matchType] = []; - byType[m.matchType].push(m); - } - const typeLabels = { - exact_name: '✅ Exact Name Match', - normalized_name: '✅ Normalized Name Match', - company_name: '🏢 Company Name Match', - slug: '🔗 Slug Match', - fuzzy: '🔍 Fuzzy Match', - }; - for (const [type, results] of Object.entries(byType)) { - console.log(`${typeLabels[type]} (${results.length}):`); - for (const r of results) { - const dispInfo = r.dispensary; - console.log(` • "${r.store.name}" → "${dispInfo.name}" (${dispInfo.city}) [${r.score}%]`); - } - console.log(''); - } - } - if (noMatches.length > 0) { - console.log(`\n❌ No Match Found: ${noMatches.length}`); - console.log('----------------------------------\n'); - for (const store of noMatches) { - console.log(` • "${store.name}" (slug: ${store.slug})`); - } - console.log(''); - } - // Apply if requested - if (flags.apply && matches.length > 0) { - console.log('\n🔧 Applying mappings...\n'); - let updated = 0; - for (const match of matches) { - if (!match.dispensary) - continue; - await migrate_1.pool.query('UPDATE stores SET dispensary_id = $1 WHERE id = $2', [match.dispensary.id, match.store.id]); - updated++; - if (flags.verbose) { - console.log(` ✓ Linked store ${match.store.id} to dispensary ${match.dispensary.id}`); - } - } - console.log(`\n✅ Updated ${updated} stores with dispensary mappings\n`); - logger_1.logger.info('system', `Backfill complete: linked ${updated} stores to dispensaries`); - } - else if (matches.length > 0 && !flags.apply) { - console.log('\n💡 Run with --apply to update the database\n'); - } - // Summary - console.log('📈 Summary:'); - console.log(` Would match: ${matches.length} stores`); - console.log(` No match: ${noMatches.length} stores`); - console.log(` Match rate: ${Math.round((matches.length / unmappedStores.length) * 100)}%\n`); - } - catch (error) { - console.error('Error:', error); - process.exit(1); - } - finally { - await migrate_1.pool.end(); - } -} -main().catch(console.error); diff --git a/backend/dist/scripts/bootstrap-discovery.js b/backend/dist/scripts/bootstrap-discovery.js deleted file mode 100644 index eac151f4..00000000 --- a/backend/dist/scripts/bootstrap-discovery.js +++ /dev/null @@ -1,332 +0,0 @@ -#!/usr/bin/env npx tsx -"use strict"; -/** - * Bootstrap Discovery Script - * - * One-time (but reusable) bootstrap command that: - * 1. Ensures every Dispensary has a dispensary_crawl_schedule entry (4h default) - * 2. Optionally runs RunDispensaryOrchestrator for each dispensary - * - * Usage: - * npx tsx src/scripts/bootstrap-discovery.ts # Create schedules only - * npx tsx src/scripts/bootstrap-discovery.ts --run # Create schedules + run orchestrator - * npx tsx src/scripts/bootstrap-discovery.ts --run --limit=10 # Run for first 10 dispensaries - * npx tsx src/scripts/bootstrap-discovery.ts --dry-run # Preview what would happen - * npx tsx src/scripts/bootstrap-discovery.ts --status # Show current status only - */ -Object.defineProperty(exports, "__esModule", { value: true }); -const migrate_1 = require("../db/migrate"); -const dispensary_orchestrator_1 = require("../services/dispensary-orchestrator"); -// Parse command line args -const args = process.argv.slice(2); -const flags = { - run: args.includes('--run'), - dryRun: args.includes('--dry-run'), - status: args.includes('--status'), - help: args.includes('--help') || args.includes('-h'), - limit: parseInt(args.find(a => a.startsWith('--limit='))?.split('=')[1] || '0'), - concurrency: parseInt(args.find(a => a.startsWith('--concurrency='))?.split('=')[1] || '3'), - interval: parseInt(args.find(a => a.startsWith('--interval='))?.split('=')[1] || '240'), - detectionOnly: args.includes('--detection-only'), - productionOnly: args.includes('--production-only'), - sandboxOnly: args.includes('--sandbox-only'), -}; -async function showHelp() { - console.log(` -Bootstrap Discovery - Initialize Dispensary Crawl System - -USAGE: - npx tsx src/scripts/bootstrap-discovery.ts [OPTIONS] - -OPTIONS: - --run After creating schedules, run the orchestrator for each dispensary - --dry-run Show what would happen without making changes - --status Show current status and exit - --limit=N Limit how many dispensaries to process (0 = all, default: 0) - --concurrency=N How many dispensaries to process in parallel (default: 3) - --interval=M Default interval in minutes for new schedules (default: 240 = 4 hours) - --detection-only Only run detection, don't crawl - --production-only Only run dispensaries in production mode - --sandbox-only Only run dispensaries in sandbox mode - --help, -h Show this help message - -EXAMPLES: - # Create schedule entries for all dispensaries (no crawling) - npx tsx src/scripts/bootstrap-discovery.ts - - # Create schedules and run orchestrator for all dispensaries - npx tsx src/scripts/bootstrap-discovery.ts --run - - # Run orchestrator for first 10 dispensaries - npx tsx src/scripts/bootstrap-discovery.ts --run --limit=10 - - # Run with higher concurrency - npx tsx src/scripts/bootstrap-discovery.ts --run --concurrency=5 - - # Show current status - npx tsx src/scripts/bootstrap-discovery.ts --status - -WHAT IT DOES: - 1. Creates dispensary_crawl_schedule entries for all dispensaries that don't have one - 2. If --run: For each dispensary, runs the orchestrator which: - a. Checks if provider detection is needed (null/unknown/stale/low confidence) - b. Runs detection if needed - c. If Dutchie + production mode: runs production crawl - d. Otherwise: runs sandbox crawl - 3. Updates schedule status and job records -`); -} -async function showStatus() { - console.log('\n📊 Current Dispensary Crawl Status\n'); - console.log('═'.repeat(70)); - // Get dispensary counts by provider - const providerStats = await migrate_1.pool.query(` - SELECT - COALESCE(product_provider, 'undetected') as provider, - COUNT(*) as count, - COUNT(*) FILTER (WHERE product_crawler_mode = 'production') as production, - COUNT(*) FILTER (WHERE product_crawler_mode = 'sandbox') as sandbox, - COUNT(*) FILTER (WHERE product_crawler_mode IS NULL) as no_mode - FROM dispensaries - GROUP BY COALESCE(product_provider, 'undetected') - ORDER BY count DESC - `); - console.log('\nProvider Distribution:'); - console.log('-'.repeat(60)); - console.log('Provider'.padEnd(20) + - 'Total'.padStart(8) + - 'Production'.padStart(12) + - 'Sandbox'.padStart(10) + - 'No Mode'.padStart(10)); - console.log('-'.repeat(60)); - for (const row of providerStats.rows) { - console.log(row.provider.padEnd(20) + - row.count.toString().padStart(8) + - row.production.toString().padStart(12) + - row.sandbox.toString().padStart(10) + - row.no_mode.toString().padStart(10)); - } - // Get schedule stats - const scheduleStats = await migrate_1.pool.query(` - SELECT - COUNT(DISTINCT d.id) as total_dispensaries, - COUNT(DISTINCT dcs.id) as with_schedule, - COUNT(DISTINCT d.id) - COUNT(DISTINCT dcs.id) as without_schedule, - COUNT(*) FILTER (WHERE dcs.is_active = TRUE) as active_schedules, - COUNT(*) FILTER (WHERE dcs.last_status = 'success') as last_success, - COUNT(*) FILTER (WHERE dcs.last_status = 'error') as last_error, - COUNT(*) FILTER (WHERE dcs.last_status = 'sandbox_only') as last_sandbox, - COUNT(*) FILTER (WHERE dcs.last_status = 'detection_only') as last_detection, - COUNT(*) FILTER (WHERE dcs.next_run_at <= NOW()) as due_now, - AVG(dcs.interval_minutes)::INTEGER as avg_interval - FROM dispensaries d - LEFT JOIN dispensary_crawl_schedule dcs ON dcs.dispensary_id = d.id - `); - const s = scheduleStats.rows[0]; - console.log('\n\nSchedule Status:'); - console.log('-'.repeat(60)); - console.log(` Total Dispensaries: ${s.total_dispensaries}`); - console.log(` With Schedule: ${s.with_schedule}`); - console.log(` Without Schedule: ${s.without_schedule}`); - console.log(` Active Schedules: ${s.active_schedules || 0}`); - console.log(` Average Interval: ${s.avg_interval || 240} minutes`); - console.log('\n Last Run Status:'); - console.log(` - Success: ${s.last_success || 0}`); - console.log(` - Error: ${s.last_error || 0}`); - console.log(` - Sandbox Only: ${s.last_sandbox || 0}`); - console.log(` - Detection Only: ${s.last_detection || 0}`); - console.log(` - Due Now: ${s.due_now || 0}`); - // Get recent job stats - const jobStats = await migrate_1.pool.query(` - SELECT - COUNT(*) as total, - COUNT(*) FILTER (WHERE status = 'completed') as completed, - COUNT(*) FILTER (WHERE status = 'failed') as failed, - COUNT(*) FILTER (WHERE status = 'running') as running, - COUNT(*) FILTER (WHERE status = 'pending') as pending, - COUNT(*) FILTER (WHERE detection_ran = TRUE) as with_detection, - COUNT(*) FILTER (WHERE crawl_ran = TRUE) as with_crawl, - COUNT(*) FILTER (WHERE crawl_type = 'production') as production_crawls, - COUNT(*) FILTER (WHERE crawl_type = 'sandbox') as sandbox_crawls, - SUM(products_found) as total_products_found - FROM dispensary_crawl_jobs - WHERE created_at > NOW() - INTERVAL '24 hours' - `); - const j = jobStats.rows[0]; - console.log('\n\nJobs (Last 24 Hours):'); - console.log('-'.repeat(60)); - console.log(` Total Jobs: ${j.total || 0}`); - console.log(` Completed: ${j.completed || 0}`); - console.log(` Failed: ${j.failed || 0}`); - console.log(` Running: ${j.running || 0}`); - console.log(` Pending: ${j.pending || 0}`); - console.log(` With Detection: ${j.with_detection || 0}`); - console.log(` With Crawl: ${j.with_crawl || 0}`); - console.log(` - Production: ${j.production_crawls || 0}`); - console.log(` - Sandbox: ${j.sandbox_crawls || 0}`); - console.log(` Products Found: ${j.total_products_found || 0}`); - console.log('\n' + '═'.repeat(70) + '\n'); -} -async function createSchedules() { - console.log('\n📅 Creating Dispensary Schedules...\n'); - if (flags.dryRun) { - // Count how many would be created - const result = await migrate_1.pool.query(` - SELECT COUNT(*) as count - FROM dispensaries d - WHERE NOT EXISTS ( - SELECT 1 FROM dispensary_crawl_schedule dcs WHERE dcs.dispensary_id = d.id - ) - `); - const wouldCreate = parseInt(result.rows[0].count); - console.log(` Would create ${wouldCreate} new schedule entries (${flags.interval} minute interval)`); - return { created: wouldCreate, existing: 0 }; - } - const result = await (0, dispensary_orchestrator_1.ensureAllDispensariesHaveSchedules)(flags.interval); - console.log(` ✓ Created ${result.created} new schedule entries`); - console.log(` ✓ ${result.existing} dispensaries already had schedules`); - return result; -} -async function getDispensariesToProcess() { - // Build query based on filters - let whereClause = 'TRUE'; - if (flags.productionOnly) { - whereClause += ` AND d.product_crawler_mode = 'production'`; - } - else if (flags.sandboxOnly) { - whereClause += ` AND d.product_crawler_mode = 'sandbox'`; - } - if (flags.detectionOnly) { - whereClause += ` AND (d.product_provider IS NULL OR d.product_provider = 'unknown' OR d.product_confidence < 50)`; - } - const limitClause = flags.limit > 0 ? `LIMIT ${flags.limit}` : ''; - const query = ` - SELECT d.id, d.name, d.product_provider, d.product_crawler_mode - FROM dispensaries d - LEFT JOIN dispensary_crawl_schedule dcs ON dcs.dispensary_id = d.id - WHERE ${whereClause} - ORDER BY - COALESCE(dcs.priority, 0) DESC, - dcs.last_run_at ASC NULLS FIRST, - d.id ASC - ${limitClause} - `; - const result = await migrate_1.pool.query(query); - return result.rows.map(row => row.id); -} -async function runOrchestrator() { - console.log('\n🚀 Running Dispensary Orchestrator...\n'); - const dispensaryIds = await getDispensariesToProcess(); - if (dispensaryIds.length === 0) { - console.log(' No dispensaries to process.'); - return; - } - console.log(` Found ${dispensaryIds.length} dispensaries to process`); - console.log(` Concurrency: ${flags.concurrency}`); - if (flags.dryRun) { - console.log('\n Would process these dispensaries:'); - const details = await migrate_1.pool.query(`SELECT id, name, product_provider, product_crawler_mode - FROM dispensaries WHERE id = ANY($1) ORDER BY id`, [dispensaryIds]); - for (const row of details.rows.slice(0, 20)) { - console.log(` - [${row.id}] ${row.name} (${row.product_provider || 'undetected'}, ${row.product_crawler_mode || 'no mode'})`); - } - if (details.rows.length > 20) { - console.log(` ... and ${details.rows.length - 20} more`); - } - return; - } - console.log('\n Starting batch processing...\n'); - const results = await (0, dispensary_orchestrator_1.runBatchDispensaryOrchestrator)(dispensaryIds, flags.concurrency); - // Summarize results - const summary = { - total: results.length, - success: results.filter(r => r.status === 'success').length, - sandboxOnly: results.filter(r => r.status === 'sandbox_only').length, - detectionOnly: results.filter(r => r.status === 'detection_only').length, - error: results.filter(r => r.status === 'error').length, - detectionsRan: results.filter(r => r.detectionRan).length, - crawlsRan: results.filter(r => r.crawlRan).length, - productionCrawls: results.filter(r => r.crawlType === 'production').length, - sandboxCrawls: results.filter(r => r.crawlType === 'sandbox').length, - totalProducts: results.reduce((sum, r) => sum + (r.productsFound || 0), 0), - totalDuration: results.reduce((sum, r) => sum + r.durationMs, 0), - }; - console.log('\n' + '═'.repeat(70)); - console.log(' Orchestrator Results'); - console.log('═'.repeat(70)); - console.log(` - Total Processed: ${summary.total} - - Status: - - Success: ${summary.success} - - Sandbox Only: ${summary.sandboxOnly} - - Detection Only: ${summary.detectionOnly} - - Error: ${summary.error} - - Operations: - - Detections Ran: ${summary.detectionsRan} - - Crawls Ran: ${summary.crawlsRan} - - Production: ${summary.productionCrawls} - - Sandbox: ${summary.sandboxCrawls} - - Results: - - Products Found: ${summary.totalProducts} - - Total Duration: ${(summary.totalDuration / 1000).toFixed(1)}s - - Avg per Dispensary: ${(summary.totalDuration / summary.total / 1000).toFixed(1)}s -`); - console.log('═'.repeat(70) + '\n'); - // Show errors if any - const errors = results.filter(r => r.status === 'error'); - if (errors.length > 0) { - console.log('\n⚠️ Errors encountered:'); - for (const err of errors.slice(0, 10)) { - console.log(` - [${err.dispensaryId}] ${err.dispensaryName}: ${err.error}`); - } - if (errors.length > 10) { - console.log(` ... and ${errors.length - 10} more errors`); - } - } -} -async function main() { - if (flags.help) { - await showHelp(); - process.exit(0); - } - console.log('\n' + '═'.repeat(70)); - console.log(' Dispensary Crawl Bootstrap Discovery'); - console.log('═'.repeat(70)); - if (flags.dryRun) { - console.log('\n🔍 DRY RUN MODE - No changes will be made'); - } - try { - // Always show status first - await showStatus(); - if (flags.status) { - // Status-only mode, we're done - await migrate_1.pool.end(); - process.exit(0); - } - // Step 1: Create schedule entries - await createSchedules(); - // Step 2: Optionally run orchestrator - if (flags.run) { - await runOrchestrator(); - } - else { - console.log('\n💡 Tip: Use --run to also run the orchestrator for each dispensary'); - } - // Show final status - if (!flags.dryRun) { - await showStatus(); - } - } - catch (error) { - console.error('\n❌ Fatal error:', error.message); - console.error(error.stack); - process.exit(1); - } - finally { - await migrate_1.pool.end(); - } -} -main(); diff --git a/backend/dist/scripts/bootstrap-stores-for-dispensaries.js b/backend/dist/scripts/bootstrap-stores-for-dispensaries.js deleted file mode 100644 index d05098a5..00000000 --- a/backend/dist/scripts/bootstrap-stores-for-dispensaries.js +++ /dev/null @@ -1,65 +0,0 @@ -"use strict"; -Object.defineProperty(exports, "__esModule", { value: true }); -const pg_1 = require("pg"); -const pool = new pg_1.Pool({ connectionString: process.env.DATABASE_URL }); -/** - * Creates `stores` table records for all dispensaries that: - * 1. Have menu_type = 'dutchie' AND platform_dispensary_id (ready for GraphQL crawl) - * 2. Don't already have a linked stores record - * - * The stores table is required by the scraper engine (scrapeStore function) - */ -async function bootstrapStores() { - console.log('=== Bootstrapping stores for Dutchie dispensaries ===\n'); - // Find all dutchie dispensaries without linked stores - const result = await pool.query(` - SELECT d.id, d.name, d.slug, d.menu_type, d.platform_dispensary_id, d.menu_url - FROM dispensaries d - LEFT JOIN stores s ON s.dispensary_id = d.id - WHERE d.menu_type = 'dutchie' - AND d.platform_dispensary_id IS NOT NULL - AND s.id IS NULL - ORDER BY d.id - `); - console.log(`Found ${result.rows.length} dispensaries needing store records\n`); - let created = 0; - let errors = 0; - for (const d of result.rows) { - try { - // Insert store record linking to dispensary - // Note: stores table only has basic fields: name, slug, dispensary_id, dutchie_url - // The platform_dispensary_id for GraphQL crawling lives in the dispensaries table - const insertResult = await pool.query(` - INSERT INTO stores ( - name, - slug, - dispensary_id, - active, - scrape_enabled, - created_at, - updated_at - ) VALUES ($1, $2, $3, true, true, NOW(), NOW()) - RETURNING id - `, [ - d.name, - d.slug || d.name.toLowerCase().replace(/[^a-z0-9]+/g, '-'), - d.id - ]); - console.log(`[CREATED] Store ${insertResult.rows[0].id} for dispensary ${d.id}: ${d.name}`); - created++; - } - catch (e) { - console.error(`[ERROR] Dispensary ${d.id} (${d.name}): ${e.message}`); - errors++; - } - } - console.log('\n=== Bootstrap Summary ==='); - console.log(`Created: ${created}`); - console.log(`Errors: ${errors}`); - console.log(`Total needing stores: ${result.rows.length}`); - await pool.end(); -} -bootstrapStores().catch(e => { - console.error('Fatal error:', e.message); - process.exit(1); -}); diff --git a/backend/dist/scripts/capture-dutchie-schema.js b/backend/dist/scripts/capture-dutchie-schema.js deleted file mode 100644 index a0960547..00000000 --- a/backend/dist/scripts/capture-dutchie-schema.js +++ /dev/null @@ -1,236 +0,0 @@ -"use strict"; -/** - * Capture Dutchie GraphQL response structure via Puppeteer interception - * This script navigates to a Dutchie menu page and captures the GraphQL responses - * to understand the exact product data structure - */ -var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { - if (k2 === undefined) k2 = k; - var desc = Object.getOwnPropertyDescriptor(m, k); - if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { - desc = { enumerable: true, get: function() { return m[k]; } }; - } - Object.defineProperty(o, k2, desc); -}) : (function(o, m, k, k2) { - if (k2 === undefined) k2 = k; - o[k2] = m[k]; -})); -var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { - Object.defineProperty(o, "default", { enumerable: true, value: v }); -}) : function(o, v) { - o["default"] = v; -}); -var __importStar = (this && this.__importStar) || (function () { - var ownKeys = function(o) { - ownKeys = Object.getOwnPropertyNames || function (o) { - var ar = []; - for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; - return ar; - }; - return ownKeys(o); - }; - return function (mod) { - if (mod && mod.__esModule) return mod; - var result = {}; - if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); - __setModuleDefault(result, mod); - return result; - }; -})(); -var __importDefault = (this && this.__importDefault) || function (mod) { - return (mod && mod.__esModule) ? mod : { "default": mod }; -}; -Object.defineProperty(exports, "__esModule", { value: true }); -const puppeteer_extra_1 = __importDefault(require("puppeteer-extra")); -const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth")); -const fs = __importStar(require("fs")); -puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)()); -async function captureSchema(menuUrl) { - let browser; - const capturedResponses = []; - try { - console.log('='.repeat(80)); - console.log('DUTCHIE GRAPHQL SCHEMA CAPTURE'); - console.log('='.repeat(80)); - console.log(`\nTarget URL: ${menuUrl}\n`); - browser = await puppeteer_extra_1.default.launch({ - headless: 'new', - args: [ - '--no-sandbox', - '--disable-setuid-sandbox', - '--disable-dev-shm-usage', - '--disable-blink-features=AutomationControlled', - ] - }); - const page = await browser.newPage(); - // Use a realistic user agent - await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'); - // Set viewport to desktop size - await page.setViewport({ width: 1920, height: 1080 }); - // Hide webdriver flag - await page.evaluateOnNewDocument(() => { - Object.defineProperty(navigator, 'webdriver', { get: () => false }); - window.chrome = { runtime: {} }; - }); - // Intercept all GraphQL responses - page.on('response', async (response) => { - const url = response.url(); - // Only capture GraphQL responses - if (!url.includes('graphql')) - return; - try { - const contentType = response.headers()['content-type'] || ''; - if (!contentType.includes('application/json')) - return; - const data = await response.json(); - // Extract operation name from URL if possible - const urlParams = new URLSearchParams(url.split('?')[1] || ''); - const operationName = urlParams.get('operationName') || 'Unknown'; - capturedResponses.push({ - operationName, - url: url.substring(0, 200), - data, - timestamp: new Date() - }); - console.log(`📡 Captured: ${operationName}`); - // Check for product data - if (data?.data?.filteredProducts?.products) { - const products = data.data.filteredProducts.products; - console.log(` Found ${products.length} products`); - } - } - catch (e) { - // Ignore parse errors - } - }); - console.log('Navigating to page...'); - await page.goto(menuUrl, { - waitUntil: 'networkidle2', - timeout: 90000 - }); - // Check if it's a Dutchie menu - const isDutchie = await page.evaluate(() => { - return typeof window.reactEnv !== 'undefined'; - }); - if (isDutchie) { - console.log('✅ Dutchie menu detected\n'); - // Get environment info - const reactEnv = await page.evaluate(() => window.reactEnv); - console.log('Dutchie Environment:'); - console.log(` dispensaryId: ${reactEnv?.dispensaryId}`); - console.log(` retailerId: ${reactEnv?.retailerId}`); - console.log(` chainId: ${reactEnv?.chainId}`); - } - // Scroll to trigger lazy loading - console.log('\nScrolling to load more products...'); - await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight)); - await new Promise(r => setTimeout(r, 3000)); - // Click on a category to trigger more loads - const categoryLinks = await page.$$('a[href*="/products/"]'); - if (categoryLinks.length > 0) { - console.log(`Found ${categoryLinks.length} category links, clicking first one...`); - try { - await categoryLinks[0].click(); - await page.waitForNavigation({ waitUntil: 'networkidle2', timeout: 30000 }); - } - catch (e) { - console.log('Category navigation failed, continuing...'); - } - } - // Wait a bit more for any final responses - await new Promise(r => setTimeout(r, 2000)); - console.log(`\n${'='.repeat(80)}`); - console.log(`CAPTURED ${capturedResponses.length} GRAPHQL RESPONSES`); - console.log('='.repeat(80)); - // Find product data - let productSchema = null; - let sampleProduct = null; - for (const resp of capturedResponses) { - console.log(`\n${resp.operationName}:`); - console.log(` URL: ${resp.url.substring(0, 100)}...`); - if (resp.data?.data?.filteredProducts?.products) { - const products = resp.data.data.filteredProducts.products; - console.log(` ✅ Contains ${products.length} products`); - if (products.length > 0 && !sampleProduct) { - sampleProduct = products[0]; - productSchema = extractSchema(products[0]); - } - } - // Show top-level data keys - if (resp.data?.data) { - console.log(` Data keys: ${Object.keys(resp.data.data).join(', ')}`); - } - } - // Output the product schema - if (productSchema) { - console.log('\n' + '='.repeat(80)); - console.log('PRODUCT SCHEMA (from first product):'); - console.log('='.repeat(80)); - console.log(JSON.stringify(productSchema, null, 2)); - console.log('\n' + '='.repeat(80)); - console.log('SAMPLE PRODUCT:'); - console.log('='.repeat(80)); - console.log(JSON.stringify(sampleProduct, null, 2)); - // Save to file - const outputData = { - capturedAt: new Date().toISOString(), - menuUrl, - schema: productSchema, - sampleProduct, - allResponses: capturedResponses.map(r => ({ - operationName: r.operationName, - dataKeys: r.data?.data ? Object.keys(r.data.data) : [], - productCount: r.data?.data?.filteredProducts?.products?.length || 0 - })) - }; - const outputPath = '/tmp/dutchie-schema-capture.json'; - fs.writeFileSync(outputPath, JSON.stringify(outputData, null, 2)); - console.log(`\nSaved capture to: ${outputPath}`); - } - else { - console.log('\n❌ No product data captured'); - // Debug: show all responses - console.log('\nAll captured responses:'); - for (const resp of capturedResponses) { - console.log(`\n${resp.operationName}:`); - console.log(JSON.stringify(resp.data, null, 2).substring(0, 500)); - } - } - } - catch (error) { - console.error('Error:', error.message); - } - finally { - if (browser) { - await browser.close(); - } - } -} -/** - * Extract schema from an object (field names + types) - */ -function extractSchema(obj, prefix = '') { - if (obj === null) - return { type: 'null' }; - if (obj === undefined) - return { type: 'undefined' }; - if (Array.isArray(obj)) { - if (obj.length === 0) - return { type: 'array', items: 'unknown' }; - return { - type: 'array', - items: extractSchema(obj[0], prefix + '[]') - }; - } - if (typeof obj === 'object') { - const schema = { type: 'object', properties: {} }; - for (const [key, value] of Object.entries(obj)) { - schema.properties[key] = extractSchema(value, prefix ? `${prefix}.${key}` : key); - } - return schema; - } - return { type: typeof obj, example: String(obj).substring(0, 100) }; -} -// Run -const url = process.argv[2] || 'https://dutchie.com/embedded-menu/AZ-Deeply-Rooted'; -captureSchema(url).catch(console.error); diff --git a/backend/dist/scripts/check-store-linking.js b/backend/dist/scripts/check-store-linking.js deleted file mode 100644 index bbdd2e41..00000000 --- a/backend/dist/scripts/check-store-linking.js +++ /dev/null @@ -1,31 +0,0 @@ -"use strict"; -Object.defineProperty(exports, "__esModule", { value: true }); -const pg_1 = require("pg"); -const pool = new pg_1.Pool({ connectionString: process.env.DATABASE_URL }); -async function check() { - // Check which dispensaries have linked stores - const result = await pool.query(` - SELECT d.id as disp_id, d.name, d.menu_type, d.platform_dispensary_id, - s.id as store_id, s.name as store_name - FROM dispensaries d - LEFT JOIN stores s ON s.dispensary_id = d.id - WHERE d.menu_type = 'dutchie' AND d.platform_dispensary_id IS NOT NULL - LIMIT 15 - `); - console.log('Dispensaries with linked stores:'); - result.rows.forEach(r => { - console.log(` [${r.disp_id}] ${r.name} -> store ${r.store_id || 'NONE'} (${r.store_name || 'NOT LINKED'})`); - }); - // Count how many have linked stores - const countResult = await pool.query(` - SELECT - COUNT(*) FILTER (WHERE s.id IS NOT NULL) as with_store, - COUNT(*) FILTER (WHERE s.id IS NULL) as without_store - FROM dispensaries d - LEFT JOIN stores s ON s.dispensary_id = d.id - WHERE d.menu_type = 'dutchie' AND d.platform_dispensary_id IS NOT NULL - `); - console.log('\nSummary:', countResult.rows[0]); - await pool.end(); -} -check(); diff --git a/backend/dist/scripts/crawl-all-dutchie.js b/backend/dist/scripts/crawl-all-dutchie.js deleted file mode 100644 index 96378479..00000000 --- a/backend/dist/scripts/crawl-all-dutchie.js +++ /dev/null @@ -1,56 +0,0 @@ -"use strict"; -/** - * Seed crawl: trigger dutchie crawls for all dispensaries with menu_type='dutchie' - * and a resolved platform_dispensary_id. This uses the AZ orchestrator endpoint logic. - * - * Usage (local): - * node dist/scripts/crawl-all-dutchie.js - * - * Requires: - * - DATABASE_URL/CRAWLSY_DATABASE_URL pointing to the consolidated DB - * - Dispensaries table populated with menu_type and platform_dispensary_id - */ -Object.defineProperty(exports, "__esModule", { value: true }); -const connection_1 = require("../dutchie-az/db/connection"); -const dispensary_orchestrator_1 = require("../services/dispensary-orchestrator"); -async function main() { - const { rows } = await (0, connection_1.query)(` - SELECT id, name, slug, platform_dispensary_id - FROM dispensaries - WHERE menu_type = 'dutchie' - AND platform_dispensary_id IS NOT NULL - ORDER BY id - `); - if (!rows.length) { - console.log('No dutchie dispensaries with resolved platform_dispensary_id found.'); - process.exit(0); - } - console.log(`Found ${rows.length} dutchie dispensaries with resolved IDs. Triggering crawls...`); - let success = 0; - let failed = 0; - for (const row of rows) { - try { - console.log(`Crawling ${row.id} (${row.name})...`); - const result = await (0, dispensary_orchestrator_1.runDispensaryOrchestrator)(row.id); - const ok = result.status === 'success' || - result.status === 'sandbox_only' || - result.status === 'detection_only'; - if (ok) { - success++; - } - else { - failed++; - console.warn(`Crawl returned status ${result.status} for ${row.id} (${row.name})`); - } - } - catch (err) { - failed++; - console.error(`Failed crawl for ${row.id} (${row.name}): ${err.message}`); - } - } - console.log(`Completed. Success: ${success}, Failed: ${failed}`); -} -main().catch((err) => { - console.error('Fatal:', err); - process.exit(1); -}); diff --git a/backend/dist/scripts/crawl-five-sequential.js b/backend/dist/scripts/crawl-five-sequential.js deleted file mode 100644 index db5c0f4c..00000000 --- a/backend/dist/scripts/crawl-five-sequential.js +++ /dev/null @@ -1,44 +0,0 @@ -"use strict"; -Object.defineProperty(exports, "__esModule", { value: true }); -const dispensary_orchestrator_1 = require("../services/dispensary-orchestrator"); -// All 57 dutchie stores with platform_dispensary_id (as of 2024-12) -const ALL_DISPENSARY_IDS = [ - 72, 74, 75, 76, 77, 78, 81, 82, 85, 87, 91, 92, 97, 101, 106, 108, 110, 112, - 115, 120, 123, 125, 128, 131, 135, 139, 140, 143, 144, 145, 152, 153, 161, - 168, 176, 177, 180, 181, 189, 195, 196, 199, 200, 201, 205, 206, 207, 213, - 214, 224, 225, 227, 232, 235, 248, 252, 281 -]; -const BATCH_SIZE = 5; -async function run() { - const totalBatches = Math.ceil(ALL_DISPENSARY_IDS.length / BATCH_SIZE); - console.log(`Starting crawl of ${ALL_DISPENSARY_IDS.length} stores in ${totalBatches} batches of ${BATCH_SIZE}...`); - let successCount = 0; - let errorCount = 0; - for (let i = 0; i < ALL_DISPENSARY_IDS.length; i += BATCH_SIZE) { - const batch = ALL_DISPENSARY_IDS.slice(i, i + BATCH_SIZE); - const batchNum = Math.floor(i / BATCH_SIZE) + 1; - console.log(`\n========== BATCH ${batchNum}/${totalBatches} (IDs: ${batch.join(', ')}) ==========`); - for (const id of batch) { - console.log(`\n--- Crawling dispensary ${id} ---`); - try { - const result = await (0, dispensary_orchestrator_1.runDispensaryOrchestrator)(id); - console.log(` Status: ${result.status}`); - console.log(` Summary: ${result.summary}`); - if (result.productsFound) { - console.log(` Products: ${result.productsFound} found, ${result.productsNew} new, ${result.productsUpdated} updated`); - } - successCount++; - } - catch (e) { - console.log(` ERROR: ${e.message}`); - errorCount++; - } - } - console.log(`\n--- Batch ${batchNum} complete. Progress: ${Math.min(i + BATCH_SIZE, ALL_DISPENSARY_IDS.length)}/${ALL_DISPENSARY_IDS.length} ---`); - } - console.log('\n========================================'); - console.log(`=== ALL CRAWLS COMPLETE ===`); - console.log(`Success: ${successCount}, Errors: ${errorCount}`); - console.log('========================================'); -} -run().catch(e => console.log('Fatal:', e.message)); diff --git a/backend/dist/scripts/detect-all.js b/backend/dist/scripts/detect-all.js deleted file mode 100644 index 0d014f89..00000000 --- a/backend/dist/scripts/detect-all.js +++ /dev/null @@ -1,111 +0,0 @@ -"use strict"; -Object.defineProperty(exports, "__esModule", { value: true }); -const pg_1 = require("pg"); -const pool = new pg_1.Pool({ connectionString: process.env.DATABASE_URL }); -// Simple fetch with timeout -async function fetchWithTimeout(url, timeout = 10000) { - const controller = new AbortController(); - const id = setTimeout(() => controller.abort(), timeout); - try { - const resp = await fetch(url, { - signal: controller.signal, - headers: { - 'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)', - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', - }, - redirect: 'follow', - }); - clearTimeout(id); - return await resp.text(); - } - catch (e) { - clearTimeout(id); - throw e; - } -} -// Check for dutchie patterns in HTML -function detectDutchie(html) { - // Check for reactEnv.dispensaryId (Curaleaf/Sol pattern) - const reactEnvMatch = html.match(/"dispensaryId"\s*:\s*"([a-fA-F0-9]{24})"/i); - if (reactEnvMatch) { - return { provider: 'dutchie', platformId: reactEnvMatch[1] }; - } - // Check for Dutchie embedded-menu script (Trulieve pattern) - // Look for: embedded-menu/5eaf48fc972e6200b1303b97.js - const embedMatch = html.match(/embedded-menu\/([a-f0-9]{24})(?:\.js)?/i); - if (embedMatch) { - return { provider: 'dutchie', platformId: embedMatch[1] }; - } - // Check for dutchie.com links - const dutchieLink = html.match(/https?:\/\/(?:www\.)?dutchie\.com\/(?:dispensary|embedded-menu|stores)\/([a-zA-Z0-9-]+)/i); - if (dutchieLink) { - return { provider: 'dutchie', menuUrl: dutchieLink[0] }; - } - // Check for jane - if (html.includes('iheartjane.com') || html.includes('jane.co')) { - const janeMatch = html.match(/https?:\/\/(?:www\.)?(?:iheartjane\.com|jane\.co)\/[^"\s]+/i); - return { provider: 'jane', menuUrl: janeMatch?.[0] }; - } - // Check for treez - if (html.includes('.treez.io')) { - const treezMatch = html.match(/https?:\/\/[a-zA-Z0-9-]+\.treez\.io[^"\s]*/i); - return { provider: 'treez', menuUrl: treezMatch?.[0] }; - } - // Check for leafly - if (html.includes('leafly.com/dispensary')) { - return { provider: 'leafly' }; - } - return { provider: 'unknown' }; -} -async function main() { - const { rows: stores } = await pool.query(` - SELECT id, name, website - FROM dispensaries - WHERE platform_dispensary_id IS NULL - AND website IS NOT NULL - AND website NOT LIKE '%example%' - ORDER BY id - LIMIT 150 - `); - console.log('Checking ' + stores.length + ' stores...\n'); - let dutchieCount = 0; - let otherCount = 0; - let errorCount = 0; - for (const store of stores) { - try { - const html = await fetchWithTimeout(store.website); - const result = detectDutchie(html); - if (result.provider === 'dutchie') { - if (result.platformId) { - await pool.query('UPDATE dispensaries SET menu_type = $1, platform_dispensary_id = $2, updated_at = NOW() WHERE id = $3', ['dutchie', result.platformId, store.id]); - console.log('[' + store.id + '] ' + store.name + ' => DUTCHIE (ID: ' + result.platformId + ')'); - dutchieCount++; - } - else if (result.menuUrl) { - await pool.query('UPDATE dispensaries SET menu_type = $1, menu_url = $2, updated_at = NOW() WHERE id = $3', ['dutchie', result.menuUrl, store.id]); - console.log('[' + store.id + '] ' + store.name + ' => DUTCHIE (URL: ' + result.menuUrl.slice(0, 60) + ')'); - dutchieCount++; - } - } - else if (result.provider !== 'unknown') { - await pool.query('UPDATE dispensaries SET menu_type = $1, menu_url = COALESCE($2, menu_url), updated_at = NOW() WHERE id = $3', [result.provider, result.menuUrl, store.id]); - console.log('[' + store.id + '] ' + store.name + ' => ' + result.provider.toUpperCase()); - otherCount++; - } - else { - console.log('[' + store.id + '] ' + store.name + ' => no menu found'); - } - } - catch (err) { - const errMsg = err.name === 'AbortError' ? 'timeout' : err.message?.slice(0, 40) || 'error'; - console.log('[' + store.id + '] ' + store.name + ' => ERROR: ' + errMsg); - errorCount++; - } - } - console.log('\n=== Summary ==='); - console.log('Dutchie detected: ' + dutchieCount); - console.log('Other providers: ' + otherCount); - console.log('Errors: ' + errorCount); - await pool.end(); -} -main().catch(console.error); diff --git a/backend/dist/scripts/export-dispensaries.js b/backend/dist/scripts/export-dispensaries.js deleted file mode 100644 index 13f2c868..00000000 --- a/backend/dist/scripts/export-dispensaries.js +++ /dev/null @@ -1,18 +0,0 @@ -"use strict"; -Object.defineProperty(exports, "__esModule", { value: true }); -const pg_1 = require("pg"); -const pool = new pg_1.Pool({ connectionString: process.env.DATABASE_URL }); -async function exportDispensaries() { - const { rows } = await pool.query(` - SELECT id, name, dba_name, company_name, slug, - address, city, state, zip, latitude, longitude, - website, menu_type, menu_url, platform_dispensary_id, - created_at, updated_at - FROM dispensaries - WHERE menu_type IS NOT NULL - ORDER BY id - `); - console.log(JSON.stringify(rows, null, 2)); - await pool.end(); -} -exportDispensaries(); diff --git a/backend/dist/scripts/extract-platform-ids.js b/backend/dist/scripts/extract-platform-ids.js deleted file mode 100644 index 06bbcad0..00000000 --- a/backend/dist/scripts/extract-platform-ids.js +++ /dev/null @@ -1,240 +0,0 @@ -"use strict"; -Object.defineProperty(exports, "__esModule", { value: true }); -const playwright_1 = require("playwright"); -const pg_1 = require("pg"); -const pool = new pg_1.Pool({ - connectionString: process.env.DATABASE_URL -}); -async function extractPlatformId(browser, dispensary) { - let capturedId = null; - const context = await browser.newContext(); - const page = await context.newPage(); - // Intercept network requests to find retailer IDs - page.on('request', (request) => { - const url = request.url(); - if (url.includes('dutchie') || url.includes('plus.dutchie') || url.includes('api.dutchie')) { - // Check URL for retailer ID - const urlMatch = url.match(/[\/=]([a-f0-9]{24})(?:[\/\?&]|$)/i); - if (urlMatch && !capturedId) { - capturedId = urlMatch[1]; - console.log(` Captured from URL: ${capturedId}`); - } - const postData = request.postData(); - if (postData) { - // Look for retailerId in GraphQL variables - const match = postData.match(/["']?retailerId["']?\s*:\s*["']([a-f0-9]{24})["']/i); - if (match && !capturedId) { - capturedId = match[1]; - console.log(` Captured retailerId: ${capturedId}`); - } - // Also look for dispensaryId - const dispMatch = postData.match(/["']?dispensaryId["']?\s*:\s*["']([a-f0-9]{24})["']/i); - if (dispMatch && !capturedId) { - capturedId = dispMatch[1]; - console.log(` Captured dispensaryId: ${capturedId}`); - } - } - } - }); - try { - console.log(`\nLoading ${dispensary.name}: ${dispensary.website}`); - await page.goto(dispensary.website, { waitUntil: 'domcontentloaded', timeout: 30000 }); - // Wait for initial load - await page.waitForTimeout(2000); - // Check page content for retailerId - const content = await page.content(); - // Try various patterns in page content - const patterns = [ - /["']retailerId["']\s*:\s*["']([a-f0-9]{24})["']/i, - /dispensaryId["']\s*:\s*["']([a-f0-9]{24})["']/i, - /retailer["']?\s*:\s*["']([a-f0-9]{24})["']/i, - /dutchie\.com\/embedded-menu\/([a-f0-9]{24})/i, - /dutchie\.com\/dispensary\/([a-f0-9]{24})/i, - /plus\.dutchie\.com\/plus\/([a-f0-9]{24})/i, - /retailerId=([a-f0-9]{24})/i, - ]; - for (const pattern of patterns) { - const match = content.match(pattern); - if (match && !capturedId) { - capturedId = match[1]; - console.log(` Found in content: ${capturedId}`); - break; - } - } - // Check __NEXT_DATA__ if present - if (!capturedId) { - const nextData = await page.evaluate(() => { - const el = document.getElementById('__NEXT_DATA__'); - return el?.textContent || null; - }); - if (nextData) { - for (const pattern of patterns) { - const match = nextData.match(pattern); - if (match) { - capturedId = match[1]; - console.log(` Found in __NEXT_DATA__: ${capturedId}`); - break; - } - } - } - } - // Look for iframes that might contain dutchie embed - if (!capturedId) { - const iframes = await page.evaluate(() => { - return Array.from(document.querySelectorAll('iframe')).map(f => f.src); - }); - for (const src of iframes) { - if (src.includes('dutchie')) { - const match = src.match(/([a-f0-9]{24})/i); - if (match) { - capturedId = match[1]; - console.log(` Found in iframe: ${capturedId}`); - break; - } - } - } - } - // If still not found, try clicking on "Shop" or "Menu" links - if (!capturedId) { - const menuSelectors = [ - 'a:has-text("Shop")', - 'a:has-text("Menu")', - 'a:has-text("Order")', - 'a[href*="menu"]', - 'a[href*="shop"]', - 'a[href*="order"]', - 'button:has-text("Shop")', - 'button:has-text("Menu")', - ]; - for (const selector of menuSelectors) { - try { - const element = page.locator(selector).first(); - const isVisible = await element.isVisible({ timeout: 500 }); - if (isVisible) { - const href = await element.getAttribute('href'); - // If it's an internal link, click it - if (href && !href.startsWith('http')) { - console.log(` Clicking ${selector}...`); - await element.click(); - await page.waitForTimeout(3000); - // Check new page content - const newContent = await page.content(); - for (const pattern of patterns) { - const match = newContent.match(pattern); - if (match && !capturedId) { - capturedId = match[1]; - console.log(` Found after navigation: ${capturedId}`); - break; - } - } - // Check iframes on new page - if (!capturedId) { - const newIframes = await page.evaluate(() => { - return Array.from(document.querySelectorAll('iframe')).map(f => f.src); - }); - for (const src of newIframes) { - if (src.includes('dutchie')) { - const match = src.match(/([a-f0-9]{24})/i); - if (match) { - capturedId = match[1]; - console.log(` Found in iframe after nav: ${capturedId}`); - break; - } - } - } - } - if (capturedId) - break; - } - } - } - catch (e) { - // Continue to next selector - } - } - } - // If still not found, wait longer for async dutchie widget to load - if (!capturedId) { - console.log(` Waiting for async content...`); - await page.waitForTimeout(5000); - // Check for dutchie script tags - const scripts = await page.evaluate(() => { - return Array.from(document.querySelectorAll('script')).map(s => s.src || s.innerHTML?.substring(0, 500)); - }); - for (const script of scripts) { - if (script && script.includes('dutchie')) { - for (const pattern of patterns) { - const match = script.match(pattern); - if (match && !capturedId) { - capturedId = match[1]; - console.log(` Found in script: ${capturedId}`); - break; - } - } - if (capturedId) - break; - } - } - // Final check of iframes after wait - if (!capturedId) { - const finalIframes = await page.evaluate(() => { - return Array.from(document.querySelectorAll('iframe')).map(f => f.src); - }); - for (const src of finalIframes) { - if (src.includes('dutchie')) { - const match = src.match(/([a-f0-9]{24})/i); - if (match) { - capturedId = match[1]; - console.log(` Found in iframe (delayed): ${capturedId}`); - break; - } - } - } - } - } - } - catch (e) { - console.log(` Error: ${e.message.substring(0, 80)}`); - } - finally { - await context.close(); - } - return capturedId; -} -async function main() { - // Get dispensaries missing platform IDs - const result = await pool.query(` - SELECT id, name, website - FROM dispensaries - WHERE state = 'AZ' - AND menu_type = 'dutchie' - AND (platform_dispensary_id IS NULL OR platform_dispensary_id = '') - AND website IS NOT NULL AND website != '' - ORDER BY name - `); - console.log(`Found ${result.rows.length} dispensaries to process\n`); - const browser = await playwright_1.chromium.launch({ headless: true }); - const results = []; - for (const dispensary of result.rows) { - const platformId = await extractPlatformId(browser, dispensary); - results.push({ id: dispensary.id, name: dispensary.name, platformId }); - if (platformId) { - // Update database - await pool.query('UPDATE dispensaries SET platform_dispensary_id = $1 WHERE id = $2', [platformId, dispensary.id]); - console.log(` Updated database with ${platformId}`); - } - } - await browser.close(); - console.log('\n=== SUMMARY ==='); - const found = results.filter(r => r.platformId); - const notFound = results.filter(r => !r.platformId); - console.log(`\nFound (${found.length}):`); - found.forEach(r => console.log(` ${r.id}: ${r.name} -> ${r.platformId}`)); - console.log(`\nNot Found (${notFound.length}):`); - notFound.forEach(r => console.log(` ${r.id}: ${r.name}`)); - await pool.end(); -} -main().catch(e => { - console.error('Error:', e); - process.exit(1); -}); diff --git a/backend/dist/scripts/import-dispensaries.js b/backend/dist/scripts/import-dispensaries.js deleted file mode 100644 index c4cc3a4f..00000000 --- a/backend/dist/scripts/import-dispensaries.js +++ /dev/null @@ -1,108 +0,0 @@ -"use strict"; -var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { - if (k2 === undefined) k2 = k; - var desc = Object.getOwnPropertyDescriptor(m, k); - if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { - desc = { enumerable: true, get: function() { return m[k]; } }; - } - Object.defineProperty(o, k2, desc); -}) : (function(o, m, k, k2) { - if (k2 === undefined) k2 = k; - o[k2] = m[k]; -})); -var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { - Object.defineProperty(o, "default", { enumerable: true, value: v }); -}) : function(o, v) { - o["default"] = v; -}); -var __importStar = (this && this.__importStar) || (function () { - var ownKeys = function(o) { - ownKeys = Object.getOwnPropertyNames || function (o) { - var ar = []; - for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; - return ar; - }; - return ownKeys(o); - }; - return function (mod) { - if (mod && mod.__esModule) return mod; - var result = {}; - if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); - __setModuleDefault(result, mod); - return result; - }; -})(); -Object.defineProperty(exports, "__esModule", { value: true }); -const pg_1 = require("pg"); -const fs = __importStar(require("fs")); -const pool = new pg_1.Pool({ connectionString: process.env.DATABASE_URL }); -async function importDispensaries(filePath) { - const data = JSON.parse(fs.readFileSync(filePath, 'utf-8')); - console.log(`Importing ${data.length} dispensaries...`); - let inserted = 0; - let updated = 0; - let errors = 0; - for (const d of data) { - try { - // Check if dispensary exists by name and city - const { rows: existing } = await pool.query(`SELECT id FROM dispensaries WHERE name = $1 AND city = $2`, [d.name, d.city]); - if (existing.length > 0) { - // Update existing - await pool.query(` - UPDATE dispensaries SET - dba_name = COALESCE($1, dba_name), - company_name = COALESCE($2, company_name), - slug = COALESCE($3, slug), - address = COALESCE($4, address), - state = COALESCE($5, state), - zip = COALESCE($6, zip), - latitude = COALESCE($7, latitude), - longitude = COALESCE($8, longitude), - website = COALESCE($9, website), - menu_type = COALESCE($10, menu_type), - menu_url = COALESCE($11, menu_url), - platform_dispensary_id = COALESCE($12, platform_dispensary_id), - updated_at = NOW() - WHERE id = $13 - `, [ - d.dba_name, d.company_name, d.slug, - d.address, d.state, d.zip, - d.latitude, d.longitude, d.website, - d.menu_type, d.menu_url, d.platform_dispensary_id, - existing[0].id - ]); - console.log(`Updated: [${existing[0].id}] ${d.name} (${d.city})`); - updated++; - } - else { - // Insert new - const { rows: newRow } = await pool.query(` - INSERT INTO dispensaries ( - name, dba_name, company_name, slug, - address, city, state, zip, latitude, longitude, - website, menu_type, menu_url, platform_dispensary_id, - created_at, updated_at - ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, NOW(), NOW()) - RETURNING id - `, [ - d.name, d.dba_name, d.company_name, d.slug, - d.address, d.city, d.state, d.zip, d.latitude, d.longitude, - d.website, d.menu_type, d.menu_url, d.platform_dispensary_id - ]); - console.log(`Inserted: [${newRow[0].id}] ${d.name} (${d.city})`); - inserted++; - } - } - catch (err) { - console.error(`Error for ${d.name}: ${err.message}`); - errors++; - } - } - console.log(`\n=== Import Summary ===`); - console.log(`Inserted: ${inserted}`); - console.log(`Updated: ${updated}`); - console.log(`Errors: ${errors}`); - await pool.end(); -} -const filePath = process.argv[2] || '/tmp/dispensaries-export.json'; -importDispensaries(filePath).catch(console.error); diff --git a/backend/dist/scripts/jars-az-extractor.js b/backend/dist/scripts/jars-az-extractor.js deleted file mode 100644 index 2df24136..00000000 --- a/backend/dist/scripts/jars-az-extractor.js +++ /dev/null @@ -1,118 +0,0 @@ -"use strict"; -Object.defineProperty(exports, "__esModule", { value: true }); -const playwright_1 = require("playwright"); -async function extractJarsAzStoreIds() { - const browser = await playwright_1.chromium.launch({ headless: true }); - const page = await browser.newPage(); - const results = []; - const capturedIds = []; - const allRequests = []; - // Intercept network requests to find Dutchie Plus API calls - page.on('request', (request) => { - const url = request.url(); - allRequests.push(url.substring(0, 100)); - if (url.includes('dutchie') || url.includes('graphql')) { - const postData = request.postData(); - console.log('Dutchie request to:', url.substring(0, 80)); - if (postData) { - // Look for retailerId in GraphQL variables - const match = postData.match(/"retailerId"\s*:\s*"([a-f0-9-]{36})"/i); - if (match) { - const id = match[1]; - if (capturedIds.indexOf(id) === -1) { - capturedIds.push(id); - console.log('Captured retailerId from request:', id); - } - } - } - } - }); - try { - // Just load one page first and thoroughly debug it - console.log('Loading Mesa store with full network debugging...'); - await page.goto('https://jarscannabis.com/shop/mesa-az/', { - waitUntil: 'networkidle', - timeout: 60000 - }); - console.log('\nWaiting 5 seconds for dynamic content...'); - await page.waitForTimeout(5000); - // Get page title and content - const title = await page.title(); - console.log('Page title:', title); - const content = await page.content(); - console.log('Page content length:', content.length); - // Save screenshot - await page.screenshot({ path: '/tmp/jars-mesa-debug.png', fullPage: true }); - console.log('Screenshot saved to /tmp/jars-mesa-debug.png'); - // Look for all UUIDs in content - const uuidPattern = /[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}/gi; - const uuids = content.match(uuidPattern); - if (uuids) { - const uniqueUuids = [...new Set(uuids)]; - console.log('\n=== All UUIDs found on page ==='); - uniqueUuids.forEach(u => console.log(u)); - } - // Look for all iframes - const iframes = await page.evaluate(() => { - return Array.from(document.querySelectorAll('iframe')).map(f => ({ - src: f.src, - id: f.id, - name: f.name, - className: f.className - })); - }); - console.log('\n=== Iframes ==='); - console.log(JSON.stringify(iframes, null, 2)); - // Look for any elements with dutchie - const dutchieElements = await page.evaluate(() => { - const elements = document.body.innerHTML.match(/dutchie[^<>]*\"/gi) || []; - return elements.slice(0, 20); - }); - console.log('\n=== Dutchie mentions ==='); - dutchieElements.forEach(e => console.log(e)); - // Look for script src containing dutchie - const scripts = await page.evaluate(() => { - return Array.from(document.querySelectorAll('script[src]')) - .map(s => s.getAttribute('src')) - .filter(src => src && (src.includes('dutchie') || src.includes('embed'))); - }); - console.log('\n=== Relevant scripts ==='); - scripts.forEach(s => console.log(s)); - // Look for __NEXT_DATA__ - const nextData = await page.evaluate(() => { - const el = document.getElementById('__NEXT_DATA__'); - return el ? el.textContent : null; - }); - if (nextData) { - console.log('\n=== __NEXT_DATA__ found ==='); - const data = JSON.parse(nextData); - // Look for retailer in various places - const propsStr = JSON.stringify(data, null, 2); - // Find all UUID patterns in the props - const propsUuids = propsStr.match(/[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}/gi); - if (propsUuids) { - console.log('UUIDs in __NEXT_DATA__:', [...new Set(propsUuids)]); - } - } - else { - console.log('\nNo __NEXT_DATA__ found'); - } - // Look for specific Dutchie embed patterns - const embedPatterns = content.match(/https:\/\/[^"'\s]*dutchie[^"'\s]*/gi); - if (embedPatterns) { - console.log('\n=== Dutchie embed URLs ==='); - [...new Set(embedPatterns)].forEach(u => console.log(u)); - } - console.log('\n=== Network requests summary ==='); - console.log('Total requests:', allRequests.length); - const dutchieRequests = allRequests.filter(r => r.includes('dutchie')); - console.log('Dutchie requests:', dutchieRequests.length); - dutchieRequests.forEach(r => console.log(r)); - console.log('\n=== CAPTURED IDS ==='); - console.log(capturedIds); - } - finally { - await browser.close(); - } -} -extractJarsAzStoreIds().catch(e => console.error('Error:', e.message)); diff --git a/backend/dist/scripts/jars-az-finder.js b/backend/dist/scripts/jars-az-finder.js deleted file mode 100644 index 625d2405..00000000 --- a/backend/dist/scripts/jars-az-finder.js +++ /dev/null @@ -1,177 +0,0 @@ -"use strict"; -Object.defineProperty(exports, "__esModule", { value: true }); -const playwright_1 = require("playwright"); -async function findJarsAzStores() { - const browser = await playwright_1.chromium.launch({ headless: true }); - const page = await browser.newPage(); - const capturedRetailerIds = []; - const allApiCalls = []; - // Intercept ALL requests to find retailer IDs - page.on('request', (request) => { - const url = request.url(); - // Log Buddy API calls - if (url.includes('buddyapi') || url.includes('dutchie') || url.includes('graphql')) { - allApiCalls.push(url); - const postData = request.postData(); - if (postData) { - // Look for retailerId in various formats - const match = postData.match(/retailerId['":\s]+([a-f0-9-]{36})/i); - if (match) { - capturedRetailerIds.push({ url, retailerId: match[1] }); - } - } - // Also check URL params - const urlMatch = url.match(/retailerId=([a-f0-9-]{36})/i); - if (urlMatch) { - capturedRetailerIds.push({ url, retailerId: urlMatch[1] }); - } - } - }); - try { - // First, let's try to find the actual Arizona menu URLs - console.log('Loading JARS find-a-dispensary page...'); - await page.goto('https://jarscannabis.com/find-a-dispensary', { - waitUntil: 'networkidle', - timeout: 30000 - }); - await page.waitForTimeout(3000); - // Take screenshot - await page.screenshot({ path: '/tmp/jars-find-dispensary.png', fullPage: true }); - console.log('Screenshot saved to /tmp/jars-find-dispensary.png'); - // Try to find state selector and click Arizona - console.log('\nLooking for state selector...'); - // Try various ways to select Arizona - const stateSelectors = [ - 'select[name*="state"]', - '[class*="state"] select', - 'select option[value="AZ"]', - 'button:has-text("Arizona")', - 'a:has-text("Arizona")', - '[data-state="AZ"]', - 'div:has-text("Arizona")', - ]; - for (const selector of stateSelectors) { - try { - const element = page.locator(selector).first(); - const isVisible = await element.isVisible({ timeout: 1000 }); - if (isVisible) { - console.log(`Found element with selector: ${selector}`); - await element.click(); - await page.waitForTimeout(2000); - } - } - catch (e) { - // Continue to next selector - } - } - // Get all links on the page - const links = await page.evaluate(() => { - return Array.from(document.querySelectorAll('a')).map(a => ({ - href: a.href, - text: a.textContent?.trim() - })).filter(l => l.href.includes('/shop') || l.href.includes('menu') || l.href.includes('arizona') || l.href.includes('-az')); - }); - console.log('\n=== Shop/Menu Links Found ==='); - links.forEach(l => console.log(`${l.text}: ${l.href}`)); - // Look for __NEXT_DATA__ which might have location data - const nextData = await page.evaluate(() => { - const el = document.getElementById('__NEXT_DATA__'); - return el?.textContent || null; - }); - if (nextData) { - console.log('\n=== Analyzing __NEXT_DATA__ ==='); - const data = JSON.parse(nextData); - const dataStr = JSON.stringify(data); - // Look for Arizona references - if (dataStr.includes('Arizona') || dataStr.includes('AZ')) { - console.log('Found Arizona references in __NEXT_DATA__'); - // Extract all objects that might be Arizona stores - const findArizonaStores = (obj, path = '') => { - const results = []; - if (!obj || typeof obj !== 'object') - return results; - if (Array.isArray(obj)) { - obj.forEach((item, i) => { - results.push(...findArizonaStores(item, `${path}[${i}]`)); - }); - } - else { - // Check if this object looks like an AZ store - if (obj.state === 'AZ' || obj.state === 'Arizona' || - obj.stateCode === 'AZ' || obj.region === 'Arizona' || - (obj.city && ['Mesa', 'Phoenix', 'Peoria', 'Payson', 'Globe', 'Safford', 'Somerton', 'Prescott Valley'].includes(obj.city))) { - results.push({ path, data: obj }); - } - for (const key of Object.keys(obj)) { - results.push(...findArizonaStores(obj[key], `${path}.${key}`)); - } - } - return results; - }; - const azStores = findArizonaStores(data); - console.log(`Found ${azStores.length} Arizona store objects`); - azStores.forEach(s => { - console.log('\n---'); - console.log('Path:', s.path); - console.log(JSON.stringify(s.data, null, 2)); - }); - } - // Also look for retailer IDs - const retailerMatches = dataStr.match(/"retailerId"\s*:\s*"([a-f0-9-]{36})"/gi); - if (retailerMatches) { - console.log('\n=== RetailerIds in __NEXT_DATA__ ==='); - const uniqueIds = [...new Set(retailerMatches.map(m => { - const match = m.match(/([a-f0-9-]{36})/i); - return match ? match[1] : null; - }).filter(Boolean))]; - uniqueIds.forEach(id => console.log(id)); - } - } - // Try loading a known store URL pattern - const testUrls = [ - 'https://jarscannabis.com/arizona/', - 'https://jarscannabis.com/az/', - 'https://jarscannabis.com/stores/arizona/', - 'https://jarscannabis.com/locations/arizona/', - 'https://jarscannabis.com/shop/arizona/', - 'https://az.jarscannabis.com/', - ]; - console.log('\n=== Testing Arizona URLs ==='); - for (const testUrl of testUrls) { - try { - const response = await page.goto(testUrl, { waitUntil: 'domcontentloaded', timeout: 10000 }); - const status = response?.status(); - console.log(`${testUrl}: ${status}`); - if (status === 200) { - const title = await page.title(); - console.log(` Title: ${title}`); - // If we found a working page, extract store links - const storeLinks = await page.evaluate(() => { - return Array.from(document.querySelectorAll('a')).map(a => ({ - href: a.href, - text: a.textContent?.trim() - })).filter(l => l.href.includes('shop') || l.href.includes('menu')); - }); - if (storeLinks.length > 0) { - console.log(' Store links:'); - storeLinks.forEach(l => console.log(` ${l.text}: ${l.href}`)); - } - } - } - catch (e) { - console.log(`${testUrl}: Error - ${e.message.substring(0, 50)}`); - } - } - console.log('\n=== Captured Retailer IDs from API calls ==='); - const uniqueRetailerIds = [...new Map(capturedRetailerIds.map(r => [r.retailerId, r])).values()]; - uniqueRetailerIds.forEach(r => { - console.log(`${r.retailerId} (from: ${r.url.substring(0, 60)}...)`); - }); - console.log('\n=== All API calls ==='); - allApiCalls.forEach(url => console.log(url.substring(0, 100))); - } - finally { - await browser.close(); - } -} -findJarsAzStores().catch(e => console.error('Error:', e.message)); diff --git a/backend/dist/scripts/parallel-scrape.js b/backend/dist/scripts/parallel-scrape.js deleted file mode 100644 index a13dff89..00000000 --- a/backend/dist/scripts/parallel-scrape.js +++ /dev/null @@ -1,181 +0,0 @@ -"use strict"; -var __importDefault = (this && this.__importDefault) || function (mod) { - return (mod && mod.__esModule) ? mod : { "default": mod }; -}; -Object.defineProperty(exports, "__esModule", { value: true }); -const migrate_1 = require("../db/migrate"); -const proxy_1 = require("../services/proxy"); -const puppeteer_extra_1 = __importDefault(require("puppeteer-extra")); -const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth")); -puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)()); -const FIREFOX_USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0'; -const NUM_WORKERS = parseInt(process.argv[2] || '15'); -const DISPENSARY_NAME = process.argv[3] || 'Deeply Rooted'; -const USE_PROXIES = process.argv[4] !== 'no-proxy'; -async function getStore(name) { - const result = await migrate_1.pool.query(`SELECT id, name, slug, dutchie_url FROM stores WHERE name ILIKE $1 LIMIT 1`, [`%${name}%`]); - return result.rows[0] || null; -} -async function getCategories(storeId) { - const result = await migrate_1.pool.query(`SELECT id, name, slug, dutchie_url as url FROM categories WHERE store_id = $1 AND scrape_enabled = true`, [storeId]); - return result.rows; -} -async function scrapeWithProxy(workerId, store, category) { - let browser = null; - let proxyId = null; - try { - // Get a proxy (if enabled) - let proxy = null; - if (USE_PROXIES) { - proxy = await (0, proxy_1.getActiveProxy)(); - if (proxy) { - proxyId = proxy.id; - console.log(`[Worker ${workerId}] Using proxy: ${proxy.protocol}://${proxy.host}:${proxy.port}`); - } - else { - console.log(`[Worker ${workerId}] No proxy available, using direct connection`); - } - } - else { - console.log(`[Worker ${workerId}] Direct connection (proxies disabled)`); - } - // Build browser args - const args = [ - '--no-sandbox', - '--disable-setuid-sandbox', - '--disable-dev-shm-usage', - '--disable-accelerated-2d-canvas', - '--disable-gpu', - '--window-size=1920,1080', - ]; - if (proxy) { - if (proxy.protocol === 'socks5' || proxy.protocol === 'socks') { - args.push(`--proxy-server=socks5://${proxy.host}:${proxy.port}`); - } - else { - args.push(`--proxy-server=${proxy.protocol}://${proxy.host}:${proxy.port}`); - } - } - browser = await puppeteer_extra_1.default.launch({ - headless: true, - args, - executablePath: process.env.PUPPETEER_EXECUTABLE_PATH, - }); - const page = await browser.newPage(); - await page.setUserAgent(FIREFOX_USER_AGENT); - await page.setViewport({ width: 1920, height: 1080 }); - // Handle proxy auth if needed - if (proxy?.username && proxy?.password) { - await page.authenticate({ - username: proxy.username, - password: proxy.password, - }); - } - console.log(`[Worker ${workerId}] Scraping category: ${category.name} (${category.url})`); - // Navigate to the category page - const response = await page.goto(category.url, { - waitUntil: 'networkidle2', - timeout: 60000, - }); - if (!response || !response.ok()) { - throw new Error(`Failed to load page: ${response?.status()}`); - } - // Wait for products to load - await page.waitForSelector('[data-testid="product-list-item"], a[href*="/product/"]', { - timeout: 30000, - }).catch(() => { - console.log(`[Worker ${workerId}] No products found on page`); - }); - // Extract products - const products = await page.evaluate(() => { - // Try data-testid first, then fall back to product links - const listItems = document.querySelectorAll('[data-testid="product-list-item"]'); - if (listItems.length > 0) - return listItems.length; - return document.querySelectorAll('a[href*="/product/"]').length; - }); - console.log(`[Worker ${workerId}] Found ${products} products in ${category.name}`); - await browser.close(); - return { success: true, products }; - } - catch (error) { - console.error(`[Worker ${workerId}] Error:`, error.message); - // Check for bot detection - if (proxyId && (0, proxy_1.isBotDetectionError)(error.message)) { - (0, proxy_1.putProxyInTimeout)(proxyId, error.message); - } - if (browser) { - await browser.close().catch(() => { }); - } - return { success: false, products: 0, error: error.message }; - } -} -async function worker(workerId, store, categories, categoryIndex) { - while (categoryIndex.current < categories.length) { - const idx = categoryIndex.current++; - const category = categories[idx]; - if (!category) - break; - console.log(`[Worker ${workerId}] Starting category ${idx + 1}/${categories.length}: ${category.name}`); - const result = await scrapeWithProxy(workerId, store, category); - if (result.success) { - console.log(`[Worker ${workerId}] Completed ${category.name}: ${result.products} products`); - } - else { - console.log(`[Worker ${workerId}] Failed ${category.name}: ${result.error}`); - } - // Small delay between requests - await new Promise(resolve => setTimeout(resolve, 2000 + Math.random() * 3000)); - } - console.log(`[Worker ${workerId}] Finished all assigned work`); -} -async function main() { - console.log(`\n${'='.repeat(60)}`); - console.log(`Parallel Scraper - ${NUM_WORKERS} workers`); - console.log(`Target: ${DISPENSARY_NAME}`); - console.log(`User Agent: Firefox`); - console.log(`Proxies: ${USE_PROXIES ? 'Enabled' : 'Disabled'}`); - console.log(`${'='.repeat(60)}\n`); - // Find the store - const store = await getStore(DISPENSARY_NAME); - if (!store) { - console.error(`Store not found: ${DISPENSARY_NAME}`); - process.exit(1); - } - console.log(`Found store: ${store.name} (ID: ${store.id})`); - // Get categories - const categories = await getCategories(store.id); - if (categories.length === 0) { - console.error('No categories found for this store'); - process.exit(1); - } - console.log(`Found ${categories.length} categories to scrape`); - console.log(`Categories: ${categories.map(c => c.name).join(', ')}\n`); - // Check proxies - const proxyResult = await migrate_1.pool.query('SELECT COUNT(*) as total, COUNT(*) FILTER (WHERE active = true) as active FROM proxies'); - console.log(`Proxies: ${proxyResult.rows[0].active} active / ${proxyResult.rows[0].total} total\n`); - // Shared index for work distribution - const categoryIndex = { current: 0 }; - // For a store with few categories, we'll run multiple passes - // Expand the work by duplicating categories for parallel workers - const expandedCategories = []; - const passes = Math.ceil(NUM_WORKERS / Math.max(categories.length, 1)); - for (let i = 0; i < passes; i++) { - expandedCategories.push(...categories); - } - console.log(`Running ${NUM_WORKERS} workers across ${expandedCategories.length} category scrapes\n`); - // Start workers - const workers = []; - for (let i = 0; i < NUM_WORKERS; i++) { - workers.push(worker(i + 1, store, expandedCategories, categoryIndex)); - // Stagger worker starts - await new Promise(resolve => setTimeout(resolve, 500)); - } - // Wait for all workers - await Promise.all(workers); - console.log(`\n${'='.repeat(60)}`); - console.log('All workers completed!'); - console.log(`${'='.repeat(60)}\n`); - await migrate_1.pool.end(); -} -main().catch(console.error); diff --git a/backend/dist/scripts/platform-id-extractor.js b/backend/dist/scripts/platform-id-extractor.js deleted file mode 100644 index 9584d975..00000000 --- a/backend/dist/scripts/platform-id-extractor.js +++ /dev/null @@ -1,301 +0,0 @@ -"use strict"; -/** - * Platform ID Extractor - Standalone script for extracting Dutchie platform IDs - * - * This script visits dispensary websites to capture their Dutchie retailerId - * by intercepting network requests to the Dutchie GraphQL API. - * - * It does NOT use the main orchestrator - it's a standalone browser-based tool. - */ -Object.defineProperty(exports, "__esModule", { value: true }); -const playwright_1 = require("playwright"); -const pg_1 = require("pg"); -const pool = new pg_1.Pool({ - connectionString: process.env.DATABASE_URL -}); -async function extractPlatformId(browser, dispensary) { - let capturedId = null; - let captureSource = null; - let errorMsg = null; - const context = await browser.newContext({ - userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' - }); - const page = await context.newPage(); - // Patterns to match retailer IDs in various formats - const idPatterns = [ - /["']retailerId["']\s*:\s*["']([a-f0-9]{24})["']/i, - /["']dispensaryId["']\s*:\s*["']([a-f0-9]{24})["']/i, - /retailer["']?\s*:\s*["']([a-f0-9]{24})["']/i, - /dutchie\.com\/embedded-menu\/([a-f0-9]{24})/i, - /dutchie\.com\/dispensary\/([a-f0-9]{24})/i, - /plus\.dutchie\.com\/plus\/([a-f0-9]{24})/i, - /retailerId=([a-f0-9]{24})/i, - /\/([a-f0-9]{24})(?:\/|\?|$)/i, // Generic ID in URL path - ]; - // Intercept network requests - page.on('request', (request) => { - if (capturedId) - return; - const url = request.url(); - if (url.includes('dutchie') || url.includes('api.dutchie')) { - // Check URL for retailer ID - for (const pattern of idPatterns) { - const match = url.match(pattern); - if (match && match[1] && match[1].length === 24) { - capturedId = match[1]; - captureSource = 'request_url'; - break; - } - } - // Check POST data - const postData = request.postData(); - if (postData && !capturedId) { - for (const pattern of idPatterns) { - const match = postData.match(pattern); - if (match && match[1] && match[1].length === 24) { - capturedId = match[1]; - captureSource = 'request_body'; - break; - } - } - } - } - }); - try { - console.log(`\n[${dispensary.id}] ${dispensary.name}: ${dispensary.website}`); - // Load main page - await page.goto(dispensary.website, { - waitUntil: 'domcontentloaded', - timeout: 25000 - }); - await page.waitForTimeout(2000); - // Check page content - if (!capturedId) { - const content = await page.content(); - for (const pattern of idPatterns) { - const match = content.match(pattern); - if (match && match[1] && match[1].length === 24) { - capturedId = match[1]; - captureSource = 'page_content'; - break; - } - } - } - // Check __NEXT_DATA__ - if (!capturedId) { - const nextData = await page.evaluate(() => { - const el = document.getElementById('__NEXT_DATA__'); - return el?.textContent || null; - }); - if (nextData) { - for (const pattern of idPatterns) { - const match = nextData.match(pattern); - if (match && match[1] && match[1].length === 24) { - capturedId = match[1]; - captureSource = '__NEXT_DATA__'; - break; - } - } - } - } - // Check iframes - if (!capturedId) { - const iframes = await page.evaluate(() => { - return Array.from(document.querySelectorAll('iframe')).map(f => f.src); - }); - for (const src of iframes) { - if (src.includes('dutchie')) { - const match = src.match(/([a-f0-9]{24})/i); - if (match) { - capturedId = match[1]; - captureSource = 'iframe_src'; - break; - } - } - } - } - // Check scripts - if (!capturedId) { - const scripts = await page.evaluate(() => { - return Array.from(document.querySelectorAll('script')) - .map(s => s.src || s.innerHTML?.substring(0, 1000)) - .filter(Boolean); - }); - for (const script of scripts) { - if (script && (script.includes('dutchie') || script.includes('retailerId'))) { - for (const pattern of idPatterns) { - const match = script.match(pattern); - if (match && match[1] && match[1].length === 24) { - capturedId = match[1]; - captureSource = 'script'; - break; - } - } - if (capturedId) - break; - } - } - } - // Try navigating to menu/shop page - if (!capturedId) { - const menuLink = await page.evaluate(() => { - const links = Array.from(document.querySelectorAll('a')); - for (const link of links) { - const href = link.href?.toLowerCase() || ''; - const text = link.textContent?.toLowerCase() || ''; - if (href.includes('menu') || href.includes('shop') || href.includes('order') || - text.includes('menu') || text.includes('shop') || text.includes('order')) { - return link.href; - } - } - return null; - }); - if (menuLink && !menuLink.startsWith('javascript:')) { - try { - console.log(` -> Following menu link: ${menuLink.substring(0, 60)}...`); - await page.goto(menuLink, { waitUntil: 'domcontentloaded', timeout: 20000 }); - await page.waitForTimeout(3000); - // Recheck all sources on new page - const newContent = await page.content(); - for (const pattern of idPatterns) { - const match = newContent.match(pattern); - if (match && match[1] && match[1].length === 24) { - capturedId = match[1]; - captureSource = 'menu_page_content'; - break; - } - } - // Check iframes on new page - if (!capturedId) { - const newIframes = await page.evaluate(() => { - return Array.from(document.querySelectorAll('iframe')).map(f => f.src); - }); - for (const src of newIframes) { - if (src.includes('dutchie')) { - const match = src.match(/([a-f0-9]{24})/i); - if (match) { - capturedId = match[1]; - captureSource = 'menu_page_iframe'; - break; - } - } - } - } - } - catch (navError) { - // Menu navigation failed, continue - } - } - } - // Final wait for async content - if (!capturedId) { - await page.waitForTimeout(3000); - // Final iframe check - const finalIframes = await page.evaluate(() => { - return Array.from(document.querySelectorAll('iframe')).map(f => f.src); - }); - for (const src of finalIframes) { - if (src.includes('dutchie')) { - const match = src.match(/([a-f0-9]{24})/i); - if (match) { - capturedId = match[1]; - captureSource = 'delayed_iframe'; - break; - } - } - } - } - if (capturedId) { - console.log(` ✓ Found: ${capturedId} (${captureSource})`); - } - else { - console.log(` ✗ Not found`); - } - } - catch (e) { - errorMsg = e.message.substring(0, 100); - console.log(` ✗ Error: ${errorMsg}`); - } - finally { - await context.close(); - } - return { - id: dispensary.id, - name: dispensary.name, - website: dispensary.website, - platformId: capturedId, - source: captureSource, - error: errorMsg - }; -} -async function main() { - // Get specific dispensary ID from command line, or process all missing - const targetId = process.argv[2] ? parseInt(process.argv[2], 10) : null; - let query; - let params = []; - if (targetId) { - query = ` - SELECT id, name, website - FROM dispensaries - WHERE id = $1 - AND website IS NOT NULL AND website != '' - `; - params = [targetId]; - } - else { - query = ` - SELECT id, name, website - FROM dispensaries - WHERE state = 'AZ' - AND menu_type = 'dutchie' - AND (platform_dispensary_id IS NULL OR platform_dispensary_id = '') - AND website IS NOT NULL AND website != '' - ORDER BY name - `; - } - const result = await pool.query(query, params); - if (result.rows.length === 0) { - console.log('No dispensaries to process'); - await pool.end(); - return; - } - console.log(`\n=== Platform ID Extractor ===`); - console.log(`Processing ${result.rows.length} dispensaries...\n`); - const browser = await playwright_1.chromium.launch({ - headless: true, - args: ['--no-sandbox', '--disable-setuid-sandbox'] - }); - const results = []; - for (const dispensary of result.rows) { - const extractionResult = await extractPlatformId(browser, dispensary); - results.push(extractionResult); - // Update database immediately if found - if (extractionResult.platformId) { - await pool.query('UPDATE dispensaries SET platform_dispensary_id = $1 WHERE id = $2', [extractionResult.platformId, extractionResult.id]); - } - } - await browser.close(); - // Summary - console.log('\n' + '='.repeat(60)); - console.log('SUMMARY'); - console.log('='.repeat(60)); - const found = results.filter(r => r.platformId); - const notFound = results.filter(r => !r.platformId); - console.log(`\nFound: ${found.length}/${results.length}`); - if (found.length > 0) { - console.log('\nSuccessful extractions:'); - found.forEach(r => console.log(` [${r.id}] ${r.name} -> ${r.platformId} (${r.source})`)); - } - if (notFound.length > 0) { - console.log(`\nNot found: ${notFound.length}`); - notFound.forEach(r => { - const reason = r.error || 'No Dutchie ID detected'; - console.log(` [${r.id}] ${r.name}: ${reason}`); - }); - } - await pool.end(); -} -main().catch(e => { - console.error('Fatal error:', e); - process.exit(1); -}); diff --git a/backend/dist/scripts/queue-dispensaries.js b/backend/dist/scripts/queue-dispensaries.js deleted file mode 100644 index 4dc7f5b8..00000000 --- a/backend/dist/scripts/queue-dispensaries.js +++ /dev/null @@ -1,344 +0,0 @@ -#!/usr/bin/env npx tsx -"use strict"; -/** - * Queue Dispensaries Script - * - * Orchestrates the multi-provider crawler system: - * 1. Queue dispensaries that need provider detection - * 2. Queue Dutchie dispensaries for production crawl - * 3. Queue sandbox dispensaries for learning crawls - * - * Usage: - * npx tsx src/scripts/queue-dispensaries.ts [--detection] [--production] [--sandbox] [--all] - * npx tsx src/scripts/queue-dispensaries.ts --dry-run - * npx tsx src/scripts/queue-dispensaries.ts --process # Process queued jobs - */ -Object.defineProperty(exports, "__esModule", { value: true }); -const migrate_1 = require("../db/migrate"); -const crawler_jobs_1 = require("../services/crawler-jobs"); -// Parse command line args -const args = process.argv.slice(2); -const flags = { - detection: args.includes('--detection') || args.includes('--all'), - production: args.includes('--production') || args.includes('--all'), - sandbox: args.includes('--sandbox') || args.includes('--all'), - dryRun: args.includes('--dry-run'), - process: args.includes('--process'), - help: args.includes('--help') || args.includes('-h'), - limit: parseInt(args.find(a => a.startsWith('--limit='))?.split('=')[1] || '10'), -}; -// If no specific flags, default to all -if (!flags.detection && !flags.production && !flags.sandbox && !flags.process) { - flags.detection = true; - flags.production = true; - flags.sandbox = true; -} -async function showHelp() { - console.log(` -Queue Dispensaries - Multi-Provider Crawler Orchestration - -USAGE: - npx tsx src/scripts/queue-dispensaries.ts [OPTIONS] - -OPTIONS: - --detection Queue dispensaries that need provider detection - --production Queue Dutchie production crawls - --sandbox Queue sandbox/learning crawls - --all Queue all job types (default if no specific flag) - --process Process queued jobs instead of just queuing - --dry-run Show what would be queued without making changes - --limit=N Maximum dispensaries to queue per type (default: 10) - --help, -h Show this help message - -EXAMPLES: - # Queue all dispensaries for appropriate jobs - npx tsx src/scripts/queue-dispensaries.ts - - # Only queue detection jobs - npx tsx src/scripts/queue-dispensaries.ts --detection --limit=20 - - # Dry run to see what would be queued - npx tsx src/scripts/queue-dispensaries.ts --dry-run - - # Process sandbox jobs - npx tsx src/scripts/queue-dispensaries.ts --process -`); -} -async function queueDetectionJobs() { - console.log('\n📡 Queueing Detection Jobs...'); - // Find dispensaries that need provider detection: - // - menu_provider is null OR - // - menu_provider_confidence < 70 AND - // - crawler_status is idle (not already queued/running) - // - has a website URL - const query = ` - SELECT id, name, website, menu_url, menu_provider, menu_provider_confidence - FROM dispensaries - WHERE (website IS NOT NULL OR menu_url IS NOT NULL) - AND crawler_status = 'idle' - AND (menu_provider IS NULL OR menu_provider_confidence < 70) - ORDER BY - CASE WHEN menu_provider IS NULL THEN 0 ELSE 1 END, - menu_provider_confidence ASC - LIMIT $1 - `; - const result = await migrate_1.pool.query(query, [flags.limit]); - if (flags.dryRun) { - console.log(` Would queue ${result.rows.length} dispensaries for detection:`); - for (const row of result.rows) { - console.log(` - [${row.id}] ${row.name} (current: ${row.menu_provider || 'unknown'}, confidence: ${row.menu_provider_confidence}%)`); - } - return result.rows.length; - } - let queued = 0; - for (const dispensary of result.rows) { - try { - // Update status to queued - await migrate_1.pool.query(`UPDATE dispensaries SET crawler_status = 'queued_detection', updated_at = NOW() WHERE id = $1`, [dispensary.id]); - // Create sandbox job for detection - await migrate_1.pool.query(`INSERT INTO sandbox_crawl_jobs (dispensary_id, job_type, status, priority) - VALUES ($1, 'detection', 'pending', 10)`, [dispensary.id]); - console.log(` ✓ Queued detection: [${dispensary.id}] ${dispensary.name}`); - queued++; - } - catch (error) { - console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`); - } - } - return queued; -} -async function queueProductionCrawls() { - console.log('\n🏭 Queueing Production Dutchie Crawls...'); - // Find Dutchie dispensaries ready for production crawl: - // - menu_provider = 'dutchie' - // - crawler_mode = 'production' - // - crawler_status is idle - // - last_menu_scrape is old or null - const query = ` - SELECT d.id, d.name, d.last_menu_scrape, d.menu_url - FROM dispensaries d - WHERE d.menu_provider = 'dutchie' - AND d.crawler_mode = 'production' - AND d.crawler_status = 'idle' - AND (d.last_menu_scrape IS NULL OR d.last_menu_scrape < NOW() - INTERVAL '4 hours') - ORDER BY - CASE WHEN d.last_menu_scrape IS NULL THEN 0 ELSE 1 END, - d.last_menu_scrape ASC - LIMIT $1 - `; - const result = await migrate_1.pool.query(query, [flags.limit]); - if (flags.dryRun) { - console.log(` Would queue ${result.rows.length} Dutchie dispensaries for production crawl:`); - for (const row of result.rows) { - const lastScrape = row.last_menu_scrape ? new Date(row.last_menu_scrape).toISOString() : 'never'; - console.log(` - [${row.id}] ${row.name} (last scrape: ${lastScrape})`); - } - return result.rows.length; - } - let queued = 0; - for (const dispensary of result.rows) { - try { - // Update status to queued - await migrate_1.pool.query(`UPDATE dispensaries SET crawler_status = 'queued_crawl', updated_at = NOW() WHERE id = $1`, [dispensary.id]); - // Create crawl job in the main crawl_jobs table (production queue) - await migrate_1.pool.query(`INSERT INTO crawl_jobs (store_id, job_type, trigger_type, status, priority, metadata) - SELECT s.id, 'full_crawl', 'scheduled', 'pending', 50, - jsonb_build_object('dispensary_id', $1, 'source', 'queue-dispensaries') - FROM stores s - JOIN dispensaries d ON (d.menu_url = s.dutchie_url OR d.name ILIKE '%' || s.name || '%') - WHERE d.id = $1 - LIMIT 1`, [dispensary.id]); - console.log(` ✓ Queued production crawl: [${dispensary.id}] ${dispensary.name}`); - queued++; - } - catch (error) { - console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`); - } - } - return queued; -} -async function queueSandboxCrawls() { - console.log('\n🧪 Queueing Sandbox Crawls...'); - // Find sandbox dispensaries needing crawls: - // - crawler_mode = 'sandbox' - // - crawler_status in (idle, error_needs_review) - // - No recent sandbox job - const query = ` - SELECT d.id, d.name, d.menu_provider, d.crawler_status, d.website - FROM dispensaries d - WHERE d.crawler_mode = 'sandbox' - AND d.crawler_status IN ('idle', 'error_needs_review') - AND (d.website IS NOT NULL OR d.menu_url IS NOT NULL) - AND NOT EXISTS ( - SELECT 1 FROM sandbox_crawl_jobs sj - WHERE sj.dispensary_id = d.id - AND sj.status IN ('pending', 'running') - ) - ORDER BY d.updated_at ASC - LIMIT $1 - `; - const result = await migrate_1.pool.query(query, [flags.limit]); - if (flags.dryRun) { - console.log(` Would queue ${result.rows.length} dispensaries for sandbox crawl:`); - for (const row of result.rows) { - console.log(` - [${row.id}] ${row.name} (provider: ${row.menu_provider || 'unknown'}, status: ${row.crawler_status})`); - } - return result.rows.length; - } - let queued = 0; - for (const dispensary of result.rows) { - try { - // Update status - await migrate_1.pool.query(`UPDATE dispensaries SET crawler_status = 'queued_crawl', updated_at = NOW() WHERE id = $1`, [dispensary.id]); - // Create sandbox job - await migrate_1.pool.query(`INSERT INTO sandbox_crawl_jobs (dispensary_id, job_type, status, priority) - VALUES ($1, 'deep_crawl', 'pending', 5)`, [dispensary.id]); - console.log(` ✓ Queued sandbox crawl: [${dispensary.id}] ${dispensary.name}`); - queued++; - } - catch (error) { - console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`); - } - } - return queued; -} -async function processJobs() { - console.log('\n⚙️ Processing Queued Jobs...\n'); - // Process sandbox jobs (detection + sandbox crawls) - const sandboxJobs = await migrate_1.pool.query(`SELECT * FROM sandbox_crawl_jobs - WHERE status = 'pending' - ORDER BY priority DESC, scheduled_at ASC - LIMIT $1`, [flags.limit]); - console.log(`Found ${sandboxJobs.rows.length} pending sandbox jobs\n`); - for (const job of sandboxJobs.rows) { - console.log(`Processing job ${job.id} (${job.job_type}) for dispensary ${job.dispensary_id}...`); - try { - // Mark as running - await migrate_1.pool.query(`UPDATE sandbox_crawl_jobs SET status = 'running', started_at = NOW() WHERE id = $1`, [job.id]); - let result; - if (job.job_type === 'detection') { - result = await (0, crawler_jobs_1.runDetectMenuProviderJob)(job.dispensary_id); - } - else { - result = await (0, crawler_jobs_1.runSandboxCrawlJob)(job.dispensary_id, job.sandbox_id); - } - // Update job status - await migrate_1.pool.query(`UPDATE sandbox_crawl_jobs - SET status = $1, completed_at = NOW(), result_summary = $2, error_message = $3 - WHERE id = $4`, [ - result.success ? 'completed' : 'failed', - JSON.stringify(result.data || {}), - result.success ? null : result.message, - job.id, - ]); - console.log(` ${result.success ? '✓' : '✗'} ${result.message}\n`); - } - catch (error) { - await migrate_1.pool.query(`UPDATE sandbox_crawl_jobs SET status = 'failed', error_message = $1 WHERE id = $2`, [error.message, job.id]); - console.log(` ✗ Error: ${error.message}\n`); - } - } -} -async function showStats() { - console.log('\n📊 Current Stats:'); - // Dispensary stats - const stats = await migrate_1.pool.query(` - SELECT - COUNT(*) as total, - COUNT(*) FILTER (WHERE menu_provider IS NULL) as no_provider, - COUNT(*) FILTER (WHERE menu_provider = 'dutchie') as dutchie, - COUNT(*) FILTER (WHERE menu_provider NOT IN ('dutchie', 'unknown') AND menu_provider IS NOT NULL) as other_providers, - COUNT(*) FILTER (WHERE menu_provider = 'unknown') as unknown, - COUNT(*) FILTER (WHERE crawler_mode = 'production') as production_mode, - COUNT(*) FILTER (WHERE crawler_mode = 'sandbox') as sandbox_mode, - COUNT(*) FILTER (WHERE crawler_status = 'idle') as idle, - COUNT(*) FILTER (WHERE crawler_status LIKE 'queued%') as queued, - COUNT(*) FILTER (WHERE crawler_status = 'running') as running, - COUNT(*) FILTER (WHERE crawler_status = 'ok') as ok, - COUNT(*) FILTER (WHERE crawler_status = 'error_needs_review') as needs_review - FROM dispensaries - `); - const s = stats.rows[0]; - console.log(` - Dispensaries: ${s.total} - - No provider detected: ${s.no_provider} - - Dutchie: ${s.dutchie} - - Other providers: ${s.other_providers} - - Unknown: ${s.unknown} - - Crawler Mode: - - Production: ${s.production_mode} - - Sandbox: ${s.sandbox_mode} - - Status: - - Idle: ${s.idle} - - Queued: ${s.queued} - - Running: ${s.running} - - OK: ${s.ok} - - Needs Review: ${s.needs_review} -`); - // Job stats - const jobStats = await migrate_1.pool.query(` - SELECT - COUNT(*) FILTER (WHERE status = 'pending') as pending, - COUNT(*) FILTER (WHERE status = 'running') as running, - COUNT(*) FILTER (WHERE status = 'completed') as completed, - COUNT(*) FILTER (WHERE status = 'failed') as failed - FROM sandbox_crawl_jobs - `); - const j = jobStats.rows[0]; - console.log(` Sandbox Jobs: - - Pending: ${j.pending} - - Running: ${j.running} - - Completed: ${j.completed} - - Failed: ${j.failed} -`); -} -async function main() { - if (flags.help) { - await showHelp(); - process.exit(0); - } - console.log('═══════════════════════════════════════════════════════'); - console.log(' Multi-Provider Crawler Queue Manager'); - console.log('═══════════════════════════════════════════════════════'); - if (flags.dryRun) { - console.log('\n🔍 DRY RUN MODE - No changes will be made\n'); - } - try { - // Show current stats first - await showStats(); - if (flags.process) { - // Process mode - run jobs instead of queuing - await processJobs(); - } - else { - // Queuing mode - let totalQueued = 0; - if (flags.detection) { - totalQueued += await queueDetectionJobs(); - } - if (flags.production) { - totalQueued += await queueProductionCrawls(); - } - if (flags.sandbox) { - totalQueued += await queueSandboxCrawls(); - } - console.log('\n═══════════════════════════════════════════════════════'); - console.log(` Total dispensaries queued: ${totalQueued}`); - console.log('═══════════════════════════════════════════════════════\n'); - } - // Show updated stats - if (!flags.dryRun) { - await showStats(); - } - } - catch (error) { - console.error('Fatal error:', error); - process.exit(1); - } - finally { - await migrate_1.pool.end(); - } -} -main(); diff --git a/backend/dist/scripts/queue-intelligence.js b/backend/dist/scripts/queue-intelligence.js deleted file mode 100644 index 7a07f115..00000000 --- a/backend/dist/scripts/queue-intelligence.js +++ /dev/null @@ -1,473 +0,0 @@ -#!/usr/bin/env npx tsx -"use strict"; -/** - * Queue Intelligence Script - * - * Orchestrates the multi-category intelligence crawler system: - * 1. Queue dispensaries that need provider detection (all 4 categories) - * 2. Queue per-category production crawls (Dutchie products only for now) - * 3. Queue per-category sandbox crawls (all providers) - * - * Each category (product, specials, brand, metadata) is handled independently. - * A failure in one category does NOT affect other categories. - * - * Usage: - * npx tsx src/scripts/queue-intelligence.ts [--detection] [--production] [--sandbox] [--all] - * npx tsx src/scripts/queue-intelligence.ts --category=product --sandbox - * npx tsx src/scripts/queue-intelligence.ts --process --category=product - * npx tsx src/scripts/queue-intelligence.ts --dry-run - */ -Object.defineProperty(exports, "__esModule", { value: true }); -const migrate_1 = require("../db/migrate"); -const intelligence_detector_1 = require("../services/intelligence-detector"); -const category_crawler_jobs_1 = require("../services/category-crawler-jobs"); -// Parse command line args -const args = process.argv.slice(2); -const flags = { - detection: args.includes('--detection') || args.includes('--all'), - production: args.includes('--production') || args.includes('--all'), - sandbox: args.includes('--sandbox') || args.includes('--all'), - dryRun: args.includes('--dry-run'), - process: args.includes('--process'), - help: args.includes('--help') || args.includes('-h'), - limit: parseInt(args.find(a => a.startsWith('--limit='))?.split('=')[1] || '10'), - category: args.find(a => a.startsWith('--category='))?.split('=')[1], - dispensary: parseInt(args.find(a => a.startsWith('--dispensary='))?.split('=')[1] || '0'), -}; -// If no specific flags, default to all -if (!flags.detection && !flags.production && !flags.sandbox && !flags.process) { - flags.detection = true; - flags.production = true; - flags.sandbox = true; -} -const CATEGORIES = ['product', 'specials', 'brand', 'metadata']; -async function showHelp() { - console.log(` -Queue Intelligence - Multi-Category Crawler Orchestration - -USAGE: - npx tsx src/scripts/queue-intelligence.ts [OPTIONS] - -OPTIONS: - --detection Queue dispensaries that need multi-category detection - --production Queue per-category production crawls - --sandbox Queue per-category sandbox crawls - --all Queue all job types (default if no specific flag) - --process Process queued jobs instead of just queuing - --category=CATEGORY Filter to specific category (product|specials|brand|metadata) - --dispensary=ID Process only a specific dispensary - --dry-run Show what would be queued without making changes - --limit=N Maximum dispensaries to queue per type (default: 10) - --help, -h Show this help message - -CATEGORIES: - product - Product/menu data (Dutchie=production, others=sandbox) - specials - Deals and specials (all sandbox for now) - brand - Brand intelligence (all sandbox for now) - metadata - Categories/taxonomy (all sandbox for now) - -EXAMPLES: - # Queue all dispensaries for appropriate jobs - npx tsx src/scripts/queue-intelligence.ts - - # Only queue product detection jobs - npx tsx src/scripts/queue-intelligence.ts --detection --category=product - - # Process sandbox jobs for specials category - npx tsx src/scripts/queue-intelligence.ts --process --category=specials --limit=5 - - # Run full detection for a specific dispensary - npx tsx src/scripts/queue-intelligence.ts --process --detection --dispensary=123 - - # Dry run to see what would be queued - npx tsx src/scripts/queue-intelligence.ts --dry-run -`); -} -async function queueMultiCategoryDetection() { - console.log('\n📡 Queueing Multi-Category Detection Jobs...'); - // Find dispensaries that need provider detection for any category: - // - Any *_provider is null OR - // - Any *_confidence < 70 - // - has a website URL - const query = ` - SELECT id, name, website, menu_url, - product_provider, product_confidence, product_crawler_mode, - specials_provider, specials_confidence, specials_crawler_mode, - brand_provider, brand_confidence, brand_crawler_mode, - metadata_provider, metadata_confidence, metadata_crawler_mode - FROM dispensaries - WHERE (website IS NOT NULL OR menu_url IS NOT NULL) - AND ( - product_provider IS NULL OR product_confidence < 70 OR - specials_provider IS NULL OR specials_confidence < 70 OR - brand_provider IS NULL OR brand_confidence < 70 OR - metadata_provider IS NULL OR metadata_confidence < 70 - ) - ORDER BY - CASE WHEN product_provider IS NULL THEN 0 ELSE 1 END, - product_confidence ASC - LIMIT $1 - `; - const result = await migrate_1.pool.query(query, [flags.limit]); - if (flags.dryRun) { - console.log(` Would queue ${result.rows.length} dispensaries for multi-category detection:`); - for (const row of result.rows) { - const needsDetection = []; - if (!row.product_provider || row.product_confidence < 70) - needsDetection.push('product'); - if (!row.specials_provider || row.specials_confidence < 70) - needsDetection.push('specials'); - if (!row.brand_provider || row.brand_confidence < 70) - needsDetection.push('brand'); - if (!row.metadata_provider || row.metadata_confidence < 70) - needsDetection.push('metadata'); - console.log(` - [${row.id}] ${row.name} (needs: ${needsDetection.join(', ')})`); - } - return result.rows.length; - } - let queued = 0; - for (const dispensary of result.rows) { - try { - // Create detection jobs for each category that needs it - for (const category of CATEGORIES) { - const provider = dispensary[`${category}_provider`]; - const confidence = dispensary[`${category}_confidence`]; - if (!provider || confidence < 70) { - await migrate_1.pool.query(`INSERT INTO sandbox_crawl_jobs (dispensary_id, category, job_type, status, priority) - VALUES ($1, $2, 'detection', 'pending', 10) - ON CONFLICT DO NOTHING`, [dispensary.id, category]); - } - } - console.log(` ✓ Queued detection: [${dispensary.id}] ${dispensary.name}`); - queued++; - } - catch (error) { - console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`); - } - } - return queued; -} -async function queueCategoryProductionCrawls(category) { - const categories = category ? [category] : CATEGORIES; - let totalQueued = 0; - for (const cat of categories) { - console.log(`\n🏭 Queueing Production ${cat.toUpperCase()} Crawls...`); - // For now, only products have production-ready crawlers (Dutchie only) - if (cat !== 'product') { - console.log(` ⏭️ No production crawler for ${cat} yet - skipping`); - continue; - } - // Find dispensaries ready for production crawl - const query = ` - SELECT id, name, ${cat}_provider as provider, last_${cat}_scan_at as last_scan - FROM dispensaries - WHERE ${cat}_provider = 'dutchie' - AND ${cat}_crawler_mode = 'production' - AND ${cat}_confidence >= 70 - AND (last_${cat}_scan_at IS NULL OR last_${cat}_scan_at < NOW() - INTERVAL '4 hours') - ORDER BY - CASE WHEN last_${cat}_scan_at IS NULL THEN 0 ELSE 1 END, - last_${cat}_scan_at ASC - LIMIT $1 - `; - const result = await migrate_1.pool.query(query, [flags.limit]); - if (flags.dryRun) { - console.log(` Would queue ${result.rows.length} dispensaries for ${cat} production crawl:`); - for (const row of result.rows) { - const lastScan = row.last_scan ? new Date(row.last_scan).toISOString() : 'never'; - console.log(` - [${row.id}] ${row.name} (provider: ${row.provider}, last: ${lastScan})`); - } - totalQueued += result.rows.length; - continue; - } - for (const dispensary of result.rows) { - try { - // For products, use the existing crawl_jobs table for production - await migrate_1.pool.query(`INSERT INTO crawl_jobs (store_id, job_type, trigger_type, status, priority, metadata) - SELECT s.id, 'full_crawl', 'scheduled', 'pending', 50, - jsonb_build_object('dispensary_id', $1, 'category', $2, 'source', 'queue-intelligence') - FROM stores s - JOIN dispensaries d ON (d.menu_url = s.dutchie_url OR d.name ILIKE '%' || s.name || '%') - WHERE d.id = $1 - LIMIT 1`, [dispensary.id, cat]); - console.log(` ✓ Queued ${cat} production: [${dispensary.id}] ${dispensary.name}`); - totalQueued++; - } - catch (error) { - console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`); - } - } - } - return totalQueued; -} -async function queueCategorySandboxCrawls(category) { - const categories = category ? [category] : CATEGORIES; - let totalQueued = 0; - for (const cat of categories) { - console.log(`\n🧪 Queueing Sandbox ${cat.toUpperCase()} Crawls...`); - // Find dispensaries in sandbox mode for this category - const query = ` - SELECT d.id, d.name, d.${cat}_provider as provider, d.${cat}_confidence as confidence, - d.website, d.menu_url - FROM dispensaries d - WHERE d.${cat}_crawler_mode = 'sandbox' - AND d.${cat}_provider IS NOT NULL - AND (d.website IS NOT NULL OR d.menu_url IS NOT NULL) - AND NOT EXISTS ( - SELECT 1 FROM sandbox_crawl_jobs sj - WHERE sj.dispensary_id = d.id - AND sj.category = $1 - AND sj.status IN ('pending', 'running') - ) - ORDER BY d.${cat}_confidence DESC, d.updated_at ASC - LIMIT $2 - `; - const result = await migrate_1.pool.query(query, [cat, flags.limit]); - if (flags.dryRun) { - console.log(` Would queue ${result.rows.length} dispensaries for ${cat} sandbox crawl:`); - for (const row of result.rows) { - console.log(` - [${row.id}] ${row.name} (provider: ${row.provider}, confidence: ${row.confidence}%)`); - } - totalQueued += result.rows.length; - continue; - } - for (const dispensary of result.rows) { - try { - // Create sandbox entry if needed - const sandboxResult = await migrate_1.pool.query(`INSERT INTO crawler_sandboxes (dispensary_id, category, suspected_menu_provider, mode, status) - VALUES ($1, $2, $3, 'template_learning', 'pending') - ON CONFLICT (dispensary_id, category) WHERE status NOT IN ('moved_to_production', 'failed') - DO UPDATE SET updated_at = NOW() - RETURNING id`, [dispensary.id, cat, dispensary.provider]); - const sandboxId = sandboxResult.rows[0]?.id; - // Create sandbox job - await migrate_1.pool.query(`INSERT INTO sandbox_crawl_jobs (dispensary_id, sandbox_id, category, job_type, status, priority) - VALUES ($1, $2, $3, 'crawl', 'pending', 5)`, [dispensary.id, sandboxId, cat]); - console.log(` ✓ Queued ${cat} sandbox: [${dispensary.id}] ${dispensary.name} (${dispensary.provider})`); - totalQueued++; - } - catch (error) { - console.error(` ✗ Failed to queue [${dispensary.id}]: ${error.message}`); - } - } - } - return totalQueued; -} -async function processDetectionJobs() { - console.log('\n🔍 Processing Detection Jobs...'); - // Get pending detection jobs - const jobs = await migrate_1.pool.query(`SELECT DISTINCT dispensary_id - FROM sandbox_crawl_jobs - WHERE job_type = 'detection' AND status = 'pending' - ${flags.category ? `AND category = $2` : ''} - ${flags.dispensary ? `AND dispensary_id = $${flags.category ? '3' : '2'}` : ''} - LIMIT $1`, flags.category - ? (flags.dispensary ? [flags.limit, flags.category, flags.dispensary] : [flags.limit, flags.category]) - : (flags.dispensary ? [flags.limit, flags.dispensary] : [flags.limit])); - for (const job of jobs.rows) { - console.log(`\nProcessing detection for dispensary ${job.dispensary_id}...`); - try { - // Get dispensary info - const dispResult = await migrate_1.pool.query('SELECT id, name, website, menu_url FROM dispensaries WHERE id = $1', [job.dispensary_id]); - const dispensary = dispResult.rows[0]; - if (!dispensary) { - console.log(` ✗ Dispensary not found`); - continue; - } - const websiteUrl = dispensary.website || dispensary.menu_url; - if (!websiteUrl) { - console.log(` ✗ No website URL`); - continue; - } - // Mark jobs as running - await migrate_1.pool.query(`UPDATE sandbox_crawl_jobs SET status = 'running', started_at = NOW() - WHERE dispensary_id = $1 AND job_type = 'detection' AND status = 'pending'`, [job.dispensary_id]); - // Run multi-category detection - console.log(` Detecting providers for ${dispensary.name}...`); - const detection = await (0, intelligence_detector_1.detectMultiCategoryProviders)(websiteUrl, { timeout: 45000 }); - // Update all categories - await (0, intelligence_detector_1.updateAllCategoryProviders)(job.dispensary_id, detection); - // Mark jobs as completed - await migrate_1.pool.query(`UPDATE sandbox_crawl_jobs SET status = 'completed', completed_at = NOW(), - result_summary = $1 - WHERE dispensary_id = $2 AND job_type = 'detection' AND status = 'running'`, [JSON.stringify({ - product: { provider: detection.product.provider, confidence: detection.product.confidence }, - specials: { provider: detection.specials.provider, confidence: detection.specials.confidence }, - brand: { provider: detection.brand.provider, confidence: detection.brand.confidence }, - metadata: { provider: detection.metadata.provider, confidence: detection.metadata.confidence }, - }), job.dispensary_id]); - console.log(` ✓ Detection complete:`); - console.log(` Product: ${detection.product.provider} (${detection.product.confidence}%) -> ${detection.product.mode}`); - console.log(` Specials: ${detection.specials.provider} (${detection.specials.confidence}%) -> ${detection.specials.mode}`); - console.log(` Brand: ${detection.brand.provider} (${detection.brand.confidence}%) -> ${detection.brand.mode}`); - console.log(` Metadata: ${detection.metadata.provider} (${detection.metadata.confidence}%) -> ${detection.metadata.mode}`); - } - catch (error) { - console.log(` ✗ Error: ${error.message}`); - await migrate_1.pool.query(`UPDATE sandbox_crawl_jobs SET status = 'failed', error_message = $1 - WHERE dispensary_id = $2 AND job_type = 'detection' AND status = 'running'`, [error.message, job.dispensary_id]); - } - } -} -async function processCrawlJobs() { - const categories = flags.category ? [flags.category] : CATEGORIES; - for (const cat of categories) { - console.log(`\n⚙️ Processing ${cat.toUpperCase()} Crawl Jobs...\n`); - // Process sandbox jobs for this category - if (flags.sandbox || !flags.production) { - await (0, category_crawler_jobs_1.processCategorySandboxJobs)(cat, flags.limit); - } - // Process production jobs for this category - if (flags.production && cat === 'product') { - // Get pending production crawls - const prodJobs = await migrate_1.pool.query(`SELECT d.id - FROM dispensaries d - WHERE d.product_provider = 'dutchie' - AND d.product_crawler_mode = 'production' - AND d.product_confidence >= 70 - ${flags.dispensary ? 'AND d.id = $2' : ''} - LIMIT $1`, flags.dispensary ? [flags.limit, flags.dispensary] : [flags.limit]); - for (const job of prodJobs.rows) { - console.log(`Processing production ${cat} crawl for dispensary ${job.id}...`); - const result = await (0, category_crawler_jobs_1.runCrawlProductsJob)(job.id); - console.log(` ${result.success ? '✓' : '✗'} ${result.message}`); - } - } - } -} -async function processSpecificDispensary() { - if (!flags.dispensary) - return; - console.log(`\n🎯 Processing Dispensary ${flags.dispensary}...\n`); - const dispResult = await migrate_1.pool.query('SELECT * FROM dispensaries WHERE id = $1', [flags.dispensary]); - if (dispResult.rows.length === 0) { - console.log('Dispensary not found'); - return; - } - const dispensary = dispResult.rows[0]; - console.log(`Name: ${dispensary.name}`); - console.log(`Website: ${dispensary.website || dispensary.menu_url || 'none'}`); - console.log(''); - if (flags.detection) { - console.log('Running multi-category detection...'); - const websiteUrl = dispensary.website || dispensary.menu_url; - if (websiteUrl) { - const detection = await (0, intelligence_detector_1.detectMultiCategoryProviders)(websiteUrl); - await (0, intelligence_detector_1.updateAllCategoryProviders)(flags.dispensary, detection); - console.log('Detection results:'); - console.log(` Product: ${detection.product.provider} (${detection.product.confidence}%) -> ${detection.product.mode}`); - console.log(` Specials: ${detection.specials.provider} (${detection.specials.confidence}%) -> ${detection.specials.mode}`); - console.log(` Brand: ${detection.brand.provider} (${detection.brand.confidence}%) -> ${detection.brand.mode}`); - console.log(` Metadata: ${detection.metadata.provider} (${detection.metadata.confidence}%) -> ${detection.metadata.mode}`); - } - } - if (flags.production) { - console.log('\nRunning production crawls...'); - const results = await (0, category_crawler_jobs_1.runAllCategoryProductionCrawls)(flags.dispensary); - console.log(` ${results.summary}`); - } - if (flags.sandbox) { - console.log('\nRunning sandbox crawls...'); - const results = await (0, category_crawler_jobs_1.runAllCategorySandboxCrawls)(flags.dispensary); - console.log(` ${results.summary}`); - } -} -async function showStats() { - console.log('\n📊 Multi-Category Intelligence Stats:'); - // Per-category stats - for (const cat of CATEGORIES) { - const stats = await migrate_1.pool.query(` - SELECT - COUNT(*) as total, - COUNT(*) FILTER (WHERE ${cat}_provider IS NULL) as no_provider, - COUNT(*) FILTER (WHERE ${cat}_provider = 'dutchie') as dutchie, - COUNT(*) FILTER (WHERE ${cat}_provider = 'treez') as treez, - COUNT(*) FILTER (WHERE ${cat}_provider NOT IN ('dutchie', 'treez', 'unknown') AND ${cat}_provider IS NOT NULL) as other, - COUNT(*) FILTER (WHERE ${cat}_provider = 'unknown') as unknown, - COUNT(*) FILTER (WHERE ${cat}_crawler_mode = 'production') as production, - COUNT(*) FILTER (WHERE ${cat}_crawler_mode = 'sandbox') as sandbox, - AVG(${cat}_confidence) as avg_confidence - FROM dispensaries - `); - const s = stats.rows[0]; - console.log(` - ${cat.toUpperCase()}: - Providers: Dutchie=${s.dutchie}, Treez=${s.treez}, Other=${s.other}, Unknown=${s.unknown}, None=${s.no_provider} - Modes: Production=${s.production}, Sandbox=${s.sandbox} - Avg Confidence: ${Math.round(s.avg_confidence || 0)}%`); - } - // Job stats per category - console.log('\n Sandbox Jobs by Category:'); - const jobStats = await migrate_1.pool.query(` - SELECT - category, - COUNT(*) FILTER (WHERE status = 'pending') as pending, - COUNT(*) FILTER (WHERE status = 'running') as running, - COUNT(*) FILTER (WHERE status = 'completed') as completed, - COUNT(*) FILTER (WHERE status = 'failed') as failed - FROM sandbox_crawl_jobs - GROUP BY category - ORDER BY category - `); - for (const row of jobStats.rows) { - console.log(` ${row.category}: pending=${row.pending}, running=${row.running}, completed=${row.completed}, failed=${row.failed}`); - } -} -async function main() { - if (flags.help) { - await showHelp(); - process.exit(0); - } - console.log('═══════════════════════════════════════════════════════'); - console.log(' Multi-Category Intelligence Queue Manager'); - console.log('═══════════════════════════════════════════════════════'); - if (flags.dryRun) { - console.log('\n🔍 DRY RUN MODE - No changes will be made\n'); - } - if (flags.category) { - console.log(`\n📌 Filtering to category: ${flags.category}\n`); - } - try { - // Show current stats first - await showStats(); - // If specific dispensary specified, process it directly - if (flags.dispensary && flags.process) { - await processSpecificDispensary(); - } - else if (flags.process) { - // Process mode - run jobs - if (flags.detection) { - await processDetectionJobs(); - } - await processCrawlJobs(); - } - else { - // Queuing mode - let totalQueued = 0; - if (flags.detection) { - totalQueued += await queueMultiCategoryDetection(); - } - if (flags.production) { - totalQueued += await queueCategoryProductionCrawls(flags.category); - } - if (flags.sandbox) { - totalQueued += await queueCategorySandboxCrawls(flags.category); - } - console.log('\n═══════════════════════════════════════════════════════'); - console.log(` Total queued: ${totalQueued}`); - console.log('═══════════════════════════════════════════════════════\n'); - } - // Show updated stats - if (!flags.dryRun) { - await showStats(); - } - } - catch (error) { - console.error('Fatal error:', error); - process.exit(1); - } - finally { - await migrate_1.pool.end(); - } -} -main(); diff --git a/backend/dist/scripts/run-dutchie-scrape.js b/backend/dist/scripts/run-dutchie-scrape.js deleted file mode 100644 index c2c8ca98..00000000 --- a/backend/dist/scripts/run-dutchie-scrape.js +++ /dev/null @@ -1,125 +0,0 @@ -"use strict"; -/** - * Run Dutchie GraphQL Scrape - * - * This script demonstrates the full pipeline: - * 1. Puppeteer navigates to Dutchie menu - * 2. GraphQL responses are intercepted - * 3. Products are normalized to our schema - * 4. Products are upserted to database - * 5. Derived views (brands, categories, specials) are automatically updated - */ -Object.defineProperty(exports, "__esModule", { value: true }); -const pg_1 = require("pg"); -const dutchie_graphql_1 = require("../scrapers/dutchie-graphql"); -const DATABASE_URL = process.env.DATABASE_URL || 'postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus'; -async function main() { - const pool = new pg_1.Pool({ connectionString: DATABASE_URL }); - try { - console.log('='.repeat(80)); - console.log('DUTCHIE GRAPHQL SCRAPER - FULL PIPELINE TEST'); - console.log('='.repeat(80)); - console.log(`Database: ${DATABASE_URL.replace(/:[^:@]+@/, ':***@')}`); - // Configuration - const storeId = 1; // Deeply Rooted - const menuUrl = 'https://dutchie.com/embedded-menu/AZ-Deeply-Rooted'; - console.log(`\nStore ID: ${storeId}`); - console.log(`Menu URL: ${menuUrl}`); - console.log('\n' + '-'.repeat(80)); - // Run the scrape - console.log('\n🚀 Starting scrape...\n'); - const result = await (0, dutchie_graphql_1.scrapeDutchieMenu)(pool, storeId, menuUrl); - console.log('\n' + '-'.repeat(80)); - console.log('📊 SCRAPE RESULTS:'); - console.log('-'.repeat(80)); - console.log(` Success: ${result.success}`); - console.log(` Products Found: ${result.productsFound}`); - console.log(` Inserted: ${result.inserted}`); - console.log(` Updated: ${result.updated}`); - if (result.error) { - console.log(` Error: ${result.error}`); - } - // Query derived views to show the result - if (result.success) { - console.log('\n' + '-'.repeat(80)); - console.log('📈 DERIVED DATA (from products table):'); - console.log('-'.repeat(80)); - // Brands - const brandsResult = await pool.query(` - SELECT brand_name, product_count, min_price, max_price - FROM derived_brands - WHERE store_id = $1 - ORDER BY product_count DESC - LIMIT 5 - `, [storeId]); - console.log('\nTop 5 Brands:'); - brandsResult.rows.forEach(row => { - console.log(` - ${row.brand_name}: ${row.product_count} products ($${row.min_price} - $${row.max_price})`); - }); - // Specials - const specialsResult = await pool.query(` - SELECT name, brand, rec_price, rec_special_price, discount_percent - FROM current_specials - WHERE store_id = $1 - LIMIT 5 - `, [storeId]); - console.log('\nTop 5 Specials:'); - if (specialsResult.rows.length === 0) { - console.log(' (No specials found - is_on_special may not be populated yet)'); - } - else { - specialsResult.rows.forEach(row => { - console.log(` - ${row.name} (${row.brand}): $${row.rec_price} → $${row.rec_special_price} (${row.discount_percent}% off)`); - }); - } - // Categories - const categoriesResult = await pool.query(` - SELECT category_name, product_count - FROM derived_categories - WHERE store_id = $1 - ORDER BY product_count DESC - LIMIT 5 - `, [storeId]); - console.log('\nTop 5 Categories:'); - if (categoriesResult.rows.length === 0) { - console.log(' (No categories found - subcategory may not be populated yet)'); - } - else { - categoriesResult.rows.forEach(row => { - console.log(` - ${row.category_name}: ${row.product_count} products`); - }); - } - // Sample product - const sampleResult = await pool.query(` - SELECT name, brand, subcategory, rec_price, rec_special_price, is_on_special, thc_percentage, status - FROM products - WHERE store_id = $1 AND subcategory IS NOT NULL - ORDER BY updated_at DESC - LIMIT 1 - `, [storeId]); - if (sampleResult.rows.length > 0) { - const sample = sampleResult.rows[0]; - console.log('\nSample Product (with new fields):'); - console.log(` Name: ${sample.name}`); - console.log(` Brand: ${sample.brand}`); - console.log(` Category: ${sample.subcategory}`); - console.log(` Price: $${sample.rec_price}`); - console.log(` Sale Price: ${sample.rec_special_price ? `$${sample.rec_special_price}` : 'N/A'}`); - console.log(` On Special: ${sample.is_on_special}`); - console.log(` THC: ${sample.thc_percentage}%`); - console.log(` Status: ${sample.status}`); - } - } - console.log('\n' + '='.repeat(80)); - console.log('✅ SCRAPE COMPLETE'); - console.log('='.repeat(80)); - } - catch (error) { - console.error('\n❌ Error:', error.message); - throw error; - } - finally { - await pool.end(); - } -} -main().catch(console.error); diff --git a/backend/dist/scripts/scrape-all-active.js b/backend/dist/scripts/scrape-all-active.js deleted file mode 100644 index fb55b0d6..00000000 --- a/backend/dist/scripts/scrape-all-active.js +++ /dev/null @@ -1,279 +0,0 @@ -"use strict"; -/** - * Scrape ALL active products via direct GraphQL pagination - * This is more reliable than category navigation - */ -var __importDefault = (this && this.__importDefault) || function (mod) { - return (mod && mod.__esModule) ? mod : { "default": mod }; -}; -Object.defineProperty(exports, "__esModule", { value: true }); -const puppeteer_extra_1 = __importDefault(require("puppeteer-extra")); -const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth")); -const pg_1 = require("pg"); -const dutchie_graphql_1 = require("../scrapers/dutchie-graphql"); -puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)()); -const DATABASE_URL = process.env.DATABASE_URL || 'postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus'; -const GRAPHQL_HASH = 'ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0'; -async function scrapeAllProducts(menuUrl, storeId) { - const pool = new pg_1.Pool({ connectionString: DATABASE_URL }); - const browser = await puppeteer_extra_1.default.launch({ - headless: 'new', - args: ['--no-sandbox', '--disable-setuid-sandbox'], - }); - try { - const page = await browser.newPage(); - await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36'); - console.log('Loading menu to establish session...'); - await page.goto(menuUrl, { - waitUntil: 'networkidle2', - timeout: 60000, - }); - await new Promise((r) => setTimeout(r, 3000)); - const dispensaryId = await page.evaluate(() => window.reactEnv?.dispensaryId); - console.log('Dispensary ID:', dispensaryId); - // Paginate through all products - const allProducts = []; - let pageNum = 0; - const perPage = 100; - console.log('\nFetching all products via paginated GraphQL...'); - while (true) { - const result = await page.evaluate(async (dispId, hash, page, perPage) => { - const variables = { - includeEnterpriseSpecials: false, - productsFilter: { - dispensaryId: dispId, - pricingType: 'rec', - Status: 'Active', - types: [], - useCache: false, - isDefaultSort: true, - sortBy: 'popularSortIdx', - sortDirection: 1, - bypassOnlineThresholds: true, - isKioskMenu: false, - removeProductsBelowOptionThresholds: false, - }, - page, - perPage, - }; - const qs = new URLSearchParams({ - operationName: 'FilteredProducts', - variables: JSON.stringify(variables), - extensions: JSON.stringify({ persistedQuery: { version: 1, sha256Hash: hash } }), - }); - const resp = await fetch(`https://dutchie.com/graphql?${qs.toString()}`, { - method: 'GET', - headers: { - 'content-type': 'application/json', - 'apollographql-client-name': 'Marketplace (production)', - }, - credentials: 'include', - }); - const json = await resp.json(); - return { - products: json?.data?.filteredProducts?.products || [], - totalCount: json?.data?.filteredProducts?.queryInfo?.totalCount, - }; - }, dispensaryId, GRAPHQL_HASH, pageNum, perPage); - if (result.products.length === 0) { - break; - } - allProducts.push(...result.products); - console.log(`Page ${pageNum}: ${result.products.length} products (total so far: ${allProducts.length}/${result.totalCount})`); - pageNum++; - // Safety limit - if (pageNum > 50) { - console.log('Reached page limit'); - break; - } - } - console.log(`\nTotal products fetched: ${allProducts.length}`); - // Normalize and upsert - console.log('\nNormalizing and upserting to database...'); - const normalized = allProducts.map(dutchie_graphql_1.normalizeDutchieProduct); - const client = await pool.connect(); - let inserted = 0; - let updated = 0; - try { - await client.query('BEGIN'); - for (const product of normalized) { - const result = await client.query(` - INSERT INTO products ( - store_id, external_id, slug, name, enterprise_product_id, - brand, brand_external_id, brand_logo_url, - subcategory, strain_type, canonical_category, - price, rec_price, med_price, rec_special_price, med_special_price, - is_on_special, special_name, discount_percent, special_data, - sku, inventory_quantity, inventory_available, is_below_threshold, status, - thc_percentage, cbd_percentage, cannabinoids, - weight_mg, net_weight_value, net_weight_unit, options, raw_options, - image_url, additional_images, - is_featured, medical_only, rec_only, - source_created_at, source_updated_at, - description, raw_data, - dutchie_url, last_seen_at, updated_at - ) - VALUES ( - $1, $2, $3, $4, $5, - $6, $7, $8, - $9, $10, $11, - $12, $13, $14, $15, $16, - $17, $18, $19, $20, - $21, $22, $23, $24, $25, - $26, $27, $28, - $29, $30, $31, $32, $33, - $34, $35, - $36, $37, $38, - $39, $40, - $41, $42, - '', NOW(), NOW() - ) - ON CONFLICT (store_id, slug) DO UPDATE SET - name = EXCLUDED.name, - enterprise_product_id = EXCLUDED.enterprise_product_id, - brand = EXCLUDED.brand, - brand_external_id = EXCLUDED.brand_external_id, - brand_logo_url = EXCLUDED.brand_logo_url, - subcategory = EXCLUDED.subcategory, - strain_type = EXCLUDED.strain_type, - canonical_category = EXCLUDED.canonical_category, - price = EXCLUDED.price, - rec_price = EXCLUDED.rec_price, - med_price = EXCLUDED.med_price, - rec_special_price = EXCLUDED.rec_special_price, - med_special_price = EXCLUDED.med_special_price, - is_on_special = EXCLUDED.is_on_special, - special_name = EXCLUDED.special_name, - discount_percent = EXCLUDED.discount_percent, - special_data = EXCLUDED.special_data, - sku = EXCLUDED.sku, - inventory_quantity = EXCLUDED.inventory_quantity, - inventory_available = EXCLUDED.inventory_available, - is_below_threshold = EXCLUDED.is_below_threshold, - status = EXCLUDED.status, - thc_percentage = EXCLUDED.thc_percentage, - cbd_percentage = EXCLUDED.cbd_percentage, - cannabinoids = EXCLUDED.cannabinoids, - weight_mg = EXCLUDED.weight_mg, - net_weight_value = EXCLUDED.net_weight_value, - net_weight_unit = EXCLUDED.net_weight_unit, - options = EXCLUDED.options, - raw_options = EXCLUDED.raw_options, - image_url = EXCLUDED.image_url, - additional_images = EXCLUDED.additional_images, - is_featured = EXCLUDED.is_featured, - medical_only = EXCLUDED.medical_only, - rec_only = EXCLUDED.rec_only, - source_created_at = EXCLUDED.source_created_at, - source_updated_at = EXCLUDED.source_updated_at, - description = EXCLUDED.description, - raw_data = EXCLUDED.raw_data, - last_seen_at = NOW(), - updated_at = NOW() - RETURNING (xmax = 0) AS was_inserted - `, [ - storeId, - product.external_id, - product.slug, - product.name, - product.enterprise_product_id, - product.brand, - product.brand_external_id, - product.brand_logo_url, - product.subcategory, - product.strain_type, - product.canonical_category, - product.price, - product.rec_price, - product.med_price, - product.rec_special_price, - product.med_special_price, - product.is_on_special, - product.special_name, - product.discount_percent, - product.special_data ? JSON.stringify(product.special_data) : null, - product.sku, - product.inventory_quantity, - product.inventory_available, - product.is_below_threshold, - product.status, - product.thc_percentage, - product.cbd_percentage, - product.cannabinoids ? JSON.stringify(product.cannabinoids) : null, - product.weight_mg, - product.net_weight_value, - product.net_weight_unit, - product.options, - product.raw_options, - product.image_url, - product.additional_images, - product.is_featured, - product.medical_only, - product.rec_only, - product.source_created_at, - product.source_updated_at, - product.description, - product.raw_data ? JSON.stringify(product.raw_data) : null, - ]); - if (result.rows[0]?.was_inserted) { - inserted++; - } - else { - updated++; - } - } - await client.query('COMMIT'); - } - catch (error) { - await client.query('ROLLBACK'); - throw error; - } - finally { - client.release(); - } - console.log(`\nDatabase: ${inserted} inserted, ${updated} updated`); - // Show summary stats - const stats = await pool.query(` - SELECT - COUNT(*) as total, - COUNT(*) FILTER (WHERE is_on_special) as specials, - COUNT(DISTINCT brand) as brands, - COUNT(DISTINCT subcategory) as categories - FROM products WHERE store_id = $1 - `, [storeId]); - console.log('\nStore summary:'); - console.log(` Total products: ${stats.rows[0].total}`); - console.log(` On special: ${stats.rows[0].specials}`); - console.log(` Unique brands: ${stats.rows[0].brands}`); - console.log(` Categories: ${stats.rows[0].categories}`); - return { - success: true, - totalProducts: allProducts.length, - inserted, - updated, - }; - } - finally { - await browser.close(); - await pool.end(); - } -} -// Run -const menuUrl = process.argv[2] || 'https://dutchie.com/embedded-menu/AZ-Deeply-Rooted'; -const storeId = parseInt(process.argv[3] || '1', 10); -console.log('='.repeat(60)); -console.log('DUTCHIE GRAPHQL FULL SCRAPE'); -console.log('='.repeat(60)); -console.log(`Menu URL: ${menuUrl}`); -console.log(`Store ID: ${storeId}`); -console.log(''); -scrapeAllProducts(menuUrl, storeId) - .then((result) => { - console.log('\n' + '='.repeat(60)); - console.log('COMPLETE'); - console.log(JSON.stringify(result, null, 2)); -}) - .catch((error) => { - console.error('Error:', error.message); - process.exit(1); -}); diff --git a/backend/dist/scripts/test-dutchie-e2e.js b/backend/dist/scripts/test-dutchie-e2e.js deleted file mode 100644 index 63bb215a..00000000 --- a/backend/dist/scripts/test-dutchie-e2e.js +++ /dev/null @@ -1,169 +0,0 @@ -"use strict"; -/** - * Test script: End-to-end Dutchie GraphQL → DB → Dashboard flow - * - * This demonstrates the complete data pipeline: - * 1. Fetch one product from Dutchie GraphQL via Puppeteer - * 2. Normalize it to our schema - * 3. Show the mapping - */ -var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { - if (k2 === undefined) k2 = k; - var desc = Object.getOwnPropertyDescriptor(m, k); - if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { - desc = { enumerable: true, get: function() { return m[k]; } }; - } - Object.defineProperty(o, k2, desc); -}) : (function(o, m, k, k2) { - if (k2 === undefined) k2 = k; - o[k2] = m[k]; -})); -var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { - Object.defineProperty(o, "default", { enumerable: true, value: v }); -}) : function(o, v) { - o["default"] = v; -}); -var __importStar = (this && this.__importStar) || (function () { - var ownKeys = function(o) { - ownKeys = Object.getOwnPropertyNames || function (o) { - var ar = []; - for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; - return ar; - }; - return ownKeys(o); - }; - return function (mod) { - if (mod && mod.__esModule) return mod; - var result = {}; - if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); - __setModuleDefault(result, mod); - return result; - }; -})(); -Object.defineProperty(exports, "__esModule", { value: true }); -const dutchie_graphql_1 = require("../scrapers/dutchie-graphql"); -const fs = __importStar(require("fs")); -// Load the captured sample product from schema capture -const capturedData = JSON.parse(fs.readFileSync('/tmp/dutchie-schema-capture.json', 'utf-8')); -const sampleProduct = capturedData.sampleProduct; -console.log('='.repeat(80)); -console.log('DUTCHIE GRAPHQL → DATABASE MAPPING DEMONSTRATION'); -console.log('='.repeat(80)); -console.log('\n📥 RAW DUTCHIE GRAPHQL PRODUCT:'); -console.log('-'.repeat(80)); -// Show key fields from raw product -const keyRawFields = { - '_id': sampleProduct._id, - 'Name': sampleProduct.Name, - 'cName': sampleProduct.cName, - 'brandName': sampleProduct.brandName, - 'brand.id': sampleProduct.brand?.id, - 'type': sampleProduct.type, - 'subcategory': sampleProduct.subcategory, - 'strainType': sampleProduct.strainType, - 'Prices': sampleProduct.Prices, - 'recPrices': sampleProduct.recPrices, - 'recSpecialPrices': sampleProduct.recSpecialPrices, - 'special': sampleProduct.special, - 'specialData.saleSpecials[0].specialName': sampleProduct.specialData?.saleSpecials?.[0]?.specialName, - 'specialData.saleSpecials[0].discount': sampleProduct.specialData?.saleSpecials?.[0]?.discount, - 'THCContent.range[0]': sampleProduct.THCContent?.range?.[0], - 'CBDContent.range[0]': sampleProduct.CBDContent?.range?.[0], - 'Status': sampleProduct.Status, - 'Image': sampleProduct.Image, - 'POSMetaData.canonicalSKU': sampleProduct.POSMetaData?.canonicalSKU, - 'POSMetaData.children[0].quantity': sampleProduct.POSMetaData?.children?.[0]?.quantity, - 'POSMetaData.children[0].quantityAvailable': sampleProduct.POSMetaData?.children?.[0]?.quantityAvailable, -}; -Object.entries(keyRawFields).forEach(([key, value]) => { - console.log(` ${key}: ${JSON.stringify(value)}`); -}); -console.log('\n📤 NORMALIZED DATABASE ROW:'); -console.log('-'.repeat(80)); -// Normalize the product -const normalized = (0, dutchie_graphql_1.normalizeDutchieProduct)(sampleProduct); -// Show the normalized result (excluding raw_data for readability) -const { raw_data, cannabinoids, special_data, ...displayFields } = normalized; -Object.entries(displayFields).forEach(([key, value]) => { - if (value !== undefined && value !== null) { - console.log(` ${key}: ${JSON.stringify(value)}`); - } -}); -console.log('\n🔗 FIELD MAPPING:'); -console.log('-'.repeat(80)); -const fieldMappings = [ - ['_id / id', 'external_id', sampleProduct._id, normalized.external_id], - ['Name', 'name', sampleProduct.Name, normalized.name], - ['cName', 'slug', sampleProduct.cName, normalized.slug], - ['brandName', 'brand', sampleProduct.brandName, normalized.brand], - ['brand.id', 'brand_external_id', sampleProduct.brand?.id, normalized.brand_external_id], - ['subcategory', 'subcategory', sampleProduct.subcategory, normalized.subcategory], - ['strainType', 'strain_type', sampleProduct.strainType, normalized.strain_type], - ['recPrices[0]', 'rec_price', sampleProduct.recPrices?.[0], normalized.rec_price], - ['recSpecialPrices[0]', 'rec_special_price', sampleProduct.recSpecialPrices?.[0], normalized.rec_special_price], - ['special', 'is_on_special', sampleProduct.special, normalized.is_on_special], - ['specialData...specialName', 'special_name', sampleProduct.specialData?.saleSpecials?.[0]?.specialName?.substring(0, 40) + '...', normalized.special_name?.substring(0, 40) + '...'], - ['THCContent.range[0]', 'thc_percentage', sampleProduct.THCContent?.range?.[0], normalized.thc_percentage], - ['CBDContent.range[0]', 'cbd_percentage', sampleProduct.CBDContent?.range?.[0], normalized.cbd_percentage], - ['Status', 'status', sampleProduct.Status, normalized.status], - ['Image', 'image_url', sampleProduct.Image?.substring(0, 50) + '...', normalized.image_url?.substring(0, 50) + '...'], - ['POSMetaData.canonicalSKU', 'sku', sampleProduct.POSMetaData?.canonicalSKU, normalized.sku], -]; -console.log(' GraphQL Field → DB Column | Value'); -console.log(' ' + '-'.repeat(75)); -fieldMappings.forEach(([gqlField, dbCol, gqlVal, dbVal]) => { - const gqlStr = String(gqlField).padEnd(30); - const dbStr = String(dbCol).padEnd(20); - console.log(` ${gqlStr} → ${dbStr} | ${JSON.stringify(dbVal)}`); -}); -console.log('\n📊 SQL INSERT STATEMENT:'); -console.log('-'.repeat(80)); -// Generate example SQL -const sqlExample = ` -INSERT INTO products ( - store_id, external_id, slug, name, - brand, brand_external_id, - subcategory, strain_type, - rec_price, rec_special_price, - is_on_special, special_name, discount_percent, - thc_percentage, cbd_percentage, - status, image_url, sku -) VALUES ( - 1, -- store_id (Deeply Rooted) - '${normalized.external_id}', -- external_id - '${normalized.slug}', -- slug - '${normalized.name}', -- name - '${normalized.brand}', -- brand - '${normalized.brand_external_id}', -- brand_external_id - '${normalized.subcategory}', -- subcategory - '${normalized.strain_type}', -- strain_type - ${normalized.rec_price}, -- rec_price - ${normalized.rec_special_price}, -- rec_special_price - ${normalized.is_on_special}, -- is_on_special - '${normalized.special_name?.substring(0, 50)}...', -- special_name - ${normalized.discount_percent || 'NULL'}, -- discount_percent - ${normalized.thc_percentage}, -- thc_percentage - ${normalized.cbd_percentage}, -- cbd_percentage - '${normalized.status}', -- status - '${normalized.image_url}', -- image_url - '${normalized.sku}' -- sku -) -ON CONFLICT (store_id, slug) DO UPDATE SET ...; -`; -console.log(sqlExample); -console.log('\n✅ SUMMARY:'); -console.log('-'.repeat(80)); -console.log(` Product: ${normalized.name}`); -console.log(` Brand: ${normalized.brand}`); -console.log(` Category: ${normalized.subcategory}`); -console.log(` Price: $${normalized.rec_price} → $${normalized.rec_special_price} (${normalized.discount_percent}% off)`); -console.log(` THC: ${normalized.thc_percentage}%`); -console.log(` Status: ${normalized.status}`); -console.log(` On Special: ${normalized.is_on_special}`); -console.log(` SKU: ${normalized.sku}`); -console.log('\n🎯 DERIVED VIEWS (computed from products table):'); -console.log('-'.repeat(80)); -console.log(' - current_specials: Products where is_on_special = true'); -console.log(' - derived_brands: Aggregated by brand name with counts/prices'); -console.log(' - derived_categories: Aggregated by subcategory'); -console.log('\nAll views are computed from the single products table - no separate tables needed!'); diff --git a/backend/dist/scripts/test-dutchie-graphql.js b/backend/dist/scripts/test-dutchie-graphql.js deleted file mode 100644 index 8cf8962f..00000000 --- a/backend/dist/scripts/test-dutchie-graphql.js +++ /dev/null @@ -1,179 +0,0 @@ -"use strict"; -/** - * Test script to validate Dutchie GraphQL API access and capture response structure - */ -var __importDefault = (this && this.__importDefault) || function (mod) { - return (mod && mod.__esModule) ? mod : { "default": mod }; -}; -Object.defineProperty(exports, "__esModule", { value: true }); -// @ts-ignore - node-fetch type declaration not installed -const node_fetch_1 = __importDefault(require("node-fetch")); -const GRAPHQL_HASHES = { - ConsumerDispensaries: '0a5bfa6ca1d64ae47bcccb7c8077c87147cbc4e6982c17ceec97a2a4948b311b', - GetAddressBasedDispensaryData: '13461f73abf7268770dfd05fe7e10c523084b2bb916a929c08efe3d87531977b', - FilteredProducts: 'ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0', - MenuFiltersV2: '2f0b3233b8a2426b391649ca3f0f7a5d43b9aefd683f6286d7261a2517e3568e', - FilteredSpecials: '0dfb85a4fc138c55a076d4d11bf6d1a25f7cbd511428e1cf5a5b863b3eb23f25', -}; -async function fetchProducts(dispensaryId, page = 0, perPage = 25) { - const session = 'crawlsy-session-' + Date.now(); - const variables = { - includeEnterpriseSpecials: false, - productsFilter: { - dispensaryId, - pricingType: 'rec', - Status: null, // null to include all (in-stock and out-of-stock) - types: [], - useCache: true, - isDefaultSort: true, - sortBy: 'popularSortIdx', - sortDirection: 1, - bypassOnlineThresholds: true, - isKioskMenu: false, - removeProductsBelowOptionThresholds: false - }, - page, - perPage - }; - const qs = new URLSearchParams({ - operationName: 'FilteredProducts', - variables: JSON.stringify(variables), - extensions: JSON.stringify({ persistedQuery: { version: 1, sha256Hash: GRAPHQL_HASHES.FilteredProducts } }) - }); - const res = await (0, node_fetch_1.default)(`https://dutchie.com/api-3/graphql?${qs.toString()}`, { - headers: { - 'x-dutchie-session': session, - 'apollographql-client-name': 'Marketplace (production)', - 'content-type': 'application/json', - 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' - } - }); - if (!res.ok) { - const text = await res.text(); - console.error('HTTP Status:', res.status); - console.error('Response:', text.substring(0, 500)); - throw new Error(`HTTP ${res.status}: ${text.substring(0, 200)}`); - } - return res.json(); -} -async function resolveDispensaryId(cName) { - const session = 'crawlsy-session-' + Date.now(); - const variables = { input: { dispensaryId: cName } }; - const qs = new URLSearchParams({ - operationName: 'GetAddressBasedDispensaryData', - variables: JSON.stringify(variables), - extensions: JSON.stringify({ persistedQuery: { version: 1, sha256Hash: GRAPHQL_HASHES.GetAddressBasedDispensaryData } }) - }); - const res = await (0, node_fetch_1.default)(`https://dutchie.com/graphql?${qs.toString()}`, { - headers: { - 'x-dutchie-session': session, - 'apollographql-client-name': 'Marketplace (production)', - 'content-type': 'application/json', - 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' - } - }); - if (!res.ok) { - console.error('Failed to resolve dispensary ID:', res.status); - return null; - } - const data = await res.json(); - return data?.data?.getAddressBasedDispensaryData?.dispensaryData?.dispensaryId || null; -} -function enumerateFields(obj, prefix = '') { - const fields = []; - for (const [key, value] of Object.entries(obj)) { - const path = prefix ? `${prefix}.${key}` : key; - if (value === null) { - fields.push(`${path}: null`); - } - else if (Array.isArray(value)) { - fields.push(`${path}: Array[${value.length}]`); - if (value.length > 0 && typeof value[0] === 'object') { - const subFields = enumerateFields(value[0], `${path}[0]`); - fields.push(...subFields); - } - } - else if (typeof value === 'object') { - fields.push(`${path}: Object`); - const subFields = enumerateFields(value, path); - fields.push(...subFields); - } - else { - const typeStr = typeof value; - const preview = String(value).substring(0, 50); - fields.push(`${path}: ${typeStr} = "${preview}"`); - } - } - return fields; -} -async function main() { - console.log('='.repeat(80)); - console.log('DUTCHIE GRAPHQL API TEST'); - console.log('='.repeat(80)); - const cName = 'AZ-Deeply-Rooted'; - // Step 1: Resolve dispensary ID - console.log(`\n1. Resolving dispensary ID for "${cName}"...`); - const dispensaryId = await resolveDispensaryId(cName); - const finalDispensaryId = dispensaryId || '6405ef617056e8014d79101b'; // Fallback to known ID - if (!dispensaryId) { - console.log(' Failed to resolve via API, using hardcoded ID: 6405ef617056e8014d79101b'); - } - console.log(` Final ID: ${finalDispensaryId}`); - // Step 2: Fetch first page of products - console.log('\n2. Fetching products (page 0, perPage 5)...'); - const result = await fetchProducts(finalDispensaryId, 0, 5); - if (result.errors) { - console.error('\nGraphQL Errors:'); - console.error(JSON.stringify(result.errors, null, 2)); - return; - } - const products = result?.data?.filteredProducts?.products || []; - console.log(` Found ${products.length} products in this page`); - if (products.length === 0) { - console.log('No products returned. Full response:'); - console.log(JSON.stringify(result, null, 2)); - return; - } - // Step 3: Enumerate all fields from first product - console.log('\n3. PRODUCT FIELD STRUCTURE (from first product):'); - console.log('-'.repeat(80)); - const product = products[0]; - const fields = enumerateFields(product); - fields.forEach(f => console.log(` ${f}`)); - // Step 4: Show full sample product JSON - console.log('\n4. FULL SAMPLE PRODUCT JSON:'); - console.log('-'.repeat(80)); - console.log(JSON.stringify(product, null, 2)); - // Step 5: Summary of key fields for schema design - console.log('\n5. KEY FIELDS FOR SCHEMA DESIGN:'); - console.log('-'.repeat(80)); - const keyFields = [ - { field: 'id', value: product.id }, - { field: 'name', value: product.name }, - { field: 'slug', value: product.slug }, - { field: 'brand', value: product.brand }, - { field: 'brandId', value: product.brandId }, - { field: 'type', value: product.type }, - { field: 'category', value: product.category }, - { field: 'subcategory', value: product.subcategory }, - { field: 'strainType', value: product.strainType }, - { field: 'THCContent', value: product.THCContent }, - { field: 'CBDContent', value: product.CBDContent }, - { field: 'description', value: product.description?.substring(0, 100) + '...' }, - { field: 'image', value: product.image }, - { field: 'options.length', value: product.options?.length }, - { field: 'pricing', value: product.pricing }, - { field: 'terpenes.length', value: product.terpenes?.length }, - { field: 'effects.length', value: product.effects?.length }, - ]; - keyFields.forEach(({ field, value }) => { - console.log(` ${field}: ${JSON.stringify(value)}`); - }); - // Step 6: Show an option (variant) if available - if (product.options && product.options.length > 0) { - console.log('\n6. SAMPLE OPTION/VARIANT:'); - console.log('-'.repeat(80)); - console.log(JSON.stringify(product.options[0], null, 2)); - } -} -main().catch(console.error); diff --git a/backend/dist/scripts/test-jane-scraper.js b/backend/dist/scripts/test-jane-scraper.js deleted file mode 100644 index 3477a724..00000000 --- a/backend/dist/scripts/test-jane-scraper.js +++ /dev/null @@ -1,255 +0,0 @@ -"use strict"; -/** - * Test script for iHeartJane menu scraping via Playwright - * Intercepts API/Algolia calls made by the browser - */ -Object.defineProperty(exports, "__esModule", { value: true }); -const playwright_1 = require("playwright"); -async function scrapeJaneMenu(urlOrStoreId) { - // Handle either a full URL or just a store ID - const menuUrl = urlOrStoreId.startsWith('http') - ? urlOrStoreId - : `https://www.iheartjane.com/embed/stores/${urlOrStoreId}/menu`; - console.log(`Starting Playwright scrape for iHeartJane: ${menuUrl}`); - const browser = await playwright_1.chromium.launch({ - headless: true, - args: [ - '--no-sandbox', - '--disable-setuid-sandbox', - '--disable-blink-features=AutomationControlled' - ] - }); - const context = await browser.newContext({ - userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', - viewport: { width: 1920, height: 1080 }, - locale: 'en-US', - timezoneId: 'America/Chicago' - }); - // Add stealth scripts to avoid detection - await context.addInitScript(() => { - Object.defineProperty(navigator, 'webdriver', { get: () => false }); - window.chrome = { runtime: {} }; - }); - const page = await context.newPage(); - const products = []; - const apiResponses = []; - const capturedCredentials = {}; - // Intercept ALL network requests to capture API/Algolia data and credentials - page.on('request', (request) => { - const url = request.url(); - const headers = request.headers(); - // Capture Algolia credentials from request headers - if (url.includes('algolia')) { - const appId = headers['x-algolia-application-id']; - const apiKey = headers['x-algolia-api-key']; - if (appId && apiKey) { - capturedCredentials.algolia = { appId, apiKey }; - console.log(`Captured Algolia credentials: App=${appId}, Key=${apiKey.substring(0, 10)}...`); - } - } - }); - page.on('response', async (response) => { - const url = response.url(); - // Capture Algolia search results - if (url.includes('algolia.net') || url.includes('algolianet.com')) { - try { - const data = await response.json(); - if (data.results && data.results[0] && data.results[0].hits) { - console.log(`Captured ${data.results[0].hits.length} products from Algolia`); - apiResponses.push({ type: 'algolia', data: data.results[0] }); - } - } - catch (e) { - // Not JSON or error parsing - } - } - // Capture Jane API responses - if (url.includes('api.iheartjane.com') && url.includes('products')) { - try { - const data = await response.json(); - console.log(`Captured Jane API response: ${url}`); - apiResponses.push({ type: 'jane-api', url, data }); - } - catch (e) { - // Not JSON or error parsing - } - } - }); - try { - console.log(`Navigating to: ${menuUrl}`); - await page.goto(menuUrl, { - waitUntil: 'domcontentloaded', - timeout: 60000 - }); - // Wait for page to settle - await page.waitForTimeout(2000); - // Handle age gate - use Playwright locator with force click - console.log('Looking for age gate...'); - try { - let clicked = false; - // Method 1: Use Playwright locator with exact text match - try { - const yesButton = page.locator('button:has-text("Yes")').first(); - await yesButton.waitFor({ state: 'visible', timeout: 5000 }); - await yesButton.click({ force: true }); - clicked = true; - console.log('Clicked age gate via Playwright locator'); - await page.waitForTimeout(5000); - } - catch (e) { - console.log('Playwright locator failed:', e.message); - } - // Method 2: Try clicking by visible bounding box - if (!clicked) { - try { - const box = await page.locator('button:has-text("Yes")').first().boundingBox(); - if (box) { - await page.mouse.click(box.x + box.width / 2, box.y + box.height / 2); - clicked = true; - console.log(`Clicked age gate at coordinates: ${box.x + box.width / 2}, ${box.y + box.height / 2}`); - await page.waitForTimeout(5000); - } - } - catch (e) { - console.log('Bounding box click failed'); - } - } - // Method 3: Try JavaScript click - if (!clicked) { - const jsClickResult = await page.evaluate(() => { - const buttons = Array.from(document.querySelectorAll('button')); - for (const btn of buttons) { - if (btn.textContent?.includes('Yes')) { - btn.click(); - return { success: true, buttonText: btn.textContent }; - } - } - return { success: false }; - }); - if (jsClickResult.success) { - clicked = true; - console.log(`Clicked via JS: ${jsClickResult.buttonText}`); - await page.waitForTimeout(5000); - } - } - // Method 4: Click element containing "Yes" with dispatchEvent - if (!clicked) { - const dispatchResult = await page.evaluate(() => { - const buttons = Array.from(document.querySelectorAll('button')); - for (const btn of buttons) { - if (btn.textContent?.includes('Yes')) { - btn.dispatchEvent(new MouseEvent('click', { bubbles: true, cancelable: true })); - return true; - } - } - return false; - }); - if (dispatchResult) { - clicked = true; - console.log('Clicked via dispatchEvent'); - await page.waitForTimeout(5000); - } - } - // Log button info for debugging - const buttonInfo = await page.evaluate(() => { - const buttons = Array.from(document.querySelectorAll('button')); - return buttons.map(b => ({ - text: b.textContent?.trim(), - visible: b.offsetParent !== null, - rect: b.getBoundingClientRect() - })); - }); - console.log('Buttons found:', JSON.stringify(buttonInfo, null, 2)); - } - catch (e) { - console.log('Age gate handling error:', e); - } - // Wait for content to load after age gate - await page.waitForTimeout(3000); - // Try to scroll to trigger more product loads - console.log('Scrolling to load more products...'); - for (let i = 0; i < 3; i++) { - await page.evaluate(() => window.scrollBy(0, 1000)); - await page.waitForTimeout(1000); - } - // Extract products from the page DOM as backup - const domProducts = await page.evaluate(() => { - const items = []; - // Try various selectors that Jane might use - const productCards = document.querySelectorAll('[data-testid*="product"], [class*="ProductCard"], [class*="product-card"], .product-tile'); - productCards.forEach((card) => { - const name = card.querySelector('[class*="name"], [class*="title"], h3, h4')?.textContent?.trim(); - const brand = card.querySelector('[class*="brand"]')?.textContent?.trim(); - const price = card.querySelector('[class*="price"]')?.textContent?.trim(); - const image = card.querySelector('img')?.getAttribute('src'); - if (name) { - items.push({ name, brand, price, image, source: 'dom' }); - } - }); - return items; - }); - console.log(`Extracted ${domProducts.length} products from DOM`); - // Check for __NEXT_DATA__ or similar embedded data - const embeddedData = await page.evaluate(() => { - // Check for Next.js data - const nextData = document.getElementById('__NEXT_DATA__'); - if (nextData) { - return { type: 'next', data: JSON.parse(nextData.textContent || '{}') }; - } - // Check for any window-level product data - const win = window; - if (win.__INITIAL_STATE__) - return { type: 'initial_state', data: win.__INITIAL_STATE__ }; - if (win.__PRELOADED_STATE__) - return { type: 'preloaded', data: win.__PRELOADED_STATE__ }; - if (win.products) - return { type: 'products', data: win.products }; - return null; - }); - if (embeddedData) { - console.log(`Found embedded data: ${embeddedData.type}`); - apiResponses.push(embeddedData); - } - // Take a screenshot for debugging - const screenshotPath = `/tmp/jane-scrape-${Date.now()}.png`; - await page.screenshot({ path: screenshotPath, fullPage: true }); - console.log(`Screenshot saved to ${screenshotPath}`); - // Process captured API responses - console.log('\n=== API Responses Summary ==='); - for (const resp of apiResponses) { - console.log(`Type: ${resp.type}`); - if (resp.type === 'algolia' && resp.data.hits) { - console.log(` Hits: ${resp.data.hits.length}`); - console.log(` Total: ${resp.data.nbHits}`); - if (resp.data.hits[0]) { - console.log(` Sample product:`, JSON.stringify(resp.data.hits[0], null, 2).substring(0, 1000)); - } - } - } - console.log('\n=== DOM Products Sample ==='); - console.log(JSON.stringify(domProducts.slice(0, 3), null, 2)); - console.log('\n=== Captured Credentials ==='); - console.log(JSON.stringify(capturedCredentials, null, 2)); - return { - apiResponses, - domProducts, - embeddedData, - capturedCredentials - }; - } - finally { - await browser.close(); - } -} -// Main execution -const urlOrStoreId = process.argv[2] || 'https://iheartjane.com/aly2djS2yXoTGnR0/DBeqE6HSSwijog9l'; // Default to The Flower Shop Az -scrapeJaneMenu(urlOrStoreId) - .then((result) => { - console.log('\n=== Scrape Complete ==='); - console.log(`Total API responses captured: ${result.apiResponses.length}`); - console.log(`Total DOM products: ${result.domProducts.length}`); -}) - .catch((err) => { - console.error('Scrape failed:', err); - process.exit(1); -}); diff --git a/backend/dist/scripts/test-status-filter.js b/backend/dist/scripts/test-status-filter.js deleted file mode 100644 index 86a663c0..00000000 --- a/backend/dist/scripts/test-status-filter.js +++ /dev/null @@ -1,84 +0,0 @@ -"use strict"; -/** - * Test different Status filter values in Dutchie GraphQL - */ -var __importDefault = (this && this.__importDefault) || function (mod) { - return (mod && mod.__esModule) ? mod : { "default": mod }; -}; -Object.defineProperty(exports, "__esModule", { value: true }); -const puppeteer_extra_1 = __importDefault(require("puppeteer-extra")); -const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth")); -puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)()); -const GRAPHQL_HASH = 'ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0'; -async function main() { - const browser = await puppeteer_extra_1.default.launch({ - headless: 'new', - args: ['--no-sandbox', '--disable-setuid-sandbox'], - }); - const page = await browser.newPage(); - await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36'); - console.log('Loading menu...'); - await page.goto('https://dutchie.com/embedded-menu/AZ-Deeply-Rooted', { - waitUntil: 'networkidle2', - timeout: 60000, - }); - await new Promise((r) => setTimeout(r, 3000)); - const dispensaryId = await page.evaluate(() => window.reactEnv?.dispensaryId); - console.log('Dispensary ID:', dispensaryId); - // Test different status values - const testCases = [ - { label: 'Active', status: 'Active', includeStatus: true }, - { label: 'Inactive', status: 'Inactive', includeStatus: true }, - { label: 'null', status: null, includeStatus: true }, - { label: 'omitted', status: null, includeStatus: false }, - ]; - for (const testCase of testCases) { - const result = await page.evaluate(async (dispId, hash, status, includeStatus) => { - const filter = { - dispensaryId: dispId, - pricingType: 'rec', - types: [], - useCache: false, - isDefaultSort: true, - sortBy: 'popularSortIdx', - sortDirection: 1, - bypassOnlineThresholds: true, - isKioskMenu: false, - removeProductsBelowOptionThresholds: false, - }; - if (includeStatus) { - filter.Status = status; - } - const variables = { - includeEnterpriseSpecials: false, - productsFilter: filter, - page: 0, - perPage: 100, - }; - const qs = new URLSearchParams({ - operationName: 'FilteredProducts', - variables: JSON.stringify(variables), - extensions: JSON.stringify({ persistedQuery: { version: 1, sha256Hash: hash } }), - }); - const resp = await fetch(`https://dutchie.com/graphql?${qs.toString()}`, { - method: 'GET', - headers: { - 'content-type': 'application/json', - 'apollographql-client-name': 'Marketplace (production)', - }, - credentials: 'include', - }); - const json = await resp.json(); - const products = json?.data?.filteredProducts?.products || []; - return { - count: products.length, - totalCount: json?.data?.filteredProducts?.queryInfo?.totalCount, - sampleStatus: products[0]?.Status, - statuses: [...new Set(products.map((p) => p.Status))], - }; - }, dispensaryId, GRAPHQL_HASH, testCase.status, testCase.includeStatus); - console.log(`Status ${testCase.label}: Products=${result.count}, Total=${result.totalCount}, Statuses=${JSON.stringify(result.statuses)}`); - } - await browser.close(); -} -main().catch(console.error); diff --git a/backend/dist/services/availability.js b/backend/dist/services/availability.js deleted file mode 100644 index 001c5917..00000000 --- a/backend/dist/services/availability.js +++ /dev/null @@ -1,201 +0,0 @@ -"use strict"; -/** - * Availability Service - * - * Normalizes product availability from various menu providers and tracks - * state transitions for inventory analytics. - */ -Object.defineProperty(exports, "__esModule", { value: true }); -exports.normalizeAvailability = normalizeAvailability; -exports.extractAvailabilityHints = extractAvailabilityHints; -exports.hintsToAvailability = hintsToAvailability; -exports.aggregateAvailability = aggregateAvailability; -// Threshold for considering stock as "limited" -const LIMITED_THRESHOLD = 5; -/** - * Normalize availability from a Dutchie product - * - * Dutchie products can have various availability indicators: - * - potencyAmount.quantity: explicit stock count - * - status: sometimes includes stock status - * - variants[].quantity: stock per variant - * - isInStock / inStock: boolean flags - */ -function normalizeAvailability(dutchieProduct) { - const raw = {}; - // Collect raw availability data for debugging - if (dutchieProduct.potencyAmount?.quantity !== undefined) { - raw.potencyQuantity = dutchieProduct.potencyAmount.quantity; - } - if (dutchieProduct.status !== undefined) { - raw.status = dutchieProduct.status; - } - if (dutchieProduct.isInStock !== undefined) { - raw.isInStock = dutchieProduct.isInStock; - } - if (dutchieProduct.inStock !== undefined) { - raw.inStock = dutchieProduct.inStock; - } - if (dutchieProduct.variants?.length) { - const variantQuantities = dutchieProduct.variants - .filter((v) => v.quantity !== undefined) - .map((v) => ({ option: v.option, quantity: v.quantity })); - if (variantQuantities.length) { - raw.variantQuantities = variantQuantities; - } - } - // Try to extract quantity - let quantity = null; - // Check potencyAmount.quantity first (most reliable for Dutchie) - if (typeof dutchieProduct.potencyAmount?.quantity === 'number') { - quantity = dutchieProduct.potencyAmount.quantity; - } - // Sum variant quantities if available - else if (dutchieProduct.variants?.length) { - const totalVariantQty = dutchieProduct.variants.reduce((sum, v) => { - return sum + (typeof v.quantity === 'number' ? v.quantity : 0); - }, 0); - if (totalVariantQty > 0) { - quantity = totalVariantQty; - } - } - // Determine status - let status = 'unknown'; - // Explicit boolean flags take precedence - if (dutchieProduct.isInStock === false || dutchieProduct.inStock === false) { - status = 'out_of_stock'; - } - else if (dutchieProduct.isInStock === true || dutchieProduct.inStock === true) { - status = quantity !== null && quantity <= LIMITED_THRESHOLD ? 'limited' : 'in_stock'; - } - // Check status string - else if (typeof dutchieProduct.status === 'string') { - const statusLower = dutchieProduct.status.toLowerCase(); - if (statusLower.includes('out') || statusLower.includes('unavailable')) { - status = 'out_of_stock'; - } - else if (statusLower.includes('limited') || statusLower.includes('low')) { - status = 'limited'; - } - else if (statusLower.includes('in') || statusLower.includes('available')) { - status = 'in_stock'; - } - } - // Infer from quantity - else if (quantity !== null) { - if (quantity === 0) { - status = 'out_of_stock'; - } - else if (quantity <= LIMITED_THRESHOLD) { - status = 'limited'; - } - else { - status = 'in_stock'; - } - } - return { status, quantity, raw }; -} -/** - * Extract availability hints from page content or product card HTML - * - * Used for sandbox provider scraping where we don't have structured data - */ -function extractAvailabilityHints(pageContent, productElement) { - const hints = {}; - const content = (productElement || pageContent).toLowerCase(); - // Check for out-of-stock indicators - const oosPatterns = [ - 'out of stock', - 'out-of-stock', - 'sold out', - 'soldout', - 'unavailable', - 'not available', - 'coming soon', - 'notify me' - ]; - hints.hasOutOfStockBadge = oosPatterns.some(p => content.includes(p)); - // Check for limited stock indicators - const limitedPatterns = [ - 'limited stock', - 'limited quantity', - 'low stock', - 'only \\d+ left', - 'few remaining', - 'almost gone', - 'selling fast' - ]; - hints.hasLimitedBadge = limitedPatterns.some(p => { - if (p.includes('\\d')) { - return new RegExp(p, 'i').test(content); - } - return content.includes(p); - }); - // Check for in-stock indicators - const inStockPatterns = [ - 'in stock', - 'in-stock', - 'add to cart', - 'add to bag', - 'buy now', - 'available' - ]; - hints.hasInStockBadge = inStockPatterns.some(p => content.includes(p)); - // Try to extract quantity text - const qtyMatch = content.match(/(\d+)\s*(left|remaining|in stock|available)/i); - if (qtyMatch) { - hints.quantityText = qtyMatch[0]; - } - // Look for explicit stock text - const stockTextMatch = content.match(/(out of stock|in stock|low stock|limited|sold out)[^<]*/i); - if (stockTextMatch) { - hints.stockText = stockTextMatch[0].trim(); - } - return hints; -} -/** - * Convert availability hints to normalized availability - */ -function hintsToAvailability(hints) { - let status = 'unknown'; - let quantity = null; - // Extract quantity if present - if (hints.quantityText) { - const match = hints.quantityText.match(/(\d+)/); - if (match) { - quantity = parseInt(match[1], 10); - } - } - // Determine status from hints - if (hints.hasOutOfStockBadge) { - status = 'out_of_stock'; - } - else if (hints.hasLimitedBadge) { - status = 'limited'; - } - else if (hints.hasInStockBadge) { - status = quantity !== null && quantity <= LIMITED_THRESHOLD ? 'limited' : 'in_stock'; - } - return { - status, - quantity, - raw: hints - }; -} -function aggregateAvailability(products) { - const counts = { - in_stock: 0, - out_of_stock: 0, - limited: 0, - unknown: 0, - changed: 0 - }; - for (const product of products) { - const status = product.availability_status || 'unknown'; - counts[status]++; - if (product.previous_status && product.previous_status !== status) { - counts.changed++; - } - } - return counts; -} diff --git a/backend/dist/services/category-crawler-jobs.js b/backend/dist/services/category-crawler-jobs.js deleted file mode 100644 index b6f0d5d9..00000000 --- a/backend/dist/services/category-crawler-jobs.js +++ /dev/null @@ -1,1107 +0,0 @@ -"use strict"; -/** - * Category-Specific Crawler Jobs - * - * Handles crawl jobs for each intelligence category independently: - * - CrawlProductsJob - Production product crawling (Dutchie only) - * - CrawlSpecialsJob - Production specials crawling - * - CrawlBrandIntelligenceJob - Production brand intelligence crawling - * - CrawlMetadataJob - Production metadata crawling - * - SandboxProductsJob - Sandbox product crawling (all providers) - * - SandboxSpecialsJob - Sandbox specials crawling - * - SandboxBrandJob - Sandbox brand crawling - * - SandboxMetadataJob - Sandbox metadata crawling - */ -var __importDefault = (this && this.__importDefault) || function (mod) { - return (mod && mod.__esModule) ? mod : { "default": mod }; -}; -Object.defineProperty(exports, "__esModule", { value: true }); -exports.runCrawlProductsJob = runCrawlProductsJob; -exports.runCrawlSpecialsJob = runCrawlSpecialsJob; -exports.runCrawlBrandIntelligenceJob = runCrawlBrandIntelligenceJob; -exports.runCrawlMetadataJob = runCrawlMetadataJob; -exports.runSandboxProductsJob = runSandboxProductsJob; -exports.runSandboxSpecialsJob = runSandboxSpecialsJob; -exports.runSandboxBrandJob = runSandboxBrandJob; -exports.runSandboxMetadataJob = runSandboxMetadataJob; -exports.processCategorySandboxJobs = processCategorySandboxJobs; -exports.runAllCategoryProductionCrawls = runAllCategoryProductionCrawls; -exports.runAllCategorySandboxCrawls = runAllCategorySandboxCrawls; -const migrate_1 = require("../db/migrate"); -const crawler_logger_1 = require("./crawler-logger"); -// Note: scrapeStore from scraper-v2 is NOT used for Dutchie - we use GraphQL API directly -const product_crawler_1 = require("../dutchie-az/services/product-crawler"); -const puppeteer_1 = __importDefault(require("puppeteer")); -const WORKER_ID = `crawler-${process.pid}-${Date.now()}`; -// ======================================== -// Helper Functions -// ======================================== -async function getDispensaryWithCategories(dispensaryId) { - const result = await migrate_1.pool.query(`SELECT id, name, website, menu_url, menu_type, platform_dispensary_id, - product_provider, product_confidence, product_crawler_mode, last_product_scan_at, - specials_provider, specials_confidence, specials_crawler_mode, last_specials_scan_at, - brand_provider, brand_confidence, brand_crawler_mode, last_brand_scan_at, - metadata_provider, metadata_confidence, metadata_crawler_mode, last_metadata_scan_at, - crawler_status, scraper_template - FROM dispensaries WHERE id = $1`, [dispensaryId]); - return result.rows[0] || null; -} -async function updateCategoryScanTime(dispensaryId, category) { - const column = `last_${category}_scan_at`; - await migrate_1.pool.query(`UPDATE dispensaries SET ${column} = NOW(), updated_at = NOW() WHERE id = $1`, [dispensaryId]); -} -async function getStoreIdForDispensary(dispensaryId) { - // First check if dispensary has menu_url - if so, try to match with stores.dutchie_url - const result = await migrate_1.pool.query(`SELECT s.id FROM stores s - JOIN dispensaries d ON d.menu_url = s.dutchie_url OR d.name ILIKE '%' || s.name || '%' - WHERE d.id = $1 - LIMIT 1`, [dispensaryId]); - if (result.rows.length > 0) { - return result.rows[0].id; - } - // Try matching by slug - const result2 = await migrate_1.pool.query(`SELECT s.id FROM stores s - JOIN dispensaries d ON d.website ILIKE '%' || s.slug || '%' - WHERE d.id = $1 - LIMIT 1`, [dispensaryId]); - return result2.rows[0]?.id || null; -} -async function createCategorySandboxEntry(dispensaryId, category, suspectedProvider, templateName, detectionSignals) { - // Check for existing sandbox for this category - const existing = await migrate_1.pool.query(`SELECT id FROM crawler_sandboxes - WHERE dispensary_id = $1 AND category = $2 AND status NOT IN ('moved_to_production', 'failed')`, [dispensaryId, category]); - if (existing.rows.length > 0) { - await migrate_1.pool.query(`UPDATE crawler_sandboxes - SET suspected_menu_provider = $2, template_name = $3, detection_signals = COALESCE($4, detection_signals), updated_at = NOW() - WHERE id = $1`, [existing.rows[0].id, suspectedProvider, templateName, detectionSignals ? JSON.stringify(detectionSignals) : null]); - return existing.rows[0].id; - } - const result = await migrate_1.pool.query(`INSERT INTO crawler_sandboxes (dispensary_id, category, suspected_menu_provider, template_name, mode, detection_signals, status) - VALUES ($1, $2, $3, $4, 'template_learning', $5, 'pending') - RETURNING id`, [dispensaryId, category, suspectedProvider, templateName, detectionSignals ? JSON.stringify(detectionSignals) : '{}']); - return result.rows[0].id; -} -async function createCategorySandboxJob(dispensaryId, sandboxId, category, templateName, jobType = 'crawl', priority = 0) { - const result = await migrate_1.pool.query(`INSERT INTO sandbox_crawl_jobs (dispensary_id, sandbox_id, category, template_name, job_type, status, priority) - VALUES ($1, $2, $3, $4, $5, 'pending', $6) - RETURNING id`, [dispensaryId, sandboxId, category, templateName, jobType, priority]); - return result.rows[0].id; -} -async function updateSandboxQuality(sandboxId, metrics) { - await migrate_1.pool.query(`UPDATE crawler_sandboxes SET - quality_score = $1, - products_extracted = $2, - fields_missing = $3, - error_count = $4, - analysis_json = COALESCE(analysis_json, '{}'::jsonb) || $5::jsonb, - analyzed_at = NOW(), - updated_at = NOW() - WHERE id = $6`, [ - metrics.quality_score, - metrics.items_extracted, - metrics.fields_missing, - metrics.error_count, - JSON.stringify({ sample_data: metrics.sample_data }), - sandboxId, - ]); -} -async function getCrawlerTemplate(provider, category, environment) { - const result = await migrate_1.pool.query(`SELECT id, name, selector_config, navigation_config - FROM crawler_templates - WHERE provider = $1 AND environment = $2 AND is_active = true - ORDER BY is_default_for_provider DESC, version DESC - LIMIT 1`, [provider, environment]); - return result.rows[0] || null; -} -// ======================================== -// Production Crawl Jobs -// ======================================== -/** - * CrawlProductsJob - Production product crawling - * Uses Dutchie GraphQL API directly (NOT browser-based scraping) - * - * IMPORTANT: This function calls crawlDispensaryProducts() from dutchie-az - * which uses the GraphQL API. The GraphQL response includes categories directly, - * so no browser-based category discovery is needed. - */ -async function runCrawlProductsJob(dispensaryId) { - const category = 'product'; - const startTime = Date.now(); - const dispensary = await getDispensaryWithCategories(dispensaryId); - if (!dispensary) { - return { success: false, category, message: `Dispensary ${dispensaryId} not found` }; - } - // Verify production eligibility - accept either: - // 1. product_provider = 'dutchie' with product_crawler_mode = 'production', OR - // 2. menu_type = 'dutchie' with platform_dispensary_id (known Dutchie store) - const isDutchieProduction = (dispensary.product_provider === 'dutchie' && dispensary.product_crawler_mode === 'production') || - (dispensary.menu_type === 'dutchie' && dispensary.platform_dispensary_id); - if (!isDutchieProduction) { - return { success: false, category, message: 'Not a Dutchie dispensary for products' }; - } - if (!dispensary.platform_dispensary_id) { - return { success: false, category, message: 'Missing platform_dispensary_id for GraphQL crawl' }; - } - // Log job start - crawler_logger_1.crawlerLogger.jobStarted({ - job_id: 0, // Category jobs don't have traditional job IDs - store_id: dispensaryId, // Use dispensary ID since we're not using stores table - store_name: dispensary.name, - job_type: 'CrawlProductsJob', - trigger_type: 'category_crawl', - provider: 'dutchie', - }); - try { - // Build Dispensary object for GraphQL crawler - // The crawler uses platformDispensaryId to call the Dutchie GraphQL API directly - const dispensaryForCrawl = { - id: dispensary.id, - platform: 'dutchie', - name: dispensary.name, - slug: dispensary.name.toLowerCase().replace(/[^a-z0-9]+/g, '-'), - city: '', - state: 'AZ', - menuType: dispensary.menu_type || 'dutchie', - menuUrl: dispensary.menu_url || undefined, - platformDispensaryId: dispensary.platform_dispensary_id || undefined, - website: dispensary.website || undefined, - createdAt: new Date(), - updatedAt: new Date(), - }; - // Use GraphQL crawler directly - this calls the Dutchie API, not browser scraping - const crawlResult = await (0, product_crawler_1.crawlDispensaryProducts)(dispensaryForCrawl, 'rec', // Default to recreational pricing - { useBothModes: true, downloadImages: true }); - // Update scan time - await updateCategoryScanTime(dispensaryId, category); - const durationMs = Date.now() - startTime; - if (crawlResult.success) { - // Log job completion with summary - crawler_logger_1.crawlerLogger.jobCompleted({ - job_id: 0, - store_id: dispensaryId, - store_name: dispensary.name, - duration_ms: durationMs, - products_found: crawlResult.productsFound, - products_new: 0, // GraphQL crawler doesn't track new vs updated separately - products_updated: crawlResult.productsUpserted, - provider: 'dutchie', - }); - return { - success: true, - category, - message: `GraphQL crawl completed: ${crawlResult.productsUpserted} products, ${crawlResult.snapshotsCreated} snapshots`, - data: { - dispensaryId, - provider: 'dutchie', - durationMs, - productsFound: crawlResult.productsFound, - productsUpserted: crawlResult.productsUpserted, - snapshotsCreated: crawlResult.snapshotsCreated, - modeAProducts: crawlResult.modeAProducts, - modeBProducts: crawlResult.modeBProducts, - }, - }; - } - else { - // Log job failure - crawler_logger_1.crawlerLogger.jobFailed({ - job_id: 0, - store_id: dispensaryId, - store_name: dispensary.name, - duration_ms: durationMs, - error_message: crawlResult.errorMessage || 'Unknown error', - provider: 'dutchie', - }); - return { success: false, category, message: crawlResult.errorMessage || 'GraphQL crawl failed' }; - } - } - catch (error) { - const durationMs = Date.now() - startTime; - // Log job failure - crawler_logger_1.crawlerLogger.jobFailed({ - job_id: 0, - store_id: dispensaryId, - store_name: dispensary.name, - duration_ms: durationMs, - error_message: error.message, - provider: 'dutchie', - }); - return { success: false, category, message: error.message }; - } -} -/** - * CrawlSpecialsJob - Production specials crawling - * Currently no production-ready providers, so always returns false - */ -async function runCrawlSpecialsJob(dispensaryId) { - const category = 'specials'; - const dispensary = await getDispensaryWithCategories(dispensaryId); - if (!dispensary) { - return { success: false, category, message: `Dispensary ${dispensaryId} not found` }; - } - // No production-ready providers for specials yet - if (dispensary.specials_crawler_mode !== 'production') { - return { success: false, category, message: 'Specials not in production mode' }; - } - // Would implement provider-specific specials crawling here - // For now, no providers are production-ready - return { - success: false, - category, - message: `No production crawler for specials provider: ${dispensary.specials_provider}`, - }; -} -/** - * CrawlBrandIntelligenceJob - Production brand intelligence crawling - * Currently no production-ready providers - */ -async function runCrawlBrandIntelligenceJob(dispensaryId) { - const category = 'brand'; - const dispensary = await getDispensaryWithCategories(dispensaryId); - if (!dispensary) { - return { success: false, category, message: `Dispensary ${dispensaryId} not found` }; - } - if (dispensary.brand_crawler_mode !== 'production') { - return { success: false, category, message: 'Brand not in production mode' }; - } - return { - success: false, - category, - message: `No production crawler for brand provider: ${dispensary.brand_provider}`, - }; -} -/** - * CrawlMetadataJob - Production metadata crawling - * Currently no production-ready providers - */ -async function runCrawlMetadataJob(dispensaryId) { - const category = 'metadata'; - const dispensary = await getDispensaryWithCategories(dispensaryId); - if (!dispensary) { - return { success: false, category, message: `Dispensary ${dispensaryId} not found` }; - } - if (dispensary.metadata_crawler_mode !== 'production') { - return { success: false, category, message: 'Metadata not in production mode' }; - } - return { - success: false, - category, - message: `No production crawler for metadata provider: ${dispensary.metadata_provider}`, - }; -} -// ======================================== -// Sandbox Crawl Jobs -// ======================================== -/** - * SandboxProductsJob - Sandbox product crawling - * Works with any provider including Treez - */ -async function runSandboxProductsJob(dispensaryId, sandboxId) { - const category = 'product'; - const startTime = Date.now(); - const dispensary = await getDispensaryWithCategories(dispensaryId); - if (!dispensary) { - return { success: false, category, message: `Dispensary ${dispensaryId} not found` }; - } - // Get or create sandbox entry - let sandbox; - if (sandboxId) { - const result = await migrate_1.pool.query('SELECT * FROM crawler_sandboxes WHERE id = $1', [sandboxId]); - sandbox = result.rows[0]; - } - else { - const result = await migrate_1.pool.query(`SELECT * FROM crawler_sandboxes - WHERE dispensary_id = $1 AND category = $2 AND status NOT IN ('moved_to_production', 'failed') - ORDER BY created_at DESC LIMIT 1`, [dispensaryId, category]); - sandbox = result.rows[0]; - if (!sandbox) { - const newSandboxId = await createCategorySandboxEntry(dispensaryId, category, dispensary.product_provider, null); - const result = await migrate_1.pool.query('SELECT * FROM crawler_sandboxes WHERE id = $1', [newSandboxId]); - sandbox = result.rows[0]; - } - } - const websiteUrl = dispensary.menu_url || dispensary.website; - if (!websiteUrl) { - await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = 'failed', failure_reason = 'No website URL' WHERE id = $1`, [sandbox.id]); - return { success: false, category, message: 'No website URL available' }; - } - let browser = null; - try { - // Update status - await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = 'analyzing', updated_at = NOW() WHERE id = $1`, [sandbox.id]); - browser = await puppeteer_1.default.launch({ - headless: true, - args: ['--no-sandbox', '--disable-setuid-sandbox'], - }); - const page = await browser.newPage(); - await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'); - // Get provider-specific template if available - const provider = dispensary.product_provider || 'unknown'; - const template = await getCrawlerTemplate(provider, category, 'sandbox'); - let products = []; - let metrics = { - quality_score: 0, - items_extracted: 0, - fields_missing: 0, - error_count: 0, - }; - // Provider-specific extraction logic - if (provider === 'treez' && template) { - // Use Treez-specific extraction - const treezResult = await extractTreezProducts(page, websiteUrl); - products = treezResult.products; - metrics = treezResult.metrics; - } - else { - // Generic product extraction - const genericResult = await extractGenericProducts(page, websiteUrl); - products = genericResult.products; - metrics = genericResult.metrics; - } - // Update sandbox with results - metrics.sample_data = products.slice(0, 5); - await updateSandboxQuality(sandbox.id, metrics); - // Determine final status based on quality - const status = metrics.quality_score >= 70 ? 'ready_for_review' : - metrics.quality_score >= 40 ? 'needs_human_review' : 'pending'; - await migrate_1.pool.query(`UPDATE crawler_sandboxes SET - status = $1, - urls_tested = $2, - updated_at = NOW() - WHERE id = $3`, [status, JSON.stringify([websiteUrl]), sandbox.id]); - // Update scan time - await updateCategoryScanTime(dispensaryId, category); - // Log sandbox completion - crawler_logger_1.crawlerLogger.sandboxEvent({ - event: 'sandbox_completed', - dispensary_id: dispensaryId, - dispensary_name: dispensary.name, - template_name: provider, - category: 'product', - quality_score: metrics.quality_score, - products_extracted: products.length, - fields_missing: metrics.fields_missing, - provider: provider, - }); - return { - success: true, - category, - message: `Sandbox crawl completed. ${products.length} products extracted, quality score ${metrics.quality_score}`, - data: { - sandboxId: sandbox.id, - productsExtracted: products.length, - qualityScore: metrics.quality_score, - status, - }, - }; - } - catch (error) { - // Log sandbox failure - crawler_logger_1.crawlerLogger.sandboxEvent({ - event: 'sandbox_failed', - dispensary_id: dispensaryId, - dispensary_name: dispensary.name, - template_name: dispensary.product_provider || 'unknown', - category: 'product', - error_message: error.message, - }); - await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = 'failed', failure_reason = $1, error_count = error_count + 1 WHERE id = $2`, [error.message, sandbox.id]); - return { success: false, category, message: error.message }; - } - finally { - if (browser) - await browser.close(); - } -} -/** - * SandboxSpecialsJob - Sandbox specials crawling - */ -async function runSandboxSpecialsJob(dispensaryId, sandboxId) { - const category = 'specials'; - const dispensary = await getDispensaryWithCategories(dispensaryId); - if (!dispensary) { - return { success: false, category, message: `Dispensary ${dispensaryId} not found` }; - } - let sandbox; - if (sandboxId) { - const result = await migrate_1.pool.query('SELECT * FROM crawler_sandboxes WHERE id = $1', [sandboxId]); - sandbox = result.rows[0]; - } - else { - const newSandboxId = await createCategorySandboxEntry(dispensaryId, category, dispensary.specials_provider, null); - const result = await migrate_1.pool.query('SELECT * FROM crawler_sandboxes WHERE id = $1', [newSandboxId]); - sandbox = result.rows[0]; - } - const websiteUrl = dispensary.website; - if (!websiteUrl) { - return { success: false, category, message: 'No website URL available' }; - } - let browser = null; - try { - await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = 'analyzing', updated_at = NOW() WHERE id = $1`, [sandbox.id]); - browser = await puppeteer_1.default.launch({ - headless: true, - args: ['--no-sandbox', '--disable-setuid-sandbox'], - }); - const page = await browser.newPage(); - await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'); - const result = await extractSpecials(page, websiteUrl); - await updateSandboxQuality(sandbox.id, { - ...result.metrics, - sample_data: result.specials.slice(0, 5), - }); - const status = result.metrics.quality_score >= 70 ? 'ready_for_review' : - result.metrics.quality_score >= 40 ? 'needs_human_review' : 'pending'; - await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = $1, updated_at = NOW() WHERE id = $2`, [status, sandbox.id]); - await updateCategoryScanTime(dispensaryId, category); - return { - success: true, - category, - message: `Sandbox specials crawl completed. ${result.specials.length} specials found.`, - data: { sandboxId: sandbox.id, specialsCount: result.specials.length }, - }; - } - catch (error) { - await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = 'failed', failure_reason = $1 WHERE id = $2`, [error.message, sandbox.id]); - return { success: false, category, message: error.message }; - } - finally { - if (browser) - await browser.close(); - } -} -/** - * SandboxBrandJob - Sandbox brand intelligence crawling - */ -async function runSandboxBrandJob(dispensaryId, sandboxId) { - const category = 'brand'; - const dispensary = await getDispensaryWithCategories(dispensaryId); - if (!dispensary) { - return { success: false, category, message: `Dispensary ${dispensaryId} not found` }; - } - let sandbox; - if (sandboxId) { - const result = await migrate_1.pool.query('SELECT * FROM crawler_sandboxes WHERE id = $1', [sandboxId]); - sandbox = result.rows[0]; - } - else { - const newSandboxId = await createCategorySandboxEntry(dispensaryId, category, dispensary.brand_provider, null); - const result = await migrate_1.pool.query('SELECT * FROM crawler_sandboxes WHERE id = $1', [newSandboxId]); - sandbox = result.rows[0]; - } - const websiteUrl = dispensary.website; - if (!websiteUrl) { - return { success: false, category, message: 'No website URL available' }; - } - let browser = null; - try { - await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = 'analyzing', updated_at = NOW() WHERE id = $1`, [sandbox.id]); - browser = await puppeteer_1.default.launch({ - headless: true, - args: ['--no-sandbox', '--disable-setuid-sandbox'], - }); - const page = await browser.newPage(); - await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'); - const result = await extractBrands(page, websiteUrl); - await updateSandboxQuality(sandbox.id, { - ...result.metrics, - sample_data: result.brands.slice(0, 10), - }); - const status = result.metrics.quality_score >= 70 ? 'ready_for_review' : 'pending'; - await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = $1, updated_at = NOW() WHERE id = $2`, [status, sandbox.id]); - await updateCategoryScanTime(dispensaryId, category); - return { - success: true, - category, - message: `Sandbox brand crawl completed. ${result.brands.length} brands found.`, - data: { sandboxId: sandbox.id, brandsCount: result.brands.length }, - }; - } - catch (error) { - await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = 'failed', failure_reason = $1 WHERE id = $2`, [error.message, sandbox.id]); - return { success: false, category, message: error.message }; - } - finally { - if (browser) - await browser.close(); - } -} -/** - * SandboxMetadataJob - Sandbox metadata crawling - */ -async function runSandboxMetadataJob(dispensaryId, sandboxId) { - const category = 'metadata'; - const dispensary = await getDispensaryWithCategories(dispensaryId); - if (!dispensary) { - return { success: false, category, message: `Dispensary ${dispensaryId} not found` }; - } - let sandbox; - if (sandboxId) { - const result = await migrate_1.pool.query('SELECT * FROM crawler_sandboxes WHERE id = $1', [sandboxId]); - sandbox = result.rows[0]; - } - else { - const newSandboxId = await createCategorySandboxEntry(dispensaryId, category, dispensary.metadata_provider, null); - const result = await migrate_1.pool.query('SELECT * FROM crawler_sandboxes WHERE id = $1', [newSandboxId]); - sandbox = result.rows[0]; - } - const websiteUrl = dispensary.website; - if (!websiteUrl) { - return { success: false, category, message: 'No website URL available' }; - } - let browser = null; - try { - await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = 'analyzing', updated_at = NOW() WHERE id = $1`, [sandbox.id]); - browser = await puppeteer_1.default.launch({ - headless: true, - args: ['--no-sandbox', '--disable-setuid-sandbox'], - }); - const page = await browser.newPage(); - await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'); - const result = await extractMetadata(page, websiteUrl); - await updateSandboxQuality(sandbox.id, { - ...result.metrics, - sample_data: result.categories.slice(0, 20), - }); - const status = result.metrics.quality_score >= 70 ? 'ready_for_review' : 'pending'; - await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = $1, updated_at = NOW() WHERE id = $2`, [status, sandbox.id]); - await updateCategoryScanTime(dispensaryId, category); - return { - success: true, - category, - message: `Sandbox metadata crawl completed. ${result.categories.length} categories found.`, - data: { sandboxId: sandbox.id, categoriesCount: result.categories.length }, - }; - } - catch (error) { - await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = 'failed', failure_reason = $1 WHERE id = $2`, [error.message, sandbox.id]); - return { success: false, category, message: error.message }; - } - finally { - if (browser) - await browser.close(); - } -} -// ======================================== -// Extraction Functions -// ======================================== -/** - * Extract products from Treez-powered sites - */ -async function extractTreezProducts(page, baseUrl) { - const products = []; - let errorCount = 0; - let fieldsMissing = 0; - try { - // Navigate to menu - const menuUrls = ['/menu', '/shop', '/products', '/order']; - let menuUrl = baseUrl; - for (const path of menuUrls) { - try { - const testUrl = new URL(path, baseUrl).toString(); - await page.goto(testUrl, { waitUntil: 'networkidle2', timeout: 20000 }); - const hasProducts = await page.evaluate(() => { - const text = document.body.innerText.toLowerCase(); - return text.includes('add to cart') || text.includes('thc') || text.includes('indica'); - }); - if (hasProducts) { - menuUrl = testUrl; - break; - } - } - catch { - // Try next URL - } - } - await page.goto(menuUrl, { waitUntil: 'networkidle2', timeout: 30000 }); - await new Promise(r => setTimeout(r, 3000)); // Wait for dynamic content - // Look for Treez API data in network requests or page content - const pageProducts = await page.evaluate(() => { - const extractedProducts = []; - // Try common Treez selectors - const selectors = [ - '.product-card', - '.menu-item', - '[data-product]', - '.product-tile', - '.menu-product', - ]; - for (const selector of selectors) { - const elements = document.querySelectorAll(selector); - if (elements.length > 3) { - elements.forEach((el) => { - const nameEl = el.querySelector('h2, h3, .product-name, .name, [class*="name"]'); - const priceEl = el.querySelector('.price, [class*="price"]'); - const thcEl = el.querySelector('[class*="thc"], [class*="potency"]'); - if (nameEl) { - extractedProducts.push({ - name: nameEl.textContent?.trim(), - price: priceEl?.textContent?.trim(), - thc: thcEl?.textContent?.trim(), - html: el.outerHTML.slice(0, 500), - }); - } - }); - break; - } - } - return extractedProducts; - }); - products.push(...pageProducts); - // Calculate quality metrics - for (const product of products) { - if (!product.name) - fieldsMissing++; - if (!product.price) - fieldsMissing++; - } - } - catch (error) { - // Error tracked via errorCount - logged at job level - errorCount++; - } - const qualityScore = products.length > 0 - ? Math.min(100, Math.max(0, 100 - (fieldsMissing * 5) - (errorCount * 10))) - : 0; - return { - products, - metrics: { - quality_score: qualityScore, - items_extracted: products.length, - fields_missing: fieldsMissing, - error_count: errorCount, - }, - }; -} -/** - * Extract products using generic selectors - */ -async function extractGenericProducts(page, baseUrl) { - const products = []; - let errorCount = 0; - let fieldsMissing = 0; - try { - // Try common menu paths - const menuPaths = ['/menu', '/shop', '/products', '/order']; - let foundMenu = false; - for (const path of menuPaths) { - try { - const fullUrl = new URL(path, baseUrl).toString(); - await page.goto(fullUrl, { waitUntil: 'networkidle2', timeout: 20000 }); - const hasProducts = await page.evaluate(() => { - const text = document.body.innerText.toLowerCase(); - return text.includes('add to cart') || text.includes('thc') || text.includes('gram'); - }); - if (hasProducts) { - foundMenu = true; - break; - } - } - catch { - continue; - } - } - if (!foundMenu) { - await page.goto(baseUrl, { waitUntil: 'networkidle2', timeout: 30000 }); - } - await new Promise(r => setTimeout(r, 2000)); - // Generic product extraction - const pageProducts = await page.evaluate(() => { - const extractedProducts = []; - const selectors = [ - '.product', - '.product-card', - '.menu-item', - '.item-card', - '[data-product]', - '.strain', - '.listing', - ]; - for (const selector of selectors) { - const elements = document.querySelectorAll(selector); - if (elements.length > 3) { - elements.forEach((el) => { - const nameEl = el.querySelector('h2, h3, h4, .name, .title, [class*="name"]'); - const priceEl = el.querySelector('.price, [class*="price"]'); - const brandEl = el.querySelector('.brand, [class*="brand"]'); - const categoryEl = el.querySelector('.category, [class*="category"], [class*="type"]'); - if (nameEl?.textContent?.trim()) { - extractedProducts.push({ - name: nameEl.textContent.trim(), - price: priceEl?.textContent?.trim(), - brand: brandEl?.textContent?.trim(), - category: categoryEl?.textContent?.trim(), - }); - } - }); - break; - } - } - return extractedProducts; - }); - products.push(...pageProducts); - // Calculate missing fields - for (const product of products) { - if (!product.name) - fieldsMissing++; - if (!product.price) - fieldsMissing++; - } - } - catch (error) { - // Error tracked via errorCount - logged at job level - errorCount++; - } - const qualityScore = products.length > 0 - ? Math.min(100, Math.max(0, 80 - (fieldsMissing * 3) - (errorCount * 10))) - : 0; - return { - products, - metrics: { - quality_score: qualityScore, - items_extracted: products.length, - fields_missing: fieldsMissing, - error_count: errorCount, - }, - }; -} -/** - * Extract specials/deals - */ -async function extractSpecials(page, baseUrl) { - const specials = []; - let errorCount = 0; - let fieldsMissing = 0; - try { - const specialsPaths = ['/specials', '/deals', '/promotions', '/offers', '/sale']; - for (const path of specialsPaths) { - try { - const fullUrl = new URL(path, baseUrl).toString(); - await page.goto(fullUrl, { waitUntil: 'networkidle2', timeout: 20000 }); - const pageSpecials = await page.evaluate(() => { - const extracted = []; - const selectors = [ - '.special', - '.deal', - '.promotion', - '.offer', - '[class*="special"]', - '[class*="deal"]', - ]; - for (const selector of selectors) { - const elements = document.querySelectorAll(selector); - elements.forEach((el) => { - const titleEl = el.querySelector('h2, h3, h4, .title, .name'); - const descEl = el.querySelector('p, .description, .details'); - const discountEl = el.querySelector('.discount, .savings, [class*="percent"]'); - if (titleEl?.textContent?.trim()) { - extracted.push({ - title: titleEl.textContent.trim(), - description: descEl?.textContent?.trim(), - discount: discountEl?.textContent?.trim(), - }); - } - }); - } - return extracted; - }); - specials.push(...pageSpecials); - if (specials.length > 0) - break; - } - catch { - continue; - } - } - for (const special of specials) { - if (!special.title) - fieldsMissing++; - if (!special.description && !special.discount) - fieldsMissing++; - } - } - catch (error) { - // Error tracked via errorCount - logged at job level - errorCount++; - } - const qualityScore = specials.length > 0 - ? Math.min(100, Math.max(0, 70 - (fieldsMissing * 5) - (errorCount * 10))) - : 0; - return { - specials, - metrics: { - quality_score: qualityScore, - items_extracted: specials.length, - fields_missing: fieldsMissing, - error_count: errorCount, - }, - }; -} -/** - * Extract brand information - */ -async function extractBrands(page, baseUrl) { - const brands = []; - let errorCount = 0; - let fieldsMissing = 0; - try { - const brandPaths = ['/brands', '/vendors', '/producers', '/menu']; - for (const path of brandPaths) { - try { - const fullUrl = new URL(path, baseUrl).toString(); - await page.goto(fullUrl, { waitUntil: 'networkidle2', timeout: 20000 }); - const pageBrands = await page.evaluate(() => { - const extracted = []; - const brandNames = new Set(); - // Look for brand elements - const selectors = [ - '.brand', - '[class*="brand"]', - '.vendor', - '.producer', - ]; - for (const selector of selectors) { - document.querySelectorAll(selector).forEach((el) => { - const name = el.textContent?.trim(); - if (name && name.length > 1 && name.length < 100 && !brandNames.has(name)) { - brandNames.add(name); - extracted.push({ name }); - } - }); - } - // Also extract from filter dropdowns - document.querySelectorAll('select option, [role="option"]').forEach((el) => { - const name = el.textContent?.trim(); - if (name && name.length > 1 && name.length < 100 && !brandNames.has(name)) { - const lowerName = name.toLowerCase(); - if (!['all', 'any', 'select', 'choose', '--'].some(skip => lowerName.includes(skip))) { - brandNames.add(name); - extracted.push({ name, source: 'filter' }); - } - } - }); - return extracted; - }); - brands.push(...pageBrands); - if (brands.length > 5) - break; - } - catch { - continue; - } - } - } - catch (error) { - // Error tracked via errorCount - logged at job level - errorCount++; - } - const qualityScore = brands.length > 0 - ? Math.min(100, Math.max(0, 60 + Math.min(30, brands.length * 2) - (errorCount * 10))) - : 0; - return { - brands, - metrics: { - quality_score: qualityScore, - items_extracted: brands.length, - fields_missing: fieldsMissing, - error_count: errorCount, - }, - }; -} -/** - * Extract metadata (categories, taxonomy) - */ -async function extractMetadata(page, baseUrl) { - const categories = []; - let errorCount = 0; - let fieldsMissing = 0; - try { - await page.goto(baseUrl, { waitUntil: 'networkidle2', timeout: 30000 }); - const menuPaths = ['/menu', '/shop', '/products']; - for (const path of menuPaths) { - try { - await page.goto(new URL(path, baseUrl).toString(), { waitUntil: 'networkidle2', timeout: 15000 }); - break; - } - catch { - continue; - } - } - const pageCategories = await page.evaluate(() => { - const extracted = []; - const categoryNames = new Set(); - // Navigation/tab categories - const navSelectors = [ - 'nav a', - '.category-nav a', - '.menu-categories a', - '[class*="category"] a', - '.tabs button', - '.tab-list button', - ]; - for (const selector of navSelectors) { - document.querySelectorAll(selector).forEach((el) => { - const name = el.textContent?.trim(); - if (name && name.length > 1 && name.length < 50 && !categoryNames.has(name)) { - const lowerName = name.toLowerCase(); - const categoryKeywords = ['flower', 'edible', 'concentrate', 'vape', 'preroll', 'tincture', 'topical', 'accessory', 'indica', 'sativa', 'hybrid']; - if (categoryKeywords.some(kw => lowerName.includes(kw)) || el.closest('[class*="category"], [class*="menu"]')) { - categoryNames.add(name); - extracted.push({ name, type: 'navigation' }); - } - } - }); - } - // Filter categories - document.querySelectorAll('select, [role="listbox"]').forEach((select) => { - const label = select.getAttribute('aria-label') || select.previousElementSibling?.textContent?.trim(); - if (label?.toLowerCase().includes('category') || label?.toLowerCase().includes('type')) { - select.querySelectorAll('option, [role="option"]').forEach((opt) => { - const name = opt.textContent?.trim(); - if (name && name.length > 1 && !categoryNames.has(name)) { - const lowerName = name.toLowerCase(); - if (!['all', 'any', 'select', 'choose'].some(skip => lowerName.includes(skip))) { - categoryNames.add(name); - extracted.push({ name, type: 'filter' }); - } - } - }); - } - }); - return extracted; - }); - categories.push(...pageCategories); - } - catch (error) { - // Error tracked via errorCount - logged at job level - errorCount++; - } - const qualityScore = categories.length > 0 - ? Math.min(100, Math.max(0, 50 + Math.min(40, categories.length * 3) - (errorCount * 10))) - : 0; - return { - categories, - metrics: { - quality_score: qualityScore, - items_extracted: categories.length, - fields_missing: fieldsMissing, - error_count: errorCount, - }, - }; -} -// ======================================== -// Queue Processing Functions -// ======================================== -/** - * Process pending category-specific sandbox jobs - */ -async function processCategorySandboxJobs(category, limit = 5) { - const jobs = await migrate_1.pool.query(`UPDATE sandbox_crawl_jobs - SET status = 'running', worker_id = $1, started_at = NOW() - WHERE id IN ( - SELECT id FROM sandbox_crawl_jobs - WHERE status = 'pending' AND category = $2 AND scheduled_at <= NOW() - ORDER BY priority DESC, scheduled_at ASC - LIMIT $3 - FOR UPDATE SKIP LOCKED - ) - RETURNING *`, [WORKER_ID, category, limit]); - for (const job of jobs.rows) { - try { - let result; - switch (category) { - case 'product': - result = await runSandboxProductsJob(job.dispensary_id, job.sandbox_id); - break; - case 'specials': - result = await runSandboxSpecialsJob(job.dispensary_id, job.sandbox_id); - break; - case 'brand': - result = await runSandboxBrandJob(job.dispensary_id, job.sandbox_id); - break; - case 'metadata': - result = await runSandboxMetadataJob(job.dispensary_id, job.sandbox_id); - break; - default: - result = { success: false, category, message: `Unknown category: ${category}` }; - } - await migrate_1.pool.query(`UPDATE sandbox_crawl_jobs - SET status = $1, completed_at = NOW(), result_summary = $2, error_message = $3 - WHERE id = $4`, [ - result.success ? 'completed' : 'failed', - JSON.stringify(result.data || {}), - result.success ? null : result.message, - job.id, - ]); - } - catch (error) { - await migrate_1.pool.query(`UPDATE sandbox_crawl_jobs SET status = 'failed', error_message = $1 WHERE id = $2`, [error.message, job.id]); - } - } -} -/** - * Run all category production crawls for a dispensary - * Each category runs independently - failures don't affect others - */ -async function runAllCategoryProductionCrawls(dispensaryId) { - const results = []; - // Run all categories in parallel - independent failures - const [productResult, specialsResult, brandResult, metadataResult] = await Promise.allSettled([ - runCrawlProductsJob(dispensaryId), - runCrawlSpecialsJob(dispensaryId), - runCrawlBrandIntelligenceJob(dispensaryId), - runCrawlMetadataJob(dispensaryId), - ]); - if (productResult.status === 'fulfilled') - results.push(productResult.value); - else - results.push({ success: false, category: 'product', message: productResult.reason?.message || 'Unknown error' }); - if (specialsResult.status === 'fulfilled') - results.push(specialsResult.value); - else - results.push({ success: false, category: 'specials', message: specialsResult.reason?.message || 'Unknown error' }); - if (brandResult.status === 'fulfilled') - results.push(brandResult.value); - else - results.push({ success: false, category: 'brand', message: brandResult.reason?.message || 'Unknown error' }); - if (metadataResult.status === 'fulfilled') - results.push(metadataResult.value); - else - results.push({ success: false, category: 'metadata', message: metadataResult.reason?.message || 'Unknown error' }); - const successCount = results.filter(r => r.success).length; - const summary = `${successCount}/4 categories succeeded: ${results.map(r => `${r.category}:${r.success ? 'ok' : 'fail'}`).join(', ')}`; - // Individual category jobs log their own completion via crawlerLogger - return { results, summary }; -} -/** - * Run all category sandbox crawls for a dispensary - */ -async function runAllCategorySandboxCrawls(dispensaryId) { - const results = []; - const [productResult, specialsResult, brandResult, metadataResult] = await Promise.allSettled([ - runSandboxProductsJob(dispensaryId), - runSandboxSpecialsJob(dispensaryId), - runSandboxBrandJob(dispensaryId), - runSandboxMetadataJob(dispensaryId), - ]); - if (productResult.status === 'fulfilled') - results.push(productResult.value); - else - results.push({ success: false, category: 'product', message: productResult.reason?.message || 'Unknown error' }); - if (specialsResult.status === 'fulfilled') - results.push(specialsResult.value); - else - results.push({ success: false, category: 'specials', message: specialsResult.reason?.message || 'Unknown error' }); - if (brandResult.status === 'fulfilled') - results.push(brandResult.value); - else - results.push({ success: false, category: 'brand', message: brandResult.reason?.message || 'Unknown error' }); - if (metadataResult.status === 'fulfilled') - results.push(metadataResult.value); - else - results.push({ success: false, category: 'metadata', message: metadataResult.reason?.message || 'Unknown error' }); - const successCount = results.filter(r => r.success).length; - const summary = `${successCount}/4 sandbox crawls: ${results.map(r => `${r.category}:${r.success ? 'ok' : 'fail'}`).join(', ')}`; - // Individual sandbox jobs log their own completion via crawlerLogger - return { results, summary }; -} diff --git a/backend/dist/services/category-discovery.js b/backend/dist/services/category-discovery.js deleted file mode 100644 index ce53f818..00000000 --- a/backend/dist/services/category-discovery.js +++ /dev/null @@ -1,246 +0,0 @@ -"use strict"; -var __importDefault = (this && this.__importDefault) || function (mod) { - return (mod && mod.__esModule) ? mod : { "default": mod }; -}; -Object.defineProperty(exports, "__esModule", { value: true }); -exports.discoverCategories = discoverCategories; -const puppeteer_extra_1 = __importDefault(require("puppeteer-extra")); -const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth")); -const migrate_1 = require("../db/migrate"); -const logger_1 = require("./logger"); -const age_gate_1 = require("../utils/age-gate"); -const dutchie_1 = require("../scrapers/templates/dutchie"); -// Apply stealth plugin -puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)()); -const DUTCHIE_CATEGORIES = [ - { name: 'Shop', slug: 'shop' }, - { name: 'Flower', slug: 'flower', parentSlug: 'shop' }, - { name: 'Pre-Rolls', slug: 'pre-rolls', parentSlug: 'shop' }, - { name: 'Vaporizers', slug: 'vaporizers', parentSlug: 'shop' }, - { name: 'Concentrates', slug: 'concentrates', parentSlug: 'shop' }, - { name: 'Edibles', slug: 'edibles', parentSlug: 'shop' }, - { name: 'Topicals', slug: 'topicals', parentSlug: 'shop' }, - { name: 'Accessories', slug: 'accessories', parentSlug: 'shop' }, - { name: 'Brands', slug: 'brands' }, - { name: 'Specials', slug: 'specials' } -]; -const CURALEAF_CATEGORIES = [ - { name: 'Shop', slug: 'shop' }, - { name: 'Flower', slug: 'flower', parentSlug: 'shop' }, - { name: 'Pre-Rolls', slug: 'pre-rolls', parentSlug: 'shop' }, - { name: 'Vaporizers', slug: 'vaporizers', parentSlug: 'shop' }, - { name: 'Concentrates', slug: 'concentrates', parentSlug: 'shop' }, - { name: 'Edibles', slug: 'edibles', parentSlug: 'shop' }, - { name: 'Tinctures', slug: 'tinctures', parentSlug: 'shop' }, - { name: 'Topicals', slug: 'topicals', parentSlug: 'shop' }, - { name: 'Capsules', slug: 'capsules', parentSlug: 'shop' }, - { name: 'Accessories', slug: 'accessories', parentSlug: 'shop' } -]; -async function makePageStealthy(page) { - await page.evaluateOnNewDocument(() => { - Object.defineProperty(navigator, 'webdriver', { get: () => false }); - Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] }); - Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] }); - window.chrome = { runtime: {} }; - }); -} -async function isDutchieMenu(page) { - try { - // Check page source for Dutchie markers - const isDutchie = await page.evaluate(() => { - // Check for window.reactEnv with dutchie URLs - if (window.reactEnv) { - const env = window.reactEnv; - if (env.adminUrl?.includes('dutchie.com') || - env.apiUrl?.includes('dutchie.com') || - env.consumerUrl?.includes('dutchie.com')) { - return true; - } - } - // Check HTML source for dutchie references - const htmlContent = document.documentElement.innerHTML; - if (htmlContent.includes('admin.dutchie.com') || - htmlContent.includes('api.dutchie.com') || - htmlContent.includes('embedded-menu') || - htmlContent.includes('window.reactEnv')) { - return true; - } - return false; - }); - return isDutchie; - } - catch (error) { - logger_1.logger.warn('categories', `Error detecting Dutchie menu: ${error}`); - return false; - } -} -async function discoverCategories(storeId) { - let browser = null; - try { - logger_1.logger.info('categories', `Discovering categories for store ID: ${storeId}`); - const storeResult = await migrate_1.pool.query(` - SELECT id, name, slug, dutchie_url - FROM stores - WHERE id = $1 - `, [storeId]); - if (storeResult.rows.length === 0) { - throw new Error('Store not found'); - } - const store = storeResult.rows[0]; - const baseUrl = store.dutchie_url; - // Launch browser to check page source - browser = await puppeteer_extra_1.default.launch({ - headless: 'new', - args: [ - '--no-sandbox', - '--disable-setuid-sandbox', - '--disable-dev-shm-usage', - '--disable-blink-features=AutomationControlled' - ] - }); - const page = await browser.newPage(); - await makePageStealthy(page); - await page.setViewport({ width: 1920, height: 1080 }); - await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'); - // Set age gate bypass cookies BEFORE navigation (standard for all cannabis sites) - const state = (0, age_gate_1.detectStateFromUrl)(baseUrl); - await (0, age_gate_1.setAgeGateCookies)(page, baseUrl, state); - logger_1.logger.info('categories', `Loading page to detect menu type: ${baseUrl}`); - await page.goto(baseUrl, { waitUntil: 'domcontentloaded', timeout: 60000 }); - await page.waitForTimeout(3000); - // If age gate still appears, try to bypass it - await (0, age_gate_1.bypassAgeGate)(page, state); - // Detect if it's a Dutchie menu by inspecting page source - const isDutchie = await isDutchieMenu(page); - await browser.close(); - browser = null; - if (isDutchie) { - logger_1.logger.info('categories', `✅ Detected Dutchie menu for ${store.name}`); - await createDutchieCategories(storeId, store); - } - else { - // Fallback: Use standard cannabis categories for non-Dutchie sites - logger_1.logger.info('categories', `Non-Dutchie menu detected, using standard cannabis categories for ${store.name}`); - await createCuraleafCategories(storeId, store); - } - } - catch (error) { - logger_1.logger.error('categories', `Category discovery error: ${error}`); - if (browser) - await browser.close(); - throw error; - } -} -async function createDutchieCategories(storeId, store) { - const client = await migrate_1.pool.connect(); - try { - await client.query('BEGIN'); - logger_1.logger.info('categories', `Creating predefined Dutchie category structure`); - const baseUrl = store.dutchie_url; - for (const category of DUTCHIE_CATEGORIES) { - let categoryUrl; - // Use Dutchie template to build correct category URLs - if (category.parentSlug) { - // Subcategory: Use template's buildCategoryUrl (e.g., /products/flower) - categoryUrl = dutchie_1.dutchieTemplate.buildCategoryUrl(baseUrl, category.name); - } - else { - // Top-level: Use base URL with slug - categoryUrl = `${baseUrl}/${category.slug}`; - } - if (!category.parentSlug) { - // Create parent category - await client.query(` - INSERT INTO categories (store_id, name, slug, dutchie_url, scrape_enabled) - VALUES ($1, $2, $3, $4, true) - ON CONFLICT (store_id, slug) - DO UPDATE SET name = $2, dutchie_url = $4 - RETURNING id - `, [storeId, category.name, category.slug, categoryUrl]); - logger_1.logger.info('categories', `📁 ${category.name}`); - } - else { - // Create subcategory - const parentResult = await client.query(` - SELECT id FROM categories - WHERE store_id = $1 AND slug = $2 - `, [storeId, category.parentSlug]); - if (parentResult.rows.length > 0) { - await client.query(` - INSERT INTO categories (store_id, name, slug, dutchie_url, scrape_enabled) - VALUES ($1, $2, $3, $4, true) - ON CONFLICT (store_id, slug) - DO UPDATE SET name = $2, dutchie_url = $4 - `, [storeId, category.name, category.slug, categoryUrl]); - logger_1.logger.info('categories', ` └── ${category.name}`); - } - } - } - await client.query('COMMIT'); - logger_1.logger.info('categories', `✅ Created ${DUTCHIE_CATEGORIES.length} Dutchie categories successfully`); - } - catch (error) { - await client.query('ROLLBACK'); - logger_1.logger.error('categories', `Failed to create Dutchie categories: ${error}`); - throw error; - } - finally { - client.release(); - } -} -async function createCuraleafCategories(storeId, store) { - const client = await migrate_1.pool.connect(); - try { - await client.query('BEGIN'); - logger_1.logger.info('categories', `Creating predefined Curaleaf category structure`); - const baseUrl = store.dutchie_url; - for (const category of CURALEAF_CATEGORIES) { - let categoryUrl; - if (category.parentSlug) { - // Subcategory URL - Curaleaf uses pattern like: /stores/{store-slug}/{category} - categoryUrl = `${baseUrl}?category=${category.slug}`; - } - else { - // Top-level category - categoryUrl = baseUrl; - } - if (!category.parentSlug) { - // Create parent category - await client.query(` - INSERT INTO categories (store_id, name, slug, dutchie_url, scrape_enabled) - VALUES ($1, $2, $3, $4, true) - ON CONFLICT (store_id, slug) - DO UPDATE SET name = $2, dutchie_url = $4 - RETURNING id - `, [storeId, category.name, category.slug, categoryUrl]); - logger_1.logger.info('categories', `📁 ${category.name}`); - } - else { - // Create subcategory - const parentResult = await client.query(` - SELECT id FROM categories - WHERE store_id = $1 AND slug = $2 - `, [storeId, category.parentSlug]); - if (parentResult.rows.length > 0) { - await client.query(` - INSERT INTO categories (store_id, name, slug, dutchie_url, scrape_enabled) - VALUES ($1, $2, $3, $4, true) - ON CONFLICT (store_id, slug) - DO UPDATE SET name = $2, dutchie_url = $4 - `, [storeId, category.name, category.slug, categoryUrl]); - logger_1.logger.info('categories', ` └── ${category.name}`); - } - } - } - await client.query('COMMIT'); - logger_1.logger.info('categories', `✅ Created ${CURALEAF_CATEGORIES.length} Curaleaf categories successfully`); - } - catch (error) { - await client.query('ROLLBACK'); - logger_1.logger.error('categories', `Failed to create Curaleaf categories: ${error}`); - throw error; - } - finally { - client.release(); - } -} diff --git a/backend/dist/services/crawl-scheduler.js b/backend/dist/services/crawl-scheduler.js deleted file mode 100644 index 271609bc..00000000 --- a/backend/dist/services/crawl-scheduler.js +++ /dev/null @@ -1,536 +0,0 @@ -"use strict"; -/** - * Crawl Scheduler Service - * - * This service manages crawl scheduling using a job queue approach. - * It does NOT modify the crawler - it only TRIGGERS the existing crawler. - * - * Features: - * - Global schedule: crawl all stores every N hours - * - Daily special run: 12:01 AM local store time - * - Per-store schedule overrides - * - Job queue for tracking pending/running crawls - */ -var __importDefault = (this && this.__importDefault) || function (mod) { - return (mod && mod.__esModule) ? mod : { "default": mod }; -}; -Object.defineProperty(exports, "__esModule", { value: true }); -exports.getGlobalSchedule = getGlobalSchedule; -exports.updateGlobalSchedule = updateGlobalSchedule; -exports.getStoreScheduleStatuses = getStoreScheduleStatuses; -exports.getStoreSchedule = getStoreSchedule; -exports.updateStoreSchedule = updateStoreSchedule; -exports.createCrawlJob = createCrawlJob; -exports.getPendingJobs = getPendingJobs; -exports.claimJob = claimJob; -exports.completeJob = completeJob; -exports.getRecentJobs = getRecentJobs; -exports.getAllRecentJobs = getAllRecentJobs; -exports.checkAndCreateScheduledJobs = checkAndCreateScheduledJobs; -exports.checkAndCreateDailySpecialJobs = checkAndCreateDailySpecialJobs; -exports.processJobs = processJobs; -exports.processOrchestrator = processOrchestrator; -exports.setSchedulerMode = setSchedulerMode; -exports.getSchedulerMode = getSchedulerMode; -exports.startCrawlScheduler = startCrawlScheduler; -exports.stopCrawlScheduler = stopCrawlScheduler; -exports.restartCrawlScheduler = restartCrawlScheduler; -exports.triggerManualCrawl = triggerManualCrawl; -exports.triggerAllStoresCrawl = triggerAllStoresCrawl; -exports.cancelJob = cancelJob; -const node_cron_1 = __importDefault(require("node-cron")); -const migrate_1 = require("../db/migrate"); -const scraper_v2_1 = require("../scraper-v2"); -const store_crawl_orchestrator_1 = require("./store-crawl-orchestrator"); -// Worker identification -const WORKER_ID = `worker-${process.pid}-${Date.now()}`; -let schedulerCronJob = null; -let jobProcessorRunning = false; -let orchestratorProcessorRunning = false; -// Scheduler mode: 'legacy' uses job queue, 'orchestrator' uses intelligent orchestration -let schedulerMode = 'orchestrator'; -// ============================================ -// Schedule Management -// ============================================ -/** - * Get global schedule settings - */ -async function getGlobalSchedule() { - const result = await migrate_1.pool.query(` - SELECT * FROM crawler_schedule ORDER BY id - `); - return result.rows; -} -/** - * Update global schedule setting - */ -async function updateGlobalSchedule(scheduleType, updates) { - const setClauses = []; - const values = []; - let paramIndex = 1; - if (updates.enabled !== undefined) { - setClauses.push(`enabled = $${paramIndex++}`); - values.push(updates.enabled); - } - if (updates.interval_hours !== undefined) { - setClauses.push(`interval_hours = $${paramIndex++}`); - values.push(updates.interval_hours); - } - if (updates.run_time !== undefined) { - setClauses.push(`run_time = $${paramIndex++}`); - values.push(updates.run_time); - } - values.push(scheduleType); - const result = await migrate_1.pool.query(` - UPDATE crawler_schedule - SET ${setClauses.join(', ')} - WHERE schedule_type = $${paramIndex} - RETURNING * - `, values); - return result.rows[0]; -} -/** - * Get all store schedule statuses - */ -async function getStoreScheduleStatuses() { - const result = await migrate_1.pool.query(`SELECT * FROM crawl_schedule_status ORDER BY priority DESC, store_name`); - return result.rows; -} -/** - * Get or create per-store schedule override - */ -async function getStoreSchedule(storeId) { - const result = await migrate_1.pool.query(` - SELECT * FROM store_crawl_schedule WHERE store_id = $1 - `, [storeId]); - if (result.rows.length > 0) { - return result.rows[0]; - } - // Return default (use global) - return { - store_id: storeId, - enabled: true, - interval_hours: null, - daily_special_enabled: true, - daily_special_time: null, - priority: 0 - }; -} -/** - * Update per-store schedule override - */ -async function updateStoreSchedule(storeId, updates) { - const result = await migrate_1.pool.query(` - INSERT INTO store_crawl_schedule (store_id, enabled, interval_hours, daily_special_enabled, daily_special_time, priority) - VALUES ($1, $2, $3, $4, $5, $6) - ON CONFLICT (store_id) DO UPDATE SET - enabled = COALESCE(EXCLUDED.enabled, store_crawl_schedule.enabled), - interval_hours = EXCLUDED.interval_hours, - daily_special_enabled = COALESCE(EXCLUDED.daily_special_enabled, store_crawl_schedule.daily_special_enabled), - daily_special_time = EXCLUDED.daily_special_time, - priority = COALESCE(EXCLUDED.priority, store_crawl_schedule.priority), - updated_at = NOW() - RETURNING * - `, [ - storeId, - updates.enabled ?? true, - updates.interval_hours ?? null, - updates.daily_special_enabled ?? true, - updates.daily_special_time ?? null, - updates.priority ?? 0 - ]); - return result.rows[0]; -} -// ============================================ -// Job Queue Management -// ============================================ -/** - * Create a new crawl job - */ -async function createCrawlJob(storeId, jobType = 'full_crawl', triggerType = 'scheduled', scheduledAt = new Date(), priority = 0) { - // Check if there's already a pending or running job for this store - const existing = await migrate_1.pool.query(` - SELECT id FROM crawl_jobs - WHERE store_id = $1 AND status IN ('pending', 'running') - LIMIT 1 - `, [storeId]); - if (existing.rows.length > 0) { - console.log(`Skipping job creation for store ${storeId} - already has pending/running job`); - return existing.rows[0]; - } - const result = await migrate_1.pool.query(` - INSERT INTO crawl_jobs (store_id, job_type, trigger_type, scheduled_at, priority, status) - VALUES ($1, $2, $3, $4, $5, 'pending') - RETURNING * - `, [storeId, jobType, triggerType, scheduledAt, priority]); - console.log(`Created crawl job ${result.rows[0].id} for store ${storeId} (${triggerType})`); - return result.rows[0]; -} -/** - * Get pending jobs ready to run - */ -async function getPendingJobs(limit = 5) { - const result = await migrate_1.pool.query(` - SELECT cj.*, s.name as store_name - FROM crawl_jobs cj - JOIN stores s ON s.id = cj.store_id - WHERE cj.status = 'pending' - AND cj.scheduled_at <= NOW() - ORDER BY cj.priority DESC, cj.scheduled_at ASC - LIMIT $1 - `, [limit]); - return result.rows; -} -/** - * Claim a job for processing - */ -async function claimJob(jobId) { - const result = await migrate_1.pool.query(` - UPDATE crawl_jobs - SET status = 'running', started_at = NOW(), worker_id = $2 - WHERE id = $1 AND status = 'pending' - RETURNING id - `, [jobId, WORKER_ID]); - return result.rows.length > 0; -} -/** - * Complete a job - */ -async function completeJob(jobId, success, results) { - await migrate_1.pool.query(` - UPDATE crawl_jobs - SET - status = $2, - completed_at = NOW(), - products_found = $3, - error_message = $4 - WHERE id = $1 - `, [ - jobId, - success ? 'completed' : 'failed', - results?.products_found ?? null, - results?.error_message ?? null - ]); -} -/** - * Get recent jobs for a store - */ -async function getRecentJobs(storeId, limit = 10) { - const result = await migrate_1.pool.query(` - SELECT * FROM crawl_jobs - WHERE store_id = $1 - ORDER BY created_at DESC - LIMIT $2 - `, [storeId, limit]); - return result.rows; -} -/** - * Get all recent jobs - */ -async function getAllRecentJobs(limit = 50) { - const result = await migrate_1.pool.query(` - SELECT cj.*, s.name as store_name, s.slug as store_slug - FROM crawl_jobs cj - JOIN stores s ON s.id = cj.store_id - ORDER BY cj.created_at DESC - LIMIT $1 - `, [limit]); - return result.rows; -} -// ============================================ -// Scheduler Logic -// ============================================ -/** - * Check which stores are due for a crawl and create jobs - */ -async function checkAndCreateScheduledJobs() { - console.log('Checking for stores due for crawl...'); - // Get global schedule settings - const globalSchedule = await migrate_1.pool.query(` - SELECT * FROM crawler_schedule WHERE schedule_type = 'global_interval' - `); - if (globalSchedule.rows.length === 0 || !globalSchedule.rows[0].enabled) { - console.log('Global scheduler is disabled'); - return 0; - } - const intervalHours = globalSchedule.rows[0].interval_hours || 4; - // Find stores due for crawl - const result = await migrate_1.pool.query(` - SELECT - s.id, - s.name, - s.timezone, - s.last_scraped_at, - COALESCE(scs.enabled, TRUE) as schedule_enabled, - COALESCE(scs.interval_hours, $1) as interval_hours, - COALESCE(scs.priority, 0) as priority - FROM stores s - LEFT JOIN store_crawl_schedule scs ON scs.store_id = s.id - WHERE s.active = TRUE - AND s.scrape_enabled = TRUE - AND COALESCE(scs.enabled, TRUE) = TRUE - AND ( - s.last_scraped_at IS NULL - OR s.last_scraped_at < NOW() - (COALESCE(scs.interval_hours, $1) || ' hours')::INTERVAL - ) - AND NOT EXISTS ( - SELECT 1 FROM crawl_jobs cj - WHERE cj.store_id = s.id AND cj.status IN ('pending', 'running') - ) - ORDER BY COALESCE(scs.priority, 0) DESC, s.last_scraped_at ASC NULLS FIRST - `, [intervalHours]); - let jobsCreated = 0; - for (const store of result.rows) { - try { - await createCrawlJob(store.id, 'full_crawl', 'scheduled', new Date(), store.priority); - jobsCreated++; - console.log(`Scheduled crawl job for: ${store.name}`); - } - catch (error) { - console.error(`Failed to create job for store ${store.name}:`, error); - } - } - console.log(`Created ${jobsCreated} scheduled crawl jobs`); - return jobsCreated; -} -/** - * Check for daily special runs (12:01 AM local time) - */ -async function checkAndCreateDailySpecialJobs() { - console.log('Checking for daily special runs...'); - // Get daily special schedule - const dailySchedule = await migrate_1.pool.query(` - SELECT * FROM crawler_schedule WHERE schedule_type = 'daily_special' - `); - if (dailySchedule.rows.length === 0 || !dailySchedule.rows[0].enabled) { - console.log('Daily special scheduler is disabled'); - return 0; - } - const targetTime = dailySchedule.rows[0].run_time || '00:01'; - // Find stores where it's currently the target time in their local timezone - // and they haven't had a daily special run today - const result = await migrate_1.pool.query(` - SELECT - s.id, - s.name, - s.timezone, - COALESCE(scs.daily_special_enabled, TRUE) as daily_special_enabled, - COALESCE(scs.daily_special_time, $1::TIME) as daily_special_time, - COALESCE(scs.priority, 0) as priority - FROM stores s - LEFT JOIN store_crawl_schedule scs ON scs.store_id = s.id - WHERE s.active = TRUE - AND s.scrape_enabled = TRUE - AND COALESCE(scs.daily_special_enabled, TRUE) = TRUE - -- Check if current time in store timezone matches the target time (within 2 minutes) - AND ABS( - EXTRACT(EPOCH FROM ( - (NOW() AT TIME ZONE COALESCE(s.timezone, 'America/Phoenix'))::TIME - - COALESCE(scs.daily_special_time, $1::TIME) - )) - ) < 120 -- within 2 minutes - -- Ensure we haven't already created a daily_special job today for this store - AND NOT EXISTS ( - SELECT 1 FROM crawl_jobs cj - WHERE cj.store_id = s.id - AND cj.trigger_type = 'daily_special' - AND cj.created_at > (NOW() AT TIME ZONE COALESCE(s.timezone, 'America/Phoenix'))::DATE - ) - AND NOT EXISTS ( - SELECT 1 FROM crawl_jobs cj - WHERE cj.store_id = s.id AND cj.status IN ('pending', 'running') - ) - ORDER BY COALESCE(scs.priority, 0) DESC - `, [targetTime]); - let jobsCreated = 0; - for (const store of result.rows) { - try { - await createCrawlJob(store.id, 'full_crawl', 'daily_special', new Date(), store.priority + 10); - jobsCreated++; - console.log(`Created daily special job for: ${store.name} (${store.timezone})`); - } - catch (error) { - console.error(`Failed to create daily special job for store ${store.name}:`, error); - } - } - if (jobsCreated > 0) { - console.log(`Created ${jobsCreated} daily special crawl jobs`); - } - return jobsCreated; -} -/** - * Process pending jobs - */ -async function processJobs() { - if (jobProcessorRunning) { - console.log('Job processor already running, skipping...'); - return; - } - jobProcessorRunning = true; - try { - const jobs = await getPendingJobs(1); // Process one at a time for safety - for (const job of jobs) { - console.log(`Processing job ${job.id} for store: ${job.store_name}`); - const claimed = await claimJob(job.id); - if (!claimed) { - console.log(`Job ${job.id} already claimed by another worker`); - continue; - } - try { - // Call the existing scraper - DO NOT MODIFY SCRAPER LOGIC - await (0, scraper_v2_1.scrapeStore)(job.store_id); - // Update store's last_scraped_at - await migrate_1.pool.query(` - UPDATE stores SET last_scraped_at = NOW() WHERE id = $1 - `, [job.store_id]); - await completeJob(job.id, true, {}); - console.log(`Job ${job.id} completed successfully`); - } - catch (error) { - console.error(`Job ${job.id} failed:`, error); - await completeJob(job.id, false, { error_message: error.message }); - } - } - } - finally { - jobProcessorRunning = false; - } -} -/** - * Process stores using the intelligent orchestrator - * This replaces the simple job queue approach with intelligent provider detection - */ -async function processOrchestrator() { - if (orchestratorProcessorRunning) { - console.log('Orchestrator processor already running, skipping...'); - return; - } - orchestratorProcessorRunning = true; - try { - // Get stores due for orchestration (respects schedule, intervals, etc.) - const storeIds = await (0, store_crawl_orchestrator_1.getStoresDueForOrchestration)(3); // Process up to 3 at a time - if (storeIds.length === 0) { - return; - } - console.log(`Orchestrator: Processing ${storeIds.length} stores due for crawl`); - // Process each store through the orchestrator - for (const storeId of storeIds) { - try { - console.log(`Orchestrator: Starting crawl for store ${storeId}`); - const result = await (0, store_crawl_orchestrator_1.runStoreCrawlOrchestrator)(storeId); - console.log(`Orchestrator: Store ${storeId} completed - ${result.summary}`); - } - catch (error) { - console.error(`Orchestrator: Store ${storeId} failed - ${error.message}`); - } - } - console.log(`Orchestrator: Finished processing ${storeIds.length} stores`); - } - finally { - orchestratorProcessorRunning = false; - } -} -// ============================================ -// Scheduler Control -// ============================================ -/** - * Set scheduler mode - */ -function setSchedulerMode(mode) { - schedulerMode = mode; - console.log(`Scheduler mode set to: ${mode}`); -} -/** - * Get current scheduler mode - */ -function getSchedulerMode() { - return schedulerMode; -} -/** - * Start the scheduler (runs every minute to check for due jobs) - */ -async function startCrawlScheduler() { - stopCrawlScheduler(); - console.log(`Starting crawl scheduler in ${schedulerMode} mode...`); - // Run every minute - schedulerCronJob = node_cron_1.default.schedule('* * * * *', async () => { - try { - if (schedulerMode === 'orchestrator') { - // Use intelligent orchestrator (handles detection + crawl) - await processOrchestrator(); - } - else { - // Legacy mode: job queue approach - // Check for interval-based scheduled jobs - await checkAndCreateScheduledJobs(); - // Check for daily special runs - await checkAndCreateDailySpecialJobs(); - // Process any pending jobs - await processJobs(); - } - } - catch (error) { - console.error('Scheduler tick error:', error); - } - }); - console.log(`Crawl scheduler started in ${schedulerMode} mode (checking every minute)`); -} -/** - * Stop the scheduler - */ -function stopCrawlScheduler() { - if (schedulerCronJob) { - schedulerCronJob.stop(); - schedulerCronJob = null; - console.log('Crawl scheduler stopped'); - } -} -/** - * Restart the scheduler - */ -async function restartCrawlScheduler() { - await startCrawlScheduler(); -} -// ============================================ -// Manual Triggers -// ============================================ -/** - * Manually trigger a crawl for a specific store (creates a job immediately) - */ -async function triggerManualCrawl(storeId) { - console.log(`Manual crawl triggered for store ID: ${storeId}`); - return await createCrawlJob(storeId, 'full_crawl', 'manual', new Date(), 100); // High priority -} -/** - * Manually trigger crawls for all stores - */ -async function triggerAllStoresCrawl() { - console.log('Manual crawl triggered for all stores'); - const result = await migrate_1.pool.query(` - SELECT id, name FROM stores - WHERE active = TRUE AND scrape_enabled = TRUE - AND NOT EXISTS ( - SELECT 1 FROM crawl_jobs cj - WHERE cj.store_id = stores.id AND cj.status IN ('pending', 'running') - ) - `); - let jobsCreated = 0; - for (const store of result.rows) { - await createCrawlJob(store.id, 'full_crawl', 'manual', new Date(), 50); - jobsCreated++; - } - console.log(`Created ${jobsCreated} manual crawl jobs`); - return jobsCreated; -} -/** - * Cancel a pending job - */ -async function cancelJob(jobId) { - const result = await migrate_1.pool.query(` - UPDATE crawl_jobs - SET status = 'cancelled' - WHERE id = $1 AND status = 'pending' - RETURNING id - `, [jobId]); - return result.rows.length > 0; -} diff --git a/backend/dist/services/crawler-jobs.js b/backend/dist/services/crawler-jobs.js deleted file mode 100644 index 6bf28e3f..00000000 --- a/backend/dist/services/crawler-jobs.js +++ /dev/null @@ -1,476 +0,0 @@ -"use strict"; -/** - * Crawler Jobs Service - * - * Handles three types of jobs: - * 1. DetectMenuProviderJob - Detect menu provider for a dispensary - * 2. DutchieMenuCrawlJob - Production Dutchie crawl - * 3. SandboxCrawlJob - Learning/testing crawl for unknown providers - */ -var __importDefault = (this && this.__importDefault) || function (mod) { - return (mod && mod.__esModule) ? mod : { "default": mod }; -}; -Object.defineProperty(exports, "__esModule", { value: true }); -exports.runDetectMenuProviderJob = runDetectMenuProviderJob; -exports.runDutchieMenuCrawlJob = runDutchieMenuCrawlJob; -exports.runSandboxCrawlJob = runSandboxCrawlJob; -exports.processSandboxJobs = processSandboxJobs; -const migrate_1 = require("../db/migrate"); -const logger_1 = require("./logger"); -const menu_provider_detector_1 = require("./menu-provider-detector"); -const scraper_v2_1 = require("../scraper-v2"); -const puppeteer_1 = __importDefault(require("puppeteer")); -const fs_1 = require("fs"); -const path_1 = __importDefault(require("path")); -const availability_1 = require("./availability"); -const WORKER_ID = `crawler-${process.pid}-${Date.now()}`; -// ======================================== -// Helper Functions -// ======================================== -async function getDispensary(dispensaryId) { - const result = await migrate_1.pool.query(`SELECT id, name, website, menu_url, menu_provider, menu_provider_confidence, - crawler_mode, crawler_status, scraper_template - FROM dispensaries WHERE id = $1`, [dispensaryId]); - return result.rows[0] || null; -} -async function updateDispensary(dispensaryId, updates) { - const setClauses = []; - const values = []; - let paramIndex = 1; - for (const [key, value] of Object.entries(updates)) { - setClauses.push(`${key} = $${paramIndex}`); - values.push(value); - paramIndex++; - } - setClauses.push(`updated_at = NOW()`); - values.push(dispensaryId); - await migrate_1.pool.query(`UPDATE dispensaries SET ${setClauses.join(', ')} WHERE id = $${paramIndex}`, values); -} -async function createSandboxEntry(dispensaryId, suspectedProvider, mode, detectionSignals) { - // First, check if there's an existing active sandbox - const existing = await migrate_1.pool.query(`SELECT id FROM crawler_sandboxes - WHERE dispensary_id = $1 AND status NOT IN ('moved_to_production', 'failed')`, [dispensaryId]); - if (existing.rows.length > 0) { - // Update existing - await migrate_1.pool.query(`UPDATE crawler_sandboxes - SET suspected_menu_provider = $2, mode = $3, detection_signals = COALESCE($4, detection_signals), updated_at = NOW() - WHERE id = $1`, [existing.rows[0].id, suspectedProvider, mode, detectionSignals ? JSON.stringify(detectionSignals) : null]); - return existing.rows[0].id; - } - // Create new - const result = await migrate_1.pool.query(`INSERT INTO crawler_sandboxes (dispensary_id, suspected_menu_provider, mode, detection_signals, status) - VALUES ($1, $2, $3, $4, 'pending') - RETURNING id`, [dispensaryId, suspectedProvider, mode, detectionSignals ? JSON.stringify(detectionSignals) : '{}']); - return result.rows[0].id; -} -async function createSandboxJob(dispensaryId, sandboxId, jobType, priority = 0) { - const result = await migrate_1.pool.query(`INSERT INTO sandbox_crawl_jobs (dispensary_id, sandbox_id, job_type, status, priority) - VALUES ($1, $2, $3, 'pending', $4) - RETURNING id`, [dispensaryId, sandboxId, jobType, priority]); - return result.rows[0].id; -} -// Get linked store ID for a dispensary (for using existing scraper) -async function getStoreIdForDispensary(dispensaryId) { - // Check if there's a stores entry linked to this dispensary - const result = await migrate_1.pool.query(`SELECT s.id FROM stores s - JOIN dispensaries d ON d.menu_url = s.dutchie_url OR d.name ILIKE '%' || s.name || '%' - WHERE d.id = $1 - LIMIT 1`, [dispensaryId]); - if (result.rows.length > 0) { - return result.rows[0].id; - } - // Try to find by website - const result2 = await migrate_1.pool.query(`SELECT s.id FROM stores s - JOIN dispensaries d ON d.website ILIKE '%' || s.slug || '%' - WHERE d.id = $1 - LIMIT 1`, [dispensaryId]); - return result2.rows[0]?.id || null; -} -// ======================================== -// Job 1: Detect Menu Provider -// ======================================== -async function runDetectMenuProviderJob(dispensaryId) { - logger_1.logger.info('crawler-jobs', `Starting menu provider detection for dispensary ${dispensaryId}`); - const dispensary = await getDispensary(dispensaryId); - if (!dispensary) { - return { success: false, message: `Dispensary ${dispensaryId} not found` }; - } - // Check for website URL - const websiteUrl = dispensary.website || dispensary.menu_url; - if (!websiteUrl) { - await updateDispensary(dispensaryId, { - crawler_status: 'error_needs_review', - last_menu_error_at: new Date(), - last_error_message: 'No website URL available for detection', - }); - return { success: false, message: 'No website URL available' }; - } - try { - // Run detection - const detection = await (0, menu_provider_detector_1.detectMenuProvider)(websiteUrl, { - checkMenuPaths: true, - timeout: 30000, - }); - // Update dispensary with results - const updates = { - menu_provider: detection.provider, - menu_provider_confidence: detection.confidence, - provider_detection_data: JSON.stringify({ - signals: detection.signals, - urlsTested: detection.urlsTested, - menuEntryPoints: detection.menuEntryPoints, - rawSignals: detection.rawSignals, - detectedAt: new Date().toISOString(), - }), - crawler_status: 'idle', - }; - // Decide crawler mode based on provider - if (detection.provider === 'dutchie' && detection.confidence >= 70) { - // Dutchie with high confidence -> production - updates.crawler_mode = 'production'; - logger_1.logger.info('crawler-jobs', `Dispensary ${dispensaryId} detected as Dutchie (${detection.confidence}%), setting to production`); - } - else { - // Unknown or non-Dutchie -> sandbox - updates.crawler_mode = 'sandbox'; - // Create sandbox entry for further analysis - const sandboxId = await createSandboxEntry(dispensaryId, detection.provider, 'detection', { - signals: detection.signals, - rawSignals: detection.rawSignals, - }); - // Queue sandbox crawl job - await createSandboxJob(dispensaryId, sandboxId, 'detection'); - logger_1.logger.info('crawler-jobs', `Dispensary ${dispensaryId} detected as ${detection.provider} (${detection.confidence}%), setting to sandbox`); - } - // Update menu entry points if found - if (detection.menuEntryPoints.length > 0 && !dispensary.menu_url) { - updates.menu_url = detection.menuEntryPoints[0]; - } - await updateDispensary(dispensaryId, updates); - return { - success: true, - message: `Detected provider: ${detection.provider} (${detection.confidence}%)`, - data: { - provider: detection.provider, - confidence: detection.confidence, - mode: updates.crawler_mode, - menuEntryPoints: detection.menuEntryPoints, - }, - }; - } - catch (error) { - logger_1.logger.error('crawler-jobs', `Detection failed for dispensary ${dispensaryId}: ${error.message}`); - await updateDispensary(dispensaryId, { - crawler_status: 'error_needs_review', - last_menu_error_at: new Date(), - last_error_message: `Detection failed: ${error.message}`, - }); - return { success: false, message: error.message }; - } -} -// ======================================== -// Job 2: Dutchie Menu Crawl (Production) -// ======================================== -async function runDutchieMenuCrawlJob(dispensaryId) { - logger_1.logger.info('crawler-jobs', `Starting Dutchie production crawl for dispensary ${dispensaryId}`); - const dispensary = await getDispensary(dispensaryId); - if (!dispensary) { - return { success: false, message: `Dispensary ${dispensaryId} not found` }; - } - // Verify it's a Dutchie production dispensary - if (dispensary.menu_provider !== 'dutchie') { - logger_1.logger.warn('crawler-jobs', `Dispensary ${dispensaryId} is not Dutchie, skipping production crawl`); - return { success: false, message: 'Not a Dutchie dispensary' }; - } - if (dispensary.crawler_mode !== 'production') { - logger_1.logger.warn('crawler-jobs', `Dispensary ${dispensaryId} is not in production mode, skipping`); - return { success: false, message: 'Not in production mode' }; - } - // Find linked store ID - const storeId = await getStoreIdForDispensary(dispensaryId); - if (!storeId) { - // Need to create a store entry or handle differently - logger_1.logger.warn('crawler-jobs', `No linked store found for dispensary ${dispensaryId}`); - return { success: false, message: 'No linked store found - needs setup' }; - } - try { - // Update status to running - await updateDispensary(dispensaryId, { crawler_status: 'running' }); - // Run the existing Dutchie scraper - await (0, scraper_v2_1.scrapeStore)(storeId, 3); // 3 parallel workers - // Update success status - await updateDispensary(dispensaryId, { - crawler_status: 'ok', - last_menu_scrape: new Date(), - menu_scrape_status: 'active', - }); - logger_1.logger.info('crawler-jobs', `Dutchie crawl completed for dispensary ${dispensaryId}`); - return { - success: true, - message: 'Dutchie crawl completed successfully', - data: { storeId }, - }; - } - catch (error) { - logger_1.logger.error('crawler-jobs', `Dutchie crawl failed for dispensary ${dispensaryId}: ${error.message}`); - // Check if this might be a provider change - let providerChanged = false; - try { - const browser = await puppeteer_1.default.launch({ headless: true, args: ['--no-sandbox'] }); - const page = await browser.newPage(); - const url = dispensary.menu_url || dispensary.website; - if (url) { - await page.goto(url, { waitUntil: 'networkidle2', timeout: 30000 }); - const changeResult = await (0, menu_provider_detector_1.detectProviderChange)(page, 'dutchie'); - providerChanged = changeResult.changed; - if (providerChanged) { - // Provider changed - move to sandbox - await updateDispensary(dispensaryId, { - crawler_mode: 'sandbox', - crawler_status: 'error_needs_review', - last_menu_error_at: new Date(), - last_error_message: `Provider appears to have changed from Dutchie to ${changeResult.newProvider}`, - }); - const sandboxId = await createSandboxEntry(dispensaryId, changeResult.newProvider || 'unknown', 'detection', { providerChangeDetected: true, previousProvider: 'dutchie' }); - await createSandboxJob(dispensaryId, sandboxId, 'detection'); - logger_1.logger.warn('crawler-jobs', `Provider change detected for dispensary ${dispensaryId}: Dutchie -> ${changeResult.newProvider}`); - } - } - await browser.close(); - } - catch { - // Ignore detection errors during failure handling - } - if (!providerChanged) { - await updateDispensary(dispensaryId, { - crawler_status: 'error_needs_review', - last_menu_error_at: new Date(), - last_error_message: error.message, - }); - } - return { success: false, message: error.message }; - } -} -// ======================================== -// Job 3: Sandbox Crawl (Learning Mode) -// ======================================== -async function runSandboxCrawlJob(dispensaryId, sandboxId) { - logger_1.logger.info('crawler-jobs', `Starting sandbox crawl for dispensary ${dispensaryId}`); - const dispensary = await getDispensary(dispensaryId); - if (!dispensary) { - return { success: false, message: `Dispensary ${dispensaryId} not found` }; - } - // Get or create sandbox entry - let sandbox; - if (sandboxId) { - const result = await migrate_1.pool.query('SELECT * FROM crawler_sandboxes WHERE id = $1', [sandboxId]); - sandbox = result.rows[0]; - } - else { - const result = await migrate_1.pool.query(`SELECT * FROM crawler_sandboxes - WHERE dispensary_id = $1 AND status NOT IN ('moved_to_production', 'failed') - ORDER BY created_at DESC LIMIT 1`, [dispensaryId]); - sandbox = result.rows[0]; - if (!sandbox) { - const newSandboxId = await createSandboxEntry(dispensaryId, dispensary.menu_provider, 'template_learning'); - const result = await migrate_1.pool.query('SELECT * FROM crawler_sandboxes WHERE id = $1', [newSandboxId]); - sandbox = result.rows[0]; - } - } - const websiteUrl = dispensary.menu_url || dispensary.website; - if (!websiteUrl) { - await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = 'failed', failure_reason = 'No website URL' WHERE id = $1`, [sandbox.id]); - return { success: false, message: 'No website URL available' }; - } - let browser = null; - try { - // Update status - await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = 'analyzing', updated_at = NOW() WHERE id = $1`, [sandbox.id]); - await updateDispensary(dispensaryId, { crawler_status: 'running' }); - // Launch browser - browser = await puppeteer_1.default.launch({ - headless: true, - args: ['--no-sandbox', '--disable-setuid-sandbox'], - }); - const page = await browser.newPage(); - await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'); - // URLs to crawl (limited depth for sandbox) - const urlsToVisit = [websiteUrl]; - const menuPaths = ['/menu', '/shop', '/products', '/order']; - for (const path of menuPaths) { - const baseUrl = new URL(websiteUrl).origin; - urlsToVisit.push(`${baseUrl}${path}`); - } - const urlsTested = []; - const menuEntryPoints = []; - const capturedHtml = []; - const analysisData = { - provider_signals: {}, - selector_candidates: [], - page_structures: [], - }; - // Crawl each URL - for (const url of urlsToVisit) { - try { - urlsTested.push(url); - await page.goto(url, { waitUntil: 'networkidle2', timeout: 30000 }); - await new Promise(r => setTimeout(r, 2000)); // Wait for dynamic content - // Get page HTML - const html = await page.content(); - // Check if this looks like a menu page - const hasMenuContent = await page.evaluate(() => { - const text = document.body.innerText.toLowerCase(); - return (text.includes('add to cart') || - text.includes('thc') || - text.includes('indica') || - text.includes('sativa')); - }); - if (hasMenuContent) { - menuEntryPoints.push(url); - capturedHtml.push({ url, html }); - // Analyze page structure for selector candidates - const structure = await page.evaluate(() => { - const candidates = []; - // Look for product-like containers - const productSelectors = [ - '.product', '.product-card', '.menu-item', '.item-card', - '[data-product]', '[data-item]', '.strain', '.listing', - ]; - for (const selector of productSelectors) { - const els = document.querySelectorAll(selector); - if (els.length > 3) { // Likely a list - candidates.push({ - selector, - count: els.length, - type: 'product_container', - }); - } - } - // Look for price patterns - const pricePattern = /\$\d+(\.\d{2})?/; - const textNodes = document.body.innerText; - const priceMatches = textNodes.match(/\$\d+(\.\d{2})?/g); - return { - candidates, - priceCount: priceMatches?.length || 0, - hasAddToCart: textNodes.toLowerCase().includes('add to cart'), - }; - }); - // Extract availability hints from page content - const availabilityHints = (0, availability_1.extractAvailabilityHints)(html); - analysisData.page_structures.push({ - url, - ...structure, - availabilityHints, - }); - } - } - catch (pageError) { - if (!pageError.message.includes('404')) { - logger_1.logger.warn('crawler-jobs', `Sandbox crawl error for ${url}: ${pageError.message}`); - } - } - } - // Save HTML to storage (local for now, S3 later) - let rawHtmlLocation = null; - if (capturedHtml.length > 0) { - const htmlDir = path_1.default.join(process.cwd(), 'sandbox-data', `dispensary-${dispensaryId}`); - await fs_1.promises.mkdir(htmlDir, { recursive: true }); - for (const { url, html } of capturedHtml) { - const filename = `${Date.now()}-${url.replace(/[^a-z0-9]/gi, '_')}.html`; - await fs_1.promises.writeFile(path_1.default.join(htmlDir, filename), html); - } - rawHtmlLocation = htmlDir; - } - // Update sandbox with results - await migrate_1.pool.query(`UPDATE crawler_sandboxes SET - status = $1, - urls_tested = $2, - menu_entry_points = $3, - raw_html_location = $4, - analysis_json = $5, - confidence_score = $6, - analyzed_at = NOW(), - updated_at = NOW() - WHERE id = $7`, [ - menuEntryPoints.length > 0 ? 'needs_human_review' : 'pending', - JSON.stringify(urlsTested), - JSON.stringify(menuEntryPoints), - rawHtmlLocation, - JSON.stringify(analysisData), - menuEntryPoints.length > 0 ? 50 : 20, - sandbox.id, - ]); - // Update dispensary status - await updateDispensary(dispensaryId, { - crawler_status: 'error_needs_review', // Sandbox results need review - }); - logger_1.logger.info('crawler-jobs', `Sandbox crawl completed for dispensary ${dispensaryId}: ${menuEntryPoints.length} menu pages found`); - return { - success: true, - message: `Sandbox crawl completed. Found ${menuEntryPoints.length} menu entry points.`, - data: { - sandboxId: sandbox.id, - urlsTested: urlsTested.length, - menuEntryPoints, - analysisData, - }, - }; - } - catch (error) { - logger_1.logger.error('crawler-jobs', `Sandbox crawl failed for dispensary ${dispensaryId}: ${error.message}`); - await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = 'failed', failure_reason = $1 WHERE id = $2`, [error.message, sandbox.id]); - await updateDispensary(dispensaryId, { - crawler_status: 'error_needs_review', - last_menu_error_at: new Date(), - last_error_message: `Sandbox crawl failed: ${error.message}`, - }); - return { success: false, message: error.message }; - } - finally { - if (browser) { - await browser.close(); - } - } -} -// ======================================== -// Queue Processing Functions -// ======================================== -/** - * Process pending sandbox jobs - */ -async function processSandboxJobs(limit = 5) { - // Claim pending jobs - const jobs = await migrate_1.pool.query(`UPDATE sandbox_crawl_jobs - SET status = 'running', worker_id = $1, started_at = NOW() - WHERE id IN ( - SELECT id FROM sandbox_crawl_jobs - WHERE status = 'pending' AND scheduled_at <= NOW() - ORDER BY priority DESC, scheduled_at ASC - LIMIT $2 - FOR UPDATE SKIP LOCKED - ) - RETURNING *`, [WORKER_ID, limit]); - for (const job of jobs.rows) { - try { - let result; - if (job.job_type === 'detection') { - result = await runDetectMenuProviderJob(job.dispensary_id); - } - else { - result = await runSandboxCrawlJob(job.dispensary_id, job.sandbox_id); - } - await migrate_1.pool.query(`UPDATE sandbox_crawl_jobs - SET status = $1, completed_at = NOW(), result_summary = $2, error_message = $3 - WHERE id = $4`, [ - result.success ? 'completed' : 'failed', - JSON.stringify(result.data || {}), - result.success ? null : result.message, - job.id, - ]); - } - catch (error) { - await migrate_1.pool.query(`UPDATE sandbox_crawl_jobs SET status = 'failed', error_message = $1 WHERE id = $2`, [error.message, job.id]); - } - } -} diff --git a/backend/dist/services/crawler-logger.js b/backend/dist/services/crawler-logger.js deleted file mode 100644 index 72c0fcbe..00000000 --- a/backend/dist/services/crawler-logger.js +++ /dev/null @@ -1,202 +0,0 @@ -"use strict"; -/** - * CrawlerLogger - Structured logging for crawler operations - * - * High-signal, low-noise logging with JSON output for: - * - Job lifecycle (one summary per job) - * - Provider/mode changes - * - Sandbox events - * - Queue failures - * - * NO per-product logging - that's too noisy. - */ -Object.defineProperty(exports, "__esModule", { value: true }); -exports.crawlerLogger = void 0; -class CrawlerLoggerService { - formatLog(payload) { - return JSON.stringify(payload); - } - log(payload) { - const formatted = this.formatLog(payload); - switch (payload.level) { - case 'error': - console.error(`[CRAWLER] ${formatted}`); - break; - case 'warn': - console.warn(`[CRAWLER] ${formatted}`); - break; - case 'debug': - console.debug(`[CRAWLER] ${formatted}`); - break; - default: - console.log(`[CRAWLER] ${formatted}`); - } - } - /** - * Log when a crawl job starts - */ - jobStarted(params) { - this.log({ - timestamp: new Date().toISOString(), - level: 'info', - event: 'job_started', - job_id: params.job_id, - store_id: params.store_id, - store_name: params.store_name, - job_type: params.job_type, - trigger_type: params.trigger_type, - provider: params.provider, - }); - } - /** - * Log when a crawl job completes successfully - */ - jobCompleted(params) { - this.log({ - timestamp: new Date().toISOString(), - level: 'info', - event: 'job_completed', - job_id: params.job_id, - store_id: params.store_id, - store_name: params.store_name, - duration_ms: params.duration_ms, - products_found: params.products_found, - products_new: params.products_new, - products_updated: params.products_updated, - products_marked_oos: params.products_marked_oos, - provider: params.provider, - }); - } - /** - * Log when a crawl job fails - */ - jobFailed(params) { - this.log({ - timestamp: new Date().toISOString(), - level: 'error', - event: 'job_failed', - job_id: params.job_id, - store_id: params.store_id, - store_name: params.store_name, - duration_ms: params.duration_ms, - error_message: params.error_message, - error_code: params.error_code, - provider: params.provider, - }); - } - /** - * Log when a provider is detected for a dispensary - */ - providerDetected(params) { - this.log({ - timestamp: new Date().toISOString(), - level: 'info', - event: 'provider_detected', - dispensary_id: params.dispensary_id, - dispensary_name: params.dispensary_name, - detected_provider: params.detected_provider, - confidence: params.confidence, - detection_method: params.detection_method, - menu_url: params.menu_url, - category: params.category, - }); - } - /** - * Log when a dispensary's provider changes - */ - providerChanged(params) { - this.log({ - timestamp: new Date().toISOString(), - level: 'info', - event: 'provider_changed', - dispensary_id: params.dispensary_id, - dispensary_name: params.dispensary_name, - old_provider: params.old_provider, - new_provider: params.new_provider, - old_confidence: params.old_confidence, - new_confidence: params.new_confidence, - category: params.category, - }); - } - /** - * Log when a dispensary's crawler mode changes (sandbox -> production, etc.) - */ - modeChanged(params) { - this.log({ - timestamp: new Date().toISOString(), - level: 'info', - event: 'mode_changed', - dispensary_id: params.dispensary_id, - dispensary_name: params.dispensary_name, - old_mode: params.old_mode, - new_mode: params.new_mode, - reason: params.reason, - category: params.category, - provider: params.provider, - }); - } - /** - * Log sandbox crawl events - */ - sandboxEvent(params) { - const level = params.event === 'sandbox_failed' ? 'error' : 'info'; - this.log({ - timestamp: new Date().toISOString(), - level, - event: params.event, - dispensary_id: params.dispensary_id, - dispensary_name: params.dispensary_name, - template_name: params.template_name, - category: params.category, - quality_score: params.quality_score, - products_extracted: params.products_extracted, - fields_missing: params.fields_missing, - error_message: params.error_message, - provider: params.provider, - }); - } - /** - * Log queue processing failures - */ - queueFailure(params) { - this.log({ - timestamp: new Date().toISOString(), - level: 'error', - event: 'queue_failure', - queue_type: params.queue_type, - error_message: params.error_message, - affected_items: params.affected_items, - }); - } - /** - * Log detection scan summary - */ - detectionScan(params) { - this.log({ - timestamp: new Date().toISOString(), - level: 'info', - event: 'detection_scan', - total_scanned: params.total_scanned, - detected: params.detected, - failed: params.failed, - skipped: params.skipped, - duration_ms: params.duration_ms, - }); - } - /** - * Log intelligence run summary - */ - intelligenceRun(params) { - this.log({ - timestamp: new Date().toISOString(), - level: 'info', - event: 'intelligence_run', - run_type: params.run_type, - dispensaries_processed: params.dispensaries_processed, - jobs_queued: params.jobs_queued, - duration_ms: params.duration_ms, - }); - } -} -// Export singleton instance -exports.crawlerLogger = new CrawlerLoggerService(); diff --git a/backend/dist/services/dispensary-orchestrator.js b/backend/dist/services/dispensary-orchestrator.js deleted file mode 100644 index 69b92245..00000000 --- a/backend/dist/services/dispensary-orchestrator.js +++ /dev/null @@ -1,394 +0,0 @@ -"use strict"; -/** - * Dispensary Crawl Orchestrator - * - * Orchestrates the complete crawl workflow for a dispensary: - * 1. Load dispensary data - * 2. Check if provider detection is needed - * 3. Run provider detection if needed - * 4. Queue appropriate crawl jobs based on provider/mode - * 5. Update dispensary_crawl_schedule with meaningful status - * - * This works DIRECTLY with dispensaries (not through stores table). - */ -Object.defineProperty(exports, "__esModule", { value: true }); -exports.runDispensaryOrchestrator = runDispensaryOrchestrator; -exports.runBatchDispensaryOrchestrator = runBatchDispensaryOrchestrator; -exports.getDispensariesDueForOrchestration = getDispensariesDueForOrchestration; -exports.ensureAllDispensariesHaveSchedules = ensureAllDispensariesHaveSchedules; -exports.processDispensaryScheduler = processDispensaryScheduler; -const uuid_1 = require("uuid"); -const migrate_1 = require("../db/migrate"); -const crawler_logger_1 = require("./crawler-logger"); -const intelligence_detector_1 = require("./intelligence-detector"); -const category_crawler_jobs_1 = require("./category-crawler-jobs"); -// ======================================== -// Main Orchestrator Function -// ======================================== -/** - * Run the complete crawl orchestration for a dispensary - * - * Behavior: - * 1. Load the dispensary info - * 2. If product_provider is missing or stale (>7 days), run detection - * 3. After detection: - * - If product_provider = 'dutchie' and product_crawler_mode = 'production': Run production crawl - * - Otherwise: Run sandbox crawl - * 4. Update dispensary_crawl_schedule with status/summary - */ -async function runDispensaryOrchestrator(dispensaryId, scheduleId) { - const startTime = Date.now(); - const runId = (0, uuid_1.v4)(); - let result = { - status: 'pending', - summary: '', - runId, - dispensaryId, - dispensaryName: '', - detectionRan: false, - crawlRan: false, - durationMs: 0, - }; - try { - // Mark schedule as running - await updateScheduleStatus(dispensaryId, 'running', 'Starting orchestrator...', null, runId); - // 1. Load dispensary info - const dispensary = await getDispensaryInfo(dispensaryId); - if (!dispensary) { - throw new Error(`Dispensary ${dispensaryId} not found`); - } - result.dispensaryName = dispensary.name; - // 2. Check if provider detection is needed - const needsDetection = await checkNeedsDetection(dispensary); - if (needsDetection) { - // Run provider detection - const websiteUrl = dispensary.menu_url || dispensary.website; - if (!websiteUrl) { - result.status = 'error'; - result.summary = 'No website URL available for detection'; - result.error = 'Dispensary has no menu_url or website configured'; - await updateScheduleStatus(dispensaryId, 'error', result.summary, result.error, runId); - result.durationMs = Date.now() - startTime; - await createJobRecord(dispensaryId, scheduleId, result); - return result; - } - await updateScheduleStatus(dispensaryId, 'running', 'Running provider detection...', null, runId); - const detectionResult = await (0, intelligence_detector_1.detectMultiCategoryProviders)(websiteUrl); - result.detectionRan = true; - result.detectionResult = detectionResult; - // Save detection results to dispensary - await (0, intelligence_detector_1.updateAllCategoryProviders)(dispensaryId, detectionResult); - crawler_logger_1.crawlerLogger.providerDetected({ - dispensary_id: dispensaryId, - dispensary_name: dispensary.name, - detected_provider: detectionResult.product.provider, - confidence: detectionResult.product.confidence, - detection_method: 'dispensary_orchestrator', - menu_url: websiteUrl, - category: 'product', - }); - // Refresh dispensary info after detection - const updatedDispensary = await getDispensaryInfo(dispensaryId); - if (updatedDispensary) { - Object.assign(dispensary, updatedDispensary); - } - } - // 3. Determine crawl type and run - // Use product_provider if available, otherwise fall back to menu_type - const provider = dispensary.product_provider || dispensary.menu_type; - const mode = dispensary.product_crawler_mode; - // Run production Dutchie crawl if: - // 1. product_provider is 'dutchie' with production mode, OR - // 2. menu_type is 'dutchie' with platform_dispensary_id (known Dutchie store) - const isDutchieProduction = (provider === 'dutchie' && mode === 'production') || - (dispensary.menu_type === 'dutchie' && dispensary.platform_dispensary_id); - if (isDutchieProduction) { - // Production Dutchie crawl - await updateScheduleStatus(dispensaryId, 'running', 'Running Dutchie production crawl...', null, runId); - try { - // Run the category-specific crawl job - const crawlResult = await (0, category_crawler_jobs_1.runCrawlProductsJob)(dispensaryId); - result.crawlRan = true; - result.crawlType = 'production'; - if (crawlResult.success) { - result.productsFound = crawlResult.data?.productsFound || 0; - const detectionPart = result.detectionRan ? 'Detection + ' : ''; - result.summary = `${detectionPart}Dutchie products crawl completed`; - result.status = 'success'; - crawler_logger_1.crawlerLogger.jobCompleted({ - job_id: 0, - store_id: 0, - store_name: dispensary.name, - duration_ms: Date.now() - startTime, - products_found: result.productsFound || 0, - products_new: 0, - products_updated: 0, - provider: 'dutchie', - }); - } - else { - result.status = 'error'; - result.error = crawlResult.message; - result.summary = `Dutchie crawl failed: ${crawlResult.message.slice(0, 100)}`; - } - } - catch (crawlError) { - result.status = 'error'; - result.error = crawlError.message; - result.summary = `Dutchie crawl failed: ${crawlError.message.slice(0, 100)}`; - result.crawlRan = true; - result.crawlType = 'production'; - crawler_logger_1.crawlerLogger.jobFailed({ - job_id: 0, - store_id: 0, - store_name: dispensary.name, - duration_ms: Date.now() - startTime, - error_message: crawlError.message, - provider: 'dutchie', - }); - } - } - else if (provider && provider !== 'unknown') { - // Sandbox crawl for non-Dutchie or sandbox mode - await updateScheduleStatus(dispensaryId, 'running', `Running ${provider} sandbox crawl...`, null, runId); - try { - const sandboxResult = await (0, category_crawler_jobs_1.runSandboxProductsJob)(dispensaryId); - result.crawlRan = true; - result.crawlType = 'sandbox'; - result.productsFound = sandboxResult.data?.productsExtracted || 0; - const detectionPart = result.detectionRan ? 'Detection + ' : ''; - if (sandboxResult.success) { - result.summary = `${detectionPart}${provider} sandbox crawl (${result.productsFound} items, quality ${sandboxResult.data?.qualityScore || 0}%)`; - result.status = 'sandbox_only'; - } - else { - result.summary = `${detectionPart}${provider} sandbox failed: ${sandboxResult.message}`; - result.status = 'error'; - result.error = sandboxResult.message; - } - } - catch (sandboxError) { - result.status = 'error'; - result.error = sandboxError.message; - result.summary = `Sandbox crawl failed: ${sandboxError.message.slice(0, 100)}`; - result.crawlRan = true; - result.crawlType = 'sandbox'; - } - } - else { - // No provider detected - detection only - if (result.detectionRan) { - result.summary = `Detection complete: provider=${dispensary.product_provider || 'unknown'}, confidence=${dispensary.product_confidence || 0}%`; - result.status = 'detection_only'; - } - else { - result.summary = 'No provider detected and no crawl possible'; - result.status = 'error'; - result.error = 'Could not determine menu provider'; - } - } - } - catch (error) { - result.status = 'error'; - result.error = error.message; - result.summary = `Orchestrator error: ${error.message.slice(0, 100)}`; - crawler_logger_1.crawlerLogger.queueFailure({ - queue_type: 'dispensary_orchestrator', - error_message: error.message, - }); - } - result.durationMs = Date.now() - startTime; - // Update final schedule status - await updateScheduleStatus(dispensaryId, result.status, result.summary, result.error || null, runId); - // Create job record - await createJobRecord(dispensaryId, scheduleId, result); - return result; -} -// ======================================== -// Helper Functions -// ======================================== -async function getDispensaryInfo(dispensaryId) { - const result = await migrate_1.pool.query(`SELECT id, name, city, website, menu_url, menu_type, platform_dispensary_id, - product_provider, product_confidence, product_crawler_mode, last_product_scan_at - FROM dispensaries - WHERE id = $1`, [dispensaryId]); - return result.rows[0] || null; -} -async function checkNeedsDetection(dispensary) { - // If menu_type is already 'dutchie' and we have platform_dispensary_id, skip detection entirely - // This avoids wasteful detection timeouts for known Dutchie stores - if (dispensary.menu_type === 'dutchie' && dispensary.platform_dispensary_id) { - return false; - } - // No provider = definitely needs detection - if (!dispensary.product_provider) - return true; - // Unknown provider = needs detection - if (dispensary.product_provider === 'unknown') - return true; - // Low confidence = needs re-detection - if (dispensary.product_confidence !== null && dispensary.product_confidence < 50) - return true; - // Stale detection (> 7 days) = needs refresh - if (dispensary.last_product_scan_at) { - const daysSince = (Date.now() - new Date(dispensary.last_product_scan_at).getTime()) / (1000 * 60 * 60 * 24); - if (daysSince > 7) - return true; - } - return false; -} -async function updateScheduleStatus(dispensaryId, status, summary, error, runId) { - await migrate_1.pool.query(`INSERT INTO dispensary_crawl_schedule (dispensary_id, last_status, last_summary, last_error, last_run_at, updated_at) - VALUES ($1, $2, $3, $4, NOW(), NOW()) - ON CONFLICT (dispensary_id) DO UPDATE SET - last_status = $2, - last_summary = $3, - last_error = $4, - last_run_at = NOW(), - updated_at = NOW()`, [dispensaryId, status, summary, error]); -} -async function createJobRecord(dispensaryId, scheduleId, result) { - await migrate_1.pool.query(`INSERT INTO dispensary_crawl_jobs ( - dispensary_id, schedule_id, job_type, trigger_type, status, priority, - scheduled_at, started_at, completed_at, duration_ms, - detection_ran, crawl_ran, crawl_type, - products_found, products_new, products_updated, - detected_provider, detected_confidence, detected_mode, - error_message, run_id - ) VALUES ( - $1, $2, 'orchestrator', 'manual', $3, 100, - NOW(), NOW(), NOW(), $4, - $5, $6, $7, - $8, $9, $10, - $11, $12, $13, - $14, $15 - )`, [ - dispensaryId, - scheduleId || null, - result.status === 'success' ? 'completed' : result.status === 'error' ? 'failed' : 'completed', - result.durationMs, - result.detectionRan, - result.crawlRan, - result.crawlType || null, - result.productsFound || null, - result.productsNew || null, - result.productsUpdated || null, - result.detectionResult?.product.provider || null, - result.detectionResult?.product.confidence || null, - result.detectionResult?.product.mode || null, - result.error || null, - result.runId, - ]); - // Update schedule stats - if (result.status === 'success' || result.status === 'sandbox_only' || result.status === 'detection_only') { - await migrate_1.pool.query(`UPDATE dispensary_crawl_schedule SET - total_runs = COALESCE(total_runs, 0) + 1, - successful_runs = COALESCE(successful_runs, 0) + 1, - consecutive_failures = 0, - next_run_at = NOW() + (interval_minutes || ' minutes')::INTERVAL, - last_duration_ms = $2 - WHERE dispensary_id = $1`, [dispensaryId, result.durationMs]); - } - else if (result.status === 'error') { - await migrate_1.pool.query(`UPDATE dispensary_crawl_schedule SET - total_runs = COALESCE(total_runs, 0) + 1, - consecutive_failures = COALESCE(consecutive_failures, 0) + 1, - next_run_at = NOW() + (interval_minutes || ' minutes')::INTERVAL, - last_duration_ms = $2 - WHERE dispensary_id = $1`, [dispensaryId, result.durationMs]); - } -} -// ======================================== -// Batch Processing -// ======================================== -/** - * Run orchestrator for multiple dispensaries - */ -async function runBatchDispensaryOrchestrator(dispensaryIds, concurrency = 3) { - const results = []; - // Process in batches - for (let i = 0; i < dispensaryIds.length; i += concurrency) { - const batch = dispensaryIds.slice(i, i + concurrency); - console.log(`Processing batch ${Math.floor(i / concurrency) + 1}: dispensaries ${batch.join(', ')}`); - const batchResults = await Promise.all(batch.map(id => runDispensaryOrchestrator(id))); - results.push(...batchResults); - // Small delay between batches to avoid overwhelming the system - if (i + concurrency < dispensaryIds.length) { - await new Promise(r => setTimeout(r, 1000)); - } - } - return results; -} -/** - * Get dispensaries that are due for orchestration - */ -async function getDispensariesDueForOrchestration(limit = 10) { - const result = await migrate_1.pool.query(`SELECT d.id - FROM dispensaries d - LEFT JOIN dispensary_crawl_schedule dcs ON dcs.dispensary_id = d.id - WHERE COALESCE(dcs.is_active, TRUE) = TRUE - AND ( - dcs.next_run_at IS NULL - OR dcs.next_run_at <= NOW() - ) - AND (dcs.last_status IS NULL OR dcs.last_status NOT IN ('running', 'pending')) - ORDER BY COALESCE(dcs.priority, 0) DESC, dcs.last_run_at ASC NULLS FIRST - LIMIT $1`, [limit]); - return result.rows.map(row => row.id); -} -/** - * Ensure all dispensaries have schedule entries - */ -async function ensureAllDispensariesHaveSchedules(intervalMinutes = 240) { - // Get all dispensary IDs that don't have a schedule - const result = await migrate_1.pool.query(`INSERT INTO dispensary_crawl_schedule (dispensary_id, is_active, interval_minutes, priority) - SELECT d.id, TRUE, $1, 0 - FROM dispensaries d - WHERE NOT EXISTS ( - SELECT 1 FROM dispensary_crawl_schedule dcs WHERE dcs.dispensary_id = d.id - ) - RETURNING id`, [intervalMinutes]); - const existingCount = await migrate_1.pool.query('SELECT COUNT(*) FROM dispensary_crawl_schedule'); - return { - created: result.rowCount || 0, - existing: parseInt(existingCount.rows[0].count) - (result.rowCount || 0), - }; -} -// ======================================== -// Scheduler Integration -// ======================================== -let dispensarySchedulerRunning = false; -/** - * Process dispensaries using the intelligent orchestrator - * Called periodically by the scheduler - */ -async function processDispensaryScheduler() { - if (dispensarySchedulerRunning) { - console.log('Dispensary scheduler already running, skipping...'); - return; - } - dispensarySchedulerRunning = true; - try { - // Get dispensaries due for orchestration - const dispensaryIds = await getDispensariesDueForOrchestration(3); - if (dispensaryIds.length === 0) { - return; - } - console.log(`Dispensary Scheduler: Processing ${dispensaryIds.length} dispensaries due for crawl`); - // Process each dispensary through the orchestrator - for (const dispensaryId of dispensaryIds) { - try { - console.log(`Dispensary Scheduler: Starting crawl for dispensary ${dispensaryId}`); - const result = await runDispensaryOrchestrator(dispensaryId); - console.log(`Dispensary Scheduler: Dispensary ${dispensaryId} completed - ${result.summary}`); - } - catch (error) { - console.error(`Dispensary Scheduler: Dispensary ${dispensaryId} failed - ${error.message}`); - } - } - console.log(`Dispensary Scheduler: Finished processing ${dispensaryIds.length} dispensaries`); - } - finally { - dispensarySchedulerRunning = false; - } -} diff --git a/backend/dist/services/geolocation.js b/backend/dist/services/geolocation.js deleted file mode 100644 index 32917440..00000000 --- a/backend/dist/services/geolocation.js +++ /dev/null @@ -1,125 +0,0 @@ -"use strict"; -var __importDefault = (this && this.__importDefault) || function (mod) { - return (mod && mod.__esModule) ? mod : { "default": mod }; -}; -Object.defineProperty(exports, "__esModule", { value: true }); -exports.lookupProxyLocation = lookupProxyLocation; -exports.updateProxyLocation = updateProxyLocation; -exports.updateAllProxyLocations = updateAllProxyLocations; -exports.queueProxyLocationUpdate = queueProxyLocationUpdate; -const axios_1 = __importDefault(require("axios")); -const migrate_1 = require("../db/migrate"); -// Free API - 45 requests/minute limit -const GEOLOCATION_API = 'http://ip-api.com/json/'; -async function lookupProxyLocation(host) { - try { - const response = await axios_1.default.get(`${GEOLOCATION_API}${host}?fields=status,message,country,countryCode,regionName,city,query`); - const data = response.data; - if (data.status === 'fail') { - console.log(`❌ Geolocation lookup failed for ${host}: ${data.message}`); - return null; - } - return data; - } - catch (error) { - console.error(`❌ Error looking up location for ${host}:`, error.message); - return null; - } -} -async function updateProxyLocation(proxyId, location) { - await migrate_1.pool.query(` - UPDATE proxies - SET city = $1, - state = $2, - country = $3, - country_code = $4, - location_updated_at = CURRENT_TIMESTAMP - WHERE id = $5 - `, [ - location.city, - location.regionName, - location.country, - location.countryCode, - proxyId - ]); -} -async function updateAllProxyLocations(batchSize = 45) { - console.log('🌍 Starting proxy location update job...'); - // Get all proxies without location data - const result = await migrate_1.pool.query(` - SELECT id, host - FROM proxies - WHERE location_updated_at IS NULL - OR location_updated_at < CURRENT_TIMESTAMP - INTERVAL '30 days' - ORDER BY id - `); - const proxies = result.rows; - console.log(`📊 Found ${proxies.length} proxies to update`); - let updated = 0; - let failed = 0; - // Process in batches to respect rate limit (45 req/min) - for (let i = 0; i < proxies.length; i += batchSize) { - const batch = proxies.slice(i, i + batchSize); - console.log(`🔄 Processing batch ${Math.floor(i / batchSize) + 1}/${Math.ceil(proxies.length / batchSize)} (${batch.length} proxies)`); - // Process batch - for (const proxy of batch) { - const location = await lookupProxyLocation(proxy.host); - if (location) { - await updateProxyLocation(proxy.id, location); - console.log(`✅ Updated ${proxy.id}: ${location.city}, ${location.regionName} - ${location.country}`); - updated++; - } - else { - console.log(`⚠️ Failed to get location for proxy ${proxy.id} (${proxy.host})`); - failed++; - } - // Small delay between requests - await new Promise(resolve => setTimeout(resolve, 100)); - } - // Wait 60 seconds before next batch to respect rate limit - if (i + batchSize < proxies.length) { - console.log(`⏳ Waiting 60s before next batch (rate limit: 45 req/min)...`); - await new Promise(resolve => setTimeout(resolve, 60000)); - } - } - console.log(`✅ Proxy location update complete!`); - console.log(` Updated: ${updated}`); - console.log(` Failed: ${failed}`); -} -// Queue for background processing -const locationUpdateQueue = new Set(); -let isProcessing = false; -function queueProxyLocationUpdate(proxyId) { - locationUpdateQueue.add(proxyId); - processLocationQueue(); -} -async function processLocationQueue() { - if (isProcessing || locationUpdateQueue.size === 0) - return; - isProcessing = true; - try { - const proxyIds = Array.from(locationUpdateQueue); - locationUpdateQueue.clear(); - console.log(`🌍 Processing ${proxyIds.length} proxy location updates from queue`); - for (const proxyId of proxyIds) { - const result = await migrate_1.pool.query('SELECT host FROM proxies WHERE id = $1', [proxyId]); - if (result.rows.length === 0) - continue; - const host = result.rows[0].host; - const location = await lookupProxyLocation(host); - if (location) { - await updateProxyLocation(proxyId, location); - console.log(`✅ Queue: Updated ${proxyId}: ${location.city}, ${location.regionName} - ${location.country}`); - } - // Respect rate limit - await new Promise(resolve => setTimeout(resolve, 1500)); // ~40 req/min - } - } - finally { - isProcessing = false; - // Process any new items that were added while we were processing - if (locationUpdateQueue.size > 0) { - processLocationQueue(); - } - } -} diff --git a/backend/dist/services/intelligence-detector.js b/backend/dist/services/intelligence-detector.js deleted file mode 100644 index 0f5993b6..00000000 --- a/backend/dist/services/intelligence-detector.js +++ /dev/null @@ -1,493 +0,0 @@ -"use strict"; -/** - * Multi-Category Intelligence Detector - * - * Detects providers for each intelligence category independently: - * - Products: Which provider serves product data - * - Specials: Which provider serves deals/specials - * - Brand: Which provider serves brand information - * - Metadata: Which provider serves taxonomy/category data - */ -var __importDefault = (this && this.__importDefault) || function (mod) { - return (mod && mod.__esModule) ? mod : { "default": mod }; -}; -Object.defineProperty(exports, "__esModule", { value: true }); -exports.detectMultiCategoryProviders = detectMultiCategoryProviders; -exports.detectCategoryProviderChange = detectCategoryProviderChange; -exports.updateDispensaryCategoryProvider = updateDispensaryCategoryProvider; -exports.updateAllCategoryProviders = updateAllCategoryProviders; -exports.moveCategoryToSandbox = moveCategoryToSandbox; -const migrate_1 = require("../db/migrate"); -const logger_1 = require("./logger"); -const puppeteer_1 = __importDefault(require("puppeteer")); -// Production-ready providers per category -// Only these combinations can be set to production mode -const PRODUCTION_READY = { - product: ['dutchie'], // Only Dutchie products are production-ready - specials: [], // None yet - brand: [], // None yet - metadata: [], // None yet -}; -// Provider detection patterns -const PROVIDER_PATTERNS = { - dutchie: { - scripts: [ - /dutchie\.com/i, - /dutchie-plus/i, - /dutchie\.js/i, - /__DUTCHIE__/i, - /dutchie-embed/i, - ], - iframes: [ - /dutchie\.com/i, - /dutchie-plus\.com/i, - /embed\.dutchie/i, - ], - html: [ - /class="dutchie/i, - /id="dutchie/i, - /data-dutchie/i, - /"menuType":\s*"dutchie"/i, - ], - apiEndpoints: [ - /dutchie\.com\/graphql/i, - /plus\.dutchie\.com/i, - ], - metaTags: [ - /dutchie/i, - ], - }, - treez: { - scripts: [ - /treez\.io/i, - /treez-ecommerce/i, - /treez\.js/i, - ], - iframes: [ - /treez\.io/i, - /shop\.treez/i, - ], - html: [ - /class="treez/i, - /data-treez/i, - /treez-menu/i, - ], - apiEndpoints: [ - /api\.treez\.io/i, - /treez\.io\/api/i, - ], - metaTags: [], - }, - jane: { - scripts: [ - /jane\.co/i, - /iheartjane\.com/i, - /jane-frame/i, - /jane\.js/i, - ], - iframes: [ - /jane\.co/i, - /iheartjane\.com/i, - /embed\.iheartjane/i, - ], - html: [ - /class="jane/i, - /data-jane/i, - /jane-embed/i, - ], - apiEndpoints: [ - /api\.iheartjane/i, - /jane\.co\/api/i, - ], - metaTags: [], - }, - weedmaps: { - scripts: [ - /weedmaps\.com/i, - /wm-menu/i, - ], - iframes: [ - /weedmaps\.com/i, - /menu\.weedmaps/i, - ], - html: [ - /data-weedmaps/i, - /wm-menu/i, - ], - apiEndpoints: [ - /api-g\.weedmaps/i, - /weedmaps\.com\/api/i, - ], - metaTags: [], - }, - leafly: { - scripts: [ - /leafly\.com/i, - /leafly-menu/i, - ], - iframes: [ - /leafly\.com/i, - /order\.leafly/i, - ], - html: [ - /data-leafly/i, - /leafly-embed/i, - ], - apiEndpoints: [ - /api\.leafly/i, - ], - metaTags: [], - }, -}; -// Category-specific detection signals -const CATEGORY_SIGNALS = { - product: { - urlPatterns: [/\/menu/i, /\/products/i, /\/shop/i, /\/order/i], - htmlPatterns: [/product-card/i, /menu-item/i, /product-list/i, /product-grid/i], - jsonKeys: ['products', 'menuItems', 'items', 'inventory'], - }, - specials: { - urlPatterns: [/\/specials/i, /\/deals/i, /\/promotions/i, /\/offers/i], - htmlPatterns: [/special/i, /deal/i, /promotion/i, /discount/i, /sale/i], - jsonKeys: ['specials', 'deals', 'promotions', 'offers'], - }, - brand: { - urlPatterns: [/\/brands/i, /\/vendors/i, /\/producers/i], - htmlPatterns: [/brand-list/i, /vendor/i, /producer/i, /manufacturer/i], - jsonKeys: ['brands', 'vendors', 'producers', 'manufacturers'], - }, - metadata: { - urlPatterns: [/\/categories/i, /\/taxonomy/i], - htmlPatterns: [/category-nav/i, /menu-categories/i, /filter-category/i], - jsonKeys: ['categories', 'taxonomy', 'filters', 'types'], - }, -}; -// ======================================== -// Main Detection Function -// ======================================== -async function detectMultiCategoryProviders(websiteUrl, options = {}) { - const { timeout = 30000, headless = true, existingBrowser } = options; - let browser = null; - let page = null; - const urlsTested = []; - const rawSignals = {}; - try { - browser = existingBrowser || await puppeteer_1.default.launch({ - headless, - args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'], - }); - page = await browser.newPage(); - await page.setViewport({ width: 1920, height: 1080 }); - await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'); - // Navigate to main site - const baseUrl = normalizeUrl(websiteUrl); - urlsTested.push(baseUrl); - await page.goto(baseUrl, { waitUntil: 'networkidle2', timeout }); - // Collect signals from main page - const mainPageSignals = await collectPageSignals(page); - rawSignals.mainPage = mainPageSignals; - // Try common menu URLs - const menuUrls = ['/menu', '/shop', '/products', '/order', '/specials', '/deals', '/brands']; - for (const path of menuUrls) { - try { - const fullUrl = new URL(path, baseUrl).toString(); - urlsTested.push(fullUrl); - await page.goto(fullUrl, { waitUntil: 'networkidle2', timeout: 15000 }); - const signals = await collectPageSignals(page); - rawSignals[path] = signals; - } - catch { - // URL doesn't exist or timed out - } - } - // Analyze signals for each category - const result = { - product: analyzeCategorySignals('product', rawSignals), - specials: analyzeCategorySignals('specials', rawSignals), - brand: analyzeCategorySignals('brand', rawSignals), - metadata: analyzeCategorySignals('metadata', rawSignals), - urlsTested, - rawSignals, - }; - logger_1.logger.info('provider-detection', `Multi-category detection complete for ${websiteUrl}`); - return result; - } - catch (error) { - logger_1.logger.error('provider-detection', `Detection failed for ${websiteUrl}: ${error.message}`); - // Return unknown results for all categories - return { - product: createUnknownResult(), - specials: createUnknownResult(), - brand: createUnknownResult(), - metadata: createUnknownResult(), - urlsTested, - rawSignals: { error: error.message }, - }; - } - finally { - if (page) - await page.close().catch(() => { }); - if (browser && !existingBrowser) - await browser.close().catch(() => { }); - } -} -// ======================================== -// Helper Functions -// ======================================== -function normalizeUrl(url) { - if (!url.startsWith('http')) { - url = 'https://' + url; - } - return url.replace(/\/$/, ''); -} -async function collectPageSignals(page) { - return page.evaluate(() => { - const signals = { - scripts: [], - iframes: [], - links: [], - metaTags: [], - bodyClasses: document.body?.className || '', - bodyId: document.body?.id || '', - htmlSnippet: document.documentElement.outerHTML.slice(0, 10000), - }; - // Collect script sources - document.querySelectorAll('script[src]').forEach((el) => { - signals.scripts.push(el.src); - }); - // Collect inline scripts - document.querySelectorAll('script:not([src])').forEach((el) => { - const content = el.textContent || ''; - if (content.length < 5000) { - signals.scripts.push(`inline:${content.slice(0, 500)}`); - } - }); - // Collect iframes - document.querySelectorAll('iframe').forEach((el) => { - signals.iframes.push(el.src); - }); - // Collect links - document.querySelectorAll('a[href]').forEach((el) => { - signals.links.push(el.href); - }); - // Collect meta tags - document.querySelectorAll('meta').forEach((el) => { - const content = el.getAttribute('content') || ''; - const name = el.getAttribute('name') || el.getAttribute('property') || ''; - if (content || name) { - signals.metaTags.push(`${name}:${content}`); - } - }); - // Look for JSON data - const jsonBlocks = []; - document.querySelectorAll('script[type="application/json"]').forEach((el) => { - jsonBlocks.push(el.textContent?.slice(0, 2000) || ''); - }); - signals.jsonBlocks = jsonBlocks; - return signals; - }); -} -function analyzeCategorySignals(category, allSignals) { - const providerScores = {}; - const detectedSignals = {}; - // Initialize scores - for (const provider of Object.keys(PROVIDER_PATTERNS)) { - providerScores[provider] = 0; - } - // Analyze each page's signals - for (const [pagePath, signals] of Object.entries(allSignals)) { - if (!signals || typeof signals !== 'object') - continue; - // Check for provider-specific patterns - for (const [provider, patterns] of Object.entries(PROVIDER_PATTERNS)) { - let score = 0; - // Check scripts - if (signals.scripts) { - for (const script of signals.scripts) { - for (const pattern of patterns.scripts) { - if (pattern.test(script)) { - score += 20; - detectedSignals[`${provider}_script_${pagePath}`] = script; - } - } - } - } - // Check iframes - if (signals.iframes) { - for (const iframe of signals.iframes) { - for (const pattern of patterns.iframes) { - if (pattern.test(iframe)) { - score += 25; - detectedSignals[`${provider}_iframe_${pagePath}`] = iframe; - } - } - } - } - // Check HTML content - if (signals.htmlSnippet) { - for (const pattern of patterns.html) { - if (pattern.test(signals.htmlSnippet)) { - score += 15; - detectedSignals[`${provider}_html_${pagePath}`] = true; - } - } - } - providerScores[provider] += score; - } - // Check for category-specific signals on relevant pages - const categorySignals = CATEGORY_SIGNALS[category]; - const isRelevantPage = categorySignals.urlPatterns.some((p) => p.test(pagePath)); - if (isRelevantPage && signals.htmlSnippet) { - for (const pattern of categorySignals.htmlPatterns) { - if (pattern.test(signals.htmlSnippet)) { - detectedSignals[`${category}_html_pattern`] = true; - } - } - } - // Check JSON blocks for category data - if (signals.jsonBlocks) { - for (const json of signals.jsonBlocks) { - for (const key of categorySignals.jsonKeys) { - if (json.toLowerCase().includes(`"${key}"`)) { - detectedSignals[`${category}_json_key_${key}`] = true; - } - } - } - } - } - // Determine winning provider - let bestProvider = 'unknown'; - let bestScore = 0; - for (const [provider, score] of Object.entries(providerScores)) { - if (score > bestScore) { - bestScore = score; - bestProvider = provider; - } - } - // Calculate confidence (0-100) - const confidence = Math.min(100, bestScore); - // Determine mode based on provider and confidence - const isProductionReady = PRODUCTION_READY[category].includes(bestProvider); - const mode = isProductionReady && confidence >= 70 - ? 'production' - : 'sandbox'; - // Get template name if available - let templateName; - if (bestProvider === 'dutchie' && category === 'product') { - templateName = 'dutchie_standard'; - } - else if (bestProvider === 'treez') { - templateName = 'treez_products_v0'; - } - return { - provider: bestProvider, - confidence, - mode, - signals: detectedSignals, - templateName, - }; -} -function createUnknownResult() { - return { - provider: 'unknown', - confidence: 0, - mode: 'sandbox', - signals: {}, - }; -} -// ======================================== -// Lightweight Per-Category Change Detection -// ======================================== -async function detectCategoryProviderChange(page, category, expectedProvider) { - try { - const signals = await collectPageSignals(page); - const result = analyzeCategorySignals(category, { currentPage: signals }); - if (result.provider !== expectedProvider && result.confidence > 50) { - logger_1.logger.warn('provider-detection', `Provider change detected for ${category}: ${expectedProvider} -> ${result.provider}`); - return { - changed: true, - newProvider: result.provider, - confidence: result.confidence, - }; - } - return { changed: false }; - } - catch (error) { - logger_1.logger.error('provider-detection', `Change detection failed: ${error.message}`); - return { changed: false }; - } -} -// ======================================== -// Database Operations -// ======================================== -async function updateDispensaryCategoryProvider(dispensaryId, category, result) { - const columnPrefix = category === 'product' ? 'product' : - category === 'specials' ? 'specials' : - category === 'brand' ? 'brand' : 'metadata'; - await migrate_1.pool.query(`UPDATE dispensaries SET - ${columnPrefix}_provider = $1, - ${columnPrefix}_confidence = $2, - ${columnPrefix}_crawler_mode = $3, - ${columnPrefix}_detection_data = $4, - updated_at = NOW() - WHERE id = $5`, [ - result.provider, - result.confidence, - result.mode, - JSON.stringify(result.signals), - dispensaryId, - ]); -} -async function updateAllCategoryProviders(dispensaryId, result) { - await migrate_1.pool.query(`UPDATE dispensaries SET - product_provider = $1, - product_confidence = $2, - product_crawler_mode = $3, - product_detection_data = $4, - specials_provider = $5, - specials_confidence = $6, - specials_crawler_mode = $7, - specials_detection_data = $8, - brand_provider = $9, - brand_confidence = $10, - brand_crawler_mode = $11, - brand_detection_data = $12, - metadata_provider = $13, - metadata_confidence = $14, - metadata_crawler_mode = $15, - metadata_detection_data = $16, - updated_at = NOW() - WHERE id = $17`, [ - result.product.provider, - result.product.confidence, - result.product.mode, - JSON.stringify(result.product.signals), - result.specials.provider, - result.specials.confidence, - result.specials.mode, - JSON.stringify(result.specials.signals), - result.brand.provider, - result.brand.confidence, - result.brand.mode, - JSON.stringify(result.brand.signals), - result.metadata.provider, - result.metadata.confidence, - result.metadata.mode, - JSON.stringify(result.metadata.signals), - dispensaryId, - ]); -} -async function moveCategoryToSandbox(dispensaryId, category, reason) { - const columnPrefix = category === 'product' ? 'product' : - category === 'specials' ? 'specials' : - category === 'brand' ? 'brand' : 'metadata'; - await migrate_1.pool.query(`UPDATE dispensaries SET - ${columnPrefix}_crawler_mode = 'sandbox', - ${columnPrefix}_detection_data = ${columnPrefix}_detection_data || $1::jsonb, - updated_at = NOW() - WHERE id = $2`, [ - JSON.stringify({ sandbox_reason: reason, sandbox_at: new Date().toISOString() }), - dispensaryId, - ]); - logger_1.logger.info('provider-detection', `Moved dispensary ${dispensaryId} ${category} to sandbox: ${reason}`); -} diff --git a/backend/dist/services/logger.js b/backend/dist/services/logger.js deleted file mode 100644 index da69295c..00000000 --- a/backend/dist/services/logger.js +++ /dev/null @@ -1,56 +0,0 @@ -"use strict"; -Object.defineProperty(exports, "__esModule", { value: true }); -exports.logger = void 0; -class LogService { - logs = []; - maxLogs = 1000; - log(level, category, message) { - const entry = { - timestamp: new Date(), - level, - category, - message - }; - this.logs.unshift(entry); - if (this.logs.length > this.maxLogs) { - this.logs = this.logs.slice(0, this.maxLogs); - } - const timestamp = entry.timestamp.toISOString(); - const prefix = `[${timestamp}] [${category.toUpperCase()}] [${level.toUpperCase()}]`; - if (level === 'error') { - console.error(prefix, message); - } - else if (level === 'warn') { - console.warn(prefix, message); - } - else { - console.log(prefix, message); - } - } - info(category, message) { - this.log('info', category, message); - } - error(category, message) { - this.log('error', category, message); - } - warn(category, message) { - this.log('warn', category, message); - } - debug(category, message) { - this.log('debug', category, message); - } - getLogs(limit = 100, level, category) { - let filtered = this.logs; - if (level) { - filtered = filtered.filter(log => log.level === level); - } - if (category) { - filtered = filtered.filter(log => log.category === category); - } - return filtered.slice(0, limit); - } - clear() { - this.logs = []; - } -} -exports.logger = new LogService(); diff --git a/backend/dist/services/menu-provider-detector.js b/backend/dist/services/menu-provider-detector.js deleted file mode 100644 index f3faa9a9..00000000 --- a/backend/dist/services/menu-provider-detector.js +++ /dev/null @@ -1,612 +0,0 @@ -"use strict"; -/** - * Menu Provider Detection Service - * - * Detects which menu platform a dispensary is using by analyzing: - * - HTML content patterns (scripts, iframes, classes) - * - URL patterns (embedded menu paths) - * - API endpoint signatures - * - Meta tags and headers - */ -var __importDefault = (this && this.__importDefault) || function (mod) { - return (mod && mod.__esModule) ? mod : { "default": mod }; -}; -Object.defineProperty(exports, "__esModule", { value: true }); -exports.detectMenuProvider = detectMenuProvider; -exports.quickDutchieCheck = quickDutchieCheck; -exports.detectProviderChange = detectProviderChange; -const puppeteer_1 = __importDefault(require("puppeteer")); -const logger_1 = require("./logger"); -// Provider detection patterns -const PROVIDER_PATTERNS = { - dutchie: { - scripts: [ - /dutchie/i, - /dutchie-plus/i, - /dutchie\.com/i, - /dutchie-embed/i, - ], - iframes: [ - /dutchie\.com/i, - /embed\.dutchie/i, - /iframe\.dutchie/i, - ], - classes: [ - /dutchie-/i, - /DutchieEmbed/i, - ], - urls: [ - /dutchie\.com/i, - /\.dutchie\./i, - ], - meta: [ - /dutchie/i, - ], - apiEndpoints: [ - /graphql.*dutchie/i, - /api\.dutchie/i, - ], - htmlPatterns: [ - /data-dutchie/i, - /__DUTCHIE__/i, - /dutchie-plus-iframe/i, - ], - }, - treez: { - scripts: [ - /treez/i, - /treez\.io/i, - /treezpay/i, - ], - iframes: [ - /treez\.io/i, - /menu\.treez/i, - ], - classes: [ - /treez-/i, - ], - urls: [ - /treez\.io/i, - /\.treez\./i, - ], - meta: [ - /treez/i, - ], - apiEndpoints: [ - /api\.treez/i, - ], - htmlPatterns: [ - /data-treez/i, - /treez-embed/i, - ], - }, - jane: { - scripts: [ - /jane\.co/i, - /iheartjane/i, - /jane-embed/i, - /janetechnologies/i, - ], - iframes: [ - /jane\.co/i, - /iheartjane\.com/i, - /menu\.jane/i, - ], - classes: [ - /jane-/i, - /iheartjane/i, - ], - urls: [ - /jane\.co/i, - /iheartjane\.com/i, - ], - meta: [ - /jane/i, - /iheartjane/i, - ], - apiEndpoints: [ - /api\.iheartjane/i, - /api\.jane\.co/i, - ], - htmlPatterns: [ - /data-jane/i, - /jane-root/i, - /jane-embed/i, - ], - }, - weedmaps: { - scripts: [ - /weedmaps/i, - /wm\.com/i, - ], - iframes: [ - /weedmaps\.com/i, - /menu\.weedmaps/i, - ], - classes: [ - /weedmaps-/i, - /wm-/i, - ], - urls: [ - /weedmaps\.com/i, - ], - meta: [ - /weedmaps/i, - ], - apiEndpoints: [ - /api.*weedmaps/i, - ], - htmlPatterns: [ - /data-weedmaps/i, - ], - }, - leafly: { - scripts: [ - /leafly/i, - /leafly\.com/i, - ], - iframes: [ - /leafly\.com/i, - /menu\.leafly/i, - ], - classes: [ - /leafly-/i, - ], - urls: [ - /leafly\.com/i, - ], - meta: [ - /leafly/i, - ], - apiEndpoints: [ - /api\.leafly/i, - ], - htmlPatterns: [ - /data-leafly/i, - ], - }, - meadow: { - scripts: [ - /meadow/i, - /getmeadow/i, - ], - iframes: [ - /getmeadow\.com/i, - ], - classes: [ - /meadow-/i, - ], - urls: [ - /getmeadow\.com/i, - ], - meta: [], - apiEndpoints: [ - /api\.getmeadow/i, - ], - htmlPatterns: [], - }, - greenlight: { - scripts: [ - /greenlight/i, - /greenlightmenu/i, - ], - iframes: [ - /greenlight/i, - ], - classes: [ - /greenlight-/i, - ], - urls: [ - /greenlight/i, - ], - meta: [], - apiEndpoints: [], - htmlPatterns: [], - }, - blaze: { - scripts: [ - /blaze\.me/i, - /blazepos/i, - ], - iframes: [ - /blaze\.me/i, - ], - classes: [ - /blaze-/i, - ], - urls: [ - /blaze\.me/i, - ], - meta: [], - apiEndpoints: [ - /api\.blaze/i, - ], - htmlPatterns: [], - }, - flowhub: { - scripts: [ - /flowhub/i, - ], - iframes: [ - /flowhub\.com/i, - ], - classes: [ - /flowhub-/i, - ], - urls: [ - /flowhub\.com/i, - ], - meta: [], - apiEndpoints: [], - htmlPatterns: [], - }, - dispense: { - scripts: [ - /dispenseapp/i, - ], - iframes: [ - /dispenseapp\.com/i, - ], - classes: [ - /dispense-/i, - ], - urls: [ - /dispenseapp\.com/i, - ], - meta: [], - apiEndpoints: [], - htmlPatterns: [], - }, - cova: { - scripts: [ - /covasoftware/i, - /cova\.software/i, - ], - iframes: [ - /cova/i, - ], - classes: [ - /cova-/i, - ], - urls: [ - /cova/i, - ], - meta: [], - apiEndpoints: [], - htmlPatterns: [], - }, -}; -// Common menu URL paths to check -const MENU_PATHS = [ - '/menu', - '/shop', - '/products', - '/order', - '/store', - '/dispensary-menu', - '/online-menu', - '/shop-all', - '/browse', - '/catalog', -]; -/** - * Analyze a single page for provider signals - */ -async function analyzePageForProviders(page, url) { - const signals = []; - try { - // Get page HTML - const html = await page.content(); - const lowerHtml = html.toLowerCase(); - // Check each provider's patterns - for (const [provider, patterns] of Object.entries(PROVIDER_PATTERNS)) { - // Check script sources - const scripts = await page.$$eval('script[src]', els => els.map(el => el.getAttribute('src') || '')); - for (const script of scripts) { - for (const pattern of patterns.scripts) { - if (pattern.test(script)) { - signals.push({ - provider: provider, - confidence: 90, - source: 'script_src', - details: script, - }); - } - } - } - // Check inline scripts - const inlineScripts = await page.$$eval('script:not([src])', els => els.map(el => el.textContent || '')); - for (const scriptContent of inlineScripts) { - for (const pattern of patterns.scripts) { - if (pattern.test(scriptContent)) { - signals.push({ - provider: provider, - confidence: 70, - source: 'inline_script', - details: `Pattern: ${pattern}`, - }); - } - } - } - // Check iframes - const iframes = await page.$$eval('iframe', els => els.map(el => el.getAttribute('src') || '')); - for (const iframe of iframes) { - for (const pattern of patterns.iframes) { - if (pattern.test(iframe)) { - signals.push({ - provider: provider, - confidence: 95, - source: 'iframe_src', - details: iframe, - }); - } - } - } - // Check HTML patterns - for (const pattern of patterns.htmlPatterns) { - if (pattern.test(html)) { - signals.push({ - provider: provider, - confidence: 85, - source: 'html_pattern', - details: `Pattern: ${pattern}`, - }); - } - } - // Check CSS classes - for (const pattern of patterns.classes) { - if (pattern.test(html)) { - signals.push({ - provider: provider, - confidence: 60, - source: 'css_class', - details: `Pattern: ${pattern}`, - }); - } - } - // Check meta tags - const metaTags = await page.$$eval('meta', els => els.map(el => `${el.getAttribute('name')} ${el.getAttribute('content')}`)); - for (const meta of metaTags) { - for (const pattern of patterns.meta) { - if (pattern.test(meta)) { - signals.push({ - provider: provider, - confidence: 80, - source: 'meta_tag', - details: meta, - }); - } - } - } - } - // Check for network requests (if we intercepted them) - // This would be enhanced with request interception - } - catch (error) { - logger_1.logger.error('provider-detection', `Error analyzing page ${url}: ${error}`); - } - return signals; -} -/** - * Aggregate signals into a final detection result - */ -function aggregateSignals(signals) { - if (signals.length === 0) { - return { provider: 'unknown', confidence: 0 }; - } - // Group signals by provider - const providerScores = {}; - for (const signal of signals) { - if (!providerScores[signal.provider]) { - providerScores[signal.provider] = []; - } - providerScores[signal.provider].push(signal.confidence); - } - // Calculate weighted score for each provider - const scores = []; - for (const [provider, confidences] of Object.entries(providerScores)) { - // Use max confidence + bonus for multiple signals - const maxConf = Math.max(...confidences); - const multiSignalBonus = Math.min(10, (confidences.length - 1) * 3); - const score = Math.min(100, maxConf + multiSignalBonus); - scores.push({ provider: provider, score }); - } - // Sort by score descending - scores.sort((a, b) => b.score - a.score); - const best = scores[0]; - // If there's a clear winner (20+ point lead), use it - if (scores.length === 1 || best.score - scores[1].score >= 20) { - return { provider: best.provider, confidence: best.score }; - } - // Multiple contenders - reduce confidence - return { provider: best.provider, confidence: Math.max(50, best.score - 20) }; -} -/** - * Detect the menu provider for a dispensary - */ -async function detectMenuProvider(websiteUrl, options = {}) { - const { checkMenuPaths = true, timeout = 30000 } = options; - const result = { - provider: 'unknown', - confidence: 0, - signals: [], - urlsTested: [], - menuEntryPoints: [], - rawSignals: {}, - }; - let browser = null; - try { - // Normalize URL - let baseUrl = websiteUrl.trim(); - if (!baseUrl.startsWith('http')) { - baseUrl = `https://${baseUrl}`; - } - baseUrl = baseUrl.replace(/\/$/, ''); // Remove trailing slash - // Launch browser - browser = await puppeteer_1.default.launch({ - headless: true, - args: [ - '--no-sandbox', - '--disable-setuid-sandbox', - '--disable-dev-shm-usage', - '--disable-gpu', - ], - }); - const page = await browser.newPage(); - await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'); - // Track network requests for API detection - const apiRequests = []; - await page.setRequestInterception(true); - page.on('request', (request) => { - const url = request.url(); - if (url.includes('api') || url.includes('graphql')) { - apiRequests.push(url); - } - request.continue(); - }); - // URLs to check - const urlsToCheck = [baseUrl]; - if (checkMenuPaths) { - for (const path of MENU_PATHS) { - urlsToCheck.push(`${baseUrl}${path}`); - } - } - // Check each URL - for (const url of urlsToCheck) { - try { - result.urlsTested.push(url); - await page.goto(url, { - waitUntil: 'networkidle2', - timeout, - }); - // Wait a bit for dynamic content - await new Promise(r => setTimeout(r, 2000)); - // Analyze page - const pageSignals = await analyzePageForProviders(page, url); - result.signals.push(...pageSignals); - // Track if this URL has menu content - const hasMenuContent = await page.evaluate(() => { - const text = document.body.innerText.toLowerCase(); - return (text.includes('add to cart') || - text.includes('add to bag') || - text.includes('product') || - text.includes('indica') || - text.includes('sativa') || - text.includes('hybrid') || - text.includes('thc') || - text.includes('cbd')); - }); - if (hasMenuContent && url !== baseUrl) { - result.menuEntryPoints.push(url); - } - } - catch (pageError) { - // 404s are fine, just skip - if (!pageError.message?.includes('404')) { - logger_1.logger.warn('provider-detection', `Could not load ${url}: ${pageError.message}`); - } - } - } - // Check API requests for provider hints - for (const apiUrl of apiRequests) { - for (const [provider, patterns] of Object.entries(PROVIDER_PATTERNS)) { - for (const pattern of patterns.apiEndpoints) { - if (pattern.test(apiUrl)) { - result.signals.push({ - provider: provider, - confidence: 95, - source: 'api_request', - details: apiUrl, - }); - } - } - } - } - // Record raw signals - result.rawSignals = { - apiRequestsFound: apiRequests.length, - menuEntryPointsFound: result.menuEntryPoints.length, - totalSignals: result.signals.length, - uniqueProviders: [...new Set(result.signals.map(s => s.provider))].length, - }; - // Aggregate signals into final result - const aggregated = aggregateSignals(result.signals); - result.provider = aggregated.provider; - result.confidence = aggregated.confidence; - } - catch (error) { - result.error = error.message; - logger_1.logger.error('provider-detection', `Detection failed for ${websiteUrl}: ${error.message}`); - } - finally { - if (browser) { - await browser.close(); - } - } - return result; -} -/** - * Quick check if a site has Dutchie - used during production crawls - */ -async function quickDutchieCheck(page) { - try { - const html = await page.content(); - // Check for Dutchie-specific patterns - const dutchiePatterns = [ - /dutchie/i, - /dutchie-plus/i, - /__DUTCHIE__/i, - /data-dutchie/i, - /embed\.dutchie/i, - ]; - for (const pattern of dutchiePatterns) { - if (pattern.test(html)) { - return true; - } - } - // Check iframes - const iframes = await page.$$eval('iframe', els => els.map(el => el.getAttribute('src') || '')); - for (const iframe of iframes) { - if (/dutchie/i.test(iframe)) { - return true; - } - } - return false; - } - catch { - return false; - } -} -/** - * Check if provider has changed from expected - */ -async function detectProviderChange(page, expectedProvider) { - try { - const signals = await analyzePageForProviders(page, page.url()); - const aggregated = aggregateSignals(signals); - // If we expected Dutchie but found something else with high confidence - if (expectedProvider === 'dutchie' && aggregated.provider !== 'dutchie' && aggregated.confidence >= 70) { - return { - changed: true, - newProvider: aggregated.provider, - confidence: aggregated.confidence, - }; - } - // If we expected Dutchie and found nothing/low confidence, might have switched - if (expectedProvider === 'dutchie' && aggregated.confidence < 30) { - // Check if Dutchie is definitely NOT present - const hasDutchie = await quickDutchieCheck(page); - if (!hasDutchie) { - return { - changed: true, - newProvider: aggregated.provider !== 'unknown' ? aggregated.provider : 'other', - confidence: Math.max(30, aggregated.confidence), - }; - } - } - return { changed: false }; - } - catch { - return { changed: false }; - } -} diff --git a/backend/dist/services/proxy.js b/backend/dist/services/proxy.js deleted file mode 100644 index 0989c314..00000000 --- a/backend/dist/services/proxy.js +++ /dev/null @@ -1,323 +0,0 @@ -"use strict"; -var __importDefault = (this && this.__importDefault) || function (mod) { - return (mod && mod.__esModule) ? mod : { "default": mod }; -}; -Object.defineProperty(exports, "__esModule", { value: true }); -exports.isBotDetectionError = isBotDetectionError; -exports.putProxyInTimeout = putProxyInTimeout; -exports.isProxyInTimeout = isProxyInTimeout; -exports.getActiveProxy = getActiveProxy; -exports.testProxy = testProxy; -exports.saveProxyTestResult = saveProxyTestResult; -exports.testAllProxies = testAllProxies; -exports.addProxy = addProxy; -exports.addProxiesFromList = addProxiesFromList; -exports.moveProxyToFailed = moveProxyToFailed; -exports.incrementProxyFailure = incrementProxyFailure; -const axios_1 = __importDefault(require("axios")); -const socks_proxy_agent_1 = require("socks-proxy-agent"); -const https_proxy_agent_1 = require("https-proxy-agent"); -const migrate_1 = require("../db/migrate"); -// In-memory proxy timeout tracking -// Maps proxy ID to timestamp when timeout expires -const proxyTimeouts = new Map(); -const PROXY_TIMEOUT_MS = 35000; // 35 seconds timeout for bot-detected proxies -// Check if error message indicates bot detection -function isBotDetectionError(errorMsg) { - const botPatterns = [ - /bot detection/i, - /captcha/i, - /challenge/i, - /cloudflare/i, - /access denied/i, - /rate limit/i, - /too many requests/i, - /temporarily blocked/i, - /suspicious activity/i, - ]; - return botPatterns.some(pattern => pattern.test(errorMsg)); -} -// Put proxy in timeout (bot detection cooldown) -function putProxyInTimeout(proxyId, reason) { - const timeoutUntil = Date.now() + PROXY_TIMEOUT_MS; - proxyTimeouts.set(proxyId, timeoutUntil); - console.log(`🚫 Proxy ${proxyId} in timeout for ${PROXY_TIMEOUT_MS / 1000}s: ${reason}`); -} -// Check if proxy is currently in timeout -function isProxyInTimeout(proxyId) { - const timeoutUntil = proxyTimeouts.get(proxyId); - if (!timeoutUntil) - return false; - if (Date.now() >= timeoutUntil) { - // Timeout expired, remove it - proxyTimeouts.delete(proxyId); - console.log(`✅ Proxy ${proxyId} timeout expired, back in rotation`); - return false; - } - return true; -} -// Get active proxy that's not in timeout -async function getActiveProxy() { - const result = await migrate_1.pool.query(` - SELECT id, host, port, protocol, username, password - FROM proxies - WHERE active = true - ORDER BY RANDOM() - `); - // Filter out proxies in timeout - for (const proxy of result.rows) { - if (!isProxyInTimeout(proxy.id)) { - return proxy; - } - } - // All proxies are in timeout, wait for first one to expire - if (proxyTimeouts.size > 0) { - const nextAvailable = Math.min(...Array.from(proxyTimeouts.values())); - const waitTime = Math.max(0, nextAvailable - Date.now()); - console.log(`⏳ All proxies in timeout, waiting ${Math.ceil(waitTime / 1000)}s for next available...`); - await new Promise(resolve => setTimeout(resolve, waitTime)); - // Try again after waiting - return getActiveProxy(); - } - console.log('⚠️ No active proxies available'); - return null; -} -async function getSettings() { - const result = await migrate_1.pool.query(` - SELECT key, value FROM settings - WHERE key IN ('proxy_timeout_ms', 'proxy_test_url') - `); - const settings = {}; - result.rows.forEach((row) => { - settings[row.key] = row.value; - }); - return { - timeout: parseInt(settings.proxy_timeout_ms || '3000'), - testUrl: settings.proxy_test_url || 'https://httpbin.org/ip' - }; -} -async function testProxy(host, port, protocol, username, password) { - try { - const { timeout, testUrl } = await getSettings(); - const startTime = Date.now(); - // Construct proxy URL - let proxyUrl; - if (username && password) { - proxyUrl = `${protocol}://${username}:${password}@${host}:${port}`; - } - else { - proxyUrl = `${protocol}://${host}:${port}`; - } - // Create appropriate agent based on protocol - let agent; - if (protocol === 'socks5' || protocol === 'socks') { - agent = new socks_proxy_agent_1.SocksProxyAgent(proxyUrl); - } - else if (protocol === 'http' || protocol === 'https') { - agent = new https_proxy_agent_1.HttpsProxyAgent(proxyUrl); - } - else { - return { - success: false, - error: `Unsupported protocol: ${protocol}` - }; - } - // Make test request - const response = await axios_1.default.get(testUrl, { - httpAgent: agent, - httpsAgent: agent, - timeout, - }); - const responseTimeMs = Date.now() - startTime; - // Check anonymity - the test URL should return our IP - // If it returns the proxy's IP, we're anonymous - let isAnonymous = false; - if (response.data && response.data.origin) { - // If the returned IP is different from our actual IP, the proxy is working - // For simplicity, we'll consider it anonymous if we get a response - isAnonymous = true; - } - return { - success: true, - responseTimeMs, - isAnonymous - }; - } - catch (error) { - return { - success: false, - error: error.message || 'Unknown error' - }; - } -} -async function saveProxyTestResult(proxyId, result) { - await migrate_1.pool.query(` - UPDATE proxies - SET last_tested_at = CURRENT_TIMESTAMP, - test_result = $1, - response_time_ms = $2, - is_anonymous = $3, - active = $4, - updated_at = CURRENT_TIMESTAMP - WHERE id = $5 - `, [ - result.success ? 'success' : 'failed', - result.responseTimeMs || null, - result.isAnonymous || false, - result.success, - proxyId - ]); -} -async function testAllProxies() { - console.log('🔍 Testing all proxies...'); - const result = await migrate_1.pool.query(` - SELECT id, host, port, protocol, username, password - FROM proxies - `); - for (const proxy of result.rows) { - console.log(`Testing proxy: ${proxy.protocol}://${proxy.host}:${proxy.port}`); - const testResult = await testProxy(proxy.host, proxy.port, proxy.protocol, proxy.username, proxy.password); - await saveProxyTestResult(proxy.id, testResult); - if (testResult.success) { - console.log(`✅ Proxy OK (${testResult.responseTimeMs}ms, anonymous: ${testResult.isAnonymous})`); - } - else { - console.log(`❌ Proxy failed: ${testResult.error}`); - } - // Small delay between tests - await new Promise(resolve => setTimeout(resolve, 500)); - } - console.log('✅ Proxy testing complete'); -} -async function addProxy(host, port, protocol, username, password) { - // Test the proxy first - const testResult = await testProxy(host, port, protocol, username, password); - if (!testResult.success) { - throw new Error(`Proxy test failed: ${testResult.error}`); - } - // Insert into database - const result = await migrate_1.pool.query(` - INSERT INTO proxies (host, port, protocol, username, password, active, is_anonymous, test_result, response_time_ms, last_tested_at) - VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, CURRENT_TIMESTAMP) - RETURNING id - `, [ - host, - port, - protocol, - username, - password, - testResult.success, - testResult.isAnonymous, - 'success', - testResult.responseTimeMs - ]); - return result.rows[0].id; -} -async function addProxiesFromList(proxies) { - let added = 0; - let failed = 0; - let duplicates = 0; - const errors = []; - console.log(`📥 Importing ${proxies.length} proxies without testing...`); - for (const proxy of proxies) { - try { - // Insert without testing first - await migrate_1.pool.query(` - INSERT INTO proxies (host, port, protocol, username, password, active) - VALUES ($1, $2, $3, $4, $5, false) - ON CONFLICT (host, port, protocol) DO NOTHING - `, [ - proxy.host, - proxy.port, - proxy.protocol, - proxy.username, - proxy.password - ]); - // Check if it was actually inserted - const result = await migrate_1.pool.query(` - SELECT id FROM proxies - WHERE host = $1 AND port = $2 AND protocol = $3 - `, [proxy.host, proxy.port, proxy.protocol]); - if (result.rows.length > 0) { - // Check if it was just inserted (no last_tested_at means new) - const checkResult = await migrate_1.pool.query(` - SELECT last_tested_at FROM proxies - WHERE host = $1 AND port = $2 AND protocol = $3 - `, [proxy.host, proxy.port, proxy.protocol]); - if (checkResult.rows[0].last_tested_at === null) { - added++; - if (added % 100 === 0) { - console.log(`📥 Imported ${added} proxies...`); - } - } - else { - duplicates++; - } - } - } - catch (error) { - failed++; - const errorMsg = `${proxy.host}:${proxy.port} - ${error.message}`; - errors.push(errorMsg); - console.log(`❌ Failed to add proxy: ${errorMsg}`); - } - } - console.log(`✅ Import complete: ${added} added, ${duplicates} duplicates, ${failed} failed`); - return { added, failed, duplicates, errors }; -} -async function moveProxyToFailed(proxyId, errorMsg) { - // Get proxy details - const proxyResult = await migrate_1.pool.query(` - SELECT host, port, protocol, username, password, failure_count - FROM proxies - WHERE id = $1 - `, [proxyId]); - if (proxyResult.rows.length === 0) { - return; - } - const proxy = proxyResult.rows[0]; - // Insert into failed_proxies table - await migrate_1.pool.query(` - INSERT INTO failed_proxies (host, port, protocol, username, password, failure_count, last_error) - VALUES ($1, $2, $3, $4, $5, $6, $7) - ON CONFLICT (host, port, protocol) - DO UPDATE SET - failure_count = $6, - last_error = $7, - failed_at = CURRENT_TIMESTAMP - `, [ - proxy.host, - proxy.port, - proxy.protocol, - proxy.username, - proxy.password, - proxy.failure_count, - errorMsg - ]); - // Delete from active proxies - await migrate_1.pool.query(`DELETE FROM proxies WHERE id = $1`, [proxyId]); - console.log(`🔴 Moved proxy to failed: ${proxy.protocol}://${proxy.host}:${proxy.port} (${proxy.failure_count} failures)`); -} -async function incrementProxyFailure(proxyId, errorMsg) { - // Increment failure count - const result = await migrate_1.pool.query(` - UPDATE proxies - SET failure_count = failure_count + 1, - active = false, - updated_at = CURRENT_TIMESTAMP - WHERE id = $1 - RETURNING failure_count, host, port, protocol - `, [proxyId]); - if (result.rows.length === 0) { - return false; - } - const proxy = result.rows[0]; - const failureCount = proxy.failure_count; - console.log(`⚠️ Proxy failure #${failureCount}: ${proxy.protocol}://${proxy.host}:${proxy.port}`); - // If failed 3 times, move to failed table - if (failureCount >= 3) { - await moveProxyToFailed(proxyId, errorMsg); - return true; // Moved to failed - } - return false; // Still in active proxies -} diff --git a/backend/dist/services/proxyTestQueue.js b/backend/dist/services/proxyTestQueue.js deleted file mode 100644 index e79c5735..00000000 --- a/backend/dist/services/proxyTestQueue.js +++ /dev/null @@ -1,174 +0,0 @@ -"use strict"; -Object.defineProperty(exports, "__esModule", { value: true }); -exports.cleanupOrphanedJobs = cleanupOrphanedJobs; -exports.createProxyTestJob = createProxyTestJob; -exports.getProxyTestJob = getProxyTestJob; -exports.getActiveProxyTestJob = getActiveProxyTestJob; -exports.cancelProxyTestJob = cancelProxyTestJob; -const migrate_1 = require("../db/migrate"); -const proxy_1 = require("./proxy"); -// Simple in-memory queue - could be replaced with Bull/Bee-Queue for production -const activeJobs = new Map(); -// Clean up orphaned jobs on server startup -async function cleanupOrphanedJobs() { - try { - const result = await migrate_1.pool.query(` - UPDATE proxy_test_jobs - SET status = 'cancelled', - completed_at = CURRENT_TIMESTAMP, - updated_at = CURRENT_TIMESTAMP - WHERE status IN ('pending', 'running') - RETURNING id - `); - if (result.rows.length > 0) { - console.log(`🧹 Cleaned up ${result.rows.length} orphaned proxy test jobs`); - } - } - catch (error) { - console.error('Error cleaning up orphaned jobs:', error); - } -} -async function createProxyTestJob() { - // Check for existing running jobs first - const existingJob = await getActiveProxyTestJob(); - if (existingJob) { - throw new Error('A proxy test job is already running. Please cancel it first.'); - } - const result = await migrate_1.pool.query(` - SELECT COUNT(*) as count FROM proxies - `); - const totalProxies = parseInt(result.rows[0].count); - const jobResult = await migrate_1.pool.query(` - INSERT INTO proxy_test_jobs (status, total_proxies) - VALUES ('pending', $1) - RETURNING id - `, [totalProxies]); - const jobId = jobResult.rows[0].id; - // Start job in background - runProxyTestJob(jobId).catch(err => { - console.error(`❌ Proxy test job ${jobId} failed:`, err); - }); - return jobId; -} -async function getProxyTestJob(jobId) { - const result = await migrate_1.pool.query(` - SELECT id, status, total_proxies, tested_proxies, passed_proxies, failed_proxies - FROM proxy_test_jobs - WHERE id = $1 - `, [jobId]); - if (result.rows.length === 0) { - return null; - } - return result.rows[0]; -} -async function getActiveProxyTestJob() { - const result = await migrate_1.pool.query(` - SELECT id, status, total_proxies, tested_proxies, passed_proxies, failed_proxies - FROM proxy_test_jobs - WHERE status IN ('pending', 'running') - ORDER BY created_at DESC - LIMIT 1 - `); - if (result.rows.length === 0) { - return null; - } - return result.rows[0]; -} -async function cancelProxyTestJob(jobId) { - // Try to cancel in-memory job first - const jobControl = activeJobs.get(jobId); - if (jobControl) { - jobControl.cancelled = true; - } - // Always update database to handle orphaned jobs - const result = await migrate_1.pool.query(` - UPDATE proxy_test_jobs - SET status = 'cancelled', - completed_at = CURRENT_TIMESTAMP, - updated_at = CURRENT_TIMESTAMP - WHERE id = $1 AND status IN ('pending', 'running') - RETURNING id - `, [jobId]); - return result.rows.length > 0; -} -async function runProxyTestJob(jobId) { - // Register job as active - activeJobs.set(jobId, { cancelled: false }); - try { - // Update status to running - await migrate_1.pool.query(` - UPDATE proxy_test_jobs - SET status = 'running', - started_at = CURRENT_TIMESTAMP, - updated_at = CURRENT_TIMESTAMP - WHERE id = $1 - `, [jobId]); - console.log(`🔍 Starting proxy test job ${jobId}...`); - // Get all proxies - const result = await migrate_1.pool.query(` - SELECT id, host, port, protocol, username, password - FROM proxies - ORDER BY id - `); - let tested = 0; - let passed = 0; - let failed = 0; - for (const proxy of result.rows) { - // Check if job was cancelled - const jobControl = activeJobs.get(jobId); - if (jobControl?.cancelled) { - console.log(`⏸️ Proxy test job ${jobId} cancelled`); - break; - } - // Test the proxy - const testResult = await (0, proxy_1.testProxy)(proxy.host, proxy.port, proxy.protocol, proxy.username, proxy.password); - // Save result - await (0, proxy_1.saveProxyTestResult)(proxy.id, testResult); - tested++; - if (testResult.success) { - passed++; - } - else { - failed++; - } - // Update job progress - await migrate_1.pool.query(` - UPDATE proxy_test_jobs - SET tested_proxies = $1, - passed_proxies = $2, - failed_proxies = $3, - updated_at = CURRENT_TIMESTAMP - WHERE id = $4 - `, [tested, passed, failed, jobId]); - // Log progress every 10 proxies - if (tested % 10 === 0) { - console.log(`📊 Job ${jobId}: ${tested}/${result.rows.length} proxies tested (${passed} passed, ${failed} failed)`); - } - } - // Mark job as completed - const jobControl = activeJobs.get(jobId); - const finalStatus = jobControl?.cancelled ? 'cancelled' : 'completed'; - await migrate_1.pool.query(` - UPDATE proxy_test_jobs - SET status = $1, - completed_at = CURRENT_TIMESTAMP, - updated_at = CURRENT_TIMESTAMP - WHERE id = $2 - `, [finalStatus, jobId]); - console.log(`✅ Proxy test job ${jobId} ${finalStatus}: ${tested} tested, ${passed} passed, ${failed} failed`); - } - catch (error) { - console.error(`❌ Proxy test job ${jobId} error:`, error); - await migrate_1.pool.query(` - UPDATE proxy_test_jobs - SET status = 'failed', - completed_at = CURRENT_TIMESTAMP, - updated_at = CURRENT_TIMESTAMP - WHERE id = $1 - `, [jobId]); - } - finally { - // Remove from active jobs - activeJobs.delete(jobId); - } -} diff --git a/backend/dist/services/scheduler.js b/backend/dist/services/scheduler.js deleted file mode 100644 index dfa670a4..00000000 --- a/backend/dist/services/scheduler.js +++ /dev/null @@ -1,104 +0,0 @@ -"use strict"; -var __importDefault = (this && this.__importDefault) || function (mod) { - return (mod && mod.__esModule) ? mod : { "default": mod }; -}; -Object.defineProperty(exports, "__esModule", { value: true }); -exports.startScheduler = startScheduler; -exports.stopScheduler = stopScheduler; -exports.restartScheduler = restartScheduler; -exports.triggerStoreScrape = triggerStoreScrape; -exports.triggerAllStoresScrape = triggerAllStoresScrape; -const node_cron_1 = __importDefault(require("node-cron")); -const migrate_1 = require("../db/migrate"); -const scraper_v2_1 = require("../scraper-v2"); -let scheduledJobs = []; -async function getSettings() { - const result = await migrate_1.pool.query(` - SELECT key, value FROM settings - WHERE key IN ('scrape_interval_hours', 'scrape_specials_time') - `); - const settings = {}; - result.rows.forEach((row) => { - settings[row.key] = row.value; - }); - return { - scrapeIntervalHours: parseInt(settings.scrape_interval_hours || '4'), - scrapeSpecialsTime: settings.scrape_specials_time || '00:01' - }; -} -async function scrapeAllStores() { - console.log('🔄 Starting scheduled scrape for all stores...'); - const result = await migrate_1.pool.query(` - SELECT id, name FROM stores WHERE active = true AND scrape_enabled = true - `); - for (const store of result.rows) { - try { - console.log(`Scraping store: ${store.name}`); - await (0, scraper_v2_1.scrapeStore)(store.id); - } - catch (error) { - console.error(`Failed to scrape store ${store.name}:`, error); - } - } - console.log('✅ Scheduled scrape completed'); -} -async function scrapeSpecials() { - console.log('🌟 Starting scheduled specials scrape...'); - const result = await migrate_1.pool.query(` - SELECT s.id, s.name, c.id as category_id - FROM stores s - JOIN categories c ON c.store_id = s.id - WHERE s.active = true AND s.scrape_enabled = true - AND c.slug = 'specials' AND c.scrape_enabled = true - `); - for (const row of result.rows) { - try { - console.log(`Scraping specials for: ${row.name}`); - await (0, scraper_v2_1.scrapeCategory)(row.id, row.category_id); - } - catch (error) { - console.error(`Failed to scrape specials for ${row.name}:`, error); - } - } - console.log('✅ Specials scrape completed'); -} -async function startScheduler() { - // Stop any existing jobs - stopScheduler(); - const settings = await getSettings(); - // Schedule regular store scrapes (every N hours) - const scrapeIntervalCron = `0 */${settings.scrapeIntervalHours} * * *`; - const storeJob = node_cron_1.default.schedule(scrapeIntervalCron, scrapeAllStores); - scheduledJobs.push(storeJob); - console.log(`📅 Scheduled store scraping: every ${settings.scrapeIntervalHours} hours`); - // Schedule specials scraping (daily at specified time) - const [hours, minutes] = settings.scrapeSpecialsTime.split(':'); - const specialsCron = `${minutes} ${hours} * * *`; - const specialsJob = node_cron_1.default.schedule(specialsCron, scrapeSpecials); - scheduledJobs.push(specialsJob); - console.log(`📅 Scheduled specials scraping: daily at ${settings.scrapeSpecialsTime}`); - // Initial scrape on startup (after 10 seconds) - setTimeout(() => { - console.log('🚀 Running initial scrape...'); - scrapeAllStores().catch(console.error); - }, 10000); -} -function stopScheduler() { - scheduledJobs.forEach(job => job.stop()); - scheduledJobs = []; - console.log('🛑 Scheduler stopped'); -} -async function restartScheduler() { - console.log('🔄 Restarting scheduler...'); - stopScheduler(); - await startScheduler(); -} -// Manual trigger functions for admin -async function triggerStoreScrape(storeId) { - console.log(`🔧 Manual scrape triggered for store ID: ${storeId}`); - await (0, scraper_v2_1.scrapeStore)(storeId); -} -async function triggerAllStoresScrape() { - console.log('🔧 Manual scrape triggered for all stores'); - await scrapeAllStores(); -} diff --git a/backend/dist/services/scraper-debug.js b/backend/dist/services/scraper-debug.js deleted file mode 100644 index 2050279f..00000000 --- a/backend/dist/services/scraper-debug.js +++ /dev/null @@ -1,83 +0,0 @@ -"use strict"; -var __importDefault = (this && this.__importDefault) || function (mod) { - return (mod && mod.__esModule) ? mod : { "default": mod }; -}; -Object.defineProperty(exports, "__esModule", { value: true }); -exports.debugDutchiePage = debugDutchiePage; -const puppeteer_extra_1 = __importDefault(require("puppeteer-extra")); -const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth")); -const logger_1 = require("./logger"); -// Apply stealth plugin -puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)()); -async function debugDutchiePage(url) { - const browser = await puppeteer_extra_1.default.launch({ - headless: 'new', - args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'] - }); - const page = await browser.newPage(); - await page.setViewport({ width: 1920, height: 1080 }); - await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'); - logger_1.logger.info('scraper', `Loading: ${url}`); - try { - await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 30000 }); - logger_1.logger.info('scraper', 'Page loaded, waiting for content...'); - // Wait for content to render - await page.waitForTimeout(8000); - const debug = await page.evaluate(() => { - // Try to find product cards - const productSelectors = [ - '[data-testid*="product"]', - '[class*="Product"]', - '[class*="product"]', - 'article', - '[role="article"]', - 'li' - ]; - const results = { - selectors: {} - }; - for (const selector of productSelectors) { - const elements = document.querySelectorAll(selector); - results.selectors[selector] = elements.length; - } - // Get sample HTML from first few matches - const firstMatch = document.querySelector('[class*="product" i], article, [data-testid*="product"]'); - if (firstMatch) { - results.sampleHTML = firstMatch.outerHTML.substring(0, 1000); - results.sampleText = firstMatch.textContent?.substring(0, 500); - } - // Get all class names that might be products - const allElements = document.querySelectorAll('*'); - const classNames = new Set(); - allElements.forEach(el => { - const classes = el.className; - if (typeof classes === 'string' && classes.toLowerCase().includes('product')) { - classes.split(' ').forEach(c => classNames.add(c)); - } - }); - results.productClasses = Array.from(classNames).slice(0, 20); - results.bodyTextSample = document.body.innerText.substring(0, 500); - return results; - }); - logger_1.logger.info('scraper', `Debug results:\n${JSON.stringify(debug, null, 2)}`); - } - catch (error) { - logger_1.logger.error('scraper', `Debug navigation error: ${error}`); - // Try to get whatever we can - try { - const partialDebug = await page.evaluate(() => { - return { - url: window.location.href, - title: document.title, - bodyLength: document.body?.innerHTML?.length || 0, - bodyStart: document.body?.innerHTML?.substring(0, 500) || '' - }; - }); - logger_1.logger.info('scraper', `Partial debug:\n${JSON.stringify(partialDebug, null, 2)}`); - } - catch (e) { - logger_1.logger.error('scraper', `Could not get partial debug: ${e}`); - } - } - await browser.close(); -} diff --git a/backend/dist/services/scraper-playwright.js b/backend/dist/services/scraper-playwright.js deleted file mode 100644 index ad2ec2fa..00000000 --- a/backend/dist/services/scraper-playwright.js +++ /dev/null @@ -1,236 +0,0 @@ -"use strict"; -Object.defineProperty(exports, "__esModule", { value: true }); -exports.scrapeCategoryPlaywright = scrapeCategoryPlaywright; -exports.testScrapeCategoryPlaywright = testScrapeCategoryPlaywright; -const age_gate_playwright_1 = require("../utils/age-gate-playwright"); -const logger_1 = require("./logger"); -const stealthBrowser_1 = require("../utils/stealthBrowser"); -const dutchie_1 = require("../scrapers/templates/dutchie"); -/** - * Scrapes a category page using Playwright with stealth mode to extract product information - */ -async function scrapeCategoryPlaywright(categoryUrl, categoryName, state = 'Arizona', proxy) { - logger_1.logger.info('scraper', `Scraping category: ${categoryName}`); - logger_1.logger.info('scraper', `URL: ${categoryUrl}`); - // Create stealth browser with optional proxy - const browser = await (0, stealthBrowser_1.createStealthBrowser)({ proxy, headless: true }); - try { - // Create stealth context with age gate cookies - const context = await (0, stealthBrowser_1.createStealthContext)(browser, { state }); - // Try to load saved session cookies - const cookiesPath = `/tmp/dutchie-session-${state.toLowerCase()}.json`; - await (0, stealthBrowser_1.loadCookies)(context, cookiesPath); - const page = await context.newPage(); - // Navigate to category page - logger_1.logger.info('scraper', `Loading page: ${categoryUrl}`); - await page.goto(categoryUrl, { waitUntil: 'domcontentloaded', timeout: 60000 }); - // Random delay to appear more human - await (0, stealthBrowser_1.randomDelay)(1000, 2000); - // Check for Cloudflare challenge - if (await (0, stealthBrowser_1.isCloudflareChallenge)(page)) { - logger_1.logger.info('scraper', '🛡️ Cloudflare challenge detected, waiting...'); - const passed = await (0, stealthBrowser_1.waitForCloudflareChallenge)(page, 30000); - if (!passed) { - logger_1.logger.error('scraper', '❌ Failed to pass Cloudflare challenge'); - await browser.close(); - return []; - } - // Save successful session cookies - await (0, stealthBrowser_1.saveCookies)(context, cookiesPath); - } - // Wait for page to be fully loaded - await (0, stealthBrowser_1.waitForPageLoad)(page); - // Simulate human behavior - await (0, stealthBrowser_1.simulateHumanBehavior)(page); - // Check for and bypass age gate - const bypassed = await (0, age_gate_playwright_1.bypassAgeGatePlaywright)(page, state); - if (!bypassed) { - logger_1.logger.error('scraper', 'Failed to bypass age gate'); - await browser.close(); - return []; - } - // Wait for products to load with random delay - logger_1.logger.info('scraper', 'Waiting for products to load...'); - await (0, stealthBrowser_1.randomDelay)(2000, 4000); - // Scroll to load all products with human-like behavior - logger_1.logger.info('scraper', 'Scrolling to load all products...'); - await scrollToBottomHuman(page); - // Extract products - logger_1.logger.info('scraper', 'Extracting products from page...'); - const products = await extractProducts(page, categoryUrl, categoryName); - logger_1.logger.info('scraper', `Found ${products.length} products`); - await browser.close(); - return products; - } - catch (error) { - logger_1.logger.error('scraper', `Error scraping category: ${error}`); - await browser.close(); - return []; - } -} -/** - * Scrolls to the bottom of the page with human-like behavior - */ -async function scrollToBottomHuman(page) { - let previousHeight = 0; - let currentHeight = await page.evaluate(() => document.body.scrollHeight); - let attempts = 0; - const maxAttempts = 20; - while (previousHeight < currentHeight && attempts < maxAttempts) { - previousHeight = currentHeight; - // Scroll down in chunks with randomized delays - const scrollAmount = Math.floor(Math.random() * 200) + 300; // 300-500px - await (0, stealthBrowser_1.humanScroll)(page, scrollAmount); - // Random pause like a human reading - await (0, stealthBrowser_1.randomDelay)(500, 1500); - // Check new height - currentHeight = await page.evaluate(() => document.body.scrollHeight); - attempts++; - } - // Final wait for any lazy-loaded content - await (0, stealthBrowser_1.randomDelay)(1000, 2000); -} -/** - * Extracts product information from the page - */ -async function extractProducts(page, categoryUrl, categoryName) { - let products = []; - // Check if we have a template for this URL - const template = (0, dutchie_1.getTemplateForUrl)(categoryUrl); - if (template) { - logger_1.logger.info('scraper', `Using ${template.name} template for extraction`); - try { - const templateProducts = await template.extractProducts(page); - // Add category to products from template - products = templateProducts.map(p => ({ - ...p, - category: categoryName, - })); - logger_1.logger.info('scraper', `Template extracted ${products.length} products`); - return products; - } - catch (err) { - logger_1.logger.error('scraper', `Template extraction failed: ${err}`); - // Fall through to fallback methods - } - } - // Fallback Method 1: Dutchie products (for Sol Flower, etc.) - try { - const dutchieProducts = await page.locator('[data-testid^="product-"], .product-card, [class*="ProductCard"]').all(); - if (dutchieProducts.length > 0) { - logger_1.logger.info('scraper', `Found ${dutchieProducts.length} Dutchie-style products`); - for (const productEl of dutchieProducts) { - try { - const name = await productEl.locator('[data-testid="product-name"], .product-name, h3, h4').first().textContent() || ''; - const brand = await productEl.locator('[data-testid="product-brand"], .product-brand, .brand').first().textContent().catch(() => ''); - const priceText = await productEl.locator('[data-testid="product-price"], .product-price, .price').first().textContent().catch(() => ''); - const imageUrl = await productEl.locator('img').first().getAttribute('src').catch(() => ''); - const productLink = await productEl.locator('a').first().getAttribute('href').catch(() => ''); - // Parse price - const price = priceText ? parseFloat(priceText.replace(/[^0-9.]/g, '')) : undefined; - if (name) { - products.push({ - name: name.trim(), - brand: brand ? brand.trim() : undefined, - category: categoryName, - price, - image_url: imageUrl || undefined, - product_url: productLink ? new URL(productLink, categoryUrl).toString() : categoryUrl, - in_stock: true - }); - } - } - catch (err) { - logger_1.logger.warn('scraper', `Error extracting Dutchie product: ${err}`); - } - } - } - } - catch (err) { - logger_1.logger.warn('scraper', `Dutchie product extraction failed: ${err}`); - } - // Method 2: Curaleaf products - if (products.length === 0) { - try { - const curaleafProducts = await page.locator('.product, [class*="Product"], [class*="item"]').all(); - if (curaleafProducts.length > 0) { - logger_1.logger.info('scraper', `Found ${curaleafProducts.length} Curaleaf-style products`); - for (const productEl of curaleafProducts) { - try { - const name = await productEl.locator('h1, h2, h3, h4, .title, .name').first().textContent() || ''; - const priceText = await productEl.locator('.price, [class*="price"]').first().textContent().catch(() => ''); - const imageUrl = await productEl.locator('img').first().getAttribute('src').catch(() => ''); - const price = priceText ? parseFloat(priceText.replace(/[^0-9.]/g, '')) : undefined; - if (name && name.length > 3) { - products.push({ - name: name.trim(), - category: categoryName, - price, - image_url: imageUrl || undefined, - product_url: categoryUrl, - in_stock: true - }); - } - } - catch (err) { - logger_1.logger.warn('scraper', `Error extracting Curaleaf product: ${err}`); - } - } - } - } - catch (err) { - logger_1.logger.warn('scraper', `Curaleaf product extraction failed: ${err}`); - } - } - // Method 3: Generic product cards - if (products.length === 0) { - try { - const genericProducts = await page.locator('article, [role="article"], .card, [class*="card"]').all(); - logger_1.logger.info('scraper', `Trying generic selectors, found ${genericProducts.length} elements`); - for (const productEl of genericProducts) { - try { - const text = await productEl.textContent() || ''; - // Only consider elements that look like products - if (text.includes('$') || text.toLowerCase().includes('price') || text.toLowerCase().includes('thc')) { - const name = await productEl.locator('h1, h2, h3, h4').first().textContent() || ''; - if (name && name.length > 3) { - products.push({ - name: name.trim(), - category: categoryName, - product_url: categoryUrl, - in_stock: true - }); - } - } - } - catch (err) { - // Skip this element - } - } - } - catch (err) { - logger_1.logger.warn('scraper', `Generic product extraction failed: ${err}`); - } - } - return products; -} -/** - * Test function to scrape a single category - */ -async function testScrapeCategoryPlaywright(url, categoryName, state = 'Arizona') { - console.log(`\n🎭 Testing Playwright Category Scraper\n`); - console.log(`Category: ${categoryName}`); - console.log(`URL: ${url}\n`); - const products = await scrapeCategoryPlaywright(url, categoryName, state); - console.log(`\n✅ Found ${products.length} products\n`); - products.slice(0, 5).forEach((p, i) => { - console.log(`${i + 1}. ${p.name}`); - if (p.brand) - console.log(` Brand: ${p.brand}`); - if (p.price) - console.log(` Price: $${p.price}`); - console.log(` URL: ${p.product_url}`); - console.log(''); - }); - return products; -} diff --git a/backend/dist/services/scraper.js b/backend/dist/services/scraper.js deleted file mode 100644 index aaaa917d..00000000 --- a/backend/dist/services/scraper.js +++ /dev/null @@ -1,717 +0,0 @@ -"use strict"; -var __importDefault = (this && this.__importDefault) || function (mod) { - return (mod && mod.__esModule) ? mod : { "default": mod }; -}; -Object.defineProperty(exports, "__esModule", { value: true }); -exports.USER_AGENT_GROUPS = exports.USER_AGENTS = void 0; -exports.getUserAgent = getUserAgent; -exports.scrapeCategory = scrapeCategory; -exports.saveProducts = saveProducts; -exports.scrapeStore = scrapeStore; -const puppeteer_extra_1 = __importDefault(require("puppeteer-extra")); -const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth")); -const migrate_1 = require("../db/migrate"); -const minio_1 = require("../utils/minio"); -const logger_1 = require("./logger"); -const scraper_monitor_1 = require("../routes/scraper-monitor"); -const proxy_1 = require("./proxy"); -const age_gate_1 = require("../utils/age-gate"); -const availability_1 = require("./availability"); -// Apply stealth plugin for antidetect/anti-fingerprinting -puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)()); -exports.USER_AGENTS = { - 'chrome-windows': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', - 'chrome-mac': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', - 'chrome-linux': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', - 'mobile-ios': 'Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1', - 'mobile-android': 'Mozilla/5.0 (Linux; Android 13; Pixel 7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Mobile Safari/537.36', - 'googlebot': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)', - 'bingbot': 'Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)' -}; -exports.USER_AGENT_GROUPS = { - desktop: ['chrome-windows', 'chrome-mac', 'chrome-linux'], - mobile: ['mobile-ios', 'mobile-android'], - serp: ['googlebot', 'bingbot'] -}; -function getRandomUserAgentFromGroup(group) { - const randomKey = group[Math.floor(Math.random() * group.length)]; - return exports.USER_AGENTS[randomKey]; -} -function getUserAgent(key) { - if (!key) - return getRandomUserAgentFromGroup(exports.USER_AGENT_GROUPS.desktop); - // Check if it's a group - if (key === 'rotate-desktop') - return getRandomUserAgentFromGroup(exports.USER_AGENT_GROUPS.desktop); - if (key === 'rotate-mobile') - return getRandomUserAgentFromGroup(exports.USER_AGENT_GROUPS.mobile); - if (key === 'rotate-serp') - return getRandomUserAgentFromGroup(exports.USER_AGENT_GROUPS.serp); - // Otherwise treat as specific UA - return exports.USER_AGENTS[key] || getRandomUserAgentFromGroup(exports.USER_AGENT_GROUPS.desktop); -} -function extractImageIdFromUrl(url) { - try { - const match = url.match(/images\.dutchie\.com\/([a-f0-9]+)/i); - return match ? match[1] : null; - } - catch (e) { - return null; - } -} -function getFullSizeImageUrl(imageUrl) { - const imageId = extractImageIdFromUrl(imageUrl); - if (!imageId) - return imageUrl; - return `https://images.dutchie.com/${imageId}?auto=format&fit=max&q=95&w=2000&h=2000`; -} -function sanitizeProductData(product) { - return { - ...product, - name: product.name?.substring(0, 500) || 'Unnamed Product', - description: product.description || null, - brand: product.brand?.substring(0, 500) || null, - weight: product.weight?.substring(0, 100) || null, - thc: product.thc && product.thc < 100 ? product.thc : null, - cbd: product.cbd && product.cbd < 100 ? product.cbd : null - }; -} -async function makePageStealthy(page) { - await page.evaluateOnNewDocument(() => { - Object.defineProperty(navigator, 'webdriver', { - get: () => false, - }); - }); - await page.evaluateOnNewDocument(() => { - Object.defineProperty(navigator, 'plugins', { - get: () => [1, 2, 3, 4, 5], - }); - }); - await page.evaluateOnNewDocument(() => { - Object.defineProperty(navigator, 'languages', { - get: () => ['en-US', 'en'], - }); - }); - await page.evaluateOnNewDocument(() => { - window.chrome = { - runtime: {}, - }; - }); - await page.evaluateOnNewDocument(() => { - const originalQuery = window.navigator.permissions.query; - window.navigator.permissions.query = (parameters) => parameters.name === 'notifications' - ? Promise.resolve({ state: 'denied' }) - : originalQuery(parameters); - }); -} -async function scrapeProductDetails(page, productUrl, productName) { - const maxRetries = 3; - let lastError = null; - for (let attempt = 1; attempt <= maxRetries; attempt++) { - try { - await page.goto(productUrl, { waitUntil: 'domcontentloaded', timeout: 30000 }); - const details = await page.evaluate(() => { - const allText = document.body.textContent || ''; - let fullSizeImage = null; - const mainImageSelectors = [ - 'img[class*="ProductImage"]', - 'img[class*="product-image"]', - '[class*="ImageGallery"] img', - 'main img', - 'img[src*="images.dutchie.com"]' - ]; - for (const sel of mainImageSelectors) { - const img = document.querySelector(sel); - if (img?.src && img.src.includes('dutchie.com')) { - fullSizeImage = img.src; - break; - } - } - let description = ''; - const descSelectors = [ - '[class*="description"]', - '[class*="Description"]', - '[data-testid*="description"]', - 'p[class*="product"]' - ]; - for (const sel of descSelectors) { - const el = document.querySelector(sel); - if (el?.textContent?.trim() && el.textContent.length > 20) { - description = el.textContent.trim(); - break; - } - } - let thc = null; - const thcPatterns = [ - /THC[:\s]*(\d+\.?\d*)\s*%/i, - /Total\s+THC[:\s]*(\d+\.?\d*)\s*%/i, - /(\d+\.?\d*)\s*%\s+THC/i - ]; - for (const pattern of thcPatterns) { - const match = allText.match(pattern); - if (match) { - thc = parseFloat(match[1]); - break; - } - } - let cbd = null; - const cbdPatterns = [ - /CBD[:\s]*(\d+\.?\d*)\s*%/i, - /Total\s+CBD[:\s]*(\d+\.?\d*)\s*%/i, - /(\d+\.?\d*)\s*%\s+CBD/i - ]; - for (const pattern of cbdPatterns) { - const match = allText.match(pattern); - if (match) { - cbd = parseFloat(match[1]); - break; - } - } - let strainType = null; - if (allText.match(/\bindica\b/i)) - strainType = 'Indica'; - else if (allText.match(/\bsativa\b/i)) - strainType = 'Sativa'; - else if (allText.match(/\bhybrid\b/i)) - strainType = 'Hybrid'; - const terpenes = []; - const terpeneNames = [ - 'Myrcene', 'Limonene', 'Caryophyllene', 'Pinene', 'Linalool', - 'Humulene', 'Terpinolene', 'Ocimene', 'Bisabolol', 'Valencene' - ]; - terpeneNames.forEach(terp => { - if (allText.match(new RegExp(`\\b${terp}\\b`, 'i'))) { - terpenes.push(terp); - } - }); - const effects = []; - const effectNames = [ - 'Relaxed', 'Happy', 'Euphoric', 'Uplifted', 'Creative', - 'Energetic', 'Focused', 'Calm', 'Sleepy', 'Hungry', - 'Talkative', 'Giggly', 'Aroused' - ]; - effectNames.forEach(effect => { - if (allText.match(new RegExp(`\\b${effect}\\b`, 'i'))) { - effects.push(effect); - } - }); - let brand = null; - const brandSelectors = [ - '[class*="brand"]', - '[class*="Brand"]', - '[data-testid*="brand"]' - ]; - for (const sel of brandSelectors) { - const el = document.querySelector(sel); - if (el?.textContent?.trim()) { - brand = el.textContent.trim(); - break; - } - } - let lineage = null; - const lineageMatch = allText.match(/(?:Lineage|Genetics|Parents?)[:\s]*([^\n]+)/i); - if (lineageMatch) { - lineage = lineageMatch[1].trim(); - } - const flavors = []; - const flavorNames = [ - 'Sweet', 'Citrus', 'Earthy', 'Pine', 'Berry', 'Diesel', - 'Sour', 'Floral', 'Spicy', 'Woody', 'Tropical', 'Fruity', - 'Vanilla', 'Mint', 'Cheese', 'Grape', 'Lemon', 'Orange' - ]; - flavorNames.forEach(flavor => { - if (allText.match(new RegExp(`\\b${flavor}\\b`, 'i'))) { - flavors.push(flavor); - } - }); - const weights = []; - const weightMatches = allText.matchAll(/(\d+\.?\d*\s*(?:g|oz|mg|gram))/gi); - for (const match of weightMatches) { - const weight = match[1].trim(); - if (!weights.includes(weight)) { - weights.push(weight); - } - } - return { - fullSizeImage, - description, - thc, - cbd, - strainType, - terpenes, - effects, - brand, - lineage, - flavors, - weights - }; - }); - return details; - } - catch (error) { - lastError = error; - logger_1.logger.warn('scraper', ` Attempt ${attempt}/${maxRetries} failed for ${productName}: ${error}`); - // No delays - just retry immediately - } - } - logger_1.logger.error('scraper', ` ✗ All attempts failed for ${productName}`); - return { - fullSizeImage: null, - description: null, - thc: null, - cbd: null, - strainType: null, - terpenes: [], - effects: [], - brand: null, - lineage: null, - flavors: [], - weights: [] - }; -} -async function scrapeCategory(storeId, categoryId, userAgent) { - let browser = null; - const scraperId = `cat-${categoryId}-${Date.now()}`; - let proxyId = null; - try { - const categoryResult = await migrate_1.pool.query(` - SELECT c.*, s.slug as store_slug, s.name as store_name - FROM categories c - JOIN stores s ON c.store_id = s.id - WHERE c.id = $1 - `, [categoryId]); - if (categoryResult.rows.length === 0) { - throw new Error('Category not found'); - } - const category = categoryResult.rows[0]; - logger_1.logger.info('scraper', `Scraping category: ${category.name} for ${category.store_name}`); - // Register scraper with monitoring system - (0, scraper_monitor_1.registerScraper)(scraperId, storeId, category.store_name, categoryId, category.name); - const proxy = await (0, proxy_1.getActiveProxy)(); - if (proxy) { - proxyId = proxy.id; - } - const launchOptions = { - headless: 'new', - args: [ - '--no-sandbox', - '--disable-setuid-sandbox', - '--disable-dev-shm-usage', - '--disable-blink-features=AutomationControlled', - '--window-size=1920,1080' - ] - }; - if (proxy) { - if (proxy.protocol === 'socks5') { - launchOptions.args.push(`--proxy-server=socks5://${proxy.host}:${proxy.port}`); - } - else if (proxy.protocol === 'http' || proxy.protocol === 'https') { - launchOptions.args.push(`--proxy-server=${proxy.protocol}://${proxy.host}:${proxy.port}`); - } - logger_1.logger.info('scraper', `Using proxy: ${proxy.protocol}://${proxy.host}:${proxy.port}`); - } - browser = await puppeteer_extra_1.default.launch(launchOptions); - const page = await browser.newPage(); - await makePageStealthy(page); - await page.setViewport({ width: 1920, height: 1080 }); - // Use provided userAgent or random if not specified - const ua = getUserAgent(userAgent); - await page.setUserAgent(ua); - // Set age gate bypass cookies BEFORE navigation (standard for all cannabis sites) - const state = (0, age_gate_1.detectStateFromUrl)(category.dutchie_url); - await (0, age_gate_1.setAgeGateCookies)(page, category.dutchie_url, state); - logger_1.logger.info('scraper', `Loading page: ${category.dutchie_url}`); - try { - await page.goto(category.dutchie_url, { - waitUntil: 'networkidle2', - timeout: 60000 - }); - // If age gate still appears, try to bypass it - await (0, age_gate_1.bypassAgeGate)(page, state); - // Wait for products to load - await page.waitForSelector('[data-testid="product-list-item"], a[href*="/product/"]', { - timeout: 30000, - }).catch(() => { - logger_1.logger.warn('scraper', 'No product selectors found, trying anyway...'); - }); - logger_1.logger.info('scraper', 'Scrolling to load all products...'); - await autoScroll(page); - } - catch (navError) { - logger_1.logger.error('scraper', `Navigation error: ${navError}`); - // Check if this is bot detection - put proxy in timeout instead of hard failure - if (proxyId) { - const errorMsg = String(navError); - if ((0, proxy_1.isBotDetectionError)(errorMsg)) { - // Bot detection! Put this proxy in timeout and get a new one - logger_1.logger.warn('scraper', `🤖 Bot detection triggered for proxy ${proxyId}!`); - (0, proxy_1.putProxyInTimeout)(proxyId, errorMsg); - throw new Error(`Bot detection: ${errorMsg}`); - } - else if (errorMsg.includes('timeout') || errorMsg.includes('net::') || - errorMsg.includes('ERR_') || errorMsg.includes('Navigation')) { - // Regular proxy failure - increment failure count - logger_1.logger.warn('scraper', `Proxy failure detected, incrementing failure count for proxy ${proxyId}`); - await (0, proxy_1.incrementProxyFailure)(proxyId, errorMsg); - } - } - throw navError; - } - logger_1.logger.info('scraper', 'Extracting product list from page...'); - const products = await page.evaluate(() => { - const items = []; - const cards = document.querySelectorAll('[data-testid="product-list-item"]'); - console.log(`Found ${cards.length} product cards`); - cards.forEach((card) => { - try { - const allText = card.textContent || ''; - let name = ''; - const nameSelectors = ['a[href*="/product/"]', 'h1', 'h2', 'h3', 'h4']; - for (const sel of nameSelectors) { - const el = card.querySelector(sel); - if (el?.textContent?.trim()) { - name = el.textContent.trim(); - name = name.split('\n')[0].trim(); - break; - } - } - if (!name || name.length < 2) - return; - let price = null; - let originalPrice = null; - const priceMatches = allText.match(/\$(\d+\.?\d*)/g); - if (priceMatches && priceMatches.length > 0) { - price = parseFloat(priceMatches[0].replace('$', '')); - if (priceMatches.length > 1) { - originalPrice = parseFloat(priceMatches[1].replace('$', '')); - } - } - // Extract variant (weight/size) - look for common patterns - let variant = null; - const variantPatterns = [ - /(\d+\.?\d*\s*(?:g|oz|mg|ml|gram|ounce))/i, // Weight units - /(\d+\s*pack)/i, // Pack sizes - /(\d+\s*ct)/i, // Count - /(\d+\s*x\s*\d+\.?\d*\s*(?:g|mg|ml))/i // Multi-pack (e.g., 5x0.5g) - ]; - for (const pattern of variantPatterns) { - const match = allText.match(pattern); - if (match) { - variant = match[1].trim(); - break; - } - } - const linkEl = card.querySelector('a[href*="/product/"]'); - let href = linkEl?.href || linkEl?.getAttribute('href') || ''; - if (href && href.startsWith('/')) { - href = 'https://dutchie.com' + href; - } - items.push({ - name, - variant, - price, - originalPrice, - href: href || window.location.href - }); - } - catch (err) { - console.error('Error parsing product card:', err); - } - }); - return items; - }); - logger_1.logger.info('scraper', `Found ${products.length} products total`); - logger_1.logger.info('scraper', `Now visiting each product page for complete details...`); - let successCount = 0; - let failCount = 0; - // Update initial stats - (0, scraper_monitor_1.updateScraperStats)(scraperId, { - productsProcessed: 0, - productsTotal: products.length - }); - for (let i = 0; i < products.length; i++) { - const product = products[i]; - try { - logger_1.logger.info('scraper', ` [${i + 1}/${products.length}] ${product.name}`); - (0, scraper_monitor_1.updateScraperStats)(scraperId, { - productsProcessed: i + 1, - productsTotal: products.length - }, `Processing: ${product.name}`); - if (!product.href) { - logger_1.logger.warn('scraper', ` ⚠ No product URL, skipping details`); - product.metadata = {}; - failCount++; - continue; - } - const details = await scrapeProductDetails(page, product.href, product.name); - product.imageUrl = details.fullSizeImage ? getFullSizeImageUrl(details.fullSizeImage) : null; - product.description = details.description; - product.thc = details.thc; - product.cbd = details.cbd; - product.strainType = details.strainType; - product.brand = details.brand; - product.weight = details.weights.length > 0 ? details.weights[0] : null; - product.metadata = { - terpenes: details.terpenes, - effects: details.effects, - lineage: details.lineage, - flavors: details.flavors, - allWeights: details.weights - }; - if (details.thc || details.cbd || details.description) { - logger_1.logger.info('scraper', ` ✓ THC: ${details.thc}%, CBD: ${details.cbd}%`); - successCount++; - } - else { - logger_1.logger.warn('scraper', ` ⚠ Limited data extracted`); - failCount++; - } - // No delays - scrape fast! - } - catch (error) { - logger_1.logger.error('scraper', ` ✗ Unexpected error: ${error}`); - product.metadata = {}; - failCount++; - } - } - await browser.close(); - logger_1.logger.info('scraper', `━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`); - logger_1.logger.info('scraper', `✅ Category complete: ${category.name}`); - logger_1.logger.info('scraper', ` Total products: ${products.length}`); - logger_1.logger.info('scraper', ` Success: ${successCount}`); - logger_1.logger.info('scraper', ` Failed: ${failCount}`); - logger_1.logger.info('scraper', `━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`); - await migrate_1.pool.query(` - UPDATE categories - SET last_scraped_at = CURRENT_TIMESTAMP - WHERE id = $1 - `, [categoryId]); - // Mark scraper as complete - (0, scraper_monitor_1.completeScraper)(scraperId); - const formattedProducts = products.map((p, index) => { - const sanitized = sanitizeProductData(p); - // Normalize availability from Dutchie product data - const availability = (0, availability_1.normalizeAvailability)(p); - return { - dutchieProductId: `${category.store_slug}-${category.slug}-${Date.now()}-${index}`, - name: sanitized.name, - variant: p.variant || null, - description: sanitized.description, - price: p.price, - originalPrice: p.originalPrice, - thcPercentage: sanitized.thc, - cbdPercentage: sanitized.cbd, - strainType: p.strainType, - brand: sanitized.brand, - weight: sanitized.weight, - imageUrl: p.imageUrl, - dutchieUrl: p.href, - metadata: p.metadata || {}, - availabilityStatus: availability.status, - availabilityRaw: availability.raw, - stockQuantity: availability.quantity - }; - }); - return formattedProducts; - } - catch (error) { - logger_1.logger.error('scraper', `❌ Category scraping error: ${error}`); - // Smart proxy error handling - if (proxyId) { - const errorMsg = String(error); - if ((0, proxy_1.isBotDetectionError)(errorMsg)) { - // Bot detection! Put this proxy in timeout - logger_1.logger.warn('scraper', `🤖 Bot detection triggered for proxy ${proxyId}!`); - (0, proxy_1.putProxyInTimeout)(proxyId, errorMsg); - } - else if (errorMsg.includes('timeout') || errorMsg.includes('net::') || - errorMsg.includes('ERR_') || errorMsg.includes('Navigation') || - errorMsg.includes('Protocol error') || errorMsg.includes('Target closed')) { - // Regular proxy failure - increment failure count - logger_1.logger.warn('scraper', `Proxy failure detected, incrementing failure count for proxy ${proxyId}`); - await (0, proxy_1.incrementProxyFailure)(proxyId, errorMsg); - } - } - // Mark scraper as failed - (0, scraper_monitor_1.completeScraper)(scraperId, String(error)); - if (browser) { - try { - await browser.close(); - } - catch (e) { - logger_1.logger.error('scraper', `Error closing browser: ${e}`); - } - } - throw error; - } -} -async function autoScroll(page) { - await page.evaluate(async () => { - await new Promise((resolve) => { - let totalHeight = 0; - const distance = 500; - const timer = setInterval(() => { - const scrollHeight = document.body.scrollHeight; - window.scrollBy(0, distance); - totalHeight += distance; - if (totalHeight >= scrollHeight) { - clearInterval(timer); - resolve(); - } - }, 200); - }); - }); -} -async function saveProducts(storeId, categoryId, products) { - const client = await migrate_1.pool.connect(); - try { - await client.query('BEGIN'); - logger_1.logger.info('scraper', `Saving ${products.length} products to database...`); - // Mark all products as out-of-stock before processing (they'll be re-marked if found) - // Also update availability_status and last_seen_out_of_stock_at for state transition tracking - await client.query(` - UPDATE products - SET in_stock = false, - availability_status = 'out_of_stock', - last_seen_out_of_stock_at = CASE - WHEN availability_status != 'out_of_stock' THEN CURRENT_TIMESTAMP - ELSE last_seen_out_of_stock_at - END - WHERE store_id = $1 AND category_id = $2 AND in_stock = true - `, [storeId, categoryId]); - for (const product of products) { - try { - // Get availability from product (defaults to in_stock if product exists in scraped data) - const availStatus = product.availabilityStatus || 'in_stock'; - const availRaw = product.availabilityRaw ? JSON.stringify(product.availabilityRaw) : null; - const stockQty = product.stockQuantity ?? null; - const existingResult = await client.query(` - SELECT id, image_url, local_image_path, availability_status - FROM products - WHERE store_id = $1 AND name = $2 AND category_id = $3 - AND (variant = $4 OR (variant IS NULL AND $4 IS NULL)) - `, [storeId, product.name, categoryId, product.variant || null]); - let localImagePath = null; - let productId; - if (existingResult.rows.length > 0) { - productId = existingResult.rows[0].id; - localImagePath = existingResult.rows[0].local_image_path; - const prevStatus = existingResult.rows[0].availability_status; - // Determine if we need to update last_seen_in_stock_at - const isNowInStock = availStatus === 'in_stock' || availStatus === 'limited'; - const wasOutOfStock = prevStatus === 'out_of_stock' || prevStatus === 'unknown'; - await client.query(` - UPDATE products - SET name = $1, variant = $2, description = $3, price = $4, - strain_type = $5, thc_percentage = $6, cbd_percentage = $7, - brand = $8, weight = $9, image_url = $10, dutchie_url = $11, - in_stock = true, metadata = $12, last_seen_at = CURRENT_TIMESTAMP, - updated_at = CURRENT_TIMESTAMP, - availability_status = $14, - availability_raw = $15, - stock_quantity = $16, - last_seen_in_stock_at = CASE - WHEN $17 THEN CURRENT_TIMESTAMP - ELSE last_seen_in_stock_at - END - WHERE id = $13 - `, [ - product.name, product.variant, product.description, product.price, - product.strainType, product.thcPercentage, product.cbdPercentage, - product.brand, product.weight, product.imageUrl, product.dutchieUrl, - JSON.stringify(product.metadata), productId, availStatus, availRaw, stockQty, - isNowInStock && wasOutOfStock - ]); - } - else { - // Generate unique slug from product name + timestamp + random suffix - const baseSlug = product.name - .toLowerCase() - .replace(/[^a-z0-9]+/g, '-') - .replace(/^-|-$/g, '') - .substring(0, 150); - const uniqueSuffix = `${Date.now()}-${Math.random().toString(36).substr(2, 6)}`; - const slug = `${baseSlug}-${uniqueSuffix}`; - const insertResult = await client.query(` - INSERT INTO products ( - store_id, category_id, dutchie_product_id, name, slug, variant, description, - price, strain_type, thc_percentage, cbd_percentage, - brand, weight, image_url, dutchie_url, in_stock, metadata, - availability_status, availability_raw, stock_quantity, last_seen_in_stock_at - ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, true, $16, $17, $18, $19, CURRENT_TIMESTAMP) - RETURNING id - `, [ - storeId, categoryId, product.dutchieProductId, product.name, slug, product.variant, product.description, - product.price, product.strainType, product.thcPercentage, product.cbdPercentage, - product.brand, product.weight, product.imageUrl, product.dutchieUrl, - JSON.stringify(product.metadata), availStatus, availRaw, stockQty - ]); - productId = insertResult.rows[0].id; - } - if (product.imageUrl && !localImagePath) { - try { - localImagePath = await (0, minio_1.uploadImageFromUrl)(product.imageUrl, productId); - await client.query(` - UPDATE products - SET local_image_path = $1 - WHERE id = $2 - `, [localImagePath, productId]); - } - catch (error) { - logger_1.logger.error('images', `Failed to download image for ${product.name}: ${error}`); - } - } - } - catch (productError) { - logger_1.logger.error('scraper', `Failed to save product ${product.name}: ${productError}`); - } - } - await client.query('COMMIT'); - logger_1.logger.info('scraper', `✅ Saved ${products.length} products successfully`); - } - catch (error) { - await client.query('ROLLBACK'); - logger_1.logger.error('scraper', `Error saving products: ${error}`); - throw error; - } - finally { - client.release(); - } -} -async function scrapeStore(storeId, parallel = 3, userAgent) { - try { - logger_1.logger.info('scraper', `🏪 Starting scrape for store ID: ${storeId} (${parallel} parallel, UA: ${userAgent || 'random'})`); - const categoriesResult = await migrate_1.pool.query(` - SELECT c.id, c.name, c.slug, c.dutchie_url - FROM categories c - WHERE c.store_id = $1 - AND c.scrape_enabled = true - ORDER BY c.name - `, [storeId]); - logger_1.logger.info('scraper', `Found ${categoriesResult.rows.length} categories to scrape`); - for (const category of categoriesResult.rows) { - try { - logger_1.logger.info('scraper', `━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`); - logger_1.logger.info('scraper', `📂 Scraping: ${category.name}`); - logger_1.logger.info('scraper', `━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`); - const products = await scrapeCategory(storeId, category.id, userAgent); - await saveProducts(storeId, category.id, products); - logger_1.logger.info('scraper', `✅ Completed ${category.name} - ${products.length} products saved`); - } - catch (error) { - logger_1.logger.error('scraper', `❌ Failed to scrape ${category.name}: ${error}`); - } - // No delays - scrape fast! - } - await migrate_1.pool.query(` - UPDATE stores - SET last_scraped_at = CURRENT_TIMESTAMP - WHERE id = $1 - `, [storeId]); - logger_1.logger.info('scraper', `🎉 Store scrape completed: ID ${storeId}`); - } - catch (error) { - logger_1.logger.error('scraper', `❌ Store scrape failed: ${error}`); - throw error; - } -} diff --git a/backend/dist/services/store-crawl-orchestrator.js b/backend/dist/services/store-crawl-orchestrator.js deleted file mode 100644 index 11831849..00000000 --- a/backend/dist/services/store-crawl-orchestrator.js +++ /dev/null @@ -1,351 +0,0 @@ -"use strict"; -/** - * Store Crawl Orchestrator - * - * Orchestrates the complete crawl workflow for a store: - * 1. Load store and its linked dispensary - * 2. Check if provider detection is needed - * 3. Run provider detection if needed - * 4. Queue appropriate crawl jobs based on provider/mode - * 5. Update store_crawl_schedule with meaningful status - * - * This replaces the simple "triggerManualCrawl" with intelligent orchestration. - */ -Object.defineProperty(exports, "__esModule", { value: true }); -exports.runStoreCrawlOrchestrator = runStoreCrawlOrchestrator; -exports.runBatchOrchestrator = runBatchOrchestrator; -exports.getStoresDueForOrchestration = getStoresDueForOrchestration; -const uuid_1 = require("uuid"); -const migrate_1 = require("../db/migrate"); -const crawler_logger_1 = require("./crawler-logger"); -const intelligence_detector_1 = require("./intelligence-detector"); -const category_crawler_jobs_1 = require("./category-crawler-jobs"); -// DEPRECATED: scrapeStore writes to legacy products table -// import { scrapeStore } from '../scraper-v2'; -// Import the new dutchie-az pipeline for Dutchie crawling -const product_crawler_1 = require("../dutchie-az/services/product-crawler"); -const connection_1 = require("../dutchie-az/db/connection"); -// ======================================== -// Main Orchestrator Function -// ======================================== -/** - * Run the complete crawl orchestration for a store - * - * Behavior: - * 1. Load the store and its linked dispensary - * 2. If no dispensary is linked, report error - * 3. If product_provider is missing or stale (>7 days), run detection - * 4. After detection: - * - If product_provider = 'dutchie' and product_crawler_mode = 'production': Run production crawl - * - Otherwise: Run sandbox crawl - * 5. Update store_crawl_schedule with status/summary - */ -async function runStoreCrawlOrchestrator(storeId) { - const startTime = Date.now(); - const runId = (0, uuid_1.v4)(); - let result = { - status: 'pending', - summary: '', - runId, - storeId, - dispensaryId: null, - detectionRan: false, - crawlRan: false, - durationMs: 0, - }; - try { - // Mark schedule as running - await updateScheduleStatus(storeId, 'running', 'Starting orchestrator...', runId); - // 1. Load store with dispensary info - const store = await getStoreWithDispensary(storeId); - if (!store) { - throw new Error(`Store ${storeId} not found`); - } - result.dispensaryId = store.dispensary_id; - // 2. Check if dispensary is linked - if (!store.dispensary_id) { - result.status = 'error'; - result.summary = 'No dispensary linked - cannot determine provider'; - result.error = 'Store is not linked to a dispensary. Link it in the Dispensaries page.'; - await updateScheduleStatus(storeId, 'error', result.summary, runId, result.error); - result.durationMs = Date.now() - startTime; - return result; - } - // 3. Check if provider detection is needed - const needsDetection = await checkNeedsDetection(store); - if (needsDetection) { - // Run provider detection - const websiteUrl = store.dispensary_menu_url || store.dispensary_website; - if (!websiteUrl) { - result.status = 'error'; - result.summary = 'No website URL available for detection'; - result.error = 'Dispensary has no menu_url or website configured'; - await updateScheduleStatus(storeId, 'error', result.summary, runId, result.error); - result.durationMs = Date.now() - startTime; - return result; - } - await updateScheduleStatus(storeId, 'running', 'Running provider detection...', runId); - const detectionResult = await (0, intelligence_detector_1.detectMultiCategoryProviders)(websiteUrl); - result.detectionRan = true; - result.detectionResult = detectionResult; - // Save detection results to dispensary - await (0, intelligence_detector_1.updateAllCategoryProviders)(store.dispensary_id, detectionResult); - crawler_logger_1.crawlerLogger.providerDetected({ - dispensary_id: store.dispensary_id, - dispensary_name: store.dispensary_name || store.name, - detected_provider: detectionResult.product.provider, - confidence: detectionResult.product.confidence, - detection_method: 'orchestrator_run', - menu_url: websiteUrl, - category: 'product', - }); - // Refresh store info after detection - const updatedStore = await getStoreWithDispensary(storeId); - if (updatedStore) { - Object.assign(store, updatedStore); - } - } - // 4. Determine crawl type and run - const provider = store.product_provider; - const mode = store.product_crawler_mode; - if (provider === 'dutchie' && mode === 'production') { - // Production Dutchie crawl - now uses the new dutchie-az GraphQL pipeline - await updateScheduleStatus(storeId, 'running', 'Running Dutchie GraphQL crawl (dutchie-az)...', runId); - try { - // Look up the dispensary in the dutchie-az database - // The dutchie-az pipeline has its own dispensaries table - // We try multiple matching strategies: name, slug, or platform_dispensary_id - const dispensaryResult = await (0, connection_1.query)(`SELECT * FROM dispensaries - WHERE name ILIKE $1 - OR slug ILIKE $2 - LIMIT 1`, [store.dispensary_name, store.slug]); - if (dispensaryResult.rows.length === 0) { - throw new Error(`Dispensary not found in dutchie-az database. ` + - `You must add this dispensary to the dutchie-az pipeline first. ` + - `Store: ${store.name} (${store.dispensary_name})`); - } - const dutchieDispensary = dispensaryResult.rows[0]; - // Run the new dutchie-az GraphQL crawler - const crawlResult = await (0, product_crawler_1.crawlDispensaryProducts)(dutchieDispensary, 'rec', { useBothModes: true }); - result.crawlRan = true; - result.crawlType = 'production'; - result.productsFound = crawlResult.productsFound ?? undefined; - result.productsNew = crawlResult.productsUpserted ?? undefined; - result.productsUpdated = crawlResult.snapshotsCreated ?? undefined; - if (crawlResult.success) { - const detectionPart = result.detectionRan ? 'Detection + ' : ''; - result.summary = `${detectionPart}Dutchie GraphQL crawl (${crawlResult.productsFound || 0} items, ${crawlResult.productsUpserted || 0} upserted, ${crawlResult.snapshotsCreated || 0} snapshots)`; - result.status = 'success'; - // Update store's last_scraped_at - await migrate_1.pool.query('UPDATE stores SET last_scraped_at = NOW() WHERE id = $1', [storeId]); - crawler_logger_1.crawlerLogger.jobCompleted({ - job_id: 0, // Orchestrator doesn't create traditional jobs - store_id: storeId, - store_name: store.name, - duration_ms: crawlResult.durationMs, - products_found: crawlResult.productsFound || 0, - products_new: crawlResult.productsUpserted || 0, - products_updated: crawlResult.snapshotsCreated || 0, - provider: 'dutchie', - }); - } - else { - throw new Error(crawlResult.errorMessage || 'Crawl failed'); - } - } - catch (crawlError) { - result.status = 'error'; - result.error = crawlError.message; - result.summary = `Dutchie crawl failed: ${crawlError.message.slice(0, 100)}`; - result.crawlRan = true; - result.crawlType = 'production'; - crawler_logger_1.crawlerLogger.jobFailed({ - job_id: 0, - store_id: storeId, - store_name: store.name, - duration_ms: Date.now() - startTime, - error_message: crawlError.message, - provider: 'dutchie', - }); - } - } - else if (provider && provider !== 'unknown') { - // Sandbox crawl for non-Dutchie or sandbox mode - await updateScheduleStatus(storeId, 'running', `Running ${provider} sandbox crawl...`, runId); - try { - const sandboxResult = await (0, category_crawler_jobs_1.runSandboxProductsJob)(store.dispensary_id); - result.crawlRan = true; - result.crawlType = 'sandbox'; - result.productsFound = sandboxResult.data?.productsExtracted || 0; - const detectionPart = result.detectionRan ? 'Detection + ' : ''; - if (sandboxResult.success) { - result.summary = `${detectionPart}${provider} sandbox crawl (${result.productsFound} items, quality ${sandboxResult.data?.qualityScore || 0}%)`; - result.status = 'sandbox_only'; - } - else { - result.summary = `${detectionPart}${provider} sandbox failed: ${sandboxResult.message}`; - result.status = 'error'; - result.error = sandboxResult.message; - } - } - catch (sandboxError) { - result.status = 'error'; - result.error = sandboxError.message; - result.summary = `Sandbox crawl failed: ${sandboxError.message.slice(0, 100)}`; - result.crawlRan = true; - result.crawlType = 'sandbox'; - } - } - else { - // No provider detected - detection only - if (result.detectionRan) { - result.summary = `Detection complete: provider=${store.product_provider || 'unknown'}, confidence=${store.product_confidence || 0}%`; - result.status = 'detection_only'; - } - else { - result.summary = 'No provider detected and no crawl possible'; - result.status = 'error'; - result.error = 'Could not determine menu provider'; - } - } - } - catch (error) { - result.status = 'error'; - result.error = error.message; - result.summary = `Orchestrator error: ${error.message.slice(0, 100)}`; - crawler_logger_1.crawlerLogger.queueFailure({ - queue_type: 'orchestrator', - error_message: error.message, - }); - } - result.durationMs = Date.now() - startTime; - // Update final schedule status - await updateScheduleStatus(storeId, result.status, result.summary, runId, result.error); - // Create a crawl_job record for tracking - await createOrchestratorJobRecord(storeId, result); - return result; -} -// ======================================== -// Helper Functions -// ======================================== -async function getStoreWithDispensary(storeId) { - const result = await migrate_1.pool.query(`SELECT - s.id, s.name, s.slug, s.timezone, s.dispensary_id, - d.name as dispensary_name, - d.menu_url as dispensary_menu_url, - d.website as dispensary_website, - d.product_provider, - d.product_confidence, - d.product_crawler_mode, - d.last_product_scan_at - FROM stores s - LEFT JOIN dispensaries d ON d.id = s.dispensary_id - WHERE s.id = $1`, [storeId]); - return result.rows[0] || null; -} -async function checkNeedsDetection(store) { - // No dispensary = can't detect - if (!store.dispensary_id) - return false; - // No provider = definitely needs detection - if (!store.product_provider) - return true; - // Unknown provider = needs detection - if (store.product_provider === 'unknown') - return true; - // Low confidence = needs re-detection - if (store.product_confidence !== null && store.product_confidence < 50) - return true; - // Stale detection (> 7 days) = needs refresh - if (store.last_product_scan_at) { - const daysSince = (Date.now() - new Date(store.last_product_scan_at).getTime()) / (1000 * 60 * 60 * 24); - if (daysSince > 7) - return true; - } - return false; -} -async function updateScheduleStatus(storeId, status, summary, runId, error) { - await migrate_1.pool.query(`INSERT INTO store_crawl_schedule (store_id, last_status, last_summary, last_run_at, last_error) - VALUES ($1, $2, $3, NOW(), $4) - ON CONFLICT (store_id) DO UPDATE SET - last_status = $2, - last_summary = $3, - last_run_at = NOW(), - last_error = $4, - updated_at = NOW()`, [storeId, status, summary, error || null]); -} -async function getLatestCrawlStats(storeId) { - // Get count of products for this store - const result = await migrate_1.pool.query(`SELECT - COUNT(*) as total, - COUNT(*) FILTER (WHERE created_at > NOW() - INTERVAL '1 hour') as recent_new, - COUNT(*) FILTER (WHERE updated_at > NOW() - INTERVAL '1 hour' AND created_at < NOW() - INTERVAL '1 hour') as recent_updated - FROM products - WHERE store_id = $1`, [storeId]); - return { - products_found: parseInt(result.rows[0]?.total || '0'), - products_new: parseInt(result.rows[0]?.recent_new || '0'), - products_updated: parseInt(result.rows[0]?.recent_updated || '0'), - }; -} -async function createOrchestratorJobRecord(storeId, result) { - await migrate_1.pool.query(`INSERT INTO crawl_jobs ( - store_id, job_type, trigger_type, status, priority, - scheduled_at, started_at, completed_at, - products_found, products_new, products_updated, - error_message, orchestrator_run_id, detection_result - ) VALUES ( - $1, 'orchestrator', 'manual', $2, 100, - NOW(), NOW(), NOW(), - $3, $4, $5, - $6, $7, $8 - )`, [ - storeId, - result.status === 'success' ? 'completed' : result.status === 'error' ? 'failed' : 'completed', - result.productsFound || null, - result.productsNew || null, - result.productsUpdated || null, - result.error || null, - result.runId, - result.detectionResult ? JSON.stringify({ - product_provider: result.detectionResult.product.provider, - product_confidence: result.detectionResult.product.confidence, - product_mode: result.detectionResult.product.mode, - }) : null, - ]); -} -// ======================================== -// Batch Orchestration -// ======================================== -/** - * Run orchestrator for multiple stores - */ -async function runBatchOrchestrator(storeIds, concurrency = 3) { - const results = []; - // Process in batches - for (let i = 0; i < storeIds.length; i += concurrency) { - const batch = storeIds.slice(i, i + concurrency); - const batchResults = await Promise.all(batch.map(storeId => runStoreCrawlOrchestrator(storeId))); - results.push(...batchResults); - } - return results; -} -/** - * Get stores that are due for orchestration - */ -async function getStoresDueForOrchestration(limit = 10) { - const result = await migrate_1.pool.query(`SELECT s.id - FROM stores s - LEFT JOIN store_crawl_schedule scs ON scs.store_id = s.id - WHERE s.active = TRUE - AND s.scrape_enabled = TRUE - AND COALESCE(scs.enabled, TRUE) = TRUE - AND ( - scs.last_run_at IS NULL - OR scs.last_run_at < NOW() - (COALESCE(scs.interval_hours, 4) || ' hours')::INTERVAL - ) - AND (scs.last_status IS NULL OR scs.last_status NOT IN ('running', 'pending')) - ORDER BY COALESCE(scs.priority, 0) DESC, scs.last_run_at ASC NULLS FIRST - LIMIT $1`, [limit]); - return result.rows.map(row => row.id); -} diff --git a/backend/dist/utils/age-gate-playwright.js b/backend/dist/utils/age-gate-playwright.js deleted file mode 100644 index ac32cce4..00000000 --- a/backend/dist/utils/age-gate-playwright.js +++ /dev/null @@ -1,175 +0,0 @@ -"use strict"; -Object.defineProperty(exports, "__esModule", { value: true }); -exports.hasAgeGatePlaywright = hasAgeGatePlaywright; -exports.bypassAgeGatePlaywright = bypassAgeGatePlaywright; -exports.detectStateFromUrlPlaywright = detectStateFromUrlPlaywright; -const logger_1 = require("../services/logger"); -/** - * Detects if a Playwright page has an age verification gate - */ -async function hasAgeGatePlaywright(page) { - try { - const url = page.url(); - const bodyText = await page.textContent('body') || ''; - const hasAgeVerification = url.includes('/age-gate') || - bodyText.includes('age verification') || - bodyText.includes('Please select your state') || - bodyText.includes('are you 21') || - bodyText.includes('are you 18') || - bodyText.includes('Enter your date of birth') || - bodyText.toLowerCase().includes('verify your age'); - return hasAgeVerification; - } - catch (err) { - logger_1.logger.warn('age-gate', `Error detecting age gate: ${err}`); - return false; - } -} -/** - * Attempts to bypass an age gate using Playwright - * Handles multiple age gate patterns including Curaleaf's complex React-based gate - * - * @param page - Playwright page object - * @param state - State to select (e.g., 'Arizona', 'California') - * @returns Promise - true if bypass succeeded, false otherwise - */ -async function bypassAgeGatePlaywright(page, state = 'Arizona') { - try { - const hasGate = await hasAgeGatePlaywright(page); - if (!hasGate) { - logger_1.logger.info('age-gate', 'No age gate detected'); - return true; - } - logger_1.logger.info('age-gate', `Age gate detected - attempting to bypass with state: ${state}...`); - // Wait for age gate to fully render - await page.waitForTimeout(2000); - // Method 1: Curaleaf-style (state dropdown + "I'm over 21" button) - try { - const stateButton = page.locator('button#state, button[id="state"]').first(); - const stateButtonExists = await stateButton.count() > 0; - if (stateButtonExists) { - logger_1.logger.info('age-gate', 'Found Curaleaf-style state dropdown...'); - await stateButton.click(); - await page.waitForTimeout(1000); - // Select state - const stateOption = page.locator('[role="option"]').filter({ hasText: new RegExp(`^${state}$`, 'i') }); - const stateExists = await stateOption.count() > 0; - if (stateExists) { - logger_1.logger.info('age-gate', `Clicking ${state} option...`); - await stateOption.first().click(); - await page.waitForTimeout(2000); - // Look for "I'm over 21" button - const ageButton = page.locator('button').filter({ hasText: /I'm over 21|I am 21|I'm 21|over 21/i }); - const ageButtonExists = await ageButton.count() > 0; - if (ageButtonExists) { - logger_1.logger.info('age-gate', 'Clicking age verification button...'); - await ageButton.first().click(); - await page.waitForLoadState('domcontentloaded', { timeout: 15000 }); - await page.waitForTimeout(3000); - // Check if we successfully bypassed - const finalUrl = page.url(); - if (!finalUrl.includes('/age-gate')) { - logger_1.logger.info('age-gate', `✅ Age gate bypass successful`); - return true; - } - } - } - } - } - catch (e) { - logger_1.logger.warn('age-gate', `Curaleaf method failed: ${e}`); - } - // Method 2: Simple "Yes" or "I'm 21" button (for simpler age gates) - try { - const simpleButton = page.locator('button, a, [role="button"]').filter({ - hasText: /yes|i am 21|i'm 21|enter the site|continue|confirm/i - }); - const simpleExists = await simpleButton.count() > 0; - if (simpleExists) { - logger_1.logger.info('age-gate', 'Found simple age gate button...'); - await simpleButton.first().click(); - await page.waitForLoadState('domcontentloaded', { timeout: 10000 }); - await page.waitForTimeout(2000); - const finalUrl = page.url(); - if (!finalUrl.includes('/age-gate')) { - logger_1.logger.info('age-gate', `✅ Age gate bypass successful`); - return true; - } - } - } - catch (e) { - logger_1.logger.warn('age-gate', `Simple button method failed: ${e}`); - } - // Method 3: Standard select dropdown - try { - const selectExists = await page.locator('select').count() > 0; - if (selectExists) { - logger_1.logger.info('age-gate', 'Found select dropdown...'); - const select = page.locator('select').first(); - await select.selectOption({ label: state }); - await page.waitForTimeout(1000); - // Look for submit button - const submitButton = page.locator('button[type="submit"], input[type="submit"]'); - const submitExists = await submitButton.count() > 0; - if (submitExists) { - await submitButton.first().click(); - await page.waitForLoadState('domcontentloaded', { timeout: 10000 }); - await page.waitForTimeout(2000); - const finalUrl = page.url(); - if (!finalUrl.includes('/age-gate')) { - logger_1.logger.info('age-gate', `✅ Age gate bypass successful`); - return true; - } - } - } - } - catch (e) { - logger_1.logger.warn('age-gate', `Select dropdown method failed: ${e}`); - } - // Verify final state - const finalUrl = page.url(); - if (finalUrl.includes('/age-gate')) { - logger_1.logger.error('age-gate', `❌ Age gate bypass failed - still at: ${finalUrl}`); - return false; - } - logger_1.logger.info('age-gate', `✅ Age gate bypass successful`); - return true; - } - catch (err) { - logger_1.logger.error('age-gate', `Error bypassing age gate: ${err}`); - return false; - } -} -/** - * Helper to detect the state from a store URL - */ -function detectStateFromUrlPlaywright(url) { - const stateMap = { - '-az-': 'Arizona', - 'arizona': 'Arizona', - '-ca-': 'California', - 'california': 'California', - '-co-': 'Colorado', - 'colorado': 'Colorado', - '-fl-': 'Florida', - 'florida': 'Florida', - '-il-': 'Illinois', - 'illinois': 'Illinois', - '-ma-': 'Massachusetts', - '-mi-': 'Michigan', - '-nv-': 'Nevada', - '-nj-': 'New Jersey', - '-ny-': 'New York', - '-or-': 'Oregon', - '-pa-': 'Pennsylvania', - '-wa-': 'Washington', - }; - const lowerUrl = url.toLowerCase(); - for (const [pattern, stateName] of Object.entries(stateMap)) { - if (lowerUrl.includes(pattern)) { - return stateName; - } - } - // Default to Arizona - return 'Arizona'; -} diff --git a/backend/dist/utils/age-gate.js b/backend/dist/utils/age-gate.js deleted file mode 100644 index 392e7b6e..00000000 --- a/backend/dist/utils/age-gate.js +++ /dev/null @@ -1,263 +0,0 @@ -"use strict"; -Object.defineProperty(exports, "__esModule", { value: true }); -exports.setAgeGateCookies = setAgeGateCookies; -exports.hasAgeGate = hasAgeGate; -exports.bypassAgeGate = bypassAgeGate; -exports.detectStateFromUrl = detectStateFromUrl; -const logger_1 = require("../services/logger"); -/** - * Sets age gate bypass cookies before navigating to a page - * This should be called BEFORE page.goto() to prevent the age gate from showing - * - * @param page - Puppeteer page object - * @param url - URL to extract domain from - * @param state - State to set in cookie - */ -async function setAgeGateCookies(page, url, state = 'Arizona') { - try { - const urlObj = new URL(url); - const domain = urlObj.hostname.replace('www.', ''); - // Set cookies that bypass age gates - await page.setCookie({ - name: 'age_gate_passed', - value: 'true', - domain: `.${domain}`, - path: '/', - expires: Date.now() / 1000 + 365 * 24 * 60 * 60, // 1 year - httpOnly: false, - secure: false, - sameSite: 'Lax' - }, { - name: 'selected_state', - value: state, - domain: `.${domain}`, - path: '/', - expires: Date.now() / 1000 + 365 * 24 * 60 * 60, // 1 year - httpOnly: false, - secure: false, - sameSite: 'Lax' - }, { - name: 'age_verified', - value: 'true', - domain: `.${domain}`, - path: '/', - expires: Date.now() / 1000 + 365 * 24 * 60 * 60, - httpOnly: false, - secure: false, - sameSite: 'Lax' - }); - logger_1.logger.info('age-gate', `Set age gate bypass cookies for ${domain} (state: ${state})`); - } - catch (err) { - logger_1.logger.warn('age-gate', `Failed to set age gate cookies: ${err}`); - } -} -/** - * Detects if a page has an age verification gate - */ -async function hasAgeGate(page) { - return await page.evaluate(() => { - const bodyText = document.body.textContent || ''; - const hasAgeVerification = bodyText.includes('age verification') || - bodyText.includes('Please select your state') || - bodyText.includes('are you 21') || - bodyText.includes('are you 18') || - bodyText.includes('Enter your date of birth') || - bodyText.toLowerCase().includes('verify'); - return hasAgeVerification; - }); -} -/** - * Attempts to bypass an age gate by selecting the appropriate state - * Works with multiple age gate patterns used by cannabis dispensaries - * - * @param page - Puppeteer page object - * @param state - State to select (e.g., 'Arizona', 'California'). Defaults to 'Arizona' - * @returns Promise - true if bypass was attempted, false if no age gate found - */ -async function bypassAgeGate(page, state = 'Arizona', useSavedCookies = true) { - try { - const hasGate = await hasAgeGate(page); - if (!hasGate) { - logger_1.logger.info('age-gate', 'No age gate detected'); - return false; - } - logger_1.logger.info('age-gate', `Age gate detected - attempting to bypass with state: ${state}...`); - // Wait a bit for React components to fully render - await page.waitForTimeout(2000); - // Try Method 0: Custom dropdown button (shadcn/radix style - Curaleaf) - let customDropdownWorked = false; - try { - // Click button to open dropdown - const dropdownButton = await page.$('button#state, button[id="state"]'); - if (dropdownButton) { - logger_1.logger.info('age-gate', 'Found state dropdown button, clicking...'); - await dropdownButton.click(); - await page.waitForTimeout(800); - // Click the state option and trigger React events - const stateClicked = await page.evaluate((selectedState) => { - const options = Array.from(document.querySelectorAll('[role="option"]')); - const stateOption = options.find(el => el.textContent?.toLowerCase() === selectedState.toLowerCase()); - if (stateOption instanceof HTMLElement) { - // Trigger multiple events that React might be listening for - stateOption.dispatchEvent(new MouseEvent('mousedown', { bubbles: true })); - stateOption.dispatchEvent(new MouseEvent('mouseup', { bubbles: true })); - stateOption.click(); - stateOption.dispatchEvent(new MouseEvent('click', { bubbles: true })); - stateOption.dispatchEvent(new Event('change', { bubbles: true })); - stateOption.dispatchEvent(new Event('input', { bubbles: true })); - return true; - } - return false; - }, state); - if (stateClicked) { - logger_1.logger.info('age-gate', `Clicked ${state} option with React events`); - await page.waitForTimeout(1000); - // Look for and click any submit/continue button that appeared - const submitClicked = await page.evaluate(() => { - const buttons = Array.from(document.querySelectorAll('button, [role="button"], a')); - const submitBtn = buttons.find(el => { - const text = el.textContent?.toLowerCase() || ''; - const ariaLabel = el.getAttribute('aria-label')?.toLowerCase() || ''; - return text.includes('continue') || text.includes('submit') || - text.includes('enter') || text.includes('confirm') || - ariaLabel.includes('continue') || ariaLabel.includes('submit'); - }); - if (submitBtn instanceof HTMLElement && submitBtn.offsetParent !== null) { - submitBtn.click(); - return true; - } - return false; - }); - if (submitClicked) { - logger_1.logger.info('age-gate', `Found and clicked submit button`); - } - customDropdownWorked = true; - } - } - } - catch (e) { - logger_1.logger.warn('age-gate', `Dropdown method failed: ${e}`); - } - // Try Method 1: Dropdown select - const selectFound = await page.evaluate((selectedState) => { - const selects = Array.from(document.querySelectorAll('select')); - for (const select of selects) { - const options = Array.from(select.options); - const stateOption = options.find(opt => opt.text.toLowerCase().includes(selectedState.toLowerCase()) || - opt.value.toLowerCase().includes(selectedState.toLowerCase())); - if (stateOption) { - select.value = stateOption.value; - select.dispatchEvent(new Event('change', { bubbles: true })); - select.dispatchEvent(new Event('input', { bubbles: true })); - return true; - } - } - return false; - }, state); - // Try Method 2: State button/card (click state, then click confirm) - let stateClicked = false; - if (!selectFound) { - stateClicked = await page.evaluate((selectedState) => { - const allElements = Array.from(document.querySelectorAll('button, a, div, span, [role="button"], [class*="state"], [class*="State"], [class*="card"], [class*="option"]')); - const stateButton = allElements.find(el => el.textContent?.toLowerCase().includes(selectedState.toLowerCase())); - if (stateButton instanceof HTMLElement) { - stateButton.click(); - return true; - } - return false; - }, state); - if (stateClicked) { - // Wait for confirm button to appear and click it - await page.waitForTimeout(1000); - await page.evaluate(() => { - const confirmBtns = Array.from(document.querySelectorAll('button, a, [role="button"]')); - const confirmBtn = confirmBtns.find(el => { - const text = el.textContent?.toLowerCase() || ''; - return text.includes('enter') || text.includes('continue') || text.includes('yes') || text.includes('confirm'); - }); - if (confirmBtn instanceof HTMLElement) { - confirmBtn.click(); - } - }); - } - } - // Try Method 3: Direct "Yes" or age confirmation button - const yesClicked = await page.evaluate(() => { - const confirmButtons = Array.from(document.querySelectorAll('button, a, [role="button"]')); - const yesButton = confirmButtons.find(el => { - const text = el.textContent?.toLowerCase() || ''; - return text.includes('yes') || - text.includes('i am 21') || - text.includes('i am 18') || - text.includes('enter the site') || - text.includes('enter') || - text.includes('continue'); - }); - if (yesButton instanceof HTMLElement) { - yesButton.click(); - return true; - } - return false; - }); - const bypassed = customDropdownWorked || selectFound || stateClicked || yesClicked; - if (bypassed) { - // Wait for navigation to complete after clicking age gate button - logger_1.logger.info('age-gate', `Waiting for navigation after age gate bypass...`); - try { - await page.waitForNavigation({ waitUntil: 'domcontentloaded', timeout: 15000 }); - } - catch (e) { - // Navigation might not trigger, that's ok - wait a bit anyway - await page.waitForTimeout(3000); - } - // Give the page extra time to load content - await page.waitForTimeout(3000); - // Verify we actually bypassed by checking the URL - const finalUrl = page.url(); - if (finalUrl.includes('/age-gate')) { - logger_1.logger.error('age-gate', `❌ Age gate bypass failed - still at age gate URL: ${finalUrl}`); - return false; - } - logger_1.logger.info('age-gate', `✅ Age gate bypass completed - now at: ${finalUrl}`); - return true; - } - else { - logger_1.logger.warn('age-gate', `Could not find ${state} option or confirmation button in age gate`); - return false; - } - } - catch (err) { - logger_1.logger.error('age-gate', `Error bypassing age gate: ${err}`); - return false; - } -} -/** - * Helper to detect the state from a store URL - * @param url - Store URL - * @returns State name (e.g., 'Arizona', 'California') - */ -function detectStateFromUrl(url) { - const stateMap = { - '-az-': 'Arizona', - '-ca-': 'California', - '-co-': 'Colorado', - '-fl-': 'Florida', - '-il-': 'Illinois', - '-ma-': 'Massachusetts', - '-mi-': 'Michigan', - '-nv-': 'Nevada', - '-nj-': 'New Jersey', - '-ny-': 'New York', - '-or-': 'Oregon', - '-pa-': 'Pennsylvania', - '-wa-': 'Washington', - }; - for (const [pattern, stateName] of Object.entries(stateMap)) { - if (url.toLowerCase().includes(pattern)) { - return stateName; - } - } - // Default to Arizona if state not detected - return 'Arizona'; -} diff --git a/backend/dist/utils/image-storage.js b/backend/dist/utils/image-storage.js deleted file mode 100644 index 8f346232..00000000 --- a/backend/dist/utils/image-storage.js +++ /dev/null @@ -1,296 +0,0 @@ -"use strict"; -/** - * Local Image Storage Utility - * - * Downloads and stores product images to local filesystem. - * Replaces MinIO-based storage with simple local file storage. - * - * Directory structure: - * /images/products//.webp - * /images/products//-thumb.webp - * /images/products//-medium.webp - * /images/brands/.webp - */ -var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { - if (k2 === undefined) k2 = k; - var desc = Object.getOwnPropertyDescriptor(m, k); - if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { - desc = { enumerable: true, get: function() { return m[k]; } }; - } - Object.defineProperty(o, k2, desc); -}) : (function(o, m, k, k2) { - if (k2 === undefined) k2 = k; - o[k2] = m[k]; -})); -var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { - Object.defineProperty(o, "default", { enumerable: true, value: v }); -}) : function(o, v) { - o["default"] = v; -}); -var __importStar = (this && this.__importStar) || (function () { - var ownKeys = function(o) { - ownKeys = Object.getOwnPropertyNames || function (o) { - var ar = []; - for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; - return ar; - }; - return ownKeys(o); - }; - return function (mod) { - if (mod && mod.__esModule) return mod; - var result = {}; - if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); - __setModuleDefault(result, mod); - return result; - }; -})(); -var __importDefault = (this && this.__importDefault) || function (mod) { - return (mod && mod.__esModule) ? mod : { "default": mod }; -}; -Object.defineProperty(exports, "__esModule", { value: true }); -exports.downloadProductImage = downloadProductImage; -exports.downloadBrandLogo = downloadBrandLogo; -exports.imageExists = imageExists; -exports.deleteProductImages = deleteProductImages; -exports.initializeImageStorage = initializeImageStorage; -exports.getStorageStats = getStorageStats; -const axios_1 = __importDefault(require("axios")); -const sharp_1 = __importDefault(require("sharp")); -const fs = __importStar(require("fs/promises")); -const path = __importStar(require("path")); -const crypto_1 = require("crypto"); -// Base path for image storage - configurable via env -const IMAGES_BASE_PATH = process.env.IMAGES_PATH || '/app/public/images'; -// Public URL base for serving images -const IMAGES_PUBLIC_URL = process.env.IMAGES_PUBLIC_URL || '/images'; -/** - * Ensure a directory exists - */ -async function ensureDir(dirPath) { - try { - await fs.mkdir(dirPath, { recursive: true }); - } - catch (error) { - if (error.code !== 'EEXIST') - throw error; - } -} -/** - * Generate a short hash from a URL for deduplication - */ -function hashUrl(url) { - return (0, crypto_1.createHash)('md5').update(url).digest('hex').substring(0, 8); -} -/** - * Download an image from a URL and return the buffer - */ -async function downloadImage(imageUrl) { - const response = await axios_1.default.get(imageUrl, { - responseType: 'arraybuffer', - timeout: 30000, - headers: { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', - 'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8', - }, - }); - return Buffer.from(response.data); -} -/** - * Process and save image in multiple sizes - * Returns the file paths relative to IMAGES_BASE_PATH - */ -async function processAndSaveImage(buffer, outputDir, baseFilename) { - await ensureDir(outputDir); - const fullPath = path.join(outputDir, `${baseFilename}.webp`); - const mediumPath = path.join(outputDir, `${baseFilename}-medium.webp`); - const thumbPath = path.join(outputDir, `${baseFilename}-thumb.webp`); - // Process images in parallel - const [fullBuffer, mediumBuffer, thumbBuffer] = await Promise.all([ - // Full: max 1200x1200, high quality - (0, sharp_1.default)(buffer) - .resize(1200, 1200, { fit: 'inside', withoutEnlargement: true }) - .webp({ quality: 85 }) - .toBuffer(), - // Medium: 600x600 - (0, sharp_1.default)(buffer) - .resize(600, 600, { fit: 'inside', withoutEnlargement: true }) - .webp({ quality: 80 }) - .toBuffer(), - // Thumb: 200x200 - (0, sharp_1.default)(buffer) - .resize(200, 200, { fit: 'inside', withoutEnlargement: true }) - .webp({ quality: 75 }) - .toBuffer(), - ]); - // Save all sizes - await Promise.all([ - fs.writeFile(fullPath, fullBuffer), - fs.writeFile(mediumPath, mediumBuffer), - fs.writeFile(thumbPath, thumbBuffer), - ]); - const totalBytes = fullBuffer.length + mediumBuffer.length + thumbBuffer.length; - return { - full: fullPath, - medium: mediumPath, - thumb: thumbPath, - totalBytes, - }; -} -/** - * Convert a file path to a public URL - */ -function pathToUrl(filePath) { - const relativePath = filePath.replace(IMAGES_BASE_PATH, ''); - return `${IMAGES_PUBLIC_URL}${relativePath}`; -} -/** - * Download and store a product image locally - * - * @param imageUrl - The third-party image URL to download - * @param dispensaryId - The dispensary ID (for directory organization) - * @param productId - The product ID or external ID (for filename) - * @returns Download result with local URLs - */ -async function downloadProductImage(imageUrl, dispensaryId, productId) { - try { - if (!imageUrl) { - return { success: false, error: 'No image URL provided' }; - } - // Download the image - const buffer = await downloadImage(imageUrl); - // Organize by dispensary ID - const outputDir = path.join(IMAGES_BASE_PATH, 'products', String(dispensaryId)); - // Use product ID + URL hash for uniqueness - const urlHash = hashUrl(imageUrl); - const baseFilename = `${productId}-${urlHash}`; - // Process and save - const result = await processAndSaveImage(buffer, outputDir, baseFilename); - return { - success: true, - urls: { - full: pathToUrl(result.full), - medium: pathToUrl(result.medium), - thumb: pathToUrl(result.thumb), - }, - bytesDownloaded: result.totalBytes, - }; - } - catch (error) { - return { - success: false, - error: error.message || 'Failed to download image', - }; - } -} -/** - * Download and store a brand logo locally - * - * @param logoUrl - The brand logo URL - * @param brandId - The brand ID or slug - * @returns Download result with local URL - */ -async function downloadBrandLogo(logoUrl, brandId) { - try { - if (!logoUrl) { - return { success: false, error: 'No logo URL provided' }; - } - // Download the image - const buffer = await downloadImage(logoUrl); - // Brand logos go in /images/brands/ - const outputDir = path.join(IMAGES_BASE_PATH, 'brands'); - // Sanitize brand ID for filename - const safeBrandId = brandId.replace(/[^a-zA-Z0-9-_]/g, '_'); - const urlHash = hashUrl(logoUrl); - const baseFilename = `${safeBrandId}-${urlHash}`; - // Process and save (single size for logos) - await ensureDir(outputDir); - const logoPath = path.join(outputDir, `${baseFilename}.webp`); - const logoBuffer = await (0, sharp_1.default)(buffer) - .resize(400, 400, { fit: 'inside', withoutEnlargement: true }) - .webp({ quality: 85 }) - .toBuffer(); - await fs.writeFile(logoPath, logoBuffer); - return { - success: true, - urls: { - full: pathToUrl(logoPath), - medium: pathToUrl(logoPath), - thumb: pathToUrl(logoPath), - }, - bytesDownloaded: logoBuffer.length, - }; - } - catch (error) { - return { - success: false, - error: error.message || 'Failed to download brand logo', - }; - } -} -/** - * Check if a local image already exists - */ -async function imageExists(dispensaryId, productId, imageUrl) { - const urlHash = hashUrl(imageUrl); - const imagePath = path.join(IMAGES_BASE_PATH, 'products', String(dispensaryId), `${productId}-${urlHash}.webp`); - try { - await fs.access(imagePath); - return true; - } - catch { - return false; - } -} -/** - * Delete a product's local images - */ -async function deleteProductImages(dispensaryId, productId, imageUrl) { - const productDir = path.join(IMAGES_BASE_PATH, 'products', String(dispensaryId)); - const prefix = imageUrl - ? `${productId}-${hashUrl(imageUrl)}` - : String(productId); - try { - const files = await fs.readdir(productDir); - const toDelete = files.filter(f => f.startsWith(prefix)); - await Promise.all(toDelete.map(f => fs.unlink(path.join(productDir, f)))); - } - catch { - // Directory might not exist, that's fine - } -} -/** - * Initialize the image storage directories - */ -async function initializeImageStorage() { - await ensureDir(path.join(IMAGES_BASE_PATH, 'products')); - await ensureDir(path.join(IMAGES_BASE_PATH, 'brands')); - console.log(`✅ Image storage initialized at ${IMAGES_BASE_PATH}`); -} -/** - * Get storage stats - */ -async function getStorageStats() { - const productsDir = path.join(IMAGES_BASE_PATH, 'products'); - const brandsDir = path.join(IMAGES_BASE_PATH, 'brands'); - let productCount = 0; - let brandCount = 0; - try { - const productDirs = await fs.readdir(productsDir); - for (const dir of productDirs) { - const files = await fs.readdir(path.join(productsDir, dir)); - productCount += files.filter(f => f.endsWith('.webp') && !f.includes('-')).length; - } - } - catch { /* ignore */ } - try { - const brandFiles = await fs.readdir(brandsDir); - brandCount = brandFiles.filter(f => f.endsWith('.webp')).length; - } - catch { /* ignore */ } - return { - productsDir, - brandsDir, - productCount, - brandCount, - }; -} diff --git a/backend/dist/utils/minio.js b/backend/dist/utils/minio.js deleted file mode 100644 index 552cdffb..00000000 --- a/backend/dist/utils/minio.js +++ /dev/null @@ -1,262 +0,0 @@ -"use strict"; -var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { - if (k2 === undefined) k2 = k; - var desc = Object.getOwnPropertyDescriptor(m, k); - if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { - desc = { enumerable: true, get: function() { return m[k]; } }; - } - Object.defineProperty(o, k2, desc); -}) : (function(o, m, k, k2) { - if (k2 === undefined) k2 = k; - o[k2] = m[k]; -})); -var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { - Object.defineProperty(o, "default", { enumerable: true, value: v }); -}) : function(o, v) { - o["default"] = v; -}); -var __importStar = (this && this.__importStar) || (function () { - var ownKeys = function(o) { - ownKeys = Object.getOwnPropertyNames || function (o) { - var ar = []; - for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; - return ar; - }; - return ownKeys(o); - }; - return function (mod) { - if (mod && mod.__esModule) return mod; - var result = {}; - if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); - __setModuleDefault(result, mod); - return result; - }; -})(); -var __importDefault = (this && this.__importDefault) || function (mod) { - return (mod && mod.__esModule) ? mod : { "default": mod }; -}; -Object.defineProperty(exports, "__esModule", { value: true }); -exports.isMinioEnabled = isMinioEnabled; -exports.initializeMinio = initializeMinio; -exports.uploadImageFromUrl = uploadImageFromUrl; -exports.getImageUrl = getImageUrl; -exports.deleteImage = deleteImage; -exports.minioClient = getMinioClient; -const Minio = __importStar(require("minio")); -const axios_1 = __importDefault(require("axios")); -const uuid_1 = require("uuid"); -const sharp_1 = __importDefault(require("sharp")); -const fs = __importStar(require("fs/promises")); -const path = __importStar(require("path")); -let minioClient = null; -// Check if MinIO is configured -function isMinioEnabled() { - return !!process.env.MINIO_ENDPOINT; -} -// Local storage path for images when MinIO is not configured -const LOCAL_IMAGES_PATH = process.env.LOCAL_IMAGES_PATH || '/app/public/images'; -function getMinioClient() { - if (!minioClient) { - minioClient = new Minio.Client({ - endPoint: process.env.MINIO_ENDPOINT || 'minio', - port: parseInt(process.env.MINIO_PORT || '9000'), - useSSL: process.env.MINIO_USE_SSL === 'true', - accessKey: process.env.MINIO_ACCESS_KEY || 'minioadmin', - secretKey: process.env.MINIO_SECRET_KEY || 'minioadmin', - }); - } - return minioClient; -} -const BUCKET_NAME = process.env.MINIO_BUCKET || 'dutchie'; -async function initializeMinio() { - // Skip MinIO initialization if not configured - if (!isMinioEnabled()) { - console.log('ℹ️ MinIO not configured (MINIO_ENDPOINT not set), using local filesystem storage'); - // Ensure local images directory exists - try { - await fs.mkdir(LOCAL_IMAGES_PATH, { recursive: true }); - await fs.mkdir(path.join(LOCAL_IMAGES_PATH, 'products'), { recursive: true }); - console.log(`✅ Local images directory ready: ${LOCAL_IMAGES_PATH}`); - } - catch (error) { - console.error('❌ Failed to create local images directory:', error); - throw error; - } - return; - } - try { - const client = getMinioClient(); - // Check if bucket exists - const exists = await client.bucketExists(BUCKET_NAME); - if (!exists) { - // Create bucket - await client.makeBucket(BUCKET_NAME, 'us-east-1'); - console.log(`✅ Minio bucket created: ${BUCKET_NAME}`); - // Set public read policy - const policy = { - Version: '2012-10-17', - Statement: [ - { - Effect: 'Allow', - Principal: { AWS: ['*'] }, - Action: ['s3:GetObject'], - Resource: [`arn:aws:s3:::${BUCKET_NAME}/*`], - }, - ], - }; - await client.setBucketPolicy(BUCKET_NAME, JSON.stringify(policy)); - console.log(`✅ Bucket policy set to public read`); - } - else { - console.log(`✅ Minio bucket already exists: ${BUCKET_NAME}`); - } - } - catch (error) { - console.error('❌ Minio initialization error:', error); - throw error; - } -} -async function removeBackground(buffer) { - try { - // Get image metadata to check if it has an alpha channel - const metadata = await (0, sharp_1.default)(buffer).metadata(); - // If image already has transparency, trim and optimize it - if (metadata.hasAlpha) { - return await (0, sharp_1.default)(buffer) - .trim() // Remove transparent borders - .toBuffer(); - } - // For images without alpha (like JPEGs with solid backgrounds), - // we'll use a threshold-based approach to detect and remove solid backgrounds - // This works well for product images on solid color backgrounds - // Convert to PNG with alpha channel, then flatten with transparency - const withAlpha = await (0, sharp_1.default)(buffer) - .ensureAlpha() // Add alpha channel - .toBuffer(); - // Use threshold to make similar colors transparent (targets solid backgrounds) - // This is a simple approach - for better results, use remove.bg API or ML models - return await (0, sharp_1.default)(withAlpha) - .flatten({ background: { r: 0, g: 0, b: 0, alpha: 0 } }) - .trim() - .toBuffer(); - } - catch (error) { - console.warn('Background removal failed, using original image:', error); - return buffer; - } -} -async function uploadToLocalFilesystem(thumbnailBuffer, mediumBuffer, fullBuffer, baseFilename) { - const thumbnailPath = `${baseFilename}-thumb.png`; - const mediumPath = `${baseFilename}-medium.png`; - const fullPath = `${baseFilename}-full.png`; - // Ensure the target directory exists (in case initializeMinio wasn't called) - // Extract directory from baseFilename (e.g., 'products/store-slug' or just 'products') - const targetDir = path.join(LOCAL_IMAGES_PATH, path.dirname(baseFilename)); - await fs.mkdir(targetDir, { recursive: true }); - await Promise.all([ - fs.writeFile(path.join(LOCAL_IMAGES_PATH, thumbnailPath), thumbnailBuffer), - fs.writeFile(path.join(LOCAL_IMAGES_PATH, mediumPath), mediumBuffer), - fs.writeFile(path.join(LOCAL_IMAGES_PATH, fullPath), fullBuffer), - ]); - return { - thumbnail: thumbnailPath, - medium: mediumPath, - full: fullPath, - }; -} -async function uploadToMinio(thumbnailBuffer, mediumBuffer, fullBuffer, baseFilename) { - const client = getMinioClient(); - const thumbnailPath = `${baseFilename}-thumb.png`; - const mediumPath = `${baseFilename}-medium.png`; - const fullPath = `${baseFilename}-full.png`; - await Promise.all([ - client.putObject(BUCKET_NAME, thumbnailPath, thumbnailBuffer, thumbnailBuffer.length, { - 'Content-Type': 'image/png', - }), - client.putObject(BUCKET_NAME, mediumPath, mediumBuffer, mediumBuffer.length, { - 'Content-Type': 'image/png', - }), - client.putObject(BUCKET_NAME, fullPath, fullBuffer, fullBuffer.length, { - 'Content-Type': 'image/png', - }), - ]); - return { - thumbnail: thumbnailPath, - medium: mediumPath, - full: fullPath, - }; -} -async function uploadImageFromUrl(imageUrl, productId, storeSlug, removeBackgrounds = true) { - try { - // Download image - const response = await axios_1.default.get(imageUrl, { responseType: 'arraybuffer' }); - let buffer = Buffer.from(response.data); - // Remove background if enabled - if (removeBackgrounds) { - buffer = await removeBackground(buffer); - } - // Generate unique base filename - organize by store if slug provided - const storeDir = storeSlug ? `products/${storeSlug}` : 'products'; - const baseFilename = `${storeDir}/${productId}-${(0, uuid_1.v4)()}`; - // Create multiple sizes with Sharp and convert to WebP/PNG for better compression - // Use PNG for images with transparency - const [thumbnailBuffer, mediumBuffer, fullBuffer] = await Promise.all([ - // Thumbnail: 300x300 - (0, sharp_1.default)(buffer) - .resize(300, 300, { fit: 'inside', background: { r: 0, g: 0, b: 0, alpha: 0 } }) - .png({ quality: 80, compressionLevel: 9 }) - .toBuffer(), - // Medium: 800x800 - (0, sharp_1.default)(buffer) - .resize(800, 800, { fit: 'inside', background: { r: 0, g: 0, b: 0, alpha: 0 } }) - .png({ quality: 85, compressionLevel: 9 }) - .toBuffer(), - // Full: 2000x2000 (optimized) - (0, sharp_1.default)(buffer) - .resize(2000, 2000, { fit: 'inside', withoutEnlargement: true, background: { r: 0, g: 0, b: 0, alpha: 0 } }) - .png({ quality: 90, compressionLevel: 9 }) - .toBuffer(), - ]); - // Upload to appropriate storage backend - let result; - if (isMinioEnabled()) { - result = await uploadToMinio(thumbnailBuffer, mediumBuffer, fullBuffer, baseFilename); - } - else { - result = await uploadToLocalFilesystem(thumbnailBuffer, mediumBuffer, fullBuffer, baseFilename); - } - console.log(`✅ Uploaded 3 sizes for product ${productId}: ${thumbnailBuffer.length + mediumBuffer.length + fullBuffer.length} bytes total`); - return result; - } - catch (error) { - console.error('Error uploading image:', error); - throw error; - } -} -function getImageUrl(imagePath) { - if (isMinioEnabled()) { - // Use MinIO endpoint for browser access - const endpoint = process.env.MINIO_PUBLIC_ENDPOINT || 'http://localhost:9020'; - return `${endpoint}/${BUCKET_NAME}/${imagePath}`; - } - else { - // Use local path - served via Express static middleware - const publicUrl = process.env.PUBLIC_URL || ''; - return `${publicUrl}/images/${imagePath}`; - } -} -async function deleteImage(imagePath) { - try { - if (isMinioEnabled()) { - const client = getMinioClient(); - await client.removeObject(BUCKET_NAME, imagePath); - } - else { - const fullPath = path.join(LOCAL_IMAGES_PATH, imagePath); - await fs.unlink(fullPath); - } - } - catch (error) { - console.error('Error deleting image:', error); - } -} diff --git a/backend/dist/utils/product-normalizer.js b/backend/dist/utils/product-normalizer.js deleted file mode 100644 index 6d98adcd..00000000 --- a/backend/dist/utils/product-normalizer.js +++ /dev/null @@ -1,181 +0,0 @@ -"use strict"; -/** - * Product Normalizer Utility - * - * Functions for normalizing product data to enable consistent matching - * and prevent duplicate product entries. - */ -Object.defineProperty(exports, "__esModule", { value: true }); -exports.normalizeProductName = normalizeProductName; -exports.normalizeBrandName = normalizeBrandName; -exports.normalizeWeight = normalizeWeight; -exports.generateProductFingerprint = generateProductFingerprint; -exports.stringSimilarity = stringSimilarity; -exports.areProductsSimilar = areProductsSimilar; -/** - * Normalize product name for matching - * - Lowercase - * - Remove punctuation - * - Remove THC/CBD percentages often appended to names - * - Remove weight suffixes - * - Remove emoji - * - Normalize whitespace - */ -function normalizeProductName(name) { - if (!name) - return ''; - return name - .toLowerCase() - .trim() - // Remove special characters except alphanumeric and spaces - .replace(/[^\w\s]/g, ' ') - // Remove common suffixes like THC/CBD percentages appended to names - .replace(/\s*(thc|cbd|cbg|cbn|tac)\s*[:=]?\s*[\d.]+\s*%?/gi, '') - // Remove weight/size suffixes often appended - .replace(/\s*\d+(\.\d+)?\s*(mg|g|oz|ml|gram|grams|ounce|ounces)\b/gi, '') - // Remove emoji - .replace(/[\u{1F300}-\u{1F9FF}]/gu, '') - // Remove "special offer" type suffixes - .replace(/\s*special\s*offer\s*/gi, '') - // Normalize multiple spaces to single space - .replace(/\s+/g, ' ') - .trim(); -} -/** - * Normalize brand name for matching - */ -function normalizeBrandName(brand) { - if (!brand) - return ''; - return brand - .toLowerCase() - .trim() - // Remove special characters - .replace(/[^\w\s]/g, ' ') - // Normalize whitespace - .replace(/\s+/g, ' ') - .trim(); -} -/** - * Normalize weight string to standard format - * e.g., "3.5 grams" -> "3.5g", "1/8 oz" -> "3.5g" - */ -function normalizeWeight(weight) { - if (!weight) - return ''; - const w = weight.toLowerCase().trim(); - // Handle fractional ounces - if (w.includes('1/8') || w.includes('eighth')) { - return '3.5g'; - } - if (w.includes('1/4') || w.includes('quarter')) { - return '7g'; - } - if (w.includes('1/2') || w.includes('half')) { - return '14g'; - } - if (w.includes('1 oz') || w === 'oz' || w === '1oz') { - return '28g'; - } - // Extract numeric value and unit - const match = w.match(/([\d.]+)\s*(mg|g|oz|ml|gram|grams?|ounce|ounces?)?/i); - if (!match) - return w; - const value = parseFloat(match[1]); - let unit = (match[2] || 'g').toLowerCase(); - // Normalize unit names - unit = unit.replace(/gram(s)?/, 'g').replace(/ounce(s)?/, 'oz'); - // Convert oz to grams for consistency - if (unit === 'oz') { - return `${(value * 28).toFixed(1)}g`; - } - return `${value}${unit}`; -} -/** - * Generate a matching fingerprint for a product - * Used for deduplication - */ -function generateProductFingerprint(name, brand, weight, categoryId) { - const parts = [ - normalizeProductName(name), - normalizeBrandName(brand), - normalizeWeight(weight), - categoryId?.toString() || '' - ]; - return parts.filter(Boolean).join('|'); -} -/** - * Calculate similarity between two strings (0-100) - * Uses Levenshtein distance - */ -function stringSimilarity(str1, str2) { - if (str1 === str2) - return 100; - if (!str1 || !str2) - return 0; - const s1 = str1.toLowerCase(); - const s2 = str2.toLowerCase(); - if (s1 === s2) - return 100; - const longer = s1.length > s2.length ? s1 : s2; - const shorter = s1.length > s2.length ? s2 : s1; - const longerLength = longer.length; - if (longerLength === 0) - return 100; - const distance = levenshteinDistance(longer, shorter); - return Math.round(((longerLength - distance) / longerLength) * 100); -} -/** - * Levenshtein distance between two strings - */ -function levenshteinDistance(str1, str2) { - const m = str1.length; - const n = str2.length; - // Create distance matrix - const dp = Array(m + 1).fill(null).map(() => Array(n + 1).fill(0)); - // Initialize first row and column - for (let i = 0; i <= m; i++) - dp[i][0] = i; - for (let j = 0; j <= n; j++) - dp[0][j] = j; - // Fill in the rest - for (let i = 1; i <= m; i++) { - for (let j = 1; j <= n; j++) { - const cost = str1[i - 1] === str2[j - 1] ? 0 : 1; - dp[i][j] = Math.min(dp[i - 1][j] + 1, // deletion - dp[i][j - 1] + 1, // insertion - dp[i - 1][j - 1] + cost // substitution - ); - } - } - return dp[m][n]; -} -/** - * Check if two products are likely the same - * Returns confidence score (0-100) - */ -function areProductsSimilar(product1, product2, threshold = 92) { - const name1 = normalizeProductName(product1.name); - const name2 = normalizeProductName(product2.name); - const nameSimilarity = stringSimilarity(name1, name2); - // If names are very similar, likely same product - if (nameSimilarity >= threshold) { - return { isSimilar: true, confidence: nameSimilarity }; - } - // Check brand match for additional confidence - const brand1 = normalizeBrandName(product1.brand); - const brand2 = normalizeBrandName(product2.brand); - if (brand1 && brand2 && brand1 === brand2) { - // Same brand, lower threshold for name match - if (nameSimilarity >= threshold - 10) { - return { isSimilar: true, confidence: nameSimilarity + 5 }; - } - } - // Check weight match - const weight1 = normalizeWeight(product1.weight); - const weight2 = normalizeWeight(product2.weight); - if (weight1 && weight2 && weight1 === weight2 && nameSimilarity >= threshold - 15) { - return { isSimilar: true, confidence: nameSimilarity + 3 }; - } - return { isSimilar: false, confidence: nameSimilarity }; -} diff --git a/backend/dist/utils/proxyManager.js b/backend/dist/utils/proxyManager.js deleted file mode 100644 index 688939b4..00000000 --- a/backend/dist/utils/proxyManager.js +++ /dev/null @@ -1,112 +0,0 @@ -"use strict"; -Object.defineProperty(exports, "__esModule", { value: true }); -exports.getProxy = getProxy; -exports.getPhoenixProxy = getPhoenixProxy; -exports.getStateProxy = getStateProxy; -exports.getCityProxy = getCityProxy; -exports.getRandomProxy = getRandomProxy; -exports.getProxyLocationStats = getProxyLocationStats; -const migrate_1 = require("../db/migrate"); -const logger_1 = require("../services/logger"); -/** - * Get an active proxy from the database, optionally filtered by location - */ -async function getProxy(locationFilter) { - try { - let query = ` - SELECT protocol, host, port, username, password - FROM proxies - WHERE active = true - `; - const params = []; - let paramIndex = 1; - if (locationFilter) { - if (locationFilter.city) { - query += ` AND LOWER(city) = LOWER($${paramIndex})`; - params.push(locationFilter.city); - paramIndex++; - } - if (locationFilter.state) { - query += ` AND LOWER(state) = LOWER($${paramIndex})`; - params.push(locationFilter.state); - paramIndex++; - } - if (locationFilter.country) { - query += ` AND LOWER(country) = LOWER($${paramIndex})`; - params.push(locationFilter.country); - paramIndex++; - } - if (locationFilter.countryCode) { - query += ` AND LOWER(country_code) = LOWER($${paramIndex})`; - params.push(locationFilter.countryCode); - paramIndex++; - } - } - // Use RANDOM() for true randomization instead of least recently used - query += ` ORDER BY RANDOM() LIMIT 1`; - const result = await migrate_1.pool.query(query, params); - if (result.rows.length === 0) { - logger_1.logger.warn('proxy', `No active proxies found with filter: ${JSON.stringify(locationFilter)}`); - return null; - } - const proxy = result.rows[0]; - return { - server: `${proxy.protocol}://${proxy.host}:${proxy.port}`, - username: proxy.username || undefined, - password: proxy.password || undefined, - }; - } - catch (error) { - logger_1.logger.error('proxy', `Error fetching proxy: ${error}`); - return null; - } -} -/** - * Get a proxy from Phoenix, AZ, USA (ideal for Arizona dispensaries) - */ -async function getPhoenixProxy() { - return getProxy({ city: 'Phoenix', state: 'Arizona', country: 'United States' }); -} -/** - * Get a proxy from a specific US state - */ -async function getStateProxy(state) { - return getProxy({ state, country: 'United States' }); -} -/** - * Get a proxy from a specific city - */ -async function getCityProxy(city, state) { - return getProxy({ city, state }); -} -/** - * Get a random active proxy (no location filter) - */ -async function getRandomProxy() { - return getProxy(); -} -/** - * Get proxy location statistics - */ -async function getProxyLocationStats() { - try { - const result = await migrate_1.pool.query(` - SELECT - country, - state, - city, - COUNT(*) as count, - SUM(CASE WHEN active THEN 1 ELSE 0 END) as active_count - FROM proxies - WHERE country IS NOT NULL - GROUP BY country, state, city - ORDER BY count DESC - LIMIT 50 - `); - return result.rows; - } - catch (error) { - logger_1.logger.error('proxy', `Error fetching proxy stats: ${error}`); - return []; - } -} diff --git a/backend/dist/utils/stealthBrowser.js b/backend/dist/utils/stealthBrowser.js deleted file mode 100644 index c6161cac..00000000 --- a/backend/dist/utils/stealthBrowser.js +++ /dev/null @@ -1,264 +0,0 @@ -"use strict"; -var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { - if (k2 === undefined) k2 = k; - var desc = Object.getOwnPropertyDescriptor(m, k); - if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { - desc = { enumerable: true, get: function() { return m[k]; } }; - } - Object.defineProperty(o, k2, desc); -}) : (function(o, m, k, k2) { - if (k2 === undefined) k2 = k; - o[k2] = m[k]; -})); -var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { - Object.defineProperty(o, "default", { enumerable: true, value: v }); -}) : function(o, v) { - o["default"] = v; -}); -var __importStar = (this && this.__importStar) || (function () { - var ownKeys = function(o) { - ownKeys = Object.getOwnPropertyNames || function (o) { - var ar = []; - for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; - return ar; - }; - return ownKeys(o); - }; - return function (mod) { - if (mod && mod.__esModule) return mod; - var result = {}; - if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); - __setModuleDefault(result, mod); - return result; - }; -})(); -var __importDefault = (this && this.__importDefault) || function (mod) { - return (mod && mod.__esModule) ? mod : { "default": mod }; -}; -Object.defineProperty(exports, "__esModule", { value: true }); -exports.createStealthBrowser = createStealthBrowser; -exports.createStealthContext = createStealthContext; -exports.randomDelay = randomDelay; -exports.humanMouseMove = humanMouseMove; -exports.humanScroll = humanScroll; -exports.humanType = humanType; -exports.simulateHumanBehavior = simulateHumanBehavior; -exports.waitForPageLoad = waitForPageLoad; -exports.isCloudflareChallenge = isCloudflareChallenge; -exports.waitForCloudflareChallenge = waitForCloudflareChallenge; -exports.saveCookies = saveCookies; -exports.loadCookies = loadCookies; -const playwright_extra_1 = require("playwright-extra"); -const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth")); -// Add stealth plugin -playwright_extra_1.chromium.use((0, puppeteer_extra_plugin_stealth_1.default)()); -/** - * Create a stealth browser instance with anti-detection measures - */ -async function createStealthBrowser(options = {}) { - const launchOptions = { - headless: options.headless !== false, - args: [ - '--disable-blink-features=AutomationControlled', - '--disable-features=IsolateOrigins,site-per-process', - '--disable-web-security', - '--disable-features=VizDisplayCompositor', - '--no-sandbox', - '--disable-setuid-sandbox', - '--disable-dev-shm-usage', - '--disable-accelerated-2d-canvas', - '--no-first-run', - '--no-zygote', - '--disable-gpu', - ], - }; - if (options.proxy) { - launchOptions.proxy = options.proxy; - } - const browser = await playwright_extra_1.chromium.launch(launchOptions); - return browser; -} -/** - * Create a stealth context with realistic browser fingerprint - */ -async function createStealthContext(browser, options = {}) { - const userAgent = options.userAgent || - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'; - const context = await browser.newContext({ - userAgent, - viewport: { width: 1920, height: 1080 }, - locale: 'en-US', - timezoneId: 'America/Phoenix', - permissions: ['geolocation'], - geolocation: { latitude: 33.4484, longitude: -112.074 }, // Phoenix, AZ - colorScheme: 'light', - deviceScaleFactor: 1, - hasTouch: false, - isMobile: false, - javaScriptEnabled: true, - extraHTTPHeaders: { - 'Accept-Language': 'en-US,en;q=0.9', - 'Accept-Encoding': 'gzip, deflate, br', - Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8', - 'Sec-Fetch-Site': 'none', - 'Sec-Fetch-Mode': 'navigate', - 'Sec-Fetch-User': '?1', - 'Sec-Fetch-Dest': 'document', - 'Upgrade-Insecure-Requests': '1', - }, - }); - // Set age verification cookies for Dutchie - await context.addCookies([ - { - name: 'age_verified', - value: 'true', - domain: '.dutchie.com', - path: '/', - expires: Math.floor(Date.now() / 1000) + 86400 * 30, // 30 days - }, - { - name: 'initial_location', - value: JSON.stringify({ state: options.state || 'Arizona' }), - domain: '.dutchie.com', - path: '/', - expires: Math.floor(Date.now() / 1000) + 86400 * 30, - }, - ]); - return context; -} -/** - * Random delay between min and max milliseconds - */ -function randomDelay(min, max) { - const delay = Math.floor(Math.random() * (max - min + 1)) + min; - return new Promise((resolve) => setTimeout(resolve, delay)); -} -/** - * Simulate human-like mouse movement - */ -async function humanMouseMove(page, x, y) { - const steps = 20; - const currentPos = await page.evaluate(() => ({ x: 0, y: 0 })); - for (let i = 0; i <= steps; i++) { - const progress = i / steps; - const easeProgress = easeInOutQuad(progress); - const nextX = currentPos.x + (x - currentPos.x) * easeProgress; - const nextY = currentPos.y + (y - currentPos.y) * easeProgress; - await page.mouse.move(nextX, nextY); - await randomDelay(5, 15); - } -} -/** - * Easing function for smooth mouse movement - */ -function easeInOutQuad(t) { - return t < 0.5 ? 2 * t * t : -1 + (4 - 2 * t) * t; -} -/** - * Simulate human-like scrolling - */ -async function humanScroll(page, scrollAmount = 500) { - const scrollSteps = 10; - const stepSize = scrollAmount / scrollSteps; - for (let i = 0; i < scrollSteps; i++) { - await page.mouse.wheel(0, stepSize); - await randomDelay(50, 150); - } -} -/** - * Simulate human-like typing - */ -async function humanType(page, selector, text) { - await page.click(selector); - await randomDelay(100, 300); - for (const char of text) { - await page.keyboard.type(char); - await randomDelay(50, 150); - } -} -/** - * Random realistic behavior before interacting with page - */ -async function simulateHumanBehavior(page) { - // Random small mouse movements - for (let i = 0; i < 3; i++) { - const x = Math.random() * 500 + 100; - const y = Math.random() * 300 + 100; - await humanMouseMove(page, x, y); - await randomDelay(200, 500); - } - // Small scroll - await humanScroll(page, 100); - await randomDelay(300, 700); -} -/** - * Wait for page to be fully loaded with human-like delay - */ -async function waitForPageLoad(page, timeout = 60000) { - try { - await page.waitForLoadState('networkidle', { timeout }); - await randomDelay(500, 1500); // Random delay after load - } - catch (error) { - // If networkidle times out, try domcontentloaded as fallback - console.log('⚠️ networkidle timeout, waiting for domcontentloaded...'); - await page.waitForLoadState('domcontentloaded', { timeout: 30000 }); - await randomDelay(1000, 2000); - } -} -/** - * Check if we're on a Cloudflare challenge page - */ -async function isCloudflareChallenge(page) { - const title = await page.title(); - const content = await page.content(); - return (title.includes('Cloudflare') || - title.includes('Just a moment') || - title.includes('Attention Required') || - content.includes('challenge-platform') || - content.includes('cf-challenge') || - content.includes('Checking your browser')); -} -/** - * Wait for Cloudflare challenge to complete - */ -async function waitForCloudflareChallenge(page, maxWaitMs = 60000) { - const startTime = Date.now(); - let attempts = 0; - while (Date.now() - startTime < maxWaitMs) { - attempts++; - if (!(await isCloudflareChallenge(page))) { - console.log(`✅ Cloudflare challenge passed after ${attempts} attempts (${Math.floor((Date.now() - startTime) / 1000)}s)`); - return true; - } - const remaining = Math.floor((maxWaitMs - (Date.now() - startTime)) / 1000); - console.log(`⏳ Waiting for Cloudflare challenge... (attempt ${attempts}, ${remaining}s remaining)`); - // Random delay between checks - await randomDelay(2000, 3000); - } - console.log('❌ Cloudflare challenge timeout - may need residential proxy or manual intervention'); - return false; -} -/** - * Save session cookies to file - */ -async function saveCookies(context, filepath) { - const cookies = await context.cookies(); - const fs = await Promise.resolve().then(() => __importStar(require('fs/promises'))); - await fs.writeFile(filepath, JSON.stringify(cookies, null, 2)); -} -/** - * Load session cookies from file - */ -async function loadCookies(context, filepath) { - try { - const fs = await Promise.resolve().then(() => __importStar(require('fs/promises'))); - const cookiesString = await fs.readFile(filepath, 'utf-8'); - const cookies = JSON.parse(cookiesString); - await context.addCookies(cookies); - return true; - } - catch (error) { - return false; - } -}