Files
cannaiq/backend/dist/services/category-discovery.js
Kelly 66e07b2009 fix(monitor): remove non-existent worker columns from job_run_logs query
The job_run_logs table tracks scheduled job orchestration, not individual
worker jobs. Worker info (worker_id, worker_hostname) belongs on
dispensary_crawl_jobs, not job_run_logs.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-03 18:45:05 -07:00

247 lines
11 KiB
JavaScript

"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.discoverCategories = discoverCategories;
const puppeteer_extra_1 = __importDefault(require("puppeteer-extra"));
const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth"));
const migrate_1 = require("../db/migrate");
const logger_1 = require("./logger");
const age_gate_1 = require("../utils/age-gate");
const dutchie_1 = require("../scrapers/templates/dutchie");
// Apply stealth plugin
puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)());
const DUTCHIE_CATEGORIES = [
{ name: 'Shop', slug: 'shop' },
{ name: 'Flower', slug: 'flower', parentSlug: 'shop' },
{ name: 'Pre-Rolls', slug: 'pre-rolls', parentSlug: 'shop' },
{ name: 'Vaporizers', slug: 'vaporizers', parentSlug: 'shop' },
{ name: 'Concentrates', slug: 'concentrates', parentSlug: 'shop' },
{ name: 'Edibles', slug: 'edibles', parentSlug: 'shop' },
{ name: 'Topicals', slug: 'topicals', parentSlug: 'shop' },
{ name: 'Accessories', slug: 'accessories', parentSlug: 'shop' },
{ name: 'Brands', slug: 'brands' },
{ name: 'Specials', slug: 'specials' }
];
const CURALEAF_CATEGORIES = [
{ name: 'Shop', slug: 'shop' },
{ name: 'Flower', slug: 'flower', parentSlug: 'shop' },
{ name: 'Pre-Rolls', slug: 'pre-rolls', parentSlug: 'shop' },
{ name: 'Vaporizers', slug: 'vaporizers', parentSlug: 'shop' },
{ name: 'Concentrates', slug: 'concentrates', parentSlug: 'shop' },
{ name: 'Edibles', slug: 'edibles', parentSlug: 'shop' },
{ name: 'Tinctures', slug: 'tinctures', parentSlug: 'shop' },
{ name: 'Topicals', slug: 'topicals', parentSlug: 'shop' },
{ name: 'Capsules', slug: 'capsules', parentSlug: 'shop' },
{ name: 'Accessories', slug: 'accessories', parentSlug: 'shop' }
];
async function makePageStealthy(page) {
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', { get: () => false });
Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] });
Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] });
window.chrome = { runtime: {} };
});
}
async function isDutchieMenu(page) {
try {
// Check page source for Dutchie markers
const isDutchie = await page.evaluate(() => {
// Check for window.reactEnv with dutchie URLs
if (window.reactEnv) {
const env = window.reactEnv;
if (env.adminUrl?.includes('dutchie.com') ||
env.apiUrl?.includes('dutchie.com') ||
env.consumerUrl?.includes('dutchie.com')) {
return true;
}
}
// Check HTML source for dutchie references
const htmlContent = document.documentElement.innerHTML;
if (htmlContent.includes('admin.dutchie.com') ||
htmlContent.includes('api.dutchie.com') ||
htmlContent.includes('embedded-menu') ||
htmlContent.includes('window.reactEnv')) {
return true;
}
return false;
});
return isDutchie;
}
catch (error) {
logger_1.logger.warn('categories', `Error detecting Dutchie menu: ${error}`);
return false;
}
}
async function discoverCategories(storeId) {
let browser = null;
try {
logger_1.logger.info('categories', `Discovering categories for store ID: ${storeId}`);
const storeResult = await migrate_1.pool.query(`
SELECT id, name, slug, dutchie_url
FROM stores
WHERE id = $1
`, [storeId]);
if (storeResult.rows.length === 0) {
throw new Error('Store not found');
}
const store = storeResult.rows[0];
const baseUrl = store.dutchie_url;
// Launch browser to check page source
browser = await puppeteer_extra_1.default.launch({
headless: 'new',
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-blink-features=AutomationControlled'
]
});
const page = await browser.newPage();
await makePageStealthy(page);
await page.setViewport({ width: 1920, height: 1080 });
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
// Set age gate bypass cookies BEFORE navigation (standard for all cannabis sites)
const state = (0, age_gate_1.detectStateFromUrl)(baseUrl);
await (0, age_gate_1.setAgeGateCookies)(page, baseUrl, state);
logger_1.logger.info('categories', `Loading page to detect menu type: ${baseUrl}`);
await page.goto(baseUrl, { waitUntil: 'domcontentloaded', timeout: 60000 });
await page.waitForTimeout(3000);
// If age gate still appears, try to bypass it
await (0, age_gate_1.bypassAgeGate)(page, state);
// Detect if it's a Dutchie menu by inspecting page source
const isDutchie = await isDutchieMenu(page);
await browser.close();
browser = null;
if (isDutchie) {
logger_1.logger.info('categories', `✅ Detected Dutchie menu for ${store.name}`);
await createDutchieCategories(storeId, store);
}
else {
// Fallback: Use standard cannabis categories for non-Dutchie sites
logger_1.logger.info('categories', `Non-Dutchie menu detected, using standard cannabis categories for ${store.name}`);
await createCuraleafCategories(storeId, store);
}
}
catch (error) {
logger_1.logger.error('categories', `Category discovery error: ${error}`);
if (browser)
await browser.close();
throw error;
}
}
async function createDutchieCategories(storeId, store) {
const client = await migrate_1.pool.connect();
try {
await client.query('BEGIN');
logger_1.logger.info('categories', `Creating predefined Dutchie category structure`);
const baseUrl = store.dutchie_url;
for (const category of DUTCHIE_CATEGORIES) {
let categoryUrl;
// Use Dutchie template to build correct category URLs
if (category.parentSlug) {
// Subcategory: Use template's buildCategoryUrl (e.g., /products/flower)
categoryUrl = dutchie_1.dutchieTemplate.buildCategoryUrl(baseUrl, category.name);
}
else {
// Top-level: Use base URL with slug
categoryUrl = `${baseUrl}/${category.slug}`;
}
if (!category.parentSlug) {
// Create parent category
await client.query(`
INSERT INTO categories (store_id, name, slug, dutchie_url, scrape_enabled)
VALUES ($1, $2, $3, $4, true)
ON CONFLICT (store_id, slug)
DO UPDATE SET name = $2, dutchie_url = $4
RETURNING id
`, [storeId, category.name, category.slug, categoryUrl]);
logger_1.logger.info('categories', `📁 ${category.name}`);
}
else {
// Create subcategory
const parentResult = await client.query(`
SELECT id FROM categories
WHERE store_id = $1 AND slug = $2
`, [storeId, category.parentSlug]);
if (parentResult.rows.length > 0) {
await client.query(`
INSERT INTO categories (store_id, name, slug, dutchie_url, scrape_enabled)
VALUES ($1, $2, $3, $4, true)
ON CONFLICT (store_id, slug)
DO UPDATE SET name = $2, dutchie_url = $4
`, [storeId, category.name, category.slug, categoryUrl]);
logger_1.logger.info('categories', ` └── ${category.name}`);
}
}
}
await client.query('COMMIT');
logger_1.logger.info('categories', `✅ Created ${DUTCHIE_CATEGORIES.length} Dutchie categories successfully`);
}
catch (error) {
await client.query('ROLLBACK');
logger_1.logger.error('categories', `Failed to create Dutchie categories: ${error}`);
throw error;
}
finally {
client.release();
}
}
async function createCuraleafCategories(storeId, store) {
const client = await migrate_1.pool.connect();
try {
await client.query('BEGIN');
logger_1.logger.info('categories', `Creating predefined Curaleaf category structure`);
const baseUrl = store.dutchie_url;
for (const category of CURALEAF_CATEGORIES) {
let categoryUrl;
if (category.parentSlug) {
// Subcategory URL - Curaleaf uses pattern like: /stores/{store-slug}/{category}
categoryUrl = `${baseUrl}?category=${category.slug}`;
}
else {
// Top-level category
categoryUrl = baseUrl;
}
if (!category.parentSlug) {
// Create parent category
await client.query(`
INSERT INTO categories (store_id, name, slug, dutchie_url, scrape_enabled)
VALUES ($1, $2, $3, $4, true)
ON CONFLICT (store_id, slug)
DO UPDATE SET name = $2, dutchie_url = $4
RETURNING id
`, [storeId, category.name, category.slug, categoryUrl]);
logger_1.logger.info('categories', `📁 ${category.name}`);
}
else {
// Create subcategory
const parentResult = await client.query(`
SELECT id FROM categories
WHERE store_id = $1 AND slug = $2
`, [storeId, category.parentSlug]);
if (parentResult.rows.length > 0) {
await client.query(`
INSERT INTO categories (store_id, name, slug, dutchie_url, scrape_enabled)
VALUES ($1, $2, $3, $4, true)
ON CONFLICT (store_id, slug)
DO UPDATE SET name = $2, dutchie_url = $4
`, [storeId, category.name, category.slug, categoryUrl]);
logger_1.logger.info('categories', ` └── ${category.name}`);
}
}
}
await client.query('COMMIT');
logger_1.logger.info('categories', `✅ Created ${CURALEAF_CATEGORIES.length} Curaleaf categories successfully`);
}
catch (error) {
await client.query('ROLLBACK');
logger_1.logger.error('categories', `Failed to create Curaleaf categories: ${error}`);
throw error;
}
finally {
client.release();
}
}