Files
cannaiq/backend/dist/scraper-v2/navigation.js
2025-11-28 19:45:44 -07:00

279 lines
12 KiB
JavaScript

"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.NavigationDiscovery = void 0;
const migrate_1 = require("../db/migrate");
const logger_1 = require("../services/logger");
/**
* Navigation Discovery - finds and builds category structure
*/
class NavigationDiscovery {
downloader;
constructor(downloader) {
this.downloader = downloader;
}
/**
* Discover categories from a store's main page
*/
async discoverCategories(storeId) {
logger_1.logger.info('categories', `Starting category discovery for store ${storeId}`);
try {
// Get store info
const storeResult = await migrate_1.pool.query(`
SELECT id, name, slug, dutchie_url
FROM stores
WHERE id = $1
`, [storeId]);
if (storeResult.rows.length === 0) {
throw new Error('Store not found');
}
const store = storeResult.rows[0];
const baseUrl = store.dutchie_url;
// Create request to fetch the main page
const request = {
url: baseUrl,
priority: 100,
retryCount: 0,
maxRetries: 3,
metadata: {
requiresBrowser: true,
requiresStealth: true
},
callback: async () => ({ items: [], requests: [] })
};
// Fetch the page
const response = await this.downloader.fetch(request);
// Extract navigation links
const page = await this.downloader.getCurrentPage();
if (!page) {
throw new Error('No active page for navigation extraction');
}
const links = await this.extractNavigationLinks(page, baseUrl);
logger_1.logger.info('categories', `Found ${links.length} navigation links`);
// Check if it's a Dutchie menu
const isDutchie = await this.isDutchieMenu(page);
if (isDutchie) {
logger_1.logger.info('categories', 'Detected Dutchie menu - using predefined structure');
await this.createDutchieCategories(storeId, store, links);
}
else {
logger_1.logger.info('categories', 'Custom menu detected - extracting from navigation');
await this.createCustomCategories(storeId, store, links);
}
logger_1.logger.info('categories', `✅ Category discovery completed for ${store.name}`);
}
catch (error) {
logger_1.logger.error('categories', `Category discovery failed: ${error}`);
throw error;
}
}
/**
* Extract navigation links from page
*/
async extractNavigationLinks(page, baseUrl) {
return await page.evaluate((base) => {
const links = [];
// Look for navigation elements
const navSelectors = [
'nav a',
'[role="navigation"] a',
'[class*="nav"] a',
'[class*="menu"] a',
'[class*="category"] a',
'header a'
];
const foundLinks = new Set();
for (const selector of navSelectors) {
// @ts-ignore - runs in browser context
const elements = document.querySelectorAll(selector);
elements.forEach((el) => {
const text = el.textContent?.trim();
let href = el.href || el.getAttribute('href');
if (!text || !href || text.length < 2)
return;
// Normalize href
if (href.startsWith('/')) {
// @ts-ignore - runs in browser context
const url = new URL(base);
href = `${url.origin}${href}`;
}
// Skip external links and anchors
if (!href.includes(base) || href.includes('#'))
return;
// Skip duplicates
const linkKey = `${text}:${href}`;
if (foundLinks.has(linkKey))
return;
foundLinks.add(linkKey);
// Determine if it's likely a category
const categoryKeywords = [
'flower', 'pre-roll', 'vape', 'edible', 'concentrate',
'topical', 'accessory', 'brand', 'special', 'shop',
'indica', 'sativa', 'hybrid', 'cbd', 'thc'
];
const isCategory = categoryKeywords.some(kw => text.toLowerCase().includes(kw) ||
href.toLowerCase().includes(kw));
links.push({
text,
href,
isCategory
});
});
}
return links;
}, baseUrl);
}
/**
* Check if it's a Dutchie menu
*/
async isDutchieMenu(page) {
return await page.evaluate(() => {
// Check for Dutchie markers
// @ts-ignore - runs in browser context
if (window.reactEnv) {
// @ts-ignore - runs in browser context
const env = window.reactEnv;
if (env.adminUrl?.includes('dutchie.com') ||
env.apiUrl?.includes('dutchie.com') ||
env.consumerUrl?.includes('dutchie.com')) {
return true;
}
}
// @ts-ignore - runs in browser context
const htmlContent = document.documentElement.innerHTML;
return (htmlContent.includes('admin.dutchie.com') ||
htmlContent.includes('api.dutchie.com') ||
htmlContent.includes('embedded-menu') ||
htmlContent.includes('window.reactEnv'));
});
}
/**
* Create categories for Dutchie menus (predefined structure)
* Uses your existing Dutchie category structure
*/
async createDutchieCategories(storeId, store, discoveredLinks) {
const client = await migrate_1.pool.connect();
try {
await client.query('BEGIN');
logger_1.logger.info('categories', `Creating predefined Dutchie category structure`);
const baseUrl = store.dutchie_url;
// Your existing Dutchie categories structure
const DUTCHIE_CATEGORIES = [
{ name: 'Shop', slug: 'shop', parentSlug: undefined },
{ name: 'Flower', slug: 'flower', parentSlug: 'shop' },
{ name: 'Pre-Rolls', slug: 'pre-rolls', parentSlug: 'shop' },
{ name: 'Vaporizers', slug: 'vaporizers', parentSlug: 'shop' },
{ name: 'Concentrates', slug: 'concentrates', parentSlug: 'shop' },
{ name: 'Edibles', slug: 'edibles', parentSlug: 'shop' },
{ name: 'Topicals', slug: 'topicals', parentSlug: 'shop' },
{ name: 'Accessories', slug: 'accessories', parentSlug: 'shop' },
{ name: 'Brands', slug: 'brands', parentSlug: undefined },
{ name: 'Specials', slug: 'specials', parentSlug: undefined }
];
for (const category of DUTCHIE_CATEGORIES) {
let categoryUrl;
if (category.parentSlug) {
// Subcategory: /embedded-menu/{slug}/shop/flower
categoryUrl = `${baseUrl}/${category.parentSlug}/${category.slug}`;
}
else {
// Top-level: /embedded-menu/{slug}/shop
categoryUrl = `${baseUrl}/${category.slug}`;
}
const path = category.parentSlug ? `${category.parentSlug}/${category.slug}` : category.slug;
if (!category.parentSlug) {
// Create parent category
await client.query(`
INSERT INTO categories (store_id, name, slug, dutchie_url, path, scrape_enabled, parent_id)
VALUES ($1, $2, $3, $4, $5, true, NULL)
ON CONFLICT (store_id, slug)
DO UPDATE SET name = $2, dutchie_url = $4, path = $5
RETURNING id
`, [storeId, category.name, category.slug, categoryUrl, path]);
logger_1.logger.info('categories', `📁 ${category.name}`);
}
else {
// Create subcategory
const parentResult = await client.query(`
SELECT id FROM categories
WHERE store_id = $1 AND slug = $2
`, [storeId, category.parentSlug]);
if (parentResult.rows.length > 0) {
const parentId = parentResult.rows[0].id;
await client.query(`
INSERT INTO categories (store_id, name, slug, dutchie_url, path, scrape_enabled, parent_id)
VALUES ($1, $2, $3, $4, $5, true, $6)
ON CONFLICT (store_id, slug)
DO UPDATE SET name = $2, dutchie_url = $4, path = $5, parent_id = $6
`, [storeId, category.name, category.slug, categoryUrl, path, parentId]);
logger_1.logger.info('categories', ` └── ${category.name}`);
}
}
}
await client.query('COMMIT');
logger_1.logger.info('categories', `✅ Created ${DUTCHIE_CATEGORIES.length} Dutchie categories successfully`);
}
catch (error) {
await client.query('ROLLBACK');
logger_1.logger.error('categories', `Failed to create Dutchie categories: ${error}`);
throw error;
}
finally {
client.release();
}
}
/**
* Create categories from discovered links (custom menus)
*/
async createCustomCategories(storeId, store, links) {
const client = await migrate_1.pool.connect();
try {
await client.query('BEGIN');
// Filter to likely category links
const categoryLinks = links.filter(link => link.isCategory);
let displayOrder = 0;
for (const link of categoryLinks) {
// Generate slug from text
const slug = link.text
.toLowerCase()
.replace(/[^a-z0-9]+/g, '-')
.replace(/^-|-$/g, '');
// Determine path from URL
const url = new URL(link.href);
const path = url.pathname.replace(/^\//, '');
await client.query(`
INSERT INTO categories (store_id, name, slug, dutchie_url, path, scrape_enabled, display_order)
VALUES ($1, $2, $3, $4, $5, true, $6)
ON CONFLICT (store_id, slug)
DO UPDATE SET name = $2, dutchie_url = $4, path = $5, display_order = $6
`, [storeId, link.text, slug, link.href, path, displayOrder++]);
logger_1.logger.info('categories', `📁 ${link.text} -> ${link.href}`);
}
await client.query('COMMIT');
logger_1.logger.info('categories', `✅ Created ${categoryLinks.length} custom categories`);
}
catch (error) {
await client.query('ROLLBACK');
throw error;
}
finally {
client.release();
}
}
/**
* Update display_order column in categories table
*/
async ensureDisplayOrderColumn() {
try {
await migrate_1.pool.query(`
ALTER TABLE categories
ADD COLUMN IF NOT EXISTS display_order INTEGER DEFAULT 0
`);
logger_1.logger.info('categories', 'Ensured display_order column exists');
}
catch (error) {
logger_1.logger.warn('categories', `Could not add display_order column: ${error}`);
}
}
}
exports.NavigationDiscovery = NavigationDiscovery;