Files
cannaiq/backend/src/_deprecated/scraper-v2/navigation.ts
Kelly a35976b9e9 chore: Clean up deprecated code and docs
- Move deprecated directories to src/_deprecated/:
  - hydration/ (old pipeline approach)
  - scraper-v2/ (old Puppeteer scraper)
  - canonical-hydration/ (merged into tasks)
  - Unused services: availability, crawler-logger, geolocation, etc
  - Unused utils: age-gate-playwright, HomepageValidator, stealthBrowser

- Archive outdated docs to docs/_archive/:
  - ANALYTICS_RUNBOOK.md
  - ANALYTICS_V2_EXAMPLES.md
  - BRAND_INTELLIGENCE_API.md
  - CRAWL_PIPELINE.md
  - TASK_WORKFLOW_2024-12-10.md
  - WORKER_TASK_ARCHITECTURE.md
  - ORGANIC_SCRAPING_GUIDE.md

- Add docs/CODEBASE_MAP.md as single source of truth
- Add warning files to deprecated/archived directories
- Slim down CLAUDE.md to essential rules only

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-11 22:17:40 -07:00

341 lines
11 KiB
TypeScript

import { pool } from '../db/pool';
import { logger } from '../services/logger';
import { Downloader } from './downloader';
import { ScraperRequest } from './types';
interface Category {
id?: number;
storeId: number;
name: string;
slug: string;
dutchieUrl: string;
parentId?: number;
path: string;
displayOrder?: number;
}
interface NavigationLink {
text: string;
href: string;
isCategory: boolean;
parentText?: string;
}
/**
* Navigation Discovery - finds and builds category structure
*/
export class NavigationDiscovery {
private downloader: Downloader;
constructor(downloader: Downloader) {
this.downloader = downloader;
}
/**
* Discover categories from a store's main page
*/
async discoverCategories(storeId: number): Promise<void> {
logger.info('categories', `Starting category discovery for store ${storeId}`);
try {
// Get dispensary info (store = dispensary)
const storeResult = await pool.query(`
SELECT id, name, slug, menu_url as dutchie_url
FROM dispensaries
WHERE id = $1
`, [storeId]);
if (storeResult.rows.length === 0) {
throw new Error('Store not found');
}
const store = storeResult.rows[0];
const baseUrl = store.dutchie_url;
// Create request to fetch the main page
const request: ScraperRequest = {
url: baseUrl,
priority: 100,
retryCount: 0,
maxRetries: 3,
metadata: {
requiresBrowser: true,
requiresStealth: true
},
callback: async () => ({ items: [], requests: [] })
};
// Fetch the page
const response = await this.downloader.fetch(request);
// Extract navigation links
const page = await this.downloader.getCurrentPage();
if (!page) {
throw new Error('No active page for navigation extraction');
}
const links = await this.extractNavigationLinks(page, baseUrl);
logger.info('categories', `Found ${links.length} navigation links`);
// Check if it's a Dutchie menu
const isDutchie = await this.isDutchieMenu(page);
if (isDutchie) {
logger.info('categories', 'Detected Dutchie menu - using predefined structure');
await this.createDutchieCategories(storeId, store, links);
} else {
logger.info('categories', 'Custom menu detected - extracting from navigation');
await this.createCustomCategories(storeId, store, links);
}
logger.info('categories', `✅ Category discovery completed for ${store.name}`);
} catch (error) {
logger.error('categories', `Category discovery failed: ${error}`);
throw error;
}
}
/**
* Extract navigation links from page
*/
private async extractNavigationLinks(page: any, baseUrl: string): Promise<NavigationLink[]> {
return await page.evaluate((base: string) => {
const links: NavigationLink[] = [];
// Look for navigation elements
const navSelectors = [
'nav a',
'[role="navigation"] a',
'[class*="nav"] a',
'[class*="menu"] a',
'[class*="category"] a',
'header a'
];
const foundLinks = new Set<string>();
for (const selector of navSelectors) {
// @ts-ignore - runs in browser context
const elements = document.querySelectorAll(selector);
elements.forEach((el: any) => {
const text = el.textContent?.trim();
let href = el.href || el.getAttribute('href');
if (!text || !href || text.length < 2) return;
// Normalize href
if (href.startsWith('/')) {
// @ts-ignore - runs in browser context
const url = new URL(base);
href = `${url.origin}${href}`;
}
// Skip external links and anchors
if (!href.includes(base) || href.includes('#')) return;
// Skip duplicates
const linkKey = `${text}:${href}`;
if (foundLinks.has(linkKey)) return;
foundLinks.add(linkKey);
// Determine if it's likely a category
const categoryKeywords = [
'flower', 'pre-roll', 'vape', 'edible', 'concentrate',
'topical', 'accessory', 'brand', 'special', 'shop',
'indica', 'sativa', 'hybrid', 'cbd', 'thc'
];
const isCategory = categoryKeywords.some(kw =>
text.toLowerCase().includes(kw) ||
href.toLowerCase().includes(kw)
);
links.push({
text,
href,
isCategory
});
});
}
return links;
}, baseUrl);
}
/**
* Check if it's a Dutchie menu
*/
private async isDutchieMenu(page: any): Promise<boolean> {
return await page.evaluate(() => {
// Check for Dutchie markers
// @ts-ignore - runs in browser context
if ((window as any).reactEnv) {
// @ts-ignore - runs in browser context
const env = (window as any).reactEnv;
if (env.adminUrl?.includes('dutchie.com') ||
env.apiUrl?.includes('dutchie.com') ||
env.consumerUrl?.includes('dutchie.com')) {
return true;
}
}
// @ts-ignore - runs in browser context
const htmlContent = document.documentElement.innerHTML;
return (
htmlContent.includes('admin.dutchie.com') ||
htmlContent.includes('api.dutchie.com') ||
htmlContent.includes('embedded-menu') ||
htmlContent.includes('window.reactEnv')
);
});
}
/**
* Create categories for Dutchie menus (predefined structure)
* Uses your existing Dutchie category structure
*/
private async createDutchieCategories(storeId: number, store: any, discoveredLinks: NavigationLink[]): Promise<void> {
const client = await pool.connect();
try {
await client.query('BEGIN');
logger.info('categories', `Creating predefined Dutchie category structure`);
const baseUrl = store.dutchie_url;
// Your existing Dutchie categories structure
const DUTCHIE_CATEGORIES = [
{ name: 'Shop', slug: 'shop', parentSlug: undefined },
{ name: 'Flower', slug: 'flower', parentSlug: 'shop' },
{ name: 'Pre-Rolls', slug: 'pre-rolls', parentSlug: 'shop' },
{ name: 'Vaporizers', slug: 'vaporizers', parentSlug: 'shop' },
{ name: 'Concentrates', slug: 'concentrates', parentSlug: 'shop' },
{ name: 'Edibles', slug: 'edibles', parentSlug: 'shop' },
{ name: 'Topicals', slug: 'topicals', parentSlug: 'shop' },
{ name: 'Accessories', slug: 'accessories', parentSlug: 'shop' },
{ name: 'Brands', slug: 'brands', parentSlug: undefined },
{ name: 'Specials', slug: 'specials', parentSlug: undefined }
];
for (const category of DUTCHIE_CATEGORIES) {
let categoryUrl: string;
if (category.parentSlug) {
// Subcategory: /embedded-menu/{slug}/shop/flower
categoryUrl = `${baseUrl}/${category.parentSlug}/${category.slug}`;
} else {
// Top-level: /embedded-menu/{slug}/shop
categoryUrl = `${baseUrl}/${category.slug}`;
}
const path = category.parentSlug ? `${category.parentSlug}/${category.slug}` : category.slug;
if (!category.parentSlug) {
// Create parent category
await client.query(`
INSERT INTO categories (store_id, name, slug, dutchie_url, path, scrape_enabled, parent_id)
VALUES ($1, $2, $3, $4, $5, true, NULL)
ON CONFLICT (store_id, slug)
DO UPDATE SET name = $2, dutchie_url = $4, path = $5
RETURNING id
`, [storeId, category.name, category.slug, categoryUrl, path]);
logger.info('categories', `📁 ${category.name}`);
} else {
// Create subcategory
const parentResult = await client.query(`
SELECT id FROM categories
WHERE store_id = $1 AND slug = $2
`, [storeId, category.parentSlug]);
if (parentResult.rows.length > 0) {
const parentId = parentResult.rows[0].id;
await client.query(`
INSERT INTO categories (store_id, name, slug, dutchie_url, path, scrape_enabled, parent_id)
VALUES ($1, $2, $3, $4, $5, true, $6)
ON CONFLICT (store_id, slug)
DO UPDATE SET name = $2, dutchie_url = $4, path = $5, parent_id = $6
`, [storeId, category.name, category.slug, categoryUrl, path, parentId]);
logger.info('categories', ` └── ${category.name}`);
}
}
}
await client.query('COMMIT');
logger.info('categories', `✅ Created ${DUTCHIE_CATEGORIES.length} Dutchie categories successfully`);
} catch (error) {
await client.query('ROLLBACK');
logger.error('categories', `Failed to create Dutchie categories: ${error}`);
throw error;
} finally {
client.release();
}
}
/**
* Create categories from discovered links (custom menus)
*/
private async createCustomCategories(storeId: number, store: any, links: NavigationLink[]): Promise<void> {
const client = await pool.connect();
try {
await client.query('BEGIN');
// Filter to likely category links
const categoryLinks = links.filter(link => link.isCategory);
let displayOrder = 0;
for (const link of categoryLinks) {
// Generate slug from text
const slug = link.text
.toLowerCase()
.replace(/[^a-z0-9]+/g, '-')
.replace(/^-|-$/g, '');
// Determine path from URL
const url = new URL(link.href);
const path = url.pathname.replace(/^\//, '');
await client.query(`
INSERT INTO categories (store_id, name, slug, dutchie_url, path, scrape_enabled, display_order)
VALUES ($1, $2, $3, $4, $5, true, $6)
ON CONFLICT (store_id, slug)
DO UPDATE SET name = $2, dutchie_url = $4, path = $5, display_order = $6
`, [storeId, link.text, slug, link.href, path, displayOrder++]);
logger.info('categories', `📁 ${link.text} -> ${link.href}`);
}
await client.query('COMMIT');
logger.info('categories', `✅ Created ${categoryLinks.length} custom categories`);
} catch (error) {
await client.query('ROLLBACK');
throw error;
} finally {
client.release();
}
}
/**
* Update display_order column in categories table
*/
async ensureDisplayOrderColumn(): Promise<void> {
try {
await pool.query(`
ALTER TABLE categories
ADD COLUMN IF NOT EXISTS display_order INTEGER DEFAULT 0
`);
logger.info('categories', 'Ensured display_order column exists');
} catch (error) {
logger.warn('categories', `Could not add display_order column: ${error}`);
}
}
}