- Move deprecated directories to src/_deprecated/: - hydration/ (old pipeline approach) - scraper-v2/ (old Puppeteer scraper) - canonical-hydration/ (merged into tasks) - Unused services: availability, crawler-logger, geolocation, etc - Unused utils: age-gate-playwright, HomepageValidator, stealthBrowser - Archive outdated docs to docs/_archive/: - ANALYTICS_RUNBOOK.md - ANALYTICS_V2_EXAMPLES.md - BRAND_INTELLIGENCE_API.md - CRAWL_PIPELINE.md - TASK_WORKFLOW_2024-12-10.md - WORKER_TASK_ARCHITECTURE.md - ORGANIC_SCRAPING_GUIDE.md - Add docs/CODEBASE_MAP.md as single source of truth - Add warning files to deprecated/archived directories - Slim down CLAUDE.md to essential rules only 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
341 lines
11 KiB
TypeScript
341 lines
11 KiB
TypeScript
import { pool } from '../db/pool';
|
|
import { logger } from '../services/logger';
|
|
import { Downloader } from './downloader';
|
|
import { ScraperRequest } from './types';
|
|
|
|
interface Category {
|
|
id?: number;
|
|
storeId: number;
|
|
name: string;
|
|
slug: string;
|
|
dutchieUrl: string;
|
|
parentId?: number;
|
|
path: string;
|
|
displayOrder?: number;
|
|
}
|
|
|
|
interface NavigationLink {
|
|
text: string;
|
|
href: string;
|
|
isCategory: boolean;
|
|
parentText?: string;
|
|
}
|
|
|
|
/**
|
|
* Navigation Discovery - finds and builds category structure
|
|
*/
|
|
export class NavigationDiscovery {
|
|
private downloader: Downloader;
|
|
|
|
constructor(downloader: Downloader) {
|
|
this.downloader = downloader;
|
|
}
|
|
|
|
/**
|
|
* Discover categories from a store's main page
|
|
*/
|
|
async discoverCategories(storeId: number): Promise<void> {
|
|
logger.info('categories', `Starting category discovery for store ${storeId}`);
|
|
|
|
try {
|
|
// Get dispensary info (store = dispensary)
|
|
const storeResult = await pool.query(`
|
|
SELECT id, name, slug, menu_url as dutchie_url
|
|
FROM dispensaries
|
|
WHERE id = $1
|
|
`, [storeId]);
|
|
|
|
if (storeResult.rows.length === 0) {
|
|
throw new Error('Store not found');
|
|
}
|
|
|
|
const store = storeResult.rows[0];
|
|
const baseUrl = store.dutchie_url;
|
|
|
|
// Create request to fetch the main page
|
|
const request: ScraperRequest = {
|
|
url: baseUrl,
|
|
priority: 100,
|
|
retryCount: 0,
|
|
maxRetries: 3,
|
|
metadata: {
|
|
requiresBrowser: true,
|
|
requiresStealth: true
|
|
},
|
|
callback: async () => ({ items: [], requests: [] })
|
|
};
|
|
|
|
// Fetch the page
|
|
const response = await this.downloader.fetch(request);
|
|
|
|
// Extract navigation links
|
|
const page = await this.downloader.getCurrentPage();
|
|
if (!page) {
|
|
throw new Error('No active page for navigation extraction');
|
|
}
|
|
|
|
const links = await this.extractNavigationLinks(page, baseUrl);
|
|
logger.info('categories', `Found ${links.length} navigation links`);
|
|
|
|
// Check if it's a Dutchie menu
|
|
const isDutchie = await this.isDutchieMenu(page);
|
|
|
|
if (isDutchie) {
|
|
logger.info('categories', 'Detected Dutchie menu - using predefined structure');
|
|
await this.createDutchieCategories(storeId, store, links);
|
|
} else {
|
|
logger.info('categories', 'Custom menu detected - extracting from navigation');
|
|
await this.createCustomCategories(storeId, store, links);
|
|
}
|
|
|
|
logger.info('categories', `✅ Category discovery completed for ${store.name}`);
|
|
|
|
} catch (error) {
|
|
logger.error('categories', `Category discovery failed: ${error}`);
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Extract navigation links from page
|
|
*/
|
|
private async extractNavigationLinks(page: any, baseUrl: string): Promise<NavigationLink[]> {
|
|
return await page.evaluate((base: string) => {
|
|
const links: NavigationLink[] = [];
|
|
|
|
// Look for navigation elements
|
|
const navSelectors = [
|
|
'nav a',
|
|
'[role="navigation"] a',
|
|
'[class*="nav"] a',
|
|
'[class*="menu"] a',
|
|
'[class*="category"] a',
|
|
'header a'
|
|
];
|
|
|
|
const foundLinks = new Set<string>();
|
|
|
|
for (const selector of navSelectors) {
|
|
// @ts-ignore - runs in browser context
|
|
const elements = document.querySelectorAll(selector);
|
|
|
|
elements.forEach((el: any) => {
|
|
const text = el.textContent?.trim();
|
|
let href = el.href || el.getAttribute('href');
|
|
|
|
if (!text || !href || text.length < 2) return;
|
|
|
|
// Normalize href
|
|
if (href.startsWith('/')) {
|
|
// @ts-ignore - runs in browser context
|
|
const url = new URL(base);
|
|
href = `${url.origin}${href}`;
|
|
}
|
|
|
|
// Skip external links and anchors
|
|
if (!href.includes(base) || href.includes('#')) return;
|
|
|
|
// Skip duplicates
|
|
const linkKey = `${text}:${href}`;
|
|
if (foundLinks.has(linkKey)) return;
|
|
foundLinks.add(linkKey);
|
|
|
|
// Determine if it's likely a category
|
|
const categoryKeywords = [
|
|
'flower', 'pre-roll', 'vape', 'edible', 'concentrate',
|
|
'topical', 'accessory', 'brand', 'special', 'shop',
|
|
'indica', 'sativa', 'hybrid', 'cbd', 'thc'
|
|
];
|
|
|
|
const isCategory = categoryKeywords.some(kw =>
|
|
text.toLowerCase().includes(kw) ||
|
|
href.toLowerCase().includes(kw)
|
|
);
|
|
|
|
links.push({
|
|
text,
|
|
href,
|
|
isCategory
|
|
});
|
|
});
|
|
}
|
|
|
|
return links;
|
|
}, baseUrl);
|
|
}
|
|
|
|
/**
|
|
* Check if it's a Dutchie menu
|
|
*/
|
|
private async isDutchieMenu(page: any): Promise<boolean> {
|
|
return await page.evaluate(() => {
|
|
// Check for Dutchie markers
|
|
// @ts-ignore - runs in browser context
|
|
if ((window as any).reactEnv) {
|
|
// @ts-ignore - runs in browser context
|
|
const env = (window as any).reactEnv;
|
|
if (env.adminUrl?.includes('dutchie.com') ||
|
|
env.apiUrl?.includes('dutchie.com') ||
|
|
env.consumerUrl?.includes('dutchie.com')) {
|
|
return true;
|
|
}
|
|
}
|
|
|
|
// @ts-ignore - runs in browser context
|
|
const htmlContent = document.documentElement.innerHTML;
|
|
return (
|
|
htmlContent.includes('admin.dutchie.com') ||
|
|
htmlContent.includes('api.dutchie.com') ||
|
|
htmlContent.includes('embedded-menu') ||
|
|
htmlContent.includes('window.reactEnv')
|
|
);
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Create categories for Dutchie menus (predefined structure)
|
|
* Uses your existing Dutchie category structure
|
|
*/
|
|
private async createDutchieCategories(storeId: number, store: any, discoveredLinks: NavigationLink[]): Promise<void> {
|
|
const client = await pool.connect();
|
|
|
|
try {
|
|
await client.query('BEGIN');
|
|
|
|
logger.info('categories', `Creating predefined Dutchie category structure`);
|
|
|
|
const baseUrl = store.dutchie_url;
|
|
|
|
// Your existing Dutchie categories structure
|
|
const DUTCHIE_CATEGORIES = [
|
|
{ name: 'Shop', slug: 'shop', parentSlug: undefined },
|
|
{ name: 'Flower', slug: 'flower', parentSlug: 'shop' },
|
|
{ name: 'Pre-Rolls', slug: 'pre-rolls', parentSlug: 'shop' },
|
|
{ name: 'Vaporizers', slug: 'vaporizers', parentSlug: 'shop' },
|
|
{ name: 'Concentrates', slug: 'concentrates', parentSlug: 'shop' },
|
|
{ name: 'Edibles', slug: 'edibles', parentSlug: 'shop' },
|
|
{ name: 'Topicals', slug: 'topicals', parentSlug: 'shop' },
|
|
{ name: 'Accessories', slug: 'accessories', parentSlug: 'shop' },
|
|
{ name: 'Brands', slug: 'brands', parentSlug: undefined },
|
|
{ name: 'Specials', slug: 'specials', parentSlug: undefined }
|
|
];
|
|
|
|
for (const category of DUTCHIE_CATEGORIES) {
|
|
let categoryUrl: string;
|
|
if (category.parentSlug) {
|
|
// Subcategory: /embedded-menu/{slug}/shop/flower
|
|
categoryUrl = `${baseUrl}/${category.parentSlug}/${category.slug}`;
|
|
} else {
|
|
// Top-level: /embedded-menu/{slug}/shop
|
|
categoryUrl = `${baseUrl}/${category.slug}`;
|
|
}
|
|
|
|
const path = category.parentSlug ? `${category.parentSlug}/${category.slug}` : category.slug;
|
|
|
|
if (!category.parentSlug) {
|
|
// Create parent category
|
|
await client.query(`
|
|
INSERT INTO categories (store_id, name, slug, dutchie_url, path, scrape_enabled, parent_id)
|
|
VALUES ($1, $2, $3, $4, $5, true, NULL)
|
|
ON CONFLICT (store_id, slug)
|
|
DO UPDATE SET name = $2, dutchie_url = $4, path = $5
|
|
RETURNING id
|
|
`, [storeId, category.name, category.slug, categoryUrl, path]);
|
|
|
|
logger.info('categories', `📁 ${category.name}`);
|
|
} else {
|
|
// Create subcategory
|
|
const parentResult = await client.query(`
|
|
SELECT id FROM categories
|
|
WHERE store_id = $1 AND slug = $2
|
|
`, [storeId, category.parentSlug]);
|
|
|
|
if (parentResult.rows.length > 0) {
|
|
const parentId = parentResult.rows[0].id;
|
|
|
|
await client.query(`
|
|
INSERT INTO categories (store_id, name, slug, dutchie_url, path, scrape_enabled, parent_id)
|
|
VALUES ($1, $2, $3, $4, $5, true, $6)
|
|
ON CONFLICT (store_id, slug)
|
|
DO UPDATE SET name = $2, dutchie_url = $4, path = $5, parent_id = $6
|
|
`, [storeId, category.name, category.slug, categoryUrl, path, parentId]);
|
|
|
|
logger.info('categories', ` └── ${category.name}`);
|
|
}
|
|
}
|
|
}
|
|
|
|
await client.query('COMMIT');
|
|
logger.info('categories', `✅ Created ${DUTCHIE_CATEGORIES.length} Dutchie categories successfully`);
|
|
|
|
} catch (error) {
|
|
await client.query('ROLLBACK');
|
|
logger.error('categories', `Failed to create Dutchie categories: ${error}`);
|
|
throw error;
|
|
} finally {
|
|
client.release();
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Create categories from discovered links (custom menus)
|
|
*/
|
|
private async createCustomCategories(storeId: number, store: any, links: NavigationLink[]): Promise<void> {
|
|
const client = await pool.connect();
|
|
|
|
try {
|
|
await client.query('BEGIN');
|
|
|
|
// Filter to likely category links
|
|
const categoryLinks = links.filter(link => link.isCategory);
|
|
|
|
let displayOrder = 0;
|
|
|
|
for (const link of categoryLinks) {
|
|
// Generate slug from text
|
|
const slug = link.text
|
|
.toLowerCase()
|
|
.replace(/[^a-z0-9]+/g, '-')
|
|
.replace(/^-|-$/g, '');
|
|
|
|
// Determine path from URL
|
|
const url = new URL(link.href);
|
|
const path = url.pathname.replace(/^\//, '');
|
|
|
|
await client.query(`
|
|
INSERT INTO categories (store_id, name, slug, dutchie_url, path, scrape_enabled, display_order)
|
|
VALUES ($1, $2, $3, $4, $5, true, $6)
|
|
ON CONFLICT (store_id, slug)
|
|
DO UPDATE SET name = $2, dutchie_url = $4, path = $5, display_order = $6
|
|
`, [storeId, link.text, slug, link.href, path, displayOrder++]);
|
|
|
|
logger.info('categories', `📁 ${link.text} -> ${link.href}`);
|
|
}
|
|
|
|
await client.query('COMMIT');
|
|
logger.info('categories', `✅ Created ${categoryLinks.length} custom categories`);
|
|
|
|
} catch (error) {
|
|
await client.query('ROLLBACK');
|
|
throw error;
|
|
} finally {
|
|
client.release();
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Update display_order column in categories table
|
|
*/
|
|
async ensureDisplayOrderColumn(): Promise<void> {
|
|
try {
|
|
await pool.query(`
|
|
ALTER TABLE categories
|
|
ADD COLUMN IF NOT EXISTS display_order INTEGER DEFAULT 0
|
|
`);
|
|
logger.info('categories', 'Ensured display_order column exists');
|
|
} catch (error) {
|
|
logger.warn('categories', `Could not add display_order column: ${error}`);
|
|
}
|
|
}
|
|
}
|