chore: Clean up deprecated code and docs
- Move deprecated directories to src/_deprecated/: - hydration/ (old pipeline approach) - scraper-v2/ (old Puppeteer scraper) - canonical-hydration/ (merged into tasks) - Unused services: availability, crawler-logger, geolocation, etc - Unused utils: age-gate-playwright, HomepageValidator, stealthBrowser - Archive outdated docs to docs/_archive/: - ANALYTICS_RUNBOOK.md - ANALYTICS_V2_EXAMPLES.md - BRAND_INTELLIGENCE_API.md - CRAWL_PIPELINE.md - TASK_WORKFLOW_2024-12-10.md - WORKER_TASK_ARCHITECTURE.md - ORGANIC_SCRAPING_GUIDE.md - Add docs/CODEBASE_MAP.md as single source of truth - Add warning files to deprecated/archived directories - Slim down CLAUDE.md to essential rules only 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
340
backend/src/_deprecated/scraper-v2/navigation.ts
Normal file
340
backend/src/_deprecated/scraper-v2/navigation.ts
Normal file
@@ -0,0 +1,340 @@
|
||||
import { pool } from '../db/pool';
|
||||
import { logger } from '../services/logger';
|
||||
import { Downloader } from './downloader';
|
||||
import { ScraperRequest } from './types';
|
||||
|
||||
interface Category {
|
||||
id?: number;
|
||||
storeId: number;
|
||||
name: string;
|
||||
slug: string;
|
||||
dutchieUrl: string;
|
||||
parentId?: number;
|
||||
path: string;
|
||||
displayOrder?: number;
|
||||
}
|
||||
|
||||
interface NavigationLink {
|
||||
text: string;
|
||||
href: string;
|
||||
isCategory: boolean;
|
||||
parentText?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Navigation Discovery - finds and builds category structure
|
||||
*/
|
||||
export class NavigationDiscovery {
|
||||
private downloader: Downloader;
|
||||
|
||||
constructor(downloader: Downloader) {
|
||||
this.downloader = downloader;
|
||||
}
|
||||
|
||||
/**
|
||||
* Discover categories from a store's main page
|
||||
*/
|
||||
async discoverCategories(storeId: number): Promise<void> {
|
||||
logger.info('categories', `Starting category discovery for store ${storeId}`);
|
||||
|
||||
try {
|
||||
// Get dispensary info (store = dispensary)
|
||||
const storeResult = await pool.query(`
|
||||
SELECT id, name, slug, menu_url as dutchie_url
|
||||
FROM dispensaries
|
||||
WHERE id = $1
|
||||
`, [storeId]);
|
||||
|
||||
if (storeResult.rows.length === 0) {
|
||||
throw new Error('Store not found');
|
||||
}
|
||||
|
||||
const store = storeResult.rows[0];
|
||||
const baseUrl = store.dutchie_url;
|
||||
|
||||
// Create request to fetch the main page
|
||||
const request: ScraperRequest = {
|
||||
url: baseUrl,
|
||||
priority: 100,
|
||||
retryCount: 0,
|
||||
maxRetries: 3,
|
||||
metadata: {
|
||||
requiresBrowser: true,
|
||||
requiresStealth: true
|
||||
},
|
||||
callback: async () => ({ items: [], requests: [] })
|
||||
};
|
||||
|
||||
// Fetch the page
|
||||
const response = await this.downloader.fetch(request);
|
||||
|
||||
// Extract navigation links
|
||||
const page = await this.downloader.getCurrentPage();
|
||||
if (!page) {
|
||||
throw new Error('No active page for navigation extraction');
|
||||
}
|
||||
|
||||
const links = await this.extractNavigationLinks(page, baseUrl);
|
||||
logger.info('categories', `Found ${links.length} navigation links`);
|
||||
|
||||
// Check if it's a Dutchie menu
|
||||
const isDutchie = await this.isDutchieMenu(page);
|
||||
|
||||
if (isDutchie) {
|
||||
logger.info('categories', 'Detected Dutchie menu - using predefined structure');
|
||||
await this.createDutchieCategories(storeId, store, links);
|
||||
} else {
|
||||
logger.info('categories', 'Custom menu detected - extracting from navigation');
|
||||
await this.createCustomCategories(storeId, store, links);
|
||||
}
|
||||
|
||||
logger.info('categories', `✅ Category discovery completed for ${store.name}`);
|
||||
|
||||
} catch (error) {
|
||||
logger.error('categories', `Category discovery failed: ${error}`);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract navigation links from page
|
||||
*/
|
||||
private async extractNavigationLinks(page: any, baseUrl: string): Promise<NavigationLink[]> {
|
||||
return await page.evaluate((base: string) => {
|
||||
const links: NavigationLink[] = [];
|
||||
|
||||
// Look for navigation elements
|
||||
const navSelectors = [
|
||||
'nav a',
|
||||
'[role="navigation"] a',
|
||||
'[class*="nav"] a',
|
||||
'[class*="menu"] a',
|
||||
'[class*="category"] a',
|
||||
'header a'
|
||||
];
|
||||
|
||||
const foundLinks = new Set<string>();
|
||||
|
||||
for (const selector of navSelectors) {
|
||||
// @ts-ignore - runs in browser context
|
||||
const elements = document.querySelectorAll(selector);
|
||||
|
||||
elements.forEach((el: any) => {
|
||||
const text = el.textContent?.trim();
|
||||
let href = el.href || el.getAttribute('href');
|
||||
|
||||
if (!text || !href || text.length < 2) return;
|
||||
|
||||
// Normalize href
|
||||
if (href.startsWith('/')) {
|
||||
// @ts-ignore - runs in browser context
|
||||
const url = new URL(base);
|
||||
href = `${url.origin}${href}`;
|
||||
}
|
||||
|
||||
// Skip external links and anchors
|
||||
if (!href.includes(base) || href.includes('#')) return;
|
||||
|
||||
// Skip duplicates
|
||||
const linkKey = `${text}:${href}`;
|
||||
if (foundLinks.has(linkKey)) return;
|
||||
foundLinks.add(linkKey);
|
||||
|
||||
// Determine if it's likely a category
|
||||
const categoryKeywords = [
|
||||
'flower', 'pre-roll', 'vape', 'edible', 'concentrate',
|
||||
'topical', 'accessory', 'brand', 'special', 'shop',
|
||||
'indica', 'sativa', 'hybrid', 'cbd', 'thc'
|
||||
];
|
||||
|
||||
const isCategory = categoryKeywords.some(kw =>
|
||||
text.toLowerCase().includes(kw) ||
|
||||
href.toLowerCase().includes(kw)
|
||||
);
|
||||
|
||||
links.push({
|
||||
text,
|
||||
href,
|
||||
isCategory
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
return links;
|
||||
}, baseUrl);
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if it's a Dutchie menu
|
||||
*/
|
||||
private async isDutchieMenu(page: any): Promise<boolean> {
|
||||
return await page.evaluate(() => {
|
||||
// Check for Dutchie markers
|
||||
// @ts-ignore - runs in browser context
|
||||
if ((window as any).reactEnv) {
|
||||
// @ts-ignore - runs in browser context
|
||||
const env = (window as any).reactEnv;
|
||||
if (env.adminUrl?.includes('dutchie.com') ||
|
||||
env.apiUrl?.includes('dutchie.com') ||
|
||||
env.consumerUrl?.includes('dutchie.com')) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// @ts-ignore - runs in browser context
|
||||
const htmlContent = document.documentElement.innerHTML;
|
||||
return (
|
||||
htmlContent.includes('admin.dutchie.com') ||
|
||||
htmlContent.includes('api.dutchie.com') ||
|
||||
htmlContent.includes('embedded-menu') ||
|
||||
htmlContent.includes('window.reactEnv')
|
||||
);
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Create categories for Dutchie menus (predefined structure)
|
||||
* Uses your existing Dutchie category structure
|
||||
*/
|
||||
private async createDutchieCategories(storeId: number, store: any, discoveredLinks: NavigationLink[]): Promise<void> {
|
||||
const client = await pool.connect();
|
||||
|
||||
try {
|
||||
await client.query('BEGIN');
|
||||
|
||||
logger.info('categories', `Creating predefined Dutchie category structure`);
|
||||
|
||||
const baseUrl = store.dutchie_url;
|
||||
|
||||
// Your existing Dutchie categories structure
|
||||
const DUTCHIE_CATEGORIES = [
|
||||
{ name: 'Shop', slug: 'shop', parentSlug: undefined },
|
||||
{ name: 'Flower', slug: 'flower', parentSlug: 'shop' },
|
||||
{ name: 'Pre-Rolls', slug: 'pre-rolls', parentSlug: 'shop' },
|
||||
{ name: 'Vaporizers', slug: 'vaporizers', parentSlug: 'shop' },
|
||||
{ name: 'Concentrates', slug: 'concentrates', parentSlug: 'shop' },
|
||||
{ name: 'Edibles', slug: 'edibles', parentSlug: 'shop' },
|
||||
{ name: 'Topicals', slug: 'topicals', parentSlug: 'shop' },
|
||||
{ name: 'Accessories', slug: 'accessories', parentSlug: 'shop' },
|
||||
{ name: 'Brands', slug: 'brands', parentSlug: undefined },
|
||||
{ name: 'Specials', slug: 'specials', parentSlug: undefined }
|
||||
];
|
||||
|
||||
for (const category of DUTCHIE_CATEGORIES) {
|
||||
let categoryUrl: string;
|
||||
if (category.parentSlug) {
|
||||
// Subcategory: /embedded-menu/{slug}/shop/flower
|
||||
categoryUrl = `${baseUrl}/${category.parentSlug}/${category.slug}`;
|
||||
} else {
|
||||
// Top-level: /embedded-menu/{slug}/shop
|
||||
categoryUrl = `${baseUrl}/${category.slug}`;
|
||||
}
|
||||
|
||||
const path = category.parentSlug ? `${category.parentSlug}/${category.slug}` : category.slug;
|
||||
|
||||
if (!category.parentSlug) {
|
||||
// Create parent category
|
||||
await client.query(`
|
||||
INSERT INTO categories (store_id, name, slug, dutchie_url, path, scrape_enabled, parent_id)
|
||||
VALUES ($1, $2, $3, $4, $5, true, NULL)
|
||||
ON CONFLICT (store_id, slug)
|
||||
DO UPDATE SET name = $2, dutchie_url = $4, path = $5
|
||||
RETURNING id
|
||||
`, [storeId, category.name, category.slug, categoryUrl, path]);
|
||||
|
||||
logger.info('categories', `📁 ${category.name}`);
|
||||
} else {
|
||||
// Create subcategory
|
||||
const parentResult = await client.query(`
|
||||
SELECT id FROM categories
|
||||
WHERE store_id = $1 AND slug = $2
|
||||
`, [storeId, category.parentSlug]);
|
||||
|
||||
if (parentResult.rows.length > 0) {
|
||||
const parentId = parentResult.rows[0].id;
|
||||
|
||||
await client.query(`
|
||||
INSERT INTO categories (store_id, name, slug, dutchie_url, path, scrape_enabled, parent_id)
|
||||
VALUES ($1, $2, $3, $4, $5, true, $6)
|
||||
ON CONFLICT (store_id, slug)
|
||||
DO UPDATE SET name = $2, dutchie_url = $4, path = $5, parent_id = $6
|
||||
`, [storeId, category.name, category.slug, categoryUrl, path, parentId]);
|
||||
|
||||
logger.info('categories', ` └── ${category.name}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
await client.query('COMMIT');
|
||||
logger.info('categories', `✅ Created ${DUTCHIE_CATEGORIES.length} Dutchie categories successfully`);
|
||||
|
||||
} catch (error) {
|
||||
await client.query('ROLLBACK');
|
||||
logger.error('categories', `Failed to create Dutchie categories: ${error}`);
|
||||
throw error;
|
||||
} finally {
|
||||
client.release();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create categories from discovered links (custom menus)
|
||||
*/
|
||||
private async createCustomCategories(storeId: number, store: any, links: NavigationLink[]): Promise<void> {
|
||||
const client = await pool.connect();
|
||||
|
||||
try {
|
||||
await client.query('BEGIN');
|
||||
|
||||
// Filter to likely category links
|
||||
const categoryLinks = links.filter(link => link.isCategory);
|
||||
|
||||
let displayOrder = 0;
|
||||
|
||||
for (const link of categoryLinks) {
|
||||
// Generate slug from text
|
||||
const slug = link.text
|
||||
.toLowerCase()
|
||||
.replace(/[^a-z0-9]+/g, '-')
|
||||
.replace(/^-|-$/g, '');
|
||||
|
||||
// Determine path from URL
|
||||
const url = new URL(link.href);
|
||||
const path = url.pathname.replace(/^\//, '');
|
||||
|
||||
await client.query(`
|
||||
INSERT INTO categories (store_id, name, slug, dutchie_url, path, scrape_enabled, display_order)
|
||||
VALUES ($1, $2, $3, $4, $5, true, $6)
|
||||
ON CONFLICT (store_id, slug)
|
||||
DO UPDATE SET name = $2, dutchie_url = $4, path = $5, display_order = $6
|
||||
`, [storeId, link.text, slug, link.href, path, displayOrder++]);
|
||||
|
||||
logger.info('categories', `📁 ${link.text} -> ${link.href}`);
|
||||
}
|
||||
|
||||
await client.query('COMMIT');
|
||||
logger.info('categories', `✅ Created ${categoryLinks.length} custom categories`);
|
||||
|
||||
} catch (error) {
|
||||
await client.query('ROLLBACK');
|
||||
throw error;
|
||||
} finally {
|
||||
client.release();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Update display_order column in categories table
|
||||
*/
|
||||
async ensureDisplayOrderColumn(): Promise<void> {
|
||||
try {
|
||||
await pool.query(`
|
||||
ALTER TABLE categories
|
||||
ADD COLUMN IF NOT EXISTS display_order INTEGER DEFAULT 0
|
||||
`);
|
||||
logger.info('categories', 'Ensured display_order column exists');
|
||||
} catch (error) {
|
||||
logger.warn('categories', `Could not add display_order column: ${error}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user