Files
cannaiq/backend/src/routes/scraper-monitor.ts
Kelly 2f483b3084 feat: SEO template library, discovery pipeline, and orchestrator enhancements
## SEO Template Library
- Add complete template library with 7 page types (state, city, category, brand, product, search, regeneration)
- Add Template Library tab in SEO Orchestrator with accordion-based editors
- Add template preview, validation, and variable injection engine
- Add API endpoints: /api/seo/templates, preview, validate, generate, regenerate

## Discovery Pipeline
- Add promotion.ts for discovery location validation and promotion
- Add discover-all-states.ts script for multi-state discovery
- Add promotion log migration (067)
- Enhance discovery routes and types

## Orchestrator & Admin
- Add crawl_enabled filter to stores page
- Add API permissions page
- Add job queue management
- Add price analytics routes
- Add markets and intelligence routes
- Enhance dashboard and worker monitoring

## Infrastructure
- Add migrations for worker definitions, SEO settings, field alignment
- Add canonical pipeline for scraper v2
- Update hydration and sync orchestrator
- Enhance multi-state query service

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-09 00:05:34 -07:00

383 lines
10 KiB
TypeScript

import { Router } from 'express';
import { authMiddleware } from '../auth/middleware';
import { pool } from '../db/pool';
const router = Router();
router.use(authMiddleware);
// In-memory storage for active scrapers
interface ActiveScraper {
id: string;
storeId: number;
storeName: string;
categoryId?: number;
categoryName?: string;
startTime: Date;
lastUpdate: Date;
status: 'running' | 'error' | 'completed';
stats: {
requestsTotal: number;
requestsSuccess: number;
itemsSaved: number;
itemsDropped: number;
errorsCount: number;
productsProcessed?: number;
productsTotal?: number;
};
currentActivity?: string;
}
export const activeScrapers = new Map<string, ActiveScraper>();
// Get all active scrapers
router.get('/active', async (req, res) => {
try {
const scrapers = Array.from(activeScrapers.values()).map(scraper => ({
...scraper,
duration: Date.now() - scraper.startTime.getTime(),
isStale: Date.now() - scraper.lastUpdate.getTime() > 60000 // 1 minute
}));
res.json({ scrapers });
} catch (error) {
console.error('Error fetching active scrapers:', error);
res.status(500).json({ error: 'Failed to fetch active scrapers' });
}
});
// Get scraper by ID
router.get('/active/:id', async (req, res) => {
try {
const { id } = req.params;
const scraper = activeScrapers.get(id);
if (!scraper) {
return res.status(404).json({ error: 'Scraper not found' });
}
res.json({
scraper: {
...scraper,
duration: Date.now() - scraper.startTime.getTime(),
isStale: Date.now() - scraper.lastUpdate.getTime() > 60000
}
});
} catch (error) {
console.error('Error fetching scraper:', error);
res.status(500).json({ error: 'Failed to fetch scraper' });
}
});
// Get scraper history (last 50 completed scrapes)
router.get('/history', async (req, res) => {
try {
const { limit = 50, dispensary_id } = req.query;
let query = `
SELECT
d.id as dispensary_id,
d.name as dispensary_name,
d.city,
d.state,
dcj.id as job_id,
dcj.job_type,
dcj.status,
dcj.products_found,
dcj.products_new,
dcj.products_updated,
dcj.in_stock_count,
dcj.out_of_stock_count,
dcj.duration_ms,
dcj.completed_at as last_scraped_at,
dcj.error_message,
(
SELECT COUNT(*)
FROM store_products sp
WHERE sp.dispensary_id = d.id
AND sp.last_seen_at >= NOW() - INTERVAL '7 days'
) as product_count
FROM dispensary_crawl_jobs dcj
JOIN dispensaries d ON d.id = dcj.dispensary_id
WHERE dcj.completed_at IS NOT NULL
`;
const params: any[] = [];
let paramCount = 1;
if (dispensary_id) {
query += ` AND d.id = $${paramCount}`;
params.push(dispensary_id);
paramCount++;
}
query += ` ORDER BY dcj.completed_at DESC LIMIT $${paramCount}`;
params.push(limit);
const result = await pool.query(query, params);
res.json({ history: result.rows });
} catch (error) {
console.error('Error fetching scraper history:', error);
res.status(500).json({ error: 'Failed to fetch scraper history' });
}
});
// Helper function to register a scraper
export function registerScraper(
id: string,
storeId: number,
storeName: string,
categoryId?: number,
categoryName?: string
): void {
activeScrapers.set(id, {
id,
storeId,
storeName,
categoryId,
categoryName,
startTime: new Date(),
lastUpdate: new Date(),
status: 'running',
stats: {
requestsTotal: 0,
requestsSuccess: 0,
itemsSaved: 0,
itemsDropped: 0,
errorsCount: 0
}
});
}
// Helper function to update scraper stats
export function updateScraperStats(
id: string,
stats: Partial<ActiveScraper['stats']>,
currentActivity?: string
): void {
const scraper = activeScrapers.get(id);
if (scraper) {
scraper.stats = { ...scraper.stats, ...stats };
scraper.lastUpdate = new Date();
if (currentActivity) {
scraper.currentActivity = currentActivity;
}
}
}
// Helper function to mark scraper as completed
export function completeScraper(id: string, error?: string): void {
const scraper = activeScrapers.get(id);
if (scraper) {
scraper.status = error ? 'error' : 'completed';
scraper.lastUpdate = new Date();
// Remove after 5 minutes
setTimeout(() => {
activeScrapers.delete(id);
}, 5 * 60 * 1000);
}
}
// Dispensary crawl jobs endpoints
router.get('/jobs/stats', async (req, res) => {
try {
const { dispensary_id } = req.query;
let whereClause = '';
const params: any[] = [];
if (dispensary_id) {
whereClause = 'WHERE dispensary_id = $1';
params.push(dispensary_id);
}
const result = await pool.query(`
SELECT
status,
COUNT(*) as count,
SUM(products_found) as total_products_found,
SUM(COALESCE(products_new, 0) + COALESCE(products_updated, 0)) as total_products_saved
FROM dispensary_crawl_jobs
${whereClause}
GROUP BY status
`, params);
const stats = {
pending: 0,
in_progress: 0,
completed: 0,
failed: 0,
total_products_found: 0,
total_products_saved: 0
};
result.rows.forEach((row: { status: string; count: string; total_products_found?: string; total_products_saved?: string }) => {
stats[row.status as keyof typeof stats] = parseInt(row.count);
if (row.status === 'completed') {
stats.total_products_found += parseInt(row.total_products_found || '0');
stats.total_products_saved += parseInt(row.total_products_saved || '0');
}
});
res.json(stats);
} catch (error) {
console.error('Error fetching job stats:', error);
res.status(500).json({ error: 'Failed to fetch job stats' });
}
});
router.get('/jobs/active', async (req, res) => {
try {
const { dispensary_id } = req.query;
let whereClause = "WHERE dcj.status = 'in_progress'";
const params: any[] = [];
let paramCount = 1;
if (dispensary_id) {
whereClause += ` AND dcj.dispensary_id = $${paramCount}`;
params.push(dispensary_id);
paramCount++;
}
const result = await pool.query(`
SELECT
dcj.id,
dcj.dispensary_id,
d.name as dispensary_name,
dcj.job_type,
dcj.status,
dcj.worker_id,
dcj.started_at,
dcj.products_found,
COALESCE(dcj.products_new, 0) + COALESCE(dcj.products_updated, 0) as products_saved,
EXTRACT(EPOCH FROM (NOW() - dcj.started_at)) as duration_seconds
FROM dispensary_crawl_jobs dcj
JOIN dispensaries d ON d.id = dcj.dispensary_id
${whereClause}
ORDER BY dcj.started_at DESC
`, params);
res.json({ jobs: result.rows });
} catch (error) {
console.error('Error fetching active jobs:', error);
res.status(500).json({ error: 'Failed to fetch active jobs' });
}
});
router.get('/jobs/recent', async (req, res) => {
try {
const { limit = 50, dispensary_id, status } = req.query;
let whereClause = '';
const params: any[] = [];
let paramCount = 1;
const conditions: string[] = [];
if (dispensary_id) {
conditions.push(`dcj.dispensary_id = $${paramCount}`);
params.push(dispensary_id);
paramCount++;
}
if (status) {
conditions.push(`dcj.status = $${paramCount}`);
params.push(status);
paramCount++;
}
if (conditions.length > 0) {
whereClause = 'WHERE ' + conditions.join(' AND ');
}
params.push(limit);
const result = await pool.query(`
SELECT
dcj.id,
dcj.dispensary_id,
d.name as dispensary_name,
dcj.job_type,
dcj.status,
dcj.worker_id,
dcj.started_at,
dcj.completed_at,
dcj.products_found,
COALESCE(dcj.products_new, 0) + COALESCE(dcj.products_updated, 0) as products_saved,
dcj.error_message,
EXTRACT(EPOCH FROM (COALESCE(dcj.completed_at, NOW()) - dcj.started_at)) as duration_seconds
FROM dispensary_crawl_jobs dcj
JOIN dispensaries d ON d.id = dcj.dispensary_id
${whereClause}
ORDER BY dcj.created_at DESC
LIMIT $${paramCount}
`, params);
res.json({ jobs: result.rows });
} catch (error) {
console.error('Error fetching recent jobs:', error);
res.status(500).json({ error: 'Failed to fetch recent jobs' });
}
});
router.get('/jobs/workers', async (req, res) => {
try {
const { dispensary_id } = req.query;
let whereClause = "WHERE status = 'in_progress' AND worker_id IS NOT NULL";
const params: any[] = [];
if (dispensary_id) {
whereClause += ` AND dispensary_id = $1`;
params.push(dispensary_id);
}
const result = await pool.query(`
SELECT
worker_id,
COUNT(*) as active_jobs,
SUM(products_found) as total_products_found,
SUM(COALESCE(products_new, 0) + COALESCE(products_updated, 0)) as total_products_saved,
MIN(started_at) as earliest_start,
MAX(started_at) as latest_start
FROM dispensary_crawl_jobs
${whereClause}
GROUP BY worker_id
ORDER BY worker_id
`, params);
res.json({ workers: result.rows });
} catch (error) {
console.error('Error fetching worker stats:', error);
res.status(500).json({ error: 'Failed to fetch worker stats' });
}
});
router.get('/jobs/worker-logs/:workerId', async (req, res) => {
try {
const { workerId } = req.params;
const fs = await import('fs/promises');
const path = await import('path');
const logPath = path.join('/tmp', `worker-${workerId}.log`);
try {
const logs = await fs.readFile(logPath, 'utf-8');
const lines = logs.split('\n');
// Return last 100 lines
const recentLogs = lines.slice(-100).join('\n');
res.json({ logs: recentLogs });
} catch (fileError) {
res.json({ logs: 'No logs available for this worker yet.' });
}
} catch (error) {
console.error('Failed to get worker logs:', error);
res.status(500).json({ error: 'Failed to get worker logs' });
}
});
export default router;