## SEO Template Library - Add complete template library with 7 page types (state, city, category, brand, product, search, regeneration) - Add Template Library tab in SEO Orchestrator with accordion-based editors - Add template preview, validation, and variable injection engine - Add API endpoints: /api/seo/templates, preview, validate, generate, regenerate ## Discovery Pipeline - Add promotion.ts for discovery location validation and promotion - Add discover-all-states.ts script for multi-state discovery - Add promotion log migration (067) - Enhance discovery routes and types ## Orchestrator & Admin - Add crawl_enabled filter to stores page - Add API permissions page - Add job queue management - Add price analytics routes - Add markets and intelligence routes - Enhance dashboard and worker monitoring ## Infrastructure - Add migrations for worker definitions, SEO settings, field alignment - Add canonical pipeline for scraper v2 - Update hydration and sync orchestrator - Enhance multi-state query service 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
383 lines
10 KiB
TypeScript
383 lines
10 KiB
TypeScript
import { Router } from 'express';
|
|
import { authMiddleware } from '../auth/middleware';
|
|
import { pool } from '../db/pool';
|
|
|
|
const router = Router();
|
|
router.use(authMiddleware);
|
|
|
|
// In-memory storage for active scrapers
|
|
interface ActiveScraper {
|
|
id: string;
|
|
storeId: number;
|
|
storeName: string;
|
|
categoryId?: number;
|
|
categoryName?: string;
|
|
startTime: Date;
|
|
lastUpdate: Date;
|
|
status: 'running' | 'error' | 'completed';
|
|
stats: {
|
|
requestsTotal: number;
|
|
requestsSuccess: number;
|
|
itemsSaved: number;
|
|
itemsDropped: number;
|
|
errorsCount: number;
|
|
productsProcessed?: number;
|
|
productsTotal?: number;
|
|
};
|
|
currentActivity?: string;
|
|
}
|
|
|
|
export const activeScrapers = new Map<string, ActiveScraper>();
|
|
|
|
// Get all active scrapers
|
|
router.get('/active', async (req, res) => {
|
|
try {
|
|
const scrapers = Array.from(activeScrapers.values()).map(scraper => ({
|
|
...scraper,
|
|
duration: Date.now() - scraper.startTime.getTime(),
|
|
isStale: Date.now() - scraper.lastUpdate.getTime() > 60000 // 1 minute
|
|
}));
|
|
|
|
res.json({ scrapers });
|
|
} catch (error) {
|
|
console.error('Error fetching active scrapers:', error);
|
|
res.status(500).json({ error: 'Failed to fetch active scrapers' });
|
|
}
|
|
});
|
|
|
|
// Get scraper by ID
|
|
router.get('/active/:id', async (req, res) => {
|
|
try {
|
|
const { id } = req.params;
|
|
const scraper = activeScrapers.get(id);
|
|
|
|
if (!scraper) {
|
|
return res.status(404).json({ error: 'Scraper not found' });
|
|
}
|
|
|
|
res.json({
|
|
scraper: {
|
|
...scraper,
|
|
duration: Date.now() - scraper.startTime.getTime(),
|
|
isStale: Date.now() - scraper.lastUpdate.getTime() > 60000
|
|
}
|
|
});
|
|
} catch (error) {
|
|
console.error('Error fetching scraper:', error);
|
|
res.status(500).json({ error: 'Failed to fetch scraper' });
|
|
}
|
|
});
|
|
|
|
// Get scraper history (last 50 completed scrapes)
|
|
router.get('/history', async (req, res) => {
|
|
try {
|
|
const { limit = 50, dispensary_id } = req.query;
|
|
|
|
let query = `
|
|
SELECT
|
|
d.id as dispensary_id,
|
|
d.name as dispensary_name,
|
|
d.city,
|
|
d.state,
|
|
dcj.id as job_id,
|
|
dcj.job_type,
|
|
dcj.status,
|
|
dcj.products_found,
|
|
dcj.products_new,
|
|
dcj.products_updated,
|
|
dcj.in_stock_count,
|
|
dcj.out_of_stock_count,
|
|
dcj.duration_ms,
|
|
dcj.completed_at as last_scraped_at,
|
|
dcj.error_message,
|
|
(
|
|
SELECT COUNT(*)
|
|
FROM store_products sp
|
|
WHERE sp.dispensary_id = d.id
|
|
AND sp.last_seen_at >= NOW() - INTERVAL '7 days'
|
|
) as product_count
|
|
FROM dispensary_crawl_jobs dcj
|
|
JOIN dispensaries d ON d.id = dcj.dispensary_id
|
|
WHERE dcj.completed_at IS NOT NULL
|
|
`;
|
|
|
|
const params: any[] = [];
|
|
let paramCount = 1;
|
|
|
|
if (dispensary_id) {
|
|
query += ` AND d.id = $${paramCount}`;
|
|
params.push(dispensary_id);
|
|
paramCount++;
|
|
}
|
|
|
|
query += ` ORDER BY dcj.completed_at DESC LIMIT $${paramCount}`;
|
|
params.push(limit);
|
|
|
|
const result = await pool.query(query, params);
|
|
|
|
res.json({ history: result.rows });
|
|
} catch (error) {
|
|
console.error('Error fetching scraper history:', error);
|
|
res.status(500).json({ error: 'Failed to fetch scraper history' });
|
|
}
|
|
});
|
|
|
|
// Helper function to register a scraper
|
|
export function registerScraper(
|
|
id: string,
|
|
storeId: number,
|
|
storeName: string,
|
|
categoryId?: number,
|
|
categoryName?: string
|
|
): void {
|
|
activeScrapers.set(id, {
|
|
id,
|
|
storeId,
|
|
storeName,
|
|
categoryId,
|
|
categoryName,
|
|
startTime: new Date(),
|
|
lastUpdate: new Date(),
|
|
status: 'running',
|
|
stats: {
|
|
requestsTotal: 0,
|
|
requestsSuccess: 0,
|
|
itemsSaved: 0,
|
|
itemsDropped: 0,
|
|
errorsCount: 0
|
|
}
|
|
});
|
|
}
|
|
|
|
// Helper function to update scraper stats
|
|
export function updateScraperStats(
|
|
id: string,
|
|
stats: Partial<ActiveScraper['stats']>,
|
|
currentActivity?: string
|
|
): void {
|
|
const scraper = activeScrapers.get(id);
|
|
if (scraper) {
|
|
scraper.stats = { ...scraper.stats, ...stats };
|
|
scraper.lastUpdate = new Date();
|
|
if (currentActivity) {
|
|
scraper.currentActivity = currentActivity;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Helper function to mark scraper as completed
|
|
export function completeScraper(id: string, error?: string): void {
|
|
const scraper = activeScrapers.get(id);
|
|
if (scraper) {
|
|
scraper.status = error ? 'error' : 'completed';
|
|
scraper.lastUpdate = new Date();
|
|
|
|
// Remove after 5 minutes
|
|
setTimeout(() => {
|
|
activeScrapers.delete(id);
|
|
}, 5 * 60 * 1000);
|
|
}
|
|
}
|
|
|
|
// Dispensary crawl jobs endpoints
|
|
router.get('/jobs/stats', async (req, res) => {
|
|
try {
|
|
const { dispensary_id } = req.query;
|
|
|
|
let whereClause = '';
|
|
const params: any[] = [];
|
|
|
|
if (dispensary_id) {
|
|
whereClause = 'WHERE dispensary_id = $1';
|
|
params.push(dispensary_id);
|
|
}
|
|
|
|
const result = await pool.query(`
|
|
SELECT
|
|
status,
|
|
COUNT(*) as count,
|
|
SUM(products_found) as total_products_found,
|
|
SUM(COALESCE(products_new, 0) + COALESCE(products_updated, 0)) as total_products_saved
|
|
FROM dispensary_crawl_jobs
|
|
${whereClause}
|
|
GROUP BY status
|
|
`, params);
|
|
|
|
const stats = {
|
|
pending: 0,
|
|
in_progress: 0,
|
|
completed: 0,
|
|
failed: 0,
|
|
total_products_found: 0,
|
|
total_products_saved: 0
|
|
};
|
|
|
|
result.rows.forEach((row: { status: string; count: string; total_products_found?: string; total_products_saved?: string }) => {
|
|
stats[row.status as keyof typeof stats] = parseInt(row.count);
|
|
if (row.status === 'completed') {
|
|
stats.total_products_found += parseInt(row.total_products_found || '0');
|
|
stats.total_products_saved += parseInt(row.total_products_saved || '0');
|
|
}
|
|
});
|
|
|
|
res.json(stats);
|
|
} catch (error) {
|
|
console.error('Error fetching job stats:', error);
|
|
res.status(500).json({ error: 'Failed to fetch job stats' });
|
|
}
|
|
});
|
|
|
|
router.get('/jobs/active', async (req, res) => {
|
|
try {
|
|
const { dispensary_id } = req.query;
|
|
|
|
let whereClause = "WHERE dcj.status = 'in_progress'";
|
|
const params: any[] = [];
|
|
let paramCount = 1;
|
|
|
|
if (dispensary_id) {
|
|
whereClause += ` AND dcj.dispensary_id = $${paramCount}`;
|
|
params.push(dispensary_id);
|
|
paramCount++;
|
|
}
|
|
|
|
const result = await pool.query(`
|
|
SELECT
|
|
dcj.id,
|
|
dcj.dispensary_id,
|
|
d.name as dispensary_name,
|
|
dcj.job_type,
|
|
dcj.status,
|
|
dcj.worker_id,
|
|
dcj.started_at,
|
|
dcj.products_found,
|
|
COALESCE(dcj.products_new, 0) + COALESCE(dcj.products_updated, 0) as products_saved,
|
|
EXTRACT(EPOCH FROM (NOW() - dcj.started_at)) as duration_seconds
|
|
FROM dispensary_crawl_jobs dcj
|
|
JOIN dispensaries d ON d.id = dcj.dispensary_id
|
|
${whereClause}
|
|
ORDER BY dcj.started_at DESC
|
|
`, params);
|
|
|
|
res.json({ jobs: result.rows });
|
|
} catch (error) {
|
|
console.error('Error fetching active jobs:', error);
|
|
res.status(500).json({ error: 'Failed to fetch active jobs' });
|
|
}
|
|
});
|
|
|
|
router.get('/jobs/recent', async (req, res) => {
|
|
try {
|
|
const { limit = 50, dispensary_id, status } = req.query;
|
|
|
|
let whereClause = '';
|
|
const params: any[] = [];
|
|
let paramCount = 1;
|
|
|
|
const conditions: string[] = [];
|
|
|
|
if (dispensary_id) {
|
|
conditions.push(`dcj.dispensary_id = $${paramCount}`);
|
|
params.push(dispensary_id);
|
|
paramCount++;
|
|
}
|
|
|
|
if (status) {
|
|
conditions.push(`dcj.status = $${paramCount}`);
|
|
params.push(status);
|
|
paramCount++;
|
|
}
|
|
|
|
if (conditions.length > 0) {
|
|
whereClause = 'WHERE ' + conditions.join(' AND ');
|
|
}
|
|
|
|
params.push(limit);
|
|
|
|
const result = await pool.query(`
|
|
SELECT
|
|
dcj.id,
|
|
dcj.dispensary_id,
|
|
d.name as dispensary_name,
|
|
dcj.job_type,
|
|
dcj.status,
|
|
dcj.worker_id,
|
|
dcj.started_at,
|
|
dcj.completed_at,
|
|
dcj.products_found,
|
|
COALESCE(dcj.products_new, 0) + COALESCE(dcj.products_updated, 0) as products_saved,
|
|
dcj.error_message,
|
|
EXTRACT(EPOCH FROM (COALESCE(dcj.completed_at, NOW()) - dcj.started_at)) as duration_seconds
|
|
FROM dispensary_crawl_jobs dcj
|
|
JOIN dispensaries d ON d.id = dcj.dispensary_id
|
|
${whereClause}
|
|
ORDER BY dcj.created_at DESC
|
|
LIMIT $${paramCount}
|
|
`, params);
|
|
|
|
res.json({ jobs: result.rows });
|
|
} catch (error) {
|
|
console.error('Error fetching recent jobs:', error);
|
|
res.status(500).json({ error: 'Failed to fetch recent jobs' });
|
|
}
|
|
});
|
|
|
|
router.get('/jobs/workers', async (req, res) => {
|
|
try {
|
|
const { dispensary_id } = req.query;
|
|
|
|
let whereClause = "WHERE status = 'in_progress' AND worker_id IS NOT NULL";
|
|
const params: any[] = [];
|
|
|
|
if (dispensary_id) {
|
|
whereClause += ` AND dispensary_id = $1`;
|
|
params.push(dispensary_id);
|
|
}
|
|
|
|
const result = await pool.query(`
|
|
SELECT
|
|
worker_id,
|
|
COUNT(*) as active_jobs,
|
|
SUM(products_found) as total_products_found,
|
|
SUM(COALESCE(products_new, 0) + COALESCE(products_updated, 0)) as total_products_saved,
|
|
MIN(started_at) as earliest_start,
|
|
MAX(started_at) as latest_start
|
|
FROM dispensary_crawl_jobs
|
|
${whereClause}
|
|
GROUP BY worker_id
|
|
ORDER BY worker_id
|
|
`, params);
|
|
|
|
res.json({ workers: result.rows });
|
|
} catch (error) {
|
|
console.error('Error fetching worker stats:', error);
|
|
res.status(500).json({ error: 'Failed to fetch worker stats' });
|
|
}
|
|
});
|
|
|
|
router.get('/jobs/worker-logs/:workerId', async (req, res) => {
|
|
try {
|
|
const { workerId } = req.params;
|
|
const fs = await import('fs/promises');
|
|
const path = await import('path');
|
|
|
|
const logPath = path.join('/tmp', `worker-${workerId}.log`);
|
|
|
|
try {
|
|
const logs = await fs.readFile(logPath, 'utf-8');
|
|
const lines = logs.split('\n');
|
|
// Return last 100 lines
|
|
const recentLogs = lines.slice(-100).join('\n');
|
|
|
|
res.json({ logs: recentLogs });
|
|
} catch (fileError) {
|
|
res.json({ logs: 'No logs available for this worker yet.' });
|
|
}
|
|
} catch (error) {
|
|
console.error('Failed to get worker logs:', error);
|
|
res.status(500).json({ error: 'Failed to get worker logs' });
|
|
}
|
|
});
|
|
|
|
export default router;
|