feat: SEO template library, discovery pipeline, and orchestrator enhancements

## SEO Template Library
- Add complete template library with 7 page types (state, city, category, brand, product, search, regeneration)
- Add Template Library tab in SEO Orchestrator with accordion-based editors
- Add template preview, validation, and variable injection engine
- Add API endpoints: /api/seo/templates, preview, validate, generate, regenerate

## Discovery Pipeline
- Add promotion.ts for discovery location validation and promotion
- Add discover-all-states.ts script for multi-state discovery
- Add promotion log migration (067)
- Enhance discovery routes and types

## Orchestrator & Admin
- Add crawl_enabled filter to stores page
- Add API permissions page
- Add job queue management
- Add price analytics routes
- Add markets and intelligence routes
- Enhance dashboard and worker monitoring

## Infrastructure
- Add migrations for worker definitions, SEO settings, field alignment
- Add canonical pipeline for scraper v2
- Update hydration and sync orchestrator
- Enhance multi-state query service

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Kelly
2025-12-09 00:05:34 -07:00
parent 9711d594db
commit 2f483b3084
83 changed files with 16700 additions and 1277 deletions

View File

@@ -24,37 +24,22 @@ const router = Router();
*/
router.get('/metrics', async (_req: Request, res: Response) => {
try {
// Get aggregate metrics
// Get aggregate metrics using 7-stage pipeline
const { rows: metrics } = await pool.query(`
SELECT
(SELECT COUNT(*) FROM dutchie_products) as total_products,
(SELECT COUNT(DISTINCT brand_name) FROM dutchie_products WHERE brand_name IS NOT NULL) as total_brands,
(SELECT COUNT(*) FROM dispensaries WHERE state = 'AZ') as total_stores,
(
SELECT COUNT(*)
FROM dispensary_crawler_profiles dcp
WHERE dcp.enabled = true
AND (dcp.status = 'production' OR (dcp.config->>'status')::text = 'production')
) as healthy_count,
(
SELECT COUNT(*)
FROM dispensary_crawler_profiles dcp
WHERE dcp.enabled = true
AND (dcp.status = 'sandbox' OR (dcp.config->>'status')::text = 'sandbox')
) as sandbox_count,
(
SELECT COUNT(*)
FROM dispensary_crawler_profiles dcp
WHERE dcp.enabled = true
AND (dcp.status = 'needs_manual' OR (dcp.config->>'status')::text = 'needs_manual')
) as needs_manual_count,
(
SELECT COUNT(*)
FROM dispensary_crawler_profiles dcp
JOIN dispensaries d ON d.id = dcp.dispensary_id
WHERE d.state = 'AZ'
AND dcp.status = 'needs_manual'
) as failing_count
(SELECT COUNT(*) FROM store_products) as total_products,
(SELECT COUNT(DISTINCT brand_name_raw) FROM store_products WHERE brand_name_raw IS NOT NULL) as total_brands,
(SELECT COUNT(*) FROM dispensaries WHERE menu_type = 'dutchie' AND crawl_enabled = true) as total_stores,
-- Stage counts from dispensaries table (7-stage pipeline)
(SELECT COUNT(*) FROM dispensaries WHERE stage = 'discovered') as discovered_count,
(SELECT COUNT(*) FROM dispensaries WHERE stage = 'validated') as validated_count,
(SELECT COUNT(*) FROM dispensaries WHERE stage = 'promoted') as promoted_count,
(SELECT COUNT(*) FROM dispensaries WHERE stage = 'sandbox') as sandbox_count,
(SELECT COUNT(*) FROM dispensaries WHERE stage = 'hydrating') as hydrating_count,
(SELECT COUNT(*) FROM dispensaries WHERE stage = 'production') as production_count,
(SELECT COUNT(*) FROM dispensaries WHERE stage = 'failing') as failing_count,
-- Discovery pipeline counts
(SELECT COUNT(*) FROM dutchie_discovery_locations WHERE stage = 'discovered' AND active = true) as discovery_pending
`);
const row = metrics[0] || {};
@@ -63,13 +48,22 @@ router.get('/metrics', async (_req: Request, res: Response) => {
total_products: parseInt(row.total_products || '0', 10),
total_brands: parseInt(row.total_brands || '0', 10),
total_stores: parseInt(row.total_stores || '0', 10),
// Placeholder sentiment values - these would come from actual analytics
market_sentiment: 'neutral',
market_direction: 'stable',
// Health counts
healthy_count: parseInt(row.healthy_count || '0', 10),
// 7-Stage Pipeline Counts
stages: {
discovered: parseInt(row.discovered_count || '0', 10),
validated: parseInt(row.validated_count || '0', 10),
promoted: parseInt(row.promoted_count || '0', 10),
sandbox: parseInt(row.sandbox_count || '0', 10),
hydrating: parseInt(row.hydrating_count || '0', 10),
production: parseInt(row.production_count || '0', 10),
failing: parseInt(row.failing_count || '0', 10),
},
// Discovery pipeline
discovery_pending: parseInt(row.discovery_pending || '0', 10),
// Legacy compatibility
healthy_count: parseInt(row.production_count || '0', 10),
sandbox_count: parseInt(row.sandbox_count || '0', 10),
needs_manual_count: parseInt(row.needs_manual_count || '0', 10),
needs_manual_count: parseInt(row.failing_count || '0', 10),
failing_count: parseInt(row.failing_count || '0', 10),
});
} catch (error: any) {
@@ -157,9 +151,14 @@ router.get('/stores', async (req: Request, res: Response) => {
d.platform_dispensary_id,
d.last_crawl_at,
d.crawl_enabled,
d.stage,
d.stage_changed_at,
d.first_crawl_at,
d.last_successful_crawl_at,
dcp.id as profile_id,
dcp.profile_key,
COALESCE(dcp.status, dcp.config->>'status', 'legacy') as crawler_status,
dcp.consecutive_successes,
dcp.consecutive_failures,
(
SELECT MAX(cot.completed_at)
FROM crawl_orchestration_traces cot
@@ -172,8 +171,8 @@ router.get('/stores', async (req: Request, res: Response) => {
) as last_failure_at,
(
SELECT COUNT(*)
FROM dutchie_products dp
WHERE dp.dispensary_id = d.id
FROM store_products sp
WHERE sp.dispensary_id = d.id
) as product_count
FROM dispensaries d
LEFT JOIN dispensary_crawler_profiles dcp
@@ -197,10 +196,17 @@ router.get('/stores', async (req: Request, res: Response) => {
state: r.state,
provider: r.provider || 'unknown',
provider_raw: r.provider || null,
provider_display: getProviderDisplayName(r.provider),
// Admin routes show actual provider names (not anonymized)
provider_display: r.provider || 'Unknown',
platformDispensaryId: r.platform_dispensary_id,
crawlEnabled: r.crawl_enabled ?? false,
status: r.crawler_status || (r.platform_dispensary_id ? 'legacy' : 'pending'),
// Use stage from dispensaries table (6-stage pipeline)
stage: r.stage || 'discovered',
stageChangedAt: r.stage_changed_at,
firstCrawlAt: r.first_crawl_at,
lastSuccessfulCrawlAt: r.last_successful_crawl_at,
consecutiveSuccesses: r.consecutive_successes || 0,
consecutiveFailures: r.consecutive_failures || 0,
profileId: r.profile_id,
profileKey: r.profile_key,
lastCrawlAt: r.last_crawl_at,
@@ -438,4 +444,392 @@ router.get('/crawl-traces/:traceId', async (req: Request, res: Response) => {
}
});
// ============================================================
// STATUS MANAGEMENT
// ============================================================
// 6-Stage Pipeline Statuses
const VALID_STAGES = ['discovered', 'validated', 'promoted', 'sandbox', 'production', 'failing'] as const;
/**
* POST /api/admin/orchestrator/stores/:id/stage
* Manually update the stage for a store (use /api/pipeline for proper transitions)
* Body: { stage: 'discovered' | 'validated' | 'promoted' | 'sandbox' | 'production' | 'failing', reason?: string }
*/
router.post('/stores/:id/stage', async (req: Request, res: Response) => {
try {
const { id } = req.params;
const { stage: status, reason } = req.body;
if (!status || !VALID_STAGES.includes(status)) {
return res.status(400).json({
error: `Invalid stage. Must be one of: ${VALID_STAGES.join(', ')}`,
});
}
const dispensaryId = parseInt(id, 10);
// Get current profile and status
const { rows: profileRows } = await pool.query(`
SELECT dcp.id, dcp.status as current_status, d.name as dispensary_name
FROM dispensary_crawler_profiles dcp
JOIN dispensaries d ON d.id = dcp.dispensary_id
WHERE dcp.dispensary_id = $1 AND dcp.enabled = true
ORDER BY dcp.updated_at DESC
LIMIT 1
`, [dispensaryId]);
if (profileRows.length === 0) {
return res.status(404).json({ error: 'No crawler profile found for this store' });
}
const profileId = profileRows[0].id;
const currentStatus = profileRows[0].current_status;
const dispensaryName = profileRows[0].dispensary_name;
// Update the status
await pool.query(`
UPDATE dispensary_crawler_profiles
SET
status = $1,
status_reason = $2,
status_changed_at = CURRENT_TIMESTAMP,
updated_at = CURRENT_TIMESTAMP
WHERE id = $3
`, [status, reason || `Manual status change to ${status}`, profileId]);
// Create status alert
const severity = status === 'production' ? 'info'
: status === 'needs_manual' ? 'warning'
: status === 'failing' ? 'error'
: 'info';
await pool.query(`
INSERT INTO crawler_status_alerts
(dispensary_id, profile_id, alert_type, severity, message, previous_status, new_status, metadata)
VALUES ($1, $2, 'status_change', $3, $4, $5, $6, $7)
`, [
dispensaryId,
profileId,
severity,
`${dispensaryName}: Status changed from ${currentStatus || 'unknown'} to ${status}`,
currentStatus,
status,
JSON.stringify({ reason, changedBy: 'admin_api' }),
]);
res.json({
success: true,
dispensaryId,
profileId,
previousStatus: currentStatus,
newStatus: status,
message: `Status updated to ${status}`,
});
} catch (error: any) {
console.error('[OrchestratorAdmin] Error updating status:', error.message);
res.status(500).json({ error: error.message });
}
});
/**
* GET /api/admin/orchestrator/alerts
* Get recent status alerts for the dashboard
* Query params:
* - severity: Filter by severity (info, warning, error, critical)
* - acknowledged: Filter by acknowledged status (true/false)
* - limit: Max results (default 50)
*/
router.get('/alerts', async (req: Request, res: Response) => {
try {
const { severity, acknowledged, dispensary_id, limit = '50' } = req.query;
let whereClause = 'WHERE 1=1';
const params: any[] = [];
let paramIndex = 1;
if (severity) {
whereClause += ` AND csa.severity = $${paramIndex}`;
params.push(severity);
paramIndex++;
}
if (acknowledged === 'true') {
whereClause += ' AND csa.acknowledged = true';
} else if (acknowledged === 'false') {
whereClause += ' AND csa.acknowledged = false';
}
if (dispensary_id) {
whereClause += ` AND csa.dispensary_id = $${paramIndex}`;
params.push(parseInt(dispensary_id as string, 10));
paramIndex++;
}
params.push(parseInt(limit as string, 10));
const { rows } = await pool.query(`
SELECT
csa.*,
d.name as dispensary_name,
d.city,
d.state
FROM crawler_status_alerts csa
LEFT JOIN dispensaries d ON csa.dispensary_id = d.id
${whereClause}
ORDER BY csa.created_at DESC
LIMIT $${paramIndex}
`, params);
// Get unacknowledged count by severity
const { rows: countRows } = await pool.query(`
SELECT severity, COUNT(*) as count
FROM crawler_status_alerts
WHERE acknowledged = false
GROUP BY severity
`);
const unacknowledgedCounts = countRows.reduce((acc: Record<string, number>, row: any) => {
acc[row.severity] = parseInt(row.count, 10);
return acc;
}, {});
res.json({
alerts: rows.map((r: any) => ({
id: r.id,
dispensaryId: r.dispensary_id,
dispensaryName: r.dispensary_name,
city: r.city,
state: r.state,
profileId: r.profile_id,
alertType: r.alert_type,
severity: r.severity,
message: r.message,
previousStatus: r.previous_status,
newStatus: r.new_status,
errorDetails: r.error_details,
metadata: r.metadata,
acknowledged: r.acknowledged,
acknowledgedAt: r.acknowledged_at,
acknowledgedBy: r.acknowledged_by,
createdAt: r.created_at,
})),
unacknowledgedCounts,
});
} catch (error: any) {
console.error('[OrchestratorAdmin] Error fetching alerts:', error.message);
res.status(500).json({ error: error.message });
}
});
/**
* POST /api/admin/orchestrator/alerts/:id/acknowledge
* Acknowledge an alert
*/
router.post('/alerts/:id/acknowledge', async (req: Request, res: Response) => {
try {
const { id } = req.params;
const { acknowledgedBy = 'admin' } = req.body;
await pool.query(`
UPDATE crawler_status_alerts
SET acknowledged = true, acknowledged_at = CURRENT_TIMESTAMP, acknowledged_by = $1
WHERE id = $2
`, [acknowledgedBy, parseInt(id, 10)]);
res.json({ success: true, alertId: parseInt(id, 10) });
} catch (error: any) {
console.error('[OrchestratorAdmin] Error acknowledging alert:', error.message);
res.status(500).json({ error: error.message });
}
});
/**
* POST /api/admin/orchestrator/alerts/acknowledge-all
* Acknowledge all unacknowledged alerts (optionally filtered)
*/
router.post('/alerts/acknowledge-all', async (req: Request, res: Response) => {
try {
const { severity, dispensaryId, acknowledgedBy = 'admin' } = req.body;
let whereClause = 'WHERE acknowledged = false';
const params: any[] = [acknowledgedBy];
let paramIndex = 2;
if (severity) {
whereClause += ` AND severity = $${paramIndex}`;
params.push(severity);
paramIndex++;
}
if (dispensaryId) {
whereClause += ` AND dispensary_id = $${paramIndex}`;
params.push(dispensaryId);
paramIndex++;
}
const result = await pool.query(`
UPDATE crawler_status_alerts
SET acknowledged = true, acknowledged_at = CURRENT_TIMESTAMP, acknowledged_by = $1
${whereClause}
`, params);
res.json({ success: true, acknowledgedCount: result.rowCount });
} catch (error: any) {
console.error('[OrchestratorAdmin] Error acknowledging alerts:', error.message);
res.status(500).json({ error: error.message });
}
});
/**
* POST /api/admin/orchestrator/crawl-outcome
* Record a crawl outcome and update status based on success/failure
* This endpoint is called by the crawler after each crawl attempt
*/
router.post('/crawl-outcome', async (req: Request, res: Response) => {
try {
const {
dispensaryId,
success,
productsFound = 0,
error,
metadata = {},
} = req.body;
if (!dispensaryId) {
return res.status(400).json({ error: 'dispensaryId is required' });
}
// Get current profile
const { rows: profileRows } = await pool.query(`
SELECT
dcp.id,
dcp.status,
dcp.consecutive_successes,
dcp.consecutive_failures,
d.name as dispensary_name
FROM dispensary_crawler_profiles dcp
JOIN dispensaries d ON d.id = dcp.dispensary_id
WHERE dcp.dispensary_id = $1 AND dcp.enabled = true
ORDER BY dcp.updated_at DESC
LIMIT 1
`, [dispensaryId]);
if (profileRows.length === 0) {
return res.status(404).json({ error: 'No crawler profile found' });
}
const profile = profileRows[0];
const currentStatus = profile.status;
let newStatus = currentStatus;
let statusChanged = false;
let consecutiveSuccesses = profile.consecutive_successes || 0;
let consecutiveFailures = profile.consecutive_failures || 0;
if (success) {
consecutiveSuccesses++;
consecutiveFailures = 0;
// Auto-promote from sandbox to production after 3 consecutive successes
if (currentStatus === 'sandbox' && consecutiveSuccesses >= 3) {
newStatus = 'production';
statusChanged = true;
}
// Auto-recover from needs_manual/failing after 2 consecutive successes
else if ((currentStatus === 'needs_manual' || currentStatus === 'failing') && consecutiveSuccesses >= 2) {
newStatus = 'production';
statusChanged = true;
}
} else {
consecutiveFailures++;
consecutiveSuccesses = 0;
// Demote to needs_manual after 2 consecutive failures
if (currentStatus === 'production' && consecutiveFailures >= 2) {
newStatus = 'needs_manual';
statusChanged = true;
}
// Demote to failing after 5 consecutive failures
else if (currentStatus === 'needs_manual' && consecutiveFailures >= 5) {
newStatus = 'failing';
statusChanged = true;
}
// Keep sandbox as sandbox even with failures (needs manual intervention to fix)
else if (currentStatus === 'sandbox' && consecutiveFailures >= 3) {
newStatus = 'needs_manual';
statusChanged = true;
}
}
// Update profile
await pool.query(`
UPDATE dispensary_crawler_profiles
SET
consecutive_successes = $1,
consecutive_failures = $2,
status = $3,
status_reason = CASE WHEN $4 THEN $5 ELSE status_reason END,
status_changed_at = CASE WHEN $4 THEN CURRENT_TIMESTAMP ELSE status_changed_at END,
updated_at = CURRENT_TIMESTAMP
WHERE id = $6
`, [
consecutiveSuccesses,
consecutiveFailures,
newStatus,
statusChanged,
statusChanged ? (success ? 'Auto-promoted after consecutive successes' : `Auto-demoted after ${consecutiveFailures} consecutive failures`) : null,
profile.id,
]);
// Create alert if status changed or error occurred
if (statusChanged) {
const severity = newStatus === 'production' ? 'info'
: newStatus === 'needs_manual' ? 'warning'
: 'error';
await pool.query(`
INSERT INTO crawler_status_alerts
(dispensary_id, profile_id, alert_type, severity, message, previous_status, new_status, metadata)
VALUES ($1, $2, 'status_change', $3, $4, $5, $6, $7)
`, [
dispensaryId,
profile.id,
severity,
`${profile.dispensary_name}: ${success ? 'Promoted' : 'Demoted'} from ${currentStatus} to ${newStatus}`,
currentStatus,
newStatus,
JSON.stringify({ productsFound, consecutiveSuccesses, consecutiveFailures, ...metadata }),
]);
} else if (!success && error) {
// Log crawl error as alert
await pool.query(`
INSERT INTO crawler_status_alerts
(dispensary_id, profile_id, alert_type, severity, message, error_details, metadata)
VALUES ($1, $2, 'crawl_error', $3, $4, $5, $6)
`, [
dispensaryId,
profile.id,
consecutiveFailures >= 2 ? 'warning' : 'info',
`${profile.dispensary_name}: Crawl failed - ${error}`,
JSON.stringify({ error, stack: metadata.stack }),
JSON.stringify({ consecutiveFailures, ...metadata }),
]);
}
res.json({
success: true,
dispensaryId,
profileId: profile.id,
statusChanged,
previousStatus: currentStatus,
newStatus,
consecutiveSuccesses,
consecutiveFailures,
});
} catch (error: any) {
console.error('[OrchestratorAdmin] Error recording crawl outcome:', error.message);
res.status(500).json({ error: error.message });
}
});
export default router;