Files
cannaiq/backend/dist/routes/crawler-sandbox.js
Kelly 66e07b2009 fix(monitor): remove non-existent worker columns from job_run_logs query
The job_run_logs table tracks scheduled job orchestration, not individual
worker jobs. Worker info (worker_id, worker_hostname) belongs on
dispensary_crawl_jobs, not job_run_logs.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-03 18:45:05 -07:00

498 lines
18 KiB
JavaScript

"use strict";
/**
* Crawler Sandbox API Routes
*
* Endpoints for managing sandbox crawls, templates, and provider detection
*/
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
const express_1 = __importDefault(require("express"));
const migrate_1 = require("../db/migrate");
const middleware_1 = require("../auth/middleware");
const logger_1 = require("../services/logger");
const crawler_jobs_1 = require("../services/crawler-jobs");
const router = express_1.default.Router();
// Apply auth middleware to all routes
router.use(middleware_1.authMiddleware);
// ========================================
// Sandbox Entries
// ========================================
/**
* GET /api/crawler-sandbox
* List sandbox entries with optional filters
*/
router.get('/', async (req, res) => {
try {
const { status, dispensaryId, limit = 50, offset = 0 } = req.query;
let query = `
SELECT cs.*, d.name as dispensary_name, d.website, d.menu_provider, d.crawler_status
FROM crawler_sandboxes cs
JOIN dispensaries d ON d.id = cs.dispensary_id
WHERE 1=1
`;
const params = [];
let paramIndex = 1;
if (status) {
query += ` AND cs.status = $${paramIndex}`;
params.push(status);
paramIndex++;
}
if (dispensaryId) {
query += ` AND cs.dispensary_id = $${paramIndex}`;
params.push(Number(dispensaryId));
paramIndex++;
}
query += ` ORDER BY cs.created_at DESC LIMIT $${paramIndex} OFFSET $${paramIndex + 1}`;
params.push(Number(limit), Number(offset));
const result = await migrate_1.pool.query(query, params);
// Get total count
const countResult = await migrate_1.pool.query(`SELECT COUNT(*) FROM crawler_sandboxes cs WHERE 1=1
${status ? 'AND cs.status = $1' : ''}
${dispensaryId ? `AND cs.dispensary_id = $${status ? 2 : 1}` : ''}`, status && dispensaryId ? [status, dispensaryId] : status ? [status] : dispensaryId ? [dispensaryId] : []);
res.json({
sandboxes: result.rows,
total: parseInt(countResult.rows[0].count),
limit: Number(limit),
offset: Number(offset),
});
}
catch (error) {
logger_1.logger.error('api', `Get sandboxes error: ${error.message}`);
res.status(500).json({ error: error.message });
}
});
/**
* GET /api/crawler-sandbox/:id
* Get a single sandbox entry with full details
*/
router.get('/:id', async (req, res) => {
try {
const { id } = req.params;
const result = await migrate_1.pool.query(`SELECT cs.*, d.name as dispensary_name, d.website, d.menu_url,
d.menu_provider, d.menu_provider_confidence, d.crawler_mode, d.crawler_status
FROM crawler_sandboxes cs
JOIN dispensaries d ON d.id = cs.dispensary_id
WHERE cs.id = $1`, [id]);
if (result.rows.length === 0) {
return res.status(404).json({ error: 'Sandbox entry not found' });
}
// Get related jobs
const jobs = await migrate_1.pool.query(`SELECT * FROM sandbox_crawl_jobs
WHERE sandbox_id = $1 OR dispensary_id = $2
ORDER BY created_at DESC
LIMIT 10`, [id, result.rows[0].dispensary_id]);
res.json({
sandbox: result.rows[0],
jobs: jobs.rows,
});
}
catch (error) {
logger_1.logger.error('api', `Get sandbox error: ${error.message}`);
res.status(500).json({ error: error.message });
}
});
/**
* POST /api/crawler-sandbox/:id/analyze
* Trigger re-analysis of a sandbox entry
*/
router.post('/:id/analyze', (0, middleware_1.requireRole)('admin'), async (req, res) => {
try {
const { id } = req.params;
const sandbox = await migrate_1.pool.query('SELECT * FROM crawler_sandboxes WHERE id = $1', [id]);
if (sandbox.rows.length === 0) {
return res.status(404).json({ error: 'Sandbox entry not found' });
}
// Queue a new sandbox job
const job = await migrate_1.pool.query(`INSERT INTO sandbox_crawl_jobs (dispensary_id, sandbox_id, job_type, status, priority)
VALUES ($1, $2, 'deep_crawl', 'pending', 20)
RETURNING id`, [sandbox.rows[0].dispensary_id, id]);
// Update sandbox status
await migrate_1.pool.query(`UPDATE crawler_sandboxes SET status = 'pending', updated_at = NOW() WHERE id = $1`, [id]);
res.json({
message: 'Analysis job queued',
jobId: job.rows[0].id,
});
}
catch (error) {
logger_1.logger.error('api', `Analyze sandbox error: ${error.message}`);
res.status(500).json({ error: error.message });
}
});
/**
* POST /api/crawler-sandbox/:id/move-to-production
* Move a sandbox entry to production (for Dutchie dispensaries)
*/
router.post('/:id/move-to-production', (0, middleware_1.requireRole)('admin'), async (req, res) => {
try {
const { id } = req.params;
const sandbox = await migrate_1.pool.query(`SELECT cs.*, d.menu_provider
FROM crawler_sandboxes cs
JOIN dispensaries d ON d.id = cs.dispensary_id
WHERE cs.id = $1`, [id]);
if (sandbox.rows.length === 0) {
return res.status(404).json({ error: 'Sandbox entry not found' });
}
// Can only move to production if provider is dutchie
if (sandbox.rows[0].menu_provider !== 'dutchie') {
return res.status(400).json({
error: 'Only Dutchie dispensaries can be moved to production currently',
});
}
// Update dispensary to production mode
await migrate_1.pool.query(`UPDATE dispensaries
SET crawler_mode = 'production', crawler_status = 'idle', updated_at = NOW()
WHERE id = $1`, [sandbox.rows[0].dispensary_id]);
// Mark sandbox as moved
await migrate_1.pool.query(`UPDATE crawler_sandboxes
SET status = 'moved_to_production', updated_at = NOW()
WHERE id = $1`, [id]);
res.json({ message: 'Dispensary moved to production' });
}
catch (error) {
logger_1.logger.error('api', `Move to production error: ${error.message}`);
res.status(500).json({ error: error.message });
}
});
/**
* PATCH /api/crawler-sandbox/:id
* Update sandbox entry (e.g., add human review notes)
*/
router.patch('/:id', (0, middleware_1.requireRole)('admin'), async (req, res) => {
try {
const { id } = req.params;
const { human_review_notes, status, suspected_menu_provider } = req.body;
const updates = [];
const params = [];
let paramIndex = 1;
if (human_review_notes !== undefined) {
updates.push(`human_review_notes = $${paramIndex}`);
params.push(human_review_notes);
paramIndex++;
}
if (status) {
updates.push(`status = $${paramIndex}`);
params.push(status);
paramIndex++;
}
if (suspected_menu_provider !== undefined) {
updates.push(`suspected_menu_provider = $${paramIndex}`);
params.push(suspected_menu_provider);
paramIndex++;
}
if (updates.length === 0) {
return res.status(400).json({ error: 'No updates provided' });
}
updates.push('updated_at = NOW()');
if (human_review_notes !== undefined) {
updates.push('reviewed_at = NOW()');
}
params.push(id);
await migrate_1.pool.query(`UPDATE crawler_sandboxes SET ${updates.join(', ')} WHERE id = $${paramIndex}`, params);
res.json({ message: 'Sandbox updated' });
}
catch (error) {
logger_1.logger.error('api', `Update sandbox error: ${error.message}`);
res.status(500).json({ error: error.message });
}
});
// ========================================
// Templates
// ========================================
/**
* GET /api/crawler-sandbox/templates
* List all crawler templates
*/
router.get('/templates/list', async (req, res) => {
try {
const result = await migrate_1.pool.query(`SELECT * FROM crawler_templates ORDER BY provider, is_default_for_provider DESC, name`);
res.json({ templates: result.rows });
}
catch (error) {
logger_1.logger.error('api', `Get templates error: ${error.message}`);
res.status(500).json({ error: error.message });
}
});
/**
* GET /api/crawler-sandbox/templates/:id
* Get a single template
*/
router.get('/templates/:id', async (req, res) => {
try {
const { id } = req.params;
const result = await migrate_1.pool.query('SELECT * FROM crawler_templates WHERE id = $1', [id]);
if (result.rows.length === 0) {
return res.status(404).json({ error: 'Template not found' });
}
res.json({ template: result.rows[0] });
}
catch (error) {
logger_1.logger.error('api', `Get template error: ${error.message}`);
res.status(500).json({ error: error.message });
}
});
/**
* POST /api/crawler-sandbox/templates
* Create a new template
*/
router.post('/templates', (0, middleware_1.requireRole)('admin'), async (req, res) => {
try {
const { provider, name, selector_config, navigation_config, transform_config, validation_rules, notes, } = req.body;
if (!provider || !name) {
return res.status(400).json({ error: 'provider and name are required' });
}
const result = await migrate_1.pool.query(`INSERT INTO crawler_templates
(provider, name, selector_config, navigation_config, transform_config, validation_rules, notes, created_by)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
RETURNING *`, [
provider,
name,
JSON.stringify(selector_config || {}),
JSON.stringify(navigation_config || {}),
JSON.stringify(transform_config || {}),
JSON.stringify(validation_rules || {}),
notes,
req.user?.email || 'system',
]);
res.status(201).json({ template: result.rows[0] });
}
catch (error) {
logger_1.logger.error('api', `Create template error: ${error.message}`);
res.status(500).json({ error: error.message });
}
});
/**
* PUT /api/crawler-sandbox/templates/:id
* Update a template
*/
router.put('/templates/:id', (0, middleware_1.requireRole)('admin'), async (req, res) => {
try {
const { id } = req.params;
const { is_active, is_default_for_provider, selector_config, navigation_config, transform_config, validation_rules, notes, } = req.body;
const updates = [];
const params = [];
let paramIndex = 1;
if (is_active !== undefined) {
updates.push(`is_active = $${paramIndex}`);
params.push(is_active);
paramIndex++;
}
if (is_default_for_provider !== undefined) {
updates.push(`is_default_for_provider = $${paramIndex}`);
params.push(is_default_for_provider);
paramIndex++;
}
if (selector_config !== undefined) {
updates.push(`selector_config = $${paramIndex}`);
params.push(JSON.stringify(selector_config));
paramIndex++;
}
if (navigation_config !== undefined) {
updates.push(`navigation_config = $${paramIndex}`);
params.push(JSON.stringify(navigation_config));
paramIndex++;
}
if (transform_config !== undefined) {
updates.push(`transform_config = $${paramIndex}`);
params.push(JSON.stringify(transform_config));
paramIndex++;
}
if (validation_rules !== undefined) {
updates.push(`validation_rules = $${paramIndex}`);
params.push(JSON.stringify(validation_rules));
paramIndex++;
}
if (notes !== undefined) {
updates.push(`notes = $${paramIndex}`);
params.push(notes);
paramIndex++;
}
if (updates.length === 0) {
return res.status(400).json({ error: 'No updates provided' });
}
updates.push('updated_at = NOW()');
params.push(id);
await migrate_1.pool.query(`UPDATE crawler_templates SET ${updates.join(', ')} WHERE id = $${paramIndex}`, params);
const result = await migrate_1.pool.query('SELECT * FROM crawler_templates WHERE id = $1', [id]);
res.json({ template: result.rows[0] });
}
catch (error) {
logger_1.logger.error('api', `Update template error: ${error.message}`);
res.status(500).json({ error: error.message });
}
});
// ========================================
// Jobs
// ========================================
/**
* GET /api/crawler-sandbox/jobs
* List sandbox crawl jobs
*/
router.get('/jobs/list', async (req, res) => {
try {
const { status, dispensaryId, limit = 50 } = req.query;
let query = `
SELECT sj.*, d.name as dispensary_name
FROM sandbox_crawl_jobs sj
JOIN dispensaries d ON d.id = sj.dispensary_id
WHERE 1=1
`;
const params = [];
let paramIndex = 1;
if (status) {
query += ` AND sj.status = $${paramIndex}`;
params.push(status);
paramIndex++;
}
if (dispensaryId) {
query += ` AND sj.dispensary_id = $${paramIndex}`;
params.push(Number(dispensaryId));
paramIndex++;
}
query += ` ORDER BY sj.created_at DESC LIMIT $${paramIndex}`;
params.push(Number(limit));
const result = await migrate_1.pool.query(query, params);
res.json({ jobs: result.rows });
}
catch (error) {
logger_1.logger.error('api', `Get jobs error: ${error.message}`);
res.status(500).json({ error: error.message });
}
});
/**
* POST /api/crawler-sandbox/jobs/detect/:dispensaryId
* Trigger provider detection for a dispensary
*/
router.post('/jobs/detect/:dispensaryId', (0, middleware_1.requireRole)('admin'), async (req, res) => {
try {
const { dispensaryId } = req.params;
// Create detection job
const job = await migrate_1.pool.query(`INSERT INTO sandbox_crawl_jobs (dispensary_id, job_type, status, priority)
VALUES ($1, 'detection', 'pending', 30)
RETURNING id`, [dispensaryId]);
// Update dispensary status
await migrate_1.pool.query(`UPDATE dispensaries SET crawler_status = 'queued_detection', updated_at = NOW() WHERE id = $1`, [dispensaryId]);
res.json({
message: 'Detection job queued',
jobId: job.rows[0].id,
});
}
catch (error) {
logger_1.logger.error('api', `Queue detection error: ${error.message}`);
res.status(500).json({ error: error.message });
}
});
/**
* POST /api/crawler-sandbox/jobs/run/:id
* Immediately run a sandbox job
*/
router.post('/jobs/run/:id', (0, middleware_1.requireRole)('admin'), async (req, res) => {
try {
const { id } = req.params;
const job = await migrate_1.pool.query('SELECT * FROM sandbox_crawl_jobs WHERE id = $1', [id]);
if (job.rows.length === 0) {
return res.status(404).json({ error: 'Job not found' });
}
const jobData = job.rows[0];
// Run the job immediately
let result;
if (jobData.job_type === 'detection') {
result = await (0, crawler_jobs_1.runDetectMenuProviderJob)(jobData.dispensary_id);
}
else {
result = await (0, crawler_jobs_1.runSandboxCrawlJob)(jobData.dispensary_id, jobData.sandbox_id);
}
// Update job status
await migrate_1.pool.query(`UPDATE sandbox_crawl_jobs
SET status = $1, completed_at = NOW(), result_summary = $2, error_message = $3
WHERE id = $4`, [
result.success ? 'completed' : 'failed',
JSON.stringify(result.data || {}),
result.success ? null : result.message,
id,
]);
res.json(result);
}
catch (error) {
logger_1.logger.error('api', `Run job error: ${error.message}`);
res.status(500).json({ error: error.message });
}
});
// ========================================
// Stats
// ========================================
/**
* GET /api/crawler-sandbox/stats
* Get sandbox/crawler statistics
*/
router.get('/stats/overview', async (req, res) => {
try {
// Dispensary provider stats
const providerStats = await migrate_1.pool.query(`
SELECT
menu_provider,
COUNT(*) as count,
AVG(menu_provider_confidence)::integer as avg_confidence
FROM dispensaries
WHERE menu_provider IS NOT NULL
GROUP BY menu_provider
ORDER BY count DESC
`);
// Mode stats
const modeStats = await migrate_1.pool.query(`
SELECT
crawler_mode,
COUNT(*) as count
FROM dispensaries
GROUP BY crawler_mode
`);
// Status stats
const statusStats = await migrate_1.pool.query(`
SELECT
crawler_status,
COUNT(*) as count
FROM dispensaries
GROUP BY crawler_status
ORDER BY count DESC
`);
// Sandbox stats
const sandboxStats = await migrate_1.pool.query(`
SELECT
status,
COUNT(*) as count
FROM crawler_sandboxes
GROUP BY status
`);
// Job stats
const jobStats = await migrate_1.pool.query(`
SELECT
status,
job_type,
COUNT(*) as count
FROM sandbox_crawl_jobs
GROUP BY status, job_type
`);
// Recent activity
const recentActivity = await migrate_1.pool.query(`
SELECT 'sandbox' as type, id, dispensary_id, status, created_at
FROM crawler_sandboxes
ORDER BY created_at DESC
LIMIT 5
`);
res.json({
providers: providerStats.rows,
modes: modeStats.rows,
statuses: statusStats.rows,
sandbox: sandboxStats.rows,
jobs: jobStats.rows,
recentActivity: recentActivity.rows,
});
}
catch (error) {
logger_1.logger.error('api', `Get stats error: ${error.message}`);
res.status(500).json({ error: error.message });
}
});
exports.default = router;