- Moved hydration module back from _deprecated (needed for product_refresh) - Restored product_refresh handler for processing stored payloads - Restored geolocation service for findadispo/findagram - Stubbed system routes that depend on deprecated SyncOrchestrator - Removed crawler-sandbox route (deprecated) - Fixed all TypeScript compilation errors 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
629 lines
17 KiB
TypeScript
629 lines
17 KiB
TypeScript
/**
|
|
* Crawler Sandbox API Routes
|
|
*
|
|
* Endpoints for managing sandbox crawls, templates, and provider detection
|
|
*/
|
|
|
|
import express from 'express';
|
|
import { pool } from '../db/pool';
|
|
import { authMiddleware, requireRole } from '../auth/middleware';
|
|
import { logger } from '../services/logger';
|
|
import {
|
|
runDetectMenuProviderJob,
|
|
runDutchieMenuCrawlJob,
|
|
runSandboxCrawlJob,
|
|
} from '../services/crawler-jobs';
|
|
|
|
const router = express.Router();
|
|
|
|
// Apply auth middleware to all routes
|
|
router.use(authMiddleware);
|
|
|
|
// ========================================
|
|
// Sandbox Entries
|
|
// ========================================
|
|
|
|
/**
|
|
* GET /api/crawler-sandbox
|
|
* List sandbox entries with optional filters
|
|
*/
|
|
router.get('/', async (req, res) => {
|
|
try {
|
|
const { status, dispensaryId, limit = 50, offset = 0 } = req.query;
|
|
|
|
let query = `
|
|
SELECT cs.*, d.name as dispensary_name, d.website, d.menu_provider, d.crawler_status
|
|
FROM crawler_sandboxes cs
|
|
JOIN dispensaries d ON d.id = cs.dispensary_id
|
|
WHERE 1=1
|
|
`;
|
|
const params: any[] = [];
|
|
let paramIndex = 1;
|
|
|
|
if (status) {
|
|
query += ` AND cs.status = $${paramIndex}`;
|
|
params.push(status);
|
|
paramIndex++;
|
|
}
|
|
|
|
if (dispensaryId) {
|
|
query += ` AND cs.dispensary_id = $${paramIndex}`;
|
|
params.push(Number(dispensaryId));
|
|
paramIndex++;
|
|
}
|
|
|
|
query += ` ORDER BY cs.created_at DESC LIMIT $${paramIndex} OFFSET $${paramIndex + 1}`;
|
|
params.push(Number(limit), Number(offset));
|
|
|
|
const result = await pool.query(query, params);
|
|
|
|
// Get total count
|
|
const countResult = await pool.query(
|
|
`SELECT COUNT(*) FROM crawler_sandboxes cs WHERE 1=1
|
|
${status ? 'AND cs.status = $1' : ''}
|
|
${dispensaryId ? `AND cs.dispensary_id = $${status ? 2 : 1}` : ''}`,
|
|
status && dispensaryId ? [status, dispensaryId] : status ? [status] : dispensaryId ? [dispensaryId] : []
|
|
);
|
|
|
|
res.json({
|
|
sandboxes: result.rows,
|
|
total: parseInt(countResult.rows[0].count),
|
|
limit: Number(limit),
|
|
offset: Number(offset),
|
|
});
|
|
} catch (error: any) {
|
|
logger.error('api', `Get sandboxes error: ${error.message}`);
|
|
res.status(500).json({ error: error.message });
|
|
}
|
|
});
|
|
|
|
/**
|
|
* GET /api/crawler-sandbox/:id
|
|
* Get a single sandbox entry with full details
|
|
*/
|
|
router.get('/:id', async (req, res) => {
|
|
try {
|
|
const { id } = req.params;
|
|
|
|
const result = await pool.query(
|
|
`SELECT cs.*, d.name as dispensary_name, d.website, d.menu_url,
|
|
d.menu_provider, d.menu_provider_confidence, d.crawler_mode, d.crawler_status
|
|
FROM crawler_sandboxes cs
|
|
JOIN dispensaries d ON d.id = cs.dispensary_id
|
|
WHERE cs.id = $1`,
|
|
[id]
|
|
);
|
|
|
|
if (result.rows.length === 0) {
|
|
return res.status(404).json({ error: 'Sandbox entry not found' });
|
|
}
|
|
|
|
// Get related jobs
|
|
const jobs = await pool.query(
|
|
`SELECT * FROM sandbox_crawl_jobs
|
|
WHERE sandbox_id = $1 OR dispensary_id = $2
|
|
ORDER BY created_at DESC
|
|
LIMIT 10`,
|
|
[id, result.rows[0].dispensary_id]
|
|
);
|
|
|
|
res.json({
|
|
sandbox: result.rows[0],
|
|
jobs: jobs.rows,
|
|
});
|
|
} catch (error: any) {
|
|
logger.error('api', `Get sandbox error: ${error.message}`);
|
|
res.status(500).json({ error: error.message });
|
|
}
|
|
});
|
|
|
|
/**
|
|
* POST /api/crawler-sandbox/:id/analyze
|
|
* Trigger re-analysis of a sandbox entry
|
|
*/
|
|
router.post('/:id/analyze', requireRole('admin'), async (req, res) => {
|
|
try {
|
|
const { id } = req.params;
|
|
|
|
const sandbox = await pool.query('SELECT * FROM crawler_sandboxes WHERE id = $1', [id]);
|
|
if (sandbox.rows.length === 0) {
|
|
return res.status(404).json({ error: 'Sandbox entry not found' });
|
|
}
|
|
|
|
// Queue a new sandbox job
|
|
const job = await pool.query(
|
|
`INSERT INTO sandbox_crawl_jobs (dispensary_id, sandbox_id, job_type, status, priority)
|
|
VALUES ($1, $2, 'deep_crawl', 'pending', 20)
|
|
RETURNING id`,
|
|
[sandbox.rows[0].dispensary_id, id]
|
|
);
|
|
|
|
// Update sandbox status
|
|
await pool.query(
|
|
`UPDATE crawler_sandboxes SET status = 'pending', updated_at = NOW() WHERE id = $1`,
|
|
[id]
|
|
);
|
|
|
|
res.json({
|
|
message: 'Analysis job queued',
|
|
jobId: job.rows[0].id,
|
|
});
|
|
} catch (error: any) {
|
|
logger.error('api', `Analyze sandbox error: ${error.message}`);
|
|
res.status(500).json({ error: error.message });
|
|
}
|
|
});
|
|
|
|
/**
|
|
* POST /api/crawler-sandbox/:id/move-to-production
|
|
* Move a sandbox entry to production (for Dutchie dispensaries)
|
|
*/
|
|
router.post('/:id/move-to-production', requireRole('admin'), async (req, res) => {
|
|
try {
|
|
const { id } = req.params;
|
|
|
|
const sandbox = await pool.query(
|
|
`SELECT cs.*, d.menu_provider
|
|
FROM crawler_sandboxes cs
|
|
JOIN dispensaries d ON d.id = cs.dispensary_id
|
|
WHERE cs.id = $1`,
|
|
[id]
|
|
);
|
|
|
|
if (sandbox.rows.length === 0) {
|
|
return res.status(404).json({ error: 'Sandbox entry not found' });
|
|
}
|
|
|
|
// Can only move to production if provider is dutchie
|
|
if (sandbox.rows[0].menu_provider !== 'dutchie') {
|
|
return res.status(400).json({
|
|
error: 'Only Dutchie dispensaries can be moved to production currently',
|
|
});
|
|
}
|
|
|
|
// Update dispensary to production mode
|
|
await pool.query(
|
|
`UPDATE dispensaries
|
|
SET crawler_mode = 'production', crawler_status = 'idle', updated_at = NOW()
|
|
WHERE id = $1`,
|
|
[sandbox.rows[0].dispensary_id]
|
|
);
|
|
|
|
// Mark sandbox as moved
|
|
await pool.query(
|
|
`UPDATE crawler_sandboxes
|
|
SET status = 'moved_to_production', updated_at = NOW()
|
|
WHERE id = $1`,
|
|
[id]
|
|
);
|
|
|
|
res.json({ message: 'Dispensary moved to production' });
|
|
} catch (error: any) {
|
|
logger.error('api', `Move to production error: ${error.message}`);
|
|
res.status(500).json({ error: error.message });
|
|
}
|
|
});
|
|
|
|
/**
|
|
* PATCH /api/crawler-sandbox/:id
|
|
* Update sandbox entry (e.g., add human review notes)
|
|
*/
|
|
router.patch('/:id', requireRole('admin'), async (req, res) => {
|
|
try {
|
|
const { id } = req.params;
|
|
const { human_review_notes, status, suspected_menu_provider } = req.body;
|
|
|
|
const updates: string[] = [];
|
|
const params: any[] = [];
|
|
let paramIndex = 1;
|
|
|
|
if (human_review_notes !== undefined) {
|
|
updates.push(`human_review_notes = $${paramIndex}`);
|
|
params.push(human_review_notes);
|
|
paramIndex++;
|
|
}
|
|
|
|
if (status) {
|
|
updates.push(`status = $${paramIndex}`);
|
|
params.push(status);
|
|
paramIndex++;
|
|
}
|
|
|
|
if (suspected_menu_provider !== undefined) {
|
|
updates.push(`suspected_menu_provider = $${paramIndex}`);
|
|
params.push(suspected_menu_provider);
|
|
paramIndex++;
|
|
}
|
|
|
|
if (updates.length === 0) {
|
|
return res.status(400).json({ error: 'No updates provided' });
|
|
}
|
|
|
|
updates.push('updated_at = NOW()');
|
|
if (human_review_notes !== undefined) {
|
|
updates.push('reviewed_at = NOW()');
|
|
}
|
|
|
|
params.push(id);
|
|
await pool.query(
|
|
`UPDATE crawler_sandboxes SET ${updates.join(', ')} WHERE id = $${paramIndex}`,
|
|
params
|
|
);
|
|
|
|
res.json({ message: 'Sandbox updated' });
|
|
} catch (error: any) {
|
|
logger.error('api', `Update sandbox error: ${error.message}`);
|
|
res.status(500).json({ error: error.message });
|
|
}
|
|
});
|
|
|
|
// ========================================
|
|
// Templates
|
|
// ========================================
|
|
|
|
/**
|
|
* GET /api/crawler-sandbox/templates
|
|
* List all crawler templates
|
|
*/
|
|
router.get('/templates/list', async (req, res) => {
|
|
try {
|
|
const result = await pool.query(
|
|
`SELECT * FROM crawler_templates ORDER BY provider, is_default_for_provider DESC, name`
|
|
);
|
|
res.json({ templates: result.rows });
|
|
} catch (error: any) {
|
|
logger.error('api', `Get templates error: ${error.message}`);
|
|
res.status(500).json({ error: error.message });
|
|
}
|
|
});
|
|
|
|
/**
|
|
* GET /api/crawler-sandbox/templates/:id
|
|
* Get a single template
|
|
*/
|
|
router.get('/templates/:id', async (req, res) => {
|
|
try {
|
|
const { id } = req.params;
|
|
const result = await pool.query('SELECT * FROM crawler_templates WHERE id = $1', [id]);
|
|
|
|
if (result.rows.length === 0) {
|
|
return res.status(404).json({ error: 'Template not found' });
|
|
}
|
|
|
|
res.json({ template: result.rows[0] });
|
|
} catch (error: any) {
|
|
logger.error('api', `Get template error: ${error.message}`);
|
|
res.status(500).json({ error: error.message });
|
|
}
|
|
});
|
|
|
|
/**
|
|
* POST /api/crawler-sandbox/templates
|
|
* Create a new template
|
|
*/
|
|
router.post('/templates', requireRole('admin'), async (req, res) => {
|
|
try {
|
|
const {
|
|
provider,
|
|
name,
|
|
selector_config,
|
|
navigation_config,
|
|
transform_config,
|
|
validation_rules,
|
|
notes,
|
|
} = req.body;
|
|
|
|
if (!provider || !name) {
|
|
return res.status(400).json({ error: 'provider and name are required' });
|
|
}
|
|
|
|
const result = await pool.query(
|
|
`INSERT INTO crawler_templates
|
|
(provider, name, selector_config, navigation_config, transform_config, validation_rules, notes, created_by)
|
|
VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
|
|
RETURNING *`,
|
|
[
|
|
provider,
|
|
name,
|
|
JSON.stringify(selector_config || {}),
|
|
JSON.stringify(navigation_config || {}),
|
|
JSON.stringify(transform_config || {}),
|
|
JSON.stringify(validation_rules || {}),
|
|
notes,
|
|
(req as any).user?.email || 'system',
|
|
]
|
|
);
|
|
|
|
res.status(201).json({ template: result.rows[0] });
|
|
} catch (error: any) {
|
|
logger.error('api', `Create template error: ${error.message}`);
|
|
res.status(500).json({ error: error.message });
|
|
}
|
|
});
|
|
|
|
/**
|
|
* PUT /api/crawler-sandbox/templates/:id
|
|
* Update a template
|
|
*/
|
|
router.put('/templates/:id', requireRole('admin'), async (req, res) => {
|
|
try {
|
|
const { id } = req.params;
|
|
const {
|
|
is_active,
|
|
is_default_for_provider,
|
|
selector_config,
|
|
navigation_config,
|
|
transform_config,
|
|
validation_rules,
|
|
notes,
|
|
} = req.body;
|
|
|
|
const updates: string[] = [];
|
|
const params: any[] = [];
|
|
let paramIndex = 1;
|
|
|
|
if (is_active !== undefined) {
|
|
updates.push(`is_active = $${paramIndex}`);
|
|
params.push(is_active);
|
|
paramIndex++;
|
|
}
|
|
|
|
if (is_default_for_provider !== undefined) {
|
|
updates.push(`is_default_for_provider = $${paramIndex}`);
|
|
params.push(is_default_for_provider);
|
|
paramIndex++;
|
|
}
|
|
|
|
if (selector_config !== undefined) {
|
|
updates.push(`selector_config = $${paramIndex}`);
|
|
params.push(JSON.stringify(selector_config));
|
|
paramIndex++;
|
|
}
|
|
|
|
if (navigation_config !== undefined) {
|
|
updates.push(`navigation_config = $${paramIndex}`);
|
|
params.push(JSON.stringify(navigation_config));
|
|
paramIndex++;
|
|
}
|
|
|
|
if (transform_config !== undefined) {
|
|
updates.push(`transform_config = $${paramIndex}`);
|
|
params.push(JSON.stringify(transform_config));
|
|
paramIndex++;
|
|
}
|
|
|
|
if (validation_rules !== undefined) {
|
|
updates.push(`validation_rules = $${paramIndex}`);
|
|
params.push(JSON.stringify(validation_rules));
|
|
paramIndex++;
|
|
}
|
|
|
|
if (notes !== undefined) {
|
|
updates.push(`notes = $${paramIndex}`);
|
|
params.push(notes);
|
|
paramIndex++;
|
|
}
|
|
|
|
if (updates.length === 0) {
|
|
return res.status(400).json({ error: 'No updates provided' });
|
|
}
|
|
|
|
updates.push('updated_at = NOW()');
|
|
params.push(id);
|
|
|
|
await pool.query(
|
|
`UPDATE crawler_templates SET ${updates.join(', ')} WHERE id = $${paramIndex}`,
|
|
params
|
|
);
|
|
|
|
const result = await pool.query('SELECT * FROM crawler_templates WHERE id = $1', [id]);
|
|
res.json({ template: result.rows[0] });
|
|
} catch (error: any) {
|
|
logger.error('api', `Update template error: ${error.message}`);
|
|
res.status(500).json({ error: error.message });
|
|
}
|
|
});
|
|
|
|
// ========================================
|
|
// Jobs
|
|
// ========================================
|
|
|
|
/**
|
|
* GET /api/crawler-sandbox/jobs
|
|
* List sandbox crawl jobs
|
|
*/
|
|
router.get('/jobs/list', async (req, res) => {
|
|
try {
|
|
const { status, dispensaryId, limit = 50 } = req.query;
|
|
|
|
let query = `
|
|
SELECT sj.*, d.name as dispensary_name
|
|
FROM sandbox_crawl_jobs sj
|
|
JOIN dispensaries d ON d.id = sj.dispensary_id
|
|
WHERE 1=1
|
|
`;
|
|
const params: any[] = [];
|
|
let paramIndex = 1;
|
|
|
|
if (status) {
|
|
query += ` AND sj.status = $${paramIndex}`;
|
|
params.push(status);
|
|
paramIndex++;
|
|
}
|
|
|
|
if (dispensaryId) {
|
|
query += ` AND sj.dispensary_id = $${paramIndex}`;
|
|
params.push(Number(dispensaryId));
|
|
paramIndex++;
|
|
}
|
|
|
|
query += ` ORDER BY sj.created_at DESC LIMIT $${paramIndex}`;
|
|
params.push(Number(limit));
|
|
|
|
const result = await pool.query(query, params);
|
|
res.json({ jobs: result.rows });
|
|
} catch (error: any) {
|
|
logger.error('api', `Get jobs error: ${error.message}`);
|
|
res.status(500).json({ error: error.message });
|
|
}
|
|
});
|
|
|
|
/**
|
|
* POST /api/crawler-sandbox/jobs/detect/:dispensaryId
|
|
* Trigger provider detection for a dispensary
|
|
*/
|
|
router.post('/jobs/detect/:dispensaryId', requireRole('admin'), async (req, res) => {
|
|
try {
|
|
const { dispensaryId } = req.params;
|
|
|
|
// Create detection job
|
|
const job = await pool.query(
|
|
`INSERT INTO sandbox_crawl_jobs (dispensary_id, job_type, status, priority)
|
|
VALUES ($1, 'detection', 'pending', 30)
|
|
RETURNING id`,
|
|
[dispensaryId]
|
|
);
|
|
|
|
// Update dispensary status
|
|
await pool.query(
|
|
`UPDATE dispensaries SET crawler_status = 'queued_detection', updated_at = NOW() WHERE id = $1`,
|
|
[dispensaryId]
|
|
);
|
|
|
|
res.json({
|
|
message: 'Detection job queued',
|
|
jobId: job.rows[0].id,
|
|
});
|
|
} catch (error: any) {
|
|
logger.error('api', `Queue detection error: ${error.message}`);
|
|
res.status(500).json({ error: error.message });
|
|
}
|
|
});
|
|
|
|
/**
|
|
* POST /api/crawler-sandbox/jobs/run/:id
|
|
* Immediately run a sandbox job
|
|
*/
|
|
router.post('/jobs/run/:id', requireRole('admin'), async (req, res) => {
|
|
try {
|
|
const { id } = req.params;
|
|
|
|
const job = await pool.query('SELECT * FROM sandbox_crawl_jobs WHERE id = $1', [id]);
|
|
if (job.rows.length === 0) {
|
|
return res.status(404).json({ error: 'Job not found' });
|
|
}
|
|
|
|
const jobData = job.rows[0];
|
|
|
|
// Run the job immediately
|
|
let result;
|
|
if (jobData.job_type === 'detection') {
|
|
result = await runDetectMenuProviderJob(jobData.dispensary_id);
|
|
} else {
|
|
result = await runSandboxCrawlJob(jobData.dispensary_id, jobData.sandbox_id);
|
|
}
|
|
|
|
// Update job status
|
|
await pool.query(
|
|
`UPDATE sandbox_crawl_jobs
|
|
SET status = $1, completed_at = NOW(), result_summary = $2, error_message = $3
|
|
WHERE id = $4`,
|
|
[
|
|
result.success ? 'completed' : 'failed',
|
|
JSON.stringify(result.data || {}),
|
|
result.success ? null : result.message,
|
|
id,
|
|
]
|
|
);
|
|
|
|
res.json(result);
|
|
} catch (error: any) {
|
|
logger.error('api', `Run job error: ${error.message}`);
|
|
res.status(500).json({ error: error.message });
|
|
}
|
|
});
|
|
|
|
// ========================================
|
|
// Stats
|
|
// ========================================
|
|
|
|
/**
|
|
* GET /api/crawler-sandbox/stats
|
|
* Get sandbox/crawler statistics
|
|
*/
|
|
router.get('/stats/overview', async (req, res) => {
|
|
try {
|
|
// Dispensary provider stats
|
|
const providerStats = await pool.query(`
|
|
SELECT
|
|
menu_provider,
|
|
COUNT(*) as count,
|
|
AVG(menu_provider_confidence)::integer as avg_confidence
|
|
FROM dispensaries
|
|
WHERE menu_provider IS NOT NULL
|
|
GROUP BY menu_provider
|
|
ORDER BY count DESC
|
|
`);
|
|
|
|
// Mode stats
|
|
const modeStats = await pool.query(`
|
|
SELECT
|
|
crawler_mode,
|
|
COUNT(*) as count
|
|
FROM dispensaries
|
|
GROUP BY crawler_mode
|
|
`);
|
|
|
|
// Status stats
|
|
const statusStats = await pool.query(`
|
|
SELECT
|
|
crawler_status,
|
|
COUNT(*) as count
|
|
FROM dispensaries
|
|
GROUP BY crawler_status
|
|
ORDER BY count DESC
|
|
`);
|
|
|
|
// Sandbox stats
|
|
const sandboxStats = await pool.query(`
|
|
SELECT
|
|
status,
|
|
COUNT(*) as count
|
|
FROM crawler_sandboxes
|
|
GROUP BY status
|
|
`);
|
|
|
|
// Job stats
|
|
const jobStats = await pool.query(`
|
|
SELECT
|
|
status,
|
|
job_type,
|
|
COUNT(*) as count
|
|
FROM sandbox_crawl_jobs
|
|
GROUP BY status, job_type
|
|
`);
|
|
|
|
// Recent activity
|
|
const recentActivity = await pool.query(`
|
|
SELECT 'sandbox' as type, id, dispensary_id, status, created_at
|
|
FROM crawler_sandboxes
|
|
ORDER BY created_at DESC
|
|
LIMIT 5
|
|
`);
|
|
|
|
res.json({
|
|
providers: providerStats.rows,
|
|
modes: modeStats.rows,
|
|
statuses: statusStats.rows,
|
|
sandbox: sandboxStats.rows,
|
|
jobs: jobStats.rows,
|
|
recentActivity: recentActivity.rows,
|
|
});
|
|
} catch (error: any) {
|
|
logger.error('api', `Get stats error: ${error.message}`);
|
|
res.status(500).json({ error: error.message });
|
|
}
|
|
});
|
|
|
|
export default router;
|