feat: Stealth worker system with mandatory proxy rotation
## Worker System - Role-agnostic workers that can handle any task type - Pod-based architecture with StatefulSet (5-15 pods, 5 workers each) - Custom pod names (Aethelgard, Xylos, Kryll, etc.) - Worker registry with friendly names and resource monitoring - Hub-and-spoke visualization on JobQueue page ## Stealth & Anti-Detection (REQUIRED) - Proxies are MANDATORY - workers fail to start without active proxies - CrawlRotator initializes on worker startup - Loads proxies from `proxies` table - Auto-rotates proxy + fingerprint on 403 errors - 12 browser fingerprints (Chrome, Firefox, Safari, Edge) - Locale/timezone matching for geographic consistency ## Task System - Renamed product_resync → product_refresh - Task chaining: store_discovery → entry_point → product_discovery - Priority-based claiming with FOR UPDATE SKIP LOCKED - Heartbeat and stale task recovery ## UI Updates - JobQueue: Pod visualization, resource monitoring on hover - WorkersDashboard: Simplified worker list - Removed unused filters from task list ## Other - IP2Location service for visitor analytics - Findagram consumer features scaffolding - Documentation updates 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -191,6 +191,23 @@ export async function runFullDiscovery(
|
||||
}
|
||||
}
|
||||
|
||||
// Step 5: Detect dropped stores (in DB but not in discovery results)
|
||||
if (!dryRun) {
|
||||
console.log('\n[Discovery] Step 5: Detecting dropped stores...');
|
||||
const droppedResult = await detectDroppedStores(pool, stateCode);
|
||||
if (droppedResult.droppedCount > 0) {
|
||||
console.log(`[Discovery] Found ${droppedResult.droppedCount} dropped stores:`);
|
||||
droppedResult.droppedStores.slice(0, 10).forEach(s => {
|
||||
console.log(` - ${s.name} (${s.city}, ${s.state}) - last seen: ${s.lastSeenAt}`);
|
||||
});
|
||||
if (droppedResult.droppedCount > 10) {
|
||||
console.log(` ... and ${droppedResult.droppedCount - 10} more`);
|
||||
}
|
||||
} else {
|
||||
console.log(`[Discovery] No dropped stores detected`);
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
cities: cityResult,
|
||||
locations: locationResults,
|
||||
@@ -200,6 +217,107 @@ export async function runFullDiscovery(
|
||||
};
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// DROPPED STORE DETECTION
|
||||
// ============================================================
|
||||
|
||||
export interface DroppedStoreResult {
|
||||
droppedCount: number;
|
||||
droppedStores: Array<{
|
||||
id: number;
|
||||
name: string;
|
||||
city: string;
|
||||
state: string;
|
||||
platformDispensaryId: string;
|
||||
lastSeenAt: string;
|
||||
}>;
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect stores that exist in dispensaries but were not found in discovery.
|
||||
* Marks them as status='dropped' for manual review.
|
||||
*
|
||||
* A store is considered "dropped" if:
|
||||
* 1. It has a platform_dispensary_id (was verified via Dutchie)
|
||||
* 2. It was NOT seen in the latest discovery crawl (last_seen_at in discovery < 24h ago)
|
||||
* 3. It's currently marked as 'open' status
|
||||
*/
|
||||
export async function detectDroppedStores(
|
||||
pool: Pool,
|
||||
stateCode?: string
|
||||
): Promise<DroppedStoreResult> {
|
||||
// Find dispensaries that:
|
||||
// 1. Have platform_dispensary_id (verified Dutchie stores)
|
||||
// 2. Are currently 'open' status
|
||||
// 3. Have a linked discovery record that wasn't seen in the last discovery run
|
||||
// (last_seen_at in dutchie_discovery_locations is older than 24 hours)
|
||||
const params: any[] = [];
|
||||
let stateFilter = '';
|
||||
|
||||
if (stateCode) {
|
||||
stateFilter = ` AND d.state = $1`;
|
||||
params.push(stateCode);
|
||||
}
|
||||
|
||||
const query = `
|
||||
WITH recently_seen AS (
|
||||
SELECT DISTINCT platform_location_id
|
||||
FROM dutchie_discovery_locations
|
||||
WHERE last_seen_at > NOW() - INTERVAL '24 hours'
|
||||
AND active = true
|
||||
)
|
||||
SELECT
|
||||
d.id,
|
||||
d.name,
|
||||
d.city,
|
||||
d.state,
|
||||
d.platform_dispensary_id,
|
||||
d.updated_at as last_seen_at
|
||||
FROM dispensaries d
|
||||
WHERE d.platform_dispensary_id IS NOT NULL
|
||||
AND d.platform = 'dutchie'
|
||||
AND (d.status = 'open' OR d.status IS NULL)
|
||||
AND d.crawl_enabled = true
|
||||
AND d.platform_dispensary_id NOT IN (SELECT platform_location_id FROM recently_seen)
|
||||
${stateFilter}
|
||||
ORDER BY d.name
|
||||
`;
|
||||
|
||||
const result = await pool.query(query, params);
|
||||
const droppedStores = result.rows;
|
||||
|
||||
// Mark these stores as 'dropped' status
|
||||
if (droppedStores.length > 0) {
|
||||
const ids = droppedStores.map(s => s.id);
|
||||
await pool.query(`
|
||||
UPDATE dispensaries
|
||||
SET status = 'dropped', updated_at = NOW()
|
||||
WHERE id = ANY($1::int[])
|
||||
`, [ids]);
|
||||
|
||||
// Log to promotion log for audit
|
||||
for (const store of droppedStores) {
|
||||
await pool.query(`
|
||||
INSERT INTO dutchie_promotion_log
|
||||
(dispensary_id, action, state_code, store_name, triggered_by)
|
||||
VALUES ($1, 'dropped', $2, $3, 'discovery_detection')
|
||||
`, [store.id, store.state, store.name]);
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
droppedCount: droppedStores.length,
|
||||
droppedStores: droppedStores.map(s => ({
|
||||
id: s.id,
|
||||
name: s.name,
|
||||
city: s.city,
|
||||
state: s.state,
|
||||
platformDispensaryId: s.platform_dispensary_id,
|
||||
lastSeenAt: s.last_seen_at,
|
||||
})),
|
||||
};
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// SINGLE CITY DISCOVERY
|
||||
// ============================================================
|
||||
|
||||
@@ -140,6 +140,7 @@ import clickAnalyticsRoutes from './routes/click-analytics';
|
||||
import seoRoutes from './routes/seo';
|
||||
import priceAnalyticsRoutes from './routes/price-analytics';
|
||||
import tasksRoutes from './routes/tasks';
|
||||
import workerRegistryRoutes from './routes/worker-registry';
|
||||
|
||||
// Mark requests from trusted domains (cannaiq.co, findagram.co, findadispo.com)
|
||||
// These domains can access the API without authentication
|
||||
@@ -216,6 +217,10 @@ console.log('[Workers] Routes registered at /api/workers, /api/monitor, and /api
|
||||
app.use('/api/tasks', tasksRoutes);
|
||||
console.log('[Tasks] Routes registered at /api/tasks');
|
||||
|
||||
// Worker registry - dynamic worker registration, heartbeats, and name management
|
||||
app.use('/api/worker-registry', workerRegistryRoutes);
|
||||
console.log('[WorkerRegistry] Routes registered at /api/worker-registry');
|
||||
|
||||
// Phase 3: Analytics V2 - Enhanced analytics with rec/med state segmentation
|
||||
try {
|
||||
const analyticsV2Router = createAnalyticsV2Router(getPool());
|
||||
|
||||
@@ -5,33 +5,37 @@ import { pool } from '../db/pool';
|
||||
const router = Router();
|
||||
router.use(authMiddleware);
|
||||
|
||||
// Get categories (flat list)
|
||||
// Get categories (flat list) - derived from actual product data
|
||||
router.get('/', async (req, res) => {
|
||||
try {
|
||||
const { store_id } = req.query;
|
||||
|
||||
const { store_id, in_stock_only } = req.query;
|
||||
|
||||
let query = `
|
||||
SELECT
|
||||
c.*,
|
||||
COUNT(DISTINCT p.id) as product_count,
|
||||
pc.name as parent_name
|
||||
FROM categories c
|
||||
LEFT JOIN store_products p ON c.name = p.category_raw
|
||||
LEFT JOIN categories pc ON c.parent_id = pc.id
|
||||
category_raw as name,
|
||||
category_raw as slug,
|
||||
COUNT(*) as product_count,
|
||||
COUNT(*) FILTER (WHERE is_in_stock = true) as in_stock_count
|
||||
FROM store_products
|
||||
WHERE category_raw IS NOT NULL
|
||||
`;
|
||||
|
||||
|
||||
const params: any[] = [];
|
||||
|
||||
|
||||
if (store_id) {
|
||||
query += ' WHERE c.store_id = $1';
|
||||
params.push(store_id);
|
||||
query += ` AND dispensary_id = $${params.length}`;
|
||||
}
|
||||
|
||||
|
||||
if (in_stock_only === 'true') {
|
||||
query += ` AND is_in_stock = true`;
|
||||
}
|
||||
|
||||
query += `
|
||||
GROUP BY c.id, pc.name
|
||||
ORDER BY c.display_order, c.name
|
||||
GROUP BY category_raw
|
||||
ORDER BY category_raw
|
||||
`;
|
||||
|
||||
|
||||
const result = await pool.query(query, params);
|
||||
res.json({ categories: result.rows });
|
||||
} catch (error) {
|
||||
@@ -40,50 +44,86 @@ router.get('/', async (req, res) => {
|
||||
}
|
||||
});
|
||||
|
||||
// Get category tree (hierarchical)
|
||||
// Get category tree (hierarchical) - category -> subcategory structure from product data
|
||||
router.get('/tree', async (req, res) => {
|
||||
try {
|
||||
const { store_id } = req.query;
|
||||
|
||||
if (!store_id) {
|
||||
return res.status(400).json({ error: 'store_id is required' });
|
||||
}
|
||||
|
||||
// Get all categories for the store
|
||||
const result = await pool.query(`
|
||||
SELECT
|
||||
c.*,
|
||||
COUNT(DISTINCT p.id) as product_count
|
||||
FROM categories c
|
||||
LEFT JOIN store_products p ON c.name = p.category_raw AND p.is_in_stock = true AND p.dispensary_id = $1
|
||||
WHERE c.store_id = $1
|
||||
GROUP BY c.id
|
||||
ORDER BY c.display_order, c.name
|
||||
`, [store_id]);
|
||||
|
||||
// Build tree structure
|
||||
const categories = result.rows;
|
||||
const categoryMap = new Map();
|
||||
const tree: any[] = [];
|
||||
|
||||
// First pass: create map
|
||||
categories.forEach((cat: { id: number; parent_id?: number }) => {
|
||||
categoryMap.set(cat.id, { ...cat, children: [] });
|
||||
});
|
||||
const { store_id, in_stock_only } = req.query;
|
||||
|
||||
// Second pass: build tree
|
||||
categories.forEach((cat: { id: number; parent_id?: number }) => {
|
||||
const node = categoryMap.get(cat.id);
|
||||
if (cat.parent_id) {
|
||||
const parent = categoryMap.get(cat.parent_id);
|
||||
if (parent) {
|
||||
parent.children.push(node);
|
||||
}
|
||||
} else {
|
||||
tree.push(node);
|
||||
// Get category + subcategory combinations with counts
|
||||
let query = `
|
||||
SELECT
|
||||
category_raw as category,
|
||||
subcategory_raw as subcategory,
|
||||
COUNT(*) as product_count,
|
||||
COUNT(*) FILTER (WHERE is_in_stock = true) as in_stock_count
|
||||
FROM store_products
|
||||
WHERE category_raw IS NOT NULL
|
||||
`;
|
||||
|
||||
const params: any[] = [];
|
||||
|
||||
if (store_id) {
|
||||
params.push(store_id);
|
||||
query += ` AND dispensary_id = $${params.length}`;
|
||||
}
|
||||
|
||||
if (in_stock_only === 'true') {
|
||||
query += ` AND is_in_stock = true`;
|
||||
}
|
||||
|
||||
query += `
|
||||
GROUP BY category_raw, subcategory_raw
|
||||
ORDER BY category_raw, subcategory_raw
|
||||
`;
|
||||
|
||||
const result = await pool.query(query, params);
|
||||
|
||||
// Build tree structure: category -> subcategories
|
||||
const categoryMap = new Map<string, {
|
||||
name: string;
|
||||
slug: string;
|
||||
product_count: number;
|
||||
in_stock_count: number;
|
||||
subcategories: Array<{
|
||||
name: string;
|
||||
slug: string;
|
||||
product_count: number;
|
||||
in_stock_count: number;
|
||||
}>;
|
||||
}>();
|
||||
|
||||
for (const row of result.rows) {
|
||||
const category = row.category;
|
||||
const subcategory = row.subcategory;
|
||||
const count = parseInt(row.product_count);
|
||||
const inStockCount = parseInt(row.in_stock_count);
|
||||
|
||||
if (!categoryMap.has(category)) {
|
||||
categoryMap.set(category, {
|
||||
name: category,
|
||||
slug: category.toLowerCase().replace(/\s+/g, '-'),
|
||||
product_count: 0,
|
||||
in_stock_count: 0,
|
||||
subcategories: []
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
const cat = categoryMap.get(category)!;
|
||||
cat.product_count += count;
|
||||
cat.in_stock_count += inStockCount;
|
||||
|
||||
if (subcategory) {
|
||||
cat.subcategories.push({
|
||||
name: subcategory,
|
||||
slug: subcategory.toLowerCase().replace(/\s+/g, '-'),
|
||||
product_count: count,
|
||||
in_stock_count: inStockCount
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
const tree = Array.from(categoryMap.values());
|
||||
|
||||
res.json({ tree });
|
||||
} catch (error) {
|
||||
console.error('Error fetching category tree:', error);
|
||||
@@ -91,4 +131,91 @@ router.get('/tree', async (req, res) => {
|
||||
}
|
||||
});
|
||||
|
||||
// Get all unique subcategories for a category
|
||||
router.get('/:category/subcategories', async (req, res) => {
|
||||
try {
|
||||
const { category } = req.params;
|
||||
const { store_id, in_stock_only } = req.query;
|
||||
|
||||
let query = `
|
||||
SELECT
|
||||
subcategory_raw as name,
|
||||
subcategory_raw as slug,
|
||||
COUNT(*) as product_count,
|
||||
COUNT(*) FILTER (WHERE is_in_stock = true) as in_stock_count
|
||||
FROM store_products
|
||||
WHERE category_raw = $1
|
||||
AND subcategory_raw IS NOT NULL
|
||||
`;
|
||||
|
||||
const params: any[] = [category];
|
||||
|
||||
if (store_id) {
|
||||
params.push(store_id);
|
||||
query += ` AND dispensary_id = $${params.length}`;
|
||||
}
|
||||
|
||||
if (in_stock_only === 'true') {
|
||||
query += ` AND is_in_stock = true`;
|
||||
}
|
||||
|
||||
query += `
|
||||
GROUP BY subcategory_raw
|
||||
ORDER BY subcategory_raw
|
||||
`;
|
||||
|
||||
const result = await pool.query(query, params);
|
||||
res.json({
|
||||
category,
|
||||
subcategories: result.rows
|
||||
});
|
||||
} catch (error) {
|
||||
console.error('Error fetching subcategories:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch subcategories' });
|
||||
}
|
||||
});
|
||||
|
||||
// Get global category summary (across all stores)
|
||||
router.get('/summary', async (req, res) => {
|
||||
try {
|
||||
const { state } = req.query;
|
||||
|
||||
let query = `
|
||||
SELECT
|
||||
sp.category_raw as category,
|
||||
COUNT(DISTINCT sp.id) as product_count,
|
||||
COUNT(DISTINCT sp.dispensary_id) as store_count,
|
||||
COUNT(*) FILTER (WHERE sp.is_in_stock = true) as in_stock_count
|
||||
FROM store_products sp
|
||||
`;
|
||||
|
||||
const params: any[] = [];
|
||||
|
||||
if (state) {
|
||||
query += `
|
||||
JOIN dispensaries d ON sp.dispensary_id = d.id
|
||||
WHERE sp.category_raw IS NOT NULL
|
||||
AND d.state = $1
|
||||
`;
|
||||
params.push(state);
|
||||
} else {
|
||||
query += ` WHERE sp.category_raw IS NOT NULL`;
|
||||
}
|
||||
|
||||
query += `
|
||||
GROUP BY sp.category_raw
|
||||
ORDER BY product_count DESC
|
||||
`;
|
||||
|
||||
const result = await pool.query(query, params);
|
||||
res.json({
|
||||
categories: result.rows,
|
||||
total_categories: result.rows.length
|
||||
});
|
||||
} catch (error) {
|
||||
console.error('Error fetching category summary:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch category summary' });
|
||||
}
|
||||
});
|
||||
|
||||
export default router;
|
||||
|
||||
@@ -11,7 +11,7 @@ const VALID_MENU_TYPES = ['dutchie', 'treez', 'jane', 'weedmaps', 'leafly', 'mea
|
||||
// Get all dispensaries (with pagination)
|
||||
router.get('/', async (req, res) => {
|
||||
try {
|
||||
const { menu_type, city, state, crawl_enabled, dutchie_verified, limit, offset, search } = req.query;
|
||||
const { menu_type, city, state, crawl_enabled, dutchie_verified, status, limit, offset, search } = req.query;
|
||||
const pageLimit = Math.min(parseInt(limit as string) || 50, 500);
|
||||
const pageOffset = parseInt(offset as string) || 0;
|
||||
|
||||
@@ -100,6 +100,12 @@ router.get('/', async (req, res) => {
|
||||
}
|
||||
}
|
||||
|
||||
// Filter by status (e.g., 'dropped', 'open', 'closed')
|
||||
if (status) {
|
||||
conditions.push(`status = $${params.length + 1}`);
|
||||
params.push(status);
|
||||
}
|
||||
|
||||
// Search filter (name, dba_name, city, company_name)
|
||||
if (search) {
|
||||
conditions.push(`(name ILIKE $${params.length + 1} OR dba_name ILIKE $${params.length + 1} OR city ILIKE $${params.length + 1})`);
|
||||
@@ -161,6 +167,7 @@ router.get('/stats/crawl-status', async (req, res) => {
|
||||
COUNT(*) FILTER (WHERE crawl_enabled = false OR crawl_enabled IS NULL) as disabled_count,
|
||||
COUNT(*) FILTER (WHERE dutchie_verified = true) as verified_count,
|
||||
COUNT(*) FILTER (WHERE dutchie_verified = false OR dutchie_verified IS NULL) as unverified_count,
|
||||
COUNT(*) FILTER (WHERE status = 'dropped') as dropped_count,
|
||||
COUNT(*) as total_count
|
||||
FROM dispensaries
|
||||
`;
|
||||
@@ -190,6 +197,34 @@ router.get('/stats/crawl-status', async (req, res) => {
|
||||
}
|
||||
});
|
||||
|
||||
// Get dropped stores count (for dashboard alert)
|
||||
router.get('/stats/dropped', async (req, res) => {
|
||||
try {
|
||||
const result = await pool.query(`
|
||||
SELECT
|
||||
COUNT(*) as dropped_count,
|
||||
json_agg(json_build_object(
|
||||
'id', id,
|
||||
'name', name,
|
||||
'city', city,
|
||||
'state', state,
|
||||
'dropped_at', updated_at
|
||||
) ORDER BY updated_at DESC) FILTER (WHERE status = 'dropped') as dropped_stores
|
||||
FROM dispensaries
|
||||
WHERE status = 'dropped'
|
||||
`);
|
||||
|
||||
const row = result.rows[0];
|
||||
res.json({
|
||||
dropped_count: parseInt(row.dropped_count) || 0,
|
||||
dropped_stores: row.dropped_stores || []
|
||||
});
|
||||
} catch (error) {
|
||||
console.error('Error fetching dropped stores:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch dropped stores' });
|
||||
}
|
||||
});
|
||||
|
||||
// Get single dispensary by slug or ID
|
||||
router.get('/:slugOrId', async (req, res) => {
|
||||
try {
|
||||
|
||||
@@ -22,11 +22,17 @@ interface ProductClickEventPayload {
|
||||
store_id?: string;
|
||||
brand_id?: string;
|
||||
campaign_id?: string;
|
||||
dispensary_name?: string;
|
||||
action: 'view' | 'open_store' | 'open_product' | 'compare' | 'other';
|
||||
source: string;
|
||||
page_type?: string; // Page where event occurred (e.g., StoreDetailPage, BrandsIntelligence)
|
||||
url_path?: string; // URL path for debugging
|
||||
occurred_at?: string;
|
||||
// Visitor location (from frontend IP geolocation)
|
||||
visitor_city?: string;
|
||||
visitor_state?: string;
|
||||
visitor_lat?: number;
|
||||
visitor_lng?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -77,13 +83,14 @@ router.post('/product-click', optionalAuthMiddleware, async (req: Request, res:
|
||||
// Insert the event with enhanced fields
|
||||
await pool.query(
|
||||
`INSERT INTO product_click_events
|
||||
(product_id, store_id, brand_id, campaign_id, action, source, user_id, ip_address, user_agent, occurred_at, event_type, page_type, url_path, device_type)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14)`,
|
||||
(product_id, store_id, brand_id, campaign_id, dispensary_name, action, source, user_id, ip_address, user_agent, occurred_at, event_type, page_type, url_path, device_type, visitor_city, visitor_state, visitor_lat, visitor_lng)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19)`,
|
||||
[
|
||||
payload.product_id,
|
||||
payload.store_id || null,
|
||||
payload.brand_id || null,
|
||||
payload.campaign_id || null,
|
||||
payload.dispensary_name || null,
|
||||
payload.action,
|
||||
payload.source,
|
||||
userId,
|
||||
@@ -93,7 +100,11 @@ router.post('/product-click', optionalAuthMiddleware, async (req: Request, res:
|
||||
'product_click', // event_type
|
||||
payload.page_type || null,
|
||||
payload.url_path || null,
|
||||
deviceType
|
||||
deviceType,
|
||||
payload.visitor_city || null,
|
||||
payload.visitor_state || null,
|
||||
payload.visitor_lat || null,
|
||||
payload.visitor_lng || null
|
||||
]
|
||||
);
|
||||
|
||||
|
||||
@@ -1,11 +1,29 @@
|
||||
import { Router } from 'express';
|
||||
import { authMiddleware } from '../auth/middleware';
|
||||
import { pool } from '../db/pool';
|
||||
import { getImageUrl } from '../utils/minio';
|
||||
|
||||
const router = Router();
|
||||
router.use(authMiddleware);
|
||||
|
||||
/**
|
||||
* Convert local image path to proxy URL
|
||||
* /images/products/... -> /img/products/...
|
||||
*/
|
||||
function getImageUrl(localPath: string): string {
|
||||
if (!localPath) return '';
|
||||
// If already a full URL, return as-is
|
||||
if (localPath.startsWith('http')) return localPath;
|
||||
// Convert /images/ path to /img/ proxy path
|
||||
if (localPath.startsWith('/images/')) {
|
||||
return '/img' + localPath.substring(7);
|
||||
}
|
||||
// Handle paths without leading slash
|
||||
if (localPath.startsWith('images/')) {
|
||||
return '/img/' + localPath.substring(7);
|
||||
}
|
||||
return '/img/' + localPath;
|
||||
}
|
||||
|
||||
// Freshness threshold: data older than this is considered stale
|
||||
const STALE_THRESHOLD_HOURS = 4;
|
||||
|
||||
|
||||
@@ -463,7 +463,7 @@ router.get('/products', async (req: PublicApiRequest, res: Response) => {
|
||||
|
||||
// Filter by on special
|
||||
if (on_special === 'true' || on_special === '1') {
|
||||
whereClause += ` AND s.is_on_special = TRUE`;
|
||||
whereClause += ` AND s.special = TRUE`;
|
||||
}
|
||||
|
||||
// Search by name or brand
|
||||
@@ -547,7 +547,7 @@ router.get('/products', async (req: PublicApiRequest, res: Response) => {
|
||||
const { rows: countRows } = await pool.query(`
|
||||
SELECT COUNT(*) as total FROM store_products p
|
||||
LEFT JOIN LATERAL (
|
||||
SELECT rec_min_price_cents / 100.0 as price_rec, med_min_price_cents / 100.0 as price_med, special as is_on_special FROM v_product_snapshots
|
||||
SELECT rec_min_price_cents / 100.0 as price_rec, med_min_price_cents / 100.0 as price_med, special FROM v_product_snapshots
|
||||
WHERE store_product_id = p.id
|
||||
ORDER BY crawled_at DESC
|
||||
LIMIT 1
|
||||
@@ -1125,6 +1125,7 @@ router.get('/dispensaries', async (req: PublicApiRequest, res: Response) => {
|
||||
SELECT
|
||||
d.id,
|
||||
d.name,
|
||||
d.slug,
|
||||
d.address1,
|
||||
d.address2,
|
||||
d.city,
|
||||
@@ -1179,6 +1180,7 @@ router.get('/dispensaries', async (req: PublicApiRequest, res: Response) => {
|
||||
const transformedDispensaries = dispensaries.map((d) => ({
|
||||
id: d.id,
|
||||
name: d.name,
|
||||
slug: d.slug || null,
|
||||
address1: d.address1,
|
||||
address2: d.address2,
|
||||
city: d.city,
|
||||
@@ -1876,7 +1878,7 @@ router.get('/stats', async (req: PublicApiRequest, res: Response) => {
|
||||
SELECT
|
||||
(SELECT COUNT(*) FROM store_products) as product_count,
|
||||
(SELECT COUNT(DISTINCT brand_name_raw) FROM store_products WHERE brand_name_raw IS NOT NULL) as brand_count,
|
||||
(SELECT COUNT(*) FROM dispensaries WHERE crawl_enabled = true AND product_count > 0) as dispensary_count
|
||||
(SELECT COUNT(DISTINCT dispensary_id) FROM store_products) as dispensary_count
|
||||
`);
|
||||
|
||||
const s = stats[0] || {};
|
||||
@@ -1996,4 +1998,235 @@ router.get('/menu', async (req: PublicApiRequest, res: Response) => {
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// VISITOR TRACKING & GEOLOCATION
|
||||
// ============================================================
|
||||
|
||||
import crypto from 'crypto';
|
||||
import { GeoLocation, lookupIP } from '../services/ip2location';
|
||||
|
||||
/**
|
||||
* Get location from IP using local IP2Location database
|
||||
*/
|
||||
function getLocationFromIP(ip: string): GeoLocation | null {
|
||||
return lookupIP(ip);
|
||||
}
|
||||
|
||||
/**
|
||||
* Hash IP for privacy (we don't store raw IPs)
|
||||
*/
|
||||
function hashIP(ip: string): string {
|
||||
return crypto.createHash('sha256').update(ip).digest('hex').substring(0, 16);
|
||||
}
|
||||
|
||||
/**
|
||||
* POST /api/v1/visitor/track
|
||||
* Track visitor location for analytics
|
||||
*
|
||||
* Body:
|
||||
* - domain: string (required) - 'findagram.co', 'findadispo.com', etc.
|
||||
* - page_path: string (optional) - current page path
|
||||
* - session_id: string (optional) - client-generated session ID
|
||||
* - referrer: string (optional) - document.referrer
|
||||
*
|
||||
* Returns:
|
||||
* - location: { city, state, lat, lng } for client use
|
||||
*/
|
||||
router.post('/visitor/track', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { domain, page_path, session_id, referrer } = req.body;
|
||||
|
||||
if (!domain) {
|
||||
return res.status(400).json({ error: 'domain is required' });
|
||||
}
|
||||
|
||||
// Get client IP
|
||||
const clientIp = (req.headers['x-forwarded-for'] as string)?.split(',')[0].trim() ||
|
||||
req.headers['x-real-ip'] as string ||
|
||||
req.ip ||
|
||||
req.socket.remoteAddress ||
|
||||
'';
|
||||
|
||||
// Get location from IP (local database lookup)
|
||||
const location = getLocationFromIP(clientIp);
|
||||
|
||||
// Store visit (with hashed IP for privacy)
|
||||
await pool.query(`
|
||||
INSERT INTO visitor_locations (
|
||||
ip_hash, city, state, state_code, country, country_code,
|
||||
latitude, longitude, domain, page_path, referrer, user_agent, session_id
|
||||
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13)
|
||||
`, [
|
||||
hashIP(clientIp),
|
||||
location?.city || null,
|
||||
location?.state || null,
|
||||
location?.stateCode || null,
|
||||
location?.country || null,
|
||||
location?.countryCode || null,
|
||||
location?.lat || null,
|
||||
location?.lng || null,
|
||||
domain,
|
||||
page_path || null,
|
||||
referrer || null,
|
||||
req.headers['user-agent'] || null,
|
||||
session_id || null
|
||||
]);
|
||||
|
||||
// Return location to client (for nearby dispensary feature)
|
||||
res.json({
|
||||
success: true,
|
||||
location: location ? {
|
||||
city: location.city,
|
||||
state: location.state,
|
||||
stateCode: location.stateCode,
|
||||
lat: location.lat,
|
||||
lng: location.lng
|
||||
} : null
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('Visitor tracking error:', error);
|
||||
// Don't fail the request - tracking is non-critical
|
||||
res.json({
|
||||
success: false,
|
||||
location: null
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/v1/visitor/location
|
||||
* Get visitor location without tracking (just IP lookup)
|
||||
*/
|
||||
router.get('/visitor/location', (req: Request, res: Response) => {
|
||||
try {
|
||||
const clientIp = (req.headers['x-forwarded-for'] as string)?.split(',')[0].trim() ||
|
||||
req.headers['x-real-ip'] as string ||
|
||||
req.ip ||
|
||||
req.socket.remoteAddress ||
|
||||
'';
|
||||
|
||||
const location = getLocationFromIP(clientIp);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
location: location ? {
|
||||
city: location.city,
|
||||
state: location.state,
|
||||
stateCode: location.stateCode,
|
||||
lat: location.lat,
|
||||
lng: location.lng
|
||||
} : null
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('Location lookup error:', error);
|
||||
res.json({
|
||||
success: false,
|
||||
location: null
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/v1/analytics/visitors
|
||||
* Get visitor analytics (admin only - requires auth)
|
||||
*
|
||||
* Query params:
|
||||
* - domain: filter by domain
|
||||
* - days: number of days to look back (default: 30)
|
||||
* - limit: max results (default: 50)
|
||||
*/
|
||||
router.get('/analytics/visitors', async (req: PublicApiRequest, res: Response) => {
|
||||
try {
|
||||
const scope = req.scope;
|
||||
|
||||
// Only allow internal keys
|
||||
if (!scope || scope.type !== 'internal') {
|
||||
return res.status(403).json({ error: 'Access denied - internal key required' });
|
||||
}
|
||||
|
||||
const { domain, days = '30', limit = '50' } = req.query;
|
||||
const daysNum = Math.min(parseInt(days as string, 10) || 30, 90);
|
||||
const limitNum = Math.min(parseInt(limit as string, 10) || 50, 200);
|
||||
|
||||
let whereClause = 'WHERE created_at > NOW() - $1::interval';
|
||||
const params: any[] = [`${daysNum} days`];
|
||||
let paramIndex = 2;
|
||||
|
||||
if (domain) {
|
||||
whereClause += ` AND domain = $${paramIndex}`;
|
||||
params.push(domain);
|
||||
paramIndex++;
|
||||
}
|
||||
|
||||
// Get top locations
|
||||
const { rows: topLocations } = await pool.query(`
|
||||
SELECT
|
||||
city,
|
||||
state,
|
||||
state_code,
|
||||
country_code,
|
||||
COUNT(*) as visit_count,
|
||||
COUNT(DISTINCT session_id) as unique_sessions,
|
||||
MAX(created_at) as last_visit
|
||||
FROM visitor_locations
|
||||
${whereClause}
|
||||
GROUP BY city, state, state_code, country_code
|
||||
ORDER BY visit_count DESC
|
||||
LIMIT $${paramIndex}
|
||||
`, [...params, limitNum]);
|
||||
|
||||
// Get daily totals
|
||||
const { rows: dailyStats } = await pool.query(`
|
||||
SELECT
|
||||
DATE(created_at) as date,
|
||||
COUNT(*) as visits,
|
||||
COUNT(DISTINCT session_id) as unique_sessions
|
||||
FROM visitor_locations
|
||||
${whereClause}
|
||||
GROUP BY DATE(created_at)
|
||||
ORDER BY date DESC
|
||||
LIMIT 30
|
||||
`, params);
|
||||
|
||||
// Get totals
|
||||
const { rows: totals } = await pool.query(`
|
||||
SELECT
|
||||
COUNT(*) as total_visits,
|
||||
COUNT(DISTINCT session_id) as total_sessions,
|
||||
COUNT(DISTINCT city || state_code) as unique_locations
|
||||
FROM visitor_locations
|
||||
${whereClause}
|
||||
`, params);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
period: {
|
||||
days: daysNum,
|
||||
domain: domain || 'all'
|
||||
},
|
||||
totals: totals[0],
|
||||
top_locations: topLocations.map(l => ({
|
||||
city: l.city,
|
||||
state: l.state,
|
||||
state_code: l.state_code,
|
||||
country_code: l.country_code,
|
||||
visits: parseInt(l.visit_count, 10),
|
||||
unique_sessions: parseInt(l.unique_sessions, 10),
|
||||
last_visit: l.last_visit
|
||||
})),
|
||||
daily_stats: dailyStats.map(d => ({
|
||||
date: d.date,
|
||||
visits: parseInt(d.visits, 10),
|
||||
unique_sessions: parseInt(d.unique_sessions, 10)
|
||||
}))
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('Visitor analytics error:', error);
|
||||
res.status(500).json({
|
||||
error: 'Failed to fetch visitor analytics',
|
||||
message: error.message
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
export default router;
|
||||
|
||||
@@ -444,7 +444,7 @@ router.post('/migration/cancel-pending-crawl-jobs', async (_req: Request, res: R
|
||||
|
||||
/**
|
||||
* POST /api/tasks/migration/create-resync-tasks
|
||||
* Create product_resync tasks for all crawl-enabled dispensaries
|
||||
* Create product_refresh tasks for all crawl-enabled dispensaries
|
||||
*/
|
||||
router.post('/migration/create-resync-tasks', async (req: Request, res: Response) => {
|
||||
try {
|
||||
@@ -474,7 +474,7 @@ router.post('/migration/create-resync-tasks', async (req: Request, res: Response
|
||||
const hasActive = await taskService.hasActiveTask(disp.id);
|
||||
if (!hasActive) {
|
||||
await taskService.createTask({
|
||||
role: 'product_resync',
|
||||
role: 'product_refresh',
|
||||
dispensary_id: disp.id,
|
||||
platform: 'dutchie',
|
||||
priority,
|
||||
|
||||
652
backend/src/routes/worker-registry.ts
Normal file
652
backend/src/routes/worker-registry.ts
Normal file
@@ -0,0 +1,652 @@
|
||||
/**
|
||||
* Worker Registry API Routes
|
||||
*
|
||||
* Dynamic worker management - workers register on startup, get assigned names,
|
||||
* and report heartbeats. Everything is API-driven, no hardcoding.
|
||||
*
|
||||
* Endpoints:
|
||||
* POST /api/worker-registry/register - Worker reports for duty
|
||||
* POST /api/worker-registry/heartbeat - Worker heartbeat
|
||||
* POST /api/worker-registry/deregister - Worker signing off
|
||||
* GET /api/worker-registry/workers - List all workers (for dashboard)
|
||||
* GET /api/worker-registry/workers/:id - Get specific worker
|
||||
* POST /api/worker-registry/cleanup - Mark stale workers offline
|
||||
*
|
||||
* GET /api/worker-registry/names - List all names in pool
|
||||
* POST /api/worker-registry/names - Add names to pool
|
||||
* DELETE /api/worker-registry/names/:name - Remove name from pool
|
||||
*
|
||||
* GET /api/worker-registry/roles - List available task roles
|
||||
* POST /api/worker-registry/roles - Add a new role (future)
|
||||
*/
|
||||
|
||||
import { Router, Request, Response } from 'express';
|
||||
import { pool } from '../db/pool';
|
||||
import os from 'os';
|
||||
|
||||
const router = Router();
|
||||
|
||||
// ============================================================
|
||||
// WORKER REGISTRATION
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* POST /api/worker-registry/register
|
||||
* Worker reports for duty - gets assigned a friendly name
|
||||
*
|
||||
* Body:
|
||||
* - role: string (optional) - task role, or null for role-agnostic workers
|
||||
* - worker_id: string (optional) - custom ID, auto-generated if not provided
|
||||
* - pod_name: string (optional) - k8s pod name
|
||||
* - hostname: string (optional) - machine hostname
|
||||
* - metadata: object (optional) - additional worker info
|
||||
*
|
||||
* Returns:
|
||||
* - worker_id: assigned worker ID
|
||||
* - friendly_name: assigned name from pool
|
||||
* - role: confirmed role (or null if agnostic)
|
||||
* - message: welcome message
|
||||
*/
|
||||
router.post('/register', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const {
|
||||
role = null, // Role is now optional - null means agnostic
|
||||
worker_id,
|
||||
pod_name,
|
||||
hostname,
|
||||
ip_address,
|
||||
metadata = {}
|
||||
} = req.body;
|
||||
|
||||
// Generate worker_id if not provided
|
||||
const finalWorkerId = worker_id || `worker-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`;
|
||||
const finalHostname = hostname || os.hostname();
|
||||
const clientIp = ip_address || req.ip || req.socket.remoteAddress;
|
||||
|
||||
// Check if worker already registered
|
||||
const existing = await pool.query(
|
||||
'SELECT id, friendly_name, status FROM worker_registry WHERE worker_id = $1',
|
||||
[finalWorkerId]
|
||||
);
|
||||
|
||||
if (existing.rows.length > 0) {
|
||||
// Re-activate existing worker
|
||||
const { rows } = await pool.query(`
|
||||
UPDATE worker_registry
|
||||
SET status = 'active',
|
||||
role = $1,
|
||||
pod_name = $2,
|
||||
hostname = $3,
|
||||
ip_address = $4,
|
||||
last_heartbeat_at = NOW(),
|
||||
started_at = NOW(),
|
||||
metadata = $5,
|
||||
updated_at = NOW()
|
||||
WHERE worker_id = $6
|
||||
RETURNING id, worker_id, friendly_name, role
|
||||
`, [role, pod_name, finalHostname, clientIp, metadata, finalWorkerId]);
|
||||
|
||||
const worker = rows[0];
|
||||
const roleMsg = role ? `for ${role}` : 'as role-agnostic';
|
||||
console.log(`[WorkerRegistry] Worker "${worker.friendly_name}" (${finalWorkerId}) re-registered ${roleMsg}`);
|
||||
|
||||
return res.json({
|
||||
success: true,
|
||||
worker_id: worker.worker_id,
|
||||
friendly_name: worker.friendly_name,
|
||||
role: worker.role,
|
||||
message: role
|
||||
? `Welcome back, ${worker.friendly_name}! You are assigned to ${role}.`
|
||||
: `Welcome back, ${worker.friendly_name}! You are ready to take any task.`
|
||||
});
|
||||
}
|
||||
|
||||
// Assign a friendly name
|
||||
const nameResult = await pool.query('SELECT assign_worker_name($1) as name', [finalWorkerId]);
|
||||
const friendlyName = nameResult.rows[0].name;
|
||||
|
||||
// Register the worker
|
||||
const { rows } = await pool.query(`
|
||||
INSERT INTO worker_registry (
|
||||
worker_id, friendly_name, role, pod_name, hostname, ip_address, status, metadata
|
||||
) VALUES ($1, $2, $3, $4, $5, $6, 'active', $7)
|
||||
RETURNING id, worker_id, friendly_name, role
|
||||
`, [finalWorkerId, friendlyName, role, pod_name, finalHostname, clientIp, metadata]);
|
||||
|
||||
const worker = rows[0];
|
||||
const roleMsg = role ? `for ${role}` : 'as role-agnostic';
|
||||
console.log(`[WorkerRegistry] New worker "${friendlyName}" (${finalWorkerId}) reporting for duty ${roleMsg}`);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
worker_id: worker.worker_id,
|
||||
friendly_name: worker.friendly_name,
|
||||
role: worker.role,
|
||||
message: role
|
||||
? `Hello ${friendlyName}! You are now registered for ${role}. Ready for work!`
|
||||
: `Hello ${friendlyName}! You are ready to take any task from the pool.`
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('[WorkerRegistry] Registration error:', error);
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/worker-registry/heartbeat
|
||||
* Worker sends heartbeat to stay alive
|
||||
*
|
||||
* Body:
|
||||
* - worker_id: string (required)
|
||||
* - current_task_id: number (optional) - task currently being processed
|
||||
* - status: string (optional) - 'active', 'idle'
|
||||
*/
|
||||
router.post('/heartbeat', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { worker_id, current_task_id, status = 'active', resources } = req.body;
|
||||
|
||||
if (!worker_id) {
|
||||
return res.status(400).json({ success: false, error: 'worker_id is required' });
|
||||
}
|
||||
|
||||
// Store resources in metadata jsonb column
|
||||
const { rows } = await pool.query(`
|
||||
UPDATE worker_registry
|
||||
SET last_heartbeat_at = NOW(),
|
||||
current_task_id = $1,
|
||||
status = $2,
|
||||
metadata = COALESCE(metadata, '{}'::jsonb) || COALESCE($4::jsonb, '{}'::jsonb),
|
||||
updated_at = NOW()
|
||||
WHERE worker_id = $3
|
||||
RETURNING id, friendly_name, status
|
||||
`, [current_task_id || null, status, worker_id, resources ? JSON.stringify(resources) : null]);
|
||||
|
||||
if (rows.length === 0) {
|
||||
return res.status(404).json({ success: false, error: 'Worker not found - please register first' });
|
||||
}
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
worker: rows[0]
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('[WorkerRegistry] Heartbeat error:', error);
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/worker-registry/task-completed
|
||||
* Worker reports task completion
|
||||
*
|
||||
* Body:
|
||||
* - worker_id: string (required)
|
||||
* - success: boolean (required)
|
||||
*/
|
||||
router.post('/task-completed', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { worker_id, success } = req.body;
|
||||
|
||||
if (!worker_id) {
|
||||
return res.status(400).json({ success: false, error: 'worker_id is required' });
|
||||
}
|
||||
|
||||
const incrementField = success ? 'tasks_completed' : 'tasks_failed';
|
||||
|
||||
const { rows } = await pool.query(`
|
||||
UPDATE worker_registry
|
||||
SET ${incrementField} = ${incrementField} + 1,
|
||||
last_task_at = NOW(),
|
||||
current_task_id = NULL,
|
||||
status = 'idle',
|
||||
updated_at = NOW()
|
||||
WHERE worker_id = $1
|
||||
RETURNING id, friendly_name, tasks_completed, tasks_failed
|
||||
`, [worker_id]);
|
||||
|
||||
if (rows.length === 0) {
|
||||
return res.status(404).json({ success: false, error: 'Worker not found' });
|
||||
}
|
||||
|
||||
res.json({ success: true, worker: rows[0] });
|
||||
} catch (error: any) {
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/worker-registry/deregister
|
||||
* Worker signing off (graceful shutdown)
|
||||
*
|
||||
* Body:
|
||||
* - worker_id: string (required)
|
||||
*/
|
||||
router.post('/deregister', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { worker_id } = req.body;
|
||||
|
||||
if (!worker_id) {
|
||||
return res.status(400).json({ success: false, error: 'worker_id is required' });
|
||||
}
|
||||
|
||||
// Release the name back to the pool
|
||||
await pool.query('SELECT release_worker_name($1)', [worker_id]);
|
||||
|
||||
// Mark as terminated
|
||||
const { rows } = await pool.query(`
|
||||
UPDATE worker_registry
|
||||
SET status = 'terminated',
|
||||
current_task_id = NULL,
|
||||
updated_at = NOW()
|
||||
WHERE worker_id = $1
|
||||
RETURNING id, friendly_name
|
||||
`, [worker_id]);
|
||||
|
||||
if (rows.length === 0) {
|
||||
return res.status(404).json({ success: false, error: 'Worker not found' });
|
||||
}
|
||||
|
||||
console.log(`[WorkerRegistry] Worker "${rows[0].friendly_name}" (${worker_id}) signed off`);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
message: `Goodbye ${rows[0].friendly_name}! Thanks for your work.`
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('[WorkerRegistry] Deregister error:', error);
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// WORKER LISTING (for Dashboard)
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/worker-registry/workers
|
||||
* List all workers (for dashboard)
|
||||
*
|
||||
* Query params:
|
||||
* - status: filter by status (active, idle, offline, all)
|
||||
* - role: filter by role
|
||||
* - include_terminated: include terminated workers (default: false)
|
||||
*/
|
||||
router.get('/workers', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { status, role, include_terminated = 'false' } = req.query;
|
||||
|
||||
let whereClause = include_terminated === 'true' ? 'WHERE 1=1' : "WHERE status != 'terminated'";
|
||||
const params: any[] = [];
|
||||
let paramIndex = 1;
|
||||
|
||||
if (status && status !== 'all') {
|
||||
whereClause += ` AND status = $${paramIndex}`;
|
||||
params.push(status);
|
||||
paramIndex++;
|
||||
}
|
||||
|
||||
if (role) {
|
||||
whereClause += ` AND role = $${paramIndex}`;
|
||||
params.push(role);
|
||||
paramIndex++;
|
||||
}
|
||||
|
||||
const { rows } = await pool.query(`
|
||||
SELECT
|
||||
id,
|
||||
worker_id,
|
||||
friendly_name,
|
||||
role,
|
||||
pod_name,
|
||||
hostname,
|
||||
ip_address,
|
||||
status,
|
||||
started_at,
|
||||
last_heartbeat_at,
|
||||
last_task_at,
|
||||
tasks_completed,
|
||||
tasks_failed,
|
||||
current_task_id,
|
||||
metadata,
|
||||
EXTRACT(EPOCH FROM (NOW() - last_heartbeat_at)) as seconds_since_heartbeat,
|
||||
CASE
|
||||
WHEN status = 'offline' OR status = 'terminated' THEN status
|
||||
WHEN last_heartbeat_at < NOW() - INTERVAL '2 minutes' THEN 'stale'
|
||||
WHEN current_task_id IS NOT NULL THEN 'busy'
|
||||
ELSE 'ready'
|
||||
END as health_status,
|
||||
created_at
|
||||
FROM worker_registry
|
||||
${whereClause}
|
||||
ORDER BY
|
||||
CASE status
|
||||
WHEN 'active' THEN 1
|
||||
WHEN 'idle' THEN 2
|
||||
WHEN 'offline' THEN 3
|
||||
ELSE 4
|
||||
END,
|
||||
last_heartbeat_at DESC
|
||||
`, params);
|
||||
|
||||
// Get summary counts
|
||||
const { rows: summary } = await pool.query(`
|
||||
SELECT
|
||||
COUNT(*) FILTER (WHERE status = 'active') as active_count,
|
||||
COUNT(*) FILTER (WHERE status = 'idle') as idle_count,
|
||||
COUNT(*) FILTER (WHERE status = 'offline') as offline_count,
|
||||
COUNT(*) FILTER (WHERE status != 'terminated') as total_count,
|
||||
COUNT(DISTINCT role) FILTER (WHERE status IN ('active', 'idle')) as active_roles
|
||||
FROM worker_registry
|
||||
`);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
workers: rows,
|
||||
summary: summary[0]
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('[WorkerRegistry] List workers error:', error);
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/worker-registry/workers/:workerId
|
||||
* Get specific worker details
|
||||
*/
|
||||
router.get('/workers/:workerId', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { workerId } = req.params;
|
||||
|
||||
const { rows } = await pool.query(`
|
||||
SELECT * FROM worker_registry WHERE worker_id = $1
|
||||
`, [workerId]);
|
||||
|
||||
if (rows.length === 0) {
|
||||
return res.status(404).json({ success: false, error: 'Worker not found' });
|
||||
}
|
||||
|
||||
res.json({ success: true, worker: rows[0] });
|
||||
} catch (error: any) {
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* DELETE /api/worker-registry/workers/:workerId
|
||||
* Remove a worker (admin action)
|
||||
*/
|
||||
router.delete('/workers/:workerId', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { workerId } = req.params;
|
||||
|
||||
// Release name
|
||||
await pool.query('SELECT release_worker_name($1)', [workerId]);
|
||||
|
||||
// Delete worker
|
||||
const { rows } = await pool.query(`
|
||||
DELETE FROM worker_registry WHERE worker_id = $1 RETURNING friendly_name
|
||||
`, [workerId]);
|
||||
|
||||
if (rows.length === 0) {
|
||||
return res.status(404).json({ success: false, error: 'Worker not found' });
|
||||
}
|
||||
|
||||
res.json({ success: true, message: `Worker ${rows[0].friendly_name} removed` });
|
||||
} catch (error: any) {
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/worker-registry/cleanup
|
||||
* Mark stale workers as offline
|
||||
*
|
||||
* Body:
|
||||
* - stale_threshold_minutes: number (default: 5)
|
||||
*/
|
||||
router.post('/cleanup', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { stale_threshold_minutes = 5 } = req.body;
|
||||
|
||||
const { rows } = await pool.query(
|
||||
'SELECT mark_stale_workers($1) as count',
|
||||
[stale_threshold_minutes]
|
||||
);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
stale_workers_marked: rows[0].count,
|
||||
message: `Marked ${rows[0].count} stale workers as offline`
|
||||
});
|
||||
} catch (error: any) {
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// NAME POOL MANAGEMENT
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/worker-registry/names
|
||||
* List all names in the pool
|
||||
*/
|
||||
router.get('/names', async (_req: Request, res: Response) => {
|
||||
try {
|
||||
const { rows } = await pool.query(`
|
||||
SELECT
|
||||
id,
|
||||
name,
|
||||
in_use,
|
||||
assigned_to,
|
||||
assigned_at
|
||||
FROM worker_name_pool
|
||||
ORDER BY in_use DESC, name ASC
|
||||
`);
|
||||
|
||||
const { rows: summary } = await pool.query(`
|
||||
SELECT
|
||||
COUNT(*) as total,
|
||||
COUNT(*) FILTER (WHERE in_use = true) as in_use,
|
||||
COUNT(*) FILTER (WHERE in_use = false) as available
|
||||
FROM worker_name_pool
|
||||
`);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
names: rows,
|
||||
summary: summary[0]
|
||||
});
|
||||
} catch (error: any) {
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/worker-registry/names
|
||||
* Add names to the pool
|
||||
*
|
||||
* Body:
|
||||
* - names: string[] (required) - array of names to add
|
||||
*/
|
||||
router.post('/names', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { names } = req.body;
|
||||
|
||||
if (!names || !Array.isArray(names) || names.length === 0) {
|
||||
return res.status(400).json({ success: false, error: 'names array is required' });
|
||||
}
|
||||
|
||||
const values = names.map(n => `('${n.replace(/'/g, "''")}')`).join(', ');
|
||||
|
||||
const { rowCount } = await pool.query(`
|
||||
INSERT INTO worker_name_pool (name)
|
||||
VALUES ${values}
|
||||
ON CONFLICT (name) DO NOTHING
|
||||
`);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
added: rowCount,
|
||||
message: `Added ${rowCount} new names to the pool`
|
||||
});
|
||||
} catch (error: any) {
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* DELETE /api/worker-registry/names/:name
|
||||
* Remove a name from the pool (only if not in use)
|
||||
*/
|
||||
router.delete('/names/:name', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { name } = req.params;
|
||||
|
||||
const { rows } = await pool.query(`
|
||||
DELETE FROM worker_name_pool
|
||||
WHERE name = $1 AND in_use = false
|
||||
RETURNING name
|
||||
`, [name]);
|
||||
|
||||
if (rows.length === 0) {
|
||||
return res.status(400).json({
|
||||
success: false,
|
||||
error: 'Name not found or currently in use'
|
||||
});
|
||||
}
|
||||
|
||||
res.json({ success: true, message: `Name "${name}" removed from pool` });
|
||||
} catch (error: any) {
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================
|
||||
// ROLE MANAGEMENT
|
||||
// ============================================================
|
||||
|
||||
/**
|
||||
* GET /api/worker-registry/roles
|
||||
* List available task roles
|
||||
*/
|
||||
router.get('/roles', async (_req: Request, res: Response) => {
|
||||
// These are the roles the task handlers support
|
||||
const roles = [
|
||||
{
|
||||
id: 'product_refresh',
|
||||
name: 'Product Refresh',
|
||||
description: 'Re-crawl dispensary products for price/stock changes',
|
||||
handler: 'handleProductRefresh'
|
||||
},
|
||||
{
|
||||
id: 'product_discovery',
|
||||
name: 'Product Discovery',
|
||||
description: 'Initial product discovery for new dispensaries',
|
||||
handler: 'handleProductDiscovery'
|
||||
},
|
||||
{
|
||||
id: 'store_discovery',
|
||||
name: 'Store Discovery',
|
||||
description: 'Discover new dispensary locations',
|
||||
handler: 'handleStoreDiscovery'
|
||||
},
|
||||
{
|
||||
id: 'entry_point_discovery',
|
||||
name: 'Entry Point Discovery',
|
||||
description: 'Resolve platform IDs from menu URLs',
|
||||
handler: 'handleEntryPointDiscovery'
|
||||
},
|
||||
{
|
||||
id: 'analytics_refresh',
|
||||
name: 'Analytics Refresh',
|
||||
description: 'Refresh materialized views and analytics',
|
||||
handler: 'handleAnalyticsRefresh'
|
||||
}
|
||||
];
|
||||
|
||||
// Get active worker counts per role
|
||||
try {
|
||||
const { rows } = await pool.query(`
|
||||
SELECT role, COUNT(*) as worker_count
|
||||
FROM worker_registry
|
||||
WHERE status IN ('active', 'idle')
|
||||
GROUP BY role
|
||||
`);
|
||||
|
||||
const countMap = new Map(rows.map(r => [r.role, parseInt(r.worker_count)]));
|
||||
|
||||
const rolesWithCounts = roles.map(r => ({
|
||||
...r,
|
||||
active_workers: countMap.get(r.id) || 0
|
||||
}));
|
||||
|
||||
res.json({ success: true, roles: rolesWithCounts });
|
||||
} catch {
|
||||
// If table doesn't exist yet, just return roles without counts
|
||||
res.json({ success: true, roles: roles.map(r => ({ ...r, active_workers: 0 })) });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* GET /api/worker-registry/capacity
|
||||
* Get capacity planning info
|
||||
*/
|
||||
router.get('/capacity', async (_req: Request, res: Response) => {
|
||||
try {
|
||||
// Get worker counts by role
|
||||
const { rows: workerCounts } = await pool.query(`
|
||||
SELECT role, COUNT(*) as count
|
||||
FROM worker_registry
|
||||
WHERE status IN ('active', 'idle')
|
||||
GROUP BY role
|
||||
`);
|
||||
|
||||
// Get pending task counts by role (if worker_tasks exists)
|
||||
let taskCounts: any[] = [];
|
||||
try {
|
||||
const result = await pool.query(`
|
||||
SELECT role, COUNT(*) as pending_count
|
||||
FROM worker_tasks
|
||||
WHERE status = 'pending'
|
||||
GROUP BY role
|
||||
`);
|
||||
taskCounts = result.rows;
|
||||
} catch {
|
||||
// worker_tasks might not exist yet
|
||||
}
|
||||
|
||||
// Get crawl-enabled store count
|
||||
const storeCountResult = await pool.query(`
|
||||
SELECT COUNT(*) as count
|
||||
FROM dispensaries
|
||||
WHERE crawl_enabled = true AND platform_dispensary_id IS NOT NULL
|
||||
`);
|
||||
const totalStores = parseInt(storeCountResult.rows[0].count);
|
||||
|
||||
const workerMap = new Map(workerCounts.map(r => [r.role, parseInt(r.count)]));
|
||||
const taskMap = new Map(taskCounts.map(r => [r.role, parseInt(r.pending_count)]));
|
||||
|
||||
const roles = ['product_refresh', 'product_discovery', 'store_discovery', 'entry_point_discovery', 'analytics_refresh'];
|
||||
|
||||
const capacity = roles.map(role => ({
|
||||
role,
|
||||
active_workers: workerMap.get(role) || 0,
|
||||
pending_tasks: taskMap.get(role) || 0,
|
||||
// Rough estimate: 20 seconds per task, 4-hour cycle
|
||||
tasks_per_worker_per_cycle: 720,
|
||||
workers_needed_for_all_stores: Math.ceil(totalStores / 720)
|
||||
}));
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
total_stores: totalStores,
|
||||
capacity
|
||||
});
|
||||
} catch (error: any) {
|
||||
res.status(500).json({ success: false, error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
export default router;
|
||||
@@ -109,14 +109,14 @@ export class ProxyRotator {
|
||||
username,
|
||||
password,
|
||||
protocol,
|
||||
is_active as "isActive",
|
||||
last_used_at as "lastUsedAt",
|
||||
active as "isActive",
|
||||
last_tested_at as "lastUsedAt",
|
||||
failure_count as "failureCount",
|
||||
success_count as "successCount",
|
||||
avg_response_time_ms as "avgResponseTimeMs"
|
||||
0 as "successCount",
|
||||
response_time_ms as "avgResponseTimeMs"
|
||||
FROM proxies
|
||||
WHERE is_active = true
|
||||
ORDER BY failure_count ASC, last_used_at ASC NULLS FIRST
|
||||
WHERE active = true
|
||||
ORDER BY failure_count ASC, last_tested_at ASC NULLS FIRST
|
||||
`);
|
||||
|
||||
this.proxies = result.rows;
|
||||
@@ -192,11 +192,11 @@ export class ProxyRotator {
|
||||
UPDATE proxies
|
||||
SET
|
||||
failure_count = failure_count + 1,
|
||||
last_failure_at = NOW(),
|
||||
last_error = $2,
|
||||
is_active = CASE WHEN failure_count >= 4 THEN false ELSE is_active END
|
||||
updated_at = NOW(),
|
||||
test_result = $2,
|
||||
active = CASE WHEN failure_count >= 4 THEN false ELSE active END
|
||||
WHERE id = $1
|
||||
`, [proxyId, error || null]);
|
||||
`, [proxyId, error || 'failed']);
|
||||
} catch (err) {
|
||||
console.error(`[ProxyRotator] Failed to update proxy ${proxyId}:`, err);
|
||||
}
|
||||
@@ -226,12 +226,13 @@ export class ProxyRotator {
|
||||
await this.pool.query(`
|
||||
UPDATE proxies
|
||||
SET
|
||||
success_count = success_count + 1,
|
||||
last_used_at = NOW(),
|
||||
avg_response_time_ms = CASE
|
||||
WHEN avg_response_time_ms IS NULL THEN $2
|
||||
ELSE (avg_response_time_ms * 0.8) + ($2 * 0.2)
|
||||
END
|
||||
last_tested_at = NOW(),
|
||||
test_result = 'success',
|
||||
response_time_ms = CASE
|
||||
WHEN response_time_ms IS NULL THEN $2
|
||||
ELSE (response_time_ms * 0.8 + $2 * 0.2)::integer
|
||||
END,
|
||||
updated_at = NOW()
|
||||
WHERE id = $1
|
||||
`, [proxyId, responseTimeMs || null]);
|
||||
} catch (err) {
|
||||
|
||||
134
backend/src/services/ip2location.ts
Normal file
134
backend/src/services/ip2location.ts
Normal file
@@ -0,0 +1,134 @@
|
||||
/**
|
||||
* IP2Location Service
|
||||
*
|
||||
* Uses local IP2Location LITE DB3 database for IP geolocation.
|
||||
* No external API calls, no rate limits.
|
||||
*
|
||||
* Database: IP2Location LITE DB3 (free, monthly updates)
|
||||
* Fields: country, region, city, latitude, longitude
|
||||
*/
|
||||
|
||||
import path from 'path';
|
||||
import fs from 'fs';
|
||||
|
||||
// @ts-ignore - no types for ip2location-nodejs
|
||||
const { IP2Location } = require('ip2location-nodejs');
|
||||
|
||||
const DB_PATH = process.env.IP2LOCATION_DB_PATH ||
|
||||
path.join(__dirname, '../../data/ip2location/IP2LOCATION-LITE-DB5.BIN');
|
||||
|
||||
let ip2location: any = null;
|
||||
let dbLoaded = false;
|
||||
|
||||
/**
|
||||
* Initialize IP2Location database
|
||||
*/
|
||||
export function initIP2Location(): boolean {
|
||||
if (dbLoaded) return true;
|
||||
|
||||
try {
|
||||
if (!fs.existsSync(DB_PATH)) {
|
||||
console.warn(`IP2Location database not found at: ${DB_PATH}`);
|
||||
console.warn('Run: ./scripts/download-ip2location.sh to download');
|
||||
return false;
|
||||
}
|
||||
|
||||
ip2location = new IP2Location();
|
||||
ip2location.open(DB_PATH);
|
||||
dbLoaded = true;
|
||||
console.log('IP2Location database loaded successfully');
|
||||
return true;
|
||||
} catch (err) {
|
||||
console.error('Failed to load IP2Location database:', err);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Close IP2Location database
|
||||
*/
|
||||
export function closeIP2Location(): void {
|
||||
if (ip2location) {
|
||||
ip2location.close();
|
||||
ip2location = null;
|
||||
dbLoaded = false;
|
||||
}
|
||||
}
|
||||
|
||||
export interface GeoLocation {
|
||||
city: string | null;
|
||||
state: string | null;
|
||||
stateCode: string | null;
|
||||
country: string | null;
|
||||
countryCode: string | null;
|
||||
lat: number | null;
|
||||
lng: number | null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Lookup IP address location
|
||||
*
|
||||
* @param ip - IPv4 or IPv6 address
|
||||
* @returns Location data or null if not found
|
||||
*/
|
||||
export function lookupIP(ip: string): GeoLocation | null {
|
||||
// Skip private/localhost IPs
|
||||
if (!ip || ip === '127.0.0.1' || ip === '::1' ||
|
||||
ip.startsWith('192.168.') || ip.startsWith('10.') ||
|
||||
ip.startsWith('172.16.') || ip.startsWith('172.17.') ||
|
||||
ip.startsWith('::ffff:127.') || ip.startsWith('::ffff:192.168.') ||
|
||||
ip.startsWith('::ffff:10.')) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Strip IPv6 prefix if present
|
||||
const cleanIP = ip.replace(/^::ffff:/, '');
|
||||
|
||||
// Initialize on first use if not already loaded
|
||||
if (!dbLoaded) {
|
||||
if (!initIP2Location()) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
const result = ip2location.getAll(cleanIP);
|
||||
|
||||
if (!result || result.ip === '?' || result.countryShort === '-') {
|
||||
return null;
|
||||
}
|
||||
|
||||
// DB3 LITE doesn't include lat/lng - would need DB5+ for that
|
||||
const lat = typeof result.latitude === 'number' && result.latitude !== 0 ? result.latitude : null;
|
||||
const lng = typeof result.longitude === 'number' && result.longitude !== 0 ? result.longitude : null;
|
||||
|
||||
return {
|
||||
city: result.city !== '-' ? result.city : null,
|
||||
state: result.region !== '-' ? result.region : null,
|
||||
stateCode: null, // DB3 doesn't include state codes
|
||||
country: result.countryLong !== '-' ? result.countryLong : null,
|
||||
countryCode: result.countryShort !== '-' ? result.countryShort : null,
|
||||
lat,
|
||||
lng,
|
||||
};
|
||||
} catch (err) {
|
||||
console.error('IP2Location lookup error:', err);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if IP2Location database is available
|
||||
*/
|
||||
export function isIP2LocationAvailable(): boolean {
|
||||
if (dbLoaded) return true;
|
||||
return fs.existsSync(DB_PATH);
|
||||
}
|
||||
|
||||
// Export singleton-style interface
|
||||
export default {
|
||||
init: initIP2Location,
|
||||
close: closeIP2Location,
|
||||
lookup: lookupIP,
|
||||
isAvailable: isIP2LocationAvailable,
|
||||
};
|
||||
@@ -3,7 +3,7 @@ import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||||
import { Browser, Page } from 'puppeteer';
|
||||
import { SocksProxyAgent } from 'socks-proxy-agent';
|
||||
import { pool } from '../db/pool';
|
||||
import { uploadImageFromUrl, getImageUrl } from '../utils/minio';
|
||||
import { downloadProductImageLegacy } from '../utils/image-storage';
|
||||
import { logger } from './logger';
|
||||
import { registerScraper, updateScraperStats, completeScraper } from '../routes/scraper-monitor';
|
||||
import { incrementProxyFailure, getActiveProxy, isBotDetectionError, putProxyInTimeout } from './proxy';
|
||||
@@ -767,7 +767,8 @@ export async function saveProducts(storeId: number, categoryId: number, products
|
||||
|
||||
if (product.imageUrl && !localImagePath) {
|
||||
try {
|
||||
localImagePath = await uploadImageFromUrl(product.imageUrl, productId);
|
||||
const result = await downloadProductImageLegacy(product.imageUrl, 0, productId);
|
||||
localImagePath = result.urls?.original || null;
|
||||
await client.query(`
|
||||
UPDATE products
|
||||
SET local_image_path = $1
|
||||
|
||||
@@ -1,13 +1,21 @@
|
||||
/**
|
||||
* Entry Point Discovery Handler
|
||||
*
|
||||
* Detects menu type and resolves platform IDs for a discovered store.
|
||||
* Resolves platform IDs for a discovered store using Dutchie GraphQL.
|
||||
* This is the step between store_discovery and product_discovery.
|
||||
*
|
||||
* TODO: Integrate with platform ID resolution when available
|
||||
* Flow:
|
||||
* 1. Load dispensary info from database
|
||||
* 2. Extract slug from menu_url
|
||||
* 3. Start stealth session (fingerprint + optional proxy)
|
||||
* 4. Query Dutchie GraphQL to resolve slug → platform_dispensary_id
|
||||
* 5. Update dispensary record with resolved ID
|
||||
* 6. Queue product_discovery task if successful
|
||||
*/
|
||||
|
||||
import { TaskContext, TaskResult } from '../task-worker';
|
||||
import { startSession, endSession } from '../../platforms/dutchie';
|
||||
import { resolveDispensaryIdWithDetails } from '../../platforms/dutchie/queries';
|
||||
|
||||
export async function handleEntryPointDiscovery(ctx: TaskContext): Promise<TaskResult> {
|
||||
const { pool, task } = ctx;
|
||||
@@ -18,9 +26,11 @@ export async function handleEntryPointDiscovery(ctx: TaskContext): Promise<TaskR
|
||||
}
|
||||
|
||||
try {
|
||||
// Get dispensary info
|
||||
// ============================================================
|
||||
// STEP 1: Load dispensary info
|
||||
// ============================================================
|
||||
const dispResult = await pool.query(`
|
||||
SELECT id, name, menu_url, platform_dispensary_id, menu_type
|
||||
SELECT id, name, menu_url, platform_dispensary_id, menu_type, state
|
||||
FROM dispensaries
|
||||
WHERE id = $1
|
||||
`, [dispensaryId]);
|
||||
@@ -33,7 +43,7 @@ export async function handleEntryPointDiscovery(ctx: TaskContext): Promise<TaskR
|
||||
|
||||
// If already has platform_dispensary_id, we're done
|
||||
if (dispensary.platform_dispensary_id) {
|
||||
console.log(`[EntryPointDiscovery] Dispensary ${dispensaryId} already has platform ID`);
|
||||
console.log(`[EntryPointDiscovery] Dispensary ${dispensaryId} already has platform ID: ${dispensary.platform_dispensary_id}`);
|
||||
return {
|
||||
success: true,
|
||||
alreadyResolved: true,
|
||||
@@ -46,9 +56,12 @@ export async function handleEntryPointDiscovery(ctx: TaskContext): Promise<TaskR
|
||||
return { success: false, error: `Dispensary ${dispensaryId} has no menu_url` };
|
||||
}
|
||||
|
||||
console.log(`[EntryPointDiscovery] Would resolve platform ID for ${dispensary.name} from ${menuUrl}`);
|
||||
console.log(`[EntryPointDiscovery] Resolving platform ID for ${dispensary.name}`);
|
||||
console.log(`[EntryPointDiscovery] Menu URL: ${menuUrl}`);
|
||||
|
||||
// Extract slug from menu URL
|
||||
// ============================================================
|
||||
// STEP 2: Extract slug from menu URL
|
||||
// ============================================================
|
||||
let slug: string | null = null;
|
||||
|
||||
const embeddedMatch = menuUrl.match(/\/embedded-menu\/([^/?]+)/);
|
||||
@@ -61,21 +74,109 @@ export async function handleEntryPointDiscovery(ctx: TaskContext): Promise<TaskR
|
||||
}
|
||||
|
||||
if (!slug) {
|
||||
// Mark as non-dutchie menu type
|
||||
await pool.query(`
|
||||
UPDATE dispensaries
|
||||
SET menu_type = 'unknown', updated_at = NOW()
|
||||
WHERE id = $1
|
||||
`, [dispensaryId]);
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: `Could not extract slug from menu_url: ${menuUrl}`,
|
||||
};
|
||||
}
|
||||
|
||||
// TODO: Integrate with actual platform ID resolution
|
||||
// For now, mark the task as needing manual resolution
|
||||
console.log(`[EntryPointDiscovery] Found slug: ${slug} - manual resolution needed`);
|
||||
console.log(`[EntryPointDiscovery] Extracted slug: ${slug}`);
|
||||
|
||||
await ctx.heartbeat();
|
||||
|
||||
// ============================================================
|
||||
// STEP 3: Start stealth session
|
||||
// ============================================================
|
||||
const session = startSession(dispensary.state || 'AZ', 'America/Phoenix');
|
||||
console.log(`[EntryPointDiscovery] Session started: ${session.sessionId}`);
|
||||
|
||||
try {
|
||||
// ============================================================
|
||||
// STEP 4: Resolve platform ID via GraphQL
|
||||
// ============================================================
|
||||
console.log(`[EntryPointDiscovery] Querying Dutchie GraphQL for slug: ${slug}`);
|
||||
|
||||
const result = await resolveDispensaryIdWithDetails(slug);
|
||||
|
||||
if (!result.dispensaryId) {
|
||||
// Resolution failed - could be 403, 404, or invalid response
|
||||
const reason = result.httpStatus
|
||||
? `HTTP ${result.httpStatus}`
|
||||
: result.error || 'Unknown error';
|
||||
|
||||
console.log(`[EntryPointDiscovery] Failed to resolve ${slug}: ${reason}`);
|
||||
|
||||
// Mark as failed resolution but keep menu_type as dutchie
|
||||
await pool.query(`
|
||||
UPDATE dispensaries
|
||||
SET
|
||||
menu_type = CASE
|
||||
WHEN $2 = 404 THEN 'removed'
|
||||
WHEN $2 = 403 THEN 'blocked'
|
||||
ELSE 'dutchie'
|
||||
END,
|
||||
updated_at = NOW()
|
||||
WHERE id = $1
|
||||
`, [dispensaryId, result.httpStatus || 0]);
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: `Could not resolve platform ID: ${reason}`,
|
||||
slug,
|
||||
httpStatus: result.httpStatus,
|
||||
};
|
||||
}
|
||||
|
||||
const platformId = result.dispensaryId;
|
||||
console.log(`[EntryPointDiscovery] Resolved ${slug} -> ${platformId}`);
|
||||
|
||||
await ctx.heartbeat();
|
||||
|
||||
// ============================================================
|
||||
// STEP 5: Update dispensary with resolved ID
|
||||
// ============================================================
|
||||
await pool.query(`
|
||||
UPDATE dispensaries
|
||||
SET
|
||||
platform_dispensary_id = $2,
|
||||
menu_type = 'dutchie',
|
||||
crawl_enabled = true,
|
||||
updated_at = NOW()
|
||||
WHERE id = $1
|
||||
`, [dispensaryId, platformId]);
|
||||
|
||||
console.log(`[EntryPointDiscovery] Updated dispensary ${dispensaryId} with platform ID`);
|
||||
|
||||
// ============================================================
|
||||
// STEP 6: Queue product_discovery task
|
||||
// ============================================================
|
||||
await pool.query(`
|
||||
INSERT INTO worker_tasks (role, dispensary_id, priority, scheduled_for)
|
||||
VALUES ('product_discovery', $1, 5, NOW())
|
||||
ON CONFLICT DO NOTHING
|
||||
`, [dispensaryId]);
|
||||
|
||||
console.log(`[EntryPointDiscovery] Queued product_discovery task for dispensary ${dispensaryId}`);
|
||||
|
||||
return {
|
||||
success: true,
|
||||
platformId,
|
||||
slug,
|
||||
queuedProductDiscovery: true,
|
||||
};
|
||||
|
||||
} finally {
|
||||
// Always end session
|
||||
endSession();
|
||||
}
|
||||
|
||||
return {
|
||||
success: true,
|
||||
message: 'Slug extracted, awaiting platform ID resolution',
|
||||
slug,
|
||||
};
|
||||
} catch (error: unknown) {
|
||||
const errorMessage = error instanceof Error ? error.message : 'Unknown error';
|
||||
console.error(`[EntryPointDiscovery] Error for dispensary ${dispensaryId}:`, errorMessage);
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
* Exports all task handlers for the task worker.
|
||||
*/
|
||||
|
||||
export { handleProductResync } from './product-resync';
|
||||
export { handleProductRefresh } from './product-refresh';
|
||||
export { handleProductDiscovery } from './product-discovery';
|
||||
export { handleStoreDiscovery } from './store-discovery';
|
||||
export { handleEntryPointDiscovery } from './entry-point-discovery';
|
||||
|
||||
@@ -6,11 +6,11 @@
|
||||
*/
|
||||
|
||||
import { TaskContext, TaskResult } from '../task-worker';
|
||||
import { handleProductResync } from './product-resync';
|
||||
import { handleProductRefresh } from './product-refresh';
|
||||
|
||||
export async function handleProductDiscovery(ctx: TaskContext): Promise<TaskResult> {
|
||||
// Product discovery is essentially the same as resync for the first time
|
||||
// Product discovery is essentially the same as refresh for the first time
|
||||
// The main difference is in when this task is triggered (new store vs scheduled)
|
||||
console.log(`[ProductDiscovery] Starting initial product fetch for dispensary ${ctx.task.dispensary_id}`);
|
||||
return handleProductResync(ctx);
|
||||
return handleProductRefresh(ctx);
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/**
|
||||
* Product Resync Handler
|
||||
* Product Refresh Handler
|
||||
*
|
||||
* Re-crawls a store to capture price/stock changes using the GraphQL pipeline.
|
||||
*
|
||||
@@ -31,12 +31,12 @@ import {
|
||||
|
||||
const normalizer = new DutchieNormalizer();
|
||||
|
||||
export async function handleProductResync(ctx: TaskContext): Promise<TaskResult> {
|
||||
export async function handleProductRefresh(ctx: TaskContext): Promise<TaskResult> {
|
||||
const { pool, task } = ctx;
|
||||
const dispensaryId = task.dispensary_id;
|
||||
|
||||
if (!dispensaryId) {
|
||||
return { success: false, error: 'No dispensary_id specified for product_resync task' };
|
||||
return { success: false, error: 'No dispensary_id specified for product_refresh task' };
|
||||
}
|
||||
|
||||
try {
|
||||
@@ -17,7 +17,7 @@ export {
|
||||
export { TaskWorker, TaskContext, TaskResult } from './task-worker';
|
||||
|
||||
export {
|
||||
handleProductResync,
|
||||
handleProductRefresh,
|
||||
handleProductDiscovery,
|
||||
handleStoreDiscovery,
|
||||
handleEntryPointDiscovery,
|
||||
|
||||
93
backend/src/tasks/start-pod.ts
Normal file
93
backend/src/tasks/start-pod.ts
Normal file
@@ -0,0 +1,93 @@
|
||||
#!/usr/bin/env npx tsx
|
||||
/**
|
||||
* Start Pod - Simulates a Kubernetes pod locally
|
||||
*
|
||||
* Starts 5 workers with a pod name from the predefined list.
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx src/tasks/start-pod.ts <pod-index>
|
||||
* npx tsx src/tasks/start-pod.ts 0 # Starts pod "Aethelgard" with 5 workers
|
||||
* npx tsx src/tasks/start-pod.ts 1 # Starts pod "Xylos" with 5 workers
|
||||
*/
|
||||
|
||||
import { spawn } from 'child_process';
|
||||
import path from 'path';
|
||||
|
||||
const POD_NAMES = [
|
||||
'Aethelgard',
|
||||
'Xylos',
|
||||
'Kryll',
|
||||
'Coriolis',
|
||||
'Dimidium',
|
||||
'Veridia',
|
||||
'Zetani',
|
||||
'Talos IV',
|
||||
'Onyx',
|
||||
'Celestia',
|
||||
'Gormand',
|
||||
'Betha',
|
||||
'Ragnar',
|
||||
'Syphon',
|
||||
'Axiom',
|
||||
'Nadir',
|
||||
'Terra Nova',
|
||||
'Acheron',
|
||||
'Nexus',
|
||||
'Vespera',
|
||||
'Helios Prime',
|
||||
'Oasis',
|
||||
'Mordina',
|
||||
'Cygnus',
|
||||
'Umbra',
|
||||
];
|
||||
|
||||
const WORKERS_PER_POD = 5;
|
||||
|
||||
async function main() {
|
||||
const podIndex = parseInt(process.argv[2] ?? '0', 10);
|
||||
|
||||
if (podIndex < 0 || podIndex >= POD_NAMES.length) {
|
||||
console.error(`Invalid pod index: ${podIndex}. Must be 0-${POD_NAMES.length - 1}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const podName = POD_NAMES[podIndex];
|
||||
console.log(`[Pod] Starting pod "${podName}" with ${WORKERS_PER_POD} workers...`);
|
||||
|
||||
const workerScript = path.join(__dirname, 'task-worker.ts');
|
||||
const workers: ReturnType<typeof spawn>[] = [];
|
||||
|
||||
for (let i = 1; i <= WORKERS_PER_POD; i++) {
|
||||
const workerId = `${podName}-worker-${i}`;
|
||||
|
||||
const worker = spawn('npx', ['tsx', workerScript], {
|
||||
env: {
|
||||
...process.env,
|
||||
WORKER_ID: workerId,
|
||||
POD_NAME: podName,
|
||||
},
|
||||
stdio: 'inherit',
|
||||
});
|
||||
|
||||
workers.push(worker);
|
||||
console.log(`[Pod] Started worker ${i}/${WORKERS_PER_POD}: ${workerId}`);
|
||||
}
|
||||
|
||||
// Handle shutdown
|
||||
const shutdown = () => {
|
||||
console.log(`\n[Pod] Shutting down pod "${podName}"...`);
|
||||
workers.forEach(w => w.kill('SIGTERM'));
|
||||
setTimeout(() => process.exit(0), 2000);
|
||||
};
|
||||
|
||||
process.on('SIGTERM', shutdown);
|
||||
process.on('SIGINT', shutdown);
|
||||
|
||||
// Keep the process alive
|
||||
await new Promise(() => {});
|
||||
}
|
||||
|
||||
main().catch(err => {
|
||||
console.error('[Pod] Fatal error:', err);
|
||||
process.exit(1);
|
||||
});
|
||||
@@ -14,7 +14,7 @@ export type TaskRole =
|
||||
| 'store_discovery'
|
||||
| 'entry_point_discovery'
|
||||
| 'product_discovery'
|
||||
| 'product_resync'
|
||||
| 'product_refresh'
|
||||
| 'analytics_refresh';
|
||||
|
||||
export type TaskStatus =
|
||||
@@ -29,6 +29,8 @@ export interface WorkerTask {
|
||||
id: number;
|
||||
role: TaskRole;
|
||||
dispensary_id: number | null;
|
||||
dispensary_name?: string; // JOINed from dispensaries
|
||||
dispensary_slug?: string; // JOINed from dispensaries
|
||||
platform: string | null;
|
||||
status: TaskStatus;
|
||||
priority: number;
|
||||
@@ -128,13 +130,42 @@ class TaskService {
|
||||
|
||||
/**
|
||||
* Claim a task atomically for a worker
|
||||
* Uses the SQL function for proper locking
|
||||
* If role is null, claims ANY available task (role-agnostic worker)
|
||||
*/
|
||||
async claimTask(role: TaskRole, workerId: string): Promise<WorkerTask | null> {
|
||||
const result = await pool.query(
|
||||
`SELECT * FROM claim_task($1, $2)`,
|
||||
[role, workerId]
|
||||
);
|
||||
async claimTask(role: TaskRole | null, workerId: string): Promise<WorkerTask | null> {
|
||||
if (role) {
|
||||
// Role-specific claiming - use the SQL function
|
||||
const result = await pool.query(
|
||||
`SELECT * FROM claim_task($1, $2)`,
|
||||
[role, workerId]
|
||||
);
|
||||
return (result.rows[0] as WorkerTask) || null;
|
||||
}
|
||||
|
||||
// Role-agnostic claiming - claim ANY pending task
|
||||
const result = await pool.query(`
|
||||
UPDATE worker_tasks
|
||||
SET
|
||||
status = 'claimed',
|
||||
worker_id = $1,
|
||||
claimed_at = NOW()
|
||||
WHERE id = (
|
||||
SELECT id FROM worker_tasks
|
||||
WHERE status = 'pending'
|
||||
AND (scheduled_for IS NULL OR scheduled_for <= NOW())
|
||||
-- Exclude stores that already have an active task
|
||||
AND (dispensary_id IS NULL OR dispensary_id NOT IN (
|
||||
SELECT dispensary_id FROM worker_tasks
|
||||
WHERE status IN ('claimed', 'running')
|
||||
AND dispensary_id IS NOT NULL
|
||||
))
|
||||
ORDER BY priority DESC, created_at ASC
|
||||
LIMIT 1
|
||||
FOR UPDATE SKIP LOCKED
|
||||
)
|
||||
RETURNING *
|
||||
`, [workerId]);
|
||||
|
||||
return (result.rows[0] as WorkerTask) || null;
|
||||
}
|
||||
|
||||
@@ -206,27 +237,27 @@ class TaskService {
|
||||
let paramIndex = 1;
|
||||
|
||||
if (filter.role) {
|
||||
conditions.push(`role = $${paramIndex++}`);
|
||||
conditions.push(`t.role = $${paramIndex++}`);
|
||||
params.push(filter.role);
|
||||
}
|
||||
|
||||
if (filter.status) {
|
||||
if (Array.isArray(filter.status)) {
|
||||
conditions.push(`status = ANY($${paramIndex++})`);
|
||||
conditions.push(`t.status = ANY($${paramIndex++})`);
|
||||
params.push(filter.status);
|
||||
} else {
|
||||
conditions.push(`status = $${paramIndex++}`);
|
||||
conditions.push(`t.status = $${paramIndex++}`);
|
||||
params.push(filter.status);
|
||||
}
|
||||
}
|
||||
|
||||
if (filter.dispensary_id) {
|
||||
conditions.push(`dispensary_id = $${paramIndex++}`);
|
||||
conditions.push(`t.dispensary_id = $${paramIndex++}`);
|
||||
params.push(filter.dispensary_id);
|
||||
}
|
||||
|
||||
if (filter.worker_id) {
|
||||
conditions.push(`worker_id = $${paramIndex++}`);
|
||||
conditions.push(`t.worker_id = $${paramIndex++}`);
|
||||
params.push(filter.worker_id);
|
||||
}
|
||||
|
||||
@@ -235,9 +266,14 @@ class TaskService {
|
||||
const offset = filter.offset ?? 0;
|
||||
|
||||
const result = await pool.query(
|
||||
`SELECT * FROM worker_tasks
|
||||
`SELECT
|
||||
t.*,
|
||||
d.name as dispensary_name,
|
||||
d.slug as dispensary_slug
|
||||
FROM worker_tasks t
|
||||
LEFT JOIN dispensaries d ON d.id = t.dispensary_id
|
||||
${whereClause}
|
||||
ORDER BY created_at DESC
|
||||
ORDER BY t.created_at DESC
|
||||
LIMIT ${limit} OFFSET ${offset}`,
|
||||
params
|
||||
);
|
||||
|
||||
@@ -1,26 +1,58 @@
|
||||
/**
|
||||
* Task Worker
|
||||
*
|
||||
* A unified worker that processes tasks from the worker_tasks queue.
|
||||
* Replaces the fragmented job systems (job_schedules, dispensary_crawl_jobs, etc.)
|
||||
* A unified worker that pulls tasks from the worker_tasks queue.
|
||||
* Workers register on startup, get a friendly name, and pull tasks.
|
||||
*
|
||||
* Architecture:
|
||||
* - Tasks are generated on schedule (by scheduler or API)
|
||||
* - Workers PULL tasks from the pool (not assigned to them)
|
||||
* - Tasks are claimed in order of priority (DESC) then creation time (ASC)
|
||||
* - Workers report heartbeats to worker_registry
|
||||
* - Workers are ROLE-AGNOSTIC by default (can handle any task type)
|
||||
*
|
||||
* Stealth & Anti-Detection:
|
||||
* PROXIES ARE REQUIRED - workers will fail to start if no proxies available.
|
||||
*
|
||||
* On startup, workers initialize the CrawlRotator which provides:
|
||||
* - Proxy rotation: Loads proxies from `proxies` table, ALL requests use proxy
|
||||
* - User-Agent rotation: Cycles through realistic browser fingerprints
|
||||
* - Fingerprint rotation: Changes browser profile on blocks
|
||||
* - Locale/timezone: Matches Accept-Language to target state
|
||||
*
|
||||
* The CrawlRotator is wired to the Dutchie client via setCrawlRotator().
|
||||
* Task handlers call startSession() which picks a random fingerprint.
|
||||
* On 403 errors, the client automatically:
|
||||
* 1. Records failure on current proxy
|
||||
* 2. Rotates to next proxy
|
||||
* 3. Rotates fingerprint
|
||||
* 4. Retries the request
|
||||
*
|
||||
* Usage:
|
||||
* WORKER_ROLE=product_resync npx tsx src/tasks/task-worker.ts
|
||||
* npx tsx src/tasks/task-worker.ts # Role-agnostic (any task)
|
||||
* WORKER_ROLE=product_refresh npx tsx src/tasks/task-worker.ts # Role-specific
|
||||
*
|
||||
* Environment:
|
||||
* WORKER_ROLE - Which task role to process (required)
|
||||
* WORKER_ID - Optional custom worker ID
|
||||
* WORKER_ROLE - Which task role to process (optional, null = any task)
|
||||
* WORKER_ID - Optional custom worker ID (auto-generated if not provided)
|
||||
* POD_NAME - Kubernetes pod name (optional)
|
||||
* POLL_INTERVAL_MS - How often to check for tasks (default: 5000)
|
||||
* HEARTBEAT_INTERVAL_MS - How often to update heartbeat (default: 30000)
|
||||
* API_BASE_URL - Backend API URL for registration (default: http://localhost:3010)
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { v4 as uuidv4 } from 'uuid';
|
||||
import { taskService, TaskRole, WorkerTask } from './task-service';
|
||||
import { getPool } from '../db/pool';
|
||||
import os from 'os';
|
||||
|
||||
// Stealth/rotation support
|
||||
import { CrawlRotator } from '../services/crawl-rotator';
|
||||
import { setCrawlRotator } from '../platforms/dutchie';
|
||||
|
||||
// Task handlers by role
|
||||
import { handleProductResync } from './handlers/product-resync';
|
||||
import { handleProductRefresh } from './handlers/product-refresh';
|
||||
import { handleProductDiscovery } from './handlers/product-discovery';
|
||||
import { handleStoreDiscovery } from './handlers/store-discovery';
|
||||
import { handleEntryPointDiscovery } from './handlers/entry-point-discovery';
|
||||
@@ -28,6 +60,7 @@ import { handleAnalyticsRefresh } from './handlers/analytics-refresh';
|
||||
|
||||
const POLL_INTERVAL_MS = parseInt(process.env.POLL_INTERVAL_MS || '5000');
|
||||
const HEARTBEAT_INTERVAL_MS = parseInt(process.env.HEARTBEAT_INTERVAL_MS || '30000');
|
||||
const API_BASE_URL = process.env.API_BASE_URL || 'http://localhost:3010';
|
||||
|
||||
export interface TaskContext {
|
||||
pool: Pool;
|
||||
@@ -48,7 +81,7 @@ export interface TaskResult {
|
||||
type TaskHandler = (ctx: TaskContext) => Promise<TaskResult>;
|
||||
|
||||
const TASK_HANDLERS: Record<TaskRole, TaskHandler> = {
|
||||
product_resync: handleProductResync,
|
||||
product_refresh: handleProductRefresh,
|
||||
product_discovery: handleProductDiscovery,
|
||||
store_discovery: handleStoreDiscovery,
|
||||
entry_point_discovery: handleEntryPointDiscovery,
|
||||
@@ -58,15 +91,160 @@ const TASK_HANDLERS: Record<TaskRole, TaskHandler> = {
|
||||
export class TaskWorker {
|
||||
private pool: Pool;
|
||||
private workerId: string;
|
||||
private role: TaskRole;
|
||||
private role: TaskRole | null; // null = role-agnostic (any task)
|
||||
private friendlyName: string = '';
|
||||
private isRunning: boolean = false;
|
||||
private heartbeatInterval: NodeJS.Timeout | null = null;
|
||||
private registryHeartbeatInterval: NodeJS.Timeout | null = null;
|
||||
private currentTask: WorkerTask | null = null;
|
||||
private crawlRotator: CrawlRotator;
|
||||
|
||||
constructor(role: TaskRole, workerId?: string) {
|
||||
constructor(role: TaskRole | null = null, workerId?: string) {
|
||||
this.pool = getPool();
|
||||
this.role = role;
|
||||
this.workerId = workerId || `worker-${role}-${uuidv4().slice(0, 8)}`;
|
||||
this.workerId = workerId || `worker-${uuidv4().slice(0, 8)}`;
|
||||
this.crawlRotator = new CrawlRotator(this.pool);
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize stealth systems (proxy rotation, fingerprints)
|
||||
* Called once on worker startup before processing any tasks.
|
||||
*
|
||||
* IMPORTANT: Proxies are REQUIRED. Workers will fail to start if no proxies available.
|
||||
*/
|
||||
private async initializeStealth(): Promise<void> {
|
||||
// Load proxies from database
|
||||
await this.crawlRotator.initialize();
|
||||
|
||||
const stats = this.crawlRotator.proxy.getStats();
|
||||
if (stats.activeProxies === 0) {
|
||||
throw new Error('No active proxies available. Workers MUST use proxies for all requests. Add proxies to the database before starting workers.');
|
||||
}
|
||||
|
||||
console.log(`[TaskWorker] Loaded ${stats.activeProxies} proxies (${stats.avgSuccessRate.toFixed(1)}% avg success rate)`);
|
||||
|
||||
// Wire rotator to Dutchie client - proxies will be used for ALL requests
|
||||
setCrawlRotator(this.crawlRotator);
|
||||
|
||||
console.log(`[TaskWorker] Stealth initialized: ${this.crawlRotator.userAgent.getCount()} fingerprints, proxy REQUIRED for all requests`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Register worker with the registry (get friendly name)
|
||||
*/
|
||||
private async register(): Promise<void> {
|
||||
try {
|
||||
const response = await fetch(`${API_BASE_URL}/api/worker-registry/register`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
role: this.role,
|
||||
worker_id: this.workerId,
|
||||
pod_name: process.env.POD_NAME || process.env.HOSTNAME,
|
||||
hostname: os.hostname(),
|
||||
metadata: {
|
||||
pid: process.pid,
|
||||
node_version: process.version,
|
||||
started_at: new Date().toISOString()
|
||||
}
|
||||
})
|
||||
});
|
||||
|
||||
const data = await response.json();
|
||||
if (data.success) {
|
||||
this.friendlyName = data.friendly_name;
|
||||
console.log(`[TaskWorker] ${data.message}`);
|
||||
} else {
|
||||
console.warn(`[TaskWorker] Registration warning: ${data.error}`);
|
||||
this.friendlyName = this.workerId.slice(0, 12);
|
||||
}
|
||||
} catch (error: any) {
|
||||
// Registration is optional - worker can still function without it
|
||||
console.warn(`[TaskWorker] Could not register with API (will continue): ${error.message}`);
|
||||
this.friendlyName = this.workerId.slice(0, 12);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Deregister worker from the registry
|
||||
*/
|
||||
private async deregister(): Promise<void> {
|
||||
try {
|
||||
await fetch(`${API_BASE_URL}/api/worker-registry/deregister`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ worker_id: this.workerId })
|
||||
});
|
||||
console.log(`[TaskWorker] ${this.friendlyName} signed off`);
|
||||
} catch {
|
||||
// Ignore deregistration errors
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Send heartbeat to registry with resource usage
|
||||
*/
|
||||
private async sendRegistryHeartbeat(): Promise<void> {
|
||||
try {
|
||||
const memUsage = process.memoryUsage();
|
||||
const cpuUsage = process.cpuUsage();
|
||||
|
||||
await fetch(`${API_BASE_URL}/api/worker-registry/heartbeat`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
worker_id: this.workerId,
|
||||
current_task_id: this.currentTask?.id || null,
|
||||
status: this.currentTask ? 'active' : 'idle',
|
||||
resources: {
|
||||
memory_mb: Math.round(memUsage.heapUsed / 1024 / 1024),
|
||||
memory_total_mb: Math.round(memUsage.heapTotal / 1024 / 1024),
|
||||
memory_rss_mb: Math.round(memUsage.rss / 1024 / 1024),
|
||||
cpu_user_ms: Math.round(cpuUsage.user / 1000),
|
||||
cpu_system_ms: Math.round(cpuUsage.system / 1000),
|
||||
}
|
||||
})
|
||||
});
|
||||
} catch {
|
||||
// Ignore heartbeat errors
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Report task completion to registry
|
||||
*/
|
||||
private async reportTaskCompletion(success: boolean): Promise<void> {
|
||||
try {
|
||||
await fetch(`${API_BASE_URL}/api/worker-registry/task-completed`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
worker_id: this.workerId,
|
||||
success
|
||||
})
|
||||
});
|
||||
} catch {
|
||||
// Ignore errors
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Start registry heartbeat interval
|
||||
*/
|
||||
private startRegistryHeartbeat(): void {
|
||||
this.registryHeartbeatInterval = setInterval(async () => {
|
||||
await this.sendRegistryHeartbeat();
|
||||
}, HEARTBEAT_INTERVAL_MS);
|
||||
}
|
||||
|
||||
/**
|
||||
* Stop registry heartbeat interval
|
||||
*/
|
||||
private stopRegistryHeartbeat(): void {
|
||||
if (this.registryHeartbeatInterval) {
|
||||
clearInterval(this.registryHeartbeatInterval);
|
||||
this.registryHeartbeatInterval = null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -74,7 +252,18 @@ export class TaskWorker {
|
||||
*/
|
||||
async start(): Promise<void> {
|
||||
this.isRunning = true;
|
||||
console.log(`[TaskWorker] Starting worker ${this.workerId} for role: ${this.role}`);
|
||||
|
||||
// Initialize stealth systems (proxy rotation, fingerprints)
|
||||
await this.initializeStealth();
|
||||
|
||||
// Register with the API to get a friendly name
|
||||
await this.register();
|
||||
|
||||
// Start registry heartbeat
|
||||
this.startRegistryHeartbeat();
|
||||
|
||||
const roleMsg = this.role ? `for role: ${this.role}` : '(role-agnostic - any task)';
|
||||
console.log(`[TaskWorker] ${this.friendlyName} starting ${roleMsg}`);
|
||||
|
||||
while (this.isRunning) {
|
||||
try {
|
||||
@@ -91,10 +280,12 @@ export class TaskWorker {
|
||||
/**
|
||||
* Stop the worker
|
||||
*/
|
||||
stop(): void {
|
||||
async stop(): Promise<void> {
|
||||
this.isRunning = false;
|
||||
this.stopHeartbeat();
|
||||
console.log(`[TaskWorker] Stopping worker ${this.workerId}...`);
|
||||
this.stopRegistryHeartbeat();
|
||||
await this.deregister();
|
||||
console.log(`[TaskWorker] ${this.friendlyName} stopped`);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -142,7 +333,8 @@ export class TaskWorker {
|
||||
if (result.success) {
|
||||
// Mark as completed
|
||||
await taskService.completeTask(task.id, result);
|
||||
console.log(`[TaskWorker] Task ${task.id} completed successfully`);
|
||||
await this.reportTaskCompletion(true);
|
||||
console.log(`[TaskWorker] ${this.friendlyName} completed task ${task.id}`);
|
||||
|
||||
// Chain next task if applicable
|
||||
const chainedTask = await taskService.chainNextTask({
|
||||
@@ -156,12 +348,14 @@ export class TaskWorker {
|
||||
} else {
|
||||
// Mark as failed
|
||||
await taskService.failTask(task.id, result.error || 'Unknown error');
|
||||
console.log(`[TaskWorker] Task ${task.id} failed: ${result.error}`);
|
||||
await this.reportTaskCompletion(false);
|
||||
console.log(`[TaskWorker] ${this.friendlyName} failed task ${task.id}: ${result.error}`);
|
||||
}
|
||||
} catch (error: any) {
|
||||
// Mark as failed
|
||||
await taskService.failTask(task.id, error.message);
|
||||
console.error(`[TaskWorker] Task ${task.id} threw error:`, error.message);
|
||||
await this.reportTaskCompletion(false);
|
||||
console.error(`[TaskWorker] ${this.friendlyName} task ${task.id} error:`, error.message);
|
||||
} finally {
|
||||
this.stopHeartbeat();
|
||||
this.currentTask = null;
|
||||
@@ -201,7 +395,7 @@ export class TaskWorker {
|
||||
/**
|
||||
* Get worker info
|
||||
*/
|
||||
getInfo(): { workerId: string; role: TaskRole; isRunning: boolean; currentTaskId: number | null } {
|
||||
getInfo(): { workerId: string; role: TaskRole | null; isRunning: boolean; currentTaskId: number | null } {
|
||||
return {
|
||||
workerId: this.workerId,
|
||||
role: this.role,
|
||||
@@ -216,30 +410,27 @@ export class TaskWorker {
|
||||
// ============================================================
|
||||
|
||||
async function main(): Promise<void> {
|
||||
const role = process.env.WORKER_ROLE as TaskRole;
|
||||
|
||||
if (!role) {
|
||||
console.error('Error: WORKER_ROLE environment variable is required');
|
||||
console.error('Valid roles: store_discovery, entry_point_discovery, product_discovery, product_resync, analytics_refresh');
|
||||
process.exit(1);
|
||||
}
|
||||
const role = process.env.WORKER_ROLE as TaskRole | undefined;
|
||||
|
||||
const validRoles: TaskRole[] = [
|
||||
'store_discovery',
|
||||
'entry_point_discovery',
|
||||
'product_discovery',
|
||||
'product_resync',
|
||||
'product_refresh',
|
||||
'analytics_refresh',
|
||||
];
|
||||
|
||||
if (!validRoles.includes(role)) {
|
||||
// If role specified, validate it
|
||||
if (role && !validRoles.includes(role)) {
|
||||
console.error(`Error: Invalid WORKER_ROLE: ${role}`);
|
||||
console.error(`Valid roles: ${validRoles.join(', ')}`);
|
||||
console.error('Or omit WORKER_ROLE for role-agnostic worker (any task)');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const workerId = process.env.WORKER_ID;
|
||||
const worker = new TaskWorker(role, workerId);
|
||||
// Pass null for role-agnostic, or the specific role
|
||||
const worker = new TaskWorker(role || null, workerId);
|
||||
|
||||
// Handle graceful shutdown
|
||||
process.on('SIGTERM', () => {
|
||||
|
||||
Reference in New Issue
Block a user