Major changes: - Split crawl into payload_fetch (API → disk) and product_refresh (disk → DB) - Add task chaining: store_discovery → product_discovery → payload_fetch → product_refresh - Add payload storage utilities for gzipped JSON on filesystem - Add /api/payloads endpoints for payload access and diffing - Add DB-driven TaskScheduler with schedule persistence - Track newDispensaryIds through discovery promotion for chaining - Add stealth improvements: HTTP fingerprinting, proxy rotation enhancements - Add Workers dashboard K8s scaling controls New files: - src/tasks/handlers/payload-fetch.ts - Fetches from API, saves to disk - src/services/task-scheduler.ts - DB-driven schedule management - src/utils/payload-storage.ts - Payload save/load utilities - src/routes/payloads.ts - Payload API endpoints - src/services/http-fingerprint.ts - Browser fingerprint generation - docs/TASK_WORKFLOW_2024-12-10.md - Complete workflow documentation Migrations: - 078: Proxy consecutive 403 tracking - 079: task_schedules table - 080: raw_crawl_payloads table - 081: payload column and last_fetch_at 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
335 lines
9.0 KiB
TypeScript
335 lines
9.0 KiB
TypeScript
/**
|
|
* Payload Routes
|
|
*
|
|
* Per TASK_WORKFLOW_2024-12-10.md: API access to raw crawl payloads.
|
|
*
|
|
* Endpoints:
|
|
* - GET /api/payloads - List payload metadata (paginated)
|
|
* - GET /api/payloads/:id - Get payload metadata by ID
|
|
* - GET /api/payloads/:id/data - Get full payload JSON
|
|
* - GET /api/payloads/store/:dispensaryId - List payloads for a store
|
|
* - GET /api/payloads/store/:dispensaryId/latest - Get latest payload for a store
|
|
* - GET /api/payloads/store/:dispensaryId/diff - Diff two payloads
|
|
*/
|
|
|
|
import { Router, Request, Response } from 'express';
|
|
import { getPool } from '../db/pool';
|
|
import {
|
|
loadRawPayloadById,
|
|
getLatestPayload,
|
|
getRecentPayloads,
|
|
listPayloadMetadata,
|
|
} from '../utils/payload-storage';
|
|
import { Pool } from 'pg';
|
|
|
|
const router = Router();
|
|
|
|
// Get pool instance for queries
|
|
const getDbPool = (): Pool => getPool() as unknown as Pool;
|
|
|
|
/**
|
|
* GET /api/payloads
|
|
* List payload metadata (paginated)
|
|
*/
|
|
router.get('/', async (req: Request, res: Response) => {
|
|
try {
|
|
const pool = getDbPool();
|
|
const limit = Math.min(parseInt(req.query.limit as string) || 50, 100);
|
|
const offset = parseInt(req.query.offset as string) || 0;
|
|
const dispensaryId = req.query.dispensary_id ? parseInt(req.query.dispensary_id as string) : undefined;
|
|
|
|
const payloads = await listPayloadMetadata(pool, {
|
|
dispensaryId,
|
|
limit,
|
|
offset,
|
|
});
|
|
|
|
res.json({
|
|
success: true,
|
|
payloads,
|
|
pagination: { limit, offset },
|
|
});
|
|
} catch (error: any) {
|
|
console.error('[Payloads] List error:', error.message);
|
|
res.status(500).json({ success: false, error: error.message });
|
|
}
|
|
});
|
|
|
|
/**
|
|
* GET /api/payloads/:id
|
|
* Get payload metadata by ID
|
|
*/
|
|
router.get('/:id', async (req: Request, res: Response) => {
|
|
try {
|
|
const pool = getDbPool();
|
|
const id = parseInt(req.params.id);
|
|
|
|
const result = await pool.query(`
|
|
SELECT
|
|
p.id,
|
|
p.dispensary_id,
|
|
p.crawl_run_id,
|
|
p.storage_path,
|
|
p.product_count,
|
|
p.size_bytes,
|
|
p.size_bytes_raw,
|
|
p.fetched_at,
|
|
p.processed_at,
|
|
p.checksum_sha256,
|
|
d.name as dispensary_name
|
|
FROM raw_crawl_payloads p
|
|
LEFT JOIN dispensaries d ON d.id = p.dispensary_id
|
|
WHERE p.id = $1
|
|
`, [id]);
|
|
|
|
if (result.rows.length === 0) {
|
|
return res.status(404).json({ success: false, error: 'Payload not found' });
|
|
}
|
|
|
|
res.json({
|
|
success: true,
|
|
payload: result.rows[0],
|
|
});
|
|
} catch (error: any) {
|
|
console.error('[Payloads] Get error:', error.message);
|
|
res.status(500).json({ success: false, error: error.message });
|
|
}
|
|
});
|
|
|
|
/**
|
|
* GET /api/payloads/:id/data
|
|
* Get full payload JSON (decompressed from disk)
|
|
*/
|
|
router.get('/:id/data', async (req: Request, res: Response) => {
|
|
try {
|
|
const pool = getDbPool();
|
|
const id = parseInt(req.params.id);
|
|
|
|
const result = await loadRawPayloadById(pool, id);
|
|
|
|
if (!result) {
|
|
return res.status(404).json({ success: false, error: 'Payload not found' });
|
|
}
|
|
|
|
res.json({
|
|
success: true,
|
|
metadata: result.metadata,
|
|
data: result.payload,
|
|
});
|
|
} catch (error: any) {
|
|
console.error('[Payloads] Get data error:', error.message);
|
|
res.status(500).json({ success: false, error: error.message });
|
|
}
|
|
});
|
|
|
|
/**
|
|
* GET /api/payloads/store/:dispensaryId
|
|
* List payloads for a specific store
|
|
*/
|
|
router.get('/store/:dispensaryId', async (req: Request, res: Response) => {
|
|
try {
|
|
const pool = getDbPool();
|
|
const dispensaryId = parseInt(req.params.dispensaryId);
|
|
const limit = Math.min(parseInt(req.query.limit as string) || 20, 100);
|
|
const offset = parseInt(req.query.offset as string) || 0;
|
|
|
|
const payloads = await listPayloadMetadata(pool, {
|
|
dispensaryId,
|
|
limit,
|
|
offset,
|
|
});
|
|
|
|
res.json({
|
|
success: true,
|
|
dispensaryId,
|
|
payloads,
|
|
pagination: { limit, offset },
|
|
});
|
|
} catch (error: any) {
|
|
console.error('[Payloads] Store list error:', error.message);
|
|
res.status(500).json({ success: false, error: error.message });
|
|
}
|
|
});
|
|
|
|
/**
|
|
* GET /api/payloads/store/:dispensaryId/latest
|
|
* Get the latest payload for a store (with full data)
|
|
*/
|
|
router.get('/store/:dispensaryId/latest', async (req: Request, res: Response) => {
|
|
try {
|
|
const pool = getDbPool();
|
|
const dispensaryId = parseInt(req.params.dispensaryId);
|
|
|
|
const result = await getLatestPayload(pool, dispensaryId);
|
|
|
|
if (!result) {
|
|
return res.status(404).json({
|
|
success: false,
|
|
error: `No payloads found for dispensary ${dispensaryId}`,
|
|
});
|
|
}
|
|
|
|
res.json({
|
|
success: true,
|
|
metadata: result.metadata,
|
|
data: result.payload,
|
|
});
|
|
} catch (error: any) {
|
|
console.error('[Payloads] Latest error:', error.message);
|
|
res.status(500).json({ success: false, error: error.message });
|
|
}
|
|
});
|
|
|
|
/**
|
|
* GET /api/payloads/store/:dispensaryId/diff
|
|
* Compare two payloads for a store
|
|
*
|
|
* Query params:
|
|
* - from: payload ID (older)
|
|
* - to: payload ID (newer) - optional, defaults to latest
|
|
*/
|
|
router.get('/store/:dispensaryId/diff', async (req: Request, res: Response) => {
|
|
try {
|
|
const pool = getDbPool();
|
|
const dispensaryId = parseInt(req.params.dispensaryId);
|
|
const fromId = req.query.from ? parseInt(req.query.from as string) : undefined;
|
|
const toId = req.query.to ? parseInt(req.query.to as string) : undefined;
|
|
|
|
let fromPayload: any;
|
|
let toPayload: any;
|
|
|
|
if (fromId && toId) {
|
|
// Load specific payloads
|
|
const [from, to] = await Promise.all([
|
|
loadRawPayloadById(pool, fromId),
|
|
loadRawPayloadById(pool, toId),
|
|
]);
|
|
fromPayload = from;
|
|
toPayload = to;
|
|
} else {
|
|
// Load two most recent
|
|
const recent = await getRecentPayloads(pool, dispensaryId, 2);
|
|
if (recent.length < 2) {
|
|
return res.status(400).json({
|
|
success: false,
|
|
error: 'Need at least 2 payloads to diff. Only found ' + recent.length,
|
|
});
|
|
}
|
|
toPayload = recent[0]; // Most recent
|
|
fromPayload = recent[1]; // Previous
|
|
}
|
|
|
|
if (!fromPayload || !toPayload) {
|
|
return res.status(404).json({ success: false, error: 'One or both payloads not found' });
|
|
}
|
|
|
|
// Build product maps by ID
|
|
const fromProducts = new Map<string, any>();
|
|
const toProducts = new Map<string, any>();
|
|
|
|
for (const p of fromPayload.payload.products || []) {
|
|
const id = p._id || p.id;
|
|
if (id) fromProducts.set(id, p);
|
|
}
|
|
|
|
for (const p of toPayload.payload.products || []) {
|
|
const id = p._id || p.id;
|
|
if (id) toProducts.set(id, p);
|
|
}
|
|
|
|
// Find differences
|
|
const added: any[] = [];
|
|
const removed: any[] = [];
|
|
const priceChanges: any[] = [];
|
|
const stockChanges: any[] = [];
|
|
|
|
// Products in "to" but not in "from" = added
|
|
for (const [id, product] of toProducts) {
|
|
if (!fromProducts.has(id)) {
|
|
added.push({
|
|
id,
|
|
name: product.name,
|
|
brand: product.brand?.name,
|
|
price: product.Prices?.[0]?.price,
|
|
});
|
|
}
|
|
}
|
|
|
|
// Products in "from" but not in "to" = removed
|
|
for (const [id, product] of fromProducts) {
|
|
if (!toProducts.has(id)) {
|
|
removed.push({
|
|
id,
|
|
name: product.name,
|
|
brand: product.brand?.name,
|
|
price: product.Prices?.[0]?.price,
|
|
});
|
|
}
|
|
}
|
|
|
|
// Products in both - check for changes
|
|
for (const [id, toProduct] of toProducts) {
|
|
const fromProduct = fromProducts.get(id);
|
|
if (!fromProduct) continue;
|
|
|
|
const fromPrice = fromProduct.Prices?.[0]?.price;
|
|
const toPrice = toProduct.Prices?.[0]?.price;
|
|
|
|
if (fromPrice !== toPrice) {
|
|
priceChanges.push({
|
|
id,
|
|
name: toProduct.name,
|
|
brand: toProduct.brand?.name,
|
|
oldPrice: fromPrice,
|
|
newPrice: toPrice,
|
|
change: toPrice && fromPrice ? toPrice - fromPrice : null,
|
|
});
|
|
}
|
|
|
|
const fromStock = fromProduct.Status || fromProduct.status;
|
|
const toStock = toProduct.Status || toProduct.status;
|
|
|
|
if (fromStock !== toStock) {
|
|
stockChanges.push({
|
|
id,
|
|
name: toProduct.name,
|
|
brand: toProduct.brand?.name,
|
|
oldStatus: fromStock,
|
|
newStatus: toStock,
|
|
});
|
|
}
|
|
}
|
|
|
|
res.json({
|
|
success: true,
|
|
from: {
|
|
id: fromPayload.metadata.id,
|
|
fetchedAt: fromPayload.metadata.fetchedAt,
|
|
productCount: fromPayload.metadata.productCount,
|
|
},
|
|
to: {
|
|
id: toPayload.metadata.id,
|
|
fetchedAt: toPayload.metadata.fetchedAt,
|
|
productCount: toPayload.metadata.productCount,
|
|
},
|
|
diff: {
|
|
added: added.length,
|
|
removed: removed.length,
|
|
priceChanges: priceChanges.length,
|
|
stockChanges: stockChanges.length,
|
|
},
|
|
details: {
|
|
added,
|
|
removed,
|
|
priceChanges,
|
|
stockChanges,
|
|
},
|
|
});
|
|
} catch (error: any) {
|
|
console.error('[Payloads] Diff error:', error.message);
|
|
res.status(500).json({ success: false, error: error.message });
|
|
}
|
|
});
|
|
|
|
export default router;
|