- Move deprecated directories to src/_deprecated/: - hydration/ (old pipeline approach) - scraper-v2/ (old Puppeteer scraper) - canonical-hydration/ (merged into tasks) - Unused services: availability, crawler-logger, geolocation, etc - Unused utils: age-gate-playwright, HomepageValidator, stealthBrowser - Archive outdated docs to docs/_archive/: - ANALYTICS_RUNBOOK.md - ANALYTICS_V2_EXAMPLES.md - BRAND_INTELLIGENCE_API.md - CRAWL_PIPELINE.md - TASK_WORKFLOW_2024-12-10.md - WORKER_TASK_ARCHITECTURE.md - ORGANIC_SCRAPING_GUIDE.md - Add docs/CODEBASE_MAP.md as single source of truth - Add warning files to deprecated/archived directories - Slim down CLAUDE.md to essential rules only 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
415 lines
10 KiB
TypeScript
415 lines
10 KiB
TypeScript
/**
|
|
* CrawlerLogger - Structured logging for crawler operations
|
|
*
|
|
* High-signal, low-noise logging with JSON output for:
|
|
* - Job lifecycle (one summary per job)
|
|
* - Provider/mode changes
|
|
* - Sandbox events
|
|
* - Queue failures
|
|
*
|
|
* NO per-product logging - that's too noisy.
|
|
*/
|
|
|
|
export type LogLevel = 'info' | 'warn' | 'error' | 'debug';
|
|
|
|
export type LogEvent =
|
|
| 'job_started'
|
|
| 'job_completed'
|
|
| 'job_failed'
|
|
| 'job_cancelled'
|
|
| 'provider_detected'
|
|
| 'provider_changed'
|
|
| 'mode_changed'
|
|
| 'sandbox_started'
|
|
| 'sandbox_completed'
|
|
| 'sandbox_failed'
|
|
| 'queue_failure'
|
|
| 'detection_scan'
|
|
| 'crawl_batch'
|
|
| 'intelligence_run';
|
|
|
|
interface BaseLogPayload {
|
|
timestamp: string;
|
|
level: LogLevel;
|
|
event: LogEvent;
|
|
dispensary_id?: number;
|
|
store_id?: number;
|
|
job_id?: number;
|
|
provider?: string;
|
|
category?: 'product' | 'specials' | 'brand' | 'metadata';
|
|
}
|
|
|
|
interface JobStartedPayload extends BaseLogPayload {
|
|
event: 'job_started';
|
|
job_type: string;
|
|
trigger_type: string;
|
|
store_name: string;
|
|
}
|
|
|
|
interface JobCompletedPayload extends BaseLogPayload {
|
|
event: 'job_completed';
|
|
store_name: string;
|
|
duration_ms: number;
|
|
products_found: number;
|
|
products_new: number;
|
|
products_updated: number;
|
|
products_marked_oos?: number;
|
|
}
|
|
|
|
interface JobFailedPayload extends BaseLogPayload {
|
|
event: 'job_failed';
|
|
store_name: string;
|
|
duration_ms: number;
|
|
error_message: string;
|
|
error_code?: string;
|
|
}
|
|
|
|
interface ProviderDetectedPayload extends BaseLogPayload {
|
|
event: 'provider_detected';
|
|
dispensary_name: string;
|
|
detected_provider: string;
|
|
confidence: number;
|
|
detection_method: string;
|
|
menu_url?: string;
|
|
}
|
|
|
|
interface ProviderChangedPayload extends BaseLogPayload {
|
|
event: 'provider_changed';
|
|
dispensary_name: string;
|
|
old_provider: string | null;
|
|
new_provider: string;
|
|
old_confidence: number;
|
|
new_confidence: number;
|
|
}
|
|
|
|
interface ModeChangedPayload extends BaseLogPayload {
|
|
event: 'mode_changed';
|
|
dispensary_name: string;
|
|
old_mode: string;
|
|
new_mode: string;
|
|
reason: string;
|
|
}
|
|
|
|
interface SandboxEventPayload extends BaseLogPayload {
|
|
event: 'sandbox_started' | 'sandbox_completed' | 'sandbox_failed';
|
|
dispensary_name: string;
|
|
template_name: string;
|
|
quality_score?: number;
|
|
products_extracted?: number;
|
|
fields_missing?: number;
|
|
error_message?: string;
|
|
}
|
|
|
|
interface QueueFailurePayload extends BaseLogPayload {
|
|
event: 'queue_failure';
|
|
queue_type: string;
|
|
error_message: string;
|
|
affected_items?: number;
|
|
}
|
|
|
|
interface DetectionScanPayload extends BaseLogPayload {
|
|
event: 'detection_scan';
|
|
total_scanned: number;
|
|
detected: number;
|
|
failed: number;
|
|
skipped: number;
|
|
duration_ms: number;
|
|
}
|
|
|
|
interface IntelligenceRunPayload extends BaseLogPayload {
|
|
event: 'intelligence_run';
|
|
run_type: 'detection' | 'production' | 'sandbox' | 'full';
|
|
dispensaries_processed: number;
|
|
jobs_queued: number;
|
|
duration_ms: number;
|
|
}
|
|
|
|
type LogPayload =
|
|
| JobStartedPayload
|
|
| JobCompletedPayload
|
|
| JobFailedPayload
|
|
| ProviderDetectedPayload
|
|
| ProviderChangedPayload
|
|
| ModeChangedPayload
|
|
| SandboxEventPayload
|
|
| QueueFailurePayload
|
|
| DetectionScanPayload
|
|
| IntelligenceRunPayload;
|
|
|
|
class CrawlerLoggerService {
|
|
private formatLog(payload: LogPayload): string {
|
|
return JSON.stringify(payload);
|
|
}
|
|
|
|
private log(payload: LogPayload): void {
|
|
const formatted = this.formatLog(payload);
|
|
|
|
switch (payload.level) {
|
|
case 'error':
|
|
console.error(`[CRAWLER] ${formatted}`);
|
|
break;
|
|
case 'warn':
|
|
console.warn(`[CRAWLER] ${formatted}`);
|
|
break;
|
|
case 'debug':
|
|
console.debug(`[CRAWLER] ${formatted}`);
|
|
break;
|
|
default:
|
|
console.log(`[CRAWLER] ${formatted}`);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Log when a crawl job starts
|
|
*/
|
|
jobStarted(params: {
|
|
job_id: number;
|
|
store_id: number;
|
|
store_name: string;
|
|
job_type: string;
|
|
trigger_type: string;
|
|
provider?: string;
|
|
}): void {
|
|
this.log({
|
|
timestamp: new Date().toISOString(),
|
|
level: 'info',
|
|
event: 'job_started',
|
|
job_id: params.job_id,
|
|
store_id: params.store_id,
|
|
store_name: params.store_name,
|
|
job_type: params.job_type,
|
|
trigger_type: params.trigger_type,
|
|
provider: params.provider,
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Log when a crawl job completes successfully
|
|
*/
|
|
jobCompleted(params: {
|
|
job_id: number;
|
|
store_id: number;
|
|
store_name: string;
|
|
duration_ms: number;
|
|
products_found: number;
|
|
products_new: number;
|
|
products_updated: number;
|
|
products_marked_oos?: number;
|
|
provider?: string;
|
|
}): void {
|
|
this.log({
|
|
timestamp: new Date().toISOString(),
|
|
level: 'info',
|
|
event: 'job_completed',
|
|
job_id: params.job_id,
|
|
store_id: params.store_id,
|
|
store_name: params.store_name,
|
|
duration_ms: params.duration_ms,
|
|
products_found: params.products_found,
|
|
products_new: params.products_new,
|
|
products_updated: params.products_updated,
|
|
products_marked_oos: params.products_marked_oos,
|
|
provider: params.provider,
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Log when a crawl job fails
|
|
*/
|
|
jobFailed(params: {
|
|
job_id: number;
|
|
store_id: number;
|
|
store_name: string;
|
|
duration_ms: number;
|
|
error_message: string;
|
|
error_code?: string;
|
|
provider?: string;
|
|
}): void {
|
|
this.log({
|
|
timestamp: new Date().toISOString(),
|
|
level: 'error',
|
|
event: 'job_failed',
|
|
job_id: params.job_id,
|
|
store_id: params.store_id,
|
|
store_name: params.store_name,
|
|
duration_ms: params.duration_ms,
|
|
error_message: params.error_message,
|
|
error_code: params.error_code,
|
|
provider: params.provider,
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Log when a provider is detected for a dispensary
|
|
*/
|
|
providerDetected(params: {
|
|
dispensary_id: number;
|
|
dispensary_name: string;
|
|
detected_provider: string;
|
|
confidence: number;
|
|
detection_method: string;
|
|
menu_url?: string;
|
|
category?: 'product' | 'specials' | 'brand' | 'metadata';
|
|
}): void {
|
|
this.log({
|
|
timestamp: new Date().toISOString(),
|
|
level: 'info',
|
|
event: 'provider_detected',
|
|
dispensary_id: params.dispensary_id,
|
|
dispensary_name: params.dispensary_name,
|
|
detected_provider: params.detected_provider,
|
|
confidence: params.confidence,
|
|
detection_method: params.detection_method,
|
|
menu_url: params.menu_url,
|
|
category: params.category,
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Log when a dispensary's provider changes
|
|
*/
|
|
providerChanged(params: {
|
|
dispensary_id: number;
|
|
dispensary_name: string;
|
|
old_provider: string | null;
|
|
new_provider: string;
|
|
old_confidence: number;
|
|
new_confidence: number;
|
|
category?: 'product' | 'specials' | 'brand' | 'metadata';
|
|
}): void {
|
|
this.log({
|
|
timestamp: new Date().toISOString(),
|
|
level: 'info',
|
|
event: 'provider_changed',
|
|
dispensary_id: params.dispensary_id,
|
|
dispensary_name: params.dispensary_name,
|
|
old_provider: params.old_provider,
|
|
new_provider: params.new_provider,
|
|
old_confidence: params.old_confidence,
|
|
new_confidence: params.new_confidence,
|
|
category: params.category,
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Log when a dispensary's crawler mode changes (sandbox -> production, etc.)
|
|
*/
|
|
modeChanged(params: {
|
|
dispensary_id: number;
|
|
dispensary_name: string;
|
|
old_mode: string;
|
|
new_mode: string;
|
|
reason: string;
|
|
category?: 'product' | 'specials' | 'brand' | 'metadata';
|
|
provider?: string;
|
|
}): void {
|
|
this.log({
|
|
timestamp: new Date().toISOString(),
|
|
level: 'info',
|
|
event: 'mode_changed',
|
|
dispensary_id: params.dispensary_id,
|
|
dispensary_name: params.dispensary_name,
|
|
old_mode: params.old_mode,
|
|
new_mode: params.new_mode,
|
|
reason: params.reason,
|
|
category: params.category,
|
|
provider: params.provider,
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Log sandbox crawl events
|
|
*/
|
|
sandboxEvent(params: {
|
|
event: 'sandbox_started' | 'sandbox_completed' | 'sandbox_failed';
|
|
dispensary_id: number;
|
|
dispensary_name: string;
|
|
template_name: string;
|
|
category?: 'product' | 'specials' | 'brand' | 'metadata';
|
|
quality_score?: number;
|
|
products_extracted?: number;
|
|
fields_missing?: number;
|
|
error_message?: string;
|
|
provider?: string;
|
|
}): void {
|
|
const level: LogLevel = params.event === 'sandbox_failed' ? 'error' : 'info';
|
|
this.log({
|
|
timestamp: new Date().toISOString(),
|
|
level,
|
|
event: params.event,
|
|
dispensary_id: params.dispensary_id,
|
|
dispensary_name: params.dispensary_name,
|
|
template_name: params.template_name,
|
|
category: params.category,
|
|
quality_score: params.quality_score,
|
|
products_extracted: params.products_extracted,
|
|
fields_missing: params.fields_missing,
|
|
error_message: params.error_message,
|
|
provider: params.provider,
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Log queue processing failures
|
|
*/
|
|
queueFailure(params: {
|
|
queue_type: string;
|
|
error_message: string;
|
|
affected_items?: number;
|
|
}): void {
|
|
this.log({
|
|
timestamp: new Date().toISOString(),
|
|
level: 'error',
|
|
event: 'queue_failure',
|
|
queue_type: params.queue_type,
|
|
error_message: params.error_message,
|
|
affected_items: params.affected_items,
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Log detection scan summary
|
|
*/
|
|
detectionScan(params: {
|
|
total_scanned: number;
|
|
detected: number;
|
|
failed: number;
|
|
skipped: number;
|
|
duration_ms: number;
|
|
}): void {
|
|
this.log({
|
|
timestamp: new Date().toISOString(),
|
|
level: 'info',
|
|
event: 'detection_scan',
|
|
total_scanned: params.total_scanned,
|
|
detected: params.detected,
|
|
failed: params.failed,
|
|
skipped: params.skipped,
|
|
duration_ms: params.duration_ms,
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Log intelligence run summary
|
|
*/
|
|
intelligenceRun(params: {
|
|
run_type: 'detection' | 'production' | 'sandbox' | 'full';
|
|
dispensaries_processed: number;
|
|
jobs_queued: number;
|
|
duration_ms: number;
|
|
}): void {
|
|
this.log({
|
|
timestamp: new Date().toISOString(),
|
|
level: 'info',
|
|
event: 'intelligence_run',
|
|
run_type: params.run_type,
|
|
dispensaries_processed: params.dispensaries_processed,
|
|
jobs_queued: params.jobs_queued,
|
|
duration_ms: params.duration_ms,
|
|
});
|
|
}
|
|
}
|
|
|
|
// Export singleton instance
|
|
export const crawlerLogger = new CrawlerLoggerService();
|