Files
cannaiq/backend/src/_deprecated/services/crawler-logger.ts
Kelly a35976b9e9 chore: Clean up deprecated code and docs
- Move deprecated directories to src/_deprecated/:
  - hydration/ (old pipeline approach)
  - scraper-v2/ (old Puppeteer scraper)
  - canonical-hydration/ (merged into tasks)
  - Unused services: availability, crawler-logger, geolocation, etc
  - Unused utils: age-gate-playwright, HomepageValidator, stealthBrowser

- Archive outdated docs to docs/_archive/:
  - ANALYTICS_RUNBOOK.md
  - ANALYTICS_V2_EXAMPLES.md
  - BRAND_INTELLIGENCE_API.md
  - CRAWL_PIPELINE.md
  - TASK_WORKFLOW_2024-12-10.md
  - WORKER_TASK_ARCHITECTURE.md
  - ORGANIC_SCRAPING_GUIDE.md

- Add docs/CODEBASE_MAP.md as single source of truth
- Add warning files to deprecated/archived directories
- Slim down CLAUDE.md to essential rules only

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-11 22:17:40 -07:00

415 lines
10 KiB
TypeScript

/**
* CrawlerLogger - Structured logging for crawler operations
*
* High-signal, low-noise logging with JSON output for:
* - Job lifecycle (one summary per job)
* - Provider/mode changes
* - Sandbox events
* - Queue failures
*
* NO per-product logging - that's too noisy.
*/
export type LogLevel = 'info' | 'warn' | 'error' | 'debug';
export type LogEvent =
| 'job_started'
| 'job_completed'
| 'job_failed'
| 'job_cancelled'
| 'provider_detected'
| 'provider_changed'
| 'mode_changed'
| 'sandbox_started'
| 'sandbox_completed'
| 'sandbox_failed'
| 'queue_failure'
| 'detection_scan'
| 'crawl_batch'
| 'intelligence_run';
interface BaseLogPayload {
timestamp: string;
level: LogLevel;
event: LogEvent;
dispensary_id?: number;
store_id?: number;
job_id?: number;
provider?: string;
category?: 'product' | 'specials' | 'brand' | 'metadata';
}
interface JobStartedPayload extends BaseLogPayload {
event: 'job_started';
job_type: string;
trigger_type: string;
store_name: string;
}
interface JobCompletedPayload extends BaseLogPayload {
event: 'job_completed';
store_name: string;
duration_ms: number;
products_found: number;
products_new: number;
products_updated: number;
products_marked_oos?: number;
}
interface JobFailedPayload extends BaseLogPayload {
event: 'job_failed';
store_name: string;
duration_ms: number;
error_message: string;
error_code?: string;
}
interface ProviderDetectedPayload extends BaseLogPayload {
event: 'provider_detected';
dispensary_name: string;
detected_provider: string;
confidence: number;
detection_method: string;
menu_url?: string;
}
interface ProviderChangedPayload extends BaseLogPayload {
event: 'provider_changed';
dispensary_name: string;
old_provider: string | null;
new_provider: string;
old_confidence: number;
new_confidence: number;
}
interface ModeChangedPayload extends BaseLogPayload {
event: 'mode_changed';
dispensary_name: string;
old_mode: string;
new_mode: string;
reason: string;
}
interface SandboxEventPayload extends BaseLogPayload {
event: 'sandbox_started' | 'sandbox_completed' | 'sandbox_failed';
dispensary_name: string;
template_name: string;
quality_score?: number;
products_extracted?: number;
fields_missing?: number;
error_message?: string;
}
interface QueueFailurePayload extends BaseLogPayload {
event: 'queue_failure';
queue_type: string;
error_message: string;
affected_items?: number;
}
interface DetectionScanPayload extends BaseLogPayload {
event: 'detection_scan';
total_scanned: number;
detected: number;
failed: number;
skipped: number;
duration_ms: number;
}
interface IntelligenceRunPayload extends BaseLogPayload {
event: 'intelligence_run';
run_type: 'detection' | 'production' | 'sandbox' | 'full';
dispensaries_processed: number;
jobs_queued: number;
duration_ms: number;
}
type LogPayload =
| JobStartedPayload
| JobCompletedPayload
| JobFailedPayload
| ProviderDetectedPayload
| ProviderChangedPayload
| ModeChangedPayload
| SandboxEventPayload
| QueueFailurePayload
| DetectionScanPayload
| IntelligenceRunPayload;
class CrawlerLoggerService {
private formatLog(payload: LogPayload): string {
return JSON.stringify(payload);
}
private log(payload: LogPayload): void {
const formatted = this.formatLog(payload);
switch (payload.level) {
case 'error':
console.error(`[CRAWLER] ${formatted}`);
break;
case 'warn':
console.warn(`[CRAWLER] ${formatted}`);
break;
case 'debug':
console.debug(`[CRAWLER] ${formatted}`);
break;
default:
console.log(`[CRAWLER] ${formatted}`);
}
}
/**
* Log when a crawl job starts
*/
jobStarted(params: {
job_id: number;
store_id: number;
store_name: string;
job_type: string;
trigger_type: string;
provider?: string;
}): void {
this.log({
timestamp: new Date().toISOString(),
level: 'info',
event: 'job_started',
job_id: params.job_id,
store_id: params.store_id,
store_name: params.store_name,
job_type: params.job_type,
trigger_type: params.trigger_type,
provider: params.provider,
});
}
/**
* Log when a crawl job completes successfully
*/
jobCompleted(params: {
job_id: number;
store_id: number;
store_name: string;
duration_ms: number;
products_found: number;
products_new: number;
products_updated: number;
products_marked_oos?: number;
provider?: string;
}): void {
this.log({
timestamp: new Date().toISOString(),
level: 'info',
event: 'job_completed',
job_id: params.job_id,
store_id: params.store_id,
store_name: params.store_name,
duration_ms: params.duration_ms,
products_found: params.products_found,
products_new: params.products_new,
products_updated: params.products_updated,
products_marked_oos: params.products_marked_oos,
provider: params.provider,
});
}
/**
* Log when a crawl job fails
*/
jobFailed(params: {
job_id: number;
store_id: number;
store_name: string;
duration_ms: number;
error_message: string;
error_code?: string;
provider?: string;
}): void {
this.log({
timestamp: new Date().toISOString(),
level: 'error',
event: 'job_failed',
job_id: params.job_id,
store_id: params.store_id,
store_name: params.store_name,
duration_ms: params.duration_ms,
error_message: params.error_message,
error_code: params.error_code,
provider: params.provider,
});
}
/**
* Log when a provider is detected for a dispensary
*/
providerDetected(params: {
dispensary_id: number;
dispensary_name: string;
detected_provider: string;
confidence: number;
detection_method: string;
menu_url?: string;
category?: 'product' | 'specials' | 'brand' | 'metadata';
}): void {
this.log({
timestamp: new Date().toISOString(),
level: 'info',
event: 'provider_detected',
dispensary_id: params.dispensary_id,
dispensary_name: params.dispensary_name,
detected_provider: params.detected_provider,
confidence: params.confidence,
detection_method: params.detection_method,
menu_url: params.menu_url,
category: params.category,
});
}
/**
* Log when a dispensary's provider changes
*/
providerChanged(params: {
dispensary_id: number;
dispensary_name: string;
old_provider: string | null;
new_provider: string;
old_confidence: number;
new_confidence: number;
category?: 'product' | 'specials' | 'brand' | 'metadata';
}): void {
this.log({
timestamp: new Date().toISOString(),
level: 'info',
event: 'provider_changed',
dispensary_id: params.dispensary_id,
dispensary_name: params.dispensary_name,
old_provider: params.old_provider,
new_provider: params.new_provider,
old_confidence: params.old_confidence,
new_confidence: params.new_confidence,
category: params.category,
});
}
/**
* Log when a dispensary's crawler mode changes (sandbox -> production, etc.)
*/
modeChanged(params: {
dispensary_id: number;
dispensary_name: string;
old_mode: string;
new_mode: string;
reason: string;
category?: 'product' | 'specials' | 'brand' | 'metadata';
provider?: string;
}): void {
this.log({
timestamp: new Date().toISOString(),
level: 'info',
event: 'mode_changed',
dispensary_id: params.dispensary_id,
dispensary_name: params.dispensary_name,
old_mode: params.old_mode,
new_mode: params.new_mode,
reason: params.reason,
category: params.category,
provider: params.provider,
});
}
/**
* Log sandbox crawl events
*/
sandboxEvent(params: {
event: 'sandbox_started' | 'sandbox_completed' | 'sandbox_failed';
dispensary_id: number;
dispensary_name: string;
template_name: string;
category?: 'product' | 'specials' | 'brand' | 'metadata';
quality_score?: number;
products_extracted?: number;
fields_missing?: number;
error_message?: string;
provider?: string;
}): void {
const level: LogLevel = params.event === 'sandbox_failed' ? 'error' : 'info';
this.log({
timestamp: new Date().toISOString(),
level,
event: params.event,
dispensary_id: params.dispensary_id,
dispensary_name: params.dispensary_name,
template_name: params.template_name,
category: params.category,
quality_score: params.quality_score,
products_extracted: params.products_extracted,
fields_missing: params.fields_missing,
error_message: params.error_message,
provider: params.provider,
});
}
/**
* Log queue processing failures
*/
queueFailure(params: {
queue_type: string;
error_message: string;
affected_items?: number;
}): void {
this.log({
timestamp: new Date().toISOString(),
level: 'error',
event: 'queue_failure',
queue_type: params.queue_type,
error_message: params.error_message,
affected_items: params.affected_items,
});
}
/**
* Log detection scan summary
*/
detectionScan(params: {
total_scanned: number;
detected: number;
failed: number;
skipped: number;
duration_ms: number;
}): void {
this.log({
timestamp: new Date().toISOString(),
level: 'info',
event: 'detection_scan',
total_scanned: params.total_scanned,
detected: params.detected,
failed: params.failed,
skipped: params.skipped,
duration_ms: params.duration_ms,
});
}
/**
* Log intelligence run summary
*/
intelligenceRun(params: {
run_type: 'detection' | 'production' | 'sandbox' | 'full';
dispensaries_processed: number;
jobs_queued: number;
duration_ms: number;
}): void {
this.log({
timestamp: new Date().toISOString(),
level: 'info',
event: 'intelligence_run',
run_type: params.run_type,
dispensaries_processed: params.dispensaries_processed,
jobs_queued: params.jobs_queued,
duration_ms: params.duration_ms,
});
}
}
// Export singleton instance
export const crawlerLogger = new CrawlerLoggerService();