/** * Error Taxonomy Module * * Standardized error codes and classification for crawler reliability. * All crawl results must use these codes for consistent error handling. * * Phase 1: Crawler Reliability & Stabilization */ // ============================================================ // ERROR CODES // ============================================================ /** * Standardized error codes for all crawl operations. * These codes are stored in the database for analytics and debugging. */ export const CrawlErrorCode = { // Success states SUCCESS: 'SUCCESS', // Rate limiting RATE_LIMITED: 'RATE_LIMITED', // 429 responses // Proxy issues BLOCKED_PROXY: 'BLOCKED_PROXY', // 407 or proxy-related blocks PROXY_TIMEOUT: 'PROXY_TIMEOUT', // Proxy connection timeout // Content issues HTML_CHANGED: 'HTML_CHANGED', // Page structure changed NO_PRODUCTS: 'NO_PRODUCTS', // Empty response (valid but no data) PARSE_ERROR: 'PARSE_ERROR', // Failed to parse response // Network issues TIMEOUT: 'TIMEOUT', // Request timeout NETWORK_ERROR: 'NETWORK_ERROR', // Connection failed DNS_ERROR: 'DNS_ERROR', // DNS resolution failed // Authentication AUTH_FAILED: 'AUTH_FAILED', // Authentication/session issues // Server errors SERVER_ERROR: 'SERVER_ERROR', // 5xx responses SERVICE_UNAVAILABLE: 'SERVICE_UNAVAILABLE', // 503 // Configuration issues INVALID_CONFIG: 'INVALID_CONFIG', // Bad store configuration MISSING_PLATFORM_ID: 'MISSING_PLATFORM_ID', // No platform_dispensary_id // Unknown UNKNOWN_ERROR: 'UNKNOWN_ERROR', // Catch-all for unclassified errors } as const; export type CrawlErrorCodeType = typeof CrawlErrorCode[keyof typeof CrawlErrorCode]; // ============================================================ // ERROR CLASSIFICATION // ============================================================ /** * Error metadata for each error code */ interface ErrorMetadata { code: CrawlErrorCodeType; retryable: boolean; rotateProxy: boolean; rotateUserAgent: boolean; backoffMultiplier: number; severity: 'low' | 'medium' | 'high' | 'critical'; description: string; } /** * Metadata for each error code - defines retry behavior */ export const ERROR_METADATA: Record = { [CrawlErrorCode.SUCCESS]: { code: CrawlErrorCode.SUCCESS, retryable: false, rotateProxy: false, rotateUserAgent: false, backoffMultiplier: 0, severity: 'low', description: 'Crawl completed successfully', }, [CrawlErrorCode.RATE_LIMITED]: { code: CrawlErrorCode.RATE_LIMITED, retryable: true, rotateProxy: true, rotateUserAgent: true, backoffMultiplier: 2.0, severity: 'medium', description: 'Rate limited by target (429)', }, [CrawlErrorCode.BLOCKED_PROXY]: { code: CrawlErrorCode.BLOCKED_PROXY, retryable: true, rotateProxy: true, rotateUserAgent: true, backoffMultiplier: 1.5, severity: 'medium', description: 'Proxy blocked or rejected (407)', }, [CrawlErrorCode.PROXY_TIMEOUT]: { code: CrawlErrorCode.PROXY_TIMEOUT, retryable: true, rotateProxy: true, rotateUserAgent: false, backoffMultiplier: 1.0, severity: 'low', description: 'Proxy connection timed out', }, [CrawlErrorCode.HTML_CHANGED]: { code: CrawlErrorCode.HTML_CHANGED, retryable: false, rotateProxy: false, rotateUserAgent: false, backoffMultiplier: 1.0, severity: 'high', description: 'Page structure changed - needs selector update', }, [CrawlErrorCode.NO_PRODUCTS]: { code: CrawlErrorCode.NO_PRODUCTS, retryable: true, rotateProxy: false, rotateUserAgent: false, backoffMultiplier: 1.0, severity: 'low', description: 'No products returned (may be temporary)', }, [CrawlErrorCode.PARSE_ERROR]: { code: CrawlErrorCode.PARSE_ERROR, retryable: true, rotateProxy: false, rotateUserAgent: false, backoffMultiplier: 1.0, severity: 'medium', description: 'Failed to parse response data', }, [CrawlErrorCode.TIMEOUT]: { code: CrawlErrorCode.TIMEOUT, retryable: true, rotateProxy: true, rotateUserAgent: false, backoffMultiplier: 1.5, severity: 'medium', description: 'Request timed out', }, [CrawlErrorCode.NETWORK_ERROR]: { code: CrawlErrorCode.NETWORK_ERROR, retryable: true, rotateProxy: true, rotateUserAgent: false, backoffMultiplier: 1.0, severity: 'medium', description: 'Network connection failed', }, [CrawlErrorCode.DNS_ERROR]: { code: CrawlErrorCode.DNS_ERROR, retryable: true, rotateProxy: true, rotateUserAgent: false, backoffMultiplier: 1.0, severity: 'medium', description: 'DNS resolution failed', }, [CrawlErrorCode.AUTH_FAILED]: { code: CrawlErrorCode.AUTH_FAILED, retryable: true, rotateProxy: false, rotateUserAgent: true, backoffMultiplier: 2.0, severity: 'high', description: 'Authentication or session failed', }, [CrawlErrorCode.SERVER_ERROR]: { code: CrawlErrorCode.SERVER_ERROR, retryable: true, rotateProxy: false, rotateUserAgent: false, backoffMultiplier: 1.5, severity: 'medium', description: 'Server error (5xx)', }, [CrawlErrorCode.SERVICE_UNAVAILABLE]: { code: CrawlErrorCode.SERVICE_UNAVAILABLE, retryable: true, rotateProxy: false, rotateUserAgent: false, backoffMultiplier: 2.0, severity: 'high', description: 'Service temporarily unavailable (503)', }, [CrawlErrorCode.INVALID_CONFIG]: { code: CrawlErrorCode.INVALID_CONFIG, retryable: false, rotateProxy: false, rotateUserAgent: false, backoffMultiplier: 0, severity: 'critical', description: 'Invalid store configuration', }, [CrawlErrorCode.MISSING_PLATFORM_ID]: { code: CrawlErrorCode.MISSING_PLATFORM_ID, retryable: false, rotateProxy: false, rotateUserAgent: false, backoffMultiplier: 0, severity: 'critical', description: 'Missing platform_dispensary_id', }, [CrawlErrorCode.UNKNOWN_ERROR]: { code: CrawlErrorCode.UNKNOWN_ERROR, retryable: true, rotateProxy: false, rotateUserAgent: false, backoffMultiplier: 1.0, severity: 'high', description: 'Unknown/unclassified error', }, }; // ============================================================ // ERROR CLASSIFICATION FUNCTIONS // ============================================================ /** * Classify an error into a standardized error code. * * @param error - The error to classify (Error object, string, or HTTP status) * @param httpStatus - Optional HTTP status code * @returns Standardized error code */ export function classifyError( error: Error | string | null, httpStatus?: number ): CrawlErrorCodeType { // Check HTTP status first if (httpStatus) { if (httpStatus === 429) return CrawlErrorCode.RATE_LIMITED; if (httpStatus === 407) return CrawlErrorCode.BLOCKED_PROXY; if (httpStatus === 401 || httpStatus === 403) return CrawlErrorCode.AUTH_FAILED; if (httpStatus === 503) return CrawlErrorCode.SERVICE_UNAVAILABLE; if (httpStatus >= 500) return CrawlErrorCode.SERVER_ERROR; } if (!error) return CrawlErrorCode.UNKNOWN_ERROR; const message = typeof error === 'string' ? error.toLowerCase() : error.message.toLowerCase(); // Rate limiting patterns if (message.includes('rate limit') || message.includes('too many requests') || message.includes('429')) { return CrawlErrorCode.RATE_LIMITED; } // Proxy patterns if (message.includes('proxy') && (message.includes('block') || message.includes('reject') || message.includes('407'))) { return CrawlErrorCode.BLOCKED_PROXY; } // Timeout patterns if (message.includes('timeout') || message.includes('timed out') || message.includes('etimedout')) { if (message.includes('proxy')) { return CrawlErrorCode.PROXY_TIMEOUT; } return CrawlErrorCode.TIMEOUT; } // Network patterns if (message.includes('econnrefused') || message.includes('econnreset') || message.includes('network')) { return CrawlErrorCode.NETWORK_ERROR; } // DNS patterns if (message.includes('enotfound') || message.includes('dns') || message.includes('getaddrinfo')) { return CrawlErrorCode.DNS_ERROR; } // Auth patterns if (message.includes('auth') || message.includes('unauthorized') || message.includes('forbidden') || message.includes('401') || message.includes('403')) { return CrawlErrorCode.AUTH_FAILED; } // HTML change patterns if (message.includes('selector') || message.includes('element not found') || message.includes('structure changed')) { return CrawlErrorCode.HTML_CHANGED; } // Parse patterns if (message.includes('parse') || message.includes('json') || message.includes('syntax')) { return CrawlErrorCode.PARSE_ERROR; } // No products patterns if (message.includes('no products') || message.includes('empty') || message.includes('0 products')) { return CrawlErrorCode.NO_PRODUCTS; } // Server error patterns if (message.includes('500') || message.includes('502') || message.includes('503') || message.includes('504')) { return CrawlErrorCode.SERVER_ERROR; } // Config patterns if (message.includes('config') || message.includes('invalid') || message.includes('missing')) { if (message.includes('platform') || message.includes('dispensary_id')) { return CrawlErrorCode.MISSING_PLATFORM_ID; } return CrawlErrorCode.INVALID_CONFIG; } return CrawlErrorCode.UNKNOWN_ERROR; } /** * Get metadata for an error code */ export function getErrorMetadata(code: CrawlErrorCodeType): ErrorMetadata { return ERROR_METADATA[code] || ERROR_METADATA[CrawlErrorCode.UNKNOWN_ERROR]; } /** * Check if an error is retryable */ export function isRetryable(code: CrawlErrorCodeType): boolean { return getErrorMetadata(code).retryable; } /** * Check if proxy should be rotated for this error */ export function shouldRotateProxy(code: CrawlErrorCodeType): boolean { return getErrorMetadata(code).rotateProxy; } /** * Check if user agent should be rotated for this error */ export function shouldRotateUserAgent(code: CrawlErrorCodeType): boolean { return getErrorMetadata(code).rotateUserAgent; } /** * Get backoff multiplier for this error */ export function getBackoffMultiplier(code: CrawlErrorCodeType): number { return getErrorMetadata(code).backoffMultiplier; } // ============================================================ // CRAWL RESULT TYPE // ============================================================ /** * Standardized crawl result with error taxonomy */ export interface CrawlResult { success: boolean; dispensaryId: number; // Error info errorCode: CrawlErrorCodeType; errorMessage?: string; httpStatus?: number; // Timing startedAt: Date; finishedAt: Date; durationMs: number; // Context attemptNumber: number; proxyUsed?: string; userAgentUsed?: string; // Metrics (on success) productsFound?: number; productsUpserted?: number; snapshotsCreated?: number; imagesDownloaded?: number; // Metadata metadata?: Record; } /** * Create a success result */ export function createSuccessResult( dispensaryId: number, startedAt: Date, metrics: { productsFound: number; productsUpserted: number; snapshotsCreated: number; imagesDownloaded?: number; }, context?: { attemptNumber?: number; proxyUsed?: string; userAgentUsed?: string; } ): CrawlResult { const finishedAt = new Date(); return { success: true, dispensaryId, errorCode: CrawlErrorCode.SUCCESS, startedAt, finishedAt, durationMs: finishedAt.getTime() - startedAt.getTime(), attemptNumber: context?.attemptNumber || 1, proxyUsed: context?.proxyUsed, userAgentUsed: context?.userAgentUsed, ...metrics, }; } /** * Create a failure result */ export function createFailureResult( dispensaryId: number, startedAt: Date, error: Error | string, httpStatus?: number, context?: { attemptNumber?: number; proxyUsed?: string; userAgentUsed?: string; } ): CrawlResult { const finishedAt = new Date(); const errorCode = classifyError(error, httpStatus); const errorMessage = typeof error === 'string' ? error : error.message; return { success: false, dispensaryId, errorCode, errorMessage, httpStatus, startedAt, finishedAt, durationMs: finishedAt.getTime() - startedAt.getTime(), attemptNumber: context?.attemptNumber || 1, proxyUsed: context?.proxyUsed, userAgentUsed: context?.userAgentUsed, }; } // ============================================================ // LOGGING HELPERS // ============================================================ /** * Format error code for logging */ export function formatErrorForLog(result: CrawlResult): string { const metadata = getErrorMetadata(result.errorCode); const retryInfo = metadata.retryable ? '(retryable)' : '(non-retryable)'; const proxyInfo = result.proxyUsed ? ` via ${result.proxyUsed}` : ''; if (result.success) { return `[${result.errorCode}] Crawl successful: ${result.productsFound} products${proxyInfo}`; } return `[${result.errorCode}] ${result.errorMessage}${proxyInfo} ${retryInfo}`; } /** * Get user-friendly error description */ export function getErrorDescription(code: CrawlErrorCodeType): string { return getErrorMetadata(code).description; }