Files
cannaiq/backend/src/dutchie-az/services/error-taxonomy.ts
Kelly b4a2fb7d03 feat: Add v2 architecture with multi-state support and orchestrator services
Major additions:
- Multi-state expansion: states table, StateSelector, NationalDashboard, StateHeatmap, CrossStateCompare
- Orchestrator services: trace service, error taxonomy, retry manager, proxy rotator
- Discovery system: dutchie discovery service, geo validation, city seeding scripts
- Analytics infrastructure: analytics v2 routes, brand/pricing/stores intelligence pages
- Local development: setup-local.sh starts all 5 services (postgres, backend, cannaiq, findadispo, findagram)
- Migrations 037-056: crawler profiles, states, analytics indexes, worker metadata

Frontend pages added:
- Discovery, ChainsDashboard, IntelligenceBrands, IntelligencePricing, IntelligenceStores
- StateHeatmap, CrossStateCompare, SyncInfoPanel

Components added:
- StateSelector, OrchestratorTraceModal, WorkflowStepper

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-07 11:30:57 -07:00

492 lines
13 KiB
TypeScript

/**
* Error Taxonomy Module
*
* Standardized error codes and classification for crawler reliability.
* All crawl results must use these codes for consistent error handling.
*
* Phase 1: Crawler Reliability & Stabilization
*/
// ============================================================
// ERROR CODES
// ============================================================
/**
* Standardized error codes for all crawl operations.
* These codes are stored in the database for analytics and debugging.
*/
export const CrawlErrorCode = {
// Success states
SUCCESS: 'SUCCESS',
// Rate limiting
RATE_LIMITED: 'RATE_LIMITED', // 429 responses
// Proxy issues
BLOCKED_PROXY: 'BLOCKED_PROXY', // 407 or proxy-related blocks
PROXY_TIMEOUT: 'PROXY_TIMEOUT', // Proxy connection timeout
// Content issues
HTML_CHANGED: 'HTML_CHANGED', // Page structure changed
NO_PRODUCTS: 'NO_PRODUCTS', // Empty response (valid but no data)
PARSE_ERROR: 'PARSE_ERROR', // Failed to parse response
// Network issues
TIMEOUT: 'TIMEOUT', // Request timeout
NETWORK_ERROR: 'NETWORK_ERROR', // Connection failed
DNS_ERROR: 'DNS_ERROR', // DNS resolution failed
// Authentication
AUTH_FAILED: 'AUTH_FAILED', // Authentication/session issues
// Server errors
SERVER_ERROR: 'SERVER_ERROR', // 5xx responses
SERVICE_UNAVAILABLE: 'SERVICE_UNAVAILABLE', // 503
// Configuration issues
INVALID_CONFIG: 'INVALID_CONFIG', // Bad store configuration
MISSING_PLATFORM_ID: 'MISSING_PLATFORM_ID', // No platform_dispensary_id
// Unknown
UNKNOWN_ERROR: 'UNKNOWN_ERROR', // Catch-all for unclassified errors
} as const;
export type CrawlErrorCodeType = typeof CrawlErrorCode[keyof typeof CrawlErrorCode];
// ============================================================
// ERROR CLASSIFICATION
// ============================================================
/**
* Error metadata for each error code
*/
interface ErrorMetadata {
code: CrawlErrorCodeType;
retryable: boolean;
rotateProxy: boolean;
rotateUserAgent: boolean;
backoffMultiplier: number;
severity: 'low' | 'medium' | 'high' | 'critical';
description: string;
}
/**
* Metadata for each error code - defines retry behavior
*/
export const ERROR_METADATA: Record<CrawlErrorCodeType, ErrorMetadata> = {
[CrawlErrorCode.SUCCESS]: {
code: CrawlErrorCode.SUCCESS,
retryable: false,
rotateProxy: false,
rotateUserAgent: false,
backoffMultiplier: 0,
severity: 'low',
description: 'Crawl completed successfully',
},
[CrawlErrorCode.RATE_LIMITED]: {
code: CrawlErrorCode.RATE_LIMITED,
retryable: true,
rotateProxy: true,
rotateUserAgent: true,
backoffMultiplier: 2.0,
severity: 'medium',
description: 'Rate limited by target (429)',
},
[CrawlErrorCode.BLOCKED_PROXY]: {
code: CrawlErrorCode.BLOCKED_PROXY,
retryable: true,
rotateProxy: true,
rotateUserAgent: true,
backoffMultiplier: 1.5,
severity: 'medium',
description: 'Proxy blocked or rejected (407)',
},
[CrawlErrorCode.PROXY_TIMEOUT]: {
code: CrawlErrorCode.PROXY_TIMEOUT,
retryable: true,
rotateProxy: true,
rotateUserAgent: false,
backoffMultiplier: 1.0,
severity: 'low',
description: 'Proxy connection timed out',
},
[CrawlErrorCode.HTML_CHANGED]: {
code: CrawlErrorCode.HTML_CHANGED,
retryable: false,
rotateProxy: false,
rotateUserAgent: false,
backoffMultiplier: 1.0,
severity: 'high',
description: 'Page structure changed - needs selector update',
},
[CrawlErrorCode.NO_PRODUCTS]: {
code: CrawlErrorCode.NO_PRODUCTS,
retryable: true,
rotateProxy: false,
rotateUserAgent: false,
backoffMultiplier: 1.0,
severity: 'low',
description: 'No products returned (may be temporary)',
},
[CrawlErrorCode.PARSE_ERROR]: {
code: CrawlErrorCode.PARSE_ERROR,
retryable: true,
rotateProxy: false,
rotateUserAgent: false,
backoffMultiplier: 1.0,
severity: 'medium',
description: 'Failed to parse response data',
},
[CrawlErrorCode.TIMEOUT]: {
code: CrawlErrorCode.TIMEOUT,
retryable: true,
rotateProxy: true,
rotateUserAgent: false,
backoffMultiplier: 1.5,
severity: 'medium',
description: 'Request timed out',
},
[CrawlErrorCode.NETWORK_ERROR]: {
code: CrawlErrorCode.NETWORK_ERROR,
retryable: true,
rotateProxy: true,
rotateUserAgent: false,
backoffMultiplier: 1.0,
severity: 'medium',
description: 'Network connection failed',
},
[CrawlErrorCode.DNS_ERROR]: {
code: CrawlErrorCode.DNS_ERROR,
retryable: true,
rotateProxy: true,
rotateUserAgent: false,
backoffMultiplier: 1.0,
severity: 'medium',
description: 'DNS resolution failed',
},
[CrawlErrorCode.AUTH_FAILED]: {
code: CrawlErrorCode.AUTH_FAILED,
retryable: true,
rotateProxy: false,
rotateUserAgent: true,
backoffMultiplier: 2.0,
severity: 'high',
description: 'Authentication or session failed',
},
[CrawlErrorCode.SERVER_ERROR]: {
code: CrawlErrorCode.SERVER_ERROR,
retryable: true,
rotateProxy: false,
rotateUserAgent: false,
backoffMultiplier: 1.5,
severity: 'medium',
description: 'Server error (5xx)',
},
[CrawlErrorCode.SERVICE_UNAVAILABLE]: {
code: CrawlErrorCode.SERVICE_UNAVAILABLE,
retryable: true,
rotateProxy: false,
rotateUserAgent: false,
backoffMultiplier: 2.0,
severity: 'high',
description: 'Service temporarily unavailable (503)',
},
[CrawlErrorCode.INVALID_CONFIG]: {
code: CrawlErrorCode.INVALID_CONFIG,
retryable: false,
rotateProxy: false,
rotateUserAgent: false,
backoffMultiplier: 0,
severity: 'critical',
description: 'Invalid store configuration',
},
[CrawlErrorCode.MISSING_PLATFORM_ID]: {
code: CrawlErrorCode.MISSING_PLATFORM_ID,
retryable: false,
rotateProxy: false,
rotateUserAgent: false,
backoffMultiplier: 0,
severity: 'critical',
description: 'Missing platform_dispensary_id',
},
[CrawlErrorCode.UNKNOWN_ERROR]: {
code: CrawlErrorCode.UNKNOWN_ERROR,
retryable: true,
rotateProxy: false,
rotateUserAgent: false,
backoffMultiplier: 1.0,
severity: 'high',
description: 'Unknown/unclassified error',
},
};
// ============================================================
// ERROR CLASSIFICATION FUNCTIONS
// ============================================================
/**
* Classify an error into a standardized error code.
*
* @param error - The error to classify (Error object, string, or HTTP status)
* @param httpStatus - Optional HTTP status code
* @returns Standardized error code
*/
export function classifyError(
error: Error | string | null,
httpStatus?: number
): CrawlErrorCodeType {
// Check HTTP status first
if (httpStatus) {
if (httpStatus === 429) return CrawlErrorCode.RATE_LIMITED;
if (httpStatus === 407) return CrawlErrorCode.BLOCKED_PROXY;
if (httpStatus === 401 || httpStatus === 403) return CrawlErrorCode.AUTH_FAILED;
if (httpStatus === 503) return CrawlErrorCode.SERVICE_UNAVAILABLE;
if (httpStatus >= 500) return CrawlErrorCode.SERVER_ERROR;
}
if (!error) return CrawlErrorCode.UNKNOWN_ERROR;
const message = typeof error === 'string' ? error.toLowerCase() : error.message.toLowerCase();
// Rate limiting patterns
if (message.includes('rate limit') || message.includes('too many requests') || message.includes('429')) {
return CrawlErrorCode.RATE_LIMITED;
}
// Proxy patterns
if (message.includes('proxy') && (message.includes('block') || message.includes('reject') || message.includes('407'))) {
return CrawlErrorCode.BLOCKED_PROXY;
}
// Timeout patterns
if (message.includes('timeout') || message.includes('timed out') || message.includes('etimedout')) {
if (message.includes('proxy')) {
return CrawlErrorCode.PROXY_TIMEOUT;
}
return CrawlErrorCode.TIMEOUT;
}
// Network patterns
if (message.includes('econnrefused') || message.includes('econnreset') || message.includes('network')) {
return CrawlErrorCode.NETWORK_ERROR;
}
// DNS patterns
if (message.includes('enotfound') || message.includes('dns') || message.includes('getaddrinfo')) {
return CrawlErrorCode.DNS_ERROR;
}
// Auth patterns
if (message.includes('auth') || message.includes('unauthorized') || message.includes('forbidden') || message.includes('401') || message.includes('403')) {
return CrawlErrorCode.AUTH_FAILED;
}
// HTML change patterns
if (message.includes('selector') || message.includes('element not found') || message.includes('structure changed')) {
return CrawlErrorCode.HTML_CHANGED;
}
// Parse patterns
if (message.includes('parse') || message.includes('json') || message.includes('syntax')) {
return CrawlErrorCode.PARSE_ERROR;
}
// No products patterns
if (message.includes('no products') || message.includes('empty') || message.includes('0 products')) {
return CrawlErrorCode.NO_PRODUCTS;
}
// Server error patterns
if (message.includes('500') || message.includes('502') || message.includes('503') || message.includes('504')) {
return CrawlErrorCode.SERVER_ERROR;
}
// Config patterns
if (message.includes('config') || message.includes('invalid') || message.includes('missing')) {
if (message.includes('platform') || message.includes('dispensary_id')) {
return CrawlErrorCode.MISSING_PLATFORM_ID;
}
return CrawlErrorCode.INVALID_CONFIG;
}
return CrawlErrorCode.UNKNOWN_ERROR;
}
/**
* Get metadata for an error code
*/
export function getErrorMetadata(code: CrawlErrorCodeType): ErrorMetadata {
return ERROR_METADATA[code] || ERROR_METADATA[CrawlErrorCode.UNKNOWN_ERROR];
}
/**
* Check if an error is retryable
*/
export function isRetryable(code: CrawlErrorCodeType): boolean {
return getErrorMetadata(code).retryable;
}
/**
* Check if proxy should be rotated for this error
*/
export function shouldRotateProxy(code: CrawlErrorCodeType): boolean {
return getErrorMetadata(code).rotateProxy;
}
/**
* Check if user agent should be rotated for this error
*/
export function shouldRotateUserAgent(code: CrawlErrorCodeType): boolean {
return getErrorMetadata(code).rotateUserAgent;
}
/**
* Get backoff multiplier for this error
*/
export function getBackoffMultiplier(code: CrawlErrorCodeType): number {
return getErrorMetadata(code).backoffMultiplier;
}
// ============================================================
// CRAWL RESULT TYPE
// ============================================================
/**
* Standardized crawl result with error taxonomy
*/
export interface CrawlResult {
success: boolean;
dispensaryId: number;
// Error info
errorCode: CrawlErrorCodeType;
errorMessage?: string;
httpStatus?: number;
// Timing
startedAt: Date;
finishedAt: Date;
durationMs: number;
// Context
attemptNumber: number;
proxyUsed?: string;
userAgentUsed?: string;
// Metrics (on success)
productsFound?: number;
productsUpserted?: number;
snapshotsCreated?: number;
imagesDownloaded?: number;
// Metadata
metadata?: Record<string, any>;
}
/**
* Create a success result
*/
export function createSuccessResult(
dispensaryId: number,
startedAt: Date,
metrics: {
productsFound: number;
productsUpserted: number;
snapshotsCreated: number;
imagesDownloaded?: number;
},
context?: {
attemptNumber?: number;
proxyUsed?: string;
userAgentUsed?: string;
}
): CrawlResult {
const finishedAt = new Date();
return {
success: true,
dispensaryId,
errorCode: CrawlErrorCode.SUCCESS,
startedAt,
finishedAt,
durationMs: finishedAt.getTime() - startedAt.getTime(),
attemptNumber: context?.attemptNumber || 1,
proxyUsed: context?.proxyUsed,
userAgentUsed: context?.userAgentUsed,
...metrics,
};
}
/**
* Create a failure result
*/
export function createFailureResult(
dispensaryId: number,
startedAt: Date,
error: Error | string,
httpStatus?: number,
context?: {
attemptNumber?: number;
proxyUsed?: string;
userAgentUsed?: string;
}
): CrawlResult {
const finishedAt = new Date();
const errorCode = classifyError(error, httpStatus);
const errorMessage = typeof error === 'string' ? error : error.message;
return {
success: false,
dispensaryId,
errorCode,
errorMessage,
httpStatus,
startedAt,
finishedAt,
durationMs: finishedAt.getTime() - startedAt.getTime(),
attemptNumber: context?.attemptNumber || 1,
proxyUsed: context?.proxyUsed,
userAgentUsed: context?.userAgentUsed,
};
}
// ============================================================
// LOGGING HELPERS
// ============================================================
/**
* Format error code for logging
*/
export function formatErrorForLog(result: CrawlResult): string {
const metadata = getErrorMetadata(result.errorCode);
const retryInfo = metadata.retryable ? '(retryable)' : '(non-retryable)';
const proxyInfo = result.proxyUsed ? ` via ${result.proxyUsed}` : '';
if (result.success) {
return `[${result.errorCode}] Crawl successful: ${result.productsFound} products${proxyInfo}`;
}
return `[${result.errorCode}] ${result.errorMessage}${proxyInfo} ${retryInfo}`;
}
/**
* Get user-friendly error description
*/
export function getErrorDescription(code: CrawlErrorCodeType): string {
return getErrorMetadata(code).description;
}