Major additions: - Multi-state expansion: states table, StateSelector, NationalDashboard, StateHeatmap, CrossStateCompare - Orchestrator services: trace service, error taxonomy, retry manager, proxy rotator - Discovery system: dutchie discovery service, geo validation, city seeding scripts - Analytics infrastructure: analytics v2 routes, brand/pricing/stores intelligence pages - Local development: setup-local.sh starts all 5 services (postgres, backend, cannaiq, findadispo, findagram) - Migrations 037-056: crawler profiles, states, analytics indexes, worker metadata Frontend pages added: - Discovery, ChainsDashboard, IntelligenceBrands, IntelligencePricing, IntelligenceStores - StateHeatmap, CrossStateCompare, SyncInfoPanel Components added: - StateSelector, OrchestratorTraceModal, WorkflowStepper 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
492 lines
13 KiB
TypeScript
492 lines
13 KiB
TypeScript
/**
|
|
* Error Taxonomy Module
|
|
*
|
|
* Standardized error codes and classification for crawler reliability.
|
|
* All crawl results must use these codes for consistent error handling.
|
|
*
|
|
* Phase 1: Crawler Reliability & Stabilization
|
|
*/
|
|
|
|
// ============================================================
|
|
// ERROR CODES
|
|
// ============================================================
|
|
|
|
/**
|
|
* Standardized error codes for all crawl operations.
|
|
* These codes are stored in the database for analytics and debugging.
|
|
*/
|
|
export const CrawlErrorCode = {
|
|
// Success states
|
|
SUCCESS: 'SUCCESS',
|
|
|
|
// Rate limiting
|
|
RATE_LIMITED: 'RATE_LIMITED', // 429 responses
|
|
|
|
// Proxy issues
|
|
BLOCKED_PROXY: 'BLOCKED_PROXY', // 407 or proxy-related blocks
|
|
PROXY_TIMEOUT: 'PROXY_TIMEOUT', // Proxy connection timeout
|
|
|
|
// Content issues
|
|
HTML_CHANGED: 'HTML_CHANGED', // Page structure changed
|
|
NO_PRODUCTS: 'NO_PRODUCTS', // Empty response (valid but no data)
|
|
PARSE_ERROR: 'PARSE_ERROR', // Failed to parse response
|
|
|
|
// Network issues
|
|
TIMEOUT: 'TIMEOUT', // Request timeout
|
|
NETWORK_ERROR: 'NETWORK_ERROR', // Connection failed
|
|
DNS_ERROR: 'DNS_ERROR', // DNS resolution failed
|
|
|
|
// Authentication
|
|
AUTH_FAILED: 'AUTH_FAILED', // Authentication/session issues
|
|
|
|
// Server errors
|
|
SERVER_ERROR: 'SERVER_ERROR', // 5xx responses
|
|
SERVICE_UNAVAILABLE: 'SERVICE_UNAVAILABLE', // 503
|
|
|
|
// Configuration issues
|
|
INVALID_CONFIG: 'INVALID_CONFIG', // Bad store configuration
|
|
MISSING_PLATFORM_ID: 'MISSING_PLATFORM_ID', // No platform_dispensary_id
|
|
|
|
// Unknown
|
|
UNKNOWN_ERROR: 'UNKNOWN_ERROR', // Catch-all for unclassified errors
|
|
} as const;
|
|
|
|
export type CrawlErrorCodeType = typeof CrawlErrorCode[keyof typeof CrawlErrorCode];
|
|
|
|
// ============================================================
|
|
// ERROR CLASSIFICATION
|
|
// ============================================================
|
|
|
|
/**
|
|
* Error metadata for each error code
|
|
*/
|
|
interface ErrorMetadata {
|
|
code: CrawlErrorCodeType;
|
|
retryable: boolean;
|
|
rotateProxy: boolean;
|
|
rotateUserAgent: boolean;
|
|
backoffMultiplier: number;
|
|
severity: 'low' | 'medium' | 'high' | 'critical';
|
|
description: string;
|
|
}
|
|
|
|
/**
|
|
* Metadata for each error code - defines retry behavior
|
|
*/
|
|
export const ERROR_METADATA: Record<CrawlErrorCodeType, ErrorMetadata> = {
|
|
[CrawlErrorCode.SUCCESS]: {
|
|
code: CrawlErrorCode.SUCCESS,
|
|
retryable: false,
|
|
rotateProxy: false,
|
|
rotateUserAgent: false,
|
|
backoffMultiplier: 0,
|
|
severity: 'low',
|
|
description: 'Crawl completed successfully',
|
|
},
|
|
|
|
[CrawlErrorCode.RATE_LIMITED]: {
|
|
code: CrawlErrorCode.RATE_LIMITED,
|
|
retryable: true,
|
|
rotateProxy: true,
|
|
rotateUserAgent: true,
|
|
backoffMultiplier: 2.0,
|
|
severity: 'medium',
|
|
description: 'Rate limited by target (429)',
|
|
},
|
|
|
|
[CrawlErrorCode.BLOCKED_PROXY]: {
|
|
code: CrawlErrorCode.BLOCKED_PROXY,
|
|
retryable: true,
|
|
rotateProxy: true,
|
|
rotateUserAgent: true,
|
|
backoffMultiplier: 1.5,
|
|
severity: 'medium',
|
|
description: 'Proxy blocked or rejected (407)',
|
|
},
|
|
|
|
[CrawlErrorCode.PROXY_TIMEOUT]: {
|
|
code: CrawlErrorCode.PROXY_TIMEOUT,
|
|
retryable: true,
|
|
rotateProxy: true,
|
|
rotateUserAgent: false,
|
|
backoffMultiplier: 1.0,
|
|
severity: 'low',
|
|
description: 'Proxy connection timed out',
|
|
},
|
|
|
|
[CrawlErrorCode.HTML_CHANGED]: {
|
|
code: CrawlErrorCode.HTML_CHANGED,
|
|
retryable: false,
|
|
rotateProxy: false,
|
|
rotateUserAgent: false,
|
|
backoffMultiplier: 1.0,
|
|
severity: 'high',
|
|
description: 'Page structure changed - needs selector update',
|
|
},
|
|
|
|
[CrawlErrorCode.NO_PRODUCTS]: {
|
|
code: CrawlErrorCode.NO_PRODUCTS,
|
|
retryable: true,
|
|
rotateProxy: false,
|
|
rotateUserAgent: false,
|
|
backoffMultiplier: 1.0,
|
|
severity: 'low',
|
|
description: 'No products returned (may be temporary)',
|
|
},
|
|
|
|
[CrawlErrorCode.PARSE_ERROR]: {
|
|
code: CrawlErrorCode.PARSE_ERROR,
|
|
retryable: true,
|
|
rotateProxy: false,
|
|
rotateUserAgent: false,
|
|
backoffMultiplier: 1.0,
|
|
severity: 'medium',
|
|
description: 'Failed to parse response data',
|
|
},
|
|
|
|
[CrawlErrorCode.TIMEOUT]: {
|
|
code: CrawlErrorCode.TIMEOUT,
|
|
retryable: true,
|
|
rotateProxy: true,
|
|
rotateUserAgent: false,
|
|
backoffMultiplier: 1.5,
|
|
severity: 'medium',
|
|
description: 'Request timed out',
|
|
},
|
|
|
|
[CrawlErrorCode.NETWORK_ERROR]: {
|
|
code: CrawlErrorCode.NETWORK_ERROR,
|
|
retryable: true,
|
|
rotateProxy: true,
|
|
rotateUserAgent: false,
|
|
backoffMultiplier: 1.0,
|
|
severity: 'medium',
|
|
description: 'Network connection failed',
|
|
},
|
|
|
|
[CrawlErrorCode.DNS_ERROR]: {
|
|
code: CrawlErrorCode.DNS_ERROR,
|
|
retryable: true,
|
|
rotateProxy: true,
|
|
rotateUserAgent: false,
|
|
backoffMultiplier: 1.0,
|
|
severity: 'medium',
|
|
description: 'DNS resolution failed',
|
|
},
|
|
|
|
[CrawlErrorCode.AUTH_FAILED]: {
|
|
code: CrawlErrorCode.AUTH_FAILED,
|
|
retryable: true,
|
|
rotateProxy: false,
|
|
rotateUserAgent: true,
|
|
backoffMultiplier: 2.0,
|
|
severity: 'high',
|
|
description: 'Authentication or session failed',
|
|
},
|
|
|
|
[CrawlErrorCode.SERVER_ERROR]: {
|
|
code: CrawlErrorCode.SERVER_ERROR,
|
|
retryable: true,
|
|
rotateProxy: false,
|
|
rotateUserAgent: false,
|
|
backoffMultiplier: 1.5,
|
|
severity: 'medium',
|
|
description: 'Server error (5xx)',
|
|
},
|
|
|
|
[CrawlErrorCode.SERVICE_UNAVAILABLE]: {
|
|
code: CrawlErrorCode.SERVICE_UNAVAILABLE,
|
|
retryable: true,
|
|
rotateProxy: false,
|
|
rotateUserAgent: false,
|
|
backoffMultiplier: 2.0,
|
|
severity: 'high',
|
|
description: 'Service temporarily unavailable (503)',
|
|
},
|
|
|
|
[CrawlErrorCode.INVALID_CONFIG]: {
|
|
code: CrawlErrorCode.INVALID_CONFIG,
|
|
retryable: false,
|
|
rotateProxy: false,
|
|
rotateUserAgent: false,
|
|
backoffMultiplier: 0,
|
|
severity: 'critical',
|
|
description: 'Invalid store configuration',
|
|
},
|
|
|
|
[CrawlErrorCode.MISSING_PLATFORM_ID]: {
|
|
code: CrawlErrorCode.MISSING_PLATFORM_ID,
|
|
retryable: false,
|
|
rotateProxy: false,
|
|
rotateUserAgent: false,
|
|
backoffMultiplier: 0,
|
|
severity: 'critical',
|
|
description: 'Missing platform_dispensary_id',
|
|
},
|
|
|
|
[CrawlErrorCode.UNKNOWN_ERROR]: {
|
|
code: CrawlErrorCode.UNKNOWN_ERROR,
|
|
retryable: true,
|
|
rotateProxy: false,
|
|
rotateUserAgent: false,
|
|
backoffMultiplier: 1.0,
|
|
severity: 'high',
|
|
description: 'Unknown/unclassified error',
|
|
},
|
|
};
|
|
|
|
// ============================================================
|
|
// ERROR CLASSIFICATION FUNCTIONS
|
|
// ============================================================
|
|
|
|
/**
|
|
* Classify an error into a standardized error code.
|
|
*
|
|
* @param error - The error to classify (Error object, string, or HTTP status)
|
|
* @param httpStatus - Optional HTTP status code
|
|
* @returns Standardized error code
|
|
*/
|
|
export function classifyError(
|
|
error: Error | string | null,
|
|
httpStatus?: number
|
|
): CrawlErrorCodeType {
|
|
// Check HTTP status first
|
|
if (httpStatus) {
|
|
if (httpStatus === 429) return CrawlErrorCode.RATE_LIMITED;
|
|
if (httpStatus === 407) return CrawlErrorCode.BLOCKED_PROXY;
|
|
if (httpStatus === 401 || httpStatus === 403) return CrawlErrorCode.AUTH_FAILED;
|
|
if (httpStatus === 503) return CrawlErrorCode.SERVICE_UNAVAILABLE;
|
|
if (httpStatus >= 500) return CrawlErrorCode.SERVER_ERROR;
|
|
}
|
|
|
|
if (!error) return CrawlErrorCode.UNKNOWN_ERROR;
|
|
|
|
const message = typeof error === 'string' ? error.toLowerCase() : error.message.toLowerCase();
|
|
|
|
// Rate limiting patterns
|
|
if (message.includes('rate limit') || message.includes('too many requests') || message.includes('429')) {
|
|
return CrawlErrorCode.RATE_LIMITED;
|
|
}
|
|
|
|
// Proxy patterns
|
|
if (message.includes('proxy') && (message.includes('block') || message.includes('reject') || message.includes('407'))) {
|
|
return CrawlErrorCode.BLOCKED_PROXY;
|
|
}
|
|
|
|
// Timeout patterns
|
|
if (message.includes('timeout') || message.includes('timed out') || message.includes('etimedout')) {
|
|
if (message.includes('proxy')) {
|
|
return CrawlErrorCode.PROXY_TIMEOUT;
|
|
}
|
|
return CrawlErrorCode.TIMEOUT;
|
|
}
|
|
|
|
// Network patterns
|
|
if (message.includes('econnrefused') || message.includes('econnreset') || message.includes('network')) {
|
|
return CrawlErrorCode.NETWORK_ERROR;
|
|
}
|
|
|
|
// DNS patterns
|
|
if (message.includes('enotfound') || message.includes('dns') || message.includes('getaddrinfo')) {
|
|
return CrawlErrorCode.DNS_ERROR;
|
|
}
|
|
|
|
// Auth patterns
|
|
if (message.includes('auth') || message.includes('unauthorized') || message.includes('forbidden') || message.includes('401') || message.includes('403')) {
|
|
return CrawlErrorCode.AUTH_FAILED;
|
|
}
|
|
|
|
// HTML change patterns
|
|
if (message.includes('selector') || message.includes('element not found') || message.includes('structure changed')) {
|
|
return CrawlErrorCode.HTML_CHANGED;
|
|
}
|
|
|
|
// Parse patterns
|
|
if (message.includes('parse') || message.includes('json') || message.includes('syntax')) {
|
|
return CrawlErrorCode.PARSE_ERROR;
|
|
}
|
|
|
|
// No products patterns
|
|
if (message.includes('no products') || message.includes('empty') || message.includes('0 products')) {
|
|
return CrawlErrorCode.NO_PRODUCTS;
|
|
}
|
|
|
|
// Server error patterns
|
|
if (message.includes('500') || message.includes('502') || message.includes('503') || message.includes('504')) {
|
|
return CrawlErrorCode.SERVER_ERROR;
|
|
}
|
|
|
|
// Config patterns
|
|
if (message.includes('config') || message.includes('invalid') || message.includes('missing')) {
|
|
if (message.includes('platform') || message.includes('dispensary_id')) {
|
|
return CrawlErrorCode.MISSING_PLATFORM_ID;
|
|
}
|
|
return CrawlErrorCode.INVALID_CONFIG;
|
|
}
|
|
|
|
return CrawlErrorCode.UNKNOWN_ERROR;
|
|
}
|
|
|
|
/**
|
|
* Get metadata for an error code
|
|
*/
|
|
export function getErrorMetadata(code: CrawlErrorCodeType): ErrorMetadata {
|
|
return ERROR_METADATA[code] || ERROR_METADATA[CrawlErrorCode.UNKNOWN_ERROR];
|
|
}
|
|
|
|
/**
|
|
* Check if an error is retryable
|
|
*/
|
|
export function isRetryable(code: CrawlErrorCodeType): boolean {
|
|
return getErrorMetadata(code).retryable;
|
|
}
|
|
|
|
/**
|
|
* Check if proxy should be rotated for this error
|
|
*/
|
|
export function shouldRotateProxy(code: CrawlErrorCodeType): boolean {
|
|
return getErrorMetadata(code).rotateProxy;
|
|
}
|
|
|
|
/**
|
|
* Check if user agent should be rotated for this error
|
|
*/
|
|
export function shouldRotateUserAgent(code: CrawlErrorCodeType): boolean {
|
|
return getErrorMetadata(code).rotateUserAgent;
|
|
}
|
|
|
|
/**
|
|
* Get backoff multiplier for this error
|
|
*/
|
|
export function getBackoffMultiplier(code: CrawlErrorCodeType): number {
|
|
return getErrorMetadata(code).backoffMultiplier;
|
|
}
|
|
|
|
// ============================================================
|
|
// CRAWL RESULT TYPE
|
|
// ============================================================
|
|
|
|
/**
|
|
* Standardized crawl result with error taxonomy
|
|
*/
|
|
export interface CrawlResult {
|
|
success: boolean;
|
|
dispensaryId: number;
|
|
|
|
// Error info
|
|
errorCode: CrawlErrorCodeType;
|
|
errorMessage?: string;
|
|
httpStatus?: number;
|
|
|
|
// Timing
|
|
startedAt: Date;
|
|
finishedAt: Date;
|
|
durationMs: number;
|
|
|
|
// Context
|
|
attemptNumber: number;
|
|
proxyUsed?: string;
|
|
userAgentUsed?: string;
|
|
|
|
// Metrics (on success)
|
|
productsFound?: number;
|
|
productsUpserted?: number;
|
|
snapshotsCreated?: number;
|
|
imagesDownloaded?: number;
|
|
|
|
// Metadata
|
|
metadata?: Record<string, any>;
|
|
}
|
|
|
|
/**
|
|
* Create a success result
|
|
*/
|
|
export function createSuccessResult(
|
|
dispensaryId: number,
|
|
startedAt: Date,
|
|
metrics: {
|
|
productsFound: number;
|
|
productsUpserted: number;
|
|
snapshotsCreated: number;
|
|
imagesDownloaded?: number;
|
|
},
|
|
context?: {
|
|
attemptNumber?: number;
|
|
proxyUsed?: string;
|
|
userAgentUsed?: string;
|
|
}
|
|
): CrawlResult {
|
|
const finishedAt = new Date();
|
|
return {
|
|
success: true,
|
|
dispensaryId,
|
|
errorCode: CrawlErrorCode.SUCCESS,
|
|
startedAt,
|
|
finishedAt,
|
|
durationMs: finishedAt.getTime() - startedAt.getTime(),
|
|
attemptNumber: context?.attemptNumber || 1,
|
|
proxyUsed: context?.proxyUsed,
|
|
userAgentUsed: context?.userAgentUsed,
|
|
...metrics,
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Create a failure result
|
|
*/
|
|
export function createFailureResult(
|
|
dispensaryId: number,
|
|
startedAt: Date,
|
|
error: Error | string,
|
|
httpStatus?: number,
|
|
context?: {
|
|
attemptNumber?: number;
|
|
proxyUsed?: string;
|
|
userAgentUsed?: string;
|
|
}
|
|
): CrawlResult {
|
|
const finishedAt = new Date();
|
|
const errorCode = classifyError(error, httpStatus);
|
|
const errorMessage = typeof error === 'string' ? error : error.message;
|
|
|
|
return {
|
|
success: false,
|
|
dispensaryId,
|
|
errorCode,
|
|
errorMessage,
|
|
httpStatus,
|
|
startedAt,
|
|
finishedAt,
|
|
durationMs: finishedAt.getTime() - startedAt.getTime(),
|
|
attemptNumber: context?.attemptNumber || 1,
|
|
proxyUsed: context?.proxyUsed,
|
|
userAgentUsed: context?.userAgentUsed,
|
|
};
|
|
}
|
|
|
|
// ============================================================
|
|
// LOGGING HELPERS
|
|
// ============================================================
|
|
|
|
/**
|
|
* Format error code for logging
|
|
*/
|
|
export function formatErrorForLog(result: CrawlResult): string {
|
|
const metadata = getErrorMetadata(result.errorCode);
|
|
const retryInfo = metadata.retryable ? '(retryable)' : '(non-retryable)';
|
|
const proxyInfo = result.proxyUsed ? ` via ${result.proxyUsed}` : '';
|
|
|
|
if (result.success) {
|
|
return `[${result.errorCode}] Crawl successful: ${result.productsFound} products${proxyInfo}`;
|
|
}
|
|
|
|
return `[${result.errorCode}] ${result.errorMessage}${proxyInfo} ${retryInfo}`;
|
|
}
|
|
|
|
/**
|
|
* Get user-friendly error description
|
|
*/
|
|
export function getErrorDescription(code: CrawlErrorCodeType): string {
|
|
return getErrorMetadata(code).description;
|
|
}
|