Major additions: - Multi-state expansion: states table, StateSelector, NationalDashboard, StateHeatmap, CrossStateCompare - Orchestrator services: trace service, error taxonomy, retry manager, proxy rotator - Discovery system: dutchie discovery service, geo validation, city seeding scripts - Analytics infrastructure: analytics v2 routes, brand/pricing/stores intelligence pages - Local development: setup-local.sh starts all 5 services (postgres, backend, cannaiq, findadispo, findagram) - Migrations 037-056: crawler profiles, states, analytics indexes, worker metadata Frontend pages added: - Discovery, ChainsDashboard, IntelligenceBrands, IntelligencePricing, IntelligenceStores - StateHeatmap, CrossStateCompare, SyncInfoPanel Components added: - StateSelector, OrchestratorTraceModal, WorkflowStepper 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
487 lines
18 KiB
TypeScript
487 lines
18 KiB
TypeScript
#!/usr/bin/env npx tsx
|
|
/**
|
|
* Crawler Reliability Stress Test
|
|
*
|
|
* Simulates various failure scenarios to test:
|
|
* - Retry logic with exponential backoff
|
|
* - Error taxonomy classification
|
|
* - Self-healing (proxy/UA rotation)
|
|
* - Status transitions (active -> degraded -> failed)
|
|
* - Minimum crawl gap enforcement
|
|
*
|
|
* Phase 1: Crawler Reliability & Stabilization
|
|
*
|
|
* Usage:
|
|
* DATABASE_URL="postgresql://..." npx tsx src/dutchie-az/scripts/stress-test.ts [test-name]
|
|
*
|
|
* Available tests:
|
|
* retry - Test retry manager with various error types
|
|
* backoff - Test exponential backoff calculation
|
|
* status - Test status transitions
|
|
* gap - Test minimum crawl gap enforcement
|
|
* rotation - Test proxy/UA rotation
|
|
* all - Run all tests
|
|
*/
|
|
|
|
import {
|
|
CrawlErrorCode,
|
|
classifyError,
|
|
isRetryable,
|
|
shouldRotateProxy,
|
|
shouldRotateUserAgent,
|
|
getBackoffMultiplier,
|
|
getErrorMetadata,
|
|
} from '../services/error-taxonomy';
|
|
|
|
import {
|
|
RetryManager,
|
|
withRetry,
|
|
calculateNextCrawlDelay,
|
|
calculateNextCrawlAt,
|
|
determineCrawlStatus,
|
|
shouldAttemptRecovery,
|
|
sleep,
|
|
} from '../services/retry-manager';
|
|
|
|
import {
|
|
UserAgentRotator,
|
|
USER_AGENTS,
|
|
} from '../services/proxy-rotator';
|
|
|
|
import {
|
|
validateStoreConfig,
|
|
isCrawlable,
|
|
DEFAULT_CONFIG,
|
|
RawStoreConfig,
|
|
} from '../services/store-validator';
|
|
|
|
// ============================================================
|
|
// TEST UTILITIES
|
|
// ============================================================
|
|
|
|
let testsPassed = 0;
|
|
let testsFailed = 0;
|
|
|
|
function assert(condition: boolean, message: string): void {
|
|
if (condition) {
|
|
console.log(` ✓ ${message}`);
|
|
testsPassed++;
|
|
} else {
|
|
console.log(` ✗ ${message}`);
|
|
testsFailed++;
|
|
}
|
|
}
|
|
|
|
function section(name: string): void {
|
|
console.log(`\n${'='.repeat(60)}`);
|
|
console.log(`TEST: ${name}`);
|
|
console.log('='.repeat(60));
|
|
}
|
|
|
|
// ============================================================
|
|
// TEST: Error Classification
|
|
// ============================================================
|
|
|
|
function testErrorClassification(): void {
|
|
section('Error Classification');
|
|
|
|
// HTTP status codes
|
|
assert(classifyError(null, 429) === CrawlErrorCode.RATE_LIMITED, '429 -> RATE_LIMITED');
|
|
assert(classifyError(null, 407) === CrawlErrorCode.BLOCKED_PROXY, '407 -> BLOCKED_PROXY');
|
|
assert(classifyError(null, 401) === CrawlErrorCode.AUTH_FAILED, '401 -> AUTH_FAILED');
|
|
assert(classifyError(null, 403) === CrawlErrorCode.AUTH_FAILED, '403 -> AUTH_FAILED');
|
|
assert(classifyError(null, 503) === CrawlErrorCode.SERVICE_UNAVAILABLE, '503 -> SERVICE_UNAVAILABLE');
|
|
assert(classifyError(null, 500) === CrawlErrorCode.SERVER_ERROR, '500 -> SERVER_ERROR');
|
|
|
|
// Error messages
|
|
assert(classifyError('rate limit exceeded') === CrawlErrorCode.RATE_LIMITED, 'rate limit message -> RATE_LIMITED');
|
|
assert(classifyError('request timed out') === CrawlErrorCode.TIMEOUT, 'timeout message -> TIMEOUT');
|
|
assert(classifyError('proxy blocked') === CrawlErrorCode.BLOCKED_PROXY, 'proxy blocked -> BLOCKED_PROXY');
|
|
assert(classifyError('ECONNREFUSED') === CrawlErrorCode.NETWORK_ERROR, 'ECONNREFUSED -> NETWORK_ERROR');
|
|
assert(classifyError('ENOTFOUND') === CrawlErrorCode.DNS_ERROR, 'ENOTFOUND -> DNS_ERROR');
|
|
assert(classifyError('selector not found') === CrawlErrorCode.HTML_CHANGED, 'selector error -> HTML_CHANGED');
|
|
assert(classifyError('JSON parse error') === CrawlErrorCode.PARSE_ERROR, 'parse error -> PARSE_ERROR');
|
|
assert(classifyError('0 products found') === CrawlErrorCode.NO_PRODUCTS, 'no products -> NO_PRODUCTS');
|
|
|
|
// Retryability
|
|
assert(isRetryable(CrawlErrorCode.RATE_LIMITED) === true, 'RATE_LIMITED is retryable');
|
|
assert(isRetryable(CrawlErrorCode.TIMEOUT) === true, 'TIMEOUT is retryable');
|
|
assert(isRetryable(CrawlErrorCode.HTML_CHANGED) === false, 'HTML_CHANGED is NOT retryable');
|
|
assert(isRetryable(CrawlErrorCode.INVALID_CONFIG) === false, 'INVALID_CONFIG is NOT retryable');
|
|
|
|
// Rotation decisions
|
|
assert(shouldRotateProxy(CrawlErrorCode.BLOCKED_PROXY) === true, 'BLOCKED_PROXY -> rotate proxy');
|
|
assert(shouldRotateProxy(CrawlErrorCode.RATE_LIMITED) === true, 'RATE_LIMITED -> rotate proxy');
|
|
assert(shouldRotateUserAgent(CrawlErrorCode.AUTH_FAILED) === true, 'AUTH_FAILED -> rotate UA');
|
|
}
|
|
|
|
// ============================================================
|
|
// TEST: Retry Manager
|
|
// ============================================================
|
|
|
|
function testRetryManager(): void {
|
|
section('Retry Manager');
|
|
|
|
const manager = new RetryManager({ maxRetries: 3, baseBackoffMs: 100 });
|
|
|
|
// Initial state
|
|
assert(manager.shouldAttempt() === true, 'Should attempt initially');
|
|
assert(manager.getAttemptNumber() === 1, 'Attempt number starts at 1');
|
|
|
|
// First attempt
|
|
manager.recordAttempt();
|
|
assert(manager.getAttemptNumber() === 2, 'Attempt number increments');
|
|
|
|
// Evaluate retryable error
|
|
const decision1 = manager.evaluateError(new Error('rate limit exceeded'), 429);
|
|
assert(decision1.shouldRetry === true, 'Should retry on rate limit');
|
|
assert(decision1.errorCode === CrawlErrorCode.RATE_LIMITED, 'Error code is RATE_LIMITED');
|
|
assert(decision1.rotateProxy === true, 'Should rotate proxy');
|
|
assert(decision1.backoffMs > 0, 'Backoff is positive');
|
|
|
|
// More attempts
|
|
manager.recordAttempt();
|
|
manager.recordAttempt();
|
|
|
|
// Now at max retries
|
|
const decision2 = manager.evaluateError(new Error('timeout'), 504);
|
|
assert(decision2.shouldRetry === true, 'Should still retry (at limit but not exceeded)');
|
|
|
|
manager.recordAttempt();
|
|
const decision3 = manager.evaluateError(new Error('timeout'));
|
|
assert(decision3.shouldRetry === false, 'Should NOT retry after max');
|
|
assert(decision3.reason.includes('exhausted'), 'Reason mentions exhausted');
|
|
|
|
// Reset
|
|
manager.reset();
|
|
assert(manager.shouldAttempt() === true, 'Should attempt after reset');
|
|
assert(manager.getAttemptNumber() === 1, 'Attempt number resets');
|
|
|
|
// Non-retryable error
|
|
const manager2 = new RetryManager({ maxRetries: 3 });
|
|
manager2.recordAttempt();
|
|
const nonRetryable = manager2.evaluateError(new Error('HTML structure changed'));
|
|
assert(nonRetryable.shouldRetry === false, 'Non-retryable error stops immediately');
|
|
assert(nonRetryable.errorCode === CrawlErrorCode.HTML_CHANGED, 'Error code is HTML_CHANGED');
|
|
}
|
|
|
|
// ============================================================
|
|
// TEST: Exponential Backoff
|
|
// ============================================================
|
|
|
|
function testExponentialBackoff(): void {
|
|
section('Exponential Backoff');
|
|
|
|
// Calculate next crawl delay
|
|
const delay0 = calculateNextCrawlDelay(0, 240); // No failures
|
|
const delay1 = calculateNextCrawlDelay(1, 240); // 1 failure
|
|
const delay2 = calculateNextCrawlDelay(2, 240); // 2 failures
|
|
const delay3 = calculateNextCrawlDelay(3, 240); // 3 failures
|
|
const delay5 = calculateNextCrawlDelay(5, 240); // 5 failures (should cap)
|
|
|
|
console.log(` Delay with 0 failures: ${delay0} minutes`);
|
|
console.log(` Delay with 1 failure: ${delay1} minutes`);
|
|
console.log(` Delay with 2 failures: ${delay2} minutes`);
|
|
console.log(` Delay with 3 failures: ${delay3} minutes`);
|
|
console.log(` Delay with 5 failures: ${delay5} minutes`);
|
|
|
|
assert(delay1 > delay0, 'Delay increases with failures');
|
|
assert(delay2 > delay1, 'Delay keeps increasing');
|
|
assert(delay3 > delay2, 'More delay with more failures');
|
|
// With jitter, exact values vary but ratio should be close to 2x
|
|
assert(delay5 <= 240 * 4 * 1.2, 'Delay is capped at max multiplier');
|
|
|
|
// Next crawl time calculation
|
|
const now = new Date();
|
|
const nextAt = calculateNextCrawlAt(2, 240);
|
|
assert(nextAt > now, 'Next crawl is in future');
|
|
assert(nextAt.getTime() - now.getTime() > 240 * 60 * 1000, 'Includes backoff');
|
|
}
|
|
|
|
// ============================================================
|
|
// TEST: Status Transitions
|
|
// ============================================================
|
|
|
|
function testStatusTransitions(): void {
|
|
section('Status Transitions');
|
|
|
|
// Active status
|
|
assert(determineCrawlStatus(0) === 'active', '0 failures -> active');
|
|
assert(determineCrawlStatus(1) === 'active', '1 failure -> active');
|
|
assert(determineCrawlStatus(2) === 'active', '2 failures -> active');
|
|
|
|
// Degraded status
|
|
assert(determineCrawlStatus(3) === 'degraded', '3 failures -> degraded');
|
|
assert(determineCrawlStatus(5) === 'degraded', '5 failures -> degraded');
|
|
assert(determineCrawlStatus(9) === 'degraded', '9 failures -> degraded');
|
|
|
|
// Failed status
|
|
assert(determineCrawlStatus(10) === 'failed', '10 failures -> failed');
|
|
assert(determineCrawlStatus(15) === 'failed', '15 failures -> failed');
|
|
|
|
// Custom thresholds
|
|
const customStatus = determineCrawlStatus(5, { degraded: 5, failed: 8 });
|
|
assert(customStatus === 'degraded', 'Custom threshold: 5 -> degraded');
|
|
|
|
// Recovery check
|
|
const recentFailure = new Date(Date.now() - 1 * 60 * 60 * 1000); // 1 hour ago
|
|
const oldFailure = new Date(Date.now() - 48 * 60 * 60 * 1000); // 48 hours ago
|
|
|
|
assert(shouldAttemptRecovery(recentFailure, 1) === false, 'No recovery for recent failure');
|
|
assert(shouldAttemptRecovery(oldFailure, 1) === true, 'Recovery allowed for old failure');
|
|
assert(shouldAttemptRecovery(null, 0) === true, 'Recovery allowed if no previous failure');
|
|
}
|
|
|
|
// ============================================================
|
|
// TEST: Store Validation
|
|
// ============================================================
|
|
|
|
function testStoreValidation(): void {
|
|
section('Store Validation');
|
|
|
|
// Valid config
|
|
const validConfig: RawStoreConfig = {
|
|
id: 1,
|
|
name: 'Test Store',
|
|
platformDispensaryId: '123abc',
|
|
menuType: 'dutchie',
|
|
};
|
|
const validResult = validateStoreConfig(validConfig);
|
|
assert(validResult.isValid === true, 'Valid config passes');
|
|
assert(validResult.config !== null, 'Valid config returns config');
|
|
assert(validResult.config?.slug === 'test-store', 'Slug is generated');
|
|
|
|
// Missing required fields
|
|
const missingId: RawStoreConfig = {
|
|
id: 0,
|
|
name: 'Test',
|
|
platformDispensaryId: '123',
|
|
menuType: 'dutchie',
|
|
};
|
|
const missingIdResult = validateStoreConfig(missingId);
|
|
assert(missingIdResult.isValid === false, 'Missing ID fails');
|
|
|
|
// Missing platform ID
|
|
const missingPlatform: RawStoreConfig = {
|
|
id: 1,
|
|
name: 'Test',
|
|
menuType: 'dutchie',
|
|
};
|
|
const missingPlatformResult = validateStoreConfig(missingPlatform);
|
|
assert(missingPlatformResult.isValid === false, 'Missing platform ID fails');
|
|
|
|
// Unknown menu type
|
|
const unknownMenu: RawStoreConfig = {
|
|
id: 1,
|
|
name: 'Test',
|
|
platformDispensaryId: '123',
|
|
menuType: 'unknown',
|
|
};
|
|
const unknownMenuResult = validateStoreConfig(unknownMenu);
|
|
assert(unknownMenuResult.isValid === false, 'Unknown menu type fails');
|
|
|
|
// Crawlable check
|
|
assert(isCrawlable(validConfig) === true, 'Valid config is crawlable');
|
|
assert(isCrawlable(missingPlatform) === false, 'Missing platform not crawlable');
|
|
assert(isCrawlable({ ...validConfig, crawlStatus: 'failed' }) === false, 'Failed status not crawlable');
|
|
assert(isCrawlable({ ...validConfig, crawlStatus: 'paused' }) === false, 'Paused status not crawlable');
|
|
}
|
|
|
|
// ============================================================
|
|
// TEST: User Agent Rotation
|
|
// ============================================================
|
|
|
|
function testUserAgentRotation(): void {
|
|
section('User Agent Rotation');
|
|
|
|
const rotator = new UserAgentRotator();
|
|
|
|
const first = rotator.getCurrent();
|
|
const second = rotator.getNext();
|
|
const third = rotator.getNext();
|
|
|
|
assert(first !== second, 'User agents rotate');
|
|
assert(second !== third, 'User agents keep rotating');
|
|
assert(USER_AGENTS.includes(first), 'Returns valid UA');
|
|
assert(USER_AGENTS.includes(second), 'Returns valid UA');
|
|
|
|
// Random UA
|
|
const random = rotator.getRandom();
|
|
assert(USER_AGENTS.includes(random), 'Random returns valid UA');
|
|
|
|
// Count
|
|
assert(rotator.getCount() === USER_AGENTS.length, 'Reports correct count');
|
|
}
|
|
|
|
// ============================================================
|
|
// TEST: WithRetry Helper
|
|
// ============================================================
|
|
|
|
async function testWithRetryHelper(): Promise<void> {
|
|
section('WithRetry Helper');
|
|
|
|
// Successful on first try
|
|
let attempts = 0;
|
|
const successResult = await withRetry(async () => {
|
|
attempts++;
|
|
return 'success';
|
|
}, { maxRetries: 3 });
|
|
assert(attempts === 1, 'Succeeds on first try');
|
|
assert(successResult.result === 'success', 'Returns result');
|
|
|
|
// Fails then succeeds
|
|
let failThenSucceedAttempts = 0;
|
|
const failThenSuccessResult = await withRetry(async () => {
|
|
failThenSucceedAttempts++;
|
|
if (failThenSucceedAttempts < 3) {
|
|
throw new Error('temporary error');
|
|
}
|
|
return 'finally succeeded';
|
|
}, { maxRetries: 5, baseBackoffMs: 10 });
|
|
assert(failThenSucceedAttempts === 3, 'Retries until success');
|
|
assert(failThenSuccessResult.result === 'finally succeeded', 'Returns final result');
|
|
assert(failThenSuccessResult.summary.attemptsMade === 3, 'Summary tracks attempts');
|
|
|
|
// Exhausts retries
|
|
let alwaysFailAttempts = 0;
|
|
try {
|
|
await withRetry(async () => {
|
|
alwaysFailAttempts++;
|
|
throw new Error('always fails');
|
|
}, { maxRetries: 2, baseBackoffMs: 10 });
|
|
assert(false, 'Should have thrown');
|
|
} catch (error: any) {
|
|
assert(alwaysFailAttempts === 3, 'Attempts all retries'); // 1 initial + 2 retries
|
|
assert(error.name === 'RetryExhaustedError', 'Throws RetryExhaustedError');
|
|
}
|
|
|
|
// Non-retryable error stops immediately
|
|
let nonRetryableAttempts = 0;
|
|
try {
|
|
await withRetry(async () => {
|
|
nonRetryableAttempts++;
|
|
const err = new Error('HTML structure changed - selector not found');
|
|
throw err;
|
|
}, { maxRetries: 3, baseBackoffMs: 10 });
|
|
assert(false, 'Should have thrown');
|
|
} catch {
|
|
assert(nonRetryableAttempts === 1, 'Non-retryable stops immediately');
|
|
}
|
|
}
|
|
|
|
// ============================================================
|
|
// TEST: Minimum Crawl Gap
|
|
// ============================================================
|
|
|
|
function testMinimumCrawlGap(): void {
|
|
section('Minimum Crawl Gap');
|
|
|
|
// Default config
|
|
assert(DEFAULT_CONFIG.minCrawlGapMinutes === 2, 'Default gap is 2 minutes');
|
|
assert(DEFAULT_CONFIG.crawlFrequencyMinutes === 240, 'Default frequency is 4 hours');
|
|
|
|
// Gap calculation
|
|
const gapMs = DEFAULT_CONFIG.minCrawlGapMinutes * 60 * 1000;
|
|
assert(gapMs === 120000, 'Gap is 2 minutes in ms');
|
|
|
|
console.log(' Note: Gap enforcement is tested at DB level (trigger) and application level');
|
|
}
|
|
|
|
// ============================================================
|
|
// TEST: Error Metadata
|
|
// ============================================================
|
|
|
|
function testErrorMetadata(): void {
|
|
section('Error Metadata');
|
|
|
|
// RATE_LIMITED
|
|
const rateLimited = getErrorMetadata(CrawlErrorCode.RATE_LIMITED);
|
|
assert(rateLimited.retryable === true, 'RATE_LIMITED is retryable');
|
|
assert(rateLimited.rotateProxy === true, 'RATE_LIMITED rotates proxy');
|
|
assert(rateLimited.backoffMultiplier === 2.0, 'RATE_LIMITED has 2x backoff');
|
|
assert(rateLimited.severity === 'medium', 'RATE_LIMITED is medium severity');
|
|
|
|
// HTML_CHANGED
|
|
const htmlChanged = getErrorMetadata(CrawlErrorCode.HTML_CHANGED);
|
|
assert(htmlChanged.retryable === false, 'HTML_CHANGED is NOT retryable');
|
|
assert(htmlChanged.severity === 'high', 'HTML_CHANGED is high severity');
|
|
|
|
// INVALID_CONFIG
|
|
const invalidConfig = getErrorMetadata(CrawlErrorCode.INVALID_CONFIG);
|
|
assert(invalidConfig.retryable === false, 'INVALID_CONFIG is NOT retryable');
|
|
assert(invalidConfig.severity === 'critical', 'INVALID_CONFIG is critical');
|
|
}
|
|
|
|
// ============================================================
|
|
// MAIN
|
|
// ============================================================
|
|
|
|
async function runTests(testName?: string): Promise<void> {
|
|
console.log('\n');
|
|
console.log('╔══════════════════════════════════════════════════════════╗');
|
|
console.log('║ CRAWLER RELIABILITY STRESS TEST - PHASE 1 ║');
|
|
console.log('╚══════════════════════════════════════════════════════════╝');
|
|
|
|
const allTests = !testName || testName === 'all';
|
|
|
|
if (allTests || testName === 'error' || testName === 'classification') {
|
|
testErrorClassification();
|
|
}
|
|
|
|
if (allTests || testName === 'retry') {
|
|
testRetryManager();
|
|
}
|
|
|
|
if (allTests || testName === 'backoff') {
|
|
testExponentialBackoff();
|
|
}
|
|
|
|
if (allTests || testName === 'status') {
|
|
testStatusTransitions();
|
|
}
|
|
|
|
if (allTests || testName === 'validation' || testName === 'store') {
|
|
testStoreValidation();
|
|
}
|
|
|
|
if (allTests || testName === 'rotation' || testName === 'ua') {
|
|
testUserAgentRotation();
|
|
}
|
|
|
|
if (allTests || testName === 'withRetry' || testName === 'helper') {
|
|
await testWithRetryHelper();
|
|
}
|
|
|
|
if (allTests || testName === 'gap') {
|
|
testMinimumCrawlGap();
|
|
}
|
|
|
|
if (allTests || testName === 'metadata') {
|
|
testErrorMetadata();
|
|
}
|
|
|
|
// Summary
|
|
console.log('\n');
|
|
console.log('═'.repeat(60));
|
|
console.log('SUMMARY');
|
|
console.log('═'.repeat(60));
|
|
console.log(` Passed: ${testsPassed}`);
|
|
console.log(` Failed: ${testsFailed}`);
|
|
console.log(` Total: ${testsPassed + testsFailed}`);
|
|
|
|
if (testsFailed > 0) {
|
|
console.log('\n❌ SOME TESTS FAILED\n');
|
|
process.exit(1);
|
|
} else {
|
|
console.log('\n✅ ALL TESTS PASSED\n');
|
|
process.exit(0);
|
|
}
|
|
}
|
|
|
|
// Run tests
|
|
const testName = process.argv[2];
|
|
runTests(testName).catch((error) => {
|
|
console.error('Fatal error:', error);
|
|
process.exit(1);
|
|
});
|