Files
cannaiq/backend/src/dutchie-az/scripts/stress-test.ts
Kelly b4a2fb7d03 feat: Add v2 architecture with multi-state support and orchestrator services
Major additions:
- Multi-state expansion: states table, StateSelector, NationalDashboard, StateHeatmap, CrossStateCompare
- Orchestrator services: trace service, error taxonomy, retry manager, proxy rotator
- Discovery system: dutchie discovery service, geo validation, city seeding scripts
- Analytics infrastructure: analytics v2 routes, brand/pricing/stores intelligence pages
- Local development: setup-local.sh starts all 5 services (postgres, backend, cannaiq, findadispo, findagram)
- Migrations 037-056: crawler profiles, states, analytics indexes, worker metadata

Frontend pages added:
- Discovery, ChainsDashboard, IntelligenceBrands, IntelligencePricing, IntelligenceStores
- StateHeatmap, CrossStateCompare, SyncInfoPanel

Components added:
- StateSelector, OrchestratorTraceModal, WorkflowStepper

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-07 11:30:57 -07:00

487 lines
18 KiB
TypeScript

#!/usr/bin/env npx tsx
/**
* Crawler Reliability Stress Test
*
* Simulates various failure scenarios to test:
* - Retry logic with exponential backoff
* - Error taxonomy classification
* - Self-healing (proxy/UA rotation)
* - Status transitions (active -> degraded -> failed)
* - Minimum crawl gap enforcement
*
* Phase 1: Crawler Reliability & Stabilization
*
* Usage:
* DATABASE_URL="postgresql://..." npx tsx src/dutchie-az/scripts/stress-test.ts [test-name]
*
* Available tests:
* retry - Test retry manager with various error types
* backoff - Test exponential backoff calculation
* status - Test status transitions
* gap - Test minimum crawl gap enforcement
* rotation - Test proxy/UA rotation
* all - Run all tests
*/
import {
CrawlErrorCode,
classifyError,
isRetryable,
shouldRotateProxy,
shouldRotateUserAgent,
getBackoffMultiplier,
getErrorMetadata,
} from '../services/error-taxonomy';
import {
RetryManager,
withRetry,
calculateNextCrawlDelay,
calculateNextCrawlAt,
determineCrawlStatus,
shouldAttemptRecovery,
sleep,
} from '../services/retry-manager';
import {
UserAgentRotator,
USER_AGENTS,
} from '../services/proxy-rotator';
import {
validateStoreConfig,
isCrawlable,
DEFAULT_CONFIG,
RawStoreConfig,
} from '../services/store-validator';
// ============================================================
// TEST UTILITIES
// ============================================================
let testsPassed = 0;
let testsFailed = 0;
function assert(condition: boolean, message: string): void {
if (condition) {
console.log(`${message}`);
testsPassed++;
} else {
console.log(`${message}`);
testsFailed++;
}
}
function section(name: string): void {
console.log(`\n${'='.repeat(60)}`);
console.log(`TEST: ${name}`);
console.log('='.repeat(60));
}
// ============================================================
// TEST: Error Classification
// ============================================================
function testErrorClassification(): void {
section('Error Classification');
// HTTP status codes
assert(classifyError(null, 429) === CrawlErrorCode.RATE_LIMITED, '429 -> RATE_LIMITED');
assert(classifyError(null, 407) === CrawlErrorCode.BLOCKED_PROXY, '407 -> BLOCKED_PROXY');
assert(classifyError(null, 401) === CrawlErrorCode.AUTH_FAILED, '401 -> AUTH_FAILED');
assert(classifyError(null, 403) === CrawlErrorCode.AUTH_FAILED, '403 -> AUTH_FAILED');
assert(classifyError(null, 503) === CrawlErrorCode.SERVICE_UNAVAILABLE, '503 -> SERVICE_UNAVAILABLE');
assert(classifyError(null, 500) === CrawlErrorCode.SERVER_ERROR, '500 -> SERVER_ERROR');
// Error messages
assert(classifyError('rate limit exceeded') === CrawlErrorCode.RATE_LIMITED, 'rate limit message -> RATE_LIMITED');
assert(classifyError('request timed out') === CrawlErrorCode.TIMEOUT, 'timeout message -> TIMEOUT');
assert(classifyError('proxy blocked') === CrawlErrorCode.BLOCKED_PROXY, 'proxy blocked -> BLOCKED_PROXY');
assert(classifyError('ECONNREFUSED') === CrawlErrorCode.NETWORK_ERROR, 'ECONNREFUSED -> NETWORK_ERROR');
assert(classifyError('ENOTFOUND') === CrawlErrorCode.DNS_ERROR, 'ENOTFOUND -> DNS_ERROR');
assert(classifyError('selector not found') === CrawlErrorCode.HTML_CHANGED, 'selector error -> HTML_CHANGED');
assert(classifyError('JSON parse error') === CrawlErrorCode.PARSE_ERROR, 'parse error -> PARSE_ERROR');
assert(classifyError('0 products found') === CrawlErrorCode.NO_PRODUCTS, 'no products -> NO_PRODUCTS');
// Retryability
assert(isRetryable(CrawlErrorCode.RATE_LIMITED) === true, 'RATE_LIMITED is retryable');
assert(isRetryable(CrawlErrorCode.TIMEOUT) === true, 'TIMEOUT is retryable');
assert(isRetryable(CrawlErrorCode.HTML_CHANGED) === false, 'HTML_CHANGED is NOT retryable');
assert(isRetryable(CrawlErrorCode.INVALID_CONFIG) === false, 'INVALID_CONFIG is NOT retryable');
// Rotation decisions
assert(shouldRotateProxy(CrawlErrorCode.BLOCKED_PROXY) === true, 'BLOCKED_PROXY -> rotate proxy');
assert(shouldRotateProxy(CrawlErrorCode.RATE_LIMITED) === true, 'RATE_LIMITED -> rotate proxy');
assert(shouldRotateUserAgent(CrawlErrorCode.AUTH_FAILED) === true, 'AUTH_FAILED -> rotate UA');
}
// ============================================================
// TEST: Retry Manager
// ============================================================
function testRetryManager(): void {
section('Retry Manager');
const manager = new RetryManager({ maxRetries: 3, baseBackoffMs: 100 });
// Initial state
assert(manager.shouldAttempt() === true, 'Should attempt initially');
assert(manager.getAttemptNumber() === 1, 'Attempt number starts at 1');
// First attempt
manager.recordAttempt();
assert(manager.getAttemptNumber() === 2, 'Attempt number increments');
// Evaluate retryable error
const decision1 = manager.evaluateError(new Error('rate limit exceeded'), 429);
assert(decision1.shouldRetry === true, 'Should retry on rate limit');
assert(decision1.errorCode === CrawlErrorCode.RATE_LIMITED, 'Error code is RATE_LIMITED');
assert(decision1.rotateProxy === true, 'Should rotate proxy');
assert(decision1.backoffMs > 0, 'Backoff is positive');
// More attempts
manager.recordAttempt();
manager.recordAttempt();
// Now at max retries
const decision2 = manager.evaluateError(new Error('timeout'), 504);
assert(decision2.shouldRetry === true, 'Should still retry (at limit but not exceeded)');
manager.recordAttempt();
const decision3 = manager.evaluateError(new Error('timeout'));
assert(decision3.shouldRetry === false, 'Should NOT retry after max');
assert(decision3.reason.includes('exhausted'), 'Reason mentions exhausted');
// Reset
manager.reset();
assert(manager.shouldAttempt() === true, 'Should attempt after reset');
assert(manager.getAttemptNumber() === 1, 'Attempt number resets');
// Non-retryable error
const manager2 = new RetryManager({ maxRetries: 3 });
manager2.recordAttempt();
const nonRetryable = manager2.evaluateError(new Error('HTML structure changed'));
assert(nonRetryable.shouldRetry === false, 'Non-retryable error stops immediately');
assert(nonRetryable.errorCode === CrawlErrorCode.HTML_CHANGED, 'Error code is HTML_CHANGED');
}
// ============================================================
// TEST: Exponential Backoff
// ============================================================
function testExponentialBackoff(): void {
section('Exponential Backoff');
// Calculate next crawl delay
const delay0 = calculateNextCrawlDelay(0, 240); // No failures
const delay1 = calculateNextCrawlDelay(1, 240); // 1 failure
const delay2 = calculateNextCrawlDelay(2, 240); // 2 failures
const delay3 = calculateNextCrawlDelay(3, 240); // 3 failures
const delay5 = calculateNextCrawlDelay(5, 240); // 5 failures (should cap)
console.log(` Delay with 0 failures: ${delay0} minutes`);
console.log(` Delay with 1 failure: ${delay1} minutes`);
console.log(` Delay with 2 failures: ${delay2} minutes`);
console.log(` Delay with 3 failures: ${delay3} minutes`);
console.log(` Delay with 5 failures: ${delay5} minutes`);
assert(delay1 > delay0, 'Delay increases with failures');
assert(delay2 > delay1, 'Delay keeps increasing');
assert(delay3 > delay2, 'More delay with more failures');
// With jitter, exact values vary but ratio should be close to 2x
assert(delay5 <= 240 * 4 * 1.2, 'Delay is capped at max multiplier');
// Next crawl time calculation
const now = new Date();
const nextAt = calculateNextCrawlAt(2, 240);
assert(nextAt > now, 'Next crawl is in future');
assert(nextAt.getTime() - now.getTime() > 240 * 60 * 1000, 'Includes backoff');
}
// ============================================================
// TEST: Status Transitions
// ============================================================
function testStatusTransitions(): void {
section('Status Transitions');
// Active status
assert(determineCrawlStatus(0) === 'active', '0 failures -> active');
assert(determineCrawlStatus(1) === 'active', '1 failure -> active');
assert(determineCrawlStatus(2) === 'active', '2 failures -> active');
// Degraded status
assert(determineCrawlStatus(3) === 'degraded', '3 failures -> degraded');
assert(determineCrawlStatus(5) === 'degraded', '5 failures -> degraded');
assert(determineCrawlStatus(9) === 'degraded', '9 failures -> degraded');
// Failed status
assert(determineCrawlStatus(10) === 'failed', '10 failures -> failed');
assert(determineCrawlStatus(15) === 'failed', '15 failures -> failed');
// Custom thresholds
const customStatus = determineCrawlStatus(5, { degraded: 5, failed: 8 });
assert(customStatus === 'degraded', 'Custom threshold: 5 -> degraded');
// Recovery check
const recentFailure = new Date(Date.now() - 1 * 60 * 60 * 1000); // 1 hour ago
const oldFailure = new Date(Date.now() - 48 * 60 * 60 * 1000); // 48 hours ago
assert(shouldAttemptRecovery(recentFailure, 1) === false, 'No recovery for recent failure');
assert(shouldAttemptRecovery(oldFailure, 1) === true, 'Recovery allowed for old failure');
assert(shouldAttemptRecovery(null, 0) === true, 'Recovery allowed if no previous failure');
}
// ============================================================
// TEST: Store Validation
// ============================================================
function testStoreValidation(): void {
section('Store Validation');
// Valid config
const validConfig: RawStoreConfig = {
id: 1,
name: 'Test Store',
platformDispensaryId: '123abc',
menuType: 'dutchie',
};
const validResult = validateStoreConfig(validConfig);
assert(validResult.isValid === true, 'Valid config passes');
assert(validResult.config !== null, 'Valid config returns config');
assert(validResult.config?.slug === 'test-store', 'Slug is generated');
// Missing required fields
const missingId: RawStoreConfig = {
id: 0,
name: 'Test',
platformDispensaryId: '123',
menuType: 'dutchie',
};
const missingIdResult = validateStoreConfig(missingId);
assert(missingIdResult.isValid === false, 'Missing ID fails');
// Missing platform ID
const missingPlatform: RawStoreConfig = {
id: 1,
name: 'Test',
menuType: 'dutchie',
};
const missingPlatformResult = validateStoreConfig(missingPlatform);
assert(missingPlatformResult.isValid === false, 'Missing platform ID fails');
// Unknown menu type
const unknownMenu: RawStoreConfig = {
id: 1,
name: 'Test',
platformDispensaryId: '123',
menuType: 'unknown',
};
const unknownMenuResult = validateStoreConfig(unknownMenu);
assert(unknownMenuResult.isValid === false, 'Unknown menu type fails');
// Crawlable check
assert(isCrawlable(validConfig) === true, 'Valid config is crawlable');
assert(isCrawlable(missingPlatform) === false, 'Missing platform not crawlable');
assert(isCrawlable({ ...validConfig, crawlStatus: 'failed' }) === false, 'Failed status not crawlable');
assert(isCrawlable({ ...validConfig, crawlStatus: 'paused' }) === false, 'Paused status not crawlable');
}
// ============================================================
// TEST: User Agent Rotation
// ============================================================
function testUserAgentRotation(): void {
section('User Agent Rotation');
const rotator = new UserAgentRotator();
const first = rotator.getCurrent();
const second = rotator.getNext();
const third = rotator.getNext();
assert(first !== second, 'User agents rotate');
assert(second !== third, 'User agents keep rotating');
assert(USER_AGENTS.includes(first), 'Returns valid UA');
assert(USER_AGENTS.includes(second), 'Returns valid UA');
// Random UA
const random = rotator.getRandom();
assert(USER_AGENTS.includes(random), 'Random returns valid UA');
// Count
assert(rotator.getCount() === USER_AGENTS.length, 'Reports correct count');
}
// ============================================================
// TEST: WithRetry Helper
// ============================================================
async function testWithRetryHelper(): Promise<void> {
section('WithRetry Helper');
// Successful on first try
let attempts = 0;
const successResult = await withRetry(async () => {
attempts++;
return 'success';
}, { maxRetries: 3 });
assert(attempts === 1, 'Succeeds on first try');
assert(successResult.result === 'success', 'Returns result');
// Fails then succeeds
let failThenSucceedAttempts = 0;
const failThenSuccessResult = await withRetry(async () => {
failThenSucceedAttempts++;
if (failThenSucceedAttempts < 3) {
throw new Error('temporary error');
}
return 'finally succeeded';
}, { maxRetries: 5, baseBackoffMs: 10 });
assert(failThenSucceedAttempts === 3, 'Retries until success');
assert(failThenSuccessResult.result === 'finally succeeded', 'Returns final result');
assert(failThenSuccessResult.summary.attemptsMade === 3, 'Summary tracks attempts');
// Exhausts retries
let alwaysFailAttempts = 0;
try {
await withRetry(async () => {
alwaysFailAttempts++;
throw new Error('always fails');
}, { maxRetries: 2, baseBackoffMs: 10 });
assert(false, 'Should have thrown');
} catch (error: any) {
assert(alwaysFailAttempts === 3, 'Attempts all retries'); // 1 initial + 2 retries
assert(error.name === 'RetryExhaustedError', 'Throws RetryExhaustedError');
}
// Non-retryable error stops immediately
let nonRetryableAttempts = 0;
try {
await withRetry(async () => {
nonRetryableAttempts++;
const err = new Error('HTML structure changed - selector not found');
throw err;
}, { maxRetries: 3, baseBackoffMs: 10 });
assert(false, 'Should have thrown');
} catch {
assert(nonRetryableAttempts === 1, 'Non-retryable stops immediately');
}
}
// ============================================================
// TEST: Minimum Crawl Gap
// ============================================================
function testMinimumCrawlGap(): void {
section('Minimum Crawl Gap');
// Default config
assert(DEFAULT_CONFIG.minCrawlGapMinutes === 2, 'Default gap is 2 minutes');
assert(DEFAULT_CONFIG.crawlFrequencyMinutes === 240, 'Default frequency is 4 hours');
// Gap calculation
const gapMs = DEFAULT_CONFIG.minCrawlGapMinutes * 60 * 1000;
assert(gapMs === 120000, 'Gap is 2 minutes in ms');
console.log(' Note: Gap enforcement is tested at DB level (trigger) and application level');
}
// ============================================================
// TEST: Error Metadata
// ============================================================
function testErrorMetadata(): void {
section('Error Metadata');
// RATE_LIMITED
const rateLimited = getErrorMetadata(CrawlErrorCode.RATE_LIMITED);
assert(rateLimited.retryable === true, 'RATE_LIMITED is retryable');
assert(rateLimited.rotateProxy === true, 'RATE_LIMITED rotates proxy');
assert(rateLimited.backoffMultiplier === 2.0, 'RATE_LIMITED has 2x backoff');
assert(rateLimited.severity === 'medium', 'RATE_LIMITED is medium severity');
// HTML_CHANGED
const htmlChanged = getErrorMetadata(CrawlErrorCode.HTML_CHANGED);
assert(htmlChanged.retryable === false, 'HTML_CHANGED is NOT retryable');
assert(htmlChanged.severity === 'high', 'HTML_CHANGED is high severity');
// INVALID_CONFIG
const invalidConfig = getErrorMetadata(CrawlErrorCode.INVALID_CONFIG);
assert(invalidConfig.retryable === false, 'INVALID_CONFIG is NOT retryable');
assert(invalidConfig.severity === 'critical', 'INVALID_CONFIG is critical');
}
// ============================================================
// MAIN
// ============================================================
async function runTests(testName?: string): Promise<void> {
console.log('\n');
console.log('╔══════════════════════════════════════════════════════════╗');
console.log('║ CRAWLER RELIABILITY STRESS TEST - PHASE 1 ║');
console.log('╚══════════════════════════════════════════════════════════╝');
const allTests = !testName || testName === 'all';
if (allTests || testName === 'error' || testName === 'classification') {
testErrorClassification();
}
if (allTests || testName === 'retry') {
testRetryManager();
}
if (allTests || testName === 'backoff') {
testExponentialBackoff();
}
if (allTests || testName === 'status') {
testStatusTransitions();
}
if (allTests || testName === 'validation' || testName === 'store') {
testStoreValidation();
}
if (allTests || testName === 'rotation' || testName === 'ua') {
testUserAgentRotation();
}
if (allTests || testName === 'withRetry' || testName === 'helper') {
await testWithRetryHelper();
}
if (allTests || testName === 'gap') {
testMinimumCrawlGap();
}
if (allTests || testName === 'metadata') {
testErrorMetadata();
}
// Summary
console.log('\n');
console.log('═'.repeat(60));
console.log('SUMMARY');
console.log('═'.repeat(60));
console.log(` Passed: ${testsPassed}`);
console.log(` Failed: ${testsFailed}`);
console.log(` Total: ${testsPassed + testsFailed}`);
if (testsFailed > 0) {
console.log('\n❌ SOME TESTS FAILED\n');
process.exit(1);
} else {
console.log('\n✅ ALL TESTS PASSED\n');
process.exit(0);
}
}
// Run tests
const testName = process.argv[2];
runTests(testName).catch((error) => {
console.error('Fatal error:', error);
process.exit(1);
});