package transport import ( "context" "fmt" "math" "runtime" "sync" "sync/atomic" "time" "github.com/fraktal/mev-beta/internal/logger" "github.com/fraktal/mev-beta/pkg/security" ) // BenchmarkSuite provides comprehensive performance testing for the transport layer type BenchmarkSuite struct { logger *logger.Logger messageBus *UniversalMessageBus results []BenchmarkResult config BenchmarkConfig metrics BenchmarkMetrics mu sync.RWMutex } // BenchmarkConfig configures benchmark parameters type BenchmarkConfig struct { MessageSizes []int // Message payload sizes to test Concurrency []int // Concurrency levels to test Duration time.Duration // Duration of each benchmark WarmupDuration time.Duration // Warmup period before measurements TransportTypes []TransportType // Transport types to benchmark MessageTypes []MessageType // Message types to test SerializationFormats []SerializationFormat // Serialization formats to test EnableMetrics bool // Whether to collect detailed metrics OutputFormat string // Output format (json, csv, console) } // BenchmarkResult contains results from a single benchmark run type BenchmarkResult struct { TestName string `json:"test_name"` Transport TransportType `json:"transport"` MessageSize int `json:"message_size"` Concurrency int `json:"concurrency"` Serialization SerializationFormat `json:"serialization"` Duration time.Duration `json:"duration"` MessagesSent int64 `json:"messages_sent"` MessagesReceived int64 `json:"messages_received"` BytesSent int64 `json:"bytes_sent"` BytesReceived int64 `json:"bytes_received"` ThroughputMsgSec float64 `json:"throughput_msg_sec"` ThroughputByteSec float64 `json:"throughput_byte_sec"` LatencyP50 time.Duration `json:"latency_p50"` LatencyP95 time.Duration `json:"latency_p95"` LatencyP99 time.Duration `json:"latency_p99"` ErrorRate float64 `json:"error_rate"` CPUUsage float64 `json:"cpu_usage"` MemoryUsage int64 `json:"memory_usage"` GCPauses int64 `json:"gc_pauses"` Timestamp time.Time `json:"timestamp"` } // BenchmarkMetrics tracks overall benchmark statistics type BenchmarkMetrics struct { TotalTests int `json:"total_tests"` PassedTests int `json:"passed_tests"` FailedTests int `json:"failed_tests"` TotalDuration time.Duration `json:"total_duration"` HighestThroughput float64 `json:"highest_throughput"` LowestLatency time.Duration `json:"lowest_latency"` BestTransport TransportType `json:"best_transport"` Timestamp time.Time `json:"timestamp"` } // LatencyTracker tracks message latencies type LatencyTracker struct { latencies []time.Duration mu sync.Mutex } // NewBenchmarkSuite creates a new benchmark suite func NewBenchmarkSuite(messageBus *UniversalMessageBus, logger *logger.Logger) *BenchmarkSuite { return &BenchmarkSuite{ logger: logger, messageBus: messageBus, results: make([]BenchmarkResult, 0), config: BenchmarkConfig{ MessageSizes: []int{64, 256, 1024, 4096, 16384}, Concurrency: []int{1, 10, 50, 100}, Duration: 30 * time.Second, WarmupDuration: 5 * time.Second, TransportTypes: []TransportType{TransportMemory, TransportUnixSocket, TransportTCP}, MessageTypes: []MessageType{MessageTypeEvent, MessageTypeCommand}, SerializationFormats: []SerializationFormat{SerializationJSON}, EnableMetrics: true, OutputFormat: "console", }, } } // SetConfig updates the benchmark configuration func (bs *BenchmarkSuite) SetConfig(config BenchmarkConfig) { bs.mu.Lock() defer bs.mu.Unlock() bs.config = config } // RunAll executes all benchmark tests func (bs *BenchmarkSuite) RunAll(ctx context.Context) error { bs.mu.Lock() defer bs.mu.Unlock() startTime := time.Now() bs.metrics = BenchmarkMetrics{ Timestamp: startTime, } for _, transport := range bs.config.TransportTypes { for _, msgSize := range bs.config.MessageSizes { for _, concurrency := range bs.config.Concurrency { for _, serialization := range bs.config.SerializationFormats { result, err := bs.runSingleBenchmark(ctx, transport, msgSize, concurrency, serialization) if err != nil { bs.metrics.FailedTests++ continue } bs.results = append(bs.results, result) bs.metrics.PassedTests++ bs.updateBestMetrics(result) } } } } bs.metrics.TotalTests = bs.metrics.PassedTests + bs.metrics.FailedTests bs.metrics.TotalDuration = time.Since(startTime) return nil } // RunThroughputBenchmark tests message throughput func (bs *BenchmarkSuite) RunThroughputBenchmark(ctx context.Context, transport TransportType, messageSize int, concurrency int) (BenchmarkResult, error) { return bs.runSingleBenchmark(ctx, transport, messageSize, concurrency, SerializationJSON) } // RunLatencyBenchmark tests message latency func (bs *BenchmarkSuite) RunLatencyBenchmark(ctx context.Context, transport TransportType, messageSize int) (BenchmarkResult, error) { return bs.runSingleBenchmark(ctx, transport, messageSize, 1, SerializationJSON) } // RunScalabilityBenchmark tests scalability across different concurrency levels func (bs *BenchmarkSuite) RunScalabilityBenchmark(ctx context.Context, transport TransportType, messageSize int) ([]BenchmarkResult, error) { var results []BenchmarkResult for _, concurrency := range bs.config.Concurrency { result, err := bs.runSingleBenchmark(ctx, transport, messageSize, concurrency, SerializationJSON) if err != nil { return nil, fmt.Errorf("scalability benchmark failed at concurrency %d: %w", concurrency, err) } results = append(results, result) } return results, nil } // GetResults returns all benchmark results func (bs *BenchmarkSuite) GetResults() []BenchmarkResult { bs.mu.RLock() defer bs.mu.RUnlock() results := make([]BenchmarkResult, len(bs.results)) copy(results, bs.results) return results } // GetMetrics returns benchmark metrics func (bs *BenchmarkSuite) GetMetrics() BenchmarkMetrics { bs.mu.RLock() defer bs.mu.RUnlock() return bs.metrics } // GetBestPerformingTransport returns the transport with the highest throughput func (bs *BenchmarkSuite) GetBestPerformingTransport() TransportType { bs.mu.RLock() defer bs.mu.RUnlock() return bs.metrics.BestTransport } // Private methods func (bs *BenchmarkSuite) runSingleBenchmark(ctx context.Context, transport TransportType, messageSize int, concurrency int, serialization SerializationFormat) (BenchmarkResult, error) { testName := fmt.Sprintf("%s_%db_%dc_%s", transport, messageSize, concurrency, serialization) result := BenchmarkResult{ TestName: testName, Transport: transport, MessageSize: messageSize, Concurrency: concurrency, Serialization: serialization, Duration: bs.config.Duration, Timestamp: time.Now(), } // Setup test environment latencyTracker := &LatencyTracker{ latencies: make([]time.Duration, 0), } // Create test topic topic := fmt.Sprintf("benchmark_%s", testName) // Subscribe to topic subscription, err := bs.messageBus.Subscribe(topic, func(ctx context.Context, msg *Message) error { if startTime, ok := msg.Metadata["start_time"].(time.Time); ok { latency := time.Since(startTime) latencyTracker.AddLatency(latency) } atomic.AddInt64(&result.MessagesReceived, 1) atomic.AddInt64(&result.BytesReceived, int64(messageSize)) return nil }) if err != nil { return result, fmt.Errorf("failed to subscribe: %w", err) } defer bs.messageBus.Unsubscribe(subscription.ID) // Warmup phase if bs.config.WarmupDuration > 0 { bs.warmup(ctx, topic, messageSize, concurrency, bs.config.WarmupDuration) } // Start system monitoring var cpuUsage float64 var memUsageBefore, memUsageAfter runtime.MemStats runtime.ReadMemStats(&memUsageBefore) monitorCtx, monitorCancel := context.WithCancel(ctx) defer monitorCancel() go bs.monitorSystemResources(monitorCtx, &cpuUsage) // Main benchmark startTime := time.Now() benchmarkCtx, cancel := context.WithTimeout(ctx, bs.config.Duration) defer cancel() // Launch concurrent senders var wg sync.WaitGroup var totalSent int64 var totalErrors int64 for i := 0; i < concurrency; i++ { wg.Add(1) go func() { defer wg.Done() bs.senderWorker(benchmarkCtx, topic, messageSize, &totalSent, &totalErrors) }() } wg.Wait() // Wait a bit for remaining messages to be processed time.Sleep(100 * time.Millisecond) actualDuration := time.Since(startTime) runtime.ReadMemStats(&memUsageAfter) // Calculate results result.MessagesSent = totalSent result.BytesSent = totalSent * int64(messageSize) result.ThroughputMsgSec = float64(totalSent) / actualDuration.Seconds() result.ThroughputByteSec = float64(result.BytesSent) / actualDuration.Seconds() result.ErrorRate = float64(totalErrors) / float64(totalSent) * 100 result.CPUUsage = cpuUsage // Calculate memory usage difference safely memDiff := memUsageAfter.Alloc - memUsageBefore.Alloc memDiffInt64, err := security.SafeUint64ToInt64(memDiff) if err != nil { bs.logger.Warn("Memory usage difference exceeds int64 max", "diff", memDiff, "error", err) result.MemoryUsage = math.MaxInt64 } else { result.MemoryUsage = memDiffInt64 } // Calculate GC pauses difference safely gcDiff := int64(memUsageAfter.NumGC) - int64(memUsageBefore.NumGC) result.GCPauses = gcDiff // Calculate latency percentiles if len(latencyTracker.latencies) > 0 { result.LatencyP50 = latencyTracker.GetPercentile(50) result.LatencyP95 = latencyTracker.GetPercentile(95) result.LatencyP99 = latencyTracker.GetPercentile(99) } return result, nil } func (bs *BenchmarkSuite) warmup(ctx context.Context, topic string, messageSize int, concurrency int, duration time.Duration) { warmupCtx, cancel := context.WithTimeout(ctx, duration) defer cancel() var wg sync.WaitGroup for i := 0; i < concurrency; i++ { wg.Add(1) go func() { defer wg.Done() var dummy1, dummy2 int64 bs.senderWorker(warmupCtx, topic, messageSize, &dummy1, &dummy2) }() } wg.Wait() } func (bs *BenchmarkSuite) senderWorker(ctx context.Context, topic string, messageSize int, totalSent, totalErrors *int64) { payload := make([]byte, messageSize) for i := range payload { payload[i] = byte(i % 256) } for { select { case <-ctx.Done(): return default: msg := NewMessage(MessageTypeEvent, topic, "benchmark", payload) msg.Metadata["start_time"] = time.Now() if err := bs.messageBus.Publish(ctx, msg); err != nil { atomic.AddInt64(totalErrors, 1) } else { atomic.AddInt64(totalSent, 1) } } } } func (bs *BenchmarkSuite) monitorSystemResources(ctx context.Context, cpuUsage *float64) { ticker := time.NewTicker(100 * time.Millisecond) defer ticker.Stop() var samples []float64 startTime := time.Now() for { select { case <-ctx.Done(): // Calculate average CPU usage if len(samples) > 0 { var total float64 for _, sample := range samples { total += sample } *cpuUsage = total / float64(len(samples)) } return case <-ticker.C: // Simple CPU usage estimation based on runtime stats var stats runtime.MemStats runtime.ReadMemStats(&stats) // This is a simplified CPU usage calculation // In production, you'd want to use proper OS-specific CPU monitoring elapsed := time.Since(startTime).Seconds() cpuSample := float64(stats.NumGC) / elapsed * 100 // Rough approximation if cpuSample > 100 { cpuSample = 100 } samples = append(samples, cpuSample) } } } func (bs *BenchmarkSuite) updateBestMetrics(result BenchmarkResult) { if result.ThroughputMsgSec > bs.metrics.HighestThroughput { bs.metrics.HighestThroughput = result.ThroughputMsgSec bs.metrics.BestTransport = result.Transport } if bs.metrics.LowestLatency == 0 || result.LatencyP50 < bs.metrics.LowestLatency { bs.metrics.LowestLatency = result.LatencyP50 } } // LatencyTracker methods func (lt *LatencyTracker) AddLatency(latency time.Duration) { lt.mu.Lock() defer lt.mu.Unlock() lt.latencies = append(lt.latencies, latency) } func (lt *LatencyTracker) GetPercentile(percentile int) time.Duration { lt.mu.Lock() defer lt.mu.Unlock() if len(lt.latencies) == 0 { return 0 } // Sort latencies sorted := make([]time.Duration, len(lt.latencies)) copy(sorted, lt.latencies) // Simple insertion sort for small datasets for i := 1; i < len(sorted); i++ { for j := i; j > 0 && sorted[j] < sorted[j-1]; j-- { sorted[j], sorted[j-1] = sorted[j-1], sorted[j] } } // Calculate percentile index index := int(float64(len(sorted)) * float64(percentile) / 100.0) if index >= len(sorted) { index = len(sorted) - 1 } return sorted[index] } // Benchmark report generation // GenerateReport generates a comprehensive benchmark report func (bs *BenchmarkSuite) GenerateReport() BenchmarkReport { bs.mu.RLock() defer bs.mu.RUnlock() report := BenchmarkReport{ Summary: bs.generateSummary(), Results: bs.results, Metrics: bs.metrics, Config: bs.config, Timestamp: time.Now(), } report.Analysis = bs.generateAnalysis() return report } // BenchmarkReport contains a complete benchmark report type BenchmarkReport struct { Summary ReportSummary `json:"summary"` Results []BenchmarkResult `json:"results"` Metrics BenchmarkMetrics `json:"metrics"` Config BenchmarkConfig `json:"config"` Analysis ReportAnalysis `json:"analysis"` Timestamp time.Time `json:"timestamp"` } // ReportSummary provides a high-level summary type ReportSummary struct { TotalTests int `json:"total_tests"` Duration time.Duration `json:"duration"` BestThroughput float64 `json:"best_throughput"` BestLatency time.Duration `json:"best_latency"` RecommendedTransport TransportType `json:"recommended_transport"` TransportRankings []TransportRanking `json:"transport_rankings"` } // TransportRanking ranks transports by performance type TransportRanking struct { Transport TransportType `json:"transport"` AvgThroughput float64 `json:"avg_throughput"` AvgLatency time.Duration `json:"avg_latency"` Score float64 `json:"score"` Rank int `json:"rank"` } // ReportAnalysis provides detailed analysis type ReportAnalysis struct { ScalabilityAnalysis ScalabilityAnalysis `json:"scalability"` PerformanceBottlenecks []PerformanceIssue `json:"bottlenecks"` Recommendations []Recommendation `json:"recommendations"` } // ScalabilityAnalysis analyzes scaling characteristics type ScalabilityAnalysis struct { LinearScaling bool `json:"linear_scaling"` ScalingFactor float64 `json:"scaling_factor"` OptimalConcurrency int `json:"optimal_concurrency"` } // PerformanceIssue identifies performance problems type PerformanceIssue struct { Issue string `json:"issue"` Severity string `json:"severity"` Impact string `json:"impact"` Suggestion string `json:"suggestion"` } // Recommendation provides optimization suggestions type Recommendation struct { Category string `json:"category"` Description string `json:"description"` Priority string `json:"priority"` Expected string `json:"expected_improvement"` } func (bs *BenchmarkSuite) generateSummary() ReportSummary { rankings := bs.calculateTransportRankings() return ReportSummary{ TotalTests: bs.metrics.TotalTests, Duration: bs.metrics.TotalDuration, BestThroughput: bs.metrics.HighestThroughput, BestLatency: bs.metrics.LowestLatency, RecommendedTransport: bs.metrics.BestTransport, TransportRankings: rankings, } } func (bs *BenchmarkSuite) calculateTransportRankings() []TransportRanking { // Group results by transport transportStats := make(map[TransportType][]BenchmarkResult) for _, result := range bs.results { transportStats[result.Transport] = append(transportStats[result.Transport], result) } var rankings []TransportRanking for transport, results := range transportStats { var totalThroughput float64 var totalLatency time.Duration for _, result := range results { totalThroughput += result.ThroughputMsgSec totalLatency += result.LatencyP50 } avgThroughput := totalThroughput / float64(len(results)) avgLatency := totalLatency / time.Duration(len(results)) // Score calculation (higher throughput + lower latency = better score) score := avgThroughput / float64(avgLatency.Microseconds()) rankings = append(rankings, TransportRanking{ Transport: transport, AvgThroughput: avgThroughput, AvgLatency: avgLatency, Score: score, }) } // Sort by score (descending) for i := 0; i < len(rankings); i++ { for j := i + 1; j < len(rankings); j++ { if rankings[j].Score > rankings[i].Score { rankings[i], rankings[j] = rankings[j], rankings[i] } } } // Assign ranks for i := range rankings { rankings[i].Rank = i + 1 } return rankings } func (bs *BenchmarkSuite) generateAnalysis() ReportAnalysis { return ReportAnalysis{ ScalabilityAnalysis: bs.analyzeScalability(), PerformanceBottlenecks: bs.identifyBottlenecks(), Recommendations: bs.generateRecommendations(), } } func (bs *BenchmarkSuite) analyzeScalability() ScalabilityAnalysis { if len(bs.results) < 2 { return ScalabilityAnalysis{ LinearScaling: false, ScalingFactor: 0.0, OptimalConcurrency: 1, } } // Analyze throughput vs concurrency relationship var throughputData []float64 var concurrencyData []int for _, result := range bs.results { if result.Concurrency > 0 && result.Duration > 0 { throughput := float64(result.MessagesReceived) / result.Duration.Seconds() throughputData = append(throughputData, throughput) concurrencyData = append(concurrencyData, result.Concurrency) } } if len(throughputData) < 2 { return ScalabilityAnalysis{ LinearScaling: false, ScalingFactor: 0.0, OptimalConcurrency: 1, } } // Calculate scaling efficiency // Compare actual throughput improvement with ideal linear scaling maxThroughput := 0.0 maxThroughputConcurrency := 1 baseThroughput := throughputData[0] baseConcurrency := float64(concurrencyData[0]) for i, throughput := range throughputData { if throughput > maxThroughput { maxThroughput = throughput maxThroughputConcurrency = concurrencyData[i] } } // Calculate scaling factor (actual vs ideal) idealThroughput := baseThroughput * float64(maxThroughputConcurrency) / baseConcurrency actualScalingFactor := maxThroughput / idealThroughput // Determine if scaling is linear (within 20% of ideal) linearScaling := actualScalingFactor >= 0.8 return ScalabilityAnalysis{ LinearScaling: linearScaling, ScalingFactor: actualScalingFactor, OptimalConcurrency: maxThroughputConcurrency, } } func (bs *BenchmarkSuite) identifyBottlenecks() []PerformanceIssue { var issues []PerformanceIssue // Analyze results for common performance issues for _, result := range bs.results { if result.ErrorRate > 5.0 { issues = append(issues, PerformanceIssue{ Issue: fmt.Sprintf("High error rate (%0.2f%%) for %s", result.ErrorRate, result.Transport), Severity: "high", Impact: "Reduced reliability and performance", Suggestion: "Check transport configuration and network stability", }) } if result.LatencyP99 > 100*time.Millisecond { issues = append(issues, PerformanceIssue{ Issue: fmt.Sprintf("High P99 latency (%v) for %s", result.LatencyP99, result.Transport), Severity: "medium", Impact: "Poor user experience for latency-sensitive operations", Suggestion: "Consider using faster transport or optimizing message serialization", }) } } return issues } func (bs *BenchmarkSuite) generateRecommendations() []Recommendation { var recommendations []Recommendation recommendations = append(recommendations, Recommendation{ Category: "Transport Selection", Description: fmt.Sprintf("Use %s for best overall performance", bs.metrics.BestTransport), Priority: "high", Expected: "20-50% improvement in throughput", }) recommendations = append(recommendations, Recommendation{ Category: "Concurrency", Description: "Optimize concurrency level based on workload characteristics", Priority: "medium", Expected: "10-30% improvement in resource utilization", }) return recommendations }