mev-beta/pkg/transport/benchmarks.go

package transport

import (
	"context"
	"fmt"
	"math"
	"runtime"
	"sync"
	"sync/atomic"
	"time"

	"github.com/fraktal/mev-beta/internal/logger"
	"github.com/fraktal/mev-beta/pkg/security"
)

// BenchmarkSuite provides comprehensive performance testing for the transport layer
type BenchmarkSuite struct {
	logger     *logger.Logger
	messageBus *UniversalMessageBus
	results    []BenchmarkResult
	config     BenchmarkConfig
	metrics    BenchmarkMetrics
	mu         sync.RWMutex
}

// BenchmarkConfig configures benchmark parameters
type BenchmarkConfig struct {
	MessageSizes         []int                 // Message payload sizes to test
	Concurrency          []int                 // Concurrency levels to test
	Duration             time.Duration         // Duration of each benchmark
	WarmupDuration       time.Duration         // Warmup period before measurements
	TransportTypes       []TransportType       // Transport types to benchmark
	MessageTypes         []MessageType         // Message types to test
	SerializationFormats []SerializationFormat // Serialization formats to test
	EnableMetrics        bool                  // Whether to collect detailed metrics
	OutputFormat         string                // Output format (json, csv, console)
}

// BenchmarkResult contains results from a single benchmark run
type BenchmarkResult struct {
	TestName          string              `json:"test_name"`
	Transport         TransportType       `json:"transport"`
	MessageSize       int                 `json:"message_size"`
	Concurrency       int                 `json:"concurrency"`
	Serialization     SerializationFormat `json:"serialization"`
	Duration          time.Duration       `json:"duration"`
	MessagesSent      int64               `json:"messages_sent"`
	MessagesReceived  int64               `json:"messages_received"`
	BytesSent         int64               `json:"bytes_sent"`
	BytesReceived     int64               `json:"bytes_received"`
	ThroughputMsgSec  float64             `json:"throughput_msg_sec"`
	ThroughputByteSec float64             `json:"throughput_byte_sec"`
	LatencyP50        time.Duration       `json:"latency_p50"`
	LatencyP95        time.Duration       `json:"latency_p95"`
	LatencyP99        time.Duration       `json:"latency_p99"`
	ErrorRate         float64             `json:"error_rate"`
	CPUUsage          float64             `json:"cpu_usage"`
	MemoryUsage       int64               `json:"memory_usage"`
	GCPauses          int64               `json:"gc_pauses"`
	Timestamp         time.Time           `json:"timestamp"`
}

// BenchmarkMetrics tracks overall benchmark statistics
type BenchmarkMetrics struct {
	TotalTests        int           `json:"total_tests"`
	PassedTests       int           `json:"passed_tests"`
	FailedTests       int           `json:"failed_tests"`
	TotalDuration     time.Duration `json:"total_duration"`
	HighestThroughput float64       `json:"highest_throughput"`
	LowestLatency     time.Duration `json:"lowest_latency"`
	BestTransport     TransportType `json:"best_transport"`
	Timestamp         time.Time     `json:"timestamp"`
}

// LatencyTracker tracks message latencies
type LatencyTracker struct {
	latencies []time.Duration
	mu        sync.Mutex
}

// NewBenchmarkSuite creates a new benchmark suite
func NewBenchmarkSuite(messageBus *UniversalMessageBus, logger *logger.Logger) *BenchmarkSuite {
	return &BenchmarkSuite{
		logger:     logger,
		messageBus: messageBus,
		results:    make([]BenchmarkResult, 0),
		config: BenchmarkConfig{
			MessageSizes:         []int{64, 256, 1024, 4096, 16384},
			Concurrency:          []int{1, 10, 50, 100},
			Duration:             30 * time.Second,
			WarmupDuration:       5 * time.Second,
			TransportTypes:       []TransportType{TransportMemory, TransportUnixSocket, TransportTCP},
			MessageTypes:         []MessageType{MessageTypeEvent, MessageTypeCommand},
			SerializationFormats: []SerializationFormat{SerializationJSON},
			EnableMetrics:        true,
			OutputFormat:         "console",
		},
	}
}

// SetConfig updates the benchmark configuration
func (bs *BenchmarkSuite) SetConfig(config BenchmarkConfig) {
	bs.mu.Lock()
	defer bs.mu.Unlock()
	bs.config = config
}

// RunAll executes all benchmark tests
func (bs *BenchmarkSuite) RunAll(ctx context.Context) error {
	bs.mu.Lock()
	defer bs.mu.Unlock()

	startTime := time.Now()
	bs.metrics = BenchmarkMetrics{
		Timestamp: startTime,
	}

	for _, transport := range bs.config.TransportTypes {
		for _, msgSize := range bs.config.MessageSizes {
			for _, concurrency := range bs.config.Concurrency {
				for _, serialization := range bs.config.SerializationFormats {
					result, err := bs.runSingleBenchmark(ctx, transport, msgSize, concurrency, serialization)
					if err != nil {
						bs.metrics.FailedTests++
						continue
					}

					bs.results = append(bs.results, result)
					bs.metrics.PassedTests++
					bs.updateBestMetrics(result)
				}
			}
		}
	}

	bs.metrics.TotalTests = bs.metrics.PassedTests + bs.metrics.FailedTests
	bs.metrics.TotalDuration = time.Since(startTime)

	return nil
}

// RunThroughputBenchmark tests message throughput
func (bs *BenchmarkSuite) RunThroughputBenchmark(ctx context.Context, transport TransportType, messageSize int, concurrency int) (BenchmarkResult, error) {
	return bs.runSingleBenchmark(ctx, transport, messageSize, concurrency, SerializationJSON)
}

// RunLatencyBenchmark tests message latency
func (bs *BenchmarkSuite) RunLatencyBenchmark(ctx context.Context, transport TransportType, messageSize int) (BenchmarkResult, error) {
	return bs.runSingleBenchmark(ctx, transport, messageSize, 1, SerializationJSON)
}

// RunScalabilityBenchmark tests scalability across different concurrency levels
func (bs *BenchmarkSuite) RunScalabilityBenchmark(ctx context.Context, transport TransportType, messageSize int) ([]BenchmarkResult, error) {
	var results []BenchmarkResult

	for _, concurrency := range bs.config.Concurrency {
		result, err := bs.runSingleBenchmark(ctx, transport, messageSize, concurrency, SerializationJSON)
		if err != nil {
			return nil, fmt.Errorf("scalability benchmark failed at concurrency %d: %w", concurrency, err)
		}
		results = append(results, result)
	}

	return results, nil
}

// GetResults returns all benchmark results
func (bs *BenchmarkSuite) GetResults() []BenchmarkResult {
	bs.mu.RLock()
	defer bs.mu.RUnlock()

	results := make([]BenchmarkResult, len(bs.results))
	copy(results, bs.results)
	return results
}

// GetMetrics returns benchmark metrics
func (bs *BenchmarkSuite) GetMetrics() BenchmarkMetrics {
	bs.mu.RLock()
	defer bs.mu.RUnlock()
	return bs.metrics
}

// GetBestPerformingTransport returns the transport with the highest throughput
func (bs *BenchmarkSuite) GetBestPerformingTransport() TransportType {
	bs.mu.RLock()
	defer bs.mu.RUnlock()
	return bs.metrics.BestTransport
}

// Private methods

func (bs *BenchmarkSuite) runSingleBenchmark(ctx context.Context, transport TransportType, messageSize int, concurrency int, serialization SerializationFormat) (BenchmarkResult, error) {
	testName := fmt.Sprintf("%s_%db_%dc_%s", transport, messageSize, concurrency, serialization)

	result := BenchmarkResult{
		TestName:      testName,
		Transport:     transport,
		MessageSize:   messageSize,
		Concurrency:   concurrency,
		Serialization: serialization,
		Duration:      bs.config.Duration,
		Timestamp:     time.Now(),
	}

	// Setup test environment
	latencyTracker := &LatencyTracker{
		latencies: make([]time.Duration, 0),
	}

	// Create test topic
	topic := fmt.Sprintf("benchmark_%s", testName)

	// Subscribe to topic
	subscription, err := bs.messageBus.Subscribe(topic, func(ctx context.Context, msg *Message) error {
		if startTime, ok := msg.Metadata["start_time"].(time.Time); ok {
			latency := time.Since(startTime)
			latencyTracker.AddLatency(latency)
		}
		atomic.AddInt64(&result.MessagesReceived, 1)
		atomic.AddInt64(&result.BytesReceived, int64(messageSize))
		return nil
	})
	if err != nil {
		return result, fmt.Errorf("failed to subscribe: %w", err)
	}
	defer bs.messageBus.Unsubscribe(subscription.ID)

	// Warmup phase
	if bs.config.WarmupDuration > 0 {
		bs.warmup(ctx, topic, messageSize, concurrency, bs.config.WarmupDuration)
	}

	// Start system monitoring
	var cpuUsage float64
	var memUsageBefore, memUsageAfter runtime.MemStats
	runtime.ReadMemStats(&memUsageBefore)

	monitorCtx, monitorCancel := context.WithCancel(ctx)
	defer monitorCancel()

	go bs.monitorSystemResources(monitorCtx, &cpuUsage)

	// Main benchmark
	startTime := time.Now()
	benchmarkCtx, cancel := context.WithTimeout(ctx, bs.config.Duration)
	defer cancel()

	// Launch concurrent senders
	var wg sync.WaitGroup
	var totalSent int64
	var totalErrors int64

	for i := 0; i < concurrency; i++ {
		wg.Add(1)
		go func() {
			defer wg.Done()
			bs.senderWorker(benchmarkCtx, topic, messageSize, &totalSent, &totalErrors)
		}()
	}

	wg.Wait()

	// Wait a bit for remaining messages to be processed
	time.Sleep(100 * time.Millisecond)

	actualDuration := time.Since(startTime)
	runtime.ReadMemStats(&memUsageAfter)

	// Calculate results
	result.MessagesSent = totalSent
	result.BytesSent = totalSent * int64(messageSize)
	result.ThroughputMsgSec = float64(totalSent) / actualDuration.Seconds()
	result.ThroughputByteSec = float64(result.BytesSent) / actualDuration.Seconds()
	result.ErrorRate = float64(totalErrors) / float64(totalSent) * 100
	result.CPUUsage = cpuUsage
	// Calculate memory usage difference safely
	memDiff := memUsageAfter.Alloc - memUsageBefore.Alloc
	memDiffInt64, err := security.SafeUint64ToInt64(memDiff)
	if err != nil {
		bs.logger.Warn("Memory usage difference exceeds int64 max", "diff", memDiff, "error", err)
		result.MemoryUsage = math.MaxInt64
	} else {
		result.MemoryUsage = memDiffInt64
	}

	// Calculate GC pauses difference safely
	gcDiff := int64(memUsageAfter.NumGC) - int64(memUsageBefore.NumGC)
	result.GCPauses = gcDiff

	// Calculate latency percentiles
	if len(latencyTracker.latencies) > 0 {
		result.LatencyP50 = latencyTracker.GetPercentile(50)
		result.LatencyP95 = latencyTracker.GetPercentile(95)
		result.LatencyP99 = latencyTracker.GetPercentile(99)
	}

	return result, nil
}

func (bs *BenchmarkSuite) warmup(ctx context.Context, topic string, messageSize int, concurrency int, duration time.Duration) {
	warmupCtx, cancel := context.WithTimeout(ctx, duration)
	defer cancel()

	var wg sync.WaitGroup
	for i := 0; i < concurrency; i++ {
		wg.Add(1)
		go func() {
			defer wg.Done()
			var dummy1, dummy2 int64
			bs.senderWorker(warmupCtx, topic, messageSize, &dummy1, &dummy2)
		}()
	}
	wg.Wait()
}

func (bs *BenchmarkSuite) senderWorker(ctx context.Context, topic string, messageSize int, totalSent, totalErrors *int64) {
	payload := make([]byte, messageSize)
	for i := range payload {
		payload[i] = byte(i % 256)
	}

	for {
		select {
		case <-ctx.Done():
			return
		default:
			msg := NewMessage(MessageTypeEvent, topic, "benchmark", payload)
			msg.Metadata["start_time"] = time.Now()

			if err := bs.messageBus.Publish(ctx, msg); err != nil {
				atomic.AddInt64(totalErrors, 1)
			} else {
				atomic.AddInt64(totalSent, 1)
			}
		}
	}
}

func (bs *BenchmarkSuite) monitorSystemResources(ctx context.Context, cpuUsage *float64) {
	ticker := time.NewTicker(100 * time.Millisecond)
	defer ticker.Stop()

	var samples []float64
	startTime := time.Now()

	for {
		select {
		case <-ctx.Done():
			// Calculate average CPU usage
			if len(samples) > 0 {
				var total float64
				for _, sample := range samples {
					total += sample
				}
				*cpuUsage = total / float64(len(samples))
			}
			return
		case <-ticker.C:
			// Simple CPU usage estimation based on runtime stats
			var stats runtime.MemStats
			runtime.ReadMemStats(&stats)

			// This is a simplified CPU usage calculation
			// In production, you'd want to use proper OS-specific CPU monitoring
			elapsed := time.Since(startTime).Seconds()
			cpuSample := float64(stats.NumGC) / elapsed * 100 // Rough approximation
			if cpuSample > 100 {
				cpuSample = 100
			}
			samples = append(samples, cpuSample)
		}
	}
}

func (bs *BenchmarkSuite) updateBestMetrics(result BenchmarkResult) {
	if result.ThroughputMsgSec > bs.metrics.HighestThroughput {
		bs.metrics.HighestThroughput = result.ThroughputMsgSec
		bs.metrics.BestTransport = result.Transport
	}

	if bs.metrics.LowestLatency == 0 || result.LatencyP50 < bs.metrics.LowestLatency {
		bs.metrics.LowestLatency = result.LatencyP50
	}
}

// LatencyTracker methods

func (lt *LatencyTracker) AddLatency(latency time.Duration) {
	lt.mu.Lock()
	defer lt.mu.Unlock()
	lt.latencies = append(lt.latencies, latency)
}

func (lt *LatencyTracker) GetPercentile(percentile int) time.Duration {
	lt.mu.Lock()
	defer lt.mu.Unlock()

	if len(lt.latencies) == 0 {
		return 0
	}

	// Sort latencies
	sorted := make([]time.Duration, len(lt.latencies))
	copy(sorted, lt.latencies)

	// Simple insertion sort for small datasets
	for i := 1; i < len(sorted); i++ {
		for j := i; j > 0 && sorted[j] < sorted[j-1]; j-- {
			sorted[j], sorted[j-1] = sorted[j-1], sorted[j]
		}
	}

	// Calculate percentile index
	index := int(float64(len(sorted)) * float64(percentile) / 100.0)
	if index >= len(sorted) {
		index = len(sorted) - 1
	}

	return sorted[index]
}

// Benchmark report generation

// GenerateReport generates a comprehensive benchmark report
func (bs *BenchmarkSuite) GenerateReport() BenchmarkReport {
	bs.mu.RLock()
	defer bs.mu.RUnlock()

	report := BenchmarkReport{
		Summary:   bs.generateSummary(),
		Results:   bs.results,
		Metrics:   bs.metrics,
		Config:    bs.config,
		Timestamp: time.Now(),
	}

	report.Analysis = bs.generateAnalysis()

	return report
}

// BenchmarkReport contains a complete benchmark report
type BenchmarkReport struct {
	Summary   ReportSummary     `json:"summary"`
	Results   []BenchmarkResult `json:"results"`
	Metrics   BenchmarkMetrics  `json:"metrics"`
	Config    BenchmarkConfig   `json:"config"`
	Analysis  ReportAnalysis    `json:"analysis"`
	Timestamp time.Time         `json:"timestamp"`
}

// ReportSummary provides a high-level summary
type ReportSummary struct {
	TotalTests           int                `json:"total_tests"`
	Duration             time.Duration      `json:"duration"`
	BestThroughput       float64            `json:"best_throughput"`
	BestLatency          time.Duration      `json:"best_latency"`
	RecommendedTransport TransportType      `json:"recommended_transport"`
	TransportRankings    []TransportRanking `json:"transport_rankings"`
}

// TransportRanking ranks transports by performance
type TransportRanking struct {
	Transport     TransportType `json:"transport"`
	AvgThroughput float64       `json:"avg_throughput"`
	AvgLatency    time.Duration `json:"avg_latency"`
	Score         float64       `json:"score"`
	Rank          int           `json:"rank"`
}

// ReportAnalysis provides detailed analysis
type ReportAnalysis struct {
	ScalabilityAnalysis    ScalabilityAnalysis `json:"scalability"`
	PerformanceBottlenecks []PerformanceIssue  `json:"bottlenecks"`
	Recommendations        []Recommendation    `json:"recommendations"`
}

// ScalabilityAnalysis analyzes scaling characteristics
type ScalabilityAnalysis struct {
	LinearScaling      bool    `json:"linear_scaling"`
	ScalingFactor      float64 `json:"scaling_factor"`
	OptimalConcurrency int     `json:"optimal_concurrency"`
}

// PerformanceIssue identifies performance problems
type PerformanceIssue struct {
	Issue      string `json:"issue"`
	Severity   string `json:"severity"`
	Impact     string `json:"impact"`
	Suggestion string `json:"suggestion"`
}

// Recommendation provides optimization suggestions
type Recommendation struct {
	Category    string `json:"category"`
	Description string `json:"description"`
	Priority    string `json:"priority"`
	Expected    string `json:"expected_improvement"`
}

func (bs *BenchmarkSuite) generateSummary() ReportSummary {
	rankings := bs.calculateTransportRankings()

	return ReportSummary{
		TotalTests:           bs.metrics.TotalTests,
		Duration:             bs.metrics.TotalDuration,
		BestThroughput:       bs.metrics.HighestThroughput,
		BestLatency:          bs.metrics.LowestLatency,
		RecommendedTransport: bs.metrics.BestTransport,
		TransportRankings:    rankings,
	}
}

func (bs *BenchmarkSuite) calculateTransportRankings() []TransportRanking {
	// Group results by transport
	transportStats := make(map[TransportType][]BenchmarkResult)
	for _, result := range bs.results {
		transportStats[result.Transport] = append(transportStats[result.Transport], result)
	}

	var rankings []TransportRanking
	for transport, results := range transportStats {
		var totalThroughput float64
		var totalLatency time.Duration

		for _, result := range results {
			totalThroughput += result.ThroughputMsgSec
			totalLatency += result.LatencyP50
		}

		avgThroughput := totalThroughput / float64(len(results))
		avgLatency := totalLatency / time.Duration(len(results))

		// Score calculation (higher throughput + lower latency = better score)
		score := avgThroughput / float64(avgLatency.Microseconds())

		rankings = append(rankings, TransportRanking{
			Transport:     transport,
			AvgThroughput: avgThroughput,
			AvgLatency:    avgLatency,
			Score:         score,
		})
	}

	// Sort by score (descending)
	for i := 0; i < len(rankings); i++ {
		for j := i + 1; j < len(rankings); j++ {
			if rankings[j].Score > rankings[i].Score {
				rankings[i], rankings[j] = rankings[j], rankings[i]
			}
		}
	}

	// Assign ranks
	for i := range rankings {
		rankings[i].Rank = i + 1
	}

	return rankings
}

func (bs *BenchmarkSuite) generateAnalysis() ReportAnalysis {
	return ReportAnalysis{
		ScalabilityAnalysis:    bs.analyzeScalability(),
		PerformanceBottlenecks: bs.identifyBottlenecks(),
		Recommendations:        bs.generateRecommendations(),
	}
}

func (bs *BenchmarkSuite) analyzeScalability() ScalabilityAnalysis {
	if len(bs.results) < 2 {
		return ScalabilityAnalysis{
			LinearScaling:      false,
			ScalingFactor:      0.0,
			OptimalConcurrency: 1,
		}
	}

	// Analyze throughput vs concurrency relationship
	var throughputData []float64
	var concurrencyData []int

	for _, result := range bs.results {
		if result.Concurrency > 0 && result.Duration > 0 {
			throughput := float64(result.MessagesReceived) / result.Duration.Seconds()
			throughputData = append(throughputData, throughput)
			concurrencyData = append(concurrencyData, result.Concurrency)
		}
	}

	if len(throughputData) < 2 {
		return ScalabilityAnalysis{
			LinearScaling:      false,
			ScalingFactor:      0.0,
			OptimalConcurrency: 1,
		}
	}

	// Calculate scaling efficiency
	// Compare actual throughput improvement with ideal linear scaling
	maxThroughput := 0.0
	maxThroughputConcurrency := 1
	baseThroughput := throughputData[0]
	baseConcurrency := float64(concurrencyData[0])

	for i, throughput := range throughputData {
		if throughput > maxThroughput {
			maxThroughput = throughput
			maxThroughputConcurrency = concurrencyData[i]
		}
	}

	// Calculate scaling factor (actual vs ideal)
	idealThroughput := baseThroughput * float64(maxThroughputConcurrency) / baseConcurrency
	actualScalingFactor := maxThroughput / idealThroughput

	// Determine if scaling is linear (within 20% of ideal)
	linearScaling := actualScalingFactor >= 0.8

	return ScalabilityAnalysis{
		LinearScaling:      linearScaling,
		ScalingFactor:      actualScalingFactor,
		OptimalConcurrency: maxThroughputConcurrency,
	}
}

func (bs *BenchmarkSuite) identifyBottlenecks() []PerformanceIssue {
	var issues []PerformanceIssue

	// Analyze results for common performance issues
	for _, result := range bs.results {
		if result.ErrorRate > 5.0 {
			issues = append(issues, PerformanceIssue{
				Issue:      fmt.Sprintf("High error rate (%0.2f%%) for %s", result.ErrorRate, result.Transport),
				Severity:   "high",
				Impact:     "Reduced reliability and performance",
				Suggestion: "Check transport configuration and network stability",
			})
		}

		if result.LatencyP99 > 100*time.Millisecond {
			issues = append(issues, PerformanceIssue{
				Issue:      fmt.Sprintf("High P99 latency (%v) for %s", result.LatencyP99, result.Transport),
				Severity:   "medium",
				Impact:     "Poor user experience for latency-sensitive operations",
				Suggestion: "Consider using faster transport or optimizing message serialization",
			})
		}
	}

	return issues
}

func (bs *BenchmarkSuite) generateRecommendations() []Recommendation {
	var recommendations []Recommendation

	recommendations = append(recommendations, Recommendation{
		Category:    "Transport Selection",
		Description: fmt.Sprintf("Use %s for best overall performance", bs.metrics.BestTransport),
		Priority:    "high",
		Expected:    "20-50% improvement in throughput",
	})

	recommendations = append(recommendations, Recommendation{
		Category:    "Concurrency",
		Description: "Optimize concurrency level based on workload characteristics",
		Priority:    "medium",
		Expected:    "10-30% improvement in resource utilization",
	})

	return recommendations
}