Files
mev-beta/internal/monitoring/integrity_monitor.go
Krypto Kajun 850223a953 fix(multicall): resolve critical multicall parsing corruption issues
- Added comprehensive bounds checking to prevent buffer overruns in multicall parsing
- Implemented graduated validation system (Strict/Moderate/Permissive) to reduce false positives
- Added LRU caching system for address validation with 10-minute TTL
- Enhanced ABI decoder with missing Universal Router and Arbitrum-specific DEX signatures
- Fixed duplicate function declarations and import conflicts across multiple files
- Added error recovery mechanisms with multiple fallback strategies
- Updated tests to handle new validation behavior for suspicious addresses
- Fixed parser test expectations for improved validation system
- Applied gofmt formatting fixes to ensure code style compliance
- Fixed mutex copying issues in monitoring package by introducing MetricsSnapshot
- Resolved critical security vulnerabilities in heuristic address extraction
- Progress: Updated TODO audit from 10% to 35% complete

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-17 00:12:55 -05:00

503 lines
16 KiB
Go

package monitoring
import (
"context"
"fmt"
"sync"
"time"
"github.com/ethereum/go-ethereum/common"
"github.com/fraktal/mev-beta/internal/logger"
"github.com/fraktal/mev-beta/internal/recovery"
)
// IntegrityMetrics tracks data integrity statistics
type IntegrityMetrics struct {
mu sync.RWMutex
TotalAddressesProcessed int64
CorruptAddressesDetected int64
AddressValidationPassed int64
AddressValidationFailed int64
ContractCallsSucceeded int64
ContractCallsFailed int64
RetryOperationsTriggered int64
FallbackOperationsUsed int64
CircuitBreakersTripped int64
LastCorruptionDetection time.Time
AverageCorruptionScore float64
MaxCorruptionScore int
HealthScore float64
HighScore float64
RecoveryActions map[recovery.RecoveryAction]int64
ErrorsByType map[recovery.ErrorType]int64
}
// MetricsSnapshot represents a copy of metrics without mutex for safe external access
type MetricsSnapshot struct {
TotalAddressesProcessed int64 `json:"total_addresses_processed"`
CorruptAddressesDetected int64 `json:"corrupt_addresses_detected"`
AddressValidationPassed int64 `json:"address_validation_passed"`
AddressValidationFailed int64 `json:"address_validation_failed"`
ContractCallsSucceeded int64 `json:"contract_calls_succeeded"`
ContractCallsFailed int64 `json:"contract_calls_failed"`
RetryOperationsTriggered int64 `json:"retry_operations_triggered"`
FallbackOperationsUsed int64 `json:"fallback_operations_used"`
CircuitBreakersTripped int64 `json:"circuit_breakers_tripped"`
LastCorruptionDetection time.Time `json:"last_corruption_detection"`
AverageCorruptionScore float64 `json:"average_corruption_score"`
MaxCorruptionScore int `json:"max_corruption_score"`
HealthScore float64 `json:"health_score"`
HighScore float64 `json:"high_score"`
RecoveryActions map[recovery.RecoveryAction]int64 `json:"recovery_actions"`
ErrorsByType map[recovery.ErrorType]int64 `json:"errors_by_type"`
}
// CorruptionAlert represents a corruption detection alert
type CorruptionAlert struct {
Timestamp time.Time
Address common.Address
CorruptionScore int
Source string
Severity AlertSeverity
Message string
Context map[string]interface{}
}
// AlertSeverity defines alert severity levels
type AlertSeverity int
const (
AlertSeverityInfo AlertSeverity = iota
AlertSeverityWarning
AlertSeverityCritical
AlertSeverityEmergency
)
func (s AlertSeverity) String() string {
switch s {
case AlertSeverityInfo:
return "INFO"
case AlertSeverityWarning:
return "WARNING"
case AlertSeverityCritical:
return "CRITICAL"
case AlertSeverityEmergency:
return "EMERGENCY"
default:
return "UNKNOWN"
}
}
// IntegrityMonitor monitors and tracks data integrity metrics
type IntegrityMonitor struct {
mu sync.RWMutex
logger *logger.Logger
metrics *IntegrityMetrics
alertThresholds map[string]float64
alertSubscribers []AlertSubscriber
healthCheckRunner *HealthCheckRunner
enabled bool
}
// AlertSubscriber defines the interface for alert handlers
type AlertSubscriber interface {
HandleAlert(alert CorruptionAlert) error
}
// NewIntegrityMonitor creates a new integrity monitoring system
func NewIntegrityMonitor(logger *logger.Logger) *IntegrityMonitor {
monitor := &IntegrityMonitor{
logger: logger,
metrics: &IntegrityMetrics{
RecoveryActions: make(map[recovery.RecoveryAction]int64),
ErrorsByType: make(map[recovery.ErrorType]int64),
HealthScore: 1.0,
HighScore: 1.0,
},
alertThresholds: make(map[string]float64),
enabled: true,
}
// Set default thresholds
monitor.setDefaultThresholds()
// Initialize health check runner
monitor.healthCheckRunner = NewHealthCheckRunner(logger, monitor)
return monitor
}
// setDefaultThresholds configures default alert thresholds
func (im *IntegrityMonitor) setDefaultThresholds() {
im.alertThresholds["corruption_rate"] = 0.05 // 5% corruption rate
im.alertThresholds["failure_rate"] = 0.10 // 10% failure rate
im.alertThresholds["health_score_min"] = 0.80 // 80% minimum health
im.alertThresholds["max_corruption_score"] = 70.0 // Maximum individual corruption score
im.alertThresholds["circuit_breaker_rate"] = 0.02 // 2% circuit breaker rate
}
// RecordAddressProcessed increments the counter for processed addresses
func (im *IntegrityMonitor) RecordAddressProcessed() {
if !im.enabled {
return
}
im.metrics.mu.Lock()
im.metrics.TotalAddressesProcessed++
im.metrics.mu.Unlock()
im.updateHealthScore()
}
// RecordCorruptionDetected records a corruption detection event
func (im *IntegrityMonitor) RecordCorruptionDetected(address common.Address, corruptionScore int, source string) {
if !im.enabled {
return
}
im.metrics.mu.Lock()
im.metrics.CorruptAddressesDetected++
im.metrics.LastCorruptionDetection = time.Now()
// Update corruption statistics
if corruptionScore > im.metrics.MaxCorruptionScore {
im.metrics.MaxCorruptionScore = corruptionScore
}
// Calculate rolling average corruption score
total := float64(im.metrics.CorruptAddressesDetected)
im.metrics.AverageCorruptionScore = ((im.metrics.AverageCorruptionScore * (total - 1)) + float64(corruptionScore)) / total
im.metrics.mu.Unlock()
// Generate alert based on corruption score
severity := im.getCorruptionSeverity(corruptionScore)
alert := CorruptionAlert{
Timestamp: time.Now(),
Address: address,
CorruptionScore: corruptionScore,
Source: source,
Severity: severity,
Message: fmt.Sprintf("Corruption detected: address %s, score %d, source %s", address.Hex(), corruptionScore, source),
Context: map[string]interface{}{
"address": address.Hex(),
"corruption_score": corruptionScore,
"source": source,
"timestamp": time.Now().Unix(),
},
}
im.sendAlert(alert)
im.updateHealthScore()
im.logger.Warn("Corruption detected",
"address", address.Hex(),
"corruption_score", corruptionScore,
"source", source,
"severity", severity.String())
}
// RecordValidationResult records address validation results
func (im *IntegrityMonitor) RecordValidationResult(passed bool) {
if !im.enabled {
return
}
im.metrics.mu.Lock()
if passed {
im.metrics.AddressValidationPassed++
} else {
im.metrics.AddressValidationFailed++
}
im.metrics.mu.Unlock()
im.updateHealthScore()
}
// RecordContractCallResult records contract call success/failure
func (im *IntegrityMonitor) RecordContractCallResult(succeeded bool) {
if !im.enabled {
return
}
im.metrics.mu.Lock()
if succeeded {
im.metrics.ContractCallsSucceeded++
} else {
im.metrics.ContractCallsFailed++
}
im.metrics.mu.Unlock()
im.updateHealthScore()
}
// RecordRecoveryAction records recovery action usage
func (im *IntegrityMonitor) RecordRecoveryAction(action recovery.RecoveryAction) {
if !im.enabled {
return
}
im.metrics.mu.Lock()
im.metrics.RecoveryActions[action]++
// Track specific metrics
switch action {
case recovery.ActionRetryWithBackoff:
im.metrics.RetryOperationsTriggered++
case recovery.ActionUseFallbackData:
im.metrics.FallbackOperationsUsed++
case recovery.ActionCircuitBreaker:
im.metrics.CircuitBreakersTripped++
}
im.metrics.mu.Unlock()
im.updateHealthScore()
}
// RecordErrorType records error by type
func (im *IntegrityMonitor) RecordErrorType(errorType recovery.ErrorType) {
if !im.enabled {
return
}
im.metrics.mu.Lock()
im.metrics.ErrorsByType[errorType]++
im.metrics.mu.Unlock()
}
// getCorruptionSeverity determines alert severity based on corruption score
func (im *IntegrityMonitor) getCorruptionSeverity(corruptionScore int) AlertSeverity {
if corruptionScore >= 90 {
return AlertSeverityEmergency
} else if corruptionScore >= 70 {
return AlertSeverityCritical
} else if corruptionScore >= 40 {
return AlertSeverityWarning
}
return AlertSeverityInfo
}
// updateHealthScore calculates overall system health score
func (im *IntegrityMonitor) updateHealthScore() {
im.metrics.mu.Lock()
defer im.metrics.mu.Unlock()
if im.metrics.TotalAddressesProcessed == 0 {
im.metrics.HealthScore = 1.0
return
}
// Calculate component scores
corruptionRate := float64(im.metrics.CorruptAddressesDetected) / float64(im.metrics.TotalAddressesProcessed)
var validationSuccessRate float64 = 1.0
validationTotal := im.metrics.AddressValidationPassed + im.metrics.AddressValidationFailed
if validationTotal > 0 {
validationSuccessRate = float64(im.metrics.AddressValidationPassed) / float64(validationTotal)
}
var contractCallSuccessRate float64 = 1.0
contractTotal := im.metrics.ContractCallsSucceeded + im.metrics.ContractCallsFailed
if contractTotal > 0 {
contractCallSuccessRate = float64(im.metrics.ContractCallsSucceeded) / float64(contractTotal)
}
// Weighted health score calculation
healthScore := 0.0
healthScore += (1.0 - corruptionRate) * 0.4 // 40% weight on corruption prevention
healthScore += validationSuccessRate * 0.3 // 30% weight on validation success
healthScore += contractCallSuccessRate * 0.3 // 30% weight on contract call success
// Cap at 1.0 and handle edge cases
if healthScore > 1.0 {
healthScore = 1.0
} else if healthScore < 0.0 {
healthScore = 0.0
}
im.metrics.HealthScore = healthScore
if healthScore > im.metrics.HighScore {
im.metrics.HighScore = healthScore
}
// Check for health score threshold alerts
if healthScore < im.alertThresholds["health_score_min"] {
alert := CorruptionAlert{
Timestamp: time.Now(),
Severity: AlertSeverityCritical,
Message: fmt.Sprintf("System health score dropped to %.2f (threshold: %.2f)", healthScore, im.alertThresholds["health_score_min"]),
Context: map[string]interface{}{
"health_score": healthScore,
"threshold": im.alertThresholds["health_score_min"],
"corruption_rate": corruptionRate,
"validation_success": validationSuccessRate,
"contract_call_success": contractCallSuccessRate,
},
}
im.sendAlert(alert)
}
}
// sendAlert sends alerts to all subscribers
func (im *IntegrityMonitor) sendAlert(alert CorruptionAlert) {
for _, subscriber := range im.alertSubscribers {
if err := subscriber.HandleAlert(alert); err != nil {
im.logger.Error("Failed to send alert",
"subscriber", fmt.Sprintf("%T", subscriber),
"error", err)
}
}
}
// AddAlertSubscriber adds an alert subscriber
func (im *IntegrityMonitor) AddAlertSubscriber(subscriber AlertSubscriber) {
im.mu.Lock()
defer im.mu.Unlock()
im.alertSubscribers = append(im.alertSubscribers, subscriber)
}
// GetMetrics returns a copy of current metrics
func (im *IntegrityMonitor) GetMetrics() MetricsSnapshot {
im.metrics.mu.RLock()
defer im.metrics.mu.RUnlock()
// Create a deep copy
metrics := IntegrityMetrics{
TotalAddressesProcessed: im.metrics.TotalAddressesProcessed,
CorruptAddressesDetected: im.metrics.CorruptAddressesDetected,
AddressValidationPassed: im.metrics.AddressValidationPassed,
AddressValidationFailed: im.metrics.AddressValidationFailed,
ContractCallsSucceeded: im.metrics.ContractCallsSucceeded,
ContractCallsFailed: im.metrics.ContractCallsFailed,
RetryOperationsTriggered: im.metrics.RetryOperationsTriggered,
FallbackOperationsUsed: im.metrics.FallbackOperationsUsed,
CircuitBreakersTripped: im.metrics.CircuitBreakersTripped,
LastCorruptionDetection: im.metrics.LastCorruptionDetection,
AverageCorruptionScore: im.metrics.AverageCorruptionScore,
MaxCorruptionScore: im.metrics.MaxCorruptionScore,
HealthScore: im.metrics.HealthScore,
HighScore: im.metrics.HighScore,
RecoveryActions: make(map[recovery.RecoveryAction]int64),
ErrorsByType: make(map[recovery.ErrorType]int64),
}
// Copy maps
for k, v := range im.metrics.RecoveryActions {
metrics.RecoveryActions[k] = v
}
for k, v := range im.metrics.ErrorsByType {
metrics.ErrorsByType[k] = v
}
// Return a safe copy without mutex
return MetricsSnapshot{
TotalAddressesProcessed: metrics.TotalAddressesProcessed,
CorruptAddressesDetected: metrics.CorruptAddressesDetected,
AddressValidationPassed: metrics.AddressValidationPassed,
AddressValidationFailed: metrics.AddressValidationFailed,
ContractCallsSucceeded: metrics.ContractCallsSucceeded,
ContractCallsFailed: metrics.ContractCallsFailed,
RetryOperationsTriggered: metrics.RetryOperationsTriggered,
FallbackOperationsUsed: metrics.FallbackOperationsUsed,
CircuitBreakersTripped: metrics.CircuitBreakersTripped,
LastCorruptionDetection: metrics.LastCorruptionDetection,
AverageCorruptionScore: metrics.AverageCorruptionScore,
MaxCorruptionScore: metrics.MaxCorruptionScore,
HealthScore: metrics.HealthScore,
HighScore: metrics.HighScore,
RecoveryActions: metrics.RecoveryActions,
ErrorsByType: metrics.ErrorsByType,
}
}
// GetHealthSummary returns a comprehensive health summary
func (im *IntegrityMonitor) GetHealthSummary() map[string]interface{} {
metrics := im.GetMetrics()
corruptionRate := 0.0
if metrics.TotalAddressesProcessed > 0 {
corruptionRate = float64(metrics.CorruptAddressesDetected) / float64(metrics.TotalAddressesProcessed)
}
validationSuccessRate := 0.0
totalValidations := metrics.AddressValidationPassed + metrics.AddressValidationFailed
if totalValidations > 0 {
validationSuccessRate = float64(metrics.AddressValidationPassed) / float64(totalValidations)
}
contractCallSuccessRate := 0.0
totalCalls := metrics.ContractCallsSucceeded + metrics.ContractCallsFailed
if totalCalls > 0 {
contractCallSuccessRate = float64(metrics.ContractCallsSucceeded) / float64(totalCalls)
}
return map[string]interface{}{
"enabled": im.enabled,
"health_score": metrics.HealthScore,
"total_addresses_processed": metrics.TotalAddressesProcessed,
"corruption_detections": metrics.CorruptAddressesDetected,
"corruption_rate": corruptionRate,
"validation_success_rate": validationSuccessRate,
"contract_call_success_rate": contractCallSuccessRate,
"average_corruption_score": metrics.AverageCorruptionScore,
"max_corruption_score": metrics.MaxCorruptionScore,
"retry_operations": metrics.RetryOperationsTriggered,
"fallback_operations": metrics.FallbackOperationsUsed,
"circuit_breakers_tripped": metrics.CircuitBreakersTripped,
"last_corruption": metrics.LastCorruptionDetection,
"recovery_actions": metrics.RecoveryActions,
"errors_by_type": metrics.ErrorsByType,
"alert_thresholds": im.alertThresholds,
"alert_subscribers": len(im.alertSubscribers),
}
}
// SetThreshold sets an alert threshold
func (im *IntegrityMonitor) SetThreshold(name string, value float64) {
im.mu.Lock()
defer im.mu.Unlock()
im.alertThresholds[name] = value
}
// Enable enables the integrity monitor
func (im *IntegrityMonitor) Enable() {
im.mu.Lock()
defer im.mu.Unlock()
im.enabled = true
im.logger.Info("Integrity monitor enabled")
}
// Disable disables the integrity monitor
func (im *IntegrityMonitor) Disable() {
im.mu.Lock()
defer im.mu.Unlock()
im.enabled = false
im.logger.Info("Integrity monitor disabled")
}
// IsEnabled returns whether the monitor is enabled
func (im *IntegrityMonitor) IsEnabled() bool {
im.mu.RLock()
defer im.mu.RUnlock()
return im.enabled
}
// StartHealthCheckRunner starts the periodic health check routine
func (im *IntegrityMonitor) StartHealthCheckRunner(ctx context.Context) {
if im.healthCheckRunner != nil {
im.healthCheckRunner.Start(ctx)
}
}
// StopHealthCheckRunner stops the periodic health check routine
func (im *IntegrityMonitor) StopHealthCheckRunner() {
if im.healthCheckRunner != nil {
im.healthCheckRunner.Stop()
}
}
// GetHealthCheckRunner returns the health check runner
func (im *IntegrityMonitor) GetHealthCheckRunner() *HealthCheckRunner {
return im.healthCheckRunner
}