feat: create v2-prep branch with comprehensive planning

Restructured project for V2 refactor:

**Structure Changes:**
- Moved all V1 code to orig/ folder (preserved with git mv)
- Created docs/planning/ directory
- Added orig/README_V1.md explaining V1 preservation

**Planning Documents:**
- 00_V2_MASTER_PLAN.md: Complete architecture overview
  - Executive summary of critical V1 issues
  - High-level component architecture diagrams
  - 5-phase implementation roadmap
  - Success metrics and risk mitigation

- 07_TASK_BREAKDOWN.md: Atomic task breakdown
  - 99+ hours of detailed tasks
  - Every task < 2 hours (atomic)
  - Clear dependencies and success criteria
  - Organized by implementation phase

**V2 Key Improvements:**
- Per-exchange parsers (factory pattern)
- Multi-layer strict validation
- Multi-index pool cache
- Background validation pipeline
- Comprehensive observability

**Critical Issues Addressed:**
- Zero address tokens (strict validation + cache enrichment)
- Parsing accuracy (protocol-specific parsers)
- No audit trail (background validation channel)
- Inefficient lookups (multi-index cache)
- Stats disconnection (event-driven metrics)

Next Steps:
1. Review planning documents
2. Begin Phase 1: Foundation (P1-001 through P1-010)
3. Implement parsers in Phase 2
4. Build cache system in Phase 3
5. Add validation pipeline in Phase 4
6. Migrate and test in Phase 5

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Administrator
2025-11-10 10:14:26 +01:00
parent 1773daffe7
commit 803de231ba
411 changed files with 20390 additions and 8680 deletions

View File

@@ -0,0 +1,533 @@
package monitoring
import (
"context"
"fmt"
"sync"
"time"
"github.com/ethereum/go-ethereum/common"
"github.com/fraktal/mev-beta/internal/logger"
"github.com/fraktal/mev-beta/internal/recovery"
)
// IntegrityMetrics tracks data integrity statistics
type IntegrityMetrics struct {
mu sync.RWMutex
TotalAddressesProcessed int64
CorruptAddressesDetected int64
AddressValidationPassed int64
AddressValidationFailed int64
ContractCallsSucceeded int64
ContractCallsFailed int64
RetryOperationsTriggered int64
FallbackOperationsUsed int64
CircuitBreakersTripped int64
LastCorruptionDetection time.Time
AverageCorruptionScore float64
MaxCorruptionScore int
HealthScore float64
HighScore float64
RecoveryActions map[recovery.RecoveryAction]int64
ErrorsByType map[recovery.ErrorType]int64
}
// MetricsSnapshot represents a copy of metrics without mutex for safe external access
type MetricsSnapshot struct {
TotalAddressesProcessed int64 `json:"total_addresses_processed"`
CorruptAddressesDetected int64 `json:"corrupt_addresses_detected"`
AddressValidationPassed int64 `json:"address_validation_passed"`
AddressValidationFailed int64 `json:"address_validation_failed"`
ContractCallsSucceeded int64 `json:"contract_calls_succeeded"`
ContractCallsFailed int64 `json:"contract_calls_failed"`
RetryOperationsTriggered int64 `json:"retry_operations_triggered"`
FallbackOperationsUsed int64 `json:"fallback_operations_used"`
CircuitBreakersTripped int64 `json:"circuit_breakers_tripped"`
LastCorruptionDetection time.Time `json:"last_corruption_detection"`
AverageCorruptionScore float64 `json:"average_corruption_score"`
MaxCorruptionScore int `json:"max_corruption_score"`
HealthScore float64 `json:"health_score"`
HighScore float64 `json:"high_score"`
RecoveryActions map[recovery.RecoveryAction]int64 `json:"recovery_actions"`
ErrorsByType map[recovery.ErrorType]int64 `json:"errors_by_type"`
}
// CorruptionAlert represents a corruption detection alert
type CorruptionAlert struct {
Timestamp time.Time
Address common.Address
CorruptionScore int
Source string
Severity AlertSeverity
Message string
Context map[string]interface{}
}
// AlertSeverity defines alert severity levels
type AlertSeverity int
const (
AlertSeverityInfo AlertSeverity = iota
AlertSeverityWarning
AlertSeverityCritical
AlertSeverityEmergency
)
func (s AlertSeverity) String() string {
switch s {
case AlertSeverityInfo:
return "INFO"
case AlertSeverityWarning:
return "WARNING"
case AlertSeverityCritical:
return "CRITICAL"
case AlertSeverityEmergency:
return "EMERGENCY"
default:
return "UNKNOWN"
}
}
// IntegrityMonitor monitors and tracks data integrity metrics
type IntegrityMonitor struct {
mu sync.RWMutex
logger *logger.Logger
metrics *IntegrityMetrics
alertThresholds map[string]float64
alertSubscribers []AlertSubscriber
healthCheckRunner *HealthCheckRunner
enabled bool
alerts []CorruptionAlert
alertsMutex sync.RWMutex
}
// AlertSubscriber defines the interface for alert handlers
type AlertSubscriber interface {
HandleAlert(alert CorruptionAlert) error
}
// NewIntegrityMonitor creates a new integrity monitoring system
func NewIntegrityMonitor(logger *logger.Logger) *IntegrityMonitor {
monitor := &IntegrityMonitor{
logger: logger,
metrics: &IntegrityMetrics{
RecoveryActions: make(map[recovery.RecoveryAction]int64),
ErrorsByType: make(map[recovery.ErrorType]int64),
HealthScore: 1.0,
HighScore: 1.0,
},
alertThresholds: make(map[string]float64),
enabled: true,
alerts: make([]CorruptionAlert, 0, 256),
}
// Set default thresholds
monitor.setDefaultThresholds()
// Initialize health check runner
monitor.healthCheckRunner = NewHealthCheckRunner(logger, monitor)
return monitor
}
// setDefaultThresholds configures default alert thresholds
func (im *IntegrityMonitor) setDefaultThresholds() {
im.alertThresholds["corruption_rate"] = 0.05 // 5% corruption rate
im.alertThresholds["failure_rate"] = 0.10 // 10% failure rate
im.alertThresholds["health_score_min"] = 0.80 // 80% minimum health
im.alertThresholds["max_corruption_score"] = 70.0 // Maximum individual corruption score
im.alertThresholds["circuit_breaker_rate"] = 0.02 // 2% circuit breaker rate
}
// RecordAddressProcessed increments the counter for processed addresses
func (im *IntegrityMonitor) RecordAddressProcessed() {
if !im.enabled {
return
}
im.metrics.mu.Lock()
im.metrics.TotalAddressesProcessed++
im.metrics.mu.Unlock()
im.updateHealthScore()
}
// RecordCorruptionDetected records a corruption detection event
func (im *IntegrityMonitor) RecordCorruptionDetected(address common.Address, corruptionScore int, source string) {
if !im.enabled {
return
}
im.metrics.mu.Lock()
im.metrics.CorruptAddressesDetected++
im.metrics.LastCorruptionDetection = time.Now()
// Update corruption statistics
if corruptionScore > im.metrics.MaxCorruptionScore {
im.metrics.MaxCorruptionScore = corruptionScore
}
// Calculate rolling average corruption score
total := float64(im.metrics.CorruptAddressesDetected)
im.metrics.AverageCorruptionScore = ((im.metrics.AverageCorruptionScore * (total - 1)) + float64(corruptionScore)) / total
im.metrics.mu.Unlock()
// Generate alert based on corruption score
severity := im.getCorruptionSeverity(corruptionScore)
alert := CorruptionAlert{
Timestamp: time.Now(),
Address: address,
CorruptionScore: corruptionScore,
Source: source,
Severity: severity,
Message: fmt.Sprintf("Corruption detected: address %s, score %d, source %s", address.Hex(), corruptionScore, source),
Context: map[string]interface{}{
"address": address.Hex(),
"corruption_score": corruptionScore,
"source": source,
"timestamp": time.Now().Unix(),
},
}
im.sendAlert(alert)
im.updateHealthScore()
im.logger.Warn("Corruption detected",
"address", address.Hex(),
"corruption_score", corruptionScore,
"source", source,
"severity", severity.String())
}
// RecordValidationResult records address validation results
func (im *IntegrityMonitor) RecordValidationResult(passed bool) {
if !im.enabled {
return
}
im.metrics.mu.Lock()
if passed {
im.metrics.AddressValidationPassed++
} else {
im.metrics.AddressValidationFailed++
}
im.metrics.mu.Unlock()
im.updateHealthScore()
}
// RecordContractCallResult records contract call success/failure
func (im *IntegrityMonitor) RecordContractCallResult(succeeded bool) {
if !im.enabled {
return
}
im.metrics.mu.Lock()
if succeeded {
im.metrics.ContractCallsSucceeded++
} else {
im.metrics.ContractCallsFailed++
}
im.metrics.mu.Unlock()
im.updateHealthScore()
}
// RecordRecoveryAction records recovery action usage
func (im *IntegrityMonitor) RecordRecoveryAction(action recovery.RecoveryAction) {
if !im.enabled {
return
}
im.metrics.mu.Lock()
im.metrics.RecoveryActions[action]++
// Track specific metrics
switch action {
case recovery.ActionRetryWithBackoff:
im.metrics.RetryOperationsTriggered++
case recovery.ActionUseFallbackData:
im.metrics.FallbackOperationsUsed++
case recovery.ActionCircuitBreaker:
im.metrics.CircuitBreakersTripped++
}
im.metrics.mu.Unlock()
im.updateHealthScore()
}
// RecordErrorType records error by type
func (im *IntegrityMonitor) RecordErrorType(errorType recovery.ErrorType) {
if !im.enabled {
return
}
im.metrics.mu.Lock()
im.metrics.ErrorsByType[errorType]++
im.metrics.mu.Unlock()
}
// getCorruptionSeverity determines alert severity based on corruption score
func (im *IntegrityMonitor) getCorruptionSeverity(corruptionScore int) AlertSeverity {
if corruptionScore >= 90 {
return AlertSeverityEmergency
} else if corruptionScore >= 70 {
return AlertSeverityCritical
} else if corruptionScore >= 40 {
return AlertSeverityWarning
}
return AlertSeverityInfo
}
// updateHealthScore calculates overall system health score
func (im *IntegrityMonitor) updateHealthScore() {
im.metrics.mu.Lock()
defer im.metrics.mu.Unlock()
if im.metrics.TotalAddressesProcessed == 0 {
im.metrics.HealthScore = 1.0
return
}
// Calculate component scores
corruptionRate := float64(im.metrics.CorruptAddressesDetected) / float64(im.metrics.TotalAddressesProcessed)
var validationSuccessRate float64 = 1.0
validationTotal := im.metrics.AddressValidationPassed + im.metrics.AddressValidationFailed
if validationTotal > 0 {
validationSuccessRate = float64(im.metrics.AddressValidationPassed) / float64(validationTotal)
}
var contractCallSuccessRate float64 = 1.0
contractTotal := im.metrics.ContractCallsSucceeded + im.metrics.ContractCallsFailed
if contractTotal > 0 {
contractCallSuccessRate = float64(im.metrics.ContractCallsSucceeded) / float64(contractTotal)
}
// Weighted health score calculation
healthScore := 0.0
healthScore += (1.0 - corruptionRate) * 0.4 // 40% weight on corruption prevention
healthScore += validationSuccessRate * 0.3 // 30% weight on validation success
healthScore += contractCallSuccessRate * 0.3 // 30% weight on contract call success
// Cap at 1.0 and handle edge cases
if healthScore > 1.0 {
healthScore = 1.0
} else if healthScore < 0.0 {
healthScore = 0.0
}
im.metrics.HealthScore = healthScore
if healthScore > im.metrics.HighScore {
im.metrics.HighScore = healthScore
}
// Check for health score threshold alerts
if healthScore < im.alertThresholds["health_score_min"] {
alert := CorruptionAlert{
Timestamp: time.Now(),
Severity: AlertSeverityCritical,
Message: fmt.Sprintf("System health score dropped to %.2f (threshold: %.2f)", healthScore, im.alertThresholds["health_score_min"]),
Context: map[string]interface{}{
"health_score": healthScore,
"threshold": im.alertThresholds["health_score_min"],
"corruption_rate": corruptionRate,
"validation_success": validationSuccessRate,
"contract_call_success": contractCallSuccessRate,
},
}
im.sendAlert(alert)
}
}
// sendAlert sends alerts to all subscribers
func (im *IntegrityMonitor) sendAlert(alert CorruptionAlert) {
im.alertsMutex.Lock()
im.alerts = append(im.alerts, alert)
if len(im.alerts) > 1000 {
trimmed := make([]CorruptionAlert, 1000)
copy(trimmed, im.alerts[len(im.alerts)-1000:])
im.alerts = trimmed
}
im.alertsMutex.Unlock()
for _, subscriber := range im.alertSubscribers {
if err := subscriber.HandleAlert(alert); err != nil {
im.logger.Error("Failed to send alert",
"subscriber", fmt.Sprintf("%T", subscriber),
"error", err)
}
}
}
// AddAlertSubscriber adds an alert subscriber
func (im *IntegrityMonitor) AddAlertSubscriber(subscriber AlertSubscriber) {
im.mu.Lock()
defer im.mu.Unlock()
im.alertSubscribers = append(im.alertSubscribers, subscriber)
}
// GetMetrics returns a copy of current metrics
func (im *IntegrityMonitor) GetMetrics() MetricsSnapshot {
im.metrics.mu.RLock()
defer im.metrics.mu.RUnlock()
// Create a deep copy
metrics := IntegrityMetrics{
TotalAddressesProcessed: im.metrics.TotalAddressesProcessed,
CorruptAddressesDetected: im.metrics.CorruptAddressesDetected,
AddressValidationPassed: im.metrics.AddressValidationPassed,
AddressValidationFailed: im.metrics.AddressValidationFailed,
ContractCallsSucceeded: im.metrics.ContractCallsSucceeded,
ContractCallsFailed: im.metrics.ContractCallsFailed,
RetryOperationsTriggered: im.metrics.RetryOperationsTriggered,
FallbackOperationsUsed: im.metrics.FallbackOperationsUsed,
CircuitBreakersTripped: im.metrics.CircuitBreakersTripped,
LastCorruptionDetection: im.metrics.LastCorruptionDetection,
AverageCorruptionScore: im.metrics.AverageCorruptionScore,
MaxCorruptionScore: im.metrics.MaxCorruptionScore,
HealthScore: im.metrics.HealthScore,
HighScore: im.metrics.HighScore,
RecoveryActions: make(map[recovery.RecoveryAction]int64),
ErrorsByType: make(map[recovery.ErrorType]int64),
}
// Copy maps
for k, v := range im.metrics.RecoveryActions {
metrics.RecoveryActions[k] = v
}
for k, v := range im.metrics.ErrorsByType {
metrics.ErrorsByType[k] = v
}
// Return a safe copy without mutex
return MetricsSnapshot{
TotalAddressesProcessed: metrics.TotalAddressesProcessed,
CorruptAddressesDetected: metrics.CorruptAddressesDetected,
AddressValidationPassed: metrics.AddressValidationPassed,
AddressValidationFailed: metrics.AddressValidationFailed,
ContractCallsSucceeded: metrics.ContractCallsSucceeded,
ContractCallsFailed: metrics.ContractCallsFailed,
RetryOperationsTriggered: metrics.RetryOperationsTriggered,
FallbackOperationsUsed: metrics.FallbackOperationsUsed,
CircuitBreakersTripped: metrics.CircuitBreakersTripped,
LastCorruptionDetection: metrics.LastCorruptionDetection,
AverageCorruptionScore: metrics.AverageCorruptionScore,
MaxCorruptionScore: metrics.MaxCorruptionScore,
HealthScore: metrics.HealthScore,
HighScore: metrics.HighScore,
RecoveryActions: metrics.RecoveryActions,
ErrorsByType: metrics.ErrorsByType,
}
}
// GetHealthSummary returns a comprehensive health summary
func (im *IntegrityMonitor) GetHealthSummary() map[string]interface{} {
metrics := im.GetMetrics()
corruptionRate := 0.0
if metrics.TotalAddressesProcessed > 0 {
corruptionRate = float64(metrics.CorruptAddressesDetected) / float64(metrics.TotalAddressesProcessed)
}
validationSuccessRate := 0.0
totalValidations := metrics.AddressValidationPassed + metrics.AddressValidationFailed
if totalValidations > 0 {
validationSuccessRate = float64(metrics.AddressValidationPassed) / float64(totalValidations)
}
contractCallSuccessRate := 0.0
totalCalls := metrics.ContractCallsSucceeded + metrics.ContractCallsFailed
if totalCalls > 0 {
contractCallSuccessRate = float64(metrics.ContractCallsSucceeded) / float64(totalCalls)
}
return map[string]interface{}{
"enabled": im.enabled,
"health_score": metrics.HealthScore,
"total_addresses_processed": metrics.TotalAddressesProcessed,
"corruption_detections": metrics.CorruptAddressesDetected,
"corruption_rate": corruptionRate,
"validation_success_rate": validationSuccessRate,
"contract_call_success_rate": contractCallSuccessRate,
"average_corruption_score": metrics.AverageCorruptionScore,
"max_corruption_score": metrics.MaxCorruptionScore,
"retry_operations": metrics.RetryOperationsTriggered,
"fallback_operations": metrics.FallbackOperationsUsed,
"circuit_breakers_tripped": metrics.CircuitBreakersTripped,
"last_corruption": metrics.LastCorruptionDetection,
"recovery_actions": metrics.RecoveryActions,
"errors_by_type": metrics.ErrorsByType,
"alert_thresholds": im.alertThresholds,
"alert_subscribers": len(im.alertSubscribers),
}
}
// GetRecentAlerts returns the most recent corruption alerts up to the specified limit.
func (im *IntegrityMonitor) GetRecentAlerts(limit int) []CorruptionAlert {
im.alertsMutex.RLock()
defer im.alertsMutex.RUnlock()
if limit <= 0 || limit > len(im.alerts) {
limit = len(im.alerts)
}
if limit == 0 {
return []CorruptionAlert{}
}
start := len(im.alerts) - limit
alertsCopy := make([]CorruptionAlert, limit)
copy(alertsCopy, im.alerts[start:])
return alertsCopy
}
// SetThreshold sets an alert threshold
func (im *IntegrityMonitor) SetThreshold(name string, value float64) {
im.mu.Lock()
defer im.mu.Unlock()
im.alertThresholds[name] = value
}
// Enable enables the integrity monitor
func (im *IntegrityMonitor) Enable() {
im.mu.Lock()
defer im.mu.Unlock()
im.enabled = true
im.logger.Info("Integrity monitor enabled")
}
// Disable disables the integrity monitor
func (im *IntegrityMonitor) Disable() {
im.mu.Lock()
defer im.mu.Unlock()
im.enabled = false
im.logger.Info("Integrity monitor disabled")
}
// IsEnabled returns whether the monitor is enabled
func (im *IntegrityMonitor) IsEnabled() bool {
im.mu.RLock()
defer im.mu.RUnlock()
return im.enabled
}
// StartHealthCheckRunner starts the periodic health check routine
func (im *IntegrityMonitor) StartHealthCheckRunner(ctx context.Context) {
if im.healthCheckRunner != nil {
im.healthCheckRunner.Start(ctx)
}
}
// StopHealthCheckRunner stops the periodic health check routine
func (im *IntegrityMonitor) StopHealthCheckRunner() {
if im.healthCheckRunner != nil {
im.healthCheckRunner.Stop()
}
}
// GetHealthCheckRunner returns the health check runner
func (im *IntegrityMonitor) GetHealthCheckRunner() *HealthCheckRunner {
return im.healthCheckRunner
}