feat: create v2-prep branch with comprehensive planning

Restructured project for V2 refactor:

**Structure Changes:**
- Moved all V1 code to orig/ folder (preserved with git mv)
- Created docs/planning/ directory
- Added orig/README_V1.md explaining V1 preservation

**Planning Documents:**
- 00_V2_MASTER_PLAN.md: Complete architecture overview
  - Executive summary of critical V1 issues
  - High-level component architecture diagrams
  - 5-phase implementation roadmap
  - Success metrics and risk mitigation

- 07_TASK_BREAKDOWN.md: Atomic task breakdown
  - 99+ hours of detailed tasks
  - Every task < 2 hours (atomic)
  - Clear dependencies and success criteria
  - Organized by implementation phase

**V2 Key Improvements:**
- Per-exchange parsers (factory pattern)
- Multi-layer strict validation
- Multi-index pool cache
- Background validation pipeline
- Comprehensive observability

**Critical Issues Addressed:**
- Zero address tokens (strict validation + cache enrichment)
- Parsing accuracy (protocol-specific parsers)
- No audit trail (background validation channel)
- Inefficient lookups (multi-index cache)
- Stats disconnection (event-driven metrics)

Next Steps:
1. Review planning documents
2. Begin Phase 1: Foundation (P1-001 through P1-010)
3. Implement parsers in Phase 2
4. Build cache system in Phase 3
5. Add validation pipeline in Phase 4
6. Migrate and test in Phase 5

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Administrator
2025-11-10 10:14:26 +01:00
parent 1773daffe7
commit 803de231ba
411 changed files with 20390 additions and 8680 deletions

View File

@@ -0,0 +1,400 @@
package monitoring
import (
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"os"
"strings"
"sync"
"time"
"github.com/fraktal/mev-beta/internal/logger"
)
// LogAlertHandler logs alerts to the application logger
type LogAlertHandler struct {
logger *logger.Logger
}
// NewLogAlertHandler creates a new log-based alert handler
func NewLogAlertHandler(logger *logger.Logger) *LogAlertHandler {
return &LogAlertHandler{
logger: logger,
}
}
// HandleAlert logs the alert using structured logging
func (lah *LogAlertHandler) HandleAlert(alert CorruptionAlert) error {
switch alert.Severity {
case AlertSeverityEmergency:
lah.logger.Error("🚨 EMERGENCY ALERT",
"message", alert.Message,
"severity", alert.Severity.String(),
"timestamp", alert.Timestamp,
"context", alert.Context)
case AlertSeverityCritical:
lah.logger.Error("🔴 CRITICAL ALERT",
"message", alert.Message,
"severity", alert.Severity.String(),
"timestamp", alert.Timestamp,
"context", alert.Context)
case AlertSeverityWarning:
lah.logger.Warn("🟡 WARNING ALERT",
"message", alert.Message,
"severity", alert.Severity.String(),
"timestamp", alert.Timestamp,
"context", alert.Context)
default:
lah.logger.Info(" INFO ALERT",
"message", alert.Message,
"severity", alert.Severity.String(),
"timestamp", alert.Timestamp,
"context", alert.Context)
}
return nil
}
// FileAlertHandler writes alerts to a file in JSON format
type FileAlertHandler struct {
mu sync.Mutex
filePath string
logger *logger.Logger
}
// NewFileAlertHandler creates a new file-based alert handler
func NewFileAlertHandler(filePath string, logger *logger.Logger) *FileAlertHandler {
return &FileAlertHandler{
filePath: filePath,
logger: logger,
}
}
// HandleAlert writes the alert to a file
func (fah *FileAlertHandler) HandleAlert(alert CorruptionAlert) error {
fah.mu.Lock()
defer fah.mu.Unlock()
// Create alert record for file
alertRecord := map[string]interface{}{
"timestamp": alert.Timestamp.Format(time.RFC3339),
"severity": alert.Severity.String(),
"message": alert.Message,
"address": alert.Address.Hex(),
"corruption_score": alert.CorruptionScore,
"source": alert.Source,
"context": alert.Context,
}
// Convert to JSON
alertJSON, err := json.Marshal(alertRecord)
if err != nil {
return fmt.Errorf("failed to marshal alert: %w", err)
}
// Open file for appending
file, err := os.OpenFile(fah.filePath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0644)
if err != nil {
return fmt.Errorf("failed to open alert file: %w", err)
}
defer file.Close()
// Write alert with newline
if _, err := file.Write(append(alertJSON, '\n')); err != nil {
return fmt.Errorf("failed to write alert to file: %w", err)
}
fah.logger.Debug("Alert written to file",
"file", fah.filePath,
"severity", alert.Severity.String())
return nil
}
// HTTPAlertHandler sends alerts to an HTTP endpoint (e.g., Slack, Discord, PagerDuty)
type HTTPAlertHandler struct {
mu sync.Mutex
webhookURL string
client *http.Client
logger *logger.Logger
retryCount int
}
// NewHTTPAlertHandler creates a new HTTP-based alert handler
func NewHTTPAlertHandler(webhookURL string, logger *logger.Logger) *HTTPAlertHandler {
return &HTTPAlertHandler{
webhookURL: webhookURL,
client: &http.Client{
Timeout: 10 * time.Second,
},
logger: logger,
retryCount: 3,
}
}
// HandleAlert sends the alert to the configured HTTP endpoint
func (hah *HTTPAlertHandler) HandleAlert(alert CorruptionAlert) error {
if hah.webhookURL == "" {
return fmt.Errorf("webhook URL not configured")
}
// Create payload based on webhook type
payload := hah.createPayload(alert)
// Convert payload to JSON
payloadJSON, err := json.Marshal(payload)
if err != nil {
return fmt.Errorf("failed to marshal webhook payload: %w", err)
}
// Send with retries
for attempt := 1; attempt <= hah.retryCount; attempt++ {
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
req, err := http.NewRequestWithContext(ctx, "POST", hah.webhookURL, strings.NewReader(string(payloadJSON)))
cancel()
if err != nil {
return fmt.Errorf("failed to create HTTP request: %w", err)
}
req.Header.Set("Content-Type", "application/json")
req.Header.Set("User-Agent", "MEV-Bot-AlertHandler/1.0")
resp, err := hah.client.Do(req)
if err != nil {
hah.logger.Warn("Failed to send alert to webhook",
"attempt", attempt,
"error", err)
if attempt == hah.retryCount {
return fmt.Errorf("failed to send alert after %d attempts: %w", hah.retryCount, err)
}
time.Sleep(time.Duration(attempt) * time.Second)
continue
}
defer resp.Body.Close()
// Read response body for debugging
body, _ := io.ReadAll(resp.Body)
if resp.StatusCode >= 200 && resp.StatusCode < 300 {
hah.logger.Debug("Alert sent successfully",
"webhook_url", hah.webhookURL,
"status_code", resp.StatusCode,
"response", string(body))
return nil
}
hah.logger.Warn("Webhook returned error status",
"attempt", attempt,
"status_code", resp.StatusCode,
"response", string(body))
if attempt == hah.retryCount {
return fmt.Errorf("webhook returned status %d after %d attempts", resp.StatusCode, hah.retryCount)
}
time.Sleep(time.Duration(attempt) * time.Second)
}
return nil
}
// createPayload creates the webhook payload based on the webhook type
func (hah *HTTPAlertHandler) createPayload(alert CorruptionAlert) map[string]interface{} {
// Detect webhook type based on URL
if strings.Contains(hah.webhookURL, "slack.com") {
return hah.createSlackPayload(alert)
} else if strings.Contains(hah.webhookURL, "discord.com") {
return hah.createDiscordPayload(alert)
}
// Generic webhook payload
return map[string]interface{}{
"timestamp": alert.Timestamp.Format(time.RFC3339),
"severity": alert.Severity.String(),
"message": alert.Message,
"address": alert.Address.Hex(),
"corruption_score": alert.CorruptionScore,
"source": alert.Source,
"context": alert.Context,
}
}
// createSlackPayload creates a Slack-compatible webhook payload
func (hah *HTTPAlertHandler) createSlackPayload(alert CorruptionAlert) map[string]interface{} {
color := "good"
switch alert.Severity {
case AlertSeverityWarning:
color = "warning"
case AlertSeverityCritical:
color = "danger"
case AlertSeverityEmergency:
color = "#FF0000" // Bright red for emergency
}
attachment := map[string]interface{}{
"color": color,
"title": fmt.Sprintf("%s Alert - MEV Bot", alert.Severity.String()),
"text": alert.Message,
"timestamp": alert.Timestamp.Unix(),
"fields": []map[string]interface{}{
{
"title": "Address",
"value": alert.Address.Hex(),
"short": true,
},
{
"title": "Corruption Score",
"value": fmt.Sprintf("%d", alert.CorruptionScore),
"short": true,
},
{
"title": "Source",
"value": alert.Source,
"short": true,
},
},
}
return map[string]interface{}{
"text": fmt.Sprintf("MEV Bot Alert: %s", alert.Severity.String()),
"attachments": []map[string]interface{}{attachment},
}
}
// createDiscordPayload creates a Discord-compatible webhook payload
func (hah *HTTPAlertHandler) createDiscordPayload(alert CorruptionAlert) map[string]interface{} {
color := 0x00FF00 // Green
switch alert.Severity {
case AlertSeverityWarning:
color = 0xFFFF00 // Yellow
case AlertSeverityCritical:
color = 0xFF8000 // Orange
case AlertSeverityEmergency:
color = 0xFF0000 // Red
}
embed := map[string]interface{}{
"title": fmt.Sprintf("%s Alert - MEV Bot", alert.Severity.String()),
"description": alert.Message,
"color": color,
"timestamp": alert.Timestamp.Format(time.RFC3339),
"fields": []map[string]interface{}{
{
"name": "Address",
"value": alert.Address.Hex(),
"inline": true,
},
{
"name": "Corruption Score",
"value": fmt.Sprintf("%d", alert.CorruptionScore),
"inline": true,
},
{
"name": "Source",
"value": alert.Source,
"inline": true,
},
},
"footer": map[string]interface{}{
"text": "MEV Bot Integrity Monitor",
},
}
return map[string]interface{}{
"embeds": []map[string]interface{}{embed},
}
}
// MetricsAlertHandler integrates with metrics systems (Prometheus, etc.)
type MetricsAlertHandler struct {
mu sync.Mutex
logger *logger.Logger
counters map[string]int64
}
// NewMetricsAlertHandler creates a new metrics-based alert handler
func NewMetricsAlertHandler(logger *logger.Logger) *MetricsAlertHandler {
return &MetricsAlertHandler{
logger: logger,
counters: make(map[string]int64),
}
}
// HandleAlert updates metrics counters based on alert
func (mah *MetricsAlertHandler) HandleAlert(alert CorruptionAlert) error {
mah.mu.Lock()
defer mah.mu.Unlock()
// Increment counters
mah.counters["total_alerts"]++
mah.counters[fmt.Sprintf("alerts_%s", strings.ToLower(alert.Severity.String()))]++
if alert.CorruptionScore > 0 {
mah.counters["corruption_alerts"]++
}
mah.logger.Debug("Metrics updated for alert",
"severity", alert.Severity.String(),
"total_alerts", mah.counters["total_alerts"])
return nil
}
// GetCounters returns the current alert counters
func (mah *MetricsAlertHandler) GetCounters() map[string]int64 {
mah.mu.Lock()
defer mah.mu.Unlock()
// Return a copy
counters := make(map[string]int64)
for k, v := range mah.counters {
counters[k] = v
}
return counters
}
// CompositeAlertHandler combines multiple alert handlers
type CompositeAlertHandler struct {
handlers []AlertSubscriber
logger *logger.Logger
}
// NewCompositeAlertHandler creates a composite alert handler
func NewCompositeAlertHandler(logger *logger.Logger, handlers ...AlertSubscriber) *CompositeAlertHandler {
return &CompositeAlertHandler{
handlers: handlers,
logger: logger,
}
}
// HandleAlert sends the alert to all configured handlers
func (cah *CompositeAlertHandler) HandleAlert(alert CorruptionAlert) error {
errors := make([]error, 0)
for i, handler := range cah.handlers {
if err := handler.HandleAlert(alert); err != nil {
cah.logger.Error("Alert handler failed",
"handler_index", i,
"handler_type", fmt.Sprintf("%T", handler),
"error", err)
errors = append(errors, fmt.Errorf("handler %d (%T): %w", i, handler, err))
}
}
if len(errors) > 0 {
return fmt.Errorf("alert handler errors: %v", errors)
}
return nil
}
// AddHandler adds a new handler to the composite
func (cah *CompositeAlertHandler) AddHandler(handler AlertSubscriber) {
cah.handlers = append(cah.handlers, handler)
}

View File

@@ -0,0 +1,549 @@
package monitoring
import (
"encoding/json"
"fmt"
"html/template"
"net/http"
"strconv"
"time"
"github.com/fraktal/mev-beta/internal/logger"
)
// DashboardServer provides a web-based monitoring dashboard
type DashboardServer struct {
logger *logger.Logger
integrityMonitor *IntegrityMonitor
healthChecker *HealthCheckRunner
port int
server *http.Server
}
// NewDashboardServer creates a new dashboard server
func NewDashboardServer(logger *logger.Logger, integrityMonitor *IntegrityMonitor, healthChecker *HealthCheckRunner, port int) *DashboardServer {
return &DashboardServer{
logger: logger,
integrityMonitor: integrityMonitor,
healthChecker: healthChecker,
port: port,
}
}
// Start starts the dashboard HTTP server
func (ds *DashboardServer) Start() error {
mux := http.NewServeMux()
// Register endpoints
mux.HandleFunc("/", ds.handleDashboard)
mux.HandleFunc("/api/health", ds.handleAPIHealth)
mux.HandleFunc("/api/metrics", ds.handleAPIMetrics)
mux.HandleFunc("/api/history", ds.handleAPIHistory)
mux.HandleFunc("/api/alerts", ds.handleAPIAlerts)
mux.HandleFunc("/static/", ds.handleStatic)
ds.server = &http.Server{
Addr: fmt.Sprintf(":%d", ds.port),
Handler: mux,
}
ds.logger.Info("Starting monitoring dashboard",
"port", ds.port,
"url", fmt.Sprintf("http://localhost:%d", ds.port))
return ds.server.ListenAndServe()
}
// Stop stops the dashboard server
func (ds *DashboardServer) Stop() error {
if ds.server != nil {
return ds.server.Close()
}
return nil
}
// handleDashboard serves the main dashboard HTML page
func (ds *DashboardServer) handleDashboard(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
// Get current metrics and health data
metrics := ds.integrityMonitor.GetMetrics()
healthSummary := ds.integrityMonitor.GetHealthSummary()
healthHistory := ds.healthChecker.GetRecentSnapshots(20)
// Render dashboard template
tmpl := ds.getDashboardTemplate()
data := struct {
Metrics MetricsSnapshot
HealthSummary map[string]interface{}
HealthHistory []HealthSnapshot
Timestamp time.Time
}{
Metrics: metrics,
HealthSummary: healthSummary,
HealthHistory: healthHistory,
Timestamp: time.Now(),
}
if err := tmpl.Execute(w, data); err != nil {
ds.logger.Error("Failed to render dashboard", "error", err)
http.Error(w, "Internal Server Error", http.StatusInternalServerError)
return
}
}
// handleAPIHealth provides JSON health endpoint
func (ds *DashboardServer) handleAPIHealth(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
healthSummary := ds.integrityMonitor.GetHealthSummary()
healthCheckerSummary := ds.healthChecker.GetHealthSummary()
// Combine summaries
response := map[string]interface{}{
"integrity_monitor": healthSummary,
"health_checker": healthCheckerSummary,
"timestamp": time.Now(),
}
if err := json.NewEncoder(w).Encode(response); err != nil {
ds.logger.Error("Failed to encode health response", "error", err)
http.Error(w, "Internal Server Error", http.StatusInternalServerError)
}
}
// handleAPIMetrics provides JSON metrics endpoint
func (ds *DashboardServer) handleAPIMetrics(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
metrics := ds.integrityMonitor.GetMetrics()
if err := json.NewEncoder(w).Encode(metrics); err != nil {
ds.logger.Error("Failed to encode metrics response", "error", err)
http.Error(w, "Internal Server Error", http.StatusInternalServerError)
}
}
// handleAPIHistory provides JSON health history endpoint
func (ds *DashboardServer) handleAPIHistory(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
// Get count parameter (default 20)
countStr := r.URL.Query().Get("count")
count := 20
if countStr != "" {
if c, err := strconv.Atoi(countStr); err == nil && c > 0 && c <= 100 {
count = c
}
}
history := ds.healthChecker.GetRecentSnapshots(count)
if err := json.NewEncoder(w).Encode(history); err != nil {
ds.logger.Error("Failed to encode history response", "error", err)
http.Error(w, "Internal Server Error", http.StatusInternalServerError)
}
}
// handleAPIAlerts provides recent alerts for integrity and health monitoring.
func (ds *DashboardServer) handleAPIAlerts(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
limit := 20
if q := r.URL.Query().Get("limit"); q != "" {
if parsed, err := strconv.Atoi(q); err == nil && parsed > 0 && parsed <= 200 {
limit = parsed
}
}
alerts := ds.integrityMonitor.GetRecentAlerts(limit)
payload := map[string]interface{}{
"alerts": alerts,
"count": len(alerts),
"timestamp": time.Now(),
}
if err := json.NewEncoder(w).Encode(payload); err != nil {
ds.logger.Error("Failed to encode alerts response", "error", err)
http.Error(w, "Internal Server Error", http.StatusInternalServerError)
}
}
// handleStatic serves static assets (CSS, JS)
func (ds *DashboardServer) handleStatic(w http.ResponseWriter, r *http.Request) {
// For simplicity, we'll inline CSS and JS in the HTML template
// In a production system, you'd serve actual static files
http.NotFound(w, r)
}
// getDashboardTemplate returns the HTML template for the dashboard
func (ds *DashboardServer) getDashboardTemplate() *template.Template {
htmlTemplate := `
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>MEV Bot - Data Integrity Monitor</title>
<style>
* {
margin: 0;
padding: 0;
box-sizing: border-box;
}
body {
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
background-color: #f5f5f5;
color: #333;
line-height: 1.6;
}
.header {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
padding: 1rem 0;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
}
.container {
max-width: 1200px;
margin: 0 auto;
padding: 0 1rem;
}
.header h1 {
font-size: 2rem;
font-weight: 300;
}
.header .subtitle {
opacity: 0.9;
margin-top: 0.5rem;
}
.dashboard {
padding: 2rem 0;
}
.grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
gap: 1.5rem;
margin-bottom: 2rem;
}
.card {
background: white;
border-radius: 8px;
padding: 1.5rem;
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
border-left: 4px solid #667eea;
}
.card h3 {
color: #333;
margin-bottom: 1rem;
font-size: 1.25rem;
}
.metric {
display: flex;
justify-content: space-between;
align-items: center;
padding: 0.5rem 0;
border-bottom: 1px solid #eee;
}
.metric:last-child {
border-bottom: none;
}
.metric-label {
font-weight: 500;
color: #666;
}
.metric-value {
font-weight: 600;
color: #333;
}
.health-score {
font-size: 2rem;
font-weight: bold;
text-align: center;
padding: 1rem;
border-radius: 50%;
width: 100px;
height: 100px;
display: flex;
align-items: center;
justify-content: center;
margin: 0 auto 1rem;
}
.health-excellent { background: #4CAF50; color: white; }
.health-good { background: #8BC34A; color: white; }
.health-fair { background: #FF9800; color: white; }
.health-poor { background: #F44336; color: white; }
.status-indicator {
display: inline-block;
width: 12px;
height: 12px;
border-radius: 50%;
margin-right: 8px;
}
.status-healthy { background: #4CAF50; }
.status-warning { background: #FF9800; }
.status-critical { background: #F44336; }
.chart-container {
background: white;
border-radius: 8px;
padding: 1.5rem;
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
margin-top: 1.5rem;
}
.refresh-indicator {
position: fixed;
top: 20px;
right: 20px;
background: #667eea;
color: white;
padding: 0.5rem 1rem;
border-radius: 4px;
font-size: 0.875rem;
}
.timestamp {
text-align: center;
color: #666;
font-size: 0.875rem;
margin-top: 2rem;
}
.recovery-actions {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
gap: 1rem;
margin-top: 1rem;
}
.recovery-action {
background: #f8f9fa;
padding: 0.75rem;
border-radius: 4px;
text-align: center;
}
.recovery-action-count {
font-size: 1.5rem;
font-weight: bold;
color: #667eea;
}
.recovery-action-label {
font-size: 0.875rem;
color: #666;
text-transform: uppercase;
}
</style>
</head>
<body>
<div class="header">
<div class="container">
<h1>🤖 MEV Bot - Data Integrity Monitor</h1>
<p class="subtitle">Real-time monitoring of corruption detection and recovery systems</p>
</div>
</div>
<div class="dashboard">
<div class="container">
<div class="grid">
<!-- Health Score Card -->
<div class="card">
<h3>System Health</h3>
<div class="health-score {{.HealthSummary.health_score | healthClass}}">
{{.HealthSummary.health_score | printf "%.1f"}}
</div>
<div class="metric">
<span class="metric-label">Status</span>
<span class="metric-value">
<span class="status-indicator {{.HealthSummary.health_score | statusClass}}"></span>
{{.HealthSummary.health_score | healthStatus}}
</span>
</div>
<div class="metric">
<span class="metric-label">Monitor Enabled</span>
<span class="metric-value">{{if .HealthSummary.enabled}}✅ Yes{{else}}❌ No{{end}}</span>
</div>
</div>
<!-- Processing Statistics -->
<div class="card">
<h3>Processing Statistics</h3>
<div class="metric">
<span class="metric-label">Total Addresses</span>
<span class="metric-value">{{.Metrics.TotalAddressesProcessed | printf "%,d"}}</span>
</div>
<div class="metric">
<span class="metric-label">Corruption Detected</span>
<span class="metric-value">{{.Metrics.CorruptAddressesDetected | printf "%,d"}}</span>
</div>
<div class="metric">
<span class="metric-label">Corruption Rate</span>
<span class="metric-value">{{.HealthSummary.corruption_rate | printf "%.4f%%"}}</span>
</div>
<div class="metric">
<span class="metric-label">Avg Corruption Score</span>
<span class="metric-value">{{.Metrics.AverageCorruptionScore | printf "%.1f"}}</span>
</div>
<div class="metric">
<span class="metric-label">Max Corruption Score</span>
<span class="metric-value">{{.Metrics.MaxCorruptionScore}}</span>
</div>
</div>
<!-- Validation Results -->
<div class="card">
<h3>Validation Results</h3>
<div class="metric">
<span class="metric-label">Validation Passed</span>
<span class="metric-value">{{.Metrics.AddressValidationPassed | printf "%,d"}}</span>
</div>
<div class="metric">
<span class="metric-label">Validation Failed</span>
<span class="metric-value">{{.Metrics.AddressValidationFailed | printf "%,d"}}</span>
</div>
<div class="metric">
<span class="metric-label">Success Rate</span>
<span class="metric-value">{{.HealthSummary.validation_success_rate | printf "%.2f%%"}}</span>
</div>
</div>
<!-- Contract Calls -->
<div class="card">
<h3>Contract Calls</h3>
<div class="metric">
<span class="metric-label">Successful Calls</span>
<span class="metric-value">{{.Metrics.ContractCallsSucceeded | printf "%,d"}}</span>
</div>
<div class="metric">
<span class="metric-label">Failed Calls</span>
<span class="metric-value">{{.Metrics.ContractCallsFailed | printf "%,d"}}</span>
</div>
<div class="metric">
<span class="metric-label">Success Rate</span>
<span class="metric-value">{{.HealthSummary.contract_call_success_rate | printf "%.2f%%"}}</span>
</div>
</div>
</div>
<!-- Recovery Actions -->
<div class="chart-container">
<h3>Recovery System Activity</h3>
<div class="recovery-actions">
<div class="recovery-action">
<div class="recovery-action-count">{{.Metrics.RetryOperationsTriggered}}</div>
<div class="recovery-action-label">Retry Operations</div>
</div>
<div class="recovery-action">
<div class="recovery-action-count">{{.Metrics.FallbackOperationsUsed}}</div>
<div class="recovery-action-label">Fallback Used</div>
</div>
<div class="recovery-action">
<div class="recovery-action-count">{{.Metrics.CircuitBreakersTripped}}</div>
<div class="recovery-action-label">Circuit Breakers</div>
</div>
</div>
</div>
<div class="timestamp">
Last updated: {{.Timestamp.Format "2006-01-02 15:04:05 UTC"}}
<br>
Auto-refresh every 30 seconds
</div>
</div>
</div>
<div class="refresh-indicator">🔄 Live</div>
<script>
// Auto-refresh every 30 seconds
setInterval(function() {
window.location.reload();
}, 30000);
// Add smooth transitions
document.addEventListener('DOMContentLoaded', function() {
const cards = document.querySelectorAll('.card');
cards.forEach((card, index) => {
card.style.animationDelay = (index * 0.1) + 's';
card.style.animation = 'fadeInUp 0.6s ease forwards';
});
});
</script>
<style>
@keyframes fadeInUp {
from {
opacity: 0;
transform: translateY(20px);
}
to {
opacity: 1;
transform: translateY(0);
}
}
</style>
</body>
</html>
`
// Create template with custom functions
funcMap := template.FuncMap{
"healthClass": func(score interface{}) string {
s := score.(float64)
if s >= 0.9 {
return "health-excellent"
} else if s >= 0.7 {
return "health-good"
} else if s >= 0.5 {
return "health-fair"
}
return "health-poor"
},
"statusClass": func(score interface{}) string {
s := score.(float64)
if s >= 0.7 {
return "status-healthy"
} else if s >= 0.5 {
return "status-warning"
}
return "status-critical"
},
"healthStatus": func(score interface{}) string {
s := score.(float64)
if s >= 0.9 {
return "Excellent"
} else if s >= 0.7 {
return "Good"
} else if s >= 0.5 {
return "Fair"
}
return "Poor"
},
}
return template.Must(template.New("dashboard").Funcs(funcMap).Parse(htmlTemplate))
}
// GetDashboardURL returns the dashboard URL
func (ds *DashboardServer) GetDashboardURL() string {
return fmt.Sprintf("http://localhost:%d", ds.port)
}

View File

@@ -0,0 +1,447 @@
package monitoring
import (
"context"
"fmt"
"sync"
"time"
"github.com/fraktal/mev-beta/internal/logger"
)
// HealthCheckRunner performs periodic health checks and monitoring
type HealthCheckRunner struct {
mu sync.RWMutex
logger *logger.Logger
integrityMonitor *IntegrityMonitor
checkInterval time.Duration
running bool
stopChan chan struct{}
lastHealthCheck time.Time
healthHistory []HealthSnapshot
maxHistorySize int
warmupSamples int
minAddressesForAlerts int64
}
// HealthSnapshot represents a point-in-time health snapshot
type HealthSnapshot struct {
Timestamp time.Time
HealthScore float64
CorruptionRate float64
ValidationSuccess float64
ContractCallSuccess float64
ActiveAlerts int
Trend HealthTrend
}
// HealthTrend indicates the direction of health metrics
type HealthTrend int
const (
HealthTrendUnknown HealthTrend = iota
HealthTrendImproving
HealthTrendStable
HealthTrendDeclining
HealthTrendCritical
)
func (t HealthTrend) String() string {
switch t {
case HealthTrendImproving:
return "IMPROVING"
case HealthTrendStable:
return "STABLE"
case HealthTrendDeclining:
return "DECLINING"
case HealthTrendCritical:
return "CRITICAL"
default:
return "UNKNOWN"
}
}
// NewHealthCheckRunner creates a new health check runner
func NewHealthCheckRunner(logger *logger.Logger, integrityMonitor *IntegrityMonitor) *HealthCheckRunner {
return &HealthCheckRunner{
logger: logger,
integrityMonitor: integrityMonitor,
checkInterval: 30 * time.Second, // Default 30 second intervals
stopChan: make(chan struct{}),
healthHistory: make([]HealthSnapshot, 0),
maxHistorySize: 100, // Keep last 100 snapshots (50 minutes at 30s intervals)
warmupSamples: 3,
minAddressesForAlerts: 25,
}
}
// Start begins the periodic health checking routine
func (hcr *HealthCheckRunner) Start(ctx context.Context) {
hcr.mu.Lock()
if hcr.running {
hcr.mu.Unlock()
return
}
hcr.running = true
hcr.mu.Unlock()
hcr.logger.Info("Starting health check runner",
"interval", hcr.checkInterval)
go hcr.healthCheckLoop(ctx)
}
// Stop stops the health checking routine
func (hcr *HealthCheckRunner) Stop() {
hcr.mu.Lock()
defer hcr.mu.Unlock()
if !hcr.running {
return
}
hcr.running = false
close(hcr.stopChan)
hcr.logger.Info("Health check runner stopped")
}
// healthCheckLoop runs the periodic health checking
func (hcr *HealthCheckRunner) healthCheckLoop(ctx context.Context) {
ticker := time.NewTicker(hcr.checkInterval)
defer ticker.Stop()
// Perform initial health check
hcr.performHealthCheck()
for {
select {
case <-ctx.Done():
hcr.logger.Info("Health check runner stopped due to context cancellation")
return
case <-hcr.stopChan:
hcr.logger.Info("Health check runner stopped")
return
case <-ticker.C:
hcr.performHealthCheck()
}
}
}
// performHealthCheck executes a comprehensive health check
func (hcr *HealthCheckRunner) performHealthCheck() {
start := time.Now()
hcr.lastHealthCheck = start
if !hcr.integrityMonitor.IsEnabled() {
hcr.logger.Debug("Skipping health check - integrity monitor disabled")
return
}
// Get current metrics
metrics := hcr.integrityMonitor.GetMetrics()
healthSummary := hcr.integrityMonitor.GetHealthSummary()
// Calculate rates
corruptionRate := 0.0
if metrics.TotalAddressesProcessed > 0 {
corruptionRate = float64(metrics.CorruptAddressesDetected) / float64(metrics.TotalAddressesProcessed)
}
validationSuccessRate := 0.0
totalValidations := metrics.AddressValidationPassed + metrics.AddressValidationFailed
if totalValidations > 0 {
validationSuccessRate = float64(metrics.AddressValidationPassed) / float64(totalValidations)
}
contractCallSuccessRate := 0.0
totalCalls := metrics.ContractCallsSucceeded + metrics.ContractCallsFailed
if totalCalls > 0 {
contractCallSuccessRate = float64(metrics.ContractCallsSucceeded) / float64(totalCalls)
}
// Create health snapshot
snapshot := HealthSnapshot{
Timestamp: start,
HealthScore: metrics.HealthScore,
CorruptionRate: corruptionRate,
ValidationSuccess: validationSuccessRate,
ContractCallSuccess: contractCallSuccessRate,
ActiveAlerts: 0, // Will be calculated based on current conditions
Trend: hcr.calculateHealthTrend(metrics.HealthScore),
}
// Add to history
hcr.addHealthSnapshot(snapshot)
// Check for threshold violations and generate alerts
hcr.checkThresholds(healthSummary, snapshot)
// Log health status periodically
hcr.logHealthStatus(snapshot, time.Since(start))
}
// addHealthSnapshot adds a snapshot to the health history
func (hcr *HealthCheckRunner) addHealthSnapshot(snapshot HealthSnapshot) {
hcr.mu.Lock()
defer hcr.mu.Unlock()
hcr.healthHistory = append(hcr.healthHistory, snapshot)
// Trim history if it exceeds max size
if len(hcr.healthHistory) > hcr.maxHistorySize {
hcr.healthHistory = hcr.healthHistory[len(hcr.healthHistory)-hcr.maxHistorySize:]
}
}
// calculateHealthTrend analyzes recent health scores to determine trend
func (hcr *HealthCheckRunner) calculateHealthTrend(currentScore float64) HealthTrend {
hcr.mu.RLock()
defer hcr.mu.RUnlock()
if len(hcr.healthHistory) < 3 {
return HealthTrendUnknown
}
// Get last few scores for trend analysis
recentScores := make([]float64, 0, 5)
start := len(hcr.healthHistory) - 5
if start < 0 {
start = 0
}
for i := start; i < len(hcr.healthHistory); i++ {
recentScores = append(recentScores, hcr.healthHistory[i].HealthScore)
}
recentScores = append(recentScores, currentScore)
// Calculate trend
if currentScore < 0.5 {
return HealthTrendCritical
}
// Simple linear trend calculation
if len(recentScores) >= 3 {
first := recentScores[0]
last := recentScores[len(recentScores)-1]
diff := last - first
if diff > 0.05 {
return HealthTrendImproving
} else if diff < -0.05 {
return HealthTrendDeclining
} else {
return HealthTrendStable
}
}
return HealthTrendUnknown
}
// checkThresholds checks for threshold violations and generates alerts
func (hcr *HealthCheckRunner) checkThresholds(healthSummary map[string]interface{}, snapshot HealthSnapshot) {
if !hcr.readyForAlerts(healthSummary, snapshot) {
hcr.logger.Debug("Health alerts suppressed during warm-up",
"health_score", snapshot.HealthScore,
"total_addresses_processed", safeNumericLookup(healthSummary, "total_addresses_processed"),
"history_size", hcr.historySize())
return
}
// Critical health score alert
if snapshot.HealthScore < 0.5 {
alert := CorruptionAlert{
Timestamp: time.Now(),
Severity: AlertSeverityEmergency,
Message: fmt.Sprintf("CRITICAL: System health score is %.2f (below 0.5)", snapshot.HealthScore),
Context: map[string]interface{}{
"health_score": snapshot.HealthScore,
"corruption_rate": snapshot.CorruptionRate,
"validation_success": snapshot.ValidationSuccess,
"contract_call_success": snapshot.ContractCallSuccess,
"trend": snapshot.Trend.String(),
},
}
hcr.integrityMonitor.sendAlert(alert)
}
// High corruption rate alert
if snapshot.CorruptionRate > 0.10 { // 10% corruption rate
alert := CorruptionAlert{
Timestamp: time.Now(),
Severity: AlertSeverityCritical,
Message: fmt.Sprintf("High corruption rate detected: %.2f%%", snapshot.CorruptionRate*100),
Context: map[string]interface{}{
"corruption_rate": snapshot.CorruptionRate,
"threshold": 0.10,
"addresses_affected": snapshot.CorruptionRate,
},
}
hcr.integrityMonitor.sendAlert(alert)
}
// Declining trend alert
if snapshot.Trend == HealthTrendDeclining || snapshot.Trend == HealthTrendCritical {
alert := CorruptionAlert{
Timestamp: time.Now(),
Severity: AlertSeverityWarning,
Message: fmt.Sprintf("System health trend is %s (current score: %.2f)", snapshot.Trend.String(), snapshot.HealthScore),
Context: map[string]interface{}{
"trend": snapshot.Trend.String(),
"health_score": snapshot.HealthScore,
"recent_snapshots": hcr.getRecentSnapshots(5),
},
}
hcr.integrityMonitor.sendAlert(alert)
}
}
func (hcr *HealthCheckRunner) readyForAlerts(healthSummary map[string]interface{}, snapshot HealthSnapshot) bool {
hcr.mu.RLock()
historyLen := len(hcr.healthHistory)
hcr.mu.RUnlock()
if historyLen < hcr.warmupSamples {
return false
}
totalProcessed := safeNumericLookup(healthSummary, "total_addresses_processed")
if totalProcessed >= 0 && totalProcessed < float64(hcr.minAddressesForAlerts) {
return false
}
// Require at least one validation or contract call attempt before alarming.
if snapshot.ValidationSuccess == 0 && snapshot.ContractCallSuccess == 0 && totalProcessed == 0 {
return false
}
return true
}
func safeNumericLookup(summary map[string]interface{}, key string) float64 {
if summary == nil {
return -1
}
value, ok := summary[key]
if !ok {
return -1
}
switch v := value.(type) {
case int:
return float64(v)
case int32:
return float64(v)
case int64:
return float64(v)
case uint:
return float64(v)
case uint32:
return float64(v)
case uint64:
return float64(v)
case float32:
return float64(v)
case float64:
return v
default:
return -1
}
}
func (hcr *HealthCheckRunner) historySize() int {
hcr.mu.RLock()
defer hcr.mu.RUnlock()
return len(hcr.healthHistory)
}
// logHealthStatus logs periodic health status information
func (hcr *HealthCheckRunner) logHealthStatus(snapshot HealthSnapshot, duration time.Duration) {
// Log detailed status every 5 minutes (10 checks at 30s intervals)
if len(hcr.healthHistory)%10 == 0 {
hcr.logger.Info("System health status",
"health_score", snapshot.HealthScore,
"corruption_rate", fmt.Sprintf("%.4f", snapshot.CorruptionRate),
"validation_success", fmt.Sprintf("%.4f", snapshot.ValidationSuccess),
"contract_call_success", fmt.Sprintf("%.4f", snapshot.ContractCallSuccess),
"trend", snapshot.Trend.String(),
"check_duration", duration)
} else {
// Brief status for regular checks
hcr.logger.Debug("Health check completed",
"health_score", snapshot.HealthScore,
"trend", snapshot.Trend.String(),
"duration", duration)
}
}
// GetRecentSnapshots returns the most recent health snapshots
func (hcr *HealthCheckRunner) GetRecentSnapshots(count int) []HealthSnapshot {
return hcr.getRecentSnapshots(count)
}
// getRecentSnapshots internal implementation
func (hcr *HealthCheckRunner) getRecentSnapshots(count int) []HealthSnapshot {
hcr.mu.RLock()
defer hcr.mu.RUnlock()
if len(hcr.healthHistory) == 0 {
return []HealthSnapshot{}
}
start := len(hcr.healthHistory) - count
if start < 0 {
start = 0
}
// Create a copy to avoid external modification
snapshots := make([]HealthSnapshot, len(hcr.healthHistory[start:]))
copy(snapshots, hcr.healthHistory[start:])
return snapshots
}
// GetHealthSummary returns a comprehensive health summary
func (hcr *HealthCheckRunner) GetHealthSummary() map[string]interface{} {
hcr.mu.RLock()
defer hcr.mu.RUnlock()
if len(hcr.healthHistory) == 0 {
return map[string]interface{}{
"running": hcr.running,
"check_interval": hcr.checkInterval.String(),
"history_size": 0,
"last_check": nil,
}
}
lastSnapshot := hcr.healthHistory[len(hcr.healthHistory)-1]
return map[string]interface{}{
"running": hcr.running,
"check_interval": hcr.checkInterval.String(),
"history_size": len(hcr.healthHistory),
"last_check": hcr.lastHealthCheck,
"current_health_score": lastSnapshot.HealthScore,
"current_trend": lastSnapshot.Trend.String(),
"corruption_rate": lastSnapshot.CorruptionRate,
"validation_success": lastSnapshot.ValidationSuccess,
"contract_call_success": lastSnapshot.ContractCallSuccess,
"recent_snapshots": hcr.getRecentSnapshots(10),
}
}
// SetCheckInterval sets the health check interval
func (hcr *HealthCheckRunner) SetCheckInterval(interval time.Duration) {
hcr.mu.Lock()
defer hcr.mu.Unlock()
hcr.checkInterval = interval
hcr.logger.Info("Health check interval updated", "interval", interval)
}
// IsRunning returns whether the health checker is running
func (hcr *HealthCheckRunner) IsRunning() bool {
hcr.mu.RLock()
defer hcr.mu.RUnlock()
return hcr.running
}

View File

@@ -0,0 +1,533 @@
package monitoring
import (
"context"
"fmt"
"sync"
"time"
"github.com/ethereum/go-ethereum/common"
"github.com/fraktal/mev-beta/internal/logger"
"github.com/fraktal/mev-beta/internal/recovery"
)
// IntegrityMetrics tracks data integrity statistics
type IntegrityMetrics struct {
mu sync.RWMutex
TotalAddressesProcessed int64
CorruptAddressesDetected int64
AddressValidationPassed int64
AddressValidationFailed int64
ContractCallsSucceeded int64
ContractCallsFailed int64
RetryOperationsTriggered int64
FallbackOperationsUsed int64
CircuitBreakersTripped int64
LastCorruptionDetection time.Time
AverageCorruptionScore float64
MaxCorruptionScore int
HealthScore float64
HighScore float64
RecoveryActions map[recovery.RecoveryAction]int64
ErrorsByType map[recovery.ErrorType]int64
}
// MetricsSnapshot represents a copy of metrics without mutex for safe external access
type MetricsSnapshot struct {
TotalAddressesProcessed int64 `json:"total_addresses_processed"`
CorruptAddressesDetected int64 `json:"corrupt_addresses_detected"`
AddressValidationPassed int64 `json:"address_validation_passed"`
AddressValidationFailed int64 `json:"address_validation_failed"`
ContractCallsSucceeded int64 `json:"contract_calls_succeeded"`
ContractCallsFailed int64 `json:"contract_calls_failed"`
RetryOperationsTriggered int64 `json:"retry_operations_triggered"`
FallbackOperationsUsed int64 `json:"fallback_operations_used"`
CircuitBreakersTripped int64 `json:"circuit_breakers_tripped"`
LastCorruptionDetection time.Time `json:"last_corruption_detection"`
AverageCorruptionScore float64 `json:"average_corruption_score"`
MaxCorruptionScore int `json:"max_corruption_score"`
HealthScore float64 `json:"health_score"`
HighScore float64 `json:"high_score"`
RecoveryActions map[recovery.RecoveryAction]int64 `json:"recovery_actions"`
ErrorsByType map[recovery.ErrorType]int64 `json:"errors_by_type"`
}
// CorruptionAlert represents a corruption detection alert
type CorruptionAlert struct {
Timestamp time.Time
Address common.Address
CorruptionScore int
Source string
Severity AlertSeverity
Message string
Context map[string]interface{}
}
// AlertSeverity defines alert severity levels
type AlertSeverity int
const (
AlertSeverityInfo AlertSeverity = iota
AlertSeverityWarning
AlertSeverityCritical
AlertSeverityEmergency
)
func (s AlertSeverity) String() string {
switch s {
case AlertSeverityInfo:
return "INFO"
case AlertSeverityWarning:
return "WARNING"
case AlertSeverityCritical:
return "CRITICAL"
case AlertSeverityEmergency:
return "EMERGENCY"
default:
return "UNKNOWN"
}
}
// IntegrityMonitor monitors and tracks data integrity metrics
type IntegrityMonitor struct {
mu sync.RWMutex
logger *logger.Logger
metrics *IntegrityMetrics
alertThresholds map[string]float64
alertSubscribers []AlertSubscriber
healthCheckRunner *HealthCheckRunner
enabled bool
alerts []CorruptionAlert
alertsMutex sync.RWMutex
}
// AlertSubscriber defines the interface for alert handlers
type AlertSubscriber interface {
HandleAlert(alert CorruptionAlert) error
}
// NewIntegrityMonitor creates a new integrity monitoring system
func NewIntegrityMonitor(logger *logger.Logger) *IntegrityMonitor {
monitor := &IntegrityMonitor{
logger: logger,
metrics: &IntegrityMetrics{
RecoveryActions: make(map[recovery.RecoveryAction]int64),
ErrorsByType: make(map[recovery.ErrorType]int64),
HealthScore: 1.0,
HighScore: 1.0,
},
alertThresholds: make(map[string]float64),
enabled: true,
alerts: make([]CorruptionAlert, 0, 256),
}
// Set default thresholds
monitor.setDefaultThresholds()
// Initialize health check runner
monitor.healthCheckRunner = NewHealthCheckRunner(logger, monitor)
return monitor
}
// setDefaultThresholds configures default alert thresholds
func (im *IntegrityMonitor) setDefaultThresholds() {
im.alertThresholds["corruption_rate"] = 0.05 // 5% corruption rate
im.alertThresholds["failure_rate"] = 0.10 // 10% failure rate
im.alertThresholds["health_score_min"] = 0.80 // 80% minimum health
im.alertThresholds["max_corruption_score"] = 70.0 // Maximum individual corruption score
im.alertThresholds["circuit_breaker_rate"] = 0.02 // 2% circuit breaker rate
}
// RecordAddressProcessed increments the counter for processed addresses
func (im *IntegrityMonitor) RecordAddressProcessed() {
if !im.enabled {
return
}
im.metrics.mu.Lock()
im.metrics.TotalAddressesProcessed++
im.metrics.mu.Unlock()
im.updateHealthScore()
}
// RecordCorruptionDetected records a corruption detection event
func (im *IntegrityMonitor) RecordCorruptionDetected(address common.Address, corruptionScore int, source string) {
if !im.enabled {
return
}
im.metrics.mu.Lock()
im.metrics.CorruptAddressesDetected++
im.metrics.LastCorruptionDetection = time.Now()
// Update corruption statistics
if corruptionScore > im.metrics.MaxCorruptionScore {
im.metrics.MaxCorruptionScore = corruptionScore
}
// Calculate rolling average corruption score
total := float64(im.metrics.CorruptAddressesDetected)
im.metrics.AverageCorruptionScore = ((im.metrics.AverageCorruptionScore * (total - 1)) + float64(corruptionScore)) / total
im.metrics.mu.Unlock()
// Generate alert based on corruption score
severity := im.getCorruptionSeverity(corruptionScore)
alert := CorruptionAlert{
Timestamp: time.Now(),
Address: address,
CorruptionScore: corruptionScore,
Source: source,
Severity: severity,
Message: fmt.Sprintf("Corruption detected: address %s, score %d, source %s", address.Hex(), corruptionScore, source),
Context: map[string]interface{}{
"address": address.Hex(),
"corruption_score": corruptionScore,
"source": source,
"timestamp": time.Now().Unix(),
},
}
im.sendAlert(alert)
im.updateHealthScore()
im.logger.Warn("Corruption detected",
"address", address.Hex(),
"corruption_score", corruptionScore,
"source", source,
"severity", severity.String())
}
// RecordValidationResult records address validation results
func (im *IntegrityMonitor) RecordValidationResult(passed bool) {
if !im.enabled {
return
}
im.metrics.mu.Lock()
if passed {
im.metrics.AddressValidationPassed++
} else {
im.metrics.AddressValidationFailed++
}
im.metrics.mu.Unlock()
im.updateHealthScore()
}
// RecordContractCallResult records contract call success/failure
func (im *IntegrityMonitor) RecordContractCallResult(succeeded bool) {
if !im.enabled {
return
}
im.metrics.mu.Lock()
if succeeded {
im.metrics.ContractCallsSucceeded++
} else {
im.metrics.ContractCallsFailed++
}
im.metrics.mu.Unlock()
im.updateHealthScore()
}
// RecordRecoveryAction records recovery action usage
func (im *IntegrityMonitor) RecordRecoveryAction(action recovery.RecoveryAction) {
if !im.enabled {
return
}
im.metrics.mu.Lock()
im.metrics.RecoveryActions[action]++
// Track specific metrics
switch action {
case recovery.ActionRetryWithBackoff:
im.metrics.RetryOperationsTriggered++
case recovery.ActionUseFallbackData:
im.metrics.FallbackOperationsUsed++
case recovery.ActionCircuitBreaker:
im.metrics.CircuitBreakersTripped++
}
im.metrics.mu.Unlock()
im.updateHealthScore()
}
// RecordErrorType records error by type
func (im *IntegrityMonitor) RecordErrorType(errorType recovery.ErrorType) {
if !im.enabled {
return
}
im.metrics.mu.Lock()
im.metrics.ErrorsByType[errorType]++
im.metrics.mu.Unlock()
}
// getCorruptionSeverity determines alert severity based on corruption score
func (im *IntegrityMonitor) getCorruptionSeverity(corruptionScore int) AlertSeverity {
if corruptionScore >= 90 {
return AlertSeverityEmergency
} else if corruptionScore >= 70 {
return AlertSeverityCritical
} else if corruptionScore >= 40 {
return AlertSeverityWarning
}
return AlertSeverityInfo
}
// updateHealthScore calculates overall system health score
func (im *IntegrityMonitor) updateHealthScore() {
im.metrics.mu.Lock()
defer im.metrics.mu.Unlock()
if im.metrics.TotalAddressesProcessed == 0 {
im.metrics.HealthScore = 1.0
return
}
// Calculate component scores
corruptionRate := float64(im.metrics.CorruptAddressesDetected) / float64(im.metrics.TotalAddressesProcessed)
var validationSuccessRate float64 = 1.0
validationTotal := im.metrics.AddressValidationPassed + im.metrics.AddressValidationFailed
if validationTotal > 0 {
validationSuccessRate = float64(im.metrics.AddressValidationPassed) / float64(validationTotal)
}
var contractCallSuccessRate float64 = 1.0
contractTotal := im.metrics.ContractCallsSucceeded + im.metrics.ContractCallsFailed
if contractTotal > 0 {
contractCallSuccessRate = float64(im.metrics.ContractCallsSucceeded) / float64(contractTotal)
}
// Weighted health score calculation
healthScore := 0.0
healthScore += (1.0 - corruptionRate) * 0.4 // 40% weight on corruption prevention
healthScore += validationSuccessRate * 0.3 // 30% weight on validation success
healthScore += contractCallSuccessRate * 0.3 // 30% weight on contract call success
// Cap at 1.0 and handle edge cases
if healthScore > 1.0 {
healthScore = 1.0
} else if healthScore < 0.0 {
healthScore = 0.0
}
im.metrics.HealthScore = healthScore
if healthScore > im.metrics.HighScore {
im.metrics.HighScore = healthScore
}
// Check for health score threshold alerts
if healthScore < im.alertThresholds["health_score_min"] {
alert := CorruptionAlert{
Timestamp: time.Now(),
Severity: AlertSeverityCritical,
Message: fmt.Sprintf("System health score dropped to %.2f (threshold: %.2f)", healthScore, im.alertThresholds["health_score_min"]),
Context: map[string]interface{}{
"health_score": healthScore,
"threshold": im.alertThresholds["health_score_min"],
"corruption_rate": corruptionRate,
"validation_success": validationSuccessRate,
"contract_call_success": contractCallSuccessRate,
},
}
im.sendAlert(alert)
}
}
// sendAlert sends alerts to all subscribers
func (im *IntegrityMonitor) sendAlert(alert CorruptionAlert) {
im.alertsMutex.Lock()
im.alerts = append(im.alerts, alert)
if len(im.alerts) > 1000 {
trimmed := make([]CorruptionAlert, 1000)
copy(trimmed, im.alerts[len(im.alerts)-1000:])
im.alerts = trimmed
}
im.alertsMutex.Unlock()
for _, subscriber := range im.alertSubscribers {
if err := subscriber.HandleAlert(alert); err != nil {
im.logger.Error("Failed to send alert",
"subscriber", fmt.Sprintf("%T", subscriber),
"error", err)
}
}
}
// AddAlertSubscriber adds an alert subscriber
func (im *IntegrityMonitor) AddAlertSubscriber(subscriber AlertSubscriber) {
im.mu.Lock()
defer im.mu.Unlock()
im.alertSubscribers = append(im.alertSubscribers, subscriber)
}
// GetMetrics returns a copy of current metrics
func (im *IntegrityMonitor) GetMetrics() MetricsSnapshot {
im.metrics.mu.RLock()
defer im.metrics.mu.RUnlock()
// Create a deep copy
metrics := IntegrityMetrics{
TotalAddressesProcessed: im.metrics.TotalAddressesProcessed,
CorruptAddressesDetected: im.metrics.CorruptAddressesDetected,
AddressValidationPassed: im.metrics.AddressValidationPassed,
AddressValidationFailed: im.metrics.AddressValidationFailed,
ContractCallsSucceeded: im.metrics.ContractCallsSucceeded,
ContractCallsFailed: im.metrics.ContractCallsFailed,
RetryOperationsTriggered: im.metrics.RetryOperationsTriggered,
FallbackOperationsUsed: im.metrics.FallbackOperationsUsed,
CircuitBreakersTripped: im.metrics.CircuitBreakersTripped,
LastCorruptionDetection: im.metrics.LastCorruptionDetection,
AverageCorruptionScore: im.metrics.AverageCorruptionScore,
MaxCorruptionScore: im.metrics.MaxCorruptionScore,
HealthScore: im.metrics.HealthScore,
HighScore: im.metrics.HighScore,
RecoveryActions: make(map[recovery.RecoveryAction]int64),
ErrorsByType: make(map[recovery.ErrorType]int64),
}
// Copy maps
for k, v := range im.metrics.RecoveryActions {
metrics.RecoveryActions[k] = v
}
for k, v := range im.metrics.ErrorsByType {
metrics.ErrorsByType[k] = v
}
// Return a safe copy without mutex
return MetricsSnapshot{
TotalAddressesProcessed: metrics.TotalAddressesProcessed,
CorruptAddressesDetected: metrics.CorruptAddressesDetected,
AddressValidationPassed: metrics.AddressValidationPassed,
AddressValidationFailed: metrics.AddressValidationFailed,
ContractCallsSucceeded: metrics.ContractCallsSucceeded,
ContractCallsFailed: metrics.ContractCallsFailed,
RetryOperationsTriggered: metrics.RetryOperationsTriggered,
FallbackOperationsUsed: metrics.FallbackOperationsUsed,
CircuitBreakersTripped: metrics.CircuitBreakersTripped,
LastCorruptionDetection: metrics.LastCorruptionDetection,
AverageCorruptionScore: metrics.AverageCorruptionScore,
MaxCorruptionScore: metrics.MaxCorruptionScore,
HealthScore: metrics.HealthScore,
HighScore: metrics.HighScore,
RecoveryActions: metrics.RecoveryActions,
ErrorsByType: metrics.ErrorsByType,
}
}
// GetHealthSummary returns a comprehensive health summary
func (im *IntegrityMonitor) GetHealthSummary() map[string]interface{} {
metrics := im.GetMetrics()
corruptionRate := 0.0
if metrics.TotalAddressesProcessed > 0 {
corruptionRate = float64(metrics.CorruptAddressesDetected) / float64(metrics.TotalAddressesProcessed)
}
validationSuccessRate := 0.0
totalValidations := metrics.AddressValidationPassed + metrics.AddressValidationFailed
if totalValidations > 0 {
validationSuccessRate = float64(metrics.AddressValidationPassed) / float64(totalValidations)
}
contractCallSuccessRate := 0.0
totalCalls := metrics.ContractCallsSucceeded + metrics.ContractCallsFailed
if totalCalls > 0 {
contractCallSuccessRate = float64(metrics.ContractCallsSucceeded) / float64(totalCalls)
}
return map[string]interface{}{
"enabled": im.enabled,
"health_score": metrics.HealthScore,
"total_addresses_processed": metrics.TotalAddressesProcessed,
"corruption_detections": metrics.CorruptAddressesDetected,
"corruption_rate": corruptionRate,
"validation_success_rate": validationSuccessRate,
"contract_call_success_rate": contractCallSuccessRate,
"average_corruption_score": metrics.AverageCorruptionScore,
"max_corruption_score": metrics.MaxCorruptionScore,
"retry_operations": metrics.RetryOperationsTriggered,
"fallback_operations": metrics.FallbackOperationsUsed,
"circuit_breakers_tripped": metrics.CircuitBreakersTripped,
"last_corruption": metrics.LastCorruptionDetection,
"recovery_actions": metrics.RecoveryActions,
"errors_by_type": metrics.ErrorsByType,
"alert_thresholds": im.alertThresholds,
"alert_subscribers": len(im.alertSubscribers),
}
}
// GetRecentAlerts returns the most recent corruption alerts up to the specified limit.
func (im *IntegrityMonitor) GetRecentAlerts(limit int) []CorruptionAlert {
im.alertsMutex.RLock()
defer im.alertsMutex.RUnlock()
if limit <= 0 || limit > len(im.alerts) {
limit = len(im.alerts)
}
if limit == 0 {
return []CorruptionAlert{}
}
start := len(im.alerts) - limit
alertsCopy := make([]CorruptionAlert, limit)
copy(alertsCopy, im.alerts[start:])
return alertsCopy
}
// SetThreshold sets an alert threshold
func (im *IntegrityMonitor) SetThreshold(name string, value float64) {
im.mu.Lock()
defer im.mu.Unlock()
im.alertThresholds[name] = value
}
// Enable enables the integrity monitor
func (im *IntegrityMonitor) Enable() {
im.mu.Lock()
defer im.mu.Unlock()
im.enabled = true
im.logger.Info("Integrity monitor enabled")
}
// Disable disables the integrity monitor
func (im *IntegrityMonitor) Disable() {
im.mu.Lock()
defer im.mu.Unlock()
im.enabled = false
im.logger.Info("Integrity monitor disabled")
}
// IsEnabled returns whether the monitor is enabled
func (im *IntegrityMonitor) IsEnabled() bool {
im.mu.RLock()
defer im.mu.RUnlock()
return im.enabled
}
// StartHealthCheckRunner starts the periodic health check routine
func (im *IntegrityMonitor) StartHealthCheckRunner(ctx context.Context) {
if im.healthCheckRunner != nil {
im.healthCheckRunner.Start(ctx)
}
}
// StopHealthCheckRunner stops the periodic health check routine
func (im *IntegrityMonitor) StopHealthCheckRunner() {
if im.healthCheckRunner != nil {
im.healthCheckRunner.Stop()
}
}
// GetHealthCheckRunner returns the health check runner
func (im *IntegrityMonitor) GetHealthCheckRunner() *HealthCheckRunner {
return im.healthCheckRunner
}

View File

@@ -0,0 +1,391 @@
package monitoring
import (
"fmt"
"testing"
"time"
"github.com/ethereum/go-ethereum/common"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/fraktal/mev-beta/internal/logger"
"github.com/fraktal/mev-beta/internal/recovery"
)
// MockAlertSubscriber for testing
type MockAlertSubscriber struct {
alerts []CorruptionAlert
}
func (m *MockAlertSubscriber) HandleAlert(alert CorruptionAlert) error {
m.alerts = append(m.alerts, alert)
return nil
}
func (m *MockAlertSubscriber) GetAlerts() []CorruptionAlert {
return m.alerts
}
func (m *MockAlertSubscriber) Reset() {
m.alerts = nil
}
func TestIntegrityMonitor_RecordCorruptionDetected(t *testing.T) {
log := logger.New("error", "text", "")
monitor := NewIntegrityMonitor(log)
mockSubscriber := &MockAlertSubscriber{}
monitor.AddAlertSubscriber(mockSubscriber)
// Test various corruption scenarios
testCases := []struct {
name string
address string
corruptionScore int
source string
expectedSeverity AlertSeverity
}{
{
name: "Low corruption",
address: "0x1234567890123456789012345678901234567890",
corruptionScore: 30,
source: "test_source",
expectedSeverity: AlertSeverityInfo,
},
{
name: "Medium corruption",
address: "0x1234000000000000000000000000000000000000",
corruptionScore: 50,
source: "token_extraction",
expectedSeverity: AlertSeverityWarning,
},
{
name: "High corruption",
address: "0x0000001000000000000000000000000000000000",
corruptionScore: 80,
source: "abi_decoder",
expectedSeverity: AlertSeverityCritical,
},
{
name: "Critical corruption - TOKEN_0x000000",
address: "0x0000000300000000000000000000000000000000",
corruptionScore: 100,
source: "generic_extraction",
expectedSeverity: AlertSeverityEmergency,
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
mockSubscriber.Reset()
addr := common.HexToAddress(tc.address)
monitor.RecordCorruptionDetected(addr, tc.corruptionScore, tc.source)
// Verify metrics were updated
metrics := monitor.GetMetrics()
assert.Greater(t, metrics.CorruptAddressesDetected, int64(0))
assert.GreaterOrEqual(t, metrics.MaxCorruptionScore, tc.corruptionScore)
// Verify alert was generated
alerts := mockSubscriber.GetAlerts()
require.Len(t, alerts, 1)
alert := alerts[0]
assert.Equal(t, tc.expectedSeverity, alert.Severity)
assert.Equal(t, addr, alert.Address)
assert.Equal(t, tc.corruptionScore, alert.CorruptionScore)
assert.Equal(t, tc.source, alert.Source)
assert.Contains(t, alert.Message, "Corruption detected")
})
}
}
func TestIntegrityMonitor_HealthScoreCalculation(t *testing.T) {
log := logger.New("error", "text", "")
monitor := NewIntegrityMonitor(log)
// Test initial health score
metrics := monitor.GetMetrics()
assert.Equal(t, 1.0, metrics.HealthScore) // Perfect health initially
// Record some activity
monitor.RecordAddressProcessed()
monitor.RecordAddressProcessed()
monitor.RecordValidationResult(true)
monitor.RecordValidationResult(true)
monitor.RecordContractCallResult(true)
monitor.RecordContractCallResult(true)
// Health should still be perfect
metrics = monitor.GetMetrics()
assert.Equal(t, 1.0, metrics.HealthScore)
// Introduce some corruption
addr := common.HexToAddress("0x0000000300000000000000000000000000000000")
monitor.RecordCorruptionDetected(addr, 80, "test")
// Health score should decrease
metrics = monitor.GetMetrics()
assert.Less(t, metrics.HealthScore, 1.0)
assert.Greater(t, metrics.HealthScore, 0.0)
// Add validation failures
monitor.RecordValidationResult(false)
monitor.RecordValidationResult(false)
// Health should decrease further
newMetrics := monitor.GetMetrics()
assert.Less(t, newMetrics.HealthScore, metrics.HealthScore)
}
func TestIntegrityMonitor_RecoveryActionTracking(t *testing.T) {
log := logger.New("error", "text", "")
monitor := NewIntegrityMonitor(log)
// Record various recovery actions
monitor.RecordRecoveryAction(recovery.ActionRetryWithBackoff)
monitor.RecordRecoveryAction(recovery.ActionRetryWithBackoff)
monitor.RecordRecoveryAction(recovery.ActionUseFallbackData)
monitor.RecordRecoveryAction(recovery.ActionCircuitBreaker)
metrics := monitor.GetMetrics()
// Verify action counts
assert.Equal(t, int64(2), metrics.RecoveryActions[recovery.ActionRetryWithBackoff])
assert.Equal(t, int64(1), metrics.RecoveryActions[recovery.ActionUseFallbackData])
assert.Equal(t, int64(1), metrics.RecoveryActions[recovery.ActionCircuitBreaker])
// Verify specific counters
assert.Equal(t, int64(2), metrics.RetryOperationsTriggered)
assert.Equal(t, int64(1), metrics.FallbackOperationsUsed)
assert.Equal(t, int64(1), metrics.CircuitBreakersTripped)
}
func TestIntegrityMonitor_ErrorTypeTracking(t *testing.T) {
log := logger.New("error", "text", "")
monitor := NewIntegrityMonitor(log)
// Record various error types
errorTypes := []recovery.ErrorType{
recovery.ErrorTypeAddressCorruption,
recovery.ErrorTypeContractCallFailed,
recovery.ErrorTypeRPCConnectionFailed,
recovery.ErrorTypeDataParsingFailed,
recovery.ErrorTypeValidationFailed,
recovery.ErrorTypeAddressCorruption, // Duplicate
}
for _, errorType := range errorTypes {
monitor.RecordErrorType(errorType)
}
metrics := monitor.GetMetrics()
// Verify error type counts
assert.Equal(t, int64(2), metrics.ErrorsByType[recovery.ErrorTypeAddressCorruption])
assert.Equal(t, int64(1), metrics.ErrorsByType[recovery.ErrorTypeContractCallFailed])
assert.Equal(t, int64(1), metrics.ErrorsByType[recovery.ErrorTypeRPCConnectionFailed])
assert.Equal(t, int64(1), metrics.ErrorsByType[recovery.ErrorTypeDataParsingFailed])
assert.Equal(t, int64(1), metrics.ErrorsByType[recovery.ErrorTypeValidationFailed])
}
func TestIntegrityMonitor_GetHealthSummary(t *testing.T) {
log := logger.New("error", "text", "")
monitor := NewIntegrityMonitor(log)
// Generate some activity
for i := 0; i < 100; i++ {
monitor.RecordAddressProcessed()
if i%10 == 0 { // 10% corruption rate
addr := common.HexToAddress(fmt.Sprintf("0x%040d", i))
monitor.RecordCorruptionDetected(addr, 50, "test")
}
monitor.RecordValidationResult(i%20 != 0) // 95% success rate
monitor.RecordContractCallResult(i%10 != 0) // 90% success rate
}
summary := monitor.GetHealthSummary()
// Verify summary structure
assert.True(t, summary["enabled"].(bool))
assert.Equal(t, int64(100), summary["total_addresses_processed"].(int64))
assert.Equal(t, int64(10), summary["corruption_detections"].(int64))
assert.InDelta(t, 0.1, summary["corruption_rate"].(float64), 0.01)
assert.InDelta(t, 0.95, summary["validation_success_rate"].(float64), 0.01)
assert.InDelta(t, 0.9, summary["contract_call_success_rate"].(float64), 0.01)
// Health score should be reasonable
healthScore := summary["health_score"].(float64)
assert.Greater(t, healthScore, 0.7) // Should be decent despite some issues
assert.Less(t, healthScore, 1.0) // Not perfect due to corruption
}
func TestIntegrityMonitor_AlertThresholds(t *testing.T) {
log := logger.New("error", "text", "")
monitor := NewIntegrityMonitor(log)
mockSubscriber := &MockAlertSubscriber{}
monitor.AddAlertSubscriber(mockSubscriber)
// Test health score threshold
monitor.SetThreshold("health_score_min", 0.8)
// Generate activity that drops health below threshold
for i := 0; i < 50; i++ {
monitor.RecordAddressProcessed()
// High corruption rate to drop health score
addr := common.HexToAddress(fmt.Sprintf("0x%040d", i))
monitor.RecordCorruptionDetected(addr, 80, "test")
}
// Should trigger health score alert
alerts := mockSubscriber.GetAlerts()
healthAlerts := 0
for _, alert := range alerts {
if alert.Severity == AlertSeverityCritical &&
alert.Context != nil &&
alert.Context["health_score"] != nil {
healthAlerts++
}
}
assert.Greater(t, healthAlerts, 0, "Should have triggered health score alerts")
}
func TestIntegrityMonitor_ConcurrentAccess(t *testing.T) {
log := logger.New("error", "text", "")
monitor := NewIntegrityMonitor(log)
const numGoroutines = 50
const operationsPerGoroutine = 100
done := make(chan bool, numGoroutines)
// Launch concurrent operations
for i := 0; i < numGoroutines; i++ {
go func(id int) {
defer func() { done <- true }()
for j := 0; j < operationsPerGoroutine; j++ {
// Perform various operations
monitor.RecordAddressProcessed()
monitor.RecordValidationResult(j%10 != 0)
monitor.RecordContractCallResult(j%5 != 0)
if j%20 == 0 { // Occasional corruption
addr := common.HexToAddress(fmt.Sprintf("0x%020d%020d", id, j))
monitor.RecordCorruptionDetected(addr, 60, fmt.Sprintf("goroutine_%d", id))
}
// Recovery actions
if j%15 == 0 {
monitor.RecordRecoveryAction(recovery.ActionRetryWithBackoff)
}
if j%25 == 0 {
monitor.RecordErrorType(recovery.ErrorTypeAddressCorruption)
}
}
}(i)
}
// Wait for completion
for i := 0; i < numGoroutines; i++ {
select {
case <-done:
// Success
case <-time.After(10 * time.Second):
t.Fatal("Concurrent test timed out")
}
}
// Verify final metrics are consistent
metrics := monitor.GetMetrics()
expectedAddresses := int64(numGoroutines * operationsPerGoroutine)
assert.Equal(t, expectedAddresses, metrics.TotalAddressesProcessed)
// Should have some corruption detections
assert.Greater(t, metrics.CorruptAddressesDetected, int64(0))
// Should have recorded recovery actions
assert.Greater(t, metrics.RetryOperationsTriggered, int64(0))
// Health score should be calculated
assert.GreaterOrEqual(t, metrics.HealthScore, 0.0)
assert.LessOrEqual(t, metrics.HealthScore, 1.0)
t.Logf("Final metrics: Processed=%d, Corrupted=%d, Health=%.3f",
metrics.TotalAddressesProcessed,
metrics.CorruptAddressesDetected,
metrics.HealthScore)
}
func TestIntegrityMonitor_DisableEnable(t *testing.T) {
log := logger.New("error", "text", "")
monitor := NewIntegrityMonitor(log)
// Should be enabled by default
assert.True(t, monitor.IsEnabled())
// Record some activity
monitor.RecordAddressProcessed()
monitor.RecordValidationResult(true)
initialMetrics := monitor.GetMetrics()
assert.Greater(t, initialMetrics.TotalAddressesProcessed, int64(0))
// Disable monitor
monitor.Disable()
assert.False(t, monitor.IsEnabled())
// Activity should not be recorded when disabled
monitor.RecordAddressProcessed()
monitor.RecordValidationResult(true)
disabledMetrics := monitor.GetMetrics()
assert.Equal(t, initialMetrics.TotalAddressesProcessed, disabledMetrics.TotalAddressesProcessed)
// Re-enable
monitor.Enable()
assert.True(t, monitor.IsEnabled())
// Activity should be recorded again
monitor.RecordAddressProcessed()
enabledMetrics := monitor.GetMetrics()
assert.Greater(t, enabledMetrics.TotalAddressesProcessed, disabledMetrics.TotalAddressesProcessed)
}
func TestIntegrityMonitor_Performance(t *testing.T) {
log := logger.New("error", "text", "")
monitor := NewIntegrityMonitor(log)
const iterations = 10000
// Benchmark recording operations
start := time.Now()
for i := 0; i < iterations; i++ {
monitor.RecordAddressProcessed()
monitor.RecordValidationResult(i%10 != 0)
monitor.RecordContractCallResult(i%5 != 0)
if i%100 == 0 {
addr := common.HexToAddress(fmt.Sprintf("0x%040d", i))
monitor.RecordCorruptionDetected(addr, 50, "benchmark")
}
}
duration := time.Since(start)
avgTime := duration / iterations
t.Logf("Performance: %d operations in %v (avg: %v per operation)",
iterations, duration, avgTime)
// Should be reasonably fast (under 500 microseconds per operation is acceptable)
maxTime := 500 * time.Microsecond
assert.Less(t, avgTime.Nanoseconds(), maxTime.Nanoseconds(),
"Recording should be faster than %v per operation (got %v)", maxTime, avgTime)
// Verify metrics are accurate
metrics := monitor.GetMetrics()
assert.Equal(t, int64(iterations), metrics.TotalAddressesProcessed)
assert.Equal(t, int64(100), metrics.CorruptAddressesDetected) // Every 100th iteration
}