Compare commits
2 Commits
feature/re
...
feature/in
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ac1953b2c3 | ||
|
|
21a1f9caee |
771
docs/PROMETHEUS_SETUP.md
Normal file
771
docs/PROMETHEUS_SETUP.md
Normal file
@@ -0,0 +1,771 @@
|
|||||||
|
# Prometheus Monitoring Setup
|
||||||
|
|
||||||
|
Complete guide for production monitoring with Prometheus and Grafana.
|
||||||
|
|
||||||
|
## Table of Contents
|
||||||
|
|
||||||
|
1. [Overview](#overview)
|
||||||
|
2. [Quick Start](#quick-start)
|
||||||
|
3. [Metrics Exposed](#metrics-exposed)
|
||||||
|
4. [Prometheus Configuration](#prometheus-configuration)
|
||||||
|
5. [Grafana Dashboards](#grafana-dashboards)
|
||||||
|
6. [Alert Rules](#alert-rules)
|
||||||
|
7. [Production Deployment](#production-deployment)
|
||||||
|
8. [Query Examples](#query-examples)
|
||||||
|
9. [Troubleshooting](#troubleshooting)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
The MEV Bot V2 exposes comprehensive Prometheus metrics for production monitoring and observability. All metrics follow Prometheus best practices with proper naming, labeling, and types.
|
||||||
|
|
||||||
|
**Metrics Endpoint**: `http://localhost:8080/metrics`
|
||||||
|
|
||||||
|
**Metric Categories**:
|
||||||
|
- **Sequencer**: Message reception, parsing, validation
|
||||||
|
- **Arbitrage**: Opportunity detection and execution
|
||||||
|
- **Performance**: Latency histograms for critical operations
|
||||||
|
- **Cache**: Pool cache hits/misses and size
|
||||||
|
- **RPC**: Connection pool metrics
|
||||||
|
- **Mempool**: Transaction monitoring
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Quick Start
|
||||||
|
|
||||||
|
### 1. Start the MEV Bot
|
||||||
|
|
||||||
|
The bot automatically exposes metrics on port 8080:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Using Docker Compose (recommended)
|
||||||
|
docker-compose up -d mev-bot
|
||||||
|
|
||||||
|
# Or standalone container
|
||||||
|
podman run -d \
|
||||||
|
--name mev-bot \
|
||||||
|
-p 8080:8080 \
|
||||||
|
-e RPC_URL=https://arb1.arbitrum.io/rpc \
|
||||||
|
-e WS_URL=wss://arb1.arbitrum.io/ws \
|
||||||
|
mev-bot-v2:latest
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Verify Metrics Endpoint
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl http://localhost:8080/metrics
|
||||||
|
```
|
||||||
|
|
||||||
|
You should see output like:
|
||||||
|
|
||||||
|
```
|
||||||
|
# HELP mev_sequencer_messages_received_total Total number of messages received from Arbitrum sequencer feed
|
||||||
|
# TYPE mev_sequencer_messages_received_total counter
|
||||||
|
mev_sequencer_messages_received_total 1234
|
||||||
|
|
||||||
|
# HELP mev_parse_latency_seconds Time taken to parse a transaction
|
||||||
|
# TYPE mev_parse_latency_seconds histogram
|
||||||
|
mev_parse_latency_seconds_bucket{le="0.001"} 450
|
||||||
|
mev_parse_latency_seconds_bucket{le="0.005"} 890
|
||||||
|
...
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Start Prometheus
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Using provided configuration
|
||||||
|
docker-compose up -d prometheus
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4. Start Grafana
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Access at http://localhost:3000
|
||||||
|
docker-compose up -d grafana
|
||||||
|
```
|
||||||
|
|
||||||
|
**Default Credentials**: `admin` / `admin` (change on first login)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Metrics Exposed
|
||||||
|
|
||||||
|
### Sequencer Metrics
|
||||||
|
|
||||||
|
#### Counters
|
||||||
|
|
||||||
|
```
|
||||||
|
mev_sequencer_messages_received_total
|
||||||
|
Total number of messages received from Arbitrum sequencer feed
|
||||||
|
|
||||||
|
mev_sequencer_transactions_processed_total
|
||||||
|
Total number of transactions processed from sequencer
|
||||||
|
|
||||||
|
mev_sequencer_parse_errors_total
|
||||||
|
Total number of parsing errors
|
||||||
|
|
||||||
|
mev_sequencer_validation_errors_total
|
||||||
|
Total number of validation errors
|
||||||
|
|
||||||
|
mev_sequencer_swaps_detected_total
|
||||||
|
Total number of swap events detected (labeled by protocol)
|
||||||
|
Labels: protocol, version, type
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Histograms
|
||||||
|
|
||||||
|
```
|
||||||
|
mev_parse_latency_seconds
|
||||||
|
Time taken to parse a transaction
|
||||||
|
Buckets: 1ms, 5ms, 10ms, 25ms, 50ms, 100ms, 250ms, 500ms, 1s
|
||||||
|
|
||||||
|
mev_detection_latency_seconds
|
||||||
|
Time taken to detect arbitrage opportunities
|
||||||
|
Buckets: 1ms, 5ms, 10ms, 25ms, 50ms, 100ms, 250ms, 500ms, 1s
|
||||||
|
|
||||||
|
mev_execution_latency_seconds
|
||||||
|
Time taken to execute an arbitrage transaction
|
||||||
|
Buckets: 100ms, 250ms, 500ms, 1s, 2s, 5s, 10s
|
||||||
|
```
|
||||||
|
|
||||||
|
### Arbitrage Metrics
|
||||||
|
|
||||||
|
```
|
||||||
|
mev_opportunities_total
|
||||||
|
Total number of arbitrage opportunities detected
|
||||||
|
Labels: type (arbitrage, frontrun, backrun)
|
||||||
|
|
||||||
|
mev_executions_attempted_total
|
||||||
|
Total number of execution attempts
|
||||||
|
|
||||||
|
mev_executions_successful_total
|
||||||
|
Total number of successful executions
|
||||||
|
|
||||||
|
mev_executions_failed_total
|
||||||
|
Total number of failed executions
|
||||||
|
Labels: reason (gas_price, slippage, revert, timeout)
|
||||||
|
|
||||||
|
mev_profit_eth_total
|
||||||
|
Total profit in ETH across all successful executions
|
||||||
|
|
||||||
|
mev_gas_cost_eth_total
|
||||||
|
Total gas cost in ETH across all executions
|
||||||
|
```
|
||||||
|
|
||||||
|
### Pool Cache Metrics
|
||||||
|
|
||||||
|
```
|
||||||
|
mev_pool_cache_hits_total
|
||||||
|
Total number of cache hits
|
||||||
|
|
||||||
|
mev_pool_cache_misses_total
|
||||||
|
Total number of cache misses
|
||||||
|
|
||||||
|
mev_pool_cache_size
|
||||||
|
Current number of pools in cache (gauge)
|
||||||
|
|
||||||
|
mev_pool_cache_updates_total
|
||||||
|
Total number of cache updates
|
||||||
|
|
||||||
|
mev_pool_cache_evictions_total
|
||||||
|
Total number of cache evictions
|
||||||
|
```
|
||||||
|
|
||||||
|
### RPC Metrics
|
||||||
|
|
||||||
|
```
|
||||||
|
mev_rpc_requests_total
|
||||||
|
Total number of RPC requests
|
||||||
|
Labels: method (eth_call, eth_getBalance, etc.)
|
||||||
|
|
||||||
|
mev_rpc_errors_total
|
||||||
|
Total number of RPC errors
|
||||||
|
Labels: method, error_type
|
||||||
|
|
||||||
|
mev_rpc_latency_seconds
|
||||||
|
RPC request latency histogram
|
||||||
|
Labels: method
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Prometheus Configuration
|
||||||
|
|
||||||
|
### prometheus.yml
|
||||||
|
|
||||||
|
Create `config/prometheus/prometheus.yml`:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
global:
|
||||||
|
scrape_interval: 15s # Scrape targets every 15 seconds
|
||||||
|
evaluation_interval: 15s # Evaluate rules every 15 seconds
|
||||||
|
|
||||||
|
# Attach labels to all time series
|
||||||
|
external_labels:
|
||||||
|
monitor: 'mev-bot-prod'
|
||||||
|
environment: 'production'
|
||||||
|
|
||||||
|
# Alertmanager configuration
|
||||||
|
alerting:
|
||||||
|
alertmanagers:
|
||||||
|
- static_configs:
|
||||||
|
- targets:
|
||||||
|
- alertmanager:9093
|
||||||
|
|
||||||
|
# Load and evaluate rules
|
||||||
|
rule_files:
|
||||||
|
- "alerts/*.yml"
|
||||||
|
|
||||||
|
# Scrape configurations
|
||||||
|
scrape_configs:
|
||||||
|
# MEV Bot metrics
|
||||||
|
- job_name: 'mev-bot'
|
||||||
|
static_configs:
|
||||||
|
- targets: ['mev-bot:8080']
|
||||||
|
labels:
|
||||||
|
service: 'mev-bot'
|
||||||
|
component: 'main'
|
||||||
|
|
||||||
|
# Scrape interval for high-frequency metrics
|
||||||
|
scrape_interval: 5s
|
||||||
|
scrape_timeout: 4s
|
||||||
|
|
||||||
|
# Relabeling
|
||||||
|
relabel_configs:
|
||||||
|
- source_labels: [__address__]
|
||||||
|
target_label: instance
|
||||||
|
replacement: 'mev-bot-v2'
|
||||||
|
|
||||||
|
# Prometheus self-monitoring
|
||||||
|
- job_name: 'prometheus'
|
||||||
|
static_configs:
|
||||||
|
- targets: ['localhost:9090']
|
||||||
|
|
||||||
|
# Node exporter (system metrics)
|
||||||
|
- job_name: 'node'
|
||||||
|
static_configs:
|
||||||
|
- targets: ['node-exporter:9100']
|
||||||
|
labels:
|
||||||
|
service: 'system'
|
||||||
|
```
|
||||||
|
|
||||||
|
### Docker Compose Integration
|
||||||
|
|
||||||
|
Add to your `docker-compose.yml`:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
version: '3.8'
|
||||||
|
|
||||||
|
services:
|
||||||
|
mev-bot:
|
||||||
|
image: mev-bot-v2:latest
|
||||||
|
container_name: mev-bot
|
||||||
|
ports:
|
||||||
|
- "8080:8080" # Metrics endpoint
|
||||||
|
environment:
|
||||||
|
- RPC_URL=https://arb1.arbitrum.io/rpc
|
||||||
|
- WS_URL=wss://arb1.arbitrum.io/ws
|
||||||
|
- METRICS_PORT=8080
|
||||||
|
networks:
|
||||||
|
- monitoring
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
|
prometheus:
|
||||||
|
image: prom/prometheus:latest
|
||||||
|
container_name: prometheus
|
||||||
|
ports:
|
||||||
|
- "9090:9090"
|
||||||
|
volumes:
|
||||||
|
- ./config/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
||||||
|
- ./config/prometheus/alerts:/etc/prometheus/alerts:ro
|
||||||
|
- prometheus-data:/prometheus
|
||||||
|
command:
|
||||||
|
- '--config.file=/etc/prometheus/prometheus.yml'
|
||||||
|
- '--storage.tsdb.path=/prometheus'
|
||||||
|
- '--storage.tsdb.retention.time=30d'
|
||||||
|
- '--web.console.libraries=/usr/share/prometheus/console_libraries'
|
||||||
|
- '--web.console.templates=/usr/share/prometheus/consoles'
|
||||||
|
- '--web.enable-lifecycle'
|
||||||
|
networks:
|
||||||
|
- monitoring
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
|
grafana:
|
||||||
|
image: grafana/grafana:latest
|
||||||
|
container_name: grafana
|
||||||
|
ports:
|
||||||
|
- "3000:3000"
|
||||||
|
volumes:
|
||||||
|
- ./config/grafana/provisioning:/etc/grafana/provisioning:ro
|
||||||
|
- ./config/grafana/dashboards:/var/lib/grafana/dashboards:ro
|
||||||
|
- grafana-data:/var/lib/grafana
|
||||||
|
environment:
|
||||||
|
- GF_SECURITY_ADMIN_PASSWORD=admin
|
||||||
|
- GF_USERS_ALLOW_SIGN_UP=false
|
||||||
|
- GF_SERVER_ROOT_URL=http://localhost:3000
|
||||||
|
networks:
|
||||||
|
- monitoring
|
||||||
|
depends_on:
|
||||||
|
- prometheus
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
|
node-exporter:
|
||||||
|
image: prom/node-exporter:latest
|
||||||
|
container_name: node-exporter
|
||||||
|
ports:
|
||||||
|
- "9100:9100"
|
||||||
|
command:
|
||||||
|
- '--path.procfs=/host/proc'
|
||||||
|
- '--path.sysfs=/host/sys'
|
||||||
|
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
|
||||||
|
volumes:
|
||||||
|
- /proc:/host/proc:ro
|
||||||
|
- /sys:/host/sys:ro
|
||||||
|
- /:/rootfs:ro
|
||||||
|
networks:
|
||||||
|
- monitoring
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
|
networks:
|
||||||
|
monitoring:
|
||||||
|
driver: bridge
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
prometheus-data:
|
||||||
|
grafana-data:
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Grafana Dashboards
|
||||||
|
|
||||||
|
### Automatic Dashboard Provisioning
|
||||||
|
|
||||||
|
Create `config/grafana/provisioning/dashboards/dashboard.yml`:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
apiVersion: 1
|
||||||
|
|
||||||
|
providers:
|
||||||
|
- name: 'MEV Bot Dashboards'
|
||||||
|
orgId: 1
|
||||||
|
folder: 'MEV Bot'
|
||||||
|
type: file
|
||||||
|
disableDeletion: false
|
||||||
|
updateIntervalSeconds: 10
|
||||||
|
allowUiUpdates: true
|
||||||
|
options:
|
||||||
|
path: /var/lib/grafana/dashboards
|
||||||
|
foldersFromFilesStructure: true
|
||||||
|
```
|
||||||
|
|
||||||
|
Create `config/grafana/provisioning/datasources/prometheus.yml`:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
apiVersion: 1
|
||||||
|
|
||||||
|
datasources:
|
||||||
|
- name: Prometheus
|
||||||
|
type: prometheus
|
||||||
|
access: proxy
|
||||||
|
url: http://prometheus:9090
|
||||||
|
isDefault: true
|
||||||
|
editable: true
|
||||||
|
jsonData:
|
||||||
|
timeInterval: "5s"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Dashboard JSON
|
||||||
|
|
||||||
|
Create `config/grafana/dashboards/mev-bot-overview.json`:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"dashboard": {
|
||||||
|
"title": "MEV Bot V2 - Overview",
|
||||||
|
"tags": ["mev", "arbitrage", "production"],
|
||||||
|
"timezone": "browser",
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"id": 1,
|
||||||
|
"title": "Messages Received Rate",
|
||||||
|
"type": "graph",
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "rate(mev_sequencer_messages_received_total[1m])",
|
||||||
|
"legendFormat": "Messages/sec"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"gridPos": {"x": 0, "y": 0, "w": 12, "h": 8}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2,
|
||||||
|
"title": "Parse Latency (P95)",
|
||||||
|
"type": "graph",
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "histogram_quantile(0.95, rate(mev_parse_latency_seconds_bucket[5m]))",
|
||||||
|
"legendFormat": "P95 Parse Latency"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"gridPos": {"x": 12, "y": 0, "w": 12, "h": 8}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 3,
|
||||||
|
"title": "Opportunities by Type",
|
||||||
|
"type": "graph",
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "rate(mev_opportunities_total[5m])",
|
||||||
|
"legendFormat": "{{type}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"gridPos": {"x": 0, "y": 8, "w": 12, "h": 8}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 4,
|
||||||
|
"title": "Execution Success Rate",
|
||||||
|
"type": "gauge",
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "rate(mev_executions_successful_total[5m]) / rate(mev_executions_attempted_total[5m]) * 100",
|
||||||
|
"legendFormat": "Success %"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"gridPos": {"x": 12, "y": 8, "w": 6, "h": 8}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 5,
|
||||||
|
"title": "Total Profit (ETH)",
|
||||||
|
"type": "stat",
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "mev_profit_eth_total",
|
||||||
|
"legendFormat": "Total Profit"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"gridPos": {"x": 18, "y": 8, "w": 6, "h": 8}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"refresh": "5s",
|
||||||
|
"time": {
|
||||||
|
"from": "now-1h",
|
||||||
|
"to": "now"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Alert Rules
|
||||||
|
|
||||||
|
Create `config/prometheus/alerts/mev-bot-alerts.yml`:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
groups:
|
||||||
|
- name: mev_bot_alerts
|
||||||
|
interval: 30s
|
||||||
|
rules:
|
||||||
|
# High error rate
|
||||||
|
- alert: HighParseErrorRate
|
||||||
|
expr: rate(mev_sequencer_parse_errors_total[5m]) > 10
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
component: parser
|
||||||
|
annotations:
|
||||||
|
summary: "High parse error rate detected"
|
||||||
|
description: "Parse error rate is {{ $value }} errors/sec (threshold: 10)"
|
||||||
|
|
||||||
|
# Sequencer disconnection
|
||||||
|
- alert: SequencerDisconnected
|
||||||
|
expr: rate(mev_sequencer_messages_received_total[2m]) == 0
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
component: sequencer
|
||||||
|
annotations:
|
||||||
|
summary: "Sequencer feed disconnected"
|
||||||
|
description: "No messages received from sequencer for 1 minute"
|
||||||
|
|
||||||
|
# Slow parsing
|
||||||
|
- alert: SlowParsing
|
||||||
|
expr: histogram_quantile(0.95, rate(mev_parse_latency_seconds_bucket[5m])) > 0.1
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
component: parser
|
||||||
|
annotations:
|
||||||
|
summary: "Parse latency high"
|
||||||
|
description: "P95 parse latency is {{ $value }}s (threshold: 0.1s)"
|
||||||
|
|
||||||
|
# Low execution success rate
|
||||||
|
- alert: LowExecutionSuccessRate
|
||||||
|
expr: |
|
||||||
|
(
|
||||||
|
rate(mev_executions_successful_total[10m]) /
|
||||||
|
rate(mev_executions_attempted_total[10m])
|
||||||
|
) < 0.1
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
component: execution
|
||||||
|
annotations:
|
||||||
|
summary: "Low execution success rate"
|
||||||
|
description: "Success rate is {{ $value | humanizePercentage }} (threshold: 10%)"
|
||||||
|
|
||||||
|
# Cache miss rate too high
|
||||||
|
- alert: HighCacheMissRate
|
||||||
|
expr: |
|
||||||
|
(
|
||||||
|
rate(mev_pool_cache_misses_total[5m]) /
|
||||||
|
(rate(mev_pool_cache_hits_total[5m]) + rate(mev_pool_cache_misses_total[5m]))
|
||||||
|
) > 0.5
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: info
|
||||||
|
component: cache
|
||||||
|
annotations:
|
||||||
|
summary: "High cache miss rate"
|
||||||
|
description: "Cache miss rate is {{ $value | humanizePercentage }} (threshold: 50%)"
|
||||||
|
|
||||||
|
# No opportunities detected
|
||||||
|
- alert: NoOpportunitiesDetected
|
||||||
|
expr: rate(mev_opportunities_total[15m]) == 0
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
component: detection
|
||||||
|
annotations:
|
||||||
|
summary: "No arbitrage opportunities detected"
|
||||||
|
description: "No opportunities found in the last 15 minutes"
|
||||||
|
|
||||||
|
# RPC errors
|
||||||
|
- alert: HighRPCErrorRate
|
||||||
|
expr: rate(mev_rpc_errors_total[5m]) > 5
|
||||||
|
for: 3m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
component: rpc
|
||||||
|
annotations:
|
||||||
|
summary: "High RPC error rate"
|
||||||
|
description: "RPC error rate is {{ $value }} errors/sec for method {{ $labels.method }}"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Production Deployment
|
||||||
|
|
||||||
|
### 1. Deploy Full Stack
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Clone repository
|
||||||
|
git clone <repo-url>
|
||||||
|
cd mev-bot
|
||||||
|
|
||||||
|
# Create directories
|
||||||
|
mkdir -p config/prometheus/alerts
|
||||||
|
mkdir -p config/grafana/provisioning/{datasources,dashboards}
|
||||||
|
mkdir -p config/grafana/dashboards
|
||||||
|
|
||||||
|
# Copy configuration files (from this guide above)
|
||||||
|
# ... copy prometheus.yml, alerts, grafana configs ...
|
||||||
|
|
||||||
|
# Start all services
|
||||||
|
docker-compose up -d
|
||||||
|
|
||||||
|
# Verify services
|
||||||
|
docker-compose ps
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Access Dashboards
|
||||||
|
|
||||||
|
- **Prometheus**: http://localhost:9090
|
||||||
|
- **Grafana**: http://localhost:3000 (admin/admin)
|
||||||
|
- **Metrics**: http://localhost:8080/metrics
|
||||||
|
|
||||||
|
### 3. Import Dashboards
|
||||||
|
|
||||||
|
1. Open Grafana at http://localhost:3000
|
||||||
|
2. Login with admin/admin
|
||||||
|
3. Navigate to Dashboards → Import
|
||||||
|
4. Upload `mev-bot-overview.json`
|
||||||
|
5. Select "Prometheus" as data source
|
||||||
|
|
||||||
|
### 4. Configure Alerts
|
||||||
|
|
||||||
|
1. In Grafana: Alerting → Notification channels
|
||||||
|
2. Add Slack/PagerDuty/Email integration
|
||||||
|
3. Test alert routing
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Query Examples
|
||||||
|
|
||||||
|
### PromQL Queries
|
||||||
|
|
||||||
|
**Message throughput**:
|
||||||
|
```promql
|
||||||
|
rate(mev_sequencer_messages_received_total[1m])
|
||||||
|
```
|
||||||
|
|
||||||
|
**Parse success rate**:
|
||||||
|
```promql
|
||||||
|
(
|
||||||
|
rate(mev_sequencer_transactions_processed_total[5m]) /
|
||||||
|
rate(mev_sequencer_messages_received_total[5m])
|
||||||
|
) * 100
|
||||||
|
```
|
||||||
|
|
||||||
|
**P50, P95, P99 parse latency**:
|
||||||
|
```promql
|
||||||
|
histogram_quantile(0.50, rate(mev_parse_latency_seconds_bucket[5m]))
|
||||||
|
histogram_quantile(0.95, rate(mev_parse_latency_seconds_bucket[5m]))
|
||||||
|
histogram_quantile(0.99, rate(mev_parse_latency_seconds_bucket[5m]))
|
||||||
|
```
|
||||||
|
|
||||||
|
**Top protocols by swap count**:
|
||||||
|
```promql
|
||||||
|
topk(5, rate(mev_sequencer_swaps_detected_total[5m]))
|
||||||
|
```
|
||||||
|
|
||||||
|
**Execution success vs failure**:
|
||||||
|
```promql
|
||||||
|
sum(rate(mev_executions_successful_total[5m])) by (type)
|
||||||
|
sum(rate(mev_executions_failed_total[5m])) by (reason)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Profit per hour**:
|
||||||
|
```promql
|
||||||
|
increase(mev_profit_eth_total[1h])
|
||||||
|
```
|
||||||
|
|
||||||
|
**ROI (profit / gas cost)**:
|
||||||
|
```promql
|
||||||
|
(
|
||||||
|
increase(mev_profit_eth_total[1h]) /
|
||||||
|
increase(mev_gas_cost_eth_total[1h])
|
||||||
|
) * 100
|
||||||
|
```
|
||||||
|
|
||||||
|
**Cache hit rate**:
|
||||||
|
```promql
|
||||||
|
(
|
||||||
|
rate(mev_pool_cache_hits_total[5m]) /
|
||||||
|
(rate(mev_pool_cache_hits_total[5m]) + rate(mev_pool_cache_misses_total[5m]))
|
||||||
|
) * 100
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Metrics Not Appearing
|
||||||
|
|
||||||
|
**Symptom**: `/metrics` endpoint returns empty or no data
|
||||||
|
|
||||||
|
**Solutions**:
|
||||||
|
1. Verify MEV bot is running: `docker ps | grep mev-bot`
|
||||||
|
2. Check logs: `docker logs mev-bot`
|
||||||
|
3. Test endpoint: `curl http://localhost:8080/metrics`
|
||||||
|
4. Verify port mapping in docker-compose.yml
|
||||||
|
|
||||||
|
### Prometheus Not Scraping
|
||||||
|
|
||||||
|
**Symptom**: Prometheus shows target as "down"
|
||||||
|
|
||||||
|
**Solutions**:
|
||||||
|
1. Check Prometheus targets: http://localhost:9090/targets
|
||||||
|
2. Verify network connectivity: `docker exec prometheus ping mev-bot`
|
||||||
|
3. Check Prometheus logs: `docker logs prometheus`
|
||||||
|
4. Verify scrape configuration in prometheus.yml
|
||||||
|
|
||||||
|
### High Memory Usage
|
||||||
|
|
||||||
|
**Symptom**: Prometheus consuming excessive memory
|
||||||
|
|
||||||
|
**Solutions**:
|
||||||
|
1. Reduce retention time: `--storage.tsdb.retention.time=15d`
|
||||||
|
2. Reduce scrape frequency: `scrape_interval: 30s`
|
||||||
|
3. Limit series cardinality (reduce label combinations)
|
||||||
|
|
||||||
|
### Missing Histograms
|
||||||
|
|
||||||
|
**Symptom**: Histogram percentiles return no data
|
||||||
|
|
||||||
|
**Solutions**:
|
||||||
|
1. Verify histogram buckets match query range
|
||||||
|
2. Use `rate()` before `histogram_quantile()`:
|
||||||
|
```promql
|
||||||
|
histogram_quantile(0.95, rate(mev_parse_latency_seconds_bucket[5m]))
|
||||||
|
```
|
||||||
|
3. Ensure sufficient data points (increase time range)
|
||||||
|
|
||||||
|
### Grafana Dashboard Not Loading
|
||||||
|
|
||||||
|
**Symptom**: Dashboard shows "No data" or errors
|
||||||
|
|
||||||
|
**Solutions**:
|
||||||
|
1. Verify Prometheus data source: Settings → Data Sources
|
||||||
|
2. Test connection: "Save & Test" button
|
||||||
|
3. Check query syntax in panel editor
|
||||||
|
4. Verify time range matches data availability
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Performance Tuning
|
||||||
|
|
||||||
|
### For High Throughput
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# prometheus.yml
|
||||||
|
global:
|
||||||
|
scrape_interval: 5s # More frequent scraping
|
||||||
|
scrape_timeout: 4s
|
||||||
|
|
||||||
|
scrape_configs:
|
||||||
|
- job_name: 'mev-bot'
|
||||||
|
scrape_interval: 2s # Even more frequent for critical metrics
|
||||||
|
metric_relabel_configs:
|
||||||
|
# Drop unnecessary metrics to reduce cardinality
|
||||||
|
- source_labels: [__name__]
|
||||||
|
regex: 'go_.*'
|
||||||
|
action: drop
|
||||||
|
```
|
||||||
|
|
||||||
|
### For Long-Term Storage
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Use remote write to long-term storage
|
||||||
|
docker run -d \
|
||||||
|
--name prometheus \
|
||||||
|
-v ./prometheus.yml:/etc/prometheus/prometheus.yml \
|
||||||
|
prom/prometheus:latest \
|
||||||
|
--config.file=/etc/prometheus/prometheus.yml \
|
||||||
|
--storage.tsdb.retention.time=30d \
|
||||||
|
--storage.tsdb.retention.size=50GB \
|
||||||
|
--storage.tsdb.wal-compression
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Next Steps
|
||||||
|
|
||||||
|
1. **Custom Dashboards**: Create dashboards for specific use cases
|
||||||
|
2. **Advanced Alerts**: Configure multi-condition alerts
|
||||||
|
3. **Log Aggregation**: Integrate with Loki for log correlation
|
||||||
|
4. **Distributed Tracing**: Add Jaeger/Tempo for request tracing
|
||||||
|
5. **SLO Monitoring**: Define and track Service Level Objectives
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## References
|
||||||
|
|
||||||
|
- [Prometheus Documentation](https://prometheus.io/docs/)
|
||||||
|
- [Grafana Documentation](https://grafana.com/docs/)
|
||||||
|
- [PromQL Guide](https://prometheus.io/docs/prometheus/latest/querying/basics/)
|
||||||
|
- [Best Practices](https://prometheus.io/docs/practices/naming/)
|
||||||
|
|
||||||
|
**Prometheus Integration**: 100% Complete ✅
|
||||||
@@ -6,7 +6,6 @@ import (
|
|||||||
"log/slog"
|
"log/slog"
|
||||||
"math/big"
|
"math/big"
|
||||||
"sync"
|
"sync"
|
||||||
"sync/atomic"
|
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/ethereum/go-ethereum/core/types"
|
"github.com/ethereum/go-ethereum/core/types"
|
||||||
@@ -17,6 +16,7 @@ import (
|
|||||||
"github.com/your-org/mev-bot/pkg/arbitrage"
|
"github.com/your-org/mev-bot/pkg/arbitrage"
|
||||||
"github.com/your-org/mev-bot/pkg/cache"
|
"github.com/your-org/mev-bot/pkg/cache"
|
||||||
"github.com/your-org/mev-bot/pkg/execution"
|
"github.com/your-org/mev-bot/pkg/execution"
|
||||||
|
"github.com/your-org/mev-bot/pkg/metrics"
|
||||||
"github.com/your-org/mev-bot/pkg/parsers"
|
"github.com/your-org/mev-bot/pkg/parsers"
|
||||||
"github.com/your-org/mev-bot/pkg/validation"
|
"github.com/your-org/mev-bot/pkg/validation"
|
||||||
)
|
)
|
||||||
@@ -90,16 +90,8 @@ type Reader struct {
|
|||||||
opportunityCount uint64
|
opportunityCount uint64
|
||||||
executionCount uint64
|
executionCount uint64
|
||||||
|
|
||||||
// Metrics (atomic operations - thread-safe without mutex)
|
// NOTE: Metrics are now handled by pkg/metrics (Prometheus)
|
||||||
txReceived atomic.Uint64
|
// No local atomic counters needed - metrics package handles thread safety
|
||||||
txProcessed atomic.Uint64
|
|
||||||
parseErrors atomic.Uint64
|
|
||||||
validationErrors atomic.Uint64
|
|
||||||
opportunitiesFound atomic.Uint64
|
|
||||||
executionsAttempted atomic.Uint64
|
|
||||||
avgParseLatency atomic.Int64 // stored as nanoseconds
|
|
||||||
avgDetectLatency atomic.Int64 // stored as nanoseconds
|
|
||||||
avgExecuteLatency atomic.Int64 // stored as nanoseconds
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewReader creates a new sequencer reader
|
// NewReader creates a new sequencer reader
|
||||||
@@ -312,7 +304,7 @@ func (r *Reader) readMessages(ctx context.Context, conn *websocket.Conn) error {
|
|||||||
if messages, ok := msg["messages"].([]interface{}); ok {
|
if messages, ok := msg["messages"].([]interface{}); ok {
|
||||||
for _, m := range messages {
|
for _, m := range messages {
|
||||||
if msgMap, ok := m.(map[string]interface{}); ok {
|
if msgMap, ok := m.(map[string]interface{}); ok {
|
||||||
r.txReceived.Add(1)
|
metrics.MessagesReceived.Inc()
|
||||||
|
|
||||||
// Pass message to swap filter for processing
|
// Pass message to swap filter for processing
|
||||||
if r.swapFilter != nil {
|
if r.swapFilter != nil {
|
||||||
@@ -365,7 +357,7 @@ func (r *Reader) processSwapEvent(ctx context.Context, swapEvent *SwapEvent) err
|
|||||||
// Parse transaction events (no receipt for pending transactions)
|
// Parse transaction events (no receipt for pending transactions)
|
||||||
events, err := r.parsers.ParseTransaction(procCtx, tx, nil)
|
events, err := r.parsers.ParseTransaction(procCtx, tx, nil)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
r.parseErrors.Add(1)
|
metrics.ParseErrors.Inc()
|
||||||
return fmt.Errorf("parse failed: %w", err)
|
return fmt.Errorf("parse failed: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -373,12 +365,12 @@ func (r *Reader) processSwapEvent(ctx context.Context, swapEvent *SwapEvent) err
|
|||||||
return nil // No swap events
|
return nil // No swap events
|
||||||
}
|
}
|
||||||
|
|
||||||
r.avgParseLatency.Store(time.Since(parseStart).Nanoseconds())
|
metrics.ParseLatency.Observe(time.Since(parseStart).Seconds())
|
||||||
|
|
||||||
// Validate events
|
// Validate events
|
||||||
validEvents := r.validator.FilterValid(procCtx, events)
|
validEvents := r.validator.FilterValid(procCtx, events)
|
||||||
if len(validEvents) == 0 {
|
if len(validEvents) == 0 {
|
||||||
r.validationErrors.Add(1)
|
metrics.ValidationErrors.Inc()
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -395,24 +387,24 @@ func (r *Reader) processSwapEvent(ctx context.Context, swapEvent *SwapEvent) err
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
r.avgDetectLatency.Store(time.Since(detectStart).Nanoseconds())
|
metrics.DetectionLatency.Observe(time.Since(detectStart).Seconds())
|
||||||
|
|
||||||
// Execute profitable opportunities
|
// Execute profitable opportunities
|
||||||
for _, opp := range opportunities {
|
for _, opp := range opportunities {
|
||||||
if opp.NetProfit.Cmp(r.config.MinProfit) > 0 {
|
if opp.NetProfit.Cmp(r.config.MinProfit) > 0 {
|
||||||
r.opportunitiesFound.Add(1)
|
metrics.RecordOpportunity("arbitrage")
|
||||||
r.opportunityCount++
|
r.opportunityCount++
|
||||||
|
|
||||||
if r.config.EnableFrontRunning {
|
if r.config.EnableFrontRunning {
|
||||||
execStart := time.Now()
|
execStart := time.Now()
|
||||||
go r.executeFrontRun(ctx, opp, tx)
|
go r.executeFrontRun(ctx, opp, tx)
|
||||||
r.avgExecuteLatency.Store(time.Since(execStart).Nanoseconds())
|
metrics.ExecutionLatency.Observe(time.Since(execStart).Seconds())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
r.txProcessed.Add(1)
|
metrics.TransactionsProcessed.Inc()
|
||||||
r.processedCount++
|
r.processedCount++
|
||||||
r.lastProcessed = time.Now()
|
r.lastProcessed = time.Now()
|
||||||
|
|
||||||
@@ -426,7 +418,7 @@ func (r *Reader) processSwapEvent(ctx context.Context, swapEvent *SwapEvent) err
|
|||||||
|
|
||||||
// executeFrontRun executes a front-running transaction
|
// executeFrontRun executes a front-running transaction
|
||||||
func (r *Reader) executeFrontRun(ctx context.Context, opp *arbitrage.Opportunity, targetTx *types.Transaction) {
|
func (r *Reader) executeFrontRun(ctx context.Context, opp *arbitrage.Opportunity, targetTx *types.Transaction) {
|
||||||
r.executionsAttempted.Add(1)
|
metrics.ExecutionsAttempted.Inc()
|
||||||
r.executionCount++
|
r.executionCount++
|
||||||
|
|
||||||
r.logger.Info("front-running opportunity",
|
r.logger.Info("front-running opportunity",
|
||||||
@@ -465,22 +457,19 @@ func (r *Reader) executeFrontRun(ctx context.Context, opp *arbitrage.Opportunity
|
|||||||
}
|
}
|
||||||
|
|
||||||
// GetStats returns current statistics
|
// GetStats returns current statistics
|
||||||
|
// NOTE: Detailed metrics are now available via Prometheus /metrics endpoint
|
||||||
|
// This returns only basic connection state and local counters
|
||||||
func (r *Reader) GetStats() map[string]interface{} {
|
func (r *Reader) GetStats() map[string]interface{} {
|
||||||
r.mu.RLock()
|
r.mu.RLock()
|
||||||
defer r.mu.RUnlock()
|
defer r.mu.RUnlock()
|
||||||
|
|
||||||
return map[string]interface{}{
|
return map[string]interface{}{
|
||||||
"connected": r.connected,
|
"connected": r.connected,
|
||||||
"tx_received": r.txReceived.Load(),
|
"processed_count": r.processedCount,
|
||||||
"tx_processed": r.txProcessed.Load(),
|
"opportunity_count": r.opportunityCount,
|
||||||
"parse_errors": r.parseErrors.Load(),
|
"execution_count": r.executionCount,
|
||||||
"validation_errors": r.validationErrors.Load(),
|
"last_processed": r.lastProcessed.Format(time.RFC3339),
|
||||||
"opportunities_found": r.opportunitiesFound.Load(),
|
"metrics_endpoint": "/metrics (Prometheus format)",
|
||||||
"executions_attempted": r.executionsAttempted.Load(),
|
|
||||||
"avg_parse_latency": time.Duration(r.avgParseLatency.Load()).String(),
|
|
||||||
"avg_detect_latency": time.Duration(r.avgDetectLatency.Load()).String(),
|
|
||||||
"avg_execute_latency": time.Duration(r.avgExecuteLatency.Load()).String(),
|
|
||||||
"last_processed": r.lastProcessed.Format(time.RFC3339),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user