Compare commits
6 Commits
feature/re
...
feature/us
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
fab8741544 | ||
|
|
942dd541e1 | ||
|
|
aec2ed2558 | ||
|
|
f600ec26ff | ||
|
|
ac1953b2c3 | ||
|
|
21a1f9caee |
771
docs/PROMETHEUS_SETUP.md
Normal file
771
docs/PROMETHEUS_SETUP.md
Normal file
@@ -0,0 +1,771 @@
|
||||
# Prometheus Monitoring Setup
|
||||
|
||||
Complete guide for production monitoring with Prometheus and Grafana.
|
||||
|
||||
## Table of Contents
|
||||
|
||||
1. [Overview](#overview)
|
||||
2. [Quick Start](#quick-start)
|
||||
3. [Metrics Exposed](#metrics-exposed)
|
||||
4. [Prometheus Configuration](#prometheus-configuration)
|
||||
5. [Grafana Dashboards](#grafana-dashboards)
|
||||
6. [Alert Rules](#alert-rules)
|
||||
7. [Production Deployment](#production-deployment)
|
||||
8. [Query Examples](#query-examples)
|
||||
9. [Troubleshooting](#troubleshooting)
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
The MEV Bot V2 exposes comprehensive Prometheus metrics for production monitoring and observability. All metrics follow Prometheus best practices with proper naming, labeling, and types.
|
||||
|
||||
**Metrics Endpoint**: `http://localhost:8080/metrics`
|
||||
|
||||
**Metric Categories**:
|
||||
- **Sequencer**: Message reception, parsing, validation
|
||||
- **Arbitrage**: Opportunity detection and execution
|
||||
- **Performance**: Latency histograms for critical operations
|
||||
- **Cache**: Pool cache hits/misses and size
|
||||
- **RPC**: Connection pool metrics
|
||||
- **Mempool**: Transaction monitoring
|
||||
|
||||
---
|
||||
|
||||
## Quick Start
|
||||
|
||||
### 1. Start the MEV Bot
|
||||
|
||||
The bot automatically exposes metrics on port 8080:
|
||||
|
||||
```bash
|
||||
# Using Docker Compose (recommended)
|
||||
docker-compose up -d mev-bot
|
||||
|
||||
# Or standalone container
|
||||
podman run -d \
|
||||
--name mev-bot \
|
||||
-p 8080:8080 \
|
||||
-e RPC_URL=https://arb1.arbitrum.io/rpc \
|
||||
-e WS_URL=wss://arb1.arbitrum.io/ws \
|
||||
mev-bot-v2:latest
|
||||
```
|
||||
|
||||
### 2. Verify Metrics Endpoint
|
||||
|
||||
```bash
|
||||
curl http://localhost:8080/metrics
|
||||
```
|
||||
|
||||
You should see output like:
|
||||
|
||||
```
|
||||
# HELP mev_sequencer_messages_received_total Total number of messages received from Arbitrum sequencer feed
|
||||
# TYPE mev_sequencer_messages_received_total counter
|
||||
mev_sequencer_messages_received_total 1234
|
||||
|
||||
# HELP mev_parse_latency_seconds Time taken to parse a transaction
|
||||
# TYPE mev_parse_latency_seconds histogram
|
||||
mev_parse_latency_seconds_bucket{le="0.001"} 450
|
||||
mev_parse_latency_seconds_bucket{le="0.005"} 890
|
||||
...
|
||||
```
|
||||
|
||||
### 3. Start Prometheus
|
||||
|
||||
```bash
|
||||
# Using provided configuration
|
||||
docker-compose up -d prometheus
|
||||
```
|
||||
|
||||
### 4. Start Grafana
|
||||
|
||||
```bash
|
||||
# Access at http://localhost:3000
|
||||
docker-compose up -d grafana
|
||||
```
|
||||
|
||||
**Default Credentials**: `admin` / `admin` (change on first login)
|
||||
|
||||
---
|
||||
|
||||
## Metrics Exposed
|
||||
|
||||
### Sequencer Metrics
|
||||
|
||||
#### Counters
|
||||
|
||||
```
|
||||
mev_sequencer_messages_received_total
|
||||
Total number of messages received from Arbitrum sequencer feed
|
||||
|
||||
mev_sequencer_transactions_processed_total
|
||||
Total number of transactions processed from sequencer
|
||||
|
||||
mev_sequencer_parse_errors_total
|
||||
Total number of parsing errors
|
||||
|
||||
mev_sequencer_validation_errors_total
|
||||
Total number of validation errors
|
||||
|
||||
mev_sequencer_swaps_detected_total
|
||||
Total number of swap events detected (labeled by protocol)
|
||||
Labels: protocol, version, type
|
||||
```
|
||||
|
||||
#### Histograms
|
||||
|
||||
```
|
||||
mev_parse_latency_seconds
|
||||
Time taken to parse a transaction
|
||||
Buckets: 1ms, 5ms, 10ms, 25ms, 50ms, 100ms, 250ms, 500ms, 1s
|
||||
|
||||
mev_detection_latency_seconds
|
||||
Time taken to detect arbitrage opportunities
|
||||
Buckets: 1ms, 5ms, 10ms, 25ms, 50ms, 100ms, 250ms, 500ms, 1s
|
||||
|
||||
mev_execution_latency_seconds
|
||||
Time taken to execute an arbitrage transaction
|
||||
Buckets: 100ms, 250ms, 500ms, 1s, 2s, 5s, 10s
|
||||
```
|
||||
|
||||
### Arbitrage Metrics
|
||||
|
||||
```
|
||||
mev_opportunities_total
|
||||
Total number of arbitrage opportunities detected
|
||||
Labels: type (arbitrage, frontrun, backrun)
|
||||
|
||||
mev_executions_attempted_total
|
||||
Total number of execution attempts
|
||||
|
||||
mev_executions_successful_total
|
||||
Total number of successful executions
|
||||
|
||||
mev_executions_failed_total
|
||||
Total number of failed executions
|
||||
Labels: reason (gas_price, slippage, revert, timeout)
|
||||
|
||||
mev_profit_eth_total
|
||||
Total profit in ETH across all successful executions
|
||||
|
||||
mev_gas_cost_eth_total
|
||||
Total gas cost in ETH across all executions
|
||||
```
|
||||
|
||||
### Pool Cache Metrics
|
||||
|
||||
```
|
||||
mev_pool_cache_hits_total
|
||||
Total number of cache hits
|
||||
|
||||
mev_pool_cache_misses_total
|
||||
Total number of cache misses
|
||||
|
||||
mev_pool_cache_size
|
||||
Current number of pools in cache (gauge)
|
||||
|
||||
mev_pool_cache_updates_total
|
||||
Total number of cache updates
|
||||
|
||||
mev_pool_cache_evictions_total
|
||||
Total number of cache evictions
|
||||
```
|
||||
|
||||
### RPC Metrics
|
||||
|
||||
```
|
||||
mev_rpc_requests_total
|
||||
Total number of RPC requests
|
||||
Labels: method (eth_call, eth_getBalance, etc.)
|
||||
|
||||
mev_rpc_errors_total
|
||||
Total number of RPC errors
|
||||
Labels: method, error_type
|
||||
|
||||
mev_rpc_latency_seconds
|
||||
RPC request latency histogram
|
||||
Labels: method
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Prometheus Configuration
|
||||
|
||||
### prometheus.yml
|
||||
|
||||
Create `config/prometheus/prometheus.yml`:
|
||||
|
||||
```yaml
|
||||
global:
|
||||
scrape_interval: 15s # Scrape targets every 15 seconds
|
||||
evaluation_interval: 15s # Evaluate rules every 15 seconds
|
||||
|
||||
# Attach labels to all time series
|
||||
external_labels:
|
||||
monitor: 'mev-bot-prod'
|
||||
environment: 'production'
|
||||
|
||||
# Alertmanager configuration
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets:
|
||||
- alertmanager:9093
|
||||
|
||||
# Load and evaluate rules
|
||||
rule_files:
|
||||
- "alerts/*.yml"
|
||||
|
||||
# Scrape configurations
|
||||
scrape_configs:
|
||||
# MEV Bot metrics
|
||||
- job_name: 'mev-bot'
|
||||
static_configs:
|
||||
- targets: ['mev-bot:8080']
|
||||
labels:
|
||||
service: 'mev-bot'
|
||||
component: 'main'
|
||||
|
||||
# Scrape interval for high-frequency metrics
|
||||
scrape_interval: 5s
|
||||
scrape_timeout: 4s
|
||||
|
||||
# Relabeling
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: instance
|
||||
replacement: 'mev-bot-v2'
|
||||
|
||||
# Prometheus self-monitoring
|
||||
- job_name: 'prometheus'
|
||||
static_configs:
|
||||
- targets: ['localhost:9090']
|
||||
|
||||
# Node exporter (system metrics)
|
||||
- job_name: 'node'
|
||||
static_configs:
|
||||
- targets: ['node-exporter:9100']
|
||||
labels:
|
||||
service: 'system'
|
||||
```
|
||||
|
||||
### Docker Compose Integration
|
||||
|
||||
Add to your `docker-compose.yml`:
|
||||
|
||||
```yaml
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
mev-bot:
|
||||
image: mev-bot-v2:latest
|
||||
container_name: mev-bot
|
||||
ports:
|
||||
- "8080:8080" # Metrics endpoint
|
||||
environment:
|
||||
- RPC_URL=https://arb1.arbitrum.io/rpc
|
||||
- WS_URL=wss://arb1.arbitrum.io/ws
|
||||
- METRICS_PORT=8080
|
||||
networks:
|
||||
- monitoring
|
||||
restart: unless-stopped
|
||||
|
||||
prometheus:
|
||||
image: prom/prometheus:latest
|
||||
container_name: prometheus
|
||||
ports:
|
||||
- "9090:9090"
|
||||
volumes:
|
||||
- ./config/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
||||
- ./config/prometheus/alerts:/etc/prometheus/alerts:ro
|
||||
- prometheus-data:/prometheus
|
||||
command:
|
||||
- '--config.file=/etc/prometheus/prometheus.yml'
|
||||
- '--storage.tsdb.path=/prometheus'
|
||||
- '--storage.tsdb.retention.time=30d'
|
||||
- '--web.console.libraries=/usr/share/prometheus/console_libraries'
|
||||
- '--web.console.templates=/usr/share/prometheus/consoles'
|
||||
- '--web.enable-lifecycle'
|
||||
networks:
|
||||
- monitoring
|
||||
restart: unless-stopped
|
||||
|
||||
grafana:
|
||||
image: grafana/grafana:latest
|
||||
container_name: grafana
|
||||
ports:
|
||||
- "3000:3000"
|
||||
volumes:
|
||||
- ./config/grafana/provisioning:/etc/grafana/provisioning:ro
|
||||
- ./config/grafana/dashboards:/var/lib/grafana/dashboards:ro
|
||||
- grafana-data:/var/lib/grafana
|
||||
environment:
|
||||
- GF_SECURITY_ADMIN_PASSWORD=admin
|
||||
- GF_USERS_ALLOW_SIGN_UP=false
|
||||
- GF_SERVER_ROOT_URL=http://localhost:3000
|
||||
networks:
|
||||
- monitoring
|
||||
depends_on:
|
||||
- prometheus
|
||||
restart: unless-stopped
|
||||
|
||||
node-exporter:
|
||||
image: prom/node-exporter:latest
|
||||
container_name: node-exporter
|
||||
ports:
|
||||
- "9100:9100"
|
||||
command:
|
||||
- '--path.procfs=/host/proc'
|
||||
- '--path.sysfs=/host/sys'
|
||||
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
|
||||
volumes:
|
||||
- /proc:/host/proc:ro
|
||||
- /sys:/host/sys:ro
|
||||
- /:/rootfs:ro
|
||||
networks:
|
||||
- monitoring
|
||||
restart: unless-stopped
|
||||
|
||||
networks:
|
||||
monitoring:
|
||||
driver: bridge
|
||||
|
||||
volumes:
|
||||
prometheus-data:
|
||||
grafana-data:
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Grafana Dashboards
|
||||
|
||||
### Automatic Dashboard Provisioning
|
||||
|
||||
Create `config/grafana/provisioning/dashboards/dashboard.yml`:
|
||||
|
||||
```yaml
|
||||
apiVersion: 1
|
||||
|
||||
providers:
|
||||
- name: 'MEV Bot Dashboards'
|
||||
orgId: 1
|
||||
folder: 'MEV Bot'
|
||||
type: file
|
||||
disableDeletion: false
|
||||
updateIntervalSeconds: 10
|
||||
allowUiUpdates: true
|
||||
options:
|
||||
path: /var/lib/grafana/dashboards
|
||||
foldersFromFilesStructure: true
|
||||
```
|
||||
|
||||
Create `config/grafana/provisioning/datasources/prometheus.yml`:
|
||||
|
||||
```yaml
|
||||
apiVersion: 1
|
||||
|
||||
datasources:
|
||||
- name: Prometheus
|
||||
type: prometheus
|
||||
access: proxy
|
||||
url: http://prometheus:9090
|
||||
isDefault: true
|
||||
editable: true
|
||||
jsonData:
|
||||
timeInterval: "5s"
|
||||
```
|
||||
|
||||
### Dashboard JSON
|
||||
|
||||
Create `config/grafana/dashboards/mev-bot-overview.json`:
|
||||
|
||||
```json
|
||||
{
|
||||
"dashboard": {
|
||||
"title": "MEV Bot V2 - Overview",
|
||||
"tags": ["mev", "arbitrage", "production"],
|
||||
"timezone": "browser",
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"title": "Messages Received Rate",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(mev_sequencer_messages_received_total[1m])",
|
||||
"legendFormat": "Messages/sec"
|
||||
}
|
||||
],
|
||||
"gridPos": {"x": 0, "y": 0, "w": 12, "h": 8}
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"title": "Parse Latency (P95)",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, rate(mev_parse_latency_seconds_bucket[5m]))",
|
||||
"legendFormat": "P95 Parse Latency"
|
||||
}
|
||||
],
|
||||
"gridPos": {"x": 12, "y": 0, "w": 12, "h": 8}
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"title": "Opportunities by Type",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(mev_opportunities_total[5m])",
|
||||
"legendFormat": "{{type}}"
|
||||
}
|
||||
],
|
||||
"gridPos": {"x": 0, "y": 8, "w": 12, "h": 8}
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"title": "Execution Success Rate",
|
||||
"type": "gauge",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(mev_executions_successful_total[5m]) / rate(mev_executions_attempted_total[5m]) * 100",
|
||||
"legendFormat": "Success %"
|
||||
}
|
||||
],
|
||||
"gridPos": {"x": 12, "y": 8, "w": 6, "h": 8}
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"title": "Total Profit (ETH)",
|
||||
"type": "stat",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "mev_profit_eth_total",
|
||||
"legendFormat": "Total Profit"
|
||||
}
|
||||
],
|
||||
"gridPos": {"x": 18, "y": 8, "w": 6, "h": 8}
|
||||
}
|
||||
],
|
||||
"refresh": "5s",
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
"to": "now"
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Alert Rules
|
||||
|
||||
Create `config/prometheus/alerts/mev-bot-alerts.yml`:
|
||||
|
||||
```yaml
|
||||
groups:
|
||||
- name: mev_bot_alerts
|
||||
interval: 30s
|
||||
rules:
|
||||
# High error rate
|
||||
- alert: HighParseErrorRate
|
||||
expr: rate(mev_sequencer_parse_errors_total[5m]) > 10
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
component: parser
|
||||
annotations:
|
||||
summary: "High parse error rate detected"
|
||||
description: "Parse error rate is {{ $value }} errors/sec (threshold: 10)"
|
||||
|
||||
# Sequencer disconnection
|
||||
- alert: SequencerDisconnected
|
||||
expr: rate(mev_sequencer_messages_received_total[2m]) == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
component: sequencer
|
||||
annotations:
|
||||
summary: "Sequencer feed disconnected"
|
||||
description: "No messages received from sequencer for 1 minute"
|
||||
|
||||
# Slow parsing
|
||||
- alert: SlowParsing
|
||||
expr: histogram_quantile(0.95, rate(mev_parse_latency_seconds_bucket[5m])) > 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
component: parser
|
||||
annotations:
|
||||
summary: "Parse latency high"
|
||||
description: "P95 parse latency is {{ $value }}s (threshold: 0.1s)"
|
||||
|
||||
# Low execution success rate
|
||||
- alert: LowExecutionSuccessRate
|
||||
expr: |
|
||||
(
|
||||
rate(mev_executions_successful_total[10m]) /
|
||||
rate(mev_executions_attempted_total[10m])
|
||||
) < 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
component: execution
|
||||
annotations:
|
||||
summary: "Low execution success rate"
|
||||
description: "Success rate is {{ $value | humanizePercentage }} (threshold: 10%)"
|
||||
|
||||
# Cache miss rate too high
|
||||
- alert: HighCacheMissRate
|
||||
expr: |
|
||||
(
|
||||
rate(mev_pool_cache_misses_total[5m]) /
|
||||
(rate(mev_pool_cache_hits_total[5m]) + rate(mev_pool_cache_misses_total[5m]))
|
||||
) > 0.5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: info
|
||||
component: cache
|
||||
annotations:
|
||||
summary: "High cache miss rate"
|
||||
description: "Cache miss rate is {{ $value | humanizePercentage }} (threshold: 50%)"
|
||||
|
||||
# No opportunities detected
|
||||
- alert: NoOpportunitiesDetected
|
||||
expr: rate(mev_opportunities_total[15m]) == 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
component: detection
|
||||
annotations:
|
||||
summary: "No arbitrage opportunities detected"
|
||||
description: "No opportunities found in the last 15 minutes"
|
||||
|
||||
# RPC errors
|
||||
- alert: HighRPCErrorRate
|
||||
expr: rate(mev_rpc_errors_total[5m]) > 5
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
component: rpc
|
||||
annotations:
|
||||
summary: "High RPC error rate"
|
||||
description: "RPC error rate is {{ $value }} errors/sec for method {{ $labels.method }}"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Production Deployment
|
||||
|
||||
### 1. Deploy Full Stack
|
||||
|
||||
```bash
|
||||
# Clone repository
|
||||
git clone <repo-url>
|
||||
cd mev-bot
|
||||
|
||||
# Create directories
|
||||
mkdir -p config/prometheus/alerts
|
||||
mkdir -p config/grafana/provisioning/{datasources,dashboards}
|
||||
mkdir -p config/grafana/dashboards
|
||||
|
||||
# Copy configuration files (from this guide above)
|
||||
# ... copy prometheus.yml, alerts, grafana configs ...
|
||||
|
||||
# Start all services
|
||||
docker-compose up -d
|
||||
|
||||
# Verify services
|
||||
docker-compose ps
|
||||
```
|
||||
|
||||
### 2. Access Dashboards
|
||||
|
||||
- **Prometheus**: http://localhost:9090
|
||||
- **Grafana**: http://localhost:3000 (admin/admin)
|
||||
- **Metrics**: http://localhost:8080/metrics
|
||||
|
||||
### 3. Import Dashboards
|
||||
|
||||
1. Open Grafana at http://localhost:3000
|
||||
2. Login with admin/admin
|
||||
3. Navigate to Dashboards → Import
|
||||
4. Upload `mev-bot-overview.json`
|
||||
5. Select "Prometheus" as data source
|
||||
|
||||
### 4. Configure Alerts
|
||||
|
||||
1. In Grafana: Alerting → Notification channels
|
||||
2. Add Slack/PagerDuty/Email integration
|
||||
3. Test alert routing
|
||||
|
||||
---
|
||||
|
||||
## Query Examples
|
||||
|
||||
### PromQL Queries
|
||||
|
||||
**Message throughput**:
|
||||
```promql
|
||||
rate(mev_sequencer_messages_received_total[1m])
|
||||
```
|
||||
|
||||
**Parse success rate**:
|
||||
```promql
|
||||
(
|
||||
rate(mev_sequencer_transactions_processed_total[5m]) /
|
||||
rate(mev_sequencer_messages_received_total[5m])
|
||||
) * 100
|
||||
```
|
||||
|
||||
**P50, P95, P99 parse latency**:
|
||||
```promql
|
||||
histogram_quantile(0.50, rate(mev_parse_latency_seconds_bucket[5m]))
|
||||
histogram_quantile(0.95, rate(mev_parse_latency_seconds_bucket[5m]))
|
||||
histogram_quantile(0.99, rate(mev_parse_latency_seconds_bucket[5m]))
|
||||
```
|
||||
|
||||
**Top protocols by swap count**:
|
||||
```promql
|
||||
topk(5, rate(mev_sequencer_swaps_detected_total[5m]))
|
||||
```
|
||||
|
||||
**Execution success vs failure**:
|
||||
```promql
|
||||
sum(rate(mev_executions_successful_total[5m])) by (type)
|
||||
sum(rate(mev_executions_failed_total[5m])) by (reason)
|
||||
```
|
||||
|
||||
**Profit per hour**:
|
||||
```promql
|
||||
increase(mev_profit_eth_total[1h])
|
||||
```
|
||||
|
||||
**ROI (profit / gas cost)**:
|
||||
```promql
|
||||
(
|
||||
increase(mev_profit_eth_total[1h]) /
|
||||
increase(mev_gas_cost_eth_total[1h])
|
||||
) * 100
|
||||
```
|
||||
|
||||
**Cache hit rate**:
|
||||
```promql
|
||||
(
|
||||
rate(mev_pool_cache_hits_total[5m]) /
|
||||
(rate(mev_pool_cache_hits_total[5m]) + rate(mev_pool_cache_misses_total[5m]))
|
||||
) * 100
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Metrics Not Appearing
|
||||
|
||||
**Symptom**: `/metrics` endpoint returns empty or no data
|
||||
|
||||
**Solutions**:
|
||||
1. Verify MEV bot is running: `docker ps | grep mev-bot`
|
||||
2. Check logs: `docker logs mev-bot`
|
||||
3. Test endpoint: `curl http://localhost:8080/metrics`
|
||||
4. Verify port mapping in docker-compose.yml
|
||||
|
||||
### Prometheus Not Scraping
|
||||
|
||||
**Symptom**: Prometheus shows target as "down"
|
||||
|
||||
**Solutions**:
|
||||
1. Check Prometheus targets: http://localhost:9090/targets
|
||||
2. Verify network connectivity: `docker exec prometheus ping mev-bot`
|
||||
3. Check Prometheus logs: `docker logs prometheus`
|
||||
4. Verify scrape configuration in prometheus.yml
|
||||
|
||||
### High Memory Usage
|
||||
|
||||
**Symptom**: Prometheus consuming excessive memory
|
||||
|
||||
**Solutions**:
|
||||
1. Reduce retention time: `--storage.tsdb.retention.time=15d`
|
||||
2. Reduce scrape frequency: `scrape_interval: 30s`
|
||||
3. Limit series cardinality (reduce label combinations)
|
||||
|
||||
### Missing Histograms
|
||||
|
||||
**Symptom**: Histogram percentiles return no data
|
||||
|
||||
**Solutions**:
|
||||
1. Verify histogram buckets match query range
|
||||
2. Use `rate()` before `histogram_quantile()`:
|
||||
```promql
|
||||
histogram_quantile(0.95, rate(mev_parse_latency_seconds_bucket[5m]))
|
||||
```
|
||||
3. Ensure sufficient data points (increase time range)
|
||||
|
||||
### Grafana Dashboard Not Loading
|
||||
|
||||
**Symptom**: Dashboard shows "No data" or errors
|
||||
|
||||
**Solutions**:
|
||||
1. Verify Prometheus data source: Settings → Data Sources
|
||||
2. Test connection: "Save & Test" button
|
||||
3. Check query syntax in panel editor
|
||||
4. Verify time range matches data availability
|
||||
|
||||
---
|
||||
|
||||
## Performance Tuning
|
||||
|
||||
### For High Throughput
|
||||
|
||||
```yaml
|
||||
# prometheus.yml
|
||||
global:
|
||||
scrape_interval: 5s # More frequent scraping
|
||||
scrape_timeout: 4s
|
||||
|
||||
scrape_configs:
|
||||
- job_name: 'mev-bot'
|
||||
scrape_interval: 2s # Even more frequent for critical metrics
|
||||
metric_relabel_configs:
|
||||
# Drop unnecessary metrics to reduce cardinality
|
||||
- source_labels: [__name__]
|
||||
regex: 'go_.*'
|
||||
action: drop
|
||||
```
|
||||
|
||||
### For Long-Term Storage
|
||||
|
||||
```bash
|
||||
# Use remote write to long-term storage
|
||||
docker run -d \
|
||||
--name prometheus \
|
||||
-v ./prometheus.yml:/etc/prometheus/prometheus.yml \
|
||||
prom/prometheus:latest \
|
||||
--config.file=/etc/prometheus/prometheus.yml \
|
||||
--storage.tsdb.retention.time=30d \
|
||||
--storage.tsdb.retention.size=50GB \
|
||||
--storage.tsdb.wal-compression
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Next Steps
|
||||
|
||||
1. **Custom Dashboards**: Create dashboards for specific use cases
|
||||
2. **Advanced Alerts**: Configure multi-condition alerts
|
||||
3. **Log Aggregation**: Integrate with Loki for log correlation
|
||||
4. **Distributed Tracing**: Add Jaeger/Tempo for request tracing
|
||||
5. **SLO Monitoring**: Define and track Service Level Objectives
|
||||
|
||||
---
|
||||
|
||||
## References
|
||||
|
||||
- [Prometheus Documentation](https://prometheus.io/docs/)
|
||||
- [Grafana Documentation](https://grafana.com/docs/)
|
||||
- [PromQL Guide](https://prometheus.io/docs/prometheus/latest/querying/basics/)
|
||||
- [Best Practices](https://prometheus.io/docs/practices/naming/)
|
||||
|
||||
**Prometheus Integration**: 100% Complete ✅
|
||||
@@ -11,9 +11,23 @@ import (
|
||||
"github.com/ethereum/go-ethereum/crypto"
|
||||
"github.com/ethereum/go-ethereum/rlp"
|
||||
|
||||
"github.com/your-org/mev-bot/pkg/config"
|
||||
"github.com/your-org/mev-bot/pkg/validation"
|
||||
)
|
||||
|
||||
// Package-level DEX configuration
|
||||
var dexConfig *config.DEXConfig
|
||||
|
||||
// InitDEXConfig loads the DEX configuration from file
|
||||
func InitDEXConfig(configPath string) error {
|
||||
cfg, err := config.LoadDEXConfig(configPath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to load DEX config: %w", err)
|
||||
}
|
||||
dexConfig = cfg
|
||||
return nil
|
||||
}
|
||||
|
||||
// L2MessageKind represents the type of L2 message
|
||||
type L2MessageKind uint8
|
||||
|
||||
@@ -233,36 +247,17 @@ func GetSwapProtocol(to *common.Address, data []byte) *DEXProtocol {
|
||||
selector := hex.EncodeToString(data[0:4])
|
||||
toAddr := to.Hex()
|
||||
|
||||
// Map known router addresses (Arbitrum mainnet)
|
||||
knownRouters := map[string]*DEXProtocol{
|
||||
// UniswapV2/V3
|
||||
"0x1b02dA8Cb0d097eB8D57A175b88c7D8b47997506": {Name: "SushiSwap", Version: "V2", Type: "router"},
|
||||
"0xE592427A0AEce92De3Edee1F18E0157C05861564": {Name: "UniswapV3", Version: "V1", Type: "router"},
|
||||
"0x68b3465833fb72A70ecDF485E0e4C7bD8665Fc45": {Name: "UniswapV3", Version: "V2", Type: "router"},
|
||||
"0xEf1c6E67703c7BD7107eed8303Fbe6EC2554BF6B": {Name: "UniswapUniversal", Version: "V1", Type: "router"},
|
||||
|
||||
// Camelot
|
||||
"0xc873fEcbd354f5A56E00E710B90EF4201db2448d": {Name: "Camelot", Version: "V2", Type: "router"},
|
||||
"0x1F721E2E82F6676FCE4eA07A5958cF098D339e18": {Name: "Camelot", Version: "V3", Type: "router"},
|
||||
|
||||
// Balancer
|
||||
"0xBA12222222228d8Ba445958a75a0704d566BF2C8": {Name: "Balancer", Version: "V2", Type: "vault"},
|
||||
|
||||
// Curve
|
||||
"0x7544Fe3d184b6B55D6B36c3FCA1157eE0Ba30287": {Name: "Curve", Version: "V1", Type: "router"},
|
||||
|
||||
// Kyber
|
||||
"0x6131B5fae19EA4f9D964eAc0408E4408b66337b5": {Name: "KyberSwap", Version: "V1", Type: "router"},
|
||||
"0xC1e7dFE73E1598E3910EF4C7845B68A19f0e8c6F": {Name: "KyberSwap", Version: "V2", Type: "router"},
|
||||
|
||||
// Aggregators
|
||||
"0x1111111254EEB25477B68fb85Ed929f73A960582": {Name: "1inch", Version: "V5", Type: "router"},
|
||||
"0xDEF171Fe48CF0115B1d80b88dc8eAB59176FEe57": {Name: "Paraswap", Version: "V5", Type: "router"},
|
||||
// Check if it's a known router (from config if loaded, else use fallback)
|
||||
if dexConfig != nil {
|
||||
for addr, routerCfg := range dexConfig.Routers {
|
||||
if addr == toAddr {
|
||||
return &DEXProtocol{
|
||||
Name: routerCfg.Name,
|
||||
Version: routerCfg.Version,
|
||||
Type: routerCfg.Type,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check if it's a known router
|
||||
if protocol, ok := knownRouters[toAddr]; ok {
|
||||
return protocol
|
||||
}
|
||||
|
||||
// Try to identify by function selector
|
||||
|
||||
@@ -3,10 +3,8 @@ package sequencer
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"math/big"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/ethereum/go-ethereum/core/types"
|
||||
@@ -17,6 +15,7 @@ import (
|
||||
"github.com/your-org/mev-bot/pkg/arbitrage"
|
||||
"github.com/your-org/mev-bot/pkg/cache"
|
||||
"github.com/your-org/mev-bot/pkg/execution"
|
||||
"github.com/your-org/mev-bot/pkg/metrics"
|
||||
"github.com/your-org/mev-bot/pkg/parsers"
|
||||
"github.com/your-org/mev-bot/pkg/validation"
|
||||
)
|
||||
@@ -63,7 +62,7 @@ func DefaultReaderConfig() *ReaderConfig {
|
||||
// Reader reads pending transactions from the Arbitrum sequencer
|
||||
type Reader struct {
|
||||
config *ReaderConfig
|
||||
logger *slog.Logger
|
||||
logger log.Logger
|
||||
|
||||
// Components
|
||||
parsers parsers.Factory
|
||||
@@ -90,16 +89,8 @@ type Reader struct {
|
||||
opportunityCount uint64
|
||||
executionCount uint64
|
||||
|
||||
// Metrics (atomic operations - thread-safe without mutex)
|
||||
txReceived atomic.Uint64
|
||||
txProcessed atomic.Uint64
|
||||
parseErrors atomic.Uint64
|
||||
validationErrors atomic.Uint64
|
||||
opportunitiesFound atomic.Uint64
|
||||
executionsAttempted atomic.Uint64
|
||||
avgParseLatency atomic.Int64 // stored as nanoseconds
|
||||
avgDetectLatency atomic.Int64 // stored as nanoseconds
|
||||
avgExecuteLatency atomic.Int64 // stored as nanoseconds
|
||||
// NOTE: Metrics are now handled by pkg/metrics (Prometheus)
|
||||
// No local atomic counters needed - metrics package handles thread safety
|
||||
}
|
||||
|
||||
// NewReader creates a new sequencer reader
|
||||
@@ -110,7 +101,7 @@ func NewReader(
|
||||
poolCache cache.PoolCache,
|
||||
detector *arbitrage.Detector,
|
||||
executor *execution.Executor,
|
||||
logger *slog.Logger,
|
||||
logger log.Logger,
|
||||
) (*Reader, error) {
|
||||
if config == nil {
|
||||
config = DefaultReaderConfig()
|
||||
@@ -125,13 +116,13 @@ func NewReader(
|
||||
// Create swap filter with pool cache
|
||||
swapFilter := NewSwapFilter(&SwapFilterConfig{
|
||||
SwapChannelSize: config.BufferSize,
|
||||
Logger: loggerAdapter(logger),
|
||||
Logger: logger,
|
||||
PoolCacheFile: "data/discovered_pools.json",
|
||||
})
|
||||
|
||||
return &Reader{
|
||||
config: config,
|
||||
logger: logger.With("component", "sequencer_reader"),
|
||||
logger: logger.New("component", "sequencer_reader"),
|
||||
parsers: parsers,
|
||||
validator: validator,
|
||||
poolCache: poolCache,
|
||||
@@ -144,13 +135,6 @@ func NewReader(
|
||||
}, nil
|
||||
}
|
||||
|
||||
// loggerAdapter converts slog.Logger to log.Logger interface
|
||||
func loggerAdapter(l *slog.Logger) log.Logger {
|
||||
// For now, create a simple wrapper
|
||||
// TODO: Implement proper adapter if needed
|
||||
return log.Root()
|
||||
}
|
||||
|
||||
// Start starts the sequencer reader
|
||||
func (r *Reader) Start(ctx context.Context) error {
|
||||
r.logger.Info("starting sequencer reader",
|
||||
@@ -312,7 +296,7 @@ func (r *Reader) readMessages(ctx context.Context, conn *websocket.Conn) error {
|
||||
if messages, ok := msg["messages"].([]interface{}); ok {
|
||||
for _, m := range messages {
|
||||
if msgMap, ok := m.(map[string]interface{}); ok {
|
||||
r.txReceived.Add(1)
|
||||
metrics.MessagesReceived.Inc()
|
||||
|
||||
// Pass message to swap filter for processing
|
||||
if r.swapFilter != nil {
|
||||
@@ -328,7 +312,7 @@ func (r *Reader) readMessages(ctx context.Context, conn *websocket.Conn) error {
|
||||
func (r *Reader) worker(ctx context.Context, id int) {
|
||||
defer r.wg.Done()
|
||||
|
||||
logger := r.logger.With("worker", id)
|
||||
logger := r.logger.New("worker", id)
|
||||
|
||||
for {
|
||||
select {
|
||||
@@ -365,7 +349,7 @@ func (r *Reader) processSwapEvent(ctx context.Context, swapEvent *SwapEvent) err
|
||||
// Parse transaction events (no receipt for pending transactions)
|
||||
events, err := r.parsers.ParseTransaction(procCtx, tx, nil)
|
||||
if err != nil {
|
||||
r.parseErrors.Add(1)
|
||||
metrics.ParseErrors.Inc()
|
||||
return fmt.Errorf("parse failed: %w", err)
|
||||
}
|
||||
|
||||
@@ -373,12 +357,12 @@ func (r *Reader) processSwapEvent(ctx context.Context, swapEvent *SwapEvent) err
|
||||
return nil // No swap events
|
||||
}
|
||||
|
||||
r.avgParseLatency.Store(time.Since(parseStart).Nanoseconds())
|
||||
metrics.ParseLatency.Observe(time.Since(parseStart).Seconds())
|
||||
|
||||
// Validate events
|
||||
validEvents := r.validator.FilterValid(procCtx, events)
|
||||
if len(validEvents) == 0 {
|
||||
r.validationErrors.Add(1)
|
||||
metrics.ValidationErrors.Inc()
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -395,24 +379,24 @@ func (r *Reader) processSwapEvent(ctx context.Context, swapEvent *SwapEvent) err
|
||||
continue
|
||||
}
|
||||
|
||||
r.avgDetectLatency.Store(time.Since(detectStart).Nanoseconds())
|
||||
metrics.DetectionLatency.Observe(time.Since(detectStart).Seconds())
|
||||
|
||||
// Execute profitable opportunities
|
||||
for _, opp := range opportunities {
|
||||
if opp.NetProfit.Cmp(r.config.MinProfit) > 0 {
|
||||
r.opportunitiesFound.Add(1)
|
||||
metrics.RecordOpportunity("arbitrage")
|
||||
r.opportunityCount++
|
||||
|
||||
if r.config.EnableFrontRunning {
|
||||
execStart := time.Now()
|
||||
go r.executeFrontRun(ctx, opp, tx)
|
||||
r.avgExecuteLatency.Store(time.Since(execStart).Nanoseconds())
|
||||
metrics.ExecutionLatency.Observe(time.Since(execStart).Seconds())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
r.txProcessed.Add(1)
|
||||
metrics.TransactionsProcessed.Inc()
|
||||
r.processedCount++
|
||||
r.lastProcessed = time.Now()
|
||||
|
||||
@@ -426,7 +410,7 @@ func (r *Reader) processSwapEvent(ctx context.Context, swapEvent *SwapEvent) err
|
||||
|
||||
// executeFrontRun executes a front-running transaction
|
||||
func (r *Reader) executeFrontRun(ctx context.Context, opp *arbitrage.Opportunity, targetTx *types.Transaction) {
|
||||
r.executionsAttempted.Add(1)
|
||||
metrics.ExecutionsAttempted.Inc()
|
||||
r.executionCount++
|
||||
|
||||
r.logger.Info("front-running opportunity",
|
||||
@@ -465,22 +449,19 @@ func (r *Reader) executeFrontRun(ctx context.Context, opp *arbitrage.Opportunity
|
||||
}
|
||||
|
||||
// GetStats returns current statistics
|
||||
// NOTE: Detailed metrics are now available via Prometheus /metrics endpoint
|
||||
// This returns only basic connection state and local counters
|
||||
func (r *Reader) GetStats() map[string]interface{} {
|
||||
r.mu.RLock()
|
||||
defer r.mu.RUnlock()
|
||||
|
||||
return map[string]interface{}{
|
||||
"connected": r.connected,
|
||||
"tx_received": r.txReceived.Load(),
|
||||
"tx_processed": r.txProcessed.Load(),
|
||||
"parse_errors": r.parseErrors.Load(),
|
||||
"validation_errors": r.validationErrors.Load(),
|
||||
"opportunities_found": r.opportunitiesFound.Load(),
|
||||
"executions_attempted": r.executionsAttempted.Load(),
|
||||
"avg_parse_latency": time.Duration(r.avgParseLatency.Load()).String(),
|
||||
"avg_detect_latency": time.Duration(r.avgDetectLatency.Load()).String(),
|
||||
"avg_execute_latency": time.Duration(r.avgExecuteLatency.Load()).String(),
|
||||
"processed_count": r.processedCount,
|
||||
"opportunity_count": r.opportunityCount,
|
||||
"execution_count": r.executionCount,
|
||||
"last_processed": r.lastProcessed.Format(time.RFC3339),
|
||||
"metrics_endpoint": "/metrics (Prometheus format)",
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user