feat: create v2-prep branch with comprehensive planning
Restructured project for V2 refactor: **Structure Changes:** - Moved all V1 code to orig/ folder (preserved with git mv) - Created docs/planning/ directory - Added orig/README_V1.md explaining V1 preservation **Planning Documents:** - 00_V2_MASTER_PLAN.md: Complete architecture overview - Executive summary of critical V1 issues - High-level component architecture diagrams - 5-phase implementation roadmap - Success metrics and risk mitigation - 07_TASK_BREAKDOWN.md: Atomic task breakdown - 99+ hours of detailed tasks - Every task < 2 hours (atomic) - Clear dependencies and success criteria - Organized by implementation phase **V2 Key Improvements:** - Per-exchange parsers (factory pattern) - Multi-layer strict validation - Multi-index pool cache - Background validation pipeline - Comprehensive observability **Critical Issues Addressed:** - Zero address tokens (strict validation + cache enrichment) - Parsing accuracy (protocol-specific parsers) - No audit trail (background validation channel) - Inefficient lookups (multi-index cache) - Stats disconnection (event-driven metrics) Next Steps: 1. Review planning documents 2. Begin Phase 1: Foundation (P1-001 through P1-010) 3. Implement parsers in Phase 2 4. Build cache system in Phase 3 5. Add validation pipeline in Phase 4 6. Migrate and test in Phase 5 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
876
orig/pkg/lifecycle/shutdown_manager.go
Normal file
876
orig/pkg/lifecycle/shutdown_manager.go
Normal file
@@ -0,0 +1,876 @@
|
||||
package lifecycle
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/signal"
|
||||
"sync"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/fraktal/mev-beta/internal/logger"
|
||||
)
|
||||
|
||||
// ShutdownManager handles graceful shutdown of the application
|
||||
type ShutdownManager struct {
|
||||
registry *ModuleRegistry
|
||||
shutdownTasks []ShutdownTask
|
||||
shutdownHooks []ShutdownHook
|
||||
config ShutdownConfig
|
||||
signalChannel chan os.Signal
|
||||
shutdownChannel chan struct{}
|
||||
state ShutdownState
|
||||
startTime time.Time
|
||||
shutdownStarted time.Time
|
||||
mu sync.RWMutex
|
||||
wg sync.WaitGroup
|
||||
ctx context.Context
|
||||
cancel context.CancelFunc
|
||||
logger *logger.Logger
|
||||
shutdownErrors []error
|
||||
shutdownErrorDetails []RecordedError
|
||||
errMu sync.Mutex
|
||||
exitFunc func(code int)
|
||||
emergencyHandler func(ctx context.Context, reason string, err error) error
|
||||
}
|
||||
|
||||
// ShutdownTask represents a task to be executed during shutdown
|
||||
type ShutdownTask struct {
|
||||
Name string
|
||||
Priority int
|
||||
Timeout time.Duration
|
||||
Task func(ctx context.Context) error
|
||||
OnError func(error)
|
||||
Critical bool
|
||||
Enabled bool
|
||||
}
|
||||
|
||||
// ShutdownHook is called at different stages of shutdown
|
||||
type ShutdownHook interface {
|
||||
OnShutdownStarted(ctx context.Context) error
|
||||
OnModulesStopped(ctx context.Context) error
|
||||
OnCleanupStarted(ctx context.Context) error
|
||||
OnShutdownCompleted(ctx context.Context) error
|
||||
OnShutdownFailed(ctx context.Context, err error) error
|
||||
}
|
||||
|
||||
// ShutdownConfig configures shutdown behavior
|
||||
type ShutdownConfig struct {
|
||||
GracefulTimeout time.Duration `json:"graceful_timeout"`
|
||||
ForceTimeout time.Duration `json:"force_timeout"`
|
||||
SignalBufferSize int `json:"signal_buffer_size"`
|
||||
MaxRetries int `json:"max_retries"`
|
||||
RetryDelay time.Duration `json:"retry_delay"`
|
||||
ParallelShutdown bool `json:"parallel_shutdown"`
|
||||
SaveState bool `json:"save_state"`
|
||||
CleanupTempFiles bool `json:"cleanup_temp_files"`
|
||||
NotifyExternal bool `json:"notify_external"`
|
||||
WaitForConnections bool `json:"wait_for_connections"`
|
||||
EnableMetrics bool `json:"enable_metrics"`
|
||||
}
|
||||
|
||||
// ShutdownState represents the current shutdown state
|
||||
type ShutdownState string
|
||||
|
||||
const (
|
||||
ShutdownStateRunning ShutdownState = "running"
|
||||
ShutdownStateInitiated ShutdownState = "initiated"
|
||||
ShutdownStateModuleStop ShutdownState = "stopping_modules"
|
||||
ShutdownStateCleanup ShutdownState = "cleanup"
|
||||
ShutdownStateCompleted ShutdownState = "completed"
|
||||
ShutdownStateFailed ShutdownState = "failed"
|
||||
ShutdownStateForced ShutdownState = "forced"
|
||||
)
|
||||
|
||||
// ShutdownMetrics tracks shutdown performance
|
||||
type ShutdownMetrics struct {
|
||||
ShutdownInitiated time.Time `json:"shutdown_initiated"`
|
||||
ModuleStopTime time.Duration `json:"module_stop_time"`
|
||||
CleanupTime time.Duration `json:"cleanup_time"`
|
||||
TotalShutdownTime time.Duration `json:"total_shutdown_time"`
|
||||
TasksExecuted int `json:"tasks_executed"`
|
||||
TasksSuccessful int `json:"tasks_successful"`
|
||||
TasksFailed int `json:"tasks_failed"`
|
||||
RetryAttempts int `json:"retry_attempts"`
|
||||
ForceShutdown bool `json:"force_shutdown"`
|
||||
Signal string `json:"signal"`
|
||||
}
|
||||
|
||||
// ShutdownProgress tracks shutdown progress
|
||||
type ShutdownProgress struct {
|
||||
State ShutdownState `json:"state"`
|
||||
Progress float64 `json:"progress"`
|
||||
CurrentTask string `json:"current_task"`
|
||||
CompletedTasks int `json:"completed_tasks"`
|
||||
TotalTasks int `json:"total_tasks"`
|
||||
ElapsedTime time.Duration `json:"elapsed_time"`
|
||||
EstimatedRemaining time.Duration `json:"estimated_remaining"`
|
||||
Message string `json:"message"`
|
||||
}
|
||||
|
||||
// NewShutdownManager creates a new shutdown manager
|
||||
func NewShutdownManager(registry *ModuleRegistry, config ShutdownConfig) *ShutdownManager {
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
|
||||
sm := &ShutdownManager{
|
||||
registry: registry,
|
||||
shutdownTasks: make([]ShutdownTask, 0),
|
||||
shutdownHooks: make([]ShutdownHook, 0),
|
||||
config: config,
|
||||
signalChannel: make(chan os.Signal, config.SignalBufferSize),
|
||||
shutdownChannel: make(chan struct{}),
|
||||
state: ShutdownStateRunning,
|
||||
startTime: time.Now(),
|
||||
ctx: ctx,
|
||||
cancel: cancel,
|
||||
shutdownErrors: make([]error, 0),
|
||||
shutdownErrorDetails: make([]RecordedError, 0),
|
||||
exitFunc: os.Exit,
|
||||
}
|
||||
|
||||
// Set default configuration
|
||||
if sm.config.GracefulTimeout == 0 {
|
||||
sm.config.GracefulTimeout = 30 * time.Second
|
||||
}
|
||||
if sm.config.ForceTimeout == 0 {
|
||||
sm.config.ForceTimeout = 60 * time.Second
|
||||
}
|
||||
if sm.config.SignalBufferSize == 0 {
|
||||
sm.config.SignalBufferSize = 10
|
||||
}
|
||||
if sm.config.MaxRetries == 0 {
|
||||
sm.config.MaxRetries = 3
|
||||
}
|
||||
if sm.config.RetryDelay == 0 {
|
||||
sm.config.RetryDelay = time.Second
|
||||
}
|
||||
|
||||
if err := os.MkdirAll("logs", 0o755); err != nil {
|
||||
fmt.Printf("failed to ensure logs directory: %v\n", err)
|
||||
}
|
||||
sm.logger = logger.New("info", "", "logs/lifecycle_shutdown.log")
|
||||
|
||||
// Setup default shutdown tasks
|
||||
sm.setupDefaultTasks()
|
||||
|
||||
// Setup signal handling
|
||||
sm.setupSignalHandling()
|
||||
|
||||
return sm
|
||||
}
|
||||
|
||||
// Start starts the shutdown manager
|
||||
func (sm *ShutdownManager) Start() error {
|
||||
sm.mu.Lock()
|
||||
defer sm.mu.Unlock()
|
||||
|
||||
if sm.state != ShutdownStateRunning {
|
||||
return fmt.Errorf("shutdown manager not in running state: %s", sm.state)
|
||||
}
|
||||
|
||||
// Start signal monitoring
|
||||
go sm.signalHandler()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Shutdown initiates graceful shutdown
|
||||
func (sm *ShutdownManager) Shutdown(ctx context.Context) error {
|
||||
sm.mu.Lock()
|
||||
defer sm.mu.Unlock()
|
||||
|
||||
if sm.state != ShutdownStateRunning {
|
||||
return fmt.Errorf("shutdown already initiated: %s", sm.state)
|
||||
}
|
||||
|
||||
sm.state = ShutdownStateInitiated
|
||||
sm.shutdownStarted = time.Now()
|
||||
|
||||
// Close shutdown channel to signal shutdown
|
||||
close(sm.shutdownChannel)
|
||||
|
||||
err := sm.performShutdown(ctx)
|
||||
|
||||
combined := sm.combinedShutdownError()
|
||||
if combined != nil {
|
||||
return combined
|
||||
}
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
// ForceShutdown forces immediate shutdown
|
||||
func (sm *ShutdownManager) ForceShutdown(ctx context.Context) error {
|
||||
sm.mu.Lock()
|
||||
defer sm.mu.Unlock()
|
||||
|
||||
sm.state = ShutdownStateForced
|
||||
sm.cancel() // Cancel all operations
|
||||
|
||||
// Force stop all modules immediately
|
||||
var forceErr error
|
||||
if sm.registry != nil {
|
||||
forceCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
if err := sm.registry.StopAll(forceCtx); err != nil {
|
||||
wrapped := fmt.Errorf("StopAll failed during force shutdown: %w", err)
|
||||
sm.recordShutdownError("StopAll failed in force shutdown", wrapped)
|
||||
forceErr = errors.Join(forceErr, wrapped)
|
||||
}
|
||||
}
|
||||
|
||||
if forceErr != nil {
|
||||
sm.recordShutdownError("Force shutdown encountered errors", forceErr)
|
||||
}
|
||||
|
||||
sm.exitFunc(1)
|
||||
return forceErr
|
||||
}
|
||||
|
||||
// AddShutdownTask adds a task to be executed during shutdown
|
||||
func (sm *ShutdownManager) AddShutdownTask(task ShutdownTask) {
|
||||
sm.mu.Lock()
|
||||
defer sm.mu.Unlock()
|
||||
|
||||
sm.shutdownTasks = append(sm.shutdownTasks, task)
|
||||
sm.sortTasksByPriority()
|
||||
}
|
||||
|
||||
// AddShutdownHook adds a hook to be called during shutdown phases
|
||||
func (sm *ShutdownManager) AddShutdownHook(hook ShutdownHook) {
|
||||
sm.mu.Lock()
|
||||
defer sm.mu.Unlock()
|
||||
|
||||
sm.shutdownHooks = append(sm.shutdownHooks, hook)
|
||||
}
|
||||
|
||||
// GetState returns the current shutdown state
|
||||
func (sm *ShutdownManager) GetState() ShutdownState {
|
||||
sm.mu.RLock()
|
||||
defer sm.mu.RUnlock()
|
||||
return sm.state
|
||||
}
|
||||
|
||||
// GetProgress returns the current shutdown progress
|
||||
func (sm *ShutdownManager) GetProgress() ShutdownProgress {
|
||||
sm.mu.RLock()
|
||||
defer sm.mu.RUnlock()
|
||||
|
||||
totalTasks := len(sm.shutdownTasks)
|
||||
if sm.registry != nil {
|
||||
totalTasks += len(sm.registry.List())
|
||||
}
|
||||
|
||||
var progress float64
|
||||
var completedTasks int
|
||||
var currentTask string
|
||||
|
||||
switch sm.state {
|
||||
case ShutdownStateRunning:
|
||||
progress = 0
|
||||
currentTask = "Running"
|
||||
case ShutdownStateInitiated:
|
||||
progress = 0.1
|
||||
currentTask = "Shutdown initiated"
|
||||
case ShutdownStateModuleStop:
|
||||
progress = 0.3
|
||||
currentTask = "Stopping modules"
|
||||
completedTasks = totalTasks / 3
|
||||
case ShutdownStateCleanup:
|
||||
progress = 0.7
|
||||
currentTask = "Cleanup"
|
||||
completedTasks = (totalTasks * 2) / 3
|
||||
case ShutdownStateCompleted:
|
||||
progress = 1.0
|
||||
currentTask = "Completed"
|
||||
completedTasks = totalTasks
|
||||
case ShutdownStateFailed:
|
||||
progress = 0.8
|
||||
currentTask = "Failed"
|
||||
case ShutdownStateForced:
|
||||
progress = 1.0
|
||||
currentTask = "Forced shutdown"
|
||||
completedTasks = totalTasks
|
||||
}
|
||||
|
||||
elapsedTime := time.Since(sm.shutdownStarted)
|
||||
var estimatedRemaining time.Duration
|
||||
if progress > 0 && progress < 1.0 {
|
||||
totalEstimated := time.Duration(float64(elapsedTime) / progress)
|
||||
estimatedRemaining = totalEstimated - elapsedTime
|
||||
}
|
||||
|
||||
return ShutdownProgress{
|
||||
State: sm.state,
|
||||
Progress: progress,
|
||||
CurrentTask: currentTask,
|
||||
CompletedTasks: completedTasks,
|
||||
TotalTasks: totalTasks,
|
||||
ElapsedTime: elapsedTime,
|
||||
EstimatedRemaining: estimatedRemaining,
|
||||
Message: fmt.Sprintf("Shutdown %s", sm.state),
|
||||
}
|
||||
}
|
||||
|
||||
// Wait waits for shutdown to complete
|
||||
func (sm *ShutdownManager) Wait() {
|
||||
<-sm.shutdownChannel
|
||||
sm.wg.Wait()
|
||||
}
|
||||
|
||||
// WaitWithTimeout waits for shutdown with timeout
|
||||
func (sm *ShutdownManager) WaitWithTimeout(timeout time.Duration) error {
|
||||
done := make(chan struct{})
|
||||
go func() {
|
||||
sm.Wait()
|
||||
close(done)
|
||||
}()
|
||||
|
||||
select {
|
||||
case <-done:
|
||||
return nil
|
||||
case <-time.After(timeout):
|
||||
return fmt.Errorf("shutdown timeout after %v", timeout)
|
||||
}
|
||||
}
|
||||
|
||||
// Private methods
|
||||
|
||||
func (sm *ShutdownManager) setupSignalHandling() {
|
||||
signal.Notify(sm.signalChannel,
|
||||
syscall.SIGINT,
|
||||
syscall.SIGTERM,
|
||||
syscall.SIGQUIT,
|
||||
syscall.SIGHUP,
|
||||
)
|
||||
}
|
||||
|
||||
func (sm *ShutdownManager) setupDefaultTasks() {
|
||||
// Task: Save application state
|
||||
sm.shutdownTasks = append(sm.shutdownTasks, ShutdownTask{
|
||||
Name: "save_state",
|
||||
Priority: 100,
|
||||
Timeout: 10 * time.Second,
|
||||
Task: func(ctx context.Context) error {
|
||||
if sm.config.SaveState {
|
||||
return sm.saveApplicationState(ctx)
|
||||
}
|
||||
return nil
|
||||
},
|
||||
Critical: false,
|
||||
Enabled: sm.config.SaveState,
|
||||
})
|
||||
|
||||
// Task: Close external connections
|
||||
sm.shutdownTasks = append(sm.shutdownTasks, ShutdownTask{
|
||||
Name: "close_connections",
|
||||
Priority: 90,
|
||||
Timeout: 5 * time.Second,
|
||||
Task: func(ctx context.Context) error {
|
||||
return sm.closeExternalConnections(ctx)
|
||||
},
|
||||
Critical: false,
|
||||
Enabled: sm.config.WaitForConnections,
|
||||
})
|
||||
|
||||
// Task: Cleanup temporary files
|
||||
sm.shutdownTasks = append(sm.shutdownTasks, ShutdownTask{
|
||||
Name: "cleanup_temp_files",
|
||||
Priority: 10,
|
||||
Timeout: 5 * time.Second,
|
||||
Task: func(ctx context.Context) error {
|
||||
if sm.config.CleanupTempFiles {
|
||||
return sm.cleanupTempFiles(ctx)
|
||||
}
|
||||
return nil
|
||||
},
|
||||
Critical: false,
|
||||
Enabled: sm.config.CleanupTempFiles,
|
||||
})
|
||||
|
||||
// Task: Notify external systems
|
||||
sm.shutdownTasks = append(sm.shutdownTasks, ShutdownTask{
|
||||
Name: "notify_external",
|
||||
Priority: 80,
|
||||
Timeout: 3 * time.Second,
|
||||
Task: func(ctx context.Context) error {
|
||||
if sm.config.NotifyExternal {
|
||||
return sm.notifyExternalSystems(ctx)
|
||||
}
|
||||
return nil
|
||||
},
|
||||
Critical: false,
|
||||
Enabled: sm.config.NotifyExternal,
|
||||
})
|
||||
}
|
||||
|
||||
func (sm *ShutdownManager) signalHandler() {
|
||||
for {
|
||||
select {
|
||||
case sig := <-sm.signalChannel:
|
||||
switch sig {
|
||||
case syscall.SIGINT, syscall.SIGTERM:
|
||||
// Graceful shutdown
|
||||
ctx, cancel := context.WithTimeout(context.Background(), sm.config.GracefulTimeout)
|
||||
if err := sm.Shutdown(ctx); err != nil {
|
||||
sm.recordShutdownError("Graceful shutdown failed from signal", err)
|
||||
cancel()
|
||||
// Force shutdown if graceful fails
|
||||
forceCtx, forceCancel := context.WithTimeout(context.Background(), sm.config.ForceTimeout)
|
||||
if err := sm.ForceShutdown(forceCtx); err != nil {
|
||||
sm.recordShutdownError("Force shutdown error in timeout scenario", err)
|
||||
// CRITICAL FIX: Escalate force shutdown failure to emergency protocols
|
||||
sm.triggerEmergencyShutdown("Force shutdown failed after graceful timeout", err)
|
||||
}
|
||||
forceCancel()
|
||||
}
|
||||
cancel()
|
||||
return
|
||||
case syscall.SIGQUIT:
|
||||
// Force shutdown
|
||||
ctx, cancel := context.WithTimeout(context.Background(), sm.config.ForceTimeout)
|
||||
if err := sm.ForceShutdown(ctx); err != nil {
|
||||
sm.recordShutdownError("Force shutdown error in SIGQUIT handler", err)
|
||||
// CRITICAL FIX: Escalate force shutdown failure to emergency protocols
|
||||
sm.triggerEmergencyShutdown("Force shutdown failed on SIGQUIT", err)
|
||||
}
|
||||
cancel()
|
||||
return
|
||||
case syscall.SIGHUP:
|
||||
// Reload signal - could be used for configuration reload
|
||||
// For now, just log it
|
||||
continue
|
||||
}
|
||||
case <-sm.ctx.Done():
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (sm *ShutdownManager) performShutdown(ctx context.Context) error {
|
||||
sm.wg.Add(1)
|
||||
defer sm.wg.Done()
|
||||
|
||||
// Create timeout context for entire shutdown
|
||||
shutdownCtx, cancel := context.WithTimeout(ctx, sm.config.GracefulTimeout)
|
||||
defer cancel()
|
||||
|
||||
var phaseErrors []error
|
||||
|
||||
// Phase 1: Call shutdown started hooks
|
||||
if err := sm.callHooks(shutdownCtx, "OnShutdownStarted", nil); err != nil {
|
||||
wrapped := fmt.Errorf("shutdown start hooks failed: %w", err)
|
||||
sm.recordShutdownError("Shutdown started hook failure", wrapped)
|
||||
phaseErrors = append(phaseErrors, wrapped)
|
||||
}
|
||||
|
||||
// Phase 2: Stop modules
|
||||
sm.state = ShutdownStateModuleStop
|
||||
if sm.registry != nil {
|
||||
if err := sm.registry.StopAll(shutdownCtx); err != nil {
|
||||
wrapped := fmt.Errorf("failed to stop modules: %w", err)
|
||||
sm.recordShutdownError("Module stop failure", wrapped)
|
||||
phaseErrors = append(phaseErrors, wrapped)
|
||||
}
|
||||
}
|
||||
|
||||
// Call modules stopped hooks
|
||||
if err := sm.callHooks(shutdownCtx, "OnModulesStopped", nil); err != nil {
|
||||
wrapped := fmt.Errorf("modules stopped hooks failed: %w", err)
|
||||
sm.recordShutdownError("Modules stopped hook failure", wrapped)
|
||||
phaseErrors = append(phaseErrors, wrapped)
|
||||
}
|
||||
|
||||
// Phase 3: Execute shutdown tasks
|
||||
sm.state = ShutdownStateCleanup
|
||||
if err := sm.callHooks(shutdownCtx, "OnCleanupStarted", nil); err != nil {
|
||||
wrapped := fmt.Errorf("cleanup hooks failed: %w", err)
|
||||
sm.recordShutdownError("Cleanup hook failure", wrapped)
|
||||
phaseErrors = append(phaseErrors, wrapped)
|
||||
}
|
||||
|
||||
if err := sm.executeShutdownTasks(shutdownCtx); err != nil {
|
||||
wrapped := fmt.Errorf("shutdown tasks failed: %w", err)
|
||||
sm.recordShutdownError("Shutdown task execution failure", wrapped)
|
||||
phaseErrors = append(phaseErrors, wrapped)
|
||||
}
|
||||
|
||||
// Phase 4: Final cleanup
|
||||
if len(phaseErrors) > 0 {
|
||||
finalErr := errors.Join(phaseErrors...)
|
||||
sm.state = ShutdownStateFailed
|
||||
if err := sm.callHooks(shutdownCtx, "OnShutdownFailed", finalErr); err != nil {
|
||||
wrapped := fmt.Errorf("shutdown failed hook error: %w", err)
|
||||
sm.recordShutdownError("Shutdown failed hook error", wrapped)
|
||||
finalErr = errors.Join(finalErr, wrapped)
|
||||
// CRITICAL FIX: Escalate hook failure during shutdown failed state
|
||||
sm.triggerEmergencyShutdown("Shutdown failed hook error", wrapped)
|
||||
}
|
||||
return finalErr
|
||||
}
|
||||
|
||||
sm.state = ShutdownStateCompleted
|
||||
if err := sm.callHooks(shutdownCtx, "OnShutdownCompleted", nil); err != nil {
|
||||
wrapped := fmt.Errorf("shutdown completed hook error: %w", err)
|
||||
sm.recordShutdownError("Shutdown completed hook error", wrapped)
|
||||
// CRITICAL FIX: Log but don't fail shutdown for completion hook errors
|
||||
// These are non-critical notifications that shouldn't prevent successful shutdown
|
||||
sm.logger.Warn("Shutdown completed hook failed", "error", wrapped)
|
||||
// Don't return error for completion hook failures - shutdown was successful
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (sm *ShutdownManager) executeShutdownTasks(ctx context.Context) error {
|
||||
if sm.config.ParallelShutdown {
|
||||
return sm.executeTasksParallel(ctx)
|
||||
} else {
|
||||
return sm.executeTasksSequential(ctx)
|
||||
}
|
||||
}
|
||||
|
||||
func (sm *ShutdownManager) executeTasksSequential(ctx context.Context) error {
|
||||
var lastErr error
|
||||
|
||||
for _, task := range sm.shutdownTasks {
|
||||
if !task.Enabled {
|
||||
continue
|
||||
}
|
||||
|
||||
if err := sm.executeTask(ctx, task); err != nil {
|
||||
lastErr = err
|
||||
if task.Critical {
|
||||
return fmt.Errorf("critical task %s failed: %w", task.Name, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return lastErr
|
||||
}
|
||||
|
||||
func (sm *ShutdownManager) executeTasksParallel(ctx context.Context) error {
|
||||
var wg sync.WaitGroup
|
||||
errors := make(chan error, len(sm.shutdownTasks))
|
||||
|
||||
// Group tasks by priority
|
||||
priorityGroups := sm.groupTasksByPriority()
|
||||
|
||||
// Execute each priority group sequentially, but tasks within group in parallel
|
||||
for _, tasks := range priorityGroups {
|
||||
for _, task := range tasks {
|
||||
if !task.Enabled {
|
||||
continue
|
||||
}
|
||||
|
||||
wg.Add(1)
|
||||
go func(t ShutdownTask) {
|
||||
defer wg.Done()
|
||||
if err := sm.executeTask(ctx, t); err != nil {
|
||||
errors <- fmt.Errorf("task %s failed: %w", t.Name, err)
|
||||
}
|
||||
}(task)
|
||||
}
|
||||
wg.Wait()
|
||||
}
|
||||
|
||||
close(errors)
|
||||
|
||||
// Collect errors
|
||||
var criticalErr error
|
||||
var lastErr error
|
||||
for err := range errors {
|
||||
lastErr = err
|
||||
// Check if this was from a critical task
|
||||
for _, task := range sm.shutdownTasks {
|
||||
if task.Critical && fmt.Sprintf("task %s failed:", task.Name) == err.Error()[:len(fmt.Sprintf("task %s failed:", task.Name))] {
|
||||
criticalErr = err
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if criticalErr != nil {
|
||||
return criticalErr
|
||||
}
|
||||
return lastErr
|
||||
}
|
||||
|
||||
func (sm *ShutdownManager) executeTask(ctx context.Context, task ShutdownTask) error {
|
||||
// Create timeout context for the task
|
||||
taskCtx, cancel := context.WithTimeout(ctx, task.Timeout)
|
||||
defer cancel()
|
||||
|
||||
// Execute task with retry
|
||||
var lastErr error
|
||||
for attempt := 0; attempt <= sm.config.MaxRetries; attempt++ {
|
||||
if attempt > 0 {
|
||||
select {
|
||||
case <-time.After(sm.config.RetryDelay):
|
||||
case <-taskCtx.Done():
|
||||
return taskCtx.Err()
|
||||
}
|
||||
}
|
||||
|
||||
err := task.Task(taskCtx)
|
||||
if err == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
lastErr = err
|
||||
attemptNumber := attempt + 1
|
||||
|
||||
sm.recordShutdownError(
|
||||
fmt.Sprintf("Shutdown task %s failed", task.Name),
|
||||
fmt.Errorf("attempt %d: %w", attemptNumber, err),
|
||||
"task", task.Name,
|
||||
"attempt", attemptNumber,
|
||||
)
|
||||
|
||||
// Call error handler if provided
|
||||
if task.OnError != nil {
|
||||
task.OnError(err)
|
||||
}
|
||||
}
|
||||
|
||||
return fmt.Errorf("task failed after %d attempts: %w", sm.config.MaxRetries, lastErr)
|
||||
}
|
||||
|
||||
func (sm *ShutdownManager) callHooks(ctx context.Context, hookMethod string, cause error) error {
|
||||
var hookErrors []error
|
||||
|
||||
for _, hook := range sm.shutdownHooks {
|
||||
hookName := fmt.Sprintf("%T", hook)
|
||||
var err error
|
||||
|
||||
switch hookMethod {
|
||||
case "OnShutdownStarted":
|
||||
err = hook.OnShutdownStarted(ctx)
|
||||
case "OnModulesStopped":
|
||||
err = hook.OnModulesStopped(ctx)
|
||||
case "OnCleanupStarted":
|
||||
err = hook.OnCleanupStarted(ctx)
|
||||
case "OnShutdownCompleted":
|
||||
err = hook.OnShutdownCompleted(ctx)
|
||||
case "OnShutdownFailed":
|
||||
err = hook.OnShutdownFailed(ctx, cause)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
recordContext := fmt.Sprintf("%s hook failure (%s)", hookMethod, hookName)
|
||||
sm.recordShutdownError(recordContext, err, "hook", hookName, "phase", hookMethod)
|
||||
hookErrors = append(hookErrors, fmt.Errorf("%s: %w", recordContext, err))
|
||||
}
|
||||
}
|
||||
|
||||
if len(hookErrors) > 0 {
|
||||
return errors.Join(hookErrors...)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (sm *ShutdownManager) sortTasksByPriority() {
|
||||
// Simple bubble sort by priority (descending)
|
||||
for i := 0; i < len(sm.shutdownTasks); i++ {
|
||||
for j := i + 1; j < len(sm.shutdownTasks); j++ {
|
||||
if sm.shutdownTasks[j].Priority > sm.shutdownTasks[i].Priority {
|
||||
sm.shutdownTasks[i], sm.shutdownTasks[j] = sm.shutdownTasks[j], sm.shutdownTasks[i]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (sm *ShutdownManager) groupTasksByPriority() [][]ShutdownTask {
|
||||
groups := make(map[int][]ShutdownTask)
|
||||
|
||||
for _, task := range sm.shutdownTasks {
|
||||
groups[task.Priority] = append(groups[task.Priority], task)
|
||||
}
|
||||
|
||||
// Convert to sorted slice
|
||||
var priorities []int
|
||||
for priority := range groups {
|
||||
priorities = append(priorities, priority)
|
||||
}
|
||||
|
||||
// Sort priorities descending
|
||||
for i := 0; i < len(priorities); i++ {
|
||||
for j := i + 1; j < len(priorities); j++ {
|
||||
if priorities[j] > priorities[i] {
|
||||
priorities[i], priorities[j] = priorities[j], priorities[i]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var result [][]ShutdownTask
|
||||
for _, priority := range priorities {
|
||||
result = append(result, groups[priority])
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
// Default task implementations
|
||||
|
||||
func (sm *ShutdownManager) saveApplicationState(ctx context.Context) error {
|
||||
// Save application state to disk
|
||||
// This would save things like current configuration, runtime state, etc.
|
||||
return nil
|
||||
}
|
||||
|
||||
func (sm *ShutdownManager) closeExternalConnections(ctx context.Context) error {
|
||||
// Close database connections, external API connections, etc.
|
||||
return nil
|
||||
}
|
||||
|
||||
func (sm *ShutdownManager) cleanupTempFiles(ctx context.Context) error {
|
||||
// Remove temporary files, logs, caches, etc.
|
||||
return nil
|
||||
}
|
||||
|
||||
func (sm *ShutdownManager) notifyExternalSystems(ctx context.Context) error {
|
||||
// Notify external systems that this instance is shutting down
|
||||
return nil
|
||||
}
|
||||
|
||||
func (sm *ShutdownManager) recordShutdownError(message string, err error, attrs ...interface{}) {
|
||||
if err == nil {
|
||||
return
|
||||
}
|
||||
|
||||
attrCopy := append([]interface{}{}, attrs...)
|
||||
wrapped, txHash, attrsWithTx := enrichErrorWithTxHash(message, err, attrCopy)
|
||||
|
||||
sm.errMu.Lock()
|
||||
sm.shutdownErrors = append(sm.shutdownErrors, wrapped)
|
||||
sm.shutdownErrorDetails = append(sm.shutdownErrorDetails, RecordedError{
|
||||
Err: wrapped,
|
||||
TxHash: txHash,
|
||||
})
|
||||
sm.errMu.Unlock()
|
||||
|
||||
if sm.logger != nil {
|
||||
kv := append([]interface{}{}, attrsWithTx...)
|
||||
kv = append(kv, "error", err)
|
||||
args := append([]interface{}{message}, kv...)
|
||||
sm.logger.Error(args...)
|
||||
}
|
||||
}
|
||||
|
||||
func (sm *ShutdownManager) combinedShutdownError() error {
|
||||
sm.errMu.Lock()
|
||||
defer sm.errMu.Unlock()
|
||||
|
||||
if len(sm.shutdownErrors) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
errs := make([]error, len(sm.shutdownErrors))
|
||||
copy(errs, sm.shutdownErrors)
|
||||
return errors.Join(errs...)
|
||||
}
|
||||
|
||||
// ShutdownErrors returns a copy of recorded shutdown errors for diagnostics.
|
||||
func (sm *ShutdownManager) ShutdownErrors() []error {
|
||||
sm.errMu.Lock()
|
||||
defer sm.errMu.Unlock()
|
||||
|
||||
if len(sm.shutdownErrors) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
errs := make([]error, len(sm.shutdownErrors))
|
||||
copy(errs, sm.shutdownErrors)
|
||||
return errs
|
||||
}
|
||||
|
||||
// ShutdownErrorDetails returns recorded errors with associated metadata such as tx hash.
|
||||
func (sm *ShutdownManager) ShutdownErrorDetails() []RecordedError {
|
||||
sm.errMu.Lock()
|
||||
defer sm.errMu.Unlock()
|
||||
|
||||
if len(sm.shutdownErrorDetails) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
details := make([]RecordedError, len(sm.shutdownErrorDetails))
|
||||
copy(details, sm.shutdownErrorDetails)
|
||||
return details
|
||||
}
|
||||
|
||||
// DefaultShutdownHook provides a basic implementation of ShutdownHook
|
||||
type DefaultShutdownHook struct {
|
||||
name string
|
||||
}
|
||||
|
||||
func NewDefaultShutdownHook(name string) *DefaultShutdownHook {
|
||||
return &DefaultShutdownHook{name: name}
|
||||
}
|
||||
|
||||
// triggerEmergencyShutdown performs emergency shutdown procedures when critical failures occur
|
||||
func (sm *ShutdownManager) triggerEmergencyShutdown(reason string, err error) {
|
||||
sm.logger.Error("EMERGENCY SHUTDOWN TRIGGERED",
|
||||
"reason", reason,
|
||||
"error", err,
|
||||
"state", sm.state,
|
||||
"timestamp", time.Now())
|
||||
|
||||
// Set emergency state
|
||||
sm.mu.Lock()
|
||||
sm.state = ShutdownStateFailed
|
||||
sm.mu.Unlock()
|
||||
|
||||
// Attempt to signal all processes to terminate immediately
|
||||
// This is a last-resort mechanism
|
||||
if sm.emergencyHandler != nil {
|
||||
go func() {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
|
||||
if err := sm.emergencyHandler(ctx, reason, err); err != nil {
|
||||
sm.logger.Error("Emergency handler failed", "error", err)
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
// Log to all available outputs
|
||||
sm.recordShutdownError("EMERGENCY_SHUTDOWN", fmt.Errorf("%s: %w", reason, err))
|
||||
|
||||
// Attempt to notify monitoring systems if available
|
||||
if len(sm.shutdownHooks) > 0 {
|
||||
go func() {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||
defer cancel()
|
||||
|
||||
// CRITICAL FIX: Log emergency shutdown notification failures
|
||||
if err := sm.callHooks(ctx, "OnEmergencyShutdown", fmt.Errorf("%s: %w", reason, err)); err != nil {
|
||||
sm.logger.Warn("Failed to call emergency shutdown hooks",
|
||||
"error", err,
|
||||
"reason", reason)
|
||||
}
|
||||
}()
|
||||
}
|
||||
}
|
||||
|
||||
func (dsh *DefaultShutdownHook) OnShutdownStarted(ctx context.Context) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (dsh *DefaultShutdownHook) OnModulesStopped(ctx context.Context) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (dsh *DefaultShutdownHook) OnCleanupStarted(ctx context.Context) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (dsh *DefaultShutdownHook) OnShutdownCompleted(ctx context.Context) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (dsh *DefaultShutdownHook) OnShutdownFailed(ctx context.Context, err error) error {
|
||||
return nil
|
||||
}
|
||||
Reference in New Issue
Block a user