package lifecycle import ( "context" "errors" "fmt" "os" "os/signal" "sync" "syscall" "time" "github.com/fraktal/mev-beta/internal/logger" ) // ShutdownManager handles graceful shutdown of the application type ShutdownManager struct { registry *ModuleRegistry shutdownTasks []ShutdownTask shutdownHooks []ShutdownHook config ShutdownConfig signalChannel chan os.Signal shutdownChannel chan struct{} state ShutdownState startTime time.Time shutdownStarted time.Time mu sync.RWMutex wg sync.WaitGroup ctx context.Context cancel context.CancelFunc logger *logger.Logger shutdownErrors []error shutdownErrorDetails []RecordedError errMu sync.Mutex exitFunc func(code int) emergencyHandler func(ctx context.Context, reason string, err error) error } // ShutdownTask represents a task to be executed during shutdown type ShutdownTask struct { Name string Priority int Timeout time.Duration Task func(ctx context.Context) error OnError func(error) Critical bool Enabled bool } // ShutdownHook is called at different stages of shutdown type ShutdownHook interface { OnShutdownStarted(ctx context.Context) error OnModulesStopped(ctx context.Context) error OnCleanupStarted(ctx context.Context) error OnShutdownCompleted(ctx context.Context) error OnShutdownFailed(ctx context.Context, err error) error } // ShutdownConfig configures shutdown behavior type ShutdownConfig struct { GracefulTimeout time.Duration `json:"graceful_timeout"` ForceTimeout time.Duration `json:"force_timeout"` SignalBufferSize int `json:"signal_buffer_size"` MaxRetries int `json:"max_retries"` RetryDelay time.Duration `json:"retry_delay"` ParallelShutdown bool `json:"parallel_shutdown"` SaveState bool `json:"save_state"` CleanupTempFiles bool `json:"cleanup_temp_files"` NotifyExternal bool `json:"notify_external"` WaitForConnections bool `json:"wait_for_connections"` EnableMetrics bool `json:"enable_metrics"` } // ShutdownState represents the current shutdown state type ShutdownState string const ( ShutdownStateRunning ShutdownState = "running" ShutdownStateInitiated ShutdownState = "initiated" ShutdownStateModuleStop ShutdownState = "stopping_modules" ShutdownStateCleanup ShutdownState = "cleanup" ShutdownStateCompleted ShutdownState = "completed" ShutdownStateFailed ShutdownState = "failed" ShutdownStateForced ShutdownState = "forced" ) // ShutdownMetrics tracks shutdown performance type ShutdownMetrics struct { ShutdownInitiated time.Time `json:"shutdown_initiated"` ModuleStopTime time.Duration `json:"module_stop_time"` CleanupTime time.Duration `json:"cleanup_time"` TotalShutdownTime time.Duration `json:"total_shutdown_time"` TasksExecuted int `json:"tasks_executed"` TasksSuccessful int `json:"tasks_successful"` TasksFailed int `json:"tasks_failed"` RetryAttempts int `json:"retry_attempts"` ForceShutdown bool `json:"force_shutdown"` Signal string `json:"signal"` } // ShutdownProgress tracks shutdown progress type ShutdownProgress struct { State ShutdownState `json:"state"` Progress float64 `json:"progress"` CurrentTask string `json:"current_task"` CompletedTasks int `json:"completed_tasks"` TotalTasks int `json:"total_tasks"` ElapsedTime time.Duration `json:"elapsed_time"` EstimatedRemaining time.Duration `json:"estimated_remaining"` Message string `json:"message"` } // NewShutdownManager creates a new shutdown manager func NewShutdownManager(registry *ModuleRegistry, config ShutdownConfig) *ShutdownManager { ctx, cancel := context.WithCancel(context.Background()) sm := &ShutdownManager{ registry: registry, shutdownTasks: make([]ShutdownTask, 0), shutdownHooks: make([]ShutdownHook, 0), config: config, signalChannel: make(chan os.Signal, config.SignalBufferSize), shutdownChannel: make(chan struct{}), state: ShutdownStateRunning, startTime: time.Now(), ctx: ctx, cancel: cancel, shutdownErrors: make([]error, 0), shutdownErrorDetails: make([]RecordedError, 0), exitFunc: os.Exit, } // Set default configuration if sm.config.GracefulTimeout == 0 { sm.config.GracefulTimeout = 30 * time.Second } if sm.config.ForceTimeout == 0 { sm.config.ForceTimeout = 60 * time.Second } if sm.config.SignalBufferSize == 0 { sm.config.SignalBufferSize = 10 } if sm.config.MaxRetries == 0 { sm.config.MaxRetries = 3 } if sm.config.RetryDelay == 0 { sm.config.RetryDelay = time.Second } if err := os.MkdirAll("logs", 0o755); err != nil { fmt.Printf("failed to ensure logs directory: %v\n", err) } sm.logger = logger.New("info", "", "logs/lifecycle_shutdown.log") // Setup default shutdown tasks sm.setupDefaultTasks() // Setup signal handling sm.setupSignalHandling() return sm } // Start starts the shutdown manager func (sm *ShutdownManager) Start() error { sm.mu.Lock() defer sm.mu.Unlock() if sm.state != ShutdownStateRunning { return fmt.Errorf("shutdown manager not in running state: %s", sm.state) } // Start signal monitoring go sm.signalHandler() return nil } // Shutdown initiates graceful shutdown func (sm *ShutdownManager) Shutdown(ctx context.Context) error { sm.mu.Lock() defer sm.mu.Unlock() if sm.state != ShutdownStateRunning { return fmt.Errorf("shutdown already initiated: %s", sm.state) } sm.state = ShutdownStateInitiated sm.shutdownStarted = time.Now() // Close shutdown channel to signal shutdown close(sm.shutdownChannel) err := sm.performShutdown(ctx) combined := sm.combinedShutdownError() if combined != nil { return combined } return err } // ForceShutdown forces immediate shutdown func (sm *ShutdownManager) ForceShutdown(ctx context.Context) error { sm.mu.Lock() defer sm.mu.Unlock() sm.state = ShutdownStateForced sm.cancel() // Cancel all operations // Force stop all modules immediately var forceErr error if sm.registry != nil { forceCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) defer cancel() if err := sm.registry.StopAll(forceCtx); err != nil { wrapped := fmt.Errorf("StopAll failed during force shutdown: %w", err) sm.recordShutdownError("StopAll failed in force shutdown", wrapped) forceErr = errors.Join(forceErr, wrapped) } } if forceErr != nil { sm.recordShutdownError("Force shutdown encountered errors", forceErr) } sm.exitFunc(1) return forceErr } // AddShutdownTask adds a task to be executed during shutdown func (sm *ShutdownManager) AddShutdownTask(task ShutdownTask) { sm.mu.Lock() defer sm.mu.Unlock() sm.shutdownTasks = append(sm.shutdownTasks, task) sm.sortTasksByPriority() } // AddShutdownHook adds a hook to be called during shutdown phases func (sm *ShutdownManager) AddShutdownHook(hook ShutdownHook) { sm.mu.Lock() defer sm.mu.Unlock() sm.shutdownHooks = append(sm.shutdownHooks, hook) } // GetState returns the current shutdown state func (sm *ShutdownManager) GetState() ShutdownState { sm.mu.RLock() defer sm.mu.RUnlock() return sm.state } // GetProgress returns the current shutdown progress func (sm *ShutdownManager) GetProgress() ShutdownProgress { sm.mu.RLock() defer sm.mu.RUnlock() totalTasks := len(sm.shutdownTasks) if sm.registry != nil { totalTasks += len(sm.registry.List()) } var progress float64 var completedTasks int var currentTask string switch sm.state { case ShutdownStateRunning: progress = 0 currentTask = "Running" case ShutdownStateInitiated: progress = 0.1 currentTask = "Shutdown initiated" case ShutdownStateModuleStop: progress = 0.3 currentTask = "Stopping modules" completedTasks = totalTasks / 3 case ShutdownStateCleanup: progress = 0.7 currentTask = "Cleanup" completedTasks = (totalTasks * 2) / 3 case ShutdownStateCompleted: progress = 1.0 currentTask = "Completed" completedTasks = totalTasks case ShutdownStateFailed: progress = 0.8 currentTask = "Failed" case ShutdownStateForced: progress = 1.0 currentTask = "Forced shutdown" completedTasks = totalTasks } elapsedTime := time.Since(sm.shutdownStarted) var estimatedRemaining time.Duration if progress > 0 && progress < 1.0 { totalEstimated := time.Duration(float64(elapsedTime) / progress) estimatedRemaining = totalEstimated - elapsedTime } return ShutdownProgress{ State: sm.state, Progress: progress, CurrentTask: currentTask, CompletedTasks: completedTasks, TotalTasks: totalTasks, ElapsedTime: elapsedTime, EstimatedRemaining: estimatedRemaining, Message: fmt.Sprintf("Shutdown %s", sm.state), } } // Wait waits for shutdown to complete func (sm *ShutdownManager) Wait() { <-sm.shutdownChannel sm.wg.Wait() } // WaitWithTimeout waits for shutdown with timeout func (sm *ShutdownManager) WaitWithTimeout(timeout time.Duration) error { done := make(chan struct{}) go func() { sm.Wait() close(done) }() select { case <-done: return nil case <-time.After(timeout): return fmt.Errorf("shutdown timeout after %v", timeout) } } // Private methods func (sm *ShutdownManager) setupSignalHandling() { signal.Notify(sm.signalChannel, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT, syscall.SIGHUP, ) } func (sm *ShutdownManager) setupDefaultTasks() { // Task: Save application state sm.shutdownTasks = append(sm.shutdownTasks, ShutdownTask{ Name: "save_state", Priority: 100, Timeout: 10 * time.Second, Task: func(ctx context.Context) error { if sm.config.SaveState { return sm.saveApplicationState(ctx) } return nil }, Critical: false, Enabled: sm.config.SaveState, }) // Task: Close external connections sm.shutdownTasks = append(sm.shutdownTasks, ShutdownTask{ Name: "close_connections", Priority: 90, Timeout: 5 * time.Second, Task: func(ctx context.Context) error { return sm.closeExternalConnections(ctx) }, Critical: false, Enabled: sm.config.WaitForConnections, }) // Task: Cleanup temporary files sm.shutdownTasks = append(sm.shutdownTasks, ShutdownTask{ Name: "cleanup_temp_files", Priority: 10, Timeout: 5 * time.Second, Task: func(ctx context.Context) error { if sm.config.CleanupTempFiles { return sm.cleanupTempFiles(ctx) } return nil }, Critical: false, Enabled: sm.config.CleanupTempFiles, }) // Task: Notify external systems sm.shutdownTasks = append(sm.shutdownTasks, ShutdownTask{ Name: "notify_external", Priority: 80, Timeout: 3 * time.Second, Task: func(ctx context.Context) error { if sm.config.NotifyExternal { return sm.notifyExternalSystems(ctx) } return nil }, Critical: false, Enabled: sm.config.NotifyExternal, }) } func (sm *ShutdownManager) signalHandler() { for { select { case sig := <-sm.signalChannel: switch sig { case syscall.SIGINT, syscall.SIGTERM: // Graceful shutdown ctx, cancel := context.WithTimeout(context.Background(), sm.config.GracefulTimeout) if err := sm.Shutdown(ctx); err != nil { sm.recordShutdownError("Graceful shutdown failed from signal", err) cancel() // Force shutdown if graceful fails forceCtx, forceCancel := context.WithTimeout(context.Background(), sm.config.ForceTimeout) if err := sm.ForceShutdown(forceCtx); err != nil { sm.recordShutdownError("Force shutdown error in timeout scenario", err) // CRITICAL FIX: Escalate force shutdown failure to emergency protocols sm.triggerEmergencyShutdown("Force shutdown failed after graceful timeout", err) } forceCancel() } cancel() return case syscall.SIGQUIT: // Force shutdown ctx, cancel := context.WithTimeout(context.Background(), sm.config.ForceTimeout) if err := sm.ForceShutdown(ctx); err != nil { sm.recordShutdownError("Force shutdown error in SIGQUIT handler", err) // CRITICAL FIX: Escalate force shutdown failure to emergency protocols sm.triggerEmergencyShutdown("Force shutdown failed on SIGQUIT", err) } cancel() return case syscall.SIGHUP: // Reload signal - could be used for configuration reload // For now, just log it continue } case <-sm.ctx.Done(): return } } } func (sm *ShutdownManager) performShutdown(ctx context.Context) error { sm.wg.Add(1) defer sm.wg.Done() // Create timeout context for entire shutdown shutdownCtx, cancel := context.WithTimeout(ctx, sm.config.GracefulTimeout) defer cancel() var phaseErrors []error // Phase 1: Call shutdown started hooks if err := sm.callHooks(shutdownCtx, "OnShutdownStarted", nil); err != nil { wrapped := fmt.Errorf("shutdown start hooks failed: %w", err) sm.recordShutdownError("Shutdown started hook failure", wrapped) phaseErrors = append(phaseErrors, wrapped) } // Phase 2: Stop modules sm.state = ShutdownStateModuleStop if sm.registry != nil { if err := sm.registry.StopAll(shutdownCtx); err != nil { wrapped := fmt.Errorf("failed to stop modules: %w", err) sm.recordShutdownError("Module stop failure", wrapped) phaseErrors = append(phaseErrors, wrapped) } } // Call modules stopped hooks if err := sm.callHooks(shutdownCtx, "OnModulesStopped", nil); err != nil { wrapped := fmt.Errorf("modules stopped hooks failed: %w", err) sm.recordShutdownError("Modules stopped hook failure", wrapped) phaseErrors = append(phaseErrors, wrapped) } // Phase 3: Execute shutdown tasks sm.state = ShutdownStateCleanup if err := sm.callHooks(shutdownCtx, "OnCleanupStarted", nil); err != nil { wrapped := fmt.Errorf("cleanup hooks failed: %w", err) sm.recordShutdownError("Cleanup hook failure", wrapped) phaseErrors = append(phaseErrors, wrapped) } if err := sm.executeShutdownTasks(shutdownCtx); err != nil { wrapped := fmt.Errorf("shutdown tasks failed: %w", err) sm.recordShutdownError("Shutdown task execution failure", wrapped) phaseErrors = append(phaseErrors, wrapped) } // Phase 4: Final cleanup if len(phaseErrors) > 0 { finalErr := errors.Join(phaseErrors...) sm.state = ShutdownStateFailed if err := sm.callHooks(shutdownCtx, "OnShutdownFailed", finalErr); err != nil { wrapped := fmt.Errorf("shutdown failed hook error: %w", err) sm.recordShutdownError("Shutdown failed hook error", wrapped) finalErr = errors.Join(finalErr, wrapped) // CRITICAL FIX: Escalate hook failure during shutdown failed state sm.triggerEmergencyShutdown("Shutdown failed hook error", wrapped) } return finalErr } sm.state = ShutdownStateCompleted if err := sm.callHooks(shutdownCtx, "OnShutdownCompleted", nil); err != nil { wrapped := fmt.Errorf("shutdown completed hook error: %w", err) sm.recordShutdownError("Shutdown completed hook error", wrapped) // CRITICAL FIX: Log but don't fail shutdown for completion hook errors // These are non-critical notifications that shouldn't prevent successful shutdown sm.logger.Warn("Shutdown completed hook failed", "error", wrapped) // Don't return error for completion hook failures - shutdown was successful } return nil } func (sm *ShutdownManager) executeShutdownTasks(ctx context.Context) error { if sm.config.ParallelShutdown { return sm.executeTasksParallel(ctx) } else { return sm.executeTasksSequential(ctx) } } func (sm *ShutdownManager) executeTasksSequential(ctx context.Context) error { var lastErr error for _, task := range sm.shutdownTasks { if !task.Enabled { continue } if err := sm.executeTask(ctx, task); err != nil { lastErr = err if task.Critical { return fmt.Errorf("critical task %s failed: %w", task.Name, err) } } } return lastErr } func (sm *ShutdownManager) executeTasksParallel(ctx context.Context) error { var wg sync.WaitGroup errors := make(chan error, len(sm.shutdownTasks)) // Group tasks by priority priorityGroups := sm.groupTasksByPriority() // Execute each priority group sequentially, but tasks within group in parallel for _, tasks := range priorityGroups { for _, task := range tasks { if !task.Enabled { continue } wg.Add(1) go func(t ShutdownTask) { defer wg.Done() if err := sm.executeTask(ctx, t); err != nil { errors <- fmt.Errorf("task %s failed: %w", t.Name, err) } }(task) } wg.Wait() } close(errors) // Collect errors var criticalErr error var lastErr error for err := range errors { lastErr = err // Check if this was from a critical task for _, task := range sm.shutdownTasks { if task.Critical && fmt.Sprintf("task %s failed:", task.Name) == err.Error()[:len(fmt.Sprintf("task %s failed:", task.Name))] { criticalErr = err break } } } if criticalErr != nil { return criticalErr } return lastErr } func (sm *ShutdownManager) executeTask(ctx context.Context, task ShutdownTask) error { // Create timeout context for the task taskCtx, cancel := context.WithTimeout(ctx, task.Timeout) defer cancel() // Execute task with retry var lastErr error for attempt := 0; attempt <= sm.config.MaxRetries; attempt++ { if attempt > 0 { select { case <-time.After(sm.config.RetryDelay): case <-taskCtx.Done(): return taskCtx.Err() } } err := task.Task(taskCtx) if err == nil { return nil } lastErr = err attemptNumber := attempt + 1 sm.recordShutdownError( fmt.Sprintf("Shutdown task %s failed", task.Name), fmt.Errorf("attempt %d: %w", attemptNumber, err), "task", task.Name, "attempt", attemptNumber, ) // Call error handler if provided if task.OnError != nil { task.OnError(err) } } return fmt.Errorf("task failed after %d attempts: %w", sm.config.MaxRetries, lastErr) } func (sm *ShutdownManager) callHooks(ctx context.Context, hookMethod string, cause error) error { var hookErrors []error for _, hook := range sm.shutdownHooks { hookName := fmt.Sprintf("%T", hook) var err error switch hookMethod { case "OnShutdownStarted": err = hook.OnShutdownStarted(ctx) case "OnModulesStopped": err = hook.OnModulesStopped(ctx) case "OnCleanupStarted": err = hook.OnCleanupStarted(ctx) case "OnShutdownCompleted": err = hook.OnShutdownCompleted(ctx) case "OnShutdownFailed": err = hook.OnShutdownFailed(ctx, cause) } if err != nil { recordContext := fmt.Sprintf("%s hook failure (%s)", hookMethod, hookName) sm.recordShutdownError(recordContext, err, "hook", hookName, "phase", hookMethod) hookErrors = append(hookErrors, fmt.Errorf("%s: %w", recordContext, err)) } } if len(hookErrors) > 0 { return errors.Join(hookErrors...) } return nil } func (sm *ShutdownManager) sortTasksByPriority() { // Simple bubble sort by priority (descending) for i := 0; i < len(sm.shutdownTasks); i++ { for j := i + 1; j < len(sm.shutdownTasks); j++ { if sm.shutdownTasks[j].Priority > sm.shutdownTasks[i].Priority { sm.shutdownTasks[i], sm.shutdownTasks[j] = sm.shutdownTasks[j], sm.shutdownTasks[i] } } } } func (sm *ShutdownManager) groupTasksByPriority() [][]ShutdownTask { groups := make(map[int][]ShutdownTask) for _, task := range sm.shutdownTasks { groups[task.Priority] = append(groups[task.Priority], task) } // Convert to sorted slice var priorities []int for priority := range groups { priorities = append(priorities, priority) } // Sort priorities descending for i := 0; i < len(priorities); i++ { for j := i + 1; j < len(priorities); j++ { if priorities[j] > priorities[i] { priorities[i], priorities[j] = priorities[j], priorities[i] } } } var result [][]ShutdownTask for _, priority := range priorities { result = append(result, groups[priority]) } return result } // Default task implementations func (sm *ShutdownManager) saveApplicationState(ctx context.Context) error { // Save application state to disk // This would save things like current configuration, runtime state, etc. return nil } func (sm *ShutdownManager) closeExternalConnections(ctx context.Context) error { // Close database connections, external API connections, etc. return nil } func (sm *ShutdownManager) cleanupTempFiles(ctx context.Context) error { // Remove temporary files, logs, caches, etc. return nil } func (sm *ShutdownManager) notifyExternalSystems(ctx context.Context) error { // Notify external systems that this instance is shutting down return nil } func (sm *ShutdownManager) recordShutdownError(message string, err error, attrs ...interface{}) { if err == nil { return } attrCopy := append([]interface{}{}, attrs...) wrapped, txHash, attrsWithTx := enrichErrorWithTxHash(message, err, attrCopy) sm.errMu.Lock() sm.shutdownErrors = append(sm.shutdownErrors, wrapped) sm.shutdownErrorDetails = append(sm.shutdownErrorDetails, RecordedError{ Err: wrapped, TxHash: txHash, }) sm.errMu.Unlock() if sm.logger != nil { kv := append([]interface{}{}, attrsWithTx...) kv = append(kv, "error", err) args := append([]interface{}{message}, kv...) sm.logger.Error(args...) } } func (sm *ShutdownManager) combinedShutdownError() error { sm.errMu.Lock() defer sm.errMu.Unlock() if len(sm.shutdownErrors) == 0 { return nil } errs := make([]error, len(sm.shutdownErrors)) copy(errs, sm.shutdownErrors) return errors.Join(errs...) } // ShutdownErrors returns a copy of recorded shutdown errors for diagnostics. func (sm *ShutdownManager) ShutdownErrors() []error { sm.errMu.Lock() defer sm.errMu.Unlock() if len(sm.shutdownErrors) == 0 { return nil } errs := make([]error, len(sm.shutdownErrors)) copy(errs, sm.shutdownErrors) return errs } // ShutdownErrorDetails returns recorded errors with associated metadata such as tx hash. func (sm *ShutdownManager) ShutdownErrorDetails() []RecordedError { sm.errMu.Lock() defer sm.errMu.Unlock() if len(sm.shutdownErrorDetails) == 0 { return nil } details := make([]RecordedError, len(sm.shutdownErrorDetails)) copy(details, sm.shutdownErrorDetails) return details } // DefaultShutdownHook provides a basic implementation of ShutdownHook type DefaultShutdownHook struct { name string } func NewDefaultShutdownHook(name string) *DefaultShutdownHook { return &DefaultShutdownHook{name: name} } // triggerEmergencyShutdown performs emergency shutdown procedures when critical failures occur func (sm *ShutdownManager) triggerEmergencyShutdown(reason string, err error) { sm.logger.Error("EMERGENCY SHUTDOWN TRIGGERED", "reason", reason, "error", err, "state", sm.state, "timestamp", time.Now()) // Set emergency state sm.mu.Lock() sm.state = ShutdownStateFailed sm.mu.Unlock() // Attempt to signal all processes to terminate immediately // This is a last-resort mechanism if sm.emergencyHandler != nil { go func() { ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) defer cancel() if err := sm.emergencyHandler(ctx, reason, err); err != nil { sm.logger.Error("Emergency handler failed", "error", err) } }() } // Log to all available outputs sm.recordShutdownError("EMERGENCY_SHUTDOWN", fmt.Errorf("%s: %w", reason, err)) // Attempt to notify monitoring systems if available if len(sm.shutdownHooks) > 0 { go func() { ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) defer cancel() // CRITICAL FIX: Log emergency shutdown notification failures if err := sm.callHooks(ctx, "OnEmergencyShutdown", fmt.Errorf("%s: %w", reason, err)); err != nil { sm.logger.Warn("Failed to call emergency shutdown hooks", "error", err, "reason", reason) } }() } } func (dsh *DefaultShutdownHook) OnShutdownStarted(ctx context.Context) error { return nil } func (dsh *DefaultShutdownHook) OnModulesStopped(ctx context.Context) error { return nil } func (dsh *DefaultShutdownHook) OnCleanupStarted(ctx context.Context) error { return nil } func (dsh *DefaultShutdownHook) OnShutdownCompleted(ctx context.Context) error { return nil } func (dsh *DefaultShutdownHook) OnShutdownFailed(ctx context.Context, err error) error { return nil }