feat: create v2-prep branch with comprehensive planning
Restructured project for V2 refactor: **Structure Changes:** - Moved all V1 code to orig/ folder (preserved with git mv) - Created docs/planning/ directory - Added orig/README_V1.md explaining V1 preservation **Planning Documents:** - 00_V2_MASTER_PLAN.md: Complete architecture overview - Executive summary of critical V1 issues - High-level component architecture diagrams - 5-phase implementation roadmap - Success metrics and risk mitigation - 07_TASK_BREAKDOWN.md: Atomic task breakdown - 99+ hours of detailed tasks - Every task < 2 hours (atomic) - Clear dependencies and success criteria - Organized by implementation phase **V2 Key Improvements:** - Per-exchange parsers (factory pattern) - Multi-layer strict validation - Multi-index pool cache - Background validation pipeline - Comprehensive observability **Critical Issues Addressed:** - Zero address tokens (strict validation + cache enrichment) - Parsing accuracy (protocol-specific parsers) - No audit trail (background validation channel) - Inefficient lookups (multi-index cache) - Stats disconnection (event-driven metrics) Next Steps: 1. Review planning documents 2. Begin Phase 1: Foundation (P1-001 through P1-010) 3. Implement parsers in Phase 2 4. Build cache system in Phase 3 5. Add validation pipeline in Phase 4 6. Migrate and test in Phase 5 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
591
orig/pkg/transport/dlq.go
Normal file
591
orig/pkg/transport/dlq.go
Normal file
@@ -0,0 +1,591 @@
|
||||
package transport
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"sort"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// DeadLetterQueue handles failed messages with retry and reprocessing capabilities
|
||||
type DeadLetterQueue struct {
|
||||
messages map[string][]*DLQMessage
|
||||
config DLQConfig
|
||||
metrics DLQMetrics
|
||||
reprocessor MessageReprocessor
|
||||
mu sync.RWMutex
|
||||
cleanupTicker *time.Ticker
|
||||
ctx context.Context
|
||||
cancel context.CancelFunc
|
||||
}
|
||||
|
||||
// DLQMessage represents a message in the dead letter queue
|
||||
type DLQMessage struct {
|
||||
ID string
|
||||
OriginalMessage *Message
|
||||
Topic string
|
||||
FirstFailed time.Time
|
||||
LastAttempt time.Time
|
||||
AttemptCount int
|
||||
MaxRetries int
|
||||
FailureReason string
|
||||
RetryDelay time.Duration
|
||||
NextRetry time.Time
|
||||
Metadata map[string]interface{}
|
||||
Permanent bool
|
||||
}
|
||||
|
||||
// DLQConfig configures dead letter queue behavior
|
||||
type DLQConfig struct {
|
||||
MaxMessages int
|
||||
MaxRetries int
|
||||
RetentionTime time.Duration
|
||||
AutoReprocess bool
|
||||
ReprocessInterval time.Duration
|
||||
BackoffStrategy BackoffStrategy
|
||||
InitialRetryDelay time.Duration
|
||||
MaxRetryDelay time.Duration
|
||||
BackoffMultiplier float64
|
||||
PermanentFailures []string // Error patterns that mark messages as permanently failed
|
||||
ReprocessBatchSize int
|
||||
}
|
||||
|
||||
// BackoffStrategy defines retry delay calculation methods
|
||||
type BackoffStrategy string
|
||||
|
||||
const (
|
||||
BackoffFixed BackoffStrategy = "fixed"
|
||||
BackoffLinear BackoffStrategy = "linear"
|
||||
BackoffExponential BackoffStrategy = "exponential"
|
||||
BackoffCustom BackoffStrategy = "custom"
|
||||
)
|
||||
|
||||
// DLQMetrics tracks dead letter queue statistics
|
||||
type DLQMetrics struct {
|
||||
MessagesAdded int64
|
||||
MessagesReprocessed int64
|
||||
MessagesExpired int64
|
||||
MessagesPermanent int64
|
||||
ReprocessSuccesses int64
|
||||
ReprocessFailures int64
|
||||
QueueSize int64
|
||||
OldestMessage time.Time
|
||||
}
|
||||
|
||||
// MessageReprocessor handles message reprocessing logic
|
||||
type MessageReprocessor interface {
|
||||
Reprocess(ctx context.Context, msg *DLQMessage) error
|
||||
CanReprocess(msg *DLQMessage) bool
|
||||
ShouldRetry(msg *DLQMessage, err error) bool
|
||||
}
|
||||
|
||||
// DefaultMessageReprocessor implements basic reprocessing logic
|
||||
type DefaultMessageReprocessor struct {
|
||||
publisher MessagePublisher
|
||||
}
|
||||
|
||||
// MessagePublisher interface for republishing messages
|
||||
type MessagePublisher interface {
|
||||
Publish(ctx context.Context, msg *Message) error
|
||||
}
|
||||
|
||||
// NewDeadLetterQueue creates a new dead letter queue
|
||||
func NewDeadLetterQueue(config DLQConfig) *DeadLetterQueue {
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
|
||||
dlq := &DeadLetterQueue{
|
||||
messages: make(map[string][]*DLQMessage),
|
||||
config: config,
|
||||
metrics: DLQMetrics{},
|
||||
ctx: ctx,
|
||||
cancel: cancel,
|
||||
}
|
||||
|
||||
// Set default configuration values
|
||||
if dlq.config.MaxMessages == 0 {
|
||||
dlq.config.MaxMessages = 10000
|
||||
}
|
||||
if dlq.config.MaxRetries == 0 {
|
||||
dlq.config.MaxRetries = 3
|
||||
}
|
||||
if dlq.config.RetentionTime == 0 {
|
||||
dlq.config.RetentionTime = 24 * time.Hour
|
||||
}
|
||||
if dlq.config.ReprocessInterval == 0 {
|
||||
dlq.config.ReprocessInterval = 5 * time.Minute
|
||||
}
|
||||
if dlq.config.InitialRetryDelay == 0 {
|
||||
dlq.config.InitialRetryDelay = time.Minute
|
||||
}
|
||||
if dlq.config.MaxRetryDelay == 0 {
|
||||
dlq.config.MaxRetryDelay = time.Hour
|
||||
}
|
||||
if dlq.config.BackoffMultiplier == 0 {
|
||||
dlq.config.BackoffMultiplier = 2.0
|
||||
}
|
||||
if dlq.config.BackoffStrategy == "" {
|
||||
dlq.config.BackoffStrategy = BackoffExponential
|
||||
}
|
||||
if dlq.config.ReprocessBatchSize == 0 {
|
||||
dlq.config.ReprocessBatchSize = 10
|
||||
}
|
||||
|
||||
// Start cleanup routine
|
||||
dlq.startCleanupRoutine()
|
||||
|
||||
// Start reprocessing routine if enabled
|
||||
if dlq.config.AutoReprocess {
|
||||
dlq.startReprocessRoutine()
|
||||
}
|
||||
|
||||
return dlq
|
||||
}
|
||||
|
||||
// AddMessage adds a failed message to the dead letter queue
|
||||
func (dlq *DeadLetterQueue) AddMessage(topic string, msg *Message) error {
|
||||
return dlq.AddMessageWithReason(topic, msg, "unknown failure")
|
||||
}
|
||||
|
||||
// AddMessageWithReason adds a failed message with a specific failure reason
|
||||
func (dlq *DeadLetterQueue) AddMessageWithReason(topic string, msg *Message, reason string) error {
|
||||
dlq.mu.Lock()
|
||||
defer dlq.mu.Unlock()
|
||||
|
||||
// Check if we've exceeded max messages
|
||||
totalMessages := dlq.getTotalMessageCount()
|
||||
if totalMessages >= dlq.config.MaxMessages {
|
||||
// Remove oldest message to make room
|
||||
dlq.removeOldestMessage()
|
||||
}
|
||||
|
||||
// Check if this is a permanent failure
|
||||
permanent := dlq.isPermanentFailure(reason)
|
||||
|
||||
dlqMsg := &DLQMessage{
|
||||
ID: fmt.Sprintf("dlq_%s_%d", topic, time.Now().UnixNano()),
|
||||
OriginalMessage: msg,
|
||||
Topic: topic,
|
||||
FirstFailed: time.Now(),
|
||||
LastAttempt: time.Now(),
|
||||
AttemptCount: 1,
|
||||
MaxRetries: dlq.config.MaxRetries,
|
||||
FailureReason: reason,
|
||||
Metadata: make(map[string]interface{}),
|
||||
Permanent: permanent,
|
||||
}
|
||||
|
||||
if !permanent {
|
||||
dlqMsg.RetryDelay = dlq.calculateRetryDelay(dlqMsg)
|
||||
dlqMsg.NextRetry = time.Now().Add(dlqMsg.RetryDelay)
|
||||
}
|
||||
|
||||
// Add to queue
|
||||
if _, exists := dlq.messages[topic]; !exists {
|
||||
dlq.messages[topic] = make([]*DLQMessage, 0)
|
||||
}
|
||||
dlq.messages[topic] = append(dlq.messages[topic], dlqMsg)
|
||||
|
||||
// Update metrics
|
||||
dlq.metrics.MessagesAdded++
|
||||
dlq.metrics.QueueSize++
|
||||
if permanent {
|
||||
dlq.metrics.MessagesPermanent++
|
||||
}
|
||||
dlq.updateOldestMessage()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// GetMessages returns all messages for a topic
|
||||
func (dlq *DeadLetterQueue) GetMessages(topic string) ([]*DLQMessage, error) {
|
||||
dlq.mu.RLock()
|
||||
defer dlq.mu.RUnlock()
|
||||
|
||||
messages, exists := dlq.messages[topic]
|
||||
if !exists {
|
||||
return []*DLQMessage{}, nil
|
||||
}
|
||||
|
||||
// Return a copy to avoid race conditions
|
||||
result := make([]*DLQMessage, len(messages))
|
||||
copy(result, messages)
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// GetAllMessages returns all messages across all topics
|
||||
func (dlq *DeadLetterQueue) GetAllMessages() map[string][]*DLQMessage {
|
||||
dlq.mu.RLock()
|
||||
defer dlq.mu.RUnlock()
|
||||
|
||||
result := make(map[string][]*DLQMessage)
|
||||
for topic, messages := range dlq.messages {
|
||||
result[topic] = make([]*DLQMessage, len(messages))
|
||||
copy(result[topic], messages)
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// ReprocessMessage attempts to reprocess a specific message
|
||||
func (dlq *DeadLetterQueue) ReprocessMessage(messageID string) error {
|
||||
dlq.mu.Lock()
|
||||
defer dlq.mu.Unlock()
|
||||
|
||||
// Find message
|
||||
var dlqMsg *DLQMessage
|
||||
var topic string
|
||||
var index int
|
||||
|
||||
for t, messages := range dlq.messages {
|
||||
for i, msg := range messages {
|
||||
if msg.ID == messageID {
|
||||
dlqMsg = msg
|
||||
topic = t
|
||||
index = i
|
||||
break
|
||||
}
|
||||
}
|
||||
if dlqMsg != nil {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if dlqMsg == nil {
|
||||
return fmt.Errorf("message not found: %s", messageID)
|
||||
}
|
||||
|
||||
if dlqMsg.Permanent {
|
||||
return fmt.Errorf("message marked as permanent failure: %s", messageID)
|
||||
}
|
||||
|
||||
// Attempt reprocessing
|
||||
err := dlq.attemptReprocess(dlqMsg)
|
||||
if err == nil {
|
||||
// Success - remove from queue
|
||||
dlq.removeMessageByIndex(topic, index)
|
||||
dlq.metrics.ReprocessSuccesses++
|
||||
dlq.metrics.QueueSize--
|
||||
return nil
|
||||
}
|
||||
|
||||
// Failed - update retry information
|
||||
dlqMsg.AttemptCount++
|
||||
dlqMsg.LastAttempt = time.Now()
|
||||
dlqMsg.FailureReason = err.Error()
|
||||
|
||||
if dlqMsg.AttemptCount >= dlqMsg.MaxRetries {
|
||||
dlqMsg.Permanent = true
|
||||
dlq.metrics.MessagesPermanent++
|
||||
} else {
|
||||
dlqMsg.RetryDelay = dlq.calculateRetryDelay(dlqMsg)
|
||||
dlqMsg.NextRetry = time.Now().Add(dlqMsg.RetryDelay)
|
||||
}
|
||||
|
||||
dlq.metrics.ReprocessFailures++
|
||||
return fmt.Errorf("reprocessing failed: %w", err)
|
||||
}
|
||||
|
||||
// PurgeMessages removes all messages for a topic
|
||||
func (dlq *DeadLetterQueue) PurgeMessages(topic string) error {
|
||||
dlq.mu.Lock()
|
||||
defer dlq.mu.Unlock()
|
||||
|
||||
if messages, exists := dlq.messages[topic]; exists {
|
||||
count := len(messages)
|
||||
delete(dlq.messages, topic)
|
||||
dlq.metrics.QueueSize -= int64(count)
|
||||
dlq.updateOldestMessage()
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// PurgeAllMessages removes all messages from the queue
|
||||
func (dlq *DeadLetterQueue) PurgeAllMessages() error {
|
||||
dlq.mu.Lock()
|
||||
defer dlq.mu.Unlock()
|
||||
|
||||
dlq.messages = make(map[string][]*DLQMessage)
|
||||
dlq.metrics.QueueSize = 0
|
||||
dlq.metrics.OldestMessage = time.Time{}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// GetMessageCount returns the total number of messages in the queue
|
||||
func (dlq *DeadLetterQueue) GetMessageCount() int {
|
||||
dlq.mu.RLock()
|
||||
defer dlq.mu.RUnlock()
|
||||
return dlq.getTotalMessageCount()
|
||||
}
|
||||
|
||||
// GetMetrics returns current DLQ metrics
|
||||
func (dlq *DeadLetterQueue) GetMetrics() DLQMetrics {
|
||||
dlq.mu.RLock()
|
||||
defer dlq.mu.RUnlock()
|
||||
return dlq.metrics
|
||||
}
|
||||
|
||||
// SetReprocessor sets the message reprocessor
|
||||
func (dlq *DeadLetterQueue) SetReprocessor(reprocessor MessageReprocessor) {
|
||||
dlq.mu.Lock()
|
||||
defer dlq.mu.Unlock()
|
||||
dlq.reprocessor = reprocessor
|
||||
}
|
||||
|
||||
// Cleanup removes expired messages
|
||||
func (dlq *DeadLetterQueue) Cleanup(maxAge time.Duration) error {
|
||||
dlq.mu.Lock()
|
||||
defer dlq.mu.Unlock()
|
||||
|
||||
cutoff := time.Now().Add(-maxAge)
|
||||
expiredCount := 0
|
||||
|
||||
for topic, messages := range dlq.messages {
|
||||
filtered := make([]*DLQMessage, 0)
|
||||
for _, msg := range messages {
|
||||
if msg.FirstFailed.After(cutoff) {
|
||||
filtered = append(filtered, msg)
|
||||
} else {
|
||||
expiredCount++
|
||||
}
|
||||
}
|
||||
dlq.messages[topic] = filtered
|
||||
|
||||
// Remove empty topics
|
||||
if len(filtered) == 0 {
|
||||
delete(dlq.messages, topic)
|
||||
}
|
||||
}
|
||||
|
||||
dlq.metrics.MessagesExpired += int64(expiredCount)
|
||||
dlq.metrics.QueueSize -= int64(expiredCount)
|
||||
dlq.updateOldestMessage()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Stop gracefully shuts down the dead letter queue
|
||||
func (dlq *DeadLetterQueue) Stop() error {
|
||||
dlq.cancel()
|
||||
|
||||
if dlq.cleanupTicker != nil {
|
||||
dlq.cleanupTicker.Stop()
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Private helper methods
|
||||
|
||||
func (dlq *DeadLetterQueue) getTotalMessageCount() int {
|
||||
count := 0
|
||||
for _, messages := range dlq.messages {
|
||||
count += len(messages)
|
||||
}
|
||||
return count
|
||||
}
|
||||
|
||||
func (dlq *DeadLetterQueue) removeOldestMessage() {
|
||||
var oldestTime time.Time
|
||||
var oldestTopic string
|
||||
var oldestIndex int
|
||||
|
||||
for topic, messages := range dlq.messages {
|
||||
for i, msg := range messages {
|
||||
if oldestTime.IsZero() || msg.FirstFailed.Before(oldestTime) {
|
||||
oldestTime = msg.FirstFailed
|
||||
oldestTopic = topic
|
||||
oldestIndex = i
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !oldestTime.IsZero() {
|
||||
dlq.removeMessageByIndex(oldestTopic, oldestIndex)
|
||||
dlq.metrics.QueueSize--
|
||||
}
|
||||
}
|
||||
|
||||
func (dlq *DeadLetterQueue) removeMessageByIndex(topic string, index int) {
|
||||
messages := dlq.messages[topic]
|
||||
dlq.messages[topic] = append(messages[:index], messages[index+1:]...)
|
||||
|
||||
if len(dlq.messages[topic]) == 0 {
|
||||
delete(dlq.messages, topic)
|
||||
}
|
||||
}
|
||||
|
||||
func (dlq *DeadLetterQueue) isPermanentFailure(reason string) bool {
|
||||
for _, pattern := range dlq.config.PermanentFailures {
|
||||
if pattern == reason {
|
||||
return true
|
||||
}
|
||||
// Simple pattern matching (can be enhanced with regex)
|
||||
if len(pattern) > 0 && pattern[len(pattern)-1] == '*' {
|
||||
prefix := pattern[:len(pattern)-1]
|
||||
if len(reason) >= len(prefix) && reason[:len(prefix)] == prefix {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (dlq *DeadLetterQueue) calculateRetryDelay(msg *DLQMessage) time.Duration {
|
||||
switch dlq.config.BackoffStrategy {
|
||||
case BackoffFixed:
|
||||
return dlq.config.InitialRetryDelay
|
||||
|
||||
case BackoffLinear:
|
||||
delay := time.Duration(msg.AttemptCount) * dlq.config.InitialRetryDelay
|
||||
if delay > dlq.config.MaxRetryDelay {
|
||||
return dlq.config.MaxRetryDelay
|
||||
}
|
||||
return delay
|
||||
|
||||
case BackoffExponential:
|
||||
delay := time.Duration(float64(dlq.config.InitialRetryDelay) *
|
||||
pow(dlq.config.BackoffMultiplier, float64(msg.AttemptCount-1)))
|
||||
if delay > dlq.config.MaxRetryDelay {
|
||||
return dlq.config.MaxRetryDelay
|
||||
}
|
||||
return delay
|
||||
|
||||
default:
|
||||
return dlq.config.InitialRetryDelay
|
||||
}
|
||||
}
|
||||
|
||||
func (dlq *DeadLetterQueue) attemptReprocess(msg *DLQMessage) error {
|
||||
if dlq.reprocessor == nil {
|
||||
return fmt.Errorf("no reprocessor configured")
|
||||
}
|
||||
|
||||
if !dlq.reprocessor.CanReprocess(msg) {
|
||||
return fmt.Errorf("message cannot be reprocessed")
|
||||
}
|
||||
|
||||
return dlq.reprocessor.Reprocess(dlq.ctx, msg)
|
||||
}
|
||||
|
||||
func (dlq *DeadLetterQueue) updateOldestMessage() {
|
||||
var oldest time.Time
|
||||
|
||||
for _, messages := range dlq.messages {
|
||||
for _, msg := range messages {
|
||||
if oldest.IsZero() || msg.FirstFailed.Before(oldest) {
|
||||
oldest = msg.FirstFailed
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
dlq.metrics.OldestMessage = oldest
|
||||
}
|
||||
|
||||
func (dlq *DeadLetterQueue) startCleanupRoutine() {
|
||||
dlq.cleanupTicker = time.NewTicker(dlq.config.ReprocessInterval)
|
||||
|
||||
go func() {
|
||||
for {
|
||||
select {
|
||||
case <-dlq.cleanupTicker.C:
|
||||
dlq.Cleanup(dlq.config.RetentionTime)
|
||||
case <-dlq.ctx.Done():
|
||||
return
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
func (dlq *DeadLetterQueue) startReprocessRoutine() {
|
||||
ticker := time.NewTicker(dlq.config.ReprocessInterval)
|
||||
|
||||
go func() {
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
dlq.processRetryableMessages()
|
||||
case <-dlq.ctx.Done():
|
||||
return
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
func (dlq *DeadLetterQueue) processRetryableMessages() {
|
||||
dlq.mu.Lock()
|
||||
retryable := dlq.getRetryableMessages()
|
||||
dlq.mu.Unlock()
|
||||
|
||||
// Sort by next retry time
|
||||
sort.Slice(retryable, func(i, j int) bool {
|
||||
return retryable[i].NextRetry.Before(retryable[j].NextRetry)
|
||||
})
|
||||
|
||||
// Process batch
|
||||
batchSize := dlq.config.ReprocessBatchSize
|
||||
if len(retryable) < batchSize {
|
||||
batchSize = len(retryable)
|
||||
}
|
||||
|
||||
for i := 0; i < batchSize; i++ {
|
||||
msg := retryable[i]
|
||||
if time.Now().After(msg.NextRetry) {
|
||||
dlq.ReprocessMessage(msg.ID)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (dlq *DeadLetterQueue) getRetryableMessages() []*DLQMessage {
|
||||
var retryable []*DLQMessage
|
||||
|
||||
for _, messages := range dlq.messages {
|
||||
for _, msg := range messages {
|
||||
if !msg.Permanent && msg.AttemptCount < msg.MaxRetries {
|
||||
retryable = append(retryable, msg)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return retryable
|
||||
}
|
||||
|
||||
// Implementation of DefaultMessageReprocessor
|
||||
|
||||
func NewDefaultMessageReprocessor(publisher MessagePublisher) *DefaultMessageReprocessor {
|
||||
return &DefaultMessageReprocessor{
|
||||
publisher: publisher,
|
||||
}
|
||||
}
|
||||
|
||||
func (r *DefaultMessageReprocessor) Reprocess(ctx context.Context, msg *DLQMessage) error {
|
||||
if r.publisher == nil {
|
||||
return fmt.Errorf("no publisher configured")
|
||||
}
|
||||
|
||||
return r.publisher.Publish(ctx, msg.OriginalMessage)
|
||||
}
|
||||
|
||||
func (r *DefaultMessageReprocessor) CanReprocess(msg *DLQMessage) bool {
|
||||
return !msg.Permanent && msg.AttemptCount < msg.MaxRetries
|
||||
}
|
||||
|
||||
func (r *DefaultMessageReprocessor) ShouldRetry(msg *DLQMessage, err error) bool {
|
||||
// Simple retry logic - can be enhanced based on error types
|
||||
return msg.AttemptCount < msg.MaxRetries
|
||||
}
|
||||
|
||||
// Helper function for power calculation
|
||||
func pow(base, exp float64) float64 {
|
||||
if exp == 0 {
|
||||
return 1
|
||||
}
|
||||
result := base
|
||||
for i := 1; i < int(exp); i++ {
|
||||
result *= base
|
||||
}
|
||||
return result
|
||||
}
|
||||
Reference in New Issue
Block a user