Skip to content

Error Handling and Cleanup Mechanisms #12

@starbops

Description

@starbops

Error Handling and Cleanup Mechanisms

User Story

As a system administrator, I want robust error handling and cleanup so that failed executions don't consume resources or leave the system in an inconsistent state.

Technical Requirements

  • Implement comprehensive error handling for all failure modes
  • Create automatic container cleanup on success/failure/timeout
  • Add resource monitoring and alerting
  • Implement graceful degradation under high load
  • Create error classification and reporting
  • Add automatic retry logic for transient failures

Acceptance Criteria

  • All execution error types properly classified and handled
  • Container cleanup works for success, failure, and timeout scenarios
  • Resource monitoring alerts on high load conditions
  • Retry logic prevents infinite loops with exponential backoff
  • Graceful degradation maintains system stability under load
  • Error messages provide actionable information for debugging

Definition of Done

  • Error handling tested for all failure scenarios
  • Cleanup mechanisms verified with integration tests
  • Load testing confirms graceful degradation
  • Monitoring and alerting configured
  • Documentation updated with error codes and handling

Implementation Guide

Error Classification System

type ErrorType string

const (
    ErrorTypeInfrastructure ErrorType = "infrastructure" // Docker daemon issues
    ErrorTypeResource       ErrorType = "resource"       // Out of memory, CPU
    ErrorTypeTimeout        ErrorType = "timeout"        // Execution timeout
    ErrorTypeUserCode       ErrorType = "user_code"      // Script execution error
    ErrorTypeValidation     ErrorType = "validation"     // Input validation
    ErrorTypeSecurity       ErrorType = "security"       // Security violation
)

type ExecutionError struct {
    Type        ErrorType `json:"type"`
    Code        string    `json:"code"`
    Message     string    `json:"message"`
    Details     string    `json:"details,omitempty"`
    Retryable   bool      `json:"retryable"`
    TaskID      string    `json:"task_id"`
    ContainerID string    `json:"container_id,omitempty"`
    Timestamp   time.Time `json:"timestamp"`
}

func ClassifyError(err error, context string) *ExecutionError {
    switch {
    case strings.Contains(err.Error(), "no such container"):
        return &ExecutionError{
            Type:      ErrorTypeInfrastructure,
            Code:      "CONTAINER_NOT_FOUND",
            Message:   "Container not found",
            Retryable: false,
        }
    case strings.Contains(err.Error(), "timeout"):
        return &ExecutionError{
            Type:      ErrorTypeTimeout,
            Code:      "EXECUTION_TIMEOUT",
            Message:   "Task execution timed out",
            Retryable: true,
        }
    default:
        return &ExecutionError{
            Type:      ErrorTypeInfrastructure,
            Code:      "UNKNOWN_ERROR",
            Message:   "Unknown execution error",
            Retryable: true,
        }
    }
}

Comprehensive Cleanup System

type CleanupManager struct {
    dockerClient *client.Client
    logger       *logrus.Logger
    metrics      *MetricsCollector
}

func (cm *CleanupManager) CleanupExecution(ctx context.Context, taskID, containerID string) error {
    start := time.Now()
    defer func() {
        cm.metrics.RecordCleanupDuration(time.Since(start))
    }()
    
    var cleanupErrors []error
    
    // Stop container (if running)
    if containerID != "" {
        if err := cm.stopContainer(ctx, containerID); err != nil {
            cleanupErrors = append(cleanupErrors, fmt.Errorf("stop container: %w", err))
        }
        
        // Remove container
        if err := cm.removeContainer(ctx, containerID); err != nil {
            cleanupErrors = append(cleanupErrors, fmt.Errorf("remove container: %w", err))
        }
    }
    
    // Clean up temporary files
    if err := cm.cleanupTempFiles(taskID); err != nil {
        cleanupErrors = append(cleanupErrors, fmt.Errorf("cleanup temp files: %w", err))
    }
    
    // Update metrics
    cm.metrics.IncrementCleanupAttempts()
    if len(cleanupErrors) == 0 {
        cm.metrics.IncrementCleanupSuccess()
    } else {
        cm.metrics.IncrementCleanupFailures()
    }
    
    if len(cleanupErrors) > 0 {
        return fmt.Errorf("cleanup failed with %d errors: %v", len(cleanupErrors), cleanupErrors)
    }
    
    return nil
}

func (cm *CleanupManager) stopContainer(ctx context.Context, containerID string) error {
    timeout := 10 * time.Second
    return cm.dockerClient.ContainerStop(ctx, containerID, &timeout)
}

func (cm *CleanupManager) removeContainer(ctx context.Context, containerID string) error {
    return cm.dockerClient.ContainerRemove(ctx, containerID, types.ContainerRemoveOptions{
        Force:         true,
        RemoveVolumes: true,
    })
}

Resource Monitoring and Alerting

type ResourceMonitor struct {
    dockerClient   *client.Client
    alertManager   *AlertManager
    thresholds     ResourceThresholds
    checkInterval  time.Duration
}

type ResourceThresholds struct {
    CPUPercent    float64 // 80%
    MemoryPercent float64 // 85%
    DiskPercent   float64 // 90%
    ContainerCount int    // 1000
}

func (rm *ResourceMonitor) MonitorResources(ctx context.Context) {
    ticker := time.NewTicker(rm.checkInterval)
    defer ticker.Stop()
    
    for {
        select {
        case <-ctx.Done():
            return
        case <-ticker.C:
            if err := rm.checkResourceUsage(ctx); err != nil {
                rm.alertManager.SendAlert("MONITOR_ERROR", err.Error())
            }
        }
    }
}

func (rm *ResourceMonitor) checkResourceUsage(ctx context.Context) error {
    // Check system resources
    stats, err := rm.getSystemStats(ctx)
    if err != nil {
        return fmt.Errorf("failed to get system stats: %w", err)
    }
    
    // CPU usage check
    if stats.CPUPercent > rm.thresholds.CPUPercent {
        rm.alertManager.SendAlert("HIGH_CPU_USAGE", 
            fmt.Sprintf("CPU usage: %.2f%%", stats.CPUPercent))
    }
    
    // Memory usage check
    if stats.MemoryPercent > rm.thresholds.MemoryPercent {
        rm.alertManager.SendAlert("HIGH_MEMORY_USAGE", 
            fmt.Sprintf("Memory usage: %.2f%%", stats.MemoryPercent))
    }
    
    // Container count check
    containers, err := rm.dockerClient.ContainerList(ctx, types.ContainerListOptions{})
    if err != nil {
        return fmt.Errorf("failed to list containers: %w", err)
    }
    
    if len(containers) > rm.thresholds.ContainerCount {
        rm.alertManager.SendAlert("HIGH_CONTAINER_COUNT", 
            fmt.Sprintf("Container count: %d", len(containers)))
    }
    
    return nil
}

Retry Logic with Exponential Backoff

type RetryConfig struct {
    MaxAttempts int
    BaseDelay   time.Duration
    MaxDelay    time.Duration
    Multiplier  float64
}

func (ee *ExecutionEngine) ExecuteWithRetry(ctx context.Context, task *Task) error {
    config := RetryConfig{
        MaxAttempts: 3,
        BaseDelay:   1 * time.Second,
        MaxDelay:    30 * time.Second,
        Multiplier:  2.0,
    }
    
    var lastErr error
    
    for attempt := 1; attempt <= config.MaxAttempts; attempt++ {
        err := ee.executeTask(ctx, task)
        if err == nil {
            return nil
        }
        
        // Classify error
        execErr := ClassifyError(err, "task_execution")
        lastErr = execErr
        
        // Don't retry non-retryable errors
        if !execErr.Retryable {
            return execErr
        }
        
        // Don't retry on last attempt
        if attempt == config.MaxAttempts {
            break
        }
        
        // Calculate delay with exponential backoff
        delay := time.Duration(float64(config.BaseDelay) * math.Pow(config.Multiplier, float64(attempt-1)))
        if delay > config.MaxDelay {
            delay = config.MaxDelay
        }
        
        ee.logger.WithFields(logrus.Fields{
            "task_id": task.ID,
            "attempt": attempt,
            "delay":   delay,
            "error":   err.Error(),
        }).Warn("Task execution failed, retrying")
        
        select {
        case <-ctx.Done():
            return ctx.Err()
        case <-time.After(delay):
            // Continue to next attempt
        }
    }
    
    return fmt.Errorf("task execution failed after %d attempts: %w", config.MaxAttempts, lastErr)
}

Graceful Degradation

  • Circuit breaker pattern for external dependencies
  • Queue size limits with rejection of new tasks
  • Resource-aware task scheduling
  • Priority-based task queuing during high load
  • Health check endpoints for load balancers

Error Reporting and Metrics

  • Structured error logging with correlation IDs
  • Error rate metrics by type and code
  • Container cleanup success/failure rates
  • Resource usage trends and alerts
  • Performance degradation indicators

Related Epic

Contributes to Epic #8: Container Execution Engine

Metadata

Metadata

Assignees

Type

Projects

Status

Done

Relationships

None yet

Development

No branches or pull requests

Issue actions