-
Notifications
You must be signed in to change notification settings - Fork 1
Milestone
Description
Error Handling and Cleanup Mechanisms
User Story
As a system administrator, I want robust error handling and cleanup so that failed executions don't consume resources or leave the system in an inconsistent state.
Technical Requirements
- Implement comprehensive error handling for all failure modes
- Create automatic container cleanup on success/failure/timeout
- Add resource monitoring and alerting
- Implement graceful degradation under high load
- Create error classification and reporting
- Add automatic retry logic for transient failures
Acceptance Criteria
- All execution error types properly classified and handled
- Container cleanup works for success, failure, and timeout scenarios
- Resource monitoring alerts on high load conditions
- Retry logic prevents infinite loops with exponential backoff
- Graceful degradation maintains system stability under load
- Error messages provide actionable information for debugging
Definition of Done
- Error handling tested for all failure scenarios
- Cleanup mechanisms verified with integration tests
- Load testing confirms graceful degradation
- Monitoring and alerting configured
- Documentation updated with error codes and handling
Implementation Guide
Error Classification System
type ErrorType string
const (
ErrorTypeInfrastructure ErrorType = "infrastructure" // Docker daemon issues
ErrorTypeResource ErrorType = "resource" // Out of memory, CPU
ErrorTypeTimeout ErrorType = "timeout" // Execution timeout
ErrorTypeUserCode ErrorType = "user_code" // Script execution error
ErrorTypeValidation ErrorType = "validation" // Input validation
ErrorTypeSecurity ErrorType = "security" // Security violation
)
type ExecutionError struct {
Type ErrorType `json:"type"`
Code string `json:"code"`
Message string `json:"message"`
Details string `json:"details,omitempty"`
Retryable bool `json:"retryable"`
TaskID string `json:"task_id"`
ContainerID string `json:"container_id,omitempty"`
Timestamp time.Time `json:"timestamp"`
}
func ClassifyError(err error, context string) *ExecutionError {
switch {
case strings.Contains(err.Error(), "no such container"):
return &ExecutionError{
Type: ErrorTypeInfrastructure,
Code: "CONTAINER_NOT_FOUND",
Message: "Container not found",
Retryable: false,
}
case strings.Contains(err.Error(), "timeout"):
return &ExecutionError{
Type: ErrorTypeTimeout,
Code: "EXECUTION_TIMEOUT",
Message: "Task execution timed out",
Retryable: true,
}
default:
return &ExecutionError{
Type: ErrorTypeInfrastructure,
Code: "UNKNOWN_ERROR",
Message: "Unknown execution error",
Retryable: true,
}
}
}Comprehensive Cleanup System
type CleanupManager struct {
dockerClient *client.Client
logger *logrus.Logger
metrics *MetricsCollector
}
func (cm *CleanupManager) CleanupExecution(ctx context.Context, taskID, containerID string) error {
start := time.Now()
defer func() {
cm.metrics.RecordCleanupDuration(time.Since(start))
}()
var cleanupErrors []error
// Stop container (if running)
if containerID != "" {
if err := cm.stopContainer(ctx, containerID); err != nil {
cleanupErrors = append(cleanupErrors, fmt.Errorf("stop container: %w", err))
}
// Remove container
if err := cm.removeContainer(ctx, containerID); err != nil {
cleanupErrors = append(cleanupErrors, fmt.Errorf("remove container: %w", err))
}
}
// Clean up temporary files
if err := cm.cleanupTempFiles(taskID); err != nil {
cleanupErrors = append(cleanupErrors, fmt.Errorf("cleanup temp files: %w", err))
}
// Update metrics
cm.metrics.IncrementCleanupAttempts()
if len(cleanupErrors) == 0 {
cm.metrics.IncrementCleanupSuccess()
} else {
cm.metrics.IncrementCleanupFailures()
}
if len(cleanupErrors) > 0 {
return fmt.Errorf("cleanup failed with %d errors: %v", len(cleanupErrors), cleanupErrors)
}
return nil
}
func (cm *CleanupManager) stopContainer(ctx context.Context, containerID string) error {
timeout := 10 * time.Second
return cm.dockerClient.ContainerStop(ctx, containerID, &timeout)
}
func (cm *CleanupManager) removeContainer(ctx context.Context, containerID string) error {
return cm.dockerClient.ContainerRemove(ctx, containerID, types.ContainerRemoveOptions{
Force: true,
RemoveVolumes: true,
})
}Resource Monitoring and Alerting
type ResourceMonitor struct {
dockerClient *client.Client
alertManager *AlertManager
thresholds ResourceThresholds
checkInterval time.Duration
}
type ResourceThresholds struct {
CPUPercent float64 // 80%
MemoryPercent float64 // 85%
DiskPercent float64 // 90%
ContainerCount int // 1000
}
func (rm *ResourceMonitor) MonitorResources(ctx context.Context) {
ticker := time.NewTicker(rm.checkInterval)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
if err := rm.checkResourceUsage(ctx); err != nil {
rm.alertManager.SendAlert("MONITOR_ERROR", err.Error())
}
}
}
}
func (rm *ResourceMonitor) checkResourceUsage(ctx context.Context) error {
// Check system resources
stats, err := rm.getSystemStats(ctx)
if err != nil {
return fmt.Errorf("failed to get system stats: %w", err)
}
// CPU usage check
if stats.CPUPercent > rm.thresholds.CPUPercent {
rm.alertManager.SendAlert("HIGH_CPU_USAGE",
fmt.Sprintf("CPU usage: %.2f%%", stats.CPUPercent))
}
// Memory usage check
if stats.MemoryPercent > rm.thresholds.MemoryPercent {
rm.alertManager.SendAlert("HIGH_MEMORY_USAGE",
fmt.Sprintf("Memory usage: %.2f%%", stats.MemoryPercent))
}
// Container count check
containers, err := rm.dockerClient.ContainerList(ctx, types.ContainerListOptions{})
if err != nil {
return fmt.Errorf("failed to list containers: %w", err)
}
if len(containers) > rm.thresholds.ContainerCount {
rm.alertManager.SendAlert("HIGH_CONTAINER_COUNT",
fmt.Sprintf("Container count: %d", len(containers)))
}
return nil
}Retry Logic with Exponential Backoff
type RetryConfig struct {
MaxAttempts int
BaseDelay time.Duration
MaxDelay time.Duration
Multiplier float64
}
func (ee *ExecutionEngine) ExecuteWithRetry(ctx context.Context, task *Task) error {
config := RetryConfig{
MaxAttempts: 3,
BaseDelay: 1 * time.Second,
MaxDelay: 30 * time.Second,
Multiplier: 2.0,
}
var lastErr error
for attempt := 1; attempt <= config.MaxAttempts; attempt++ {
err := ee.executeTask(ctx, task)
if err == nil {
return nil
}
// Classify error
execErr := ClassifyError(err, "task_execution")
lastErr = execErr
// Don't retry non-retryable errors
if !execErr.Retryable {
return execErr
}
// Don't retry on last attempt
if attempt == config.MaxAttempts {
break
}
// Calculate delay with exponential backoff
delay := time.Duration(float64(config.BaseDelay) * math.Pow(config.Multiplier, float64(attempt-1)))
if delay > config.MaxDelay {
delay = config.MaxDelay
}
ee.logger.WithFields(logrus.Fields{
"task_id": task.ID,
"attempt": attempt,
"delay": delay,
"error": err.Error(),
}).Warn("Task execution failed, retrying")
select {
case <-ctx.Done():
return ctx.Err()
case <-time.After(delay):
// Continue to next attempt
}
}
return fmt.Errorf("task execution failed after %d attempts: %w", config.MaxAttempts, lastErr)
}Graceful Degradation
- Circuit breaker pattern for external dependencies
- Queue size limits with rejection of new tasks
- Resource-aware task scheduling
- Priority-based task queuing during high load
- Health check endpoints for load balancers
Error Reporting and Metrics
- Structured error logging with correlation IDs
- Error rate metrics by type and code
- Container cleanup success/failure rates
- Resource usage trends and alerts
- Performance degradation indicators
Related Epic
Contributes to Epic #8: Container Execution Engine
Metadata
Metadata
Assignees
Type
Projects
Status
Done