Skip to content

Commit

Permalink
remove the image if the storage is corrupted
Browse files Browse the repository at this point in the history
When we detect that podman failed with the storage corruption warning
and readlink no such file error, we will remove the image and set the
command result to unavailable error to trigger a retry.
  • Loading branch information
luluz66 committed Apr 20, 2022
1 parent e640b8d commit 9cd41b6
Showing 1 changed file with 44 additions and 4 deletions.
48 changes: 44 additions & 4 deletions enterprise/server/remote_execution/containers/podman/podman.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"fmt"
"io"
"path/filepath"
"regexp"
"strings"
"sync"
"time"
Expand All @@ -23,16 +24,17 @@ import (
)

var (
// podmanExecSIGKILLExitCode is the exit code returned by `podman exec` when the exec
// process is killed due to the parent container being removed.
podmanExecSIGKILLExitCode = 137

// Additional time used to kill the container if the command doesn't exit cleanly
containerFinalizationTimeout = 10 * time.Second

storageErrorRegex = regexp.MustCompile(`(?s)A storage corruption might have occurred.*Error: readlink.*no such file or directory`)
)

const (
podmanInternalExitCode = 125
// podmanExecSIGKILLExitCode is the exit code returned by `podman exec` when the exec
// process is killed due to the parent container being removed.
podmanExecSIGKILLExitCode = 137
)

type PodmanOptions struct {
Expand Down Expand Up @@ -141,6 +143,13 @@ func (c *podmanCommandContainer) Run(ctx context.Context, command *repb.Command,
podmanRunArgs = append(podmanRunArgs, c.image)
podmanRunArgs = append(podmanRunArgs, command.Arguments...)
result = runPodman(ctx, "run", nil, nil, podmanRunArgs...)
if isStorageCorrupted(result) {
result.Error = status.UnavailableError("podman run failed because of storage corruption")
err = removeImage(ctx, c.image)
if err != nil {
log.Warningf("Failed to remove corrupted image: %s", err)
}
}
if exitedCleanly := result.ExitCode >= 0; !exitedCleanly {
err = killContainerIfRunning(ctx, containerName)
}
Expand All @@ -164,6 +173,13 @@ func (c *podmanCommandContainer) Create(ctx context.Context, workDir string) err
if err = createResult.Error; err != nil {
return status.UnavailableErrorf("failed to create container: %s", err)
}
if isStorageCorrupted(createResult) {
createResult.Error = status.UnavailableError("podman create failed because of storage corruption")
err = removeImage(ctx, c.image)
if err != nil {
log.Warningf("Failed to remove corrupted image: %s", err)
}
}

startResult := runPodman(ctx, "start", nil, nil, c.name)
return startResult.Error
Expand Down Expand Up @@ -287,3 +303,27 @@ func killContainerIfRunning(ctx context.Context, containerName string) error {
}
return status.UnknownErrorf("podman kill failed: %s", string(result.Stderr))
}

// Check if the podman run command hit a warning indicating a storage corruption
// More details can be found at
// https://github.com/containers/storage/issues/1136
func isStorageCorrupted(result *interfaces.CommandResult) bool {
if result.ExitCode != podmanInternalExitCode {
return false
}
return storageErrorRegex.MatchString(string(result.Stderr))
}

func removeImage(ctx context.Context, imageName string) error {
ctx, cancel := background.ExtendContextForFinalization(ctx, containerFinalizationTimeout)
defer cancel()

result := runPodman(ctx, "rmi", nil, nil, imageName)
if result.Error != nil {
return result.Error
}
if result.ExitCode == 0 || strings.Contains(string(result.Stderr), "image not known") {
return nil
}
return status.UnknownErrorf("podman rmi failed: %s", string(result.Stderr))
}

0 comments on commit 9cd41b6

Please sign in to comment.