Skip to content

Commit

Permalink
Make sure we always handle the "Failed" pod container event. (#16527)
Browse files Browse the repository at this point in the history
Fixes #16400.

Make sure we always handle the "Failed" pod container event.
Handle it differently from other unrecoverable errors because
this should be handled unconditionally.

Also make sure that we handle the situation where a pod is
declared running but one of the containers is waiting and
has terminated before. This actually can happen without us
first receiving the event about it terminating, possibly
due to timing issues (on the Kubernetes side?).

Also considers container restarts as an error condition during
the workspace startup.

Signed-off-by: Lukas Krejci <lkrejci@redhat.com>
  • Loading branch information
metlos authored Apr 17, 2020
1 parent 2a5e1ea commit eb9a2ab
Show file tree
Hide file tree
Showing 5 changed files with 103 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,8 @@ che.infra.kubernetes.ingress_start_timeout_min=5

# If during workspace startup an unrecoverable event defined in the property occurs,
# terminate workspace immediately instead of waiting until timeout
# Note that this SHOULD NOT include a mere "Failed" reason, because that might catch events that are not unrecoverable.
# A failed container startup is handled explicitly by Che server.
che.infra.kubernetes.workspace_unrecoverable_events=FailedMount,FailedScheduling,MountVolume.SetUp failed,Failed to pull image,FailedCreate

# Defines whether use the Persistent Volume Claim for che workspace needs
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

import com.google.common.base.Strings;
import io.fabric8.kubernetes.api.model.ContainerStateTerminated;
import io.fabric8.kubernetes.api.model.ContainerStateWaiting;
import io.fabric8.kubernetes.api.model.ContainerStatus;
import io.fabric8.kubernetes.api.model.DoneablePod;
import io.fabric8.kubernetes.api.model.Event;
Expand Down Expand Up @@ -410,30 +411,58 @@ private void handleStartingPodStatus(CompletableFuture<Void> podRunningFuture, P
if (POD_STATUS_PHASE_RUNNING.equals(podPhase)) {
// check that all the containers are ready...
Map<String, String> terminatedContainers = new HashMap<>();
List<String> restartingContainers = new ArrayList<>();

for (ContainerStatus cs : status.getContainerStatuses()) {
ContainerStateTerminated terminated = cs.getState().getTerminated();

if (terminated == null) {
ContainerStateWaiting waiting = cs.getState().getWaiting();
// we've caught the container waiting for a restart after a failure
if (waiting != null) {
terminated = cs.getLastState().getTerminated();
}
}

if (terminated != null) {
terminatedContainers.put(
cs.getName(),
format(
"reason = '%s', exit code = %d, message = '%s'",
terminated.getReason(), terminated.getExitCode(), terminated.getMessage()));
}

if (cs.getRestartCount() != null && cs.getRestartCount() > 0) {
restartingContainers.add(cs.getName());
}
}

if (terminatedContainers.isEmpty()) {
if (terminatedContainers.isEmpty() && restartingContainers.isEmpty()) {
podRunningFuture.complete(null);
} else {
String errorMessage =
"The following containers have terminated:\n"
+ terminatedContainers
.entrySet()
.stream()
.map(e -> e.getKey() + ": " + e.getValue())
.collect(Collectors.joining("" + "\n"));

podRunningFuture.completeExceptionally(new InfrastructureException(errorMessage));
StringBuilder errorMessage = new StringBuilder();

if (!restartingContainers.isEmpty()) {
errorMessage.append("The following containers have restarted during the startup:\n");
errorMessage.append(String.join(", ", restartingContainers));
}

if (!terminatedContainers.isEmpty()) {
if (errorMessage.length() > 0) {
errorMessage.append("\n");
}

errorMessage.append("The following containers have terminated:\n");
errorMessage.append(
terminatedContainers
.entrySet()
.stream()
.map(e -> e.getKey() + ": " + e.getValue())
.collect(Collectors.joining("" + "\n")));
}

podRunningFuture.completeExceptionally(
new InfrastructureException(errorMessage.toString()));
}

return;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ public UnrecoverablePodEventListener(

@Override
public void handle(PodEvent event) {
if (isWorkspaceEvent(event) && isUnrecoverable(event)) {
if (isWorkspaceEvent(event) && (isFailedContainer(event) || isUnrecoverable(event))) {
unrecoverableEventHandler.accept(event);
}
}
Expand Down Expand Up @@ -82,4 +82,16 @@ private boolean isUnrecoverable(PodEvent event) {
}
return isUnrecoverable;
}

/**
* This method detects whether the pod event corresponds to a container that failed to start. Note
* that this is handled differently from the the {@link #isUnrecoverable(PodEvent)} because we are
* specifically looking at failed containers and don't consider the event message in the logic.
*
* @param event an event on the pod
* @return true if the event is about a failed container, false otherwise
*/
private boolean isFailedContainer(PodEvent event) {
return "Failed".equals(event.getReason()) && event.getContainerName() != null;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,37 @@ public void shouldCompleteFutureForWaitingPodIfStatusIsRunning() {
assertTrue(future.isCompletedExceptionally());
}

@Test
public void
shouldCompleteExceptionallyFutureForWaitingPodIfStatusIsRunningButSomeContainersAreWaitingAndTerminatedBefore() {
// given
ContainerStatus containerStatus = mock(ContainerStatus.class);
when(containerStatus.getName()).thenReturn("FailingContainer");
when(containerStatus.getState())
.thenReturn(
new ContainerStateBuilder().withNewWaiting().withMessage("bah").endWaiting().build());
when(containerStatus.getLastState())
.thenReturn(
new ContainerStateBuilder()
.withNewTerminated()
.withReason("Completed")
.endTerminated()
.build());

when(status.getPhase()).thenReturn(POD_STATUS_PHASE_RUNNING);
when(status.getContainerStatuses()).thenReturn(singletonList(containerStatus));
CompletableFuture<?> future = kubernetesDeployments.waitRunningAsync(POD_NAME);

// when
verify(podResource).watch(watcherCaptor.capture());
Watcher<Pod> watcher = watcherCaptor.getValue();
watcher.eventReceived(Watcher.Action.MODIFIED, pod);

// then
assertTrue(future.isDone());
assertTrue(future.isCompletedExceptionally());
}

@Test
public void shouldCompleteExceptionallyFutureForWaitingPodIfStatusIsSucceeded() throws Exception {
// given
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,24 @@ public void testHandleRegularEvent() throws Exception {
verify(unrecoverableEventConsumer, never()).accept(any());
}

@Test
public void testFailedContainersInWorkspacePodAlwaysHandled() {
// given
PodEvent ev =
mockContainerEvent(
WORKSPACE_POD_NAME,
"Failed",
"bah",
EVENT_CREATION_TIMESTAMP,
getCurrentTimestampWithOneHourShiftAhead());

// when
unrecoverableEventListener.handle(ev);

// then
verify(unrecoverableEventConsumer).accept(any());
}

/**
* Mock a container event, as though it was triggered by the OpenShift API. As workspace Pods are
* created indirectly through deployments, they are given generated names with the provided name
Expand Down

0 comments on commit eb9a2ab

Please sign in to comment.