Make sure we always handle the "Failed" pod container event. (#16527)

Fixes #16400. Make sure we always handle the "Failed" pod container event. Handle it differently from other unrecoverable errors because this should be handled unconditionally. Also make sure that we handle the situation where a pod is declared running but one of the containers is waiting and has terminated before. This actually can happen without us first receiving the event about it terminating, possibly due to timing issues (on the Kubernetes side?). Also considers container restarts as an error condition during the workspace startup. Signed-off-by: Lukas Krejci <lkrejci@redhat.com>
eclipse-che · Apr 17, 2020 · eb9a2ab · eb9a2ab
1 parent 2a5e1ea
commit eb9a2ab
Show file tree

Hide file tree

Showing 5 changed files with 103 additions and 11 deletions.
diff --git a/assembly/assembly-wsmaster-war/src/main/webapp/WEB-INF/classes/che/che.properties b/assembly/assembly-wsmaster-war/src/main/webapp/WEB-INF/classes/che/che.properties
@@ -284,6 +284,8 @@ che.infra.kubernetes.ingress_start_timeout_min=5
 
 # If during workspace startup an unrecoverable event defined in the property occurs,
 # terminate workspace immediately instead of waiting until timeout
+# Note that this SHOULD NOT include a mere "Failed" reason, because that might catch events that are not unrecoverable.
+# A failed container startup is handled explicitly by Che server.
 che.infra.kubernetes.workspace_unrecoverable_events=FailedMount,FailedScheduling,MountVolume.SetUp failed,Failed to pull image,FailedCreate
 
 # Defines whether use the Persistent Volume Claim for che workspace needs

diff --git a/.../org/eclipse/che/workspace/infrastructure/kubernetes/namespace/KubernetesDeployments.java b/.../org/eclipse/che/workspace/infrastructure/kubernetes/namespace/KubernetesDeployments.java
@@ -24,6 +24,7 @@
 
 import com.google.common.base.Strings;
 import io.fabric8.kubernetes.api.model.ContainerStateTerminated;
+import io.fabric8.kubernetes.api.model.ContainerStateWaiting;
 import io.fabric8.kubernetes.api.model.ContainerStatus;
 import io.fabric8.kubernetes.api.model.DoneablePod;
 import io.fabric8.kubernetes.api.model.Event;
@@ -410,30 +411,58 @@ private void handleStartingPodStatus(CompletableFuture<Void> podRunningFuture, P
     if (POD_STATUS_PHASE_RUNNING.equals(podPhase)) {
       // check that all the containers are ready...
       Map<String, String> terminatedContainers = new HashMap<>();
+      List<String> restartingContainers = new ArrayList<>();
 
       for (ContainerStatus cs : status.getContainerStatuses()) {
         ContainerStateTerminated terminated = cs.getState().getTerminated();
+
+        if (terminated == null) {
+          ContainerStateWaiting waiting = cs.getState().getWaiting();
+          // we've caught the container waiting for a restart after a failure
+          if (waiting != null) {
+            terminated = cs.getLastState().getTerminated();
+          }
+        }
+
         if (terminated != null) {
           terminatedContainers.put(
               cs.getName(),
               format(
                   "reason = '%s', exit code = %d, message = '%s'",
                   terminated.getReason(), terminated.getExitCode(), terminated.getMessage()));
         }
+
+        if (cs.getRestartCount() != null && cs.getRestartCount() > 0) {
+          restartingContainers.add(cs.getName());
+        }
       }
 
-      if (terminatedContainers.isEmpty()) {
+      if (terminatedContainers.isEmpty() && restartingContainers.isEmpty()) {
         podRunningFuture.complete(null);
       } else {
-        String errorMessage =
-            "The following containers have terminated:\n"
-                + terminatedContainers
-                    .entrySet()
-                    .stream()
-                    .map(e -> e.getKey() + ": " + e.getValue())
-                    .collect(Collectors.joining("" + "\n"));
-
-        podRunningFuture.completeExceptionally(new InfrastructureException(errorMessage));
+        StringBuilder errorMessage = new StringBuilder();
+
+        if (!restartingContainers.isEmpty()) {
+          errorMessage.append("The following containers have restarted during the startup:\n");
+          errorMessage.append(String.join(", ", restartingContainers));
+        }
+
+        if (!terminatedContainers.isEmpty()) {
+          if (errorMessage.length() > 0) {
+            errorMessage.append("\n");
+          }
+
+          errorMessage.append("The following containers have terminated:\n");
+          errorMessage.append(
+              terminatedContainers
+                  .entrySet()
+                  .stream()
+                  .map(e -> e.getKey() + ": " + e.getValue())
+                  .collect(Collectors.joining("" + "\n")));
+        }
+
+        podRunningFuture.completeExceptionally(
+            new InfrastructureException(errorMessage.toString()));
       }
 
       return;

diff --git a/...g/eclipse/che/workspace/infrastructure/kubernetes/util/UnrecoverablePodEventListener.java b/...g/eclipse/che/workspace/infrastructure/kubernetes/util/UnrecoverablePodEventListener.java
@@ -40,7 +40,7 @@ public UnrecoverablePodEventListener(
 
   @Override
   public void handle(PodEvent event) {
-    if (isWorkspaceEvent(event) && isUnrecoverable(event)) {
+    if (isWorkspaceEvent(event) && (isFailedContainer(event) || isUnrecoverable(event))) {
       unrecoverableEventHandler.accept(event);
     }
   }
@@ -82,4 +82,16 @@ private boolean isUnrecoverable(PodEvent event) {
     }
     return isUnrecoverable;
   }
+
+  /**
+   * This method detects whether the pod event corresponds to a container that failed to start. Note
+   * that this is handled differently from the the {@link #isUnrecoverable(PodEvent)} because we are
+   * specifically looking at failed containers and don't consider the event message in the logic.
+   *
+   * @param event an event on the pod
+   * @return true if the event is about a failed container, false otherwise
+   */
+  private boolean isFailedContainer(PodEvent event) {
+    return "Failed".equals(event.getReason()) && event.getContainerName() != null;
+  }
 }
diff --git a/.../eclipse/che/workspace/infrastructure/kubernetes/namespace/KubernetesDeploymentsTest.java b/.../eclipse/che/workspace/infrastructure/kubernetes/namespace/KubernetesDeploymentsTest.java
@@ -309,6 +309,37 @@ public void shouldCompleteFutureForWaitingPodIfStatusIsRunning() {
     assertTrue(future.isCompletedExceptionally());
   }
 
+  @Test
+  public void
+      shouldCompleteExceptionallyFutureForWaitingPodIfStatusIsRunningButSomeContainersAreWaitingAndTerminatedBefore() {
+    // given
+    ContainerStatus containerStatus = mock(ContainerStatus.class);
+    when(containerStatus.getName()).thenReturn("FailingContainer");
+    when(containerStatus.getState())
+        .thenReturn(
+            new ContainerStateBuilder().withNewWaiting().withMessage("bah").endWaiting().build());
+    when(containerStatus.getLastState())
+        .thenReturn(
+            new ContainerStateBuilder()
+                .withNewTerminated()
+                .withReason("Completed")
+                .endTerminated()
+                .build());
+
+    when(status.getPhase()).thenReturn(POD_STATUS_PHASE_RUNNING);
+    when(status.getContainerStatuses()).thenReturn(singletonList(containerStatus));
+    CompletableFuture<?> future = kubernetesDeployments.waitRunningAsync(POD_NAME);
+
+    // when
+    verify(podResource).watch(watcherCaptor.capture());
+    Watcher<Pod> watcher = watcherCaptor.getValue();
+    watcher.eventReceived(Watcher.Action.MODIFIED, pod);
+
+    // then
+    assertTrue(future.isDone());
+    assertTrue(future.isCompletedExceptionally());
+  }
+
   @Test
   public void shouldCompleteExceptionallyFutureForWaitingPodIfStatusIsSucceeded() throws Exception {
     // given

diff --git a/...lipse/che/workspace/infrastructure/kubernetes/util/UnrecoverablePodEventListenerTest.java b/...lipse/che/workspace/infrastructure/kubernetes/util/UnrecoverablePodEventListenerTest.java
@@ -136,6 +136,24 @@ public void testHandleRegularEvent() throws Exception {
     verify(unrecoverableEventConsumer, never()).accept(any());
   }
 
+  @Test
+  public void testFailedContainersInWorkspacePodAlwaysHandled() {
+    // given
+    PodEvent ev =
+        mockContainerEvent(
+            WORKSPACE_POD_NAME,
+            "Failed",
+            "bah",
+            EVENT_CREATION_TIMESTAMP,
+            getCurrentTimestampWithOneHourShiftAhead());
+
+    // when
+    unrecoverableEventListener.handle(ev);
+
+    // then
+    verify(unrecoverableEventConsumer).accept(any());
+  }
+
   /**
    * Mock a container event, as though it was triggered by the OpenShift API. As workspace Pods are
    * created indirectly through deployments, they are given generated names with the provided name