Skip to content

Changes for OWLS-82011 to reflect introspector status in domain status #1832

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Jul 29, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import io.kubernetes.client.openapi.models.V1ObjectMeta;
import io.kubernetes.client.openapi.models.V1ObjectReference;
import io.kubernetes.client.openapi.models.V1Pod;
import io.kubernetes.client.openapi.models.V1PodCondition;
import io.kubernetes.client.openapi.models.V1PodList;
import io.kubernetes.client.openapi.models.V1PodStatus;
import io.kubernetes.client.openapi.models.V1Service;
Expand All @@ -34,7 +35,6 @@
import oracle.kubernetes.operator.helpers.CallBuilder;
import oracle.kubernetes.operator.helpers.ConfigMapHelper;
import oracle.kubernetes.operator.helpers.DomainPresenceInfo;
import oracle.kubernetes.operator.helpers.DomainStatusPatch;
import oracle.kubernetes.operator.helpers.DomainValidationSteps;
import oracle.kubernetes.operator.helpers.JobHelper;
import oracle.kubernetes.operator.helpers.KubernetesUtils;
Expand Down Expand Up @@ -65,7 +65,9 @@
import oracle.kubernetes.weblogic.domain.model.Channel;
import oracle.kubernetes.weblogic.domain.model.Domain;

import static oracle.kubernetes.operator.DomainStatusUpdater.ADMIN_SERVER_STARTING_PROGRESS_REASON;
import static oracle.kubernetes.operator.DomainStatusUpdater.INSPECTING_DOMAIN_PROGRESS_REASON;
import static oracle.kubernetes.operator.DomainStatusUpdater.MANAGED_SERVERS_STARTING_PROGRESS_REASON;
import static oracle.kubernetes.operator.LabelConstants.INTROSPECTION_STATE_LABEL;
import static oracle.kubernetes.operator.ProcessingConstants.DOMAIN_INTROSPECT_REQUESTED;
import static oracle.kubernetes.operator.ProcessingConstants.MAKE_RIGHT_DOMAIN_OPERATION;
Expand Down Expand Up @@ -209,7 +211,8 @@ private static Step bringAdminServerUpSteps(
}

private static Step bringManagedServersUp(Step next) {
return new ManagedServersUpStep(next);
return DomainStatusUpdater.createProgressingStep(MANAGED_SERVERS_STARTING_PROGRESS_REASON, true,
new ManagedServersUpStep(next));
}

private FiberGate getMakeRightFiberGate(String ns) {
Expand Down Expand Up @@ -341,7 +344,8 @@ private void processIntrospectorJobPodWatch(V1Pod pod, String watchType) {
switch (watchType) {
case "ADDED":
case "MODIFIED":
new DomainStatusUpdate(info.getDomain(), pod, domainUid).invoke();
PodWatcher.PodStatus podStatus = PodWatcher.getPodStatus(pod);
new DomainStatusUpdate(pod, domainUid, delegate, info, podStatus).invoke();
break;
default:
}
Expand Down Expand Up @@ -783,13 +787,13 @@ public void onThrowable(Packet packet, Throwable throwable) {
}

Step createDomainUpPlan(DomainPresenceInfo info) {
Step managedServerStrategy =
bringManagedServersUp(DomainStatusUpdater.createEndProgressingStep(new TailStep()));
Step managedServerStrategy = bringManagedServersUp(DomainStatusUpdater.createEndProgressingStep(new TailStep()));

Step domainUpStrategy =
Step.chain(
domainIntrospectionSteps(info),
new DomainStatusStep(info, null),
DomainStatusUpdater.createProgressingStep(ADMIN_SERVER_STARTING_PROGRESS_REASON,true, null),
bringAdminServerUp(info, delegate.getPodAwaiterStepFactory(info.getNamespace())),
managedServerStrategy);

Expand Down Expand Up @@ -974,42 +978,87 @@ public NextAction apply(Packet packet) {
}

private static class DomainStatusUpdate {
private final Domain domain;
private final V1Pod pod;
private final String domainUid;
private DomainProcessorDelegate delegate = null;
private DomainPresenceInfo info = null;
private PodWatcher.PodStatus podStatus;

DomainStatusUpdate(Domain domain, V1Pod pod, String domainUid) {
this.domain = domain;
DomainStatusUpdate(V1Pod pod, String domainUid, DomainProcessorDelegate delegate,
DomainPresenceInfo info, PodWatcher.PodStatus podStatus) {
this.pod = pod;
this.domainUid = domainUid;
this.delegate = delegate;
this.info = info;
this.podStatus = podStatus;
}

public void invoke() {
Optional.ofNullable(getMatchingContainerStatus())
.map(V1ContainerStatus::getState)
.map(V1ContainerState::getWaiting)
.ifPresent(waiting -> updateStatus(waiting.getReason(), waiting.getMessage()));
}

private void updateStatus(String reason, String message) {
if (reason == null || message == null) {
return;
private void invoke() {
switch (podStatus) {
case PHASE_FAILED:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do these statuses indicate? I'd have thought that once you selected a status, that indicated what had to be updated in the domain status - and yet this seems to do a fair bit of additional processing. Can you explain your approach?

Copy link
Member Author

@ankedia ankedia Jul 24, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately Kubernetes seem to have different way of dealing with various failures and reason/message for failure is captured in different json stanza/object.

  1. The UNSCHEDULABLE status indicates that pod can't be scheduled and is in pending state (for e.g. due to missing pv/c). The reason/message for this condition is captured in pod conditions which is part of pod status.

  2. The WAITING_NON_NULL_MESSAGE indicates that pod container is stuck in waiting state (for e.g. to due to image pull error). The reason/message for this is captured in waiting state object in container statuses which is part of pod status.

  3. The PHASE_FAILED status indicates that pod has failed due to timeout (Deadline exceeded). In this case both conditions and container statuses in pod status are null (pls see below). The pod status phase is "Failed" and reason/message is captured directly as part of pod status.
    V1PodStatus:
    {
    conditions: null
    containerStatuses: [class V1ContainerStatus {
    containerID: docker://b823ffd3fe05d101b22529485ce9d946b652631487e8ce82e94f2e8ec1c2acb0
    image: model-in-image-1:v1
    imageID: docker://sha256:1cb49428989aec3007cd887d5b31cbf8da6fa0da30998e03d2ebec29ed4bf5ec
    lastState: class V1ContainerState {
    running: null
    terminated: null
    waiting: null
    }
    name: domain1-introspect-domain-job
    ready: false
    restartCount: 0
    started: null
    state: class V1ContainerState {
    running: class V1ContainerStateRunning {
    startedAt: 2020-07-23T23:07:08.000Z
    }
    terminated: null
    waiting: null
    }
    }]
    ephemeralContainerStatuses: null
    hostIP: null
    initContainerStatuses: null
    message: Pod was active on the node longer than the specified deadline
    nominatedNodeName: null
    phase: Failed
    podIP: 192.168.4.110
    podIPs: null
    qosClass: Burstable
    reason: DeadlineExceeded
    startTime: 2020-07-23T23:07:07.000Z
    }

  4. There are other cases where pod/container is terminated due to error and reason/message is captured in terminated state in container statuses which is part of pod status. This is indicated by TERMINATED_ERROR_REASON.

Earlier logic was updating domain status unconditionally with reason/message from waiting state of container statuses which is part of pod status after receiving pod added/modified watch event.

Additionally, there's job watcher and pod watcher and both of them update domain status. We might have to decide which one gets higher priority or if we can update domain status only in pod watcher based on additional testing and results of integration tests.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was hoping for a simple way to make this more readable, but I don't see one that is obvious. I may want to revisit in the future.

delegate.runSteps(
DomainStatusUpdater.createFailedStep(
info, pod.getStatus().getReason(), pod.getStatus().getMessage(), null));
break;
case WAITING_NON_NULL_MESSAGE:
Optional.ofNullable(getMatchingContainerStatus())
.map(V1ContainerStatus::getState)
.map(V1ContainerState::getWaiting)
.ifPresent(waiting ->
delegate.runSteps(
DomainStatusUpdater.createFailedStep(
info, waiting.getReason(), waiting.getMessage(), null)));
break;
case TERMINATED_ERROR_REASON:
Optional.ofNullable(getMatchingContainerStatus())
.map(V1ContainerStatus::getState)
.map(V1ContainerState::getTerminated)
.ifPresent(terminated -> delegate.runSteps(
DomainStatusUpdater.createFailedStep(
info, terminated.getReason(), terminated.getMessage(), null)));
break;
case UNSCHEDULABLE:
Optional.ofNullable(getMatchingPodCondition())
.ifPresent(condition ->
delegate.runSteps(
DomainStatusUpdater.createFailedStep(
info, condition.getReason(), condition.getMessage(), null)));
break;
case SUCCESS:
Optional.ofNullable(getMatchingContainerStatus())
.map(V1ContainerStatus::getState)
.map(V1ContainerState::getWaiting)
.ifPresent(waiting ->
delegate.runSteps(
DomainStatusUpdater.createProgressingStep(
info, waiting.getReason(), false, null)));
break;
default:
}

DomainStatusPatch.updateSynchronously(domain, reason, message);
}

private V1ContainerStatus getMatchingContainerStatus() {
return Optional.ofNullable(pod.getStatus())
.map(V1PodStatus::getContainerStatuses)
.flatMap(this::getMatchingContainerStatus)
.orElse(null);
.map(V1PodStatus::getContainerStatuses)
.flatMap(this::getMatchingContainerStatus)
.orElse(null);
}

private Optional<V1ContainerStatus> getMatchingContainerStatus(Collection<V1ContainerStatus> statuses) {
return statuses.stream().filter(this::hasInstrospectorJobName).findFirst();
}

private V1PodCondition getMatchingPodCondition() {
return Optional.ofNullable(pod.getStatus())
.map(V1PodStatus::getConditions)
.flatMap(this::getPodCondition)
.orElse(null);
}

private Optional<V1PodCondition> getPodCondition(Collection<V1PodCondition> conditions) {
return conditions.stream().findFirst();
}

private boolean hasInstrospectorJobName(V1ContainerStatus s) {
return toJobIntrospectorName(domainUid).equals(s.getName());
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
import oracle.kubernetes.operator.steps.DefaultResponseStep;
import oracle.kubernetes.operator.wlsconfig.WlsClusterConfig;
import oracle.kubernetes.operator.wlsconfig.WlsDomainConfig;
import oracle.kubernetes.operator.work.Component;
import oracle.kubernetes.operator.work.NextAction;
import oracle.kubernetes.operator.work.Packet;
import oracle.kubernetes.operator.work.Step;
Expand Down Expand Up @@ -65,6 +66,7 @@
@SuppressWarnings("WeakerAccess")
public class DomainStatusUpdater {
public static final String INSPECTING_DOMAIN_PROGRESS_REASON = "InspectingDomainPresence";
public static final String ADMIN_SERVER_STARTING_PROGRESS_REASON = "AdminServerStarting";
public static final String MANAGED_SERVERS_STARTING_PROGRESS_REASON = "ManagedServersStarting";
public static final String SERVERS_READY_REASON = "ServersReady";
public static final String ALL_STOPPED_AVAILABLE_REASON = "AllServersStopped";
Expand Down Expand Up @@ -97,7 +99,21 @@ public static Step createStatusUpdateStep(Step next) {
* @return Step
*/
public static Step createProgressingStep(String reason, boolean isPreserveAvailable, Step next) {
return new ProgressingStep(reason, isPreserveAvailable, next);
return new ProgressingStep(null, reason, isPreserveAvailable, next);
}

/**
* Asynchronous step to set Domain condition to Progressing.
*
* @param info Domain presence info
* @param reason Progressing reason
* @param isPreserveAvailable true, if existing Available=True condition should be preserved
* @param next Next step
* @return Step
*/
public static Step createProgressingStep(DomainPresenceInfo info, String reason, boolean isPreserveAvailable,
Step next) {
return new ProgressingStep(info, reason, isPreserveAvailable, next);
}

/**
Expand Down Expand Up @@ -148,7 +164,8 @@ public static Step createFailedStep(CallResponse<?> callResponse, Step next) {
* @return Step
*/
static Step createFailedStep(Throwable throwable, Step next) {
return createFailedStep("Exception", throwable.getMessage(), next);
return throwable.getMessage() == null ? createFailedStep("Exception", throwable.toString(), next)
: createFailedStep("Exception", throwable.getMessage(), next);
}

/**
Expand All @@ -160,10 +177,24 @@ static Step createFailedStep(Throwable throwable, Step next) {
* @return Step
*/
public static Step createFailedStep(String reason, String message, Step next) {
return new FailedStep(reason, message, next);
return new FailedStep(null, reason, message, next);
}

/**
* Asynchronous step to set Domain condition to Failed.
*
* @param info Domain presence info
* @param reason the reason for the failure
* @param message a fuller description of the problem
* @param next Next step
* @return Step
*/
public static Step createFailedStep(DomainPresenceInfo info, String reason, String message, Step next) {
return new FailedStep(info, reason, message, next);
}

abstract static class DomainStatusUpdaterStep extends Step {
private DomainPresenceInfo info = null;

DomainStatusUpdaterStep(Step next) {
super(next);
Expand All @@ -177,6 +208,14 @@ DomainStatusUpdaterContext createContext(Packet packet) {

@Override
public NextAction apply(Packet packet) {
if ((packet.getSpi(DomainPresenceInfo.class) == null)
&& (info != null)) {
packet
.getComponents()
.put(
ProcessingConstants.DOMAIN_COMPONENT_NAME,
Component.createFor(info));
}
DomainStatusUpdaterContext context = createContext(packet);
DomainStatus newStatus = context.getNewStatus();

Expand Down Expand Up @@ -556,8 +595,9 @@ private static class ProgressingStep extends DomainStatusUpdaterStep {
private final String reason;
private final boolean isPreserveAvailable;

private ProgressingStep(String reason, boolean isPreserveAvailable, Step next) {
private ProgressingStep(DomainPresenceInfo info, String reason, boolean isPreserveAvailable, Step next) {
super(next);
super.info = info;
this.reason = reason;
this.isPreserveAvailable = isPreserveAvailable;
}
Expand Down Expand Up @@ -602,8 +642,9 @@ private static class FailedStep extends DomainStatusUpdaterStep {
private final String reason;
private final String message;

private FailedStep(String reason, String message, Step next) {
private FailedStep(DomainPresenceInfo info, String reason, String message, Step next) {
super(next);
super.info = info;
this.reason = reason;
this.message = message;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,8 @@ void updatePacket(Packet packet, V1Job job) {
// be available for reading
@Override
boolean shouldTerminateFiber(V1Job job) {
return isFailed(job) && "DeadlineExceeded".equals(getFailedReason(job));
return isFailed(job) && ("DeadlineExceeded".equals(getFailedReason(job))
|| "BackoffLimitExceeded".equals(getFailedReason(job)));
}

// create an exception to terminate the fiber
Expand Down
77 changes: 59 additions & 18 deletions operator/src/main/java/oracle/kubernetes/operator/PodWatcher.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.concurrent.ThreadFactory;
Expand All @@ -21,6 +22,7 @@
import io.kubernetes.client.openapi.models.V1ContainerStatus;
import io.kubernetes.client.openapi.models.V1ObjectMeta;
import io.kubernetes.client.openapi.models.V1Pod;
import io.kubernetes.client.openapi.models.V1PodCondition;
import io.kubernetes.client.openapi.models.V1PodStatus;
import io.kubernetes.client.util.Watch;
import oracle.kubernetes.operator.TuningParameters.WatchTuning;
Expand All @@ -45,6 +47,14 @@ public class PodWatcher extends Watcher<V1Pod> implements WatchListener<V1Pod>,
private final String namespace;
private final WatchListener<V1Pod> listener;

public enum PodStatus {
PHASE_FAILED,
WAITING_NON_NULL_MESSAGE,
TERMINATED_ERROR_REASON,
UNSCHEDULABLE,
SUCCESS
}

// Map of Pod name to callback. Note that since each pod name can be mapped to multiple callback registrations,
// a concurrent map will not suffice; we therefore use an ordinary map and synchronous accesses.
private final Map<String, Collection<Consumer<V1Pod>>> modifiedCallbackRegistrations = new HashMap<>();
Expand Down Expand Up @@ -170,27 +180,58 @@ public void receivedResponse(Watch.Response<V1Pod> item) {
* @param pod pob
* @return true, if failed
*/
private static boolean isFailed(V1Pod pod) {
if (pod == null) {
return false;
}
static boolean isFailed(@Nonnull V1Pod pod) {

V1PodStatus status = pod.getStatus();
LOGGER.fine(
"PodWatcher.isFailed status of pod " + pod.getMetadata().getName() + ": " + status);
if (status != null) {
java.util.List<V1ContainerStatus> conStatuses = status.getContainerStatuses();
if (conStatuses != null) {
for (V1ContainerStatus conStatus : conStatuses) {
if (!isReady(conStatus)
&& (getContainerStateWaitingMessage(conStatus) != null
|| getContainerStateTerminatedReason(conStatus).contains("Error"))) {
return true;
}
}
}
"PodWatcher.isFailed status of pod " + pod.getMetadata().getName() + ": " + pod.getStatus());
return getContainerStatuses(pod).stream().anyMatch(PodWatcher::isPodFailed);
}

static PodStatus getPodStatus(@Nonnull V1Pod pod) {
V1ContainerStatus conStatus = getContainerStatuses(pod)
.stream()
.findFirst()
.orElse(new V1ContainerStatus());
String phase = Optional.ofNullable(pod.getStatus()).map(V1PodStatus::getPhase).orElse("");
if (phase.equals("Failed")) {
return PodStatus.PHASE_FAILED;
} else if (!isReady(conStatus) && getContainerStateWaitingMessage(conStatus) != null) {
return PodStatus.WAITING_NON_NULL_MESSAGE;
} else if (!isReady(conStatus) && getContainerStateTerminatedReason(conStatus).contains("Error")) {
return PodStatus.TERMINATED_ERROR_REASON;
} else if (isUnschedulable(pod)) {
return PodStatus.UNSCHEDULABLE;
}
return false;
return PodStatus.SUCCESS;
}

static List<V1ContainerStatus> getContainerStatuses(@Nonnull V1Pod pod) {
return Optional.ofNullable(pod.getStatus()).map(V1PodStatus::getContainerStatuses).orElse(Collections.emptyList());
}

private static boolean isPodFailed(V1ContainerStatus conStatus) {
return
!isReady(conStatus)
&& (getContainerStateWaitingMessage(conStatus) != null
|| getContainerStateTerminatedReason(conStatus).contains("Error"));
}

static boolean isUnschedulable(@Nonnull V1Pod pod) {

LOGGER.fine("PodWatcher.isUnschedulable status of pod " + pod.getMetadata().getName() + ": " + pod.getStatus());
return getPodConditions(pod).stream().anyMatch(PodWatcher::isPodUnschedulable);
}

private static List<V1PodCondition> getPodConditions(@Nonnull V1Pod pod) {
return Optional.ofNullable(pod.getStatus()).map(V1PodStatus::getConditions).orElse(Collections.emptyList());
}

private static boolean isPodUnschedulable(V1PodCondition podCondition) {
return getReason(podCondition).contains("Unschedulable");
}

private static String getReason(V1PodCondition podCondition) {
return Optional.ofNullable(podCondition).map(V1PodCondition::getReason).orElse("");
}

private static boolean isReady(V1ContainerStatus conStatus) {
Expand Down
Loading