Skip to content

Detect and shut down stuck server pods #2027

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Nov 4, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 18 additions & 4 deletions operator/src/main/java/oracle/kubernetes/operator/Main.java
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,10 @@ public class Main {
private static final AtomicReference<DateTime> lastFullRecheck =
new AtomicReference<>(DateTime.now());
private static final Semaphore shutdownSignal = new Semaphore(0);
private static final int DEFAULT_STUCK_POD_RECHECK_SECONDS = 30;

private final MainDelegate delegate;
private final StuckPodProcessing stuckPodProcessing;
private NamespaceWatcher namespaceWatcher;
private boolean warnedOfCrdAbsence;

Expand Down Expand Up @@ -291,6 +293,7 @@ DomainNamespaces getDomainNamespaces() {

Main(MainDelegate delegate) {
this.delegate = delegate;
stuckPodProcessing = new StuckPodProcessing(delegate);
}

void startOperator(Runnable completionAction) {
Expand Down Expand Up @@ -336,10 +339,9 @@ private void completeBegin() {

// start periodic retry and recheck
int recheckInterval = TuningParameters.getInstance().getMainTuning().domainNamespaceRecheckIntervalSeconds;
delegate.getEngine()
.getExecutor()
.scheduleWithFixedDelay(
recheckDomains(), recheckInterval, recheckInterval, TimeUnit.SECONDS);
int stuckPodInterval = getStuckPodInterval();
delegate.scheduleWithFixedDelay(recheckDomains(), recheckInterval, recheckInterval, TimeUnit.SECONDS);
delegate.scheduleWithFixedDelay(checkStuckPods(), stuckPodInterval, stuckPodInterval, TimeUnit.SECONDS);

markReadyAndStartLivenessThread();

Expand All @@ -348,6 +350,13 @@ private void completeBegin() {
}
}

private int getStuckPodInterval() {
return Optional.ofNullable(TuningParameters.getInstance())
.map(TuningParameters::getMainTuning)
.map(t -> t.stuckPodRecheckSeconds)
.orElse(DEFAULT_STUCK_POD_RECHECK_SECONDS);
}

NamespaceWatcher getNamespaceWatcher() {
return namespaceWatcher;
}
Expand All @@ -360,6 +369,11 @@ Runnable recheckDomains() {
return () -> delegate.runSteps(createDomainRecheckSteps());
}

Runnable checkStuckPods() {
return () -> getDomainNamespaces().getNamespaces().forEach(stuckPodProcessing::checkStuckPods);
}


Step createDomainRecheckSteps() {
return createDomainRecheckSteps(DateTime.now());
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@

package oracle.kubernetes.operator;

import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.TimeUnit;

import oracle.kubernetes.operator.helpers.KubernetesVersion;
import oracle.kubernetes.operator.helpers.SemanticVersion;
import oracle.kubernetes.operator.logging.LoggingFacade;
Expand Down Expand Up @@ -36,4 +39,6 @@ default void runSteps(Step firstStep) {
DomainNamespaces getDomainNamespaces();

KubernetesVersion getKubernetesVersion();

ScheduledFuture<?> scheduleWithFixedDelay(Runnable command, long initialDelay, long delay, TimeUnit unit);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
// Copyright (c) 2020, Oracle Corporation and/or its affiliates.
// Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl.

package oracle.kubernetes.operator;

import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import javax.annotation.Nonnull;

import io.kubernetes.client.openapi.models.V1ObjectMeta;
import io.kubernetes.client.openapi.models.V1Pod;
import io.kubernetes.client.openapi.models.V1PodList;
import oracle.kubernetes.operator.calls.CallResponse;
import oracle.kubernetes.operator.helpers.CallBuilder;
import oracle.kubernetes.operator.helpers.PodHelper;
import oracle.kubernetes.operator.logging.LoggingFacade;
import oracle.kubernetes.operator.logging.LoggingFactory;
import oracle.kubernetes.operator.steps.DefaultResponseStep;
import oracle.kubernetes.operator.work.NextAction;
import oracle.kubernetes.operator.work.Packet;
import oracle.kubernetes.operator.work.Step;
import oracle.kubernetes.utils.SystemClock;
import org.joda.time.DateTime;

import static oracle.kubernetes.operator.logging.MessageKeys.POD_FORCE_DELETED;

/**
* Under certain circumstances, when a Kubernetes node goes down, it may mark its pods as terminating, but never
* actually remove them. This code detects such cases, deletes the pods and triggers the necessary make-right flows.
*/
public class StuckPodProcessing {
private static final LoggingFacade LOGGER = LoggingFactory.getLogger("Operator", "Operator");

private final MainDelegate mainDelegate;

public StuckPodProcessing(MainDelegate mainDelegate) {
this.mainDelegate = mainDelegate;
}

void checkStuckPods(String namespace) {
Step step = new CallBuilder()
.withLabelSelectors(LabelConstants.getCreatedbyOperatorSelector())
.listPodAsync(namespace, new PodListProcessing(namespace, SystemClock.now()));
mainDelegate.runSteps(step);
}

@SuppressWarnings("unchecked")
private List<V1Pod> getStuckPodList(Packet packet) {
return (List<V1Pod>) packet.computeIfAbsent("STUCK_PODS", k -> new ArrayList<>());
}

class PodListProcessing extends DefaultResponseStep<V1PodList> {

private final DateTime now;

public PodListProcessing(String namespace, DateTime dateTime) {
super(new PodActionsStep(namespace));
now = dateTime;
}

@Override
public NextAction onSuccess(Packet packet, CallResponse<V1PodList> callResponse) {
callResponse.getResult().getItems().stream()
.filter(pod -> isStuck(pod, now))
.forEach(pod -> addStuckPodToPacket(packet, pod));

return doContinueListOrNext(callResponse, packet);
}

private boolean isStuck(V1Pod pod, DateTime now) {
return getExpectedDeleteTime(pod).isBefore(now);

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  • The stuck check potentially involves two different machine clocks that might be out of sync - the operator's and the pod's. I just did some brief research, and it looks like k8s doesn't require strict clock synchronization although it's considered desirable (in addition, WL clusters have fairly tight needs).
  • Should there be an additional grace after a detecting a potentially stuck delete in order to give K8S time to 'naturally' honor its timeout and perhaps to help account for a 'small' amount of potential clock drift?

Copy link
Member Author

@russgold russgold Nov 4, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That seems a reasonable precaution - how much drift should we be concerned about?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tough question. Off the top my head, maybe account for a few seconds of drift -- I've seen much worse on occasion, but, in general, WL clusters themselves dislike a drift any higher than that in my experience. Plus perhaps account for a few extra seconds to give k8s time to handle the delete 'naturally'. So that would total 10 seconds? And perhaps have an associated configurable?

@rjeberhard Thoughts?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I keep any accounting for drift very small; perhaps just 1 or 2 seconds. It's subjective, but when I tested the solution it still "felt" like I waited a long time before the terminated pod was removed.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, what's the harm? If we delete a pod that is going to be deleted anyway, it should still be fine, right?

Copy link

@tbarnes-us tbarnes-us Nov 4, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's my understanding this change initiates a forced delete - so in-memory-replicated-state data loss is possible if the forced delete occurs too early for the pod to perform its graceful shutdown

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The harm is that instead of catching only a pod that is "stuck" in the Terminating state that we could also detect pods that are in the process of shutting down normally and delete them before they've had a chance to complete normal shut down. This should only happen if the operator's pod is badly skewed from the master and the operator thinks the time is later than the master does.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wish that we could find a way to know that time at the master, but I've not been able to find it.

}

private DateTime getExpectedDeleteTime(V1Pod pod) {
return getDeletionTimeStamp(pod).plusSeconds((int) getDeletionGracePeriodSeconds(pod));
}

private long getDeletionGracePeriodSeconds(V1Pod pod) {
return Optional.of(pod).map(V1Pod::getMetadata).map(V1ObjectMeta::getDeletionGracePeriodSeconds).orElse(1L);
}

private DateTime getDeletionTimeStamp(V1Pod pod) {
return Optional.of(pod).map(V1Pod::getMetadata).map(V1ObjectMeta::getDeletionTimestamp).orElse(SystemClock.now());
}

private void addStuckPodToPacket(Packet packet, V1Pod stuckPod) {
getStuckPodList(packet).add(stuckPod);
}
}

class PodActionsStep extends Step {

private final String namespace;

public PodActionsStep(String namespace) {
this.namespace = namespace;
}

@Override
public NextAction apply(Packet packet) {
final List<V1Pod> stuckPodList = getStuckPodList(packet);
if (stuckPodList.isEmpty()) {
return doNext(packet);
} else {
Collection<StepAndPacket> startDetails = new ArrayList<>();

for (V1Pod pod : stuckPodList) {
startDetails.add(new StepAndPacket(createForcedDeletePodStep(pod), packet.clone()));
}
return doForkJoin(readExistingNamespaces(), packet, startDetails);
}
}

@Nonnull
private Step readExistingNamespaces() {
return mainDelegate.getDomainNamespaces().readExistingResources(namespace, mainDelegate.getDomainProcessor());
}

private Step createForcedDeletePodStep(V1Pod pod) {
return new CallBuilder()
.withGracePeriodSeconds(0)
.deletePodAsync(getName(pod), getNamespace(pod), getDomainUid(pod), null,
new ForcedDeleteResponseStep(getName(pod), getNamespace(pod)));
}

private String getName(V1Pod pod) {
return Objects.requireNonNull(pod.getMetadata()).getName();
}

private String getNamespace(V1Pod pod) {
return Objects.requireNonNull(pod.getMetadata()).getNamespace();
}

private String getDomainUid(V1Pod pod) {
return PodHelper.getPodDomainUid(pod);
}
}

static class ForcedDeleteResponseStep extends DefaultResponseStep<V1Pod> {

private final String name;
private final String namespace;

public ForcedDeleteResponseStep(String name, String namespace) {
this.name = name;
this.namespace = namespace;
}

@Override
public NextAction onSuccess(Packet packet, CallResponse<V1Pod> callResponse) {
LOGGER.info(POD_FORCE_DELETED, name, namespace);
return super.onSuccess(packet, callResponse);
}
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ public static class MainTuning {
public final int domainNamespaceRecheckIntervalSeconds;
public final int statusUpdateTimeoutSeconds;
public final int unchangedCountToDelayStatusRecheck;
public final int stuckPodRecheckSeconds;
public final long initialShortDelay;
public final long eventualLongDelay;

Expand All @@ -48,24 +49,27 @@ public static class MainTuning {
* @param domainNamespaceRecheckIntervalSeconds domain namespace recheck interval
* @param statusUpdateTimeoutSeconds status update timeout
* @param unchangedCountToDelayStatusRecheck unchanged count to delay status recheck
* @param stuckPodRecheckSeconds time between checks for stuck pods
* @param initialShortDelay initial short delay
* @param eventualLongDelay eventual long delay
*/
public MainTuning(
int domainPresenceFailureRetrySeconds,
int domainPresenceFailureRetryMaxCount,
int domainPresenceRecheckIntervalSeconds,
int domainNamespaceRecheckIntervalSeconds,
int statusUpdateTimeoutSeconds,
int unchangedCountToDelayStatusRecheck,
long initialShortDelay,
long eventualLongDelay) {
int domainPresenceFailureRetrySeconds,
int domainPresenceFailureRetryMaxCount,
int domainPresenceRecheckIntervalSeconds,
int domainNamespaceRecheckIntervalSeconds,
int statusUpdateTimeoutSeconds,
int unchangedCountToDelayStatusRecheck,
int stuckPodRecheckSeconds,
long initialShortDelay,
long eventualLongDelay) {
this.domainPresenceFailureRetrySeconds = domainPresenceFailureRetrySeconds;
this.domainPresenceFailureRetryMaxCount = domainPresenceFailureRetryMaxCount;
this.domainPresenceRecheckIntervalSeconds = domainPresenceRecheckIntervalSeconds;
this.domainNamespaceRecheckIntervalSeconds = domainNamespaceRecheckIntervalSeconds;
this.statusUpdateTimeoutSeconds = statusUpdateTimeoutSeconds;
this.unchangedCountToDelayStatusRecheck = unchangedCountToDelayStatusRecheck;
this.stuckPodRecheckSeconds = stuckPodRecheckSeconds;
this.initialShortDelay = initialShortDelay;
this.eventualLongDelay = eventualLongDelay;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ private void update() {
(int) readTuningParameter("domainNamespaceRecheckIntervalSeconds", 3),
(int) readTuningParameter("statusUpdateTimeoutSeconds", 10),
(int) readTuningParameter("statusUpdateUnchangedCountToDelayStatusRecheck", 10),
(int) readTuningParameter("stuckPodRecheckSeconds", 30),
readTuningParameter("statusUpdateInitialShortDelay", 5),
readTuningParameter("statusUpdateEventualLongDelay", 30));

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ public AsyncRequestStep(
String labelSelector,
String resourceVersion) {
this(next, requestParams, factory, null, helper, timeoutSeconds, maxRetryCount,
fieldSelector, labelSelector, resourceVersion);
null, fieldSelector, labelSelector, resourceVersion);
}

/**
Expand All @@ -108,6 +108,7 @@ public AsyncRequestStep(
ClientPool helper,
int timeoutSeconds,
int maxRetryCount,
Integer gracePeriodSeconds,
String fieldSelector,
String labelSelector,
String resourceVersion) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ <T> Step createRequestAsync(
ClientPool helper,
int timeoutSeconds,
int maxRetryCount,
Integer gracePeriodSeconds,
String fieldSelector,
String labelSelector,
String resourceVersion);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,7 @@ public <T> T execute(
private final CallFactory<V1Secret> readSecret =
(requestParams, usage, cont, callback) ->
wrap(readSecretAsync(usage, requestParams.name, requestParams.namespace, callback));
private final Integer gracePeriodSeconds = null;
private Integer gracePeriodSeconds = null;
private final Boolean orphanDependents = null;
private final String propagationPolicy = null;

Expand Down Expand Up @@ -540,6 +540,11 @@ public CallBuilder withTimeoutSeconds(int timeoutSeconds) {
return this;
}

public CallBuilder withGracePeriodSeconds(int gracePeriodSeconds) {
this.gracePeriodSeconds = gracePeriodSeconds;
return this;
}

private void tuning(int limit, int timeoutSeconds, int maxRetryCount) {
this.limit = limit;
this.timeoutSeconds = timeoutSeconds;
Expand Down Expand Up @@ -1909,6 +1914,7 @@ private <T> Step createRequestAsync(
helper,
timeoutSeconds,
maxRetryCount,
gracePeriodSeconds,
fieldSelector,
labelSelector,
resourceVersion);
Expand All @@ -1924,6 +1930,7 @@ private <T> Step createRequestAsync(
helper,
timeoutSeconds,
maxRetryCount,
gracePeriodSeconds,
fieldSelector,
labelSelector,
resourceVersion);
Expand All @@ -1939,6 +1946,7 @@ private <T> Step createRequestAsync(
helper,
timeoutSeconds,
maxRetryCount,
gracePeriodSeconds,
fieldSelector,
labelSelector,
resourceVersion);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ public class MessageKeys {
public static final String INTROSPECTOR_JOB_FAILED_DETAIL = "WLSKO-0176";
public static final String INTROSPECTOR_POD_FAILED = "WLSKO-0177";
public static final String CRD_NOT_INSTALLED = "WLSKO-0178";
public static final String POD_FORCE_DELETED = "WLSKO-0179";

// domain status messages
public static final String DUPLICATE_SERVER_NAME_FOUND = "WLSDO-0001";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,9 @@
/** A wrapper for the system clock that facilitates unit testing of time. */
public abstract class SystemClock {

private static SystemClock DELEGATE =
new SystemClock() {
// Leave as non-final; unit tests may replace this value
@SuppressWarnings("FieldMayBeFinal")
private static SystemClock DELEGATE = new SystemClock() {
@Override
public DateTime getCurrentTime() {
return DateTime.now();
Expand Down
1 change: 1 addition & 0 deletions operator/src/main/resources/Operator.properties
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,7 @@ WLSKO-0175=Job {0} in namespace {1} failed with status {2}. Check log messages \
WLSKO-0176=Job {1} in namespace {0} failed, job details are {2}
WLSKO-0177=Pod {0} in namespace {1} failed, the pod status is {2}
WLSKO-0178=Operator cannot proceed, as the Custom Resource Definition for ''domains.weblogic.oracle'' is not installed.
WLSKO-0179=Pod {0} in namespace {1} detected as stuck, and force-deleted

# Domain status messages

Expand Down
Loading