Skip to content

OWLS-96896 - Restart Evicted Pods #2979

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
Apr 21, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -175,9 +175,8 @@ public class MessageKeys {
public static final String MISSING_ELASTIC_SEARCH_SECRET = "WLSKO-0223";
public static final String FLUENTD_CONFIGMAP_CREATED = "WLSKO-0224";
public static final String FLUENTD_CONFIGMAP_REPLACED = "WLSKO-0225";



public static final String POD_EVICTED = "WLSKO-0226";
public static final String POD_EVICTED_NO_RESTART = "WLSKO-0227";

// domain status messages
public static final String DUPLICATE_SERVER_NAME_FOUND = "WLSDO-0001";
Expand Down Expand Up @@ -216,6 +215,8 @@ public class MessageKeys {
public static final String INVALID_MODEL_HOME = "WLSDO-0039";
public static final String PODS_FAILED = "WLSDO-0040";
public static final String PODS_NOT_READY = "WLSDO-0041";
public static final String CYCLING_POD_EVICTED = "WLSDO-0042";
public static final String CYCLING_POD_SPEC_CHANGED = "WLSDO-0043";

private MessageKeys() {
}
Expand Down
4 changes: 4 additions & 0 deletions common/src/main/resources/Operator.properties
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,8 @@ WLSKO-0223=When fluentdSpecification is specified in the domain spec, a secret c
must be specified in {0}
WKSKO-0224=Fluentd configmap created.
WKSKO-0225=Fluentd configmap replaced.
WLSKO-0226=Pod {0} was evicted due to {1}; validating domain
WLSKO-0227=Pod {0} was evicted due to {1} but the operator is configured not to restart it.

# Domain status messages

Expand Down Expand Up @@ -237,6 +239,8 @@ WLSDO-0041=One or more server pods that are supposed to be available are not rea
defined in 'serverPod.maxReadyWaitTimeSeconds' under "domain.spec', 'domain.adminServer', 'managedServer', or 'domain.cluster'. \
Check the server status in the domain status, the server pod status and logs, and WebLogic Server logs for possible reasons. \
Adjust the value of 'serverPod.maxReadyWaitTimeSeconds' setting if needed."
WLSDO-0042=Pod was evicted
WLSDO-0043=Pod spec has changed
oneEnvVar=variable
multipleEnvVars=variables
singularToBe=is
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@
import static oracle.kubernetes.operator.helpers.PodHelper.getPodDomainUid;
import static oracle.kubernetes.operator.helpers.PodHelper.getPodName;
import static oracle.kubernetes.operator.helpers.PodHelper.getPodNamespace;
import static oracle.kubernetes.operator.helpers.PodHelper.getPodStatusMessage;
import static oracle.kubernetes.operator.logging.ThreadLoggingContext.setThreadContext;

public class DomainProcessorImpl implements DomainProcessor {
Expand Down Expand Up @@ -405,7 +406,15 @@ private void processServerPodWatch(V1Pod pod, String watchType) {
info.setServerPodBeingDeleted(serverName, Boolean.FALSE);
// fall through
case MODIFIED:
info.setServerPodFromEvent(serverName, pod);
boolean podPreviouslyEvicted = info.setServerPodFromEvent(serverName, pod, PodHelper::isEvicted);
if (PodHelper.isEvicted(pod) && !podPreviouslyEvicted) {
if (PodHelper.shouldRestartEvictedPod(pod)) {
LOGGER.info(MessageKeys.POD_EVICTED, getPodName(pod), getPodStatusMessage(pod));
createMakeRightOperation(info).interrupt().withExplicitRecheck().execute();
} else {
LOGGER.info(MessageKeys.POD_EVICTED_NO_RESTART, getPodName(pod), getPodStatusMessage(pod));
}
}
break;
case DELETED:
boolean removed = info.deleteServerPodFromEvent(serverName, pod);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ public interface KubernetesConstants {
String WEBHOOK_POD_UID_ENV = "WEBHOOK_POD_UID";
String NAMESPACE = "Namespace";
String POD = "Pod";
String EVICTED_REASON = "Evicted";

int DEFAULT_EXPORTER_SIDECAR_PORT = 8080;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -258,12 +258,13 @@ class PodTuning {
public final int livenessProbeFailureThreshold;
public final long introspectorJobActiveDeadlineSeconds;
public final long maxReadyWaitTimeSeconds;
public final boolean restartEvictedPods;

/**
* create pod tuning.
* @param readinessProbeInitialDelaySeconds readiness probe initial delay
* @param readinessProbeTimeoutSeconds readiness probe timeout
* @param readinessProbePeriodSeconds rediness probe period
* @param readinessProbePeriodSeconds readiness probe period
* @param readinessProbeSuccessThreshold readiness probe success threshold
* @param readinessProbeFailureThreshold readiness probe failure threshold
* @param livenessProbeInitialDelaySeconds liveness probe initial delay
Expand All @@ -273,6 +274,7 @@ class PodTuning {
* @param livenessProbeFailureThreshold liveness probe failure threshold
* @param introspectorJobActiveDeadlineSeconds introspector job active deadline
* @param maxReadyWaitTimeSeconds maximum wait time for server pod to reach ready state
* @param restartEvictedPods whether evicted server pods should be restarted
*/
public PodTuning(
int readinessProbeInitialDelaySeconds,
Expand All @@ -286,7 +288,8 @@ public PodTuning(
int livenessProbeSuccessThreshold,
int livenessProbeFailureThreshold,
long introspectorJobActiveDeadlineSeconds,
long maxReadyWaitTimeSeconds) {
long maxReadyWaitTimeSeconds,
boolean restartEvictedPods) {
this.readinessProbeInitialDelaySeconds = readinessProbeInitialDelaySeconds;
this.readinessProbeTimeoutSeconds = readinessProbeTimeoutSeconds;
this.readinessProbePeriodSeconds = readinessProbePeriodSeconds;
Expand All @@ -299,6 +302,7 @@ public PodTuning(
this.livenessProbeFailureThreshold = livenessProbeFailureThreshold;
this.introspectorJobActiveDeadlineSeconds = introspectorJobActiveDeadlineSeconds;
this.maxReadyWaitTimeSeconds = maxReadyWaitTimeSeconds;
this.restartEvictedPods = restartEvictedPods;
}

@Override
Expand All @@ -315,6 +319,7 @@ public String toString() {
.append("livenessProbeSuccessThreshold", livenessProbeSuccessThreshold)
.append("livenessProbeFailureThreshold", livenessProbeFailureThreshold)
.append("maxReadyWaitTimeSeconds", maxReadyWaitTimeSeconds)
.append("restartEvictedPods", restartEvictedPods)
.toString();
}

Expand All @@ -332,6 +337,7 @@ public int hashCode() {
.append(livenessProbeSuccessThreshold)
.append(livenessProbeFailureThreshold)
.append(maxReadyWaitTimeSeconds)
.append(restartEvictedPods)
.toHashCode();
}

Expand All @@ -356,6 +362,7 @@ public boolean equals(Object o) {
.append(livenessProbeSuccessThreshold, pt.livenessProbeSuccessThreshold)
.append(livenessProbeFailureThreshold, pt.livenessProbeFailureThreshold)
.append(maxReadyWaitTimeSeconds, pt.maxReadyWaitTimeSeconds)
.append(restartEvictedPods, pt.restartEvictedPods)
.isEquals();
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,9 @@ private void update() {
(int) readTuningParameter("livenessProbeSuccessThreshold", 1),
(int) readTuningParameter("livenessProbeFailureThreshold", 1),
readTuningParameter("introspectorJobActiveDeadlineSeconds", 120),
readTuningParameter("maxReadyWaitTimeSeconds", 1800));
readTuningParameter("maxReadyWaitTimeSeconds", 1800),
readBooleanTuningParameter("restartEvictedPods", true)
);

FeatureGates featureGates =
new FeatureGates(generateFeatureGates(get("featureGates")));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,16 @@ public long readTuningParameter(String parameter, long defaultValue) {
return defaultValue;
}

/**
* read boolean tuning parameter.
* @param parameter parameter
* @param defaultValue default value
* @return parameter value
*/
public boolean readBooleanTuningParameter(String parameter, boolean defaultValue) {
return Optional.ofNullable(get(parameter)).map(Boolean::parseBoolean).orElse(defaultValue);
}

@Override
public int size() {
return Optional.ofNullable(mountPointDir.list()).map(list -> list.length).orElse(0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,20 @@ public void setServerPodFromEvent(String serverName, V1Pod event) {
getSko(serverName).getPod().accumulateAndGet(event, this::getNewerPod);
}

/**
* Applies an add or modify event for a server pod. If the current pod is newer than the one
* associated with the event, ignores the event.
*
* @param serverName the name of the server associated with the event
* @param event the pod associated with the event
* @param podPredicate predicate to be applied to the original pod
* @return boolean result from applying the original pod to the podFunction provided
*/
public boolean setServerPodFromEvent(String serverName, V1Pod event, @Nonnull Predicate<V1Pod> podPredicate) {
updateStatus(serverName, event);
return podPredicate.test(getSko(serverName).getPod().getAndAccumulate(event, this::getNewerPod));
}

private void updateStatus(String serverName, V1Pod event) {
getSko(serverName)
.getLastKnownStatus()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
import oracle.kubernetes.weblogic.domain.model.ServerSpec;
import oracle.kubernetes.weblogic.domain.model.Shutdown;

import static oracle.kubernetes.operator.KubernetesConstants.EVICTED_REASON;
import static oracle.kubernetes.operator.LabelConstants.CLUSTERNAME_LABEL;
import static oracle.kubernetes.operator.LabelConstants.SERVERNAME_LABEL;
import static oracle.kubernetes.operator.ProcessingConstants.SERVERS_TO_ROLL;
Expand Down Expand Up @@ -223,6 +224,38 @@ public static boolean isFailed(V1Pod pod) {
return false;
}

/**
* Check if pod is in failed state with "Evicted" as the reason.
* @param pod pod
* @return true, if pod is in failed state with "Evicted" as the reason.
*/
public static boolean isEvicted(V1Pod pod) {
return Optional.ofNullable(pod)
.map(V1Pod::getStatus)
.map(PodHelper::isEvicted)
.orElse(false);
}

/**
* Chcek if the pod status shows that the pod is evicted.
* @param status Pod status to be checked
* @return True if the pod status shows that the pod is evicted, false otherwise
*/
public static boolean isEvicted(@Nonnull V1PodStatus status) {
return V1PodStatus.PhaseEnum.FAILED.equals(status.getPhase())
&& EVICTED_REASON.equals(status.getReason());
}

/**
* Return true if pod was evicted and operator is configured to restart evicted pods.
* @param pod pod
* @return true, if pod was evicted and operator is configured to restart evicted pods
*
*/
public static boolean shouldRestartEvictedPod(V1Pod pod) {
return isEvicted(pod) && TuningParameters.getInstance().getPodTuning().restartEvictedPods;
}

/**
* Returns the domain UID associated with the specified pod.
* @param pod the pod
Expand All @@ -245,6 +278,17 @@ public static String getPodServerName(V1Pod pod) {
return null;
}

/**
* Get the message from the pod's status.
* @param pod pod
* @return Message string from the pod's status
*/
public static String getPodStatusMessage(V1Pod pod) {
return Optional.ofNullable(pod)
.map(V1Pod::getStatus)
.map(V1PodStatus::getMessage)
.orElse(null);
}

/**
* Factory for {@link Step} that creates admin server pod.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@

import static oracle.kubernetes.common.CommonConstants.COMPATIBILITY_MODE;
import static oracle.kubernetes.common.helpers.AuxiliaryImageEnvVars.AUXILIARY_IMAGE_MOUNT_PATH;
import static oracle.kubernetes.common.logging.MessageKeys.CYCLING_POD_EVICTED;
import static oracle.kubernetes.common.logging.MessageKeys.CYCLING_POD_SPEC_CHANGED;
import static oracle.kubernetes.operator.DomainStatusUpdater.createKubernetesFailureSteps;
import static oracle.kubernetes.operator.IntrospectorConfigMapConstants.NUM_CONFIG_MAPS;
import static oracle.kubernetes.operator.KubernetesConstants.DEFAULT_EXPORTER_SIDECAR_PORT;
Expand Down Expand Up @@ -457,8 +459,13 @@ private Map<String, String> getNonHashedPodAnnotations() {

abstract String getPodReplacedMessageKey();

Step cycleEvictedPodStep(V1Pod pod, Step next) {
return new CyclePodStep(pod, next, LOGGER.formatMessage(CYCLING_POD_EVICTED));
}

Step createCyclePodStep(V1Pod pod, Step next) {
return Step.chain(DomainStatusUpdater.createStartRollStep(), new CyclePodStep(pod, next));
return Step.chain(DomainStatusUpdater.createStartRollStep(),
new CyclePodStep(pod, next, LOGGER.formatMessage(CYCLING_POD_SPEC_CHANGED)));
}

private boolean isLegacyAuxImageOperatorVersion(String operatorVersion) {
Expand Down Expand Up @@ -1012,10 +1019,12 @@ private Step getConflictStep() {

public class CyclePodStep extends BaseStep {
private final V1Pod pod;
private final String message;

CyclePodStep(V1Pod pod, Step next) {
CyclePodStep(V1Pod pod, Step next, String message) {
super(next);
this.pod = pod;
this.message = message;
}

private ResponseStep<Object> deleteResponse(V1Pod pod, Step next) {
Expand Down Expand Up @@ -1049,7 +1058,7 @@ public NextAction apply(Packet packet) {

private Step createCyclePodEventStep(Step next) {
LOGGER.info(MessageKeys.CYCLING_POD, Objects.requireNonNull(pod.getMetadata()).getName());
return Step.chain(EventHelper.createEventStep(new EventData(POD_CYCLE_STARTING).podName(getPodName())),
return Step.chain(EventHelper.createEventStep(new EventData(POD_CYCLE_STARTING, message).podName(getPodName())),
next);
}
}
Expand Down Expand Up @@ -1221,6 +1230,8 @@ public NextAction apply(Packet packet) {
return doNext(createNewPod(getNext()), packet);
} else if (!canUseCurrentPod(currentPod)) {
return doNext(replaceCurrentPod(currentPod, getNext()), packet);
} else if (PodHelper.shouldRestartEvictedPod(currentPod)) {
return doNext(cycleEvictedPodStep(currentPod, getNext()), packet);
} else if (mustPatchPod(currentPod)) {
return doNext(patchCurrentPod(currentPod, getNext()), packet);
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -137,11 +137,15 @@ private void addServersToFactory(@Nonnull ServersUpStepFactory factory, @Nonnull
factory.addServerIfNeeded(serverConfig.wlsServerConfig, serverConfig.wlsClusterConfig);
}

info.getServerPods().filter(pod -> !factory.getServers().contains(getPodServerName(pod)))
info.getServerPods().filter(pod -> podShouldNotBeRunning(pod, factory))
.filter(pod -> !getPodServerName(pod).equals(wlsDomainConfig.getAdminServerName()))
.forEach(pod -> shutdownServersNotPresentInDomainConfig(factory, pod));
}

private boolean podShouldNotBeRunning(V1Pod pod, ServersUpStepFactory factory) {
return !factory.getServers().contains(getPodServerName(pod));
}

private void shutdownServersNotPresentInDomainConfig(ServersUpStepFactory factory, V1Pod pod) {
WlsServerConfig serverConfig = new WlsServerConfig(getPodServerName(pod), PodHelper.getPodName(pod), 0);
factory.addShutdownInfo(new ServerShutdownInfo(serverConfig, pod.getMetadata().getClusterName(), null, false));
Expand Down
Loading