Skip to content

Commit d85f5c9

Browse files
authored
OWLS-96896 - Restart Evicted Pods (oracle#2979)
* restart evicted server pods
1 parent 564c9a0 commit d85f5c9

File tree

13 files changed

+265
-16
lines changed

13 files changed

+265
-16
lines changed

common/src/main/java/oracle/kubernetes/common/logging/MessageKeys.java

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -175,9 +175,8 @@ public class MessageKeys {
175175
public static final String MISSING_ELASTIC_SEARCH_SECRET = "WLSKO-0223";
176176
public static final String FLUENTD_CONFIGMAP_CREATED = "WLSKO-0224";
177177
public static final String FLUENTD_CONFIGMAP_REPLACED = "WLSKO-0225";
178-
179-
180-
178+
public static final String POD_EVICTED = "WLSKO-0226";
179+
public static final String POD_EVICTED_NO_RESTART = "WLSKO-0227";
181180

182181
// domain status messages
183182
public static final String DUPLICATE_SERVER_NAME_FOUND = "WLSDO-0001";
@@ -216,6 +215,8 @@ public class MessageKeys {
216215
public static final String INVALID_MODEL_HOME = "WLSDO-0039";
217216
public static final String PODS_FAILED = "WLSDO-0040";
218217
public static final String PODS_NOT_READY = "WLSDO-0041";
218+
public static final String CYCLING_POD_EVICTED = "WLSDO-0042";
219+
public static final String CYCLING_POD_SPEC_CHANGED = "WLSDO-0043";
219220

220221
private MessageKeys() {
221222
}

common/src/main/resources/Operator.properties

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,8 @@ WLSKO-0223=When fluentdSpecification is specified in the domain spec, a secret c
185185
must be specified in {0}
186186
WKSKO-0224=Fluentd configmap created.
187187
WKSKO-0225=Fluentd configmap replaced.
188+
WLSKO-0226=Pod {0} was evicted due to {1}; validating domain
189+
WLSKO-0227=Pod {0} was evicted due to {1} but the operator is configured not to restart it.
188190

189191
# Domain status messages
190192

@@ -237,6 +239,8 @@ WLSDO-0041=One or more server pods that are supposed to be available are not rea
237239
defined in 'serverPod.maxReadyWaitTimeSeconds' under "domain.spec', 'domain.adminServer', 'managedServer', or 'domain.cluster'. \
238240
Check the server status in the domain status, the server pod status and logs, and WebLogic Server logs for possible reasons. \
239241
Adjust the value of 'serverPod.maxReadyWaitTimeSeconds' setting if needed."
242+
WLSDO-0042=Pod was evicted
243+
WLSDO-0043=Pod spec has changed
240244
oneEnvVar=variable
241245
multipleEnvVars=variables
242246
singularToBe=is

operator/src/main/java/oracle/kubernetes/operator/DomainProcessorImpl.java

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@
9191
import static oracle.kubernetes.operator.helpers.PodHelper.getPodDomainUid;
9292
import static oracle.kubernetes.operator.helpers.PodHelper.getPodName;
9393
import static oracle.kubernetes.operator.helpers.PodHelper.getPodNamespace;
94+
import static oracle.kubernetes.operator.helpers.PodHelper.getPodStatusMessage;
9495
import static oracle.kubernetes.operator.logging.ThreadLoggingContext.setThreadContext;
9596

9697
public class DomainProcessorImpl implements DomainProcessor {
@@ -405,7 +406,15 @@ private void processServerPodWatch(V1Pod pod, String watchType) {
405406
info.setServerPodBeingDeleted(serverName, Boolean.FALSE);
406407
// fall through
407408
case MODIFIED:
408-
info.setServerPodFromEvent(serverName, pod);
409+
boolean podPreviouslyEvicted = info.setServerPodFromEvent(serverName, pod, PodHelper::isEvicted);
410+
if (PodHelper.isEvicted(pod) && !podPreviouslyEvicted) {
411+
if (PodHelper.shouldRestartEvictedPod(pod)) {
412+
LOGGER.info(MessageKeys.POD_EVICTED, getPodName(pod), getPodStatusMessage(pod));
413+
createMakeRightOperation(info).interrupt().withExplicitRecheck().execute();
414+
} else {
415+
LOGGER.info(MessageKeys.POD_EVICTED_NO_RESTART, getPodName(pod), getPodStatusMessage(pod));
416+
}
417+
}
409418
break;
410419
case DELETED:
411420
boolean removed = info.deleteServerPodFromEvent(serverName, pod);

operator/src/main/java/oracle/kubernetes/operator/KubernetesConstants.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ public interface KubernetesConstants {
4242
String WEBHOOK_POD_UID_ENV = "WEBHOOK_POD_UID";
4343
String NAMESPACE = "Namespace";
4444
String POD = "Pod";
45+
String EVICTED_REASON = "Evicted";
4546

4647
int DEFAULT_EXPORTER_SIDECAR_PORT = 8080;
4748

operator/src/main/java/oracle/kubernetes/operator/TuningParameters.java

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -258,12 +258,13 @@ class PodTuning {
258258
public final int livenessProbeFailureThreshold;
259259
public final long introspectorJobActiveDeadlineSeconds;
260260
public final long maxReadyWaitTimeSeconds;
261+
public final boolean restartEvictedPods;
261262

262263
/**
263264
* create pod tuning.
264265
* @param readinessProbeInitialDelaySeconds readiness probe initial delay
265266
* @param readinessProbeTimeoutSeconds readiness probe timeout
266-
* @param readinessProbePeriodSeconds rediness probe period
267+
* @param readinessProbePeriodSeconds readiness probe period
267268
* @param readinessProbeSuccessThreshold readiness probe success threshold
268269
* @param readinessProbeFailureThreshold readiness probe failure threshold
269270
* @param livenessProbeInitialDelaySeconds liveness probe initial delay
@@ -273,6 +274,7 @@ class PodTuning {
273274
* @param livenessProbeFailureThreshold liveness probe failure threshold
274275
* @param introspectorJobActiveDeadlineSeconds introspector job active deadline
275276
* @param maxReadyWaitTimeSeconds maximum wait time for server pod to reach ready state
277+
* @param restartEvictedPods whether evicted server pods should be restarted
276278
*/
277279
public PodTuning(
278280
int readinessProbeInitialDelaySeconds,
@@ -286,7 +288,8 @@ public PodTuning(
286288
int livenessProbeSuccessThreshold,
287289
int livenessProbeFailureThreshold,
288290
long introspectorJobActiveDeadlineSeconds,
289-
long maxReadyWaitTimeSeconds) {
291+
long maxReadyWaitTimeSeconds,
292+
boolean restartEvictedPods) {
290293
this.readinessProbeInitialDelaySeconds = readinessProbeInitialDelaySeconds;
291294
this.readinessProbeTimeoutSeconds = readinessProbeTimeoutSeconds;
292295
this.readinessProbePeriodSeconds = readinessProbePeriodSeconds;
@@ -299,6 +302,7 @@ public PodTuning(
299302
this.livenessProbeFailureThreshold = livenessProbeFailureThreshold;
300303
this.introspectorJobActiveDeadlineSeconds = introspectorJobActiveDeadlineSeconds;
301304
this.maxReadyWaitTimeSeconds = maxReadyWaitTimeSeconds;
305+
this.restartEvictedPods = restartEvictedPods;
302306
}
303307

304308
@Override
@@ -315,6 +319,7 @@ public String toString() {
315319
.append("livenessProbeSuccessThreshold", livenessProbeSuccessThreshold)
316320
.append("livenessProbeFailureThreshold", livenessProbeFailureThreshold)
317321
.append("maxReadyWaitTimeSeconds", maxReadyWaitTimeSeconds)
322+
.append("restartEvictedPods", restartEvictedPods)
318323
.toString();
319324
}
320325

@@ -332,6 +337,7 @@ public int hashCode() {
332337
.append(livenessProbeSuccessThreshold)
333338
.append(livenessProbeFailureThreshold)
334339
.append(maxReadyWaitTimeSeconds)
340+
.append(restartEvictedPods)
335341
.toHashCode();
336342
}
337343

@@ -356,6 +362,7 @@ public boolean equals(Object o) {
356362
.append(livenessProbeSuccessThreshold, pt.livenessProbeSuccessThreshold)
357363
.append(livenessProbeFailureThreshold, pt.livenessProbeFailureThreshold)
358364
.append(maxReadyWaitTimeSeconds, pt.maxReadyWaitTimeSeconds)
365+
.append(restartEvictedPods, pt.restartEvictedPods)
359366
.isEquals();
360367
}
361368
}

operator/src/main/java/oracle/kubernetes/operator/TuningParametersImpl.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,9 @@ private void update() {
9696
(int) readTuningParameter("livenessProbeSuccessThreshold", 1),
9797
(int) readTuningParameter("livenessProbeFailureThreshold", 1),
9898
readTuningParameter("introspectorJobActiveDeadlineSeconds", 120),
99-
readTuningParameter("maxReadyWaitTimeSeconds", 1800));
99+
readTuningParameter("maxReadyWaitTimeSeconds", 1800),
100+
readBooleanTuningParameter("restartEvictedPods", true)
101+
);
100102

101103
FeatureGates featureGates =
102104
new FeatureGates(generateFeatureGates(get("featureGates")));

operator/src/main/java/oracle/kubernetes/operator/helpers/ConfigMapConsumer.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,16 @@ public long readTuningParameter(String parameter, long defaultValue) {
8282
return defaultValue;
8383
}
8484

85+
/**
86+
* read boolean tuning parameter.
87+
* @param parameter parameter
88+
* @param defaultValue default value
89+
* @return parameter value
90+
*/
91+
public boolean readBooleanTuningParameter(String parameter, boolean defaultValue) {
92+
return Optional.ofNullable(get(parameter)).map(Boolean::parseBoolean).orElse(defaultValue);
93+
}
94+
8595
@Override
8696
public int size() {
8797
return Optional.ofNullable(mountPointDir.list()).map(list -> list.length).orElse(0);

operator/src/main/java/oracle/kubernetes/operator/helpers/DomainPresenceInfo.java

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,20 @@ public void setServerPodFromEvent(String serverName, V1Pod event) {
284284
getSko(serverName).getPod().accumulateAndGet(event, this::getNewerPod);
285285
}
286286

287+
/**
288+
* Applies an add or modify event for a server pod. If the current pod is newer than the one
289+
* associated with the event, ignores the event.
290+
*
291+
* @param serverName the name of the server associated with the event
292+
* @param event the pod associated with the event
293+
* @param podPredicate predicate to be applied to the original pod
294+
* @return boolean result from applying the original pod to the podFunction provided
295+
*/
296+
public boolean setServerPodFromEvent(String serverName, V1Pod event, @Nonnull Predicate<V1Pod> podPredicate) {
297+
updateStatus(serverName, event);
298+
return podPredicate.test(getSko(serverName).getPod().getAndAccumulate(event, this::getNewerPod));
299+
}
300+
287301
private void updateStatus(String serverName, V1Pod event) {
288302
getSko(serverName)
289303
.getLastKnownStatus()

operator/src/main/java/oracle/kubernetes/operator/helpers/PodHelper.java

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
import oracle.kubernetes.weblogic.domain.model.ServerSpec;
4444
import oracle.kubernetes.weblogic.domain.model.Shutdown;
4545

46+
import static oracle.kubernetes.operator.KubernetesConstants.EVICTED_REASON;
4647
import static oracle.kubernetes.operator.LabelConstants.CLUSTERNAME_LABEL;
4748
import static oracle.kubernetes.operator.LabelConstants.SERVERNAME_LABEL;
4849
import static oracle.kubernetes.operator.ProcessingConstants.SERVERS_TO_ROLL;
@@ -223,6 +224,38 @@ public static boolean isFailed(V1Pod pod) {
223224
return false;
224225
}
225226

227+
/**
228+
* Check if pod is in failed state with "Evicted" as the reason.
229+
* @param pod pod
230+
* @return true, if pod is in failed state with "Evicted" as the reason.
231+
*/
232+
public static boolean isEvicted(V1Pod pod) {
233+
return Optional.ofNullable(pod)
234+
.map(V1Pod::getStatus)
235+
.map(PodHelper::isEvicted)
236+
.orElse(false);
237+
}
238+
239+
/**
240+
* Chcek if the pod status shows that the pod is evicted.
241+
* @param status Pod status to be checked
242+
* @return True if the pod status shows that the pod is evicted, false otherwise
243+
*/
244+
public static boolean isEvicted(@Nonnull V1PodStatus status) {
245+
return V1PodStatus.PhaseEnum.FAILED.equals(status.getPhase())
246+
&& EVICTED_REASON.equals(status.getReason());
247+
}
248+
249+
/**
250+
* Return true if pod was evicted and operator is configured to restart evicted pods.
251+
* @param pod pod
252+
* @return true, if pod was evicted and operator is configured to restart evicted pods
253+
*
254+
*/
255+
public static boolean shouldRestartEvictedPod(V1Pod pod) {
256+
return isEvicted(pod) && TuningParameters.getInstance().getPodTuning().restartEvictedPods;
257+
}
258+
226259
/**
227260
* Returns the domain UID associated with the specified pod.
228261
* @param pod the pod
@@ -245,6 +278,17 @@ public static String getPodServerName(V1Pod pod) {
245278
return null;
246279
}
247280

281+
/**
282+
* Get the message from the pod's status.
283+
* @param pod pod
284+
* @return Message string from the pod's status
285+
*/
286+
public static String getPodStatusMessage(V1Pod pod) {
287+
return Optional.ofNullable(pod)
288+
.map(V1Pod::getStatus)
289+
.map(V1PodStatus::getMessage)
290+
.orElse(null);
291+
}
248292

249293
/**
250294
* Factory for {@link Step} that creates admin server pod.

operator/src/main/java/oracle/kubernetes/operator/helpers/PodStepContext.java

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,8 @@
7878

7979
import static oracle.kubernetes.common.CommonConstants.COMPATIBILITY_MODE;
8080
import static oracle.kubernetes.common.helpers.AuxiliaryImageEnvVars.AUXILIARY_IMAGE_MOUNT_PATH;
81+
import static oracle.kubernetes.common.logging.MessageKeys.CYCLING_POD_EVICTED;
82+
import static oracle.kubernetes.common.logging.MessageKeys.CYCLING_POD_SPEC_CHANGED;
8183
import static oracle.kubernetes.operator.DomainStatusUpdater.createKubernetesFailureSteps;
8284
import static oracle.kubernetes.operator.IntrospectorConfigMapConstants.NUM_CONFIG_MAPS;
8385
import static oracle.kubernetes.operator.KubernetesConstants.DEFAULT_EXPORTER_SIDECAR_PORT;
@@ -457,8 +459,13 @@ private Map<String, String> getNonHashedPodAnnotations() {
457459

458460
abstract String getPodReplacedMessageKey();
459461

462+
Step cycleEvictedPodStep(V1Pod pod, Step next) {
463+
return new CyclePodStep(pod, next, LOGGER.formatMessage(CYCLING_POD_EVICTED));
464+
}
465+
460466
Step createCyclePodStep(V1Pod pod, Step next) {
461-
return Step.chain(DomainStatusUpdater.createStartRollStep(), new CyclePodStep(pod, next));
467+
return Step.chain(DomainStatusUpdater.createStartRollStep(),
468+
new CyclePodStep(pod, next, LOGGER.formatMessage(CYCLING_POD_SPEC_CHANGED)));
462469
}
463470

464471
private boolean isLegacyAuxImageOperatorVersion(String operatorVersion) {
@@ -1012,10 +1019,12 @@ private Step getConflictStep() {
10121019

10131020
public class CyclePodStep extends BaseStep {
10141021
private final V1Pod pod;
1022+
private final String message;
10151023

1016-
CyclePodStep(V1Pod pod, Step next) {
1024+
CyclePodStep(V1Pod pod, Step next, String message) {
10171025
super(next);
10181026
this.pod = pod;
1027+
this.message = message;
10191028
}
10201029

10211030
private ResponseStep<Object> deleteResponse(V1Pod pod, Step next) {
@@ -1049,7 +1058,7 @@ public NextAction apply(Packet packet) {
10491058

10501059
private Step createCyclePodEventStep(Step next) {
10511060
LOGGER.info(MessageKeys.CYCLING_POD, Objects.requireNonNull(pod.getMetadata()).getName());
1052-
return Step.chain(EventHelper.createEventStep(new EventData(POD_CYCLE_STARTING).podName(getPodName())),
1061+
return Step.chain(EventHelper.createEventStep(new EventData(POD_CYCLE_STARTING, message).podName(getPodName())),
10531062
next);
10541063
}
10551064
}
@@ -1221,6 +1230,8 @@ public NextAction apply(Packet packet) {
12211230
return doNext(createNewPod(getNext()), packet);
12221231
} else if (!canUseCurrentPod(currentPod)) {
12231232
return doNext(replaceCurrentPod(currentPod, getNext()), packet);
1233+
} else if (PodHelper.shouldRestartEvictedPod(currentPod)) {
1234+
return doNext(cycleEvictedPodStep(currentPod, getNext()), packet);
12241235
} else if (mustPatchPod(currentPod)) {
12251236
return doNext(patchCurrentPod(currentPod, getNext()), packet);
12261237
} else {

0 commit comments

Comments
 (0)