Skip to content

Commit f20257e

Browse files
authored
OWLS-91212 - Fix for the introspector retry behavior after the job times out. (#2580)
* Fix for the introspector retry behavior after the job times out
1 parent e1457aa commit f20257e

File tree

17 files changed

+263
-88
lines changed

17 files changed

+263
-88
lines changed

integration-tests/src/test/java/oracle/weblogic/kubernetes/ItServerStartPolicy.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1433,7 +1433,8 @@ private void scalingClusters(String clusterName, String serverPodName, int repli
14331433
executeLifecycleScript(STATUS_CLUSTER_SCRIPT, CLUSTER_LIFECYCLE, clusterName),
14341434
String.format("Failed to run %s", STATUS_CLUSTER_SCRIPT));
14351435

1436-
assertTrue(verifyExecuteResult(result, regex), "The script should scale the given cluster: " + clusterName);
1436+
assertTrue(verifyExecuteResult(result, regex), "The script should scale the given cluster: " + clusterName
1437+
+ ", the result is -> " + result + " . It doesn't match the expected pattern -> " + regex);
14371438
logger.info("The cluster {0} scaled successfully.", clusterName);
14381439
}
14391440

operator/src/main/java/oracle/kubernetes/operator/DomainProcessorImpl.java

Lines changed: 25 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -882,7 +882,7 @@ private boolean shouldContinue() {
882882
return true;
883883
} else if (shouldReportAbortedEvent()) {
884884
return true;
885-
} else if (hasExceededRetryCount() && !isImgRestartIntrospectVerChanged(liveInfo, cachedInfo)) {
885+
} else if (hasExceededRetryCount(liveInfo) && !isImgRestartIntrospectVerChanged(liveInfo, cachedInfo)) {
886886
LOGGER.severe(ProcessingConstants.EXCEEDED_INTROSPECTOR_MAX_RETRY_COUNT_ERROR_MSG);
887887
return false;
888888
} else if (isFatalIntrospectorError()) {
@@ -893,10 +893,7 @@ private boolean shouldContinue() {
893893
return false; // we have already cached this
894894
} else if (shouldRecheck(cachedInfo)) {
895895

896-
if (hasExceededRetryCount()) {
897-
resetIntrospectorJobFailureCount();
898-
}
899-
if (getCurrentIntrospectFailureRetryCount() > 0) {
896+
if (getCurrentIntrospectFailureRetryCount(liveInfo) > 0) {
900897
logRetryCount(cachedInfo);
901898
ensureRetryingEventPresent();
902899
}
@@ -917,29 +914,10 @@ private void ensureRetryingEventPresent() {
917914
}
918915
}
919916

920-
private void resetIntrospectorJobFailureCount() {
921-
Optional.ofNullable(liveInfo)
922-
.map(DomainPresenceInfo::getDomain)
923-
.map(Domain::getStatus)
924-
.map(DomainStatus::resetIntrospectJobFailureCount);
925-
}
926-
927-
private boolean hasExceededRetryCount() {
928-
return getCurrentIntrospectFailureRetryCount()
929-
>= DomainPresence.getDomainPresenceFailureRetryMaxCount();
930-
}
931-
932-
private Integer getCurrentIntrospectFailureRetryCount() {
933-
return Optional.ofNullable(liveInfo)
934-
.map(DomainPresenceInfo::getDomain)
935-
.map(Domain::getStatus)
936-
.map(DomainStatus::getIntrospectJobFailureCount)
937-
.orElse(0);
938-
}
939917

940918
private void logRetryCount(DomainPresenceInfo cachedInfo) {
941919
LOGGER.info(MessageKeys.INTROSPECT_JOB_FAILED_RETRY_COUNT, cachedInfo.getDomain().getDomainUid(),
942-
getCurrentIntrospectFailureRetryCount(),
920+
getCurrentIntrospectFailureRetryCount(liveInfo),
943921
DomainPresence.getDomainPresenceFailureRetryMaxCount());
944922
}
945923

@@ -1068,11 +1046,24 @@ private static String getIntrospectVersion(DomainPresenceInfo info) {
10681046
.orElse(null);
10691047
}
10701048

1049+
private Integer getCurrentIntrospectFailureRetryCount(DomainPresenceInfo info) {
1050+
return Optional.ofNullable(info)
1051+
.map(DomainPresenceInfo::getDomain)
1052+
.map(Domain::getStatus)
1053+
.map(DomainStatus::getIntrospectJobFailureCount)
1054+
.orElse(0);
1055+
}
1056+
10711057
private static boolean isCachedInfoNewer(DomainPresenceInfo liveInfo, DomainPresenceInfo cachedInfo) {
10721058
return liveInfo.getDomain() != null
10731059
&& KubernetesUtils.isFirstNewer(cachedInfo.getDomain().getMetadata(), liveInfo.getDomain().getMetadata());
10741060
}
10751061

1062+
private boolean hasExceededRetryCount(DomainPresenceInfo info) {
1063+
return getCurrentIntrospectFailureRetryCount(info)
1064+
>= DomainPresence.getDomainPresenceFailureRetryMaxCount();
1065+
}
1066+
10761067
@SuppressWarnings("unused")
10771068
private void runDomainPlan(
10781069
Domain dom,
@@ -1092,11 +1083,11 @@ public void onCompletion(Packet packet) {
10921083
@Override
10931084
public void onThrowable(Packet packet, Throwable throwable) {
10941085
logThrowable(throwable);
1095-
10961086
gate.startFiberIfLastFiberMatches(
10971087
domainUid,
10981088
Fiber.getCurrentIfSet(),
1099-
DomainStatusUpdater.createFailureRelatedSteps(throwable, null),
1089+
Step.chain(DomainStatusUpdater.createFailureCountStep(),
1090+
DomainStatusUpdater.createFailureRelatedSteps(throwable, null)),
11001091
plan.packet,
11011092
new CompletionCallback() {
11021093
@Override
@@ -1119,7 +1110,7 @@ public void onThrowable(Packet packet, Throwable throwable) {
11191110
LoggingContext.setThreadContext().namespace(ns).domainUid(domainUid)) {
11201111
existing.setPopulated(false);
11211112
// proceed only if we have not already retried max number of times
1122-
int retryCount = existing.incrementAndGetFailureCount();
1113+
int retryCount = getCurrentIntrospectFailureRetryCount(existing);
11231114
LOGGER.fine(
11241115
"Failure count for DomainPresenceInfo: "
11251116
+ existing
@@ -1179,6 +1170,11 @@ Step createDomainUpPlan(DomainPresenceInfo info) {
11791170
bringAdminServerUp(info, delegate.getPodAwaiterStepFactory(info.getNamespace())),
11801171
managedServerStrategy);
11811172

1173+
if (hasExceededRetryCount(info) && isImgRestartIntrospectVerChanged(info,
1174+
getExistingDomainPresenceInfo(info.getNamespace(), info.getDomainUid()))) {
1175+
domainUpStrategy = Step.chain(DomainStatusUpdater.createResetFailureCountStep(), domainUpStrategy);
1176+
}
1177+
11821178
return Step.chain(
11831179
createDomainUpInitialStep(info),
11841180
ConfigMapHelper.readExistingIntrospectorConfigMap(info.getNamespace(), info.getDomainUid()),
@@ -1226,8 +1222,7 @@ private static class TailStep extends Step {
12261222

12271223
@Override
12281224
public NextAction apply(Packet packet) {
1229-
packet.getSpi(DomainPresenceInfo.class).complete();
1230-
return doNext(packet);
1225+
return doNext(DomainStatusUpdater.createResetFailureCountStep(), packet);
12311226
}
12321227
}
12331228

operator/src/main/java/oracle/kubernetes/operator/DomainStatusUpdater.java

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -842,6 +842,22 @@ private boolean isFatalError(DomainStatus domainStatus) {
842842
}
843843
}
844844

845+
public static Step createResetFailureCountStep() {
846+
return new ResetFailureCountStep();
847+
}
848+
849+
static class ResetFailureCountStep extends DomainStatusUpdaterStep {
850+
851+
public ResetFailureCountStep() {
852+
super(null);
853+
}
854+
855+
@Override
856+
void modifyStatus(DomainStatus domainStatus) {
857+
domainStatus.resetIntrospectJobFailureCount();
858+
}
859+
}
860+
845861
public static Step recordLastIntrospectJobProcessedUid(String lastIntrospectJobProcessedId) {
846862
return new RecordLastIntrospectJobProcessedUidStep(lastIntrospectJobProcessedId);
847863
}

operator/src/main/java/oracle/kubernetes/operator/PodWatcher.java

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -476,13 +476,8 @@ protected DefaultResponseStep<V1Pod> resumeIfReady(Callback callback) {
476476
return new DefaultResponseStep<>(getNext()) {
477477
@Override
478478
public NextAction onSuccess(Packet packet, CallResponse<V1Pod> callResponse) {
479-
480479
DomainPresenceInfo info = packet.getSpi(DomainPresenceInfo.class);
481-
String serverName = (String)packet.get(SERVER_NAME);
482-
if ((info != null) && (callResponse != null) && (callResponse.getResult() == null)) {
483-
info.setServerPod(serverName, null);
484-
}
485-
480+
String serverName = callback.geServerName();
486481
if (isReady(callResponse.getResult(), info, serverName) || callback.didResumeFiber()) {
487482
callback.proceedFromWait(callResponse.getResult());
488483
return null;

operator/src/main/java/oracle/kubernetes/operator/ProcessingConstants.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ public interface ProcessingConstants {
2727

2828
String DOMAIN_TOPOLOGY = "domainTopology";
2929
String JOB_POD_NAME = "jobPodName";
30+
String JOB_POD_CONTAINER_WAITING_REASON = "jobPodContainerWaitingReason";
3031
String DOMAIN_INTROSPECTOR_JOB = "domainIntrospectorJob";
3132
String DOMAIN_INTROSPECTOR_LOG_RESULT = "domainIntrospectorLogResult";
3233
String DOMAIN_INTROSPECT_REQUESTED = "domainIntrospectRequested";

operator/src/main/java/oracle/kubernetes/operator/WaitForReadyStep.java

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,6 @@ void logWaiting(String name) {
178178
@Override
179179
public final NextAction apply(Packet packet) {
180180
String serverName = (String)packet.get(SERVER_NAME);
181-
DomainPresenceInfo info = packet.getSpi(DomainPresenceInfo.class);
182181
if (shouldTerminateFiber(initialResource)) {
183182
return doTerminate(createTerminationException(initialResource), packet);
184183
} else if (isReady(initialResource, packet.getSpi(DomainPresenceInfo.class), serverName)) {
@@ -280,10 +279,12 @@ class Callback implements Consumer<T> {
280279
private final Packet packet;
281280
private final AtomicBoolean didResume = new AtomicBoolean(false);
282281
private final AtomicInteger recheckCount = new AtomicInteger(0);
282+
private final String serverName;
283283

284284
Callback(AsyncFiber fiber, Packet packet) {
285285
this.fiber = fiber;
286286
this.packet = packet;
287+
this.serverName = (String) packet.get(SERVER_NAME);
287288
}
288289

289290
@Override
@@ -320,6 +321,10 @@ int incrementAndGetRecheckCount() {
320321
int getRecheckCount() {
321322
return recheckCount.get();
322323
}
324+
325+
String geServerName() {
326+
return serverName;
327+
}
323328
}
324329

325330
private void handleResourceReady(AsyncFiber fiber, Packet packet, T resource) {

operator/src/main/java/oracle/kubernetes/operator/helpers/DomainPresenceInfo.java

Lines changed: 0 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
import java.util.concurrent.ConcurrentHashMap;
1414
import java.util.concurrent.ConcurrentMap;
1515
import java.util.concurrent.atomic.AtomicBoolean;
16-
import java.util.concurrent.atomic.AtomicInteger;
1716
import java.util.concurrent.atomic.AtomicReference;
1817
import java.util.concurrent.locks.ReadWriteLock;
1918
import java.util.concurrent.locks.ReentrantReadWriteLock;
@@ -54,7 +53,6 @@ public class DomainPresenceInfo {
5453
private final AtomicReference<Domain> domain;
5554
private final AtomicBoolean isDeleting = new AtomicBoolean(false);
5655
private final AtomicBoolean isPopulated = new AtomicBoolean(false);
57-
private final AtomicInteger retryCount = new AtomicInteger(0);
5856
private final AtomicReference<Collection<ServerStartupInfo>> serverStartupInfo;
5957
private final AtomicReference<Collection<ServerShutdownInfo>> serverShutdownInfo;
6058

@@ -547,23 +545,6 @@ public void setPopulated(boolean populated) {
547545
isPopulated.set(populated);
548546
}
549547

550-
private void resetFailureCount() {
551-
retryCount.set(0);
552-
}
553-
554-
public int incrementAndGetFailureCount() {
555-
return retryCount.incrementAndGet();
556-
}
557-
558-
int getRetryCount() {
559-
return retryCount.get();
560-
}
561-
562-
/** Sets the last completion time to now. */
563-
public void complete() {
564-
resetFailureCount();
565-
}
566-
567548
EventItem getLastEventItem() {
568549
return lastEventItem;
569550
}

0 commit comments

Comments
 (0)