Skip to content

Commit 1ee25b6

Browse files
authored
OWLS-93072 - Improve domain status failure message to include retry stage and fatal condition (#2571)
* Improve domain status failure message to include retry stage and fatal condition
1 parent 53b8dd1 commit 1ee25b6

File tree

3 files changed

+150
-39
lines changed

3 files changed

+150
-39
lines changed

operator/src/main/java/oracle/kubernetes/operator/DomainStatusUpdater.java

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import java.util.Collection;
88
import java.util.Collections;
99
import java.util.HashSet;
10+
import java.util.List;
1011
import java.util.Map;
1112
import java.util.Objects;
1213
import java.util.Optional;
@@ -54,13 +55,16 @@
5455
import oracle.kubernetes.weblogic.domain.model.OnlineUpdate;
5556
import oracle.kubernetes.weblogic.domain.model.ServerHealth;
5657
import oracle.kubernetes.weblogic.domain.model.ServerStatus;
58+
import org.jetbrains.annotations.NotNull;
5759

5860
import static oracle.kubernetes.operator.LabelConstants.CLUSTERNAME_LABEL;
5961
import static oracle.kubernetes.operator.MIINonDynamicChangesMethod.CommitUpdateOnly;
6062
import static oracle.kubernetes.operator.ProcessingConstants.DOMAIN_TOPOLOGY;
6163
import static oracle.kubernetes.operator.ProcessingConstants.EXCEEDED_INTROSPECTOR_MAX_RETRY_COUNT_ERROR_MSG;
64+
import static oracle.kubernetes.operator.ProcessingConstants.FATAL_ERROR_DOMAIN_STATUS_MESSAGE;
6265
import static oracle.kubernetes.operator.ProcessingConstants.FATAL_INTROSPECTOR_ERROR;
6366
import static oracle.kubernetes.operator.ProcessingConstants.FATAL_INTROSPECTOR_ERROR_MSG;
67+
import static oracle.kubernetes.operator.ProcessingConstants.INTROSPECTION_ERROR;
6468
import static oracle.kubernetes.operator.ProcessingConstants.MII_DYNAMIC_UPDATE;
6569
import static oracle.kubernetes.operator.ProcessingConstants.MII_DYNAMIC_UPDATE_RESTART_REQUIRED;
6670
import static oracle.kubernetes.operator.ProcessingConstants.SERVER_HEALTH_MAP;
@@ -769,6 +773,52 @@ public FailureCountStep() {
769773
@Override
770774
void modifyStatus(DomainStatus domainStatus) {
771775
domainStatus.incrementIntrospectJobFailureCount();
776+
setStatusMessage(domainStatus);
777+
}
778+
779+
private void setStatusMessage(DomainStatus domainStatus) {
780+
781+
if (hasExceededMaxRetryCount(domainStatus)) {
782+
domainStatus.setMessage(onSeparateLines(domainStatus, EXCEEDED_INTROSPECTOR_MAX_RETRY_COUNT_ERROR_MSG));
783+
} else if (isFatalError(domainStatus)) {
784+
domainStatus.setMessage(onSeparateLines(domainStatus, FATAL_ERROR_DOMAIN_STATUS_MESSAGE));
785+
} else {
786+
domainStatus.setMessage(onSeparateLines(domainStatus, getNonFatalErrorStatusMessage(domainStatus)));
787+
}
788+
}
789+
790+
@NotNull
791+
private String onSeparateLines(DomainStatus domainStatus, String exceededIntrospectorMaxRetryCountErrorMsg) {
792+
return String.join(System.lineSeparator(), exceededIntrospectorMaxRetryCountErrorMsg,
793+
INTROSPECTION_ERROR, getNonNullMessage(domainStatus));
794+
}
795+
796+
private String getNonNullMessage(DomainStatus domainStatus) {
797+
return Optional.ofNullable(domainStatus.getMessage()).orElse(getFailedConditionMessage(domainStatus));
798+
799+
}
800+
801+
private String getFailedConditionMessage(DomainStatus domainStatus) {
802+
List<DomainCondition> failedConditions = domainStatus.getConditions().stream()
803+
.filter(c -> c.hasType(Failed) && c.getMessage() != null)
804+
.collect(Collectors.toList());
805+
return failedConditions.size() >= 1 ? failedConditions.get(failedConditions.size() - 1).getMessage()
806+
: "";
807+
}
808+
809+
@NotNull
810+
private String getNonFatalErrorStatusMessage(DomainStatus domainStatus) {
811+
return "Introspection failed on try " + domainStatus.getIntrospectJobFailureCount()
812+
+ " of " + DomainPresence.getDomainPresenceFailureRetryMaxCount() + ".";
813+
}
814+
815+
private boolean hasExceededMaxRetryCount(DomainStatus domainStatus) {
816+
return domainStatus.getIntrospectJobFailureCount() >= DomainPresence.getDomainPresenceFailureRetryMaxCount();
817+
}
818+
819+
private boolean isFatalError(DomainStatus domainStatus) {
820+
return Optional.ofNullable(domainStatus.getMessage())
821+
.map(m -> m.contains(FATAL_INTROSPECTOR_ERROR)).orElse(false);
772822
}
773823
}
774824

operator/src/main/java/oracle/kubernetes/operator/ProcessingConstants.java

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,9 +53,17 @@ public interface ProcessingConstants {
5353
String EXCEEDED_INTROSPECTOR_MAX_RETRY_COUNT_ERROR_MSG = "Stop introspection retry - "
5454
+ "exceeded configured domainPresenceFailureRetryMaxCount: "
5555
+ DomainPresence.getDomainPresenceFailureRetryMaxCount()
56-
+ " The domainPresenceFailureRetryMaxCount is an operator tuning parameter and can be controlled"
57-
+ " by adding it to the weblogic-operator-cm configmap.";
56+
+ ". The domainPresenceFailureRetryMaxCount is an operator tuning parameter and can be controlled"
57+
+ " by adding it to the weblogic-operator-cm configmap."
58+
+ " To force the introspector to start retrying again, update 'domain.spec.introspectVersion'.";
5859

5960
String FATAL_INTROSPECTOR_ERROR_MSG = "Stop introspection retry - MII Fatal Error: ";
6061
String OPERATOR_EVENT_LABEL_FILTER = LabelConstants.getCreatedByOperatorSelector();
62+
63+
String INTROSPECTION_ERROR = "Introspection Error: ";
64+
65+
String FATAL_ERROR_DOMAIN_STATUS_MESSAGE = "Introspection encountered a fatal error and will not retry automatically."
66+
+ " Please resolve the error and then update 'domain.spec.introspectVersion' to force a retry.";
67+
68+
6169
}

operator/src/test/java/oracle/kubernetes/operator/helpers/DomainIntrospectorJobTest.java

Lines changed: 90 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
import oracle.kubernetes.operator.work.FiberTestSupport;
3737
import oracle.kubernetes.operator.work.Step;
3838
import oracle.kubernetes.operator.work.TerminalStep;
39+
import oracle.kubernetes.utils.SystemClock;
3940
import oracle.kubernetes.utils.TestUtils;
4041
import oracle.kubernetes.weblogic.domain.DomainConfigurator;
4142
import oracle.kubernetes.weblogic.domain.DomainConfiguratorFactory;
@@ -62,6 +63,9 @@
6263
import static oracle.kubernetes.operator.DomainUpPlanTest.StepChainMatcher.hasChainWithStepsInOrder;
6364
import static oracle.kubernetes.operator.ProcessingConstants.DOMAIN_INTROSPECTOR_JOB;
6465
import static oracle.kubernetes.operator.ProcessingConstants.DOMAIN_TOPOLOGY;
66+
import static oracle.kubernetes.operator.ProcessingConstants.EXCEEDED_INTROSPECTOR_MAX_RETRY_COUNT_ERROR_MSG;
67+
import static oracle.kubernetes.operator.ProcessingConstants.FATAL_ERROR_DOMAIN_STATUS_MESSAGE;
68+
import static oracle.kubernetes.operator.ProcessingConstants.INTROSPECTION_ERROR;
6569
import static oracle.kubernetes.operator.ProcessingConstants.JOBWATCHER_COMPONENT_NAME;
6670
import static oracle.kubernetes.operator.ProcessingConstants.JOB_POD_NAME;
6771
import static oracle.kubernetes.operator.helpers.DomainStatusMatcher.hasStatus;
@@ -110,8 +114,11 @@ class DomainIntrospectorJobTest {
110114
IntStream.rangeClosed(1, MAX_SERVERS).mapToObj(n -> MS_PREFIX + n).toArray(String[]::new);
111115
private static final String SEVERE_PROBLEM_1 = "really bad";
112116
private static final String SEVERE_MESSAGE_1 = "@[SEVERE] " + SEVERE_PROBLEM_1;
117+
private static final String FATAL_PROBLEM_1 = "FatalIntrospectorError: really bad";
118+
private static final String FATAL_MESSAGE_1 = "@[SEVERE] " + FATAL_PROBLEM_1;
113119
public static final String TEST_VOLUME_NAME = "test";
114-
public static final String LAST_JOB_PROCESSED_ID = "some-unique-id";
120+
public static final String JOB_UID = "some-unique-id";
121+
public static final String INFO_MESSAGE_1 = "informational message";
115122

116123
private final TerminalStep terminalStep = new TerminalStep();
117124
private final Domain domain = createDomain();
@@ -122,6 +129,7 @@ class DomainIntrospectorJobTest {
122129
private final DomainConfigurator configurator = DomainConfiguratorFactory.forDomain(domain);
123130
private final EventRetryStrategyStub retryStrategy = createStrictStub(EventRetryStrategyStub.class);
124131
private final String jobPodName = LegalNames.toJobIntrospectorName(UID);
132+
private TestUtils.ConsoleHandlerMemento consoleHandlerMemento;
125133

126134
public DomainIntrospectorJobTest() {
127135
}
@@ -133,7 +141,7 @@ private static String getJobName() {
133141
@BeforeEach
134142
public void setUp() throws Exception {
135143
mementos.add(
136-
TestUtils.silenceOperatorLogger()
144+
consoleHandlerMemento = TestUtils.silenceOperatorLogger()
137145
.collectLogMessages(logRecords, getMessageKeys())
138146
.withLogLevel(Level.FINE)
139147
.ignoringLoggedExceptions(ApiException.class));
@@ -532,10 +540,7 @@ void whenIntrospectorJobNotNeeded_doesNotValidatesDomainTopology() throws JsonPr
532540

533541
@Test
534542
void whenJobLogContainsSevereError_logJobInfosOnDelete() {
535-
testSupport.defineResources(
536-
new V1Job().metadata(new V1ObjectMeta().name(getJobName()).namespace(NS)).status(new V1JobStatus()));
537-
IntrospectionTestUtils.defineResources(testSupport, SEVERE_MESSAGE_1);
538-
testSupport.addToPacket(DOMAIN_INTROSPECTOR_JOB, testSupport.getResourceWithName(JOB, getJobName()));
543+
createIntrospectionLog(SEVERE_MESSAGE_1, false);
539544

540545
testSupport.runSteps(JobHelper.deleteDomainIntrospectorJobStep(terminalStep));
541546

@@ -546,10 +551,7 @@ void whenJobLogContainsSevereError_logJobInfosOnDelete() {
546551

547552
@Test
548553
void whenJobLogContainsSevereError_logJobInfosOnReadPogLog() {
549-
testSupport.defineResources(
550-
new V1Job().metadata(new V1ObjectMeta().name(getJobName()).namespace(NS)).status(new V1JobStatus()));
551-
IntrospectionTestUtils.defineResources(testSupport, SEVERE_MESSAGE_1);
552-
testSupport.addToPacket(DOMAIN_INTROSPECTOR_JOB, testSupport.getResourceWithName(JOB, getJobName()));
554+
createIntrospectionLog(SEVERE_MESSAGE_1, false);
553555

554556
testSupport.runSteps(JobHelper.readDomainIntrospectorPodLog(terminalStep));
555557

@@ -559,57 +561,39 @@ void whenJobLogContainsSevereError_logJobInfosOnReadPogLog() {
559561

560562
@Test
561563
void whenJobLogContainsSevereError_incrementFailureCount() {
562-
testSupport.defineResources(
563-
new V1Job().metadata(new V1ObjectMeta().name(getJobName()).namespace(NS)).status(new V1JobStatus()));
564-
IntrospectionTestUtils.defineResources(testSupport, SEVERE_MESSAGE_1);
565-
testSupport.addToPacket(DOMAIN_INTROSPECTOR_JOB, testSupport.getResourceWithName(JOB, getJobName()));
564+
createIntrospectionLog(SEVERE_MESSAGE_1);
566565

567566
testSupport.runSteps(JobHelper.readDomainIntrospectorPodLog(terminalStep));
568567

569-
final Domain updatedDomain = testSupport.<Domain>getResources(DOMAIN).get(0);
570-
571-
assertThat(updatedDomain.getStatus().getIntrospectJobFailureCount(), equalTo(1));
572-
logRecords.clear();
568+
assertThat(getUpdatedDomain().getStatus().getIntrospectJobFailureCount(), equalTo(1));
573569
}
574570

575571
@Test
576572
void whenReadJobLogCompletesWithSevereError_domainStatusContainsLastProcessedJobId() {
577-
testSupport.defineResources(
578-
new V1Job().metadata(new V1ObjectMeta().name(getJobName()).namespace(NS).uid(LAST_JOB_PROCESSED_ID))
579-
.status(new V1JobStatus()));
580-
IntrospectionTestUtils.defineResources(testSupport, SEVERE_MESSAGE_1);
581-
testSupport.addToPacket(DOMAIN_INTROSPECTOR_JOB, testSupport.getResourceWithName(JOB, getJobName()));
573+
createIntrospectionLog(SEVERE_MESSAGE_1);
582574

583575
testSupport.runSteps(JobHelper.readDomainIntrospectorPodLog(terminalStep));
584576

585-
final Domain updatedDomain = testSupport.<Domain>getResources(DOMAIN).get(0);
586-
587-
assertThat(updatedDomain.getStatus().getLastIntrospectJobProcessedUid(), equalTo(LAST_JOB_PROCESSED_ID));
588-
logRecords.clear();
577+
assertThat(getUpdatedDomain().getStatus().getLastIntrospectJobProcessedUid(), equalTo(JOB_UID));
589578
}
590579

591580
@Test
592581
void whenReadJobLogCompletesWithoutSevereError_domainStatusContainsLastProcessedJobId() {
593-
testSupport.defineResources(
594-
new V1Job().metadata(new V1ObjectMeta().name(getJobName()).namespace(NS).uid(LAST_JOB_PROCESSED_ID))
595-
.status(new V1JobStatus()));
596-
IntrospectionTestUtils.defineResources(testSupport, "passed");
597-
testSupport.addToPacket(DOMAIN_INTROSPECTOR_JOB, testSupport.getResourceWithName(JOB, getJobName()));
582+
createIntrospectionLog(INFO_MESSAGE_1);
598583

599584
testSupport.runSteps(JobHelper.readDomainIntrospectorPodLog(terminalStep));
600585

601586
final Domain updatedDomain = testSupport.<Domain>getResources(DOMAIN).get(0);
602587

603-
assertThat(updatedDomain.getStatus().getLastIntrospectJobProcessedUid(), equalTo(LAST_JOB_PROCESSED_ID));
604-
logRecords.clear();
588+
assertThat(updatedDomain.getStatus().getLastIntrospectJobProcessedUid(), equalTo(JOB_UID));
605589
}
606590

607591
@Test
608592
void whenDomainStatusContainsNullLastIntrospectProcessedJobUid_correctStepsExecuted() {
609593
List<Step> nextSteps = new ArrayList<>();
610594
domainPresenceInfo.getDomain()
611595
.setStatus(new DomainStatus().withLastIntrospectJobProcessedUid(null));
612-
V1Job job = new V1Job().metadata(new V1ObjectMeta().name(getJobName()).namespace(NS).uid(LAST_JOB_PROCESSED_ID))
596+
V1Job job = new V1Job().metadata(new V1ObjectMeta().name(getJobName()).namespace(NS).uid(JOB_UID))
613597
.status(new V1JobStatus());
614598
testSupport.defineResources(job);
615599
IntrospectionTestUtils.defineResources(testSupport, "passed");
@@ -626,8 +610,8 @@ void whenDomainStatusContainsNullLastIntrospectProcessedJobUid_correctStepsExecu
626610
void whenDomainStatusContainsProcessedJobIdSameAsCurrentJob_correctStepsExecuted() {
627611
List<Step> nextSteps = new ArrayList<>();
628612
domainPresenceInfo.getDomain()
629-
.setStatus(new DomainStatus().withLastIntrospectJobProcessedUid(LAST_JOB_PROCESSED_ID));
630-
V1Job job = new V1Job().metadata(new V1ObjectMeta().name(getJobName()).namespace(NS).uid(LAST_JOB_PROCESSED_ID))
613+
.setStatus(new DomainStatus().withLastIntrospectJobProcessedUid(JOB_UID));
614+
V1Job job = new V1Job().metadata(new V1ObjectMeta().name(getJobName()).namespace(NS).uid(JOB_UID))
631615
.status(new V1JobStatus());
632616
testSupport.defineResources(job);
633617
IntrospectionTestUtils.defineResources(testSupport, "passed");
@@ -667,6 +651,75 @@ void whenJobCreateFailsWith409Error_JobIsCreated() {
667651
logRecords.clear();
668652
}
669653

654+
@Test
655+
void whenJobLogContainsSevereErrorAndRetriesLeft_domainStatusHasExpectedMessage() {
656+
createIntrospectionLog(SEVERE_MESSAGE_1);
657+
658+
testSupport.runSteps(JobHelper.readDomainIntrospectorPodLog(terminalStep));
659+
660+
assertThat(getUpdatedDomain().getStatus().getMessage(), equalTo(String.join(System.lineSeparator(),
661+
"Introspection failed on try 1 of 2.", INTROSPECTION_ERROR,
662+
SEVERE_PROBLEM_1)));
663+
}
664+
665+
@Test
666+
void whenJobLogContainsFatalError_domainStatusHasExpectedMessage() {
667+
createIntrospectionLog(FATAL_MESSAGE_1);
668+
669+
testSupport.runSteps(JobHelper.readDomainIntrospectorPodLog(terminalStep));
670+
671+
final Domain updatedDomain = testSupport.<Domain>getResources(DOMAIN).get(0);
672+
673+
assertThat(updatedDomain.getStatus().getMessage(), equalTo(String.join(System.lineSeparator(),
674+
FATAL_ERROR_DOMAIN_STATUS_MESSAGE, INTROSPECTION_ERROR,
675+
FATAL_PROBLEM_1)));
676+
}
677+
678+
@Test
679+
void whenJobLogContainsSevereErrorAndNumberOfRetriesExceedsMaxLimit_domainStatusHasExpectedMessage() {
680+
createIntrospectionLog(SEVERE_MESSAGE_1);
681+
682+
getUpdatedDomain().setStatus(createDomainStatusWithIntrospectJobFailureCount(2));
683+
684+
testSupport.runSteps(JobHelper.readDomainIntrospectorPodLog(terminalStep));
685+
686+
assertThat(getUpdatedDomain().getStatus().getMessage(), equalTo(String.join(System.lineSeparator(),
687+
EXCEEDED_INTROSPECTOR_MAX_RETRY_COUNT_ERROR_MSG, INTROSPECTION_ERROR,
688+
SEVERE_PROBLEM_1)));
689+
}
690+
691+
private void createIntrospectionLog(String logMessage) {
692+
createIntrospectionLog(logMessage, true);
693+
}
694+
695+
private void createIntrospectionLog(String logMessage, boolean ignoreLogMessages) {
696+
if (ignoreLogMessages) {
697+
consoleHandlerMemento.ignoreMessage(getJobFailedMessageKey());
698+
consoleHandlerMemento.ignoreMessage(getJobFailedDetailMessageKey());
699+
}
700+
testSupport.defineResources(createIntrospectorJob());
701+
IntrospectionTestUtils.defineResources(testSupport, logMessage);
702+
testSupport.addToPacket(DOMAIN_INTROSPECTOR_JOB, testSupport.getResourceWithName(JOB, getJobName()));
703+
}
704+
705+
private V1Job createIntrospectorJob() {
706+
return new V1Job().metadata(createJobMetadata()).status(new V1JobStatus());
707+
}
708+
709+
private V1ObjectMeta createJobMetadata() {
710+
return new V1ObjectMeta().name(getJobName()).namespace(NS).creationTimestamp(SystemClock.now()).uid(JOB_UID);
711+
}
712+
713+
private DomainStatus createDomainStatusWithIntrospectJobFailureCount(int failureCount) {
714+
final DomainStatus status = new DomainStatus();
715+
status.withIntrospectJobFailureCount(failureCount);
716+
return status;
717+
}
718+
719+
private Domain getUpdatedDomain() {
720+
return testSupport.<Domain>getResources(DOMAIN).get(0);
721+
}
722+
670723
private Cluster getCluster(String clusterName) {
671724
return domain.getSpec().getClusters().stream()
672725
.filter(c -> clusterName.equals(c.getClusterName()))

0 commit comments

Comments
 (0)