Skip to content

OWLS-93072 - Improve domain status failure message to include retry stage and fatal condition #2571

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Oct 14, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
Expand Down Expand Up @@ -54,13 +55,16 @@
import oracle.kubernetes.weblogic.domain.model.OnlineUpdate;
import oracle.kubernetes.weblogic.domain.model.ServerHealth;
import oracle.kubernetes.weblogic.domain.model.ServerStatus;
import org.jetbrains.annotations.NotNull;

import static oracle.kubernetes.operator.LabelConstants.CLUSTERNAME_LABEL;
import static oracle.kubernetes.operator.MIINonDynamicChangesMethod.CommitUpdateOnly;
import static oracle.kubernetes.operator.ProcessingConstants.DOMAIN_TOPOLOGY;
import static oracle.kubernetes.operator.ProcessingConstants.EXCEEDED_INTROSPECTOR_MAX_RETRY_COUNT_ERROR_MSG;
import static oracle.kubernetes.operator.ProcessingConstants.FATAL_ERROR_DOMAIN_STATUS_MESSAGE;
import static oracle.kubernetes.operator.ProcessingConstants.FATAL_INTROSPECTOR_ERROR;
import static oracle.kubernetes.operator.ProcessingConstants.FATAL_INTROSPECTOR_ERROR_MSG;
import static oracle.kubernetes.operator.ProcessingConstants.INTROSPECTION_ERROR;
import static oracle.kubernetes.operator.ProcessingConstants.MII_DYNAMIC_UPDATE;
import static oracle.kubernetes.operator.ProcessingConstants.MII_DYNAMIC_UPDATE_RESTART_REQUIRED;
import static oracle.kubernetes.operator.ProcessingConstants.SERVER_HEALTH_MAP;
Expand Down Expand Up @@ -769,6 +773,52 @@ public FailureCountStep() {
@Override
void modifyStatus(DomainStatus domainStatus) {
domainStatus.incrementIntrospectJobFailureCount();
setStatusMessage(domainStatus);
}

private void setStatusMessage(DomainStatus domainStatus) {

if (hasExceededMaxRetryCount(domainStatus)) {
domainStatus.setMessage(onSeparateLines(domainStatus, EXCEEDED_INTROSPECTOR_MAX_RETRY_COUNT_ERROR_MSG));
} else if (isFatalError(domainStatus)) {
domainStatus.setMessage(onSeparateLines(domainStatus, FATAL_ERROR_DOMAIN_STATUS_MESSAGE));
} else {
domainStatus.setMessage(onSeparateLines(domainStatus, getNonFatalErrorStatusMessage(domainStatus)));
}
}

@NotNull
private String onSeparateLines(DomainStatus domainStatus, String exceededIntrospectorMaxRetryCountErrorMsg) {
return String.join(System.lineSeparator(), exceededIntrospectorMaxRetryCountErrorMsg,
INTROSPECTION_ERROR, getNonNullMessage(domainStatus));
}

private String getNonNullMessage(DomainStatus domainStatus) {
return Optional.ofNullable(domainStatus.getMessage()).orElse(getFailedConditionMessage(domainStatus));

}

private String getFailedConditionMessage(DomainStatus domainStatus) {
List<DomainCondition> failedConditions = domainStatus.getConditions().stream()
.filter(c -> c.hasType(Failed) && c.getMessage() != null)
.collect(Collectors.toList());
return failedConditions.size() >= 1 ? failedConditions.get(failedConditions.size() - 1).getMessage()
: "";
}

@NotNull
private String getNonFatalErrorStatusMessage(DomainStatus domainStatus) {
return "Introspection failed on try " + domainStatus.getIntrospectJobFailureCount()
+ " of " + DomainPresence.getDomainPresenceFailureRetryMaxCount() + ".";
}

private boolean hasExceededMaxRetryCount(DomainStatus domainStatus) {
return domainStatus.getIntrospectJobFailureCount() >= DomainPresence.getDomainPresenceFailureRetryMaxCount();
}

private boolean isFatalError(DomainStatus domainStatus) {
return Optional.ofNullable(domainStatus.getMessage())
.map(m -> m.contains(FATAL_INTROSPECTOR_ERROR)).orElse(false);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,17 @@ public interface ProcessingConstants {
String EXCEEDED_INTROSPECTOR_MAX_RETRY_COUNT_ERROR_MSG = "Stop introspection retry - "
+ "exceeded configured domainPresenceFailureRetryMaxCount: "
+ DomainPresence.getDomainPresenceFailureRetryMaxCount()
+ " The domainPresenceFailureRetryMaxCount is an operator tuning parameter and can be controlled"
+ " by adding it to the weblogic-operator-cm configmap.";
+ ". The domainPresenceFailureRetryMaxCount is an operator tuning parameter and can be controlled"
+ " by adding it to the weblogic-operator-cm configmap."
+ " To force the introspector to start retrying again, update 'domain.spec.introspectVersion'.";

String FATAL_INTROSPECTOR_ERROR_MSG = "Stop introspection retry - MII Fatal Error: ";
String OPERATOR_EVENT_LABEL_FILTER = LabelConstants.getCreatedByOperatorSelector();

String INTROSPECTION_ERROR = "Introspection Error: ";

String FATAL_ERROR_DOMAIN_STATUS_MESSAGE = "Introspection encountered a fatal error and will not retry automatically."
+ " Please resolve the error and then update 'domain.spec.introspectVersion' to force a retry.";


}
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
import oracle.kubernetes.operator.work.FiberTestSupport;
import oracle.kubernetes.operator.work.Step;
import oracle.kubernetes.operator.work.TerminalStep;
import oracle.kubernetes.utils.SystemClock;
import oracle.kubernetes.utils.TestUtils;
import oracle.kubernetes.weblogic.domain.DomainConfigurator;
import oracle.kubernetes.weblogic.domain.DomainConfiguratorFactory;
Expand All @@ -62,6 +63,9 @@
import static oracle.kubernetes.operator.DomainUpPlanTest.StepChainMatcher.hasChainWithStepsInOrder;
import static oracle.kubernetes.operator.ProcessingConstants.DOMAIN_INTROSPECTOR_JOB;
import static oracle.kubernetes.operator.ProcessingConstants.DOMAIN_TOPOLOGY;
import static oracle.kubernetes.operator.ProcessingConstants.EXCEEDED_INTROSPECTOR_MAX_RETRY_COUNT_ERROR_MSG;
import static oracle.kubernetes.operator.ProcessingConstants.FATAL_ERROR_DOMAIN_STATUS_MESSAGE;
import static oracle.kubernetes.operator.ProcessingConstants.INTROSPECTION_ERROR;
import static oracle.kubernetes.operator.ProcessingConstants.JOBWATCHER_COMPONENT_NAME;
import static oracle.kubernetes.operator.ProcessingConstants.JOB_POD_NAME;
import static oracle.kubernetes.operator.helpers.DomainStatusMatcher.hasStatus;
Expand Down Expand Up @@ -110,8 +114,11 @@ class DomainIntrospectorJobTest {
IntStream.rangeClosed(1, MAX_SERVERS).mapToObj(n -> MS_PREFIX + n).toArray(String[]::new);
private static final String SEVERE_PROBLEM_1 = "really bad";
private static final String SEVERE_MESSAGE_1 = "@[SEVERE] " + SEVERE_PROBLEM_1;
private static final String FATAL_PROBLEM_1 = "FatalIntrospectorError: really bad";
private static final String FATAL_MESSAGE_1 = "@[SEVERE] " + FATAL_PROBLEM_1;
public static final String TEST_VOLUME_NAME = "test";
public static final String LAST_JOB_PROCESSED_ID = "some-unique-id";
public static final String JOB_UID = "some-unique-id";
public static final String INFO_MESSAGE_1 = "informational message";

private final TerminalStep terminalStep = new TerminalStep();
private final Domain domain = createDomain();
Expand All @@ -122,6 +129,7 @@ class DomainIntrospectorJobTest {
private final DomainConfigurator configurator = DomainConfiguratorFactory.forDomain(domain);
private final EventRetryStrategyStub retryStrategy = createStrictStub(EventRetryStrategyStub.class);
private final String jobPodName = LegalNames.toJobIntrospectorName(UID);
private TestUtils.ConsoleHandlerMemento consoleHandlerMemento;

public DomainIntrospectorJobTest() {
}
Expand All @@ -133,7 +141,7 @@ private static String getJobName() {
@BeforeEach
public void setUp() throws Exception {
mementos.add(
TestUtils.silenceOperatorLogger()
consoleHandlerMemento = TestUtils.silenceOperatorLogger()
.collectLogMessages(logRecords, getMessageKeys())
.withLogLevel(Level.FINE)
.ignoringLoggedExceptions(ApiException.class));
Expand Down Expand Up @@ -532,10 +540,7 @@ void whenIntrospectorJobNotNeeded_doesNotValidatesDomainTopology() throws JsonPr

@Test
void whenJobLogContainsSevereError_logJobInfosOnDelete() {
testSupport.defineResources(
new V1Job().metadata(new V1ObjectMeta().name(getJobName()).namespace(NS)).status(new V1JobStatus()));
IntrospectionTestUtils.defineResources(testSupport, SEVERE_MESSAGE_1);
testSupport.addToPacket(DOMAIN_INTROSPECTOR_JOB, testSupport.getResourceWithName(JOB, getJobName()));
createIntrospectionLog(SEVERE_MESSAGE_1, false);

testSupport.runSteps(JobHelper.deleteDomainIntrospectorJobStep(terminalStep));

Expand All @@ -546,10 +551,7 @@ void whenJobLogContainsSevereError_logJobInfosOnDelete() {

@Test
void whenJobLogContainsSevereError_logJobInfosOnReadPogLog() {
testSupport.defineResources(
new V1Job().metadata(new V1ObjectMeta().name(getJobName()).namespace(NS)).status(new V1JobStatus()));
IntrospectionTestUtils.defineResources(testSupport, SEVERE_MESSAGE_1);
testSupport.addToPacket(DOMAIN_INTROSPECTOR_JOB, testSupport.getResourceWithName(JOB, getJobName()));
createIntrospectionLog(SEVERE_MESSAGE_1, false);

testSupport.runSteps(JobHelper.readDomainIntrospectorPodLog(terminalStep));

Expand All @@ -559,57 +561,39 @@ void whenJobLogContainsSevereError_logJobInfosOnReadPogLog() {

@Test
void whenJobLogContainsSevereError_incrementFailureCount() {
testSupport.defineResources(
new V1Job().metadata(new V1ObjectMeta().name(getJobName()).namespace(NS)).status(new V1JobStatus()));
IntrospectionTestUtils.defineResources(testSupport, SEVERE_MESSAGE_1);
testSupport.addToPacket(DOMAIN_INTROSPECTOR_JOB, testSupport.getResourceWithName(JOB, getJobName()));
createIntrospectionLog(SEVERE_MESSAGE_1);

testSupport.runSteps(JobHelper.readDomainIntrospectorPodLog(terminalStep));

final Domain updatedDomain = testSupport.<Domain>getResources(DOMAIN).get(0);

assertThat(updatedDomain.getStatus().getIntrospectJobFailureCount(), equalTo(1));
logRecords.clear();
assertThat(getUpdatedDomain().getStatus().getIntrospectJobFailureCount(), equalTo(1));
}

@Test
void whenReadJobLogCompletesWithSevereError_domainStatusContainsLastProcessedJobId() {
testSupport.defineResources(
new V1Job().metadata(new V1ObjectMeta().name(getJobName()).namespace(NS).uid(LAST_JOB_PROCESSED_ID))
.status(new V1JobStatus()));
IntrospectionTestUtils.defineResources(testSupport, SEVERE_MESSAGE_1);
testSupport.addToPacket(DOMAIN_INTROSPECTOR_JOB, testSupport.getResourceWithName(JOB, getJobName()));
createIntrospectionLog(SEVERE_MESSAGE_1);

testSupport.runSteps(JobHelper.readDomainIntrospectorPodLog(terminalStep));

final Domain updatedDomain = testSupport.<Domain>getResources(DOMAIN).get(0);

assertThat(updatedDomain.getStatus().getLastIntrospectJobProcessedUid(), equalTo(LAST_JOB_PROCESSED_ID));
logRecords.clear();
assertThat(getUpdatedDomain().getStatus().getLastIntrospectJobProcessedUid(), equalTo(JOB_UID));
}

@Test
void whenReadJobLogCompletesWithoutSevereError_domainStatusContainsLastProcessedJobId() {
testSupport.defineResources(
new V1Job().metadata(new V1ObjectMeta().name(getJobName()).namespace(NS).uid(LAST_JOB_PROCESSED_ID))
.status(new V1JobStatus()));
IntrospectionTestUtils.defineResources(testSupport, "passed");
testSupport.addToPacket(DOMAIN_INTROSPECTOR_JOB, testSupport.getResourceWithName(JOB, getJobName()));
createIntrospectionLog(INFO_MESSAGE_1);

testSupport.runSteps(JobHelper.readDomainIntrospectorPodLog(terminalStep));

final Domain updatedDomain = testSupport.<Domain>getResources(DOMAIN).get(0);

assertThat(updatedDomain.getStatus().getLastIntrospectJobProcessedUid(), equalTo(LAST_JOB_PROCESSED_ID));
logRecords.clear();
assertThat(updatedDomain.getStatus().getLastIntrospectJobProcessedUid(), equalTo(JOB_UID));
}

@Test
void whenDomainStatusContainsNullLastIntrospectProcessedJobUid_correctStepsExecuted() {
List<Step> nextSteps = new ArrayList<>();
domainPresenceInfo.getDomain()
.setStatus(new DomainStatus().withLastIntrospectJobProcessedUid(null));
V1Job job = new V1Job().metadata(new V1ObjectMeta().name(getJobName()).namespace(NS).uid(LAST_JOB_PROCESSED_ID))
V1Job job = new V1Job().metadata(new V1ObjectMeta().name(getJobName()).namespace(NS).uid(JOB_UID))
.status(new V1JobStatus());
testSupport.defineResources(job);
IntrospectionTestUtils.defineResources(testSupport, "passed");
Expand All @@ -626,8 +610,8 @@ void whenDomainStatusContainsNullLastIntrospectProcessedJobUid_correctStepsExecu
void whenDomainStatusContainsProcessedJobIdSameAsCurrentJob_correctStepsExecuted() {
List<Step> nextSteps = new ArrayList<>();
domainPresenceInfo.getDomain()
.setStatus(new DomainStatus().withLastIntrospectJobProcessedUid(LAST_JOB_PROCESSED_ID));
V1Job job = new V1Job().metadata(new V1ObjectMeta().name(getJobName()).namespace(NS).uid(LAST_JOB_PROCESSED_ID))
.setStatus(new DomainStatus().withLastIntrospectJobProcessedUid(JOB_UID));
V1Job job = new V1Job().metadata(new V1ObjectMeta().name(getJobName()).namespace(NS).uid(JOB_UID))
.status(new V1JobStatus());
testSupport.defineResources(job);
IntrospectionTestUtils.defineResources(testSupport, "passed");
Expand Down Expand Up @@ -667,6 +651,75 @@ void whenJobCreateFailsWith409Error_JobIsCreated() {
logRecords.clear();
}

@Test
void whenJobLogContainsSevereErrorAndRetriesLeft_domainStatusHasExpectedMessage() {
createIntrospectionLog(SEVERE_MESSAGE_1);

testSupport.runSteps(JobHelper.readDomainIntrospectorPodLog(terminalStep));

assertThat(getUpdatedDomain().getStatus().getMessage(), equalTo(String.join(System.lineSeparator(),
"Introspection failed on try 1 of 2.", INTROSPECTION_ERROR,
SEVERE_PROBLEM_1)));
}

@Test
void whenJobLogContainsFatalError_domainStatusHasExpectedMessage() {
createIntrospectionLog(FATAL_MESSAGE_1);

testSupport.runSteps(JobHelper.readDomainIntrospectorPodLog(terminalStep));

final Domain updatedDomain = testSupport.<Domain>getResources(DOMAIN).get(0);

assertThat(updatedDomain.getStatus().getMessage(), equalTo(String.join(System.lineSeparator(),
FATAL_ERROR_DOMAIN_STATUS_MESSAGE, INTROSPECTION_ERROR,
FATAL_PROBLEM_1)));
}

@Test
void whenJobLogContainsSevereErrorAndNumberOfRetriesExceedsMaxLimit_domainStatusHasExpectedMessage() {
createIntrospectionLog(SEVERE_MESSAGE_1);

getUpdatedDomain().setStatus(createDomainStatusWithIntrospectJobFailureCount(2));

testSupport.runSteps(JobHelper.readDomainIntrospectorPodLog(terminalStep));

assertThat(getUpdatedDomain().getStatus().getMessage(), equalTo(String.join(System.lineSeparator(),
EXCEEDED_INTROSPECTOR_MAX_RETRY_COUNT_ERROR_MSG, INTROSPECTION_ERROR,
SEVERE_PROBLEM_1)));
}

private void createIntrospectionLog(String logMessage) {
createIntrospectionLog(logMessage, true);
}

private void createIntrospectionLog(String logMessage, boolean ignoreLogMessages) {
if (ignoreLogMessages) {
consoleHandlerMemento.ignoreMessage(getJobFailedMessageKey());
consoleHandlerMemento.ignoreMessage(getJobFailedDetailMessageKey());
}
testSupport.defineResources(createIntrospectorJob());
IntrospectionTestUtils.defineResources(testSupport, logMessage);
testSupport.addToPacket(DOMAIN_INTROSPECTOR_JOB, testSupport.getResourceWithName(JOB, getJobName()));
}

private V1Job createIntrospectorJob() {
return new V1Job().metadata(createJobMetadata()).status(new V1JobStatus());
}

private V1ObjectMeta createJobMetadata() {
return new V1ObjectMeta().name(getJobName()).namespace(NS).creationTimestamp(SystemClock.now()).uid(JOB_UID);
}

private DomainStatus createDomainStatusWithIntrospectJobFailureCount(int failureCount) {
final DomainStatus status = new DomainStatus();
status.withIntrospectJobFailureCount(failureCount);
return status;
}

private Domain getUpdatedDomain() {
return testSupport.<Domain>getResources(DOMAIN).get(0);
}

private Cluster getCluster(String clusterName) {
return domain.getSpec().getClusters().stream()
.filter(c -> clusterName.equals(c.getClusterName()))
Expand Down