Skip to content

OWLS-91212 - Fix for the introspector retry behavior after the job times out. #2580

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Oct 27, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -1433,7 +1433,8 @@ private void scalingClusters(String clusterName, String serverPodName, int repli
executeLifecycleScript(STATUS_CLUSTER_SCRIPT, CLUSTER_LIFECYCLE, clusterName),
String.format("Failed to run %s", STATUS_CLUSTER_SCRIPT));

assertTrue(verifyExecuteResult(result, regex), "The script should scale the given cluster: " + clusterName);
assertTrue(verifyExecuteResult(result, regex), "The script should scale the given cluster: " + clusterName
+ ", the result is -> " + result + " . It doesn't match the expected pattern -> " + regex);
logger.info("The cluster {0} scaled successfully.", clusterName);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -882,7 +882,7 @@ private boolean shouldContinue() {
return true;
} else if (shouldReportAbortedEvent()) {
return true;
} else if (hasExceededRetryCount() && !isImgRestartIntrospectVerChanged(liveInfo, cachedInfo)) {
} else if (hasExceededRetryCount(liveInfo) && !isImgRestartIntrospectVerChanged(liveInfo, cachedInfo)) {
LOGGER.severe(ProcessingConstants.EXCEEDED_INTROSPECTOR_MAX_RETRY_COUNT_ERROR_MSG);
return false;
} else if (isFatalIntrospectorError()) {
Expand All @@ -893,10 +893,7 @@ private boolean shouldContinue() {
return false; // we have already cached this
} else if (shouldRecheck(cachedInfo)) {

if (hasExceededRetryCount()) {
resetIntrospectorJobFailureCount();
}
if (getCurrentIntrospectFailureRetryCount() > 0) {
if (getCurrentIntrospectFailureRetryCount(liveInfo) > 0) {
logRetryCount(cachedInfo);
ensureRetryingEventPresent();
}
Expand All @@ -917,29 +914,10 @@ private void ensureRetryingEventPresent() {
}
}

private void resetIntrospectorJobFailureCount() {
Optional.ofNullable(liveInfo)
.map(DomainPresenceInfo::getDomain)
.map(Domain::getStatus)
.map(DomainStatus::resetIntrospectJobFailureCount);
}

private boolean hasExceededRetryCount() {
return getCurrentIntrospectFailureRetryCount()
>= DomainPresence.getDomainPresenceFailureRetryMaxCount();
}

private Integer getCurrentIntrospectFailureRetryCount() {
return Optional.ofNullable(liveInfo)
.map(DomainPresenceInfo::getDomain)
.map(Domain::getStatus)
.map(DomainStatus::getIntrospectJobFailureCount)
.orElse(0);
}

private void logRetryCount(DomainPresenceInfo cachedInfo) {
LOGGER.info(MessageKeys.INTROSPECT_JOB_FAILED_RETRY_COUNT, cachedInfo.getDomain().getDomainUid(),
getCurrentIntrospectFailureRetryCount(),
getCurrentIntrospectFailureRetryCount(liveInfo),
DomainPresence.getDomainPresenceFailureRetryMaxCount());
}

Expand Down Expand Up @@ -1068,11 +1046,24 @@ private static String getIntrospectVersion(DomainPresenceInfo info) {
.orElse(null);
}

private Integer getCurrentIntrospectFailureRetryCount(DomainPresenceInfo info) {
return Optional.ofNullable(info)
.map(DomainPresenceInfo::getDomain)
.map(Domain::getStatus)
.map(DomainStatus::getIntrospectJobFailureCount)
.orElse(0);
}

private static boolean isCachedInfoNewer(DomainPresenceInfo liveInfo, DomainPresenceInfo cachedInfo) {
return liveInfo.getDomain() != null
&& KubernetesUtils.isFirstNewer(cachedInfo.getDomain().getMetadata(), liveInfo.getDomain().getMetadata());
}

private boolean hasExceededRetryCount(DomainPresenceInfo info) {
return getCurrentIntrospectFailureRetryCount(info)
>= DomainPresence.getDomainPresenceFailureRetryMaxCount();
}

@SuppressWarnings("unused")
private void runDomainPlan(
Domain dom,
Expand All @@ -1092,11 +1083,11 @@ public void onCompletion(Packet packet) {
@Override
public void onThrowable(Packet packet, Throwable throwable) {
logThrowable(throwable);

gate.startFiberIfLastFiberMatches(
domainUid,
Fiber.getCurrentIfSet(),
DomainStatusUpdater.createFailureRelatedSteps(throwable, null),
Step.chain(DomainStatusUpdater.createFailureCountStep(),
DomainStatusUpdater.createFailureRelatedSteps(throwable, null)),
plan.packet,
new CompletionCallback() {
@Override
Expand All @@ -1119,7 +1110,7 @@ public void onThrowable(Packet packet, Throwable throwable) {
LoggingContext.setThreadContext().namespace(ns).domainUid(domainUid)) {
existing.setPopulated(false);
// proceed only if we have not already retried max number of times
int retryCount = existing.incrementAndGetFailureCount();
int retryCount = getCurrentIntrospectFailureRetryCount(existing);
LOGGER.fine(
"Failure count for DomainPresenceInfo: "
+ existing
Expand Down Expand Up @@ -1179,6 +1170,11 @@ Step createDomainUpPlan(DomainPresenceInfo info) {
bringAdminServerUp(info, delegate.getPodAwaiterStepFactory(info.getNamespace())),
managedServerStrategy);

if (hasExceededRetryCount(info) && isImgRestartIntrospectVerChanged(info,
getExistingDomainPresenceInfo(info.getNamespace(), info.getDomainUid()))) {
domainUpStrategy = Step.chain(DomainStatusUpdater.createResetFailureCountStep(), domainUpStrategy);
}

return Step.chain(
createDomainUpInitialStep(info),
ConfigMapHelper.readExistingIntrospectorConfigMap(info.getNamespace(), info.getDomainUid()),
Expand Down Expand Up @@ -1226,8 +1222,7 @@ private static class TailStep extends Step {

@Override
public NextAction apply(Packet packet) {
packet.getSpi(DomainPresenceInfo.class).complete();
return doNext(packet);
return doNext(DomainStatusUpdater.createResetFailureCountStep(), packet);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -842,6 +842,22 @@ private boolean isFatalError(DomainStatus domainStatus) {
}
}

public static Step createResetFailureCountStep() {
return new ResetFailureCountStep();
}

static class ResetFailureCountStep extends DomainStatusUpdaterStep {

public ResetFailureCountStep() {
super(null);
}

@Override
void modifyStatus(DomainStatus domainStatus) {
domainStatus.resetIntrospectJobFailureCount();
}
}

public static Step recordLastIntrospectJobProcessedUid(String lastIntrospectJobProcessedId) {
return new RecordLastIntrospectJobProcessedUidStep(lastIntrospectJobProcessedId);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -476,13 +476,8 @@ protected DefaultResponseStep<V1Pod> resumeIfReady(Callback callback) {
return new DefaultResponseStep<>(getNext()) {
@Override
public NextAction onSuccess(Packet packet, CallResponse<V1Pod> callResponse) {

DomainPresenceInfo info = packet.getSpi(DomainPresenceInfo.class);
String serverName = (String)packet.get(SERVER_NAME);
if ((info != null) && (callResponse != null) && (callResponse.getResult() == null)) {
info.setServerPod(serverName, null);
}

String serverName = callback.geServerName();
if (isReady(callResponse.getResult(), info, serverName) || callback.didResumeFiber()) {
callback.proceedFromWait(callResponse.getResult());
return null;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ public interface ProcessingConstants {

String DOMAIN_TOPOLOGY = "domainTopology";
String JOB_POD_NAME = "jobPodName";
String JOB_POD_CONTAINER_WAITING_REASON = "jobPodContainerWaitingReason";
String DOMAIN_INTROSPECTOR_JOB = "domainIntrospectorJob";
String DOMAIN_INTROSPECTOR_LOG_RESULT = "domainIntrospectorLogResult";
String DOMAIN_INTROSPECT_REQUESTED = "domainIntrospectRequested";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,6 @@ void logWaiting(String name) {
@Override
public final NextAction apply(Packet packet) {
String serverName = (String)packet.get(SERVER_NAME);
DomainPresenceInfo info = packet.getSpi(DomainPresenceInfo.class);
if (shouldTerminateFiber(initialResource)) {
return doTerminate(createTerminationException(initialResource), packet);
} else if (isReady(initialResource, packet.getSpi(DomainPresenceInfo.class), serverName)) {
Expand Down Expand Up @@ -280,10 +279,12 @@ class Callback implements Consumer<T> {
private final Packet packet;
private final AtomicBoolean didResume = new AtomicBoolean(false);
private final AtomicInteger recheckCount = new AtomicInteger(0);
private final String serverName;

Callback(AsyncFiber fiber, Packet packet) {
this.fiber = fiber;
this.packet = packet;
this.serverName = (String) packet.get(SERVER_NAME);
}

@Override
Expand Down Expand Up @@ -320,6 +321,10 @@ int incrementAndGetRecheckCount() {
int getRecheckCount() {
return recheckCount.get();
}

String geServerName() {
return serverName;
}
}

private void handleResourceReady(AsyncFiber fiber, Packet packet, T resource) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference;
import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;
Expand Down Expand Up @@ -54,7 +53,6 @@ public class DomainPresenceInfo {
private final AtomicReference<Domain> domain;
private final AtomicBoolean isDeleting = new AtomicBoolean(false);
private final AtomicBoolean isPopulated = new AtomicBoolean(false);
private final AtomicInteger retryCount = new AtomicInteger(0);
private final AtomicReference<Collection<ServerStartupInfo>> serverStartupInfo;
private final AtomicReference<Collection<ServerShutdownInfo>> serverShutdownInfo;

Expand Down Expand Up @@ -547,23 +545,6 @@ public void setPopulated(boolean populated) {
isPopulated.set(populated);
}

private void resetFailureCount() {
retryCount.set(0);
}

public int incrementAndGetFailureCount() {
return retryCount.incrementAndGet();
}

int getRetryCount() {
return retryCount.get();
}

/** Sets the last completion time to now. */
public void complete() {
resetFailureCount();
}

EventItem getLastEventItem() {
return lastEventItem;
}
Expand Down
Loading