Skip to content

Retry behavior for synchronous calls during initialization #2118

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jan 5, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ public static TuningParameters getInstance() {
public PodTuning getPodTuning();

public static class MainTuning {
public final int initializationRetryDelaySeconds;
public final int domainPresenceFailureRetrySeconds;
public final int domainPresenceFailureRetryMaxCount;
public final int domainPresenceRecheckIntervalSeconds;
Expand All @@ -43,6 +44,7 @@ public static class MainTuning {

/**
* create main tuning.
* @param initializationRetryDelaySeconds initialization retry delay
* @param domainPresenceFailureRetrySeconds domain presence failure retry
* @param domainPresenceFailureRetryMaxCount domain presence failure retry max count
* @param domainPresenceRecheckIntervalSeconds domain presence recheck interval
Expand All @@ -54,6 +56,7 @@ public static class MainTuning {
* @param eventualLongDelay eventual long delay
*/
public MainTuning(
int initializationRetryDelaySeconds,
int domainPresenceFailureRetrySeconds,
int domainPresenceFailureRetryMaxCount,
int domainPresenceRecheckIntervalSeconds,
Expand All @@ -63,6 +66,7 @@ public MainTuning(
int stuckPodRecheckSeconds,
long initialShortDelay,
long eventualLongDelay) {
this.initializationRetryDelaySeconds = initializationRetryDelaySeconds;
this.domainPresenceFailureRetrySeconds = domainPresenceFailureRetrySeconds;
this.domainPresenceFailureRetryMaxCount = domainPresenceFailureRetryMaxCount;
this.domainPresenceRecheckIntervalSeconds = domainPresenceRecheckIntervalSeconds;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ private static void updateTuningParameters() {
private void update() {
MainTuning main =
new MainTuning(
(int) readTuningParameter("initializationRetryDelaySeconds", 5),
(int) readTuningParameter("domainPresenceFailureRetrySeconds", 10),
(int) readTuningParameter("domainPresenceFailureRetryMaxCount", 5),
(int) readTuningParameter("domainPresenceRecheckIntervalSeconds", 120),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.concurrent.Callable;
import java.util.function.Consumer;
import javax.annotation.Nonnull;

Expand Down Expand Up @@ -66,6 +67,9 @@
import oracle.kubernetes.operator.calls.RetryStrategy;
import oracle.kubernetes.operator.calls.SynchronousCallDispatcher;
import oracle.kubernetes.operator.calls.SynchronousCallFactory;
import oracle.kubernetes.operator.logging.LoggingFacade;
import oracle.kubernetes.operator.logging.LoggingFactory;
import oracle.kubernetes.operator.logging.MessageKeys;
import oracle.kubernetes.operator.work.Step;
import oracle.kubernetes.weblogic.domain.api.WeblogicApi;
import oracle.kubernetes.weblogic.domain.model.Domain;
Expand All @@ -77,6 +81,8 @@
/** Simplifies synchronous and asynchronous call patterns to the Kubernetes API Server. */
@SuppressWarnings({"WeakerAccess", "UnusedReturnValue"})
public class CallBuilder {
private static final LoggingFacade LOGGER = LoggingFactory.getLogger("Operator", "Operator");

/** HTTP status code for "Not Found". */
public static final int NOT_FOUND = 404;

Expand Down Expand Up @@ -594,6 +600,54 @@ private <T> T executeSynchronousCall(
return DISPATCHER.execute(factory, requestParams, helper);
}

/**
* Execute a synchronous call with a retry on failure.
* @param call The call
* @param retryDelaySeconds Retry delay in seconds
* @param <T> Call return type
* @return Results of operation, if successful
* @throws Exception Exception types other than ApiException, which will cause failure
*/
public <T> T executeSynchronousCallWithRetry(Callable<T> call, int retryDelaySeconds) throws Exception {
/*
* Implementation Note: synchronous calls are only allowed during operator initialization.
* All make-right work must be done with the asynchronous calling pattern. Therefore, since
* we know that this method will only be invoked during operator initialization, we've chosen
* not to put a limit on the number of retries. This is acceptable because the liveness probe will
* eventually kill the operator if the initialization sequence does not complete.
*
* This call was specifically added to address the Istio-related use case where the operator attempts
* to initialize prior to the Istio Envoy sidecar completing its initialization as described in this
* Istio bug: https://github.com/istio/istio/issues/11130. However, the pattern will also work for
* use cases where the Kubernetes master happens to temporarily unavailable just as the operator is
* starting.
*/
T result = null;
boolean complete = false;
do {
try {
result = call.call();
complete = true;
} catch (RuntimeException re) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What happened if it is a RuntimeException but not an ApiException? Do we want to limit the number of retries or make it retry forever (hopefully the condition can be resolved)?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll think about other exceptions... I didn't want to limit the number of retries here because the operator cannot go on until it can connect to the master and the operator will eventually be killed by the liveness probe.

Throwable cause = re.getCause();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does the ApiException ever get nested further down? Is there ever a case where you need to loop through the causes looking for ApiException?

if (cause instanceof ApiException) {
LOGGER.warning(MessageKeys.EXCEPTION, cause);
}
} catch (Throwable t) {
LOGGER.warning(MessageKeys.EXCEPTION, t);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add a comment here that we expect the liveness probe to cancel this process if it "retries forever". The next coder may not understand the assumption/expectation.

}

if (complete) {
break;
}

Thread.sleep(retryDelaySeconds * 1000L);

// We are intentionally not limiting the number of retries as described in the implementation note above.
} while (true);
return result;
}

/**
* Read namespace.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,12 @@
import java.util.Map;
import javax.annotation.Nonnull;

import io.kubernetes.client.openapi.ApiException;
import io.kubernetes.client.openapi.models.V1ResourceRule;
import io.kubernetes.client.openapi.models.V1SelfSubjectRulesReview;
import io.kubernetes.client.openapi.models.V1SubjectRulesReviewStatus;
import io.kubernetes.client.openapi.models.VersionInfo;
import oracle.kubernetes.operator.Main;
import oracle.kubernetes.operator.TuningParameters;
import oracle.kubernetes.operator.helpers.AuthorizationProxy.Operation;
import oracle.kubernetes.operator.helpers.AuthorizationProxy.Resource;
import oracle.kubernetes.operator.logging.LoggingFacade;
Expand Down Expand Up @@ -205,9 +205,12 @@ public static KubernetesVersion performK8sVersionCheck() {
LOGGER.fine(MessageKeys.VERIFY_K8S_MIN_VERSION);

try {
return createAndValidateKubernetesVersion(new CallBuilder().readVersionCode());
} catch (ApiException ae) {
LOGGER.warning(MessageKeys.K8S_VERSION_CHECK_FAILURE, ae);
CallBuilder cb = new CallBuilder();
return createAndValidateKubernetesVersion(
cb.executeSynchronousCallWithRetry(() -> cb.readVersionCode(),
TuningParameters.getInstance().getMainTuning().initializationRetryDelaySeconds));
} catch (Throwable t) {
LOGGER.warning(MessageKeys.K8S_VERSION_CHECK_FAILURE, t);
return KubernetesVersion.UNREADABLE;
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ private static String toJson(Object object) {

@Before
public void setUp() throws NoSuchFieldException {
mementos.add(TestUtils.silenceOperatorLogger());
mementos.add(TestUtils.silenceOperatorLogger().ignoringLoggedExceptions(ApiException.class));
mementos.add(PseudoServletCallDispatcher.install(getHostPath()));
}

Expand All @@ -78,6 +78,34 @@ public void getVersionCode_returnsAVersionInfo() throws ApiException {
assertThat(callBuilder.readVersionCode(), equalTo(versionInfo));
}

@Test
public void getVersionCode_firstAttemptFailsAndThenReturnsAVersionInfo() throws Exception {
VersionInfo versionInfo = new VersionInfo().major("1").minor("2");
defineHttpGetResponse("/version/", new FailOnceGetServlet(versionInfo, HTTP_BAD_REQUEST));

assertThat(callBuilder.executeSynchronousCallWithRetry(
() -> callBuilder.readVersionCode(), 1), equalTo(versionInfo));
}

static class FailOnceGetServlet extends JsonGetServlet {

final int errorCode;
int numGetResponseCalled = 0;

FailOnceGetServlet(Object returnValue, int errorCode) {
super(returnValue);
this.errorCode = errorCode;
}

@Override
public WebResource getGetResponse() throws IOException {
if (numGetResponseCalled++ > 0) {
return super.getGetResponse();
}
return new WebResource("", errorCode);
}
}

@Test
public void listDomains_returnsListasJson() throws ApiException {
DomainList list = new DomainList().withItems(Arrays.asList(new Domain(), new Domain()));
Expand Down Expand Up @@ -132,6 +160,11 @@ private JsonServlet defineHttpGetResponse(String resourceName, Object response)
return servlet;
}

private void defineHttpGetResponse(
String resourceName, PseudoServlet pseudoServlet) {
defineResource(resourceName, pseudoServlet);
}

private void defineHttpPostResponse(String resourceName, Object response) {
defineResource(resourceName, new JsonPostServlet(response));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ public static void setParameter(String key, String value) {

@Override
public MainTuning getMainTuning() {
return new MainTuning(2, 2, 2, 2, 2, 2, 30, 2L, 2L);
return new MainTuning(5, 2, 2, 2, 2, 2, 2, 30, 2L, 2L);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ public void setUp() throws Exception {
consoleControl = TestUtils.silenceOperatorLogger().collectLogMessages(logRecords, LOG_KEYS);
mementos.add(consoleControl);
mementos.add(ClientFactoryStub.install());
mementos.add(TuningParametersStub.install());
mementos.add(testSupport.installSynchronousCallDispatcher());
}

Expand Down