Skip to content

[7.x] Use query param instead of a system property for opting in for new cluster health response code #79397

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Oct 18, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,13 @@
import org.elasticsearch.common.compress.CompressedXContent;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.unit.ByteSizeUnit;
import org.elasticsearch.core.TimeValue;
import org.elasticsearch.xcontent.XContentType;
import org.elasticsearch.common.xcontent.support.XContentMapValues;
import org.elasticsearch.core.TimeValue;
import org.elasticsearch.indices.recovery.RecoverySettings;
import org.elasticsearch.rest.RestStatus;
import org.elasticsearch.transport.RemoteClusterService;
import org.elasticsearch.transport.SniffConnectionStrategy;
import org.elasticsearch.xcontent.XContentType;

import java.io.IOException;
import java.util.Collections;
Expand Down Expand Up @@ -317,7 +317,7 @@ public void testClusterHealthNotFoundIndex() throws IOException {
assertThat(response.getStatus(), equalTo(ClusterHealthStatus.RED));
assertNoIndices(response);
assertWarnings("The HTTP status code for a cluster health timeout will be changed from 408 to 200 in a " +
"future version. Set the [es.cluster_health.request_timeout_200] system property to [true] to suppress this message and " +
"future version. Set the [return_200_for_cluster_health_timeout] query parameter to [true] to suppress this message and " +
"opt in to the future behaviour now.");
}

Expand Down
5 changes: 5 additions & 0 deletions docs/reference/cluster/health.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,11 @@ include::{es-repo-dir}/rest-api/common-parms.asciidoc[tag=timeoutparms]
provided or better, i.e. `green` > `yellow` > `red`. By default, will not
wait for any status.

`return_200_for_cluster_health_timeout`::
(Optional, Boolean) A boolean value which controls whether to return HTTP 200
status code instead of HTTP 408 in case of a cluster health timeout from
the server side. Defaults to false.

[[cluster-health-api-response-body]]
==== {api-response-body-title}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,10 @@
"red"
],
"description":"Wait until cluster is in a specific state"
},
"return_200_for_cluster_health_timeout":{
"type":"boolean",
"description":"Whether to return HTTP 200 instead of 408 in case of a cluster health timeout from the server side"
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,25 @@
- match: { initializing_shards: 0 }
- match: { unassigned_shards: 0 }
- gte: { number_of_pending_tasks: 0 }

---
"cluster health request timeout with 200 response code":
- skip:
version: " - 7.15.99"
reason: "return_200_for_cluster_health_timeout was added in 7.16"
- do:
cluster.health:
timeout: 1ms
wait_for_active_shards: 5
return_200_for_cluster_health_timeout: true

- is_true: cluster_name
- is_true: timed_out
- gte: { number_of_nodes: 1 }
- gte: { number_of_data_nodes: 1 }
- match: { active_primary_shards: 0 }
- match: { active_shards: 0 }
- match: { relocating_shards: 0 }
- match: { initializing_shards: 0 }
- match: { unassigned_shards: 0 }
- gte: { number_of_pending_tasks: 0 }
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ public class ClusterHealthRequest extends MasterNodeReadRequest<ClusterHealthReq
private ActiveShardCount waitForActiveShards = ActiveShardCount.NONE;
private String waitForNodes = "";
private Priority waitForEvents = null;
private boolean return200ForClusterHealthTimeout;

/**
* Only used by the high-level REST Client. Controls the details level of the health information returned.
* The default value is 'cluster'.
Expand Down Expand Up @@ -69,6 +71,9 @@ public ClusterHealthRequest(StreamInput in) throws IOException {
} else {
indicesOptions = IndicesOptions.lenientExpandOpen();
}
if (in.getVersion().onOrAfter(Version.V_7_16_0)) {
return200ForClusterHealthTimeout = in.readBoolean();
}
}

@Override
Expand Down Expand Up @@ -101,6 +106,11 @@ public void writeTo(StreamOutput out) throws IOException {
if (out.getVersion().onOrAfter(Version.V_7_2_0)) {
indicesOptions.writeIndicesOptions(out);
}
if (out.getVersion().onOrAfter(Version.V_7_16_0)) {
out.writeBoolean(return200ForClusterHealthTimeout);
} else if (return200ForClusterHealthTimeout) {
throw new IllegalArgumentException("Can't fix response code in a cluster involving nodes with version " + out.getVersion());
}
}

@Override
Expand Down Expand Up @@ -244,6 +254,18 @@ public Priority waitForEvents() {
return this.waitForEvents;
}

public boolean doesReturn200ForClusterHealthTimeout() {
return return200ForClusterHealthTimeout;
}

/**
* Sets whether to return HTTP 200 status code instead of HTTP 408 in case of a
* cluster health timeout from the server side.
*/
public void return200ForClusterHealthTimeout(boolean return200ForClusterHealthTimeout) {
this.return200ForClusterHealthTimeout = return200ForClusterHealthTimeout;
}

/**
* Set the level of detail for the health information to be returned.
* Only used by the high-level REST Client.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,25 +8,26 @@

package org.elasticsearch.action.admin.cluster.health;

import org.elasticsearch.Version;
import org.elasticsearch.action.ActionResponse;
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.cluster.health.ClusterHealthStatus;
import org.elasticsearch.cluster.health.ClusterIndexHealth;
import org.elasticsearch.cluster.health.ClusterStateHealth;
import org.elasticsearch.common.logging.DeprecationCategory;
import org.elasticsearch.common.logging.DeprecationLogger;
import org.elasticsearch.xcontent.ParseField;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.logging.DeprecationCategory;
import org.elasticsearch.common.logging.DeprecationLogger;
import org.elasticsearch.common.xcontent.StatusToXContentObject;
import org.elasticsearch.core.TimeValue;
import org.elasticsearch.rest.RestStatus;
import org.elasticsearch.rest.action.search.RestSearchAction;
import org.elasticsearch.xcontent.ConstructingObjectParser;
import org.elasticsearch.xcontent.ObjectParser;
import org.elasticsearch.common.xcontent.StatusToXContentObject;
import org.elasticsearch.xcontent.ParseField;
import org.elasticsearch.xcontent.XContentBuilder;
import org.elasticsearch.xcontent.XContentParser;
import org.elasticsearch.rest.RestStatus;
import org.elasticsearch.rest.action.search.RestSearchAction;

import java.io.IOException;
import java.util.HashMap;
Expand Down Expand Up @@ -102,10 +103,10 @@ public class ClusterHealthResponse extends ActionResponse implements StatusToXCo

private static final ObjectParser.NamedObjectParser<ClusterIndexHealth, Void> INDEX_PARSER =
(XContentParser parser, Void context, String index) -> ClusterIndexHealth.innerFromXContent(parser, index);
private static final String ES_CLUSTER_HEALTH_REQUEST_TIMEOUT_200_KEY = "es.cluster_health.request_timeout_200";
static final String ES_CLUSTER_HEALTH_REQUEST_TIMEOUT_200_KEY = "return_200_for_cluster_health_timeout";
static final String CLUSTER_HEALTH_REQUEST_TIMEOUT_DEPRECATION_MSG = "The HTTP status code for a cluster health timeout " +
"will be changed from 408 to 200 in a future version. Set the [" + ES_CLUSTER_HEALTH_REQUEST_TIMEOUT_200_KEY + "] " +
"system property to [true] to suppress this message and opt in to the future behaviour now.";
"query parameter to [true] to suppress this message and opt in to the future behaviour now.";

static {
// ClusterStateHealth fields
Expand Down Expand Up @@ -138,15 +139,7 @@ public class ClusterHealthResponse extends ActionResponse implements StatusToXCo
private boolean timedOut = false;
private ClusterStateHealth clusterStateHealth;
private ClusterHealthStatus clusterHealthStatus;
private boolean esClusterHealthRequestTimeout200 = readEsClusterHealthRequestTimeout200FromProperty();

public ClusterHealthResponse() {
}

/** For the testing of opting in for the 200 status code without setting a system property */
ClusterHealthResponse(boolean esClusterHealthRequestTimeout200) {
this.esClusterHealthRequestTimeout200 = esClusterHealthRequestTimeout200;
}
private boolean return200ForClusterHealthTimeout;

public ClusterHealthResponse(StreamInput in) throws IOException {
super(in);
Expand All @@ -158,22 +151,29 @@ public ClusterHealthResponse(StreamInput in) throws IOException {
numberOfInFlightFetch = in.readInt();
delayedUnassignedShards= in.readInt();
taskMaxWaitingTime = in.readTimeValue();
if (in.getVersion().onOrAfter(Version.V_7_16_0)) {
return200ForClusterHealthTimeout = in.readBoolean();
}
}

/** needed for plugins BWC */
public ClusterHealthResponse(String clusterName, String[] concreteIndices, ClusterState clusterState) {
this(clusterName, concreteIndices, clusterState, -1, -1, -1, TimeValue.timeValueHours(0));
public ClusterHealthResponse(String clusterName, String[] concreteIndices, ClusterState clusterState,
boolean return200ForServerTimeout) {
this(clusterName, concreteIndices, clusterState, -1, -1, -1, TimeValue.timeValueHours(0),
return200ForServerTimeout);
}

public ClusterHealthResponse(String clusterName, String[] concreteIndices, ClusterState clusterState, int numberOfPendingTasks,
int numberOfInFlightFetch, int delayedUnassignedShards, TimeValue taskMaxWaitingTime) {
int numberOfInFlightFetch, int delayedUnassignedShards, TimeValue taskMaxWaitingTime,
boolean return200ForServerTimeout) {
this.clusterName = clusterName;
this.numberOfPendingTasks = numberOfPendingTasks;
this.numberOfInFlightFetch = numberOfInFlightFetch;
this.delayedUnassignedShards = delayedUnassignedShards;
this.taskMaxWaitingTime = taskMaxWaitingTime;
this.clusterStateHealth = new ClusterStateHealth(clusterState, concreteIndices);
this.clusterHealthStatus = clusterStateHealth.getStatus();
this.return200ForClusterHealthTimeout = return200ForServerTimeout;
}

/**
Expand Down Expand Up @@ -305,6 +305,11 @@ public void writeTo(StreamOutput out) throws IOException {
out.writeInt(numberOfInFlightFetch);
out.writeInt(delayedUnassignedShards);
out.writeTimeValue(taskMaxWaitingTime);
if (out.getVersion().onOrAfter(Version.V_7_16_0)) {
out.writeBoolean(return200ForClusterHealthTimeout);
} else if (return200ForClusterHealthTimeout) {
throw new IllegalArgumentException("Can't fix response code in a cluster involving nodes with version " + out.getVersion());
}
}

@Override
Expand All @@ -317,7 +322,7 @@ public RestStatus status() {
if (isTimedOut() == false) {
return RestStatus.OK;
}
if (esClusterHealthRequestTimeout200) {
if (return200ForClusterHealthTimeout) {
return RestStatus.OK;
} else {
deprecationLogger.critical(DeprecationCategory.API,"cluster_health_request_timeout",
Expand Down Expand Up @@ -383,17 +388,4 @@ public int hashCode() {
return Objects.hash(clusterName, numberOfPendingTasks, numberOfInFlightFetch, delayedUnassignedShards, taskMaxWaitingTime,
timedOut, clusterStateHealth, clusterHealthStatus);
}

private static boolean readEsClusterHealthRequestTimeout200FromProperty() {
String property = System.getProperty(ES_CLUSTER_HEALTH_REQUEST_TIMEOUT_200_KEY);
if (property == null) {
return false;
}
if (Boolean.parseBoolean(property)) {
return true;
} else {
throw new IllegalArgumentException(ES_CLUSTER_HEALTH_REQUEST_TIMEOUT_200_KEY + " can only be unset or [true] but was ["
+ property + "]");
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@
import org.elasticsearch.cluster.service.ClusterService;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.core.TimeValue;
import org.elasticsearch.common.util.CollectionUtils;
import org.elasticsearch.core.TimeValue;
import org.elasticsearch.index.IndexNotFoundException;
import org.elasticsearch.node.NodeClosedException;
import org.elasticsearch.tasks.Task;
Expand Down Expand Up @@ -232,7 +232,8 @@ private enum TimeoutState {

private ClusterHealthResponse getResponse(final ClusterHealthRequest request, ClusterState clusterState,
final int waitFor, TimeoutState timeoutState) {
ClusterHealthResponse response = clusterHealth(request, clusterState, clusterService.getMasterService().numberOfPendingTasks(),
ClusterHealthResponse response = clusterHealth(request, clusterState,
clusterService.getMasterService().numberOfPendingTasks(),
allocationService.getNumberOfInFlightFetches(), clusterService.getMasterService().getMaxTaskWaitTime());
int readyCounter = prepareResponse(request, response, clusterState, indexNameExpressionResolver);
boolean valid = (readyCounter == waitFor);
Expand Down Expand Up @@ -331,8 +332,8 @@ static int prepareResponse(final ClusterHealthRequest request, final ClusterHeal
}


private ClusterHealthResponse clusterHealth(ClusterHealthRequest request, ClusterState clusterState, int numberOfPendingTasks,
int numberOfInFlightFetch, TimeValue pendingTaskTimeInQueue) {
private ClusterHealthResponse clusterHealth(ClusterHealthRequest request, ClusterState clusterState,
int numberOfPendingTasks, int numberOfInFlightFetch, TimeValue pendingTaskTimeInQueue) {
if (logger.isTraceEnabled()) {
logger.trace("Calculating health based on state version [{}]", clusterState.version());
}
Expand All @@ -344,12 +345,13 @@ private ClusterHealthResponse clusterHealth(ClusterHealthRequest request, Cluste
// one of the specified indices is not there - treat it as RED.
ClusterHealthResponse response = new ClusterHealthResponse(clusterState.getClusterName().value(), Strings.EMPTY_ARRAY,
clusterState, numberOfPendingTasks, numberOfInFlightFetch, UnassignedInfo.getNumberOfDelayedUnassigned(clusterState),
pendingTaskTimeInQueue);
pendingTaskTimeInQueue, request.doesReturn200ForClusterHealthTimeout());
response.setStatus(ClusterHealthStatus.RED);
return response;
}

return new ClusterHealthResponse(clusterState.getClusterName().value(), concreteIndices, clusterState, numberOfPendingTasks,
numberOfInFlightFetch, UnassignedInfo.getNumberOfDelayedUnassigned(clusterState), pendingTaskTimeInQueue);
return new ClusterHealthResponse(clusterState.getClusterName().value(), concreteIndices, clusterState,
numberOfPendingTasks, numberOfInFlightFetch, UnassignedInfo.getNumberOfDelayedUnassigned(clusterState), pendingTaskTimeInQueue,
request.doesReturn200ForClusterHealthTimeout());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,9 @@ public static ClusterHealthRequest fromRequest(final RestRequest request) {
if (request.param("wait_for_events") != null) {
clusterHealthRequest.waitForEvents(Priority.valueOf(request.param("wait_for_events").toUpperCase(Locale.ROOT)));
}
clusterHealthRequest.return200ForClusterHealthTimeout(request.paramAsBoolean(
"return_200_for_cluster_health_timeout",
clusterHealthRequest.doesReturn200ForClusterHealthTimeout()));
return clusterHealthRequest;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ public void testSerialize() throws Exception {
assertThat(cloneRequest.waitForEvents(), equalTo(originalRequest.waitForEvents()));
assertIndicesEquals(cloneRequest.indices(), originalRequest.indices());
assertThat(cloneRequest.indicesOptions(), equalTo(originalRequest.indicesOptions()));
assertThat(cloneRequest.doesReturn200ForClusterHealthTimeout(), equalTo(originalRequest.doesReturn200ForClusterHealthTimeout()));
}

public void testRequestReturnsHiddenIndicesByDefault() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@
import org.elasticsearch.common.io.stream.Writeable;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.core.TimeValue;
import org.elasticsearch.xcontent.ToXContent;
import org.elasticsearch.xcontent.XContentParser;
import org.elasticsearch.rest.RestStatus;
import org.elasticsearch.test.AbstractSerializingTestCase;
import org.elasticsearch.xcontent.ToXContent;
import org.elasticsearch.xcontent.XContentParser;
import org.hamcrest.Matchers;

import java.io.IOException;
Expand All @@ -43,7 +43,7 @@ public class ClusterHealthResponsesTests extends AbstractSerializingTestCase<Clu
private final ClusterHealthRequest.Level level = randomFrom(ClusterHealthRequest.Level.values());

public void testIsTimeout() {
ClusterHealthResponse res = new ClusterHealthResponse();
ClusterHealthResponse res = new ClusterHealthResponse("", new String[]{}, ClusterState.EMPTY_STATE, false);
for (int i = 0; i < 5; i++) {
res.setTimedOut(randomBoolean());
if (res.isTimedOut()) {
Expand All @@ -56,7 +56,7 @@ public void testIsTimeout() {
}

public void testTimeoutReturns200IfOptedIn() {
ClusterHealthResponse res = new ClusterHealthResponse(true);
ClusterHealthResponse res = new ClusterHealthResponse("", new String[]{}, ClusterState.EMPTY_STATE, true);
for (int i = 0; i < 5; i++) {
res.setTimedOut(randomBoolean());
assertEquals(RestStatus.OK, res.status());
Expand All @@ -70,7 +70,7 @@ public void testClusterHealth() throws IOException {
int delayedUnassigned = randomIntBetween(0, 200);
TimeValue pendingTaskInQueueTime = TimeValue.timeValueMillis(randomIntBetween(1000, 100000));
ClusterHealthResponse clusterHealth = new ClusterHealthResponse("bla", new String[] {Metadata.ALL},
clusterState, pendingTasks, inFlight, delayedUnassigned, pendingTaskInQueueTime);
clusterState, pendingTasks, inFlight, delayedUnassigned, pendingTaskInQueueTime, false);
clusterHealth = maybeSerialize(clusterHealth);
assertClusterHealth(clusterHealth);
assertThat(clusterHealth.getNumberOfPendingTasks(), Matchers.equalTo(pendingTasks));
Expand Down
Loading