Skip to content

xds: add "resource_timer_is_transient_failure" server feature #12063

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Jul 15, 2025
9 changes: 6 additions & 3 deletions xds/src/main/java/io/grpc/xds/client/Bootstrapper.java
Original file line number Diff line number Diff line change
Expand Up @@ -63,17 +63,20 @@ public abstract static class ServerInfo {

public abstract boolean isTrustedXdsServer();

public abstract boolean resourceTimerIsTransientError();

@VisibleForTesting
public static ServerInfo create(String target, @Nullable Object implSpecificConfig) {
return new AutoValue_Bootstrapper_ServerInfo(target, implSpecificConfig, false, false);
return new AutoValue_Bootstrapper_ServerInfo(target, implSpecificConfig,
false, false, false);
}

@VisibleForTesting
public static ServerInfo create(
String target, Object implSpecificConfig, boolean ignoreResourceDeletion,
boolean isTrustedXdsServer) {
boolean isTrustedXdsServer, boolean resourceTimerIsTransientError) {
return new AutoValue_Bootstrapper_ServerInfo(target, implSpecificConfig,
ignoreResourceDeletion, isTrustedXdsServer);
ignoreResourceDeletion, isTrustedXdsServer, resourceTimerIsTransientError);
}
}

Expand Down
13 changes: 12 additions & 1 deletion xds/src/main/java/io/grpc/xds/client/BootstrapperImpl.java
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ public abstract class BootstrapperImpl extends Bootstrapper {

public static final String GRPC_EXPERIMENTAL_XDS_FALLBACK =
"GRPC_EXPERIMENTAL_XDS_FALLBACK";
public static final String GRPC_EXPERIMENTAL_XDS_DATA_ERROR_HANDLING =
"GRPC_EXPERIMENTAL_XDS_DATA_ERROR_HANDLING";

// Client features.
@VisibleForTesting
Expand All @@ -54,10 +56,15 @@ public abstract class BootstrapperImpl extends Bootstrapper {
// Server features.
private static final String SERVER_FEATURE_IGNORE_RESOURCE_DELETION = "ignore_resource_deletion";
private static final String SERVER_FEATURE_TRUSTED_XDS_SERVER = "trusted_xds_server";
private static final String
SERVER_FEATURE_RESOURCE_TIMER_IS_TRANSIENT_ERROR = "resource_timer_is_transient_error";

@VisibleForTesting
static boolean enableXdsFallback = GrpcUtil.getFlag(GRPC_EXPERIMENTAL_XDS_FALLBACK, true);

static boolean XdsDataErrorHandlingEnabled
= GrpcUtil.getFlag(GRPC_EXPERIMENTAL_XDS_DATA_ERROR_HANDLING, false);

protected final XdsLogger logger;

protected FileReader reader = LocalFileReader.INSTANCE;
Expand Down Expand Up @@ -247,18 +254,22 @@ private List<ServerInfo> parseServerInfos(List<?> rawServerConfigs, XdsLogger lo

Object implSpecificConfig = getImplSpecificConfig(serverConfig, serverUri);

boolean resourceTimerIsTransientError = false;
boolean ignoreResourceDeletion = false;
// "For forward compatibility reasons, the client will ignore any entry in the list that it
// does not understand, regardless of type."
List<?> serverFeatures = JsonUtil.getList(serverConfig, "server_features");
if (serverFeatures != null) {
logger.log(XdsLogLevel.INFO, "Server features: {0}", serverFeatures);
ignoreResourceDeletion = serverFeatures.contains(SERVER_FEATURE_IGNORE_RESOURCE_DELETION);
resourceTimerIsTransientError = XdsDataErrorHandlingEnabled
&& serverFeatures.contains(SERVER_FEATURE_RESOURCE_TIMER_IS_TRANSIENT_ERROR);
}
servers.add(
ServerInfo.create(serverUri, implSpecificConfig, ignoreResourceDeletion,
serverFeatures != null
&& serverFeatures.contains(SERVER_FEATURE_TRUSTED_XDS_SERVER)));
&& serverFeatures.contains(SERVER_FEATURE_TRUSTED_XDS_SERVER),
resourceTimerIsTransientError));
}
return servers.build();
}
Expand Down
6 changes: 5 additions & 1 deletion xds/src/main/java/io/grpc/xds/client/XdsClient.java
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,10 @@
return new ResourceMetadata(ResourceMetadataStatus.DOES_NOT_EXIST, "", 0, false, null, null);
}

public static ResourceMetadata newResourceMetadataTimeout() {
return new ResourceMetadata(ResourceMetadataStatus.TIMEOUT, "", 0, false, null, null);

Check warning on line 203 in xds/src/main/java/io/grpc/xds/client/XdsClient.java

View check run for this annotation

Codecov / codecov/patch

xds/src/main/java/io/grpc/xds/client/XdsClient.java#L203

Added line #L203 was not covered by tests
}

public static ResourceMetadata newResourceMetadataAcked(
Any rawResource, String version, long updateTimeNanos) {
checkNotNull(rawResource, "rawResource");
Expand Down Expand Up @@ -256,7 +260,7 @@
* config_dump.proto</a>
*/
public enum ResourceMetadataStatus {
UNKNOWN, REQUESTED, DOES_NOT_EXIST, ACKED, NACKED
UNKNOWN, REQUESTED, DOES_NOT_EXIST, ACKED, NACKED, TIMEOUT
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is used by CSDS, and looks like it crashes if you don't update it:

static ClientResourceStatus metadataStatusToClientStatus(ResourceMetadataStatus status) {
switch (status) {
case UNKNOWN:
return ClientResourceStatus.UNKNOWN;
case DOES_NOT_EXIST:
return ClientResourceStatus.DOES_NOT_EXIST;
case REQUESTED:
return ClientResourceStatus.REQUESTED;
case ACKED:
return ClientResourceStatus.ACKED;
case NACKED:
return ClientResourceStatus.NACKED;
default:
throw new AssertionError("Unexpected ResourceMetadataStatus: " + status);
}
}

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So in the case of TIMEOUT, CSDS service should return either ClientResourceStatus.REQUESTED or ClientResourceStatus.DOES_NOT_EXIST. What do you think? DOES_NOT_EXIST fits better.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That is clearly not something for us to arbitrarily decide during implementation. That's defined by the gRFC. They map 1:1 with the CSDS enum values.

Those states map directly to the states in the CSDS ClientResourceStatus enum.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right. I see now: envoyproxy/envoy@2aaa544. We need to do envoy proto sync to grpc java to have these.

}

/**
Expand Down
15 changes: 13 additions & 2 deletions xds/src/main/java/io/grpc/xds/client/XdsClientImpl.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;
import static io.grpc.xds.client.BootstrapperImpl.XdsDataErrorHandlingEnabled;
import static io.grpc.xds.client.XdsResourceType.ParsedResource;
import static io.grpc.xds.client.XdsResourceType.ValidatedResourceUpdate;

Expand Down Expand Up @@ -67,6 +68,7 @@ public final class XdsClientImpl extends XdsClient implements ResourceStore {
// Longest time to wait, since the subscription to some resource, for concluding its absence.
@VisibleForTesting
public static final int INITIAL_RESOURCE_FETCH_TIMEOUT_SEC = 15;
public static final int EXTENDED_RESOURCE_FETCH_TIMEOUT_SEC = 30;

private final SynchronizationContext syncContext = new SynchronizationContext(
new Thread.UncaughtExceptionHandler() {
Expand Down Expand Up @@ -739,6 +741,9 @@ void restartTimer() {
// When client becomes ready, it triggers a restartTimer for all relevant subscribers.
return;
}
ServerInfo serverInfo = activeCpc.getServerInfo();
int timeoutSec = XdsDataErrorHandlingEnabled && serverInfo.resourceTimerIsTransientError()
? EXTENDED_RESOURCE_FETCH_TIMEOUT_SEC : INITIAL_RESOURCE_FETCH_TIMEOUT_SEC;

class ResourceNotFound implements Runnable {
@Override
Expand All @@ -762,7 +767,7 @@ public String toString() {
respTimer.cancel();
}
respTimer = syncContext.schedule(
new ResourceNotFound(), INITIAL_RESOURCE_FETCH_TIMEOUT_SEC, TimeUnit.SECONDS,
new ResourceNotFound(), timeoutSec, TimeUnit.SECONDS,
timeService);
}

Expand Down Expand Up @@ -841,6 +846,8 @@ void onAbsent(@Nullable ProcessingTracker processingTracker, ServerInfo serverIn
// Ignore deletion of State of the World resources when this feature is on,
// and the resource is reusable.
boolean ignoreResourceDeletionEnabled = serverInfo.ignoreResourceDeletion();
boolean resourceTimerIsTransientError =
XdsDataErrorHandlingEnabled && serverInfo.resourceTimerIsTransientError();
if (ignoreResourceDeletionEnabled && type.isFullStateOfTheWorld() && data != null) {
if (!resourceDeletionIgnored) {
logger.log(XdsLogLevel.FORCE_WARNING,
Expand All @@ -855,13 +862,17 @@ void onAbsent(@Nullable ProcessingTracker processingTracker, ServerInfo serverIn
if (!absent) {
data = null;
absent = true;
metadata = ResourceMetadata.newResourceMetadataDoesNotExist();
metadata = resourceTimerIsTransientError ? ResourceMetadata.newResourceMetadataTimeout() :
ResourceMetadata.newResourceMetadataDoesNotExist();
for (ResourceWatcher<T> watcher : watchers.keySet()) {
if (processingTracker != null) {
processingTracker.startTask();
}
watchers.get(watcher).execute(() -> {
try {
/*This will go after xdsClient watcher APIs are in.
watcher.onResourceChanged(StatusOr.fromStatus(Status.UNAVAILABLE.withDescription(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can just call onError(). That's equivalent for the current API as there is no data yet.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see a lot of new code has been merged since then. There are a lot of new failures in files like CdsLoadBalancer2Test, GrpcXdsClientImplTestBase, GrpcXdsClientImplV3Test, etc.
Same files and lines were also getting changed in xds client watcher API changes. Perhaps this should wait until xds client watcher api changes goes in?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why should there be any new failures? Sounds like you aren't setting resourceTimerIsTransientError correctly to preserve the old behavior for the majority of the tests.

"Resource " + resource + ": timeout obtaining resource from xDS server")));*/
watcher.onResourceDoesNotExist(resource);
} finally {
if (processingTracker != null) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3505,7 +3505,7 @@ private static Filter buildHttpConnectionManagerFilter(HttpFilter... httpFilters

private XdsResourceType.Args getXdsResourceTypeArgs(boolean isTrustedServer) {
return new XdsResourceType.Args(
ServerInfo.create("http://td", "", false, isTrustedServer), "1.0", null, null, null, null
ServerInfo.create("http://td", "", false, isTrustedServer, false), "1.0", null, null, null, null
);
}
}
4 changes: 2 additions & 2 deletions xds/src/test/java/io/grpc/xds/GrpcXdsClientImplTestBase.java
Original file line number Diff line number Diff line change
Expand Up @@ -347,7 +347,7 @@ public XdsTransport create(ServerInfo serverInfo) {
};

xdsServerInfo = ServerInfo.create(SERVER_URI, CHANNEL_CREDENTIALS, ignoreResourceDeletion(),
true);
true, false);
BootstrapInfo bootstrapInfo =
Bootstrapper.BootstrapInfo.builder()
.servers(Collections.singletonList(xdsServerInfo))
Expand Down Expand Up @@ -4201,7 +4201,7 @@ private XdsClientImpl createXdsClient(String serverUri) {
private BootstrapInfo buildBootStrap(String serverUri) {

ServerInfo xdsServerInfo = ServerInfo.create(serverUri, CHANNEL_CREDENTIALS,
ignoreResourceDeletion(), true);
ignoreResourceDeletion(), true, false);

return Bootstrapper.BootstrapInfo.builder()
.servers(Collections.singletonList(xdsServerInfo))
Expand Down
4 changes: 2 additions & 2 deletions xds/src/test/java/io/grpc/xds/XdsNameResolverTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -367,13 +367,13 @@ public void resolving_targetAuthorityInAuthoritiesMap() {
String serviceAuthority = "[::FFFF:129.144.52.38]:80";
bootstrapInfo = BootstrapInfo.builder()
.servers(ImmutableList.of(ServerInfo.create(
"td.googleapis.com", InsecureChannelCredentials.create(), true, true)))
"td.googleapis.com", InsecureChannelCredentials.create(), true, true, false)))
.node(Node.newBuilder().build())
.authorities(
ImmutableMap.of(targetAuthority, AuthorityInfo.create(
"xdstp://" + targetAuthority + "/envoy.config.listener.v3.Listener/%s?foo=1&bar=2",
ImmutableList.of(ServerInfo.create(
"td.googleapis.com", InsecureChannelCredentials.create(), true, true)))))
"td.googleapis.com", InsecureChannelCredentials.create(), true, true, false)))))
.build();
expectedLdsResourceName = "xdstp://xds.authority.com/envoy.config.listener.v3.Listener/"
+ "%5B::FFFF:129.144.52.38%5D:80?bar=2&foo=1"; // query param canonified
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ public static Bootstrapper.BootstrapInfo buildBootStrap(List<String> serverUris)

List<ServerInfo> serverInfos = new ArrayList<>();
for (String uri : serverUris) {
serverInfos.add(ServerInfo.create(uri, CHANNEL_CREDENTIALS, false, true));
serverInfos.add(ServerInfo.create(uri, CHANNEL_CREDENTIALS, false, true, false));
}
EnvoyProtoData.Node node = EnvoyProtoData.Node.newBuilder().setId("node-id").build();

Expand Down