Skip to content

Commit ae1fb94

Browse files
authored
core: use exponential backoff for name resolution (#4105)
1 parent d45e1ab commit ae1fb94

File tree

6 files changed

+393
-291
lines changed

6 files changed

+393
-291
lines changed

core/src/main/java/io/grpc/NameResolver.java

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,10 @@
3030
* <p>The addresses and attributes of a target may be changed over time, thus the caller registers a
3131
* {@link Listener} to receive continuous updates.
3232
*
33+
* <p>A {@code NameResolver} does not need to automatically re-resolve on failure. Instead, the
34+
* {@link Listener} is responsible for eventually (after an appropriate backoff period) invoking
35+
* {@link #refresh()}.
36+
*
3337
* @since 1.0.0
3438
*/
3539
@ExperimentalApi("https://github.com/grpc/grpc-java/issues/1770")
@@ -136,7 +140,8 @@ public interface Listener {
136140
void onAddresses(List<EquivalentAddressGroup> servers, Attributes attributes);
137141

138142
/**
139-
* Handles an error from the resolver.
143+
* Handles an error from the resolver. The listener is responsible for eventually invoking
144+
* {@link #refresh()} to re-attempt resolution.
140145
*
141146
* @param error a non-OK status
142147
* @since 1.0.0

core/src/main/java/io/grpc/internal/DnsNameResolver.java

Lines changed: 0 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,6 @@
3636
import java.util.Collections;
3737
import java.util.List;
3838
import java.util.concurrent.ExecutorService;
39-
import java.util.concurrent.ScheduledExecutorService;
40-
import java.util.concurrent.ScheduledFuture;
41-
import java.util.concurrent.TimeUnit;
4239
import java.util.logging.Level;
4340
import java.util.logging.Logger;
4441
import java.util.regex.Pattern;
@@ -79,29 +76,22 @@ final class DnsNameResolver extends NameResolver {
7976
private final String authority;
8077
private final String host;
8178
private final int port;
82-
private final Resource<ScheduledExecutorService> timerServiceResource;
8379
private final Resource<ExecutorService> executorResource;
8480
private final ProxyDetector proxyDetector;
8581
@GuardedBy("this")
8682
private boolean shutdown;
8783
@GuardedBy("this")
88-
private ScheduledExecutorService timerService;
89-
@GuardedBy("this")
9084
private ExecutorService executor;
9185
@GuardedBy("this")
92-
private ScheduledFuture<?> resolutionTask;
93-
@GuardedBy("this")
9486
private boolean resolving;
9587
@GuardedBy("this")
9688
private Listener listener;
9789

9890
DnsNameResolver(@Nullable String nsAuthority, String name, Attributes params,
99-
Resource<ScheduledExecutorService> timerServiceResource,
10091
Resource<ExecutorService> executorResource,
10192
ProxyDetector proxyDetector) {
10293
// TODO: if a DNS server is provided as nsAuthority, use it.
10394
// https://www.captechconsulting.com/blogs/accessing-the-dusty-corners-of-dns-with-java
104-
this.timerServiceResource = timerServiceResource;
10595
this.executorResource = executorResource;
10696
// Must prepend a "//" to the name when constructing a URI, otherwise it will be treated as an
10797
// opaque URI, thus the authority and host of the resulted URI would be null.
@@ -131,7 +121,6 @@ public final String getServiceAuthority() {
131121
@Override
132122
public final synchronized void start(Listener listener) {
133123
Preconditions.checkState(this.listener == null, "already started");
134-
timerService = SharedResourceHolder.get(timerServiceResource);
135124
executor = SharedResourceHolder.get(executorResource);
136125
this.listener = Preconditions.checkNotNull(listener, "listener");
137126
resolve();
@@ -148,11 +137,6 @@ public final synchronized void refresh() {
148137
public void run() {
149138
Listener savedListener;
150139
synchronized (DnsNameResolver.this) {
151-
// If this task is started by refresh(), there might already be a scheduled task.
152-
if (resolutionTask != null) {
153-
resolutionTask.cancel(false);
154-
resolutionTask = null;
155-
}
156140
if (shutdown) {
157141
return;
158142
}
@@ -171,16 +155,6 @@ public void run() {
171155
try {
172156
resolvedInetAddrs = delegateResolver.resolve(host);
173157
} catch (Exception e) {
174-
synchronized (DnsNameResolver.this) {
175-
if (shutdown) {
176-
return;
177-
}
178-
// Because timerService is the single-threaded GrpcUtil.TIMER_SERVICE in production,
179-
// we need to delegate the blocking work to the executor
180-
resolutionTask =
181-
timerService.schedule(new LogExceptionRunnable(resolutionRunnableOnExecutor),
182-
1, TimeUnit.MINUTES);
183-
}
184158
savedListener.onError(
185159
Status.UNAVAILABLE.withDescription("Unable to resolve host " + host).withCause(e));
186160
return;
@@ -207,17 +181,6 @@ public void run() {
207181
}
208182
};
209183

210-
private final Runnable resolutionRunnableOnExecutor = new Runnable() {
211-
@Override
212-
public void run() {
213-
synchronized (DnsNameResolver.this) {
214-
if (!shutdown) {
215-
executor.execute(resolutionRunnable);
216-
}
217-
}
218-
}
219-
};
220-
221184
@GuardedBy("this")
222185
private void resolve() {
223186
if (resolving || shutdown) {
@@ -232,12 +195,6 @@ public final synchronized void shutdown() {
232195
return;
233196
}
234197
shutdown = true;
235-
if (resolutionTask != null) {
236-
resolutionTask.cancel(false);
237-
}
238-
if (timerService != null) {
239-
timerService = SharedResourceHolder.release(timerServiceResource, timerService);
240-
}
241198
if (executor != null) {
242199
executor = SharedResourceHolder.release(executorResource, executor);
243200
}

core/src/main/java/io/grpc/internal/DnsNameResolverProvider.java

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,12 @@ public DnsNameResolver newNameResolver(URI targetUri, Attributes params) {
4747
Preconditions.checkArgument(targetPath.startsWith("/"),
4848
"the path component (%s) of the target (%s) must start with '/'", targetPath, targetUri);
4949
String name = targetPath.substring(1);
50-
return new DnsNameResolver(targetUri.getAuthority(), name, params, GrpcUtil.TIMER_SERVICE,
51-
GrpcUtil.SHARED_CHANNEL_EXECUTOR, GrpcUtil.getProxyDetector());
50+
return new DnsNameResolver(
51+
targetUri.getAuthority(),
52+
name,
53+
params,
54+
GrpcUtil.SHARED_CHANNEL_EXECUTOR,
55+
GrpcUtil.getProxyDetector());
5256
} else {
5357
return null;
5458
}

core/src/main/java/io/grpc/internal/ManagedChannelImpl.java

Lines changed: 100 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -344,6 +344,7 @@ private void shutdownNameResolverAndLoadBalancer(boolean verifyActive) {
344344
checkState(lbHelper != null, "lbHelper is null");
345345
}
346346
if (nameResolver != null) {
347+
cancelNameResolverBackoff();
347348
nameResolver.shutdown();
348349
nameResolver = null;
349350
nameResolverStarted = false;
@@ -429,6 +430,46 @@ public void run() {
429430
idleTimeoutMillis, TimeUnit.MILLISECONDS);
430431
}
431432

433+
// Run from channelExecutor
434+
@VisibleForTesting
435+
class NameResolverRefresh implements Runnable {
436+
// Only mutated from channelExecutor
437+
boolean cancelled;
438+
439+
@Override
440+
public void run() {
441+
if (cancelled) {
442+
// Race detected: this task was scheduled on channelExecutor before
443+
// cancelNameResolverBackoff() could cancel the timer.
444+
return;
445+
}
446+
nameResolverRefreshFuture = null;
447+
nameResolverRefresh = null;
448+
if (nameResolver != null) {
449+
nameResolver.refresh();
450+
}
451+
}
452+
}
453+
454+
// Must be used from channelExecutor
455+
@Nullable private ScheduledFuture<?> nameResolverRefreshFuture;
456+
// Must be used from channelExecutor
457+
@Nullable private NameResolverRefresh nameResolverRefresh;
458+
// The policy to control backoff between name resolution attempts. Non-null when an attempt is
459+
// scheduled. Must be used from channelExecutor
460+
@Nullable private BackoffPolicy nameResolverBackoffPolicy;
461+
462+
// Must be run from channelExecutor
463+
private void cancelNameResolverBackoff() {
464+
if (nameResolverRefreshFuture != null) {
465+
nameResolverRefreshFuture.cancel(false);
466+
nameResolverRefresh.cancelled = true;
467+
nameResolverRefreshFuture = null;
468+
nameResolverRefresh = null;
469+
nameResolverBackoffPolicy = null;
470+
}
471+
}
472+
432473
private final ClientTransportProvider transportProvider = new ClientTransportProvider() {
433474
@Override
434475
public ClientTransport get(PickSubchannelArgs args) {
@@ -799,24 +840,28 @@ public void run() {
799840

800841
@Override
801842
public void resetConnectBackoff() {
802-
channelExecutor.executeLater(
803-
new Runnable() {
804-
@Override
805-
public void run() {
806-
if (shutdown.get()) {
807-
return;
808-
}
809-
if (nameResolverStarted) {
810-
nameResolver.refresh();
811-
}
812-
for (InternalSubchannel subchannel : subchannels) {
813-
subchannel.resetConnectBackoff();
814-
}
815-
for (OobChannel oobChannel : oobChannels) {
816-
oobChannel.resetConnectBackoff();
817-
}
818-
}
819-
}).drain();
843+
channelExecutor
844+
.executeLater(
845+
new Runnable() {
846+
@Override
847+
public void run() {
848+
if (shutdown.get()) {
849+
return;
850+
}
851+
if (nameResolverRefreshFuture != null) {
852+
checkState(nameResolverStarted, "name resolver must be started");
853+
cancelNameResolverBackoff();
854+
nameResolver.refresh();
855+
}
856+
for (InternalSubchannel subchannel : subchannels) {
857+
subchannel.resetConnectBackoff();
858+
}
859+
for (OobChannel oobChannel : oobChannels) {
860+
oobChannel.resetConnectBackoff();
861+
}
862+
}
863+
})
864+
.drain();
820865
}
821866

822867
@Override
@@ -1132,6 +1177,8 @@ public void run() {
11321177
return;
11331178
}
11341179

1180+
nameResolverBackoffPolicy = null;
1181+
11351182
try {
11361183
if (retryEnabled) {
11371184
retryPolicies = getRetryPolicies(config);
@@ -1156,16 +1203,41 @@ public void onError(final Status error) {
11561203
checkArgument(!error.isOk(), "the error status must not be OK");
11571204
logger.log(Level.WARNING, "[{0}] Failed to resolve name. status={1}",
11581205
new Object[] {getLogId(), error});
1159-
channelExecutor.executeLater(new Runnable() {
1160-
@Override
1161-
public void run() {
1162-
// Call LB only if it's not shutdown. If LB is shutdown, lbHelper won't match.
1163-
if (NameResolverListenerImpl.this.helper != ManagedChannelImpl.this.lbHelper) {
1164-
return;
1165-
}
1166-
balancer.handleNameResolutionError(error);
1167-
}
1168-
}).drain();
1206+
channelExecutor
1207+
.executeLater(
1208+
new Runnable() {
1209+
@Override
1210+
public void run() {
1211+
// Call LB only if it's not shutdown. If LB is shutdown, lbHelper won't match.
1212+
if (NameResolverListenerImpl.this.helper != ManagedChannelImpl.this.lbHelper) {
1213+
return;
1214+
}
1215+
balancer.handleNameResolutionError(error);
1216+
if (nameResolverRefreshFuture != null) {
1217+
// The name resolver may invoke onError multiple times, but we only want to
1218+
// schedule one backoff attempt
1219+
// TODO(ericgribkoff) Update contract of NameResolver.Listener or decide if we
1220+
// want to reset the backoff interval upon repeated onError() calls
1221+
return;
1222+
}
1223+
if (nameResolverBackoffPolicy == null) {
1224+
nameResolverBackoffPolicy = backoffPolicyProvider.get();
1225+
}
1226+
long delayNanos = nameResolverBackoffPolicy.nextBackoffNanos();
1227+
if (logger.isLoggable(Level.FINE)) {
1228+
logger.log(
1229+
Level.FINE,
1230+
"[{0}] Scheduling DNS resolution backoff for {1} ns",
1231+
new Object[] {logId, delayNanos});
1232+
}
1233+
nameResolverRefresh = new NameResolverRefresh();
1234+
nameResolverRefreshFuture =
1235+
transportFactory
1236+
.getScheduledExecutorService()
1237+
.schedule(nameResolverRefresh, delayNanos, TimeUnit.NANOSECONDS);
1238+
}
1239+
})
1240+
.drain();
11691241
}
11701242
}
11711243

0 commit comments

Comments
 (0)