Skip to content

Commit 017dc09

Browse files
authored
Add support for retries (grpc#1708)
Motivation: The `ClientRPCExecutor` currently ignores retry and hedging policies. This change adds support for retries. Modifications: - Add a retry executor and wire it up to the client rpc executor - Add a few missing state transitions to the broadcasts sequence Result: RPC can be retried under certain conditions
1 parent b52f944 commit 017dc09

File tree

9 files changed

+955
-85
lines changed

9 files changed

+955
-85
lines changed

Sources/GRPCCore/Call/Client/Internal/ClientRPCExecutor+OneShotExecutor.swift

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,8 @@ extension ClientRPCExecutor.OneShotExecutor {
8080

8181
let streamExecutor = ClientStreamExecutor(transport: self.transport)
8282
group.addTask {
83-
return .streamExecutorCompleted(await streamExecutor.run())
83+
await streamExecutor.run()
84+
return .streamExecutorCompleted
8485
}
8586

8687
group.addTask {
@@ -103,14 +104,10 @@ extension ClientRPCExecutor.OneShotExecutor {
103104

104105
while let result = await group.next() {
105106
switch result {
106-
case .streamExecutorCompleted(.success):
107+
case .streamExecutorCompleted:
107108
// Stream finished; wait for the response to be handled.
108109
()
109110

110-
case .streamExecutorCompleted(.failure):
111-
// Stream execution threw: cancel and wait.
112-
group.cancelAll()
113-
114111
case .timedOut(.success):
115112
// The deadline passed; cancel the ongoing work group.
116113
group.cancelAll()
@@ -137,7 +134,7 @@ extension ClientRPCExecutor.OneShotExecutor {
137134

138135
@usableFromInline
139136
enum _OneShotExecutorTask<R> {
140-
case streamExecutorCompleted(Result<Void, RPCError>)
137+
case streamExecutorCompleted
141138
case timedOut(Result<Void, Error>)
142139
case responseHandled(Result<R, Error>)
143140
}
Lines changed: 306 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,306 @@
1+
/*
2+
* Copyright 2023, gRPC Authors All rights reserved.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
@available(macOS 13.0, iOS 16.0, watchOS 9.0, tvOS 16.0, *)
18+
extension ClientRPCExecutor {
19+
@usableFromInline
20+
struct RetryExecutor<
21+
Transport: ClientTransport,
22+
Serializer: MessageSerializer,
23+
Deserializer: MessageDeserializer
24+
> {
25+
@usableFromInline
26+
typealias Input = Serializer.Message
27+
@usableFromInline
28+
typealias Output = Deserializer.Message
29+
30+
@usableFromInline
31+
let transport: Transport
32+
@usableFromInline
33+
let policy: RetryPolicy
34+
@usableFromInline
35+
let timeout: Duration?
36+
@usableFromInline
37+
let interceptors: [any ClientInterceptor]
38+
@usableFromInline
39+
let serializer: Serializer
40+
@usableFromInline
41+
let deserializer: Deserializer
42+
@usableFromInline
43+
let bufferSize: Int
44+
45+
@inlinable
46+
init(
47+
transport: Transport,
48+
policy: RetryPolicy,
49+
timeout: Duration?,
50+
interceptors: [any ClientInterceptor],
51+
serializer: Serializer,
52+
deserializer: Deserializer,
53+
bufferSize: Int
54+
) {
55+
self.transport = transport
56+
self.policy = policy
57+
self.timeout = timeout
58+
self.interceptors = interceptors
59+
self.serializer = serializer
60+
self.deserializer = deserializer
61+
self.bufferSize = bufferSize
62+
}
63+
}
64+
}
65+
66+
@available(macOS 13.0, iOS 16.0, watchOS 9.0, tvOS 16.0, *)
67+
extension ClientRPCExecutor.RetryExecutor {
68+
@inlinable
69+
func execute<R: Sendable>(
70+
request: ClientRequest.Stream<Input>,
71+
method: MethodDescriptor,
72+
responseHandler: @Sendable @escaping (ClientResponse.Stream<Output>) async throws -> R
73+
) async throws -> R {
74+
// There's quite a lot going on here...
75+
//
76+
// The high level approach is to have two levels of task group. In the outer level tasks are
77+
// run to:
78+
// - run a timeout task (if necessary),
79+
// - run the request producer so that it writes into a broadcast sequence (in this instance we
80+
// don't care about broadcasting but the sequence's ability to replay)
81+
// - run the inner task group.
82+
//
83+
// An inner task group is run for each RPC attempt. We might also pause between attempts. The
84+
// inner group runs two tasks:
85+
// - a stream executor, and
86+
// - the unsafe RPC executor which inspects the response, either passing it to the handler or
87+
// deciding a retry should be undertaken.
88+
//
89+
// It is also worth noting that the server can override the retry delay using "pushback" and
90+
// retries may be skipped if the throttle is applied.
91+
let result = await withTaskGroup(
92+
of: _RetryExecutorTask<R>.self,
93+
returning: Result<R, Error>.self
94+
) { group in
95+
// Add a task to limit the overall execution time of the RPC.
96+
if let timeout = self.timeout {
97+
group.addTask {
98+
let result = await Result {
99+
try await Task.sleep(until: .now.advanced(by: timeout), clock: .continuous)
100+
}
101+
return .timedOut(result)
102+
}
103+
}
104+
105+
// Play the original request into the broadcast sequence and construct a replayable request.
106+
let retry = BroadcastAsyncSequence<Input>.makeStream(bufferSize: self.bufferSize)
107+
group.addTask {
108+
let result = await Result {
109+
try await request.producer(RPCWriter(wrapping: retry.continuation))
110+
}
111+
retry.continuation.finish(with: result)
112+
return .outboundFinished(result)
113+
}
114+
115+
// The sequence isn't limited by the number of attempts as the iterator is reset when the
116+
// server applies pushback.
117+
let delaySequence = RetryDelaySequence(policy: self.policy)
118+
var delayIterator = delaySequence.makeIterator()
119+
120+
for attempt in 1 ... self.policy.maximumAttempts {
121+
group.addTask {
122+
await withTaskGroup(
123+
of: _RetryExecutorSubTask<R>.self,
124+
returning: _RetryExecutorTask<R>.self
125+
) { thisAttemptGroup in
126+
let streamExecutor = ClientStreamExecutor(transport: self.transport)
127+
thisAttemptGroup.addTask {
128+
await streamExecutor.run()
129+
return .streamProcessed
130+
}
131+
132+
thisAttemptGroup.addTask {
133+
let response = await ClientRPCExecutor.unsafeExecute(
134+
request: ClientRequest.Stream(metadata: request.metadata) {
135+
try await $0.write(contentsOf: retry.stream)
136+
},
137+
method: method,
138+
attempt: attempt,
139+
serializer: self.serializer,
140+
deserializer: self.deserializer,
141+
interceptors: self.interceptors,
142+
streamProcessor: streamExecutor
143+
)
144+
145+
let shouldRetry: Bool
146+
let retryDelayOverride: Duration?
147+
148+
switch response.accepted {
149+
case .success:
150+
// Request was accepted. This counts as success to the throttle and there's no need
151+
// to retry.
152+
self.transport.retryThrottle.recordSuccess()
153+
retryDelayOverride = nil
154+
shouldRetry = false
155+
156+
case .failure(let error):
157+
// The request was rejected. Determine whether a retry should be carried out. The
158+
// following conditions must be checked:
159+
//
160+
// - Whether the status code is retryable.
161+
// - Whether more attempts are permitted by the config.
162+
// - Whether the throttle permits another retry to be carried out.
163+
// - Whether the server pushed back to either stop further retries or to override
164+
// the delay before the next retry.
165+
let code = Status.Code(error.code)
166+
let isRetryableStatusCode = self.policy.retryableStatusCodes.contains(code)
167+
168+
if isRetryableStatusCode {
169+
// Counted as failure for throttling.
170+
let throttled = self.transport.retryThrottle.recordFailure()
171+
172+
// Status code can be retried, Did the server send pushback?
173+
switch error.metadata.retryPushback {
174+
case .retryAfter(let delay):
175+
// Pushback: only retry if our config permits it.
176+
shouldRetry = (attempt < self.policy.maximumAttempts) && !throttled
177+
retryDelayOverride = delay
178+
case .stopRetrying:
179+
// Server told us to stop trying.
180+
shouldRetry = false
181+
retryDelayOverride = nil
182+
case .none:
183+
// No pushback: only retry if our config permits it.
184+
shouldRetry = (attempt < self.policy.maximumAttempts) && !throttled
185+
retryDelayOverride = nil
186+
break
187+
}
188+
} else {
189+
// Not-retryable; this is considered a success.
190+
self.transport.retryThrottle.recordSuccess()
191+
shouldRetry = false
192+
retryDelayOverride = nil
193+
}
194+
}
195+
196+
if shouldRetry {
197+
// Cancel subscribers of the broadcast sequence. This is safe as we are the only
198+
// subscriber and maximises the chances that 'isKnownSafeForNextSubscriber' will
199+
// return true.
200+
//
201+
// Note: this must only be called if we should retry, otherwise we may cancel a
202+
// subscriber for an accepted request.
203+
retry.stream.invalidateAllSubscriptions()
204+
205+
// Only retry if we know it's safe for the next subscriber, that is, the first
206+
// element is still in the buffer. It's safe to call this because there's only
207+
// ever one attempt at a time and the existing subscribers have been invalidated.
208+
if retry.stream.isKnownSafeForNextSubscriber {
209+
return .retry(retryDelayOverride)
210+
}
211+
}
212+
213+
// Not retrying or not safe to retry.
214+
let result = await Result {
215+
// Check for cancellation; the RPC may have timed out in which case we should skip
216+
// the response handler.
217+
try Task.checkCancellation()
218+
return try await responseHandler(response)
219+
}
220+
return .handledResponse(result)
221+
}
222+
223+
while let result = await thisAttemptGroup.next() {
224+
switch result {
225+
case .streamProcessed:
226+
() // Continue processing; wait for the response to be handled.
227+
228+
case .retry(let delayOverride):
229+
thisAttemptGroup.cancelAll()
230+
return .retry(delayOverride)
231+
232+
case .handledResponse(let result):
233+
thisAttemptGroup.cancelAll()
234+
return .handledResponse(result)
235+
}
236+
}
237+
238+
fatalError("Internal inconsistency")
239+
}
240+
}
241+
242+
loop: while let next = await group.next() {
243+
switch next {
244+
case .handledResponse(let result):
245+
// A usable response; cancel the remaining work and return the result.
246+
group.cancelAll()
247+
return result
248+
249+
case .retry(let delayOverride):
250+
// The attempt failed, wait a bit and then retry. The server might have overridden the
251+
// delay via pushback so preferentially use that value.
252+
//
253+
// Any error will come from cancellation: if it happens while we're sleeping we can
254+
// just loop around, the next attempt will be cancelled immediately and we will return
255+
// its response to the client.
256+
if let delayOverride = delayOverride {
257+
// If the delay is overridden with server pushback then reset the iterator for the
258+
// next retry.
259+
delayIterator = delaySequence.makeIterator()
260+
try? await Task.sleep(until: .now.advanced(by: delayOverride), clock: .continuous)
261+
} else {
262+
// The delay iterator never terminates.
263+
try? await Task.sleep(
264+
until: .now.advanced(by: delayIterator.next()!),
265+
clock: .continuous
266+
)
267+
}
268+
269+
break loop // from the while loop so another attempt can be started.
270+
271+
case .timedOut(.success), .outboundFinished(.failure):
272+
// Timeout task fired successfully or failed to process the outbound stream. Cancel and
273+
// wait for a usable response (which is likely to be an error).
274+
group.cancelAll()
275+
276+
case .timedOut(.failure), .outboundFinished(.success):
277+
// Timeout task failed which means it was cancelled (so no need to cancel again) or the
278+
// outbound stream was successfully processed (so don't need to do anything).
279+
()
280+
}
281+
}
282+
}
283+
284+
fatalError("Internal inconsistency")
285+
}
286+
287+
return try result.get()
288+
}
289+
}
290+
291+
@available(macOS 13.0, iOS 16.0, watchOS 9.0, tvOS 16.0, *)
292+
@usableFromInline
293+
enum _RetryExecutorTask<R> {
294+
case timedOut(Result<Void, Error>)
295+
case handledResponse(Result<R, Error>)
296+
case retry(Duration?)
297+
case outboundFinished(Result<Void, Error>)
298+
}
299+
300+
@available(macOS 13.0, iOS 16.0, watchOS 9.0, tvOS 16.0, *)
301+
@usableFromInline
302+
enum _RetryExecutorSubTask<R> {
303+
case streamProcessed
304+
case handledResponse(Result<R, Error>)
305+
case retry(Duration?)
306+
}

Sources/GRPCCore/Call/Client/Internal/ClientRPCExecutor.swift

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,24 @@ enum ClientRPCExecutor {
5858
responseHandler: handler
5959
)
6060

61-
case .retry, .hedge:
61+
case .retry(let policy):
62+
let retryExecutor = RetryExecutor(
63+
transport: transport,
64+
policy: policy,
65+
timeout: configuration.timeout,
66+
interceptors: interceptors,
67+
serializer: serializer,
68+
deserializer: deserializer,
69+
bufferSize: 64 // TODO: the client should have some control over this.
70+
)
71+
72+
return try await retryExecutor.execute(
73+
request: request,
74+
method: method,
75+
responseHandler: handler
76+
)
77+
78+
case .hedge:
6279
fatalError()
6380
}
6481
}

0 commit comments

Comments
 (0)