Skip to content

Commit 8fc2e5d

Browse files
committed
Add a configurable escalation behaviour
# Motivation The service group is often running multiple services and orchestrates shutdown for them. We have seen that sometimes some services never shutdown nor do they respond to task cancellation properly. This can become quite problematic when the whole application is waiting for a service to cancel and otherwise appears to be healthy but in reality can't serve any traffic. # Modification This PR adds a new configuration to escalate both graceful shutdown and cancellation. The escalation order is graceful shutdown -> task cancellation -> `fatalError`. The `fatalError` acts a last resort to make sure applications are never stuck.
1 parent b71a961 commit 8fc2e5d

File tree

3 files changed

+260
-15
lines changed

3 files changed

+260
-15
lines changed

Sources/ServiceLifecycle/ServiceGroup.swift

Lines changed: 94 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@ public actor ServiceGroup: Sendable {
3333
private let logger: Logger
3434
/// The logging configuration.
3535
private let loggingConfiguration: ServiceGroupConfiguration.LoggingConfiguration
36+
/// The escalation configuration.
37+
private let escalationConfiguration: ServiceGroupConfiguration.EscalationBehaviour
3638
/// The signals that lead to graceful shutdown.
3739
private let gracefulShutdownSignals: [UnixSignal]
3840
/// The signals that lead to cancellation.
@@ -57,6 +59,7 @@ public actor ServiceGroup: Sendable {
5759
self.cancellationSignals = configuration.cancellationSignals
5860
self.logger = configuration.logger
5961
self.loggingConfiguration = configuration.logging
62+
self.escalationConfiguration = configuration.escalation
6063
}
6164

6265
/// Initializes a new ``ServiceGroup``.
@@ -94,6 +97,7 @@ public actor ServiceGroup: Sendable {
9497
self.cancellationSignals = configuration.cancellationSignals
9598
self.logger = logger
9699
self.loggingConfiguration = configuration.logging
100+
self.escalationConfiguration = configuration.escalation
97101
}
98102

99103
/// Runs all the services by spinning up a child task per service.
@@ -176,6 +180,8 @@ public actor ServiceGroup: Sendable {
176180
case signalSequenceFinished
177181
case gracefulShutdownCaught
178182
case gracefulShutdownFinished
183+
case gracefulShutdownTimedOut
184+
case cancellationCaught
179185
}
180186

181187
private func _run(
@@ -191,6 +197,10 @@ public actor ServiceGroup: Sendable {
191197
]
192198
)
193199

200+
// A task that is spawned when we got cancelled or
201+
// we cancel the task group to keep track of a timeout.
202+
var cancellationTimeoutTask: Task<Void, Never>?
203+
194204
// Using a result here since we want a task group that has non-throwing child tasks
195205
// but the body itself is throwing
196206
let result = try await withThrowingTaskGroup(of: ChildTaskResult.self, returning: Result<Void, Error>.self) { group in
@@ -267,6 +277,13 @@ public actor ServiceGroup: Sendable {
267277
}
268278
}
269279

280+
group.addTask {
281+
// This child task is waiting forever until the group gets cancelled.
282+
let (stream, _) = AsyncStream.makeStream(of: Void.self)
283+
await stream.first { _ in true }
284+
return .cancellationCaught
285+
}
286+
270287
// We are storing the services in an optional array now. When a slot in the array is
271288
// empty it indicates that the service has been shutdown.
272289
var services = services.map { Optional($0) }
@@ -293,7 +310,7 @@ public actor ServiceGroup: Sendable {
293310
self.loggingConfiguration.keys.serviceKey: "\(service.service)",
294311
]
295312
)
296-
group.cancelAll()
313+
cancellationTimeoutTask = self.cancelGroupAndSpawnTimeoutIfNeeded(group: &group)
297314
return .failure(ServiceGroupError.serviceFinishedUnexpectedly())
298315

299316
case .gracefullyShutdownGroup:
@@ -307,6 +324,7 @@ public actor ServiceGroup: Sendable {
307324
do {
308325
try await self.shutdownGracefully(
309326
services: services,
327+
cancellationTimeoutTask: &cancellationTimeoutTask,
310328
group: &group,
311329
gracefulShutdownManagers: gracefulShutdownManagers
312330
)
@@ -327,7 +345,7 @@ public actor ServiceGroup: Sendable {
327345
self.logger.debug(
328346
"All services finished."
329347
)
330-
group.cancelAll()
348+
cancellationTimeoutTask = self.cancelGroupAndSpawnTimeoutIfNeeded(group: &group)
331349
return .success(())
332350
}
333351
}
@@ -342,7 +360,7 @@ public actor ServiceGroup: Sendable {
342360
self.loggingConfiguration.keys.errorKey: "\(serviceError)",
343361
]
344362
)
345-
group.cancelAll()
363+
cancellationTimeoutTask = self.cancelGroupAndSpawnTimeoutIfNeeded(group: &group)
346364
return .failure(serviceError)
347365

348366
case .gracefullyShutdownGroup:
@@ -358,6 +376,7 @@ public actor ServiceGroup: Sendable {
358376
do {
359377
try await self.shutdownGracefully(
360378
services: services,
379+
cancellationTimeoutTask: &cancellationTimeoutTask,
361380
group: &group,
362381
gracefulShutdownManagers: gracefulShutdownManagers
363382
)
@@ -381,7 +400,7 @@ public actor ServiceGroup: Sendable {
381400
"All services finished."
382401
)
383402

384-
group.cancelAll()
403+
cancellationTimeoutTask = self.cancelGroupAndSpawnTimeoutIfNeeded(group: &group)
385404
return .success(())
386405
}
387406
}
@@ -398,6 +417,7 @@ public actor ServiceGroup: Sendable {
398417
do {
399418
try await self.shutdownGracefully(
400419
services: services,
420+
cancellationTimeoutTask: &cancellationTimeoutTask,
401421
group: &group,
402422
gracefulShutdownManagers: gracefulShutdownManagers
403423
)
@@ -413,7 +433,7 @@ public actor ServiceGroup: Sendable {
413433
]
414434
)
415435

416-
group.cancelAll()
436+
cancellationTimeoutTask = self.cancelGroupAndSpawnTimeoutIfNeeded(group: &group)
417437
}
418438

419439
case .gracefulShutdownCaught:
@@ -423,19 +443,29 @@ public actor ServiceGroup: Sendable {
423443
do {
424444
try await self.shutdownGracefully(
425445
services: services,
446+
cancellationTimeoutTask: &cancellationTimeoutTask,
426447
group: &group,
427448
gracefulShutdownManagers: gracefulShutdownManagers
428449
)
429450
} catch {
430451
return .failure(error)
431452
}
432453

454+
case .cancellationCaught:
455+
// We caught cancellation in our child task so we have to spawn
456+
// our cancellation timeout task if needed
457+
self.logger.debug("Caught cancellation.")
458+
cancellationTimeoutTask = self.cancelGroupAndSpawnTimeoutIfNeeded(group: &group)
459+
433460
case .signalSequenceFinished, .gracefulShutdownFinished:
434461
// This can happen when we are either cancelling everything or
435462
// when the user did not specify any shutdown signals. We just have to tolerate
436463
// this.
437464
continue
438465

466+
case .gracefulShutdownTimedOut:
467+
fatalError("Received gracefulShutdownTimedOut but never triggered a graceful shutdown")
468+
439469
case nil:
440470
fatalError("Invalid result from group.next(). We checked if the group is empty before and still got nil")
441471
}
@@ -447,18 +477,28 @@ public actor ServiceGroup: Sendable {
447477
self.logger.debug(
448478
"Service lifecycle ended"
449479
)
480+
cancellationTimeoutTask?.cancel()
450481
try result.get()
451482
}
452483

453484
private func shutdownGracefully(
454485
services: [ServiceGroupConfiguration.ServiceConfiguration?],
486+
cancellationTimeoutTask: inout Task<Void, Never>?,
455487
group: inout ThrowingTaskGroup<ChildTaskResult, Error>,
456488
gracefulShutdownManagers: [GracefulShutdownManager]
457489
) async throws {
458490
guard case .running = self.state else {
459491
fatalError("Unexpected state")
460492
}
461493

494+
if #available(macOS 13.0, *), let maximumGracefulShutdownDuration = self.escalationConfiguration.maximumGracefulShutdownDuration {
495+
group.addTask {
496+
try await Task.sleep(for: maximumGracefulShutdownDuration)
497+
return .gracefulShutdownTimedOut
498+
}
499+
}
500+
501+
462502
// We are storing the first error of a service that threw here.
463503
var error: Error?
464504

@@ -509,7 +549,7 @@ public actor ServiceGroup: Sendable {
509549
]
510550
)
511551

512-
group.cancelAll()
552+
cancellationTimeoutTask = self.cancelGroupAndSpawnTimeoutIfNeeded(group: &group)
513553
throw ServiceGroupError.serviceFinishedUnexpectedly()
514554
}
515555

@@ -561,9 +601,26 @@ public actor ServiceGroup: Sendable {
561601
]
562602
)
563603

564-
group.cancelAll()
604+
cancellationTimeoutTask = self.cancelGroupAndSpawnTimeoutIfNeeded(group: &group)
565605
}
566606

607+
case .gracefulShutdownTimedOut:
608+
// Gracefully shutting down took longer than the user configured
609+
// so we have to escalate it now.
610+
self.logger.debug(
611+
"Graceful shutdown took longer than allowed by the configuration. Cancelling the group now.",
612+
metadata: [
613+
self.loggingConfiguration.keys.serviceKey: "\(service.service)",
614+
]
615+
)
616+
cancellationTimeoutTask = self.cancelGroupAndSpawnTimeoutIfNeeded(group: &group)
617+
618+
case .cancellationCaught:
619+
// We caught cancellation in our child task so we have to spawn
620+
// our cancellation timeout task if needed
621+
self.logger.debug("Caught cancellation.")
622+
cancellationTimeoutTask = self.cancelGroupAndSpawnTimeoutIfNeeded(group: &group)
623+
567624
case .signalSequenceFinished, .gracefulShutdownCaught, .gracefulShutdownFinished:
568625
// We just have to tolerate this since signals and parent graceful shutdowns downs can race.
569626
continue
@@ -575,7 +632,9 @@ public actor ServiceGroup: Sendable {
575632

576633
// If we hit this then all services are shutdown. The only thing remaining
577634
// are the tasks that listen to the various graceful shutdown signals. We
578-
// just have to cancel those
635+
// just have to cancel those.
636+
// In this case we don't have to spawn our cancellation timeout task since
637+
// we are sure all other child tasks are handling cancellation appropriately.
579638
group.cancelAll()
580639

581640
// If we saw an error during graceful shutdown from a service that triggers graceful
@@ -584,6 +643,33 @@ public actor ServiceGroup: Sendable {
584643
throw error
585644
}
586645
}
646+
647+
private func cancelGroupAndSpawnTimeoutIfNeeded(
648+
group: inout ThrowingTaskGroup<ChildTaskResult, Error>
649+
) -> Task<Void, Never>? {
650+
group.cancelAll()
651+
if #available(macOS 13.0, iOS 16.0, watchOS 9.0, tvOS 16.0, *), let maximumCancellationDuration = self.escalationConfiguration.maximumCancellationDuration {
652+
// We have to spawn an unstructured task here because the call to our `run`
653+
// method might have already been cancelled and we need to protect the sleep
654+
// from being cancelled.
655+
return Task {
656+
do {
657+
self.logger.debug(
658+
"Task cancellation timeout task started."
659+
)
660+
try await Task.sleep(for: maximumCancellationDuration)
661+
self.logger.debug(
662+
"Cancellation took longer than allowed by the configuration."
663+
)
664+
fatalError("Cancellation took longer than allowed by the configuration.")
665+
} catch {
666+
// We got cancelled so our services must have finished up.
667+
}
668+
}
669+
} else {
670+
return nil
671+
}
672+
}
587673
}
588674

589675
// This should be removed once we support Swift 5.9+

Sources/ServiceLifecycle/ServiceGroupConfiguration.swift

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,65 @@ public struct ServiceGroupConfiguration: Sendable {
9696
}
9797
}
9898

99+
/// The group's escalation configuration.
100+
public struct EscalationBehaviour: Sendable {
101+
/// The maximum amount of time that graceful shutdown is allowed to take.
102+
///
103+
/// After this time has elapsed graceful shutdown will be escalated to task cancellation.
104+
@available(macOS 13.0, iOS 16.0, watchOS 9.0, tvOS 16.0, *)
105+
public var maximumGracefulShutdownDuration: Duration? {
106+
get {
107+
if let maximumGracefulShutdownDuration = self._maximumGracefulShutdownDuration {
108+
return .init(
109+
secondsComponent: maximumGracefulShutdownDuration.secondsComponent,
110+
attosecondsComponent: maximumGracefulShutdownDuration.attosecondsComponent
111+
)
112+
} else {
113+
return nil
114+
}
115+
}
116+
set {
117+
if let newValue = newValue {
118+
self._maximumGracefulShutdownDuration = (newValue.components.seconds, newValue.components.attoseconds)
119+
} else {
120+
self._maximumCancellationDuration = nil
121+
}
122+
}
123+
}
124+
125+
/// The maximum amount of time that task cancellation is allowed to take.
126+
///
127+
/// After this time has elapsed task cancellation will be escalated to a `fatalError`.
128+
///
129+
/// - Important: This setting is useful to guarantee that your application will exit at some point and
130+
/// should be used to identify APIs that are not properly implementing task cancellation.
131+
@available(macOS 13.0, iOS 16.0, watchOS 9.0, tvOS 16.0, *)
132+
public var maximumCancellationDuration: Duration? {
133+
get {
134+
if let maximumCancellationDuration = self._maximumCancellationDuration {
135+
return .init(
136+
secondsComponent: maximumCancellationDuration.secondsComponent,
137+
attosecondsComponent: maximumCancellationDuration.attosecondsComponent
138+
)
139+
} else {
140+
return nil
141+
}
142+
}
143+
set {
144+
if let newValue = newValue {
145+
self._maximumCancellationDuration = (newValue.components.seconds, newValue.components.attoseconds)
146+
} else {
147+
self._maximumCancellationDuration = nil
148+
}
149+
}
150+
}
151+
152+
private var _maximumGracefulShutdownDuration: (secondsComponent: Int64, attosecondsComponent: Int64)?
153+
private var _maximumCancellationDuration: (secondsComponent: Int64, attosecondsComponent: Int64)?
154+
155+
public init() {}
156+
}
157+
99158
/// The groups's service configurations.
100159
public var services: [ServiceConfiguration]
101160

@@ -111,6 +170,9 @@ public struct ServiceGroupConfiguration: Sendable {
111170
/// The group's logging configuration.
112171
public var logging = LoggingConfiguration()
113172

173+
/// The group's escalation configuration.
174+
public var escalation = EscalationBehaviour()
175+
114176
/// Initializes a new ``ServiceGroupConfiguration``.
115177
///
116178
/// - Parameters:

0 commit comments

Comments
 (0)