Skip to content

Commit

Permalink
fix(panic): Stop panicking on async task cancellation on shutdown in …
Browse files Browse the repository at this point in the history
…network and state futures (#7219)

* Add an async-error feature and an initial module structure

* Implement checking for panics in OS threads and async tasks

* Implement waiting for panics in OS threads and async tasks

* Add a TODO to simplify some state request error handling

* Use the new panic-checking methods in zebra-state

* Use new panic-checking methods in zebra-network

* fixup! Implement waiting for panics in OS threads and async tasks

* Replace existing async code with generic panic-checking methods

* Simplify trait to a single method

* Move thread panic code into generic trait impls

* Simplify option handling

Co-authored-by: Arya <aryasolhi@gmail.com>

* Fix comment

Co-authored-by: Arya <aryasolhi@gmail.com>

* Add missing track_caller

---------

Co-authored-by: Arya <aryasolhi@gmail.com>
  • Loading branch information
teor2345 and arya2 authored Jul 18, 2023
1 parent c885de4 commit 3bbe3ce
Show file tree
Hide file tree
Showing 12 changed files with 367 additions and 232 deletions.
12 changes: 9 additions & 3 deletions zebra-chain/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,11 @@ json-conversion = [
"serde_json",
]

# Async error handling convenience traits
async-error = [
"tokio",
]

# Experimental mining RPC support
getblocktemplate-rpcs = [
"zcash_address",
Expand All @@ -39,7 +44,7 @@ proptest-impl = [
"proptest-derive",
"rand",
"rand_chacha",
"tokio",
"tokio/tracing",
"zebra-test",
]

Expand Down Expand Up @@ -108,6 +113,9 @@ reddsa = "0.5.0"
# Production feature json-conversion
serde_json = { version = "1.0.100", optional = true }

# Production feature async-error and testing feature proptest-impl
tokio = { version = "1.29.1", optional = true }

# Experimental feature getblocktemplate-rpcs
zcash_address = { version = "0.3.0", optional = true }

Expand All @@ -118,8 +126,6 @@ proptest-derive = { version = "0.3.0", optional = true }
rand = { version = "0.8.5", optional = true }
rand_chacha = { version = "0.3.1", optional = true }

tokio = { version = "1.29.1", features = ["tracing"], optional = true }

zebra-test = { path = "../zebra-test/", version = "1.0.0-beta.27", optional = true }

[dev-dependencies]
Expand Down
15 changes: 12 additions & 3 deletions zebra-chain/src/diagnostic.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,15 @@
//! Tracing the execution time of functions.
//!
//! TODO: also trace polling time for futures, using a `Future` wrapper
//! Diagnostic types and functions for Zebra:
//! - code performance
//! - task handling
//! - errors and panics

pub mod task;

// Tracing the execution time of functions.
//
// TODO:
// - move this to a `timing` submodule
// - also trace polling time for futures, using a `Future` wrapper

use std::time::{Duration, Instant};

Expand Down
47 changes: 47 additions & 0 deletions zebra-chain/src/diagnostic/task.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
//! Diagnostic types and functions for Zebra tasks:
//! - OS thread handling
//! - async future task handling
//! - errors and panics

#[cfg(feature = "async-error")]
pub mod future;

pub mod thread;

/// A trait that checks a task's return value for panics.
pub trait CheckForPanics {
/// The output type, after removing panics from `Self`.
type Output;

/// Check if `self` contains a panic payload or an unexpected termination, then panic.
/// Otherwise, return the non-panic part of `self`.
///
/// # Panics
///
/// If `self` contains a panic payload or an unexpected termination.
#[track_caller]
fn check_for_panics(self) -> Self::Output;
}

/// A trait that waits for a task to finish, then handles panics and cancellations.
pub trait WaitForPanics {
/// The underlying task output, after removing panics and unwrapping termination results.
type Output;

/// Waits for `self` to finish, then check if its output is:
/// - a panic payload: resume that panic,
/// - an unexpected termination: panic with that error,
/// - an expected termination: hang waiting for shutdown.
///
/// Otherwise, returns the task return value of `self`.
///
/// # Panics
///
/// If `self` contains a panic payload or an unexpected termination.
///
/// # Hangs
///
/// If `self` contains an expected termination, and we're shutting down anyway.
#[track_caller]
fn wait_for_panics(self) -> Self::Output;
}
93 changes: 93 additions & 0 deletions zebra-chain/src/diagnostic/task/future.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
//! Diagnostic types and functions for Zebra async future tasks:
//! - task handles
//! - errors and panics

use std::{future, panic};

use futures::future::{BoxFuture, FutureExt};
use tokio::task::{JoinError, JoinHandle};

use crate::shutdown::is_shutting_down;

use super::{CheckForPanics, WaitForPanics};

/// This is the return type of the [`JoinHandle`] future.
impl<T> CheckForPanics for Result<T, JoinError> {
/// The [`JoinHandle`]'s task output, after resuming any panics,
/// and ignoring task cancellations on shutdown.
type Output = Result<T, JoinError>;

/// Returns the task result if the task finished normally.
/// Otherwise, resumes any panics, logs unexpected errors, and ignores any expected errors.
///
/// If the task finished normally, returns `Some(T)`.
/// If the task was cancelled, returns `None`.
#[track_caller]
fn check_for_panics(self) -> Self::Output {
match self {
Ok(task_output) => Ok(task_output),
Err(join_error) => Err(join_error.check_for_panics()),
}
}
}

impl CheckForPanics for JoinError {
/// The [`JoinError`] after resuming any panics, and logging any unexpected task cancellations.
type Output = JoinError;

/// Resume any panics and panic on unexpected task cancellations.
/// Always returns [`JoinError::Cancelled`](JoinError::is_cancelled).
#[track_caller]
fn check_for_panics(self) -> Self::Output {
match self.try_into_panic() {
Ok(panic_payload) => panic::resume_unwind(panic_payload),

// We could ignore this error, but then we'd have to change the return type.
Err(task_cancelled) if is_shutting_down() => {
debug!(
?task_cancelled,
"ignoring cancelled task because Zebra is shutting down"
);

task_cancelled
}

Err(task_cancelled) => {
panic!("task cancelled during normal Zebra operation: {task_cancelled:?}");
}
}
}
}

impl<T> WaitForPanics for JoinHandle<T>
where
T: Send + 'static,
{
type Output = BoxFuture<'static, T>;

/// Returns a future which waits for `self` to finish, then checks if its output is:
/// - a panic payload: resume that panic,
/// - an unexpected termination: panic with that error,
/// - an expected termination: hang waiting for shutdown.
///
/// Otherwise, returns the task return value of `self`.
///
/// # Panics
///
/// If `self` contains a panic payload, or [`JoinHandle::abort()`] has been called on `self`.
///
/// # Hangs
///
/// If `self` contains an expected termination, and we're shutting down anyway.
/// Futures hang by returning `Pending` and not setting a waker, so this uses minimal resources.
#[track_caller]
fn wait_for_panics(self) -> Self::Output {
async move {
match self.await.check_for_panics() {
Ok(task_output) => task_output,
Err(_expected_cancel_error) => future::pending().await,
}
}
.boxed()
}
}
108 changes: 108 additions & 0 deletions zebra-chain/src/diagnostic/task/thread.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
//! Diagnostic types and functions for Zebra OS thread tasks:
//! - task handles
//! - errors and panics

use std::{
panic,
sync::Arc,
thread::{self, JoinHandle},
};

use super::{CheckForPanics, WaitForPanics};

impl<T> CheckForPanics for thread::Result<T> {
type Output = T;

/// Panics if the thread panicked.
///
/// Threads can't be cancelled except by using a panic, so there are no thread errors here.
#[track_caller]
fn check_for_panics(self) -> Self::Output {
match self {
// The value returned by the thread when it finished.
Ok(thread_output) => thread_output,

// A thread error is always a panic.
Err(panic_payload) => panic::resume_unwind(panic_payload),
}
}
}

impl<T> WaitForPanics for JoinHandle<T> {
type Output = T;

/// Waits for the thread to finish, then panics if the thread panicked.
#[track_caller]
fn wait_for_panics(self) -> Self::Output {
self.join().check_for_panics()
}
}

impl<T> WaitForPanics for Arc<JoinHandle<T>> {
type Output = Option<T>;

/// If this is the final `Arc`, waits for the thread to finish, then panics if the thread
/// panicked. Otherwise, returns the thread's return value.
///
/// If this is not the final `Arc`, drops the handle and immediately returns `None`.
#[track_caller]
fn wait_for_panics(self) -> Self::Output {
// If we are the last Arc with a reference to this handle,
// we can wait for it and propagate any panics.
//
// We use into_inner() because it guarantees that exactly one of the tasks gets the
// JoinHandle. try_unwrap() lets us keep the JoinHandle, but it can also miss panics.
//
// This is more readable as an expanded statement.
#[allow(clippy::manual_map)]
if let Some(handle) = Arc::into_inner(self) {
Some(handle.wait_for_panics())
} else {
None
}
}
}

impl<T> CheckForPanics for &mut Option<Arc<JoinHandle<T>>> {
type Output = Option<T>;

/// If this is the final `Arc`, checks if the thread has finished, then panics if the thread
/// panicked. Otherwise, returns the thread's return value.
///
/// If the thread has not finished, or this is not the final `Arc`, returns `None`.
#[track_caller]
fn check_for_panics(self) -> Self::Output {
let handle = self.take()?;

if handle.is_finished() {
// This is the same as calling `self.wait_for_panics()`, but we can't do that,
// because we've taken `self`.
#[allow(clippy::manual_map)]
return handle.wait_for_panics();
}

*self = Some(handle);

None
}
}

impl<T> WaitForPanics for &mut Option<Arc<JoinHandle<T>>> {
type Output = Option<T>;

/// If this is the final `Arc`, waits for the thread to finish, then panics if the thread
/// panicked. Otherwise, returns the thread's return value.
///
/// If this is not the final `Arc`, drops the handle and returns `None`.
#[track_caller]
fn wait_for_panics(self) -> Self::Output {
// This is more readable as an expanded statement.
#[allow(clippy::manual_map)]
if let Some(output) = self.take()?.wait_for_panics() {
Some(output)
} else {
// Some other task has a reference, so we should give up ours to let them use it.
None
}
}
}
2 changes: 1 addition & 1 deletion zebra-network/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ howudoin = { version = "0.1.2", optional = true }
proptest = { version = "1.2.0", optional = true }
proptest-derive = { version = "0.3.0", optional = true }

zebra-chain = { path = "../zebra-chain", version = "1.0.0-beta.27" }
zebra-chain = { path = "../zebra-chain", version = "1.0.0-beta.27", features = ["async-error"] }

[dev-dependencies]
proptest = "1.2.0"
Expand Down
8 changes: 4 additions & 4 deletions zebra-network/src/peer_set/candidate_set.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use tokio::time::{sleep_until, timeout, Instant};
use tower::{Service, ServiceExt};
use tracing::Span;

use zebra_chain::serialization::DateTime32;
use zebra_chain::{diagnostic::task::WaitForPanics, serialization::DateTime32};

use crate::{
constants, meta_addr::MetaAddrChange, peer_set::set::MorePeers, types::MetaAddr, AddressBook,
Expand Down Expand Up @@ -348,8 +348,8 @@ where
tokio::task::spawn_blocking(move || {
span.in_scope(|| address_book.lock().unwrap().extend(addrs))
})
.wait_for_panics()
.await
.expect("panic in new peers address book update task");
}

/// Returns the next candidate for a connection attempt, if any are available.
Expand Down Expand Up @@ -403,8 +403,8 @@ where
// Correctness: Spawn address book accesses on a blocking thread, to avoid deadlocks (see #1976).
let span = Span::current();
let next_peer = tokio::task::spawn_blocking(move || span.in_scope(next_peer))
.await
.expect("panic in next peer address book task")?;
.wait_for_panics()
.await?;

// Security: rate-limit new outbound peer connections
sleep_until(self.min_next_handshake).await;
Expand Down
Loading

0 comments on commit 3bbe3ce

Please sign in to comment.