From 85386d314a934b7eaa0bf2707f6d5af039e93340 Mon Sep 17 00:00:00 2001 From: Alex Ostrovski Date: Thu, 27 Jun 2024 12:19:36 +0300 Subject: [PATCH] fix(object-store): Consider some token source errors transient (#2331) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## What ❔ Considers some token source GCS errors transient. ## Why ❔ Considering errors as transient leads to less abnormal application terminations. ## Checklist - [x] PR title corresponds to the body of PR (we generate changelog entries from PRs). - [x] Documentation comments have been added / updated. - [x] Code has been formatted via `zk fmt` and `zk lint`. --- core/lib/object_store/src/gcs.rs | 36 +++++++++++++++++++------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/core/lib/object_store/src/gcs.rs b/core/lib/object_store/src/gcs.rs index 2d4fae77ab8..fd883a53f3e 100644 --- a/core/lib/object_store/src/gcs.rs +++ b/core/lib/object_store/src/gcs.rs @@ -107,21 +107,22 @@ fn is_transient_http_error(err: &reqwest::Error) -> bool { || err.status() == Some(StatusCode::SERVICE_UNAVAILABLE) } -fn has_transient_io_source(mut err: &(dyn StdError + 'static)) -> bool { +fn get_source<'a, T: StdError + 'static>(mut err: &'a (dyn StdError + 'static)) -> Option<&'a T> { loop { - if err.is::() { - // We treat any I/O errors as transient. This isn't always true, but frequently occurring I/O errors - // (e.g., "connection reset by peer") *are* transient, and treating an error as transient is a safer option, - // even if it can lead to unnecessary retries. - return true; + if let Some(err) = err.downcast_ref::() { + return Some(err); } - err = match err.source() { - Some(source) => source, - None => return false, - }; + err = err.source()?; } } +fn has_transient_io_source(err: &(dyn StdError + 'static)) -> bool { + // We treat any I/O errors as transient. This isn't always true, but frequently occurring I/O errors + // (e.g., "connection reset by peer") *are* transient, and treating an error as transient is a safer option, + // even if it can lead to unnecessary retries. + get_source::(err).is_some() +} + impl From for ObjectStoreError { fn from(err: HttpError) -> Self { let is_not_found = match &err { @@ -135,10 +136,17 @@ impl From for ObjectStoreError { if is_not_found { ObjectStoreError::KeyNotFound(err.into()) } else { - let is_transient = matches!( - &err, - HttpError::HttpClient(err) if is_transient_http_error(err) - ); + let is_transient = match &err { + HttpError::HttpClient(err) => is_transient_http_error(err), + HttpError::TokenSource(err) => { + // Token sources are mostly based on the `reqwest` HTTP client, so transient error detection + // can reuse the same logic. + let err = err.as_ref(); + has_transient_io_source(err) + || get_source::(err).is_some_and(is_transient_http_error) + } + HttpError::Response(_) => false, + }; ObjectStoreError::Other { is_transient, source: err.into(),