Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Outlier Detection: use gRPC status code for detecting failures #7942

Merged
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 40 additions & 1 deletion source/common/router/router.cc
Original file line number Diff line number Diff line change
Expand Up @@ -875,6 +875,12 @@ void Filter::onUpstreamReset(Http::StreamResetReason reset_reason,
ENVOY_STREAM_LOG(debug, "upstream reset: reset reason {}", *callbacks_,
Http::Utility::resetReasonToString(reset_reason));

// If gRPC request is finished without grpc-status code, use the one from the
// response headers.
if (grpc_request_ && !grpc_status_code_received_) {
upstream_request.upstream_host_->outlierDetector().putHttpResponseCode(http_status_code_);
}

// TODO: The reset may also come from upstream over the wire. In this case it should be
// treated as external origin error and distinguished from local origin error.
// This matters only when running OutlierDetection with split_external_local_origin_errors config
Expand Down Expand Up @@ -997,13 +1003,34 @@ void Filter::resetOtherUpstreams(UpstreamRequest& upstream_request) {
final_upstream_request->moveIntoList(std::move(final_upstream_request), upstream_requests_);
}

void Filter::maybeSetGrpcStatusToOutlierDetection(
UpstreamRequest& upstream_request, absl::optional<Grpc::Status::GrpcStatus> grpc_status) {
if (grpc_status) {
grpc_status_code_received_ = true;
if (Http::CodeUtility::is5xx(Grpc::Utility::grpcToHttpStatus(grpc_status.value()))) {
// For gRPC request, use the grpc status code for outlier detector when the mapped
// http code is 5xx.
upstream_request.upstream_host_->outlierDetector().putHttpResponseCode(

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this future proof, e.g. 4xx codes will never have the same issue here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From outlier_detection.proto, outlier only cares about 5xx status code.

For non-5xx gRPC status code, we use the http status code instead so non-5xx failure's behavior remains unchanged.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you should clarify the intent of this change as

"For gRPC requests, we overwrite the default (200) status for the outlier detector only when the grpc status maps to 5xx HTTP status. TODO: confirm if 4xx status should be recorded with the outlier detector."

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's fine to only consider 5xx today, but can we future proof by always doing grpcToHttpStatus? Shouldn't it be idempotent for 200?

Grpc::Utility::grpcToHttpStatus(grpc_status.value()));
} else {
upstream_request.upstream_host_->outlierDetector().putHttpResponseCode(http_status_code_);
}
}
}

void Filter::onUpstreamHeaders(uint64_t response_code, Http::HeaderMapPtr&& headers,
UpstreamRequest& upstream_request, bool end_stream) {
ENVOY_STREAM_LOG(debug, "upstream headers complete: end_stream={}", *callbacks_, end_stream);

modify_headers_(*headers);

upstream_request.upstream_host_->outlierDetector().putHttpResponseCode(response_code);
if (!grpc_request_) {
ZhouyihaiDing marked this conversation as resolved.
Show resolved Hide resolved
upstream_request.upstream_host_->outlierDetector().putHttpResponseCode(response_code);
} else {
http_status_code_ = response_code;
ZhouyihaiDing marked this conversation as resolved.
Show resolved Hide resolved
ZhouyihaiDing marked this conversation as resolved.
Show resolved Hide resolved
absl::optional<Grpc::Status::GrpcStatus> grpc_status = Grpc::Common::getGrpcStatus(*headers);
maybeSetGrpcStatusToOutlierDetection(upstream_request, grpc_status);
}

if (headers->EnvoyImmediateHealthCheckFail() != nullptr) {
upstream_request.upstream_host_->healthChecker().setUnhealthy();
Expand Down Expand Up @@ -1148,6 +1175,11 @@ void Filter::onUpstreamTrailers(Http::HeaderMapPtr&& trailers, UpstreamRequest&
}
}

if (grpc_request_) {
absl::optional<Grpc::Status::GrpcStatus> grpc_status = Grpc::Common::getGrpcStatus(*trailers);
maybeSetGrpcStatusToOutlierDetection(upstream_request, grpc_status);
}

onUpstreamComplete(upstream_request);

callbacks_->encodeTrailers(std::move(trailers));
Expand All @@ -1161,6 +1193,13 @@ void Filter::onUpstreamComplete(UpstreamRequest& upstream_request) {
if (!downstream_end_stream_) {
upstream_request.resetStream();
}

// If gRPC request is finished without grpc-status code, use the one from the
// response headers.
if (grpc_request_ && !grpc_status_code_received_) {
upstream_request.upstream_host_->outlierDetector().putHttpResponseCode(http_status_code_);
}
ZhouyihaiDing marked this conversation as resolved.
Show resolved Hide resolved

callbacks_->streamInfo().setUpstreamTiming(final_upstream_request_->upstream_timing_);

if (config_.emit_dynamic_stats_ && !callbacks_->streamInfo().healthCheck() &&
Expand Down
7 changes: 6 additions & 1 deletion source/common/router/router.h
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,8 @@ class Filter : Logger::Loggable<Logger::Id::router>,
Filter(FilterConfig& config)
: config_(config), final_upstream_request_(nullptr), downstream_response_started_(false),
downstream_end_stream_(false), do_shadowing_(false), is_retry_(false),
attempting_internal_redirect_with_complete_stream_(false) {}
attempting_internal_redirect_with_complete_stream_(false),
grpc_status_code_received_(false) {}

~Filter() override;

Expand Down Expand Up @@ -512,6 +513,8 @@ class Filter : Logger::Loggable<Logger::Id::router>,
// downstream if appropriate.
void onUpstreamAbort(Http::Code code, StreamInfo::ResponseFlag response_flag,
absl::string_view body, bool dropped, absl::string_view details);
void maybeSetGrpcStatusToOutlierDetection(UpstreamRequest& upstream_request,
absl::optional<Grpc::Status::GrpcStatus> grpc_status);
void onUpstreamHeaders(uint64_t response_code, Http::HeaderMapPtr&& headers,
UpstreamRequest& upstream_request, bool end_stream);
void onUpstreamData(Buffer::Instance& data, UpstreamRequest& upstream_request, bool end_stream);
Expand Down Expand Up @@ -572,6 +575,8 @@ class Filter : Logger::Loggable<Logger::Id::router>,
bool is_retry_ : 1;
bool include_attempt_count_ : 1;
bool attempting_internal_redirect_with_complete_stream_ : 1;
bool grpc_status_code_received_ : 1;
uint64_t http_status_code_{200};
uint32_t attempt_count_{1};
uint32_t pending_retries_{0};
};
Expand Down
29 changes: 27 additions & 2 deletions test/common/router/router_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1199,6 +1199,31 @@ TEST_F(RouterTest, GrpcAlreadyExistsTrailersOnly) {
EXPECT_TRUE(verifyHostUpstreamStats(1, 0));
}

// Validate outlier detections records failure when gRPC response status is Unavailable.
TEST_F(RouterTest, GrpcOutlierDetectionUnavailableStatusCode) {
NiceMock<Http::MockStreamEncoder> encoder1;
Http::StreamDecoder* response_decoder = nullptr;
EXPECT_CALL(cm_.conn_pool_, newStream(_, _))
.WillOnce(Invoke([&](Http::StreamDecoder& decoder, Http::ConnectionPool::Callbacks& callbacks)
-> Http::ConnectionPool::Cancellable* {
response_decoder = &decoder;
callbacks.onPoolReady(encoder1, cm_.conn_pool_.host_);
return nullptr;
}));
expectResponseTimerCreate();

Http::TestHeaderMapImpl headers{{"content-type", "application/grpc"}, {"grpc-timeout", "20S"}};
HttpTestUtility::addDefaultHeaders(headers);
router_.decodeHeaders(headers, true);

Http::HeaderMapPtr response_headers(
new Http::TestHeaderMapImpl{{":status", "200"}, {"grpc-status", "14"}});
// Outlier detector will use the gRPC response status code.
EXPECT_CALL(cm_.conn_pool_.host_->outlier_detector_, putHttpResponseCode(503));
response_decoder->decodeHeaders(std::move(response_headers), true);
EXPECT_TRUE(verifyHostUpstreamStats(0, 1));
}
ZhouyihaiDing marked this conversation as resolved.
Show resolved Hide resolved

// Validate gRPC Internal response stats are sane when response is trailers only.
TEST_F(RouterTest, GrpcInternalTrailersOnly) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Need comment.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

NiceMock<Http::MockStreamEncoder> encoder1;
Expand All @@ -1218,7 +1243,7 @@ TEST_F(RouterTest, GrpcInternalTrailersOnly) {

Http::HeaderMapPtr response_headers(
new Http::TestHeaderMapImpl{{":status", "200"}, {"grpc-status", "13"}});
EXPECT_CALL(cm_.conn_pool_.host_->outlier_detector_, putHttpResponseCode(200));
EXPECT_CALL(cm_.conn_pool_.host_->outlier_detector_, putHttpResponseCode(500));
response_decoder->decodeHeaders(std::move(response_headers), true);
EXPECT_TRUE(verifyHostUpstreamStats(0, 1));
}
Expand Down Expand Up @@ -1326,10 +1351,10 @@ TEST_F(RouterTest, GrpcInternal) {
router_.decodeHeaders(headers, true);

Http::HeaderMapPtr response_headers(new Http::TestHeaderMapImpl{{":status", "200"}});
EXPECT_CALL(cm_.conn_pool_.host_->outlier_detector_, putHttpResponseCode(200));
response_decoder->decodeHeaders(std::move(response_headers), false);
EXPECT_TRUE(verifyHostUpstreamStats(0, 0));
Http::HeaderMapPtr response_trailers(new Http::TestHeaderMapImpl{{"grpc-status", "13"}});
EXPECT_CALL(cm_.conn_pool_.host_->outlier_detector_, putHttpResponseCode(500));
response_decoder->decodeTrailers(std::move(response_trailers));
EXPECT_TRUE(verifyHostUpstreamStats(0, 1));
}
Expand Down