From 7e2099ae585ba70997d59a8dfe6246b9728c1b24 Mon Sep 17 00:00:00 2001 From: asraa Date: Mon, 8 Feb 2021 21:47:27 -0500 Subject: [PATCH] [server] Fix uncaught exceptions in callbacks that initialize secondary clusters (#14963) Fixes uncaught exceptions in callbacks that can occur after server initialization. Signed-off-by: Asra Ali --- source/server/server.cc | 32 ++- ...erfuzz-testcase-minimized-5146513187667968 | 43 ++++ ...erfuzz-testcase-minimized-6612746286268416 | 232 ++++++++++++++++++ test/server/server_test.cc | 34 ++- .../test_data/server/hds_exception.yaml | 27 ++ .../server/health_check_nullptr.yaml | 23 ++ 6 files changed, 378 insertions(+), 13 deletions(-) create mode 100644 test/server/server_corpus/clusterfuzz-testcase-minimized-5146513187667968 create mode 100644 test/server/server_corpus/clusterfuzz-testcase-minimized-6612746286268416 create mode 100644 test/server/test_data/server/hds_exception.yaml create mode 100644 test/server/test_data/server/health_check_nullptr.yaml diff --git a/source/server/server.cc b/source/server/server.cc index b7afdd253e25..11bca3dc99aa 100644 --- a/source/server/server.cc +++ b/source/server/server.cc @@ -601,21 +601,33 @@ void InstanceImpl::onClusterManagerPrimaryInitializationComplete() { void InstanceImpl::onRuntimeReady() { // Begin initializing secondary clusters after RTDS configuration has been applied. - clusterManager().initializeSecondaryClusters(bootstrap_); + // Initializing can throw exceptions, so catch these. + try { + clusterManager().initializeSecondaryClusters(bootstrap_); + } catch (const EnvoyException& e) { + ENVOY_LOG(warn, "Skipping initialization of secondary cluster: {}", e.what()); + shutdown(); + } if (bootstrap_.has_hds_config()) { const auto& hds_config = bootstrap_.hds_config(); async_client_manager_ = std::make_unique( *config_.clusterManager(), thread_local_, time_source_, *api_, grpc_context_.statNames()); - hds_delegate_ = std::make_unique( - stats_store_, - Config::Utility::factoryForGrpcApiConfigSource(*async_client_manager_, hds_config, - stats_store_, false) - ->create(), - Config::Utility::getAndCheckTransportVersion(hds_config), *dispatcher_, - Runtime::LoaderSingleton::get(), stats_store_, *ssl_context_manager_, info_factory_, - access_log_manager_, *config_.clusterManager(), *local_info_, *admin_, *singleton_manager_, - thread_local_, messageValidationContext().dynamicValidationVisitor(), *api_); + try { + hds_delegate_ = std::make_unique( + stats_store_, + Config::Utility::factoryForGrpcApiConfigSource(*async_client_manager_, hds_config, + stats_store_, false) + ->create(), + Config::Utility::getAndCheckTransportVersion(hds_config), *dispatcher_, + Runtime::LoaderSingleton::get(), stats_store_, *ssl_context_manager_, info_factory_, + access_log_manager_, *config_.clusterManager(), *local_info_, *admin_, + *singleton_manager_, thread_local_, messageValidationContext().dynamicValidationVisitor(), + *api_); + } catch (const EnvoyException& e) { + ENVOY_LOG(warn, "Skipping initialization of HDS cluster: {}", e.what()); + shutdown(); + } } // If there is no global limit to the number of active connections, warn on startup. diff --git a/test/server/server_corpus/clusterfuzz-testcase-minimized-5146513187667968 b/test/server/server_corpus/clusterfuzz-testcase-minimized-5146513187667968 new file mode 100644 index 000000000000..465615da0d72 --- /dev/null +++ b/test/server/server_corpus/clusterfuzz-testcase-minimized-5146513187667968 @@ -0,0 +1,43 @@ +static_resources { + clusters { + name: "ser" + connect_timeout { + nanos: 813 + } + health_checks { + timeout { + nanos: 926363914 + } + interval { + seconds: 165537 + } + unhealthy_threshold { + value: 16 + } + healthy_threshold { + } + http_health_check { + host: "h" + path: "&" + } + } + load_assignment { + cluster_name: "." + endpoints { + lb_endpoints { + endpoint { + address { + pipe { + path: "." + } + } + } + health_status: DRAINING + } + } + } + } +} +hds_config { +} +enable_dispatcher_stats: true diff --git a/test/server/server_corpus/clusterfuzz-testcase-minimized-6612746286268416 b/test/server/server_corpus/clusterfuzz-testcase-minimized-6612746286268416 new file mode 100644 index 000000000000..19ccaf6ecd5c --- /dev/null +++ b/test/server/server_corpus/clusterfuzz-testcase-minimized-6612746286268416 @@ -0,0 +1,232 @@ +static_resources { + clusters { + name: "ser" + connect_timeout { + nanos: 813 + } + per_connection_buffer_limit_bytes { + } + lb_policy: LEAST_REQUEST + health_checks { + timeout { + nanos: 926363914 + } + interval { + seconds: 165537 + nanos: 95 + } + unhealthy_threshold { + } + healthy_threshold { + } + reuse_connection { + value: true + } + http_health_check { + host: "h" + path: "&" + codec_client_type: HTTP2 + } + healthy_edge_interval { + nanos: 54784 + } + } + protocol_selection: USE_DOWNSTREAM_PROTOCOL + ignore_health_on_host_removal: true + load_assignment { + cluster_name: "*" + endpoints { + locality { + sub_zone: "\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\001\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037" + } + lb_endpoints { + endpoint { + address { + pipe { + path: "." + } + } + } + health_status: HEALTHY + } + } + endpoints { + locality { + sub_zone: "\037\037\037\037\037\037\037\037\037\000\037\037\037\037\037\037\037\037\037\037\037\001\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\031\037\037\037\037\037\037\037\037\037\037\037\037" + } + lb_endpoints { + endpoint { + address { + pipe { + path: "^" + } + } + } + health_status: DRAINING + metadata { + } + } + } + endpoints { + priority: 16 + } + endpoints { + } + endpoints { + lb_endpoints { + endpoint { + address { + pipe { + path: "." + } + } + } + health_status: HEALTHY + } + priority: 64 + } + } + use_tcp_for_dns_lookups: true + upstream_http_protocol_options { + auto_sni: true + } + track_timeout_budgets: true + } + clusters { + name: "@" + connect_timeout { + seconds: 8519680 + nanos: 247771776 + } + outlier_detection { + success_rate_minimum_hosts { + value: 536870912 + } + } + ring_hash_lb_config { + } + transport_socket { + name: "raw_buffer" + } + metadata { + filter_metadata { + key: "I" + value { + } + } + } + load_assignment { + cluster_name: "." + endpoints { + locality { + sub_zone: "\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\001\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037" + } + } + endpoints { + locality { + sub_zone: "\037\037\037\037\037\037\037\037\037\000\037\037\037\037\037\037\037\037\037\037\037\001\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\037\031\037\037\037\037\037\037\037\037\037\037\037\037" + } + lb_endpoints { + endpoint { + address { + pipe { + path: "." + } + } + } + health_status: HEALTHY + } + } + endpoints { + priority: 16 + } + endpoints { + lb_endpoints { + endpoint { + address { + pipe { + path: "." + } + } + hostname: "ser" + } + health_status: HEALTHY + } + } + endpoints { + lb_endpoints { + endpoint { + address { + pipe { + path: "." + } + } + } + health_status: HEALTHY + } + priority: 64 + } + } + } + clusters { + name: "7" + type: STRICT_DNS + connect_timeout { + nanos: 926363914 + } + http_protocol_options { + allow_absolute_url { + value: true + } + accept_http_10: true + } + dns_lookup_family: V6_ONLY + transport_socket { + name: "raw_buffer" + } + protocol_selection: USE_DOWNSTREAM_PROTOCOL + common_lb_config { + healthy_panic_threshold { + } + ignore_new_hosts_until_first_hc: true + } + } +} +cluster_manager { + load_stats_config { + api_type: GRPC + grpc_services { + google_grpc { + target_uri: "2" + channel_credentials { + ssl_credentials { + private_key { + filename: "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000" + } + } + } + call_credentials { + service_account_jwt_access { + json_key: "\"a" + token_lifetime_seconds: 140737488355326 + } + } + call_credentials { + service_account_jwt_access { + json_key: "\001\000\000\000\"a" + token_lifetime_seconds: 140737488355326 + } + } + stat_prefix: "*" + } + } + request_timeout { + nanos: 14024704 + } + transport_api_version: V3 + } +} +stats_flush_interval { + nanos: 2097152 +} +header_prefix: "\001" \ No newline at end of file diff --git a/test/server/server_test.cc b/test/server/server_test.cc index 24c5869c4526..681bb9cf068e 100644 --- a/test/server/server_test.cc +++ b/test/server/server_test.cc @@ -391,6 +391,31 @@ TEST_P(ServerInstanceImplTest, EmptyShutdownLifecycleNotifications) { EXPECT_EQ(0L, TestUtility::findGauge(stats_store_, "server.state")->value()); } +// Catch exceptions in secondary cluster initialization callbacks. These are not caught in the main +// initialization try/catch. +TEST_P(ServerInstanceImplTest, SecondaryClusterExceptions) { + // Error in reading illegal file path for channel credentials in secondary cluster. + EXPECT_LOG_CONTAINS("warn", + "Skipping initialization of secondary cluster: API configs must have either " + "a gRPC service or a cluster name defined", + { + initialize("test/server/test_data/server/health_check_nullptr.yaml"); + server_->dispatcher().run(Event::Dispatcher::RunType::NonBlock); + }); +} + +// Catch exceptions in HDS cluster initialization callbacks. +TEST_P(ServerInstanceImplTest, HdsClusterException) { + // Error in reading illegal file path for channel credentials in secondary cluster. + EXPECT_LOG_CONTAINS("warn", + "Skipping initialization of HDS cluster: API configs must have either a gRPC " + "service or a cluster name defined", + { + initialize("test/server/test_data/server/hds_exception.yaml"); + server_->dispatcher().run(Event::Dispatcher::RunType::NonBlock); + }); +} + TEST_P(ServerInstanceImplTest, LifecycleNotifications) { bool startup = false, post_init = false, shutdown = false, shutdown_with_completion = false; absl::Notification started, post_init_fired, shutdown_begin, completion_block, completion_done; @@ -812,9 +837,12 @@ TEST_P(ServerInstanceImplTest, // set. TEST_P(ServerInstanceImplTest, DEPRECATED_FEATURE_TEST(FailToLoadV2HdsTransportWithoutExplicitVersion)) { - EXPECT_THROW_WITH_REGEX(initialize("test/server/test_data/server/hds_v2.yaml"), - DeprecatedMajorVersionException, - "V2 .and AUTO. xDS transport protocol versions are deprecated in.*"); + // HDS cluster initialization happens through callbacks after runtime initialization. Exceptions + // are caught and will result in server shutdown. + EXPECT_LOG_CONTAINS("warn", + "Skipping initialization of HDS cluster: V2 (and AUTO) xDS transport " + "protocol versions are deprecated", + initialize("test/server/test_data/server/hds_v2.yaml")); } // Validate that bootstrap v2 is rejected when --bootstrap-version is not set. diff --git a/test/server/test_data/server/hds_exception.yaml b/test/server/test_data/server/hds_exception.yaml new file mode 100644 index 000000000000..bb9816e1f004 --- /dev/null +++ b/test/server/test_data/server/hds_exception.yaml @@ -0,0 +1,27 @@ +staticResources: + clusters: + - name: ser + connectTimeout: 0.000000813s + healthChecks: + - timeout: 0.926363914s + interval: 165537s + unhealthyThreshold: 16 + healthyThreshold: 0 + httpHealthCheck: + host: h + path: '&' + loadAssignment: + clusterName: . + endpoints: + - lbEndpoints: + - endpoint: + address: + pipe: + path: . + healthStatus: DRAINING +hdsConfig: + apiType: GRPC + requestTimeout: 0.014024704s + transportApiVersion: V3 + +enableDispatcherStats: true diff --git a/test/server/test_data/server/health_check_nullptr.yaml b/test/server/test_data/server/health_check_nullptr.yaml new file mode 100644 index 000000000000..e150a3fc717d --- /dev/null +++ b/test/server/test_data/server/health_check_nullptr.yaml @@ -0,0 +1,23 @@ +staticResources: + clusters: + - name: foo + connectTimeout: 0.000000813s + perConnectionBufferLimitBytes: 0 + lbPolicy: LEAST_REQUEST + loadAssignment: + clusterName: '*' + endpoints: + - locality: + subZone: "\x1F\x1F" + lbEndpoints: + - endpoint: + address: + pipe: + path: . + healthStatus: HEALTHY +clusterManager: + loadStatsConfig: + apiType: GRPC + requestTimeout: 0.014024704s + transportApiVersion: V3 +statsFlushInterval: 0.002097152s