From 5661eefc120fca8e747df3eafa987281be3d80e9 Mon Sep 17 00:00:00 2001 From: Amaury Chamayou Date: Tue, 17 Sep 2024 13:30:18 +0100 Subject: [PATCH] Make collateral fetching retries configurable, and shut down when they are exhausted (#6478) --- .snpcc_canary | 2 +- CHANGELOG.md | 1 + doc/host_config_schema/cchost_config.json | 5 ++ include/ccf/pal/attestation.h | 15 +++++- include/ccf/pal/attestation_sev_snp.h | 25 ++++++--- .../pal/attestation_sev_snp_endorsements.h | 53 ++++++++++++------- src/node/quote_endorsements_client.h | 36 ++++++++++--- tests/infra/network.py | 13 +++++ tests/infra/remote.py | 1 + tests/reconfiguration.py | 11 +++- 10 files changed, 127 insertions(+), 35 deletions(-) diff --git a/.snpcc_canary b/.snpcc_canary index a90204f06158..08708bcac61a 100644 --- a/.snpcc_canary +++ b/.snpcc_canary @@ -3,4 +3,4 @@ O \ o | / /-xXx--//-----x=x--/-xXx--/---x---->>>--/ ... -/\/\d(-_-)b +/\/\d(-_-)b/\/\ diff --git a/CHANGELOG.md b/CHANGELOG.md index e463c55caf7c..af0269301b81 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - The `set_jwt_issuer` governance action has been updated, and no longer accepts `key_filter` or `key_policy` arguments (#6450). - Nodes started in `Join` mode will shut down if they receive and unrecoverable condition such as `StartupSeqnoIsOld` when attempting to join (#6471). +- In configuration, `attestation.snp_endorsements_servers` can specify a `max_retries_count`. If the count has been exhausted without success for all configured servers, the node will shut down (#6478). ### Removed diff --git a/doc/host_config_schema/cchost_config.json b/doc/host_config_schema/cchost_config.json index 313e6144af2b..4662da4aad36 100644 --- a/doc/host_config_schema/cchost_config.json +++ b/doc/host_config_schema/cchost_config.json @@ -486,6 +486,11 @@ "url": { "type": "string", "description": "Server URLs used to retrieve attestation report endorsement certificates, e.g. \"kdsintf.amd.com\" (AMD), \"global.acccache.azure.net\" (Azure) or \"169.254.169.254\" (THIM)" + }, + "max_retries_count": { + "type": "integer", + "default": 3, + "description": "Maximum number of retries to fetch endorsements from the server" } }, "required": ["url"], diff --git a/include/ccf/pal/attestation.h b/include/ccf/pal/attestation.h index c3a7ad859e00..2cc25bee6939 100644 --- a/include/ccf/pal/attestation.h +++ b/include/ccf/pal/attestation.h @@ -442,6 +442,19 @@ namespace ccf::pal measurement = claim_measurement.value(); report_data = custom_claim_report_data.value(); } - #endif + + class AttestationCollateralFetchingTimeout : public std::exception + { + private: + std::string msg; + + public: + AttestationCollateralFetchingTimeout(const std::string& msg_) : msg(msg_) {} + + virtual const char* what() const throw() + { + return msg.c_str(); + } + }; } \ No newline at end of file diff --git a/include/ccf/pal/attestation_sev_snp.h b/include/ccf/pal/attestation_sev_snp.h index 5ddee523415e..2f9e40b078cf 100644 --- a/include/ccf/pal/attestation_sev_snp.h +++ b/include/ccf/pal/attestation_sev_snp.h @@ -187,24 +187,31 @@ QPHfbkH0CyPfhl1jWhJFZasCAwEAAQ== auto chip_id_hex = fmt::format("{:02x}", fmt::join(quote.chip_id, "")); auto reported_tcb = fmt::format("{:0x}", *(uint64_t*)("e.reported_tcb)); + constexpr size_t default_max_retries_count = 10; + if (endorsements_servers.empty()) { // Default to Azure server if no servers are specified config.servers.emplace_back(make_azure_endorsements_server( - default_azure_endorsements_endpoint, chip_id_hex, reported_tcb)); + default_azure_endorsements_endpoint, + chip_id_hex, + reported_tcb, + default_max_retries_count)); return config; } for (auto const& server : endorsements_servers) { + size_t max_retries_count = + server.max_retries_count.value_or(default_max_retries_count); switch (server.type) { case EndorsementsEndpointType::Azure: { auto loc = get_endpoint_loc(server, default_azure_endorsements_endpoint); - config.servers.emplace_back( - make_azure_endorsements_server(loc, chip_id_hex, reported_tcb)); + config.servers.emplace_back(make_azure_endorsements_server( + loc, chip_id_hex, reported_tcb, max_retries_count)); break; } case EndorsementsEndpointType::AMD: @@ -217,15 +224,21 @@ QPHfbkH0CyPfhl1jWhJFZasCAwEAAQ== auto loc = get_endpoint_loc(server, default_amd_endorsements_endpoint); config.servers.emplace_back(make_amd_endorsements_server( - loc, chip_id_hex, boot_loader, tee, snp, microcode)); + loc, + chip_id_hex, + boot_loader, + tee, + snp, + microcode, + max_retries_count)); break; } case EndorsementsEndpointType::THIM: { auto loc = get_endpoint_loc(server, default_thim_endorsements_endpoint); - config.servers.emplace_back( - make_thim_endorsements_server(loc, chip_id_hex, reported_tcb)); + config.servers.emplace_back(make_thim_endorsements_server( + loc, chip_id_hex, reported_tcb, max_retries_count)); break; } default: diff --git a/include/ccf/pal/attestation_sev_snp_endorsements.h b/include/ccf/pal/attestation_sev_snp_endorsements.h index 23db1c2e6c7b..b1a41e8cfbeb 100644 --- a/include/ccf/pal/attestation_sev_snp_endorsements.h +++ b/include/ccf/pal/attestation_sev_snp_endorsements.h @@ -47,6 +47,7 @@ namespace ccf::pal::snp bool response_is_thim_json = false; std::map headers = {}; bool tls = true; + size_t max_retries_count = 3; bool operator==(const EndpointInfo&) const = default; }; @@ -73,12 +74,14 @@ namespace ccf::pal::snp { EndorsementsEndpointType type = Azure; std::optional url = std::nullopt; + std::optional max_retries_count = std::nullopt; bool operator==(const EndorsementsServer&) const = default; }; DECLARE_JSON_TYPE_WITH_OPTIONAL_FIELDS(EndorsementsServer); DECLARE_JSON_REQUIRED_FIELDS(EndorsementsServer); - DECLARE_JSON_OPTIONAL_FIELDS(EndorsementsServer, type, url); + DECLARE_JSON_OPTIONAL_FIELDS( + EndorsementsServer, type, url, max_retries_count); using EndorsementsServers = std::vector; struct HostPort @@ -94,15 +97,20 @@ namespace ccf::pal::snp make_azure_endorsements_server( const HostPort& endpoint, const std::string& chip_id_hex, - const std::string& reported_tcb) + const std::string& reported_tcb, + size_t max_retries_count) { std::map params; params["api-version"] = "2020-10-15-preview"; - return { - {endpoint.host, - endpoint.port, - fmt::format("/SevSnpVM/certificates/{}/{}", chip_id_hex, reported_tcb), - params}}; + EndorsementEndpointsConfiguration::EndpointInfo info{ + endpoint.host, + endpoint.port, + fmt::format("/SevSnpVM/certificates/{}/{}", chip_id_hex, reported_tcb), + params}; + + info.max_retries_count = max_retries_count; + + return {info}; } // AMD endorsements endpoints. See @@ -116,7 +124,8 @@ namespace ccf::pal::snp const std::string& boot_loader, const std::string& tee, const std::string& snp, - const std::string& microcode) + const std::string& microcode, + size_t max_retries_count) { std::map params; params["blSPL"] = boot_loader; @@ -125,19 +134,23 @@ namespace ccf::pal::snp params["ucodeSPL"] = microcode; EndorsementEndpointsConfiguration::Server server; - server.push_back({ + EndorsementEndpointsConfiguration::EndpointInfo leaf{ endpoint.host, endpoint.port, fmt::format("/vcek/v1/{}/{}", product_name, chip_id_hex), params, true // DER - }); - server.push_back( - {endpoint.host, - endpoint.port, - fmt::format("/vcek/v1/{}/cert_chain", product_name), - {}}); + }; + leaf.max_retries_count = max_retries_count; + EndorsementEndpointsConfiguration::EndpointInfo chain{ + endpoint.host, + endpoint.port, + fmt::format("/vcek/v1/{}/cert_chain", product_name), + {}}; + chain.max_retries_count = max_retries_count; + server.push_back(leaf); + server.push_back(chain); return server; } @@ -148,12 +161,13 @@ namespace ccf::pal::snp make_thim_endorsements_server( const HostPort& endpoint, const std::string& chip_id_hex, - const std::string& reported_tcb) + const std::string& reported_tcb, + size_t max_retries_count) { std::map params; params["tcbVersion"] = reported_tcb; params["platformId"] = chip_id_hex; - return {{ + EndorsementEndpointsConfiguration::EndpointInfo info{ endpoint.host, endpoint.port, "/metadata/THIM/amd/certification", @@ -162,7 +176,10 @@ namespace ccf::pal::snp true, // But THIM JSON {{"Metadata", "true"}}, false // No TLS - }}; + }; + info.max_retries_count = max_retries_count; + + return {info}; } } diff --git a/src/node/quote_endorsements_client.h b/src/node/quote_endorsements_client.h index 2a64b166df22..0d0c2594a39a 100644 --- a/src/node/quote_endorsements_client.h +++ b/src/node/quote_endorsements_client.h @@ -9,6 +9,21 @@ namespace ccf { using QuoteEndorsementsFetchedCallback = std::function&& endorsements)>; + using Server = pal::snp::EndorsementEndpointsConfiguration::Server; + + static inline size_t max_retries_count(const Server& server) + { + // Each server should contain at least one endpoint definition + if (server.empty()) + { + throw std::logic_error( + "No endpoints defined in SNP attestation collateral server"); + } + + // If multiple endpoints are defined, the max_retries_count of the first + // if the maximum number of retries for the server. + return server.front().max_retries_count; + } // Resilient client to fetch attestation report endorsement certificate. class QuoteEndorsementsClient @@ -17,16 +32,11 @@ namespace ccf private: using EndpointInfo = pal::snp::EndorsementEndpointsConfiguration::EndpointInfo; - using Server = pal::snp::EndorsementEndpointsConfiguration::Server; // Resend request after this interval if no response was received from // remote server static constexpr size_t server_connection_timeout_s = 3; - // Maximum number of retries per remote server before giving up and moving - // on to the next server. - static constexpr size_t max_server_retries_count = 3; - std::shared_ptr rpcsessions; pal::snp::EndorsementEndpointsConfiguration config; @@ -121,9 +131,18 @@ namespace ccf if (msg->data.request_id >= msg->data.self->last_received_request_id) { auto& servers = msg->data.self->config.servers; + // Should always contain at least one server, + // installed by ccf::pal::make_endorsement_endpoint_configuration() + if (servers.empty()) + { + throw std::logic_error( + "No server specified to fetch endorsements"); + } + msg->data.self->server_retries_count++; if ( - msg->data.self->server_retries_count >= max_server_retries_count) + msg->data.self->server_retries_count >= + max_retries_count(servers.front())) { if (servers.size() > 1) { @@ -137,7 +156,10 @@ namespace ccf "Giving up retrying fetching attestation endorsements from " "{} after {} attempts", server.front().host, - max_server_retries_count); + server.front().max_retries_count); + throw ccf::pal::AttestationCollateralFetchingTimeout( + "Timed out fetching attestation endorsements from all " + "configured servers"); return; } } diff --git a/tests/infra/network.py b/tests/infra/network.py index 93abc20e89be..b454435a50bf 100644 --- a/tests/infra/network.py +++ b/tests/infra/network.py @@ -21,6 +21,7 @@ import http import pprint import functools +import re from datetime import datetime, timedelta, timezone from infra.consortium import slurp_file from infra.snp import IS_SNP @@ -76,6 +77,10 @@ class StartupSeqnoIsOld(Exception): pass +class CollateralFetchTimeout(Exception): + pass + + class ServiceCertificateInvalid(Exception): pass @@ -916,6 +921,9 @@ def run_join_node( errors = [] self.nodes.remove(node) if errors: + giving_up_fetching = re.compile( + "Giving up retrying fetching attestation endorsements from .* after (\d+) attempts" + ) # Throw accurate exceptions if known errors found in for error in errors: if "Quote does not contain known enclave measurement" in error: @@ -924,6 +932,11 @@ def run_join_node( raise StartupSeqnoIsOld(has_stopped) from e if "invalid cert on handshake" in error: raise ServiceCertificateInvalid from e + match = giving_up_fetching.search(error) + if match: + raise CollateralFetchTimeout( + has_stopped, int(match.group(1)) + ) from e raise def join_node( diff --git a/tests/infra/remote.py b/tests/infra/remote.py index 14aad26f109e..0a6fe11e9ef3 100644 --- a/tests/infra/remote.py +++ b/tests/infra/remote.py @@ -745,6 +745,7 @@ def __init__( s = {} s["type"] = server_type s["url"] = url + s["max_retries_count"] = 4 snp_endorsements_servers_list.append(s) # Default snp_security_policy_file if not set diff --git a/tests/reconfiguration.py b/tests/reconfiguration.py index f3cada1ab143..c79b3550450e 100644 --- a/tests/reconfiguration.py +++ b/tests/reconfiguration.py @@ -256,10 +256,13 @@ def test_add_node_endorsements_endpoints(network, args): test_vectors = [ (["Azure:global.acccache.azure.net"], True), (["Azure:global.acccache.azure.net:443"], True), + (["Azure:invalid.azure.net:443"], False), (["AMD:kdsintf.amd.com"], True), (["AMD:invalid.amd.com"], False), (["THIM:$Fabric_NodeIPOrFQDN:2377"], True), + (["THIM:invalid:2377"], False), (["Azure:invalid.azure.com", "AMD:kdsintf.amd.com"], True), # Fallback server + (["Azure:invalid.azure.com", "AMD:invalid.amd.com"], False), ] for servers, expected_result in test_vectors: @@ -275,11 +278,15 @@ def test_add_node_endorsements_endpoints(network, args): args_copy, timeout=15, ) - except TimeoutError: - assert not expected_result + except infra.network.CollateralFetchTimeout as e: LOG.info( f"Node with invalid quote endorsement servers {servers} could not join as expected" ) + assert not expected_result + assert e.args == ( + True, + 4, + ), "Node has stopped after timing out on fetching collateral" else: assert ( expected_result