Make collateral fetching retries configurable, and shut down when the…

…y are exhausted (#6478)
microsoft · Sep 17, 2024 · 5661eef · 5661eef
1 parent 36f11f9
commit 5661eef
Show file tree

Hide file tree

Showing 10 changed files with 127 additions and 35 deletions.
diff --git a/.snpcc_canary b/.snpcc_canary
@@ -3,4 +3,4 @@
    O     \     o      |    /
 /-xXx--//-----x=x--/-xXx--/---x---->>>--/
 ...
-/\/\d(-_-)b
+/\/\d(-_-)b/\/\
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 
 - The `set_jwt_issuer` governance action has been updated, and no longer accepts `key_filter` or `key_policy` arguments (#6450).
 - Nodes started in `Join` mode will shut down if they receive and unrecoverable condition such as `StartupSeqnoIsOld` when attempting to join (#6471).
+- In configuration, `attestation.snp_endorsements_servers` can specify a `max_retries_count`. If the count has been exhausted without success for all configured servers, the node will shut down (#6478).
 
 ### Removed
 

diff --git a/doc/host_config_schema/cchost_config.json b/doc/host_config_schema/cchost_config.json
@@ -486,6 +486,11 @@
               "url": {
                 "type": "string",
                 "description": "Server URLs used to retrieve attestation report endorsement certificates, e.g. \"kdsintf.amd.com\" (AMD), \"global.acccache.azure.net\" (Azure) or \"169.254.169.254\" (THIM)"
+              },
+              "max_retries_count": {
+                "type": "integer",
+                "default": 3,
+                "description": "Maximum number of retries to fetch endorsements from the server"
               }
             },
             "required": ["url"],

diff --git a/include/ccf/pal/attestation.h b/include/ccf/pal/attestation.h
@@ -442,6 +442,19 @@ namespace ccf::pal
     measurement = claim_measurement.value();
     report_data = custom_claim_report_data.value();
   }
-
 #endif
+
+  class AttestationCollateralFetchingTimeout : public std::exception
+  {
+  private:
+    std::string msg;
+
+  public:
+    AttestationCollateralFetchingTimeout(const std::string& msg_) : msg(msg_) {}
+
+    virtual const char* what() const throw()
+    {
+      return msg.c_str();
+    }
+  };
 }
diff --git a/include/ccf/pal/attestation_sev_snp.h b/include/ccf/pal/attestation_sev_snp.h
@@ -187,24 +187,31 @@ QPHfbkH0CyPfhl1jWhJFZasCAwEAAQ==
     auto chip_id_hex = fmt::format("{:02x}", fmt::join(quote.chip_id, ""));
     auto reported_tcb = fmt::format("{:0x}", *(uint64_t*)(&quote.reported_tcb));
 
+    constexpr size_t default_max_retries_count = 10;
+
     if (endorsements_servers.empty())
     {
       // Default to Azure server if no servers are specified
       config.servers.emplace_back(make_azure_endorsements_server(
-        default_azure_endorsements_endpoint, chip_id_hex, reported_tcb));
+        default_azure_endorsements_endpoint,
+        chip_id_hex,
+        reported_tcb,
+        default_max_retries_count));
       return config;
     }
 
     for (auto const& server : endorsements_servers)
     {
+      size_t max_retries_count =
+        server.max_retries_count.value_or(default_max_retries_count);
       switch (server.type)
       {
         case EndorsementsEndpointType::Azure:
         {
           auto loc =
             get_endpoint_loc(server, default_azure_endorsements_endpoint);
-          config.servers.emplace_back(
-            make_azure_endorsements_server(loc, chip_id_hex, reported_tcb));
+          config.servers.emplace_back(make_azure_endorsements_server(
+            loc, chip_id_hex, reported_tcb, max_retries_count));
           break;
         }
         case EndorsementsEndpointType::AMD:
@@ -217,15 +224,21 @@ QPHfbkH0CyPfhl1jWhJFZasCAwEAAQ==
           auto loc =
             get_endpoint_loc(server, default_amd_endorsements_endpoint);
           config.servers.emplace_back(make_amd_endorsements_server(
-            loc, chip_id_hex, boot_loader, tee, snp, microcode));
+            loc,
+            chip_id_hex,
+            boot_loader,
+            tee,
+            snp,
+            microcode,
+            max_retries_count));
           break;
         }
         case EndorsementsEndpointType::THIM:
         {
           auto loc =
             get_endpoint_loc(server, default_thim_endorsements_endpoint);
-          config.servers.emplace_back(
-            make_thim_endorsements_server(loc, chip_id_hex, reported_tcb));
+          config.servers.emplace_back(make_thim_endorsements_server(
+            loc, chip_id_hex, reported_tcb, max_retries_count));
           break;
         }
         default:

diff --git a/include/ccf/pal/attestation_sev_snp_endorsements.h b/include/ccf/pal/attestation_sev_snp_endorsements.h
@@ -47,6 +47,7 @@ namespace ccf::pal::snp
       bool response_is_thim_json = false;
       std::map<std::string, std::string> headers = {};
       bool tls = true;
+      size_t max_retries_count = 3;
 
       bool operator==(const EndpointInfo&) const = default;
     };
@@ -73,12 +74,14 @@ namespace ccf::pal::snp
   {
     EndorsementsEndpointType type = Azure;
     std::optional<std::string> url = std::nullopt;
+    std::optional<size_t> max_retries_count = std::nullopt;
 
     bool operator==(const EndorsementsServer&) const = default;
   };
   DECLARE_JSON_TYPE_WITH_OPTIONAL_FIELDS(EndorsementsServer);
   DECLARE_JSON_REQUIRED_FIELDS(EndorsementsServer);
-  DECLARE_JSON_OPTIONAL_FIELDS(EndorsementsServer, type, url);
+  DECLARE_JSON_OPTIONAL_FIELDS(
+    EndorsementsServer, type, url, max_retries_count);
   using EndorsementsServers = std::vector<EndorsementsServer>;
 
   struct HostPort
@@ -94,15 +97,20 @@ namespace ccf::pal::snp
   make_azure_endorsements_server(
     const HostPort& endpoint,
     const std::string& chip_id_hex,
-    const std::string& reported_tcb)
+    const std::string& reported_tcb,
+    size_t max_retries_count)
   {
     std::map<std::string, std::string> params;
     params["api-version"] = "2020-10-15-preview";
-    return {
-      {endpoint.host,
-       endpoint.port,
-       fmt::format("/SevSnpVM/certificates/{}/{}", chip_id_hex, reported_tcb),
-       params}};
+    EndorsementEndpointsConfiguration::EndpointInfo info{
+      endpoint.host,
+      endpoint.port,
+      fmt::format("/SevSnpVM/certificates/{}/{}", chip_id_hex, reported_tcb),
+      params};
+
+    info.max_retries_count = max_retries_count;
+
+    return {info};
   }
 
   // AMD endorsements endpoints. See
@@ -116,7 +124,8 @@ namespace ccf::pal::snp
     const std::string& boot_loader,
     const std::string& tee,
     const std::string& snp,
-    const std::string& microcode)
+    const std::string& microcode,
+    size_t max_retries_count)
   {
     std::map<std::string, std::string> params;
     params["blSPL"] = boot_loader;
@@ -125,19 +134,23 @@ namespace ccf::pal::snp
     params["ucodeSPL"] = microcode;
 
     EndorsementEndpointsConfiguration::Server server;
-    server.push_back({
+    EndorsementEndpointsConfiguration::EndpointInfo leaf{
       endpoint.host,
       endpoint.port,
       fmt::format("/vcek/v1/{}/{}", product_name, chip_id_hex),
       params,
       true // DER
-    });
-    server.push_back(
-      {endpoint.host,
-       endpoint.port,
-       fmt::format("/vcek/v1/{}/cert_chain", product_name),
-       {}});
+    };
+    leaf.max_retries_count = max_retries_count;
+    EndorsementEndpointsConfiguration::EndpointInfo chain{
+      endpoint.host,
+      endpoint.port,
+      fmt::format("/vcek/v1/{}/cert_chain", product_name),
+      {}};
+    chain.max_retries_count = max_retries_count;
 
+    server.push_back(leaf);
+    server.push_back(chain);
     return server;
   }
 
@@ -148,12 +161,13 @@ namespace ccf::pal::snp
   make_thim_endorsements_server(
     const HostPort& endpoint,
     const std::string& chip_id_hex,
-    const std::string& reported_tcb)
+    const std::string& reported_tcb,
+    size_t max_retries_count)
   {
     std::map<std::string, std::string> params;
     params["tcbVersion"] = reported_tcb;
     params["platformId"] = chip_id_hex;
-    return {{
+    EndorsementEndpointsConfiguration::EndpointInfo info{
       endpoint.host,
       endpoint.port,
       "/metadata/THIM/amd/certification",
@@ -162,7 +176,10 @@ namespace ccf::pal::snp
       true, // But THIM JSON
       {{"Metadata", "true"}},
       false // No TLS
-    }};
+    };
+    info.max_retries_count = max_retries_count;
+
+    return {info};
   }
 }
 

diff --git a/src/node/quote_endorsements_client.h b/src/node/quote_endorsements_client.h
@@ -9,6 +9,21 @@ namespace ccf
 {
   using QuoteEndorsementsFetchedCallback =
     std::function<void(std::vector<uint8_t>&& endorsements)>;
+  using Server = pal::snp::EndorsementEndpointsConfiguration::Server;
+
+  static inline size_t max_retries_count(const Server& server)
+  {
+    // Each server should contain at least one endpoint definition
+    if (server.empty())
+    {
+      throw std::logic_error(
+        "No endpoints defined in SNP attestation collateral server");
+    }
+
+    // If multiple endpoints are defined, the max_retries_count of the first
+    // if the maximum number of retries for the server.
+    return server.front().max_retries_count;
+  }
 
   // Resilient client to fetch attestation report endorsement certificate.
   class QuoteEndorsementsClient
@@ -17,16 +32,11 @@ namespace ccf
   private:
     using EndpointInfo =
       pal::snp::EndorsementEndpointsConfiguration::EndpointInfo;
-    using Server = pal::snp::EndorsementEndpointsConfiguration::Server;
 
     // Resend request after this interval if no response was received from
     // remote server
     static constexpr size_t server_connection_timeout_s = 3;
 
-    // Maximum number of retries per remote server before giving up and moving
-    // on to the next server.
-    static constexpr size_t max_server_retries_count = 3;
-
     std::shared_ptr<RPCSessions> rpcsessions;
 
     pal::snp::EndorsementEndpointsConfiguration config;
@@ -121,9 +131,18 @@ namespace ccf
           if (msg->data.request_id >= msg->data.self->last_received_request_id)
           {
             auto& servers = msg->data.self->config.servers;
+            // Should always contain at least one server,
+            // installed by ccf::pal::make_endorsement_endpoint_configuration()
+            if (servers.empty())
+            {
+              throw std::logic_error(
+                "No server specified to fetch endorsements");
+            }
+
             msg->data.self->server_retries_count++;
             if (
-              msg->data.self->server_retries_count >= max_server_retries_count)
+              msg->data.self->server_retries_count >=
+              max_retries_count(servers.front()))
             {
               if (servers.size() > 1)
               {
@@ -137,7 +156,10 @@ namespace ccf
                   "Giving up retrying fetching attestation endorsements from "
                   "{} after {} attempts",
                   server.front().host,
-                  max_server_retries_count);
+                  server.front().max_retries_count);
+                throw ccf::pal::AttestationCollateralFetchingTimeout(
+                  "Timed out fetching attestation endorsements from all "
+                  "configured servers");
                 return;
               }
             }

diff --git a/tests/infra/network.py b/tests/infra/network.py
@@ -21,6 +21,7 @@
 import http
 import pprint
 import functools
+import re
 from datetime import datetime, timedelta, timezone
 from infra.consortium import slurp_file
 from infra.snp import IS_SNP
@@ -76,6 +77,10 @@ class StartupSeqnoIsOld(Exception):
     pass
 
 
+class CollateralFetchTimeout(Exception):
+    pass
+
+
 class ServiceCertificateInvalid(Exception):
     pass
 
@@ -916,6 +921,9 @@ def run_join_node(
                     errors = []
                 self.nodes.remove(node)
                 if errors:
+                    giving_up_fetching = re.compile(
+                        "Giving up retrying fetching attestation endorsements from .* after (\d+) attempts"
+                    )
                     # Throw accurate exceptions if known errors found in
                     for error in errors:
                         if "Quote does not contain known enclave measurement" in error:
@@ -924,6 +932,11 @@ def run_join_node(
                             raise StartupSeqnoIsOld(has_stopped) from e
                         if "invalid cert on handshake" in error:
                             raise ServiceCertificateInvalid from e
+                        match = giving_up_fetching.search(error)
+                        if match:
+                            raise CollateralFetchTimeout(
+                                has_stopped, int(match.group(1))
+                            ) from e
                 raise
 
     def join_node(

diff --git a/tests/infra/remote.py b/tests/infra/remote.py
@@ -745,6 +745,7 @@ def __init__(
             s = {}
             s["type"] = server_type
             s["url"] = url
+            s["max_retries_count"] = 4
             snp_endorsements_servers_list.append(s)
 
         # Default snp_security_policy_file if not set

diff --git a/tests/reconfiguration.py b/tests/reconfiguration.py
@@ -256,10 +256,13 @@ def test_add_node_endorsements_endpoints(network, args):
     test_vectors = [
         (["Azure:global.acccache.azure.net"], True),
         (["Azure:global.acccache.azure.net:443"], True),
+        (["Azure:invalid.azure.net:443"], False),
         (["AMD:kdsintf.amd.com"], True),
         (["AMD:invalid.amd.com"], False),
         (["THIM:$Fabric_NodeIPOrFQDN:2377"], True),
+        (["THIM:invalid:2377"], False),
         (["Azure:invalid.azure.com", "AMD:kdsintf.amd.com"], True),  # Fallback server
+        (["Azure:invalid.azure.com", "AMD:invalid.amd.com"], False),
     ]
 
     for servers, expected_result in test_vectors:
@@ -275,11 +278,15 @@ def test_add_node_endorsements_endpoints(network, args):
                 args_copy,
                 timeout=15,
             )
-        except TimeoutError:
-            assert not expected_result
+        except infra.network.CollateralFetchTimeout as e:
             LOG.info(
                 f"Node with invalid quote endorsement servers {servers} could not join as expected"
             )
+            assert not expected_result
+            assert e.args == (
+                True,
+                4,
+            ), "Node has stopped after timing out on fetching collateral"
         else:
             assert (
                 expected_result