Skip to content

Commit

Permalink
Make collateral fetching retries configurable, and shut down when the…
Browse files Browse the repository at this point in the history
…y are exhausted (#6478)
  • Loading branch information
achamayou authored Sep 17, 2024
1 parent 36f11f9 commit 5661eef
Show file tree
Hide file tree
Showing 10 changed files with 127 additions and 35 deletions.
2 changes: 1 addition & 1 deletion .snpcc_canary
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@
O \ o | /
/-xXx--//-----x=x--/-xXx--/---x---->>>--/
...
/\/\d(-_-)b
/\/\d(-_-)b/\/\
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.

- The `set_jwt_issuer` governance action has been updated, and no longer accepts `key_filter` or `key_policy` arguments (#6450).
- Nodes started in `Join` mode will shut down if they receive and unrecoverable condition such as `StartupSeqnoIsOld` when attempting to join (#6471).
- In configuration, `attestation.snp_endorsements_servers` can specify a `max_retries_count`. If the count has been exhausted without success for all configured servers, the node will shut down (#6478).

### Removed

Expand Down
5 changes: 5 additions & 0 deletions doc/host_config_schema/cchost_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -486,6 +486,11 @@
"url": {
"type": "string",
"description": "Server URLs used to retrieve attestation report endorsement certificates, e.g. \"kdsintf.amd.com\" (AMD), \"global.acccache.azure.net\" (Azure) or \"169.254.169.254\" (THIM)"
},
"max_retries_count": {
"type": "integer",
"default": 3,
"description": "Maximum number of retries to fetch endorsements from the server"
}
},
"required": ["url"],
Expand Down
15 changes: 14 additions & 1 deletion include/ccf/pal/attestation.h
Original file line number Diff line number Diff line change
Expand Up @@ -442,6 +442,19 @@ namespace ccf::pal
measurement = claim_measurement.value();
report_data = custom_claim_report_data.value();
}

#endif

class AttestationCollateralFetchingTimeout : public std::exception
{
private:
std::string msg;

public:
AttestationCollateralFetchingTimeout(const std::string& msg_) : msg(msg_) {}

virtual const char* what() const throw()
{
return msg.c_str();
}
};
}
25 changes: 19 additions & 6 deletions include/ccf/pal/attestation_sev_snp.h
Original file line number Diff line number Diff line change
Expand Up @@ -187,24 +187,31 @@ QPHfbkH0CyPfhl1jWhJFZasCAwEAAQ==
auto chip_id_hex = fmt::format("{:02x}", fmt::join(quote.chip_id, ""));
auto reported_tcb = fmt::format("{:0x}", *(uint64_t*)(&quote.reported_tcb));

constexpr size_t default_max_retries_count = 10;

if (endorsements_servers.empty())
{
// Default to Azure server if no servers are specified
config.servers.emplace_back(make_azure_endorsements_server(
default_azure_endorsements_endpoint, chip_id_hex, reported_tcb));
default_azure_endorsements_endpoint,
chip_id_hex,
reported_tcb,
default_max_retries_count));
return config;
}

for (auto const& server : endorsements_servers)
{
size_t max_retries_count =
server.max_retries_count.value_or(default_max_retries_count);
switch (server.type)
{
case EndorsementsEndpointType::Azure:
{
auto loc =
get_endpoint_loc(server, default_azure_endorsements_endpoint);
config.servers.emplace_back(
make_azure_endorsements_server(loc, chip_id_hex, reported_tcb));
config.servers.emplace_back(make_azure_endorsements_server(
loc, chip_id_hex, reported_tcb, max_retries_count));
break;
}
case EndorsementsEndpointType::AMD:
Expand All @@ -217,15 +224,21 @@ QPHfbkH0CyPfhl1jWhJFZasCAwEAAQ==
auto loc =
get_endpoint_loc(server, default_amd_endorsements_endpoint);
config.servers.emplace_back(make_amd_endorsements_server(
loc, chip_id_hex, boot_loader, tee, snp, microcode));
loc,
chip_id_hex,
boot_loader,
tee,
snp,
microcode,
max_retries_count));
break;
}
case EndorsementsEndpointType::THIM:
{
auto loc =
get_endpoint_loc(server, default_thim_endorsements_endpoint);
config.servers.emplace_back(
make_thim_endorsements_server(loc, chip_id_hex, reported_tcb));
config.servers.emplace_back(make_thim_endorsements_server(
loc, chip_id_hex, reported_tcb, max_retries_count));
break;
}
default:
Expand Down
53 changes: 35 additions & 18 deletions include/ccf/pal/attestation_sev_snp_endorsements.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ namespace ccf::pal::snp
bool response_is_thim_json = false;
std::map<std::string, std::string> headers = {};
bool tls = true;
size_t max_retries_count = 3;

bool operator==(const EndpointInfo&) const = default;
};
Expand All @@ -73,12 +74,14 @@ namespace ccf::pal::snp
{
EndorsementsEndpointType type = Azure;
std::optional<std::string> url = std::nullopt;
std::optional<size_t> max_retries_count = std::nullopt;

bool operator==(const EndorsementsServer&) const = default;
};
DECLARE_JSON_TYPE_WITH_OPTIONAL_FIELDS(EndorsementsServer);
DECLARE_JSON_REQUIRED_FIELDS(EndorsementsServer);
DECLARE_JSON_OPTIONAL_FIELDS(EndorsementsServer, type, url);
DECLARE_JSON_OPTIONAL_FIELDS(
EndorsementsServer, type, url, max_retries_count);
using EndorsementsServers = std::vector<EndorsementsServer>;

struct HostPort
Expand All @@ -94,15 +97,20 @@ namespace ccf::pal::snp
make_azure_endorsements_server(
const HostPort& endpoint,
const std::string& chip_id_hex,
const std::string& reported_tcb)
const std::string& reported_tcb,
size_t max_retries_count)
{
std::map<std::string, std::string> params;
params["api-version"] = "2020-10-15-preview";
return {
{endpoint.host,
endpoint.port,
fmt::format("/SevSnpVM/certificates/{}/{}", chip_id_hex, reported_tcb),
params}};
EndorsementEndpointsConfiguration::EndpointInfo info{
endpoint.host,
endpoint.port,
fmt::format("/SevSnpVM/certificates/{}/{}", chip_id_hex, reported_tcb),
params};

info.max_retries_count = max_retries_count;

return {info};
}

// AMD endorsements endpoints. See
Expand All @@ -116,7 +124,8 @@ namespace ccf::pal::snp
const std::string& boot_loader,
const std::string& tee,
const std::string& snp,
const std::string& microcode)
const std::string& microcode,
size_t max_retries_count)
{
std::map<std::string, std::string> params;
params["blSPL"] = boot_loader;
Expand All @@ -125,19 +134,23 @@ namespace ccf::pal::snp
params["ucodeSPL"] = microcode;

EndorsementEndpointsConfiguration::Server server;
server.push_back({
EndorsementEndpointsConfiguration::EndpointInfo leaf{
endpoint.host,
endpoint.port,
fmt::format("/vcek/v1/{}/{}", product_name, chip_id_hex),
params,
true // DER
});
server.push_back(
{endpoint.host,
endpoint.port,
fmt::format("/vcek/v1/{}/cert_chain", product_name),
{}});
};
leaf.max_retries_count = max_retries_count;
EndorsementEndpointsConfiguration::EndpointInfo chain{
endpoint.host,
endpoint.port,
fmt::format("/vcek/v1/{}/cert_chain", product_name),
{}};
chain.max_retries_count = max_retries_count;

server.push_back(leaf);
server.push_back(chain);
return server;
}

Expand All @@ -148,12 +161,13 @@ namespace ccf::pal::snp
make_thim_endorsements_server(
const HostPort& endpoint,
const std::string& chip_id_hex,
const std::string& reported_tcb)
const std::string& reported_tcb,
size_t max_retries_count)
{
std::map<std::string, std::string> params;
params["tcbVersion"] = reported_tcb;
params["platformId"] = chip_id_hex;
return {{
EndorsementEndpointsConfiguration::EndpointInfo info{
endpoint.host,
endpoint.port,
"/metadata/THIM/amd/certification",
Expand All @@ -162,7 +176,10 @@ namespace ccf::pal::snp
true, // But THIM JSON
{{"Metadata", "true"}},
false // No TLS
}};
};
info.max_retries_count = max_retries_count;

return {info};
}
}

Expand Down
36 changes: 29 additions & 7 deletions src/node/quote_endorsements_client.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,21 @@ namespace ccf
{
using QuoteEndorsementsFetchedCallback =
std::function<void(std::vector<uint8_t>&& endorsements)>;
using Server = pal::snp::EndorsementEndpointsConfiguration::Server;

static inline size_t max_retries_count(const Server& server)
{
// Each server should contain at least one endpoint definition
if (server.empty())
{
throw std::logic_error(
"No endpoints defined in SNP attestation collateral server");
}

// If multiple endpoints are defined, the max_retries_count of the first
// if the maximum number of retries for the server.
return server.front().max_retries_count;
}

// Resilient client to fetch attestation report endorsement certificate.
class QuoteEndorsementsClient
Expand All @@ -17,16 +32,11 @@ namespace ccf
private:
using EndpointInfo =
pal::snp::EndorsementEndpointsConfiguration::EndpointInfo;
using Server = pal::snp::EndorsementEndpointsConfiguration::Server;

// Resend request after this interval if no response was received from
// remote server
static constexpr size_t server_connection_timeout_s = 3;

// Maximum number of retries per remote server before giving up and moving
// on to the next server.
static constexpr size_t max_server_retries_count = 3;

std::shared_ptr<RPCSessions> rpcsessions;

pal::snp::EndorsementEndpointsConfiguration config;
Expand Down Expand Up @@ -121,9 +131,18 @@ namespace ccf
if (msg->data.request_id >= msg->data.self->last_received_request_id)
{
auto& servers = msg->data.self->config.servers;
// Should always contain at least one server,
// installed by ccf::pal::make_endorsement_endpoint_configuration()
if (servers.empty())
{
throw std::logic_error(
"No server specified to fetch endorsements");
}

msg->data.self->server_retries_count++;
if (
msg->data.self->server_retries_count >= max_server_retries_count)
msg->data.self->server_retries_count >=
max_retries_count(servers.front()))
{
if (servers.size() > 1)
{
Expand All @@ -137,7 +156,10 @@ namespace ccf
"Giving up retrying fetching attestation endorsements from "
"{} after {} attempts",
server.front().host,
max_server_retries_count);
server.front().max_retries_count);
throw ccf::pal::AttestationCollateralFetchingTimeout(
"Timed out fetching attestation endorsements from all "
"configured servers");
return;
}
}
Expand Down
13 changes: 13 additions & 0 deletions tests/infra/network.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import http
import pprint
import functools
import re
from datetime import datetime, timedelta, timezone
from infra.consortium import slurp_file
from infra.snp import IS_SNP
Expand Down Expand Up @@ -76,6 +77,10 @@ class StartupSeqnoIsOld(Exception):
pass


class CollateralFetchTimeout(Exception):
pass


class ServiceCertificateInvalid(Exception):
pass

Expand Down Expand Up @@ -916,6 +921,9 @@ def run_join_node(
errors = []
self.nodes.remove(node)
if errors:
giving_up_fetching = re.compile(
"Giving up retrying fetching attestation endorsements from .* after (\d+) attempts"
)
# Throw accurate exceptions if known errors found in
for error in errors:
if "Quote does not contain known enclave measurement" in error:
Expand All @@ -924,6 +932,11 @@ def run_join_node(
raise StartupSeqnoIsOld(has_stopped) from e
if "invalid cert on handshake" in error:
raise ServiceCertificateInvalid from e
match = giving_up_fetching.search(error)
if match:
raise CollateralFetchTimeout(
has_stopped, int(match.group(1))
) from e
raise

def join_node(
Expand Down
1 change: 1 addition & 0 deletions tests/infra/remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -745,6 +745,7 @@ def __init__(
s = {}
s["type"] = server_type
s["url"] = url
s["max_retries_count"] = 4
snp_endorsements_servers_list.append(s)

# Default snp_security_policy_file if not set
Expand Down
11 changes: 9 additions & 2 deletions tests/reconfiguration.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,10 +256,13 @@ def test_add_node_endorsements_endpoints(network, args):
test_vectors = [
(["Azure:global.acccache.azure.net"], True),
(["Azure:global.acccache.azure.net:443"], True),
(["Azure:invalid.azure.net:443"], False),
(["AMD:kdsintf.amd.com"], True),
(["AMD:invalid.amd.com"], False),
(["THIM:$Fabric_NodeIPOrFQDN:2377"], True),
(["THIM:invalid:2377"], False),
(["Azure:invalid.azure.com", "AMD:kdsintf.amd.com"], True), # Fallback server
(["Azure:invalid.azure.com", "AMD:invalid.amd.com"], False),
]

for servers, expected_result in test_vectors:
Expand All @@ -275,11 +278,15 @@ def test_add_node_endorsements_endpoints(network, args):
args_copy,
timeout=15,
)
except TimeoutError:
assert not expected_result
except infra.network.CollateralFetchTimeout as e:
LOG.info(
f"Node with invalid quote endorsement servers {servers} could not join as expected"
)
assert not expected_result
assert e.args == (
True,
4,
), "Node has stopped after timing out on fetching collateral"
else:
assert (
expected_result
Expand Down

0 comments on commit 5661eef

Please sign in to comment.