Skip to content

Commit

Permalink
Properly handle crashes/reboots during FabricTable commit
Browse files Browse the repository at this point in the history
- Since project-chip#19819, commits are very small and safer. There is less surface
  to fail during commit. The previous large-scale fail-safe behavior
  stored too much data, for too long and could cause larger reverts
  even if nothing was committed yet. FabricTable data no longer is
  ever persisted without commit.
- The existing code deleted fabrics unwittingly when not required,
  such as when powering off a light during a fail-safe for an
  update when there was nothing committed yet, assuming we still
  committed immediately.

This change:
- Detects failed commits
- Only deletes data on failed commits
- Fixes Thread driver to detect stale data where a backup was done
  (since we cannot prevent internal commits from OpenThread)

Testing done:
- Added unit test to FabricTable to generate the condition
- Did manual testing of all-clusters-app/chip-tool Linux that
  aborted on second commissioning, during commit. Found that
  cleanup occurred as expected on restart
- Integration/cert testing (including Cirque that validates
  fail-safe) still pass
  • Loading branch information
tcarmelveilleux committed Jun 27, 2022
1 parent c38e915 commit ab6ac10
Show file tree
Hide file tree
Showing 22 changed files with 494 additions and 265 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ CHIP_ERROR AccessControlAttribute::ReadAcl(AttributeValueEncoder & aEncoder)

CHIP_ERROR AccessControlAttribute::ReadExtension(AttributeValueEncoder & aEncoder)
{
#if SUPPORT_EXTENSION
auto & storage = Server::GetInstance().GetPersistentStorage();
DefaultStorageKeyAllocator key;

Expand All @@ -174,6 +175,9 @@ CHIP_ERROR AccessControlAttribute::ReadExtension(AttributeValueEncoder & aEncode
}
return CHIP_NO_ERROR;
});
#else
return CHIP_IM_GLOBAL_STATUS(UnsupportedAttribute);
#endif
}

CHIP_ERROR AccessControlAttribute::Write(const ConcreteDataAttributePath & aPath, AttributeValueDecoder & aDecoder)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ bool emberAfAdministratorCommissioningClusterOpenCommissioningWindowCallback(

FabricIndex fabricIndex = commandObj->GetAccessingFabricIndex();
FabricInfo * fabricInfo = Server::GetInstance().GetFabricTable().FindFabricWithIndex(fabricIndex);
auto & failSafeContext = DeviceLayer::DeviceControlServer::DeviceControlSvr().GetFailSafeContext();
auto & failSafeContext = Server::GetInstance().GetFailSafeContext();
auto & commissionMgr = Server::GetInstance().GetCommissioningWindowManager();

VerifyOrExit(fabricInfo != nullptr, status.Emplace(StatusCode::EMBER_ZCL_STATUS_CODE_PAKE_PARAMETER_ERROR));
Expand Down Expand Up @@ -169,7 +169,7 @@ bool emberAfAdministratorCommissioningClusterOpenBasicCommissioningWindowCallbac

FabricIndex fabricIndex = commandObj->GetAccessingFabricIndex();
FabricInfo * fabricInfo = Server::GetInstance().GetFabricTable().FindFabricWithIndex(fabricIndex);
auto & failSafeContext = DeviceLayer::DeviceControlServer::DeviceControlSvr().GetFailSafeContext();
auto & failSafeContext = Server::GetInstance().GetFailSafeContext();
auto & commissionMgr = Server::GetInstance().GetCommissioningWindowManager();

VerifyOrExit(fabricInfo != nullptr, status.Emplace(StatusCode::EMBER_ZCL_STATUS_CODE_PAKE_PARAMETER_ERROR));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ bool emberAfGeneralCommissioningClusterArmFailSafeCallback(app::CommandHandler *
const Commands::ArmFailSafe::DecodableType & commandData)
{
MATTER_TRACE_EVENT_SCOPE("ArmFailSafe", "GeneralCommissioning");
FailSafeContext & failSafeContext = DeviceLayer::DeviceControlServer::DeviceControlSvr().GetFailSafeContext();
auto & failSafeContext = Server::GetInstance().GetFailSafeContext();
Commands::ArmFailSafeResponse::Type response;

ChipLogProgress(FailSafe, "GeneralCommissioning: Received ArmFailSafe (%us)",
Expand Down Expand Up @@ -196,9 +196,7 @@ bool emberAfGeneralCommissioningClusterArmFailSafeCallback(app::CommandHandler *
}
else
{
CheckSuccess(
failSafeContext.ArmFailSafe(accessingFabricIndex, System::Clock::Seconds16(commandData.expiryLengthSeconds)),
Failure);
CheckSuccess(failSafeContext.ArmFailSafe(accessingFabricIndex, commandData.expiryLengthSeconds), Failure);
Breadcrumb::Set(commandPath.mEndpointId, commandData.breadcrumb);
response.errorCode = CommissioningError::kOk;
commandObj->AddResponse(commandPath, response);
Expand All @@ -219,9 +217,9 @@ bool emberAfGeneralCommissioningClusterCommissioningCompleteCallback(
{
MATTER_TRACE_EVENT_SCOPE("CommissioningComplete", "GeneralCommissioning");

DeviceControlServer * server = &DeviceLayer::DeviceControlServer::DeviceControlSvr();
auto & failSafe = server->GetFailSafeContext();
auto & fabricTable = Server::GetInstance().GetFabricTable();
DeviceControlServer * devCtrl = &DeviceLayer::DeviceControlServer::DeviceControlSvr();
auto & failSafe = Server::GetInstance().GetFailSafeContext();
auto & fabricTable = Server::GetInstance().GetFabricTable();

ChipLogProgress(FailSafe, "GeneralCommissioning: Received CommissioningComplete");

Expand Down Expand Up @@ -267,7 +265,7 @@ bool emberAfGeneralCommissioningClusterCommissioningCompleteCallback(
*/
failSafe.DisarmFailSafe();
CheckSuccess(
server->PostCommissioningCompleteEvent(handle->AsSecureSession()->GetPeerNodeId(), handle->GetFabricIndex()),
devCtrl->PostCommissioningCompleteEvent(handle->AsSecureSession()->GetPeerNodeId(), handle->GetFabricIndex()),
Failure);

Breadcrumb::Set(commandPath.mEndpointId, 0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include <app/CommandHandlerInterface.h>
#include <app/InteractionModelEngine.h>
#include <app/clusters/general-commissioning-server/general-commissioning-server.h>
#include <app/server/Server.h>
#include <app/util/attribute-storage.h>
#include <lib/support/SafeInt.h>
#include <lib/support/ThreadOperationalDataset.h>
Expand Down Expand Up @@ -301,7 +302,7 @@ void FillDebugTextAndNetworkIndex(Commands::NetworkConfigResponse::Type & respon

bool CheckFailSafeArmed(CommandHandlerInterface::HandlerContext & ctx)
{
DeviceLayer::FailSafeContext & failSafeContext = DeviceLayer::DeviceControlServer::DeviceControlSvr().GetFailSafeContext();
auto & failSafeContext = chip::Server::GetInstance().GetFailSafeContext();

if (failSafeContext.IsFailSafeArmed(ctx.mCommandHandler.GetAccessingFabricIndex()))
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,15 +48,13 @@
#include <lib/support/ScopedBuffer.h>
#include <lib/support/logging/CHIPLogging.h>
#include <platform/CHIPDeviceLayer.h>
#include <platform/DeviceControlServer.h>
#include <string.h>
#include <trace/trace.h>
#if CHIP_CRYPTO_HSM
#include <crypto/hsm/CHIPCryptoPALHsm.h>
#endif

using namespace chip;
using namespace ::chip::DeviceLayer;
using namespace ::chip::Transport;
using namespace chip::app;
using namespace chip::app::Clusters;
Expand Down Expand Up @@ -339,14 +337,6 @@ void FailSafeCleanup(const chip::DeviceLayer::ChipDeviceEvent * event)
ChipLogError(Zcl, "OpCreds: failed to delete fabric at index %u: %" CHIP_ERROR_FORMAT, fabricIndex, err.Format());
}
}

// If an UpdateNOC command had been successfully invoked, revert the state of operational key pair, NOC and ICAC for that
// Fabric to the state prior to the Fail-Safe timer being armed, for the Fabric Index that was the subject of the UpdateNOC
// command.
if (event->FailSafeTimerExpired.updateNocCommandHasBeenInvoked)
{
// TODO: Revert the state of operational key pair, NOC and ICAC
}
}

void OnPlatformEventHandler(const chip::DeviceLayer::ChipDeviceEvent * event, intptr_t arg)
Expand Down Expand Up @@ -651,7 +641,7 @@ bool emberAfOperationalCredentialsClusterAddNOCCallback(app::CommandHandler * co
auto & fabricTable = Server::GetInstance().GetFabricTable();

auto * secureSession = commandObj->GetExchangeContext()->GetSessionHandle()->AsSecureSession();
FailSafeContext & failSafeContext = DeviceControlServer::DeviceControlSvr().GetFailSafeContext();
auto & failSafeContext = Server::GetInstance().GetFailSafeContext();

uint8_t compressed_fabric_id_buffer[sizeof(uint64_t)];
MutableByteSpan compressed_fabric_id(compressed_fabric_id_buffer);
Expand Down Expand Up @@ -737,8 +727,7 @@ bool emberAfOperationalCredentialsClusterAddNOCCallback(app::CommandHandler * co

// The Fabric Index associated with the armed fail-safe context SHALL be updated to match the Fabric
// Index just allocated.
err = failSafeContext.SetAddNocCommandInvoked(newFabricIndex);
VerifyOrExit(err == CHIP_NO_ERROR, nonDefaultStatus = Status::Failure);
failSafeContext.SetAddNocCommandInvoked(newFabricIndex);

// Done all intermediate steps, we are now successful
needRevert = false;
Expand Down Expand Up @@ -816,16 +805,15 @@ bool emberAfOperationalCredentialsClusterUpdateNOCCallback(app::CommandHandler *

auto nocResponse = OperationalCertStatus::kSuccess;
auto nonDefaultStatus = Status::Success;
bool needRevert = false;

CHIP_ERROR err = CHIP_NO_ERROR;
FabricIndex fabricIndex = 0;

ChipLogProgress(Zcl, "OpCreds: Received an UpdateNOC command");

auto & fabricTable = Server::GetInstance().GetFabricTable();
FailSafeContext & failSafeContext = DeviceControlServer::DeviceControlSvr().GetFailSafeContext();
FabricInfo * fabricInfo = RetrieveCurrentFabric(commandObj);
auto & fabricTable = Server::GetInstance().GetFabricTable();
auto & failSafeContext = Server::GetInstance().GetFailSafeContext();
FabricInfo * fabricInfo = RetrieveCurrentFabric(commandObj);

bool csrWasForUpdateNoc = false; //< Output param of HasPendingOperationalKey
bool hasPendingKey = fabricTable.HasPendingOperationalKey(csrWasForUpdateNoc);
Expand All @@ -849,15 +837,8 @@ bool emberAfOperationalCredentialsClusterUpdateNOCCallback(app::CommandHandler *
err = fabricTable.UpdatePendingFabricWithOperationalKeystore(fabricIndex, NOCValue, ICACValue.ValueOr(ByteSpan{}));
VerifyOrExit(err == CHIP_NO_ERROR, nocResponse = ConvertToNOCResponseStatus(err));

// From here if we error-out, we should revert the fabric table pending updates
needRevert = true;

// Flag on the fail-safe context that the UpdateNOC command was invoked.
err = failSafeContext.SetUpdateNocCommandInvoked();
VerifyOrExit(err == CHIP_NO_ERROR, nocResponse = ConvertToNOCResponseStatus(err));

// Done all intermediate steps, we are now successful
needRevert = false;
failSafeContext.SetUpdateNocCommandInvoked();

// We might have a new operational identity, so we should start advertising
// it right away. Also, we need to withdraw our old operational identity.
Expand All @@ -866,11 +847,6 @@ bool emberAfOperationalCredentialsClusterUpdateNOCCallback(app::CommandHandler *

// Attribute notification was already done by fabric table
exit:
if (needRevert)
{
fabricTable.RevertPendingOpCertsExceptRoot();
}

// We have an NOC response
if (nonDefaultStatus == Status::Success)
{
Expand Down Expand Up @@ -1040,7 +1016,7 @@ bool emberAfOperationalCredentialsClusterCSRRequestCallback(app::CommandHandler
CHIP_ERROR err = CHIP_ERROR_INVALID_ARGUMENT;

auto & fabricTable = Server::GetInstance().GetFabricTable();
FailSafeContext & failSafeContext = DeviceControlServer::DeviceControlSvr().GetFailSafeContext();
auto & failSafeContext = Server::GetInstance().GetFailSafeContext();

auto & CSRNonce = commandData.CSRNonce;
bool isForUpdateNoc = commandData.isForUpdateNOC.ValueOr(false);
Expand Down Expand Up @@ -1158,7 +1134,7 @@ bool emberAfOperationalCredentialsClusterAddTrustedRootCertificateCallback(
CHIP_ERROR err = CHIP_ERROR_INVALID_ARGUMENT;

auto & rootCertificate = commandData.rootCertificate;
FailSafeContext & failSafeContext = DeviceControlServer::DeviceControlSvr().GetFailSafeContext();
auto & failSafeContext = Server::GetInstance().GetFailSafeContext();

ChipLogProgress(Zcl, "OpCreds: Received an AddTrustedRootCertificate command");

Expand Down
2 changes: 2 additions & 0 deletions src/app/server/BUILD.gn
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ static_library("server") {
"Dnssd.h",
"EchoHandler.cpp",
"EchoHandler.h",
"FailSafeContext.cpp",
"FailSafeContext.h",
"OnboardingCodesUtil.cpp",
"OnboardingCodesUtil.h",
"Server.cpp",
Expand Down
8 changes: 4 additions & 4 deletions src/app/server/CommissioningWindowManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ void CommissioningWindowManager::OnSessionEstablished(const SessionHandle & sess

StopAdvertisement(/* aShuttingDown = */ false);

DeviceLayer::FailSafeContext & failSafeContext = DeviceLayer::DeviceControlServer::DeviceControlSvr().GetFailSafeContext();
auto & failSafeContext = Server::GetInstance().GetFailSafeContext();
// This should never be armed because we don't allow CASE sessions to arm the failsafe when the commissioning window is open and
// we check that the failsafe is not armed before opening the commissioning window. None the less, it is good to double-check.
CHIP_ERROR err = CHIP_NO_ERROR;
Expand All @@ -170,7 +170,7 @@ void CommissioningWindowManager::OnSessionEstablished(const SessionHandle & sess
}
else
{
err = failSafeContext.ArmFailSafe(kUndefinedFabricId, System::Clock::Seconds16(60));
err = failSafeContext.ArmFailSafe(kUndefinedFabricId, 60);
if (err != CHIP_NO_ERROR)
{
ChipLogError(AppServer, "Error arming failsafe on PASE session establishment completion");
Expand All @@ -194,7 +194,7 @@ CHIP_ERROR CommissioningWindowManager::OpenCommissioningWindow(Seconds16 commiss
{
VerifyOrReturnError(commissioningTimeout <= MaxCommissioningTimeout() && commissioningTimeout >= MinCommissioningTimeout(),
CHIP_ERROR_INVALID_ARGUMENT);
DeviceLayer::FailSafeContext & failSafeContext = DeviceLayer::DeviceControlServer::DeviceControlSvr().GetFailSafeContext();
auto & failSafeContext = Server::GetInstance().GetFailSafeContext();
VerifyOrReturnError(!failSafeContext.IsFailSafeArmed(), CHIP_ERROR_INCORRECT_STATE);

ReturnErrorOnFailure(Dnssd::ServiceAdvertiser::Instance().UpdateCommissionableInstanceName());
Expand Down Expand Up @@ -471,7 +471,7 @@ void CommissioningWindowManager::OnSessionReleased()

void CommissioningWindowManager::ExpireFailSafeIfArmed()
{
DeviceLayer::FailSafeContext & failSafeContext = DeviceLayer::DeviceControlServer::DeviceControlSvr().GetFailSafeContext();
auto & failSafeContext = Server::GetInstance().GetFailSafeContext();
if (failSafeContext.IsFailSafeArmed())
{
failSafeContext.ForceFailSafeTimerExpiry();
Expand Down
Loading

0 comments on commit ab6ac10

Please sign in to comment.