Skip to content

Commit

Permalink
Fix UpdateNOC session invalidation and opcreds timing (#20461)
Browse files Browse the repository at this point in the history
* Fix UpdateNOC session invalidation and opcreds timing

- UpdateNOC did not clear session of previous fabric like spec intended
  (#20379)
- All OpCreds cluster slow commands did not try to early-ack to avoid
  MRP timeouts (#19132)

Fixes #20379
Fixes #19132

This PR:

- Fixes UpdateNOC expiring all sessions for the updated node
- Adds `FlushAcksRightAwayOnSlowCommand` to CommandHandler to flush acks
  on slow commands
- Adds Python tests for UpdateNOC behavior of session expiring
- Adds `ExpireSessions` to Python for testing

Testing done:

- Unit tests all pass
- Cert tests pass
- With the session clearing, previous Python tests failed, until
  I fixed them with the new `ExpireSessions` API
- Observed standalone acks immediately sent on opcreds cluster commands

* Restyled

* Removed leftover debug
  • Loading branch information
tcarmelveilleux authored Jul 8, 2022
1 parent e6e43fa commit c3cd004
Show file tree
Hide file tree
Showing 12 changed files with 108 additions and 23 deletions.
2 changes: 1 addition & 1 deletion examples/chip-tool/commands/tests/TestCommand.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ CHIP_ERROR TestCommand::WaitForCommissionee(const char * identity,
// or is just starting out fresh outright. Let's make sure we're not re-using any cached CASE sessions
// that will now be stale and mismatched with the peer, causing subsequent interactions to fail.
//
GetCommissioner(identity).SessionMgr()->ExpireAllPairings(chip::ScopedNodeId(value.nodeId, fabricIndex));
GetCommissioner(identity).SessionMgr()->ExpireAllSessions(chip::ScopedNodeId(value.nodeId, fabricIndex));

SetIdentity(identity);
return GetCommissioner(identity).GetConnectedDevice(value.nodeId, &mOnDeviceConnectedCallback,
Expand Down
19 changes: 19 additions & 0 deletions src/app/CommandHandler.h
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,25 @@ class CommandHandler
*/
Messaging::ExchangeContext * GetExchangeContext() const { return mpExchangeCtx; }

/**
* @brief Flush acks right away for a slow command
*
* Some commands that do heavy lifting of storage/crypto should
* ack right away to improve reliability and reduce needless retries. This
* method can be manually called in commands that are especially slow to
* immediately schedule an acknowledgement (if needed) since the delayed
* stand-alone ack timer may actually not hit soon enough due to blocking command
* execution.
*
*/
void FlushAcksRightAwayOnSlowCommand()
{
VerifyOrReturn(mpExchangeCtx != nullptr);
auto * msgContext = mpExchangeCtx->GetReliableMessageContext();
VerifyOrReturn(msgContext != nullptr);
msgContext->FlushAcks();
}

Access::SubjectDescriptor GetSubjectDescriptor() const { return mpExchangeCtx->GetSessionHandle()->GetSubjectDescriptor(); }

private:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ CHIP_ERROR DeleteFabricFromTable(FabricIndex fabricIndex)
void CleanupSessionsForFabric(SessionManager & sessionMgr, FabricIndex fabricIndex)
{
InteractionModelEngine::GetInstance()->CloseTransactionsFromFabricIndex(fabricIndex);
sessionMgr.ExpireAllPairingsForFabric(fabricIndex);
sessionMgr.ExpireAllSessionsForFabric(fabricIndex);
}

void FailSafeCleanup(const chip::DeviceLayer::ChipDeviceEvent * event)
Expand All @@ -278,13 +278,16 @@ void FailSafeCleanup(const chip::DeviceLayer::ChipDeviceEvent * event)
// Session Context at the Server.
if (event->FailSafeTimerExpired.addNocCommandHasBeenInvoked || event->FailSafeTimerExpired.updateNocCommandHasBeenInvoked)
{
CASESessionManager * caseSessionManager = Server::GetInstance().GetCASESessionManager();
if (caseSessionManager)
// TODO(#19259): The following scope will no longer need to exist after #19259 is fixed
{
const FabricInfo * fabricInfo = Server::GetInstance().GetFabricTable().FindFabricWithIndex(fabricIndex);
VerifyOrReturn(fabricInfo != nullptr);
CASESessionManager * caseSessionManager = Server::GetInstance().GetCASESessionManager();
if (caseSessionManager)
{
const FabricInfo * fabricInfo = Server::GetInstance().GetFabricTable().FindFabricWithIndex(fabricIndex);
VerifyOrReturn(fabricInfo != nullptr);

caseSessionManager->ReleaseSessionsForFabric(fabricInfo->GetFabricIndex());
caseSessionManager->ReleaseSessionsForFabric(fabricInfo->GetFabricIndex());
}
}

SessionManager & sessionMgr = Server::GetInstance().GetSecureSessionManager();
Expand Down Expand Up @@ -416,6 +419,8 @@ bool emberAfOperationalCredentialsClusterRemoveFabricCallback(app::CommandHandle
return true;
}

commandObj->FlushAcksRightAwayOnSlowCommand();

CHIP_ERROR err = DeleteFabricFromTable(fabricBeingRemoved);
SuccessOrExit(err);

Expand Down Expand Up @@ -638,6 +643,9 @@ bool emberAfOperationalCredentialsClusterAddNOCCallback(app::CommandHandler * co
// Internal error that would prevent IPK from being added
VerifyOrExit(groupDataProvider != nullptr, nonDefaultStatus = Status::Failure);

// Flush acks before really slow work
commandObj->FlushAcksRightAwayOnSlowCommand();

// TODO: Add support for calling AddNOC without a prior AddTrustedRootCertificate if
// the root properly matches an existing one.

Expand Down Expand Up @@ -803,6 +811,9 @@ bool emberAfOperationalCredentialsClusterUpdateNOCCallback(app::CommandHandler *
VerifyOrExit(fabricInfo != nullptr, nocResponse = ConvertToNOCResponseStatus(CHIP_ERROR_INSUFFICIENT_PRIVILEGE));
fabricIndex = fabricInfo->GetFabricIndex();

// Flush acks before really slow work
commandObj->FlushAcksRightAwayOnSlowCommand();

err = fabricTable.UpdatePendingFabricWithOperationalKeystore(fabricIndex, NOCValue, ICACValue.ValueOr(ByteSpan{}));
VerifyOrExit(err == CHIP_NO_ERROR, nocResponse = ConvertToNOCResponseStatus(err));

Expand Down Expand Up @@ -830,6 +841,17 @@ bool emberAfOperationalCredentialsClusterUpdateNOCCallback(app::CommandHandler *
else
{
ChipLogProgress(Zcl, "OpCreds: UpdateNOC successful.");

// On success, revoke all CASE sessions on the fabric hosting the exchange.
// From spec:
//
// All internal data reflecting the prior operational identifier of the Node within the Fabric
// SHALL be revoked and removed, to an outcome equivalent to the disappearance of the prior Node,
// except for the ongoing CASE session context, which SHALL temporarily remain valid until the
// `NOCResponse` has been successfully delivered or until the next transport-layer error, so
// that the response can be received by the Administrator invoking the command.

commandObj->GetExchangeContext()->AbortAllOtherCommunicationOnFabric();
}
}
// No NOC response - Failed constraints
Expand Down Expand Up @@ -921,6 +943,10 @@ bool emberAfOperationalCredentialsClusterAttestationRequestCallback(app::Command
const ByteSpan kEmptyFirmwareInfo;

ChipLogProgress(Zcl, "OpCreds: Received an AttestationRequest command");

// Flush acks before really slow work
commandObj->FlushAcksRightAwayOnSlowCommand();

Credentials::DeviceAttestationCredentialsProvider * dacProvider = Credentials::GetDeviceAttestationCredentialsProvider();

VerifyOrExit(attestationNonce.size() == Credentials::kExpectedAttestationNonceSize, finalStatus = Status::InvalidCommand);
Expand Down Expand Up @@ -1020,6 +1046,9 @@ bool emberAfOperationalCredentialsClusterCSRRequestCallback(app::CommandHandler
VerifyOrExit(failSafeContext.IsFailSafeArmed(commandObj->GetAccessingFabricIndex()), finalStatus = Status::FailsafeRequired);
VerifyOrExit(!failSafeContext.NocCommandHasBeenInvoked(), finalStatus = Status::ConstraintError);

// Flush acks before really slow work
commandObj->FlushAcksRightAwayOnSlowCommand();

// Prepare NOCSRElements structure
{
constexpr size_t csrLength = Crypto::kMAX_CSR_Length;
Expand Down Expand Up @@ -1143,10 +1172,12 @@ bool emberAfOperationalCredentialsClusterAddTrustedRootCertificateCallback(
// be useful in the context.
VerifyOrExit(!failSafeContext.NocCommandHasBeenInvoked(), finalStatus = Status::ConstraintError);

// TODO: Handle checking for byte-to-byte match with existing fabrics
// before allowing the add
// Flush acks before really slow work
commandObj->FlushAcksRightAwayOnSlowCommand();

// TODO(#17208): Handle checking for byte-to-byte match with existing fabrics before allowing the add

// TODO: Validate cert signature prior to setting.
// TODO(#17208): Validate cert signature prior to setting.
err = fabricTable.AddNewPendingTrustedRootCert(rootCertificate);
VerifyOrExit(err != CHIP_ERROR_NO_MEMORY, finalStatus = Status::ResourceExhausted);

Expand Down
2 changes: 1 addition & 1 deletion src/app/server/CommissioningWindowManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ void CommissioningWindowManager::OnPlatformEvent(const DeviceLayer::ChipDeviceEv
DeviceLayer::SystemLayer().CancelTimer(HandleCommissioningWindowTimeout, this);
mCommissioningTimeoutTimerArmed = false;
Cleanup();
mServer->GetSecureSessionManager().ExpireAllPASEPairings();
mServer->GetSecureSessionManager().ExpireAllPASESessions();
// That should have cleared out mPASESession.
#if CONFIG_NETWORK_LAYER_BLE
mServer->GetBleLayerObject()->CloseAllBleConnections();
Expand Down
2 changes: 1 addition & 1 deletion src/controller/CHIPDeviceController.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,7 @@ void DeviceController::Shutdown()
// sessions. It just shuts down any ongoing CASE session establishment
// we're in the middle of as initiator. Maybe it should shut down
// existing sessions too?
mSystemState->SessionMgr()->ExpireAllPairingsForFabric(mFabricIndex);
mSystemState->SessionMgr()->ExpireAllSessionsForFabric(mFabricIndex);

FabricTable * fabricTable = mSystemState->Fabrics();
if (fabricTable != nullptr)
Expand Down
3 changes: 3 additions & 0 deletions src/controller/CHIPDeviceController.h
Original file line number Diff line number Diff line change
Expand Up @@ -288,11 +288,14 @@ class DLL_EXPORT DeviceController : public AbstractDnssdDiscoveryController
return mSystemState->Fabrics();
}

// TODO(#20452): This should be removed/renamed once #20452 is fixed
void ReleaseOperationalDevice(NodeId remoteNodeId);

OperationalCredentialsDelegate * GetOperationalCredentialsDelegate() { return mOperationalCredentialsDelegate; }

/**
* TODO(#20452): This needs to be refactored to reflect what it actually does (which is not disconnecting anything)
*
* TEMPORARY - DO NOT USE or if you use please request review on why/how to
* officially support such an API.
*
Expand Down
11 changes: 11 additions & 0 deletions src/controller/python/ChipDeviceController-ScriptBinding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,8 @@ ChipError::StorageType pychip_GetConnectedDeviceByNodeId(chip::Controller::Devic
DeviceAvailableFunc callback);
ChipError::StorageType pychip_GetDeviceBeingCommissioned(chip::Controller::DeviceCommissioner * devCtrl, chip::NodeId nodeId,
CommissioneeDeviceProxy ** proxy);
ChipError::StorageType pychip_ExpireSessions(chip::Controller::DeviceCommissioner * devCtrl, chip::NodeId nodeId);

uint64_t pychip_GetCommandSenderHandle(chip::DeviceProxy * device);

chip::ChipError::StorageType pychip_InteractionModel_ShutdownSubscription(SubscriptionId subscriptionId);
Expand Down Expand Up @@ -646,6 +648,15 @@ ChipError::StorageType pychip_GetDeviceBeingCommissioned(chip::Controller::Devic
return devCtrl->GetDeviceBeingCommissioned(nodeId, proxy).AsInteger();
}

// This is a method called VERY seldom, just for RemoveFabric/UpdateNOC
ChipError::StorageType pychip_ExpireSessions(chip::Controller::DeviceCommissioner * devCtrl, chip::NodeId nodeId)
{
VerifyOrReturnError((devCtrl != nullptr) && (devCtrl->SessionMgr() != nullptr), CHIP_ERROR_INVALID_ARGUMENT.AsInteger());
(void) devCtrl->ReleaseOperationalDevice(nodeId);
devCtrl->SessionMgr()->ExpireAllSessions(ScopedNodeId(nodeId, devCtrl->GetFabricIndex()));
return CHIP_NO_ERROR.AsInteger();
}

ChipError::StorageType pychip_DeviceCommissioner_CloseBleConnection(chip::Controller::DeviceCommissioner * devCtrl)
{
#if CONFIG_NETWORK_LAYER_BLE
Expand Down
18 changes: 18 additions & 0 deletions src/controller/python/chip/ChipDeviceCtrl.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,21 @@ def CloseBLEConnection(self):
self.devCtrl)
)

def ExpireSessions(self, nodeid):
"""Close all sessions with `nodeid` (if any existed) so that sessions get re-established.
This is needed to properly handle operations that invalidate a node's state, such as
UpdateNOC.
WARNING: ONLY CALL THIS IF YOU UNDERSTAND THE SIDE-EFFECTS
"""
self.CheckIsActive()

res = self._ChipStack.Call(lambda: self._dmLib.pychip_ExpireSessions(self.devCtrl, nodeid))
if res != 0:
raise self._ChipStack.ErrorToException(res)

# TODO: This needs to be called MarkSessionDefunct
def CloseSession(self, nodeid):
self.CheckIsActive()

Expand Down Expand Up @@ -1106,6 +1121,9 @@ def _InitLib(self):
c_void_p, c_uint64, c_void_p]
self._dmLib.pychip_GetDeviceBeingCommissioned.restype = c_uint32

self._dmLib.pychip_ExpireSessions.argtypes = [c_void_p, c_uint64]
self._dmLib.pychip_ExpireSessions.restype = c_uint32

self._dmLib.pychip_DeviceCommissioner_CloseBleConnection.argtypes = [
c_void_p]
self._dmLib.pychip_DeviceCommissioner_CloseBleConnection.restype = c_uint32
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,9 @@ async def UpdateNOC(devCtrl, existingNodeId, newNodeId):
await devCtrl.SendCommand(existingNodeId, 0, generalCommissioning.Commands.ArmFailSafe(0), timedRequestTimeoutMs=1000)
return False

# Forget our session since the peer deleted it
devCtrl.ExpireSessions(existingNodeId)

resp = await devCtrl.SendCommand(newNodeId, 0, generalCommissioning.Commands.CommissioningComplete())
if resp.errorCode is not generalCommissioning.Enums.CommissioningError.kOk:
# Expiring the failsafe timer in an attempt to clean up.
Expand Down
2 changes: 1 addition & 1 deletion src/messaging/ExchangeContext.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -588,7 +588,7 @@ void ExchangeContext::AbortAllOtherCommunicationOnFabric()

SetIgnoreSessionRelease(true);

GetExchangeMgr()->GetSessionManager()->ExpireAllPairingsForFabric(mSession->GetFabricIndex());
GetExchangeMgr()->GetSessionManager()->ExpireAllSessionsForFabric(mSession->GetFabricIndex());

mSession.GrabExpiredSession(session.Value());

Expand Down
14 changes: 7 additions & 7 deletions src/transport/SessionManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ void SessionManager::Shutdown()

/**
* @brief Notification that a fabric was removed.
* This function doesn't call ExpireAllPairingsForFabric
* This function doesn't call ExpireAllSessionsForFabric
* since the CASE session might still be open to send a response
* on the removed fabric.
*/
Expand Down Expand Up @@ -342,7 +342,7 @@ CHIP_ERROR SessionManager::SendPreparedMessage(const SessionHandle & sessionHand
return CHIP_ERROR_INCORRECT_STATE;
}

void SessionManager::ExpireAllPairings(const ScopedNodeId & node)
void SessionManager::ExpireAllSessions(const ScopedNodeId & node)
{
mSecureSessions.ForEachSession([&](auto session) {
if (session->GetPeer() == node)
Expand All @@ -353,21 +353,21 @@ void SessionManager::ExpireAllPairings(const ScopedNodeId & node)
});
}

void SessionManager::ExpireAllPairingsForFabric(FabricIndex fabric)
void SessionManager::ExpireAllSessionsForFabric(FabricIndex fabricIndex)
{
ChipLogDetail(Inet, "Expiring all connections for fabric %d!!", fabric);
ChipLogDetail(Inet, "Expiring all sessions for fabric 0x%x!!", static_cast<unsigned>(fabricIndex));
mSecureSessions.ForEachSession([&](auto session) {
if (session->GetFabricIndex() == fabric)
if (session->GetFabricIndex() == fabricIndex)
{
session->MarkForEviction();
}
return Loop::Continue;
});
}

void SessionManager::ExpireAllPASEPairings()
void SessionManager::ExpireAllPASESessions()
{
ChipLogDetail(Inet, "Expiring all PASE pairings");
ChipLogDetail(Inet, "Expiring all PASE sessions");
mSecureSessions.ForEachSession([&](auto session) {
if (session->GetSecureSessionType() == Transport::SecureSession::Type::kPASE)
{
Expand Down
6 changes: 3 additions & 3 deletions src/transport/SessionManager.h
Original file line number Diff line number Diff line change
Expand Up @@ -174,9 +174,9 @@ class DLL_EXPORT SessionManager : public TransportMgrDelegate, public FabricTabl
Optional<SessionHandle> AllocateSession(Transport::SecureSession::Type secureSessionType,
const ScopedNodeId & sessionEvictionHint);

void ExpireAllPairings(const ScopedNodeId & node);
void ExpireAllPairingsForFabric(FabricIndex fabric);
void ExpireAllPASEPairings();
void ExpireAllSessions(const ScopedNodeId & node);
void ExpireAllSessionsForFabric(FabricIndex fabricIndex);
void ExpireAllPASESessions();

/**
* @brief
Expand Down

0 comments on commit c3cd004

Please sign in to comment.