From 95030208ac11c6749742b7f36f0f13cf65f22921 Mon Sep 17 00:00:00 2001 From: Boris Zbarsky Date: Fri, 17 Mar 2023 16:06:20 -0400 Subject: [PATCH] Use a longer backoff for CASE retries. (#25736) We have a common failure mode for Thread devices where the commissioner sends Sigma1 prior to sending CommissioningComplete, the commissionee responds, but the Sigma2 (and all its retries) never reach the commissioner. When this happens, our existing CASE retries do not help because they happen too quickly: the commissionee is waiting for Sigma3, and will ignore Sigma1 messages until it gets the Sigma3 or times out. To address this, include the time it would take the commissionee to time out in our CASE retry delay for every other retry. This allows us to retry quickly once, then give the commissionee time to start listening again, then retry quickly one or two times (depending on how many retries were requested), etc. --- src/app/OperationalSessionSetup.cpp | 12 ++++++++++++ src/protocols/secure_channel/CASESession.cpp | 8 ++++++++ src/protocols/secure_channel/CASESession.h | 4 ++++ 3 files changed, 24 insertions(+) diff --git a/src/app/OperationalSessionSetup.cpp b/src/app/OperationalSessionSetup.cpp index 1deeebf8c92f5f..62551d66c22247 100644 --- a/src/app/OperationalSessionSetup.cpp +++ b/src/app/OperationalSessionSetup.cpp @@ -532,6 +532,18 @@ CHIP_ERROR OperationalSessionSetup::ScheduleSessionSetupReattempt(System::Clock: timerDelay = System::Clock::Seconds16( static_cast(CHIP_DEVICE_CONFIG_AUTOMATIC_CASE_RETRY_INITIAL_DELAY_SECONDS << min((mAttemptsDone - 1), CHIP_DEVICE_CONFIG_AUTOMATIC_CASE_RETRY_MAX_BACKOFF))); + if (mAttemptsDone % 2 == 0) + { + // It's possible that the other side received one of our Sigma1 messages + // and then failed to get its Sigma2 back to us. If that's the case, it + // will be waiting for that Sigma2 to time out before it starts + // listening for Sigma1 messages again. + // + // To handle that, on every other retry, add the amount of time it would + // take the other side to time out. + auto additionalTimeout = CASESession::ComputeSigma2ResponseTimeout(GetLocalMRPConfig().ValueOr(GetDefaultMRPConfig())); + timerDelay += std::chrono::duration_cast(additionalTimeout); + } CHIP_ERROR err = mInitParams.exchangeMgr->GetSessionManager()->SystemLayer()->StartTimer(timerDelay, TrySetupAgain, this); // The cast on count() is needed because the type count() returns might not // actually be uint16_t; on some platforms it's int. diff --git a/src/protocols/secure_channel/CASESession.cpp b/src/protocols/secure_channel/CASESession.cpp index ac931c7ba5f135..1abd1f1b1e9e6b 100644 --- a/src/protocols/secure_channel/CASESession.cpp +++ b/src/protocols/secure_channel/CASESession.cpp @@ -1984,4 +1984,12 @@ System::Clock::Timeout CASESession::ComputeSigma1ResponseTimeout(const ReliableM kExpectedSigma1ProcessingTime; } +System::Clock::Timeout CASESession::ComputeSigma2ResponseTimeout(const ReliableMessageProtocolConfig & remoteMrpConfig) +{ + return GetRetransmissionTimeout(remoteMrpConfig.mActiveRetransTimeout, remoteMrpConfig.mIdleRetransTimeout, + // Assume peer is idle, as a worst-case assumption. + System::Clock::kZero, Transport::kMinActiveTime) + + kExpectedHighProcessingTime; +} + } // namespace chip diff --git a/src/protocols/secure_channel/CASESession.h b/src/protocols/secure_channel/CASESession.h index 9cf2bba6ee651d..b6516aa5ca0612 100644 --- a/src/protocols/secure_channel/CASESession.h +++ b/src/protocols/secure_channel/CASESession.h @@ -181,6 +181,10 @@ class DLL_EXPORT CASESession : public Messaging::UnsolicitedMessageHandler, // how long it will take to detect that our Sigma1 did not get through. static System::Clock::Timeout ComputeSigma1ResponseTimeout(const ReliableMessageProtocolConfig & remoteMrpConfig); + // Compute our Sigma2 response timeout. This can give consumers an idea of + // how long it will take to detect that our Sigma1 did not get through. + static System::Clock::Timeout ComputeSigma2ResponseTimeout(const ReliableMessageProtocolConfig & remoteMrpConfig); + // TODO: remove Clear, we should create a new instance instead reset the old instance. /** @brief This function zeroes out and resets the memory used by the object. **/