From 9f1a470e7c368d4d25fbd0e2bb27a7f901827738 Mon Sep 17 00:00:00 2001 From: Boris Zbarsky Date: Thu, 25 May 2023 16:32:06 -0400 Subject: [PATCH] Fix ThreadSanitizer failure in controller factory. (#26824) * Fix ThreadSanitizer failure in controller factory. The failure looks like this: WARNING: ThreadSanitizer: race on NSMutableArray (pid=11619) Read-only access of NSMutableArray at 0x7b0c0005f5b0 by thread T3: #0 -[__NSArrayM countByEnumeratingWithState:objects:count:] :2 (CoreFoundation:x86_64+0x4a338) #1 -[MTRDeviceControllerFactory(InternalMethods) operationalInstanceAdded:] MTRDeviceControllerFactory.mm:855 (Matter:x86_64+0x1fd2a) #2 MTROperationalBrowser::OnBrowse(_DNSServiceRef_t*, unsigned int, unsigned int, int, char const*, char const*, char const*, void*) MTROperationalBrowser.mm:100 (Matter:x86_64+0x20ee63c) #3 handle_browse_response :2 (libsystem_dnssd.dylib:x86_64+0x3733) #4 _dispatch_client_callout :2 (libdispatch.dylib:x86_64+0x3316) Previous modifying access of NSMutableArray at 0x7b0c0005f5b0 by main thread: #0 -[__NSArrayM addObject:] :2 (CoreFoundation:x86_64+0x2457a) #1 -[MTRDeviceControllerFactory createController] MTRDeviceControllerFactory.mm:719 (Matter:x86_64+0x1cee3) #2 -[MTRDeviceControllerFactory createControllerOnExistingFabric:error:] MTRDeviceControllerFactory.mm:534 (Matter:x86_64+0x19792) The basic problem is that we are in the middle of adding an object to _controllers on the API consumer thread when on the Matter thread we get our browse notification. The changes here don't aim to lock around all access to _controllers, but just to make sure that our mutations of it can't race with the access on the Matter thread. More coarse locking would need to be done very carefully, given the amount of dispath_sync to the Matter thread we have going on. * Address review comments. --- .github/workflows/darwin.yaml | 2 +- .../CHIP/MTRDeviceControllerFactory.mm | 64 ++++++++++++++++++- src/platform/Darwin/PlatformManagerImpl.cpp | 9 ++- src/platform/Darwin/PlatformManagerImpl.h | 2 + 4 files changed, 72 insertions(+), 5 deletions(-) diff --git a/.github/workflows/darwin.yaml b/.github/workflows/darwin.yaml index 053eb96af69796..a7d8f31a819124 100644 --- a/.github/workflows/darwin.yaml +++ b/.github/workflows/darwin.yaml @@ -155,7 +155,7 @@ jobs: TEST_RUNNER_ASAN_OPTIONS=__CURRENT_VALUE__:detect_stack_use_after_return=1 xcodebuild test -target "Matter" -scheme "Matter Framework Tests" -sdk macosx -enableAddressSanitizer YES -enableUndefinedBehaviorSanitizer YES OTHER_CFLAGS='${inherited} -Werror -Wconversion -Wno-incomplete-umbrella -Wno-unguarded-availability-new' CHIP_IS_UBSAN=YES > >(tee /tmp/darwin/framework-tests/darwin-tests-asan.log) 2> >(tee /tmp/darwin/framework-tests/darwin-tests-asan-err.log >&2) # -enableThreadSanitizer instruments the code in Matter.framework, # but to instrument the code in the underlying libCHIP we need to pass CHIP_IS_TSAN=YES - #xcodebuild test -target "Matter" -scheme "Matter Framework Tests" -sdk macosx -enableThreadSanitizer YES OTHER_CFLAGS='${inherited} -Werror -Wconversion -Wno-incomplete-umbrella -Wno-unguarded-availability-new' CHIP_IS_TSAN=YES > >(tee /tmp/darwin/framework-tests/darwin-tests-tsan.log) 2> >(tee /tmp/darwin/framework-tests/darwin-tests-tsan-err.log >&2) + xcodebuild test -target "Matter" -scheme "Matter Framework Tests" -sdk macosx -enableThreadSanitizer YES OTHER_CFLAGS='${inherited} -Werror -Wconversion -Wno-incomplete-umbrella -Wno-unguarded-availability-new' CHIP_IS_TSAN=YES > >(tee /tmp/darwin/framework-tests/darwin-tests-tsan.log) 2> >(tee /tmp/darwin/framework-tests/darwin-tests-tsan-err.log >&2) working-directory: src/darwin/Framework - name: Uploading log files uses: actions/upload-artifact@v3 diff --git a/src/darwin/Framework/CHIP/MTRDeviceControllerFactory.mm b/src/darwin/Framework/CHIP/MTRDeviceControllerFactory.mm index 93f7abeb91a979..5bd050b0df028e 100644 --- a/src/darwin/Framework/CHIP/MTRDeviceControllerFactory.mm +++ b/src/darwin/Framework/CHIP/MTRDeviceControllerFactory.mm @@ -34,6 +34,8 @@ #import "MTRPersistentStorageDelegateBridge.h" #import "NSDataSpanConversion.h" +#import + #include #include #include @@ -88,6 +90,22 @@ @interface MTRDeviceControllerFactory () @property () chip::Credentials::DeviceAttestationVerifier * deviceAttestationVerifier; @property (readonly) BOOL advertiseOperational; @property (nonatomic, readonly) Credentials::IgnoreCertificateValidityPeriodPolicy * certificateValidityPolicy; +// Lock used to serialize access to the "controllers" array, since it needs to +// be touched from both whatever queue is starting controllers and from the +// Matter queue. The way this lock is used assumes that: +// +// 1) The only mutating accesses to the controllers array happen when the +// current queue is not the Matter queue. This is a good assumption, because +// the implementation of the fucntions that mutate the array do sync dispatch +// to the Matter queue, which would deadlock if they were called when that +// queue was the current queue. +// 2) It's our API consumer's responsibility to serialize access to us from +// outside. +// +// This means that we only take the lock around mutations of the array and +// accesses to the array that are from code running on the Matter queue. + +@property (nonatomic, readonly) os_unfair_lock controllersLock; - (BOOL)findMatchingFabric:(FabricTable &)fabricTable params:(MTRDeviceControllerStartupParams *)params @@ -123,6 +141,7 @@ - (instancetype)init _running = NO; _chipWorkQueue = DeviceLayer::PlatformMgrImpl().GetWorkQueue(); _controllerFactory = &DeviceControllerFactory::GetInstance(); + _controllersLock = OS_UNFAIR_LOCK_INIT; _sessionKeystore = new chip::Crypto::RawKeySessionKeystore(); if ([self checkForInitError:(_sessionKeystore != nullptr) logMsg:kErrorSessionKeystoreInit]) { @@ -166,8 +185,15 @@ - (void)dealloc [self cleanupInitObjects]; } +- (void)_assertCurrentQueueIsNotMatterQueue +{ + VerifyOrDie(!DeviceLayer::PlatformMgrImpl().IsWorkQueueCurrentQueue()); +} + - (BOOL)checkIsRunning:(NSError * __autoreleasing *)error { + [self _assertCurrentQueueIsNotMatterQueue]; + if ([self isRunning]) { return YES; } @@ -255,6 +281,8 @@ - (void)cleanupStartupObjects - (nullable NSArray *)knownFabrics { + [self _assertCurrentQueueIsNotMatterQueue]; + if (!self.isRunning) { return nil; } @@ -303,6 +331,8 @@ - (void)cleanupStartupObjects - (BOOL)startControllerFactory:(MTRDeviceControllerFactoryParams *)startupParams error:(NSError * __autoreleasing *)error; { + [self _assertCurrentQueueIsNotMatterQueue]; + if ([self isRunning]) { MTR_LOG_DEBUG("Ignoring duplicate call to startup, Matter controller factory already started..."); return YES; @@ -501,6 +531,8 @@ - (BOOL)startControllerFactory:(MTRDeviceControllerFactoryParams *)startupParams - (void)stopControllerFactory { + [self _assertCurrentQueueIsNotMatterQueue]; + if (![self isRunning]) { return; } @@ -524,6 +556,8 @@ - (void)stopControllerFactory - (MTRDeviceController * _Nullable)createControllerOnExistingFabric:(MTRDeviceControllerStartupParams *)startupParams error:(NSError * __autoreleasing *)error { + [self _assertCurrentQueueIsNotMatterQueue]; + if (![self checkIsRunning:error]) { MTR_LOG_ERROR("Trying to start controller while Matter controller factory is not running"); return nil; @@ -615,6 +649,8 @@ - (MTRDeviceController * _Nullable)createControllerOnExistingFabric:(MTRDeviceCo - (MTRDeviceController * _Nullable)createControllerOnNewFabric:(MTRDeviceControllerStartupParams *)startupParams error:(NSError * __autoreleasing *)error { + [self _assertCurrentQueueIsNotMatterQueue]; + if (![self isRunning]) { MTR_LOG_ERROR("Trying to start controller while Matter controller factory is not running"); return nil; @@ -699,6 +735,8 @@ - (MTRDeviceController * _Nullable)createControllerOnNewFabric:(MTRDeviceControl - (MTRDeviceController * _Nullable)createController { + [self _assertCurrentQueueIsNotMatterQueue]; + MTRDeviceController * controller = [[MTRDeviceController alloc] initWithFactory:self queue:_chipWorkQueue]; if (controller == nil) { MTR_LOG_ERROR("Failed to init controller"); @@ -716,7 +754,9 @@ - (MTRDeviceController * _Nullable)createController // Add the controller to _controllers now, so if we fail partway through its // startup we will still do the right cleanups. + os_unfair_lock_lock(&_controllersLock); [_controllers addObject:controller]; + os_unfair_lock_unlock(&_controllersLock); return controller; } @@ -733,6 +773,8 @@ - (BOOL)findMatchingFabric:(FabricTable &)fabricTable params:(MTRDeviceControllerStartupParams *)params fabric:(const FabricInfo * _Nullable * _Nonnull)fabric { + assertChipStackLockedByCurrentThread(); + CHIP_ERROR err = fabricTable.Init( { .storage = _persistentStorageDelegateBridge, .operationalKeystore = _keystore, .opCertStore = _opCertStore }); if (err != CHIP_NO_ERROR) { @@ -768,6 +810,8 @@ - (BOOL)findMatchingFabric:(FabricTable &)fabricTable // - (MTRDeviceController * _Nullable)maybeInitializeOTAProvider:(MTRDeviceController * _Nonnull)controller { + [self _assertCurrentQueueIsNotMatterQueue]; + VerifyOrReturnValue(_otaProviderDelegateBridge != nil, controller); VerifyOrReturnValue([_controllers count] == 1, controller); @@ -791,6 +835,8 @@ @implementation MTRDeviceControllerFactory (InternalMethods) - (void)controllerShuttingDown:(MTRDeviceController *)controller { + [self _assertCurrentQueueIsNotMatterQueue]; + if (![_controllers containsObject:controller]) { MTR_LOG_ERROR("Controller we don't know about shutting down"); return; @@ -808,7 +854,9 @@ - (void)controllerShuttingDown:(MTRDeviceController *)controller }); } + os_unfair_lock_lock(&_controllersLock); [_controllers removeObject:controller]; + os_unfair_lock_unlock(&_controllersLock); if ([_controllers count] == 0) { dispatch_sync(_chipWorkQueue, ^{ @@ -841,7 +889,13 @@ - (void)controllerShuttingDown:(MTRDeviceController *)controller - (nullable MTRDeviceController *)runningControllerForFabricIndex:(chip::FabricIndex)fabricIndex { - for (MTRDeviceController * existing in _controllers) { + assertChipStackLockedByCurrentThread(); + + os_unfair_lock_lock(&_controllersLock); + NSArray * controllersCopy = [_controllers copy]; + os_unfair_lock_unlock(&_controllersLock); + + for (MTRDeviceController * existing in controllersCopy) { if ([existing fabricIndex] == fabricIndex) { return existing; } @@ -852,7 +906,13 @@ - (nullable MTRDeviceController *)runningControllerForFabricIndex:(chip::FabricI - (void)operationalInstanceAdded:(chip::PeerId &)operationalID { - for (MTRDeviceController * controller in _controllers) { + assertChipStackLockedByCurrentThread(); + + os_unfair_lock_lock(&_controllersLock); + NSArray * controllersCopy = [_controllers copy]; + os_unfair_lock_unlock(&_controllersLock); + + for (MTRDeviceController * controller in controllersCopy) { auto * compressedFabricId = controller.compressedFabricID; if (compressedFabricId != nil && compressedFabricId.unsignedLongLongValue == operationalID.GetCompressedFabricId()) { ChipLogProgress(Controller, "Notifying controller at fabric index %u about new operational node 0x" ChipLogFormatX64, diff --git a/src/platform/Darwin/PlatformManagerImpl.cpp b/src/platform/Darwin/PlatformManagerImpl.cpp index 6e78a8195b004f..f1d5e4841e761b 100644 --- a/src/platform/Darwin/PlatformManagerImpl.cpp +++ b/src/platform/Darwin/PlatformManagerImpl.cpp @@ -88,7 +88,7 @@ CHIP_ERROR PlatformManagerImpl::_StopEventLoopTask() if (!mIsWorkQueueSuspended && !mIsWorkQueueSuspensionPending) { mIsWorkQueueSuspensionPending = true; - if (dispatch_get_current_queue() != mWorkQueue) + if (!IsWorkQueueCurrentQueue()) { // dispatch_sync is used in order to guarantee serialization of the caller with // respect to any tasks that might already be on the queue, or running. @@ -157,10 +157,15 @@ bool PlatformManagerImpl::_IsChipStackLockedByCurrentThread() const { // If we have no work queue, or it's suspended, then we assume our caller // knows what they are doing in terms of their own concurrency. - return !mWorkQueue || mIsWorkQueueSuspended || dispatch_get_current_queue() == mWorkQueue; + return !mWorkQueue || mIsWorkQueueSuspended || IsWorkQueueCurrentQueue(); }; #endif +bool PlatformManagerImpl::IsWorkQueueCurrentQueue() const +{ + return dispatch_get_current_queue() == mWorkQueue; +} + CHIP_ERROR PlatformManagerImpl::StartBleScan(BleScannerDelegate * delegate) { #if CONFIG_NETWORK_LAYER_BLE diff --git a/src/platform/Darwin/PlatformManagerImpl.h b/src/platform/Darwin/PlatformManagerImpl.h index a0aff2ab21e439..79da67a4991d2f 100644 --- a/src/platform/Darwin/PlatformManagerImpl.h +++ b/src/platform/Darwin/PlatformManagerImpl.h @@ -56,6 +56,8 @@ class PlatformManagerImpl final : public PlatformManager, public Internal::Gener return mWorkQueue; } + bool IsWorkQueueCurrentQueue() const; + CHIP_ERROR StartBleScan(BleScannerDelegate * delegate = nullptr); CHIP_ERROR StopBleScan(); CHIP_ERROR PrepareCommissioning();