Skip to content

Commit 4b79c1c

Browse files
committed
Ignore lockorder violation on same callsite with different construction
As long as the lock order on such locks is still valid, we should allow them regardless of whether they were constructed at the same location or not. Note that we can only really enforce this if we require one lock call per line, or if we have access to symbol columns (as we do on Linux and macOS). We opt for a smaller patch by relying on the latter. This was previously triggered by some recent test changes to `test_manager_serialize_deserialize_inconsistent_monitor`. When the test ends and a node is dropped causing us to persist each, we'd detect a possible lockorder violation deadlock across three different `Mutex` instances that are held at the same location when serializing our `per_peer_states` in `ChannelManager::write`. The presumed lockorder violation happens because the first `Mutex` held shares the same construction location with the third one, while the second `Mutex` has a different construction location. When we hold the second one, we consider the first as a dependency, and then consider the second as a dependency when holding the third, causing a circular dependency (since the third shares the same construction location as the first). This isn't considered a lockorder violation that could result in a deadlock as the comment suggests inline though, since we are under a dependent write lock which no one else can have access to.
1 parent 04ee948 commit 4b79c1c

File tree

1 file changed

+43
-26
lines changed

1 file changed

+43
-26
lines changed

lightning/src/sync/debug_sync.rs

Lines changed: 43 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -74,24 +74,31 @@ struct LockDep {
7474
_lockdep_trace: Backtrace,
7575
}
7676

77+
// Locates the earliest frame preceding a `debug_sync` frame in the call stack. This ensures we can
78+
// properly detect a lock's construction and acquiral callsites, since the latter may contain
79+
// multiple `debug_sync` frames.
7780
#[cfg(feature = "backtrace")]
78-
fn get_construction_location(backtrace: &Backtrace) -> (String, Option<u32>) {
79-
// Find the first frame that is after `debug_sync` (or that is in our tests) and use
80-
// that as the mutex construction site. Note that the first few frames may be in
81-
// the `backtrace` crate, so we have to ignore those.
81+
fn locate_call_symbol(backtrace: &Backtrace) -> (String, Option<u32>) {
82+
// Find the earliest `debug_sync` frame (or that is in our tests) and use the frame preceding it
83+
// that as the callsite. Note that the first few frames may be in the `backtrace` crate, so we
84+
// have to ignore those.
8285
let sync_mutex_constr_regex = regex::Regex::new(r"lightning.*debug_sync").unwrap();
8386
let mut found_debug_sync = false;
84-
for frame in backtrace.frames() {
85-
for symbol in frame.symbols() {
86-
let symbol_name = symbol.name().unwrap().as_str().unwrap();
87-
if !sync_mutex_constr_regex.is_match(symbol_name) {
88-
if found_debug_sync {
89-
return (format!("{}:{}", symbol.filename().unwrap().display(), symbol.lineno().unwrap()), symbol.colno());
90-
}
91-
} else { found_debug_sync = true; }
87+
let mut symbol_after_latest_debug_sync = None;
88+
for frame in backtrace.frames().iter() {
89+
for symbol in frame.symbols().iter() {
90+
if let Some(symbol_name) = symbol.name().map(|name| name.as_str()).flatten() {
91+
if !sync_mutex_constr_regex.is_match(symbol_name) {
92+
if found_debug_sync {
93+
symbol_after_latest_debug_sync = Some(symbol);
94+
found_debug_sync = false;
95+
}
96+
} else { found_debug_sync = true; }
97+
}
9298
}
9399
}
94-
panic!("Couldn't find mutex construction callsite");
100+
let symbol = symbol_after_latest_debug_sync.expect("Couldn't find lock call symbol");
101+
(format!("{}:{}", symbol.filename().unwrap().display(), symbol.lineno().unwrap()), symbol.colno())
95102
}
96103

97104
impl LockMetadata {
@@ -108,13 +115,13 @@ impl LockMetadata {
108115
#[cfg(feature = "backtrace")]
109116
{
110117
let (lock_constr_location, lock_constr_colno) =
111-
get_construction_location(&res._lock_construction_bt);
118+
locate_call_symbol(&res._lock_construction_bt);
112119
LOCKS_INIT.call_once(|| { unsafe { LOCKS = Some(StdMutex::new(HashMap::new())); } });
113120
let mut locks = unsafe { LOCKS.as_ref() }.unwrap().lock().unwrap();
114121
match locks.entry(lock_constr_location) {
115122
hash_map::Entry::Occupied(e) => {
116123
assert_eq!(lock_constr_colno,
117-
get_construction_location(&e.get()._lock_construction_bt).1,
124+
locate_call_symbol(&e.get()._lock_construction_bt).1,
118125
"Because Windows doesn't support column number results in backtraces, we cannot construct two mutexes on the same line or we risk lockorder detection false positives.");
119126
return Arc::clone(e.get())
120127
},
@@ -138,23 +145,33 @@ impl LockMetadata {
138145
#[cfg(feature = "backtrace")]
139146
debug_assert!(_double_lock_self_allowed,
140147
"Tried to acquire a lock while it was held!\nLock constructed at {}",
141-
get_construction_location(&this._lock_construction_bt).0);
148+
locate_call_symbol(&this._lock_construction_bt).0);
142149
#[cfg(not(feature = "backtrace"))]
143150
panic!("Tried to acquire a lock while it was held!");
144151
}
145152
}
146153
for (_locked_idx, locked) in held.borrow().iter() {
147154
for (locked_dep_idx, _locked_dep) in locked.locked_before.lock().unwrap().iter() {
148-
if *locked_dep_idx == this.lock_idx && *locked_dep_idx != locked.lock_idx {
149-
#[cfg(feature = "backtrace")]
150-
panic!("Tried to violate existing lockorder.\nMutex that should be locked after the current lock was created at the following backtrace.\nNote that to get a backtrace for the lockorder violation, you should set RUST_BACKTRACE=1\nLock being taken constructed at: {} ({}):\n{:?}\nLock constructed at: {} ({})\n{:?}\n\nLock dep created at:\n{:?}\n\n",
151-
get_construction_location(&this._lock_construction_bt).0,
152-
this.lock_idx, this._lock_construction_bt,
153-
get_construction_location(&locked._lock_construction_bt).0,
154-
locked.lock_idx, locked._lock_construction_bt,
155-
_locked_dep._lockdep_trace);
156-
#[cfg(not(feature = "backtrace"))]
157-
panic!("Tried to violate existing lockorder. Build with the backtrace feature for more info.");
155+
let is_dep_this_lock = *locked_dep_idx == this.lock_idx;
156+
let has_same_construction = *locked_dep_idx == locked.lock_idx;
157+
if is_dep_this_lock && !has_same_construction {
158+
#[allow(unused_mut, unused_assignments)]
159+
let mut has_same_callsite = false;
160+
#[cfg(feature = "backtrace")] {
161+
has_same_callsite = locate_call_symbol(&_locked_dep._lockdep_trace) ==
162+
locate_call_symbol(&Backtrace::new());
163+
}
164+
if !has_same_callsite {
165+
#[cfg(feature = "backtrace")]
166+
panic!("Tried to violate existing lockorder.\nMutex that should be locked after the current lock was created at the following backtrace.\nNote that to get a backtrace for the lockorder violation, you should set RUST_BACKTRACE=1\nLock being taken constructed at: {} ({}):\n{:?}\nLock constructed at: {} ({})\n{:?}\n\nLock dep created at:\n{:?}\n\n",
167+
locate_call_symbol(&this._lock_construction_bt).0,
168+
this.lock_idx, this._lock_construction_bt,
169+
locate_call_symbol(&locked._lock_construction_bt).0,
170+
locked.lock_idx, locked._lock_construction_bt,
171+
_locked_dep._lockdep_trace);
172+
#[cfg(not(feature = "backtrace"))]
173+
panic!("Tried to violate existing lockorder. Build with the backtrace feature for more info.");
174+
}
158175
}
159176
}
160177
// Insert any already-held locks in our locked-before set.

0 commit comments

Comments
 (0)