Skip to content

Commit ca271f4

Browse files
committed
[lldb-server/linux] Fix waitpid for multithreaded forks
The lldb-server code is currently set up in a way that each NativeProcess instance does its own waitpid handling. This works fine for BSDs, where the code can do a waitpid(process_id), and get information for all threads in that process. The situation is trickier on linux, because waitpid(pid) will only return information for the main thread of the process (one whose tid == pid). For this reason the linux code does a waitpid(-1), to get information for all threads. This was fine while we were supporting just a single process, but becomes a problem when we have multiple processes as they end up stealing each others events. There are two possible solutions to this problem: - call waitpid(-1) centrally, and then dispatch the events to the appropriate process - have each process call waitpid(tid) for all the threads it manages This patch implements the second approach. Besides fitting better into the existing design, it also has the added benefit of ensuring predictable ordering for thread/process creation events (which come in pairs -- one for the parent and one for the child). The first approach OTOH, would make this ordering even more complicated since we would have to keep the half-threads hanging in mid-air until we find the process we should attach them to. The downside to this approach is an increased number of syscalls (one waitpid for each thread), but I think we're pretty far from optimizing things like this, and so the cleanliness of the design is worth it. The included test reproduces the circumstances which should demonstrate the bug (which manifests as a hung test), but I have not been able to get it to fail. The only place I've seen this failure modes are very rare hangs in the thread sanitizer tests (tsan forks an addr2line process to produce its error messages). Differential Revision: https://reviews.llvm.org/D116372
1 parent 730414b commit ca271f4

File tree

3 files changed

+109
-134
lines changed

3 files changed

+109
-134
lines changed

lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp

Lines changed: 70 additions & 118 deletions
Original file line numberDiff line numberDiff line change
@@ -426,22 +426,24 @@ Status NativeProcessLinux::SetDefaultPtraceOpts(lldb::pid_t pid) {
426426
}
427427

428428
// Handles all waitpid events from the inferior process.
429-
void NativeProcessLinux::MonitorCallback(lldb::pid_t pid, WaitStatus status) {
429+
void NativeProcessLinux::MonitorCallback(NativeThreadLinux &thread,
430+
WaitStatus status) {
430431
Log *log(GetLogIfAnyCategoriesSet(LIBLLDB_LOG_PROCESS));
431432

432433
// Certain activities differ based on whether the pid is the tid of the main
433434
// thread.
434-
const bool is_main_thread = (pid == GetID());
435+
const bool is_main_thread = (thread.GetID() == GetID());
435436

436437
// Handle when the thread exits.
437438
if (status.type == WaitStatus::Exit || status.type == WaitStatus::Signal) {
438439
LLDB_LOG(log,
439440
"got exit status({0}) , tid = {1} ({2} main thread), process "
440441
"state = {3}",
441-
status, pid, is_main_thread ? "is" : "is not", GetState());
442+
status, thread.GetID(), is_main_thread ? "is" : "is not",
443+
GetState());
442444

443445
// This is a thread that exited. Ensure we're not tracking it anymore.
444-
StopTrackingThread(pid);
446+
StopTrackingThread(thread);
445447

446448
if (is_main_thread) {
447449
// The main thread exited. We're done monitoring. Report to delegate.
@@ -454,37 +456,15 @@ void NativeProcessLinux::MonitorCallback(lldb::pid_t pid, WaitStatus status) {
454456
}
455457

456458
siginfo_t info;
457-
const auto info_err = GetSignalInfo(pid, &info);
458-
auto thread_sp = GetThreadByID(pid);
459-
460-
if (!thread_sp) {
461-
// Normally, the only situation when we cannot find the thread is if we
462-
// have just received a new thread notification. This is indicated by
463-
// GetSignalInfo() returning si_code == SI_USER and si_pid == 0
464-
LLDB_LOG(log, "received notification about an unknown tid {0}.", pid);
465-
466-
if (info_err.Fail()) {
467-
LLDB_LOG(log,
468-
"(tid {0}) GetSignalInfo failed ({1}). "
469-
"Ingoring this notification.",
470-
pid, info_err);
471-
return;
472-
}
473-
474-
LLDB_LOG(log, "tid {0}, si_code: {1}, si_pid: {2}", pid, info.si_code,
475-
info.si_pid);
476-
477-
MonitorClone(pid, llvm::None);
478-
return;
479-
}
459+
const auto info_err = GetSignalInfo(thread.GetID(), &info);
480460

481461
// Get details on the signal raised.
482462
if (info_err.Success()) {
483463
// We have retrieved the signal info. Dispatch appropriately.
484464
if (info.si_signo == SIGTRAP)
485-
MonitorSIGTRAP(info, *thread_sp);
465+
MonitorSIGTRAP(info, thread);
486466
else
487-
MonitorSignal(info, *thread_sp);
467+
MonitorSignal(info, thread);
488468
} else {
489469
if (info_err.GetError() == EINVAL) {
490470
// This is a group stop reception for this tid. We can reach here if we
@@ -500,9 +480,8 @@ void NativeProcessLinux::MonitorCallback(lldb::pid_t pid, WaitStatus status) {
500480
"received a group stop for pid {0} tid {1}. Transparent "
501481
"handling of group stops not supported, resuming the "
502482
"thread.",
503-
GetID(), pid);
504-
ResumeThread(*thread_sp, thread_sp->GetState(),
505-
LLDB_INVALID_SIGNAL_NUMBER);
483+
GetID(), thread.GetID());
484+
ResumeThread(thread, thread.GetState(), LLDB_INVALID_SIGNAL_NUMBER);
506485
} else {
507486
// ptrace(GETSIGINFO) failed (but not due to group-stop).
508487

@@ -512,12 +491,12 @@ void NativeProcessLinux::MonitorCallback(lldb::pid_t pid, WaitStatus status) {
512491

513492
// Stop tracking the metadata for the thread since it's entirely off the
514493
// system now.
515-
const bool thread_found = StopTrackingThread(pid);
494+
StopTrackingThread(thread);
516495

517496
LLDB_LOG(log,
518497
"GetSignalInfo failed: {0}, tid = {1}, status = {2}, "
519-
"status = {3}, main_thread = {4}, thread_found: {5}",
520-
info_err, pid, status, status, is_main_thread, thread_found);
498+
"status = {3}, main_thread = {4}",
499+
info_err, thread.GetID(), status, status, is_main_thread);
521500

522501
if (is_main_thread) {
523502
// Notify the delegate - our process is not available but appears to
@@ -532,7 +511,7 @@ void NativeProcessLinux::MonitorCallback(lldb::pid_t pid, WaitStatus status) {
532511
"pid {0} tid {1} non-main thread exit occurred, didn't "
533512
"tell delegate anything since thread disappeared out "
534513
"from underneath us",
535-
GetID(), pid);
514+
GetID(), thread.GetID());
536515
}
537516
}
538517
}
@@ -549,29 +528,14 @@ void NativeProcessLinux::WaitForCloneNotification(::pid_t pid) {
549528
pid);
550529
::pid_t wait_pid =
551530
llvm::sys::RetryAfterSignal(-1, ::waitpid, pid, &status, __WALL);
552-
// Since we are waiting on a specific pid, this must be the creation event.
553-
// But let's do some checks just in case.
554-
if (wait_pid != pid) {
555-
LLDB_LOG(log,
556-
"waiting for pid {0} failed. Assuming the pid has "
557-
"disappeared in the meantime",
558-
pid);
559-
// The only way I know of this could happen is if the whole process was
560-
// SIGKILLed in the mean time. In any case, we can't do anything about that
561-
// now.
562-
return;
563-
}
564-
if (WIFEXITED(status)) {
565-
LLDB_LOG(log,
566-
"waiting for pid {0} returned an 'exited' event. Not "
567-
"tracking it.",
568-
pid);
569-
// Also a very improbable event.
570-
m_pending_pid_map.erase(pid);
571-
return;
572-
}
573531

574-
MonitorClone(pid, llvm::None);
532+
// It's theoretically possible to get other events if the entire process was
533+
// SIGKILLed before we got a chance to check this. In that case, we'll just
534+
// clean everything up when we get the process exit event.
535+
536+
LLDB_LOG(log,
537+
"waitpid({0}, &status, __WALL) => {1} (errno: {2}, status = {3})",
538+
pid, wait_pid, errno, WaitStatus::Decode(status));
575539
}
576540

577541
void NativeProcessLinux::MonitorSIGTRAP(const siginfo_t &info,
@@ -598,8 +562,7 @@ void NativeProcessLinux::MonitorSIGTRAP(const siginfo_t &info,
598562
thread.GetID());
599563
ResumeThread(thread, thread.GetState(), LLDB_INVALID_SIGNAL_NUMBER);
600564
} else {
601-
if (!MonitorClone(event_message, {{(info.si_code >> 8), thread.GetID()}}))
602-
WaitForCloneNotification(event_message);
565+
MonitorClone(thread, event_message, info.si_code >> 8);
603566
}
604567

605568
break;
@@ -886,36 +849,15 @@ void NativeProcessLinux::MonitorSignal(const siginfo_t &info,
886849
StopRunningThreads(thread.GetID());
887850
}
888851

889-
bool NativeProcessLinux::MonitorClone(
890-
lldb::pid_t child_pid,
891-
llvm::Optional<NativeProcessLinux::CloneInfo> clone_info) {
852+
bool NativeProcessLinux::MonitorClone(NativeThreadLinux &parent,
853+
lldb::pid_t child_pid, int event) {
892854
Log *log(ProcessPOSIXLog::GetLogIfAllCategoriesSet(POSIX_LOG_PROCESS));
893-
LLDB_LOG(log, "clone, child_pid={0}, clone info?={1}", child_pid,
894-
clone_info.hasValue());
855+
LLDB_LOG(log, "parent_tid={0}, child_pid={1}, event={2}", parent.GetID(),
856+
child_pid, event);
895857

896-
auto find_it = m_pending_pid_map.find(child_pid);
897-
if (find_it == m_pending_pid_map.end()) {
898-
// not in the map, so this is the first signal for the PID
899-
m_pending_pid_map.insert({child_pid, clone_info});
900-
return false;
901-
}
902-
m_pending_pid_map.erase(find_it);
903-
904-
// second signal for the pid
905-
assert(clone_info.hasValue() != find_it->second.hasValue());
906-
if (!clone_info) {
907-
// child signal does not indicate the event, so grab the one stored
908-
// earlier
909-
clone_info = find_it->second;
910-
}
911-
912-
LLDB_LOG(log, "second signal for child_pid={0}, parent_tid={1}, event={2}",
913-
child_pid, clone_info->parent_tid, clone_info->event);
858+
WaitForCloneNotification(child_pid);
914859

915-
auto *parent_thread = GetThreadByID(clone_info->parent_tid);
916-
assert(parent_thread);
917-
918-
switch (clone_info->event) {
860+
switch (event) {
919861
case PTRACE_EVENT_CLONE: {
920862
// PTRACE_EVENT_CLONE can either mean a new thread or a new process.
921863
// Try to grab the new process' PGID to figure out which one it is.
@@ -930,15 +872,14 @@ bool NativeProcessLinux::MonitorClone(
930872
ThreadWasCreated(child_thread);
931873

932874
// Resume the parent.
933-
ResumeThread(*parent_thread, parent_thread->GetState(),
934-
LLDB_INVALID_SIGNAL_NUMBER);
875+
ResumeThread(parent, parent.GetState(), LLDB_INVALID_SIGNAL_NUMBER);
935876
break;
936877
}
937878
}
938879
LLVM_FALLTHROUGH;
939880
case PTRACE_EVENT_FORK:
940881
case PTRACE_EVENT_VFORK: {
941-
bool is_vfork = clone_info->event == PTRACE_EVENT_VFORK;
882+
bool is_vfork = event == PTRACE_EVENT_VFORK;
942883
std::unique_ptr<NativeProcessLinux> child_process{new NativeProcessLinux(
943884
static_cast<::pid_t>(child_pid), m_terminal_fd, m_delegate, m_arch,
944885
m_main_loop, {static_cast<::pid_t>(child_pid)})};
@@ -949,12 +890,11 @@ bool NativeProcessLinux::MonitorClone(
949890
if (bool(m_enabled_extensions & expected_ext)) {
950891
m_delegate.NewSubprocess(this, std::move(child_process));
951892
// NB: non-vfork clone() is reported as fork
952-
parent_thread->SetStoppedByFork(is_vfork, child_pid);
953-
StopRunningThreads(parent_thread->GetID());
893+
parent.SetStoppedByFork(is_vfork, child_pid);
894+
StopRunningThreads(parent.GetID());
954895
} else {
955896
child_process->Detach();
956-
ResumeThread(*parent_thread, parent_thread->GetState(),
957-
LLDB_INVALID_SIGNAL_NUMBER);
897+
ResumeThread(parent, parent.GetState(), LLDB_INVALID_SIGNAL_NUMBER);
958898
}
959899
break;
960900
}
@@ -1729,24 +1669,19 @@ bool NativeProcessLinux::HasThreadNoLock(lldb::tid_t thread_id) {
17291669
return false;
17301670
}
17311671

1732-
bool NativeProcessLinux::StopTrackingThread(lldb::tid_t thread_id) {
1672+
void NativeProcessLinux::StopTrackingThread(NativeThreadLinux &thread) {
17331673
Log *const log = ProcessPOSIXLog::GetLogIfAllCategoriesSet(POSIX_LOG_THREAD);
1734-
LLDB_LOG(log, "tid: {0})", thread_id);
1735-
1736-
bool found = false;
1737-
for (auto it = m_threads.begin(); it != m_threads.end(); ++it) {
1738-
if (*it && ((*it)->GetID() == thread_id)) {
1739-
m_threads.erase(it);
1740-
found = true;
1741-
break;
1742-
}
1743-
}
1674+
lldb::tid_t thread_id = thread.GetID();
1675+
LLDB_LOG(log, "tid: {0}", thread_id);
17441676

1745-
if (found)
1746-
NotifyTracersOfThreadDestroyed(thread_id);
1677+
auto it = llvm::find_if(m_threads, [&](const auto &thread_up) {
1678+
return thread_up.get() == &thread;
1679+
});
1680+
assert(it != m_threads.end());
1681+
m_threads.erase(it);
17471682

1683+
NotifyTracersOfThreadDestroyed(thread_id);
17481684
SignalIfAllThreadsStopped();
1749-
return found;
17501685
}
17511686

17521687
Status NativeProcessLinux::NotifyTracersOfNewThread(lldb::tid_t tid) {
@@ -1945,27 +1880,44 @@ void NativeProcessLinux::ThreadWasCreated(NativeThreadLinux &thread) {
19451880

19461881
void NativeProcessLinux::SigchldHandler() {
19471882
Log *log(ProcessPOSIXLog::GetLogIfAllCategoriesSet(POSIX_LOG_PROCESS));
1948-
// Process all pending waitpid notifications.
1949-
while (true) {
1883+
1884+
// Threads can appear or disappear as a result of event processing, so gather
1885+
// the events upfront.
1886+
llvm::DenseMap<lldb::tid_t, WaitStatus> tid_events;
1887+
for (const auto &thread_up : m_threads) {
19501888
int status = -1;
1951-
::pid_t wait_pid = llvm::sys::RetryAfterSignal(-1, ::waitpid, -1, &status,
1952-
__WALL | __WNOTHREAD | WNOHANG);
1889+
::pid_t wait_pid =
1890+
llvm::sys::RetryAfterSignal(-1, ::waitpid, thread_up->GetID(), &status,
1891+
__WALL | __WNOTHREAD | WNOHANG);
19531892

19541893
if (wait_pid == 0)
1955-
break; // We are done.
1894+
continue; // Nothing to do for this thread.
19561895

19571896
if (wait_pid == -1) {
19581897
Status error(errno, eErrorTypePOSIX);
1959-
LLDB_LOG(log, "waitpid (-1, &status, _) failed: {0}", error);
1960-
break;
1898+
LLDB_LOG(log, "waitpid({0}, &status, _) failed: {1}", thread_up->GetID(),
1899+
error);
1900+
continue;
19611901
}
19621902

1903+
assert(wait_pid == static_cast<::pid_t>(thread_up->GetID()));
1904+
19631905
WaitStatus wait_status = WaitStatus::Decode(status);
19641906

1965-
LLDB_LOG(log, "waitpid (-1, &status, _) => pid = {0}, status = {1}",
1966-
wait_pid, wait_status);
1907+
LLDB_LOG(log, "waitpid({0}) got status = {1}", thread_up->GetID(),
1908+
wait_status);
1909+
tid_events.try_emplace(thread_up->GetID(), wait_status);
1910+
}
19671911

1968-
MonitorCallback(wait_pid, wait_status);
1912+
for (auto &KV : tid_events) {
1913+
LLDB_LOG(log, "processing {0}({1}) ...", KV.first, KV.second);
1914+
NativeThreadLinux *thread = GetThreadByID(KV.first);
1915+
if (thread) {
1916+
MonitorCallback(*thread, KV.second);
1917+
} else {
1918+
// This can happen if one of the events is an main thread exit.
1919+
LLDB_LOG(log, "... but the thread has disappeared");
1920+
}
19691921
}
19701922
}
19711923

lldb/source/Plugins/Process/Linux/NativeProcessLinux.h

Lines changed: 5 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,7 @@ class NativeProcessLinux : public NativeProcessELF,
164164

165165
static Status SetDefaultPtraceOpts(const lldb::pid_t);
166166

167-
void MonitorCallback(lldb::pid_t pid, WaitStatus status);
167+
void MonitorCallback(NativeThreadLinux &thread, WaitStatus status);
168168

169169
void WaitForCloneNotification(::pid_t pid);
170170

@@ -180,7 +180,7 @@ class NativeProcessLinux : public NativeProcessELF,
180180

181181
bool HasThreadNoLock(lldb::tid_t thread_id);
182182

183-
bool StopTrackingThread(lldb::tid_t thread_id);
183+
void StopTrackingThread(NativeThreadLinux &thread);
184184

185185
/// Create a new thread.
186186
///
@@ -243,20 +243,9 @@ class NativeProcessLinux : public NativeProcessELF,
243243
/// Manages Intel PT process and thread traces.
244244
IntelPTManager m_intel_pt_manager;
245245

246-
struct CloneInfo {
247-
int event;
248-
lldb::tid_t parent_tid;
249-
};
250-
251-
// Map of child processes that have been signaled once, and we are
252-
// waiting for the second signal.
253-
llvm::DenseMap<lldb::pid_t, llvm::Optional<CloneInfo>> m_pending_pid_map;
254-
255-
// Handle a clone()-like event. If received by parent, clone_info contains
256-
// additional info. Returns true if the event is handled, or false if it
257-
// is pending second notification.
258-
bool MonitorClone(lldb::pid_t child_pid,
259-
llvm::Optional<CloneInfo> clone_info);
246+
// Handle a clone()-like event.
247+
bool MonitorClone(NativeThreadLinux &parent, lldb::pid_t child_pid,
248+
int event);
260249
};
261250

262251
} // namespace process_linux

0 commit comments

Comments
 (0)