From b9a92745b848952d7b3e6b3f3e0f5aa6ed7c5fc4 Mon Sep 17 00:00:00 2001 From: Keno Fischer Date: Sat, 19 Aug 2023 21:37:23 -0400 Subject: [PATCH] TracyProfiler: Use rr-safe nopl; rdtsc sequence (#7225) Several users have reported masssive slowdowns to rr record/replay in packages that have the tracy client loaded. The issue turns out to be an unlucky false positive in an rr heuristic that determines the patchability of `rdtsc`. In rr master, it is possible to guarantee patchability by using a specific `nopl; rdtsc` sequence. Add a patch to tracy to do just that. See also https://github.com/rr-debugger/rr/pull/3580 https://github.com/JuliaLang/julia/pull/50975 --- T/TracyProfiler/build_tarballs.jl | 1 + .../patches/TracyProfiler-rr-nopl-seq.patch | 43 +++++++++++++++++++ 2 files changed, 44 insertions(+) create mode 100644 T/TracyProfiler/bundled/patches/TracyProfiler-rr-nopl-seq.patch diff --git a/T/TracyProfiler/build_tarballs.jl b/T/TracyProfiler/build_tarballs.jl index 0ec48c36b01..a7591815b06 100644 --- a/T/TracyProfiler/build_tarballs.jl +++ b/T/TracyProfiler/build_tarballs.jl @@ -44,6 +44,7 @@ fi atomic_patch -p1 ../patches/TracyProfiler-nfd-extended-1.0.2.patch atomic_patch -p1 ../patches/TracyProfiler-filter-user-text.patch atomic_patch -p1 ../patches/TracyProfiler-no-divide-zero.patch +atomic_patch -p1 ../patches/TracyProfiler-rr-nopl-seq.patch # Build / install the profiler GUI make -e -j${nproc} -C profiler/build/unix LEGACY=1 IMAGE=tracy release diff --git a/T/TracyProfiler/bundled/patches/TracyProfiler-rr-nopl-seq.patch b/T/TracyProfiler/bundled/patches/TracyProfiler-rr-nopl-seq.patch new file mode 100644 index 00000000000..8ecd9193bf3 --- /dev/null +++ b/T/TracyProfiler/bundled/patches/TracyProfiler-rr-nopl-seq.patch @@ -0,0 +1,43 @@ +commit 21a65e08372a37342e598422a2bd58263857807e +Author: Keno Fischer +Date: Sat Aug 19 01:40:18 2023 +0000 + + Use patchable rdtsc sequence to avoid slowdowns under rr + + We (Julia) ship both support for using tracy to trace julia applications, + as well as using `rr` (https://github.com/rr-debugger/rr) for record-replay debugging. + After our most recent rebuild of tracy, users have been reporting signfificant performance + slowdowns when `rr` recording a session that happens to also load the tracy library + (even if tracing is not enabled). Upon further examination, the recompile happened + to trigger a protective heuristic that disabled rr's patching of tracy's use of + `rdtsc` because an earlier part of the same function happened to look like a + conditional branch into the patch region. See https://github.com/rr-debugger/rr/pull/3580 + for details. To avoid this issue occurring again in future rebuilds of tracy, + adjust tracy's `rdtsc` sequence to be `nopl; rdtsc`, which (as of of the + linked PR) is a sequence that is guaranteed to bypass this heuristic + and not incur the additional overhead when run under rr. + +diff --git a/public/client/TracyProfiler.hpp b/public/client/TracyProfiler.hpp +index 1b825ea3..27f6bbbd 100644 +--- a/public/client/TracyProfiler.hpp ++++ b/public/client/TracyProfiler.hpp +@@ -209,7 +209,18 @@ public: + if( HardwareSupportsInvariantTSC() ) + { + uint64_t rax, rdx; +- asm volatile ( "rdtsc" : "=a" (rax), "=d" (rdx) ); ++ // Some external tooling (such as rr) wants to patch our rdtsc and replace it by a ++ // branch to control the external input seen by a program. This kind of patching is ++ // not generally possible depending on the surrounding code and can lead to significant ++ // slowdowns if the compiler generated unlucky code and rr and tracy are used together. ++ // To avoid this, use the rr-safe `nopl 0(%rax, %rax, 1); rdtsc` instruction sequence, ++ // which rr promises will be patchable independent of the surrounding code. ++ asm volatile ( ++ // This is nopl 0(%rax, %rax, 1), but assemblers are inconsistent about whether ++ // they emit that as a 4 or 5 byte sequence and we need to be guaranteed to use ++ // the 5 byte one. ++ ".byte 0x0f, 0x1f, 0x44, 0x00, 0x00\n\t" ++ "rdtsc" : "=a" (rax), "=d" (rdx) ); + return (int64_t)(( rdx << 32 ) + rax); + } + # else