From 105fc01f1964a32e69a0b035ae30c78e7dbd813a Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Wed, 9 Nov 2022 14:10:25 +0000 Subject: [PATCH] Ensure that at most one `gc_tracepoint` is active There's two situations that can happen here (see comments in code), and in either situation we want to ensure that at most one `gc_tracepoint` is active (or none, if disabled). --- .../collectors_cpu_and_wall_time_worker.c | 24 +++++++++++++++---- .../cpu_and_wall_time_worker_spec.rb | 18 ++++++++++++++ 2 files changed, 37 insertions(+), 5 deletions(-) diff --git a/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time_worker.c b/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time_worker.c index 83b05b916d5..a96b346935a 100644 --- a/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time_worker.c +++ b/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time_worker.c @@ -205,11 +205,25 @@ static VALUE _native_sampling_loop(DDTRACE_UNUSED VALUE _self, VALUE instance) { struct cpu_and_wall_time_worker_state *state; TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state); - if (active_sampler_owner_thread != Qnil && is_thread_alive(active_sampler_owner_thread)) { - rb_raise( - rb_eRuntimeError, - "Could not start CpuAndWallTimeWorker: There's already another instance of CpuAndWallTimeWorker active in a different thread" - ); + if (active_sampler_owner_thread != Qnil) { + if (is_thread_alive(active_sampler_owner_thread)) { + rb_raise( + rb_eRuntimeError, + "Could not start CpuAndWallTimeWorker: There's already another instance of CpuAndWallTimeWorker active in a different thread" + ); + } else { + // The previously active thread seems to have died without cleaning up after itself. + // In this case, we can still go ahead and start the profiler BUT we make sure to disable any existing GC tracepoint + // first as: + // a) If this is a new instance of the CpuAndWallTimeWorker, we don't want the tracepoint from the old instance + // being kept around + // b) If this is the same instance of the CpuAndWallTimeWorker if we call enable on a tracepoint that is already + // enabled, it will start firing more than once, see https://bugs.ruby-lang.org/issues/19114 for details. + + struct cpu_and_wall_time_worker_state *old_state; + TypedData_Get_Struct(active_sampler_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, old_state); + rb_tracepoint_disable(old_state->gc_tracepoint); + } } // This write to a global is thread-safe BECAUSE we're still holding on to the global VM lock at this point diff --git a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb index 101afa5eadb..cd2adad846a 100644 --- a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb +++ b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb @@ -204,6 +204,24 @@ try_wait_until(backoff: 0.01) { described_class::Testing._native_is_running?(another_instance) } end end + + it 'disables the existing gc_tracepoint before starting another CpuAndWallTimeWorker' do + start + + expect_in_fork do + another_instance = described_class.new( + recorder: Datadog::Profiling::StackRecorder.new, + max_frames: 400, + tracer: nil, + gc_profiling_enabled: gc_profiling_enabled, + ) + another_instance.start + + try_wait_until(backoff: 0.01) { described_class::Testing._native_is_running?(another_instance) } + + expect(described_class::Testing._native_gc_tracepoint(cpu_and_wall_time_worker)).to_not be_enabled + end + end end end