Skip to content

Commit

Permalink
Merge pull request #2359 from DataDog/ivoanjo/prof-5943-handle-vm-for…
Browse files Browse the repository at this point in the history
…king

[PROF-5953] Ruby VM forking fixes for new profiler
  • Loading branch information
ivoanjo authored Nov 10, 2022
2 parents 4a58009 + eb8fcb6 commit af002cd
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 18 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ static VALUE _native_initialize(
static void cpu_and_wall_time_worker_typed_data_mark(void *state_ptr);
static VALUE _native_sampling_loop(VALUE self, VALUE instance);
static VALUE _native_stop(DDTRACE_UNUSED VALUE _self, VALUE self_instance);
static VALUE stop(VALUE self_instance, VALUE optional_exception);
static void install_sigprof_signal_handler(void (*signal_handler_function)(int, siginfo_t *, void *));
static void remove_sigprof_signal_handler(void);
static void block_sigprof_signal_handler_from_running_in_current_thread(void);
Expand Down Expand Up @@ -205,11 +206,25 @@ static VALUE _native_sampling_loop(DDTRACE_UNUSED VALUE _self, VALUE instance) {
struct cpu_and_wall_time_worker_state *state;
TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);

if (active_sampler_owner_thread != Qnil && is_thread_alive(active_sampler_owner_thread)) {
rb_raise(
rb_eRuntimeError,
"Could not start CpuAndWallTimeWorker: There's already another instance of CpuAndWallTimeWorker active in a different thread"
);
if (active_sampler_owner_thread != Qnil) {
if (is_thread_alive(active_sampler_owner_thread)) {
rb_raise(
rb_eRuntimeError,
"Could not start CpuAndWallTimeWorker: There's already another instance of CpuAndWallTimeWorker active in a different thread"
);
} else {
// The previously active thread seems to have died without cleaning up after itself.
// In this case, we can still go ahead and start the profiler BUT we make sure to disable any existing GC tracepoint
// first as:
// a) If this is a new instance of the CpuAndWallTimeWorker, we don't want the tracepoint from the old instance
// being kept around
// b) If this is the same instance of the CpuAndWallTimeWorker if we call enable on a tracepoint that is already
// enabled, it will start firing more than once, see https://bugs.ruby-lang.org/issues/19114 for details.

struct cpu_and_wall_time_worker_state *old_state;
TypedData_Get_Struct(active_sampler_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, old_state);
rb_tracepoint_disable(old_state->gc_tracepoint);
}
}

// This write to a global is thread-safe BECAUSE we're still holding on to the global VM lock at this point
Expand Down Expand Up @@ -243,10 +258,18 @@ static VALUE _native_sampling_loop(DDTRACE_UNUSED VALUE _self, VALUE instance) {
}

static VALUE _native_stop(DDTRACE_UNUSED VALUE _self, VALUE self_instance) {
return stop(self_instance, /* optional_exception: */ Qnil);
}

static VALUE stop(VALUE self_instance, VALUE optional_exception) {
struct cpu_and_wall_time_worker_state *state;
TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);

state->should_run = false;
state->failure_exception = optional_exception;

// Disable the GC tracepoint as soon as possible, so the VM doesn't keep on calling it
rb_tracepoint_disable(state->gc_tracepoint);

return Qtrue;
}
Expand Down Expand Up @@ -368,18 +391,7 @@ static void sample_from_postponed_job(DDTRACE_UNUSED void *_unused) {
safely_call(cpu_and_wall_time_collector_sample, state->cpu_and_wall_time_collector_instance, instance);
}

static VALUE handle_sampling_failure(VALUE self_instance, VALUE exception) {
struct cpu_and_wall_time_worker_state *state;
TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);

state->should_run = false;
state->failure_exception = exception;

// Disable the GC tracepoint as soon as possible, so the VM doesn't keep on calling it
rb_tracepoint_disable(state->gc_tracepoint);

return Qnil;
}
static VALUE handle_sampling_failure(VALUE self_instance, VALUE exception) { return stop(self_instance, exception); }

// This method exists only to enable testing Datadog::Profiling::Collectors::CpuAndWallTimeWorker behavior using RSpec.
// It SHOULD NOT be used for other purposes.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def initialize(

def start
@start_stop_mutex.synchronize do
return if @worker_thread
return if @worker_thread && @worker_thread.alive?

Datadog.logger.debug { "Starting thread for: #{self}" }
@worker_thread = Thread.new do
Expand Down
46 changes: 46 additions & 0 deletions spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,52 @@
current_thread_gc_samples.inject(0) { |sum, sample| sum + sample.fetch(:values).fetch(:'cpu-samples') }
).to be >= invoke_gc_times
end

context 'when the background thread dies without cleaning up (after Ruby forks)' do
it 'allows the CpuAndWallTimeWorker to be restarted' do
start

expect_in_fork do
cpu_and_wall_time_worker.start
wait_until_running
end
end

it 'allows a different instance of the CpuAndWallTimeWorker to be started' do
start

expect_in_fork do
another_instance = described_class.new(
recorder: Datadog::Profiling::StackRecorder.new,
max_frames: 400,
tracer: nil,
gc_profiling_enabled: gc_profiling_enabled,
)
another_instance.start

try_wait_until(backoff: 0.01) { described_class::Testing._native_is_running?(another_instance) }
end
end

it 'disables the existing gc_tracepoint before starting another CpuAndWallTimeWorker' do
start

expect_in_fork do
another_instance = described_class.new(
recorder: Datadog::Profiling::StackRecorder.new,
max_frames: 400,
tracer: nil,
gc_profiling_enabled: gc_profiling_enabled,
)
another_instance.start

try_wait_until(backoff: 0.01) { described_class::Testing._native_is_running?(another_instance) }

expect(described_class::Testing._native_gc_tracepoint(cpu_and_wall_time_worker)).to_not be_enabled
expect(described_class::Testing._native_gc_tracepoint(another_instance)).to be_enabled
end
end
end
end

describe 'Ractor safety' do
Expand Down

0 comments on commit af002cd

Please sign in to comment.