Skip to content

Commit

Permalink
Merge pull request #2357 from DataDog/ivoanjo/prof-5942-ractor-support
Browse files Browse the repository at this point in the history
[PROF-5942] Disallow sampling ractors in profiler
  • Loading branch information
ivoanjo authored Nov 10, 2022
2 parents 97885d7 + 6e3d2e9 commit 72a7907
Show file tree
Hide file tree
Showing 4 changed files with 107 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,7 @@ static VALUE _native_sample_after_gc(DDTRACE_UNUSED VALUE self, VALUE collector_
// Assumption 2: This function is allowed to raise exceptions. Caller is responsible for handling them, if needed.
// Assumption 3: This function IS NOT called from a signal handler. This function is not async-signal-safe.
// Assumption 4: This function IS NOT called in a reentrant way.
// Assumption 5: This function is called from the main Ractor (if Ruby has support for Ractors).
VALUE cpu_and_wall_time_collector_sample(VALUE self_instance) {
struct cpu_and_wall_time_collector_state *state;
TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_collector_state, &cpu_and_wall_time_collector_typed_data, state);
Expand Down Expand Up @@ -384,6 +385,7 @@ VALUE cpu_and_wall_time_collector_sample(VALUE self_instance) {
// This includes exceptions and use of ruby_xcalloc (because xcalloc can trigger GC)!
//
// Assumption 1: This function is called in a thread that is holding the Global VM Lock. Caller is responsible for enforcing this.
// Assumption 2: This function is called from the main Ractor (if Ruby has support for Ractors).
void cpu_and_wall_time_collector_on_gc_start(VALUE self_instance) {
struct cpu_and_wall_time_collector_state *state;
if (!rb_typeddata_is_kind_of(self_instance, &cpu_and_wall_time_collector_typed_data)) return;
Expand Down Expand Up @@ -428,6 +430,7 @@ void cpu_and_wall_time_collector_on_gc_start(VALUE self_instance) {
// This includes exceptions and use of ruby_xcalloc (because xcalloc can trigger GC)!
//
// Assumption 1: This function is called in a thread that is holding the Global VM Lock. Caller is responsible for enforcing this.
// Assumption 2: This function is called from the main Ractor (if Ruby has support for Ractors).
void cpu_and_wall_time_collector_on_gc_finish(VALUE self_instance) {
struct cpu_and_wall_time_collector_state *state;
if (!rb_typeddata_is_kind_of(self_instance, &cpu_and_wall_time_collector_typed_data)) return;
Expand Down Expand Up @@ -464,6 +467,7 @@ void cpu_and_wall_time_collector_on_gc_finish(VALUE self_instance) {
// Assumption 1: This function is called in a thread that is holding the Global VM Lock. Caller is responsible for enforcing this.
// Assumption 2: This function is allowed to raise exceptions. Caller is responsible for handling them, if needed.
// Assumption 3: Unlike `on_gc_start` and `on_gc_finish`, this method is allowed to allocate memory as needed.
// Assumption 4: This function is called from the main Ractor (if Ruby has support for Ractors).
VALUE cpu_and_wall_time_collector_sample_after_gc(VALUE self_instance) {
struct cpu_and_wall_time_collector_state *state;
TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_collector_state, &cpu_and_wall_time_collector_typed_data, state);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,8 @@ static VALUE _native_gc_tracepoint(DDTRACE_UNUSED VALUE self, VALUE instance);
static void on_gc_event(VALUE tracepoint_data, DDTRACE_UNUSED void *unused);
static void after_gc_from_postponed_job(DDTRACE_UNUSED void *_unused);
static void safely_call(VALUE (*function_to_call_safely)(VALUE), VALUE function_to_call_safely_arg, VALUE instance);
static VALUE _native_simulate_handle_sampling_signal(DDTRACE_UNUSED VALUE self);
static VALUE _native_simulate_sample_from_postponed_job(DDTRACE_UNUSED VALUE self);

// Global state -- be very careful when accessing or modifying it

Expand Down Expand Up @@ -142,6 +144,8 @@ void collectors_cpu_and_wall_time_worker_init(VALUE profiling_module) {
rb_define_singleton_method(testing_module, "_native_remove_testing_signal_handler", _native_remove_testing_signal_handler, 0);
rb_define_singleton_method(testing_module, "_native_trigger_sample", _native_trigger_sample, 0);
rb_define_singleton_method(testing_module, "_native_gc_tracepoint", _native_gc_tracepoint, 1);
rb_define_singleton_method(testing_module, "_native_simulate_handle_sampling_signal", _native_simulate_handle_sampling_signal, 0);
rb_define_singleton_method(testing_module, "_native_simulate_sample_from_postponed_job", _native_simulate_sample_from_postponed_job, 0);
}

// This structure is used to define a Ruby object that stores a pointer to a struct cpu_and_wall_time_worker_state
Expand Down Expand Up @@ -301,6 +305,9 @@ static void handle_sampling_signal(DDTRACE_UNUSED int _signal, DDTRACE_UNUSED si
if (!ruby_thread_has_gvl_p()) {
return; // Not safe to enqueue a sample from this thread
}
if (!ddtrace_rb_ractor_main_p()) {
return; // We're not on the main Ractor; we currently don't support profiling non-main Ractors
}

// We implicitly assume there can be no concurrent nor nested calls to handle_sampling_signal because
// a) we get triggered using SIGPROF, and the docs state second SIGPROF will not interrupt an existing one
Expand Down Expand Up @@ -348,6 +355,12 @@ static void sample_from_postponed_job(DDTRACE_UNUSED void *_unused) {
// This can potentially happen if the CpuAndWallTimeWorker was stopped while the postponed job was waiting to be executed; nothing to do
if (instance == Qnil) return;

// @ivoanjo: I'm not sure this can ever happen because `handle_sampling_signal` only enqueues this callback if
// it's running on the main Ractor, but just in case...
if (!ddtrace_rb_ractor_main_p()) {
return; // We're not on the main Ractor; we currently don't support profiling non-main Ractors
}

struct cpu_and_wall_time_worker_state *state;
TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);

Expand Down Expand Up @@ -453,6 +466,10 @@ static VALUE _native_gc_tracepoint(DDTRACE_UNUSED VALUE self, VALUE instance) {
// *NO ALLOCATION* is allowed. This function, and any it calls must never trigger memory or object allocation.
// This includes exceptions and use of ruby_xcalloc (because xcalloc can trigger GC)!
static void on_gc_event(VALUE tracepoint_data, DDTRACE_UNUSED void *unused) {
if (!ddtrace_rb_ractor_main_p()) {
return; // We're not on the main Ractor; we currently don't support profiling non-main Ractors
}

int event = rb_tracearg_event_flag(rb_tracearg_from_tracepoint(tracepoint_data));
if (event != RUBY_INTERNAL_EVENT_GC_ENTER && event != RUBY_INTERNAL_EVENT_GC_EXIT) return; // Unknown event

Expand Down Expand Up @@ -498,6 +515,12 @@ static void after_gc_from_postponed_job(DDTRACE_UNUSED void *_unused) {
// This can potentially happen if the CpuAndWallTimeWorker was stopped while the postponed job was waiting to be executed; nothing to do
if (instance == Qnil) return;

// @ivoanjo: I'm not sure this can ever happen because `on_gc_event` only enqueues this callback if
// it's running on the main Ractor, but just in case...
if (!ddtrace_rb_ractor_main_p()) {
return; // We're not on the main Ractor; we currently don't support profiling non-main Ractors
}

struct cpu_and_wall_time_worker_state *state;
TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);

Expand All @@ -518,3 +541,17 @@ static void safely_call(VALUE (*function_to_call_safely)(VALUE), VALUE function_
0 // Required by API to be the last argument
);
}

// This method exists only to enable testing Datadog::Profiling::Collectors::CpuAndWallTimeWorker behavior using RSpec.
// It SHOULD NOT be used for other purposes.
static VALUE _native_simulate_handle_sampling_signal(DDTRACE_UNUSED VALUE self) {
handle_sampling_signal(0, NULL, NULL);
return Qtrue;
}

// This method exists only to enable testing Datadog::Profiling::Collectors::CpuAndWallTimeWorker behavior using RSpec.
// It SHOULD NOT be used for other purposes.
static VALUE _native_simulate_sample_from_postponed_job(DDTRACE_UNUSED VALUE self) {
sample_from_postponed_job(NULL);
return Qtrue;
}
4 changes: 2 additions & 2 deletions lib/datadog/core/configuration/settings.rb
Original file line number Diff line number Diff line change
Expand Up @@ -263,8 +263,8 @@ def initialize(*_)
# being automatically enabled in the future.
#
# This toggle was added because, although this feature is safe and enabled by default on Ruby 2.x,
# on Ruby 3.x it can break in applications that make use of Ractors due to a Ruby VM bug
# (https://bugs.ruby-lang.org/issues/19112).
# on Ruby 3.x it can break in applications that make use of Ractors due to two Ruby VM bugs:
# https://bugs.ruby-lang.org/issues/19112 AND https://bugs.ruby-lang.org/issues/18464.
#
# If you use Ruby 3.x and your application does not use Ractors (or if your Ruby has been patched), the
# feature is fully safe to enable and this toggle can be used to do so.
Expand Down
64 changes: 64 additions & 0 deletions spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,70 @@
end
end

describe 'Ractor safety' do
before do
skip 'Behavior does not apply to current Ruby version' if RUBY_VERSION < '3.'

# See native_extension_spec.rb for more details on the issues we saw on 3.0
skip 'Ruby 3.0 Ractors are too buggy to run this spec' if RUBY_VERSION.start_with?('3.0.')
end

shared_examples_for 'does not trigger a sample' do |run_ractor|
it 'does not trigger a sample' do
cpu_and_wall_time_worker.start
wait_until_running

run_ractor.call

cpu_and_wall_time_worker.stop

serialization_result = recorder.serialize
raise 'Unexpected: Serialization failed' unless serialization_result

samples_from_ractor =
samples_from_pprof(serialization_result.last)
.select { |it| it.fetch(:labels)[:'thread name'] == 'background ractor' }

expect(samples_from_ractor).to be_empty
end
end

context 'when called from a background ractor' do
# Even though we're not testing it explicitly, the GC profiling hooks can sometimes be called when running these
# specs. Unfortunately, there's a VM crash in that case as well -- https://bugs.ruby-lang.org/issues/18464 --
# so this must be disabled when interacting with Ractors.
let(:gc_profiling_enabled) { false }

describe 'handle_sampling_signal' do
include_examples 'does not trigger a sample',
(
proc do
Ractor.new do
Thread.current.name = 'background ractor'
Datadog::Profiling::Collectors::CpuAndWallTimeWorker::Testing._native_simulate_handle_sampling_signal
end.take
end
)
end

describe 'sample_from_postponed_job' do
include_examples 'does not trigger a sample',
(
proc do
Ractor.new do
Thread.current.name = 'background ractor'
Datadog::Profiling::Collectors::CpuAndWallTimeWorker::Testing._native_simulate_sample_from_postponed_job
end.take
end
)
end

# @ivoanjo: I initially tried to also test the GC callbacks, but it gets a bit hacky to force the thread
# context creation for the ractors, and then simulate a GC. (For instance -- how to prevent against the context
# creation running in parallel with a regular sample?)
end
end

describe '#stop' do
subject(:stop) { cpu_and_wall_time_worker.stop }

Expand Down

0 comments on commit 72a7907

Please sign in to comment.