Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[PROF-5942] Disallow sampling ractors in profiler #2357

Merged
merged 2 commits into from
Nov 10, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,7 @@ static VALUE _native_sample_after_gc(DDTRACE_UNUSED VALUE self, VALUE collector_
// Assumption 2: This function is allowed to raise exceptions. Caller is responsible for handling them, if needed.
// Assumption 3: This function IS NOT called from a signal handler. This function is not async-signal-safe.
// Assumption 4: This function IS NOT called in a reentrant way.
// Assumption 5: This function is called from the main Ractor (if Ruby has support for Ractors).
VALUE cpu_and_wall_time_collector_sample(VALUE self_instance) {
struct cpu_and_wall_time_collector_state *state;
TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_collector_state, &cpu_and_wall_time_collector_typed_data, state);
Expand Down Expand Up @@ -384,6 +385,7 @@ VALUE cpu_and_wall_time_collector_sample(VALUE self_instance) {
// This includes exceptions and use of ruby_xcalloc (because xcalloc can trigger GC)!
//
// Assumption 1: This function is called in a thread that is holding the Global VM Lock. Caller is responsible for enforcing this.
// Assumption 2: This function is called from the main Ractor (if Ruby has support for Ractors).
void cpu_and_wall_time_collector_on_gc_start(VALUE self_instance) {
struct cpu_and_wall_time_collector_state *state;
if (!rb_typeddata_is_kind_of(self_instance, &cpu_and_wall_time_collector_typed_data)) return;
Expand Down Expand Up @@ -428,6 +430,7 @@ void cpu_and_wall_time_collector_on_gc_start(VALUE self_instance) {
// This includes exceptions and use of ruby_xcalloc (because xcalloc can trigger GC)!
//
// Assumption 1: This function is called in a thread that is holding the Global VM Lock. Caller is responsible for enforcing this.
// Assumption 2: This function is called from the main Ractor (if Ruby has support for Ractors).
void cpu_and_wall_time_collector_on_gc_finish(VALUE self_instance) {
struct cpu_and_wall_time_collector_state *state;
if (!rb_typeddata_is_kind_of(self_instance, &cpu_and_wall_time_collector_typed_data)) return;
Expand Down Expand Up @@ -464,6 +467,7 @@ void cpu_and_wall_time_collector_on_gc_finish(VALUE self_instance) {
// Assumption 1: This function is called in a thread that is holding the Global VM Lock. Caller is responsible for enforcing this.
// Assumption 2: This function is allowed to raise exceptions. Caller is responsible for handling them, if needed.
// Assumption 3: Unlike `on_gc_start` and `on_gc_finish`, this method is allowed to allocate memory as needed.
// Assumption 4: This function is called from the main Ractor (if Ruby has support for Ractors).
VALUE cpu_and_wall_time_collector_sample_after_gc(VALUE self_instance) {
struct cpu_and_wall_time_collector_state *state;
TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_collector_state, &cpu_and_wall_time_collector_typed_data, state);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,8 @@ static VALUE _native_gc_tracepoint(DDTRACE_UNUSED VALUE self, VALUE instance);
static void on_gc_event(VALUE tracepoint_data, DDTRACE_UNUSED void *unused);
static void after_gc_from_postponed_job(DDTRACE_UNUSED void *_unused);
static void safely_call(VALUE (*function_to_call_safely)(VALUE), VALUE function_to_call_safely_arg, VALUE instance);
static VALUE _native_simulate_handle_sampling_signal(DDTRACE_UNUSED VALUE self);
static VALUE _native_simulate_sample_from_postponed_job(DDTRACE_UNUSED VALUE self);

// Global state -- be very careful when accessing or modifying it

Expand Down Expand Up @@ -142,6 +144,8 @@ void collectors_cpu_and_wall_time_worker_init(VALUE profiling_module) {
rb_define_singleton_method(testing_module, "_native_remove_testing_signal_handler", _native_remove_testing_signal_handler, 0);
rb_define_singleton_method(testing_module, "_native_trigger_sample", _native_trigger_sample, 0);
rb_define_singleton_method(testing_module, "_native_gc_tracepoint", _native_gc_tracepoint, 1);
rb_define_singleton_method(testing_module, "_native_simulate_handle_sampling_signal", _native_simulate_handle_sampling_signal, 0);
rb_define_singleton_method(testing_module, "_native_simulate_sample_from_postponed_job", _native_simulate_sample_from_postponed_job, 0);
}

// This structure is used to define a Ruby object that stores a pointer to a struct cpu_and_wall_time_worker_state
Expand Down Expand Up @@ -301,6 +305,9 @@ static void handle_sampling_signal(DDTRACE_UNUSED int _signal, DDTRACE_UNUSED si
if (!ruby_thread_has_gvl_p()) {
return; // Not safe to enqueue a sample from this thread
}
if (!ddtrace_rb_ractor_main_p()) {
return; // We're not on the main Ractor; we currently don't support profiling non-main Ractors
}

// We implicitly assume there can be no concurrent nor nested calls to handle_sampling_signal because
// a) we get triggered using SIGPROF, and the docs state second SIGPROF will not interrupt an existing one
Expand Down Expand Up @@ -348,6 +355,12 @@ static void sample_from_postponed_job(DDTRACE_UNUSED void *_unused) {
// This can potentially happen if the CpuAndWallTimeWorker was stopped while the postponed job was waiting to be executed; nothing to do
if (instance == Qnil) return;

// @ivoanjo: I'm not sure this can ever happen because `handle_sampling_signal` only enqueues this callback if
// it's running on the main Ractor, but just in case...
if (!ddtrace_rb_ractor_main_p()) {
return; // We're not on the main Ractor; we currently don't support profiling non-main Ractors
}

struct cpu_and_wall_time_worker_state *state;
TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);

Expand Down Expand Up @@ -453,6 +466,10 @@ static VALUE _native_gc_tracepoint(DDTRACE_UNUSED VALUE self, VALUE instance) {
// *NO ALLOCATION* is allowed. This function, and any it calls must never trigger memory or object allocation.
// This includes exceptions and use of ruby_xcalloc (because xcalloc can trigger GC)!
static void on_gc_event(VALUE tracepoint_data, DDTRACE_UNUSED void *unused) {
if (!ddtrace_rb_ractor_main_p()) {
return; // We're not on the main Ractor; we currently don't support profiling non-main Ractors
}

int event = rb_tracearg_event_flag(rb_tracearg_from_tracepoint(tracepoint_data));
if (event != RUBY_INTERNAL_EVENT_GC_ENTER && event != RUBY_INTERNAL_EVENT_GC_EXIT) return; // Unknown event

Expand Down Expand Up @@ -498,6 +515,12 @@ static void after_gc_from_postponed_job(DDTRACE_UNUSED void *_unused) {
// This can potentially happen if the CpuAndWallTimeWorker was stopped while the postponed job was waiting to be executed; nothing to do
if (instance == Qnil) return;

// @ivoanjo: I'm not sure this can ever happen because `on_gc_event` only enqueues this callback if
// it's running on the main Ractor, but just in case...
if (!ddtrace_rb_ractor_main_p()) {
return; // We're not on the main Ractor; we currently don't support profiling non-main Ractors
}

struct cpu_and_wall_time_worker_state *state;
TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);

Expand All @@ -518,3 +541,17 @@ static void safely_call(VALUE (*function_to_call_safely)(VALUE), VALUE function_
0 // Required by API to be the last argument
);
}

// This method exists only to enable testing Datadog::Profiling::Collectors::CpuAndWallTimeWorker behavior using RSpec.
// It SHOULD NOT be used for other purposes.
static VALUE _native_simulate_handle_sampling_signal(DDTRACE_UNUSED VALUE self) {
handle_sampling_signal(0, NULL, NULL);
return Qtrue;
}

// This method exists only to enable testing Datadog::Profiling::Collectors::CpuAndWallTimeWorker behavior using RSpec.
// It SHOULD NOT be used for other purposes.
static VALUE _native_simulate_sample_from_postponed_job(DDTRACE_UNUSED VALUE self) {
sample_from_postponed_job(NULL);
return Qtrue;
}
4 changes: 2 additions & 2 deletions lib/datadog/core/configuration/settings.rb
Original file line number Diff line number Diff line change
Expand Up @@ -263,8 +263,8 @@ def initialize(*_)
# being automatically enabled in the future.
#
# This toggle was added because, although this feature is safe and enabled by default on Ruby 2.x,
# on Ruby 3.x it can break in applications that make use of Ractors due to a Ruby VM bug
# (https://bugs.ruby-lang.org/issues/19112).
# on Ruby 3.x it can break in applications that make use of Ractors due to two Ruby VM bugs:
# https://bugs.ruby-lang.org/issues/19112 AND https://bugs.ruby-lang.org/issues/18464.
#
# If you use Ruby 3.x and your application does not use Ractors (or if your Ruby has been patched), the
# feature is fully safe to enable and this toggle can be used to do so.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,70 @@
end
end

describe 'Ractor safety' do
before do
skip 'Behavior does not apply to current Ruby version' if RUBY_VERSION < '3.'

# See native_extension_spec.rb for more details on the issues we saw on 3.0
skip 'Ruby 3.0 Ractors are too buggy to run this spec' if RUBY_VERSION.start_with?('3.0.')
end

shared_examples_for 'does not trigger a sample' do |run_ractor|
it 'does not trigger a sample' do
cpu_and_wall_time_worker.start
wait_until_running

run_ractor.call

cpu_and_wall_time_worker.stop

serialization_result = recorder.serialize
raise 'Unexpected: Serialization failed' unless serialization_result

samples_from_ractor =
samples_from_pprof(serialization_result.last)
.select { |it| it.fetch(:labels)[:'thread name'] == 'background ractor' }

expect(samples_from_ractor).to be_empty
end
end

context 'when called from a background ractor' do
# Even though we're not testing it explicitly, the GC profiling hooks can sometimes be called when running these
# specs. Unfortunately, there's a VM crash in that case as well -- https://bugs.ruby-lang.org/issues/18464 --
# so this must be disabled when interacting with Ractors.
let(:gc_profiling_enabled) { false }

describe 'handle_sampling_signal' do
include_examples 'does not trigger a sample',
(
proc do
Ractor.new do
Thread.current.name = 'background ractor'
Datadog::Profiling::Collectors::CpuAndWallTimeWorker::Testing._native_simulate_handle_sampling_signal
end.take
end
)
end

describe 'sample_from_postponed_job' do
include_examples 'does not trigger a sample',
(
proc do
Ractor.new do
Thread.current.name = 'background ractor'
Datadog::Profiling::Collectors::CpuAndWallTimeWorker::Testing._native_simulate_sample_from_postponed_job
end.take
end
)
end

# @ivoanjo: I initially tried to also test the GC callbacks, but it gets a bit hacky to force the thread
# context creation for the ractors, and then simulate a GC. (For instance -- how to prevent against the context
# creation running in parallel with a regular sample?)
end
end

describe '#stop' do
subject(:stop) { cpu_and_wall_time_worker.stop }

Expand Down