diff --git a/ext/ddtrace_profiling_native_extension/collectors_stack.c b/ext/ddtrace_profiling_native_extension/collectors_stack.c index e6ca4114a5..83b427128f 100644 --- a/ext/ddtrace_profiling_native_extension/collectors_stack.c +++ b/ext/ddtrace_profiling_native_extension/collectors_stack.c @@ -22,6 +22,7 @@ typedef struct sampling_buffer { static VALUE _native_sample(VALUE self, VALUE thread, VALUE recorder_instance, VALUE metric_values_hash, VALUE labels_array, VALUE max_frames); void sample(VALUE thread, sampling_buffer* buffer, VALUE recorder_instance, ddprof_ffi_Slice_i64 metric_values, ddprof_ffi_Slice_label labels); +void record_placeholder_stack_in_native_code(VALUE recorder_instance, ddprof_ffi_Slice_i64 metric_values, ddprof_ffi_Slice_label labels); sampling_buffer *sampling_buffer_new(int max_frames); void sampling_buffer_free(sampling_buffer *buffer); @@ -111,6 +112,11 @@ void sample(VALUE thread, sampling_buffer* buffer, VALUE recorder_instance, ddpr VALUE last_ruby_frame = Qnil; int last_ruby_line = 0; + if (captured_frames == PLACEHOLDER_STACK_IN_NATIVE_CODE) { + record_placeholder_stack_in_native_code(recorder_instance, metric_values, labels); + return; + } + for (int i = captured_frames - 1; i >= 0; i--) { VALUE name, filename; int line; @@ -163,6 +169,47 @@ void sample(VALUE thread, sampling_buffer* buffer, VALUE recorder_instance, ddpr ); } +// Our custom rb_profile_frames returning PLACEHOLDER_STACK_IN_NATIVE_CODE is equivalent to when the +// Ruby `Thread#backtrace` API returns an empty array: we know that a thread is alive but we don't know what it's doing: +// +// 1. It can be starting up +// ``` +// > Thread.new { sleep }.backtrace +// => [] # <-- note the thread hasn't actually started running sleep yet, we got there first +// ``` +// 2. It can be running native code +// ``` +// > t = Process.detach(fork { sleep }) +// => # +// > t.backtrace +// => [] # <-- this can happen even minutes later, e.g. it's not a race as in 1. +// ``` +// This effect has been observed in threads created by the Iodine web server and the ffi gem, +// see for instance https://github.com/ffi/ffi/pull/883 and https://github.com/DataDog/dd-trace-rb/pull/1719 . +// +// To give customers visibility into these threads, rather than reporting an empty stack, we replace the empty stack +// with one containing a placeholder frame, so that these threads are properly represented in the UX. +void record_placeholder_stack_in_native_code(VALUE recorder_instance, ddprof_ffi_Slice_i64 metric_values, ddprof_ffi_Slice_label labels) { + ddprof_ffi_Line placeholder_stack_in_native_code_line = { + .function = (ddprof_ffi_Function) { + .name = DDPROF_FFI_CHARSLICE_C(""), + .filename = DDPROF_FFI_CHARSLICE_C("In native code") + }, + .line = 0 + }; + ddprof_ffi_Location placeholder_stack_in_native_code_location = + {.lines = (ddprof_ffi_Slice_line) {.ptr = &placeholder_stack_in_native_code_line, .len = 1}}; + + record_sample( + recorder_instance, + (ddprof_ffi_Sample) { + .locations = (ddprof_ffi_Slice_location) {.ptr = &placeholder_stack_in_native_code_location, .len = 1}, + .values = metric_values, + .labels = labels, + } + ); +} + sampling_buffer *sampling_buffer_new(int max_frames) { sampling_buffer* buffer = xcalloc(1, sizeof(sampling_buffer)); if (buffer == NULL) rb_raise(rb_eNoMemError, "Failed to allocate memory for sampling buffer"); diff --git a/ext/ddtrace_profiling_native_extension/private_vm_api_access.c b/ext/ddtrace_profiling_native_extension/private_vm_api_access.c index ae834aba58..9be16b929e 100644 --- a/ext/ddtrace_profiling_native_extension/private_vm_api_access.c +++ b/ext/ddtrace_profiling_native_extension/private_vm_api_access.c @@ -151,6 +151,9 @@ calc_lineno(const rb_iseq_t *iseq, const VALUE *pc) // * Add `end_cfp == NULL` and `end_cfp <= cfp` safety checks. These are used in a bunch of places in // `vm_backtrace.c` (`backtrace_each`, `backtrace_size`, `rb_ec_partial_backtrace_object`) but are conspicuously // absent from `rb_profile_frames`. Oversight? +// * Distinguish between `end_cfp == NULL` (dead thread or some other error, returns 0) and `end_cfp <= cfp` +// (alive thread which may just be executing native code and has not pushed anything on the Ruby stack, returns +// PLACEHOLDER_STACK_IN_NATIVE_CODE). See comments on `record_placeholder_stack_in_native_code` for more details. // // **IMPORTANT: WHEN CHANGING THIS FUNCTION, CONSIDER IF THE SAME CHANGE ALSO NEEDS TO BE MADE TO THE VARIANT FOR // RUBY 2.2 AND BELOW WHICH IS ALSO PRESENT ON THIS FILE** @@ -205,7 +208,8 @@ int ddtrace_rb_profile_frames(VALUE thread, int start, int limit, VALUE *buff, i // are computed in a different way, so the two calls really are equivalent to one here. end_cfp = RUBY_VM_NEXT_CONTROL_FRAME(end_cfp); - if (end_cfp <= cfp) return 0; + // See comment on `record_placeholder_stack_in_native_code` for a full explanation of what this means (and why we don't just return 0) + if (end_cfp <= cfp) return PLACEHOLDER_STACK_IN_NATIVE_CODE; for (i=0; iiseq && cfp->pc) { /* should be NORMAL_ISEQ */ diff --git a/ext/ddtrace_profiling_native_extension/private_vm_api_access.h b/ext/ddtrace_profiling_native_extension/private_vm_api_access.h index 0426abdfe2..68c03c21c2 100644 --- a/ext/ddtrace_profiling_native_extension/private_vm_api_access.h +++ b/ext/ddtrace_profiling_native_extension/private_vm_api_access.h @@ -27,3 +27,6 @@ int ddtrace_rb_profile_frames(VALUE thread, int start, int limit, VALUE *buff, i #else // Ruby > 3.0, just use the stock functionality #define ddtrace_rb_profile_frame_method_name rb_profile_frame_method_name #endif + +// See comment on `record_placeholder_stack_in_native_code` for a full explanation of what this means (and why we don't just return 0) +#define PLACEHOLDER_STACK_IN_NATIVE_CODE -1 diff --git a/spec/datadog/profiling/collectors/stack_spec.rb b/spec/datadog/profiling/collectors/stack_spec.rb index 969c07d721..c067631c2b 100644 --- a/spec/datadog/profiling/collectors/stack_spec.rb +++ b/spec/datadog/profiling/collectors/stack_spec.rb @@ -97,6 +97,56 @@ end end + + context 'when sampling a thread with empty locations' do + let(:ready_pipe) { IO.pipe } + let(:finish_pipe) { IO.pipe } + + let(:thread_with_empty_locations) do + read_ready_pipe, write_ready_pipe = ready_pipe + read_finish_pipe, write_finish_pipe = finish_pipe + + Process.detach( + fork do + # Signal ready to parent + read_ready_pipe.close + write_ready_pipe.write('ready') + write_ready_pipe.close + + # Wait for parent to signal we can exit + write_finish_pipe.close + read_finish_pipe.read + read_finish_pipe.close + end + ) + end + + before do + thread_with_empty_locations + + # Wait for child to signal ready + read_ready_pipe, write_ready_pipe = ready_pipe + write_ready_pipe.close + expect(read_ready_pipe.read).to eq 'ready' + read_ready_pipe.close + + expect(reference_stack).to be_empty + end + + after do + # Signal child to exit + finish_pipe.map(&:close) + + thread_with_empty_locations.join + end + + let!(:stacks) { {reference: thread_with_empty_locations.backtrace_locations, gathered: sample_and_decode(thread_with_empty_locations)} } + + it 'gathers a one-element stack with a "In native code" placeholder' do + expect(gathered_stack).to contain_exactly({base_label: '', path: 'In native code', lineno: 0}) + end + end + def sample_and_decode(thread) collectors_stack.sample(thread, recorder, metric_values, labels)