Skip to content

Commit

Permalink
[PROF-7307] Wire up allocation sampling into CpuAndWallTimeWorker
Browse files Browse the repository at this point in the history
**What does this PR do?**

In #2657 we added support for the `Collectors::ThreadContext` to
sample a given object allocation.

But until now, triggering these kinds of allocations was only done in
our test suite.

This PR adds the missing part: actually sampling objects as Ruby
allocates them.

In the `Collectors::CpuAndWallTimeWorker`, we already
had a `TracePoint` that is called on every new object allocation
(`RUBY_INTERNAL_EVENT_NEWOBJ`), but we were only using it to
count allocations.

This PR changes the `TracePoint` code to actually call into the
`Collectors::ThreadContext` and thus allow allocation sampling to
happen.

**Motivation:**

This PR is another step towards the allocation profiling feature.

**Additional Notes:**

It's not yet possible to turn on allocation profiling via configuration.

I am reserving that change for later. But it's easy to do:
change the `profiling/component.rb` to use `alloc_samples_enabled: true`
and `allocation_sample_every: 1`.

This will result in a big overhead impact to the application. I'll be
working on this in later PRs.

In particular, the sampling approach we're using
(`allocation_sample_every`), is only a placeholder; it will get
replaced before we get to beta. This is why I don't want to add it
as a real user-available setting -- because it's temporary.

**How to test the change?**

This change includes test coverage. You can also manually check it out,
by following the instructions in the "Additional Notes".

(There's a feature flag needed for this data to show up in the
Datadog UX).
  • Loading branch information
ivoanjo committed Sep 1, 2023
1 parent 737db7b commit 0128298
Show file tree
Hide file tree
Showing 6 changed files with 100 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ struct cpu_and_wall_time_worker_state {
bool allocation_counting_enabled;
bool no_signals_workaround_enabled;
bool dynamic_sampling_rate_enabled;
int allocation_sample_every; // Temporarily used for development/testing of allocation profiling
VALUE self_instance;
VALUE thread_context_collector_instance;
VALUE idle_sampling_helper_instance;
Expand Down Expand Up @@ -150,7 +151,8 @@ static VALUE _native_initialize(
VALUE idle_sampling_helper_instance,
VALUE allocation_counting_enabled,
VALUE no_signals_workaround_enabled,
VALUE dynamic_sampling_rate_enabled
VALUE dynamic_sampling_rate_enabled,
VALUE allocation_sample_every
);
static void cpu_and_wall_time_worker_typed_data_mark(void *state_ptr);
static VALUE _native_sampling_loop(VALUE self, VALUE instance);
Expand Down Expand Up @@ -183,9 +185,10 @@ static void grab_gvl_and_sample(void);
static void reset_stats(struct cpu_and_wall_time_worker_state *state);
static void sleep_for(uint64_t time_ns);
static VALUE _native_allocation_count(DDTRACE_UNUSED VALUE self);
static void on_newobj_event(DDTRACE_UNUSED VALUE tracepoint_data, DDTRACE_UNUSED void *unused);
static void on_newobj_event(VALUE tracepoint_data, DDTRACE_UNUSED void *unused);
static void disable_tracepoints(struct cpu_and_wall_time_worker_state *state);
static VALUE _native_with_blocked_sigprof(DDTRACE_UNUSED VALUE self);
static VALUE rescued_sample_allocation(VALUE tracepoint_data);

// Note on sampler global state safety:
//
Expand Down Expand Up @@ -223,7 +226,7 @@ void collectors_cpu_and_wall_time_worker_init(VALUE profiling_module) {
// https://bugs.ruby-lang.org/issues/18007 for a discussion around this.
rb_define_alloc_func(collectors_cpu_and_wall_time_worker_class, _native_new);

rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_initialize", _native_initialize, 7);
rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_initialize", _native_initialize, 8);
rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_sampling_loop", _native_sampling_loop, 1);
rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_stop", _native_stop, 2);
rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_reset_after_fork", _native_reset_after_fork, 1);
Expand Down Expand Up @@ -261,6 +264,7 @@ static VALUE _native_new(VALUE klass) {
state->allocation_counting_enabled = false;
state->no_signals_workaround_enabled = false;
state->dynamic_sampling_rate_enabled = true;
state->allocation_sample_every = 0;
state->thread_context_collector_instance = Qnil;
state->idle_sampling_helper_instance = Qnil;
state->owner_thread = Qnil;
Expand All @@ -287,12 +291,14 @@ static VALUE _native_initialize(
VALUE idle_sampling_helper_instance,
VALUE allocation_counting_enabled,
VALUE no_signals_workaround_enabled,
VALUE dynamic_sampling_rate_enabled
VALUE dynamic_sampling_rate_enabled,
VALUE allocation_sample_every
) {
ENFORCE_BOOLEAN(gc_profiling_enabled);
ENFORCE_BOOLEAN(allocation_counting_enabled);
ENFORCE_BOOLEAN(no_signals_workaround_enabled);
ENFORCE_BOOLEAN(dynamic_sampling_rate_enabled);
ENFORCE_TYPE(allocation_sample_every, T_FIXNUM);

struct cpu_and_wall_time_worker_state *state;
TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
Expand All @@ -301,6 +307,12 @@ static VALUE _native_initialize(
state->allocation_counting_enabled = (allocation_counting_enabled == Qtrue);
state->no_signals_workaround_enabled = (no_signals_workaround_enabled == Qtrue);
state->dynamic_sampling_rate_enabled = (dynamic_sampling_rate_enabled == Qtrue);
state->allocation_sample_every = NUM2INT(allocation_sample_every);

if (state->allocation_sample_every < 0) {
rb_raise(rb_eArgError, "Unexpected value for allocation_sample_every: %d. This value must be >= 0.", state->allocation_sample_every);
}

state->thread_context_collector_instance = enforce_thread_context_collector_instance(thread_context_collector_instance);
state->idle_sampling_helper_instance = idle_sampling_helper_instance;
state->gc_tracepoint = rb_tracepoint_new(Qnil, RUBY_INTERNAL_EVENT_GC_ENTER | RUBY_INTERNAL_EVENT_GC_EXIT, on_gc_event, NULL /* unused */);
Expand Down Expand Up @@ -880,7 +892,7 @@ static VALUE _native_allocation_count(DDTRACE_UNUSED VALUE self) {

// Implements memory-related profiling events. This function is called by Ruby via the `object_allocation_tracepoint`
// when the RUBY_INTERNAL_EVENT_NEWOBJ event is triggered.
static void on_newobj_event(DDTRACE_UNUSED VALUE tracepoint_data, DDTRACE_UNUSED void *unused) {
static void on_newobj_event(VALUE tracepoint_data, DDTRACE_UNUSED void *unused) {
// Update thread-local allocation count
if (RB_UNLIKELY(allocation_count == UINT64_MAX)) {
allocation_count = 0;
Expand All @@ -907,7 +919,12 @@ static void on_newobj_event(DDTRACE_UNUSED VALUE tracepoint_data, DDTRACE_UNUSED
// defined as not being able to allocate) sets this.
state->during_sample = true;

// TODO: Sampling goes here (calling into `thread_context_collector_sample_allocation`)
// TODO: This is a placeholder sampling decision strategy. We plan to replace it with a better one soon (e.g. before
// beta), and having something here allows us to test the rest of feature, sampling decision aside.
if (state->allocation_sample_every > 0 && ((allocation_count % state->allocation_sample_every) == 0)) {
// Rescue against any exceptions that happen during sampling
safely_call(rescued_sample_allocation, tracepoint_data, state->self_instance);
}

state->during_sample = false;
}
Expand All @@ -929,3 +946,18 @@ static VALUE _native_with_blocked_sigprof(DDTRACE_UNUSED VALUE self) {
return result;
}
}

static VALUE rescued_sample_allocation(VALUE tracepoint_data) {
struct cpu_and_wall_time_worker_state *state = active_sampler_instance_state; // Read from global variable, see "sampler global state safety" note above

// This should not happen in a normal situation because on_newobj_event already checked for this, but just in case...
if (state == NULL) return Qnil;

rb_trace_arg_t *data = rb_tracearg_from_tracepoint(tracepoint_data);
VALUE new_object = rb_tracearg_object(data);

thread_context_collector_sample_allocation(state->thread_context_collector_instance, state->allocation_sample_every, new_object);

// Return a dummy VALUE because we're called from rb_rescue2 which requires it
return Qnil;
}
11 changes: 10 additions & 1 deletion lib/datadog/profiling/collectors/cpu_and_wall_time_worker.rb
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,22 @@ def initialize(
idle_sampling_helper: IdleSamplingHelper.new,
# **NOTE**: This should only be used for testing; disabling the dynamic sampling rate will increase the
# profiler overhead!
dynamic_sampling_rate_enabled: true
dynamic_sampling_rate_enabled: true,
allocation_sample_every: 0 # Currently only for testing; Setting this to > 0 can add a lot of overhead!
)
unless dynamic_sampling_rate_enabled
Datadog.logger.warn(
'Profiling dynamic sampling rate disabled. This should only be used for testing, and will increase overhead!'
)
end

if allocation_counting_enabled && allocation_sample_every > 0
Datadog.logger.warn(
"Enabled experimental allocation profiling: allocation_sample_every=#{allocation_sample_every}. This is " \
'experimental, not recommended, and will increase overhead!'
)
end

self.class._native_initialize(
self,
thread_context_collector,
Expand All @@ -37,6 +45,7 @@ def initialize(
allocation_counting_enabled,
no_signals_workaround_enabled,
dynamic_sampling_rate_enabled,
allocation_sample_every,
)
@worker_thread = nil
@failure_exception = nil
Expand Down
1 change: 1 addition & 0 deletions lib/datadog/profiling/component.rb
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ def self.build_profiler_component(settings:, agent_settings:, optional_tracer:)
allocation_counting_enabled: settings.profiling.advanced.allocation_counting_enabled,
no_signals_workaround_enabled: no_signals_workaround_enabled,
thread_context_collector: thread_context_collector,
allocation_sample_every: 0,
)
else
load_pprof_support
Expand Down
2 changes: 2 additions & 0 deletions sig/datadog/profiling/collectors/cpu_and_wall_time_worker.rbs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ module Datadog
thread_context_collector: Datadog::Profiling::Collectors::ThreadContext,
?idle_sampling_helper: Datadog::Profiling::Collectors::IdleSamplingHelper,
?dynamic_sampling_rate_enabled: bool,
?allocation_sample_every: Integer,
) -> void

def self._native_initialize: (
Expand All @@ -24,6 +25,7 @@ module Datadog
bool allocation_counting_enabled,
bool no_signals_workaround_enabled,
bool dynamic_sampling_rate_enabled,
::Integer allocation_sample_every,
) -> true

def start: () -> bool?
Expand Down
48 changes: 48 additions & 0 deletions spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -412,6 +412,54 @@
expect(stats.fetch(:trigger_sample_attempts)).to eq(stats.fetch(:postponed_job_success))
end
end

context 'when allocation sampling is enabled' do
let(:options) { { allocation_sample_every: 1 } }

before do
allow(Datadog.logger).to receive(:warn)
end

it 'logs a warning message mentioning this is experimental' do
expect(Datadog.logger).to receive(:warn).with(/Enabled experimental allocation profiling/)

start
end

it 'records allocated objects' do
stub_const('CpuAndWallTimeWorkerSpec::TestStruct', Struct.new(:foo))

start

123.times { CpuAndWallTimeWorkerSpec::TestStruct.new }
allocation_line = __LINE__ - 1

cpu_and_wall_time_worker.stop

allocation_sample =
samples_for_thread(samples_from_pprof(recorder.serialize!), Thread.current)
.find { |s| s.labels[:'allocation class'] == 'CpuAndWallTimeWorkerSpec::TestStruct' }

expect(allocation_sample.values).to include(:'alloc-samples' => 123)
expect(allocation_sample.locations.first.lineno).to eq allocation_line
end
end

context 'when allocation sampling is disabled' do
let(:options) { { allocation_sample_every: 0 } }

it 'does not record allocations' do
stub_const('CpuAndWallTimeWorkerSpec::TestStruct', Struct.new(:foo))

start

123.times { CpuAndWallTimeWorkerSpec::TestStruct.new }

cpu_and_wall_time_worker.stop

expect(samples_from_pprof(recorder.serialize!).map(&:values)).to all(include(:'alloc-samples' => 0))
end
end
end

describe 'Ractor safety' do
Expand Down
1 change: 1 addition & 0 deletions spec/datadog/profiling/component_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@
allocation_counting_enabled: anything,
no_signals_workaround_enabled: :no_signals_result,
thread_context_collector: instance_of(Datadog::Profiling::Collectors::ThreadContext),
allocation_sample_every: 0,
)

build_profiler_component
Expand Down

0 comments on commit 0128298

Please sign in to comment.