Skip to content

Commit

Permalink
Merge pull request #2635 from DataDog/ivoanjo/prof-7252-profiling-new…
Browse files Browse the repository at this point in the history
…obj-tracepoint

[PROF-7252] Add `Profiling.allocation_count` API for new profiler
  • Loading branch information
ivoanjo authored Feb 22, 2023
2 parents f53e993 + 40100b6 commit 4c0efe4
Show file tree
Hide file tree
Showing 9 changed files with 283 additions and 37 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ struct cpu_and_wall_time_worker_state {
atomic_bool should_run;

bool gc_profiling_enabled;
bool allocation_counting_enabled;
VALUE self_instance;
VALUE cpu_and_wall_time_collector_instance;
VALUE idle_sampling_helper_instance;
Expand All @@ -95,6 +96,8 @@ struct cpu_and_wall_time_worker_state {
// Used to get gc start/finish information
VALUE gc_tracepoint;

VALUE object_allocation_tracepoint;

struct stats {
// How many times we tried to trigger a sample
unsigned int trigger_sample_attempts;
Expand All @@ -121,7 +124,8 @@ static VALUE _native_initialize(
VALUE self_instance,
VALUE cpu_and_wall_time_collector_instance,
VALUE gc_profiling_enabled,
VALUE idle_sampling_helper_instance
VALUE idle_sampling_helper_instance,
VALUE allocation_counting_enabled
);
static void cpu_and_wall_time_worker_typed_data_mark(void *state_ptr);
static VALUE _native_sampling_loop(VALUE self, VALUE instance);
Expand Down Expand Up @@ -153,6 +157,9 @@ void *simulate_sampling_signal_delivery(DDTRACE_UNUSED void *_unused);
static void grab_gvl_and_sample(void);
static void reset_stats(struct cpu_and_wall_time_worker_state *state);
static void sleep_for(uint64_t time_ns);
static VALUE _native_allocation_count(DDTRACE_UNUSED VALUE self);
static void on_newobj_event(DDTRACE_UNUSED VALUE tracepoint_data, DDTRACE_UNUSED void *unused);
static void disable_tracepoints(struct cpu_and_wall_time_worker_state *state);

// Note on sampler global state safety:
//
Expand All @@ -163,7 +170,14 @@ static void sleep_for(uint64_t time_ns);
// This global state is needed because a bunch of functions on this file need to access it from situations
// (e.g. signal handler) where it's impossible or just awkward to pass it as an argument.
static VALUE active_sampler_instance = Qnil;
struct cpu_and_wall_time_worker_state *active_sampler_instance_state = NULL;
static struct cpu_and_wall_time_worker_state *active_sampler_instance_state = NULL;

// Used to implement CpuAndWallTimeWorker._native_allocation_count . To be able to use cheap thread-local variables
// (here with `__thread`, see https://gcc.gnu.org/onlinedocs/gcc/Thread-Local.html), this needs to be global.
//
// Carryover of state between profiler instances can happen and is not considered to be a problem -- see expectations for this
// API documented in profiling.rb .
__thread uint64_t allocation_count = 0;

void collectors_cpu_and_wall_time_worker_init(VALUE profiling_module) {
rb_global_variable(&active_sampler_instance);
Expand All @@ -183,11 +197,12 @@ void collectors_cpu_and_wall_time_worker_init(VALUE profiling_module) {
// https://bugs.ruby-lang.org/issues/18007 for a discussion around this.
rb_define_alloc_func(collectors_cpu_and_wall_time_worker_class, _native_new);

rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_initialize", _native_initialize, 4);
rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_initialize", _native_initialize, 5);
rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_sampling_loop", _native_sampling_loop, 1);
rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_stop", _native_stop, 2);
rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_reset_after_fork", _native_reset_after_fork, 1);
rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_stats", _native_stats, 1);
rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_allocation_count", _native_allocation_count, 0);
rb_define_singleton_method(testing_module, "_native_current_sigprof_signal_handler", _native_current_sigprof_signal_handler, 0);
rb_define_singleton_method(testing_module, "_native_is_running?", _native_is_running, 1);
rb_define_singleton_method(testing_module, "_native_install_testing_signal_handler", _native_install_testing_signal_handler, 0);
Expand Down Expand Up @@ -217,13 +232,15 @@ static VALUE _native_new(VALUE klass) {

atomic_init(&state->should_run, false);
state->gc_profiling_enabled = false;
state->allocation_counting_enabled = false;
state->cpu_and_wall_time_collector_instance = Qnil;
state->idle_sampling_helper_instance = Qnil;
state->owner_thread = Qnil;
dynamic_sampling_rate_init(&state->dynamic_sampling_rate);
state->failure_exception = Qnil;
state->stop_thread = Qnil;
state->gc_tracepoint = Qnil;
state->object_allocation_tracepoint = Qnil;
reset_stats(state);

return state->self_instance = TypedData_Wrap_Struct(klass, &cpu_and_wall_time_worker_typed_data, state);
Expand All @@ -234,17 +251,21 @@ static VALUE _native_initialize(
VALUE self_instance,
VALUE cpu_and_wall_time_collector_instance,
VALUE gc_profiling_enabled,
VALUE idle_sampling_helper_instance
VALUE idle_sampling_helper_instance,
VALUE allocation_counting_enabled
) {
ENFORCE_BOOLEAN(gc_profiling_enabled);
ENFORCE_BOOLEAN(allocation_counting_enabled);

struct cpu_and_wall_time_worker_state *state;
TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);

state->gc_profiling_enabled = (gc_profiling_enabled == Qtrue);
state->allocation_counting_enabled = (allocation_counting_enabled == Qtrue);
state->cpu_and_wall_time_collector_instance = enforce_cpu_and_wall_time_collector_instance(cpu_and_wall_time_collector_instance);
state->idle_sampling_helper_instance = idle_sampling_helper_instance;
state->gc_tracepoint = rb_tracepoint_new(Qnil, RUBY_INTERNAL_EVENT_GC_ENTER | RUBY_INTERNAL_EVENT_GC_EXIT, on_gc_event, NULL /* unused */);
state->object_allocation_tracepoint = rb_tracepoint_new(Qnil, RUBY_INTERNAL_EVENT_NEWOBJ, on_newobj_event, NULL /* unused */);

return Qtrue;
}
Expand All @@ -259,6 +280,7 @@ static void cpu_and_wall_time_worker_typed_data_mark(void *state_ptr) {
rb_gc_mark(state->failure_exception);
rb_gc_mark(state->stop_thread);
rb_gc_mark(state->gc_tracepoint);
rb_gc_mark(state->object_allocation_tracepoint);
}

// Called in a background thread created in CpuAndWallTimeWorker#start
Expand All @@ -275,14 +297,13 @@ static VALUE _native_sampling_loop(DDTRACE_UNUSED VALUE _self, VALUE instance) {
);
} else {
// The previously active thread seems to have died without cleaning up after itself.
// In this case, we can still go ahead and start the profiler BUT we make sure to disable any existing GC tracepoint
// In this case, we can still go ahead and start the profiler BUT we make sure to disable any existing tracepoint
// first as:
// a) If this is a new instance of the CpuAndWallTimeWorker, we don't want the tracepoint from the old instance
// being kept around
// b) If this is the same instance of the CpuAndWallTimeWorker if we call enable on a tracepoint that is already
// enabled, it will start firing more than once, see https://bugs.ruby-lang.org/issues/19114 for details.

rb_tracepoint_disable(old_state->gc_tracepoint);
disable_tracepoints(old_state);
}
}

Expand All @@ -308,7 +329,7 @@ static VALUE _native_sampling_loop(DDTRACE_UNUSED VALUE _self, VALUE instance) {

// The sample trigger loop finished (either cleanly or with an error); let's clean up

rb_tracepoint_disable(state->gc_tracepoint);
disable_tracepoints(state);

active_sampler_instance_state = NULL;
active_sampler_instance = Qnil;
Expand Down Expand Up @@ -360,8 +381,8 @@ static VALUE stop(VALUE self_instance, VALUE optional_exception) {
atomic_store(&state->should_run, false);
state->failure_exception = optional_exception;

// Disable the GC tracepoint as soon as possible, so the VM doesn't keep on calling it
rb_tracepoint_disable(state->gc_tracepoint);
// Disable the tracepoints as soon as possible, so the VM doesn't keep on calling them
disable_tracepoints(state);

return Qtrue;
}
Expand Down Expand Up @@ -528,10 +549,11 @@ static VALUE release_gvl_and_run_sampling_trigger_loop(VALUE instance) {
struct cpu_and_wall_time_worker_state *state;
TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);

// Final preparations: Setup signal handler and enable tracepoint. We run these here and not in `_native_sampling_loop`
// Final preparations: Setup signal handler and enable tracepoints. We run these here and not in `_native_sampling_loop`
// because they may raise exceptions.
install_sigprof_signal_handler(handle_sampling_signal, "handle_sampling_signal");
if (state->gc_profiling_enabled) rb_tracepoint_enable(state->gc_tracepoint);
if (state->allocation_counting_enabled) rb_tracepoint_enable(state->object_allocation_tracepoint);

rb_thread_call_without_gvl(run_sampling_trigger_loop, state, interrupt_sampling_trigger_loop, state);

Expand Down Expand Up @@ -695,7 +717,7 @@ static VALUE _native_reset_after_fork(DDTRACE_UNUSED VALUE self, VALUE instance)
TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);

// Disable all tracepoints, so that there are no more attempts to mutate the profile
rb_tracepoint_disable(state->gc_tracepoint);
disable_tracepoints(state);

reset_stats(state);

Expand Down Expand Up @@ -775,3 +797,25 @@ static void sleep_for(uint64_t time_ns) {
}
}
}

static VALUE _native_allocation_count(DDTRACE_UNUSED VALUE self) {
bool is_profiler_running = active_sampler_instance_state != NULL;

return is_profiler_running ? ULL2NUM(allocation_count) : Qnil;
}

// Implements memory-related profiling events. This function is called by Ruby via the `object_allocation_tracepoint`
// when the RUBY_INTERNAL_EVENT_NEWOBJ event is triggered.
static void on_newobj_event(DDTRACE_UNUSED VALUE tracepoint_data, DDTRACE_UNUSED void *unused) {
// Update thread-local allocation count
if (RB_UNLIKELY(allocation_count == UINT64_MAX)) {
allocation_count = 0;
} else {
allocation_count++;
}
}

static void disable_tracepoints(struct cpu_and_wall_time_worker_state *state) {
rb_tracepoint_disable(state->gc_tracepoint);
rb_tracepoint_disable(state->object_allocation_tracepoint);
}
17 changes: 14 additions & 3 deletions lib/datadog/core/configuration/settings.rb
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,7 @@ def initialize(*_)
# This should never be reduced, as it can cause the resulting profiles to become biased.
# The current default should be enough for most services, allowing 16 threads to be sampled around 30 times
# per second for a 60 second period.
# This setting is ignored when CPU Profiling 2.0 is in use.
option :max_events, default: 32768

# Controls the maximum number of frames for each thread sampled. Can be tuned to avoid omitted frames in the
Expand All @@ -232,8 +233,8 @@ def initialize(*_)
end
end

# Disable gathering of names and versions of gems in use by the service, used to power grouping and
# categorization of stack traces.
# Can be used to disable the gathering of names and versions of gems in use by the service, used to power
# grouping and categorization of stack traces.
option :code_provenance_enabled, default: true

# No longer does anything, and will be removed on dd-trace-rb 2.0.
Expand All @@ -249,7 +250,7 @@ def initialize(*_)
end
end

# Forces enabling the new profiler. We do not yet recommend turning on this option.
# Forces enabling the new CPU Profiling 2.0 profiler (see ddtrace release notes for more details).
#
# Note that setting this to "false" (or not setting it) will not prevent the new profiler from
# being automatically used in the future.
Expand Down Expand Up @@ -279,6 +280,16 @@ def initialize(*_)
o.default { env_to_bool('DD_PROFILING_FORCE_ENABLE_GC', false) }
o.lazy
end

# Can be used to enable/disable the Datadog::Profiling.allocation_count feature.
#
# This feature is safe and enabled by default on Ruby 2.x, but
# on Ruby 3.x it can break in applications that make use of Ractors due to two Ruby VM bugs:
# https://bugs.ruby-lang.org/issues/19112 AND https://bugs.ruby-lang.org/issues/18464.
#
# If you use Ruby 3.x and your application does not use Ractors (or if your Ruby has been patched), the
# feature is fully safe to enable and this toggle can be used to do so.
option :allocation_counting_enabled, default: RUBY_VERSION.start_with?('2.')
end

# @public_api
Expand Down
39 changes: 37 additions & 2 deletions lib/datadog/profiling.rb
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
# typed: true
# typed: false

require_relative 'core'
require_relative 'core/environment/variable_helpers'
require_relative 'core/utils/only_once'

module Datadog
# Contains profiler for generating stack profiles, etc.
# Datadog Continuous Profiler implementation: https://docs.datadoghq.com/profiler/
module Profiling
GOOGLE_PROTOBUF_MINIMUM_VERSION = Gem::Version.new('3.0')
private_constant :GOOGLE_PROTOBUF_MINIMUM_VERSION
Expand Down Expand Up @@ -41,6 +41,39 @@ def self.start_if_enabled
!!profiler
end

# Returns an ever-increasing counter of the number of allocations observed by the profiler in this thread.
#
# Note 1: This counter may not start from zero on new threads. It should only be used to measure how many
# allocations have happened between two calls to this API:
# ```
# allocations_before = Datadog::Profiling.allocation_count
# do_some_work()
# allocations_after = Datadog::Profiling.allocation_count
# puts "Allocations during do_some_work: #{allocations_after - allocations_before}"
# ```
# (This is similar to some OS-based time representations.)
#
# Note 2: All fibers in the same thread will share the same counter values.
#
# Only available when the profiler is running, the new CPU Profiling 2.0 profiler is in use, and allocation-related
# features are not disabled via configuration.
# For instructions on enabling CPU Profiling 2.0 see the ddtrace release notes.
#
# @return [Integer] number of allocations observed in the current thread.
# @return [nil] when not available.
# @public_api
def self.allocation_count
# This no-op implementation is used when profiling failed to load.
# It gets replaced inside #replace_noop_allocation_count.
nil
end

private_class_method def self.replace_noop_allocation_count
def self.allocation_count # rubocop:disable Lint/DuplicateMethods, Lint/NestedMethodDefinition (On purpose!)
Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_allocation_count
end
end

private_class_method def self.native_library_compilation_skipped?
skipped_reason = try_reading_skipped_reason_file

Expand Down Expand Up @@ -169,6 +202,8 @@ def self.start_if_enabled
require_relative 'profiling/tag_builder'
require_relative 'profiling/http_transport'

replace_noop_allocation_count

true
end

Expand Down
9 changes: 8 additions & 1 deletion lib/datadog/profiling/collectors/cpu_and_wall_time_worker.rb
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,17 @@ def initialize(
max_frames:,
tracer:,
gc_profiling_enabled:,
allocation_counting_enabled:,
cpu_and_wall_time_collector: CpuAndWallTime.new(recorder: recorder, max_frames: max_frames, tracer: tracer),
idle_sampling_helper: IdleSamplingHelper.new
)
self.class._native_initialize(self, cpu_and_wall_time_collector, gc_profiling_enabled, idle_sampling_helper)
self.class._native_initialize(
self,
cpu_and_wall_time_collector,
gc_profiling_enabled,
idle_sampling_helper,
allocation_counting_enabled
)
@worker_thread = nil
@failure_exception = nil
@start_stop_mutex = Mutex.new
Expand Down
3 changes: 2 additions & 1 deletion lib/datadog/profiling/component.rb
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,8 @@ def build_profiler(settings, agent_settings, tracer)
recorder: recorder,
max_frames: settings.profiling.advanced.max_frames,
tracer: tracer,
gc_profiling_enabled: should_enable_gc_profiling?(settings)
gc_profiling_enabled: should_enable_gc_profiling?(settings),
allocation_counting_enabled: settings.profiling.advanced.allocation_counting_enabled,
)
else
trace_identifiers_helper = Profiling::TraceIdentifiers::Helper.new(
Expand Down
29 changes: 29 additions & 0 deletions spec/datadog/core/configuration/components_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -1033,6 +1033,7 @@
max_frames: settings.profiling.advanced.max_frames,
tracer: tracer,
gc_profiling_enabled: anything,
allocation_counting_enabled: anything,
)

build_profiler
Expand Down Expand Up @@ -1096,6 +1097,34 @@
end
end

context 'when allocation_counting_enabled is enabled' do
before do
settings.profiling.advanced.allocation_counting_enabled = true
end

it 'initializes a CpuAndWallTimeWorker collector with allocation_counting_enabled set to true' do
expect(Datadog::Profiling::Collectors::CpuAndWallTimeWorker).to receive(:new).with hash_including(
allocation_counting_enabled: true,
)

build_profiler
end
end

context 'when allocation_counting_enabled is disabled' do
before do
settings.profiling.advanced.allocation_counting_enabled = false
end

it 'initializes a CpuAndWallTimeWorker collector with allocation_counting_enabled set to false' do
expect(Datadog::Profiling::Collectors::CpuAndWallTimeWorker).to receive(:new).with hash_including(
allocation_counting_enabled: false,
)

build_profiler
end
end

it 'sets up the Profiler with the CpuAndWallTimeWorker collector' do
expect(Datadog::Profiling::Profiler).to receive(:new).with(
[instance_of(Datadog::Profiling::Collectors::CpuAndWallTimeWorker)],
Expand Down
Loading

0 comments on commit 4c0efe4

Please sign in to comment.