Skip to content

Commit

Permalink
[PROF-8917] Add support for the libdatadog crash tracker
Browse files Browse the repository at this point in the history
**What does this PR do?**

This PR adds support for the libdatadog crash tracker feature (off
by default).

The crash tracker works by detecting when the Ruby VM reaches a
segmentation fault, reporting the crash information as a last profile
before the VM dies.

All of the interesting work is in
<DataDog/libdatadog#282>, this PR basically
just wires things up.

**Motivation:**

This will be a useful tool when debugging VM crashes.

**Additional Notes:**

I'm opening this PR as a draft as the libdatadog support has not
yet landed/been released. Also, there's a few open questions on:

* fork handling
* when to shut down

**How to test the change?**

(You'll need to build <<DataDog/libdatadog#282>
until the crash tracker gets included in a libdatadog release)

To test the crash tracker with an actual crash, try running the
following on Ruby 2.6:

```bash
$ DD_PROFILING_ENABLED=true DD_PROFILING_EXPERIMENTAL_CRASH_TRACKING_ENABLED=true DD_TRACE_DEBUG=true bundle exec ddtracerb exec ruby -e 'Process.detach(fork { exit! }).instance_variable_get(:@foo)'
```

This should also work in the future but right now it doesn't work
correctly; still looking into why:

```bash
$ DD_PROFILING_ENABLED=true DD_PROFILING_EXPERIMENTAL_CRASH_TRACKING_ENABLED=true DD_TRACE_DEBUG=true bundle exec ddtracerb exec ruby -e 'Process.kill("SEGV", Process.pid)'
```
  • Loading branch information
ivoanjo committed Jan 16, 2024
1 parent 37b486f commit 05346ba
Show file tree
Hide file tree
Showing 14 changed files with 319 additions and 7 deletions.
1 change: 1 addition & 0 deletions Steepfile
Original file line number Diff line number Diff line change
Expand Up @@ -648,4 +648,5 @@ target :ddtrace do

# TODO: gem 'libddwaf'
library 'libddwaf'
library 'libdatadog'
end
57 changes: 57 additions & 0 deletions ext/ddtrace_profiling_native_extension/crash_tracker.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#include <ruby.h>
#include <datadog/common.h>
#include <libdatadog_helpers.h>

static VALUE _native_start_crashtracker(int argc, VALUE *argv, DDTRACE_UNUSED VALUE _self);

// Used to report Ruby VM crashes.
// Once initialized, segfaults will be reported automatically using libdatadog.

void crash_tracker_init(VALUE profiling_module) {
VALUE crash_tracker_class = rb_define_class_under(profiling_module, "CrashTracker", rb_cObject);

rb_define_singleton_method(crash_tracker_class, "_native_start_crashtracker", _native_start_crashtracker, -1);
}

static VALUE _native_start_crashtracker(int argc, VALUE *argv, DDTRACE_UNUSED VALUE _self) {
VALUE options;
rb_scan_args(argc, argv, "0:", &options);

VALUE exporter_configuration = rb_hash_fetch(options, ID2SYM(rb_intern("exporter_configuration")));
VALUE path_to_crashtracking_receiver_binary = rb_hash_fetch(options, ID2SYM(rb_intern("path_to_crashtracking_receiver_binary")));
VALUE tags_as_array = rb_hash_fetch(options, ID2SYM(rb_intern("tags_as_array")));

ENFORCE_TYPE(exporter_configuration, T_ARRAY);
ENFORCE_TYPE(tags_as_array, T_ARRAY);
ENFORCE_TYPE(path_to_crashtracking_receiver_binary, T_STRING);

VALUE version = ddtrace_version();
ddog_Endpoint endpoint = endpoint_from(exporter_configuration);

// This needs to come last, after all things that can raise exceptions, as otherwise it can leak
ddog_Vec_Tag tags = convert_tags(tags_as_array);

ddog_prof_Configuration config = {
.create_alt_stack = false, // This breaks the Ruby VM's stack overflow detection
.endpoint = endpoint,
.path_to_receiver_binary = char_slice_from_ruby_string(path_to_crashtracking_receiver_binary),
};

ddog_prof_Metadata metadata = {
.profiling_library_name = DDOG_CHARSLICE_C("dd-trace-rb"),
.profiling_library_version = char_slice_from_ruby_string(version),
.family = DDOG_CHARSLICE_C("ruby"),
.tags = &tags,
};

ddog_prof_Profile_Result result = ddog_prof_crashtracker_init(config, metadata);

// Clean up before potentially raising any exceptions
ddog_Vec_Tag_drop(tags);

if (result.tag == DDOG_PROF_PROFILE_RESULT_ERR) {
rb_raise(rb_eRuntimeError, "Failed to initialize the crash tracker: %"PRIsVALUE, get_error_details_and_drop(&result.err));
}

return Qtrue;
}
2 changes: 2 additions & 0 deletions ext/ddtrace_profiling_native_extension/profiling.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ void collectors_dynamic_sampling_rate_init(VALUE profiling_module);
void collectors_idle_sampling_helper_init(VALUE profiling_module);
void collectors_stack_init(VALUE profiling_module);
void collectors_thread_context_init(VALUE profiling_module);
void crash_tracker_init(VALUE profiling_module);
void http_transport_init(VALUE profiling_module);
void stack_recorder_init(VALUE profiling_module);

Expand Down Expand Up @@ -47,6 +48,7 @@ void DDTRACE_EXPORT Init_ddtrace_profiling_native_extension(void) {
collectors_idle_sampling_helper_init(profiling_module);
collectors_stack_init(profiling_module);
collectors_thread_context_init(profiling_module);
crash_tracker_init(profiling_module);
http_transport_init(profiling_module);
stack_recorder_init(profiling_module);

Expand Down
1 change: 1 addition & 0 deletions lib/datadog/profiling.rb
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ def self.allocation_count # rubocop:disable Lint/NestedMethodDefinition (On purp
require_relative 'profiling/collectors/idle_sampling_helper'
require_relative 'profiling/collectors/stack'
require_relative 'profiling/collectors/thread_context'
require_relative 'profiling/crash_tracker'
require_relative 'profiling/diagnostics/environment_logger'
require_relative 'profiling/stack_recorder'
require_relative 'profiling/exporter'
Expand Down
18 changes: 18 additions & 0 deletions lib/datadog/profiling/component.rb
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ def self.build_profiler_component(settings:, agent_settings:, optional_tracer:)
exporter = build_profiler_exporter(settings, recorder, internal_metadata: internal_metadata)
transport = build_profiler_transport(settings, agent_settings)
scheduler = Profiling::Scheduler.new(exporter: exporter, transport: transport, interval: upload_period_seconds)
# FIXME: What should the lifetime of the crash tracker be?
build_crash_tracker(settings, transport)

Profiling::Profiler.new(worker: worker, scheduler: scheduler)
end
Expand Down Expand Up @@ -113,6 +115,22 @@ def self.build_profiler_component(settings:, agent_settings:, optional_tracer:)
)
end

private_class_method def self.build_crash_tracker(settings, transport)
return unless settings.profiling.advanced.experimental_crash_tracking_enabled

unless transport.respond_to?(:exporter_configuration)
Datadog.logger.warn(
'Cannot enable profiling crash tracking as a custom settings.profiling.exporter.transport is configured'
)
return
end

Datadog::Profiling::CrashTracker.build_crash_tracker(
exporter_configuration: transport.exporter_configuration,
tags: Datadog::Profiling::TagBuilder.call(settings: settings),
)
end

private_class_method def self.enable_gc_profiling?(settings)
# See comments on the setting definition for more context on why it exists.
if settings.profiling.advanced.force_enable_gc_profiling
Expand Down
57 changes: 57 additions & 0 deletions lib/datadog/profiling/crash_tracker.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# frozen_string_literal: true

require 'libdatadog'

# FIXME: Move this to libdatadog -- this is only here to facilitate testing
module ::Libdatadog
def self.path_to_crashtracking_receiver_binary
# TODO: Error handling when pkgconfig_folder is not detected correctly
File.absolute_path("#{::Libdatadog.pkgconfig_folder}/../../bin/ddog-crashtracking-receiver")
end
end

module Datadog
module Profiling
# Used to report Ruby VM crashes.
# The interesting bits are implemented as native code and using libdatadog.
#
# Methods prefixed with _native_ are implemented in `crash_tracker.c`
class CrashTracker
def self.build_crash_tracker(
exporter_configuration:,
tags:,
path_to_crashtracking_receiver_binary: Libdatadog.path_to_crashtracking_receiver_binary
)
unless path_to_crashtracking_receiver_binary
Datadog.logger.warn(
'Cannot enable profiling crash tracking as no path_to_crashtracking_receiver_binary was found'
)
return
end

begin
new(
exporter_configuration: exporter_configuration,
path_to_crashtracking_receiver_binary: path_to_crashtracking_receiver_binary,
tags_as_array: tags.to_a,
).tap {
Datadog.logger.debug('Crash tracker enabled')
}
rescue => e
Datadog.logger.error("Failed to initialize crash tracking: #{e.message}")
nil
end
end

private

def initialize(exporter_configuration:, path_to_crashtracking_receiver_binary:, tags_as_array:)
self.class._native_start_crashtracker(
exporter_configuration: exporter_configuration,
path_to_crashtracking_receiver_binary: path_to_crashtracking_receiver_binary,
tags_as_array: tags_as_array,
)
end
end
end
end
12 changes: 7 additions & 5 deletions lib/datadog/profiling/http_transport.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,26 +5,28 @@ module Profiling
# Used to report profiling data to Datadog.
# Methods prefixed with _native_ are implemented in `http_transport.c`
class HttpTransport
attr_reader :exporter_configuration

def initialize(agent_settings:, site:, api_key:, upload_timeout_seconds:)
@upload_timeout_milliseconds = (upload_timeout_seconds * 1_000).to_i

validate_agent_settings(agent_settings)

@exporter_configuration =
if agentless?(site, api_key)
[:agentless, site, api_key]
[:agentless, site, api_key].freeze
else
[:agent, base_url_from(agent_settings)]
[:agent, base_url_from(agent_settings)].freeze
end

status, result = validate_exporter(@exporter_configuration)
status, result = validate_exporter(exporter_configuration)

raise(ArgumentError, "Failed to initialize transport: #{result}") if status == :error
end

def export(flush)
status, result = do_export(
exporter_configuration: @exporter_configuration,
exporter_configuration: exporter_configuration,
upload_timeout_milliseconds: @upload_timeout_milliseconds,

# why "timespec"?
Expand Down Expand Up @@ -130,7 +132,7 @@ def do_export(
end

def config_without_api_key
[@exporter_configuration[0..1]].to_h
[exporter_configuration[0..1]].to_h
end
end
end
Expand Down
5 changes: 5 additions & 0 deletions sig/datadog/profiling/component.rbs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@ module Datadog
Datadog::Core::Configuration::AgentSettingsResolver::AgentSettings agent_settings
) -> untyped

def self.build_crash_tracker: (
untyped settings,
untyped transport,
) -> Datadog::Profiling::CrashTracker?

def self.enable_gc_profiling?: (untyped settings) -> bool
def self.get_allocation_sample_every: (untyped settings) -> ::Integer
def self.enable_allocation_profiling?: (untyped settings, ::Integer allocation_sample_every) -> bool
Expand Down
25 changes: 25 additions & 0 deletions sig/datadog/profiling/crash_tracker.rbs
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
module Datadog
module Profiling
class CrashTracker
def self.build_crash_tracker: (
exporter_configuration: [:agentless | :agent, untyped],
tags: ::Hash[::String, ::String],
?path_to_crashtracking_receiver_binary: ::String,
) -> CrashTracker?

private

def initialize: (
exporter_configuration: [:agentless | :agent, untyped],
path_to_crashtracking_receiver_binary: ::String,
tags_as_array: ::Array[[::String, ::String]],
) -> void

def self._native_start_crashtracker: (
exporter_configuration: [:agentless | :agent, untyped],
path_to_crashtracking_receiver_binary: ::String,
tags_as_array: ::Array[[::String, ::String]],
) -> void
end
end
end
4 changes: 2 additions & 2 deletions sig/datadog/profiling/http_transport.rbs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ module Datadog
class HttpTransport
type exporter_configuration_array = [:agentless | :agent, untyped]

attr_reader exporter_configuration: exporter_configuration_array

@upload_timeout_milliseconds: ::Integer
@exporter_configuration: exporter_configuration_array

Expand All @@ -15,8 +17,6 @@ module Datadog

def export: (Datadog::Profiling::Flush flush) -> bool

def self.log_failure_to_process_tag: (::String failure_details) -> void

private

def base_url_from: (Datadog::Core::Configuration::AgentSettingsResolver::AgentSettings agent_settings) -> ::String
Expand Down
44 changes: 44 additions & 0 deletions spec/datadog/profiling/component_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -500,6 +500,50 @@
build_profiler_component
end
end

context 'when crash tracking is disabled' do
before { settings.profiling.advanced.experimental_crash_tracking_enabled = false }

it 'does not initialize the crash tracker' do
expect(Datadog::Profiling::CrashTracker).to_not receive(:build_crash_tracker)

build_profiler_component
end
end

context 'when crash tracking is enabled' do
before { settings.profiling.advanced.experimental_crash_tracking_enabled = true }

it 'initializes the crash tracker' do
expect(Datadog::Profiling::CrashTracker).to receive(:build_crash_tracker).with(
exporter_configuration: array_including(:agent),
tags: hash_including('runtime' => 'ruby'),
)

build_profiler_component
end

context 'when a custom transport is provided' do
let(:custom_transport) { double('Custom transport') }

before do
settings.profiling.exporter.transport = custom_transport
allow(Datadog.logger).to receive(:warn)
end

it 'warns that crash tracking will not be enabled' do
expect(Datadog.logger).to receive(:warn).with(/Cannot enable profiling crash tracking/)

build_profiler_component
end

it 'does not initialize the crash tracker' do
expect(Datadog::Profiling::CrashTracker).to_not receive(:build_crash_tracker)

build_profiler_component
end
end
end
end
end

Expand Down
Loading

0 comments on commit 05346ba

Please sign in to comment.