diff --git a/dd-java-agent/agent-bootstrap/src/main/java11/datadog/trace/bootstrap/instrumentation/jfr/backpressure/BackpressureProfiling.java b/dd-java-agent/agent-bootstrap/src/main/java11/datadog/trace/bootstrap/instrumentation/jfr/backpressure/BackpressureProfiling.java new file mode 100644 index 00000000000..c1001014688 --- /dev/null +++ b/dd-java-agent/agent-bootstrap/src/main/java11/datadog/trace/bootstrap/instrumentation/jfr/backpressure/BackpressureProfiling.java @@ -0,0 +1,36 @@ +package datadog.trace.bootstrap.instrumentation.jfr.backpressure; + +import datadog.trace.api.Config; +import datadog.trace.bootstrap.instrumentation.api.TaskWrapper; + +public final class BackpressureProfiling { + + private static final class Holder { + static final BackpressureProfiling INSTANCE = new BackpressureProfiling(Config.get()); + } + + public static BackpressureProfiling getInstance() { + return Holder.INSTANCE; + } + + private final BackpressureSampler sampler; + + private BackpressureProfiling(final Config config) { + this(new BackpressureSampler(config)); + } + + BackpressureProfiling(BackpressureSampler sampler) { + this.sampler = sampler; + } + + public void start() { + sampler.start(); + } + + public void process(Class backpressureMechanism, Object task) { + if (sampler.sample()) { + new BackpressureSampleEvent(backpressureMechanism, TaskWrapper.getUnwrappedType(task)) + .commit(); + } + } +} diff --git a/dd-java-agent/agent-bootstrap/src/main/java11/datadog/trace/bootstrap/instrumentation/jfr/backpressure/BackpressureSampleEvent.java b/dd-java-agent/agent-bootstrap/src/main/java11/datadog/trace/bootstrap/instrumentation/jfr/backpressure/BackpressureSampleEvent.java new file mode 100644 index 00000000000..ec3eefd28c3 --- /dev/null +++ b/dd-java-agent/agent-bootstrap/src/main/java11/datadog/trace/bootstrap/instrumentation/jfr/backpressure/BackpressureSampleEvent.java @@ -0,0 +1,38 @@ +package datadog.trace.bootstrap.instrumentation.jfr.backpressure; + +import datadog.trace.bootstrap.instrumentation.jfr.ContextualEvent; +import jdk.jfr.Category; +import jdk.jfr.Description; +import jdk.jfr.Event; +import jdk.jfr.Label; +import jdk.jfr.Name; + +@Name("datadog.BackpressureSample") +@Label("Backpressure Sample") +@Description("Datadog backpressure sample event.") +@Category("Datadog") +public class BackpressureSampleEvent extends Event implements ContextualEvent { + @Label("Policy") + private final Class policy; + + @Label("Task") + private final Class task; + + @Label("Local Root Span Id") + private long localRootSpanId; + + @Label("Span Id") + private long spanId; + + public BackpressureSampleEvent(Class policy, Class task) { + this.policy = policy; + this.task = task; + captureContext(); + } + + @Override + public void setContext(long localRootSpanId, long spanId) { + this.localRootSpanId = localRootSpanId; + this.spanId = spanId; + } +} diff --git a/dd-java-agent/agent-bootstrap/src/main/java11/datadog/trace/bootstrap/instrumentation/jfr/backpressure/BackpressureSampler.java b/dd-java-agent/agent-bootstrap/src/main/java11/datadog/trace/bootstrap/instrumentation/jfr/backpressure/BackpressureSampler.java new file mode 100644 index 00000000000..1469b71f28e --- /dev/null +++ b/dd-java-agent/agent-bootstrap/src/main/java11/datadog/trace/bootstrap/instrumentation/jfr/backpressure/BackpressureSampler.java @@ -0,0 +1,31 @@ +package datadog.trace.bootstrap.instrumentation.jfr.backpressure; + +import datadog.trace.api.Config; +import datadog.trace.bootstrap.instrumentation.jfr.WindowSampler; +import java.time.Duration; +import java.time.temporal.ChronoUnit; + +final class BackpressureSampler extends WindowSampler { + /* + * Fixed 0.5 second sampling window. + * Logic in AdaptiveSampler relies on sampling window being small compared to (in our case) recording duration: + * sampler may overshoot on one given window but should average to samplesPerWindow in the long run. + */ + private static final Duration SAMPLING_WINDOW = Duration.of(500, ChronoUnit.MILLIS); + + BackpressureSampler(final Config config) { + this( + SAMPLING_WINDOW, + getSamplesPerWindow(config), + samplingWindowsPerRecording(config.getProfilingUploadPeriod(), SAMPLING_WINDOW)); + } + + BackpressureSampler(Duration windowDuration, int samplesPerWindow, int lookback) { + super(windowDuration, samplesPerWindow, lookback, BackpressureSampleEvent.class); + } + + protected static int getSamplesPerWindow(final Config config) { + return config.getProfilingBackPressureSampleLimit() + / samplingWindowsPerRecording(config.getProfilingUploadPeriod(), SAMPLING_WINDOW); + } +} diff --git a/dd-java-agent/agent-profiling/profiling-controller-openjdk/src/main/java/com/datadog/profiling/controller/openjdk/OpenJdkController.java b/dd-java-agent/agent-profiling/profiling-controller-openjdk/src/main/java/com/datadog/profiling/controller/openjdk/OpenJdkController.java index bc48ba6c810..242c8fade2f 100644 --- a/dd-java-agent/agent-profiling/profiling-controller-openjdk/src/main/java/com/datadog/profiling/controller/openjdk/OpenJdkController.java +++ b/dd-java-agent/agent-profiling/profiling-controller-openjdk/src/main/java/com/datadog/profiling/controller/openjdk/OpenJdkController.java @@ -32,9 +32,11 @@ import com.datadog.profiling.controller.jfr.JFRAccess; import com.datadog.profiling.controller.jfr.JfpUtils; import com.datadog.profiling.controller.openjdk.events.AvailableProcessorCoresEvent; +import datadog.trace.api.Config; import datadog.trace.api.Platform; import datadog.trace.api.config.ProfilingConfig; import datadog.trace.bootstrap.config.provider.ConfigProvider; +import datadog.trace.bootstrap.instrumentation.jfr.backpressure.BackpressureProfiling; import datadog.trace.bootstrap.instrumentation.jfr.exceptions.ExceptionProfiling; import datadog.trace.util.PidHelper; import de.thetaphi.forbiddenapis.SuppressForbidden; @@ -231,6 +233,10 @@ && isEventEnabled(recordingSettings, "jdk.NativeMethodSample")) { ExceptionProfiling.getInstance().start(); } + if (Config.get().isProfilingBackPressureSamplingEnabled()) { + BackpressureProfiling.getInstance().start(); + } + // Register periodic events AvailableProcessorCoresEvent.register(); } diff --git a/dd-java-agent/instrumentation/graal/native-image/src/main/java/datadog/trace/instrumentation/graal/nativeimage/NativeImageGeneratorRunnerInstrumentation.java b/dd-java-agent/instrumentation/graal/native-image/src/main/java/datadog/trace/instrumentation/graal/nativeimage/NativeImageGeneratorRunnerInstrumentation.java index 7fef8bc99cd..319f6b12fc4 100644 --- a/dd-java-agent/instrumentation/graal/native-image/src/main/java/datadog/trace/instrumentation/graal/nativeimage/NativeImageGeneratorRunnerInstrumentation.java +++ b/dd-java-agent/instrumentation/graal/native-image/src/main/java/datadog/trace/instrumentation/graal/nativeimage/NativeImageGeneratorRunnerInstrumentation.java @@ -96,6 +96,7 @@ public static void onEnter(@Advice.Argument(value = 0, readOnly = false) String[ + "datadog.trace.bootstrap.instrumentation.java.concurrent.TPEHelper:build_time," + "datadog.trace.bootstrap.instrumentation.jfr.exceptions.ExceptionCountEvent:build_time," + "datadog.trace.bootstrap.instrumentation.jfr.exceptions.ExceptionSampleEvent:build_time," + + "datadog.trace.bootstrap.instrumentation.jfr.backpressure.BackpressureSampleEvent:build_time," + "datadog.trace.bootstrap.instrumentation.jfr.directallocation.DirectAllocationTotalEvent:build_time," + "datadog.trace.logging.LoggingSettingsDescription:build_time," + "datadog.trace.logging.simplelogger.SLCompatFactory:build_time," diff --git a/dd-java-agent/instrumentation/java-concurrent/src/main/java/datadog/trace/instrumentation/java/concurrent/RejectedExecutionHandlerInstrumentation.java b/dd-java-agent/instrumentation/java-concurrent/src/main/java/datadog/trace/instrumentation/java/concurrent/RejectedExecutionHandlerInstrumentation.java index f651ef9d585..65da485f490 100644 --- a/dd-java-agent/instrumentation/java-concurrent/src/main/java/datadog/trace/instrumentation/java/concurrent/RejectedExecutionHandlerInstrumentation.java +++ b/dd-java-agent/instrumentation/java-concurrent/src/main/java/datadog/trace/instrumentation/java/concurrent/RejectedExecutionHandlerInstrumentation.java @@ -11,9 +11,11 @@ import com.google.auto.service.AutoService; import datadog.trace.agent.tooling.Instrumenter; import datadog.trace.agent.tooling.InstrumenterModule; +import datadog.trace.api.Config; import datadog.trace.bootstrap.InstrumentationContext; import datadog.trace.bootstrap.instrumentation.java.concurrent.State; import datadog.trace.bootstrap.instrumentation.java.concurrent.Wrapper; +import datadog.trace.bootstrap.instrumentation.jfr.backpressure.BackpressureProfiling; import java.util.Collections; import java.util.HashMap; import java.util.Map; @@ -81,13 +83,19 @@ public static final class Reject { // remove our wrapper before calling the handler (save wrapper, so we can cancel it later) @Advice.OnMethodEnter(suppress = Throwable.class) public static Wrapper handle( - @Advice.Argument(readOnly = false, value = 0) Runnable runnable) { + @Advice.This Object zis, @Advice.Argument(readOnly = false, value = 0) Runnable runnable) { + Wrapper wrapper = null; if (runnable instanceof Wrapper) { - Wrapper wrapper = (Wrapper) runnable; + wrapper = (Wrapper) runnable; runnable = wrapper.unwrap(); - return wrapper; } - return null; + if (Config.get().isProfilingBackPressureSamplingEnabled()) { + // record this event before the handler executes, which will help + // explain why the task is running on the submitter thread for + // rejection policies which run on the caller (CallerRunsPolicy or user-provided) + BackpressureProfiling.getInstance().process(zis.getClass(), runnable); + } + return wrapper; } // must execute after in case the handler actually runs the runnable, diff --git a/dd-trace-api/src/main/java/datadog/trace/api/config/ProfilingConfig.java b/dd-trace-api/src/main/java/datadog/trace/api/config/ProfilingConfig.java index e8629e7016e..fb041010d66 100644 --- a/dd-trace-api/src/main/java/datadog/trace/api/config/ProfilingConfig.java +++ b/dd-trace-api/src/main/java/datadog/trace/api/config/ProfilingConfig.java @@ -46,6 +46,13 @@ public final class ProfilingConfig { "profiling.exception.record.message"; public static final boolean PROFILING_EXCEPTION_RECORD_MESSAGE_DEFAULT = true; + public static final String PROFILING_BACKPRESSURE_SAMPLING_ENABLED = + "profiling.backpressure.sampling.enabled"; + public static final boolean PROFILING_BACKPRESSURE_SAMPLING_ENABLED_DEFAULT = false; + public static final String PROFILING_BACKPRESSURE_SAMPLE_LIMIT = + "profiling.backpressure.sample.limit"; + public static final int PROFILING_BACKPRESSURE_SAMPLE_LIMIT_DEFAULT = 10_000; + public static final String PROFILING_DIRECT_ALLOCATION_SAMPLE_LIMIT = "profiling.direct.allocation.sample.limit"; public static final int PROFILING_DIRECT_ALLOCATION_SAMPLE_LIMIT_DEFAULT = 2_000; diff --git a/internal-api/src/main/java/datadog/trace/api/Config.java b/internal-api/src/main/java/datadog/trace/api/Config.java index a7de7370bd6..24774d53b84 100644 --- a/internal-api/src/main/java/datadog/trace/api/Config.java +++ b/internal-api/src/main/java/datadog/trace/api/Config.java @@ -315,6 +315,9 @@ import static datadog.trace.api.config.ProfilingConfig.PROFILING_API_KEY_FILE_VERY_OLD; import static datadog.trace.api.config.ProfilingConfig.PROFILING_API_KEY_OLD; import static datadog.trace.api.config.ProfilingConfig.PROFILING_API_KEY_VERY_OLD; +import static datadog.trace.api.config.ProfilingConfig.PROFILING_BACKPRESSURE_SAMPLE_LIMIT_DEFAULT; +import static datadog.trace.api.config.ProfilingConfig.PROFILING_BACKPRESSURE_SAMPLING_ENABLED; +import static datadog.trace.api.config.ProfilingConfig.PROFILING_BACKPRESSURE_SAMPLING_ENABLED_DEFAULT; import static datadog.trace.api.config.ProfilingConfig.PROFILING_DATADOG_PROFILER_ENABLED; import static datadog.trace.api.config.ProfilingConfig.PROFILING_DIRECT_ALLOCATION_SAMPLE_LIMIT; import static datadog.trace.api.config.ProfilingConfig.PROFILING_DIRECT_ALLOCATION_SAMPLE_LIMIT_DEFAULT; @@ -713,6 +716,8 @@ static class HostNameHolder { private final String profilingProxyUsername; private final String profilingProxyPassword; private final int profilingExceptionSampleLimit; + private final int profilingBackPressureSampleLimit; + private final boolean profilingBackPressureEnabled; private final int profilingDirectAllocationSampleLimit; private final int profilingExceptionHistogramTopItems; private final int profilingExceptionHistogramMaxCollectionSize; @@ -1543,6 +1548,13 @@ PROFILING_DATADOG_PROFILER_ENABLED, isDatadogProfilerSafeInCurrentEnvironment()) profilingExceptionSampleLimit = configProvider.getInteger( PROFILING_EXCEPTION_SAMPLE_LIMIT, PROFILING_EXCEPTION_SAMPLE_LIMIT_DEFAULT); + profilingBackPressureSampleLimit = + configProvider.getInteger( + PROFILING_EXCEPTION_SAMPLE_LIMIT, PROFILING_BACKPRESSURE_SAMPLE_LIMIT_DEFAULT); + profilingBackPressureEnabled = + configProvider.getBoolean( + PROFILING_BACKPRESSURE_SAMPLING_ENABLED, + PROFILING_BACKPRESSURE_SAMPLING_ENABLED_DEFAULT); profilingDirectAllocationSampleLimit = configProvider.getInteger( PROFILING_DIRECT_ALLOCATION_SAMPLE_LIMIT, @@ -2678,6 +2690,14 @@ public int getProfilingDirectAllocationSampleLimit() { return profilingDirectAllocationSampleLimit; } + public int getProfilingBackPressureSampleLimit() { + return profilingBackPressureSampleLimit; + } + + public boolean isProfilingBackPressureSamplingEnabled() { + return profilingBackPressureEnabled; + } + public int getProfilingExceptionHistogramTopItems() { return profilingExceptionHistogramTopItems; }