From e8bd4f157f79d28a166689915c160b311967787f Mon Sep 17 00:00:00 2001 From: Andrey Marchenko Date: Fri, 21 Jun 2024 10:37:14 +0200 Subject: [PATCH] limit telemetry app-started event retries --- lib/datadog/core/telemetry/worker.rb | 15 ++++- .../core/utils/only_once_successful.rb | 8 +++ sig/datadog/core/telemetry/worker.rbs | 3 + spec/datadog/core/telemetry/worker_spec.rb | 65 +++++++++++++++++++ 4 files changed, 89 insertions(+), 2 deletions(-) diff --git a/lib/datadog/core/telemetry/worker.rb b/lib/datadog/core/telemetry/worker.rb index 2021e139b0..46d7721ee4 100644 --- a/lib/datadog/core/telemetry/worker.rb +++ b/lib/datadog/core/telemetry/worker.rb @@ -15,8 +15,9 @@ class Worker include Core::Workers::Polling DEFAULT_BUFFER_MAX_SIZE = 1000 + APP_STARTED_EVENT_RETRIES = 10 - TELEMETRY_STARTED_ONCE = Utils::OnlyOnceSuccessful.new + TELEMETRY_STARTED_ONCE = Utils::OnlyOnceSuccessful.new(APP_STARTED_EVENT_RETRIES) def initialize( heartbeat_interval_seconds:, @@ -61,7 +62,11 @@ def enqueue(event) end def sent_started_event? - TELEMETRY_STARTED_ONCE.ran? + TELEMETRY_STARTED_ONCE.success? + end + + def failed_to_start? + TELEMETRY_STARTED_ONCE.failed? end private @@ -94,6 +99,12 @@ def heartbeat! def started! return unless enabled? + if failed_to_start? + Datadog.logger.debug('Telemetry app-started event exhausted retries, disabling telemetry worker') + self.enabled = false + return + end + TELEMETRY_STARTED_ONCE.run do res = send_event(Event::AppStarted.new) diff --git a/lib/datadog/core/utils/only_once_successful.rb b/lib/datadog/core/utils/only_once_successful.rb index 4209cafc1e..ed8b414196 100644 --- a/lib/datadog/core/utils/only_once_successful.rb +++ b/lib/datadog/core/utils/only_once_successful.rb @@ -62,6 +62,14 @@ def check_limit! def limited? !@limit.nil? && @limit.positive? end + + def reset_ran_once_state_for_tests + @mutex.synchronize do + @ran_once = false + @failed = false + @retries = 0 + end + end end end end diff --git a/sig/datadog/core/telemetry/worker.rbs b/sig/datadog/core/telemetry/worker.rbs index 01c5107e99..822b9fece9 100644 --- a/sig/datadog/core/telemetry/worker.rbs +++ b/sig/datadog/core/telemetry/worker.rbs @@ -9,6 +9,7 @@ module Datadog include Core::Workers::Queue TELEMETRY_STARTED_ONCE: Datadog::Core::Utils::OnlyOnceSuccessful + APP_STARTED_EVENT_RETRIES: 10 DEFAULT_BUFFER_MAX_SIZE: 1000 @emitter: Emitter @@ -23,6 +24,8 @@ module Datadog def sent_started_event?: () -> bool + def failed_to_start?: () -> bool + def enqueue: (Event::Base event) -> void def dequeue: () -> Array[Event::Base] diff --git a/spec/datadog/core/telemetry/worker_spec.rb b/spec/datadog/core/telemetry/worker_spec.rb index 35a01e4683..e73fec0888 100644 --- a/spec/datadog/core/telemetry/worker_spec.rb +++ b/spec/datadog/core/telemetry/worker_spec.rb @@ -120,6 +120,71 @@ try_wait_until { sent_hearbeat } end + context 'when app-started event fails' do + it 'retries' do + expect(emitter).to receive(:request).with(an_instance_of(Datadog::Core::Telemetry::Event::AppStarted)) + .and_return( + double( + Datadog::Core::Telemetry::Http::Adapters::Net::Response, + not_found?: false, + ok?: false + ) + ).once + + expect(emitter).to receive(:request).with(an_instance_of(Datadog::Core::Telemetry::Event::AppStarted)) do + @received_started = true + + response + end + + sent_hearbeat = false + allow(emitter).to receive(:request).with(kind_of(Datadog::Core::Telemetry::Event::AppHeartbeat)) do + # app-started was already sent by now + expect(@received_started).to be(true) + + sent_hearbeat = true + + response + end + + worker.start + + try_wait_until { sent_hearbeat } + end + end + + context 'when app-started event exhausted retries' do + let(:heartbeat_interval_seconds) { 0.1 } + + it 'stops retrying, never sends heartbeat, and disables worker' do + expect(emitter).to receive(:request).with(an_instance_of(Datadog::Core::Telemetry::Event::AppStarted)) + .and_return( + double( + Datadog::Core::Telemetry::Http::Adapters::Net::Response, + not_found?: false, + ok?: false + ) + ).exactly(described_class::APP_STARTED_EVENT_RETRIES).times + + sent_hearbeat = false + allow(emitter).to receive(:request).with(kind_of(Datadog::Core::Telemetry::Event::AppHeartbeat)) do + # app-started was already sent by now + expect(@received_started).to be(true) + + sent_hearbeat = true + + response + end + + worker.start + + try_wait_until { !worker.enabled? } + + expect(sent_hearbeat).to be(false) + expect(worker.failed_to_start?).to be(true) + end + end + context 'when dependencies collection enabled' do let(:dependency_collection) { true }