Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Envelopes that fail to send are now flushed when the transport recovers #3438

Merged
merged 25 commits into from
Jul 11, 2024
Merged
Show file tree
Hide file tree
Changes from 22 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
d2d074b
Processing envelopes now get flushed after reconnecting to the network
jamescrosswell Jun 20, 2024
eae7ef2
Update CHANGELOG.md
jamescrosswell Jun 20, 2024
fb1128b
Merge branch 'main' into stuck-envelopes-3384
jamescrosswell Jun 20, 2024
36351d8
Added dead letter queue and only retry for network failures
jamescrosswell Jul 1, 2024
3d1fc34
Moved logit to InnerProcessCacheAsync to deduplicate and removed dead…
jamescrosswell Jul 1, 2024
fe24b77
Added a progressive backoff that can be used to monitor the network s…
jamescrosswell Jul 1, 2024
2e0b986
Refactored
jamescrosswell Jul 1, 2024
80461ca
Update HubTests.CaptureEvent_ActiveTransaction_UnhandledExceptionTran…
jamescrosswell Jul 1, 2024
db9e146
Format code
getsentry-bot Jul 1, 2024
19a81fe
Dispose connectivity monitor
jamescrosswell Jul 1, 2024
ed91bb2
Merge branch 'stuck-envelopes-3384' of github.com:getsentry/sentry-do…
jamescrosswell Jul 1, 2024
9d8e4c0
Merge branch 'main' into stuck-envelopes-3384
jamescrosswell Jul 1, 2024
95bcd42
Update CHANGELOG.md
jamescrosswell Jul 1, 2024
92e45aa
Merge branch 'main' into stuck-envelopes-3384
jamescrosswell Jul 3, 2024
ea92966
Implemented PollingNetworkStatusListener
jamescrosswell Jul 8, 2024
fc2a3d7
Merge branch 'main' into stuck-envelopes-3384
jamescrosswell Jul 8, 2024
07db2f7
Format code
getsentry-bot Jul 8, 2024
a57ed92
Changed from ICMP to TCP to check connectivity
jamescrosswell Jul 9, 2024
465795b
Merge branch 'stuck-envelopes-3384' of github.com:getsentry/sentry-do…
jamescrosswell Jul 9, 2024
49a1893
Merge branch 'main' into stuck-envelopes-3384
bitsandfoxes Jul 9, 2024
238393e
Merge branch 'main' into stuck-envelopes-3384
jamescrosswell Jul 10, 2024
2cd2302
Merge branch 'main' into stuck-envelopes-3384
jamescrosswell Jul 10, 2024
5a426fd
Added comment explaining why we move unprocessed files back to the cache
jamescrosswell Jul 10, 2024
53ef780
Update PollingNetworkStatusListener.cs
jamescrosswell Jul 11, 2024
f359140
Update PollingNetworkStatusListener.cs
jamescrosswell Jul 11, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

## Unreleased

### Fixes

- Fixed envelopes getting stuck in processing when losing network connectivity ([#3438](https://github.com/getsentry/sentry-dotnet/pull/3438))

### Features

- Client reports now include dropped spans ([#3463](https://github.com/getsentry/sentry-dotnet/pull/3463))
Expand Down
23 changes: 20 additions & 3 deletions src/Sentry/Internal/Http/CachingTransport.cs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ namespace Sentry.Internal.Http;
internal class CachingTransport : ITransport, IDisposable
{
private const string EnvelopeFileExt = "envelope";
private const string ProcessingFolder = "__processing";

private readonly ITransport _innerTransport;
private readonly SentryOptions _options;
Expand Down Expand Up @@ -77,7 +78,7 @@ private CachingTransport(ITransport innerTransport, SentryOptions options, bool
options.TryGetProcessSpecificCacheDirectoryPath() ??
throw new InvalidOperationException("Cache directory or DSN is not set.");

_processingDirectoryPath = Path.Combine(_isolatedCacheDirectoryPath, "__processing");
_processingDirectoryPath = Path.Combine(_isolatedCacheDirectoryPath, ProcessingFolder);
}

private void Initialize(bool startWorker)
Expand Down Expand Up @@ -271,6 +272,12 @@ private async Task ProcessCacheAsync(CancellationToken cancellation)
// Signal that we can start waiting for _options.InitCacheFlushTimeout
_preInitCacheResetEvent?.Set();

// Make sure no files got stuck in the processing directory
if (_options.NetworkStatusListener is { Online: false } listener)
jamescrosswell marked this conversation as resolved.
Show resolved Hide resolved
{
MoveUnprocessedFilesBackToCache();
}

// Process the cache
_options.LogDebug("Flushing cached envelopes.");
while (await TryPrepareNextCacheFileAsync(cancellation).ConfigureAwait(false) is { } file)
Expand All @@ -288,6 +295,13 @@ private async Task ProcessCacheAsync(CancellationToken cancellation)
}
}

private static bool IsNetworkError(Exception exception) =>
exception switch
{
HttpRequestException or WebException or IOException or SocketException => true,
_ => false
};

private async Task InnerProcessCacheAsync(string file, CancellationToken cancellation)
{
if (_options.NetworkStatusListener is { Online: false } listener)
Expand Down Expand Up @@ -327,9 +341,12 @@ private async Task InnerProcessCacheAsync(string file, CancellationToken cancell
// Let the worker catch, log, wait a bit and retry.
throw;
}
catch (Exception ex) when (ex is HttpRequestException or WebException or SocketException
or IOException)
catch (Exception ex) when (IsNetworkError(ex))
{
if (_options.NetworkStatusListener is PollingNetworkStatusListener pollingListener)
{
pollingListener.Online = false;
}
_options.LogError(ex, "Failed to send cached envelope: {0}, retrying after a delay.", file);
// Let the worker catch, log, wait a bit and retry.
throw;
Expand Down
69 changes: 69 additions & 0 deletions src/Sentry/Internal/PollingNetworkStatusListener.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
using Sentry.Extensibility;

namespace Sentry.Internal;

internal class PollingNetworkStatusListener : INetworkStatusListener
{
private long _networkIsUnavailable = 0;

private readonly SentryOptions? _options;
private readonly IPing? _testPing;
internal int _delayInMilliseconds;
private readonly int _maxDelayInMilliseconds;
private readonly Func<int, int> _backoffFunction;

public PollingNetworkStatusListener(SentryOptions options, int initialDelayInMilliseconds = 500,
int maxDelayInMilliseconds = 32_000, Func<int, int>? backoffFunction = null)
{
_options = options;
_delayInMilliseconds = initialDelayInMilliseconds;
_maxDelayInMilliseconds = maxDelayInMilliseconds;
_backoffFunction = backoffFunction ?? (x => x * 2);
}

/// <summary>
/// Overload for testing
/// </summary>
internal PollingNetworkStatusListener(IPing testPing, int initialDelayInMilliseconds = 500,
int maxDelayInMilliseconds = 32_000, Func<int, int>? backoffFunction = null)
{
_testPing = testPing;
_delayInMilliseconds = initialDelayInMilliseconds;
_maxDelayInMilliseconds = maxDelayInMilliseconds;
_backoffFunction = backoffFunction ?? (x => x * 2);
}

private Lazy<IPing> LazyPing => new(() =>
{
if (_testPing != null)
{
return _testPing;
}
var uri = new Uri(_options!.Dsn!);
jamescrosswell marked this conversation as resolved.
Show resolved Hide resolved
return new TcpPing(uri.DnsSafeHost, uri.Port);
});
private IPing Ping => LazyPing.Value;

public bool Online
{
get => Interlocked.Read(ref _networkIsUnavailable) == 0;
set => Interlocked.Exchange(ref _networkIsUnavailable, value ? 0 : 1);
}
jamescrosswell marked this conversation as resolved.
Show resolved Hide resolved
public async Task WaitForNetworkOnlineAsync(CancellationToken cancellationToken = default)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This argument has a default value but I don't see it being tested when a cancellation token is passed.

If one is passed, and triggered, Delay would throw an exception that isnt' handled anywhere here. Is that considered?

I'd recommend remove the argument if we're not using anywhere (at least I can't see it being used).

If we're using it, add a test that does that to verify the behavior

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good call. Added test and modified code to ensure a graceful exit if the task is cancelled:

{
while (!cancellationToken.IsCancellationRequested)
{
await Task.Delay(_delayInMilliseconds, cancellationToken).ConfigureAwait(false);
var checkResult = await Ping.IsAvailableAsync().ConfigureAwait(false);
if (checkResult)
{
Online = true;
return;
}
if (_delayInMilliseconds < _maxDelayInMilliseconds)
{
_delayInMilliseconds = _backoffFunction(_delayInMilliseconds);
}
}
}
}
25 changes: 25 additions & 0 deletions src/Sentry/Internal/TcpPing.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
namespace Sentry.Internal;

internal interface IPing
{
Task<bool> IsAvailableAsync();
}

internal class TcpPing(string hostToCheck, int portToCheck = 443) : IPing
{
private readonly Ping _ping = new();

public async Task<bool> IsAvailableAsync()
{
try
{
using var tcpClient = new TcpClient();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I take it the new TcpClient every 500ms is not an issue?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It won't be every 500ms due to the progressive backoff. There would be a first attempt, a subsequent attempts after 500ms, then 1000ms then 2000ms etc.

We could try to use the same TcpClient but it would need to be reset to be usable again after each unsuccessful connection attempt. I could do some benchmarks to check those two different approaches. It's worth noting that there will only be one PollingNetworkStatusListener instance in the application though, so I don't think this is really a hot path.

await tcpClient.ConnectAsync(hostToCheck, portToCheck).ConfigureAwait(false);
return true;
}
catch (Exception)
{
return false;
}
}
}
4 changes: 4 additions & 0 deletions src/Sentry/SentryOptions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1297,6 +1297,10 @@ public SentryOptions()
UriComponents.SchemeAndServer,
UriFormat.Unescaped)
);

#if PLATFORM_NEUTRAL
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What does this mean in practice? Did we document in what environments this will work and which ones it wont?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's the flip side to this:

#if !PLATFORM_NEUTRAL
// We can use MAUI's network connectivity information to inform the CachingTransport when we're offline.
options.NetworkStatusListener = new MauiNetworkStatusListener(Connectivity.Current, options);
#endif

With PLATFORM_NEUTRAL being defined here.

In practice, I think that means if you lose network connectivity and then regain it again, and you're targeting one of those platforms specifically but you're not using MAUI, then any envelopes that you tried to send while the network was down won't be sent until you restart the application again.

Another option would be to always initialise the SentryOptions.NetworkStatusListener to a new PollingNetworkStatusListener but let the SentryMauiOptionsSetup override this... or some variant of that. Technically SentryOptions.NetworkStatusListener is public so the user could set this to some custom listener that they've crafted. We don't currently do anything to make sure we don't override the user's listener if they've already set one.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Have created #3510 to track this conversation in any case (this PR is closed/merged so easy for the conversation to get lost here).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't currently do anything to make sure we don't override the user's listener if they've already set one.

I think that's the crux. But if we set it up in the option's constructor the user would overwrite it again and we'd be fine, right?

NetworkStatusListener = new PollingNetworkStatusListener(this);
#endif
}

/// <summary>
Expand Down
30 changes: 21 additions & 9 deletions test/Sentry.Tests/Internals/Http/CachingTransportTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -545,14 +545,17 @@ await Assert.ThrowsAnyAsync<Exception>(async () =>
[MemberData(nameof(NetworkTestData))]
public async Task TestNetworkException(Exception exception)
{
// Arrange
// Arrange - network unavailable
using var cacheDirectory = new TempDirectory(_fileSystem);
var pingHost = Substitute.For<IPing>();
pingHost.IsAvailableAsync().Returns(Task.FromResult(true));
var options = new SentryOptions
{
Dsn = ValidDsn,
DiagnosticLogger = _logger,
Debug = true,
CacheDirectoryPath = cacheDirectory.Path,
NetworkStatusListener = new PollingNetworkStatusListener(pingHost),
FileSystem = _fileSystem
};

Expand All @@ -577,16 +580,25 @@ public async Task TestNetworkException(Exception exception)
{
receivedException = he;
}
finally
{
// (transport stops failing)
innerTransport.ClearReceivedCalls();
await transport.FlushAsync();
}

// Assert
Assert.Equal(exception, receivedException);
Assert.True(_fileSystem.EnumerateFiles(cacheDirectory.Path, "*", SearchOption.AllDirectories).Any());
receivedException.Should().Be(exception);
var files = _fileSystem.EnumerateFiles(cacheDirectory.Path, "*", SearchOption.AllDirectories).ToArray();
files.Should().NotBeEmpty();

// Arrange - network recovery
innerTransport.ClearReceivedCalls();
innerTransport
.SendEnvelopeAsync(Arg.Any<Envelope>(), Arg.Any<CancellationToken>())
.Returns(_ => Task.CompletedTask);

// Act
await transport.FlushAsync();

// Assert
receivedException.Should().Be(exception);
files = _fileSystem.EnumerateFiles(cacheDirectory.Path, "*", SearchOption.AllDirectories).ToArray();
files.Should().NotContain(file => file.Contains("__processing", StringComparison.OrdinalIgnoreCase));
}

[Fact]
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
namespace Sentry.Tests.Internals;

public class PollingNetworkStatusListenerTest
{
[Fact]
public async Task HostAvailable_CheckOnlyRunsOnce()
{
// Arrange
var initialDelay = 100;
var pingHost = Substitute.For<IPing>();
pingHost
.IsAvailableAsync()
.Returns(Task.FromResult(true));

var pollingListener = new PollingNetworkStatusListener(pingHost, initialDelay);
pollingListener.Online = false;

// Act
var waitForNetwork = pollingListener.WaitForNetworkOnlineAsync();
var timeout = Task.Delay(1000);
var completedTask = await Task.WhenAny(waitForNetwork, timeout);

// Assert
completedTask.Should().Be(waitForNetwork);
pollingListener.Online.Should().Be(true);
await pingHost.Received(1).IsAvailableAsync();
}

[Fact]
public async Task HostUnavailable_ShouldIncreaseDelay()
{
// Arrange
var initialDelay = 100; // set initial delay to ease the testing
var pingHost = Substitute.For<IPing>();
pingHost
.IsAvailableAsync()
.Returns(Task.FromResult(false));

var pollingListener = new PollingNetworkStatusListener(pingHost, initialDelay);
pollingListener.Online = false;

// Act
var waitForNetwork = pollingListener.WaitForNetworkOnlineAsync();
var timeout = Task.Delay(2000);
var completedTask = await Task.WhenAny(waitForNetwork, timeout);

// Assert
completedTask.Should().Be(timeout);
pollingListener.Online.Should().Be(false);
await pingHost.Received().IsAvailableAsync();
pollingListener._delayInMilliseconds.Should().BeGreaterThan(initialDelay);
}
}
Loading