Skip to content

Commit

Permalink
Client Telemetry : Fixes data precision and telemetry handler position (
Browse files Browse the repository at this point in the history
#2684)

1. Improved precision of all the metrics: So we are collecting different kind of metrics, Somewhere using conversion as a precision factor and if value needs not to be converted then using HistogramPrecisionfactor to preserve data precision.
2. Move telemetry handler before retry handler: Telemetry should be collected from the toppest layer of the operation.
3. Test cases improvement : There were chances that test cases started failing due to thread starvation or related issues. So added waiting till 30 sec.
4. Upper Case Connection Mode and Consistency : Just like Java
  • Loading branch information
sourabh1007 authored Sep 8, 2021
1 parent c306993 commit 70ec89c
Show file tree
Hide file tree
Showing 11 changed files with 193 additions and 97 deletions.
12 changes: 6 additions & 6 deletions Microsoft.Azure.Cosmos/src/Handler/ClientPipelineBuilder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -105,15 +105,15 @@ private set
/// |
/// +-----------------------------+
/// | |
/// | RetryHandler |-> RetryPolicy -> ResetSessionTokenRetryPolicyFactory -> ClientRetryPolicy -> ResourceThrottleRetryPolicy
/// | TelemetryHandler |-> Trigger a thread to monitor system usage/operation information and sends to an API
/// | |
/// +-----------------------------+
/// |
/// |
/// |
/// +-----------------------------+
/// | |
/// | Telemetry Handler |-> Trigger a thread to monitor system usage/operation information and send it to juno
/// | RetryHandler |-> RetryPolicy -> ResetSessionTokenRetryPolicyFactory -> ClientRetryPolicy -> ResourceThrottleRetryPolicy
/// | |
/// +-----------------------------+
/// |
Expand Down Expand Up @@ -176,16 +176,16 @@ public RequestInvokerHandler Build()
current = current.InnerHandler;
}

Debug.Assert(this.retryHandler != null, nameof(this.retryHandler));
current.InnerHandler = this.retryHandler;
current = current.InnerHandler;

if (this.telemetryHandler != null)
{
current.InnerHandler = this.telemetryHandler;
current = current.InnerHandler;
}

Debug.Assert(this.retryHandler != null, nameof(this.retryHandler));
current.InnerHandler = this.retryHandler;
current = current.InnerHandler;

// Have a router handler
RequestHandler feedHandler = this.CreateDocumentFeedPipeline();

Expand Down
18 changes: 8 additions & 10 deletions Microsoft.Azure.Cosmos/src/Telemetry/ClientTelemetry.cs
Original file line number Diff line number Diff line change
Expand Up @@ -148,10 +148,10 @@ private async Task EnrichAndSendAsync()
return;
}

this.clientTelemetryInfo.DateTimeUtc = DateTime.UtcNow.ToString(ClientTelemetryOptions.DateFormat);

this.RecordSystemUtilization();

this.clientTelemetryInfo.DateTimeUtc = DateTime.UtcNow.ToString(ClientTelemetryOptions.DateFormat);

ConcurrentDictionary<OperationInfo, (LongConcurrentHistogram latency, LongConcurrentHistogram requestcharge)> operationInfoSnapshot
= Interlocked.Exchange(ref this.operationInfoMap, new ConcurrentDictionary<OperationInfo, (LongConcurrentHistogram latency, LongConcurrentHistogram requestcharge)>());

Expand Down Expand Up @@ -216,21 +216,19 @@ internal void Collect(CosmosDiagnostics cosmosDiagnostics,
statusCode: (int)statusCode);

(LongConcurrentHistogram latency, LongConcurrentHistogram requestcharge) = this.operationInfoMap
.GetOrAdd(payloadKey, x => (latency: new LongConcurrentHistogram(1,
.GetOrAdd(payloadKey, x => (latency: new LongConcurrentHistogram(ClientTelemetryOptions.RequestLatencyMin,
ClientTelemetryOptions.RequestLatencyMax,
ClientTelemetryOptions.RequestLatencyPrecision),
requestcharge: new LongConcurrentHistogram(ClientTelemetryOptions.RequestChargeMin,
ClientTelemetryOptions.RequestChargeMax,
ClientTelemetryOptions.RequestChargePrecision)));

long totalElapsedTimeInMs = (long)cosmosDiagnostics.GetClientElapsedTime().TotalMilliseconds;
try
{
latency.RecordValue(totalElapsedTimeInMs * ClientTelemetryOptions.HistogramPrecisionFactor);
latency.RecordValue(cosmosDiagnostics.GetClientElapsedTime().Ticks);
}
catch (Exception ex)
{
DefaultTrace.TraceError("Latency Recording Failed by Telemetry. Latency Value : " + totalElapsedTimeInMs + " Exception : " + ex.Message);
DefaultTrace.TraceError("Latency Recording Failed by Telemetry. Exception : " + ex.Message);
}

long requestChargeToRecord = (long)(requestCharge * ClientTelemetryOptions.HistogramPrecisionFactor);
Expand Down Expand Up @@ -282,11 +280,11 @@ private void RecordSystemUtilization()
{
DefaultTrace.TraceVerbose("Started Recording System Usage for telemetry.");

SystemUsageHistory systemUsageRecorder = this.diagnosticsHelper.GetClientTelemtrySystemHistory();
SystemUsageHistory systemUsageHistory = this.diagnosticsHelper.GetClientTelemtrySystemHistory();

if (systemUsageRecorder != null )
if (systemUsageHistory != null )
{
(SystemInfo cpuUsagePayload, SystemInfo memoryUsagePayload) = ClientTelemetryHelper.RecordSystemUsage(systemUsageRecorder);
(SystemInfo cpuUsagePayload, SystemInfo memoryUsagePayload) = ClientTelemetryHelper.RecordSystemUsage(systemUsageHistory);
if (cpuUsagePayload != null)
{
this.clientTelemetryInfo.SystemInfo.Add(cpuUsagePayload);
Expand Down
25 changes: 13 additions & 12 deletions Microsoft.Azure.Cosmos/src/Telemetry/ClientTelemetryHelper.cs
Original file line number Diff line number Diff line change
Expand Up @@ -88,51 +88,52 @@ static ValueTask<HttpRequestMessage> CreateRequestMessage()
/// <summary>
/// Record System Usage and return recorded metrics
/// </summary>
/// <param name="systemUsageRecorder"></param>
/// <param name="systemUsageHistory"></param>
/// <returns>ReportPayload</returns>
internal static (SystemInfo cpuInfo, SystemInfo memoryInfo) RecordSystemUsage(SystemUsageHistory systemUsageRecorder)
internal static (SystemInfo cpuInfo, SystemInfo memoryInfo) RecordSystemUsage(SystemUsageHistory systemUsageHistory)
{
LongConcurrentHistogram cpuHistogram = new LongConcurrentHistogram(1,
LongConcurrentHistogram cpuHistogram = new LongConcurrentHistogram(ClientTelemetryOptions.CpuMin,
ClientTelemetryOptions.CpuMax,
ClientTelemetryOptions.CpuPrecision);

LongConcurrentHistogram memoryHistogram = new LongConcurrentHistogram(1,
LongConcurrentHistogram memoryHistogram = new LongConcurrentHistogram(ClientTelemetryOptions.MemoryMin,
ClientTelemetryOptions.MemoryMax,
ClientTelemetryOptions.MemoryPrecision);

if (systemUsageRecorder.Values == null)
if (systemUsageHistory.Values == null)
{
return (null, null);
}

foreach (SystemUsageLoad systemUsage in systemUsageRecorder.Values)
DefaultTrace.TraceInformation("System Usage recorded by telemetry is : " + systemUsageHistory);

foreach (SystemUsageLoad systemUsage in systemUsageHistory.Values)
{
float? cpuValue = systemUsage.CpuUsage;
if (cpuValue.HasValue && !float.IsNaN(cpuValue.Value))
{
cpuHistogram.RecordValue((long)cpuValue);
cpuHistogram.RecordValue((long)(cpuValue * ClientTelemetryOptions.HistogramPrecisionFactor));
}

long? memoryLoad = systemUsage.MemoryUsage;
if (memoryLoad.HasValue)
{
long memoryLoadInMb = memoryLoad.Value / ClientTelemetryOptions.BytesToMb;
memoryHistogram.RecordValue(memoryLoadInMb);
memoryHistogram.RecordValue(memoryLoad.Value);
}
}

SystemInfo memoryInfoPayload = null;
if (memoryHistogram.TotalCount > 0)
{
memoryInfoPayload = new SystemInfo(ClientTelemetryOptions.MemoryName, ClientTelemetryOptions.MemoryUnit);
memoryInfoPayload.SetAggregators(memoryHistogram);
memoryInfoPayload.SetAggregators(memoryHistogram, ClientTelemetryOptions.KbToMbFactor);
}

SystemInfo cpuInfoPayload = null;
if (cpuHistogram.TotalCount > 0)
{
cpuInfoPayload = new SystemInfo(ClientTelemetryOptions.CpuName, ClientTelemetryOptions.CpuUnit);
cpuInfoPayload.SetAggregators(cpuHistogram);
cpuInfoPayload.SetAggregators(cpuHistogram, ClientTelemetryOptions.HistogramPrecisionFactor);
}

return (cpuInfoPayload, memoryInfoPayload);
Expand All @@ -152,7 +153,7 @@ internal static List<OperationInfo> ToListWithMetricsInfo(IDictionary<OperationI
{
OperationInfo payloadForLatency = entry.Key;
payloadForLatency.MetricInfo = new MetricInfo(ClientTelemetryOptions.RequestLatencyName, ClientTelemetryOptions.RequestLatencyUnit);
payloadForLatency.SetAggregators(entry.Value.latency, ClientTelemetryOptions.HistogramPrecisionFactor);
payloadForLatency.SetAggregators(entry.Value.latency, ClientTelemetryOptions.TicksToMsFactor);

payloadWithMetricInformation.Add(payloadForLatency);

Expand Down
35 changes: 22 additions & 13 deletions Microsoft.Azure.Cosmos/src/Telemetry/ClientTelemetryOptions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,30 +13,42 @@ namespace Microsoft.Azure.Cosmos.Telemetry

internal class ClientTelemetryOptions
{
internal const String RequestKey = "telemetry";
// ConversionFactor used in Histogram calculation to maintain precision or to collect data in desired unit
internal const double HistogramPrecisionFactor = 100;
internal const double TicksToMsFactor = TimeSpan.TicksPerMillisecond;
internal const int KbToMbFactor = 1024;

internal const long HistogramPrecisionFactor = 100;

internal const int BytesToMb = 1024 * 1024;
internal const int OneKbToBytes = 1024;

internal const int RequestLatencyMax = Int32.MaxValue;
// Expecting histogram to have Minimum Latency of 1 and Maximum Latency of 1 hour (which is never going to happen)
internal const long RequestLatencyMax = TimeSpan.TicksPerHour;
internal const long RequestLatencyMin = 1;
internal const int RequestLatencyPrecision = 5;
internal const string RequestLatencyName = "RequestLatency";
internal const string RequestLatencyUnit = "MilliSecond";

// Expecting histogram to have Minimum Request Charge of 1 and Maximum Request Charge of 9999900
// Maximum precision can be set as 5 so consider maximum value in 5 digit i.e. 99999 of request charge
// So 99999 * HistogramPrecisionFactor = 9999900 is the maximum request charge we have set.
// For all the Document ReadWriteQuery Operations there will be at least 1 request charge.
internal const long RequestChargeMax = 9999900;
internal const long RequestChargeMin = 1;
internal const int RequestChargePrecision = 5;
internal const string RequestChargeName = "RequestCharge";
internal const string RequestChargeUnit = "RU";

internal const int CpuMax = 100;
internal const int CpuPrecision = 3;
// Expecting histogram to have Minimum CPU Usage of .001% and Maximum CPU Usage of 999.99%
internal const long CpuMax = 99999;
internal const long CpuMin = 1;
internal const int CpuPrecision = 5; // 100 (max CPU Usage) * 100(Historam Precision factor) = 10000 which is 5 digit and that is what we can set as maximum precision in histogram
internal const String CpuName = "CPU";
internal const String CpuUnit = "Percentage";

// Expecting histogram to have Minimum Memory Remaining of 1 MB and Maximum Memory Remaining of Long Max Value
internal const long MemoryMax = Int64.MaxValue;
internal const long MemoryMin = 1;
internal const int MemoryPrecision = 5;
internal const String MemoryName = "Memory Remaining";
internal const String MemoryName = "MemoryRemaining";
internal const String MemoryUnit = "MB";

internal const string DefaultVmMetadataUrL = "http://169.254.169.254/metadata/instance?api-version=2020-06-01";
Expand All @@ -58,9 +70,6 @@ internal class ClientTelemetryOptions

internal static readonly JsonSerializerSettings JsonSerializerSettings = new JsonSerializerSettings { NullValueHandling = NullValueHandling.Ignore };

internal static readonly int RequestChargeMax = 99999 * Convert.ToInt32(HistogramPrecisionFactor);
internal static readonly int RequestChargeMin = 1 * Convert.ToInt32(HistogramPrecisionFactor);

private static Uri vmMetadataUrl;
private static TimeSpan scheduledTimeSpan = TimeSpan.Zero;
private static Uri clientTelemetryEndpoint;
Expand Down Expand Up @@ -113,13 +122,13 @@ internal static async Task<AzureVMMetadata> ProcessResponseAsync(HttpResponseMes
return JObject.Parse(jsonVmInfo).ToObject<AzureVMMetadata>();
}

internal static string GetHostInformation(Compute vmInformation)
internal static string GetHostInformation(Compute vmInformation)
{
return String.Concat(vmInformation?.OSType, "|",
vmInformation?.SKU, "|",
vmInformation?.VMSize, "|",
vmInformation?.AzEnvironment);
}
}

internal static Uri GetClientTelemetryEndpoint()
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,11 +66,14 @@ internal ClientTelemetryProperties(string clientId,
this.ProcessId = processId;
this.UserAgent = userAgent;
this.ConnectionModeEnum = connectionMode;
this.ConnectionMode = connectionMode.ToString();
this.ConnectionMode = ClientTelemetryProperties.GetConnectionModeString(connectionMode);
this.SystemInfo = new List<SystemInfo>();
this.PreferredRegions = preferredRegions;
}

/// <summary>
/// Needed by Serializer to deserialize the json
/// </summary>
public ClientTelemetryProperties(string dateTimeUtc,
string clientId,
string processId,
Expand Down Expand Up @@ -99,5 +102,15 @@ public ClientTelemetryProperties(string dateTimeUtc,
this.OperationInfo = operationInfo;
this.PreferredRegions = preferredRegions;
}

private static string GetConnectionModeString(ConnectionMode connectionMode)
{
return connectionMode switch
{
Cosmos.ConnectionMode.Direct => "DIRECT",
Cosmos.ConnectionMode.Gateway => "GATEWAY",
_ => connectionMode.ToString().ToUpper(),
};
}
}
}
12 changes: 6 additions & 6 deletions Microsoft.Azure.Cosmos/src/Telemetry/MetricInfo.cs
Original file line number Diff line number Diff line change
Expand Up @@ -48,29 +48,29 @@ public MetricInfo(string metricsName,
internal long Count { get; set; }

[JsonProperty(PropertyName = "min")]
internal long Min { get; set; }
internal double Min { get; set; }

[JsonProperty(PropertyName = "max")]
internal long Max { get; set; }
internal double Max { get; set; }

[JsonProperty(PropertyName = "percentiles")]
internal IReadOnlyDictionary<Double, Double> Percentiles { get; set; }
internal IReadOnlyDictionary<double, double> Percentiles { get; set; }

/// <summary>
/// It will set the current object with the aggregated values from the given histogram
/// </summary>
/// <param name="histogram"></param>
/// <param name="adjustment"></param>
/// <returns>MetricInfo</returns>
internal MetricInfo SetAggregators(LongConcurrentHistogram histogram, long adjustment = 1)
internal MetricInfo SetAggregators(LongConcurrentHistogram histogram, double adjustment = 1)
{
if (histogram != null)
{
this.Count = histogram.TotalCount;
this.Max = histogram.GetMaxValue() / adjustment;
this.Min = histogram.GetMinValue() / adjustment;
this.Mean = histogram.GetMean() / Convert.ToDouble(adjustment);
IReadOnlyDictionary<Double, Double> percentile = new Dictionary<Double, Double>
this.Mean = histogram.GetMean() / adjustment;
IReadOnlyDictionary<double, double> percentile = new Dictionary<double, double>
{
{ ClientTelemetryOptions.Percentile50, histogram.GetValueAtPercentile(ClientTelemetryOptions.Percentile50) / adjustment },
{ ClientTelemetryOptions.Percentile90, histogram.GetValueAtPercentile(ClientTelemetryOptions.Percentile90) / adjustment },
Expand Down
22 changes: 20 additions & 2 deletions Microsoft.Azure.Cosmos/src/Telemetry/OperationInfo.cs
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ internal OperationInfo(string regionsContacted,
{
this.GreaterThan1Kb = responseSizeInBytes > ClientTelemetryOptions.OneKbToBytes;
}
this.Consistency = consistency?.ToString();
this.Consistency = OperationInfo.GetConsistencyString(consistency);
this.DatabaseName = databaseName;
this.ContainerName = containerName;
this.Operation = operation?.ToOperationTypeString();
Expand Down Expand Up @@ -93,6 +93,24 @@ public OperationInfo(string regionsContacted,
this.MetricInfo = metricInfo;
}

private static string GetConsistencyString(Cosmos.ConsistencyLevel? consistency)
{
if (consistency == null)
{
return null;
}

return consistency switch
{
Cosmos.ConsistencyLevel.Strong => "STRONG",
Cosmos.ConsistencyLevel.Session => "SESSION",
Cosmos.ConsistencyLevel.Eventual => "EVENTUAL",
Cosmos.ConsistencyLevel.ConsistentPrefix => "CONSISTENTPREFIX",
Cosmos.ConsistencyLevel.BoundedStaleness => "BOUNDEDSTALENESS",
_ => consistency.ToString().ToUpper(),
};
}

public OperationInfo Copy()
{
return new OperationInfo(this.RegionsContacted,
Expand Down Expand Up @@ -136,7 +154,7 @@ public override bool Equals(object obj)
return isequal;
}

internal void SetAggregators(LongConcurrentHistogram histogram, long adjustment = 1)
internal void SetAggregators(LongConcurrentHistogram histogram, double adjustment = 1)
{
this.MetricInfo.SetAggregators(histogram, adjustment);
}
Expand Down
2 changes: 1 addition & 1 deletion Microsoft.Azure.Cosmos/src/Telemetry/SystemInfo.cs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ public SystemInfo(MetricInfo metricInfo)
this.MetricInfo = metricInfo;
}

internal void SetAggregators(LongConcurrentHistogram histogram, long adjustment = 1)
internal void SetAggregators(LongConcurrentHistogram histogram, double adjustment = 1)
{
this.MetricInfo.SetAggregators(histogram, adjustment);
}
Expand Down
Loading

0 comments on commit 70ec89c

Please sign in to comment.