Skip to content

Commit fc32034

Browse files
MrFreezeexjulianwiedmann
authored andcommitted
health: remove self reporting of cluster/node name
Those metrics should be directly inferred by users' Prometheus config. When Cilium installs ServiceMonitor we in fact already add nodes and we can pretty much expect users with multiple clusters to add their own label to differentiate clusters. Signed-off-by: Arthur Outhenin-Chalandre <git@mrfreezeex.fr>
1 parent fdc0786 commit fc32034

File tree

5 files changed

+113
-162
lines changed

5 files changed

+113
-162
lines changed

Documentation/observability/metrics.rst

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -504,12 +504,12 @@ Name Labels
504504
Node Connectivity
505505
~~~~~~~~~~~~~~~~~
506506

507-
============================================= ====================================================================================================================================================================== ========== ==================================================================================================================================================================================================================
508-
Name Labels Default Description
509-
============================================= ====================================================================================================================================================================== ========== ==================================================================================================================================================================================================================
510-
``node_health_connectivity_status`` ``source_cluster``, ``source_node_name``, ``type``, ``status`` Enabled Number of endpoints with last observed status of both ICMP and HTTP connectivity between the current Cilium agent and other Cilium nodes
511-
``node_health_connectivity_latency_seconds`` ``source_cluster``, ``source_node_name``, ``type``, ``address_type``, ``protocol`` Enabled Histogram of the last observed latency between the current Cilium agent and other Cilium nodes in seconds
512-
============================================= ====================================================================================================================================================================== ========== ==================================================================================================================================================================================================================
507+
============================================= ======================================== ========== ==========================================================================================================================================
508+
Name Labels Default Description
509+
============================================= ======================================== ========== ==========================================================================================================================================
510+
``node_health_connectivity_status`` ``type``, ``status`` Enabled Number of endpoints with last observed status of both ICMP and HTTP connectivity between the current Cilium agent and other Cilium nodes
511+
``node_health_connectivity_latency_seconds`` ``type``, ``address_type``, ``protocol`` Enabled Histogram of the last observed latency between the current Cilium agent and other Cilium nodes in seconds
512+
============================================= ======================================== ========== ==========================================================================================================================================
513513

514514
Clustermesh
515515
~~~~~~~~~~~

Documentation/operations/upgrade.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -395,6 +395,11 @@ now report per cluster metric instead of a "global" count and were renamed to re
395395
* ``cilium_operator_clustermesh_remote_cluster_services``
396396
* ``cilium_operator_clustermesh_remote_cluster_service_exports``
397397

398+
The following metrics no longer reports a ``source_cluster`` and a ``source_node_name`` label:
399+
* ``node_health_connectivity_status``
400+
* ``node_health_connectivity_latency_seconds``
401+
402+
398403
Deprecated Metrics
399404
~~~~~~~~~~~~~~~~~~
400405

pkg/health/server/server.go

Lines changed: 14 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ package server
66
import (
77
"fmt"
88
"log/slog"
9-
"path"
109
"time"
1110

1211
"github.com/cilium/cilium/api/v1/client/daemon"
@@ -16,7 +15,6 @@ import (
1615
"github.com/cilium/cilium/api/v1/models"
1716
"github.com/cilium/cilium/pkg/api"
1817
ciliumPkg "github.com/cilium/cilium/pkg/client"
19-
ciliumDefaults "github.com/cilium/cilium/pkg/defaults"
2018
healthClientPkg "github.com/cilium/cilium/pkg/health/client"
2119
"github.com/cilium/cilium/pkg/health/defaults"
2220
"github.com/cilium/cilium/pkg/health/probe"
@@ -171,8 +169,6 @@ func (s *Server) collectNodeConnectivityMetrics(report *healthReport) {
171169
if s.localStatus == nil || report == nil {
172170
return
173171
}
174-
localClusterName, localNodeName := getClusterNodeName(s.localStatus.Name)
175-
176172
endpointStatuses := make(map[healthClientPkg.ConnectivityStatusType]int)
177173
nodeStatuses := make(map[healthClientPkg.ConnectivityStatusType]int)
178174

@@ -214,70 +210,70 @@ func (s *Server) collectNodeConnectivityMetrics(report *healthReport) {
214210
s.nodesSeen[n.Name] = struct{}{}
215211

216212
// HTTP endpoint primary
217-
collectConnectivityMetric(s.logger, endpointPathStatus.PrimaryAddress.HTTP, localClusterName, localNodeName,
213+
collectConnectivityMetric(s.logger, endpointPathStatus.PrimaryAddress.HTTP,
218214
metrics.LabelPeerEndpoint, metrics.LabelTrafficHTTP, metrics.LabelAddressTypePrimary)
219215

220216
// HTTP endpoint secondary
221217
for _, secondary := range endpointPathStatus.SecondaryAddresses {
222-
collectConnectivityMetric(s.logger, secondary.HTTP, localClusterName, localNodeName,
218+
collectConnectivityMetric(s.logger, secondary.HTTP,
223219
metrics.LabelPeerEndpoint, metrics.LabelTrafficHTTP, metrics.LabelAddressTypeSecondary)
224220
}
225221

226222
// HTTP node primary
227-
collectConnectivityMetric(s.logger, nodePathPrimaryAddress.HTTP, localClusterName, localNodeName,
223+
collectConnectivityMetric(s.logger, nodePathPrimaryAddress.HTTP,
228224
metrics.LabelPeerNode, metrics.LabelTrafficHTTP, metrics.LabelAddressTypePrimary)
229225

230226
// HTTP node secondary
231227
for _, secondary := range nodePathSecondaryAddress {
232-
collectConnectivityMetric(s.logger, secondary.HTTP, localClusterName, localNodeName,
228+
collectConnectivityMetric(s.logger, secondary.HTTP,
233229
metrics.LabelPeerNode, metrics.LabelTrafficHTTP, metrics.LabelAddressTypeSecondary)
234230
}
235231

236232
// ICMP endpoint primary
237-
collectConnectivityMetric(s.logger, endpointPathStatus.PrimaryAddress.Icmp, localClusterName, localNodeName,
233+
collectConnectivityMetric(s.logger, endpointPathStatus.PrimaryAddress.Icmp,
238234
metrics.LabelPeerEndpoint, metrics.LabelTrafficICMP, metrics.LabelAddressTypePrimary)
239235

240236
// ICMP endpoint secondary
241237
for _, secondary := range endpointPathStatus.SecondaryAddresses {
242-
collectConnectivityMetric(s.logger, secondary.Icmp, localClusterName, localNodeName,
238+
collectConnectivityMetric(s.logger, secondary.Icmp,
243239
metrics.LabelPeerEndpoint, metrics.LabelTrafficICMP, metrics.LabelAddressTypeSecondary)
244240
}
245241

246242
// ICMP node primary
247-
collectConnectivityMetric(s.logger, nodePathPrimaryAddress.Icmp, localClusterName, localNodeName,
243+
collectConnectivityMetric(s.logger, nodePathPrimaryAddress.Icmp,
248244
metrics.LabelPeerNode, metrics.LabelTrafficICMP, metrics.LabelAddressTypePrimary)
249245

250246
// ICMP node secondary
251247
for _, secondary := range nodePathSecondaryAddress {
252-
collectConnectivityMetric(s.logger, secondary.Icmp, localClusterName, localNodeName,
248+
collectConnectivityMetric(s.logger, secondary.Icmp,
253249
metrics.LabelPeerNode, metrics.LabelTrafficICMP, metrics.LabelAddressTypeSecondary)
254250
}
255251
}
256252

257253
// Aggregated health statuses for endpoint connectivity
258254
metrics.NodeHealthConnectivityStatus.WithLabelValues(
259-
localClusterName, localNodeName, metrics.LabelPeerEndpoint, metrics.LabelReachable).
255+
metrics.LabelPeerEndpoint, metrics.LabelReachable).
260256
Set(float64(endpointStatuses[healthClientPkg.ConnStatusReachable]))
261257

262258
metrics.NodeHealthConnectivityStatus.WithLabelValues(
263-
localClusterName, localNodeName, metrics.LabelPeerEndpoint, metrics.LabelUnreachable).
259+
metrics.LabelPeerEndpoint, metrics.LabelUnreachable).
264260
Set(float64(endpointStatuses[healthClientPkg.ConnStatusUnreachable]))
265261

266262
metrics.NodeHealthConnectivityStatus.WithLabelValues(
267-
localClusterName, localNodeName, metrics.LabelPeerEndpoint, metrics.LabelUnknown).
263+
metrics.LabelPeerEndpoint, metrics.LabelUnknown).
268264
Set(float64(endpointStatuses[healthClientPkg.ConnStatusUnknown]))
269265

270266
// Aggregated health statuses for node connectivity
271267
metrics.NodeHealthConnectivityStatus.WithLabelValues(
272-
localClusterName, localNodeName, metrics.LabelPeerNode, metrics.LabelReachable).
268+
metrics.LabelPeerNode, metrics.LabelReachable).
273269
Set(float64(nodeStatuses[healthClientPkg.ConnStatusReachable]))
274270

275271
metrics.NodeHealthConnectivityStatus.WithLabelValues(
276-
localClusterName, localNodeName, metrics.LabelPeerNode, metrics.LabelUnreachable).
272+
metrics.LabelPeerNode, metrics.LabelUnreachable).
277273
Set(float64(nodeStatuses[healthClientPkg.ConnStatusUnreachable]))
278274

279275
metrics.NodeHealthConnectivityStatus.WithLabelValues(
280-
localClusterName, localNodeName, metrics.LabelPeerNode, metrics.LabelUnknown).
276+
metrics.LabelPeerNode, metrics.LabelUnknown).
281277
Set(float64(nodeStatuses[healthClientPkg.ConnStatusUnknown]))
282278
}
283279

@@ -292,16 +288,6 @@ func collectConnectivityMetric(logger *slog.Logger, status *healthModels.Connect
292288
}
293289
}
294290

295-
// getClusterNodeName returns the cluster name and node name if possible.
296-
func getClusterNodeName(str string) (string, string) {
297-
clusterName, nodeName := path.Split(str)
298-
if len(clusterName) == 0 {
299-
return ciliumDefaults.ClusterName, nodeName
300-
}
301-
// remove forward slash at the end if any for cluster name
302-
return path.Dir(clusterName), nodeName
303-
}
304-
305291
// GetStatusResponse returns the most recent cluster connectivity status.
306292
func (s *Server) GetStatusResponse() *healthModels.HealthStatusResponse {
307293
s.RLock()

0 commit comments

Comments
 (0)