From 964e32ccecdb65d8c1c649c7743c63cf91976fcd Mon Sep 17 00:00:00 2001 From: Andrei Smirnov Date: Tue, 16 Apr 2024 13:20:54 +0300 Subject: [PATCH] Addressed PR feedback --- cmd/markdown_internal_test.go | 3 + cmd/relay/metrics.go | 90 +++++++++++++++++ cmd/relay/metrics_tracer.go | 159 +++++++++++++++++++++++++++++++ cmd/relay/metrics_tracer_test.go | 78 +++++++++++++++ cmd/relay/p2p.go | 3 +- cmd/relay/relay_internal_test.go | 2 +- docs/metrics.md | 3 + 7 files changed, 335 insertions(+), 3 deletions(-) create mode 100644 cmd/relay/metrics_tracer.go create mode 100644 cmd/relay/metrics_tracer_test.go diff --git a/cmd/markdown_internal_test.go b/cmd/markdown_internal_test.go index b46a95a47..cf614466b 100644 --- a/cmd/markdown_internal_test.go +++ b/cmd/markdown_internal_test.go @@ -91,6 +91,9 @@ All metrics contain the following labels, so they are omitted from the table bel The 'cluster_*' labels uniquely identify a specific node's metrics which is required when storing metrics from multiple nodes or clusters in one Prometheus instance. +Several 'relay_p2p' metrics having 'int_' prefix are produced by libp2p relay implementation. +[These](https://github.com/libp2p/go-libp2p/pull/2154) are not mentioned in this doc. + | Name | Type | Help | Labels | |---|---|---|---| {{ range . }}| '{{ .FQName }}' | {{ .Type }} | {{ .Help }} | '{{ Join .Labels ", " }}' | diff --git a/cmd/relay/metrics.go b/cmd/relay/metrics.go index 8cf34958d..30d10c9b0 100644 --- a/cmd/relay/metrics.go +++ b/cmd/relay/metrics.go @@ -48,6 +48,96 @@ var ( Name: "ping_latency", Help: "Ping latency by peer and cluster", }, []string{"peer", "peer_cluster"}) + + // Relay metrics produced by libp2p. + // These are prefixed with "int_" to avoid conflicts with other metrics. + + intStatus = prometheus.NewGauge( + prometheus.GaugeOpts{ + Namespace: "relay", + Subsystem: "p2p", + Name: "int_status", + Help: "Relay Status", + }, + ) + + intReservationsTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "relay", + Subsystem: "p2p", + Name: "int_reservations_total", + Help: "Relay Reservation Request", + }, + []string{"type"}, + ) + + intReservationRequestResponseStatusTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "relay", + Subsystem: "p2p", + Name: "int_reservation_request_response_status_total", + Help: "Relay Reservation Request Response Status", + }, + []string{"status"}, + ) + + intReservationRejectionsTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "relay", + Subsystem: "p2p", + Name: "int_reservation_rejections_total", + Help: "Relay Reservation Rejected Reason", + }, + []string{"reason"}, + ) + + intConnectionsTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "relay", + Subsystem: "p2p", + Name: "int_connections_total", + Help: "Relay Connection Total", + }, + []string{"type"}, + ) + + intConnectionRequestResponseStatusTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "relay", + Subsystem: "p2p", + Name: "int_connection_request_response_status_total", + Help: "Relay Connection Request Status", + }, + []string{"status"}, + ) + + intConnectionRejectionsTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "relay", + Subsystem: "p2p", + Name: "int_connection_rejections_total", + Help: "Relay Connection Rejected Reason", + }, + []string{"reason"}, + ) + + intConnectionDurationSeconds = prometheus.NewHistogram( + prometheus.HistogramOpts{ + Namespace: "relay", + Subsystem: "p2p", + Name: "int_connection_duration_seconds", + Help: "Relay Connection Duration", + }, + ) + + intDataTransferredBytesTotal = prometheus.NewCounter( + prometheus.CounterOpts{ + Namespace: "relay", + Subsystem: "p2p", + Name: "int_data_transferred_bytes_total", + Help: "Bytes Transferred Total", + }, + ) ) // newBandwidthCounter returns a new bandwidth counter that stops counting when the context is cancelled. diff --git a/cmd/relay/metrics_tracer.go b/cmd/relay/metrics_tracer.go new file mode 100644 index 000000000..8f12e77a6 --- /dev/null +++ b/cmd/relay/metrics_tracer.go @@ -0,0 +1,159 @@ +// Copyright © 2022-2024 Obol Labs Inc. Licensed under the terms of a Business Source License 1.1 + +package relay + +import ( + "time" + + "github.com/libp2p/go-libp2p/p2p/metricshelper" + pbv2 "github.com/libp2p/go-libp2p/p2p/protocol/circuitv2/pb" + "github.com/libp2p/go-libp2p/p2p/protocol/circuitv2/relay" + "github.com/prometheus/client_golang/prometheus" +) + +// This implementation must be kept in sync with metricsTracer in +// https://github.com/libp2p/go-libp2p/blob/master/p2p/protocol/circuitv2/relay/metrics.go + +const ( + requestStatusOK = "ok" + requestStatusRejected = "rejected" + requestStatusError = "error" +) + +var collectors = []prometheus.Collector{ + intStatus, + intReservationsTotal, + intReservationRequestResponseStatusTotal, + intReservationRejectionsTotal, + intConnectionsTotal, + intConnectionRequestResponseStatusTotal, + intConnectionRejectionsTotal, + intConnectionDurationSeconds, + intDataTransferredBytesTotal, +} + +func NewMetricsTracer(promRegisterer prometheus.Registerer) relay.MetricsTracer { + metricshelper.RegisterCollectors(promRegisterer, collectors...) + + return &metricsTracer{} +} + +type metricsTracer struct{} + +func (*metricsTracer) RelayStatus(enabled bool) { + if enabled { + intStatus.Set(1) + } else { + intStatus.Set(0) + } +} + +func (*metricsTracer) ConnectionOpened() { + tags := metricshelper.GetStringSlice() + defer metricshelper.PutStringSlice(tags) + *tags = append(*tags, "opened") + + intConnectionsTotal.WithLabelValues(*tags...).Add(1) +} + +func (*metricsTracer) ConnectionClosed(d time.Duration) { + tags := metricshelper.GetStringSlice() + defer metricshelper.PutStringSlice(tags) + *tags = append(*tags, "closed") + + intConnectionsTotal.WithLabelValues(*tags...).Add(1) + intConnectionDurationSeconds.Observe(d.Seconds()) +} + +func (*metricsTracer) ConnectionRequestHandled(status pbv2.Status) { + tags := metricshelper.GetStringSlice() + defer metricshelper.PutStringSlice(tags) + + respStatus := getResponseStatus(status) + + *tags = append(*tags, respStatus) + intConnectionRequestResponseStatusTotal.WithLabelValues(*tags...).Add(1) + if respStatus == requestStatusRejected { + *tags = (*tags)[:0] + *tags = append(*tags, getRejectionReason(status)) + intConnectionRejectionsTotal.WithLabelValues(*tags...).Add(1) + } +} + +func (*metricsTracer) ReservationAllowed(isRenewal bool) { + tags := metricshelper.GetStringSlice() + defer metricshelper.PutStringSlice(tags) + if isRenewal { + *tags = append(*tags, "renewed") + } else { + *tags = append(*tags, "opened") + } + + intReservationsTotal.WithLabelValues(*tags...).Add(1) +} + +func (*metricsTracer) ReservationClosed(cnt int) { + tags := metricshelper.GetStringSlice() + defer metricshelper.PutStringSlice(tags) + *tags = append(*tags, "closed") + + intReservationsTotal.WithLabelValues(*tags...).Add(float64(cnt)) +} + +func (*metricsTracer) ReservationRequestHandled(status pbv2.Status) { + tags := metricshelper.GetStringSlice() + defer metricshelper.PutStringSlice(tags) + + respStatus := getResponseStatus(status) + + *tags = append(*tags, respStatus) + intReservationRequestResponseStatusTotal.WithLabelValues(*tags...).Add(1) + if respStatus == requestStatusRejected { + *tags = (*tags)[:0] + *tags = append(*tags, getRejectionReason(status)) + intReservationRejectionsTotal.WithLabelValues(*tags...).Add(1) + } +} + +func (*metricsTracer) BytesTransferred(cnt int) { + intDataTransferredBytesTotal.Add(float64(cnt)) +} + +func getResponseStatus(status pbv2.Status) string { + responseStatus := "unknown" + switch status { + case pbv2.Status_RESERVATION_REFUSED, + pbv2.Status_RESOURCE_LIMIT_EXCEEDED, + pbv2.Status_PERMISSION_DENIED, + pbv2.Status_NO_RESERVATION, + pbv2.Status_MALFORMED_MESSAGE: + + responseStatus = requestStatusRejected + case pbv2.Status_UNEXPECTED_MESSAGE, pbv2.Status_CONNECTION_FAILED: + responseStatus = requestStatusError + case pbv2.Status_OK: + responseStatus = requestStatusOK + default: + } + + return responseStatus +} + +func getRejectionReason(status pbv2.Status) string { + reason := "unknown" + switch status { + case pbv2.Status_RESERVATION_REFUSED: + reason = "ip constraint violation" + case pbv2.Status_RESOURCE_LIMIT_EXCEEDED: + reason = "resource limit exceeded" + case pbv2.Status_PERMISSION_DENIED: + reason = "permission denied" + case pbv2.Status_NO_RESERVATION: + reason = "no reservation" + case pbv2.Status_MALFORMED_MESSAGE: + reason = "malformed message" + default: + } + + return reason +} diff --git a/cmd/relay/metrics_tracer_test.go b/cmd/relay/metrics_tracer_test.go new file mode 100644 index 000000000..e9c21f846 --- /dev/null +++ b/cmd/relay/metrics_tracer_test.go @@ -0,0 +1,78 @@ +// Copyright © 2022-2024 Obol Labs Inc. Licensed under the terms of a Business Source License 1.1 + +package relay_test + +import ( + "strings" + "testing" + + pbv2 "github.com/libp2p/go-libp2p/p2p/protocol/circuitv2/pb" + p2p_relay "github.com/libp2p/go-libp2p/p2p/protocol/circuitv2/relay" + "github.com/prometheus/client_golang/prometheus" + "github.com/stretchr/testify/require" + + "github.com/obolnetwork/charon/app/promauto" + "github.com/obolnetwork/charon/cmd/relay" +) + +func TestMetricsTracer(t *testing.T) { + // The test runs both charon and libp2p MetricsTracer instances + // and compares the metric values produced by both. + // The only difference is metric namespace and name prefixes. + + charonReg, err := promauto.NewRegistry(prometheus.Labels{}) + require.NoError(t, err) + charonMT := relay.NewMetricsTracer(charonReg) + + libp2pReg, err := promauto.NewRegistry(prometheus.Labels{}) + require.NoError(t, err) + libp2pMT := p2p_relay.NewMetricsTracer(p2p_relay.WithRegisterer(libp2pReg)) + + charonMT.RelayStatus(true) + libp2pMT.RelayStatus(true) + charonMT.BytesTransferred(100) + libp2pMT.BytesTransferred(100) + charonMT.ConnectionOpened() + libp2pMT.ConnectionOpened() + charonMT.ConnectionClosed(0) + libp2pMT.ConnectionClosed(0) + charonMT.ReservationRequestHandled(pbv2.Status_CONNECTION_FAILED) + libp2pMT.ReservationRequestHandled(pbv2.Status_CONNECTION_FAILED) + charonMT.ReservationAllowed(true) + libp2pMT.ReservationAllowed(true) + charonMT.ConnectionRequestHandled(pbv2.Status_CONNECTION_FAILED) + libp2pMT.ConnectionRequestHandled(pbv2.Status_CONNECTION_FAILED) + charonMT.ReservationClosed(123) + libp2pMT.ReservationClosed(123) + + charonMetrics, err := charonReg.Gather() + require.NoError(t, err) + + libp2pMetrics, err := libp2pReg.Gather() + require.NoError(t, err) + + libp2pMetricsMap := make(map[string][]string) + for _, lm := range libp2pMetrics { + if strings.HasPrefix(lm.GetName(), "libp2p_relaysvc_") { + name := strings.TrimPrefix(lm.GetName(), "libp2p_relaysvc_") + var vals []string + for _, m := range lm.GetMetric() { + // timing fields go always last, safe to trim as they will differ + vals = append(vals, strings.Split(m.String(), "seconds:")[0]) + } + libp2pMetricsMap[name] = vals + } + } + + for _, cm := range charonMetrics { + if strings.HasPrefix(cm.GetName(), "relay_p2p_int_") { + name := strings.TrimPrefix(cm.GetName(), "relay_p2p_int_") + charonVals := cm.GetMetric() + libp2pVals := libp2pMetricsMap[name] + for i := 0; i < len(charonVals); i++ { + exp := strings.Split(charonVals[i].String(), "seconds:")[0] + require.Equal(t, exp, libp2pVals[i]) + } + } + } +} diff --git a/cmd/relay/p2p.go b/cmd/relay/p2p.go index c8748b744..793d0e7cd 100644 --- a/cmd/relay/p2p.go +++ b/cmd/relay/p2p.go @@ -65,8 +65,7 @@ func startP2P(ctx context.Context, config Config, key *k1.PrivateKey, reporter m relayResources.MaxReservations = config.MaxConns relayResources.MaxCircuits = config.MaxResPerPeer - // This enables relay metrics: https://github.com/libp2p/go-libp2p/blob/master/p2p/protocol/circuitv2/relay/metrics.go - mt := relay.NewMetricsTracer(relay.WithRegisterer(promRegistry)) + mt := NewMetricsTracer(promRegistry) relayService, err := relay.New(tcpNode, relay.WithResources(relayResources), relay.WithMetricsTracer(mt)) if err != nil { return nil, nil, errors.Wrap(err, "new relay service") diff --git a/cmd/relay/relay_internal_test.go b/cmd/relay/relay_internal_test.go index 44b08f5b1..2b3ae31ed 100644 --- a/cmd/relay/relay_internal_test.go +++ b/cmd/relay/relay_internal_test.go @@ -262,6 +262,6 @@ func TestRelayMetricsExported(t *testing.T) { } require.Eventually(t, func() bool { - return strings.Contains(fetchMetrics(), "libp2p_relaysvc_") + return strings.Contains(fetchMetrics(), "relay_p2p_int_status") }, 10*time.Second, time.Second, "waiting for relay service to start") } diff --git a/docs/metrics.md b/docs/metrics.md index 83224ad82..a34f45f91 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -11,6 +11,9 @@ All metrics contain the following labels, so they are omitted from the table bel The `cluster_*` labels uniquely identify a specific node`s metrics which is required when storing metrics from multiple nodes or clusters in one Prometheus instance. +Several `relay_p2p` metrics having `int_` prefix are produced by libp2p relay implementation. +[These](https://github.com/libp2p/go-libp2p/pull/2154) are not mentioned in this doc. + | Name | Type | Help | Labels | |---|---|---|---| | `app_beacon_node_peers` | Gauge | Gauge set to the peer count of the upstream beacon node | |