Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

server,tests: add additional lease metrics and test #18711

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions server/lease/lessor.go
Original file line number Diff line number Diff line change
Expand Up @@ -280,10 +280,12 @@ func (le *lessor) SetCheckpointer(cp Checkpointer) {

func (le *lessor) Grant(id LeaseID, ttl int64) (*Lease, error) {
if id == NoLease {
leaseGrantError.WithLabelValues(ErrLeaseNotFound.Error()).Inc()
return nil, ErrLeaseNotFound
}

if ttl > MaxLeaseTTL {
leaseGrantError.WithLabelValues(ErrLeaseExists.Error()).Inc()
return nil, ErrLeaseTTLTooLarge
}

Expand Down Expand Up @@ -329,6 +331,7 @@ func (le *lessor) Revoke(id LeaseID) error {
l := le.leaseMap[id]
if l == nil {
le.mu.Unlock()
leaseRevokeError.WithLabelValues(ErrLeaseNotFound.Error()).Inc()
return ErrLeaseNotFound
}

Expand Down Expand Up @@ -418,12 +421,15 @@ func (le *lessor) Renew(id LeaseID) (int64, error) {
// quorum to be revoked. To be accurate, renew request must wait for the
// deletion to complete.
case <-l.revokec:
leaseRenewError.WithLabelValues(ErrLeaseNotFound.Error()).Inc()
return -1, ErrLeaseNotFound
// The expired lease might fail to be revoked if the primary changes.
// The caller will retry on ErrNotPrimary.
case <-demotec:
leaseRenewError.WithLabelValues(ErrNotPrimary.Error()).Inc()
return -1, ErrNotPrimary
case <-le.stopC:
leaseRenewError.WithLabelValues(ErrNotPrimary.Error()).Inc()
return -1, ErrNotPrimary
}
}
Expand All @@ -433,6 +439,7 @@ func (le *lessor) Renew(id LeaseID) (int64, error) {
// of RAFT entries written per lease to a max of 2 per checkpoint interval.
if clearRemainingTTL {
if err := le.cp(context.Background(), &pb.LeaseCheckpointRequest{Checkpoints: []*pb.LeaseCheckpoint{{ID: int64(l.ID), Remaining_TTL: 0}}}); err != nil {
leaseRenewError.WithLabelValues(err.Error()).Inc()
return -1, err
}
}
Expand Down Expand Up @@ -555,6 +562,7 @@ func (le *lessor) Attach(id LeaseID, items []LeaseItem) error {

l.mu.Lock()
for _, it := range items {
leaseAttached.Inc()
l.itemSet[it] = struct{}{}
le.itemMap[it] = id
}
Expand Down Expand Up @@ -582,6 +590,7 @@ func (le *lessor) Detach(id LeaseID, items []LeaseItem) error {

l.mu.Lock()
for _, it := range items {
leaseDetached.Inc()
delete(l.itemSet, it)
delete(le.itemMap, it)
}
Expand Down Expand Up @@ -821,6 +830,7 @@ func (le *lessor) initAndRecover() {
}
}
le.leaseExpiredNotifier.Init()
initLeaseCount.Add(float64(len(lpbs)))
heap.Init(&le.leaseCheckpointHeap)

le.b.ForceCommit()
Expand Down
47 changes: 47 additions & 0 deletions server/lease/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,58 @@ var (
// 1 second -> 3 months
Buckets: prometheus.ExponentialBuckets(1, 2, 24),
})

leaseAttached = prometheus.NewCounter(prometheus.CounterOpts{
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the purpose of metrics about lease attachements? What you are trying to measure? number of keys per lease?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

number of keys per lease?

Precisely, yes.

Namespace: "etcd_debugging",
Subsystem: "lease",
Name: "attach_total",
Help: "The number of leases that are attached to a lease item.",
})

leaseDetached = prometheus.NewCounter(prometheus.CounterOpts{
Namespace: "etcd_debugging",
Subsystem: "lease",
Name: "detach_total",
Help: "The number of leases that are detached from a lease item.",
})
Comment on lines +53 to +65
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I feel like we can consolidate the two metrics (leaseAttached and leaseDetached) into one something like below,

	leaseTotalAttachedKeys = prometheus.NewGaugeVec(prometheus.GaugeOpts{
		Namespace: "etcd_debugging",
		Subsystem: "lease",
		Name:      "attached_keys_total",
		Help:      "Total number of attached key for each lease",
	},
		[]string{"lease_id"},
	)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't have enough background about cardinality but my understanding is that, adding a key multiplies the cardinalities. But if not, then the idea sounds good. @ahrtr thoughts?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

adding a key multiplies the cardinalities

Yes, it's true. Indeed, It isn't a good idea to add label "lease_id". The prometheus official document clearly clarifies it https://prometheus.io/docs/practices/naming/#labels

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So

  • either keep it as it's (two metrics, leaseAttached and leaseDetached), but shouldn't them counter instead of gauge?
  • or consolidate them into one as mentioned above, but without the label "lease_id". In this case, it should be Gauge for sure.

Copy link
Member

@serathius serathius Oct 24, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have background in metrics cardinality. label_id is a big no-no. Please note that not everything can be expressed in metrics, they are aggregations that allow you to observe state of program. Some things, especially for debugging purposes are better expressed as events. So write logs.

Copy link
Contributor Author

@vivekpatani vivekpatani Nov 4, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

or consolidate them into one as mentioned above, but without the label "lease_id". In this case, it should be Gauge for sure.

This seems like a good idea. Thoughts? @serathius

Some things, especially for debugging purposes are better expressed as events. So write logs

Logs are great but sometimes it's difficult to miss a trend, when there's log. Makes it easier to setup a trigger based on logs, than the trend of metrics, if that makes sense.


initLeaseCount = prometheus.NewCounter(prometheus.CounterOpts{
Namespace: "etcd_debugging",
Subsystem: "lease",
Name: "initial_lease_count",
Help: "Reports an initial lease count.",
})

leaseGrantError = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "etcd_debugging",
Subsystem: "lease",
Name: "grant_errors",
Help: "Error count by type to count for lease grants.",
}, []string{"error"})

leaseRevokeError = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "etcd_debugging",
Subsystem: "lease",
Name: "revoke_errors",
Help: "Error count by type to count for lease revokes.",
}, []string{"error"})

leaseRenewError = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "etcd_debugging",
Subsystem: "lease",
Name: "renew_errors",
Help: "Error count by type to count for lease renewals.",
}, []string{"error"})
)

func init() {
prometheus.MustRegister(leaseGranted)
prometheus.MustRegister(leaseRevoked)
prometheus.MustRegister(leaseRenewed)
prometheus.MustRegister(leaseTotalTTLs)
prometheus.MustRegister(leaseAttached)
prometheus.MustRegister(leaseDetached)
prometheus.MustRegister(leaseGrantError)
prometheus.MustRegister(leaseRevokeError)
prometheus.MustRegister(leaseRenewError)
}
137 changes: 137 additions & 0 deletions tests/integration/v3_lease_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"errors"
"fmt"
"math"
"strconv"
"testing"
"time"

Expand All @@ -38,6 +39,142 @@ import (
gofail "go.etcd.io/gofail/runtime"
)

// TestV3LeaseMetrics
func TestV3LeaseMetrics(t *testing.T) {
integration.BeforeTest(t)

clusterSize := 1

clus := integration.NewCluster(t, &integration.ClusterConfig{
Size: clusterSize,
EnableLeaseCheckpoint: true,
})
defer clus.Terminate(t)
clus.WaitLeader(t)

ctx, cancel := context.WithCancel(context.Background())
defer cancel()

lc := integration.ToGRPC(clus.RandClient()).Lease
kvc := integration.ToGRPC(clus.RandClient()).KV

// lease grant op
lresp, err := lc.LeaseGrant(ctx, &pb.LeaseGrantRequest{TTL: fiveMinTTL})
if err != nil {
t.Fatal(err)
}

// lease grant stats
v, err := clus.Members[0].Metric("etcd_debugging_lease_granted_total")
if err != nil {
t.Errorf("expected to get metric, error %s", err)
}
if v != strconv.Itoa(clusterSize) {
t.Errorf("expected: %d, got %s", clusterSize, v)
}

// attach the keys to the lease
_, err = kvc.Put(context.TODO(), &pb.PutRequest{Key: []byte("test"), Lease: lresp.ID})
if err != nil {
t.Fatal(err)
}

// lease attach stats
v, err = clus.Members[0].Metric("etcd_debugging_lease_attach_total")
if err != nil {
t.Errorf("expected to get metric, error %s", err)
}
if v != strconv.Itoa(clusterSize) {
t.Errorf("expected: %d, got %s", clusterSize, v)
}

// lease revoke op
_, err = lc.LeaseRevoke(ctx, &pb.LeaseRevokeRequest{ID: lresp.ID})
if err != nil {
t.Fatal(err)
}

// list all leases
llresp, err := lc.LeaseLeases(context.TODO(), &pb.LeaseLeasesRequest{})
if err != nil {
t.Fatal(err)
}
if len(llresp.Leases) != 0 {
t.Errorf("expected no leases, got %d", len(llresp.Leases))
}

// lease revoke stats
v, err = clus.Members[0].Metric("etcd_debugging_lease_revoked_total")
if err != nil {
t.Errorf("expected to get metric, error %s", err)
}
if v != strconv.Itoa(clusterSize) {
t.Errorf("expected: %d, got %s", clusterSize, v)
}

// lease detach stats
v, err = clus.Members[0].Metric("etcd_debugging_lease_detach_total")
if err != nil {
t.Errorf("expected to get metric, error %s", err)
}
if v != strconv.Itoa(clusterSize) {
t.Errorf("expected: %d, got %s", clusterSize, v)
}

_, err = lc.LeaseGrant(context.TODO(), &pb.LeaseGrantRequest{TTL: 4327842798472398})
if err == nil || !errors.Is(err, rpctypes.ErrGRPCLeaseTTLTooLarge) {
t.Errorf("expected: %+v, got %+v", rpctypes.ErrGRPCLeaseTTLTooLarge, err)
}

_, err = lc.LeaseRevoke(context.TODO(), &pb.LeaseRevokeRequest{ID: lresp.ID + 1})
if err == nil && !errors.Is(err, rpctypes.ErrLeaseNotFound) {
t.Errorf("expected: %+v, got %+v", clusterSize, err)
}

// lease errors - grant
v, err = clus.Members[0].Metric("etcd_debugging_lease_grant_errors")
if err != nil {
t.Errorf("expected to get metric, error %s", err.Error())
}
if v == "" {
t.Errorf("expected: %s, got %s", "", v)
}

// lease errors - revoke
v, err = clus.Members[0].Metric("etcd_debugging_lease_revoke_errors")
if err != nil {
t.Errorf("expected to get metric, error %s", err.Error())
}
if v == "" {
t.Errorf("expected: %s, got %s", "", v)
}

// restart instance
clus.Members[0].Stop(t)
err = clus.Members[0].Restart(t)
if err != nil {
t.Fatalf("error while restarting etcd member %d, %+v", 0, err)
}
clus.Members[0].WaitOK(t)

// lease initial count
leasesResp, err := lc.LeaseLeases(ctx, &pb.LeaseLeasesRequest{})
if err != nil {
t.Fatalf("error while fetch leases, %+v", err)
}
if len(leasesResp.Leases) != 1 {
t.Errorf("expected lease count: %d, got: %d", 1, len(leasesResp.Leases))
}

v, err = clus.Members[0].Metric("etcd_debugging_lease_initial_lease_count")
if err != nil {
t.Errorf("expected to get metric, error %s", err.Error())
}
if v == "" {
t.Errorf("expected: %s, got %s", "1", v)
}
}

// TestV3LeasePromote ensures the newly elected leader can promote itself
// to the primary lessor, refresh the leases and start to manage leases.
// TODO: use customized clock to make this test go faster?
Expand Down
Loading