From ba35d471dfae1de5fd002069c4d055afb230f142 Mon Sep 17 00:00:00 2001 From: vivekpatani <9080894+vivekpatani@users.noreply.github.com> Date: Wed, 9 Oct 2024 15:53:10 -0700 Subject: [PATCH] server,tests: add additional lease metrics and test - metrics to capture leases attached and detached - metric to capture initial lease count at startup Signed-off-by: vivekpatani <9080894+vivekpatani@users.noreply.github.com> --- server/lease/lessor.go | 10 +++ server/lease/metrics.go | 47 ++++++++++ tests/integration/v3_lease_test.go | 137 +++++++++++++++++++++++++++++ 3 files changed, 194 insertions(+) diff --git a/server/lease/lessor.go b/server/lease/lessor.go index cf2028933d5..aaa1bfb0f53 100644 --- a/server/lease/lessor.go +++ b/server/lease/lessor.go @@ -280,10 +280,12 @@ func (le *lessor) SetCheckpointer(cp Checkpointer) { func (le *lessor) Grant(id LeaseID, ttl int64) (*Lease, error) { if id == NoLease { + leaseGrantError.WithLabelValues(ErrLeaseNotFound.Error()).Inc() return nil, ErrLeaseNotFound } if ttl > MaxLeaseTTL { + leaseGrantError.WithLabelValues(ErrLeaseExists.Error()).Inc() return nil, ErrLeaseTTLTooLarge } @@ -329,6 +331,7 @@ func (le *lessor) Revoke(id LeaseID) error { l := le.leaseMap[id] if l == nil { le.mu.Unlock() + leaseRevokeError.WithLabelValues(ErrLeaseNotFound.Error()).Inc() return ErrLeaseNotFound } @@ -418,12 +421,15 @@ func (le *lessor) Renew(id LeaseID) (int64, error) { // quorum to be revoked. To be accurate, renew request must wait for the // deletion to complete. case <-l.revokec: + leaseRenewError.WithLabelValues(ErrLeaseNotFound.Error()).Inc() return -1, ErrLeaseNotFound // The expired lease might fail to be revoked if the primary changes. // The caller will retry on ErrNotPrimary. case <-demotec: + leaseRenewError.WithLabelValues(ErrNotPrimary.Error()).Inc() return -1, ErrNotPrimary case <-le.stopC: + leaseRenewError.WithLabelValues(ErrNotPrimary.Error()).Inc() return -1, ErrNotPrimary } } @@ -433,6 +439,7 @@ func (le *lessor) Renew(id LeaseID) (int64, error) { // of RAFT entries written per lease to a max of 2 per checkpoint interval. if clearRemainingTTL { if err := le.cp(context.Background(), &pb.LeaseCheckpointRequest{Checkpoints: []*pb.LeaseCheckpoint{{ID: int64(l.ID), Remaining_TTL: 0}}}); err != nil { + leaseRenewError.WithLabelValues(err.Error()).Inc() return -1, err } } @@ -555,6 +562,7 @@ func (le *lessor) Attach(id LeaseID, items []LeaseItem) error { l.mu.Lock() for _, it := range items { + leaseAttached.Inc() l.itemSet[it] = struct{}{} le.itemMap[it] = id } @@ -582,6 +590,7 @@ func (le *lessor) Detach(id LeaseID, items []LeaseItem) error { l.mu.Lock() for _, it := range items { + leaseDetached.Inc() delete(l.itemSet, it) delete(le.itemMap, it) } @@ -821,6 +830,7 @@ func (le *lessor) initAndRecover() { } } le.leaseExpiredNotifier.Init() + initLeaseCount.Add(float64(len(lpbs))) heap.Init(&le.leaseCheckpointHeap) le.b.ForceCommit() diff --git a/server/lease/metrics.go b/server/lease/metrics.go index 06f8b58015f..94f9177c89f 100644 --- a/server/lease/metrics.go +++ b/server/lease/metrics.go @@ -49,6 +49,48 @@ var ( // 1 second -> 3 months Buckets: prometheus.ExponentialBuckets(1, 2, 24), }) + + leaseAttached = prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "etcd_debugging", + Subsystem: "lease", + Name: "attach_total", + Help: "The number of leases that are attached to a lease item.", + }) + + leaseDetached = prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "etcd_debugging", + Subsystem: "lease", + Name: "detach_total", + Help: "The number of leases that are detached from a lease item.", + }) + + initLeaseCount = prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "etcd_debugging", + Subsystem: "lease", + Name: "initial_lease_count", + Help: "Reports an initial lease count.", + }) + + leaseGrantError = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "etcd_debugging", + Subsystem: "lease", + Name: "grant_errors", + Help: "Error count by type to count for lease grants.", + }, []string{"error"}) + + leaseRevokeError = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "etcd_debugging", + Subsystem: "lease", + Name: "revoke_errors", + Help: "Error count by type to count for lease revokes.", + }, []string{"error"}) + + leaseRenewError = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "etcd_debugging", + Subsystem: "lease", + Name: "renew_errors", + Help: "Error count by type to count for lease renewals.", + }, []string{"error"}) ) func init() { @@ -56,4 +98,9 @@ func init() { prometheus.MustRegister(leaseRevoked) prometheus.MustRegister(leaseRenewed) prometheus.MustRegister(leaseTotalTTLs) + prometheus.MustRegister(leaseAttached) + prometheus.MustRegister(leaseDetached) + prometheus.MustRegister(leaseGrantError) + prometheus.MustRegister(leaseRevokeError) + prometheus.MustRegister(leaseRenewError) } diff --git a/tests/integration/v3_lease_test.go b/tests/integration/v3_lease_test.go index 339b05a526d..4f1a4f6c218 100644 --- a/tests/integration/v3_lease_test.go +++ b/tests/integration/v3_lease_test.go @@ -19,6 +19,7 @@ import ( "errors" "fmt" "math" + "strconv" "testing" "time" @@ -38,6 +39,142 @@ import ( gofail "go.etcd.io/gofail/runtime" ) +// TestV3LeaseMetrics +func TestV3LeaseMetrics(t *testing.T) { + integration.BeforeTest(t) + + clusterSize := 1 + + clus := integration.NewCluster(t, &integration.ClusterConfig{ + Size: clusterSize, + EnableLeaseCheckpoint: true, + }) + defer clus.Terminate(t) + clus.WaitLeader(t) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + lc := integration.ToGRPC(clus.RandClient()).Lease + kvc := integration.ToGRPC(clus.RandClient()).KV + + // lease grant op + lresp, err := lc.LeaseGrant(ctx, &pb.LeaseGrantRequest{TTL: fiveMinTTL}) + if err != nil { + t.Fatal(err) + } + + // lease grant stats + v, err := clus.Members[0].Metric("etcd_debugging_lease_granted_total") + if err != nil { + t.Errorf("expected to get metric, error %s", err) + } + if v != strconv.Itoa(clusterSize) { + t.Errorf("expected: %d, got %s", clusterSize, v) + } + + // attach the keys to the lease + _, err = kvc.Put(context.TODO(), &pb.PutRequest{Key: []byte("test"), Lease: lresp.ID}) + if err != nil { + t.Fatal(err) + } + + // lease attach stats + v, err = clus.Members[0].Metric("etcd_debugging_lease_attach_total") + if err != nil { + t.Errorf("expected to get metric, error %s", err) + } + if v != strconv.Itoa(clusterSize) { + t.Errorf("expected: %d, got %s", clusterSize, v) + } + + // lease revoke op + _, err = lc.LeaseRevoke(ctx, &pb.LeaseRevokeRequest{ID: lresp.ID}) + if err != nil { + t.Fatal(err) + } + + // list all leases + llresp, err := lc.LeaseLeases(context.TODO(), &pb.LeaseLeasesRequest{}) + if err != nil { + t.Fatal(err) + } + if len(llresp.Leases) != 0 { + t.Errorf("expected no leases, got %d", len(llresp.Leases)) + } + + // lease revoke stats + v, err = clus.Members[0].Metric("etcd_debugging_lease_revoked_total") + if err != nil { + t.Errorf("expected to get metric, error %s", err) + } + if v != strconv.Itoa(clusterSize) { + t.Errorf("expected: %d, got %s", clusterSize, v) + } + + // lease detach stats + v, err = clus.Members[0].Metric("etcd_debugging_lease_detach_total") + if err != nil { + t.Errorf("expected to get metric, error %s", err) + } + if v != strconv.Itoa(clusterSize) { + t.Errorf("expected: %d, got %s", clusterSize, v) + } + + _, err = lc.LeaseGrant(context.TODO(), &pb.LeaseGrantRequest{TTL: 4327842798472398}) + if err == nil || !errors.Is(err, rpctypes.ErrGRPCLeaseTTLTooLarge) { + t.Errorf("expected: %+v, got %+v", rpctypes.ErrGRPCLeaseTTLTooLarge, err) + } + + _, err = lc.LeaseRevoke(context.TODO(), &pb.LeaseRevokeRequest{ID: lresp.ID + 1}) + if err == nil && !errors.Is(err, rpctypes.ErrLeaseNotFound) { + t.Errorf("expected: %+v, got %+v", clusterSize, err) + } + + // lease errors - grant + v, err = clus.Members[0].Metric("etcd_debugging_lease_grant_errors") + if err != nil { + t.Errorf("expected to get metric, error %s", err.Error()) + } + if v == "" { + t.Errorf("expected: %s, got %s", "", v) + } + + // lease errors - revoke + v, err = clus.Members[0].Metric("etcd_debugging_lease_revoke_errors") + if err != nil { + t.Errorf("expected to get metric, error %s", err.Error()) + } + if v == "" { + t.Errorf("expected: %s, got %s", "", v) + } + + // restart instance + clus.Members[0].Stop(t) + err = clus.Members[0].Restart(t) + if err != nil { + t.Fatalf("error while restarting etcd member %d, %+v", 0, err) + } + clus.Members[0].WaitOK(t) + + // lease initial count + leasesResp, err := lc.LeaseLeases(ctx, &pb.LeaseLeasesRequest{}) + if err != nil { + t.Fatalf("error while fetch leases, %+v", err) + } + if len(leasesResp.Leases) != 1 { + t.Errorf("expected lease count: %d, got: %d", 1, len(leasesResp.Leases)) + } + + v, err = clus.Members[0].Metric("etcd_debugging_lease_initial_lease_count") + if err != nil { + t.Errorf("expected to get metric, error %s", err.Error()) + } + if v == "" { + t.Errorf("expected: %s, got %s", "1", v) + } +} + // TestV3LeasePromote ensures the newly elected leader can promote itself // to the primary lessor, refresh the leases and start to manage leases. // TODO: use customized clock to make this test go faster?