Skip to content

Commit

Permalink
server,tests: add additional lease metrics and test
Browse files Browse the repository at this point in the history
- metrics to capture leases attached and detached
- metric to capture initial lease count at startup

Signed-off-by: vivekpatani <[email protected]>
  • Loading branch information
vivekpatani committed Nov 4, 2024
1 parent 995027f commit ba35d47
Show file tree
Hide file tree
Showing 3 changed files with 194 additions and 0 deletions.
10 changes: 10 additions & 0 deletions server/lease/lessor.go
Original file line number Diff line number Diff line change
Expand Up @@ -280,10 +280,12 @@ func (le *lessor) SetCheckpointer(cp Checkpointer) {

func (le *lessor) Grant(id LeaseID, ttl int64) (*Lease, error) {
if id == NoLease {
leaseGrantError.WithLabelValues(ErrLeaseNotFound.Error()).Inc()
return nil, ErrLeaseNotFound
}

if ttl > MaxLeaseTTL {
leaseGrantError.WithLabelValues(ErrLeaseExists.Error()).Inc()
return nil, ErrLeaseTTLTooLarge
}

Expand Down Expand Up @@ -329,6 +331,7 @@ func (le *lessor) Revoke(id LeaseID) error {
l := le.leaseMap[id]
if l == nil {
le.mu.Unlock()
leaseRevokeError.WithLabelValues(ErrLeaseNotFound.Error()).Inc()
return ErrLeaseNotFound
}

Expand Down Expand Up @@ -418,12 +421,15 @@ func (le *lessor) Renew(id LeaseID) (int64, error) {
// quorum to be revoked. To be accurate, renew request must wait for the
// deletion to complete.
case <-l.revokec:
leaseRenewError.WithLabelValues(ErrLeaseNotFound.Error()).Inc()
return -1, ErrLeaseNotFound
// The expired lease might fail to be revoked if the primary changes.
// The caller will retry on ErrNotPrimary.
case <-demotec:
leaseRenewError.WithLabelValues(ErrNotPrimary.Error()).Inc()
return -1, ErrNotPrimary
case <-le.stopC:
leaseRenewError.WithLabelValues(ErrNotPrimary.Error()).Inc()
return -1, ErrNotPrimary
}
}
Expand All @@ -433,6 +439,7 @@ func (le *lessor) Renew(id LeaseID) (int64, error) {
// of RAFT entries written per lease to a max of 2 per checkpoint interval.
if clearRemainingTTL {
if err := le.cp(context.Background(), &pb.LeaseCheckpointRequest{Checkpoints: []*pb.LeaseCheckpoint{{ID: int64(l.ID), Remaining_TTL: 0}}}); err != nil {
leaseRenewError.WithLabelValues(err.Error()).Inc()
return -1, err
}
}
Expand Down Expand Up @@ -555,6 +562,7 @@ func (le *lessor) Attach(id LeaseID, items []LeaseItem) error {

l.mu.Lock()
for _, it := range items {
leaseAttached.Inc()
l.itemSet[it] = struct{}{}
le.itemMap[it] = id
}
Expand Down Expand Up @@ -582,6 +590,7 @@ func (le *lessor) Detach(id LeaseID, items []LeaseItem) error {

l.mu.Lock()
for _, it := range items {
leaseDetached.Inc()
delete(l.itemSet, it)
delete(le.itemMap, it)
}
Expand Down Expand Up @@ -821,6 +830,7 @@ func (le *lessor) initAndRecover() {
}
}
le.leaseExpiredNotifier.Init()
initLeaseCount.Add(float64(len(lpbs)))
heap.Init(&le.leaseCheckpointHeap)

le.b.ForceCommit()
Expand Down
47 changes: 47 additions & 0 deletions server/lease/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,58 @@ var (
// 1 second -> 3 months
Buckets: prometheus.ExponentialBuckets(1, 2, 24),
})

leaseAttached = prometheus.NewCounter(prometheus.CounterOpts{
Namespace: "etcd_debugging",
Subsystem: "lease",
Name: "attach_total",
Help: "The number of leases that are attached to a lease item.",
})

leaseDetached = prometheus.NewCounter(prometheus.CounterOpts{
Namespace: "etcd_debugging",
Subsystem: "lease",
Name: "detach_total",
Help: "The number of leases that are detached from a lease item.",
})

initLeaseCount = prometheus.NewCounter(prometheus.CounterOpts{
Namespace: "etcd_debugging",
Subsystem: "lease",
Name: "initial_lease_count",
Help: "Reports an initial lease count.",
})

leaseGrantError = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "etcd_debugging",
Subsystem: "lease",
Name: "grant_errors",
Help: "Error count by type to count for lease grants.",
}, []string{"error"})

leaseRevokeError = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "etcd_debugging",
Subsystem: "lease",
Name: "revoke_errors",
Help: "Error count by type to count for lease revokes.",
}, []string{"error"})

leaseRenewError = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "etcd_debugging",
Subsystem: "lease",
Name: "renew_errors",
Help: "Error count by type to count for lease renewals.",
}, []string{"error"})
)

func init() {
prometheus.MustRegister(leaseGranted)
prometheus.MustRegister(leaseRevoked)
prometheus.MustRegister(leaseRenewed)
prometheus.MustRegister(leaseTotalTTLs)
prometheus.MustRegister(leaseAttached)
prometheus.MustRegister(leaseDetached)
prometheus.MustRegister(leaseGrantError)
prometheus.MustRegister(leaseRevokeError)
prometheus.MustRegister(leaseRenewError)
}
137 changes: 137 additions & 0 deletions tests/integration/v3_lease_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"errors"
"fmt"
"math"
"strconv"
"testing"
"time"

Expand All @@ -38,6 +39,142 @@ import (
gofail "go.etcd.io/gofail/runtime"
)

// TestV3LeaseMetrics
func TestV3LeaseMetrics(t *testing.T) {
integration.BeforeTest(t)

clusterSize := 1

clus := integration.NewCluster(t, &integration.ClusterConfig{
Size: clusterSize,
EnableLeaseCheckpoint: true,
})
defer clus.Terminate(t)
clus.WaitLeader(t)

ctx, cancel := context.WithCancel(context.Background())
defer cancel()

lc := integration.ToGRPC(clus.RandClient()).Lease
kvc := integration.ToGRPC(clus.RandClient()).KV

// lease grant op
lresp, err := lc.LeaseGrant(ctx, &pb.LeaseGrantRequest{TTL: fiveMinTTL})
if err != nil {
t.Fatal(err)
}

// lease grant stats
v, err := clus.Members[0].Metric("etcd_debugging_lease_granted_total")
if err != nil {
t.Errorf("expected to get metric, error %s", err)
}
if v != strconv.Itoa(clusterSize) {
t.Errorf("expected: %d, got %s", clusterSize, v)
}

// attach the keys to the lease
_, err = kvc.Put(context.TODO(), &pb.PutRequest{Key: []byte("test"), Lease: lresp.ID})
if err != nil {
t.Fatal(err)
}

// lease attach stats
v, err = clus.Members[0].Metric("etcd_debugging_lease_attach_total")
if err != nil {
t.Errorf("expected to get metric, error %s", err)
}
if v != strconv.Itoa(clusterSize) {
t.Errorf("expected: %d, got %s", clusterSize, v)
}

// lease revoke op
_, err = lc.LeaseRevoke(ctx, &pb.LeaseRevokeRequest{ID: lresp.ID})
if err != nil {
t.Fatal(err)
}

// list all leases
llresp, err := lc.LeaseLeases(context.TODO(), &pb.LeaseLeasesRequest{})
if err != nil {
t.Fatal(err)
}
if len(llresp.Leases) != 0 {
t.Errorf("expected no leases, got %d", len(llresp.Leases))
}

// lease revoke stats
v, err = clus.Members[0].Metric("etcd_debugging_lease_revoked_total")
if err != nil {
t.Errorf("expected to get metric, error %s", err)
}
if v != strconv.Itoa(clusterSize) {
t.Errorf("expected: %d, got %s", clusterSize, v)
}

// lease detach stats
v, err = clus.Members[0].Metric("etcd_debugging_lease_detach_total")
if err != nil {
t.Errorf("expected to get metric, error %s", err)
}
if v != strconv.Itoa(clusterSize) {
t.Errorf("expected: %d, got %s", clusterSize, v)
}

_, err = lc.LeaseGrant(context.TODO(), &pb.LeaseGrantRequest{TTL: 4327842798472398})
if err == nil || !errors.Is(err, rpctypes.ErrGRPCLeaseTTLTooLarge) {
t.Errorf("expected: %+v, got %+v", rpctypes.ErrGRPCLeaseTTLTooLarge, err)
}

_, err = lc.LeaseRevoke(context.TODO(), &pb.LeaseRevokeRequest{ID: lresp.ID + 1})
if err == nil && !errors.Is(err, rpctypes.ErrLeaseNotFound) {
t.Errorf("expected: %+v, got %+v", clusterSize, err)
}

// lease errors - grant
v, err = clus.Members[0].Metric("etcd_debugging_lease_grant_errors")
if err != nil {
t.Errorf("expected to get metric, error %s", err.Error())
}
if v == "" {
t.Errorf("expected: %s, got %s", "", v)
}

// lease errors - revoke
v, err = clus.Members[0].Metric("etcd_debugging_lease_revoke_errors")
if err != nil {
t.Errorf("expected to get metric, error %s", err.Error())
}
if v == "" {
t.Errorf("expected: %s, got %s", "", v)
}

// restart instance
clus.Members[0].Stop(t)
err = clus.Members[0].Restart(t)
if err != nil {
t.Fatalf("error while restarting etcd member %d, %+v", 0, err)
}
clus.Members[0].WaitOK(t)

// lease initial count
leasesResp, err := lc.LeaseLeases(ctx, &pb.LeaseLeasesRequest{})
if err != nil {
t.Fatalf("error while fetch leases, %+v", err)
}
if len(leasesResp.Leases) != 1 {
t.Errorf("expected lease count: %d, got: %d", 1, len(leasesResp.Leases))
}

v, err = clus.Members[0].Metric("etcd_debugging_lease_initial_lease_count")
if err != nil {
t.Errorf("expected to get metric, error %s", err.Error())
}
if v == "" {
t.Errorf("expected: %s, got %s", "1", v)
}
}

// TestV3LeasePromote ensures the newly elected leader can promote itself
// to the primary lessor, refresh the leases and start to manage leases.
// TODO: use customized clock to make this test go faster?
Expand Down

0 comments on commit ba35d47

Please sign in to comment.