Browse Source

etcdserver: add learner metrics

宇慕 6 years ago
parent
commit
0b8727b3f3
3 changed files with 52 additions and 0 deletions
  1. 15 0
      docs/metrics/latest
  2. 23 0
      etcdserver/metrics.go
  3. 14 0
      etcdserver/server.go

+ 15 - 0
docs/metrics/latest

@@ -533,6 +533,21 @@ etcd_server_is_leader
 # type: "counter"
 etcd_server_leader_changes_seen_total
 
+# name: "etcd_server_is_learner"
+# description: "Whether or not this member is a learner. 1 if is, 0 otherwise."
+# type: "gauge"
+etcd_server_is_learner
+
+# name: "etcd_server_learner_promote_failures"
+# description: "The total number of failed learner promotions (likely learner not ready) while this member is leader."
+# type: "counter"
+etcd_server_learner_promote_failures
+
+# name: "etcd_server_learner_promote_successes"
+# description: "The total number of successful learner promotions while this member is leader."
+# type: "counter"
+etcd_server_learner_promote_successes
+
 # name: "etcd_server_proposals_applied_total"
 # description: "The total number of consensus proposals applied."
 # type: "gauge"

+ 23 - 0
etcdserver/metrics.go

@@ -44,6 +44,26 @@ var (
 		Name:      "leader_changes_seen_total",
 		Help:      "The number of leader changes seen.",
 	})
+	isLearner = prometheus.NewGauge(prometheus.GaugeOpts{
+		Namespace: "etcd",
+		Subsystem: "server",
+		Name:      "is_learner",
+		Help:      "Whether or not this member is a learner. 1 if is, 0 otherwise.",
+	})
+	learnerPromoteFailed = prometheus.NewCounterVec(prometheus.CounterOpts{
+		Namespace: "etcd",
+		Subsystem: "server",
+		Name:      "learner_promote_failures",
+		Help:      "The total number of failed learner promotions (likely learner not ready) while this member is leader.",
+	},
+		[]string{"Reason"},
+	)
+	learnerPromoteSucceed = prometheus.NewCounter(prometheus.CounterOpts{
+		Namespace: "etcd",
+		Subsystem: "server",
+		Name:      "learner_promote_successes",
+		Help:      "The total number of successful learner promotions while this member is leader.",
+	})
 	heartbeatSendFailures = prometheus.NewCounter(prometheus.CounterOpts{
 		Namespace: "etcd",
 		Subsystem: "server",
@@ -144,6 +164,9 @@ func init() {
 	prometheus.MustRegister(currentVersion)
 	prometheus.MustRegister(currentGoVersion)
 	prometheus.MustRegister(serverID)
+	prometheus.MustRegister(isLearner)
+	prometheus.MustRegister(learnerPromoteSucceed)
+	prometheus.MustRegister(learnerPromoteFailed)
 
 	currentVersion.With(prometheus.Labels{
 		"server_version": version.Version,

+ 14 - 0
etcdserver/server.go

@@ -1646,7 +1646,12 @@ func (s *EtcdServer) PromoteMember(ctx context.Context, id uint64) ([]*membershi
 	// fails with ErrNotLeader, forward the request to leader node via HTTP. If promoteMember call fails with error
 	// other than ErrNotLeader, return the error.
 	resp, err := s.promoteMember(ctx, id)
+	if err == nil {
+		learnerPromoteSucceed.Inc()
+		return resp, nil
+	}
 	if err != ErrNotLeader {
+		learnerPromoteFailed.WithLabelValues(err.Error()).Inc()
 		return resp, err
 	}
 
@@ -2259,6 +2264,15 @@ func (s *EtcdServer) applyConfChange(cc raftpb.ConfChange, confState *raftpb.Con
 			}
 		}
 
+		// update the isLearner metric when this server id is equal to the id in raft member confChange
+		if confChangeContext.Member.ID == s.id {
+			if cc.Type == raftpb.ConfChangeAddLearnerNode {
+				isLearner.Set(1)
+			} else {
+				isLearner.Set(0)
+			}
+		}
+
 	case raftpb.ConfChangeRemoveNode:
 		id := types.ID(cc.NodeID)
 		s.cluster.RemoveMember(id)