Ver código fonte

Merge pull request #5289 from xiang90/has_leader_metrics

*: add has leader metrics
Xiang Li 9 anos atrás
pai
commit
34fbec118a
3 arquivos alterados com 26 adições e 4 exclusões
  1. 9 3
      Documentation/metrics.md
  2. 10 1
      etcdserver/metrics.go
  3. 7 0
      etcdserver/raft.go

+ 9 - 3
Documentation/metrics.md

@@ -20,9 +20,15 @@ These metrics describe the status of the etcd server. In order to detect outages
 
 
 All these metrics are prefixed with `etcd_server_`
 All these metrics are prefixed with `etcd_server_`
 
 
-| Name                      | Description                       | Type    |
-|---------------------------|-----------------------------------|---------|
-| leader_changes_seen_total | The number of leader changes seen | Counter |
+| Name                      | Description                                              | Type    |
+|---------------------------|----------------------------------------------------------|---------|
+| has_leader                | Whether or not a leader exists. 1 is existence, 0 is not.| Gauge   |
+| leader_changes_seen_total | The number of leader changes seen.                       | Counter |
+
+
+`has_leader` indicates whether the member has a leader. If a member does not have a leader, it is
+totally unavailable. If all the members in the cluster do not have any leader, the entire cluster
+is totally unavailable.
 
 
 `leader_changes_seen_total` counts the number of leader changes the member has seen since its start. Rapid leadership changes impact the performance of etcd significantly. It also signals that the leader is unstable, perhaps due to network connectivity issues or excessive load hitting the etcd cluster.
 `leader_changes_seen_total` counts the number of leader changes the member has seen since its start. Rapid leadership changes impact the performance of etcd significantly. It also signals that the leader is unstable, perhaps due to network connectivity issues or excessive load hitting the etcd cluster.
 
 

+ 10 - 1
etcdserver/metrics.go

@@ -44,11 +44,19 @@ var (
 		Name:      "proposals_failed_total",
 		Name:      "proposals_failed_total",
 		Help:      "The total number of failed proposals.",
 		Help:      "The total number of failed proposals.",
 	})
 	})
+
+	// stable metrics for monitoring
+	hasLeader = prometheus.NewGauge(prometheus.GaugeOpts{
+		Namespace: "etcd",
+		Subsystem: "server",
+		Name:      "has_leader",
+		Help:      "Whether or not a leader exists. 1 is existence, 0 is not.",
+	})
 	leaderChanges = prometheus.NewCounter(prometheus.CounterOpts{
 	leaderChanges = prometheus.NewCounter(prometheus.CounterOpts{
 		Namespace: "etcd",
 		Namespace: "etcd",
 		Subsystem: "server",
 		Subsystem: "server",
 		Name:      "leader_changes_seen_total",
 		Name:      "leader_changes_seen_total",
-		Help:      "The number of leader changes seen",
+		Help:      "The number of leader changes seen.",
 	})
 	})
 )
 )
 
 
@@ -56,6 +64,7 @@ func init() {
 	prometheus.MustRegister(proposeDurations)
 	prometheus.MustRegister(proposeDurations)
 	prometheus.MustRegister(proposePending)
 	prometheus.MustRegister(proposePending)
 	prometheus.MustRegister(proposeFailed)
 	prometheus.MustRegister(proposeFailed)
+	prometheus.MustRegister(hasLeader)
 	prometheus.MustRegister(leaderChanges)
 	prometheus.MustRegister(leaderChanges)
 }
 }
 
 

+ 7 - 0
etcdserver/raft.go

@@ -159,6 +159,13 @@ func (r *raftNode) start(s *EtcdServer) {
 						r.mu.Unlock()
 						r.mu.Unlock()
 						leaderChanges.Inc()
 						leaderChanges.Inc()
 					}
 					}
+
+					if rd.SoftState.Lead == raft.None {
+						hasLeader.Set(0)
+					} else {
+						hasLeader.Set(1)
+					}
+
 					atomic.StoreUint64(&r.lead, rd.SoftState.Lead)
 					atomic.StoreUint64(&r.lead, rd.SoftState.Lead)
 					if rd.RaftState == raft.StateLeader {
 					if rd.RaftState == raft.StateLeader {
 						islead = true
 						islead = true