Browse Source

etcdserver: add "etcd_server_heartbeat_failures_total"

{"level":"warn","ts":1527101858.4149103,"caller":"etcdserver/raft.go:370","msg":"failed to send out heartbeat; took too long, server is overloaded likely from slow disk","heartbeat-interval":0.1,"expected-duration":0.2,"exceeded-duration":0.025771662}
{"level":"warn","ts":1527101858.4149644,"caller":"etcdserver/raft.go:370","msg":"failed to send out heartbeat; took too long, server is overloaded likely from slow disk","heartbeat-interval":0.1,"expected-duration":0.2,"exceeded-duration":0.034015766}

Signed-off-by: Gyuho Lee <gyuhox@gmail.com>
Gyuho Lee 7 years ago
parent
commit
896a5e4a2b
2 changed files with 11 additions and 2 deletions
  1. 7 0
      etcdserver/metrics.go
  2. 4 2
      etcdserver/raft.go

+ 7 - 0
etcdserver/metrics.go

@@ -42,6 +42,12 @@ var (
 		Name:      "leader_changes_seen_total",
 		Help:      "The number of leader changes seen.",
 	})
+	heartbeatFailures = prometheus.NewCounter(prometheus.CounterOpts{
+		Namespace: "etcd",
+		Subsystem: "server",
+		Name:      "heartbeat_failures_total",
+		Help:      "The total number of heartbeat send failures (likely overloaded from slow disk).",
+	})
 	proposalsCommitted = prometheus.NewGauge(prometheus.GaugeOpts{
 		Namespace: "etcd",
 		Subsystem: "server",
@@ -85,6 +91,7 @@ func init() {
 	prometheus.MustRegister(hasLeader)
 	prometheus.MustRegister(isLeader)
 	prometheus.MustRegister(leaderChanges)
+	prometheus.MustRegister(heartbeatFailures)
 	prometheus.MustRegister(proposalsCommitted)
 	prometheus.MustRegister(proposalsApplied)
 	prometheus.MustRegister(proposalsPending)

+ 4 - 2
etcdserver/raft.go

@@ -368,14 +368,16 @@ func (r *raftNode) processMessages(ms []raftpb.Message) []raftpb.Message {
 				// TODO: limit request rate.
 				if r.lg != nil {
 					r.lg.Warn(
-						"heartbeat took too long to send out; server is overloaded, likely from slow disk",
-						zap.Duration("exceeded", exceed),
+						"failed to send out heartbeat; took too long, server is overloaded likely from slow disk",
 						zap.Duration("heartbeat-interval", r.heartbeat),
+						zap.Duration("expected-duration", 2*r.heartbeat),
+						zap.Duration("exceeded-duration", exceed),
 					)
 				} else {
 					plog.Warningf("failed to send out heartbeat on time (exceeded the %v timeout for %v)", r.heartbeat, exceed)
 					plog.Warningf("server is likely overloaded")
 				}
+				heartbeatFailures.Inc()
 			}
 		}
 	}