Browse Source

etcdctl: refactor the way to check cluster health

This method uses raft status exposed at /debug/varz to determine the
health of the cluster. It uses whether commit index increases to
determine the cluster health, and uses whether match index increases to
determine the member health.

This could fix the bug #2711 that fails to detect follower is unhealthy
because it doesn't rely on whether message in long-polling connection is sent.

This health check is stricter than the old one, and reflects the
situation that whether followers are healthy in the view of the leader. One
example is that if the follower is receiving the snapshot, it will turns
out to be unhealthy because it doesn't move forward.

`etcdctl cluster-health` will reflect the healthy view in the raft level,
while connectivity checks reflects the healthy view in transport level.
Yicheng Qin 10 years ago
parent
commit
f1aaa7a9e3
1 changed files with 42 additions and 19 deletions
  1. 42 19
      etcdctl/command/cluster_health.go

+ 42 - 19
etcdctl/command/cluster_health.go

@@ -11,7 +11,6 @@ import (
 
 	"github.com/coreos/etcd/Godeps/_workspace/src/github.com/codegangsta/cli"
 	"github.com/coreos/etcd/Godeps/_workspace/src/golang.org/x/net/context"
-	"github.com/coreos/etcd/etcdserver/stats"
 )
 
 func NewClusterHealthCommand() cli.Command {
@@ -42,7 +41,7 @@ func handleClusterHealth(c *cli.Context) {
 
 	// check the /health endpoint of all members first
 
-	ep, ls0, err := getLeaderStats(tr, cl)
+	ep, rs0, err := getLeaderStatus(tr, cl)
 	if err != nil {
 		fmt.Println("cluster may be unhealthy: failed to connect", cl)
 		os.Exit(1)
@@ -51,27 +50,31 @@ func handleClusterHealth(c *cli.Context) {
 	time.Sleep(time.Second)
 
 	// are all the members makeing progress?
-	_, ls1, err := getLeaderStats(tr, []string{ep})
+	_, rs1, err := getLeaderStatus(tr, []string{ep})
 	if err != nil {
 		fmt.Println("cluster is unhealthy")
 		os.Exit(1)
 	}
 
-	fmt.Println("cluster is healthy")
-	// self is healthy
+	if rs1.Commit > rs0.Commit {
+		fmt.Printf("cluster is healthy: raft is making progress [commit index: %v->%v]\n", rs0.Commit, rs1.Commit)
+	} else {
+		fmt.Printf("cluster is unhealthy: raft is not making progress [commit index: %v]\n", rs0.Commit)
+	}
+	fmt.Printf("leader is %v\n", rs0.Lead)
+
 	var prints []string
 
-	prints = append(prints, fmt.Sprintf("member %s is healthy\n", ls1.Leader))
-	for name, fs0 := range ls0.Followers {
-		fs1, ok := ls1.Followers[name]
+	for id, pr0 := range rs0.Progress {
+		pr1, ok := rs1.Progress[id]
 		if !ok {
 			fmt.Println("Cluster configuration changed during health checking. Please retry.")
 			os.Exit(1)
 		}
-		if fs1.Counts.Success <= fs0.Counts.Success {
-			prints = append(prints, fmt.Sprintf("member %s is unhealthy\n", name))
+		if pr1.Match <= pr0.Match {
+			prints = append(prints, fmt.Sprintf("member %s is unhealthy: raft is not making progress [match: %v->%v]\n", id, pr0.Match, pr1.Match))
 		} else {
-			prints = append(prints, fmt.Sprintf("member %s is healthy\n", name))
+			prints = append(prints, fmt.Sprintf("member %s is healthy: raft is making progress [match: %v->%v]\n", id, pr0.Match, pr1.Match))
 		}
 	}
 
@@ -82,15 +85,32 @@ func handleClusterHealth(c *cli.Context) {
 	os.Exit(0)
 }
 
-func getLeaderStats(tr *http.Transport, endpoints []string) (string, *stats.LeaderStats, error) {
-	// go-etcd does not support cluster stats, use http client for now
-	// TODO: use new etcd client with new member/stats endpoint
+type raftStatus struct {
+	ID        string `json:"id"`
+	Term      uint64 `json:"term"`
+	Vote      string `json:"vote"`
+	Commit    uint64 `json:"commit"`
+	Lead      string `json:"lead"`
+	RaftState string `json:"raftState"`
+	Progress  map[string]struct {
+		Match uint64 `json:"match"`
+		Next  uint64 `json:"next"`
+		State string `json:"state"`
+	} `json:"progress"`
+}
+
+type vars struct {
+	RaftStatus raftStatus `json:"raft.status"`
+}
+
+func getLeaderStatus(tr *http.Transport, endpoints []string) (string, raftStatus, error) {
+	// TODO: use new etcd client
 	httpclient := http.Client{
 		Transport: tr,
 	}
 
 	for _, ep := range endpoints {
-		resp, err := httpclient.Get(ep + "/v2/stats/leader")
+		resp, err := httpclient.Get(ep + "/debug/vars")
 		if err != nil {
 			continue
 		}
@@ -99,13 +119,16 @@ func getLeaderStats(tr *http.Transport, endpoints []string) (string, *stats.Lead
 			continue
 		}
 
-		ls := &stats.LeaderStats{}
+		vs := &vars{}
 		d := json.NewDecoder(resp.Body)
-		err = d.Decode(ls)
+		err = d.Decode(vs)
 		if err != nil {
 			continue
 		}
-		return ep, ls, nil
+		if vs.RaftStatus.Lead != vs.RaftStatus.ID {
+			continue
+		}
+		return ep, vs.RaftStatus, nil
 	}
-	return "", nil, errors.New("no leader")
+	return "", raftStatus{}, errors.New("no leader")
 }