浏览代码

Merge pull request #3323 from xiang90/cl_health

etcdctl: use health endpoint to greatly simplify health checking
Xiang Li 10 年之前
父节点
当前提交
b0303e948c
共有 1 个文件被更改,包括 38 次插入104 次删除
  1. 38 104
      etcdctl/command/cluster_health.go

+ 38 - 104
etcdctl/command/cluster_health.go

@@ -2,12 +2,10 @@ package command
 
 import (
 	"encoding/json"
-	"errors"
 	"fmt"
 	"net/http"
 	"os"
 	"os/signal"
-	"sort"
 	"time"
 
 	"github.com/coreos/etcd/Godeps/_workspace/src/github.com/codegangsta/cli"
@@ -42,124 +40,60 @@ func handleClusterHealth(c *cli.Context) {
 		handleError(ExitServerError, err)
 	}
 
-	// TODO: update members when forever is set.
+	hc := http.Client{
+		Transport: tr,
+	}
+
 	mi := mustNewMembersAPI(c)
 	ms, err := mi.List(context.TODO())
 	if err != nil {
 		fmt.Println("cluster may be unhealthy: failed to list members")
 		handleError(ExitServerError, err)
 	}
-	cl := make([]string, 0)
-	for _, m := range ms {
-		cl = append(cl, m.ClientURLs...)
-	}
 
 	for {
-		// check the /health endpoint of all members first
-
-		ep, rs0, err := getLeaderStatus(tr, cl)
-		if err != nil {
-			fmt.Println("cluster may be unhealthy: failed to connect", cl)
-			if forever {
-				time.Sleep(10 * time.Second)
-				continue
+		health := false
+		for _, m := range ms {
+			checked := false
+			for _, url := range m.ClientURLs {
+				resp, err := hc.Get(url + "/health")
+				if err != nil {
+					fmt.Printf("failed to check the health of member %s on %s: %v\n", m.ID, url, err)
+					continue
+				}
+
+				result := struct{ Health string }{}
+				d := json.NewDecoder(resp.Body)
+				err = d.Decode(&result)
+				resp.Body.Close()
+				if err != nil {
+					fmt.Printf("failed to check the health of member %s on %s: %v\n", m.ID, url, err)
+					continue
+				}
+
+				checked = true
+				if result.Health == "true" {
+					checked = true
+					fmt.Printf("member %s is healthy: got healthy result from %s\n", m.ID, url)
+				} else {
+					fmt.Printf("member %s is unhealthy: got unhealthy result from %s\n", m.ID, url)
+				}
+				break
 			}
-			os.Exit(1)
-		}
-
-		time.Sleep(time.Second)
-
-		// are all the members makeing progress?
-		_, rs1, err := getLeaderStatus(tr, []string{ep})
-		if err != nil {
-			fmt.Println("cluster is unhealthy")
-			if forever {
-				time.Sleep(10 * time.Second)
-				continue
+			if !checked {
+				fmt.Printf("member %s is unreachable: %v are all unreachable\n", m.ID, m.ClientURLs)
 			}
-			os.Exit(1)
 		}
-
-		if rs1.Commit > rs0.Commit {
-			fmt.Printf("cluster is healthy: raft is making progress [commit index: %v->%v]\n", rs0.Commit, rs1.Commit)
+		if health {
+			fmt.Println("cluster is healthy")
 		} else {
-			fmt.Printf("cluster is unhealthy: raft is not making progress [commit index: %v]\n", rs0.Commit)
-		}
-		fmt.Printf("leader is %v\n", rs0.Lead)
-
-		var prints []string
-
-		for id, pr0 := range rs0.Progress {
-			pr1, ok := rs1.Progress[id]
-			if !ok {
-				// TODO: forever should handle configuration change.
-				fmt.Println("Cluster configuration changed during health checking. Please retry.")
-				os.Exit(1)
-			}
-			if pr1.Match <= pr0.Match {
-				prints = append(prints, fmt.Sprintf("member %s is unhealthy: raft is not making progress [match: %v->%v]\n", id, pr0.Match, pr1.Match))
-			} else {
-				prints = append(prints, fmt.Sprintf("member %s is healthy: raft is making progress [match: %v->%v]\n", id, pr0.Match, pr1.Match))
-			}
-		}
-
-		sort.Strings(prints)
-		for _, p := range prints {
-			fmt.Print(p)
+			fmt.Println("cluster is unhealthy")
 		}
 
 		if !forever {
-			return
+			break
 		}
-
+		fmt.Printf("\nnext check after 10 second...\n\n")
 		time.Sleep(10 * time.Second)
 	}
 }
-
-type raftStatus struct {
-	ID        string `json:"id"`
-	Term      uint64 `json:"term"`
-	Vote      string `json:"vote"`
-	Commit    uint64 `json:"commit"`
-	Lead      string `json:"lead"`
-	RaftState string `json:"raftState"`
-	Progress  map[string]struct {
-		Match uint64 `json:"match"`
-		Next  uint64 `json:"next"`
-		State string `json:"state"`
-	} `json:"progress"`
-}
-
-type vars struct {
-	RaftStatus raftStatus `json:"raft.status"`
-}
-
-func getLeaderStatus(tr *http.Transport, endpoints []string) (string, raftStatus, error) {
-	// TODO: use new etcd client
-	httpclient := http.Client{
-		Transport: tr,
-	}
-
-	for _, ep := range endpoints {
-		resp, err := httpclient.Get(ep + "/debug/vars")
-		if err != nil {
-			continue
-		}
-		defer resp.Body.Close()
-		if resp.StatusCode != http.StatusOK {
-			continue
-		}
-
-		vs := &vars{}
-		d := json.NewDecoder(resp.Body)
-		err = d.Decode(vs)
-		if err != nil {
-			continue
-		}
-		if vs.RaftStatus.Lead != vs.RaftStatus.ID {
-			continue
-		}
-		return ep, vs.RaftStatus, nil
-	}
-	return "", raftStatus{}, errors.New("no leader")
-}