|
@@ -0,0 +1,140 @@
|
|
|
|
|
+package command
|
|
|
|
|
+
|
|
|
|
|
+import (
|
|
|
|
|
+ "encoding/json"
|
|
|
|
|
+ "errors"
|
|
|
|
|
+ "fmt"
|
|
|
|
|
+ "net/http"
|
|
|
|
|
+ "os"
|
|
|
|
|
+ "sort"
|
|
|
|
|
+ "strings"
|
|
|
|
|
+ "time"
|
|
|
|
|
+
|
|
|
|
|
+ "github.com/coreos/etcd/Godeps/_workspace/src/github.com/codegangsta/cli"
|
|
|
|
|
+ "github.com/coreos/etcd/Godeps/_workspace/src/github.com/coreos/go-etcd/etcd"
|
|
|
|
|
+ "github.com/coreos/etcd/etcdserver/stats"
|
|
|
|
|
+)
|
|
|
|
|
+
|
|
|
|
|
+func NewClusterHealthCommand() cli.Command {
|
|
|
|
|
+ return cli.Command{
|
|
|
|
|
+ Name: "cluster-health",
|
|
|
|
|
+ Usage: "check the health of the etcd cluster",
|
|
|
|
|
+ Flags: []cli.Flag{},
|
|
|
|
|
+ Action: handleClusterHealth,
|
|
|
|
|
+ }
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+func handleClusterHealth(c *cli.Context) {
|
|
|
|
|
+ endpoints, err := getEndpoints(c)
|
|
|
|
|
+ if err != nil {
|
|
|
|
|
+ handleError(ErrorFromEtcd, err)
|
|
|
|
|
+ }
|
|
|
|
|
+ tr, err := getTransport(c)
|
|
|
|
|
+ if err != nil {
|
|
|
|
|
+ handleError(ErrorFromEtcd, err)
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ client := etcd.NewClient(endpoints)
|
|
|
|
|
+ client.SetTransport(tr)
|
|
|
|
|
+
|
|
|
|
|
+ if c.GlobalBool("debug") {
|
|
|
|
|
+ go dumpCURL(client)
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ if ok := client.SyncCluster(); !ok {
|
|
|
|
|
+ handleError(FailedToConnectToHost, errors.New("cannot sync with the cluster using endpoints "+strings.Join(endpoints, ", ")))
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // do we have a leader?
|
|
|
|
|
+ ep, ls0, err := getLeaderStats(tr, client.GetCluster())
|
|
|
|
|
+ if err != nil {
|
|
|
|
|
+ fmt.Println("cluster is unhealthy")
|
|
|
|
|
+ os.Exit(1)
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // is raft stable and making progress?
|
|
|
|
|
+ client = etcd.NewClient([]string{ep})
|
|
|
|
|
+ resp, err := client.Get("/", false, false)
|
|
|
|
|
+ if err != nil {
|
|
|
|
|
+ fmt.Println("cluster is unhealthy")
|
|
|
|
|
+ os.Exit(1)
|
|
|
|
|
+ }
|
|
|
|
|
+ rt0, ri0 := resp.RaftTerm, resp.RaftIndex
|
|
|
|
|
+ time.Sleep(time.Second)
|
|
|
|
|
+
|
|
|
|
|
+ resp, err = client.Get("/", false, false)
|
|
|
|
|
+ if err != nil {
|
|
|
|
|
+ fmt.Println("cluster is unhealthy")
|
|
|
|
|
+ os.Exit(1)
|
|
|
|
|
+ }
|
|
|
|
|
+ rt1, ri1 := resp.RaftTerm, resp.RaftIndex
|
|
|
|
|
+
|
|
|
|
|
+ if rt0 != rt1 {
|
|
|
|
|
+ fmt.Println("cluster is unhealthy")
|
|
|
|
|
+ os.Exit(1)
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ if ri1 == ri0 {
|
|
|
|
|
+ fmt.Println("cluster is unhealthy")
|
|
|
|
|
+ os.Exit(1)
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // are all the members makeing progress?
|
|
|
|
|
+ _, ls1, err := getLeaderStats(tr, []string{ep})
|
|
|
|
|
+ if err != nil {
|
|
|
|
|
+ fmt.Println("cluster is unhealthy")
|
|
|
|
|
+ os.Exit(1)
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ fmt.Println("cluster is healthy")
|
|
|
|
|
+ // self is healthy
|
|
|
|
|
+ var prints []string
|
|
|
|
|
+
|
|
|
|
|
+ prints = append(prints, fmt.Sprintf("member %s is healthy\n", ls1.Leader))
|
|
|
|
|
+ for name, fs0 := range ls0.Followers {
|
|
|
|
|
+ fs1, ok := ls1.Followers[name]
|
|
|
|
|
+ if !ok {
|
|
|
|
|
+ fmt.Println("Cluster configuration changed during health checking. Please retry.")
|
|
|
|
|
+ os.Exit(1)
|
|
|
|
|
+ }
|
|
|
|
|
+ if fs1.Counts.Success <= fs0.Counts.Success {
|
|
|
|
|
+ prints = append(prints, fmt.Sprintf("member %s is unhealthy\n", name))
|
|
|
|
|
+ } else {
|
|
|
|
|
+ prints = append(prints, fmt.Sprintf("member %s is healthy\n", name))
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ sort.Strings(prints)
|
|
|
|
|
+ for _, p := range prints {
|
|
|
|
|
+ fmt.Print(p)
|
|
|
|
|
+ }
|
|
|
|
|
+ os.Exit(0)
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+func getLeaderStats(tr *http.Transport, endpoints []string) (string, *stats.LeaderStats, error) {
|
|
|
|
|
+ // go-etcd does not support cluster stats, use http client for now
|
|
|
|
|
+ // TODO: use new etcd client with new member/stats endpoint
|
|
|
|
|
+ httpclient := http.Client{
|
|
|
|
|
+ Transport: tr,
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ for _, ep := range endpoints {
|
|
|
|
|
+ resp, err := httpclient.Get(ep + "/v2/stats/leader")
|
|
|
|
|
+ if err != nil {
|
|
|
|
|
+ continue
|
|
|
|
|
+ }
|
|
|
|
|
+ defer resp.Body.Close()
|
|
|
|
|
+ if resp.StatusCode != http.StatusOK {
|
|
|
|
|
+ continue
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ ls := &stats.LeaderStats{}
|
|
|
|
|
+ d := json.NewDecoder(resp.Body)
|
|
|
|
|
+ err = d.Decode(ls)
|
|
|
|
|
+ if err != nil {
|
|
|
|
|
+ continue
|
|
|
|
|
+ }
|
|
|
|
|
+ return ep, ls, nil
|
|
|
|
|
+ }
|
|
|
|
|
+ return "", nil, errors.New("no leader")
|
|
|
|
|
+}
|