Browse Source

etcdctl: support healthy checking

Xiang Li 11 years ago
parent
commit
7bbdad9068
2 changed files with 141 additions and 0 deletions
  1. 140 0
      etcdctl/command/cluster_health.go
  2. 1 0
      etcdctl/main.go

+ 140 - 0
etcdctl/command/cluster_health.go

@@ -0,0 +1,140 @@
+package command
+
+import (
+	"encoding/json"
+	"errors"
+	"fmt"
+	"net/http"
+	"os"
+	"sort"
+	"strings"
+	"time"
+
+	"github.com/coreos/etcd/Godeps/_workspace/src/github.com/codegangsta/cli"
+	"github.com/coreos/etcd/Godeps/_workspace/src/github.com/coreos/go-etcd/etcd"
+	"github.com/coreos/etcd/etcdserver/stats"
+)
+
+func NewClusterHealthCommand() cli.Command {
+	return cli.Command{
+		Name:   "cluster-health",
+		Usage:  "check the health of the etcd cluster",
+		Flags:  []cli.Flag{},
+		Action: handleClusterHealth,
+	}
+}
+
+func handleClusterHealth(c *cli.Context) {
+	endpoints, err := getEndpoints(c)
+	if err != nil {
+		handleError(ErrorFromEtcd, err)
+	}
+	tr, err := getTransport(c)
+	if err != nil {
+		handleError(ErrorFromEtcd, err)
+	}
+
+	client := etcd.NewClient(endpoints)
+	client.SetTransport(tr)
+
+	if c.GlobalBool("debug") {
+		go dumpCURL(client)
+	}
+
+	if ok := client.SyncCluster(); !ok {
+		handleError(FailedToConnectToHost, errors.New("cannot sync with the cluster using endpoints "+strings.Join(endpoints, ", ")))
+	}
+
+	// do we have a leader?
+	ep, ls0, err := getLeaderStats(tr, client.GetCluster())
+	if err != nil {
+		fmt.Println("cluster is unhealthy")
+		os.Exit(1)
+	}
+
+	// is raft stable and making progress?
+	client = etcd.NewClient([]string{ep})
+	resp, err := client.Get("/", false, false)
+	if err != nil {
+		fmt.Println("cluster is unhealthy")
+		os.Exit(1)
+	}
+	rt0, ri0 := resp.RaftTerm, resp.RaftIndex
+	time.Sleep(time.Second)
+
+	resp, err = client.Get("/", false, false)
+	if err != nil {
+		fmt.Println("cluster is unhealthy")
+		os.Exit(1)
+	}
+	rt1, ri1 := resp.RaftTerm, resp.RaftIndex
+
+	if rt0 != rt1 {
+		fmt.Println("cluster is unhealthy")
+		os.Exit(1)
+	}
+
+	if ri1 == ri0 {
+		fmt.Println("cluster is unhealthy")
+		os.Exit(1)
+	}
+
+	// are all the members makeing progress?
+	_, ls1, err := getLeaderStats(tr, []string{ep})
+	if err != nil {
+		fmt.Println("cluster is unhealthy")
+		os.Exit(1)
+	}
+
+	fmt.Println("cluster is healthy")
+	// self is healthy
+	var prints []string
+
+	prints = append(prints, fmt.Sprintf("member %s is healthy\n", ls1.Leader))
+	for name, fs0 := range ls0.Followers {
+		fs1, ok := ls1.Followers[name]
+		if !ok {
+			fmt.Println("Cluster configuration changed during health checking. Please retry.")
+			os.Exit(1)
+		}
+		if fs1.Counts.Success <= fs0.Counts.Success {
+			prints = append(prints, fmt.Sprintf("member %s is unhealthy\n", name))
+		} else {
+			prints = append(prints, fmt.Sprintf("member %s is healthy\n", name))
+		}
+	}
+
+	sort.Strings(prints)
+	for _, p := range prints {
+		fmt.Print(p)
+	}
+	os.Exit(0)
+}
+
+func getLeaderStats(tr *http.Transport, endpoints []string) (string, *stats.LeaderStats, error) {
+	// go-etcd does not support cluster stats, use http client for now
+	// TODO: use new etcd client with new member/stats endpoint
+	httpclient := http.Client{
+		Transport: tr,
+	}
+
+	for _, ep := range endpoints {
+		resp, err := httpclient.Get(ep + "/v2/stats/leader")
+		if err != nil {
+			continue
+		}
+		defer resp.Body.Close()
+		if resp.StatusCode != http.StatusOK {
+			continue
+		}
+
+		ls := &stats.LeaderStats{}
+		d := json.NewDecoder(resp.Body)
+		err = d.Decode(ls)
+		if err != nil {
+			continue
+		}
+		return ep, ls, nil
+	}
+	return "", nil, errors.New("no leader")
+}

+ 1 - 0
etcdctl/main.go

@@ -39,6 +39,7 @@ func main() {
 	}
 	}
 	app.Commands = []cli.Command{
 	app.Commands = []cli.Command{
 		command.NewBackupCommand(),
 		command.NewBackupCommand(),
+		command.NewClusterHealthCommand(),
 		command.NewMakeCommand(),
 		command.NewMakeCommand(),
 		command.NewMakeDirCommand(),
 		command.NewMakeDirCommand(),
 		command.NewRemoveCommand(),
 		command.NewRemoveCommand(),