|
|
@@ -2,12 +2,10 @@ package command
|
|
|
|
|
|
import (
|
|
|
"encoding/json"
|
|
|
- "errors"
|
|
|
"fmt"
|
|
|
"net/http"
|
|
|
"os"
|
|
|
"os/signal"
|
|
|
- "sort"
|
|
|
"time"
|
|
|
|
|
|
"github.com/coreos/etcd/Godeps/_workspace/src/github.com/codegangsta/cli"
|
|
|
@@ -42,124 +40,60 @@ func handleClusterHealth(c *cli.Context) {
|
|
|
handleError(ExitServerError, err)
|
|
|
}
|
|
|
|
|
|
- // TODO: update members when forever is set.
|
|
|
+ hc := http.Client{
|
|
|
+ Transport: tr,
|
|
|
+ }
|
|
|
+
|
|
|
mi := mustNewMembersAPI(c)
|
|
|
ms, err := mi.List(context.TODO())
|
|
|
if err != nil {
|
|
|
fmt.Println("cluster may be unhealthy: failed to list members")
|
|
|
handleError(ExitServerError, err)
|
|
|
}
|
|
|
- cl := make([]string, 0)
|
|
|
- for _, m := range ms {
|
|
|
- cl = append(cl, m.ClientURLs...)
|
|
|
- }
|
|
|
|
|
|
for {
|
|
|
- // check the /health endpoint of all members first
|
|
|
-
|
|
|
- ep, rs0, err := getLeaderStatus(tr, cl)
|
|
|
- if err != nil {
|
|
|
- fmt.Println("cluster may be unhealthy: failed to connect", cl)
|
|
|
- if forever {
|
|
|
- time.Sleep(10 * time.Second)
|
|
|
- continue
|
|
|
+ health := false
|
|
|
+ for _, m := range ms {
|
|
|
+ checked := false
|
|
|
+ for _, url := range m.ClientURLs {
|
|
|
+ resp, err := hc.Get(url + "/health")
|
|
|
+ if err != nil {
|
|
|
+ fmt.Printf("failed to check the health of member %s on %s: %v\n", m.ID, url, err)
|
|
|
+ continue
|
|
|
+ }
|
|
|
+
|
|
|
+ result := struct{ Health string }{}
|
|
|
+ d := json.NewDecoder(resp.Body)
|
|
|
+ err = d.Decode(&result)
|
|
|
+ resp.Body.Close()
|
|
|
+ if err != nil {
|
|
|
+ fmt.Printf("failed to check the health of member %s on %s: %v\n", m.ID, url, err)
|
|
|
+ continue
|
|
|
+ }
|
|
|
+
|
|
|
+ checked = true
|
|
|
+ if result.Health == "true" {
|
|
|
+ checked = true
|
|
|
+ fmt.Printf("member %s is healthy: got healthy result from %s\n", m.ID, url)
|
|
|
+ } else {
|
|
|
+ fmt.Printf("member %s is unhealthy: got unhealthy result from %s\n", m.ID, url)
|
|
|
+ }
|
|
|
+ break
|
|
|
}
|
|
|
- os.Exit(1)
|
|
|
- }
|
|
|
-
|
|
|
- time.Sleep(time.Second)
|
|
|
-
|
|
|
- // are all the members makeing progress?
|
|
|
- _, rs1, err := getLeaderStatus(tr, []string{ep})
|
|
|
- if err != nil {
|
|
|
- fmt.Println("cluster is unhealthy")
|
|
|
- if forever {
|
|
|
- time.Sleep(10 * time.Second)
|
|
|
- continue
|
|
|
+ if !checked {
|
|
|
+ fmt.Printf("member %s is unreachable: %v are all unreachable\n", m.ID, m.ClientURLs)
|
|
|
}
|
|
|
- os.Exit(1)
|
|
|
}
|
|
|
-
|
|
|
- if rs1.Commit > rs0.Commit {
|
|
|
- fmt.Printf("cluster is healthy: raft is making progress [commit index: %v->%v]\n", rs0.Commit, rs1.Commit)
|
|
|
+ if health {
|
|
|
+ fmt.Println("cluster is healthy")
|
|
|
} else {
|
|
|
- fmt.Printf("cluster is unhealthy: raft is not making progress [commit index: %v]\n", rs0.Commit)
|
|
|
- }
|
|
|
- fmt.Printf("leader is %v\n", rs0.Lead)
|
|
|
-
|
|
|
- var prints []string
|
|
|
-
|
|
|
- for id, pr0 := range rs0.Progress {
|
|
|
- pr1, ok := rs1.Progress[id]
|
|
|
- if !ok {
|
|
|
- // TODO: forever should handle configuration change.
|
|
|
- fmt.Println("Cluster configuration changed during health checking. Please retry.")
|
|
|
- os.Exit(1)
|
|
|
- }
|
|
|
- if pr1.Match <= pr0.Match {
|
|
|
- prints = append(prints, fmt.Sprintf("member %s is unhealthy: raft is not making progress [match: %v->%v]\n", id, pr0.Match, pr1.Match))
|
|
|
- } else {
|
|
|
- prints = append(prints, fmt.Sprintf("member %s is healthy: raft is making progress [match: %v->%v]\n", id, pr0.Match, pr1.Match))
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- sort.Strings(prints)
|
|
|
- for _, p := range prints {
|
|
|
- fmt.Print(p)
|
|
|
+ fmt.Println("cluster is unhealthy")
|
|
|
}
|
|
|
|
|
|
if !forever {
|
|
|
- return
|
|
|
+ break
|
|
|
}
|
|
|
-
|
|
|
+ fmt.Printf("\nnext check after 10 second...\n\n")
|
|
|
time.Sleep(10 * time.Second)
|
|
|
}
|
|
|
}
|
|
|
-
|
|
|
-type raftStatus struct {
|
|
|
- ID string `json:"id"`
|
|
|
- Term uint64 `json:"term"`
|
|
|
- Vote string `json:"vote"`
|
|
|
- Commit uint64 `json:"commit"`
|
|
|
- Lead string `json:"lead"`
|
|
|
- RaftState string `json:"raftState"`
|
|
|
- Progress map[string]struct {
|
|
|
- Match uint64 `json:"match"`
|
|
|
- Next uint64 `json:"next"`
|
|
|
- State string `json:"state"`
|
|
|
- } `json:"progress"`
|
|
|
-}
|
|
|
-
|
|
|
-type vars struct {
|
|
|
- RaftStatus raftStatus `json:"raft.status"`
|
|
|
-}
|
|
|
-
|
|
|
-func getLeaderStatus(tr *http.Transport, endpoints []string) (string, raftStatus, error) {
|
|
|
- // TODO: use new etcd client
|
|
|
- httpclient := http.Client{
|
|
|
- Transport: tr,
|
|
|
- }
|
|
|
-
|
|
|
- for _, ep := range endpoints {
|
|
|
- resp, err := httpclient.Get(ep + "/debug/vars")
|
|
|
- if err != nil {
|
|
|
- continue
|
|
|
- }
|
|
|
- defer resp.Body.Close()
|
|
|
- if resp.StatusCode != http.StatusOK {
|
|
|
- continue
|
|
|
- }
|
|
|
-
|
|
|
- vs := &vars{}
|
|
|
- d := json.NewDecoder(resp.Body)
|
|
|
- err = d.Decode(vs)
|
|
|
- if err != nil {
|
|
|
- continue
|
|
|
- }
|
|
|
- if vs.RaftStatus.Lead != vs.RaftStatus.ID {
|
|
|
- continue
|
|
|
- }
|
|
|
- return ep, vs.RaftStatus, nil
|
|
|
- }
|
|
|
- return "", raftStatus{}, errors.New("no leader")
|
|
|
-}
|