cluster_health.go 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142
  1. package command
  2. import (
  3. "encoding/json"
  4. "errors"
  5. "fmt"
  6. "net/http"
  7. "os"
  8. "sort"
  9. "strings"
  10. "time"
  11. "github.com/coreos/etcd/Godeps/_workspace/src/github.com/codegangsta/cli"
  12. "github.com/coreos/etcd/Godeps/_workspace/src/github.com/coreos/go-etcd/etcd"
  13. "github.com/coreos/etcd/etcdserver/stats"
  14. )
  15. func NewClusterHealthCommand() cli.Command {
  16. return cli.Command{
  17. Name: "cluster-health",
  18. Usage: "check the health of the etcd cluster",
  19. Flags: []cli.Flag{},
  20. Action: handleClusterHealth,
  21. }
  22. }
  23. func handleClusterHealth(c *cli.Context) {
  24. endpoints, err := getEndpoints(c)
  25. if err != nil {
  26. handleError(ErrorFromEtcd, err)
  27. }
  28. tr, err := getTransport(c)
  29. if err != nil {
  30. handleError(ErrorFromEtcd, err)
  31. }
  32. client := etcd.NewClient(endpoints)
  33. client.SetTransport(tr)
  34. if c.GlobalBool("debug") {
  35. go dumpCURL(client)
  36. }
  37. if ok := client.SyncCluster(); !ok {
  38. handleError(FailedToConnectToHost, errors.New("cannot sync with the cluster using endpoints "+strings.Join(endpoints, ", ")))
  39. }
  40. // do we have a leader?
  41. cl := client.GetCluster()
  42. ep, ls0, err := getLeaderStats(tr, cl)
  43. if err != nil {
  44. fmt.Println("cluster may be unhealthy: failed to connect", cl)
  45. os.Exit(1)
  46. }
  47. // is raft stable and making progress?
  48. client = etcd.NewClient([]string{ep})
  49. client.SetTransport(tr)
  50. resp, err := client.Get("/", false, false)
  51. if err != nil {
  52. fmt.Println("cluster is unhealthy")
  53. os.Exit(1)
  54. }
  55. rt0, ri0 := resp.RaftTerm, resp.RaftIndex
  56. time.Sleep(time.Second)
  57. resp, err = client.Get("/", false, false)
  58. if err != nil {
  59. fmt.Println("cluster is unhealthy")
  60. os.Exit(1)
  61. }
  62. rt1, ri1 := resp.RaftTerm, resp.RaftIndex
  63. if rt0 != rt1 {
  64. fmt.Println("cluster is unhealthy")
  65. os.Exit(1)
  66. }
  67. if ri1 == ri0 {
  68. fmt.Println("cluster is unhealthy")
  69. os.Exit(1)
  70. }
  71. // are all the members makeing progress?
  72. _, ls1, err := getLeaderStats(tr, []string{ep})
  73. if err != nil {
  74. fmt.Println("cluster is unhealthy")
  75. os.Exit(1)
  76. }
  77. fmt.Println("cluster is healthy")
  78. // self is healthy
  79. var prints []string
  80. prints = append(prints, fmt.Sprintf("member %s is healthy\n", ls1.Leader))
  81. for name, fs0 := range ls0.Followers {
  82. fs1, ok := ls1.Followers[name]
  83. if !ok {
  84. fmt.Println("Cluster configuration changed during health checking. Please retry.")
  85. os.Exit(1)
  86. }
  87. if fs1.Counts.Success <= fs0.Counts.Success {
  88. prints = append(prints, fmt.Sprintf("member %s is unhealthy\n", name))
  89. } else {
  90. prints = append(prints, fmt.Sprintf("member %s is healthy\n", name))
  91. }
  92. }
  93. sort.Strings(prints)
  94. for _, p := range prints {
  95. fmt.Print(p)
  96. }
  97. os.Exit(0)
  98. }
  99. func getLeaderStats(tr *http.Transport, endpoints []string) (string, *stats.LeaderStats, error) {
  100. // go-etcd does not support cluster stats, use http client for now
  101. // TODO: use new etcd client with new member/stats endpoint
  102. httpclient := http.Client{
  103. Transport: tr,
  104. }
  105. for _, ep := range endpoints {
  106. resp, err := httpclient.Get(ep + "/v2/stats/leader")
  107. if err != nil {
  108. continue
  109. }
  110. defer resp.Body.Close()
  111. if resp.StatusCode != http.StatusOK {
  112. continue
  113. }
  114. ls := &stats.LeaderStats{}
  115. d := json.NewDecoder(resp.Body)
  116. err = d.Decode(ls)
  117. if err != nil {
  118. continue
  119. }
  120. return ep, ls, nil
  121. }
  122. return "", nil, errors.New("no leader")
  123. }