cluster_health.go 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141
  1. package command
  2. import (
  3. "encoding/json"
  4. "errors"
  5. "fmt"
  6. "net/http"
  7. "os"
  8. "sort"
  9. "strings"
  10. "time"
  11. "github.com/coreos/etcd/Godeps/_workspace/src/github.com/codegangsta/cli"
  12. "github.com/coreos/etcd/Godeps/_workspace/src/github.com/coreos/go-etcd/etcd"
  13. "github.com/coreos/etcd/etcdserver/stats"
  14. )
  15. func NewClusterHealthCommand() cli.Command {
  16. return cli.Command{
  17. Name: "cluster-health",
  18. Usage: "check the health of the etcd cluster",
  19. Flags: []cli.Flag{},
  20. Action: handleClusterHealth,
  21. }
  22. }
  23. func handleClusterHealth(c *cli.Context) {
  24. endpoints, err := getEndpoints(c)
  25. if err != nil {
  26. handleError(ErrorFromEtcd, err)
  27. }
  28. tr, err := getTransport(c)
  29. if err != nil {
  30. handleError(ErrorFromEtcd, err)
  31. }
  32. client := etcd.NewClient(endpoints)
  33. client.SetTransport(tr)
  34. if c.GlobalBool("debug") {
  35. go dumpCURL(client)
  36. }
  37. if ok := client.SyncCluster(); !ok {
  38. handleError(FailedToConnectToHost, errors.New("cannot sync with the cluster using endpoints "+strings.Join(endpoints, ", ")))
  39. }
  40. // do we have a leader?
  41. ep, ls0, err := getLeaderStats(tr, client.GetCluster())
  42. if err != nil {
  43. fmt.Println("cluster is unhealthy")
  44. os.Exit(1)
  45. }
  46. // is raft stable and making progress?
  47. client = etcd.NewClient([]string{ep})
  48. client.SetTransport(tr)
  49. resp, err := client.Get("/", false, false)
  50. if err != nil {
  51. fmt.Println("cluster is unhealthy")
  52. os.Exit(1)
  53. }
  54. rt0, ri0 := resp.RaftTerm, resp.RaftIndex
  55. time.Sleep(time.Second)
  56. resp, err = client.Get("/", false, false)
  57. if err != nil {
  58. fmt.Println("cluster is unhealthy")
  59. os.Exit(1)
  60. }
  61. rt1, ri1 := resp.RaftTerm, resp.RaftIndex
  62. if rt0 != rt1 {
  63. fmt.Println("cluster is unhealthy")
  64. os.Exit(1)
  65. }
  66. if ri1 == ri0 {
  67. fmt.Println("cluster is unhealthy")
  68. os.Exit(1)
  69. }
  70. // are all the members makeing progress?
  71. _, ls1, err := getLeaderStats(tr, []string{ep})
  72. if err != nil {
  73. fmt.Println("cluster is unhealthy")
  74. os.Exit(1)
  75. }
  76. fmt.Println("cluster is healthy")
  77. // self is healthy
  78. var prints []string
  79. prints = append(prints, fmt.Sprintf("member %s is healthy\n", ls1.Leader))
  80. for name, fs0 := range ls0.Followers {
  81. fs1, ok := ls1.Followers[name]
  82. if !ok {
  83. fmt.Println("Cluster configuration changed during health checking. Please retry.")
  84. os.Exit(1)
  85. }
  86. if fs1.Counts.Success <= fs0.Counts.Success {
  87. prints = append(prints, fmt.Sprintf("member %s is unhealthy\n", name))
  88. } else {
  89. prints = append(prints, fmt.Sprintf("member %s is healthy\n", name))
  90. }
  91. }
  92. sort.Strings(prints)
  93. for _, p := range prints {
  94. fmt.Print(p)
  95. }
  96. os.Exit(0)
  97. }
  98. func getLeaderStats(tr *http.Transport, endpoints []string) (string, *stats.LeaderStats, error) {
  99. // go-etcd does not support cluster stats, use http client for now
  100. // TODO: use new etcd client with new member/stats endpoint
  101. httpclient := http.Client{
  102. Transport: tr,
  103. }
  104. for _, ep := range endpoints {
  105. resp, err := httpclient.Get(ep + "/v2/stats/leader")
  106. if err != nil {
  107. continue
  108. }
  109. defer resp.Body.Close()
  110. if resp.StatusCode != http.StatusOK {
  111. continue
  112. }
  113. ls := &stats.LeaderStats{}
  114. d := json.NewDecoder(resp.Body)
  115. err = d.Decode(ls)
  116. if err != nil {
  117. continue
  118. }
  119. return ep, ls, nil
  120. }
  121. return "", nil, errors.New("no leader")
  122. }