cluster_health.go 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140
  1. package command
  2. import (
  3. "encoding/json"
  4. "errors"
  5. "fmt"
  6. "net/http"
  7. "os"
  8. "sort"
  9. "strings"
  10. "time"
  11. "github.com/coreos/etcd/Godeps/_workspace/src/github.com/codegangsta/cli"
  12. "github.com/coreos/etcd/Godeps/_workspace/src/github.com/coreos/go-etcd/etcd"
  13. "github.com/coreos/etcd/etcdserver/stats"
  14. )
  15. func NewClusterHealthCommand() cli.Command {
  16. return cli.Command{
  17. Name: "cluster-health",
  18. Usage: "check the health of the etcd cluster",
  19. Flags: []cli.Flag{},
  20. Action: handleClusterHealth,
  21. }
  22. }
  23. func handleClusterHealth(c *cli.Context) {
  24. endpoints, err := getEndpoints(c)
  25. if err != nil {
  26. handleError(ErrorFromEtcd, err)
  27. }
  28. tr, err := getTransport(c)
  29. if err != nil {
  30. handleError(ErrorFromEtcd, err)
  31. }
  32. client := etcd.NewClient(endpoints)
  33. client.SetTransport(tr)
  34. if c.GlobalBool("debug") {
  35. go dumpCURL(client)
  36. }
  37. if ok := client.SyncCluster(); !ok {
  38. handleError(FailedToConnectToHost, errors.New("cannot sync with the cluster using endpoints "+strings.Join(endpoints, ", ")))
  39. }
  40. // do we have a leader?
  41. ep, ls0, err := getLeaderStats(tr, client.GetCluster())
  42. if err != nil {
  43. fmt.Println("cluster is unhealthy")
  44. os.Exit(1)
  45. }
  46. // is raft stable and making progress?
  47. client = etcd.NewClient([]string{ep})
  48. resp, err := client.Get("/", false, false)
  49. if err != nil {
  50. fmt.Println("cluster is unhealthy")
  51. os.Exit(1)
  52. }
  53. rt0, ri0 := resp.RaftTerm, resp.RaftIndex
  54. time.Sleep(time.Second)
  55. resp, err = client.Get("/", false, false)
  56. if err != nil {
  57. fmt.Println("cluster is unhealthy")
  58. os.Exit(1)
  59. }
  60. rt1, ri1 := resp.RaftTerm, resp.RaftIndex
  61. if rt0 != rt1 {
  62. fmt.Println("cluster is unhealthy")
  63. os.Exit(1)
  64. }
  65. if ri1 == ri0 {
  66. fmt.Println("cluster is unhealthy")
  67. os.Exit(1)
  68. }
  69. // are all the members makeing progress?
  70. _, ls1, err := getLeaderStats(tr, []string{ep})
  71. if err != nil {
  72. fmt.Println("cluster is unhealthy")
  73. os.Exit(1)
  74. }
  75. fmt.Println("cluster is healthy")
  76. // self is healthy
  77. var prints []string
  78. prints = append(prints, fmt.Sprintf("member %s is healthy\n", ls1.Leader))
  79. for name, fs0 := range ls0.Followers {
  80. fs1, ok := ls1.Followers[name]
  81. if !ok {
  82. fmt.Println("Cluster configuration changed during health checking. Please retry.")
  83. os.Exit(1)
  84. }
  85. if fs1.Counts.Success <= fs0.Counts.Success {
  86. prints = append(prints, fmt.Sprintf("member %s is unhealthy\n", name))
  87. } else {
  88. prints = append(prints, fmt.Sprintf("member %s is healthy\n", name))
  89. }
  90. }
  91. sort.Strings(prints)
  92. for _, p := range prints {
  93. fmt.Print(p)
  94. }
  95. os.Exit(0)
  96. }
  97. func getLeaderStats(tr *http.Transport, endpoints []string) (string, *stats.LeaderStats, error) {
  98. // go-etcd does not support cluster stats, use http client for now
  99. // TODO: use new etcd client with new member/stats endpoint
  100. httpclient := http.Client{
  101. Transport: tr,
  102. }
  103. for _, ep := range endpoints {
  104. resp, err := httpclient.Get(ep + "/v2/stats/leader")
  105. if err != nil {
  106. continue
  107. }
  108. defer resp.Body.Close()
  109. if resp.StatusCode != http.StatusOK {
  110. continue
  111. }
  112. ls := &stats.LeaderStats{}
  113. d := json.NewDecoder(resp.Body)
  114. err = d.Decode(ls)
  115. if err != nil {
  116. continue
  117. }
  118. return ep, ls, nil
  119. }
  120. return "", nil, errors.New("no leader")
  121. }