tester.go 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237
  1. // Copyright 2015 CoreOS, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package main
  15. import (
  16. "log"
  17. "sync"
  18. "time"
  19. "github.com/coreos/etcd/Godeps/_workspace/src/golang.org/x/net/context"
  20. "github.com/coreos/etcd/Godeps/_workspace/src/google.golang.org/grpc"
  21. pb "github.com/coreos/etcd/etcdserver/etcdserverpb"
  22. )
  23. type tester struct {
  24. failures []failure
  25. cluster *cluster
  26. limit int
  27. status Status
  28. }
  29. func (tt *tester) runLoop() {
  30. tt.status.Since = time.Now()
  31. tt.status.RoundLimit = tt.limit
  32. tt.status.cluster = tt.cluster
  33. for _, f := range tt.failures {
  34. tt.status.Failures = append(tt.status.Failures, f.Desc())
  35. }
  36. for i := 0; i < tt.limit; i++ {
  37. tt.status.setRound(i)
  38. for j, f := range tt.failures {
  39. tt.status.setCase(j)
  40. if err := tt.cluster.WaitHealth(); err != nil {
  41. log.Printf("etcd-tester: [round#%d case#%d] wait full health error: %v", i, j, err)
  42. if err := tt.cleanup(i, j); err != nil {
  43. log.Printf("etcd-tester: [round#%d case#%d] cleanup error: %v", i, j, err)
  44. return
  45. }
  46. continue
  47. }
  48. log.Printf("etcd-tester: [round#%d case#%d] start failure %s", i, j, f.Desc())
  49. log.Printf("etcd-tester: [round#%d case#%d] start injecting failure...", i, j)
  50. if err := f.Inject(tt.cluster, i); err != nil {
  51. log.Printf("etcd-tester: [round#%d case#%d] injection error: %v", i, j, err)
  52. if err := tt.cleanup(i, j); err != nil {
  53. log.Printf("etcd-tester: [round#%d case#%d] cleanup error: %v", i, j, err)
  54. return
  55. }
  56. continue
  57. }
  58. log.Printf("etcd-tester: [round#%d case#%d] start recovering failure...", i, j)
  59. if err := f.Recover(tt.cluster, i); err != nil {
  60. log.Printf("etcd-tester: [round#%d case#%d] recovery error: %v", i, j, err)
  61. if err := tt.cleanup(i, j); err != nil {
  62. log.Printf("etcd-tester: [round#%d case#%d] cleanup error: %v", i, j, err)
  63. return
  64. }
  65. continue
  66. }
  67. if tt.cluster.v2Only {
  68. log.Printf("etcd-tester: [round#%d case#%d] succeed!", i, j)
  69. continue
  70. }
  71. log.Printf("etcd-tester: [round#%d case#%d] canceling the stressers...", i, j)
  72. for _, s := range tt.cluster.Stressers {
  73. s.Cancel()
  74. }
  75. ok := false
  76. for k := 0; k < 5; k++ {
  77. time.Sleep(time.Second)
  78. log.Printf("etcd-tester: [round#%d case#%d.%d] checking current revisions...", i, j, k)
  79. revs, err := tt.cluster.getRevision()
  80. if err != nil {
  81. if e := tt.cleanup(i, j); e != nil {
  82. log.Printf("etcd-tester: [round#%d case#%d.%d] cleanup error: %v", i, j, k, e)
  83. return
  84. }
  85. log.Printf("etcd-tester: [round#%d case#%d.%d] failed to get revisions (%v)", i, j, k, err)
  86. continue
  87. }
  88. if ok = isSameValueInMap(revs); ok {
  89. log.Printf("etcd-tester: [round#%d case#%d.%d] checking current revisions succeed!", i, j, k)
  90. break
  91. } else {
  92. log.Printf("etcd-tester: [round#%d case#%d.%d] current revisions %+v", i, j, k, revs)
  93. }
  94. }
  95. if !ok {
  96. log.Printf("etcd-tester: [round#%d case#%d] checking current revisions failure...", i, j)
  97. if err := tt.cleanup(i, j); err != nil {
  98. log.Printf("etcd-tester: [round#%d case#%d] cleanup error: %v", i, j, err)
  99. return
  100. }
  101. continue
  102. }
  103. log.Printf("etcd-tester: [round#%d case#%d] checking current storage hashes...", i, j)
  104. hashes, err := tt.cluster.getKVHash()
  105. if err != nil {
  106. log.Printf("etcd-tester: [round#%d case#%d] getKVHash error (%v)", i, j, err)
  107. if err := tt.cleanup(i, j); err != nil {
  108. log.Printf("etcd-tester: [round#%d case#%d] cleanup error: %v", i, j, err)
  109. return
  110. }
  111. }
  112. if !isSameValueInMap(hashes) {
  113. if err := tt.cleanup(i, j); err != nil {
  114. log.Printf("etcd-tester: [round#%d case#%d] cleanup error: %v", i, j, err)
  115. return
  116. }
  117. continue
  118. }
  119. log.Printf("etcd-tester: [round#%d case#%d] all members are consistent!", i, j)
  120. log.Printf("etcd-tester: [round#%d case#%d] succeed!", i, j)
  121. log.Printf("etcd-tester: [round#%d case#%d] restarting the stressers...", i, j)
  122. for _, s := range tt.cluster.Stressers {
  123. go s.Stress()
  124. }
  125. }
  126. }
  127. }
  128. func (tt *tester) cleanup(i, j int) error {
  129. log.Printf("etcd-tester: [round#%d case#%d] cleaning up...", i, j)
  130. if err := tt.cluster.Cleanup(); err != nil {
  131. return err
  132. }
  133. return tt.cluster.Bootstrap()
  134. }
  135. type Status struct {
  136. Since time.Time
  137. Failures []string
  138. RoundLimit int
  139. Cluster ClusterStatus
  140. cluster *cluster
  141. mu sync.Mutex // guards Round and Case
  142. Round int
  143. Case int
  144. }
  145. // get gets a copy of status
  146. func (s *Status) get() Status {
  147. s.mu.Lock()
  148. got := *s
  149. cluster := s.cluster
  150. s.mu.Unlock()
  151. got.Cluster = cluster.Status()
  152. return got
  153. }
  154. func (s *Status) setRound(r int) {
  155. s.mu.Lock()
  156. defer s.mu.Unlock()
  157. s.Round = r
  158. }
  159. func (s *Status) setCase(c int) {
  160. s.mu.Lock()
  161. defer s.mu.Unlock()
  162. s.Case = c
  163. }
  164. func (c *cluster) getRevision() (map[string]int64, error) {
  165. revs := make(map[string]int64)
  166. for _, u := range c.GRPCURLs {
  167. conn, err := grpc.Dial(u, grpc.WithInsecure(), grpc.WithTimeout(5*time.Second))
  168. if err != nil {
  169. return nil, err
  170. }
  171. kvc := pb.NewKVClient(conn)
  172. ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
  173. resp, err := kvc.Range(ctx, &pb.RangeRequest{Key: []byte("foo")})
  174. if err != nil {
  175. return nil, err
  176. }
  177. cancel()
  178. revs[u] = resp.Header.Revision
  179. }
  180. return revs, nil
  181. }
  182. func (c *cluster) getKVHash() (map[string]int64, error) {
  183. hashes := make(map[string]int64)
  184. for _, u := range c.GRPCURLs {
  185. conn, err := grpc.Dial(u, grpc.WithInsecure(), grpc.WithTimeout(5*time.Second))
  186. if err != nil {
  187. return nil, err
  188. }
  189. kvc := pb.NewKVClient(conn)
  190. ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
  191. resp, err := kvc.Hash(ctx, &pb.HashRequest{})
  192. if resp != nil && err != nil {
  193. return nil, err
  194. }
  195. cancel()
  196. hashes[u] = int64(resp.Hash)
  197. }
  198. return hashes, nil
  199. }
  200. func isSameValueInMap(hashes map[string]int64) bool {
  201. var rv int64
  202. ok := true
  203. for _, v := range hashes {
  204. if rv == 0 {
  205. rv = v
  206. }
  207. if rv != v {
  208. ok = false
  209. break
  210. }
  211. }
  212. return ok
  213. }