tester.go 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283
  1. // Copyright 2015 CoreOS, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package main
  15. import (
  16. "log"
  17. "sync"
  18. "time"
  19. "github.com/coreos/etcd/Godeps/_workspace/src/golang.org/x/net/context"
  20. "github.com/coreos/etcd/Godeps/_workspace/src/google.golang.org/grpc"
  21. pb "github.com/coreos/etcd/etcdserver/etcdserverpb"
  22. )
  23. type tester struct {
  24. failures []failure
  25. cluster *cluster
  26. limit int
  27. status Status
  28. }
  29. func (tt *tester) runLoop() {
  30. tt.status.Since = time.Now()
  31. tt.status.RoundLimit = tt.limit
  32. tt.status.cluster = tt.cluster
  33. for _, f := range tt.failures {
  34. tt.status.Failures = append(tt.status.Failures, f.Desc())
  35. }
  36. for i := 0; i < tt.limit; i++ {
  37. tt.status.setRound(i)
  38. for j, f := range tt.failures {
  39. tt.status.setCase(j)
  40. if err := tt.cluster.WaitHealth(); err != nil {
  41. log.Printf("etcd-tester: [round#%d case#%d] wait full health error: %v", i, j, err)
  42. if err := tt.cleanup(i, j); err != nil {
  43. log.Printf("etcd-tester: [round#%d case#%d] cleanup error: %v", i, j, err)
  44. return
  45. }
  46. continue
  47. }
  48. log.Printf("etcd-tester: [round#%d case#%d] start failure %s", i, j, f.Desc())
  49. log.Printf("etcd-tester: [round#%d case#%d] start injecting failure...", i, j)
  50. if err := f.Inject(tt.cluster, i); err != nil {
  51. log.Printf("etcd-tester: [round#%d case#%d] injection error: %v", i, j, err)
  52. if err := tt.cleanup(i, j); err != nil {
  53. log.Printf("etcd-tester: [round#%d case#%d] cleanup error: %v", i, j, err)
  54. return
  55. }
  56. continue
  57. }
  58. log.Printf("etcd-tester: [round#%d case#%d] injected failure", i, j)
  59. log.Printf("etcd-tester: [round#%d case#%d] start recovering failure...", i, j)
  60. if err := f.Recover(tt.cluster, i); err != nil {
  61. log.Printf("etcd-tester: [round#%d case#%d] recovery error: %v", i, j, err)
  62. if err := tt.cleanup(i, j); err != nil {
  63. log.Printf("etcd-tester: [round#%d case#%d] cleanup error: %v", i, j, err)
  64. return
  65. }
  66. continue
  67. }
  68. log.Printf("etcd-tester: [round#%d case#%d] recovered failure", i, j)
  69. if tt.cluster.v2Only {
  70. log.Printf("etcd-tester: [round#%d case#%d] succeed!", i, j)
  71. continue
  72. }
  73. log.Printf("etcd-tester: [round#%d case#%d] canceling the stressers...", i, j)
  74. for _, s := range tt.cluster.Stressers {
  75. s.Cancel()
  76. }
  77. log.Printf("etcd-tester: [round#%d case#%d] canceled stressers", i, j)
  78. log.Printf("etcd-tester: [round#%d case#%d] checking current revisions...", i, j)
  79. ok := false
  80. var currentRevision int64
  81. for k := 0; k < 5; k++ {
  82. time.Sleep(time.Second)
  83. revs, err := tt.cluster.getRevision()
  84. if err != nil {
  85. if e := tt.cleanup(i, j); e != nil {
  86. log.Printf("etcd-tester: [round#%d case#%d.%d] cleanup error: %v", i, j, k, e)
  87. return
  88. }
  89. log.Printf("etcd-tester: [round#%d case#%d.%d] failed to get current revisions (%v)", i, j, k, err)
  90. continue
  91. }
  92. if currentRevision, ok = getSameValue(revs); ok {
  93. break
  94. } else {
  95. log.Printf("etcd-tester: [round#%d case#%d.%d] inconsistent current revisions %+v", i, j, k, revs)
  96. }
  97. }
  98. if !ok {
  99. log.Printf("etcd-tester: [round#%d case#%d] checking current revisions failure...", i, j)
  100. if err := tt.cleanup(i, j); err != nil {
  101. log.Printf("etcd-tester: [round#%d case#%d] cleanup error: %v", i, j, err)
  102. return
  103. }
  104. continue
  105. }
  106. log.Printf("etcd-tester: [round#%d case#%d] all members are consistent with current revisions", i, j)
  107. log.Printf("etcd-tester: [round#%d case#%d] checking current storage hashes...", i, j)
  108. hashes, err := tt.cluster.getKVHash()
  109. if err != nil {
  110. log.Printf("etcd-tester: [round#%d case#%d] getKVHash error (%v)", i, j, err)
  111. if err := tt.cleanup(i, j); err != nil {
  112. log.Printf("etcd-tester: [round#%d case#%d] cleanup error: %v", i, j, err)
  113. return
  114. }
  115. }
  116. if _, ok = getSameValue(hashes); !ok {
  117. if err := tt.cleanup(i, j); err != nil {
  118. log.Printf("etcd-tester: [round#%d case#%d] cleanup error: %v", i, j, err)
  119. return
  120. }
  121. continue
  122. }
  123. log.Printf("etcd-tester: [round#%d case#%d] all members are consistent with storage hashes", i, j)
  124. revToCompact := max(0, currentRevision-10000)
  125. log.Printf("etcd-tester: [round#%d case#%d] compacting storage at %d (current revision %d)", i, j, revToCompact, currentRevision)
  126. if err := tt.cluster.compactKV(revToCompact); err != nil {
  127. log.Printf("etcd-tester: [round#%d case#%d] compactKV error (%v)", i, j, err)
  128. if err := tt.cleanup(i, j); err != nil {
  129. log.Printf("etcd-tester: [round#%d case#%d] cleanup error: %v", i, j, err)
  130. return
  131. }
  132. }
  133. log.Printf("etcd-tester: [round#%d case#%d] compacted storage", i, j)
  134. log.Printf("etcd-tester: [round#%d case#%d] restarting the stressers...", i, j)
  135. for _, s := range tt.cluster.Stressers {
  136. go s.Stress()
  137. }
  138. log.Printf("etcd-tester: [round#%d case#%d] succeed!", i, j)
  139. }
  140. }
  141. }
  142. func (tt *tester) cleanup(i, j int) error {
  143. log.Printf("etcd-tester: [round#%d case#%d] cleaning up...", i, j)
  144. if err := tt.cluster.Cleanup(); err != nil {
  145. return err
  146. }
  147. return tt.cluster.Bootstrap()
  148. }
  149. type Status struct {
  150. Since time.Time
  151. Failures []string
  152. RoundLimit int
  153. Cluster ClusterStatus
  154. cluster *cluster
  155. mu sync.Mutex // guards Round and Case
  156. Round int
  157. Case int
  158. }
  159. // get gets a copy of status
  160. func (s *Status) get() Status {
  161. s.mu.Lock()
  162. got := *s
  163. cluster := s.cluster
  164. s.mu.Unlock()
  165. got.Cluster = cluster.Status()
  166. return got
  167. }
  168. func (s *Status) setRound(r int) {
  169. s.mu.Lock()
  170. defer s.mu.Unlock()
  171. s.Round = r
  172. }
  173. func (s *Status) setCase(c int) {
  174. s.mu.Lock()
  175. defer s.mu.Unlock()
  176. s.Case = c
  177. }
  178. func (c *cluster) getRevision() (map[string]int64, error) {
  179. revs := make(map[string]int64)
  180. for _, u := range c.GRPCURLs {
  181. conn, err := grpc.Dial(u, grpc.WithInsecure(), grpc.WithTimeout(5*time.Second))
  182. if err != nil {
  183. return nil, err
  184. }
  185. kvc := pb.NewKVClient(conn)
  186. ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
  187. resp, err := kvc.Range(ctx, &pb.RangeRequest{Key: []byte("foo")})
  188. if err != nil {
  189. return nil, err
  190. }
  191. cancel()
  192. revs[u] = resp.Header.Revision
  193. }
  194. return revs, nil
  195. }
  196. func (c *cluster) getKVHash() (map[string]int64, error) {
  197. hashes := make(map[string]int64)
  198. for _, u := range c.GRPCURLs {
  199. conn, err := grpc.Dial(u, grpc.WithInsecure(), grpc.WithTimeout(5*time.Second))
  200. if err != nil {
  201. return nil, err
  202. }
  203. kvc := pb.NewKVClient(conn)
  204. ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
  205. resp, err := kvc.Hash(ctx, &pb.HashRequest{})
  206. if resp != nil && err != nil {
  207. return nil, err
  208. }
  209. cancel()
  210. hashes[u] = int64(resp.Hash)
  211. }
  212. return hashes, nil
  213. }
  214. func getSameValue(hashes map[string]int64) (int64, bool) {
  215. var rv int64
  216. ok := true
  217. for _, v := range hashes {
  218. if rv == 0 {
  219. rv = v
  220. }
  221. if rv != v {
  222. ok = false
  223. break
  224. }
  225. }
  226. return rv, ok
  227. }
  228. func max(n1, n2 int64) int64 {
  229. if n1 > n2 {
  230. return n1
  231. }
  232. return n2
  233. }
  234. func (c *cluster) compactKV(rev int64) error {
  235. var (
  236. conn *grpc.ClientConn
  237. err error
  238. )
  239. for _, u := range c.GRPCURLs {
  240. conn, err = grpc.Dial(u, grpc.WithInsecure(), grpc.WithTimeout(5*time.Second))
  241. if err != nil {
  242. continue
  243. }
  244. kvc := pb.NewKVClient(conn)
  245. ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
  246. _, err = kvc.Compact(ctx, &pb.CompactionRequest{Revision: rev})
  247. cancel()
  248. if err == nil {
  249. return nil
  250. }
  251. }
  252. return err
  253. }