tester.go 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266
  1. // Copyright 2015 CoreOS, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package main
  15. import (
  16. "log"
  17. "sync"
  18. "time"
  19. "github.com/coreos/etcd/Godeps/_workspace/src/golang.org/x/net/context"
  20. "github.com/coreos/etcd/Godeps/_workspace/src/google.golang.org/grpc"
  21. pb "github.com/coreos/etcd/etcdserver/etcdserverpb"
  22. )
  23. type tester struct {
  24. failures []failure
  25. cluster *cluster
  26. limit int
  27. status Status
  28. }
  29. func (tt *tester) runLoop() {
  30. tt.status.Since = time.Now()
  31. tt.status.RoundLimit = tt.limit
  32. tt.status.cluster = tt.cluster
  33. for _, f := range tt.failures {
  34. tt.status.Failures = append(tt.status.Failures, f.Desc())
  35. }
  36. for i := 0; i < tt.limit; i++ {
  37. tt.status.setRound(i)
  38. var currentRevision int64
  39. for j, f := range tt.failures {
  40. tt.status.setCase(j)
  41. if err := tt.cluster.WaitHealth(); err != nil {
  42. log.Printf("etcd-tester: [round#%d case#%d] wait full health error: %v", i, j, err)
  43. if err := tt.cleanup(i, j); err != nil {
  44. log.Printf("etcd-tester: [round#%d case#%d] cleanup error: %v", i, j, err)
  45. return
  46. }
  47. continue
  48. }
  49. log.Printf("etcd-tester: [round#%d case#%d] start failure %s", i, j, f.Desc())
  50. log.Printf("etcd-tester: [round#%d case#%d] start injecting failure...", i, j)
  51. if err := f.Inject(tt.cluster, i); err != nil {
  52. log.Printf("etcd-tester: [round#%d case#%d] injection error: %v", i, j, err)
  53. if err := tt.cleanup(i, j); err != nil {
  54. log.Printf("etcd-tester: [round#%d case#%d] cleanup error: %v", i, j, err)
  55. return
  56. }
  57. continue
  58. }
  59. log.Printf("etcd-tester: [round#%d case#%d] injected failure", i, j)
  60. log.Printf("etcd-tester: [round#%d case#%d] start recovering failure...", i, j)
  61. if err := f.Recover(tt.cluster, i); err != nil {
  62. log.Printf("etcd-tester: [round#%d case#%d] recovery error: %v", i, j, err)
  63. if err := tt.cleanup(i, j); err != nil {
  64. log.Printf("etcd-tester: [round#%d case#%d] cleanup error: %v", i, j, err)
  65. return
  66. }
  67. continue
  68. }
  69. log.Printf("etcd-tester: [round#%d case#%d] recovered failure", i, j)
  70. if tt.cluster.v2Only {
  71. log.Printf("etcd-tester: [round#%d case#%d] succeed!", i, j)
  72. continue
  73. }
  74. log.Printf("etcd-tester: [round#%d case#%d] canceling the stressers...", i, j)
  75. for _, s := range tt.cluster.Stressers {
  76. s.Cancel()
  77. }
  78. log.Printf("etcd-tester: [round#%d case#%d] canceled stressers", i, j)
  79. log.Printf("etcd-tester: [round#%d case#%d] checking current revisions...", i, j)
  80. var (
  81. revs map[string]int64
  82. rerr error
  83. ok bool
  84. )
  85. for k := 0; k < 5; k++ {
  86. time.Sleep(time.Second)
  87. revs, rerr = tt.cluster.getRevision()
  88. if rerr != nil {
  89. log.Printf("etcd-tester: [round#%d case#%d.%d] failed to get current revisions (%v)", i, j, k, rerr)
  90. continue
  91. }
  92. if currentRevision, ok = getSameValue(revs); ok {
  93. break
  94. }
  95. log.Printf("etcd-tester: [round#%d case#%d.%d] inconsistent current revisions %+v", i, j, k, revs)
  96. }
  97. if !ok || rerr != nil {
  98. log.Printf("etcd-tester: [round#%d case#%d] checking current revisions failed (%v)", i, j, revs)
  99. if err := tt.cleanup(i, j); err != nil {
  100. log.Printf("etcd-tester: [round#%d case#%d] cleanup error: %v", i, j, err)
  101. return
  102. }
  103. continue
  104. }
  105. log.Printf("etcd-tester: [round#%d case#%d] all members are consistent with current revisions", i, j)
  106. log.Printf("etcd-tester: [round#%d case#%d] checking current storage hashes...", i, j)
  107. hashes, err := tt.cluster.getKVHash()
  108. if err != nil {
  109. log.Printf("etcd-tester: [round#%d case#%d] getKVHash error (%v)", i, j, err)
  110. if err := tt.cleanup(i, j); err != nil {
  111. log.Printf("etcd-tester: [round#%d case#%d] cleanup error: %v", i, j, err)
  112. return
  113. }
  114. }
  115. if _, ok = getSameValue(hashes); !ok {
  116. log.Printf("etcd-tester: [round#%d case#%d] checking current storage hashes failed (%v)", i, j, hashes)
  117. if err := tt.cleanup(i, j); err != nil {
  118. log.Printf("etcd-tester: [round#%d case#%d] cleanup error: %v", i, j, err)
  119. return
  120. }
  121. continue
  122. }
  123. log.Printf("etcd-tester: [round#%d case#%d] all members are consistent with storage hashes", i, j)
  124. log.Printf("etcd-tester: [round#%d case#%d] restarting the stressers...", i, j)
  125. for _, s := range tt.cluster.Stressers {
  126. go s.Stress()
  127. }
  128. log.Printf("etcd-tester: [round#%d case#%d] succeed!", i, j)
  129. }
  130. revToCompact := max(0, currentRevision-10000)
  131. log.Printf("etcd-tester: [round#%d] compacting storage at %d (current revision %d)", i, revToCompact, currentRevision)
  132. if err := tt.cluster.compactKV(revToCompact); err != nil {
  133. log.Printf("etcd-tester: [round#%d] compactKV error (%v)", i, err)
  134. if err := tt.cleanup(i, 0); err != nil {
  135. log.Printf("etcd-tester: [round#%d] cleanup error: %v", i, err)
  136. return
  137. }
  138. }
  139. log.Printf("etcd-tester: [round#%d] compacted storage", i)
  140. // TODO: make sure compaction is finished
  141. time.Sleep(30 * time.Second)
  142. }
  143. }
  144. func (tt *tester) cleanup(i, j int) error {
  145. log.Printf("etcd-tester: [round#%d case#%d] cleaning up...", i, j)
  146. if err := tt.cluster.Cleanup(); err != nil {
  147. return err
  148. }
  149. return tt.cluster.Bootstrap()
  150. }
  151. type Status struct {
  152. Since time.Time
  153. Failures []string
  154. RoundLimit int
  155. Cluster ClusterStatus
  156. cluster *cluster
  157. mu sync.Mutex // guards Round and Case
  158. Round int
  159. Case int
  160. }
  161. // get gets a copy of status
  162. func (s *Status) get() Status {
  163. s.mu.Lock()
  164. got := *s
  165. cluster := s.cluster
  166. s.mu.Unlock()
  167. got.Cluster = cluster.Status()
  168. return got
  169. }
  170. func (s *Status) setRound(r int) {
  171. s.mu.Lock()
  172. defer s.mu.Unlock()
  173. s.Round = r
  174. }
  175. func (s *Status) setCase(c int) {
  176. s.mu.Lock()
  177. defer s.mu.Unlock()
  178. s.Case = c
  179. }
  180. func (c *cluster) getRevision() (map[string]int64, error) {
  181. revs := make(map[string]int64)
  182. for _, u := range c.GRPCURLs {
  183. conn, err := grpc.Dial(u, grpc.WithInsecure(), grpc.WithTimeout(5*time.Second))
  184. if err != nil {
  185. return nil, err
  186. }
  187. kvc := pb.NewKVClient(conn)
  188. ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
  189. resp, err := kvc.Range(ctx, &pb.RangeRequest{Key: []byte("foo")})
  190. if err != nil {
  191. return nil, err
  192. }
  193. cancel()
  194. revs[u] = resp.Header.Revision
  195. }
  196. return revs, nil
  197. }
  198. func (c *cluster) getKVHash() (map[string]int64, error) {
  199. hashes := make(map[string]int64)
  200. for _, u := range c.GRPCURLs {
  201. conn, err := grpc.Dial(u, grpc.WithInsecure(), grpc.WithTimeout(5*time.Second))
  202. if err != nil {
  203. return nil, err
  204. }
  205. kvc := pb.NewKVClient(conn)
  206. ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
  207. resp, err := kvc.Hash(ctx, &pb.HashRequest{})
  208. if err != nil {
  209. return nil, err
  210. }
  211. cancel()
  212. hashes[u] = int64(resp.Hash)
  213. }
  214. return hashes, nil
  215. }
  216. func (c *cluster) compactKV(rev int64) error {
  217. var (
  218. conn *grpc.ClientConn
  219. err error
  220. )
  221. for _, u := range c.GRPCURLs {
  222. conn, err = grpc.Dial(u, grpc.WithInsecure(), grpc.WithTimeout(5*time.Second))
  223. if err != nil {
  224. continue
  225. }
  226. kvc := pb.NewKVClient(conn)
  227. ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
  228. _, err = kvc.Compact(ctx, &pb.CompactionRequest{Revision: rev})
  229. cancel()
  230. if err == nil {
  231. return nil
  232. }
  233. }
  234. return err
  235. }