tester.go 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268
  1. // Copyright 2015 CoreOS, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package main
  15. import (
  16. "log"
  17. "sync"
  18. "time"
  19. "github.com/coreos/etcd/Godeps/_workspace/src/golang.org/x/net/context"
  20. "github.com/coreos/etcd/Godeps/_workspace/src/google.golang.org/grpc"
  21. pb "github.com/coreos/etcd/etcdserver/etcdserverpb"
  22. )
  23. type tester struct {
  24. failures []failure
  25. cluster *cluster
  26. limit int
  27. status Status
  28. }
  29. func (tt *tester) runLoop() {
  30. tt.status.Since = time.Now()
  31. tt.status.RoundLimit = tt.limit
  32. tt.status.cluster = tt.cluster
  33. for _, f := range tt.failures {
  34. tt.status.Failures = append(tt.status.Failures, f.Desc())
  35. }
  36. for i := 0; i < tt.limit; i++ {
  37. tt.status.setRound(i)
  38. var currentRevision int64
  39. for j, f := range tt.failures {
  40. tt.status.setCase(j)
  41. if err := tt.cluster.WaitHealth(); err != nil {
  42. log.Printf("etcd-tester: [round#%d case#%d] wait full health error: %v", i, j, err)
  43. if err := tt.cleanup(i, j); err != nil {
  44. log.Printf("etcd-tester: [round#%d case#%d] cleanup error: %v", i, j, err)
  45. return
  46. }
  47. continue
  48. }
  49. log.Printf("etcd-tester: [round#%d case#%d] start failure %s", i, j, f.Desc())
  50. log.Printf("etcd-tester: [round#%d case#%d] start injecting failure...", i, j)
  51. if err := f.Inject(tt.cluster, i); err != nil {
  52. log.Printf("etcd-tester: [round#%d case#%d] injection error: %v", i, j, err)
  53. if err := tt.cleanup(i, j); err != nil {
  54. log.Printf("etcd-tester: [round#%d case#%d] cleanup error: %v", i, j, err)
  55. return
  56. }
  57. continue
  58. }
  59. log.Printf("etcd-tester: [round#%d case#%d] injected failure", i, j)
  60. log.Printf("etcd-tester: [round#%d case#%d] start recovering failure...", i, j)
  61. if err := f.Recover(tt.cluster, i); err != nil {
  62. log.Printf("etcd-tester: [round#%d case#%d] recovery error: %v", i, j, err)
  63. if err := tt.cleanup(i, j); err != nil {
  64. log.Printf("etcd-tester: [round#%d case#%d] cleanup error: %v", i, j, err)
  65. return
  66. }
  67. continue
  68. }
  69. log.Printf("etcd-tester: [round#%d case#%d] recovered failure", i, j)
  70. if tt.cluster.v2Only {
  71. log.Printf("etcd-tester: [round#%d case#%d] succeed!", i, j)
  72. continue
  73. }
  74. log.Printf("etcd-tester: [round#%d case#%d] canceling the stressers...", i, j)
  75. for _, s := range tt.cluster.Stressers {
  76. s.Cancel()
  77. }
  78. log.Printf("etcd-tester: [round#%d case#%d] canceled stressers", i, j)
  79. log.Printf("etcd-tester: [round#%d case#%d] checking current revisions...", i, j)
  80. var (
  81. revs map[string]int64
  82. rerr error
  83. ok bool
  84. )
  85. for k := 0; k < 5; k++ {
  86. time.Sleep(time.Second)
  87. revs, rerr = tt.cluster.getRevision()
  88. if rerr != nil {
  89. log.Printf("etcd-tester: [round#%d case#%d.%d] failed to get current revisions (%v)", i, j, k, rerr)
  90. continue
  91. }
  92. if currentRevision, ok = getSameValue(revs); ok {
  93. break
  94. }
  95. log.Printf("etcd-tester: [round#%d case#%d.%d] inconsistent current revisions %+v", i, j, k, revs)
  96. }
  97. if !ok || rerr != nil {
  98. log.Printf("etcd-tester: [round#%d case#%d] checking current revisions failed (%v)", i, j, revs)
  99. if err := tt.cleanup(i, j); err != nil {
  100. log.Printf("etcd-tester: [round#%d case#%d] cleanup error: %v", i, j, err)
  101. return
  102. }
  103. continue
  104. }
  105. log.Printf("etcd-tester: [round#%d case#%d] all members are consistent with current revisions", i, j)
  106. log.Printf("etcd-tester: [round#%d case#%d] checking current storage hashes...", i, j)
  107. hashes, err := tt.cluster.getKVHash()
  108. if err != nil {
  109. log.Printf("etcd-tester: [round#%d case#%d] getKVHash error (%v)", i, j, err)
  110. if err := tt.cleanup(i, j); err != nil {
  111. log.Printf("etcd-tester: [round#%d case#%d] cleanup error: %v", i, j, err)
  112. return
  113. }
  114. continue
  115. }
  116. if _, ok = getSameValue(hashes); !ok {
  117. log.Printf("etcd-tester: [round#%d case#%d] checking current storage hashes failed (%v)", i, j, hashes)
  118. if err := tt.cleanup(i, j); err != nil {
  119. log.Printf("etcd-tester: [round#%d case#%d] cleanup error: %v", i, j, err)
  120. return
  121. }
  122. continue
  123. }
  124. log.Printf("etcd-tester: [round#%d case#%d] all members are consistent with storage hashes", i, j)
  125. log.Printf("etcd-tester: [round#%d case#%d] restarting the stressers...", i, j)
  126. for _, s := range tt.cluster.Stressers {
  127. go s.Stress()
  128. }
  129. log.Printf("etcd-tester: [round#%d case#%d] succeed!", i, j)
  130. }
  131. revToCompact := max(0, currentRevision-10000)
  132. log.Printf("etcd-tester: [round#%d] compacting storage at %d (current revision %d)", i, revToCompact, currentRevision)
  133. if err := tt.cluster.compactKV(revToCompact); err != nil {
  134. log.Printf("etcd-tester: [round#%d] compactKV error (%v)", i, err)
  135. if err := tt.cleanup(i, 0); err != nil {
  136. log.Printf("etcd-tester: [round#%d] cleanup error: %v", i, err)
  137. return
  138. }
  139. continue
  140. }
  141. log.Printf("etcd-tester: [round#%d] compacted storage", i)
  142. // TODO: make sure compaction is finished
  143. time.Sleep(30 * time.Second)
  144. }
  145. }
  146. func (tt *tester) cleanup(i, j int) error {
  147. log.Printf("etcd-tester: [round#%d case#%d] cleaning up...", i, j)
  148. if err := tt.cluster.Cleanup(); err != nil {
  149. return err
  150. }
  151. return tt.cluster.Bootstrap()
  152. }
  153. type Status struct {
  154. Since time.Time
  155. Failures []string
  156. RoundLimit int
  157. Cluster ClusterStatus
  158. cluster *cluster
  159. mu sync.Mutex // guards Round and Case
  160. Round int
  161. Case int
  162. }
  163. // get gets a copy of status
  164. func (s *Status) get() Status {
  165. s.mu.Lock()
  166. got := *s
  167. cluster := s.cluster
  168. s.mu.Unlock()
  169. got.Cluster = cluster.Status()
  170. return got
  171. }
  172. func (s *Status) setRound(r int) {
  173. s.mu.Lock()
  174. defer s.mu.Unlock()
  175. s.Round = r
  176. }
  177. func (s *Status) setCase(c int) {
  178. s.mu.Lock()
  179. defer s.mu.Unlock()
  180. s.Case = c
  181. }
  182. func (c *cluster) getRevision() (map[string]int64, error) {
  183. revs := make(map[string]int64)
  184. for _, u := range c.GRPCURLs {
  185. conn, err := grpc.Dial(u, grpc.WithInsecure(), grpc.WithTimeout(5*time.Second))
  186. if err != nil {
  187. return nil, err
  188. }
  189. kvc := pb.NewKVClient(conn)
  190. ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
  191. resp, err := kvc.Range(ctx, &pb.RangeRequest{Key: []byte("foo")})
  192. if err != nil {
  193. return nil, err
  194. }
  195. cancel()
  196. revs[u] = resp.Header.Revision
  197. }
  198. return revs, nil
  199. }
  200. func (c *cluster) getKVHash() (map[string]int64, error) {
  201. hashes := make(map[string]int64)
  202. for _, u := range c.GRPCURLs {
  203. conn, err := grpc.Dial(u, grpc.WithInsecure(), grpc.WithTimeout(5*time.Second))
  204. if err != nil {
  205. return nil, err
  206. }
  207. kvc := pb.NewKVClient(conn)
  208. ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
  209. resp, err := kvc.Hash(ctx, &pb.HashRequest{})
  210. if err != nil {
  211. return nil, err
  212. }
  213. cancel()
  214. hashes[u] = int64(resp.Hash)
  215. }
  216. return hashes, nil
  217. }
  218. func (c *cluster) compactKV(rev int64) error {
  219. var (
  220. conn *grpc.ClientConn
  221. err error
  222. )
  223. for _, u := range c.GRPCURLs {
  224. conn, err = grpc.Dial(u, grpc.WithInsecure(), grpc.WithTimeout(5*time.Second))
  225. if err != nil {
  226. continue
  227. }
  228. kvc := pb.NewKVClient(conn)
  229. ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
  230. _, err = kvc.Compact(ctx, &pb.CompactionRequest{Revision: rev})
  231. cancel()
  232. if err == nil {
  233. return nil
  234. }
  235. }
  236. return err
  237. }