tester.go 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235
  1. // Copyright 2015 CoreOS, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package main
  15. import (
  16. "sync"
  17. "time"
  18. )
  19. type tester struct {
  20. failures []failure
  21. cluster *cluster
  22. limit int
  23. consistencyCheck bool
  24. status Status
  25. }
  26. func (tt *tester) runLoop() {
  27. tt.status.Since = time.Now()
  28. tt.status.RoundLimit = tt.limit
  29. tt.status.cluster = tt.cluster
  30. for _, f := range tt.failures {
  31. tt.status.Failures = append(tt.status.Failures, f.Desc())
  32. }
  33. for i := 0; i < tt.limit; i++ {
  34. tt.status.setRound(i)
  35. roundTotalCounter.Inc()
  36. var (
  37. currentRevision int64
  38. failed bool
  39. )
  40. for j, f := range tt.failures {
  41. caseTotalCounter.WithLabelValues(f.Desc()).Inc()
  42. tt.status.setCase(j)
  43. if err := tt.cluster.WaitHealth(); err != nil {
  44. plog.Printf("[round#%d case#%d] wait full health error: %v", i, j, err)
  45. if err := tt.cleanup(i, j); err != nil {
  46. plog.Printf("[round#%d case#%d] cleanup error: %v", i, j, err)
  47. return
  48. }
  49. failed = true
  50. break
  51. }
  52. plog.Printf("[round#%d case#%d] start failure %s", i, j, f.Desc())
  53. plog.Printf("[round#%d case#%d] start injecting failure...", i, j)
  54. if err := f.Inject(tt.cluster, i); err != nil {
  55. plog.Printf("[round#%d case#%d] injection error: %v", i, j, err)
  56. if err := tt.cleanup(i, j); err != nil {
  57. plog.Printf("[round#%d case#%d] cleanup error: %v", i, j, err)
  58. return
  59. }
  60. failed = true
  61. break
  62. }
  63. plog.Printf("[round#%d case#%d] injected failure", i, j)
  64. plog.Printf("[round#%d case#%d] start recovering failure...", i, j)
  65. if err := f.Recover(tt.cluster, i); err != nil {
  66. plog.Printf("[round#%d case#%d] recovery error: %v", i, j, err)
  67. if err := tt.cleanup(i, j); err != nil {
  68. plog.Printf("[round#%d case#%d] cleanup error: %v", i, j, err)
  69. return
  70. }
  71. failed = true
  72. break
  73. }
  74. plog.Printf("[round#%d case#%d] recovered failure", i, j)
  75. if !tt.consistencyCheck {
  76. continue
  77. }
  78. if tt.cluster.v2Only {
  79. plog.Printf("[round#%d case#%d] succeed!", i, j)
  80. continue
  81. }
  82. plog.Printf("[round#%d case#%d] canceling the stressers...", i, j)
  83. for _, s := range tt.cluster.Stressers {
  84. s.Cancel()
  85. }
  86. plog.Printf("[round#%d case#%d] canceled stressers", i, j)
  87. plog.Printf("[round#%d case#%d] checking current revisions...", i, j)
  88. var (
  89. revs map[string]int64
  90. hashes map[string]int64
  91. rerr error
  92. ok bool
  93. )
  94. for k := 0; k < 5; k++ {
  95. time.Sleep(time.Second)
  96. revs, hashes, rerr = tt.cluster.getRevisionHash()
  97. if rerr != nil {
  98. plog.Printf("[round#%d case#%d.%d] failed to get current revisions (%v)", i, j, k, rerr)
  99. continue
  100. }
  101. if currentRevision, ok = getSameValue(revs); ok {
  102. break
  103. }
  104. plog.Printf("[round#%d case#%d.%d] inconsistent current revisions %+v", i, j, k, revs)
  105. }
  106. if !ok || rerr != nil {
  107. plog.Printf("[round#%d case#%d] checking current revisions failed [revisions: %v]", i, j, revs)
  108. if err := tt.cleanup(i, j); err != nil {
  109. plog.Printf("[round#%d case#%d] cleanup error: %v", i, j, err)
  110. return
  111. }
  112. failed = true
  113. break
  114. }
  115. plog.Printf("[round#%d case#%d] all members are consistent with current revisions [revisions: %v]", i, j, revs)
  116. plog.Printf("[round#%d case#%d] checking current storage hashes...", i, j)
  117. if _, ok = getSameValue(hashes); !ok {
  118. plog.Printf("[round#%d case#%d] checking current storage hashes failed [hashes: %v]", i, j, hashes)
  119. if err := tt.cleanup(i, j); err != nil {
  120. plog.Printf("[round#%d case#%d] cleanup error: %v", i, j, err)
  121. return
  122. }
  123. failed = true
  124. break
  125. }
  126. plog.Printf("[round#%d case#%d] all members are consistent with storage hashes", i, j)
  127. plog.Printf("[round#%d case#%d] restarting the stressers...", i, j)
  128. for _, s := range tt.cluster.Stressers {
  129. go s.Stress()
  130. }
  131. plog.Printf("[round#%d case#%d] succeed!", i, j)
  132. }
  133. if failed {
  134. continue
  135. }
  136. revToCompact := max(0, currentRevision-10000)
  137. plog.Printf("[round#%d] compacting storage at %d (current revision %d)", i, revToCompact, currentRevision)
  138. if err := tt.cluster.compactKV(revToCompact); err != nil {
  139. plog.Printf("[round#%d] compactKV error (%v)", i, err)
  140. if err := tt.cleanup(i, 0); err != nil {
  141. plog.Printf("[round#%d] cleanup error: %v", i, err)
  142. return
  143. }
  144. continue
  145. }
  146. plog.Printf("[round#%d] compacted storage", i)
  147. plog.Printf("[round#%d] check compaction at %d", i, revToCompact)
  148. if err := tt.cluster.checkCompact(revToCompact); err != nil {
  149. plog.Printf("[round#%d] checkCompact error (%v)", i, err)
  150. if err := tt.cleanup(i, 0); err != nil {
  151. plog.Printf("[round#%d] cleanup error: %v", i, err)
  152. return
  153. }
  154. }
  155. plog.Printf("[round#%d] confirmed compaction at %d", i, revToCompact)
  156. if i > 0 && i%500 == 0 { // every 500 rounds
  157. plog.Printf("[round#%d] canceling the stressers...", i)
  158. for _, s := range tt.cluster.Stressers {
  159. s.Cancel()
  160. }
  161. plog.Printf("[round#%d] canceled stressers", i)
  162. plog.Printf("[round#%d] deframenting...", i)
  163. if err := tt.cluster.defrag(); err != nil {
  164. plog.Printf("[round#%d] defrag error (%v)", i, err)
  165. if err := tt.cleanup(i, 0); err != nil {
  166. plog.Printf("[round#%d] cleanup error: %v", i, err)
  167. return
  168. }
  169. }
  170. plog.Printf("[round#%d] deframented...", i)
  171. plog.Printf("[round#%d] restarting the stressers...", i)
  172. for _, s := range tt.cluster.Stressers {
  173. go s.Stress()
  174. }
  175. }
  176. }
  177. }
  178. func (tt *tester) cleanup(i, j int) error {
  179. roundFailedTotalCounter.Inc()
  180. caseFailedTotalCounter.WithLabelValues(tt.failures[j].Desc()).Inc()
  181. plog.Printf("[round#%d case#%d] cleaning up...", i, j)
  182. if err := tt.cluster.Cleanup(); err != nil {
  183. return err
  184. }
  185. return tt.cluster.Bootstrap()
  186. }
  187. type Status struct {
  188. Since time.Time
  189. Failures []string
  190. RoundLimit int
  191. Cluster ClusterStatus
  192. cluster *cluster
  193. mu sync.Mutex // guards Round and Case
  194. Round int
  195. Case int
  196. }
  197. func (s *Status) setRound(r int) {
  198. s.mu.Lock()
  199. defer s.mu.Unlock()
  200. s.Round = r
  201. }
  202. func (s *Status) setCase(c int) {
  203. s.mu.Lock()
  204. defer s.mu.Unlock()
  205. s.Case = c
  206. }