cluster_tester.go 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310
  1. // Copyright 2018 The etcd Authors
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package tester
  15. import (
  16. "fmt"
  17. "os"
  18. "time"
  19. "github.com/coreos/etcd/tools/functional-tester/rpcpb"
  20. "go.uber.org/zap"
  21. )
  22. // compactQPS is rough number of compact requests per second.
  23. // Previous tests showed etcd can compact about 60,000 entries per second.
  24. const compactQPS = 50000
  25. // StartTester starts tester.
  26. func (clus *Cluster) StartTester() {
  27. // TODO: upate status
  28. var preModifiedKey int64
  29. for round := 0; round < int(clus.Tester.RoundLimit) || clus.Tester.RoundLimit == -1; round++ {
  30. roundTotalCounter.Inc()
  31. clus.rd = round
  32. if err := clus.doRound(); err != nil {
  33. clus.lg.Warn(
  34. "doRound failed; returning",
  35. zap.Int("round", clus.rd),
  36. zap.Int("case", clus.cs),
  37. zap.Error(err),
  38. )
  39. if clus.cleanup() != nil {
  40. return
  41. }
  42. // reset preModifiedKey after clean up
  43. preModifiedKey = 0
  44. continue
  45. }
  46. // -1 so that logPrefix doesn't print out 'case'
  47. clus.cs = -1
  48. revToCompact := max(0, clus.currentRevision-10000)
  49. currentModifiedKey := clus.stresser.ModifiedKeys()
  50. modifiedKey := currentModifiedKey - preModifiedKey
  51. preModifiedKey = currentModifiedKey
  52. timeout := 10 * time.Second
  53. timeout += time.Duration(modifiedKey/compactQPS) * time.Second
  54. clus.lg.Info(
  55. "compacting",
  56. zap.Int("round", clus.rd),
  57. zap.Int("case", clus.cs),
  58. zap.Duration("timeout", timeout),
  59. )
  60. if err := clus.compact(revToCompact, timeout); err != nil {
  61. clus.lg.Warn(
  62. "compact failed",
  63. zap.Int("round", clus.rd),
  64. zap.Int("case", clus.cs),
  65. zap.Error(err),
  66. )
  67. if err = clus.cleanup(); err != nil {
  68. clus.lg.Warn(
  69. "cleanup failed",
  70. zap.Int("round", clus.rd),
  71. zap.Int("case", clus.cs),
  72. zap.Error(err),
  73. )
  74. return
  75. }
  76. // reset preModifiedKey after clean up
  77. preModifiedKey = 0
  78. }
  79. if round > 0 && round%500 == 0 { // every 500 rounds
  80. if err := clus.defrag(); err != nil {
  81. clus.lg.Warn(
  82. "defrag failed; returning",
  83. zap.Int("round", clus.rd),
  84. zap.Int("case", clus.cs),
  85. zap.Error(err),
  86. )
  87. clus.failed()
  88. return
  89. }
  90. }
  91. }
  92. clus.lg.Info(
  93. "functional-tester passed",
  94. zap.Int("round", clus.rd),
  95. zap.Int("case", clus.cs),
  96. )
  97. }
  98. func (clus *Cluster) doRound() error {
  99. if clus.Tester.FailureShuffle {
  100. clus.shuffleFailures()
  101. }
  102. clus.lg.Info(
  103. "starting round",
  104. zap.Int("round", clus.rd),
  105. zap.Strings("failures", clus.failureStrings()),
  106. )
  107. for i, fa := range clus.failures {
  108. clus.cs = i
  109. caseTotalCounter.WithLabelValues(fa.Desc()).Inc()
  110. clus.lg.Info(
  111. "failure case START",
  112. zap.Int("round", clus.rd),
  113. zap.Int("case", clus.cs),
  114. zap.String("desc", fa.Desc()),
  115. )
  116. clus.lg.Info("wait health before injecting failures")
  117. if err := clus.WaitHealth(); err != nil {
  118. return fmt.Errorf("wait full health error: %v", err)
  119. }
  120. stressStarted := false
  121. if fa.FailureCase() != rpcpb.FailureCase_NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS {
  122. clus.lg.Info(
  123. "starting stressers before injecting failures",
  124. zap.Int("round", clus.rd),
  125. zap.Int("case", clus.cs),
  126. zap.String("desc", fa.Desc()),
  127. )
  128. if err := clus.stresser.Stress(); err != nil {
  129. return fmt.Errorf("start stresser error: %v", err)
  130. }
  131. stressStarted = true
  132. }
  133. clus.lg.Info(
  134. "injecting",
  135. zap.Int("round", clus.rd),
  136. zap.Int("case", clus.cs),
  137. zap.String("desc", fa.Desc()),
  138. )
  139. if err := fa.Inject(clus); err != nil {
  140. return fmt.Errorf("injection error: %v", err)
  141. }
  142. // if run local, recovering server may conflict
  143. // with stressing client ports
  144. // TODO: use unix for local tests
  145. clus.lg.Info(
  146. "recovering",
  147. zap.Int("round", clus.rd),
  148. zap.Int("case", clus.cs),
  149. zap.String("desc", fa.Desc()),
  150. )
  151. if err := fa.Recover(clus); err != nil {
  152. return fmt.Errorf("recovery error: %v", err)
  153. }
  154. if stressStarted {
  155. clus.lg.Info("pausing stresser after failure recovery, before wait health")
  156. clus.stresser.Pause()
  157. }
  158. clus.lg.Info("wait health after recover")
  159. if err := clus.WaitHealth(); err != nil {
  160. return fmt.Errorf("wait full health error: %v", err)
  161. }
  162. clus.lg.Info("check consistency after recover")
  163. if err := clus.checkConsistency(); err != nil {
  164. return fmt.Errorf("tt.checkConsistency error (%v)", err)
  165. }
  166. clus.lg.Info(
  167. "failure case PASS",
  168. zap.Int("round", clus.rd),
  169. zap.Int("case", clus.cs),
  170. zap.String("desc", fa.Desc()),
  171. )
  172. }
  173. clus.lg.Info(
  174. "finished round",
  175. zap.Int("round", clus.rd),
  176. zap.Strings("failures", clus.failureStrings()),
  177. )
  178. return nil
  179. }
  180. func (clus *Cluster) updateRevision() error {
  181. revs, _, err := clus.getRevisionHash()
  182. for _, rev := range revs {
  183. clus.currentRevision = rev
  184. break // just need get one of the current revisions
  185. }
  186. clus.lg.Info(
  187. "updated current revision",
  188. zap.Int64("current-revision", clus.currentRevision),
  189. )
  190. return err
  191. }
  192. func (clus *Cluster) compact(rev int64, timeout time.Duration) (err error) {
  193. clus.lg.Info(
  194. "compacting storage",
  195. zap.Int64("current-revision", clus.currentRevision),
  196. zap.Int64("compact-revision", rev),
  197. )
  198. if err = clus.compactKV(rev, timeout); err != nil {
  199. return err
  200. }
  201. clus.lg.Info(
  202. "compacted storage",
  203. zap.Int64("current-revision", clus.currentRevision),
  204. zap.Int64("compact-revision", rev),
  205. )
  206. clus.lg.Info(
  207. "checking compaction",
  208. zap.Int64("current-revision", clus.currentRevision),
  209. zap.Int64("compact-revision", rev),
  210. )
  211. if err = clus.checkCompact(rev); err != nil {
  212. clus.lg.Warn(
  213. "checkCompact failed",
  214. zap.Int64("current-revision", clus.currentRevision),
  215. zap.Int64("compact-revision", rev),
  216. zap.Error(err),
  217. )
  218. return err
  219. }
  220. clus.lg.Info(
  221. "confirmed compaction",
  222. zap.Int64("current-revision", clus.currentRevision),
  223. zap.Int64("compact-revision", rev),
  224. )
  225. return nil
  226. }
  227. func (clus *Cluster) failed() {
  228. if !clus.Tester.ExitOnFailure {
  229. return
  230. }
  231. clus.lg.Info(
  232. "exiting on failure",
  233. zap.Int("round", clus.rd),
  234. zap.Int("case", clus.cs),
  235. )
  236. clus.DestroyEtcdAgents()
  237. os.Exit(2)
  238. }
  239. func (clus *Cluster) cleanup() error {
  240. defer clus.failed()
  241. roundFailedTotalCounter.Inc()
  242. desc := "compact/defrag"
  243. if clus.cs != -1 {
  244. desc = clus.failures[clus.cs].Desc()
  245. }
  246. caseFailedTotalCounter.WithLabelValues(desc).Inc()
  247. clus.lg.Info(
  248. "closing stressers before archiving failure data",
  249. zap.Int("round", clus.rd),
  250. zap.Int("case", clus.cs),
  251. )
  252. clus.stresser.Close()
  253. if err := clus.FailArchive(); err != nil {
  254. clus.lg.Warn(
  255. "cleanup failed",
  256. zap.Int("round", clus.rd),
  257. zap.Int("case", clus.cs),
  258. zap.Error(err),
  259. )
  260. return err
  261. }
  262. if err := clus.Restart(); err != nil {
  263. clus.lg.Warn(
  264. "restart failed",
  265. zap.Int("round", clus.rd),
  266. zap.Int("case", clus.cs),
  267. zap.Error(err),
  268. )
  269. return err
  270. }
  271. clus.updateStresserChecker()
  272. return nil
  273. }