cluster_tester.go 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310
  1. // Copyright 2018 The etcd Authors
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package tester
  15. import (
  16. "fmt"
  17. "os"
  18. "time"
  19. "github.com/coreos/etcd/tools/functional-tester/rpcpb"
  20. "go.uber.org/zap"
  21. )
  22. // compactQPS is rough number of compact requests per second.
  23. // Previous tests showed etcd can compact about 60,000 entries per second.
  24. const compactQPS = 50000
  25. // StartTester starts tester.
  26. func (clus *Cluster) StartTester() {
  27. // TODO: upate status
  28. clus.startStresser()
  29. var preModifiedKey int64
  30. for round := 0; round < int(clus.Tester.RoundLimit) || clus.Tester.RoundLimit == -1; round++ {
  31. roundTotalCounter.Inc()
  32. clus.rd = round
  33. if err := clus.doRound(); err != nil {
  34. clus.lg.Warn(
  35. "doRound failed; returning",
  36. zap.Int("round", clus.rd),
  37. zap.Int("case", clus.cs),
  38. zap.Error(err),
  39. )
  40. if clus.cleanup() != nil {
  41. return
  42. }
  43. // reset preModifiedKey after clean up
  44. preModifiedKey = 0
  45. continue
  46. }
  47. // -1 so that logPrefix doesn't print out 'case'
  48. clus.cs = -1
  49. revToCompact := max(0, clus.currentRevision-10000)
  50. currentModifiedKey := clus.stresser.ModifiedKeys()
  51. modifiedKey := currentModifiedKey - preModifiedKey
  52. preModifiedKey = currentModifiedKey
  53. timeout := 10 * time.Second
  54. timeout += time.Duration(modifiedKey/compactQPS) * time.Second
  55. clus.lg.Info(
  56. "compacting",
  57. zap.Int("round", clus.rd),
  58. zap.Int("case", clus.cs),
  59. zap.Duration("timeout", timeout),
  60. )
  61. if err := clus.compact(revToCompact, timeout); err != nil {
  62. clus.lg.Warn(
  63. "compact failed",
  64. zap.Int("round", clus.rd),
  65. zap.Int("case", clus.cs),
  66. zap.Error(err),
  67. )
  68. if err = clus.cleanup(); err != nil {
  69. clus.lg.Warn(
  70. "cleanup failed",
  71. zap.Int("round", clus.rd),
  72. zap.Int("case", clus.cs),
  73. zap.Error(err),
  74. )
  75. return
  76. }
  77. // reset preModifiedKey after clean up
  78. preModifiedKey = 0
  79. }
  80. if round > 0 && round%500 == 0 { // every 500 rounds
  81. if err := clus.defrag(); err != nil {
  82. clus.lg.Warn(
  83. "defrag failed; returning",
  84. zap.Int("round", clus.rd),
  85. zap.Int("case", clus.cs),
  86. zap.Error(err),
  87. )
  88. clus.failed()
  89. return
  90. }
  91. }
  92. }
  93. clus.lg.Info(
  94. "functional-tester passed",
  95. zap.Int("round", clus.rd),
  96. zap.Int("case", clus.cs),
  97. )
  98. }
  99. func (clus *Cluster) doRound() error {
  100. if clus.Tester.FailureShuffle {
  101. clus.shuffleFailures()
  102. }
  103. clus.lg.Info(
  104. "starting round",
  105. zap.Int("round", clus.rd),
  106. zap.Strings("failures", clus.failureStrings()),
  107. )
  108. for i, fa := range clus.failures {
  109. clus.cs = i
  110. caseTotalCounter.WithLabelValues(fa.Desc()).Inc()
  111. clus.lg.Info("wait health before injecting failures")
  112. if err := clus.WaitHealth(); err != nil {
  113. return fmt.Errorf("wait full health error: %v", err)
  114. }
  115. if fa.FailureCase() == rpcpb.FailureCase_NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS {
  116. clus.lg.Info("pausing stresser after before injecting failures")
  117. clus.pauseStresser()
  118. }
  119. clus.lg.Info(
  120. "injecting failure",
  121. zap.Int("round", clus.rd),
  122. zap.Int("case", clus.cs),
  123. zap.String("desc", fa.Desc()),
  124. )
  125. if err := fa.Inject(clus); err != nil {
  126. return fmt.Errorf("injection error: %v", err)
  127. }
  128. clus.lg.Info(
  129. "injected failure",
  130. zap.Int("round", clus.rd),
  131. zap.Int("case", clus.cs),
  132. zap.String("desc", fa.Desc()),
  133. )
  134. // if run local, recovering server may conflict
  135. // with stressing client ports
  136. // TODO: use unix for local tests
  137. clus.lg.Info(
  138. "recovering failure",
  139. zap.Int("round", clus.rd),
  140. zap.Int("case", clus.cs),
  141. zap.String("desc", fa.Desc()),
  142. )
  143. if err := fa.Recover(clus); err != nil {
  144. return fmt.Errorf("recovery error: %v", err)
  145. }
  146. clus.lg.Info(
  147. "recovered failure",
  148. zap.Int("round", clus.rd),
  149. zap.Int("case", clus.cs),
  150. zap.String("desc", fa.Desc()),
  151. )
  152. if fa.FailureCase() != rpcpb.FailureCase_NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS {
  153. clus.lg.Info("pausing stresser after failure recovery, before wait health")
  154. clus.pauseStresser()
  155. }
  156. clus.lg.Info("wait health after recovering failures")
  157. if err := clus.WaitHealth(); err != nil {
  158. return fmt.Errorf("wait full health error: %v", err)
  159. }
  160. clus.lg.Info("check consistency after recovering failures")
  161. if err := clus.checkConsistency(); err != nil {
  162. return fmt.Errorf("tt.checkConsistency error (%v)", err)
  163. }
  164. clus.lg.Info(
  165. "failure case passed",
  166. zap.Int("round", clus.rd),
  167. zap.Int("case", clus.cs),
  168. zap.String("desc", fa.Desc()),
  169. )
  170. }
  171. clus.lg.Info(
  172. "finished round",
  173. zap.Int("round", clus.rd),
  174. zap.Strings("failures", clus.failureStrings()),
  175. )
  176. return nil
  177. }
  178. func (clus *Cluster) updateRevision() error {
  179. revs, _, err := clus.getRevisionHash()
  180. for _, rev := range revs {
  181. clus.currentRevision = rev
  182. break // just need get one of the current revisions
  183. }
  184. clus.lg.Info(
  185. "updated current revision",
  186. zap.Int64("current-revision", clus.currentRevision),
  187. )
  188. return err
  189. }
  190. func (clus *Cluster) compact(rev int64, timeout time.Duration) (err error) {
  191. clus.lg.Info("pausing stresser before compact")
  192. clus.pauseStresser()
  193. defer func() {
  194. if err == nil {
  195. err = clus.startStresser()
  196. }
  197. }()
  198. clus.lg.Info(
  199. "compacting storage",
  200. zap.Int64("current-revision", clus.currentRevision),
  201. zap.Int64("compact-revision", rev),
  202. )
  203. if err = clus.compactKV(rev, timeout); err != nil {
  204. return err
  205. }
  206. clus.lg.Info(
  207. "compacted storage",
  208. zap.Int64("current-revision", clus.currentRevision),
  209. zap.Int64("compact-revision", rev),
  210. )
  211. clus.lg.Info(
  212. "checking compaction",
  213. zap.Int64("current-revision", clus.currentRevision),
  214. zap.Int64("compact-revision", rev),
  215. )
  216. if err = clus.checkCompact(rev); err != nil {
  217. clus.lg.Warn(
  218. "checkCompact failed",
  219. zap.Int64("current-revision", clus.currentRevision),
  220. zap.Int64("compact-revision", rev),
  221. zap.Error(err),
  222. )
  223. return err
  224. }
  225. clus.lg.Info(
  226. "confirmed compaction",
  227. zap.Int64("current-revision", clus.currentRevision),
  228. zap.Int64("compact-revision", rev),
  229. )
  230. return nil
  231. }
  232. func (clus *Cluster) failed() {
  233. if !clus.Tester.ExitOnFailure {
  234. return
  235. }
  236. clus.lg.Info(
  237. "exiting on failure",
  238. zap.Int("round", clus.rd),
  239. zap.Int("case", clus.cs),
  240. )
  241. clus.DestroyEtcdAgents()
  242. os.Exit(2)
  243. }
  244. func (clus *Cluster) cleanup() error {
  245. defer clus.failed()
  246. roundFailedTotalCounter.Inc()
  247. desc := "compact/defrag"
  248. if clus.cs != -1 {
  249. desc = clus.failures[clus.cs].Desc()
  250. }
  251. caseFailedTotalCounter.WithLabelValues(desc).Inc()
  252. clus.closeStresser()
  253. if err := clus.FailArchive(); err != nil {
  254. clus.lg.Warn(
  255. "cleanup failed",
  256. zap.Int("round", clus.rd),
  257. zap.Int("case", clus.cs),
  258. zap.Error(err),
  259. )
  260. return err
  261. }
  262. if err := clus.Restart(); err != nil {
  263. clus.lg.Warn(
  264. "restart failed",
  265. zap.Int("round", clus.rd),
  266. zap.Int("case", clus.cs),
  267. zap.Error(err),
  268. )
  269. return err
  270. }
  271. clus.updateStresserChecker()
  272. return nil
  273. }