tester.go 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274
  1. // Copyright 2018 The etcd Authors
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package tester
  15. import (
  16. "fmt"
  17. "os"
  18. "time"
  19. "go.uber.org/zap"
  20. )
  21. // compactQPS is rough number of compact requests per second.
  22. // Previous tests showed etcd can compact about 60,000 entries per second.
  23. const compactQPS = 50000
  24. // StartTester starts tester.
  25. func (clus *Cluster) StartTester() {
  26. // TODO: upate status
  27. clus.startStresser()
  28. var preModifiedKey int64
  29. for round := 0; round < int(clus.Tester.RoundLimit) || clus.Tester.RoundLimit == -1; round++ {
  30. roundTotalCounter.Inc()
  31. clus.rd = round
  32. if err := clus.doRound(round); err != nil {
  33. clus.logger.Warn(
  34. "doRound failed; returning",
  35. zap.Int("round", clus.rd),
  36. zap.Int("case", clus.cs),
  37. zap.Error(err),
  38. )
  39. if clus.cleanup() != nil {
  40. return
  41. }
  42. // reset preModifiedKey after clean up
  43. preModifiedKey = 0
  44. continue
  45. }
  46. // -1 so that logPrefix doesn't print out 'case'
  47. clus.cs = -1
  48. revToCompact := max(0, clus.currentRevision-10000)
  49. currentModifiedKey := clus.stresser.ModifiedKeys()
  50. modifiedKey := currentModifiedKey - preModifiedKey
  51. preModifiedKey = currentModifiedKey
  52. timeout := 10 * time.Second
  53. timeout += time.Duration(modifiedKey/compactQPS) * time.Second
  54. clus.logger.Info(
  55. "compacting",
  56. zap.Int("round", clus.rd),
  57. zap.Int("case", clus.cs),
  58. zap.Duration("timeout", timeout),
  59. )
  60. if err := clus.compact(revToCompact, timeout); err != nil {
  61. clus.logger.Warn(
  62. "compact failed",
  63. zap.Int("round", clus.rd),
  64. zap.Int("case", clus.cs),
  65. zap.Error(err),
  66. )
  67. if err = clus.cleanup(); err != nil {
  68. clus.logger.Warn(
  69. "cleanup failed",
  70. zap.Int("round", clus.rd),
  71. zap.Int("case", clus.cs),
  72. zap.Error(err),
  73. )
  74. return
  75. }
  76. // reset preModifiedKey after clean up
  77. preModifiedKey = 0
  78. }
  79. if round > 0 && round%500 == 0 { // every 500 rounds
  80. if err := clus.defrag(); err != nil {
  81. clus.logger.Warn(
  82. "defrag failed; returning",
  83. zap.Int("round", clus.rd),
  84. zap.Int("case", clus.cs),
  85. zap.Error(err),
  86. )
  87. clus.failed()
  88. return
  89. }
  90. }
  91. }
  92. clus.logger.Info(
  93. "functional-tester is finished",
  94. zap.Int("round", clus.rd),
  95. zap.Int("case", clus.cs),
  96. )
  97. }
  98. func (clus *Cluster) doRound(round int) error {
  99. for i, f := range clus.failures {
  100. clus.cs = i
  101. caseTotalCounter.WithLabelValues(f.Desc()).Inc()
  102. if err := clus.WaitHealth(); err != nil {
  103. return fmt.Errorf("wait full health error: %v", err)
  104. }
  105. clus.logger.Info(
  106. "injecting failure",
  107. zap.Int("round", clus.rd),
  108. zap.Int("case", clus.cs),
  109. zap.String("desc", f.Desc()),
  110. )
  111. if err := f.Inject(clus, round); err != nil {
  112. return fmt.Errorf("injection error: %v", err)
  113. }
  114. clus.logger.Info(
  115. "injected failure",
  116. zap.Int("round", clus.rd),
  117. zap.Int("case", clus.cs),
  118. zap.String("desc", f.Desc()),
  119. )
  120. clus.logger.Info(
  121. "recovering failure",
  122. zap.Int("round", clus.rd),
  123. zap.Int("case", clus.cs),
  124. zap.String("desc", f.Desc()),
  125. )
  126. if err := f.Recover(clus, round); err != nil {
  127. return fmt.Errorf("recovery error: %v", err)
  128. }
  129. clus.logger.Info(
  130. "recovered failure",
  131. zap.Int("round", clus.rd),
  132. zap.Int("case", clus.cs),
  133. zap.String("desc", f.Desc()),
  134. )
  135. clus.pauseStresser()
  136. if err := clus.WaitHealth(); err != nil {
  137. return fmt.Errorf("wait full health error: %v", err)
  138. }
  139. if err := clus.checkConsistency(); err != nil {
  140. return fmt.Errorf("tt.checkConsistency error (%v)", err)
  141. }
  142. clus.logger.Info(
  143. "success",
  144. zap.Int("round", clus.rd),
  145. zap.Int("case", clus.cs),
  146. zap.String("desc", f.Desc()),
  147. )
  148. }
  149. return nil
  150. }
  151. func (clus *Cluster) updateRevision() error {
  152. revs, _, err := clus.getRevisionHash()
  153. for _, rev := range revs {
  154. clus.currentRevision = rev
  155. break // just need get one of the current revisions
  156. }
  157. clus.logger.Info(
  158. "updated current revision",
  159. zap.Int64("current-revision", clus.currentRevision),
  160. )
  161. return err
  162. }
  163. func (clus *Cluster) compact(rev int64, timeout time.Duration) (err error) {
  164. clus.pauseStresser()
  165. defer func() {
  166. if err == nil {
  167. err = clus.startStresser()
  168. }
  169. }()
  170. clus.logger.Info(
  171. "compacting storage",
  172. zap.Int64("current-revision", clus.currentRevision),
  173. zap.Int64("compact-revision", rev),
  174. )
  175. if err = clus.compactKV(rev, timeout); err != nil {
  176. return err
  177. }
  178. clus.logger.Info(
  179. "compacted storage",
  180. zap.Int64("current-revision", clus.currentRevision),
  181. zap.Int64("compact-revision", rev),
  182. )
  183. clus.logger.Info(
  184. "checking compaction",
  185. zap.Int64("current-revision", clus.currentRevision),
  186. zap.Int64("compact-revision", rev),
  187. )
  188. if err = clus.checkCompact(rev); err != nil {
  189. clus.logger.Warn(
  190. "checkCompact failed",
  191. zap.Int64("current-revision", clus.currentRevision),
  192. zap.Int64("compact-revision", rev),
  193. zap.Error(err),
  194. )
  195. return err
  196. }
  197. clus.logger.Info(
  198. "confirmed compaction",
  199. zap.Int64("current-revision", clus.currentRevision),
  200. zap.Int64("compact-revision", rev),
  201. )
  202. return nil
  203. }
  204. func (clus *Cluster) failed() {
  205. if !clus.Tester.ExitOnFailure {
  206. return
  207. }
  208. clus.logger.Info(
  209. "exiting on failure",
  210. zap.Int("round", clus.rd),
  211. zap.Int("case", clus.cs),
  212. )
  213. clus.DestroyEtcdAgents()
  214. os.Exit(2)
  215. }
  216. func (clus *Cluster) cleanup() error {
  217. defer clus.failed()
  218. roundFailedTotalCounter.Inc()
  219. desc := "compact/defrag"
  220. if clus.cs != -1 {
  221. desc = clus.failures[clus.cs].Desc()
  222. }
  223. caseFailedTotalCounter.WithLabelValues(desc).Inc()
  224. clus.closeStresser()
  225. if err := clus.FailArchive(); err != nil {
  226. clus.logger.Warn(
  227. "Cleanup failed",
  228. zap.Int("round", clus.rd),
  229. zap.Int("case", clus.cs),
  230. zap.Error(err),
  231. )
  232. return err
  233. }
  234. if err := clus.Restart(); err != nil {
  235. clus.logger.Warn(
  236. "Restart failed",
  237. zap.Int("round", clus.rd),
  238. zap.Int("case", clus.cs),
  239. zap.Error(err),
  240. )
  241. return err
  242. }
  243. clus.updateStresserChecker()
  244. return nil
  245. }