cluster_tester.go 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323
  1. // Copyright 2018 The etcd Authors
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package tester
  15. import (
  16. "fmt"
  17. "os"
  18. "time"
  19. "github.com/coreos/etcd/tools/functional-tester/rpcpb"
  20. "go.uber.org/zap"
  21. )
  22. // compactQPS is rough number of compact requests per second.
  23. // Previous tests showed etcd can compact about 60,000 entries per second.
  24. const compactQPS = 50000
  25. // StartTester starts tester.
  26. func (clus *Cluster) StartTester() {
  27. // TODO: upate status
  28. var preModifiedKey int64
  29. for round := 0; round < int(clus.Tester.RoundLimit) || clus.Tester.RoundLimit == -1; round++ {
  30. roundTotalCounter.Inc()
  31. clus.rd = round
  32. if err := clus.doRound(); err != nil {
  33. clus.lg.Warn(
  34. "round FAIL",
  35. zap.Int("round", clus.rd),
  36. zap.Int("case", clus.cs),
  37. zap.Error(err),
  38. )
  39. if clus.cleanup() != nil {
  40. return
  41. }
  42. // reset preModifiedKey after clean up
  43. preModifiedKey = 0
  44. continue
  45. }
  46. // -1 so that logPrefix doesn't print out 'case'
  47. clus.cs = -1
  48. revToCompact := max(0, clus.currentRevision-10000)
  49. currentModifiedKey := clus.stresser.ModifiedKeys()
  50. modifiedKey := currentModifiedKey - preModifiedKey
  51. preModifiedKey = currentModifiedKey
  52. timeout := 10 * time.Second
  53. timeout += time.Duration(modifiedKey/compactQPS) * time.Second
  54. clus.lg.Info(
  55. "compact START",
  56. zap.Int("round", clus.rd),
  57. zap.Int("case", clus.cs),
  58. zap.Duration("timeout", timeout),
  59. )
  60. if err := clus.compact(revToCompact, timeout); err != nil {
  61. clus.lg.Warn(
  62. "compact FAIL",
  63. zap.Int("round", clus.rd),
  64. zap.Int("case", clus.cs),
  65. zap.Error(err),
  66. )
  67. if err = clus.cleanup(); err != nil {
  68. clus.lg.Warn(
  69. "cleanup FAIL",
  70. zap.Int("round", clus.rd),
  71. zap.Int("case", clus.cs),
  72. zap.Error(err),
  73. )
  74. return
  75. }
  76. // reset preModifiedKey after clean up
  77. preModifiedKey = 0
  78. }
  79. if round > 0 && round%500 == 0 { // every 500 rounds
  80. if err := clus.defrag(); err != nil {
  81. clus.failed()
  82. return
  83. }
  84. }
  85. }
  86. clus.lg.Info(
  87. "functional-tester PASS",
  88. zap.Int("round", clus.rd),
  89. zap.Int("case", clus.cs),
  90. )
  91. }
  92. func (clus *Cluster) doRound() error {
  93. if clus.Tester.FailureShuffle {
  94. clus.shuffleFailures()
  95. }
  96. roundNow := time.Now()
  97. clus.lg.Info(
  98. "round START",
  99. zap.Int("round", clus.rd),
  100. zap.Strings("failures", clus.failureStrings()),
  101. )
  102. for i, fa := range clus.failures {
  103. clus.cs = i
  104. caseTotalCounter.WithLabelValues(fa.Desc()).Inc()
  105. caseNow := time.Now()
  106. clus.lg.Info(
  107. "case START",
  108. zap.Int("round", clus.rd),
  109. zap.Int("case", clus.cs),
  110. zap.String("desc", fa.Desc()),
  111. )
  112. clus.lg.Info("wait health before injecting failures")
  113. if err := clus.WaitHealth(); err != nil {
  114. return fmt.Errorf("wait full health error: %v", err)
  115. }
  116. stressStarted := false
  117. fcase := fa.FailureCase()
  118. if fcase != rpcpb.FailureCase_NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS {
  119. clus.lg.Info(
  120. "stresser START",
  121. zap.Int("round", clus.rd),
  122. zap.Int("case", clus.cs),
  123. zap.String("desc", fa.Desc()),
  124. )
  125. if err := clus.stresser.Stress(); err != nil {
  126. return fmt.Errorf("start stresser error: %v", err)
  127. }
  128. stressStarted = true
  129. }
  130. clus.lg.Info(
  131. "inject START",
  132. zap.Int("round", clus.rd),
  133. zap.Int("case", clus.cs),
  134. zap.String("desc", fa.Desc()),
  135. )
  136. if err := fa.Inject(clus); err != nil {
  137. return fmt.Errorf("injection error: %v", err)
  138. }
  139. // if run local, recovering server may conflict
  140. // with stressing client ports
  141. // TODO: use unix for local tests
  142. clus.lg.Info(
  143. "recover START",
  144. zap.Int("round", clus.rd),
  145. zap.Int("case", clus.cs),
  146. zap.String("desc", fa.Desc()),
  147. )
  148. if err := fa.Recover(clus); err != nil {
  149. return fmt.Errorf("recovery error: %v", err)
  150. }
  151. if stressStarted {
  152. clus.lg.Info("stresser PAUSE")
  153. ems := clus.stresser.Pause()
  154. if fcase == rpcpb.FailureCase_NO_FAIL_WITH_STRESS && len(ems) > 0 {
  155. ess := make([]string, 0, len(ems))
  156. cnt := 0
  157. for k, v := range ems {
  158. ess = append(ess, fmt.Sprintf("%s (count: %d)", k, v))
  159. cnt += v
  160. }
  161. clus.lg.Warn(
  162. "expected no errors",
  163. zap.String("desc", fa.Desc()),
  164. zap.Strings("errors", ess),
  165. )
  166. // with network delay, some ongoing requests may fail
  167. // only return error, if more than 10% of QPS requests fail
  168. if cnt > int(clus.Tester.StressQPS)/10 {
  169. return fmt.Errorf("expected no error in %q, got %q", fcase.String(), ess)
  170. }
  171. }
  172. }
  173. clus.lg.Info("health check START")
  174. if err := clus.WaitHealth(); err != nil {
  175. return fmt.Errorf("wait full health error: %v", err)
  176. }
  177. clus.lg.Info("consistency check START")
  178. if err := clus.checkConsistency(); err != nil {
  179. return fmt.Errorf("consistency check error (%v)", err)
  180. }
  181. clus.lg.Info(
  182. "case PASS",
  183. zap.Int("round", clus.rd),
  184. zap.Int("case", clus.cs),
  185. zap.String("desc", fa.Desc()),
  186. zap.Duration("took", time.Since(caseNow)),
  187. )
  188. }
  189. clus.lg.Info(
  190. "round ALL PASS",
  191. zap.Int("round", clus.rd),
  192. zap.Strings("failures", clus.failureStrings()),
  193. zap.Duration("took", time.Since(roundNow)),
  194. )
  195. return nil
  196. }
  197. func (clus *Cluster) updateRevision() error {
  198. revs, _, err := clus.getRevisionHash()
  199. for _, rev := range revs {
  200. clus.currentRevision = rev
  201. break // just need get one of the current revisions
  202. }
  203. clus.lg.Info(
  204. "updated current revision",
  205. zap.Int64("current-revision", clus.currentRevision),
  206. )
  207. return err
  208. }
  209. func (clus *Cluster) compact(rev int64, timeout time.Duration) (err error) {
  210. if err = clus.compactKV(rev, timeout); err != nil {
  211. clus.lg.Warn(
  212. "compact FAIL",
  213. zap.Int64("current-revision", clus.currentRevision),
  214. zap.Int64("compact-revision", rev),
  215. zap.Error(err),
  216. )
  217. return err
  218. }
  219. clus.lg.Info(
  220. "compact DONE",
  221. zap.Int64("current-revision", clus.currentRevision),
  222. zap.Int64("compact-revision", rev),
  223. )
  224. if err = clus.checkCompact(rev); err != nil {
  225. clus.lg.Warn(
  226. "check compact FAIL",
  227. zap.Int64("current-revision", clus.currentRevision),
  228. zap.Int64("compact-revision", rev),
  229. zap.Error(err),
  230. )
  231. return err
  232. }
  233. clus.lg.Info(
  234. "check compact DONE",
  235. zap.Int64("current-revision", clus.currentRevision),
  236. zap.Int64("compact-revision", rev),
  237. )
  238. return nil
  239. }
  240. func (clus *Cluster) failed() {
  241. if !clus.Tester.ExitOnFailure {
  242. return
  243. }
  244. clus.lg.Info(
  245. "functional-tester FAIL",
  246. zap.Int("round", clus.rd),
  247. zap.Int("case", clus.cs),
  248. )
  249. clus.DestroyEtcdAgents()
  250. os.Exit(2)
  251. }
  252. func (clus *Cluster) cleanup() error {
  253. defer clus.failed()
  254. roundFailedTotalCounter.Inc()
  255. desc := "compact/defrag"
  256. if clus.cs != -1 {
  257. desc = clus.failures[clus.cs].Desc()
  258. }
  259. caseFailedTotalCounter.WithLabelValues(desc).Inc()
  260. clus.lg.Info(
  261. "closing stressers before archiving failure data",
  262. zap.Int("round", clus.rd),
  263. zap.Int("case", clus.cs),
  264. )
  265. clus.stresser.Close()
  266. if err := clus.FailArchive(); err != nil {
  267. clus.lg.Warn(
  268. "cleanup FAIL",
  269. zap.Int("round", clus.rd),
  270. zap.Int("case", clus.cs),
  271. zap.Error(err),
  272. )
  273. return err
  274. }
  275. if err := clus.Restart(); err != nil {
  276. clus.lg.Warn(
  277. "restart FAIL",
  278. zap.Int("round", clus.rd),
  279. zap.Int("case", clus.cs),
  280. zap.Error(err),
  281. )
  282. return err
  283. }
  284. clus.updateStresserChecker()
  285. return nil
  286. }