cluster_tester.go 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334
  1. // Copyright 2018 The etcd Authors
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package tester
  15. import (
  16. "fmt"
  17. "os"
  18. "time"
  19. "github.com/coreos/etcd/pkg/fileutil"
  20. "github.com/coreos/etcd/tools/functional-tester/rpcpb"
  21. "go.uber.org/zap"
  22. )
  23. // compactQPS is rough number of compact requests per second.
  24. // Previous tests showed etcd can compact about 60,000 entries per second.
  25. const compactQPS = 50000
  26. // StartTester starts tester.
  27. func (clus *Cluster) StartTester() {
  28. if err := fileutil.TouchDirAll(clus.Tester.TesterDataDir); err != nil {
  29. clus.lg.Panic(
  30. "failed to create test data directory",
  31. zap.String("dir", clus.Tester.TesterDataDir),
  32. zap.Error(err),
  33. )
  34. }
  35. var preModifiedKey int64
  36. for round := 0; round < int(clus.Tester.RoundLimit) || clus.Tester.RoundLimit == -1; round++ {
  37. roundTotalCounter.Inc()
  38. clus.rd = round
  39. if err := clus.doRound(); err != nil {
  40. clus.lg.Warn(
  41. "round FAIL",
  42. zap.Int("round", clus.rd),
  43. zap.Int("case", clus.cs),
  44. zap.Error(err),
  45. )
  46. if clus.cleanup() != nil {
  47. return
  48. }
  49. // reset preModifiedKey after clean up
  50. preModifiedKey = 0
  51. continue
  52. }
  53. // -1 so that logPrefix doesn't print out 'case'
  54. clus.cs = -1
  55. revToCompact := max(0, clus.currentRevision-10000)
  56. currentModifiedKey := clus.stresser.ModifiedKeys()
  57. modifiedKey := currentModifiedKey - preModifiedKey
  58. preModifiedKey = currentModifiedKey
  59. timeout := 10 * time.Second
  60. timeout += time.Duration(modifiedKey/compactQPS) * time.Second
  61. clus.lg.Info(
  62. "compact START",
  63. zap.Int("round", clus.rd),
  64. zap.Int("case", clus.cs),
  65. zap.Duration("timeout", timeout),
  66. )
  67. if err := clus.compact(revToCompact, timeout); err != nil {
  68. clus.lg.Warn(
  69. "compact FAIL",
  70. zap.Int("round", clus.rd),
  71. zap.Int("case", clus.cs),
  72. zap.Error(err),
  73. )
  74. if err = clus.cleanup(); err != nil {
  75. clus.lg.Warn(
  76. "cleanup FAIL",
  77. zap.Int("round", clus.rd),
  78. zap.Int("case", clus.cs),
  79. zap.Error(err),
  80. )
  81. return
  82. }
  83. // reset preModifiedKey after clean up
  84. preModifiedKey = 0
  85. }
  86. if round > 0 && round%500 == 0 { // every 500 rounds
  87. if err := clus.defrag(); err != nil {
  88. clus.failed()
  89. return
  90. }
  91. }
  92. }
  93. clus.lg.Info(
  94. "functional-tester PASS",
  95. zap.Int("round", clus.rd),
  96. zap.Int("case", clus.cs),
  97. )
  98. }
  99. func (clus *Cluster) doRound() error {
  100. if clus.Tester.FailureShuffle {
  101. clus.shuffleFailures()
  102. }
  103. roundNow := time.Now()
  104. clus.lg.Info(
  105. "round START",
  106. zap.Int("round", clus.rd),
  107. zap.Strings("failures", clus.failureStrings()),
  108. zap.Int("total-failures", len(clus.failures)),
  109. )
  110. for i, fa := range clus.failures {
  111. clus.cs = i
  112. caseTotalCounter.WithLabelValues(fa.Desc()).Inc()
  113. caseNow := time.Now()
  114. clus.lg.Info(
  115. "case START",
  116. zap.Int("round", clus.rd),
  117. zap.Int("case", clus.cs),
  118. zap.String("desc", fa.Desc()),
  119. zap.Int("total-failures", len(clus.failures)),
  120. )
  121. clus.lg.Info("wait health before injecting failures")
  122. if err := clus.WaitHealth(); err != nil {
  123. return fmt.Errorf("wait full health error: %v", err)
  124. }
  125. stressStarted := false
  126. fcase := fa.FailureCase()
  127. if fcase != rpcpb.FailureCase_NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS {
  128. clus.lg.Info(
  129. "stresser START",
  130. zap.Int("round", clus.rd),
  131. zap.Int("case", clus.cs),
  132. zap.String("desc", fa.Desc()),
  133. )
  134. if err := clus.stresser.Stress(); err != nil {
  135. return fmt.Errorf("start stresser error: %v", err)
  136. }
  137. stressStarted = true
  138. }
  139. clus.lg.Info(
  140. "inject START",
  141. zap.Int("round", clus.rd),
  142. zap.Int("case", clus.cs),
  143. zap.String("desc", fa.Desc()),
  144. )
  145. if err := fa.Inject(clus); err != nil {
  146. return fmt.Errorf("injection error: %v", err)
  147. }
  148. // if run local, recovering server may conflict
  149. // with stressing client ports
  150. // TODO: use unix for local tests
  151. clus.lg.Info(
  152. "recover START",
  153. zap.Int("round", clus.rd),
  154. zap.Int("case", clus.cs),
  155. zap.String("desc", fa.Desc()),
  156. )
  157. if err := fa.Recover(clus); err != nil {
  158. return fmt.Errorf("recovery error: %v", err)
  159. }
  160. if stressStarted {
  161. clus.lg.Info("stresser PAUSE")
  162. ems := clus.stresser.Pause()
  163. if fcase == rpcpb.FailureCase_NO_FAIL_WITH_STRESS && len(ems) > 0 {
  164. ess := make([]string, 0, len(ems))
  165. cnt := 0
  166. for k, v := range ems {
  167. ess = append(ess, fmt.Sprintf("%s (count: %d)", k, v))
  168. cnt += v
  169. }
  170. clus.lg.Warn(
  171. "expected no errors",
  172. zap.String("desc", fa.Desc()),
  173. zap.Strings("errors", ess),
  174. )
  175. // with network delay, some ongoing requests may fail
  176. // only return error, if more than 10% of QPS requests fail
  177. if cnt > int(clus.Tester.StressQPS)/10 {
  178. return fmt.Errorf("expected no error in %q, got %q", fcase.String(), ess)
  179. }
  180. }
  181. }
  182. clus.lg.Info("health check START")
  183. if err := clus.WaitHealth(); err != nil {
  184. return fmt.Errorf("wait full health error: %v", err)
  185. }
  186. clus.lg.Info("consistency check START")
  187. if err := clus.checkConsistency(); err != nil {
  188. return fmt.Errorf("consistency check error (%v)", err)
  189. }
  190. clus.lg.Info(
  191. "case PASS",
  192. zap.Int("round", clus.rd),
  193. zap.Int("case", clus.cs),
  194. zap.String("desc", fa.Desc()),
  195. zap.Int("total-failures", len(clus.failures)),
  196. zap.Duration("took", time.Since(caseNow)),
  197. )
  198. }
  199. clus.lg.Info(
  200. "round ALL PASS",
  201. zap.Int("round", clus.rd),
  202. zap.Strings("failures", clus.failureStrings()),
  203. zap.Int("total-failures", len(clus.failures)),
  204. zap.Duration("took", time.Since(roundNow)),
  205. )
  206. return nil
  207. }
  208. func (clus *Cluster) updateRevision() error {
  209. revs, _, err := clus.getRevisionHash()
  210. for _, rev := range revs {
  211. clus.currentRevision = rev
  212. break // just need get one of the current revisions
  213. }
  214. clus.lg.Info(
  215. "updated current revision",
  216. zap.Int64("current-revision", clus.currentRevision),
  217. )
  218. return err
  219. }
  220. func (clus *Cluster) compact(rev int64, timeout time.Duration) (err error) {
  221. if err = clus.compactKV(rev, timeout); err != nil {
  222. clus.lg.Warn(
  223. "compact FAIL",
  224. zap.Int64("current-revision", clus.currentRevision),
  225. zap.Int64("compact-revision", rev),
  226. zap.Error(err),
  227. )
  228. return err
  229. }
  230. clus.lg.Info(
  231. "compact DONE",
  232. zap.Int64("current-revision", clus.currentRevision),
  233. zap.Int64("compact-revision", rev),
  234. )
  235. if err = clus.checkCompact(rev); err != nil {
  236. clus.lg.Warn(
  237. "check compact FAIL",
  238. zap.Int64("current-revision", clus.currentRevision),
  239. zap.Int64("compact-revision", rev),
  240. zap.Error(err),
  241. )
  242. return err
  243. }
  244. clus.lg.Info(
  245. "check compact DONE",
  246. zap.Int64("current-revision", clus.currentRevision),
  247. zap.Int64("compact-revision", rev),
  248. )
  249. return nil
  250. }
  251. func (clus *Cluster) failed() {
  252. if !clus.Tester.ExitOnFailure {
  253. return
  254. }
  255. clus.lg.Info(
  256. "functional-tester FAIL",
  257. zap.Int("round", clus.rd),
  258. zap.Int("case", clus.cs),
  259. )
  260. clus.DestroyEtcdAgents()
  261. os.Exit(2)
  262. }
  263. func (clus *Cluster) cleanup() error {
  264. defer clus.failed()
  265. roundFailedTotalCounter.Inc()
  266. desc := "compact/defrag"
  267. if clus.cs != -1 {
  268. desc = clus.failures[clus.cs].Desc()
  269. }
  270. caseFailedTotalCounter.WithLabelValues(desc).Inc()
  271. clus.lg.Info(
  272. "closing stressers before archiving failure data",
  273. zap.Int("round", clus.rd),
  274. zap.Int("case", clus.cs),
  275. )
  276. clus.stresser.Close()
  277. if err := clus.FailArchive(); err != nil {
  278. clus.lg.Warn(
  279. "cleanup FAIL",
  280. zap.Int("round", clus.rd),
  281. zap.Int("case", clus.cs),
  282. zap.Error(err),
  283. )
  284. return err
  285. }
  286. if err := clus.Restart(); err != nil {
  287. clus.lg.Warn(
  288. "restart FAIL",
  289. zap.Int("round", clus.rd),
  290. zap.Int("case", clus.cs),
  291. zap.Error(err),
  292. )
  293. return err
  294. }
  295. clus.updateStresserChecker()
  296. return nil
  297. }