cluster_run.go 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373
  1. // Copyright 2018 The etcd Authors
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package tester
  15. import (
  16. "fmt"
  17. "os"
  18. "time"
  19. "go.etcd.io/etcd/functional/rpcpb"
  20. "go.etcd.io/etcd/pkg/fileutil"
  21. "go.uber.org/zap"
  22. )
  23. // compactQPS is rough number of compact requests per second.
  24. // Previous tests showed etcd can compact about 60,000 entries per second.
  25. const compactQPS = 50000
  26. // Run starts tester.
  27. func (clus *Cluster) Run() {
  28. defer printReport()
  29. if err := fileutil.TouchDirAll(clus.Tester.DataDir); err != nil {
  30. clus.lg.Panic(
  31. "failed to create test data directory",
  32. zap.String("dir", clus.Tester.DataDir),
  33. zap.Error(err),
  34. )
  35. }
  36. var preModifiedKey int64
  37. for round := 0; round < int(clus.Tester.RoundLimit) || clus.Tester.RoundLimit == -1; round++ {
  38. roundTotalCounter.Inc()
  39. clus.rd = round
  40. if err := clus.doRound(); err != nil {
  41. clus.lg.Warn(
  42. "round FAIL",
  43. zap.Int("round", clus.rd),
  44. zap.Int("case", clus.cs),
  45. zap.Int("case-total", len(clus.cases)),
  46. zap.Error(err),
  47. )
  48. if clus.cleanup() != nil {
  49. return
  50. }
  51. // reset preModifiedKey after clean up
  52. preModifiedKey = 0
  53. continue
  54. }
  55. // -1 so that logPrefix doesn't print out 'case'
  56. clus.cs = -1
  57. revToCompact := max(0, clus.currentRevision-10000)
  58. currentModifiedKey := clus.stresser.ModifiedKeys()
  59. modifiedKey := currentModifiedKey - preModifiedKey
  60. preModifiedKey = currentModifiedKey
  61. timeout := 10 * time.Second
  62. timeout += time.Duration(modifiedKey/compactQPS) * time.Second
  63. clus.lg.Info(
  64. "compact START",
  65. zap.Int("round", clus.rd),
  66. zap.Int("case", clus.cs),
  67. zap.Int("case-total", len(clus.cases)),
  68. zap.Duration("timeout", timeout),
  69. )
  70. if err := clus.compact(revToCompact, timeout); err != nil {
  71. clus.lg.Warn(
  72. "compact FAIL",
  73. zap.Int("round", clus.rd),
  74. zap.Int("case", clus.cs),
  75. zap.Int("case-total", len(clus.cases)),
  76. zap.Error(err),
  77. )
  78. if err = clus.cleanup(); err != nil {
  79. clus.lg.Warn(
  80. "cleanup FAIL",
  81. zap.Int("round", clus.rd),
  82. zap.Int("case", clus.cs),
  83. zap.Int("case-total", len(clus.cases)),
  84. zap.Error(err),
  85. )
  86. return
  87. }
  88. // reset preModifiedKey after clean up
  89. preModifiedKey = 0
  90. }
  91. if round > 0 && round%500 == 0 { // every 500 rounds
  92. if err := clus.defrag(); err != nil {
  93. clus.failed()
  94. return
  95. }
  96. }
  97. }
  98. clus.lg.Info(
  99. "functional-tester PASS",
  100. zap.Int("round", clus.rd),
  101. zap.Int("case", clus.cs),
  102. zap.Int("case-total", len(clus.cases)),
  103. )
  104. }
  105. func (clus *Cluster) doRound() error {
  106. if clus.Tester.CaseShuffle {
  107. clus.shuffleCases()
  108. }
  109. roundNow := time.Now()
  110. clus.lg.Info(
  111. "round START",
  112. zap.Int("round", clus.rd),
  113. zap.Int("case", clus.cs),
  114. zap.Int("case-total", len(clus.cases)),
  115. zap.Strings("cases", clus.listCases()),
  116. )
  117. for i, fa := range clus.cases {
  118. clus.cs = i
  119. caseTotal[fa.Desc()]++
  120. caseTotalCounter.WithLabelValues(fa.Desc()).Inc()
  121. caseNow := time.Now()
  122. clus.lg.Info(
  123. "case START",
  124. zap.Int("round", clus.rd),
  125. zap.Int("case", clus.cs),
  126. zap.Int("case-total", len(clus.cases)),
  127. zap.String("desc", fa.Desc()),
  128. )
  129. clus.lg.Info("wait health before injecting failures")
  130. if err := clus.WaitHealth(); err != nil {
  131. return fmt.Errorf("wait full health error: %v", err)
  132. }
  133. stressStarted := false
  134. fcase := fa.TestCase()
  135. if fcase != rpcpb.Case_NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS {
  136. clus.lg.Info(
  137. "stress START",
  138. zap.Int("round", clus.rd),
  139. zap.Int("case", clus.cs),
  140. zap.Int("case-total", len(clus.cases)),
  141. zap.String("desc", fa.Desc()),
  142. )
  143. if err := clus.stresser.Stress(); err != nil {
  144. return fmt.Errorf("start stresser error: %v", err)
  145. }
  146. stressStarted = true
  147. }
  148. clus.lg.Info(
  149. "inject START",
  150. zap.Int("round", clus.rd),
  151. zap.Int("case", clus.cs),
  152. zap.Int("case-total", len(clus.cases)),
  153. zap.String("desc", fa.Desc()),
  154. )
  155. if err := fa.Inject(clus); err != nil {
  156. return fmt.Errorf("injection error: %v", err)
  157. }
  158. // if run local, recovering server may conflict
  159. // with stressing client ports
  160. // TODO: use unix for local tests
  161. clus.lg.Info(
  162. "recover START",
  163. zap.Int("round", clus.rd),
  164. zap.Int("case", clus.cs),
  165. zap.Int("case-total", len(clus.cases)),
  166. zap.String("desc", fa.Desc()),
  167. )
  168. if err := fa.Recover(clus); err != nil {
  169. return fmt.Errorf("recovery error: %v", err)
  170. }
  171. if stressStarted {
  172. clus.lg.Info(
  173. "stress PAUSE",
  174. zap.Int("round", clus.rd),
  175. zap.Int("case", clus.cs),
  176. zap.Int("case-total", len(clus.cases)),
  177. zap.String("desc", fa.Desc()),
  178. )
  179. ems := clus.stresser.Pause()
  180. if fcase == rpcpb.Case_NO_FAIL_WITH_STRESS && len(ems) > 0 {
  181. ess := make([]string, 0, len(ems))
  182. cnt := 0
  183. for k, v := range ems {
  184. ess = append(ess, fmt.Sprintf("%s (count: %d)", k, v))
  185. cnt += v
  186. }
  187. clus.lg.Warn(
  188. "expected no errors",
  189. zap.String("desc", fa.Desc()),
  190. zap.Strings("errors", ess),
  191. )
  192. // with network delay, some ongoing requests may fail
  193. // only return error, if more than 30% of QPS requests fail
  194. if cnt > int(float64(clus.Tester.StressQPS)*0.3) {
  195. return fmt.Errorf("expected no error in %q, got %q", fcase.String(), ess)
  196. }
  197. }
  198. }
  199. clus.lg.Info(
  200. "health check START",
  201. zap.Int("round", clus.rd),
  202. zap.Int("case", clus.cs),
  203. zap.Int("case-total", len(clus.cases)),
  204. zap.String("desc", fa.Desc()),
  205. )
  206. if err := clus.WaitHealth(); err != nil {
  207. return fmt.Errorf("wait full health error: %v", err)
  208. }
  209. checkerFailExceptions := []rpcpb.Checker{}
  210. switch fcase {
  211. case rpcpb.Case_SIGQUIT_AND_REMOVE_QUORUM_AND_RESTORE_LEADER_SNAPSHOT_FROM_SCRATCH:
  212. // TODO: restore from snapshot
  213. checkerFailExceptions = append(checkerFailExceptions, rpcpb.Checker_LEASE_EXPIRE)
  214. }
  215. clus.lg.Info(
  216. "consistency check START",
  217. zap.Int("round", clus.rd),
  218. zap.Int("case", clus.cs),
  219. zap.Int("case-total", len(clus.cases)),
  220. zap.String("desc", fa.Desc()),
  221. )
  222. if err := clus.runCheckers(checkerFailExceptions...); err != nil {
  223. return fmt.Errorf("consistency check error (%v)", err)
  224. }
  225. clus.lg.Info(
  226. "consistency check PASS",
  227. zap.Int("round", clus.rd),
  228. zap.Int("case", clus.cs),
  229. zap.Int("case-total", len(clus.cases)),
  230. zap.String("desc", fa.Desc()),
  231. zap.Duration("took", time.Since(caseNow)),
  232. )
  233. }
  234. clus.lg.Info(
  235. "round ALL PASS",
  236. zap.Int("round", clus.rd),
  237. zap.Strings("cases", clus.listCases()),
  238. zap.Int("case-total", len(clus.cases)),
  239. zap.Duration("took", time.Since(roundNow)),
  240. )
  241. return nil
  242. }
  243. func (clus *Cluster) updateRevision() error {
  244. revs, _, err := clus.getRevisionHash()
  245. for _, rev := range revs {
  246. clus.currentRevision = rev
  247. break // just need get one of the current revisions
  248. }
  249. clus.lg.Info(
  250. "updated current revision",
  251. zap.Int64("current-revision", clus.currentRevision),
  252. )
  253. return err
  254. }
  255. func (clus *Cluster) compact(rev int64, timeout time.Duration) (err error) {
  256. if err = clus.compactKV(rev, timeout); err != nil {
  257. clus.lg.Warn(
  258. "compact FAIL",
  259. zap.Int64("current-revision", clus.currentRevision),
  260. zap.Int64("compact-revision", rev),
  261. zap.Error(err),
  262. )
  263. return err
  264. }
  265. clus.lg.Info(
  266. "compact DONE",
  267. zap.Int64("current-revision", clus.currentRevision),
  268. zap.Int64("compact-revision", rev),
  269. )
  270. if err = clus.checkCompact(rev); err != nil {
  271. clus.lg.Warn(
  272. "check compact FAIL",
  273. zap.Int64("current-revision", clus.currentRevision),
  274. zap.Int64("compact-revision", rev),
  275. zap.Error(err),
  276. )
  277. return err
  278. }
  279. clus.lg.Info(
  280. "check compact DONE",
  281. zap.Int64("current-revision", clus.currentRevision),
  282. zap.Int64("compact-revision", rev),
  283. )
  284. return nil
  285. }
  286. func (clus *Cluster) failed() {
  287. clus.lg.Info(
  288. "functional-tester FAIL",
  289. zap.Int("round", clus.rd),
  290. zap.Int("case", clus.cs),
  291. zap.Int("case-total", len(clus.cases)),
  292. )
  293. clus.Send_SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT()
  294. os.Exit(2)
  295. }
  296. func (clus *Cluster) cleanup() error {
  297. if clus.Tester.ExitOnCaseFail {
  298. defer clus.failed()
  299. }
  300. roundFailedTotalCounter.Inc()
  301. desc := "compact/defrag"
  302. if clus.cs != -1 {
  303. desc = clus.cases[clus.cs].Desc()
  304. }
  305. caseFailedTotalCounter.WithLabelValues(desc).Inc()
  306. clus.lg.Info(
  307. "closing stressers before archiving failure data",
  308. zap.Int("round", clus.rd),
  309. zap.Int("case", clus.cs),
  310. zap.Int("case-total", len(clus.cases)),
  311. )
  312. clus.stresser.Close()
  313. if err := clus.send_SIGQUIT_ETCD_AND_ARCHIVE_DATA(); err != nil {
  314. clus.lg.Warn(
  315. "cleanup FAIL",
  316. zap.Int("round", clus.rd),
  317. zap.Int("case", clus.cs),
  318. zap.Int("case-total", len(clus.cases)),
  319. zap.Error(err),
  320. )
  321. return err
  322. }
  323. if err := clus.send_RESTART_ETCD(); err != nil {
  324. clus.lg.Warn(
  325. "restart FAIL",
  326. zap.Int("round", clus.rd),
  327. zap.Int("case", clus.cs),
  328. zap.Int("case-total", len(clus.cases)),
  329. zap.Error(err),
  330. )
  331. return err
  332. }
  333. clus.setStresserChecker()
  334. return nil
  335. }