failure.go 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401
  1. // Copyright 2015 The etcd Authors
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package main
  15. import (
  16. "fmt"
  17. "math/rand"
  18. "time"
  19. )
  20. const (
  21. snapshotCount = 10000
  22. slowNetworkLatency = 500 // 500 millisecond
  23. randomVariation = 50
  24. // Wait more when it recovers from slow network, because network layer
  25. // needs extra time to propagate traffic control (tc command) change.
  26. // Otherwise, we get different hash values from the previous revision.
  27. // For more detail, please see https://github.com/coreos/etcd/issues/5121.
  28. waitRecover = 5 * time.Second
  29. )
  30. type failure interface {
  31. // Inject injeccts the failure into the testing cluster at the given
  32. // round. When calling the function, the cluster should be in health.
  33. Inject(c *cluster, round int) error
  34. // Recover recovers the injected failure caused by the injection of the
  35. // given round and wait for the recovery of the testing cluster.
  36. Recover(c *cluster, round int) error
  37. // Desc returns a description of the failure
  38. Desc() string
  39. }
  40. type description string
  41. func (d description) Desc() string { return string(d) }
  42. type failureKillAll struct {
  43. description
  44. }
  45. func newFailureKillAll() *failureKillAll {
  46. return &failureKillAll{
  47. description: "kill all members",
  48. }
  49. }
  50. func (f *failureKillAll) Inject(c *cluster, round int) error {
  51. for _, a := range c.Agents {
  52. if err := a.Stop(); err != nil {
  53. return err
  54. }
  55. }
  56. return nil
  57. }
  58. func (f *failureKillAll) Recover(c *cluster, round int) error {
  59. for _, a := range c.Agents {
  60. if _, err := a.Restart(); err != nil {
  61. return err
  62. }
  63. }
  64. return c.WaitHealth()
  65. }
  66. type failureKillMajority struct {
  67. description
  68. }
  69. func newFailureKillMajority() *failureKillMajority {
  70. return &failureKillMajority{
  71. description: "kill majority of the cluster",
  72. }
  73. }
  74. func (f *failureKillMajority) Inject(c *cluster, round int) error {
  75. for i := range getToKillMap(c.Size, round) {
  76. if err := c.Agents[i].Stop(); err != nil {
  77. return err
  78. }
  79. }
  80. return nil
  81. }
  82. func (f *failureKillMajority) Recover(c *cluster, round int) error {
  83. for i := range getToKillMap(c.Size, round) {
  84. if _, err := c.Agents[i].Restart(); err != nil {
  85. return err
  86. }
  87. }
  88. return c.WaitHealth()
  89. }
  90. func getToKillMap(size int, seed int) map[int]bool {
  91. m := make(map[int]bool)
  92. r := rand.New(rand.NewSource(int64(seed)))
  93. majority := size/2 + 1
  94. for {
  95. m[r.Intn(size)] = true
  96. if len(m) >= majority {
  97. return m
  98. }
  99. }
  100. }
  101. type failureKillOne struct {
  102. description
  103. }
  104. func newFailureKillOne() *failureKillOne {
  105. return &failureKillOne{
  106. description: "kill one random member",
  107. }
  108. }
  109. func (f *failureKillOne) Inject(c *cluster, round int) error {
  110. i := round % c.Size
  111. return c.Agents[i].Stop()
  112. }
  113. func (f *failureKillOne) Recover(c *cluster, round int) error {
  114. i := round % c.Size
  115. if _, err := c.Agents[i].Restart(); err != nil {
  116. return err
  117. }
  118. return c.WaitHealth()
  119. }
  120. type failureKillLeader struct {
  121. description
  122. idx int
  123. }
  124. func newFailureKillLeader() *failureKillLeader {
  125. return &failureKillLeader{
  126. description: "kill leader member",
  127. }
  128. }
  129. func (f *failureKillLeader) Inject(c *cluster, round int) error {
  130. idx, err := c.GetLeader()
  131. if err != nil {
  132. return err
  133. }
  134. f.idx = idx
  135. return c.Agents[idx].Stop()
  136. }
  137. func (f *failureKillLeader) Recover(c *cluster, round int) error {
  138. if _, err := c.Agents[f.idx].Restart(); err != nil {
  139. return err
  140. }
  141. return c.WaitHealth()
  142. }
  143. // failureKillOneForLongTime kills one member for long time, and restart
  144. // after a snapshot is required.
  145. type failureKillOneForLongTime struct {
  146. description
  147. }
  148. func newFailureKillOneForLongTime() *failureKillOneForLongTime {
  149. return &failureKillOneForLongTime{
  150. description: "kill one member for long time and expect it to recover from incoming snapshot",
  151. }
  152. }
  153. func (f *failureKillOneForLongTime) Inject(c *cluster, round int) error {
  154. i := round % c.Size
  155. if err := c.Agents[i].Stop(); err != nil {
  156. return err
  157. }
  158. if c.Size >= 3 {
  159. start, _ := c.Report()
  160. var end int
  161. // Normal healthy cluster could accept 1000req/s at least.
  162. // Give it 3-times time to create a new snapshot.
  163. retry := snapshotCount / 1000 * 3
  164. for j := 0; j < retry; j++ {
  165. end, _ = c.Report()
  166. // If the number of proposals committed is bigger than snapshot count,
  167. // a new snapshot should have been created.
  168. if end-start > snapshotCount {
  169. return nil
  170. }
  171. time.Sleep(time.Second)
  172. }
  173. return fmt.Errorf("cluster too slow: only commit %d requests in %ds", end-start, retry)
  174. }
  175. return nil
  176. }
  177. func (f *failureKillOneForLongTime) Recover(c *cluster, round int) error {
  178. i := round % c.Size
  179. if _, err := c.Agents[i].Restart(); err != nil {
  180. return err
  181. }
  182. return c.WaitHealth()
  183. }
  184. // failureKillLeaderForLongTime kills the leader for long time, and restart
  185. // after a snapshot is required.
  186. type failureKillLeaderForLongTime struct {
  187. description
  188. idx int
  189. }
  190. func newFailureKillLeaderForLongTime() *failureKillLeaderForLongTime {
  191. return &failureKillLeaderForLongTime{
  192. description: "kill the leader for long time and expect it to recover from incoming snapshot",
  193. }
  194. }
  195. func (f *failureKillLeaderForLongTime) Inject(c *cluster, round int) error {
  196. idx, err := c.GetLeader()
  197. if err != nil {
  198. return err
  199. }
  200. f.idx = idx
  201. if err := c.Agents[idx].Stop(); err != nil {
  202. return err
  203. }
  204. if c.Size >= 3 {
  205. start, _ := c.Report()
  206. var end int
  207. retry := snapshotCount / 1000 * 3
  208. for j := 0; j < retry; j++ {
  209. end, _ = c.Report()
  210. if end-start > snapshotCount {
  211. return nil
  212. }
  213. time.Sleep(time.Second)
  214. }
  215. return fmt.Errorf("cluster too slow: only commit %d requests in %ds", end-start, retry)
  216. }
  217. return nil
  218. }
  219. func (f *failureKillLeaderForLongTime) Recover(c *cluster, round int) error {
  220. if _, err := c.Agents[f.idx].Restart(); err != nil {
  221. return err
  222. }
  223. return c.WaitHealth()
  224. }
  225. type failureIsolate struct {
  226. description
  227. }
  228. func newFailureIsolate() *failureIsolate {
  229. return &failureIsolate{
  230. description: "isolate one member",
  231. }
  232. }
  233. func (f *failureIsolate) Inject(c *cluster, round int) error {
  234. i := round % c.Size
  235. return c.Agents[i].DropPort(peerURLPort)
  236. }
  237. func (f *failureIsolate) Recover(c *cluster, round int) error {
  238. i := round % c.Size
  239. if err := c.Agents[i].RecoverPort(peerURLPort); err != nil {
  240. return err
  241. }
  242. return c.WaitHealth()
  243. }
  244. type failureIsolateAll struct {
  245. description
  246. }
  247. func newFailureIsolateAll() *failureIsolateAll {
  248. return &failureIsolateAll{
  249. description: "isolate all members",
  250. }
  251. }
  252. func (f *failureIsolateAll) Inject(c *cluster, round int) error {
  253. for _, a := range c.Agents {
  254. if err := a.DropPort(peerURLPort); err != nil {
  255. return err
  256. }
  257. }
  258. return nil
  259. }
  260. func (f *failureIsolateAll) Recover(c *cluster, round int) error {
  261. for _, a := range c.Agents {
  262. if err := a.RecoverPort(peerURLPort); err != nil {
  263. return err
  264. }
  265. }
  266. return c.WaitHealth()
  267. }
  268. type failureSlowNetworkOneMember struct {
  269. description
  270. }
  271. func newFailureSlowNetworkOneMember() *failureSlowNetworkOneMember {
  272. desc := fmt.Sprintf("slow down one member's network by adding %d ms latency", slowNetworkLatency)
  273. return &failureSlowNetworkOneMember{
  274. description: description(desc),
  275. }
  276. }
  277. func (f *failureSlowNetworkOneMember) Inject(c *cluster, round int) error {
  278. i := round % c.Size
  279. if err := c.Agents[i].SetLatency(slowNetworkLatency, randomVariation); err != nil {
  280. c.Agents[i].RemoveLatency() // roll back
  281. return err
  282. }
  283. return nil
  284. }
  285. func (f *failureSlowNetworkOneMember) Recover(c *cluster, round int) error {
  286. i := round % c.Size
  287. if err := c.Agents[i].RemoveLatency(); err != nil {
  288. return err
  289. }
  290. time.Sleep(waitRecover)
  291. return c.WaitHealth()
  292. }
  293. type failureSlowNetworkLeader struct {
  294. description
  295. idx int
  296. }
  297. func newFailureSlowNetworkLeader() *failureSlowNetworkLeader {
  298. desc := fmt.Sprintf("slow down leader's network by adding %d ms latency", slowNetworkLatency)
  299. return &failureSlowNetworkLeader{
  300. description: description(desc),
  301. }
  302. }
  303. func (f *failureSlowNetworkLeader) Inject(c *cluster, round int) error {
  304. idx, err := c.GetLeader()
  305. if err != nil {
  306. return err
  307. }
  308. f.idx = idx
  309. if err := c.Agents[idx].SetLatency(slowNetworkLatency, randomVariation); err != nil {
  310. c.Agents[idx].RemoveLatency() // roll back
  311. return err
  312. }
  313. return nil
  314. }
  315. func (f *failureSlowNetworkLeader) Recover(c *cluster, round int) error {
  316. if err := c.Agents[f.idx].RemoveLatency(); err != nil {
  317. return err
  318. }
  319. time.Sleep(waitRecover)
  320. return c.WaitHealth()
  321. }
  322. type failureSlowNetworkAll struct {
  323. description
  324. }
  325. func newFailureSlowNetworkAll() *failureSlowNetworkAll {
  326. return &failureSlowNetworkAll{
  327. description: "slow down all members' network",
  328. }
  329. }
  330. func (f *failureSlowNetworkAll) Inject(c *cluster, round int) error {
  331. for i, a := range c.Agents {
  332. if err := a.SetLatency(slowNetworkLatency, randomVariation); err != nil {
  333. for j := 0; j < i; j++ { // roll back
  334. c.Agents[j].RemoveLatency()
  335. }
  336. return err
  337. }
  338. }
  339. return nil
  340. }
  341. func (f *failureSlowNetworkAll) Recover(c *cluster, round int) error {
  342. for _, a := range c.Agents {
  343. if err := a.RemoveLatency(); err != nil {
  344. return err
  345. }
  346. }
  347. time.Sleep(waitRecover)
  348. return c.WaitHealth()
  349. }