failure.go 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392
  1. // Copyright 2015 CoreOS, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package main
  15. import (
  16. "fmt"
  17. "math/rand"
  18. "time"
  19. )
  20. const (
  21. snapshotCount = 10000
  22. slowNetworkLatency = 1000 // 1-second
  23. randomVariation = 50
  24. )
  25. type failure interface {
  26. // Inject injeccts the failure into the testing cluster at the given
  27. // round. When calling the function, the cluster should be in health.
  28. Inject(c *cluster, round int) error
  29. // Recover recovers the injected failure caused by the injection of the
  30. // given round and wait for the recovery of the testing cluster.
  31. Recover(c *cluster, round int) error
  32. // return a description of the failure
  33. Desc() string
  34. }
  35. type description string
  36. func (d description) Desc() string { return string(d) }
  37. type failureKillAll struct {
  38. description
  39. }
  40. func newFailureKillAll() *failureKillAll {
  41. return &failureKillAll{
  42. description: "kill all members",
  43. }
  44. }
  45. func (f *failureKillAll) Inject(c *cluster, round int) error {
  46. for _, a := range c.Agents {
  47. if err := a.Stop(); err != nil {
  48. return err
  49. }
  50. }
  51. return nil
  52. }
  53. func (f *failureKillAll) Recover(c *cluster, round int) error {
  54. for _, a := range c.Agents {
  55. if _, err := a.Restart(); err != nil {
  56. return err
  57. }
  58. }
  59. return c.WaitHealth()
  60. }
  61. type failureKillMajority struct {
  62. description
  63. }
  64. func newFailureKillMajority() *failureKillMajority {
  65. return &failureKillMajority{
  66. description: "kill majority of the cluster",
  67. }
  68. }
  69. func (f *failureKillMajority) Inject(c *cluster, round int) error {
  70. for i := range getToKillMap(c.Size, round) {
  71. if err := c.Agents[i].Stop(); err != nil {
  72. return err
  73. }
  74. }
  75. return nil
  76. }
  77. func (f *failureKillMajority) Recover(c *cluster, round int) error {
  78. for i := range getToKillMap(c.Size, round) {
  79. if _, err := c.Agents[i].Restart(); err != nil {
  80. return err
  81. }
  82. }
  83. return c.WaitHealth()
  84. }
  85. func getToKillMap(size int, seed int) map[int]bool {
  86. m := make(map[int]bool)
  87. r := rand.New(rand.NewSource(int64(seed)))
  88. majority := size/2 + 1
  89. for {
  90. m[r.Intn(size)] = true
  91. if len(m) >= majority {
  92. return m
  93. }
  94. }
  95. }
  96. type failureKillOne struct {
  97. description
  98. }
  99. func newFailureKillOne() *failureKillOne {
  100. return &failureKillOne{
  101. description: "kill one random member",
  102. }
  103. }
  104. func (f *failureKillOne) Inject(c *cluster, round int) error {
  105. i := round % c.Size
  106. return c.Agents[i].Stop()
  107. }
  108. func (f *failureKillOne) Recover(c *cluster, round int) error {
  109. i := round % c.Size
  110. if _, err := c.Agents[i].Restart(); err != nil {
  111. return err
  112. }
  113. return c.WaitHealth()
  114. }
  115. type failureKillLeader struct {
  116. description
  117. idx int
  118. }
  119. func newFailureKillLeader() *failureKillLeader {
  120. return &failureKillLeader{
  121. description: "kill leader member",
  122. }
  123. }
  124. func (f *failureKillLeader) Inject(c *cluster, round int) error {
  125. idx, err := c.GetLeader()
  126. if err != nil {
  127. return err
  128. }
  129. f.idx = idx
  130. return c.Agents[idx].Stop()
  131. }
  132. func (f *failureKillLeader) Recover(c *cluster, round int) error {
  133. if _, err := c.Agents[f.idx].Restart(); err != nil {
  134. return err
  135. }
  136. return c.WaitHealth()
  137. }
  138. // failureKillOneForLongTime kills one member for long time, and restart
  139. // after a snapshot is required.
  140. type failureKillOneForLongTime struct {
  141. description
  142. }
  143. func newFailureKillOneForLongTime() *failureKillOneForLongTime {
  144. return &failureKillOneForLongTime{
  145. description: "kill one member for long time and expect it to recover from incoming snapshot",
  146. }
  147. }
  148. func (f *failureKillOneForLongTime) Inject(c *cluster, round int) error {
  149. i := round % c.Size
  150. if err := c.Agents[i].Stop(); err != nil {
  151. return err
  152. }
  153. if c.Size >= 3 {
  154. start, _ := c.Report()
  155. var end int
  156. // Normal healthy cluster could accept 1000req/s at least.
  157. // Give it 3-times time to create a new snapshot.
  158. retry := snapshotCount / 1000 * 3
  159. for j := 0; j < retry; j++ {
  160. end, _ = c.Report()
  161. // If the number of proposals committed is bigger than snapshot count,
  162. // a new snapshot should have been created.
  163. if end-start > snapshotCount {
  164. return nil
  165. }
  166. time.Sleep(time.Second)
  167. }
  168. return fmt.Errorf("cluster too slow: only commit %d requests in %ds", end-start, retry)
  169. }
  170. return nil
  171. }
  172. func (f *failureKillOneForLongTime) Recover(c *cluster, round int) error {
  173. i := round % c.Size
  174. if _, err := c.Agents[i].Restart(); err != nil {
  175. return err
  176. }
  177. return c.WaitHealth()
  178. }
  179. // failureKillLeaderForLongTime kills the leader for long time, and restart
  180. // after a snapshot is required.
  181. type failureKillLeaderForLongTime struct {
  182. description
  183. idx int
  184. }
  185. func newFailureKillLeaderForLongTime() *failureKillLeaderForLongTime {
  186. return &failureKillLeaderForLongTime{
  187. description: "kill the leader for long time and expect it to recover from incoming snapshot",
  188. }
  189. }
  190. func (f *failureKillLeaderForLongTime) Inject(c *cluster, round int) error {
  191. idx, err := c.GetLeader()
  192. if err != nil {
  193. return err
  194. }
  195. f.idx = idx
  196. if err := c.Agents[idx].Stop(); err != nil {
  197. return err
  198. }
  199. if c.Size >= 3 {
  200. start, _ := c.Report()
  201. var end int
  202. retry := snapshotCount / 1000 * 3
  203. for j := 0; j < retry; j++ {
  204. end, _ = c.Report()
  205. if end-start > snapshotCount {
  206. return nil
  207. }
  208. time.Sleep(time.Second)
  209. }
  210. return fmt.Errorf("cluster too slow: only commit %d requests in %ds", end-start, retry)
  211. }
  212. return nil
  213. }
  214. func (f *failureKillLeaderForLongTime) Recover(c *cluster, round int) error {
  215. if _, err := c.Agents[f.idx].Restart(); err != nil {
  216. return err
  217. }
  218. return c.WaitHealth()
  219. }
  220. type failureIsolate struct {
  221. description
  222. }
  223. func newFailureIsolate() *failureIsolate {
  224. return &failureIsolate{
  225. description: "isolate one member",
  226. }
  227. }
  228. func (f *failureIsolate) Inject(c *cluster, round int) error {
  229. i := round % c.Size
  230. return c.Agents[i].DropPort(peerURLPort)
  231. }
  232. func (f *failureIsolate) Recover(c *cluster, round int) error {
  233. i := round % c.Size
  234. if err := c.Agents[i].RecoverPort(peerURLPort); err != nil {
  235. return err
  236. }
  237. return c.WaitHealth()
  238. }
  239. type failureIsolateAll struct {
  240. description
  241. }
  242. func newFailureIsolateAll() *failureIsolateAll {
  243. return &failureIsolateAll{
  244. description: "isolate all members",
  245. }
  246. }
  247. func (f *failureIsolateAll) Inject(c *cluster, round int) error {
  248. for _, a := range c.Agents {
  249. if err := a.DropPort(peerURLPort); err != nil {
  250. return err
  251. }
  252. }
  253. return nil
  254. }
  255. func (f *failureIsolateAll) Recover(c *cluster, round int) error {
  256. for _, a := range c.Agents {
  257. if err := a.RecoverPort(peerURLPort); err != nil {
  258. return err
  259. }
  260. }
  261. return c.WaitHealth()
  262. }
  263. type failureSlowNetworkOneMember struct {
  264. description
  265. }
  266. func newFailureSlowNetworkOneMember() *failureSlowNetworkOneMember {
  267. desc := fmt.Sprintf("slow down one member's network by adding %d ms latency", slowNetworkLatency)
  268. return &failureSlowNetworkOneMember{
  269. description: description(desc),
  270. }
  271. }
  272. func (f *failureSlowNetworkOneMember) Inject(c *cluster, round int) error {
  273. i := round % c.Size
  274. if err := c.Agents[i].SetLatency(slowNetworkLatency, randomVariation); err != nil {
  275. c.Agents[i].RemoveLatency() // roll back
  276. return err
  277. }
  278. return nil
  279. }
  280. func (f *failureSlowNetworkOneMember) Recover(c *cluster, round int) error {
  281. i := round % c.Size
  282. if err := c.Agents[i].RemoveLatency(); err != nil {
  283. return err
  284. }
  285. return c.WaitHealth()
  286. }
  287. type failureSlowNetworkLeader struct {
  288. description
  289. idx int
  290. }
  291. func newFailureSlowNetworkLeader() *failureSlowNetworkLeader {
  292. desc := fmt.Sprintf("slow down leader's network by adding %d ms latency", slowNetworkLatency)
  293. return &failureSlowNetworkLeader{
  294. description: description(desc),
  295. }
  296. }
  297. func (f *failureSlowNetworkLeader) Inject(c *cluster, round int) error {
  298. idx, err := c.GetLeader()
  299. if err != nil {
  300. return err
  301. }
  302. f.idx = idx
  303. if err := c.Agents[idx].SetLatency(slowNetworkLatency, randomVariation); err != nil {
  304. c.Agents[idx].RemoveLatency() // roll back
  305. return err
  306. }
  307. return nil
  308. }
  309. func (f *failureSlowNetworkLeader) Recover(c *cluster, round int) error {
  310. if err := c.Agents[f.idx].RemoveLatency(); err != nil {
  311. return err
  312. }
  313. return c.WaitHealth()
  314. }
  315. type failureSlowNetworkAll struct {
  316. description
  317. }
  318. func newFailureSlowNetworkAll() *failureSlowNetworkAll {
  319. return &failureSlowNetworkAll{
  320. description: "slow down all members' network",
  321. }
  322. }
  323. func (f *failureSlowNetworkAll) Inject(c *cluster, round int) error {
  324. for i, a := range c.Agents {
  325. if err := a.SetLatency(slowNetworkLatency, randomVariation); err != nil {
  326. for j := 0; j < i; j++ { // roll back
  327. c.Agents[j].RemoveLatency()
  328. }
  329. return err
  330. }
  331. }
  332. return nil
  333. }
  334. func (f *failureSlowNetworkAll) Recover(c *cluster, round int) error {
  335. for _, a := range c.Agents {
  336. if err := a.RemoveLatency(); err != nil {
  337. return err
  338. }
  339. }
  340. return c.WaitHealth()
  341. }