failure.go 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300
  1. // Copyright 2015 CoreOS, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package main
  15. import (
  16. "fmt"
  17. "math/rand"
  18. "time"
  19. )
  20. const snapshotCount = 10000
  21. type failure interface {
  22. // Inject injeccts the failure into the testing cluster at the given
  23. // round. When calling the function, the cluster should be in health.
  24. Inject(c *cluster, round int) error
  25. // Recover recovers the injected failure caused by the injection of the
  26. // given round and wait for the recovery of the testing cluster.
  27. Recover(c *cluster, round int) error
  28. // return a description of the failure
  29. Desc() string
  30. }
  31. type description string
  32. func (d description) Desc() string { return string(d) }
  33. type failureKillAll struct {
  34. description
  35. }
  36. func newFailureKillAll() *failureKillAll {
  37. return &failureKillAll{
  38. description: "kill all members",
  39. }
  40. }
  41. func (f *failureKillAll) Inject(c *cluster, round int) error {
  42. for _, a := range c.Agents {
  43. if err := a.Stop(); err != nil {
  44. return err
  45. }
  46. }
  47. return nil
  48. }
  49. func (f *failureKillAll) Recover(c *cluster, round int) error {
  50. for _, a := range c.Agents {
  51. if _, err := a.Restart(); err != nil {
  52. return err
  53. }
  54. }
  55. return c.WaitHealth()
  56. }
  57. type failureKillMajority struct {
  58. description
  59. }
  60. func newFailureKillMajority() *failureKillMajority {
  61. return &failureKillMajority{
  62. description: "kill majority of the cluster",
  63. }
  64. }
  65. func (f *failureKillMajority) Inject(c *cluster, round int) error {
  66. for i := range getToKillMap(c.Size, round) {
  67. if err := c.Agents[i].Stop(); err != nil {
  68. return err
  69. }
  70. }
  71. return nil
  72. }
  73. func (f *failureKillMajority) Recover(c *cluster, round int) error {
  74. for i := range getToKillMap(c.Size, round) {
  75. if _, err := c.Agents[i].Restart(); err != nil {
  76. return err
  77. }
  78. }
  79. return c.WaitHealth()
  80. }
  81. func getToKillMap(size int, seed int) map[int]bool {
  82. m := make(map[int]bool)
  83. r := rand.New(rand.NewSource(int64(seed)))
  84. majority := size/2 + 1
  85. for {
  86. m[r.Intn(size)] = true
  87. if len(m) >= majority {
  88. return m
  89. }
  90. }
  91. }
  92. type failureKillOne struct {
  93. description
  94. }
  95. func newFailureKillOne() *failureKillOne {
  96. return &failureKillOne{
  97. description: "kill one random member",
  98. }
  99. }
  100. func (f *failureKillOne) Inject(c *cluster, round int) error {
  101. i := round % c.Size
  102. return c.Agents[i].Stop()
  103. }
  104. func (f *failureKillOne) Recover(c *cluster, round int) error {
  105. i := round % c.Size
  106. if _, err := c.Agents[i].Restart(); err != nil {
  107. return err
  108. }
  109. return c.WaitHealth()
  110. }
  111. type failureKillLeader struct {
  112. description
  113. idx int
  114. }
  115. func newFailureKillLeader() *failureKillLeader {
  116. return &failureKillLeader{
  117. description: "kill leader member",
  118. }
  119. }
  120. func (f *failureKillLeader) Inject(c *cluster, round int) error {
  121. idx, err := c.GetLeader()
  122. if err != nil {
  123. return err
  124. }
  125. f.idx = idx
  126. return c.Agents[idx].Stop()
  127. }
  128. func (f *failureKillLeader) Recover(c *cluster, round int) error {
  129. if _, err := c.Agents[f.idx].Restart(); err != nil {
  130. return err
  131. }
  132. return c.WaitHealth()
  133. }
  134. // failureKillOneForLongTime kills one member for long time, and restart
  135. // after a snapshot is required.
  136. type failureKillOneForLongTime struct {
  137. description
  138. }
  139. func newFailureKillOneForLongTime() *failureKillOneForLongTime {
  140. return &failureKillOneForLongTime{
  141. description: "kill one member for long time and expect it to recover from incoming snapshot",
  142. }
  143. }
  144. func (f *failureKillOneForLongTime) Inject(c *cluster, round int) error {
  145. i := round % c.Size
  146. if err := c.Agents[i].Stop(); err != nil {
  147. return err
  148. }
  149. if c.Size >= 3 {
  150. start, _ := c.Report()
  151. var end int
  152. // Normal healthy cluster could accept 1000req/s at least.
  153. // Give it 3-times time to create a new snapshot.
  154. retry := snapshotCount / 1000 * 3
  155. for j := 0; j < retry; j++ {
  156. end, _ = c.Report()
  157. // If the number of proposals committed is bigger than snapshot count,
  158. // a new snapshot should have been created.
  159. if end-start > snapshotCount {
  160. return nil
  161. }
  162. time.Sleep(time.Second)
  163. }
  164. return fmt.Errorf("cluster too slow: only commit %d requests in %ds", end-start, retry)
  165. }
  166. return nil
  167. }
  168. func (f *failureKillOneForLongTime) Recover(c *cluster, round int) error {
  169. i := round % c.Size
  170. if _, err := c.Agents[i].Restart(); err != nil {
  171. return err
  172. }
  173. return c.WaitHealth()
  174. }
  175. // failureKillLeaderForLongTime kills the leader for long time, and restart
  176. // after a snapshot is required.
  177. type failureKillLeaderForLongTime struct {
  178. description
  179. idx int
  180. }
  181. func newFailureKillLeaderForLongTime() *failureKillLeaderForLongTime {
  182. return &failureKillLeaderForLongTime{
  183. description: "kill the leader for long time and expect it to recover from incoming snapshot",
  184. }
  185. }
  186. func (f *failureKillLeaderForLongTime) Inject(c *cluster, round int) error {
  187. idx, err := c.GetLeader()
  188. if err != nil {
  189. return err
  190. }
  191. f.idx = idx
  192. if err := c.Agents[idx].Stop(); err != nil {
  193. return err
  194. }
  195. if c.Size >= 3 {
  196. start, _ := c.Report()
  197. var end int
  198. retry := snapshotCount / 1000 * 3
  199. for j := 0; j < retry; j++ {
  200. end, _ = c.Report()
  201. if end-start > snapshotCount {
  202. return nil
  203. }
  204. time.Sleep(time.Second)
  205. }
  206. return fmt.Errorf("cluster too slow: only commit %d requests in %ds", end-start, retry)
  207. }
  208. return nil
  209. }
  210. func (f *failureKillLeaderForLongTime) Recover(c *cluster, round int) error {
  211. if _, err := c.Agents[f.idx].Restart(); err != nil {
  212. return err
  213. }
  214. return c.WaitHealth()
  215. }
  216. type failureIsolate struct {
  217. description
  218. }
  219. func newFailureIsolate() *failureIsolate {
  220. return &failureIsolate{
  221. description: "isolate one member",
  222. }
  223. }
  224. func (f *failureIsolate) Inject(c *cluster, round int) error {
  225. i := round % c.Size
  226. if err := c.Agents[i].DropPort(peerURLPort); err != nil {
  227. return err
  228. }
  229. return nil
  230. }
  231. func (f *failureIsolate) Recover(c *cluster, round int) error {
  232. i := round % c.Size
  233. if err := c.Agents[i].RecoverPort(peerURLPort); err != nil {
  234. return err
  235. }
  236. return c.WaitHealth()
  237. }
  238. type failureIsolateAll struct {
  239. description
  240. }
  241. func newFailureIsolateAll() *failureIsolateAll {
  242. return &failureIsolateAll{
  243. description: "isolate all members",
  244. }
  245. }
  246. func (f *failureIsolateAll) Inject(c *cluster, round int) error {
  247. for _, a := range c.Agents {
  248. if err := a.DropPort(peerURLPort); err != nil {
  249. return err
  250. }
  251. }
  252. return nil
  253. }
  254. func (f *failureIsolateAll) Recover(c *cluster, round int) error {
  255. for _, a := range c.Agents {
  256. if err := a.RecoverPort(peerURLPort); err != nil {
  257. return err
  258. }
  259. }
  260. return c.WaitHealth()
  261. }