failure.go 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228
  1. // Copyright 2015 CoreOS, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package main
  15. import (
  16. "fmt"
  17. "math/rand"
  18. "time"
  19. )
  20. const snapshotCount = 10000
  21. type failure interface {
  22. // Inject injeccts the failure into the testing cluster at the given
  23. // round. When calling the function, the cluster should be in health.
  24. Inject(c *cluster, round int) error
  25. // Recover recovers the injected failure caused by the injection of the
  26. // given round and wait for the recovery of the testing cluster.
  27. Recover(c *cluster, round int) error
  28. // return a description of the failure
  29. Desc() string
  30. }
  31. type description string
  32. func (d description) Desc() string { return string(d) }
  33. type failureKillAll struct {
  34. description
  35. }
  36. func newFailureKillAll() *failureKillAll {
  37. return &failureKillAll{
  38. description: "kill all members",
  39. }
  40. }
  41. func (f *failureKillAll) Inject(c *cluster, round int) error {
  42. for _, a := range c.Agents {
  43. if err := a.Stop(); err != nil {
  44. return err
  45. }
  46. }
  47. return nil
  48. }
  49. func (f *failureKillAll) Recover(c *cluster, round int) error {
  50. for _, a := range c.Agents {
  51. if _, err := a.Restart(); err != nil {
  52. return err
  53. }
  54. }
  55. return c.WaitHealth()
  56. }
  57. type failureKillMajority struct {
  58. description
  59. }
  60. func newFailureKillMajority() *failureKillMajority {
  61. return &failureKillMajority{
  62. description: "kill majority of the cluster",
  63. }
  64. }
  65. func (f *failureKillMajority) Inject(c *cluster, round int) error {
  66. for i := range getToKillMap(c.Size, round) {
  67. if err := c.Agents[i].Stop(); err != nil {
  68. return err
  69. }
  70. }
  71. return nil
  72. }
  73. func (f *failureKillMajority) Recover(c *cluster, round int) error {
  74. for i := range getToKillMap(c.Size, round) {
  75. if _, err := c.Agents[i].Restart(); err != nil {
  76. return err
  77. }
  78. }
  79. return c.WaitHealth()
  80. }
  81. func getToKillMap(size int, seed int) map[int]bool {
  82. m := make(map[int]bool)
  83. r := rand.New(rand.NewSource(int64(seed)))
  84. majority := size/2 + 1
  85. for {
  86. m[r.Intn(size)] = true
  87. if len(m) >= majority {
  88. return m
  89. }
  90. }
  91. }
  92. type failureKillOne struct {
  93. description
  94. }
  95. func newFailureKillOne() *failureKillOne {
  96. return &failureKillOne{
  97. description: "kill one random member",
  98. }
  99. }
  100. func (f *failureKillOne) Inject(c *cluster, round int) error {
  101. i := round % c.Size
  102. return c.Agents[i].Stop()
  103. }
  104. func (f *failureKillOne) Recover(c *cluster, round int) error {
  105. i := round % c.Size
  106. if _, err := c.Agents[i].Restart(); err != nil {
  107. return err
  108. }
  109. return c.WaitHealth()
  110. }
  111. // failureKillOneForLongTime kills one member for long time, and restart
  112. // after a snapshot is required.
  113. type failureKillOneForLongTime struct {
  114. description
  115. }
  116. func newFailureKillOneForLongTime() *failureKillOneForLongTime {
  117. return &failureKillOneForLongTime{
  118. description: "kill one member for long time and expect it to recover from incoming snapshot",
  119. }
  120. }
  121. func (f *failureKillOneForLongTime) Inject(c *cluster, round int) error {
  122. i := round % c.Size
  123. if err := c.Agents[i].Stop(); err != nil {
  124. return err
  125. }
  126. if c.Size >= 3 {
  127. start, _ := c.Report()
  128. var end int
  129. // Normal healthy cluster could accept 1000req/s at least.
  130. // Give it 3-times time to create a new snapshot.
  131. retry := snapshotCount / 1000 * 3
  132. for j := 0; j < retry; j++ {
  133. end, _ = c.Report()
  134. // If the number of proposals committed is bigger than snapshot count,
  135. // a new snapshot should have been created.
  136. if end-start > snapshotCount {
  137. return nil
  138. }
  139. time.Sleep(time.Second)
  140. }
  141. return fmt.Errorf("cluster too slow: only commit %d requests in %ds", end-start, retry)
  142. }
  143. return nil
  144. }
  145. func (f *failureKillOneForLongTime) Recover(c *cluster, round int) error {
  146. i := round % c.Size
  147. if _, err := c.Agents[i].Restart(); err != nil {
  148. return err
  149. }
  150. return c.WaitHealth()
  151. }
  152. type failureIsolate struct {
  153. description
  154. }
  155. func newFailureIsolate() *failureIsolate {
  156. return &failureIsolate{
  157. description: "isolate one member",
  158. }
  159. }
  160. func (f *failureIsolate) Inject(c *cluster, round int) error {
  161. i := round % c.Size
  162. if err := c.Agents[i].DropPort(peerURLPort); err != nil {
  163. return err
  164. }
  165. return nil
  166. }
  167. func (f *failureIsolate) Recover(c *cluster, round int) error {
  168. i := round % c.Size
  169. if err := c.Agents[i].RecoverPort(peerURLPort); err != nil {
  170. return err
  171. }
  172. return c.WaitHealth()
  173. }
  174. type failureIsolateAll struct {
  175. description
  176. }
  177. func newFailureIsolateAll() *failureIsolateAll {
  178. return &failureIsolateAll{
  179. description: "isolate all members",
  180. }
  181. }
  182. func (f *failureIsolateAll) Inject(c *cluster, round int) error {
  183. for _, a := range c.Agents {
  184. if err := a.DropPort(peerURLPort); err != nil {
  185. return err
  186. }
  187. }
  188. return nil
  189. }
  190. func (f *failureIsolateAll) Recover(c *cluster, round int) error {
  191. for _, a := range c.Agents {
  192. if err := a.RecoverPort(peerURLPort); err != nil {
  193. return err
  194. }
  195. }
  196. return c.WaitHealth()
  197. }