failure_agent.go 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177
  1. // Copyright 2016 The etcd Authors
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package main
  15. import (
  16. "fmt"
  17. "time"
  18. )
  19. const (
  20. snapshotCount = 10000
  21. slowNetworkLatency = 500 // 500 millisecond
  22. randomVariation = 50
  23. // delay duration to trigger leader election (default election timeout 1s)
  24. triggerElectionDur = 5 * time.Second
  25. // Wait more when it recovers from slow network, because network layer
  26. // needs extra time to propagate traffic control (tc command) change.
  27. // Otherwise, we get different hash values from the previous revision.
  28. // For more detail, please see https://github.com/coreos/etcd/issues/5121.
  29. waitRecover = 5 * time.Second
  30. )
  31. func injectStop(m *member) error { return m.Agent.Stop() }
  32. func recoverStop(m *member) error {
  33. _, err := m.Agent.Restart()
  34. return err
  35. }
  36. func newFailureKillAll() failure {
  37. return &failureAll{
  38. description: "kill all members",
  39. injectMember: injectStop,
  40. recoverMember: recoverStop,
  41. }
  42. }
  43. func newFailureKillMajority() failure {
  44. return &failureMajority{
  45. description: "kill majority of the cluster",
  46. injectMember: injectStop,
  47. recoverMember: recoverStop,
  48. }
  49. }
  50. func newFailureKillOne() failure {
  51. return &failureOne{
  52. description: "kill one random member",
  53. injectMember: injectStop,
  54. recoverMember: recoverStop,
  55. }
  56. }
  57. func newFailureKillLeader() failure {
  58. ff := failureByFunc{
  59. description: "kill leader member",
  60. injectMember: injectStop,
  61. recoverMember: recoverStop,
  62. }
  63. return &failureLeader{ff, 0}
  64. }
  65. func newFailureKillOneForLongTime() failure {
  66. return &failureUntilSnapshot{newFailureKillOne()}
  67. }
  68. func newFailureKillLeaderForLongTime() failure {
  69. return &failureUntilSnapshot{newFailureKillLeader()}
  70. }
  71. func injectDropPort(m *member) error { return m.Agent.DropPort(m.peerPort()) }
  72. func recoverDropPort(m *member) error { return m.Agent.RecoverPort(m.peerPort()) }
  73. func newFailureIsolate() failure {
  74. f := &failureOne{
  75. description: "isolate one member",
  76. injectMember: injectDropPort,
  77. recoverMember: recoverDropPort,
  78. }
  79. return &failureDelay{
  80. failure: f,
  81. delayDuration: triggerElectionDur,
  82. }
  83. }
  84. func newFailureIsolateAll() failure {
  85. f := &failureAll{
  86. description: "isolate all members",
  87. injectMember: injectDropPort,
  88. recoverMember: recoverDropPort,
  89. }
  90. return &failureDelay{
  91. failure: f,
  92. delayDuration: triggerElectionDur,
  93. }
  94. }
  95. func injectLatency(m *member) error {
  96. if err := m.Agent.SetLatency(slowNetworkLatency, randomVariation); err != nil {
  97. m.Agent.RemoveLatency()
  98. return err
  99. }
  100. return nil
  101. }
  102. func recoverLatency(m *member) error {
  103. if err := m.Agent.RemoveLatency(); err != nil {
  104. return err
  105. }
  106. time.Sleep(waitRecover)
  107. return nil
  108. }
  109. func newFailureSlowNetworkOneMember() failure {
  110. desc := fmt.Sprintf("slow down one member's network by adding %d ms latency", slowNetworkLatency)
  111. f := &failureOne{
  112. description: description(desc),
  113. injectMember: injectLatency,
  114. recoverMember: recoverLatency,
  115. }
  116. return &failureDelay{
  117. failure: f,
  118. delayDuration: triggerElectionDur,
  119. }
  120. }
  121. func newFailureSlowNetworkLeader() failure {
  122. desc := fmt.Sprintf("slow down leader's network by adding %d ms latency", slowNetworkLatency)
  123. ff := failureByFunc{
  124. description: description(desc),
  125. injectMember: injectLatency,
  126. recoverMember: recoverLatency,
  127. }
  128. f := &failureLeader{ff, 0}
  129. return &failureDelay{
  130. failure: f,
  131. delayDuration: triggerElectionDur,
  132. }
  133. }
  134. func newFailureSlowNetworkAll() failure {
  135. f := &failureAll{
  136. description: "slow down all members' network",
  137. injectMember: injectLatency,
  138. recoverMember: recoverLatency,
  139. }
  140. return &failureDelay{
  141. failure: f,
  142. delayDuration: triggerElectionDur,
  143. }
  144. }
  145. func newFailureNop() failure {
  146. return &failureNop{
  147. description: "no failure",
  148. }
  149. }
  150. func newFailureExternal(scriptPath string) failure {
  151. return &failureExternal{
  152. description: fmt.Sprintf("external fault injector (script: %s)", scriptPath),
  153. scriptPath: scriptPath,
  154. }
  155. }