failure_agent.go 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154
  1. // Copyright 2016 The etcd Authors
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package main
  15. import (
  16. "fmt"
  17. "time"
  18. )
  19. const (
  20. snapshotCount = 10000
  21. slowNetworkLatency = 500 // 500 millisecond
  22. randomVariation = 50
  23. // Wait more when it recovers from slow network, because network layer
  24. // needs extra time to propagate traffic control (tc command) change.
  25. // Otherwise, we get different hash values from the previous revision.
  26. // For more detail, please see https://github.com/coreos/etcd/issues/5121.
  27. waitRecover = 5 * time.Second
  28. )
  29. func injectStop(m *member) error { return m.Agent.Stop() }
  30. func recoverStop(m *member) error {
  31. _, err := m.Agent.Restart()
  32. return err
  33. }
  34. func newFailureKillAll() failure {
  35. return &failureAll{
  36. description: "kill all members",
  37. injectMember: injectStop,
  38. recoverMember: recoverStop,
  39. }
  40. }
  41. func newFailureKillMajority() failure {
  42. return &failureMajority{
  43. description: "kill majority of the cluster",
  44. injectMember: injectStop,
  45. recoverMember: recoverStop,
  46. }
  47. }
  48. func newFailureKillOne() failure {
  49. return &failureOne{
  50. description: "kill one random member",
  51. injectMember: injectStop,
  52. recoverMember: recoverStop,
  53. }
  54. }
  55. func newFailureKillLeader() failure {
  56. ff := failureByFunc{
  57. description: "kill leader member",
  58. injectMember: injectStop,
  59. recoverMember: recoverStop,
  60. }
  61. return &failureLeader{ff, 0}
  62. }
  63. func newFailureKillOneForLongTime() failure {
  64. return &failureUntilSnapshot{newFailureKillOne()}
  65. }
  66. func newFailureKillLeaderForLongTime() failure {
  67. return &failureUntilSnapshot{newFailureKillLeader()}
  68. }
  69. func injectDropPort(m *member) error { return m.Agent.DropPort(m.peerPort()) }
  70. func recoverDropPort(m *member) error { return m.Agent.RecoverPort(m.peerPort()) }
  71. func newFailureIsolate() failure {
  72. return &failureOne{
  73. description: "isolate one member",
  74. injectMember: injectDropPort,
  75. recoverMember: recoverDropPort,
  76. }
  77. }
  78. func newFailureIsolateAll() failure {
  79. return &failureAll{
  80. description: "isolate all members",
  81. injectMember: injectDropPort,
  82. recoverMember: recoverDropPort,
  83. }
  84. }
  85. func injectLatency(m *member) error {
  86. if err := m.Agent.SetLatency(slowNetworkLatency, randomVariation); err != nil {
  87. m.Agent.RemoveLatency()
  88. return err
  89. }
  90. return nil
  91. }
  92. func recoverLatency(m *member) error {
  93. if err := m.Agent.RemoveLatency(); err != nil {
  94. return err
  95. }
  96. time.Sleep(waitRecover)
  97. return nil
  98. }
  99. func newFailureSlowNetworkOneMember() failure {
  100. desc := fmt.Sprintf("slow down one member's network by adding %d ms latency", slowNetworkLatency)
  101. return &failureOne{
  102. description: description(desc),
  103. injectMember: injectLatency,
  104. recoverMember: recoverLatency,
  105. }
  106. }
  107. func newFailureSlowNetworkLeader() failure {
  108. desc := fmt.Sprintf("slow down leader's network by adding %d ms latency", slowNetworkLatency)
  109. ff := failureByFunc{
  110. description: description(desc),
  111. injectMember: injectLatency,
  112. recoverMember: recoverLatency,
  113. }
  114. return &failureLeader{ff, 0}
  115. }
  116. func newFailureSlowNetworkAll() failure {
  117. return &failureAll{
  118. description: "slow down all members' network",
  119. injectMember: injectLatency,
  120. recoverMember: recoverLatency,
  121. }
  122. }
  123. func newFailureNop() failure {
  124. return &failureNop{
  125. description: "no failure",
  126. }
  127. }
  128. func newFailureExternal(scriptPath string) failure {
  129. return &failureExternal{
  130. description: fmt.Sprintf("external fault injector (script: %s)", scriptPath),
  131. scriptPath: scriptPath,
  132. }
  133. }