case_sigquit_remove.go 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229
  1. // Copyright 2018 The etcd Authors
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package tester
  15. import (
  16. "context"
  17. "fmt"
  18. "sort"
  19. "strings"
  20. "time"
  21. "github.com/coreos/etcd/clientv3"
  22. "github.com/coreos/etcd/functional/rpcpb"
  23. "go.uber.org/zap"
  24. )
  25. func inject_SIGQUIT_ETCD_AND_REMOVE_DATA(clus *Cluster, idx1 int) error {
  26. cli1, err := clus.Members[idx1].CreateEtcdClient()
  27. if err != nil {
  28. return err
  29. }
  30. defer cli1.Close()
  31. var mresp *clientv3.MemberListResponse
  32. mresp, err = cli1.MemberList(context.Background())
  33. mss := []string{}
  34. if err == nil && mresp != nil {
  35. mss = describeMembers(mresp)
  36. }
  37. clus.lg.Info(
  38. "member list before disastrous machine failure",
  39. zap.String("request-to", clus.Members[idx1].EtcdClientEndpoint),
  40. zap.Strings("members", mss),
  41. zap.Error(err),
  42. )
  43. if err != nil {
  44. return err
  45. }
  46. sresp, serr := cli1.Status(context.Background(), clus.Members[idx1].EtcdClientEndpoint)
  47. if serr != nil {
  48. return serr
  49. }
  50. id1 := sresp.Header.MemberId
  51. is1 := fmt.Sprintf("%016x", id1)
  52. clus.lg.Info(
  53. "disastrous machine failure START",
  54. zap.String("target-endpoint", clus.Members[idx1].EtcdClientEndpoint),
  55. zap.String("target-member-id", is1),
  56. zap.Error(err),
  57. )
  58. err = clus.sendOp(idx1, rpcpb.Operation_SIGQUIT_ETCD_AND_REMOVE_DATA)
  59. clus.lg.Info(
  60. "disastrous machine failure END",
  61. zap.String("target-endpoint", clus.Members[idx1].EtcdClientEndpoint),
  62. zap.String("target-member-id", is1),
  63. zap.Error(err),
  64. )
  65. if err != nil {
  66. return err
  67. }
  68. time.Sleep(2 * time.Second)
  69. idx2 := (idx1 + 1) % len(clus.Members)
  70. var cli2 *clientv3.Client
  71. cli2, err = clus.Members[idx2].CreateEtcdClient()
  72. if err != nil {
  73. return err
  74. }
  75. defer cli2.Close()
  76. // FIXME(bug): this may block forever during
  77. // "SIGQUIT_AND_REMOVE_LEADER_UNTIL_TRIGGER_SNAPSHOT"
  78. // is the new leader too busy with snapshotting?
  79. // is raft proposal dropped?
  80. // enable client keepalive for failover?
  81. clus.lg.Info(
  82. "member remove after disaster START",
  83. zap.String("target-endpoint", clus.Members[idx1].EtcdClientEndpoint),
  84. zap.String("target-member-id", is1),
  85. zap.String("request-to", clus.Members[idx2].EtcdClientEndpoint),
  86. )
  87. ctx, cancel := context.WithTimeout(context.Background(), time.Minute)
  88. _, err = cli2.MemberRemove(ctx, id1)
  89. cancel()
  90. clus.lg.Info(
  91. "member remove after disaster END",
  92. zap.String("target-endpoint", clus.Members[idx1].EtcdClientEndpoint),
  93. zap.String("target-member-id", is1),
  94. zap.String("request-to", clus.Members[idx2].EtcdClientEndpoint),
  95. zap.Error(err),
  96. )
  97. if err != nil {
  98. return err
  99. }
  100. time.Sleep(2 * time.Second)
  101. mresp, err = cli2.MemberList(context.Background())
  102. mss = []string{}
  103. if err == nil && mresp != nil {
  104. mss = describeMembers(mresp)
  105. }
  106. clus.lg.Info(
  107. "member list after member remove",
  108. zap.String("request-to", clus.Members[idx2].EtcdClientEndpoint),
  109. zap.Strings("members", mss),
  110. zap.Error(err),
  111. )
  112. return err
  113. }
  114. func recover_SIGQUIT_ETCD_AND_REMOVE_DATA(clus *Cluster, idx1 int) error {
  115. idx2 := (idx1 + 1) % len(clus.Members)
  116. cli2, err := clus.Members[idx2].CreateEtcdClient()
  117. if err != nil {
  118. return err
  119. }
  120. defer cli2.Close()
  121. _, err = cli2.MemberAdd(context.Background(), clus.Members[idx1].Etcd.AdvertisePeerURLs)
  122. clus.lg.Info(
  123. "member add before fresh restart",
  124. zap.String("target-endpoint", clus.Members[idx1].EtcdClientEndpoint),
  125. zap.String("request-to", clus.Members[idx2].EtcdClientEndpoint),
  126. zap.Error(err),
  127. )
  128. if err != nil {
  129. return err
  130. }
  131. time.Sleep(2 * time.Second)
  132. clus.Members[idx1].Etcd.InitialClusterState = "existing"
  133. err = clus.sendOp(idx1, rpcpb.Operation_RESTART_ETCD)
  134. clus.lg.Info(
  135. "fresh restart after member add",
  136. zap.String("target-endpoint", clus.Members[idx1].EtcdClientEndpoint),
  137. zap.Error(err),
  138. )
  139. if err != nil {
  140. return err
  141. }
  142. time.Sleep(2 * time.Second)
  143. var mresp *clientv3.MemberListResponse
  144. mresp, err = cli2.MemberList(context.Background())
  145. mss := []string{}
  146. if err == nil && mresp != nil {
  147. mss = describeMembers(mresp)
  148. }
  149. clus.lg.Info(
  150. "member list after member add",
  151. zap.String("request-to", clus.Members[idx2].EtcdClientEndpoint),
  152. zap.Strings("members", mss),
  153. zap.Error(err),
  154. )
  155. return err
  156. }
  157. func new_Case_SIGQUIT_AND_REMOVE_ONE_FOLLOWER(clus *Cluster) Case {
  158. cc := caseByFunc{
  159. rpcpbCase: rpcpb.Case_SIGQUIT_AND_REMOVE_ONE_FOLLOWER,
  160. injectMember: inject_SIGQUIT_ETCD_AND_REMOVE_DATA,
  161. recoverMember: recover_SIGQUIT_ETCD_AND_REMOVE_DATA,
  162. }
  163. c := &caseFollower{cc, -1, -1}
  164. return &caseDelay{
  165. Case: c,
  166. delayDuration: clus.GetCaseDelayDuration(),
  167. }
  168. }
  169. func new_Case_SIGQUIT_AND_REMOVE_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT(clus *Cluster) Case {
  170. return &caseUntilSnapshot{
  171. rpcpbCase: rpcpb.Case_SIGQUIT_AND_REMOVE_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT,
  172. Case: new_Case_SIGQUIT_AND_REMOVE_ONE_FOLLOWER(clus),
  173. }
  174. }
  175. func new_Case_SIGQUIT_AND_REMOVE_LEADER(clus *Cluster) Case {
  176. cc := caseByFunc{
  177. rpcpbCase: rpcpb.Case_SIGQUIT_AND_REMOVE_LEADER,
  178. injectMember: inject_SIGQUIT_ETCD_AND_REMOVE_DATA,
  179. recoverMember: recover_SIGQUIT_ETCD_AND_REMOVE_DATA,
  180. }
  181. c := &caseLeader{cc, -1, -1}
  182. return &caseDelay{
  183. Case: c,
  184. delayDuration: clus.GetCaseDelayDuration(),
  185. }
  186. }
  187. func new_Case_SIGQUIT_AND_REMOVE_LEADER_UNTIL_TRIGGER_SNAPSHOT(clus *Cluster) Case {
  188. return &caseUntilSnapshot{
  189. rpcpbCase: rpcpb.Case_SIGQUIT_AND_REMOVE_LEADER_UNTIL_TRIGGER_SNAPSHOT,
  190. Case: new_Case_SIGQUIT_AND_REMOVE_LEADER(clus),
  191. }
  192. }
  193. func describeMembers(mresp *clientv3.MemberListResponse) (ss []string) {
  194. ss = make([]string, len(mresp.Members))
  195. for i, m := range mresp.Members {
  196. ss[i] = fmt.Sprintf("Name %s / ID %016x / ClientURLs %s / PeerURLs %s",
  197. m.Name,
  198. m.ID,
  199. strings.Join(m.ClientURLs, ","),
  200. strings.Join(m.PeerURLs, ","),
  201. )
  202. }
  203. sort.Strings(ss)
  204. return ss
  205. }