failure_case_sigquit_remove.go 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182
  1. // Copyright 2018 The etcd Authors
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package tester
  15. import (
  16. "context"
  17. "fmt"
  18. "sort"
  19. "strings"
  20. "time"
  21. "github.com/coreos/etcd/clientv3"
  22. "github.com/coreos/etcd/functional/rpcpb"
  23. "go.uber.org/zap"
  24. )
  25. func inject_SIGQUIT_ETCD_AND_REMOVE_DATA(clus *Cluster, idx1 int) error {
  26. cli1, err := clus.Members[idx1].CreateEtcdClient()
  27. if err != nil {
  28. return err
  29. }
  30. defer cli1.Close()
  31. var mresp *clientv3.MemberListResponse
  32. mresp, err = cli1.MemberList(context.Background())
  33. mss := []string{}
  34. if err == nil && mresp != nil {
  35. mss = describeMembers(mresp)
  36. }
  37. clus.lg.Info(
  38. "member list before disastrous machine failure",
  39. zap.String("request-to", clus.Members[idx1].EtcdClientEndpoint),
  40. zap.Strings("members", mss),
  41. zap.Error(err),
  42. )
  43. if err != nil {
  44. return err
  45. }
  46. sresp, serr := cli1.Status(context.Background(), clus.Members[idx1].EtcdClientEndpoint)
  47. if serr != nil {
  48. return serr
  49. }
  50. id1 := sresp.Header.MemberId
  51. is1 := fmt.Sprintf("%016x", id1)
  52. err = clus.sendOp(idx1, rpcpb.Operation_SIGQUIT_ETCD_AND_REMOVE_DATA)
  53. clus.lg.Info(
  54. "disastrous machine failure",
  55. zap.String("target-endpoint", clus.Members[idx1].EtcdClientEndpoint),
  56. zap.String("target-member-id", is1),
  57. zap.Error(err),
  58. )
  59. if err != nil {
  60. return err
  61. }
  62. time.Sleep(3 * time.Second)
  63. idx2 := (idx1 + 1) % len(clus.Members)
  64. var cli2 *clientv3.Client
  65. cli2, err = clus.Members[idx2].CreateEtcdClient()
  66. if err != nil {
  67. return err
  68. }
  69. defer cli2.Close()
  70. _, err = cli2.MemberRemove(context.Background(), id1)
  71. clus.lg.Info(
  72. "member remove after disaster",
  73. zap.String("target-endpoint", clus.Members[idx1].EtcdClientEndpoint),
  74. zap.String("target-member-id", is1),
  75. zap.String("request-to", clus.Members[idx2].EtcdClientEndpoint),
  76. zap.Error(err),
  77. )
  78. if err != nil {
  79. return err
  80. }
  81. time.Sleep(5 * time.Second)
  82. mresp, err = cli2.MemberList(context.Background())
  83. mss = []string{}
  84. if err == nil && mresp != nil {
  85. mss = describeMembers(mresp)
  86. }
  87. clus.lg.Info(
  88. "member list after member remove",
  89. zap.String("request-to", clus.Members[idx2].EtcdClientEndpoint),
  90. zap.Strings("members", mss),
  91. zap.Error(err),
  92. )
  93. return err
  94. }
  95. func recover_SIGQUIT_ETCD_AND_REMOVE_DATA(clus *Cluster, idx1 int) error {
  96. idx2 := (idx1 + 1) % len(clus.Members)
  97. cli2, err := clus.Members[idx2].CreateEtcdClient()
  98. if err != nil {
  99. return err
  100. }
  101. defer cli2.Close()
  102. _, err = cli2.MemberAdd(context.Background(), clus.Members[idx1].Etcd.AdvertisePeerURLs)
  103. clus.lg.Info(
  104. "member add before fresh restart",
  105. zap.String("target-endpoint", clus.Members[idx1].EtcdClientEndpoint),
  106. zap.String("request-to", clus.Members[idx2].EtcdClientEndpoint),
  107. zap.Error(err),
  108. )
  109. if err != nil {
  110. return err
  111. }
  112. time.Sleep(3 * time.Second)
  113. clus.Members[idx1].Etcd.InitialClusterState = "existing"
  114. err = clus.sendOp(idx1, rpcpb.Operation_RESTART_ETCD)
  115. clus.lg.Info(
  116. "fresh restart after member add",
  117. zap.String("target-endpoint", clus.Members[idx1].EtcdClientEndpoint),
  118. zap.Error(err),
  119. )
  120. if err != nil {
  121. return err
  122. }
  123. time.Sleep(3 * time.Second)
  124. var mresp *clientv3.MemberListResponse
  125. mresp, err = cli2.MemberList(context.Background())
  126. mss := []string{}
  127. if err == nil && mresp != nil {
  128. mss = describeMembers(mresp)
  129. }
  130. clus.lg.Info(
  131. "member list after member add",
  132. zap.String("request-to", clus.Members[idx2].EtcdClientEndpoint),
  133. zap.Strings("members", mss),
  134. zap.Error(err),
  135. )
  136. return err
  137. }
  138. func new_FailureCase_SIGQUIT_AND_REMOVE_ONE_FOLLOWER(clus *Cluster) Failure {
  139. ff := failureByFunc{
  140. failureCase: rpcpb.FailureCase_SIGQUIT_AND_REMOVE_ONE_FOLLOWER,
  141. injectMember: inject_SIGQUIT_ETCD_AND_REMOVE_DATA,
  142. recoverMember: recover_SIGQUIT_ETCD_AND_REMOVE_DATA,
  143. }
  144. f := &failureFollower{ff, -1, -1}
  145. return &failureDelay{
  146. Failure: f,
  147. delayDuration: clus.GetFailureDelayDuration(),
  148. }
  149. }
  150. func describeMembers(mresp *clientv3.MemberListResponse) (ss []string) {
  151. ss = make([]string, len(mresp.Members))
  152. for i, m := range mresp.Members {
  153. ss[i] = fmt.Sprintf("Name %s / ID %016x / ClientURLs %s / PeerURLs %s",
  154. m.Name,
  155. m.ID,
  156. strings.Join(m.ClientURLs, ","),
  157. strings.Join(m.PeerURLs, ","),
  158. )
  159. }
  160. sort.Strings(ss)
  161. return ss
  162. }