// Copyright 2018 The etcd Authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package tester import ( "context" "fmt" "sort" "strings" "time" "github.com/coreos/etcd/clientv3" "github.com/coreos/etcd/functional/rpcpb" "go.uber.org/zap" ) func inject_SIGQUIT_ETCD_AND_REMOVE_DATA(clus *Cluster, idx1 int) error { cli1, err := clus.Members[idx1].CreateEtcdClient() if err != nil { return err } defer cli1.Close() var mresp *clientv3.MemberListResponse mresp, err = cli1.MemberList(context.Background()) mss := []string{} if err == nil && mresp != nil { mss = describeMembers(mresp) } clus.lg.Info( "member list before disastrous machine failure", zap.String("request-to", clus.Members[idx1].EtcdClientEndpoint), zap.Strings("members", mss), zap.Error(err), ) if err != nil { return err } sresp, serr := cli1.Status(context.Background(), clus.Members[idx1].EtcdClientEndpoint) if serr != nil { return serr } id1 := sresp.Header.MemberId is1 := fmt.Sprintf("%016x", id1) clus.lg.Info( "disastrous machine failure START", zap.String("target-endpoint", clus.Members[idx1].EtcdClientEndpoint), zap.String("target-member-id", is1), zap.Error(err), ) err = clus.sendOp(idx1, rpcpb.Operation_SIGQUIT_ETCD_AND_REMOVE_DATA) clus.lg.Info( "disastrous machine failure END", zap.String("target-endpoint", clus.Members[idx1].EtcdClientEndpoint), zap.String("target-member-id", is1), zap.Error(err), ) if err != nil { return err } time.Sleep(2 * time.Second) idx2 := (idx1 + 1) % len(clus.Members) var cli2 *clientv3.Client cli2, err = clus.Members[idx2].CreateEtcdClient() if err != nil { return err } defer cli2.Close() // FIXME(bug): this may block forever during // "SIGQUIT_AND_REMOVE_LEADER_UNTIL_TRIGGER_SNAPSHOT" // is the new leader too busy with snapshotting? // is raft proposal dropped? // enable client keepalive for failover? clus.lg.Info( "member remove after disaster START", zap.String("target-endpoint", clus.Members[idx1].EtcdClientEndpoint), zap.String("target-member-id", is1), zap.String("request-to", clus.Members[idx2].EtcdClientEndpoint), ) ctx, cancel := context.WithTimeout(context.Background(), time.Minute) _, err = cli2.MemberRemove(ctx, id1) cancel() clus.lg.Info( "member remove after disaster END", zap.String("target-endpoint", clus.Members[idx1].EtcdClientEndpoint), zap.String("target-member-id", is1), zap.String("request-to", clus.Members[idx2].EtcdClientEndpoint), zap.Error(err), ) if err != nil { return err } time.Sleep(2 * time.Second) mresp, err = cli2.MemberList(context.Background()) mss = []string{} if err == nil && mresp != nil { mss = describeMembers(mresp) } clus.lg.Info( "member list after member remove", zap.String("request-to", clus.Members[idx2].EtcdClientEndpoint), zap.Strings("members", mss), zap.Error(err), ) return err } func recover_SIGQUIT_ETCD_AND_REMOVE_DATA(clus *Cluster, idx1 int) error { idx2 := (idx1 + 1) % len(clus.Members) cli2, err := clus.Members[idx2].CreateEtcdClient() if err != nil { return err } defer cli2.Close() _, err = cli2.MemberAdd(context.Background(), clus.Members[idx1].Etcd.AdvertisePeerURLs) clus.lg.Info( "member add before fresh restart", zap.String("target-endpoint", clus.Members[idx1].EtcdClientEndpoint), zap.String("request-to", clus.Members[idx2].EtcdClientEndpoint), zap.Error(err), ) if err != nil { return err } time.Sleep(2 * time.Second) clus.Members[idx1].Etcd.InitialClusterState = "existing" err = clus.sendOp(idx1, rpcpb.Operation_RESTART_ETCD) clus.lg.Info( "fresh restart after member add", zap.String("target-endpoint", clus.Members[idx1].EtcdClientEndpoint), zap.Error(err), ) if err != nil { return err } time.Sleep(2 * time.Second) var mresp *clientv3.MemberListResponse mresp, err = cli2.MemberList(context.Background()) mss := []string{} if err == nil && mresp != nil { mss = describeMembers(mresp) } clus.lg.Info( "member list after member add", zap.String("request-to", clus.Members[idx2].EtcdClientEndpoint), zap.Strings("members", mss), zap.Error(err), ) return err } func new_Case_SIGQUIT_AND_REMOVE_ONE_FOLLOWER(clus *Cluster) Case { cc := caseByFunc{ rpcpbCase: rpcpb.Case_SIGQUIT_AND_REMOVE_ONE_FOLLOWER, injectMember: inject_SIGQUIT_ETCD_AND_REMOVE_DATA, recoverMember: recover_SIGQUIT_ETCD_AND_REMOVE_DATA, } c := &caseFollower{cc, -1, -1} return &caseDelay{ Case: c, delayDuration: clus.GetCaseDelayDuration(), } } func new_Case_SIGQUIT_AND_REMOVE_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT(clus *Cluster) Case { return &caseUntilSnapshot{ rpcpbCase: rpcpb.Case_SIGQUIT_AND_REMOVE_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT, Case: new_Case_SIGQUIT_AND_REMOVE_ONE_FOLLOWER(clus), } } func new_Case_SIGQUIT_AND_REMOVE_LEADER(clus *Cluster) Case { cc := caseByFunc{ rpcpbCase: rpcpb.Case_SIGQUIT_AND_REMOVE_LEADER, injectMember: inject_SIGQUIT_ETCD_AND_REMOVE_DATA, recoverMember: recover_SIGQUIT_ETCD_AND_REMOVE_DATA, } c := &caseLeader{cc, -1, -1} return &caseDelay{ Case: c, delayDuration: clus.GetCaseDelayDuration(), } } func new_Case_SIGQUIT_AND_REMOVE_LEADER_UNTIL_TRIGGER_SNAPSHOT(clus *Cluster) Case { return &caseUntilSnapshot{ rpcpbCase: rpcpb.Case_SIGQUIT_AND_REMOVE_LEADER_UNTIL_TRIGGER_SNAPSHOT, Case: new_Case_SIGQUIT_AND_REMOVE_LEADER(clus), } } func describeMembers(mresp *clientv3.MemberListResponse) (ss []string) { ss = make([]string, len(mresp.Members)) for i, m := range mresp.Members { ss[i] = fmt.Sprintf("Name %s / ID %016x / ClientURLs %s / PeerURLs %s", m.Name, m.ID, strings.Join(m.ClientURLs, ","), strings.Join(m.PeerURLs, ","), ) } sort.Strings(ss) return ss }