Browse Source

functional: add "SIGQUIT_AND_REMOVE_ONE_FOLLOWER"

Signed-off-by: Gyuho Lee <gyuhox@gmail.com>
Gyuho Lee 7 years ago
parent
commit
9057253d8c

+ 1 - 0
functional.yaml

@@ -128,6 +128,7 @@ tester-config:
   - SIGTERM_LEADER_UNTIL_TRIGGER_SNAPSHOT
   - SIGTERM_QUORUM
   - SIGTERM_ALL
+  - SIGQUIT_AND_REMOVE_ONE_FOLLOWER
   - BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER
   - BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT
   - BLACKHOLE_PEER_PORT_TX_RX_LEADER

+ 10 - 8
functional/tester/cluster.go

@@ -161,6 +161,10 @@ func (clus *Cluster) updateFailures() {
 			clus.failures = append(clus.failures,
 				new_FailureCase_SIGTERM_ALL(clus))
 
+		case "SIGQUIT_AND_REMOVE_ONE_FOLLOWER":
+			clus.failures = append(clus.failures,
+				new_FailureCase_SIGQUIT_AND_REMOVE_ONE_FOLLOWER(clus))
+
 		case "BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER":
 			clus.failures = append(clus.failures,
 				new_FailureCase_BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER(clus))
@@ -377,14 +381,12 @@ func (clus *Cluster) broadcast(op rpcpb.Operation) error {
 }
 
 func (clus *Cluster) sendOp(idx int, op rpcpb.Operation) error {
-	if op == rpcpb.Operation_INITIAL_START_ETCD {
-		clus.agentRequests[idx] = &rpcpb.Request{
-			Operation: op,
-			Member:    clus.Members[idx],
-			Tester:    clus.Tester,
-		}
-	} else {
-		clus.agentRequests[idx].Operation = op
+	// maintain the initial member object
+	// throughout the test time
+	clus.agentRequests[idx] = &rpcpb.Request{
+		Operation: op,
+		Member:    clus.Members[idx],
+		Tester:    clus.Tester,
 	}
 
 	err := clus.agentStreams[idx].Send(clus.agentRequests[idx])

+ 7 - 0
functional/tester/cluster_read_config.go

@@ -68,6 +68,13 @@ func read(lg *zap.Logger, fpath string) (*Cluster, error) {
 			clus.Members[i].Etcd.WALDir = filepath.Join(mem.Etcd.DataDir, "member", "wal")
 		}
 
+		switch mem.Etcd.InitialClusterState {
+		case "new":
+		case "existing":
+		default:
+			return nil, fmt.Errorf("'--initial-cluster-state' got %q", mem.Etcd.InitialClusterState)
+		}
+
 		if mem.Etcd.HeartbeatIntervalMs == 0 {
 			return nil, fmt.Errorf("'--heartbeat-interval' cannot be 0 (got %+v)", mem.Etcd)
 		}

+ 1 - 0
functional/tester/cluster_test.go

@@ -162,6 +162,7 @@ func Test_read(t *testing.T) {
 				"SIGTERM_LEADER_UNTIL_TRIGGER_SNAPSHOT",
 				"SIGTERM_QUORUM",
 				"SIGTERM_ALL",
+				"SIGQUIT_AND_REMOVE_ONE_FOLLOWER",
 				"BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER",
 				"BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT",
 				"BLACKHOLE_PEER_PORT_TX_RX_LEADER",

+ 182 - 0
functional/tester/failure_case_sigquit_remove.go

@@ -0,0 +1,182 @@
+// Copyright 2018 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tester
+
+import (
+	"context"
+	"fmt"
+	"sort"
+	"strings"
+	"time"
+
+	"github.com/coreos/etcd/clientv3"
+	"github.com/coreos/etcd/functional/rpcpb"
+	"go.uber.org/zap"
+)
+
+func inject_SIGQUIT_ETCD_AND_REMOVE_DATA(clus *Cluster, idx1 int) error {
+	cli1, err := clus.Members[idx1].CreateEtcdClient()
+	if err != nil {
+		return err
+	}
+	defer cli1.Close()
+
+	var mresp *clientv3.MemberListResponse
+	mresp, err = cli1.MemberList(context.Background())
+	mss := []string{}
+	if err == nil && mresp != nil {
+		mss = describeMembers(mresp)
+	}
+	clus.lg.Info(
+		"member list before disastrous machine failure",
+		zap.String("request-to", clus.Members[idx1].EtcdClientEndpoint),
+		zap.Strings("members", mss),
+		zap.Error(err),
+	)
+	if err != nil {
+		return err
+	}
+
+	sresp, serr := cli1.Status(context.Background(), clus.Members[idx1].EtcdClientEndpoint)
+	if serr != nil {
+		return serr
+	}
+	id1 := sresp.Header.MemberId
+	is1 := fmt.Sprintf("%016x", id1)
+
+	err = clus.sendOp(idx1, rpcpb.Operation_SIGQUIT_ETCD_AND_REMOVE_DATA)
+	clus.lg.Info(
+		"disastrous machine failure",
+		zap.String("target-endpoint", clus.Members[idx1].EtcdClientEndpoint),
+		zap.String("target-member-id", is1),
+		zap.Error(err),
+	)
+	if err != nil {
+		return err
+	}
+
+	time.Sleep(3 * time.Second)
+
+	idx2 := (idx1 + 1) % len(clus.Members)
+	var cli2 *clientv3.Client
+	cli2, err = clus.Members[idx2].CreateEtcdClient()
+	if err != nil {
+		return err
+	}
+	defer cli2.Close()
+
+	_, err = cli2.MemberRemove(context.Background(), id1)
+	clus.lg.Info(
+		"member remove after disaster",
+		zap.String("target-endpoint", clus.Members[idx1].EtcdClientEndpoint),
+		zap.String("target-member-id", is1),
+		zap.String("request-to", clus.Members[idx2].EtcdClientEndpoint),
+		zap.Error(err),
+	)
+	if err != nil {
+		return err
+	}
+
+	time.Sleep(5 * time.Second)
+
+	mresp, err = cli2.MemberList(context.Background())
+	mss = []string{}
+	if err == nil && mresp != nil {
+		mss = describeMembers(mresp)
+	}
+	clus.lg.Info(
+		"member list after member remove",
+		zap.String("request-to", clus.Members[idx2].EtcdClientEndpoint),
+		zap.Strings("members", mss),
+		zap.Error(err),
+	)
+	return err
+}
+
+func recover_SIGQUIT_ETCD_AND_REMOVE_DATA(clus *Cluster, idx1 int) error {
+	idx2 := (idx1 + 1) % len(clus.Members)
+	cli2, err := clus.Members[idx2].CreateEtcdClient()
+	if err != nil {
+		return err
+	}
+	defer cli2.Close()
+
+	_, err = cli2.MemberAdd(context.Background(), clus.Members[idx1].Etcd.AdvertisePeerURLs)
+	clus.lg.Info(
+		"member add before fresh restart",
+		zap.String("target-endpoint", clus.Members[idx1].EtcdClientEndpoint),
+		zap.String("request-to", clus.Members[idx2].EtcdClientEndpoint),
+		zap.Error(err),
+	)
+	if err != nil {
+		return err
+	}
+
+	time.Sleep(3 * time.Second)
+
+	clus.Members[idx1].Etcd.InitialClusterState = "existing"
+	err = clus.sendOp(idx1, rpcpb.Operation_RESTART_ETCD)
+	clus.lg.Info(
+		"fresh restart after member add",
+		zap.String("target-endpoint", clus.Members[idx1].EtcdClientEndpoint),
+		zap.Error(err),
+	)
+	if err != nil {
+		return err
+	}
+
+	time.Sleep(3 * time.Second)
+
+	var mresp *clientv3.MemberListResponse
+	mresp, err = cli2.MemberList(context.Background())
+	mss := []string{}
+	if err == nil && mresp != nil {
+		mss = describeMembers(mresp)
+	}
+	clus.lg.Info(
+		"member list after member add",
+		zap.String("request-to", clus.Members[idx2].EtcdClientEndpoint),
+		zap.Strings("members", mss),
+		zap.Error(err),
+	)
+	return err
+}
+
+func new_FailureCase_SIGQUIT_AND_REMOVE_ONE_FOLLOWER(clus *Cluster) Failure {
+	ff := failureByFunc{
+		failureCase:   rpcpb.FailureCase_SIGQUIT_AND_REMOVE_ONE_FOLLOWER,
+		injectMember:  inject_SIGQUIT_ETCD_AND_REMOVE_DATA,
+		recoverMember: recover_SIGQUIT_ETCD_AND_REMOVE_DATA,
+	}
+	f := &failureFollower{ff, -1, -1}
+	return &failureDelay{
+		Failure:       f,
+		delayDuration: clus.GetFailureDelayDuration(),
+	}
+}
+
+func describeMembers(mresp *clientv3.MemberListResponse) (ss []string) {
+	ss = make([]string, len(mresp.Members))
+	for i, m := range mresp.Members {
+		ss[i] = fmt.Sprintf("Name %s / ID %016x / ClientURLs %s / PeerURLs %s",
+			m.Name,
+			m.ID,
+			strings.Join(m.ClientURLs, ","),
+			strings.Join(m.PeerURLs, ","),
+		)
+	}
+	sort.Strings(ss)
+	return ss
+}