Browse Source

etcdserver: adjust StrictReconfigCheck

Adjust StrictReconfigCheck logic to accommodate learner members in the
cluster.
Jingyi Hu 6 years ago
parent
commit
76a63f9f7d
3 changed files with 141 additions and 42 deletions
  1. 17 4
      etcdserver/api/membership/cluster.go
  2. 73 4
      etcdserver/api/membership/cluster_test.go
  3. 51 34
      etcdserver/server.go

+ 17 - 4
etcdserver/api/membership/cluster.go

@@ -123,6 +123,19 @@ func (c *RaftCluster) Member(id types.ID) *Member {
 	return c.members[id].Clone()
 }
 
+func (c *RaftCluster) VotingMembers() []*Member {
+	c.Lock()
+	defer c.Unlock()
+	var ms MembersByID
+	for _, m := range c.members {
+		if !m.IsLearner {
+			ms = append(ms, m.Clone())
+		}
+	}
+	sort.Sort(ms)
+	return []*Member(ms)
+}
+
 // MemberByName returns a Member with the given name if exists.
 // If more than one member has the given name, it will panic.
 func (c *RaftCluster) MemberByName(name string) *Member {
@@ -551,11 +564,11 @@ func (c *RaftCluster) SetVersion(ver *semver.Version, onSet func(*zap.Logger, *s
 	onSet(c.lg, ver)
 }
 
-func (c *RaftCluster) IsReadyToAddNewMember() bool {
+func (c *RaftCluster) IsReadyToAddVotingMember() bool {
 	nmembers := 1
 	nstarted := 0
 
-	for _, member := range c.members {
+	for _, member := range c.VotingMembers() {
 		if member.IsStarted() {
 			nstarted++
 		}
@@ -592,11 +605,11 @@ func (c *RaftCluster) IsReadyToAddNewMember() bool {
 	return true
 }
 
-func (c *RaftCluster) IsReadyToRemoveMember(id uint64) bool {
+func (c *RaftCluster) IsReadyToRemoveVotingMember(id uint64) bool {
 	nmembers := 0
 	nstarted := 0
 
-	for _, member := range c.members {
+	for _, member := range c.VotingMembers() {
 		if uint64(member.ID) == id {
 			continue
 		}

+ 73 - 4
etcdserver/api/membership/cluster_test.go

@@ -626,7 +626,7 @@ func newTestCluster(membs []*Member) *RaftCluster {
 
 func stringp(s string) *string { return &s }
 
-func TestIsReadyToAddNewMember(t *testing.T) {
+func TestIsReadyToAddVotingMember(t *testing.T) {
 	tests := []struct {
 		members []*Member
 		want    bool
@@ -697,16 +697,38 @@ func TestIsReadyToAddNewMember(t *testing.T) {
 			[]*Member{},
 			false,
 		},
+		{
+			// 2 voting members ready in cluster with 2 voting members and 2 unstarted learner member, should succeed
+			// (the status of learner members does not affect the readiness of adding voting member)
+			[]*Member{
+				newTestMember(1, nil, "1", nil),
+				newTestMember(2, nil, "2", nil),
+				newTestMemberAsLearner(3, nil, "", nil),
+				newTestMemberAsLearner(4, nil, "", nil),
+			},
+			true,
+		},
+		{
+			// 1 voting member ready in cluster with 2 voting members and 2 ready learner member, should fail
+			// (the status of learner members does not affect the readiness of adding voting member)
+			[]*Member{
+				newTestMember(1, nil, "1", nil),
+				newTestMember(2, nil, "", nil),
+				newTestMemberAsLearner(3, nil, "3", nil),
+				newTestMemberAsLearner(4, nil, "4", nil),
+			},
+			false,
+		},
 	}
 	for i, tt := range tests {
 		c := newTestCluster(tt.members)
-		if got := c.IsReadyToAddNewMember(); got != tt.want {
+		if got := c.IsReadyToAddVotingMember(); got != tt.want {
 			t.Errorf("%d: isReadyToAddNewMember returned %t, want %t", i, got, tt.want)
 		}
 	}
 }
 
-func TestIsReadyToRemoveMember(t *testing.T) {
+func TestIsReadyToRemoveVotingMember(t *testing.T) {
 	tests := []struct {
 		members  []*Member
 		removeID uint64
@@ -782,10 +804,57 @@ func TestIsReadyToRemoveMember(t *testing.T) {
 			4,
 			true,
 		},
+		{
+			// 1 voting members ready in cluster with 1 voting member and 1 ready learner,
+			// removing voting member should fail
+			// (the status of learner members does not affect the readiness of removing voting member)
+			[]*Member{
+				newTestMember(1, nil, "1", nil),
+				newTestMemberAsLearner(2, nil, "2", nil),
+			},
+			1,
+			false,
+		},
+		{
+			// 1 voting members ready in cluster with 2 voting member and 1 ready learner,
+			// removing ready voting member should fail
+			// (the status of learner members does not affect the readiness of removing voting member)
+			[]*Member{
+				newTestMember(1, nil, "1", nil),
+				newTestMember(2, nil, "", nil),
+				newTestMemberAsLearner(3, nil, "3", nil),
+			},
+			1,
+			false,
+		},
+		{
+			// 1 voting members ready in cluster with 2 voting member and 1 ready learner,
+			// removing unstarted voting member should be fine. (Actual operation will fail)
+			// (the status of learner members does not affect the readiness of removing voting member)
+			[]*Member{
+				newTestMember(1, nil, "1", nil),
+				newTestMember(2, nil, "", nil),
+				newTestMemberAsLearner(3, nil, "3", nil),
+			},
+			2,
+			true,
+		},
+		{
+			// 1 voting members ready in cluster with 2 voting member and 1 unstarted learner,
+			// removing not-ready voting member should be fine. (Actual operation will fail)
+			// (the status of learner members does not affect the readiness of removing voting member)
+			[]*Member{
+				newTestMember(1, nil, "1", nil),
+				newTestMember(2, nil, "", nil),
+				newTestMemberAsLearner(3, nil, "", nil),
+			},
+			2,
+			true,
+		},
 	}
 	for i, tt := range tests {
 		c := newTestCluster(tt.members)
-		if got := c.IsReadyToRemoveMember(tt.removeID); got != tt.want {
+		if got := c.IsReadyToRemoveVotingMember(tt.removeID); got != tt.want {
 			t.Errorf("%d: isReadyToAddNewMember returned %t, want %t", i, got, tt.want)
 		}
 	}

+ 51 - 34
etcdserver/server.go

@@ -1553,43 +1553,17 @@ func (s *EtcdServer) AddMember(ctx context.Context, memb membership.Member) ([]*
 		return nil, err
 	}
 
-	// TODO: might switch to less strict check when adding raft learner
-	if s.Cfg.StrictReconfigCheck {
-		// by default StrictReconfigCheck is enabled; reject new members if unhealthy
-		if !s.cluster.IsReadyToAddNewMember() {
-			if lg := s.getLogger(); lg != nil {
-				lg.Warn(
-					"rejecting member add request; not enough healthy members",
-					zap.String("local-member-id", s.ID().String()),
-					zap.String("requested-member-add", fmt.Sprintf("%+v", memb)),
-					zap.Error(ErrNotEnoughStartedMembers),
-				)
-			} else {
-				plog.Warningf("not enough started members, rejecting member add %+v", memb)
-			}
-			return nil, ErrNotEnoughStartedMembers
-		}
-
-		if !isConnectedFullySince(s.r.transport, time.Now().Add(-HealthInterval), s.ID(), s.cluster.Members()) {
-			if lg := s.getLogger(); lg != nil {
-				lg.Warn(
-					"rejecting member add request; local member has not been connected to all peers, reconfigure breaks active quorum",
-					zap.String("local-member-id", s.ID().String()),
-					zap.String("requested-member-add", fmt.Sprintf("%+v", memb)),
-					zap.Error(ErrUnhealthy),
-				)
-			} else {
-				plog.Warningf("not healthy for reconfigure, rejecting member add %+v", memb)
-			}
-			return nil, ErrUnhealthy
-		}
-	}
-
 	// TODO: move Member to protobuf type
 	b, err := json.Marshal(memb)
 	if err != nil {
 		return nil, err
 	}
+
+	// by default StrictReconfigCheck is enabled; reject new members if unhealthy.
+	if err := s.mayAddMember(memb); err != nil {
+		return nil, err
+	}
+
 	cc := raftpb.ConfChange{
 		Type:    raftpb.ConfChangeAddNode,
 		NodeID:  uint64(memb.ID),
@@ -1603,6 +1577,43 @@ func (s *EtcdServer) AddMember(ctx context.Context, memb membership.Member) ([]*
 	return s.configure(ctx, cc)
 }
 
+func (s *EtcdServer) mayAddMember(memb membership.Member) error {
+	if !s.Cfg.StrictReconfigCheck {
+		return nil
+	}
+
+	// protect quorum when adding voting member
+	if !memb.IsLearner && !s.cluster.IsReadyToAddVotingMember() {
+		if lg := s.getLogger(); lg != nil {
+			lg.Warn(
+				"rejecting member add request; not enough healthy members",
+				zap.String("local-member-id", s.ID().String()),
+				zap.String("requested-member-add", fmt.Sprintf("%+v", memb)),
+				zap.Error(ErrNotEnoughStartedMembers),
+			)
+		} else {
+			plog.Warningf("not enough started members, rejecting member add %+v", memb)
+		}
+		return ErrNotEnoughStartedMembers
+	}
+
+	if !isConnectedFullySince(s.r.transport, time.Now().Add(-HealthInterval), s.ID(), s.cluster.VotingMembers()) {
+		if lg := s.getLogger(); lg != nil {
+			lg.Warn(
+				"rejecting member add request; local member has not been connected to all peers, reconfigure breaks active quorum",
+				zap.String("local-member-id", s.ID().String()),
+				zap.String("requested-member-add", fmt.Sprintf("%+v", memb)),
+				zap.Error(ErrUnhealthy),
+			)
+		} else {
+			plog.Warningf("not healthy for reconfigure, rejecting member add %+v", memb)
+		}
+		return ErrUnhealthy
+	}
+
+	return nil
+}
+
 func (s *EtcdServer) RemoveMember(ctx context.Context, id uint64) ([]*membership.Member, error) {
 	if err := s.checkMembershipOperationPermission(ctx); err != nil {
 		return nil, err
@@ -1667,7 +1678,13 @@ func (s *EtcdServer) mayRemoveMember(id types.ID) error {
 		return nil
 	}
 
-	if !s.cluster.IsReadyToRemoveMember(uint64(id)) {
+	isLearner := s.cluster.IsMemberExist(id) && s.cluster.Member(id).IsLearner
+	// no need to check quorum when removing non-voting member
+	if isLearner {
+		return nil
+	}
+
+	if !s.cluster.IsReadyToRemoveVotingMember(uint64(id)) {
 		if lg := s.getLogger(); lg != nil {
 			lg.Warn(
 				"rejecting member remove request; not enough healthy members",
@@ -1687,7 +1704,7 @@ func (s *EtcdServer) mayRemoveMember(id types.ID) error {
 	}
 
 	// protect quorum if some members are down
-	m := s.cluster.Members()
+	m := s.cluster.VotingMembers()
 	active := numConnectedSince(s.r.transport, time.Now().Add(-HealthInterval), s.ID(), m)
 	if (active - 1) < 1+((len(m)-1)/2) {
 		if lg := s.getLogger(); lg != nil {