Browse Source

etcdserver: forbid removing started member if quorum cannot be preserved in strict reconfig mode

Like the commit 6974fc63ed87, this commit lets etcdserver forbid
removing started member if quorum cannot be preserved after
reconfiguration if the option -strict-reconfig-check is passed to
etcd. The removal can cause deadlock if unstarted members have wrong
peer URLs.
Hitoshi Mitake 10 years ago
parent
commit
f8859a980d
4 changed files with 118 additions and 1 deletions
  1. 24 0
      etcdserver/cluster.go
  2. 85 0
      etcdserver/cluster_test.go
  3. 6 0
      etcdserver/server.go
  4. 3 1
      etcdserver/server_test.go

+ 24 - 0
etcdserver/cluster.go

@@ -396,6 +396,30 @@ func (c *cluster) isReadyToAddNewMember() bool {
 	return true
 	return true
 }
 }
 
 
+func (c *cluster) isReadyToRemoveMember(id uint64) bool {
+	nmembers := 0
+	nstarted := 0
+
+	for _, member := range c.members {
+		if uint64(member.ID) == id {
+			continue
+		}
+
+		if member.IsStarted() {
+			nstarted++
+		}
+		nmembers++
+	}
+
+	nquorum := nmembers/2 + 1
+	if nstarted < nquorum {
+		plog.Warningf("Reject remove member request: the number of started member (%d) will be less than the quorum number of the cluster (%d)", nstarted, nquorum)
+		return false
+	}
+
+	return true
+}
+
 func membersFromStore(st store.Store) (map[types.ID]*Member, map[types.ID]bool) {
 func membersFromStore(st store.Store) (map[types.ID]*Member, map[types.ID]bool) {
 	members := make(map[types.ID]*Member)
 	members := make(map[types.ID]*Member)
 	removed := make(map[types.ID]bool)
 	removed := make(map[types.ID]bool)

+ 85 - 0
etcdserver/cluster_test.go

@@ -646,3 +646,88 @@ func TestIsReadyToAddNewMember(t *testing.T) {
 		}
 		}
 	}
 	}
 }
 }
+
+func TestIsReadyToRemoveMember(t *testing.T) {
+	tests := []struct {
+		members  []*Member
+		removeID uint64
+		want     bool
+	}{
+		{
+			// 1/1 members ready, should fail
+			[]*Member{
+				newTestMember(1, nil, "1", nil),
+			},
+			1,
+			false,
+		},
+		{
+			// 0/3 members ready, should fail
+			[]*Member{
+				newTestMember(1, nil, "", nil),
+				newTestMember(2, nil, "", nil),
+				newTestMember(3, nil, "", nil),
+			},
+			1,
+			false,
+		},
+		{
+			// 1/2 members ready, should be fine to remove unstarted member
+			// (iReadyToRemoveMember() logic should return success, but operation itself would fail)
+			[]*Member{
+				newTestMember(1, nil, "1", nil),
+				newTestMember(2, nil, "", nil),
+			},
+			2,
+			true,
+		},
+		{
+			// 2/3 members ready, should fail
+			[]*Member{
+				newTestMember(1, nil, "1", nil),
+				newTestMember(2, nil, "2", nil),
+				newTestMember(3, nil, "", nil),
+			},
+			2,
+			false,
+		},
+		{
+			// 3/3 members ready, should be fine to remove one member and retain quorum
+			[]*Member{
+				newTestMember(1, nil, "1", nil),
+				newTestMember(2, nil, "2", nil),
+				newTestMember(3, nil, "3", nil),
+			},
+			3,
+			true,
+		},
+		{
+			// 3/4 members ready, should be fine to remove one member
+			[]*Member{
+				newTestMember(1, nil, "1", nil),
+				newTestMember(2, nil, "2", nil),
+				newTestMember(3, nil, "3", nil),
+				newTestMember(4, nil, "", nil),
+			},
+			3,
+			true,
+		},
+		{
+			// 3/4 members ready, should be fine to remove unstarted member
+			[]*Member{
+				newTestMember(1, nil, "1", nil),
+				newTestMember(2, nil, "2", nil),
+				newTestMember(3, nil, "3", nil),
+				newTestMember(4, nil, "", nil),
+			},
+			4,
+			true,
+		},
+	}
+	for i, tt := range tests {
+		c := newTestCluster(tt.members)
+		if got := c.isReadyToRemoveMember(tt.removeID); got != tt.want {
+			t.Errorf("%d: isReadyToAddNewMember returned %t, want %t", i, got, tt.want)
+		}
+	}
+}

+ 6 - 0
etcdserver/server.go

@@ -635,6 +635,12 @@ func (s *EtcdServer) AddMember(ctx context.Context, memb Member) error {
 }
 }
 
 
 func (s *EtcdServer) RemoveMember(ctx context.Context, id uint64) error {
 func (s *EtcdServer) RemoveMember(ctx context.Context, id uint64) error {
+	if s.cfg.StrictReconfigCheck && !s.cluster.isReadyToRemoveMember(id) {
+		// If s.cfg.StrictReconfigCheck is false, it means the option -strict-reconfig-check isn't passed to etcd.
+		// In such a case removing a member is allowed unconditionally
+		return ErrNotEnoughStartedMembers
+	}
+
 	cc := raftpb.ConfChange{
 	cc := raftpb.ConfChange{
 		Type:   raftpb.ConfChangeRemoveNode,
 		Type:   raftpb.ConfChangeRemoveNode,
 		NodeID: id,
 		NodeID: id,

+ 3 - 1
etcdserver/server_test.go

@@ -451,6 +451,7 @@ func TestApplyConfChangeError(t *testing.T) {
 		srv := &EtcdServer{
 		srv := &EtcdServer{
 			r:       raftNode{Node: n},
 			r:       raftNode{Node: n},
 			cluster: cl,
 			cluster: cl,
+			cfg:     &ServerConfig{},
 		}
 		}
 		_, err := srv.applyConfChange(tt.cc, nil)
 		_, err := srv.applyConfChange(tt.cc, nil)
 		if err != tt.werr {
 		if err != tt.werr {
@@ -863,10 +864,10 @@ func TestAddMember(t *testing.T) {
 			storage:     &storageRecorder{},
 			storage:     &storageRecorder{},
 			transport:   &nopTransporter{},
 			transport:   &nopTransporter{},
 		},
 		},
+		cfg:      &ServerConfig{},
 		store:    st,
 		store:    st,
 		cluster:  cl,
 		cluster:  cl,
 		reqIDGen: idutil.NewGenerator(0, time.Time{}),
 		reqIDGen: idutil.NewGenerator(0, time.Time{}),
-		cfg:      &ServerConfig{},
 	}
 	}
 	s.start()
 	s.start()
 	m := Member{ID: 1234, RaftAttributes: RaftAttributes{PeerURLs: []string{"foo"}}}
 	m := Member{ID: 1234, RaftAttributes: RaftAttributes{PeerURLs: []string{"foo"}}}
@@ -903,6 +904,7 @@ func TestRemoveMember(t *testing.T) {
 			storage:     &storageRecorder{},
 			storage:     &storageRecorder{},
 			transport:   &nopTransporter{},
 			transport:   &nopTransporter{},
 		},
 		},
+		cfg:      &ServerConfig{},
 		store:    st,
 		store:    st,
 		cluster:  cl,
 		cluster:  cl,
 		reqIDGen: idutil.NewGenerator(0, time.Time{}),
 		reqIDGen: idutil.NewGenerator(0, time.Time{}),