Browse Source

Merge pull request #3479 from mitake/membership

etcdserver: avoid deadlock caused by adding members with wrong peer URLs
Yicheng Qin 10 years ago
parent
commit
0ca800fbac

+ 2 - 0
etcdmain/config.go

@@ -95,6 +95,7 @@ type config struct {
 	fallback            *flags.StringsFlag
 	initialCluster      string
 	initialClusterToken string
+	strictReconfigCheck bool
 
 	// proxy
 	proxy                  *flags.StringsFlag
@@ -177,6 +178,7 @@ func NewConfig() *config {
 		// Should never happen.
 		plog.Panicf("unexpected error setting up clusterStateFlag: %v", err)
 	}
+	fs.BoolVar(&cfg.strictReconfigCheck, "strict-reconfig-check", false, "Reject reconfiguration that might cause quorum loss")
 
 	// proxy
 	fs.Var(cfg.proxy, "proxy", fmt.Sprintf("Valid values include %s", strings.Join(cfg.proxy.Values, ", ")))

+ 1 - 0
etcdmain/etcd.go

@@ -265,6 +265,7 @@ func startEtcd(cfg *config) (<-chan struct{}, error) {
 		TickMs:              cfg.TickMs,
 		ElectionTicks:       cfg.electionTicks(),
 		V3demo:              cfg.v3demo,
+		StrictReconfigCheck: cfg.strictReconfigCheck,
 	}
 	var s *etcdserver.EtcdServer
 	s, err = etcdserver.NewServer(srvcfg)

+ 28 - 0
etcdserver/cluster.go

@@ -368,6 +368,34 @@ func (c *cluster) SetVersion(ver *semver.Version) {
 	MustDetectDowngrade(c.version)
 }
 
+func (c *cluster) isReadyToAddNewMember() bool {
+	nmembers := 1
+	nstarted := 0
+
+	for _, member := range c.members {
+		if member.IsStarted() {
+			nstarted++
+		}
+		nmembers++
+	}
+
+	if nstarted == 1 && nmembers == 2 {
+		// a case of adding a new node to 1-member cluster for restoring cluster data
+		// https://github.com/coreos/etcd/blob/master/Documentation/admin_guide.md#restoring-the-cluster
+
+		plog.Debugf("The number of started member is 1. This cluster can accept add member request.")
+		return true
+	}
+
+	nquorum := nmembers/2 + 1
+	if nstarted < nquorum {
+		plog.Warningf("Reject add member request: the number of started member (%d) will be less than the quorum number of the cluster (%d)", nstarted, nquorum)
+		return false
+	}
+
+	return true
+}
+
 func membersFromStore(st store.Store) (map[types.ID]*Member, map[types.ID]bool) {
 	members := make(map[types.ID]*Member)
 	removed := make(map[types.ID]bool)

+ 80 - 0
etcdserver/cluster_test.go

@@ -566,3 +566,83 @@ func newTestCluster(membs []*Member) *cluster {
 }
 
 func stringp(s string) *string { return &s }
+
+func TestIsReadyToAddNewMember(t *testing.T) {
+	tests := []struct {
+		members []*Member
+		want    bool
+	}{
+		{
+			// 0/3 members ready, should fail
+			[]*Member{
+				newTestMember(1, nil, "", nil),
+				newTestMember(2, nil, "", nil),
+				newTestMember(3, nil, "", nil),
+			},
+			false,
+		},
+		{
+			// 1/2 members ready, should fail
+			[]*Member{
+				newTestMember(1, nil, "1", nil),
+				newTestMember(2, nil, "", nil),
+			},
+			false,
+		},
+		{
+			// 1/3 members ready, should fail
+			[]*Member{
+				newTestMember(1, nil, "1", nil),
+				newTestMember(2, nil, "", nil),
+				newTestMember(3, nil, "", nil),
+			},
+			false,
+		},
+		{
+			// 1/1 members ready, should succeed (special case of 1-member cluster for recovery)
+			[]*Member{
+				newTestMember(1, nil, "1", nil),
+			},
+			true,
+		},
+		{
+			// 2/3 members ready, should fail
+			[]*Member{
+				newTestMember(1, nil, "1", nil),
+				newTestMember(2, nil, "2", nil),
+				newTestMember(3, nil, "", nil),
+			},
+			false,
+		},
+		{
+			// 3/3 members ready, should be fine to add one member and retain quorum
+			[]*Member{
+				newTestMember(1, nil, "1", nil),
+				newTestMember(2, nil, "2", nil),
+				newTestMember(3, nil, "3", nil),
+			},
+			true,
+		},
+		{
+			// 3/4 members ready, should be fine to add one member and retain quorum
+			[]*Member{
+				newTestMember(1, nil, "1", nil),
+				newTestMember(2, nil, "2", nil),
+				newTestMember(3, nil, "3", nil),
+				newTestMember(4, nil, "", nil),
+			},
+			true,
+		},
+		{
+			// empty cluster, it is impossible but should fail
+			[]*Member{},
+			false,
+		},
+	}
+	for i, tt := range tests {
+		c := newTestCluster(tt.members)
+		if got := c.isReadyToAddNewMember(); got != tt.want {
+			t.Errorf("%d: isReadyToAddNewMember returned %t, want %t", i, got, tt.want)
+		}
+	}
+}

+ 2 - 0
etcdserver/config.go

@@ -49,6 +49,8 @@ type ServerConfig struct {
 	ElectionTicks int
 
 	V3demo bool
+
+	StrictReconfigCheck bool
 }
 
 // VerifyBootstrapConfig sanity-checks the initial config for bootstrap case

+ 1 - 0
etcdserver/errors.go

@@ -31,6 +31,7 @@ var (
 	ErrTimeout                    = errors.New("etcdserver: request timed out")
 	ErrTimeoutDueToLeaderFail     = errors.New("etcdserver: request timed out, possibly due to previous leader failure")
 	ErrTimeoutDueToConnectionLost = errors.New("etcdserver: request timed out, possibly due to connection lost")
+	ErrNotEnoughStartedMembers    = errors.New("etcdserver: re-configuration failed due to not enough started members")
 )
 
 func isKeyNotFound(err error) bool {

+ 4 - 0
etcdserver/member.go

@@ -105,6 +105,10 @@ func (m *Member) Clone() *Member {
 	return mm
 }
 
+func (m *Member) IsStarted() bool {
+	return len(m.Name) != 0
+}
+
 func memberStoreKey(id types.ID) string {
 	return path.Join(storeMembersPrefix, id.String())
 }

+ 6 - 0
etcdserver/server.go

@@ -603,6 +603,12 @@ func (s *EtcdServer) LeaderStats() []byte {
 func (s *EtcdServer) StoreStats() []byte { return s.store.JsonStats() }
 
 func (s *EtcdServer) AddMember(ctx context.Context, memb Member) error {
+	if s.cfg.StrictReconfigCheck && !s.cluster.isReadyToAddNewMember() {
+		// If s.cfg.StrictReconfigCheck is false, it means the option -strict-reconfig-check isn't passed to etcd.
+		// In such a case adding a new member is allowed unconditionally
+		return ErrNotEnoughStartedMembers
+	}
+
 	// TODO: move Member to protobuf type
 	b, err := json.Marshal(memb)
 	if err != nil {

+ 1 - 0
etcdserver/server_test.go

@@ -866,6 +866,7 @@ func TestAddMember(t *testing.T) {
 		store:    st,
 		cluster:  cl,
 		reqIDGen: idutil.NewGenerator(0, time.Time{}),
+		cfg:      &ServerConfig{},
 	}
 	s.start()
 	m := Member{ID: 1234, RaftAttributes: RaftAttributes{PeerURLs: []string{"foo"}}}