Browse Source

Merge pull request #2453 from yichengq/334

tools/etcd-tester: add kill one member tests
Yicheng Qin 10 years ago
parent
commit
9a9d00b482

+ 30 - 2
tools/functional-tester/etcd-tester/cluster.go

@@ -32,6 +32,7 @@ type cluster struct {
 
 	Size       int
 	Agents     []client.Agent
+	Stressers  []Stresser
 	Names      []string
 	ClientURLs []string
 }
@@ -98,8 +99,19 @@ func (c *cluster) Bootstrap() error {
 		}
 	}
 
+	stressers := make([]Stresser, len(clientURLs))
+	for i, u := range clientURLs {
+		s := &stresser{
+			Endpoint: u,
+			N:        200,
+		}
+		go s.Stress()
+		stressers[i] = s
+	}
+
 	c.Size = size
 	c.Agents = agents
+	c.Stressers = stressers
 	c.Names = names
 	c.ClientURLs = clientURLs
 	return nil
@@ -117,19 +129,35 @@ func (c *cluster) WaitHealth() error {
 	return err
 }
 
+func (c *cluster) Report() (success, failure int) {
+	for _, stress := range c.Stressers {
+		s, f := stress.Report()
+		success += s
+		failure += f
+	}
+	return
+}
+
 func (c *cluster) Cleanup() error {
+	var lasterr error
 	for _, a := range c.Agents {
 		if err := a.Cleanup(); err != nil {
-			return err
+			lasterr = err
 		}
 	}
-	return nil
+	for _, s := range c.Stressers {
+		s.Cancel()
+	}
+	return lasterr
 }
 
 func (c *cluster) Terminate() {
 	for _, a := range c.Agents {
 		a.Terminate()
 	}
+	for _, s := range c.Stressers {
+		s.Cancel()
+	}
 }
 
 // setHealthKey sets health key on all given urls.

+ 144 - 0
tools/functional-tester/etcd-tester/failure.go

@@ -14,6 +14,14 @@
 
 package main
 
+import (
+	"fmt"
+	"math/rand"
+	"time"
+)
+
+const snapshotCount = 10000
+
 type failure interface {
 	// Inject injeccts the failure into the testing cluster at the given
 	// round. When calling the function, the cluster should be in health.
@@ -28,3 +36,139 @@ type failure interface {
 type description string
 
 func (d description) Desc() string { return string(d) }
+
+type failureKillAll struct {
+	description
+}
+
+func newFailureKillAll() *failureKillAll {
+	return &failureKillAll{
+		description: "kill all members",
+	}
+}
+
+func (f *failureKillAll) Inject(c *cluster, round int) error {
+	for _, a := range c.Agents {
+		if err := a.Stop(); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (f *failureKillAll) Recover(c *cluster, round int) error {
+	for _, a := range c.Agents {
+		if _, err := a.Restart(); err != nil {
+			return err
+		}
+	}
+	return c.WaitHealth()
+}
+
+type failureKillMajority struct {
+	description
+}
+
+func newFailureKillMajority() *failureKillMajority {
+	return &failureKillMajority{
+		description: "kill majority of the cluster",
+	}
+}
+
+func (f *failureKillMajority) Inject(c *cluster, round int) error {
+	for i := range getToKillMap(c.Size, round) {
+		if err := c.Agents[i].Stop(); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (f *failureKillMajority) Recover(c *cluster, round int) error {
+	for i := range getToKillMap(c.Size, round) {
+		if _, err := c.Agents[i].Restart(); err != nil {
+			return err
+		}
+	}
+	return c.WaitHealth()
+}
+
+func getToKillMap(size int, seed int) map[int]bool {
+	m := make(map[int]bool)
+	r := rand.New(rand.NewSource(int64(seed)))
+	majority := size/2 + 1
+	for {
+		m[r.Intn(size)] = true
+		if len(m) >= majority {
+			return m
+		}
+	}
+}
+
+type failureKillOne struct {
+	description
+}
+
+func newFailureKillOne() *failureKillOne {
+	return &failureKillOne{
+		description: "kill one random member",
+	}
+}
+
+func (f *failureKillOne) Inject(c *cluster, round int) error {
+	i := round % c.Size
+	return c.Agents[i].Stop()
+}
+
+func (f *failureKillOne) Recover(c *cluster, round int) error {
+	i := round % c.Size
+	if _, err := c.Agents[i].Restart(); err != nil {
+		return err
+	}
+	return c.WaitHealth()
+}
+
+// failureKillOneForLongTime kills one member for long time, and restart
+// after a snapshot is required.
+type failureKillOneForLongTime struct {
+	description
+}
+
+func newFailureKillOneForLongTime() *failureKillOneForLongTime {
+	return &failureKillOneForLongTime{
+		description: "kill one member for long time and expect it to recover from incoming snapshot",
+	}
+}
+
+func (f *failureKillOneForLongTime) Inject(c *cluster, round int) error {
+	i := round % c.Size
+	if err := c.Agents[i].Stop(); err != nil {
+		return err
+	}
+	if c.Size >= 3 {
+		start, _ := c.Report()
+		var end int
+		// Normal healthy cluster could accept 1000req/s at least.
+		// Give it 3-times time to create a new snapshot.
+		retry := snapshotCount / 1000 * 3
+		for j := 0; j < retry; j++ {
+			end, _ = c.Report()
+			// If the number of proposals committed is bigger than snapshot count,
+			// a new snapshot should have been created.
+			if end-start > snapshotCount {
+				return nil
+			}
+			time.Sleep(time.Second)
+		}
+		return fmt.Errorf("cluster too slow: only commit %d requests in %ds", end-start, retry)
+	}
+	return nil
+}
+
+func (f *failureKillOneForLongTime) Recover(c *cluster, round int) error {
+	i := round % c.Size
+	if _, err := c.Agents[i].Restart(); err != nil {
+		return err
+	}
+	return c.WaitHealth()
+}

+ 0 - 43
tools/functional-tester/etcd-tester/failure_killall.go

@@ -1,43 +0,0 @@
-// Copyright 2015 CoreOS, Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package main
-
-type failureKillAll struct {
-	description
-}
-
-func newFailureKillAll() *failureKillAll {
-	return &failureKillAll{
-		description: "kill all members",
-	}
-}
-
-func (f *failureKillAll) Inject(c *cluster, round int) error {
-	for _, a := range c.Agents {
-		if err := a.Stop(); err != nil {
-			return err
-		}
-	}
-	return nil
-}
-
-func (f *failureKillAll) Recover(c *cluster, round int) error {
-	for _, a := range c.Agents {
-		if _, err := a.Restart(); err != nil {
-			return err
-		}
-	}
-	return c.WaitHealth()
-}

+ 0 - 57
tools/functional-tester/etcd-tester/failure_killmaj.go

@@ -1,57 +0,0 @@
-// Copyright 2015 CoreOS, Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package main
-
-import "math/rand"
-
-type failureKillMajority struct {
-	description
-}
-
-func newFailureKillMajority() *failureKillMajority {
-	return &failureKillMajority{
-		description: "kill majority of the cluster",
-	}
-}
-
-func (f *failureKillMajority) Inject(c *cluster, round int) error {
-	for i := range getToKillMap(c.Size, round) {
-		if err := c.Agents[i].Stop(); err != nil {
-			return err
-		}
-	}
-	return nil
-}
-
-func (f *failureKillMajority) Recover(c *cluster, round int) error {
-	for i := range getToKillMap(c.Size, round) {
-		if _, err := c.Agents[i].Restart(); err != nil {
-			return err
-		}
-	}
-	return c.WaitHealth()
-}
-
-func getToKillMap(size int, seed int) map[int]bool {
-	m := make(map[int]bool)
-	r := rand.New(rand.NewSource(int64(seed)))
-	majority := size/2 + 1
-	for {
-		m[r.Intn(size)] = true
-		if len(m) >= majority {
-			return m
-		}
-	}
-}

+ 0 - 29
tools/functional-tester/etcd-tester/failure_no.go

@@ -1,29 +0,0 @@
-// Copyright 2015 CoreOS, Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package main
-
-type failureBase struct {
-	description
-}
-
-func newFailureBase() *failureBase {
-	return &failureBase{
-		description: "do nothing",
-	}
-}
-
-func (f *failureBase) Inject(c *cluster, round int) error { return nil }
-
-func (f *failureBase) Recover(c *cluster, round int) error { return nil }

+ 8 - 17
tools/functional-tester/etcd-tester/main.go

@@ -33,24 +33,15 @@ func main() {
 	}
 	defer c.Terminate()
 
-	stressers := make([]Stresser, len(c.ClientURLs))
-	for i, u := range c.ClientURLs {
-		s := &stresser{
-			Endpoint: u,
-			N:        200,
-		}
-		go s.Stress()
-		stressers[i] = s
-	}
-
 	t := &tester{
-		failures: []failure{newFailureBase(), newFailureKillAll(), newFailureKillMajority()},
-		cluster:  c,
-		limit:    *limit,
+		failures: []failure{
+			newFailureKillAll(),
+			newFailureKillMajority(),
+			newFailureKillOne(),
+			newFailureKillOneForLongTime(),
+		},
+		cluster: c,
+		limit:   *limit,
 	}
 	t.runLoop()
-
-	for _, s := range stressers {
-		s.Cancel()
-	}
 }

+ 2 - 1
tools/functional-tester/etcd-tester/stresser.go

@@ -65,8 +65,9 @@ func (s *stresser) Stress() error {
 				s.mu.Lock()
 				if err != nil {
 					s.failure++
+				} else {
+					s.success++
 				}
-				s.success++
 				s.mu.Unlock()
 			}
 		}()