소스 검색

functional-tester/tester: refactor "Failure" to support liveness mode

Signed-off-by: Gyuho Lee <gyuhox@gmail.com>
Gyuho Lee 7 년 전
부모
커밋
b3fea7ed53

+ 68 - 27
tools/functional-tester/tester/failure.go

@@ -18,6 +18,8 @@ import (
 	"fmt"
 	"math/rand"
 	"time"
+
+	"github.com/coreos/etcd/tools/functional-tester/rpcpb"
 )
 
 // Failure defines failure injection interface.
@@ -33,28 +35,32 @@ type Failure interface {
 	Recover(clus *Cluster) error
 	// Desc returns a description of the failure
 	Desc() string
+	// FailureCase returns "rpcpb.FailureCase" enum type.
+	FailureCase() rpcpb.FailureCase
 }
 
-type description string
-
-func (d description) Desc() string { return string(d) }
-
 type injectMemberFunc func(*Cluster, int) error
 type recoverMemberFunc func(*Cluster, int) error
 
 type failureByFunc struct {
-	description
+	desc
+	failureCase   rpcpb.FailureCase
 	injectMember  injectMemberFunc
 	recoverMember recoverMemberFunc
 }
 
-type failureFollower struct {
-	failureByFunc
-	last int
-	lead int
+func (f *failureByFunc) Desc() string {
+	if string(f.desc) != "" {
+		return string(f.desc)
+	}
+	return f.failureCase.String()
 }
 
-type failureLeader struct {
+func (f *failureByFunc) FailureCase() rpcpb.FailureCase {
+	return f.failureCase
+}
+
+type failureFollower struct {
 	failureByFunc
 	last int
 	lead int
@@ -82,22 +88,6 @@ func (f *failureFollower) updateIndex(clus *Cluster) error {
 	return nil
 }
 
-func (f *failureLeader) updateIndex(clus *Cluster) error {
-	idx, err := clus.GetLeader()
-	if err != nil {
-		return err
-	}
-	f.lead = idx
-	f.last = idx
-	return nil
-}
-
-type failureQuorum failureByFunc
-type failureAll failureByFunc
-
-// failureUntilSnapshot injects a failure and waits for a snapshot event
-type failureUntilSnapshot struct{ Failure }
-
 func (f *failureFollower) Inject(clus *Cluster) error {
 	if err := f.updateIndex(clus); err != nil {
 		return err
@@ -109,6 +99,24 @@ func (f *failureFollower) Recover(clus *Cluster) error {
 	return f.recoverMember(clus, f.last)
 }
 
+func (f *failureFollower) FailureCase() rpcpb.FailureCase { return f.failureCase }
+
+type failureLeader struct {
+	failureByFunc
+	last int
+	lead int
+}
+
+func (f *failureLeader) updateIndex(clus *Cluster) error {
+	idx, err := clus.GetLeader()
+	if err != nil {
+		return err
+	}
+	f.lead = idx
+	f.last = idx
+	return nil
+}
+
 func (f *failureLeader) Inject(clus *Cluster) error {
 	if err := f.updateIndex(clus); err != nil {
 		return err
@@ -120,6 +128,12 @@ func (f *failureLeader) Recover(clus *Cluster) error {
 	return f.recoverMember(clus, f.last)
 }
 
+func (f *failureLeader) FailureCase() rpcpb.FailureCase {
+	return f.failureCase
+}
+
+type failureQuorum failureByFunc
+
 func (f *failureQuorum) Inject(clus *Cluster) error {
 	for i := range killMap(len(clus.Members), clus.rd) {
 		if err := f.injectMember(clus, i); err != nil {
@@ -138,6 +152,10 @@ func (f *failureQuorum) Recover(clus *Cluster) error {
 	return nil
 }
 
+func (f *failureQuorum) FailureCase() rpcpb.FailureCase { return f.failureCase }
+
+type failureAll failureByFunc
+
 func (f *failureAll) Inject(clus *Cluster) error {
 	for i := range clus.Members {
 		if err := f.injectMember(clus, i); err != nil {
@@ -156,6 +174,18 @@ func (f *failureAll) Recover(clus *Cluster) error {
 	return nil
 }
 
+func (f *failureAll) FailureCase() rpcpb.FailureCase {
+	return f.failureCase
+}
+
+// failureUntilSnapshot injects a failure and waits for a snapshot event
+type failureUntilSnapshot struct {
+	desc        desc
+	failureCase rpcpb.FailureCase
+
+	Failure
+}
+
 const snapshotCount = 10000
 
 func (f *failureUntilSnapshot) Inject(clus *Cluster) error {
@@ -190,7 +220,14 @@ func (f *failureUntilSnapshot) Inject(clus *Cluster) error {
 }
 
 func (f *failureUntilSnapshot) Desc() string {
-	return f.Failure.Desc() + " for a long time and expect it to recover from an incoming snapshot"
+	if f.desc.Desc() != "" {
+		return f.desc.Desc()
+	}
+	return f.failureCase.String() + " (to trigger snapshot)"
+}
+
+func (f *failureUntilSnapshot) FailureCase() rpcpb.FailureCase {
+	return f.failureCase
 }
 
 func killMap(size int, seed int) map[int]bool {
@@ -204,3 +241,7 @@ func killMap(size int, seed int) map[int]bool {
 		}
 	}
 }
+
+type desc string
+
+func (d desc) Desc() string { return string(d) }

+ 15 - 4
tools/functional-tester/tester/failure_case_external.go

@@ -17,13 +17,17 @@ package tester
 import (
 	"fmt"
 	"os/exec"
+
+	"github.com/coreos/etcd/tools/functional-tester/rpcpb"
 )
 
 type failureExternal struct {
 	Failure
 
-	description string
-	scriptPath  string
+	desc        string
+	failureCase rpcpb.FailureCase
+
+	scriptPath string
 }
 
 func (f *failureExternal) Inject(clus *Cluster) error {
@@ -34,11 +38,18 @@ func (f *failureExternal) Recover(clus *Cluster) error {
 	return exec.Command(f.scriptPath, "disable", fmt.Sprintf("%d", clus.rd)).Run()
 }
 
-func (f *failureExternal) Desc() string { return f.description }
+func (f *failureExternal) Desc() string {
+	return f.desc
+}
+
+func (f *failureExternal) FailureCase() rpcpb.FailureCase {
+	return f.failureCase
+}
 
 func newFailureExternal(scriptPath string) Failure {
 	return &failureExternal{
-		description: fmt.Sprintf("external fault injector (script: %q)", scriptPath),
+		desc:        fmt.Sprintf("external fault injector (script: %q)", scriptPath),
+		failureCase: rpcpb.FailureCase_EXTERNAL,
 		scriptPath:  scriptPath,
 	}
 }

+ 21 - 6
tools/functional-tester/tester/failure_case_failpoints.go

@@ -21,6 +21,8 @@ import (
 	"strings"
 	"sync"
 	"time"
+
+	"github.com/coreos/etcd/tools/functional-tester/rpcpb"
 )
 
 type failpointStats struct {
@@ -42,14 +44,23 @@ func failpointFailures(clus *Cluster) (ret []Failure, err error) {
 		if len(fp) == 0 {
 			continue
 		}
+
 		fpFails := failuresFromFailpoint(fp, clus.Tester.FailpointCommands)
+
 		// wrap in delays so failpoint has time to trigger
 		for i, fpf := range fpFails {
 			if strings.Contains(fp, "Snap") {
 				// hack to trigger snapshot failpoints
-				fpFails[i] = &failureUntilSnapshot{fpf}
+				fpFails[i] = &failureUntilSnapshot{
+					desc:        desc(fpf.Desc()),
+					failureCase: rpcpb.FailureCase_FAILPOINTS,
+					Failure:     fpf,
+				}
 			} else {
-				fpFails[i] = &failureDelay{fpf, 3 * time.Second}
+				fpFails[i] = &failureDelay{
+					Failure:       fpf,
+					delayDuration: 3 * time.Second,
+				}
 			}
 		}
 		ret = append(ret, fpFails...)
@@ -85,7 +96,8 @@ func failuresFromFailpoint(fp string, failpointCommands []string) (fs []Failure)
 		fs = append(fs, []Failure{
 			&failureFollower{
 				failureByFunc: failureByFunc{
-					description:   description(fmt.Sprintf("failpoint %s (one: %s)", fp, fcmd)),
+					desc:          desc(fmt.Sprintf("failpoint %q (one: %q)", fp, fcmd)),
+					failureCase:   rpcpb.FailureCase_FAILPOINTS,
 					injectMember:  inject,
 					recoverMember: recov,
 				},
@@ -94,7 +106,8 @@ func failuresFromFailpoint(fp string, failpointCommands []string) (fs []Failure)
 			},
 			&failureLeader{
 				failureByFunc: failureByFunc{
-					description:   description(fmt.Sprintf("failpoint %s (leader: %s)", fp, fcmd)),
+					desc:          desc(fmt.Sprintf("failpoint %q (leader: %q)", fp, fcmd)),
+					failureCase:   rpcpb.FailureCase_FAILPOINTS,
 					injectMember:  inject,
 					recoverMember: recov,
 				},
@@ -102,12 +115,14 @@ func failuresFromFailpoint(fp string, failpointCommands []string) (fs []Failure)
 				lead: -1,
 			},
 			&failureQuorum{
-				description:   description(fmt.Sprintf("failpoint %s (quorum: %s)", fp, fcmd)),
+				desc:          desc(fmt.Sprintf("failpoint %q (quorum: %q)", fp, fcmd)),
+				failureCase:   rpcpb.FailureCase_FAILPOINTS,
 				injectMember:  inject,
 				recoverMember: recov,
 			},
 			&failureAll{
-				description:   description(fmt.Sprintf("failpoint %s (all: %s)", fp, fcmd)),
+				desc:          desc(fmt.Sprintf("failpoint %q (all: %q)", fp, fcmd)),
+				failureCase:   rpcpb.FailureCase_FAILPOINTS,
 				injectMember:  inject,
 				recoverMember: recov,
 			},

+ 12 - 6
tools/functional-tester/tester/failure_case_kill.go

@@ -26,7 +26,7 @@ func recoverKill(clus *Cluster, idx int) error {
 
 func newFailureKillOneFollower() Failure {
 	ff := failureByFunc{
-		description:   "kill one follower",
+		failureCase:   rpcpb.FailureCase_KILL_ONE_FOLLOWER,
 		injectMember:  injectKill,
 		recoverMember: recoverKill,
 	}
@@ -35,7 +35,7 @@ func newFailureKillOneFollower() Failure {
 
 func newFailureKillLeader() Failure {
 	ff := failureByFunc{
-		description:   "kill leader",
+		failureCase:   rpcpb.FailureCase_KILL_LEADER,
 		injectMember:  injectKill,
 		recoverMember: recoverKill,
 	}
@@ -44,7 +44,7 @@ func newFailureKillLeader() Failure {
 
 func newFailureKillQuorum() Failure {
 	return &failureQuorum{
-		description:   "kill quorum",
+		failureCase:   rpcpb.FailureCase_KILL_QUORUM,
 		injectMember:  injectKill,
 		recoverMember: recoverKill,
 	}
@@ -52,16 +52,22 @@ func newFailureKillQuorum() Failure {
 
 func newFailureKillAll() Failure {
 	return &failureAll{
-		description:   "kill all",
+		failureCase:   rpcpb.FailureCase_KILL_ALL,
 		injectMember:  injectKill,
 		recoverMember: recoverKill,
 	}
 }
 
 func newFailureKillOneFollowerForLongTime() Failure {
-	return &failureUntilSnapshot{newFailureKillOneFollower()}
+	return &failureUntilSnapshot{
+		failureCase: rpcpb.FailureCase_KILL_ONE_FOLLOWER_FOR_LONG,
+		Failure:     newFailureKillOneFollower(),
+	}
 }
 
 func newFailureKillLeaderForLongTime() Failure {
-	return &failureUntilSnapshot{newFailureKillLeader()}
+	return &failureUntilSnapshot{
+		failureCase: rpcpb.FailureCase_KILL_LEADER_FOR_LONG,
+		Failure:     newFailureKillLeader(),
+	}
 }

+ 3 - 3
tools/functional-tester/tester/failure_case_network_blackhole.go

@@ -26,7 +26,7 @@ func recoverBlackholePeerPortTxRx(clus *Cluster, idx int) error {
 
 func newFailureBlackholePeerPortTxRxOneFollower() Failure {
 	ff := failureByFunc{
-		description:   "blackhole peer port on one follower",
+		failureCase:   rpcpb.FailureCase_BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER,
 		injectMember:  injectBlackholePeerPortTxRx,
 		recoverMember: recoverBlackholePeerPortTxRx,
 	}
@@ -39,7 +39,7 @@ func newFailureBlackholePeerPortTxRxOneFollower() Failure {
 
 func newFailureBlackholePeerPortTxRxLeader() Failure {
 	ff := failureByFunc{
-		description:   "blackhole peer port on leader",
+		failureCase:   rpcpb.FailureCase_BLACKHOLE_PEER_PORT_TX_RX_LEADER,
 		injectMember:  injectBlackholePeerPortTxRx,
 		recoverMember: recoverBlackholePeerPortTxRx,
 	}
@@ -52,7 +52,7 @@ func newFailureBlackholePeerPortTxRxLeader() Failure {
 
 func newFailureBlackholePeerPortTxRxAll() Failure {
 	f := &failureAll{
-		description:   "blackhole peer port on all",
+		failureCase:   rpcpb.FailureCase_BLACKHOLE_PEER_PORT_TX_RX_ALL,
 		injectMember:  injectBlackholePeerPortTxRx,
 		recoverMember: recoverBlackholePeerPortTxRx,
 	}

+ 3 - 7
tools/functional-tester/tester/failure_case_network_slow.go

@@ -15,7 +15,6 @@
 package tester
 
 import (
-	"fmt"
 	"time"
 
 	"github.com/coreos/etcd/tools/functional-tester/rpcpb"
@@ -43,9 +42,8 @@ func recoverDelayPeerPortTxRx(clus *Cluster, idx int) error {
 }
 
 func newFailureDelayPeerPortTxRxOneFollower(clus *Cluster) Failure {
-	desc := fmt.Sprintf("delay follower peer port by %d ms", clus.Tester.DelayLatencyMs)
 	ff := failureByFunc{
-		description:   description(desc),
+		failureCase:   rpcpb.FailureCase_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER,
 		injectMember:  injectDelayPeerPortTxRx,
 		recoverMember: recoverDelayPeerPortTxRx,
 	}
@@ -57,9 +55,8 @@ func newFailureDelayPeerPortTxRxOneFollower(clus *Cluster) Failure {
 }
 
 func newFailureDelayPeerPortTxRxLeader(clus *Cluster) Failure {
-	desc := fmt.Sprintf("delay leader peer port by %d ms", clus.Tester.DelayLatencyMs)
 	ff := failureByFunc{
-		description:   description(desc),
+		failureCase:   rpcpb.FailureCase_DELAY_PEER_PORT_TX_RX_LEADER,
 		injectMember:  injectDelayPeerPortTxRx,
 		recoverMember: recoverDelayPeerPortTxRx,
 	}
@@ -71,9 +68,8 @@ func newFailureDelayPeerPortTxRxLeader(clus *Cluster) Failure {
 }
 
 func newFailureDelayPeerPortTxRxAll(clus *Cluster) Failure {
-	desc := fmt.Sprintf("delay all peer port by %d ms", clus.Tester.DelayLatencyMs)
 	f := &failureAll{
-		description:   description(desc),
+		failureCase:   rpcpb.FailureCase_DELAY_PEER_PORT_TX_RX_ALL,
 		injectMember:  injectDelayPeerPortTxRx,
 		recoverMember: recoverDelayPeerPortTxRx,
 	}

+ 15 - 4
tools/functional-tester/tester/failure_case_no_op.go

@@ -14,13 +14,24 @@
 
 package tester
 
+import (
+	"time"
+
+	"github.com/coreos/etcd/tools/functional-tester/rpcpb"
+)
+
 type failureNoOp failureByFunc
 
-func (f *failureNoOp) Inject(clus *Cluster) error  { return nil }
-func (f *failureNoOp) Recover(clus *Cluster) error { return nil }
+func (f *failureNoOp) Inject(clus *Cluster) error     { return nil }
+func (f *failureNoOp) Recover(clus *Cluster) error    { return nil }
+func (f *failureNoOp) FailureCase() rpcpb.FailureCase { return f.failureCase }
 
 func newFailureNoOp() Failure {
-	return &failureNoOp{
-		description: "no failure",
+	f := &failureNoOp{
+		failureCase: rpcpb.FailureCase_NO_FAIL_WITH_STRESS,
+	}
+	return &failureDelay{
+		Failure:       f,
+		delayDuration: 5 * time.Second,
 	}
 }