|
|
@@ -14,7 +14,13 @@
|
|
|
|
|
|
package main
|
|
|
|
|
|
-import "math/rand"
|
|
|
+import (
|
|
|
+ "fmt"
|
|
|
+ "math/rand"
|
|
|
+ "time"
|
|
|
+)
|
|
|
+
|
|
|
+const snapshotCount = 10000
|
|
|
|
|
|
type failure interface {
|
|
|
// Inject injeccts the failure into the testing cluster at the given
|
|
|
@@ -98,3 +104,71 @@ func getToKillMap(size int, seed int) map[int]bool {
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
+
|
|
|
+type failureKillOne struct {
|
|
|
+ description
|
|
|
+}
|
|
|
+
|
|
|
+func newFailureKillOne() *failureKillOne {
|
|
|
+ return &failureKillOne{
|
|
|
+ description: "kill one random member",
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+func (f *failureKillOne) Inject(c *cluster, round int) error {
|
|
|
+ i := round % c.Size
|
|
|
+ return c.Agents[i].Stop()
|
|
|
+}
|
|
|
+
|
|
|
+func (f *failureKillOne) Recover(c *cluster, round int) error {
|
|
|
+ i := round % c.Size
|
|
|
+ if _, err := c.Agents[i].Restart(); err != nil {
|
|
|
+ return err
|
|
|
+ }
|
|
|
+ return c.WaitHealth()
|
|
|
+}
|
|
|
+
|
|
|
+// failureKillOneForLongTime kills one member for long time, and restart
|
|
|
+// after a snapshot is required.
|
|
|
+type failureKillOneForLongTime struct {
|
|
|
+ description
|
|
|
+}
|
|
|
+
|
|
|
+func newFailureKillOneForLongTime() *failureKillOneForLongTime {
|
|
|
+ return &failureKillOneForLongTime{
|
|
|
+ description: "kill one member for long time and expect it to recover from incoming snapshot",
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+func (f *failureKillOneForLongTime) Inject(c *cluster, round int) error {
|
|
|
+ i := round % c.Size
|
|
|
+ if err := c.Agents[i].Stop(); err != nil {
|
|
|
+ return err
|
|
|
+ }
|
|
|
+ if c.Size >= 3 {
|
|
|
+ start, _ := c.Report()
|
|
|
+ var end int
|
|
|
+ // Normal healthy cluster could accept 1000req/s at least.
|
|
|
+ // Give it 3-times time to create a new snapshot.
|
|
|
+ retry := snapshotCount / 1000 * 3
|
|
|
+ for j := 0; j < retry; j++ {
|
|
|
+ end, _ = c.Report()
|
|
|
+ // If the number of proposals committed is bigger than snapshot count,
|
|
|
+ // a new snapshot should have been created.
|
|
|
+ if end-start > snapshotCount {
|
|
|
+ return nil
|
|
|
+ }
|
|
|
+ time.Sleep(time.Second)
|
|
|
+ }
|
|
|
+ return fmt.Errorf("cluster too slow: only commit %d requests in %ds", end-start, retry)
|
|
|
+ }
|
|
|
+ return nil
|
|
|
+}
|
|
|
+
|
|
|
+func (f *failureKillOneForLongTime) Recover(c *cluster, round int) error {
|
|
|
+ i := round % c.Size
|
|
|
+ if _, err := c.Agents[i].Restart(); err != nil {
|
|
|
+ return err
|
|
|
+ }
|
|
|
+ return c.WaitHealth()
|
|
|
+}
|