Browse Source

functional: add back, travis

Signed-off-by: Gyuho Lee <leegyuho@amazon.com>
Gyuho Lee 6 years ago
parent
commit
e16b21be7b
58 changed files with 563 additions and 1532 deletions
  1. 4 0
      .travis.yml
  2. 15 50
      functional.yaml
  3. 1 1
      functional/Dockerfile
  4. 0 0
      functional/Procfile-proxy
  5. 6 6
      functional/README.md
  6. 0 0
      functional/agent/doc.go
  7. 130 199
      functional/agent/handler.go
  8. 10 14
      functional/agent/server.go
  9. 16 14
      functional/agent/utils.go
  10. 0 0
      functional/agent/utils_test.go
  11. 0 0
      functional/cmd/etcd-agent/main.go
  12. 4 7
      functional/cmd/etcd-proxy/main.go
  13. 0 0
      functional/cmd/etcd-runner/main.go
  14. 0 0
      functional/cmd/etcd-tester/main.go
  15. 11 3
      functional/rpcpb/etcd_config.go
  16. 10 17
      functional/rpcpb/etcd_config_test.go
  17. 1 11
      functional/rpcpb/member.go
  18. 168 636
      functional/rpcpb/rpc.pb.go
  19. 22 32
      functional/rpcpb/rpc.proto
  20. 0 0
      functional/runner/election_command.go
  21. 0 0
      functional/runner/error.go
  22. 1 1
      functional/runner/global.go
  23. 0 0
      functional/runner/help.go
  24. 0 0
      functional/runner/lease_renewer_command.go
  25. 0 0
      functional/runner/lock_racer_command.go
  26. 0 0
      functional/runner/root.go
  27. 0 0
      functional/runner/watch_command.go
  28. 2 2
      functional/scripts/docker-local-agent.sh
  29. 2 2
      functional/scripts/docker-local-tester.sh
  30. 2 2
      functional/scripts/genproto.sh
  31. 6 17
      functional/tester/case.go
  32. 0 0
      functional/tester/case_delay.go
  33. 0 0
      functional/tester/case_external.go
  34. 0 0
      functional/tester/case_failpoints.go
  35. 0 0
      functional/tester/case_network_blackhole.go
  36. 1 1
      functional/tester/case_network_delay.go
  37. 0 0
      functional/tester/case_no_fail.go
  38. 0 0
      functional/tester/case_sigquit_remove.go
  39. 0 0
      functional/tester/case_sigquit_remove_quorum.go
  40. 0 0
      functional/tester/case_sigterm.go
  41. 0 0
      functional/tester/checker.go
  42. 0 0
      functional/tester/checker_kv_hash.go
  43. 0 0
      functional/tester/checker_lease_expire.go
  44. 0 0
      functional/tester/checker_no_check.go
  45. 0 0
      functional/tester/checker_runner.go
  46. 6 8
      functional/tester/cluster.go
  47. 42 60
      functional/tester/cluster_read_config.go
  48. 2 2
      functional/tester/cluster_run.go
  49. 0 0
      functional/tester/cluster_shuffle.go
  50. 0 304
      functional/tester/cluster_test.go
  51. 0 0
      functional/tester/doc.go
  52. 0 0
      functional/tester/metrics_report.go
  53. 35 59
      functional/tester/stresser.go
  54. 0 0
      functional/tester/stresser_composite.go
  55. 63 80
      functional/tester/stresser_key.go
  56. 1 1
      functional/tester/stresser_lease.go
  57. 2 3
      functional/tester/stresser_runner.go
  58. 0 0
      functional/tester/utils.go

+ 4 - 0
.travis.yml

@@ -19,6 +19,7 @@ env:
   matrix:
   matrix:
   - TARGET=linux-amd64-integration-1-cpu
   - TARGET=linux-amd64-integration-1-cpu
   - TARGET=linux-amd64-integration-4-cpu
   - TARGET=linux-amd64-integration-4-cpu
+  - TARGET=linux-amd64-functional
   - TARGET=linux-amd64-unit
   - TARGET=linux-amd64-unit
   - TARGET=linux-amd64-e2e
   - TARGET=linux-amd64-e2e
   - TARGET=all-build
   - TARGET=all-build
@@ -43,6 +44,9 @@ script:
       linux-amd64-integration-4-cpu)
       linux-amd64-integration-4-cpu)
         GOARCH=amd64 CPU=4 PASSES='integration' ./test
         GOARCH=amd64 CPU=4 PASSES='integration' ./test
         ;;
         ;;
+      linux-amd64-functional)
+        GOARCH=amd64 PASSES='functional' ./test
+        ;;
       linux-amd64-unit)
       linux-amd64-unit)
         GOARCH=amd64 PASSES='unit' ./test
         GOARCH=amd64 PASSES='unit' ./test
         ;;
         ;;

+ 15 - 50
functional.yaml

@@ -1,8 +1,9 @@
 agent-configs:
 agent-configs:
-- etcd-exec: ./bin/etcd
+- etcd-exec-path: ./bin/etcd
   agent-addr: 127.0.0.1:19027
   agent-addr: 127.0.0.1:19027
   failpoint-http-addr: http://127.0.0.1:7381
   failpoint-http-addr: http://127.0.0.1:7381
   base-dir: /tmp/etcd-functional-1
   base-dir: /tmp/etcd-functional-1
+  etcd-log-path: /tmp/etcd-functional-1/etcd.log
   etcd-client-proxy: false
   etcd-client-proxy: false
   etcd-peer-proxy: true
   etcd-peer-proxy: true
   etcd-client-endpoint: 127.0.0.1:1379
   etcd-client-endpoint: 127.0.0.1:1379
@@ -29,7 +30,7 @@ agent-configs:
     initial-cluster: s1=https://127.0.0.1:1381,s2=https://127.0.0.1:2381,s3=https://127.0.0.1:3381
     initial-cluster: s1=https://127.0.0.1:1381,s2=https://127.0.0.1:2381,s3=https://127.0.0.1:3381
     initial-cluster-state: new
     initial-cluster-state: new
     initial-cluster-token: tkn
     initial-cluster-token: tkn
-    snapshot-count: 2000
+    snapshot-count: 10000
     quota-backend-bytes: 10740000000 # 10 GiB
     quota-backend-bytes: 10740000000 # 10 GiB
     pre-vote: true
     pre-vote: true
     initial-corrupt-check: true
     initial-corrupt-check: true
@@ -47,10 +48,11 @@ agent-configs:
   peer-trusted-ca-path: ""
   peer-trusted-ca-path: ""
   snapshot-path: /tmp/etcd-functional-1.snapshot.db
   snapshot-path: /tmp/etcd-functional-1.snapshot.db
 
 
-- etcd-exec: ./bin/etcd
+- etcd-exec-path: ./bin/etcd
   agent-addr: 127.0.0.1:29027
   agent-addr: 127.0.0.1:29027
   failpoint-http-addr: http://127.0.0.1:7382
   failpoint-http-addr: http://127.0.0.1:7382
   base-dir: /tmp/etcd-functional-2
   base-dir: /tmp/etcd-functional-2
+  etcd-log-path: /tmp/etcd-functional-2/etcd.log
   etcd-client-proxy: false
   etcd-client-proxy: false
   etcd-peer-proxy: true
   etcd-peer-proxy: true
   etcd-client-endpoint: 127.0.0.1:2379
   etcd-client-endpoint: 127.0.0.1:2379
@@ -77,7 +79,7 @@ agent-configs:
     initial-cluster: s1=https://127.0.0.1:1381,s2=https://127.0.0.1:2381,s3=https://127.0.0.1:3381
     initial-cluster: s1=https://127.0.0.1:1381,s2=https://127.0.0.1:2381,s3=https://127.0.0.1:3381
     initial-cluster-state: new
     initial-cluster-state: new
     initial-cluster-token: tkn
     initial-cluster-token: tkn
-    snapshot-count: 2000
+    snapshot-count: 10000
     quota-backend-bytes: 10740000000 # 10 GiB
     quota-backend-bytes: 10740000000 # 10 GiB
     pre-vote: true
     pre-vote: true
     initial-corrupt-check: true
     initial-corrupt-check: true
@@ -95,10 +97,11 @@ agent-configs:
   peer-trusted-ca-path: ""
   peer-trusted-ca-path: ""
   snapshot-path: /tmp/etcd-functional-2.snapshot.db
   snapshot-path: /tmp/etcd-functional-2.snapshot.db
 
 
-- etcd-exec: ./bin/etcd
+- etcd-exec-path: ./bin/etcd
   agent-addr: 127.0.0.1:39027
   agent-addr: 127.0.0.1:39027
   failpoint-http-addr: http://127.0.0.1:7383
   failpoint-http-addr: http://127.0.0.1:7383
   base-dir: /tmp/etcd-functional-3
   base-dir: /tmp/etcd-functional-3
+  etcd-log-path: /tmp/etcd-functional-3/etcd.log
   etcd-client-proxy: false
   etcd-client-proxy: false
   etcd-peer-proxy: true
   etcd-peer-proxy: true
   etcd-client-endpoint: 127.0.0.1:3379
   etcd-client-endpoint: 127.0.0.1:3379
@@ -125,7 +128,7 @@ agent-configs:
     initial-cluster: s1=https://127.0.0.1:1381,s2=https://127.0.0.1:2381,s3=https://127.0.0.1:3381
     initial-cluster: s1=https://127.0.0.1:1381,s2=https://127.0.0.1:2381,s3=https://127.0.0.1:3381
     initial-cluster-state: new
     initial-cluster-state: new
     initial-cluster-token: tkn
     initial-cluster-token: tkn
-    snapshot-count: 2000
+    snapshot-count: 10000
     quota-backend-bytes: 10740000000 # 10 GiB
     quota-backend-bytes: 10740000000 # 10 GiB
     pre-vote: true
     pre-vote: true
     initial-corrupt-check: true
     initial-corrupt-check: true
@@ -160,7 +163,7 @@ tester-config:
   case-shuffle: true
   case-shuffle: true
 
 
   # For full descriptions,
   # For full descriptions,
-  # https://godoc.org/github.com/etcd-io/etcd/functional/rpcpb#Case
+  # https://godoc.org/github.com/coreos/etcd/functional/rpcpb#Case
   cases:
   cases:
   - SIGTERM_ONE_FOLLOWER
   - SIGTERM_ONE_FOLLOWER
   - SIGTERM_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT
   - SIGTERM_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT
@@ -170,62 +173,24 @@ tester-config:
   - SIGTERM_ALL
   - SIGTERM_ALL
   - SIGQUIT_AND_REMOVE_ONE_FOLLOWER
   - SIGQUIT_AND_REMOVE_ONE_FOLLOWER
   - SIGQUIT_AND_REMOVE_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT
   - SIGQUIT_AND_REMOVE_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT
+  - BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER
+  - BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT
   - BLACKHOLE_PEER_PORT_TX_RX_LEADER
   - BLACKHOLE_PEER_PORT_TX_RX_LEADER
   - BLACKHOLE_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT
   - BLACKHOLE_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT
   - BLACKHOLE_PEER_PORT_TX_RX_QUORUM
   - BLACKHOLE_PEER_PORT_TX_RX_QUORUM
-  - BLACKHOLE_PEER_PORT_TX_RX_ALL
+  - DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER
   - DELAY_PEER_PORT_TX_RX_LEADER
   - DELAY_PEER_PORT_TX_RX_LEADER
-  - RANDOM_DELAY_PEER_PORT_TX_RX_LEADER
-  - DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT
-  - RANDOM_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT
   - DELAY_PEER_PORT_TX_RX_QUORUM
   - DELAY_PEER_PORT_TX_RX_QUORUM
-  - RANDOM_DELAY_PEER_PORT_TX_RX_QUORUM
-  - DELAY_PEER_PORT_TX_RX_ALL
-  - RANDOM_DELAY_PEER_PORT_TX_RX_ALL
-  - NO_FAIL_WITH_STRESS
-  - NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS
-
-  # TODO: use iptables for discarding outbound rafthttp traffic to peer port
-  # - BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER
-  # - BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT
-  # - DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER
-  # - RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER
-  # - DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT
-  # - RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT
-  # - SIGQUIT_AND_REMOVE_LEADER
-  # - SIGQUIT_AND_REMOVE_LEADER_UNTIL_TRIGGER_SNAPSHOT
-  # - SIGQUIT_AND_REMOVE_QUORUM_AND_RESTORE_LEADER_SNAPSHOT_FROM_SCRATCH
 
 
   failpoint-commands:
   failpoint-commands:
   - panic("etcd-tester")
   - panic("etcd-tester")
-  # - panic("etcd-tester"),1*sleep(1000)
 
 
   runner-exec-path: ./bin/etcd-runner
   runner-exec-path: ./bin/etcd-runner
   external-exec-path: ""
   external-exec-path: ""
 
 
-  # make up ±70% of workloads with writes
   stressers:
   stressers:
-  - type: KV_WRITE_SMALL
-    weight: 0.35
-  - type: KV_WRITE_LARGE
-    weight: 0.002
-  - type: KV_READ_ONE_KEY
-    weight: 0.07
-  - type: KV_READ_RANGE
-    weight: 0.07
-  - type: KV_DELETE_ONE_KEY
-    weight: 0.07
-  - type: KV_DELETE_RANGE
-    weight: 0.07
-  - type: KV_TXN_WRITE_DELETE
-    weight: 0.35
-  - type: LEASE
-    weight: 0.0
-
-  # - ELECTION_RUNNER
-  # - WATCH_RUNNER
-  # - LOCK_RACER_RUNNER
-  # - LEASE_RUNNER
+  - KV
+  - LEASE
 
 
   checkers:
   checkers:
   - KV_HASH
   - KV_HASH

+ 1 - 1
functional/Dockerfile

@@ -39,4 +39,4 @@ RUN go get -v github.com/coreos/gofail \
   && cp ./bin/etcd-tester /bin/etcd-tester \
   && cp ./bin/etcd-tester /bin/etcd-tester \
   && go build -v -o /bin/benchmark ./tools/benchmark \
   && go build -v -o /bin/benchmark ./tools/benchmark \
   && popd \
   && popd \
-  && rm -rf ${GOPATH}/src/github.com/coreos/etcd
+  && rm -rf ${GOPATH}/src/github.com/coreos/etcd

+ 0 - 0
functional/Procfile-proxy


+ 6 - 6
functional/README.md

@@ -4,7 +4,7 @@
 
 
 See [`rpcpb.Case`](https://godoc.org/github.com/coreos/etcd/functional/rpcpb#Case) for all failure cases.
 See [`rpcpb.Case`](https://godoc.org/github.com/coreos/etcd/functional/rpcpb#Case) for all failure cases.
 
 
-See [functional.yaml](https://github.com/etcd-io/etcd/blob/master/functional.yaml) for an example configuration.
+See [functional.yaml](https://github.com/coreos/etcd/blob/master/functional.yaml) for an example configuration.
 
 
 ### Run locally
 ### Run locally
 
 
@@ -16,7 +16,7 @@ PASSES=functional ./test
 
 
 ```bash
 ```bash
 pushd ..
 pushd ..
-make build-docker-functional push-docker-functional pull-docker-functional
+make build-docker-functional
 popd
 popd
 ```
 ```
 
 
@@ -24,12 +24,12 @@ And run [example scripts](./scripts).
 
 
 ```bash
 ```bash
 # run 3 agents for 3-node local etcd cluster
 # run 3 agents for 3-node local etcd cluster
-./functional/scripts/docker-local-agent.sh 1
-./functional/scripts/docker-local-agent.sh 2
-./functional/scripts/docker-local-agent.sh 3
+./scripts/docker-local-agent.sh 1
+./scripts/docker-local-agent.sh 2
+./scripts/docker-local-agent.sh 3
 
 
 # to run only 1 tester round
 # to run only 1 tester round
-./functional/scripts/docker-local-tester.sh
+./scripts/docker-local-tester.sh
 ```
 ```
 
 
 ## etcd Proxy
 ## etcd Proxy

+ 0 - 0
functional/agent/doc.go


+ 130 - 199
functional/agent/handler.go

@@ -70,13 +70,13 @@ func (srv *Server) handleTesterRequest(req *rpcpb.Request) (resp *rpcpb.Response
 		return srv.handle_SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT()
 		return srv.handle_SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT()
 
 
 	case rpcpb.Operation_BLACKHOLE_PEER_PORT_TX_RX:
 	case rpcpb.Operation_BLACKHOLE_PEER_PORT_TX_RX:
-		return srv.handle_BLACKHOLE_PEER_PORT_TX_RX(), nil
+		return srv.handle_BLACKHOLE_PEER_PORT_TX_RX()
 	case rpcpb.Operation_UNBLACKHOLE_PEER_PORT_TX_RX:
 	case rpcpb.Operation_UNBLACKHOLE_PEER_PORT_TX_RX:
-		return srv.handle_UNBLACKHOLE_PEER_PORT_TX_RX(), nil
+		return srv.handle_UNBLACKHOLE_PEER_PORT_TX_RX()
 	case rpcpb.Operation_DELAY_PEER_PORT_TX_RX:
 	case rpcpb.Operation_DELAY_PEER_PORT_TX_RX:
-		return srv.handle_DELAY_PEER_PORT_TX_RX(), nil
+		return srv.handle_DELAY_PEER_PORT_TX_RX()
 	case rpcpb.Operation_UNDELAY_PEER_PORT_TX_RX:
 	case rpcpb.Operation_UNDELAY_PEER_PORT_TX_RX:
-		return srv.handle_UNDELAY_PEER_PORT_TX_RX(), nil
+		return srv.handle_UNDELAY_PEER_PORT_TX_RX()
 
 
 	default:
 	default:
 		msg := fmt.Sprintf("operation not found (%v)", req.Operation)
 		msg := fmt.Sprintf("operation not found (%v)", req.Operation)
@@ -84,125 +84,50 @@ func (srv *Server) handleTesterRequest(req *rpcpb.Request) (resp *rpcpb.Response
 	}
 	}
 }
 }
 
 
-// just archive the first file
-func (srv *Server) createEtcdLogFile() error {
-	var err error
-	srv.etcdLogFile, err = os.Create(srv.Member.Etcd.LogOutputs[0])
-	if err != nil {
-		return err
+func (srv *Server) handle_INITIAL_START_ETCD(req *rpcpb.Request) (*rpcpb.Response, error) {
+	if srv.last != rpcpb.Operation_NOT_STARTED {
+		return &rpcpb.Response{
+			Success: false,
+			Status:  fmt.Sprintf("%q is not valid; last server operation was %q", rpcpb.Operation_INITIAL_START_ETCD.String(), srv.last.String()),
+			Member:  req.Member,
+		}, nil
 	}
 	}
-	srv.lg.Info("created etcd log file", zap.String("path", srv.Member.Etcd.LogOutputs[0]))
-	return nil
-}
 
 
-func (srv *Server) creatEtcd(fromSnapshot bool) error {
-	if !fileutil.Exist(srv.Member.EtcdExec) {
-		return fmt.Errorf("unknown etcd exec path %q does not exist", srv.Member.EtcdExec)
+	err := fileutil.TouchDirAll(srv.Member.BaseDir)
+	if err != nil {
+		return nil, err
 	}
 	}
+	srv.lg.Info("created base directory", zap.String("path", srv.Member.BaseDir))
 
 
-	etcdPath, etcdFlags := srv.Member.EtcdExec, srv.Member.Etcd.Flags()
-	if fromSnapshot {
-		etcdFlags = srv.Member.EtcdOnSnapshotRestore.Flags()
+	if err = srv.createEtcdLogFile(); err != nil {
+		return nil, err
 	}
 	}
-	u, _ := url.Parse(srv.Member.FailpointHTTPAddr)
-	srv.lg.Info(
-		"creating etcd command",
-		zap.String("etcd-exec", etcdPath),
-		zap.Strings("etcd-flags", etcdFlags),
-		zap.String("failpoint-http-addr", srv.Member.FailpointHTTPAddr),
-		zap.String("failpoint-addr", u.Host),
-	)
-	srv.etcdCmd = exec.Command(etcdPath, etcdFlags...)
-	srv.etcdCmd.Env = []string{"GOFAIL_HTTP=" + u.Host}
-	srv.etcdCmd.Stdout = srv.etcdLogFile
-	srv.etcdCmd.Stderr = srv.etcdLogFile
-	return nil
-}
 
 
-// start but do not wait for it to complete
-func (srv *Server) runEtcd() error {
-	errc := make(chan error)
-	go func() {
-		time.Sleep(5 * time.Second)
-		// server advertise client/peer listener had to start first
-		// before setting up proxy listener
-		errc <- srv.startProxy()
-	}()
+	srv.creatEtcdCmd(false)
 
 
-	if srv.etcdCmd != nil {
-		srv.lg.Info(
-			"starting etcd command",
-			zap.String("command-path", srv.etcdCmd.Path),
-		)
-		err := srv.etcdCmd.Start()
-		perr := <-errc
-		srv.lg.Info(
-			"started etcd command",
-			zap.String("command-path", srv.etcdCmd.Path),
-			zap.Errors("errors", []error{err, perr}),
-		)
-		if err != nil {
-			return err
-		}
-		return perr
+	if err = srv.saveTLSAssets(); err != nil {
+		return nil, err
 	}
 	}
-
-	select {
-	case <-srv.etcdServer.Server.ReadyNotify():
-		srv.lg.Info("embedded etcd is ready")
-	case <-time.After(time.Minute):
-		srv.etcdServer.Close()
-		return fmt.Errorf("took too long to start %v", <-srv.etcdServer.Err())
+	if err = srv.startEtcdCmd(); err != nil {
+		return nil, err
+	}
+	srv.lg.Info("started etcd", zap.String("command-path", srv.etcdCmd.Path))
+	if err = srv.loadAutoTLSAssets(); err != nil {
+		return nil, err
 	}
 	}
-	return <-errc
-}
-
-// SIGQUIT to exit with stackstrace
-func (srv *Server) stopEtcd(sig os.Signal) error {
-	srv.stopProxy()
-
-	if srv.etcdCmd != nil {
-		srv.lg.Info(
-			"stopping etcd command",
-			zap.String("command-path", srv.etcdCmd.Path),
-			zap.String("signal", sig.String()),
-		)
-
-		err := srv.etcdCmd.Process.Signal(sig)
-		if err != nil {
-			return err
-		}
-
-		errc := make(chan error)
-		go func() {
-			_, ew := srv.etcdCmd.Process.Wait()
-			errc <- ew
-			close(errc)
-		}()
-
-		select {
-		case <-time.After(5 * time.Second):
-			srv.etcdCmd.Process.Kill()
-		case e := <-errc:
-			return e
-		}
-
-		err = <-errc
 
 
-		srv.lg.Info(
-			"stopped etcd command",
-			zap.String("command-path", srv.etcdCmd.Path),
-			zap.String("signal", sig.String()),
-			zap.Error(err),
-		)
-		return err
+	// wait some time for etcd listener start
+	// before setting up proxy
+	time.Sleep(time.Second)
+	if err = srv.startProxy(); err != nil {
+		return nil, err
 	}
 	}
 
 
-	srv.lg.Info("stopping embedded etcd")
-	srv.etcdServer.Server.HardStop()
-	srv.etcdServer.Close()
-	srv.lg.Info("stopped embedded etcd")
-	return nil
+	return &rpcpb.Response{
+		Success: true,
+		Status:  "start etcd PASS",
+		Member:  srv.Member,
+	}, nil
 }
 }
 
 
 func (srv *Server) startProxy() error {
 func (srv *Server) startProxy() error {
@@ -216,7 +141,6 @@ func (srv *Server) startProxy() error {
 			return err
 			return err
 		}
 		}
 
 
-		srv.lg.Info("starting proxy on client traffic", zap.String("url", advertiseClientURL.String()))
 		srv.advertiseClientPortToProxy[advertiseClientURLPort] = proxy.NewServer(proxy.ServerConfig{
 		srv.advertiseClientPortToProxy[advertiseClientURLPort] = proxy.NewServer(proxy.ServerConfig{
 			Logger: srv.lg,
 			Logger: srv.lg,
 			From:   *advertiseClientURL,
 			From:   *advertiseClientURL,
@@ -240,7 +164,6 @@ func (srv *Server) startProxy() error {
 			return err
 			return err
 		}
 		}
 
 
-		srv.lg.Info("starting proxy on peer traffic", zap.String("url", advertisePeerURL.String()))
 		srv.advertisePeerPortToProxy[advertisePeerURLPort] = proxy.NewServer(proxy.ServerConfig{
 		srv.advertisePeerPortToProxy[advertisePeerURLPort] = proxy.NewServer(proxy.ServerConfig{
 			Logger: srv.lg,
 			Logger: srv.lg,
 			From:   *advertisePeerURL,
 			From:   *advertisePeerURL,
@@ -299,6 +222,34 @@ func (srv *Server) stopProxy() {
 	}
 	}
 }
 }
 
 
+func (srv *Server) createEtcdLogFile() error {
+	var err error
+	srv.etcdLogFile, err = os.Create(srv.Member.EtcdLogPath)
+	if err != nil {
+		return err
+	}
+	srv.lg.Info("created etcd log file", zap.String("path", srv.Member.EtcdLogPath))
+	return nil
+}
+
+func (srv *Server) creatEtcdCmd(fromSnapshot bool) {
+	etcdPath, etcdFlags := srv.Member.EtcdExecPath, srv.Member.Etcd.Flags()
+	if fromSnapshot {
+		etcdFlags = srv.Member.EtcdOnSnapshotRestore.Flags()
+	}
+	u, _ := url.Parse(srv.Member.FailpointHTTPAddr)
+	srv.lg.Info("creating etcd command",
+		zap.String("etcd-exec-path", etcdPath),
+		zap.Strings("etcd-flags", etcdFlags),
+		zap.String("failpoint-http-addr", srv.Member.FailpointHTTPAddr),
+		zap.String("failpoint-addr", u.Host),
+	)
+	srv.etcdCmd = exec.Command(etcdPath, etcdFlags...)
+	srv.etcdCmd.Env = []string{"GOFAIL_HTTP=" + u.Host}
+	srv.etcdCmd.Stdout = srv.etcdLogFile
+	srv.etcdCmd.Stderr = srv.etcdLogFile
+}
+
 // if started with manual TLS, stores TLS assets
 // if started with manual TLS, stores TLS assets
 // from tester/client to disk before starting etcd process
 // from tester/client to disk before starting etcd process
 func (srv *Server) saveTLSAssets() error {
 func (srv *Server) saveTLSAssets() error {
@@ -371,6 +322,7 @@ func (srv *Server) saveTLSAssets() error {
 			zap.String("client-trusted-ca", srv.Member.ClientTrustedCAPath),
 			zap.String("client-trusted-ca", srv.Member.ClientTrustedCAPath),
 		)
 		)
 	}
 	}
+
 	return nil
 	return nil
 }
 }
 
 
@@ -460,45 +412,9 @@ func (srv *Server) loadAutoTLSAssets() error {
 	return nil
 	return nil
 }
 }
 
 
-func (srv *Server) handle_INITIAL_START_ETCD(req *rpcpb.Request) (*rpcpb.Response, error) {
-	if srv.last != rpcpb.Operation_NOT_STARTED {
-		return &rpcpb.Response{
-			Success: false,
-			Status:  fmt.Sprintf("%q is not valid; last server operation was %q", rpcpb.Operation_INITIAL_START_ETCD.String(), srv.last.String()),
-			Member:  req.Member,
-		}, nil
-	}
-
-	err := fileutil.TouchDirAll(srv.Member.BaseDir)
-	if err != nil {
-		return nil, err
-	}
-	srv.lg.Info("created base directory", zap.String("path", srv.Member.BaseDir))
-
-	if srv.etcdServer == nil {
-		if err = srv.createEtcdLogFile(); err != nil {
-			return nil, err
-		}
-	}
-
-	if err = srv.saveTLSAssets(); err != nil {
-		return nil, err
-	}
-	if err = srv.creatEtcd(false); err != nil {
-		return nil, err
-	}
-	if err = srv.runEtcd(); err != nil {
-		return nil, err
-	}
-	if err = srv.loadAutoTLSAssets(); err != nil {
-		return nil, err
-	}
-
-	return &rpcpb.Response{
-		Success: true,
-		Status:  "start etcd PASS",
-		Member:  srv.Member,
-	}, nil
+// start but do not wait for it to complete
+func (srv *Server) startEtcdCmd() error {
+	return srv.etcdCmd.Start()
 }
 }
 
 
 func (srv *Server) handle_RESTART_ETCD() (*rpcpb.Response, error) {
 func (srv *Server) handle_RESTART_ETCD() (*rpcpb.Response, error) {
@@ -510,16 +426,25 @@ func (srv *Server) handle_RESTART_ETCD() (*rpcpb.Response, error) {
 		}
 		}
 	}
 	}
 
 
+	srv.creatEtcdCmd(false)
+
 	if err = srv.saveTLSAssets(); err != nil {
 	if err = srv.saveTLSAssets(); err != nil {
 		return nil, err
 		return nil, err
 	}
 	}
-	if err = srv.creatEtcd(false); err != nil {
+	if err = srv.startEtcdCmd(); err != nil {
 		return nil, err
 		return nil, err
 	}
 	}
-	if err = srv.runEtcd(); err != nil {
+	srv.lg.Info("restarted etcd", zap.String("command-path", srv.etcdCmd.Path))
+	if err = srv.loadAutoTLSAssets(); err != nil {
 		return nil, err
 		return nil, err
 	}
 	}
-	if err = srv.loadAutoTLSAssets(); err != nil {
+
+	// wait some time for etcd listener start
+	// before setting up proxy
+	// TODO: local tests should handle port conflicts
+	// with clients on restart
+	time.Sleep(time.Second)
+	if err = srv.startProxy(); err != nil {
 		return nil, err
 		return nil, err
 	}
 	}
 
 
@@ -531,15 +456,13 @@ func (srv *Server) handle_RESTART_ETCD() (*rpcpb.Response, error) {
 }
 }
 
 
 func (srv *Server) handle_SIGTERM_ETCD() (*rpcpb.Response, error) {
 func (srv *Server) handle_SIGTERM_ETCD() (*rpcpb.Response, error) {
-	if err := srv.stopEtcd(syscall.SIGTERM); err != nil {
-		return nil, err
-	}
+	srv.stopProxy()
 
 
-	if srv.etcdServer != nil {
-		// srv.etcdServer.GetLogger().Sync()
-	} else {
-		srv.etcdLogFile.Sync()
+	err := stopWithSig(srv.etcdCmd, syscall.SIGTERM)
+	if err != nil {
+		return nil, err
 	}
 	}
+	srv.lg.Info("killed etcd", zap.String("signal", syscall.SIGTERM.String()))
 
 
 	return &rpcpb.Response{
 	return &rpcpb.Response{
 		Success: true,
 		Success: true,
@@ -548,17 +471,16 @@ func (srv *Server) handle_SIGTERM_ETCD() (*rpcpb.Response, error) {
 }
 }
 
 
 func (srv *Server) handle_SIGQUIT_ETCD_AND_REMOVE_DATA() (*rpcpb.Response, error) {
 func (srv *Server) handle_SIGQUIT_ETCD_AND_REMOVE_DATA() (*rpcpb.Response, error) {
-	err := srv.stopEtcd(syscall.SIGQUIT)
+	srv.stopProxy()
+
+	err := stopWithSig(srv.etcdCmd, syscall.SIGQUIT)
 	if err != nil {
 	if err != nil {
 		return nil, err
 		return nil, err
 	}
 	}
+	srv.lg.Info("killed etcd", zap.String("signal", syscall.SIGQUIT.String()))
 
 
-	if srv.etcdServer != nil {
-		// srv.etcdServer.GetLogger().Sync()
-	} else {
-		srv.etcdLogFile.Sync()
-		srv.etcdLogFile.Close()
-	}
+	srv.etcdLogFile.Sync()
+	srv.etcdLogFile.Close()
 
 
 	// for debugging purposes, rename instead of removing
 	// for debugging purposes, rename instead of removing
 	if err = os.RemoveAll(srv.Member.BaseDir + ".backup"); err != nil {
 	if err = os.RemoveAll(srv.Member.BaseDir + ".backup"); err != nil {
@@ -580,6 +502,9 @@ func (srv *Server) handle_SIGQUIT_ETCD_AND_REMOVE_DATA() (*rpcpb.Response, error
 			return nil, err
 			return nil, err
 		}
 		}
 	}
 	}
+	if err = srv.createEtcdLogFile(); err != nil {
+		return nil, err
+	}
 
 
 	return &rpcpb.Response{
 	return &rpcpb.Response{
 		Success: true,
 		Success: true,
@@ -612,16 +537,25 @@ func (srv *Server) handle_RESTORE_RESTART_FROM_SNAPSHOT() (resp *rpcpb.Response,
 }
 }
 
 
 func (srv *Server) handle_RESTART_FROM_SNAPSHOT() (resp *rpcpb.Response, err error) {
 func (srv *Server) handle_RESTART_FROM_SNAPSHOT() (resp *rpcpb.Response, err error) {
+	srv.creatEtcdCmd(true)
+
 	if err = srv.saveTLSAssets(); err != nil {
 	if err = srv.saveTLSAssets(); err != nil {
 		return nil, err
 		return nil, err
 	}
 	}
-	if err = srv.creatEtcd(true); err != nil {
+	if err = srv.startEtcdCmd(); err != nil {
 		return nil, err
 		return nil, err
 	}
 	}
-	if err = srv.runEtcd(); err != nil {
+	srv.lg.Info("restarted etcd", zap.String("command-path", srv.etcdCmd.Path))
+	if err = srv.loadAutoTLSAssets(); err != nil {
 		return nil, err
 		return nil, err
 	}
 	}
-	if err = srv.loadAutoTLSAssets(); err != nil {
+
+	// wait some time for etcd listener start
+	// before setting up proxy
+	// TODO: local tests should handle port conflicts
+	// with clients on restart
+	time.Sleep(time.Second)
+	if err = srv.startProxy(); err != nil {
 		return nil, err
 		return nil, err
 	}
 	}
 
 
@@ -633,32 +567,30 @@ func (srv *Server) handle_RESTART_FROM_SNAPSHOT() (resp *rpcpb.Response, err err
 }
 }
 
 
 func (srv *Server) handle_SIGQUIT_ETCD_AND_ARCHIVE_DATA() (*rpcpb.Response, error) {
 func (srv *Server) handle_SIGQUIT_ETCD_AND_ARCHIVE_DATA() (*rpcpb.Response, error) {
-	err := srv.stopEtcd(syscall.SIGQUIT)
+	srv.stopProxy()
+
+	// exit with stackstrace
+	err := stopWithSig(srv.etcdCmd, syscall.SIGQUIT)
 	if err != nil {
 	if err != nil {
 		return nil, err
 		return nil, err
 	}
 	}
+	srv.lg.Info("killed etcd", zap.String("signal", syscall.SIGQUIT.String()))
 
 
-	if srv.etcdServer != nil {
-		// srv.etcdServer.GetLogger().Sync()
-	} else {
-		srv.etcdLogFile.Sync()
-		srv.etcdLogFile.Close()
-	}
+	srv.etcdLogFile.Sync()
+	srv.etcdLogFile.Close()
 
 
 	// TODO: support separate WAL directory
 	// TODO: support separate WAL directory
 	if err = archive(
 	if err = archive(
 		srv.Member.BaseDir,
 		srv.Member.BaseDir,
-		srv.Member.Etcd.LogOutputs[0],
+		srv.Member.EtcdLogPath,
 		srv.Member.Etcd.DataDir,
 		srv.Member.Etcd.DataDir,
 	); err != nil {
 	); err != nil {
 		return nil, err
 		return nil, err
 	}
 	}
 	srv.lg.Info("archived data", zap.String("base-dir", srv.Member.BaseDir))
 	srv.lg.Info("archived data", zap.String("base-dir", srv.Member.BaseDir))
 
 
-	if srv.etcdServer == nil {
-		if err = srv.createEtcdLogFile(); err != nil {
-			return nil, err
-		}
+	if err = srv.createEtcdLogFile(); err != nil {
+		return nil, err
 	}
 	}
 
 
 	srv.lg.Info("cleaning up page cache")
 	srv.lg.Info("cleaning up page cache")
@@ -675,17 +607,16 @@ func (srv *Server) handle_SIGQUIT_ETCD_AND_ARCHIVE_DATA() (*rpcpb.Response, erro
 
 
 // stop proxy, etcd, delete data directory
 // stop proxy, etcd, delete data directory
 func (srv *Server) handle_SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT() (*rpcpb.Response, error) {
 func (srv *Server) handle_SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT() (*rpcpb.Response, error) {
-	err := srv.stopEtcd(syscall.SIGQUIT)
+	srv.stopProxy()
+
+	err := stopWithSig(srv.etcdCmd, syscall.SIGQUIT)
 	if err != nil {
 	if err != nil {
 		return nil, err
 		return nil, err
 	}
 	}
+	srv.lg.Info("killed etcd", zap.String("signal", syscall.SIGQUIT.String()))
 
 
-	if srv.etcdServer != nil {
-		// srv.etcdServer.GetLogger().Sync()
-	} else {
-		srv.etcdLogFile.Sync()
-		srv.etcdLogFile.Close()
-	}
+	srv.etcdLogFile.Sync()
+	srv.etcdLogFile.Close()
 
 
 	err = os.RemoveAll(srv.Member.BaseDir)
 	err = os.RemoveAll(srv.Member.BaseDir)
 	if err != nil {
 	if err != nil {
@@ -702,7 +633,7 @@ func (srv *Server) handle_SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT() (*rpcpb.
 	}, nil
 	}, nil
 }
 }
 
 
-func (srv *Server) handle_BLACKHOLE_PEER_PORT_TX_RX() *rpcpb.Response {
+func (srv *Server) handle_BLACKHOLE_PEER_PORT_TX_RX() (*rpcpb.Response, error) {
 	for port, px := range srv.advertisePeerPortToProxy {
 	for port, px := range srv.advertisePeerPortToProxy {
 		srv.lg.Info("blackholing", zap.Int("peer-port", port))
 		srv.lg.Info("blackholing", zap.Int("peer-port", port))
 		px.BlackholeTx()
 		px.BlackholeTx()
@@ -712,10 +643,10 @@ func (srv *Server) handle_BLACKHOLE_PEER_PORT_TX_RX() *rpcpb.Response {
 	return &rpcpb.Response{
 	return &rpcpb.Response{
 		Success: true,
 		Success: true,
 		Status:  "blackholed peer port tx/rx",
 		Status:  "blackholed peer port tx/rx",
-	}
+	}, nil
 }
 }
 
 
-func (srv *Server) handle_UNBLACKHOLE_PEER_PORT_TX_RX() *rpcpb.Response {
+func (srv *Server) handle_UNBLACKHOLE_PEER_PORT_TX_RX() (*rpcpb.Response, error) {
 	for port, px := range srv.advertisePeerPortToProxy {
 	for port, px := range srv.advertisePeerPortToProxy {
 		srv.lg.Info("unblackholing", zap.Int("peer-port", port))
 		srv.lg.Info("unblackholing", zap.Int("peer-port", port))
 		px.UnblackholeTx()
 		px.UnblackholeTx()
@@ -725,10 +656,10 @@ func (srv *Server) handle_UNBLACKHOLE_PEER_PORT_TX_RX() *rpcpb.Response {
 	return &rpcpb.Response{
 	return &rpcpb.Response{
 		Success: true,
 		Success: true,
 		Status:  "unblackholed peer port tx/rx",
 		Status:  "unblackholed peer port tx/rx",
-	}
+	}, nil
 }
 }
 
 
-func (srv *Server) handle_DELAY_PEER_PORT_TX_RX() *rpcpb.Response {
+func (srv *Server) handle_DELAY_PEER_PORT_TX_RX() (*rpcpb.Response, error) {
 	lat := time.Duration(srv.Tester.UpdatedDelayLatencyMs) * time.Millisecond
 	lat := time.Duration(srv.Tester.UpdatedDelayLatencyMs) * time.Millisecond
 	rv := time.Duration(srv.Tester.DelayLatencyMsRv) * time.Millisecond
 	rv := time.Duration(srv.Tester.DelayLatencyMsRv) * time.Millisecond
 
 
@@ -750,10 +681,10 @@ func (srv *Server) handle_DELAY_PEER_PORT_TX_RX() *rpcpb.Response {
 	return &rpcpb.Response{
 	return &rpcpb.Response{
 		Success: true,
 		Success: true,
 		Status:  "delayed peer port tx/rx",
 		Status:  "delayed peer port tx/rx",
-	}
+	}, nil
 }
 }
 
 
-func (srv *Server) handle_UNDELAY_PEER_PORT_TX_RX() *rpcpb.Response {
+func (srv *Server) handle_UNDELAY_PEER_PORT_TX_RX() (*rpcpb.Response, error) {
 	for port, px := range srv.advertisePeerPortToProxy {
 	for port, px := range srv.advertisePeerPortToProxy {
 		srv.lg.Info("undelaying", zap.Int("peer-port", port))
 		srv.lg.Info("undelaying", zap.Int("peer-port", port))
 		px.UndelayTx()
 		px.UndelayTx()
@@ -763,5 +694,5 @@ func (srv *Server) handle_UNDELAY_PEER_PORT_TX_RX() *rpcpb.Response {
 	return &rpcpb.Response{
 	return &rpcpb.Response{
 		Success: true,
 		Success: true,
 		Status:  "undelayed peer port tx/rx",
 		Status:  "undelayed peer port tx/rx",
-	}
+	}, nil
 }
 }

+ 10 - 14
functional/agent/server.go

@@ -21,7 +21,6 @@ import (
 	"os/exec"
 	"os/exec"
 	"strings"
 	"strings"
 
 
-	"github.com/coreos/etcd/embed"
 	"github.com/coreos/etcd/functional/rpcpb"
 	"github.com/coreos/etcd/functional/rpcpb"
 	"github.com/coreos/etcd/pkg/proxy"
 	"github.com/coreos/etcd/pkg/proxy"
 
 
@@ -34,9 +33,8 @@ import (
 // no need to lock fields since request operations are
 // no need to lock fields since request operations are
 // serialized in tester-side
 // serialized in tester-side
 type Server struct {
 type Server struct {
-	lg *zap.Logger
-
 	grpcServer *grpc.Server
 	grpcServer *grpc.Server
+	lg         *zap.Logger
 
 
 	network string
 	network string
 	address string
 	address string
@@ -48,7 +46,6 @@ type Server struct {
 	*rpcpb.Member
 	*rpcpb.Member
 	*rpcpb.Tester
 	*rpcpb.Tester
 
 
-	etcdServer  *embed.Etcd
 	etcdCmd     *exec.Cmd
 	etcdCmd     *exec.Cmd
 	etcdLogFile *os.File
 	etcdLogFile *os.File
 
 
@@ -64,10 +61,10 @@ func NewServer(
 	address string,
 	address string,
 ) *Server {
 ) *Server {
 	return &Server{
 	return &Server{
-		lg:                         lg,
-		network:                    network,
-		address:                    address,
-		last:                       rpcpb.Operation_NOT_STARTED,
+		lg:      lg,
+		network: network,
+		address: address,
+		last:    rpcpb.Operation_NOT_STARTED,
 		advertiseClientPortToProxy: make(map[int]proxy.Server),
 		advertiseClientPortToProxy: make(map[int]proxy.Server),
 		advertisePeerPortToProxy:   make(map[int]proxy.Server),
 		advertisePeerPortToProxy:   make(map[int]proxy.Server),
 	}
 	}
@@ -126,12 +123,11 @@ func (srv *Server) Stop() {
 }
 }
 
 
 // Transport communicates with etcd tester.
 // Transport communicates with etcd tester.
-func (srv *Server) Transport(stream rpcpb.Transport_TransportServer) (reterr error) {
-	errc := make(chan error, 1)
+func (srv *Server) Transport(stream rpcpb.Transport_TransportServer) (err error) {
+	errc := make(chan error)
 	go func() {
 	go func() {
 		for {
 		for {
 			var req *rpcpb.Request
 			var req *rpcpb.Request
-			var err error
 			req, err = stream.Recv()
 			req, err = stream.Recv()
 			if err != nil {
 			if err != nil {
 				errc <- err
 				errc <- err
@@ -162,9 +158,9 @@ func (srv *Server) Transport(stream rpcpb.Transport_TransportServer) (reterr err
 	}()
 	}()
 
 
 	select {
 	select {
-	case reterr = <-errc:
+	case err = <-errc:
 	case <-stream.Context().Done():
 	case <-stream.Context().Done():
-		reterr = stream.Context().Err()
+		err = stream.Context().Err()
 	}
 	}
-	return reterr
+	return err
 }
 }

+ 16 - 14
functional/agent/utils.go

@@ -15,7 +15,6 @@
 package agent
 package agent
 
 
 import (
 import (
-	"io"
 	"net"
 	"net"
 	"net/url"
 	"net/url"
 	"os"
 	"os"
@@ -37,8 +36,7 @@ func archive(baseDir, etcdLogPath, dataDir string) error {
 		return err
 		return err
 	}
 	}
 
 
-	dst := filepath.Join(dir, "etcd.log")
-	if err := copyFile(etcdLogPath, dst); err != nil {
+	if err := os.Rename(etcdLogPath, filepath.Join(dir, "etcd.log")); err != nil {
 		if !os.IsNotExist(err) {
 		if !os.IsNotExist(err) {
 			return err
 			return err
 		}
 		}
@@ -81,23 +79,27 @@ func getURLAndPort(addr string) (urlAddr *url.URL, port int, err error) {
 	return urlAddr, port, err
 	return urlAddr, port, err
 }
 }
 
 
-func copyFile(src, dst string) error {
-	f, err := os.Open(src)
+func stopWithSig(cmd *exec.Cmd, sig os.Signal) error {
+	err := cmd.Process.Signal(sig)
 	if err != nil {
 	if err != nil {
 		return err
 		return err
 	}
 	}
-	defer f.Close()
 
 
-	w, err := os.Create(dst)
-	if err != nil {
-		return err
-	}
-	defer w.Close()
+	errc := make(chan error)
+	go func() {
+		_, ew := cmd.Process.Wait()
+		errc <- ew
+		close(errc)
+	}()
 
 
-	if _, err = io.Copy(w, f); err != nil {
-		return err
+	select {
+	case <-time.After(5 * time.Second):
+		cmd.Process.Kill()
+	case e := <-errc:
+		return e
 	}
 	}
-	return w.Sync()
+	err = <-errc
+	return err
 }
 }
 
 
 func cleanPageCache() error {
 func cleanPageCache() error {

+ 0 - 0
functional/agent/utils_test.go


+ 0 - 0
functional/cmd/etcd-agent/main.go


+ 4 - 7
functional/cmd/etcd-proxy/main.go

@@ -19,8 +19,6 @@ import (
 	"context"
 	"context"
 	"flag"
 	"flag"
 	"fmt"
 	"fmt"
-	"io/ioutil"
-	"log"
 	"net/http"
 	"net/http"
 	"net/url"
 	"net/url"
 	"os"
 	"os"
@@ -64,8 +62,8 @@ $ make build-etcd-proxy
 $ ./bin/etcd-proxy --help
 $ ./bin/etcd-proxy --help
 $ ./bin/etcd-proxy --from localhost:23790 --to localhost:2379 --http-port 2378 --verbose
 $ ./bin/etcd-proxy --from localhost:23790 --to localhost:2379 --http-port 2378 --verbose
 
 
-$ ./bin/etcdctl --endpoints localhost:2379 put foo bar
-$ ./bin/etcdctl --endpoints localhost:23790 put foo bar`)
+$ ETCDCTL_API=3 ./bin/etcdctl --endpoints localhost:2379 put foo bar
+$ ETCDCTL_API=3 ./bin/etcdctl --endpoints localhost:23790 put foo bar`)
 		flag.PrintDefaults()
 		flag.PrintDefaults()
 	}
 	}
 
 
@@ -193,9 +191,8 @@ $ ./bin/etcdctl --endpoints localhost:23790 put foo bar`)
 		}
 		}
 	})
 	})
 	srv := &http.Server{
 	srv := &http.Server{
-		Addr:     fmt.Sprintf(":%d", httpPort),
-		Handler:  mux,
-		ErrorLog: log.New(ioutil.Discard, "net/http", 0),
+		Addr:    fmt.Sprintf(":%d", httpPort),
+		Handler: mux,
 	}
 	}
 	defer srv.Close()
 	defer srv.Close()
 
 

+ 0 - 0
functional/cmd/etcd-runner/main.go


+ 0 - 0
functional/cmd/etcd-tester/main.go


+ 11 - 3
functional/rpcpb/etcd_config.go

@@ -50,12 +50,15 @@ var etcdFields = []string{
 
 
 	"SnapshotCount",
 	"SnapshotCount",
 	"QuotaBackendBytes",
 	"QuotaBackendBytes",
+
+	// "PreVote",
+	// "InitialCorruptCheck",
 }
 }
 
 
 // Flags returns etcd flags in string slice.
 // Flags returns etcd flags in string slice.
-func (e *Etcd) Flags() (fs []string) {
-	tp := reflect.TypeOf(*e)
-	vo := reflect.ValueOf(*e)
+func (cfg *Etcd) Flags() (fs []string) {
+	tp := reflect.TypeOf(*cfg)
+	vo := reflect.ValueOf(*cfg)
 	for _, name := range etcdFields {
 	for _, name := range etcdFields {
 		field, ok := tp.FieldByName(name)
 		field, ok := tp.FieldByName(name)
 		if !ok {
 		if !ok {
@@ -83,6 +86,11 @@ func (e *Etcd) Flags() (fs []string) {
 
 
 		fname := field.Tag.Get("yaml")
 		fname := field.Tag.Get("yaml")
 
 
+		// not supported in old etcd
+		if fname == "pre-vote" || fname == "initial-corrupt-check" {
+			continue
+		}
+
 		if sv != "" {
 		if sv != "" {
 			fs = append(fs, fmt.Sprintf("--%s=%s", fname, sv))
 			fs = append(fs, fmt.Sprintf("--%s=%s", fname, sv))
 		}
 		}

+ 10 - 17
functional/rpcpb/etcd_config_test.go

@@ -19,11 +19,11 @@ import (
 	"testing"
 	"testing"
 )
 )
 
 
-func TestEtcd(t *testing.T) {
-	e := &Etcd{
+func TestEtcdFlags(t *testing.T) {
+	cfg := &Etcd{
 		Name:    "s1",
 		Name:    "s1",
-		DataDir: "/tmp/etcd-functionl-1/etcd.data",
-		WALDir:  "/tmp/etcd-functionl-1/etcd.data/member/wal",
+		DataDir: "/tmp/etcd-agent-data-1/etcd.data",
+		WALDir:  "/tmp/etcd-agent-data-1/etcd.data/member/wal",
 
 
 		HeartbeatIntervalMs: 100,
 		HeartbeatIntervalMs: 100,
 		ElectionTimeoutMs:   1000,
 		ElectionTimeoutMs:   1000,
@@ -53,16 +53,12 @@ func TestEtcd(t *testing.T) {
 
 
 		PreVote:             true,
 		PreVote:             true,
 		InitialCorruptCheck: true,
 		InitialCorruptCheck: true,
-
-		Logger:     "zap",
-		LogOutputs: []string{"/tmp/etcd-functional-1/etcd.log"},
-		LogLevel:   "info",
 	}
 	}
 
 
-	exps := []string{
+	exp := []string{
 		"--name=s1",
 		"--name=s1",
-		"--data-dir=/tmp/etcd-functionl-1/etcd.data",
-		"--wal-dir=/tmp/etcd-functionl-1/etcd.data/member/wal",
+		"--data-dir=/tmp/etcd-agent-data-1/etcd.data",
+		"--wal-dir=/tmp/etcd-agent-data-1/etcd.data/member/wal",
 		"--heartbeat-interval=100",
 		"--heartbeat-interval=100",
 		"--election-timeout=1000",
 		"--election-timeout=1000",
 		"--listen-client-urls=https://127.0.0.1:1379",
 		"--listen-client-urls=https://127.0.0.1:1379",
@@ -80,12 +76,9 @@ func TestEtcd(t *testing.T) {
 		"--quota-backend-bytes=10740000000",
 		"--quota-backend-bytes=10740000000",
 		"--pre-vote=true",
 		"--pre-vote=true",
 		"--experimental-initial-corrupt-check=true",
 		"--experimental-initial-corrupt-check=true",
-		"--logger=zap",
-		"--log-outputs=/tmp/etcd-functional-1/etcd.log",
-		"--log-level=info",
 	}
 	}
-	fs := e.Flags()
-	if !reflect.DeepEqual(exps, fs) {
-		t.Fatalf("expected %q, got %q", exps, fs)
+	fs := cfg.Flags()
+	if !reflect.DeepEqual(exp, fs) {
+		t.Fatalf("expected %q, got %q", exp, fs)
 	}
 	}
 }
 }

+ 1 - 11
functional/rpcpb/member.go

@@ -23,10 +23,9 @@ import (
 	"time"
 	"time"
 
 
 	"github.com/coreos/etcd/clientv3"
 	"github.com/coreos/etcd/clientv3"
-	"github.com/coreos/etcd/clientv3/snapshot"
 	pb "github.com/coreos/etcd/etcdserver/etcdserverpb"
 	pb "github.com/coreos/etcd/etcdserver/etcdserverpb"
-	"github.com/coreos/etcd/pkg/logutil"
 	"github.com/coreos/etcd/pkg/transport"
 	"github.com/coreos/etcd/pkg/transport"
+	"github.com/coreos/etcd/snapshot"
 
 
 	"github.com/dustin/go-humanize"
 	"github.com/dustin/go-humanize"
 	"go.uber.org/zap"
 	"go.uber.org/zap"
@@ -95,19 +94,10 @@ func (m *Member) CreateEtcdClientConfig(opts ...grpc.DialOption) (cfg *clientv3.
 		}
 		}
 	}
 	}
 
 
-	// TODO: make this configurable
-	level := "error"
-	if os.Getenv("ETCD_CLIENT_DEBUG") != "" {
-		level = "debug"
-	}
-	lcfg := logutil.DefaultZapLoggerConfig
-	lcfg.Level = zap.NewAtomicLevelAt(logutil.ConvertToZapLevel(level))
-
 	cfg = &clientv3.Config{
 	cfg = &clientv3.Config{
 		Endpoints:   []string{m.EtcdClientEndpoint},
 		Endpoints:   []string{m.EtcdClientEndpoint},
 		DialTimeout: 10 * time.Second,
 		DialTimeout: 10 * time.Second,
 		DialOptions: opts,
 		DialOptions: opts,
-		LogConfig:   &lcfg,
 	}
 	}
 	if secure {
 	if secure {
 		// assume save TLS assets are already stord on disk
 		// assume save TLS assets are already stord on disk

File diff suppressed because it is too large
+ 168 - 636
functional/rpcpb/rpc.pb.go


+ 22 - 32
functional/rpcpb/rpc.proto

@@ -45,8 +45,9 @@ service Transport {
 }
 }
 
 
 message Member {
 message Member {
-  // EtcdExec is the executable etcd binary path in agent server.
-  string EtcdExec = 1 [(gogoproto.moretags) = "yaml:\"etcd-exec\""];
+  // EtcdExecPath is the executable etcd binary path in agent server.
+  string EtcdExecPath = 1 [(gogoproto.moretags) = "yaml:\"etcd-exec-path\""];
+  // TODO: support embedded etcd
 
 
   // AgentAddr is the agent HTTP server address.
   // AgentAddr is the agent HTTP server address.
   string AgentAddr = 11 [(gogoproto.moretags) = "yaml:\"agent-addr\""];
   string AgentAddr = 11 [(gogoproto.moretags) = "yaml:\"agent-addr\""];
@@ -55,6 +56,8 @@ message Member {
 
 
   // BaseDir is the base directory where all logs and etcd data are stored.
   // BaseDir is the base directory where all logs and etcd data are stored.
   string BaseDir = 101 [(gogoproto.moretags) = "yaml:\"base-dir\""];
   string BaseDir = 101 [(gogoproto.moretags) = "yaml:\"base-dir\""];
+  // EtcdLogPath is the log file to store current etcd server logs.
+  string EtcdLogPath = 102 [(gogoproto.moretags) = "yaml:\"etcd-log-path\""];
 
 
   // EtcdClientProxy is true when client traffic needs to be proxied.
   // EtcdClientProxy is true when client traffic needs to be proxied.
   // If true, listen client URL port must be different than advertise client URL port.
   // If true, listen client URL port must be different than advertise client URL port.
@@ -138,7 +141,7 @@ message Tester {
 
 
   // Stressers is the list of stresser types:
   // Stressers is the list of stresser types:
   // KV, LEASE, ELECTION_RUNNER, WATCH_RUNNER, LOCK_RACER_RUNNER, LEASE_RUNNER.
   // KV, LEASE, ELECTION_RUNNER, WATCH_RUNNER, LOCK_RACER_RUNNER, LEASE_RUNNER.
-  repeated Stresser Stressers = 101 [(gogoproto.moretags) = "yaml:\"stressers\""];
+  repeated string Stressers = 101 [(gogoproto.moretags) = "yaml:\"stressers\""];
   // Checkers is the list of consistency checker types:
   // Checkers is the list of consistency checker types:
   // KV_HASH, LEASE_EXPIRE, NO_CHECK, RUNNER.
   // KV_HASH, LEASE_EXPIRE, NO_CHECK, RUNNER.
   // Leave empty to skip consistency checks.
   // Leave empty to skip consistency checks.
@@ -164,35 +167,6 @@ message Tester {
   int32 StressQPS = 302 [(gogoproto.moretags) = "yaml:\"stress-qps\""];
   int32 StressQPS = 302 [(gogoproto.moretags) = "yaml:\"stress-qps\""];
 }
 }
 
 
-enum StresserType {
-  KV_WRITE_SMALL = 0;
-  KV_WRITE_LARGE = 1;
-  KV_READ_ONE_KEY = 2;
-  KV_READ_RANGE = 3;
-  KV_DELETE_ONE_KEY = 4;
-  KV_DELETE_RANGE = 5;
-  KV_TXN_WRITE_DELETE = 6;
-
-  LEASE = 10;
-
-  ELECTION_RUNNER = 20;
-  WATCH_RUNNER = 31;
-  LOCK_RACER_RUNNER = 41;
-  LEASE_RUNNER = 51;
-}
-
-message Stresser {
-  string Type = 1 [(gogoproto.moretags) = "yaml:\"type\""];
-  double Weight = 2 [(gogoproto.moretags) = "yaml:\"weight\""];
-}
-
-enum Checker {
-  KV_HASH = 0;
-  LEASE_EXPIRE = 1;
-  RUNNER = 2;
-  NO_CHECK = 3;
-}
-
 message Etcd {
 message Etcd {
   string Name = 1 [(gogoproto.moretags) = "yaml:\"name\""];
   string Name = 1 [(gogoproto.moretags) = "yaml:\"name\""];
   string DataDir = 2 [(gogoproto.moretags) = "yaml:\"data-dir\""];
   string DataDir = 2 [(gogoproto.moretags) = "yaml:\"data-dir\""];
@@ -620,3 +594,19 @@ enum Case {
   // EXTERNAL runs external failure injection scripts.
   // EXTERNAL runs external failure injection scripts.
   EXTERNAL = 500;
   EXTERNAL = 500;
 }
 }
+
+enum Stresser {
+  KV = 0;
+  LEASE = 1;
+  ELECTION_RUNNER = 2;
+  WATCH_RUNNER = 3;
+  LOCK_RACER_RUNNER = 4;
+  LEASE_RUNNER = 5;
+}
+
+enum Checker {
+  KV_HASH = 0;
+  LEASE_EXPIRE = 1;
+  RUNNER = 2;
+  NO_CHECK = 3;
+}

+ 0 - 0
functional/runner/election_command.go


+ 0 - 0
functional/runner/error.go


+ 1 - 1
functional/runner/global.go

@@ -47,7 +47,7 @@ type roundClient struct {
 func newClient(eps []string, timeout time.Duration) *clientv3.Client {
 func newClient(eps []string, timeout time.Duration) *clientv3.Client {
 	c, err := clientv3.New(clientv3.Config{
 	c, err := clientv3.New(clientv3.Config{
 		Endpoints:   eps,
 		Endpoints:   eps,
-		DialTimeout: timeout * time.Second,
+		DialTimeout: time.Duration(timeout) * time.Second,
 	})
 	})
 	if err != nil {
 	if err != nil {
 		log.Fatal(err)
 		log.Fatal(err)

+ 0 - 0
functional/runner/help.go


+ 0 - 0
functional/runner/lease_renewer_command.go


+ 0 - 0
functional/runner/lock_racer_command.go


+ 0 - 0
functional/runner/root.go


+ 0 - 0
functional/runner/watch_command.go


+ 2 - 2
functional/scripts/docker-local-agent.sh

@@ -13,7 +13,7 @@ if ! [[ "${0}" =~ "scripts/docker-local-agent.sh" ]]; then
 fi
 fi
 
 
 if [[ -z "${GO_VERSION}" ]]; then
 if [[ -z "${GO_VERSION}" ]]; then
-  GO_VERSION=1.12.8
+  GO_VERSION=1.10.1
 fi
 fi
 echo "Running with GO_VERSION:" ${GO_VERSION}
 echo "Running with GO_VERSION:" ${GO_VERSION}
 
 
@@ -38,5 +38,5 @@ docker run \
   --rm \
   --rm \
   --net=host \
   --net=host \
   --name ${AGENT_NAME} \
   --name ${AGENT_NAME} \
-  gcr.io/etcd-development/etcd-functional:go${GO_VERSION} \
+  gcr.io/etcd-development/etcd-functional-tester:go${GO_VERSION} \
   /bin/bash -c "./bin/etcd-agent ${AGENT_ADDR_FLAG}"
   /bin/bash -c "./bin/etcd-agent ${AGENT_ADDR_FLAG}"

+ 2 - 2
functional/scripts/docker-local-tester.sh

@@ -6,7 +6,7 @@ if ! [[ "${0}" =~ "scripts/docker-local-tester.sh" ]]; then
 fi
 fi
 
 
 if [[ -z "${GO_VERSION}" ]]; then
 if [[ -z "${GO_VERSION}" ]]; then
-  GO_VERSION=1.12.8
+  GO_VERSION=1.10.1
 fi
 fi
 echo "Running with GO_VERSION:" ${GO_VERSION}
 echo "Running with GO_VERSION:" ${GO_VERSION}
 
 
@@ -14,5 +14,5 @@ docker run \
   --rm \
   --rm \
   --net=host \
   --net=host \
   --name tester \
   --name tester \
-  gcr.io/etcd-development/etcd-functional:go${GO_VERSION} \
+  gcr.io/etcd-development/etcd-functional-tester:go${GO_VERSION} \
   /bin/bash -c "./bin/etcd-tester --config ./functional.yaml"
   /bin/bash -c "./bin/etcd-tester --config ./functional.yaml"

+ 2 - 2
functional/scripts/genproto.sh

@@ -7,8 +7,8 @@ if ! [[ "$0" =~ "scripts/genproto.sh" ]]; then
 fi
 fi
 
 
 # for now, be conservative about what version of protoc we expect
 # for now, be conservative about what version of protoc we expect
-if ! [[ $(protoc --version) =~ "3.7.1" ]]; then
-  echo "could not find protoc 3.7.1, is it installed + in PATH?"
+if ! [[ $(protoc --version) =~ "3.5.1" ]]; then
+  echo "could not find protoc 3.5.1, is it installed + in PATH?"
   exit 255
   exit 255
 fi
 fi
 
 

+ 6 - 17
functional/tester/case.go

@@ -275,18 +275,6 @@ func (c *caseUntilSnapshot) Inject(clus *Cluster) error {
 
 
 	for i := 0; i < retries; i++ {
 	for i := 0; i < retries; i++ {
 		lastRev, err = clus.maxRev()
 		lastRev, err = clus.maxRev()
-		if lastRev == 0 {
-			clus.lg.Info(
-				"trigger snapshot RETRY",
-				zap.Int("retries", i),
-				zap.Int64("etcd-snapshot-count", snapshotCount),
-				zap.Int64("start-revision", startRev),
-				zap.Error(err),
-			)
-			time.Sleep(3 * time.Second)
-			continue
-		}
-
 		// If the number of proposals committed is bigger than snapshot count,
 		// If the number of proposals committed is bigger than snapshot count,
 		// a new snapshot should have been created.
 		// a new snapshot should have been created.
 		diff := lastRev - startRev
 		diff := lastRev - startRev
@@ -304,8 +292,12 @@ func (c *caseUntilSnapshot) Inject(clus *Cluster) error {
 			return nil
 			return nil
 		}
 		}
 
 
+		dur := time.Second
+		if diff < 0 || err != nil {
+			dur = 3 * time.Second
+		}
 		clus.lg.Info(
 		clus.lg.Info(
-			"trigger snapshot RETRY",
+			"trigger snapshot PROGRESS",
 			zap.Int("retries", i),
 			zap.Int("retries", i),
 			zap.Int64("committed-entries", diff),
 			zap.Int64("committed-entries", diff),
 			zap.Int64("etcd-snapshot-count", snapshotCount),
 			zap.Int64("etcd-snapshot-count", snapshotCount),
@@ -314,10 +306,7 @@ func (c *caseUntilSnapshot) Inject(clus *Cluster) error {
 			zap.Duration("took", time.Since(now)),
 			zap.Duration("took", time.Since(now)),
 			zap.Error(err),
 			zap.Error(err),
 		)
 		)
-		time.Sleep(time.Second)
-		if err != nil {
-			time.Sleep(2 * time.Second)
-		}
+		time.Sleep(dur)
 	}
 	}
 
 
 	return fmt.Errorf("cluster too slow: only %d commits in %d retries", lastRev-startRev, retries)
 	return fmt.Errorf("cluster too slow: only %d commits in %d retries", lastRev-startRev, retries)

+ 0 - 0
functional/tester/case_delay.go


+ 0 - 0
functional/tester/case_external.go


+ 0 - 0
functional/tester/case_failpoints.go


+ 0 - 0
functional/tester/case_network_blackhole.go


+ 1 - 1
functional/tester/case_network_delay.go

@@ -26,7 +26,7 @@ const (
 	// Wait more when it recovers from slow network, because network layer
 	// Wait more when it recovers from slow network, because network layer
 	// needs extra time to propagate traffic control (tc command) change.
 	// needs extra time to propagate traffic control (tc command) change.
 	// Otherwise, we get different hash values from the previous revision.
 	// Otherwise, we get different hash values from the previous revision.
-	// For more detail, please see https://github.com/etcd-io/etcd/issues/5121.
+	// For more detail, please see https://github.com/coreos/etcd/issues/5121.
 	waitRecover = 5 * time.Second
 	waitRecover = 5 * time.Second
 )
 )
 
 

+ 0 - 0
functional/tester/case_no_fail.go


+ 0 - 0
functional/tester/case_sigquit_remove.go


+ 0 - 0
functional/tester/case_sigquit_remove_quorum.go


+ 0 - 0
functional/tester/case_sigterm.go


+ 0 - 0
functional/tester/checker.go


+ 0 - 0
functional/tester/checker_kv_hash.go


+ 0 - 0
functional/tester/checker_lease_expire.go


+ 0 - 0
functional/tester/checker_no_check.go


+ 0 - 0
functional/tester/checker_runner.go


+ 6 - 8
functional/tester/cluster.go

@@ -20,7 +20,6 @@ import (
 	"fmt"
 	"fmt"
 	"io"
 	"io"
 	"io/ioutil"
 	"io/ioutil"
-	"log"
 	"math/rand"
 	"math/rand"
 	"net/http"
 	"net/http"
 	"net/url"
 	"net/url"
@@ -107,9 +106,8 @@ func NewCluster(lg *zap.Logger, fpath string) (*Cluster, error) {
 		}
 		}
 	}
 	}
 	clus.testerHTTPServer = &http.Server{
 	clus.testerHTTPServer = &http.Server{
-		Addr:     clus.Tester.Addr,
-		Handler:  mux,
-		ErrorLog: log.New(ioutil.Discard, "net/http", 0),
+		Addr:    clus.Tester.Addr,
+		Handler: mux,
 	}
 	}
 	go clus.serveTesterServer()
 	go clus.serveTesterServer()
 
 
@@ -493,9 +491,9 @@ func (clus *Cluster) sendOpWithResp(idx int, op rpcpb.Operation) (*rpcpb.Respons
 
 
 	m, secure := clus.Members[idx], false
 	m, secure := clus.Members[idx], false
 	for _, cu := range m.Etcd.AdvertiseClientURLs {
 	for _, cu := range m.Etcd.AdvertiseClientURLs {
-		u, perr := url.Parse(cu)
-		if perr != nil {
-			return nil, perr
+		u, err := url.Parse(cu)
+		if err != nil {
+			return nil, err
 		}
 		}
 		if u.Scheme == "https" { // TODO: handle unix
 		if u.Scheme == "https" { // TODO: handle unix
 			secure = true
 			secure = true
@@ -593,7 +591,7 @@ func (clus *Cluster) WaitHealth() error {
 	// wait 60s to check cluster health.
 	// wait 60s to check cluster health.
 	// TODO: set it to a reasonable value. It is set that high because
 	// TODO: set it to a reasonable value. It is set that high because
 	// follower may use long time to catch up the leader when reboot under
 	// follower may use long time to catch up the leader when reboot under
-	// reasonable workload (https://github.com/etcd-io/etcd/issues/2698)
+	// reasonable workload (https://github.com/coreos/etcd/issues/2698)
 	for i := 0; i < 60; i++ {
 	for i := 0; i < 60; i++ {
 		for _, m := range clus.Members {
 		for _, m := range clus.Members {
 			if err = m.WriteHealthKey(); err != nil {
 			if err = m.WriteHealthKey(); err != nil {

+ 42 - 60
functional/tester/cluster_read_config.go

@@ -44,56 +44,14 @@ func read(lg *zap.Logger, fpath string) (*Cluster, error) {
 		return nil, fmt.Errorf("len(clus.Members) expects at least 3, got %d", len(clus.Members))
 		return nil, fmt.Errorf("len(clus.Members) expects at least 3, got %d", len(clus.Members))
 	}
 	}
 
 
-	failpointsEnabled := false
-	for _, c := range clus.Tester.Cases {
-		if c == rpcpb.Case_FAILPOINTS.String() {
-			failpointsEnabled = true
-			break
-		}
-	}
-
-	if len(clus.Tester.Cases) == 0 {
-		return nil, errors.New("cases not found")
-	}
-	if clus.Tester.DelayLatencyMs <= clus.Tester.DelayLatencyMsRv*5 {
-		return nil, fmt.Errorf("delay latency %d ms must be greater than 5x of delay latency random variable %d ms", clus.Tester.DelayLatencyMs, clus.Tester.DelayLatencyMsRv)
-	}
-	if clus.Tester.UpdatedDelayLatencyMs == 0 {
-		clus.Tester.UpdatedDelayLatencyMs = clus.Tester.DelayLatencyMs
-	}
-
-	for _, v := range clus.Tester.Cases {
-		if _, ok := rpcpb.Case_value[v]; !ok {
-			return nil, fmt.Errorf("%q is not defined in 'rpcpb.Case_value'", v)
-		}
-	}
-
-	for _, s := range clus.Tester.Stressers {
-		if _, ok := rpcpb.StresserType_value[s.Type]; !ok {
-			return nil, fmt.Errorf("unknown 'StresserType' %+v", s)
-		}
-	}
-
-	for _, v := range clus.Tester.Checkers {
-		if _, ok := rpcpb.Checker_value[v]; !ok {
-			return nil, fmt.Errorf("Checker is unknown; got %q", v)
-		}
-	}
-
-	if clus.Tester.StressKeySuffixRangeTxn > 100 {
-		return nil, fmt.Errorf("StressKeySuffixRangeTxn maximum value is 100, got %v", clus.Tester.StressKeySuffixRangeTxn)
-	}
-	if clus.Tester.StressKeyTxnOps > 64 {
-		return nil, fmt.Errorf("StressKeyTxnOps maximum value is 64, got %v", clus.Tester.StressKeyTxnOps)
-	}
-
 	for i, mem := range clus.Members {
 	for i, mem := range clus.Members {
-		if mem.EtcdExec == "embed" && failpointsEnabled {
-			return nil, errors.New("EtcdExec 'embed' cannot be run with failpoints enabled")
-		}
 		if mem.BaseDir == "" {
 		if mem.BaseDir == "" {
 			return nil, fmt.Errorf("BaseDir cannot be empty (got %q)", mem.BaseDir)
 			return nil, fmt.Errorf("BaseDir cannot be empty (got %q)", mem.BaseDir)
 		}
 		}
+		if mem.EtcdLogPath == "" {
+			return nil, fmt.Errorf("EtcdLogPath cannot be empty (got %q)", mem.EtcdLogPath)
+		}
+
 		if mem.Etcd.Name == "" {
 		if mem.Etcd.Name == "" {
 			return nil, fmt.Errorf("'--name' cannot be empty (got %+v)", mem)
 			return nil, fmt.Errorf("'--name' cannot be empty (got %+v)", mem)
 		}
 		}
@@ -174,6 +132,9 @@ func read(lg *zap.Logger, fpath string) (*Cluster, error) {
 			}
 			}
 		}
 		}
 
 
+		if !strings.HasPrefix(mem.EtcdLogPath, mem.BaseDir) {
+			return nil, fmt.Errorf("EtcdLogPath must be prefixed with BaseDir (got %q)", mem.EtcdLogPath)
+		}
 		if !strings.HasPrefix(mem.Etcd.DataDir, mem.BaseDir) {
 		if !strings.HasPrefix(mem.Etcd.DataDir, mem.BaseDir) {
 			return nil, fmt.Errorf("Etcd.DataDir must be prefixed with BaseDir (got %q)", mem.Etcd.DataDir)
 			return nil, fmt.Errorf("Etcd.DataDir must be prefixed with BaseDir (got %q)", mem.Etcd.DataDir)
 		}
 		}
@@ -227,7 +188,7 @@ func read(lg *zap.Logger, fpath string) (*Cluster, error) {
 			return nil, fmt.Errorf("Etcd.PeerClientCertAuth and Etcd.PeerAutoTLS cannot be both 'true'")
 			return nil, fmt.Errorf("Etcd.PeerClientCertAuth and Etcd.PeerAutoTLS cannot be both 'true'")
 		}
 		}
 		if (mem.Etcd.PeerCertFile == "") != (mem.Etcd.PeerKeyFile == "") {
 		if (mem.Etcd.PeerCertFile == "") != (mem.Etcd.PeerKeyFile == "") {
-			return nil, fmt.Errorf("both Etcd.PeerCertFile %q and Etcd.PeerKeyFile %q must be either empty or non-empty", mem.Etcd.PeerCertFile, mem.Etcd.PeerKeyFile)
+			return nil, fmt.Errorf("Both Etcd.PeerCertFile %q and Etcd.PeerKeyFile %q must be either empty or non-empty", mem.Etcd.PeerCertFile, mem.Etcd.PeerKeyFile)
 		}
 		}
 		if mem.Etcd.ClientCertAuth && mem.Etcd.ClientAutoTLS {
 		if mem.Etcd.ClientCertAuth && mem.Etcd.ClientAutoTLS {
 			return nil, fmt.Errorf("Etcd.ClientCertAuth and Etcd.ClientAutoTLS cannot be both 'true'")
 			return nil, fmt.Errorf("Etcd.ClientCertAuth and Etcd.ClientAutoTLS cannot be both 'true'")
@@ -251,7 +212,7 @@ func read(lg *zap.Logger, fpath string) (*Cluster, error) {
 			return nil, fmt.Errorf("Etcd.ClientCertAuth 'false', but Etcd.ClientTrustedCAFile is %q", mem.Etcd.PeerCertFile)
 			return nil, fmt.Errorf("Etcd.ClientCertAuth 'false', but Etcd.ClientTrustedCAFile is %q", mem.Etcd.PeerCertFile)
 		}
 		}
 		if (mem.Etcd.ClientCertFile == "") != (mem.Etcd.ClientKeyFile == "") {
 		if (mem.Etcd.ClientCertFile == "") != (mem.Etcd.ClientKeyFile == "") {
-			return nil, fmt.Errorf("both Etcd.ClientCertFile %q and Etcd.ClientKeyFile %q must be either empty or non-empty", mem.Etcd.ClientCertFile, mem.Etcd.ClientKeyFile)
+			return nil, fmt.Errorf("Both Etcd.ClientCertFile %q and Etcd.ClientKeyFile %q must be either empty or non-empty", mem.Etcd.ClientCertFile, mem.Etcd.ClientKeyFile)
 		}
 		}
 
 
 		peerTLS := mem.Etcd.PeerAutoTLS ||
 		peerTLS := mem.Etcd.PeerAutoTLS ||
@@ -356,21 +317,42 @@ func read(lg *zap.Logger, fpath string) (*Cluster, error) {
 				}
 				}
 				clus.Members[i].ClientCertData = string(data)
 				clus.Members[i].ClientCertData = string(data)
 			}
 			}
+		}
+	}
 
 
-			if len(mem.Etcd.LogOutputs) == 0 {
-				return nil, fmt.Errorf("mem.Etcd.LogOutputs cannot be empty")
-			}
-			for _, v := range mem.Etcd.LogOutputs {
-				switch v {
-				case "stderr", "stdout", "/dev/null", "default":
-				default:
-					if !strings.HasPrefix(v, mem.BaseDir) {
-						return nil, fmt.Errorf("LogOutput %q must be prefixed with BaseDir %q", v, mem.BaseDir)
-					}
-				}
-			}
+	if len(clus.Tester.Cases) == 0 {
+		return nil, errors.New("Cases not found")
+	}
+	if clus.Tester.DelayLatencyMs <= clus.Tester.DelayLatencyMsRv*5 {
+		return nil, fmt.Errorf("delay latency %d ms must be greater than 5x of delay latency random variable %d ms", clus.Tester.DelayLatencyMs, clus.Tester.DelayLatencyMsRv)
+	}
+	if clus.Tester.UpdatedDelayLatencyMs == 0 {
+		clus.Tester.UpdatedDelayLatencyMs = clus.Tester.DelayLatencyMs
+	}
+
+	for _, v := range clus.Tester.Cases {
+		if _, ok := rpcpb.Case_value[v]; !ok {
+			return nil, fmt.Errorf("%q is not defined in 'rpcpb.Case_value'", v)
+		}
+	}
+
+	for _, v := range clus.Tester.Stressers {
+		if _, ok := rpcpb.Stresser_value[v]; !ok {
+			return nil, fmt.Errorf("Stresser is unknown; got %q", v)
 		}
 		}
 	}
 	}
+	for _, v := range clus.Tester.Checkers {
+		if _, ok := rpcpb.Checker_value[v]; !ok {
+			return nil, fmt.Errorf("Checker is unknown; got %q", v)
+		}
+	}
+
+	if clus.Tester.StressKeySuffixRangeTxn > 100 {
+		return nil, fmt.Errorf("StressKeySuffixRangeTxn maximum value is 100, got %v", clus.Tester.StressKeySuffixRangeTxn)
+	}
+	if clus.Tester.StressKeyTxnOps > 64 {
+		return nil, fmt.Errorf("StressKeyTxnOps maximum value is 64, got %v", clus.Tester.StressKeyTxnOps)
+	}
 
 
 	return clus, err
 	return clus, err
 }
 }

+ 2 - 2
functional/tester/cluster_run.go

@@ -212,8 +212,8 @@ func (clus *Cluster) doRound() error {
 				)
 				)
 
 
 				// with network delay, some ongoing requests may fail
 				// with network delay, some ongoing requests may fail
-				// only return error, if more than 30% of QPS requests fail
-				if cnt > int(float64(clus.Tester.StressQPS)*0.3) {
+				// only return error, if more than 10% of QPS requests fail
+				if cnt > int(clus.Tester.StressQPS)/10 {
 					return fmt.Errorf("expected no error in %q, got %q", fcase.String(), ess)
 					return fmt.Errorf("expected no error in %q, got %q", fcase.String(), ess)
 				}
 				}
 			}
 			}

+ 0 - 0
functional/tester/cluster_shuffle.go


+ 0 - 304
functional/tester/cluster_test.go

@@ -1,304 +0,0 @@
-// Copyright 2018 The etcd Authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package tester
-
-import (
-	"reflect"
-	"sort"
-	"testing"
-
-	"github.com/coreos/etcd/functional/rpcpb"
-
-	"go.uber.org/zap"
-)
-
-func Test_read(t *testing.T) {
-	exp := &Cluster{
-		Members: []*rpcpb.Member{
-			{
-				EtcdExec:           "./bin/etcd",
-				AgentAddr:          "127.0.0.1:19027",
-				FailpointHTTPAddr:  "http://127.0.0.1:7381",
-				BaseDir:            "/tmp/etcd-functional-1",
-				EtcdClientProxy:    false,
-				EtcdPeerProxy:      true,
-				EtcdClientEndpoint: "127.0.0.1:1379",
-				Etcd: &rpcpb.Etcd{
-					Name:                "s1",
-					DataDir:             "/tmp/etcd-functional-1/etcd.data",
-					WALDir:              "/tmp/etcd-functional-1/etcd.data/member/wal",
-					HeartbeatIntervalMs: 100,
-					ElectionTimeoutMs:   1000,
-					ListenClientURLs:    []string{"https://127.0.0.1:1379"},
-					AdvertiseClientURLs: []string{"https://127.0.0.1:1379"},
-					ClientAutoTLS:       true,
-					ClientCertAuth:      false,
-					ClientCertFile:      "",
-					ClientKeyFile:       "",
-					ClientTrustedCAFile: "",
-					ListenPeerURLs:      []string{"https://127.0.0.1:1380"},
-					AdvertisePeerURLs:   []string{"https://127.0.0.1:1381"},
-					PeerAutoTLS:         true,
-					PeerClientCertAuth:  false,
-					PeerCertFile:        "",
-					PeerKeyFile:         "",
-					PeerTrustedCAFile:   "",
-					InitialCluster:      "s1=https://127.0.0.1:1381,s2=https://127.0.0.1:2381,s3=https://127.0.0.1:3381",
-					InitialClusterState: "new",
-					InitialClusterToken: "tkn",
-					SnapshotCount:       10000,
-					QuotaBackendBytes:   10740000000,
-					PreVote:             true,
-					InitialCorruptCheck: true,
-					Logger:              "zap",
-					LogOutputs:          []string{"/tmp/etcd-functional-1/etcd.log"},
-					Debug:               true,
-				},
-				ClientCertData:      "",
-				ClientCertPath:      "",
-				ClientKeyData:       "",
-				ClientKeyPath:       "",
-				ClientTrustedCAData: "",
-				ClientTrustedCAPath: "",
-				PeerCertData:        "",
-				PeerCertPath:        "",
-				PeerKeyData:         "",
-				PeerKeyPath:         "",
-				PeerTrustedCAData:   "",
-				PeerTrustedCAPath:   "",
-				SnapshotPath:        "/tmp/etcd-functional-1.snapshot.db",
-			},
-			{
-				EtcdExec:           "./bin/etcd",
-				AgentAddr:          "127.0.0.1:29027",
-				FailpointHTTPAddr:  "http://127.0.0.1:7382",
-				BaseDir:            "/tmp/etcd-functional-2",
-				EtcdClientProxy:    false,
-				EtcdPeerProxy:      true,
-				EtcdClientEndpoint: "127.0.0.1:2379",
-				Etcd: &rpcpb.Etcd{
-					Name:                "s2",
-					DataDir:             "/tmp/etcd-functional-2/etcd.data",
-					WALDir:              "/tmp/etcd-functional-2/etcd.data/member/wal",
-					HeartbeatIntervalMs: 100,
-					ElectionTimeoutMs:   1000,
-					ListenClientURLs:    []string{"https://127.0.0.1:2379"},
-					AdvertiseClientURLs: []string{"https://127.0.0.1:2379"},
-					ClientAutoTLS:       true,
-					ClientCertAuth:      false,
-					ClientCertFile:      "",
-					ClientKeyFile:       "",
-					ClientTrustedCAFile: "",
-					ListenPeerURLs:      []string{"https://127.0.0.1:2380"},
-					AdvertisePeerURLs:   []string{"https://127.0.0.1:2381"},
-					PeerAutoTLS:         true,
-					PeerClientCertAuth:  false,
-					PeerCertFile:        "",
-					PeerKeyFile:         "",
-					PeerTrustedCAFile:   "",
-					InitialCluster:      "s1=https://127.0.0.1:1381,s2=https://127.0.0.1:2381,s3=https://127.0.0.1:3381",
-					InitialClusterState: "new",
-					InitialClusterToken: "tkn",
-					SnapshotCount:       10000,
-					QuotaBackendBytes:   10740000000,
-					PreVote:             true,
-					InitialCorruptCheck: true,
-					Logger:              "zap",
-					LogOutputs:          []string{"/tmp/etcd-functional-2/etcd.log"},
-					Debug:               true,
-				},
-				ClientCertData:      "",
-				ClientCertPath:      "",
-				ClientKeyData:       "",
-				ClientKeyPath:       "",
-				ClientTrustedCAData: "",
-				ClientTrustedCAPath: "",
-				PeerCertData:        "",
-				PeerCertPath:        "",
-				PeerKeyData:         "",
-				PeerKeyPath:         "",
-				PeerTrustedCAData:   "",
-				PeerTrustedCAPath:   "",
-				SnapshotPath:        "/tmp/etcd-functional-2.snapshot.db",
-			},
-			{
-				EtcdExec:           "./bin/etcd",
-				AgentAddr:          "127.0.0.1:39027",
-				FailpointHTTPAddr:  "http://127.0.0.1:7383",
-				BaseDir:            "/tmp/etcd-functional-3",
-				EtcdClientProxy:    false,
-				EtcdPeerProxy:      true,
-				EtcdClientEndpoint: "127.0.0.1:3379",
-				Etcd: &rpcpb.Etcd{
-					Name:                "s3",
-					DataDir:             "/tmp/etcd-functional-3/etcd.data",
-					WALDir:              "/tmp/etcd-functional-3/etcd.data/member/wal",
-					HeartbeatIntervalMs: 100,
-					ElectionTimeoutMs:   1000,
-					ListenClientURLs:    []string{"https://127.0.0.1:3379"},
-					AdvertiseClientURLs: []string{"https://127.0.0.1:3379"},
-					ClientAutoTLS:       true,
-					ClientCertAuth:      false,
-					ClientCertFile:      "",
-					ClientKeyFile:       "",
-					ClientTrustedCAFile: "",
-					ListenPeerURLs:      []string{"https://127.0.0.1:3380"},
-					AdvertisePeerURLs:   []string{"https://127.0.0.1:3381"},
-					PeerAutoTLS:         true,
-					PeerClientCertAuth:  false,
-					PeerCertFile:        "",
-					PeerKeyFile:         "",
-					PeerTrustedCAFile:   "",
-					InitialCluster:      "s1=https://127.0.0.1:1381,s2=https://127.0.0.1:2381,s3=https://127.0.0.1:3381",
-					InitialClusterState: "new",
-					InitialClusterToken: "tkn",
-					SnapshotCount:       10000,
-					QuotaBackendBytes:   10740000000,
-					PreVote:             true,
-					InitialCorruptCheck: true,
-					Logger:              "zap",
-					LogOutputs:          []string{"/tmp/etcd-functional-3/etcd.log"},
-					Debug:               true,
-				},
-				ClientCertData:      "",
-				ClientCertPath:      "",
-				ClientKeyData:       "",
-				ClientKeyPath:       "",
-				ClientTrustedCAData: "",
-				ClientTrustedCAPath: "",
-				PeerCertData:        "",
-				PeerCertPath:        "",
-				PeerKeyData:         "",
-				PeerKeyPath:         "",
-				PeerTrustedCAData:   "",
-				PeerTrustedCAPath:   "",
-				SnapshotPath:        "/tmp/etcd-functional-3.snapshot.db",
-			},
-		},
-		Tester: &rpcpb.Tester{
-			DataDir:               "/tmp/etcd-tester-data",
-			Network:               "tcp",
-			Addr:                  "127.0.0.1:9028",
-			DelayLatencyMs:        5000,
-			DelayLatencyMsRv:      500,
-			UpdatedDelayLatencyMs: 5000,
-			RoundLimit:            1,
-			ExitOnCaseFail:        true,
-			EnablePprof:           true,
-			CaseDelayMs:           7000,
-			CaseShuffle:           true,
-			Cases: []string{
-				"SIGTERM_ONE_FOLLOWER",
-				"SIGTERM_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT",
-				"SIGTERM_LEADER",
-				"SIGTERM_LEADER_UNTIL_TRIGGER_SNAPSHOT",
-				"SIGTERM_QUORUM",
-				"SIGTERM_ALL",
-				"SIGQUIT_AND_REMOVE_ONE_FOLLOWER",
-				"SIGQUIT_AND_REMOVE_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT",
-				// "SIGQUIT_AND_REMOVE_LEADER",
-				// "SIGQUIT_AND_REMOVE_LEADER_UNTIL_TRIGGER_SNAPSHOT",
-				// "SIGQUIT_AND_REMOVE_QUORUM_AND_RESTORE_LEADER_SNAPSHOT_FROM_SCRATCH",
-				// "BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER",
-				// "BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT",
-				"BLACKHOLE_PEER_PORT_TX_RX_LEADER",
-				"BLACKHOLE_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT",
-				"BLACKHOLE_PEER_PORT_TX_RX_QUORUM",
-				"BLACKHOLE_PEER_PORT_TX_RX_ALL",
-				// "DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER",
-				// "RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER",
-				// "DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT",
-				// "RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT",
-				"DELAY_PEER_PORT_TX_RX_LEADER",
-				"RANDOM_DELAY_PEER_PORT_TX_RX_LEADER",
-				"DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT",
-				"RANDOM_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT",
-				"DELAY_PEER_PORT_TX_RX_QUORUM",
-				"RANDOM_DELAY_PEER_PORT_TX_RX_QUORUM",
-				"DELAY_PEER_PORT_TX_RX_ALL",
-				"RANDOM_DELAY_PEER_PORT_TX_RX_ALL",
-				"NO_FAIL_WITH_STRESS",
-				"NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS",
-			},
-			FailpointCommands: []string{`panic("etcd-tester")`},
-			RunnerExecPath:    "./bin/etcd-runner",
-			ExternalExecPath:  "",
-			Stressers: []*rpcpb.Stresser{
-				{Type: "KV_WRITE_SMALL", Weight: 0.35},
-				{Type: "KV_WRITE_LARGE", Weight: 0.002},
-				{Type: "KV_READ_ONE_KEY", Weight: 0.07},
-				{Type: "KV_READ_RANGE", Weight: 0.07},
-				{Type: "KV_DELETE_ONE_KEY", Weight: 0.07},
-				{Type: "KV_DELETE_RANGE", Weight: 0.07},
-				{Type: "KV_TXN_WRITE_DELETE", Weight: 0.35},
-				{Type: "LEASE", Weight: 0.0},
-			},
-			Checkers:                []string{"KV_HASH", "LEASE_EXPIRE"},
-			StressKeySize:           100,
-			StressKeySizeLarge:      32769,
-			StressKeySuffixRange:    250000,
-			StressKeySuffixRangeTxn: 100,
-			StressKeyTxnOps:         10,
-			StressClients:           100,
-			StressQPS:               2000,
-		},
-	}
-
-	logger, err := zap.NewProduction()
-	if err != nil {
-		t.Fatal(err)
-	}
-	defer logger.Sync()
-
-	cfg, err := read(logger, "../../functional.yaml")
-	if err != nil {
-		t.Fatal(err)
-	}
-	cfg.lg = nil
-
-	if !reflect.DeepEqual(exp, cfg) {
-		t.Fatalf("expected %+v, got %+v", exp, cfg)
-	}
-
-	cfg.lg = logger
-
-	cfg.updateCases()
-	fs1 := cfg.listCases()
-
-	cfg.shuffleCases()
-	fs2 := cfg.listCases()
-	if reflect.DeepEqual(fs1, fs2) {
-		t.Fatalf("expected shuffled failure cases, got %q", fs2)
-	}
-
-	cfg.shuffleCases()
-	fs3 := cfg.listCases()
-	if reflect.DeepEqual(fs2, fs3) {
-		t.Fatalf("expected reshuffled failure cases from %q, got %q", fs2, fs3)
-	}
-
-	// shuffle ensures visit all exactly once
-	// so when sorted, failure cases must be equal
-	sort.Strings(fs1)
-	sort.Strings(fs2)
-	sort.Strings(fs3)
-
-	if !reflect.DeepEqual(fs1, fs2) {
-		t.Fatalf("expected %q, got %q", fs1, fs2)
-	}
-	if !reflect.DeepEqual(fs2, fs3) {
-		t.Fatalf("expected %q, got %q", fs2, fs3)
-	}
-}

+ 0 - 0
functional/tester/doc.go


+ 0 - 0
functional/tester/metrics_report.go


+ 35 - 59
functional/tester/stresser.go

@@ -37,60 +37,40 @@ type Stresser interface {
 
 
 // newStresser creates stresser from a comma separated list of stresser types.
 // newStresser creates stresser from a comma separated list of stresser types.
 func newStresser(clus *Cluster, m *rpcpb.Member) (stressers []Stresser) {
 func newStresser(clus *Cluster, m *rpcpb.Member) (stressers []Stresser) {
-	// TODO: Too intensive stressing clients can panic etcd member with
-	// 'out of memory' error. Put rate limits in server side.
-	ks := &keyStresser{
-		lg:                clus.lg,
-		m:                 m,
-		keySize:           int(clus.Tester.StressKeySize),
-		keyLargeSize:      int(clus.Tester.StressKeySizeLarge),
-		keySuffixRange:    int(clus.Tester.StressKeySuffixRange),
-		keyTxnSuffixRange: int(clus.Tester.StressKeySuffixRangeTxn),
-		keyTxnOps:         int(clus.Tester.StressKeyTxnOps),
-		clientsN:          int(clus.Tester.StressClients),
-		rateLimiter:       clus.rateLimiter,
-	}
-	ksExist := false
-
-	for _, s := range clus.Tester.Stressers {
+	stressers = make([]Stresser, len(clus.Tester.Stressers))
+	for i, stype := range clus.Tester.Stressers {
 		clus.lg.Info(
 		clus.lg.Info(
 			"creating stresser",
 			"creating stresser",
-			zap.String("type", s.Type),
-			zap.Float64("weight", s.Weight),
+			zap.String("type", stype),
 			zap.String("endpoint", m.EtcdClientEndpoint),
 			zap.String("endpoint", m.EtcdClientEndpoint),
 		)
 		)
-		switch s.Type {
-		case "KV_WRITE_SMALL":
-			ksExist = true
-			ks.weightKVWriteSmall = s.Weight
-		case "KV_WRITE_LARGE":
-			ksExist = true
-			ks.weightKVWriteLarge = s.Weight
-		case "KV_READ_ONE_KEY":
-			ksExist = true
-			ks.weightKVReadOneKey = s.Weight
-		case "KV_READ_RANGE":
-			ksExist = true
-			ks.weightKVReadRange = s.Weight
-		case "KV_DELETE_ONE_KEY":
-			ksExist = true
-			ks.weightKVDeleteOneKey = s.Weight
-		case "KV_DELETE_RANGE":
-			ksExist = true
-			ks.weightKVDeleteRange = s.Weight
-		case "KV_TXN_WRITE_DELETE":
-			ksExist = true
-			ks.weightKVTxnWriteDelete = s.Weight
+
+		switch stype {
+		case "KV":
+			// TODO: Too intensive stressing clients can panic etcd member with
+			// 'out of memory' error. Put rate limits in server side.
+			stressers[i] = &keyStresser{
+				stype:             rpcpb.Stresser_KV,
+				lg:                clus.lg,
+				m:                 m,
+				keySize:           int(clus.Tester.StressKeySize),
+				keyLargeSize:      int(clus.Tester.StressKeySizeLarge),
+				keySuffixRange:    int(clus.Tester.StressKeySuffixRange),
+				keyTxnSuffixRange: int(clus.Tester.StressKeySuffixRangeTxn),
+				keyTxnOps:         int(clus.Tester.StressKeyTxnOps),
+				clientsN:          int(clus.Tester.StressClients),
+				rateLimiter:       clus.rateLimiter,
+			}
 
 
 		case "LEASE":
 		case "LEASE":
-			stressers = append(stressers, &leaseStresser{
-				stype:        rpcpb.StresserType_LEASE,
+			stressers[i] = &leaseStresser{
+				stype:        rpcpb.Stresser_LEASE,
 				lg:           clus.lg,
 				lg:           clus.lg,
 				m:            m,
 				m:            m,
 				numLeases:    10, // TODO: configurable
 				numLeases:    10, // TODO: configurable
 				keysPerLease: 10, // TODO: configurable
 				keysPerLease: 10, // TODO: configurable
 				rateLimiter:  clus.rateLimiter,
 				rateLimiter:  clus.rateLimiter,
-			})
+			}
 
 
 		case "ELECTION_RUNNER":
 		case "ELECTION_RUNNER":
 			reqRate := 100
 			reqRate := 100
@@ -103,15 +83,15 @@ func newStresser(clus *Cluster, m *rpcpb.Member) (stressers []Stresser) {
 				"--rounds=0", // runs forever
 				"--rounds=0", // runs forever
 				"--req-rate", fmt.Sprintf("%v", reqRate),
 				"--req-rate", fmt.Sprintf("%v", reqRate),
 			}
 			}
-			stressers = append(stressers, newRunnerStresser(
-				rpcpb.StresserType_ELECTION_RUNNER,
+			stressers[i] = newRunnerStresser(
+				rpcpb.Stresser_ELECTION_RUNNER,
 				m.EtcdClientEndpoint,
 				m.EtcdClientEndpoint,
 				clus.lg,
 				clus.lg,
 				clus.Tester.RunnerExecPath,
 				clus.Tester.RunnerExecPath,
 				args,
 				args,
 				clus.rateLimiter,
 				clus.rateLimiter,
 				reqRate,
 				reqRate,
-			))
+			)
 
 
 		case "WATCH_RUNNER":
 		case "WATCH_RUNNER":
 			reqRate := 100
 			reqRate := 100
@@ -125,15 +105,15 @@ func newStresser(clus *Cluster, m *rpcpb.Member) (stressers []Stresser) {
 				"--rounds=0", // runs forever
 				"--rounds=0", // runs forever
 				"--req-rate", fmt.Sprintf("%v", reqRate),
 				"--req-rate", fmt.Sprintf("%v", reqRate),
 			}
 			}
-			stressers = append(stressers, newRunnerStresser(
-				rpcpb.StresserType_WATCH_RUNNER,
+			stressers[i] = newRunnerStresser(
+				rpcpb.Stresser_WATCH_RUNNER,
 				m.EtcdClientEndpoint,
 				m.EtcdClientEndpoint,
 				clus.lg,
 				clus.lg,
 				clus.Tester.RunnerExecPath,
 				clus.Tester.RunnerExecPath,
 				args,
 				args,
 				clus.rateLimiter,
 				clus.rateLimiter,
 				reqRate,
 				reqRate,
-			))
+			)
 
 
 		case "LOCK_RACER_RUNNER":
 		case "LOCK_RACER_RUNNER":
 			reqRate := 100
 			reqRate := 100
@@ -145,15 +125,15 @@ func newStresser(clus *Cluster, m *rpcpb.Member) (stressers []Stresser) {
 				"--rounds=0", // runs forever
 				"--rounds=0", // runs forever
 				"--req-rate", fmt.Sprintf("%v", reqRate),
 				"--req-rate", fmt.Sprintf("%v", reqRate),
 			}
 			}
-			stressers = append(stressers, newRunnerStresser(
-				rpcpb.StresserType_LOCK_RACER_RUNNER,
+			stressers[i] = newRunnerStresser(
+				rpcpb.Stresser_LOCK_RACER_RUNNER,
 				m.EtcdClientEndpoint,
 				m.EtcdClientEndpoint,
 				clus.lg,
 				clus.lg,
 				clus.Tester.RunnerExecPath,
 				clus.Tester.RunnerExecPath,
 				args,
 				args,
 				clus.rateLimiter,
 				clus.rateLimiter,
 				reqRate,
 				reqRate,
-			))
+			)
 
 
 		case "LEASE_RUNNER":
 		case "LEASE_RUNNER":
 			args := []string{
 			args := []string{
@@ -161,20 +141,16 @@ func newStresser(clus *Cluster, m *rpcpb.Member) (stressers []Stresser) {
 				"--ttl=30",
 				"--ttl=30",
 				"--endpoints", m.EtcdClientEndpoint,
 				"--endpoints", m.EtcdClientEndpoint,
 			}
 			}
-			stressers = append(stressers, newRunnerStresser(
-				rpcpb.StresserType_LEASE_RUNNER,
+			stressers[i] = newRunnerStresser(
+				rpcpb.Stresser_LEASE_RUNNER,
 				m.EtcdClientEndpoint,
 				m.EtcdClientEndpoint,
 				clus.lg,
 				clus.lg,
 				clus.Tester.RunnerExecPath,
 				clus.Tester.RunnerExecPath,
 				args,
 				args,
 				clus.rateLimiter,
 				clus.rateLimiter,
 				0,
 				0,
-			))
+			)
 		}
 		}
 	}
 	}
-
-	if ksExist {
-		return append(stressers, ks)
-	}
 	return stressers
 	return stressers
 }
 }

+ 0 - 0
functional/tester/stresser_composite.go


+ 63 - 80
functional/tester/stresser_key.go

@@ -31,23 +31,14 @@ import (
 	"go.uber.org/zap"
 	"go.uber.org/zap"
 	"golang.org/x/time/rate"
 	"golang.org/x/time/rate"
 	"google.golang.org/grpc"
 	"google.golang.org/grpc"
-	"google.golang.org/grpc/codes"
-	"google.golang.org/grpc/status"
 )
 )
 
 
 type keyStresser struct {
 type keyStresser struct {
-	lg *zap.Logger
+	stype rpcpb.Stresser
+	lg    *zap.Logger
 
 
 	m *rpcpb.Member
 	m *rpcpb.Member
 
 
-	weightKVWriteSmall     float64
-	weightKVWriteLarge     float64
-	weightKVReadOneKey     float64
-	weightKVReadRange      float64
-	weightKVDeleteOneKey   float64
-	weightKVDeleteRange    float64
-	weightKVTxnWriteDelete float64
-
 	keySize           int
 	keySize           int
 	keyLargeSize      int
 	keyLargeSize      int
 	keySuffixRange    int
 	keySuffixRange    int
@@ -82,16 +73,26 @@ func (s *keyStresser) Stress() error {
 	s.ctx, s.cancel = context.WithCancel(context.Background())
 	s.ctx, s.cancel = context.WithCancel(context.Background())
 
 
 	s.wg.Add(s.clientsN)
 	s.wg.Add(s.clientsN)
-
-	s.stressTable = createStressTable([]stressEntry{
-		{weight: s.weightKVWriteSmall, f: newStressPut(s.cli, s.keySuffixRange, s.keySize)},
-		{weight: s.weightKVWriteLarge, f: newStressPut(s.cli, s.keySuffixRange, s.keyLargeSize)},
-		{weight: s.weightKVReadOneKey, f: newStressRange(s.cli, s.keySuffixRange)},
-		{weight: s.weightKVReadRange, f: newStressRangeInterval(s.cli, s.keySuffixRange)},
-		{weight: s.weightKVDeleteOneKey, f: newStressDelete(s.cli, s.keySuffixRange)},
-		{weight: s.weightKVDeleteRange, f: newStressDeleteInterval(s.cli, s.keySuffixRange)},
-		{weight: s.weightKVTxnWriteDelete, f: newStressTxn(s.cli, s.keyTxnSuffixRange, s.keyTxnOps)},
-	})
+	var stressEntries = []stressEntry{
+		{weight: 0.7, f: newStressPut(s.cli, s.keySuffixRange, s.keySize)},
+		{
+			weight: 0.7 * float32(s.keySize) / float32(s.keyLargeSize),
+			f:      newStressPut(s.cli, s.keySuffixRange, s.keyLargeSize),
+		},
+		{weight: 0.07, f: newStressRange(s.cli, s.keySuffixRange)},
+		{weight: 0.07, f: newStressRangeInterval(s.cli, s.keySuffixRange)},
+		{weight: 0.07, f: newStressDelete(s.cli, s.keySuffixRange)},
+		{weight: 0.07, f: newStressDeleteInterval(s.cli, s.keySuffixRange)},
+	}
+	if s.keyTxnSuffixRange > 0 {
+		// adjust to make up ±70% of workloads with writes
+		stressEntries[0].weight = 0.35
+		stressEntries = append(stressEntries, stressEntry{
+			weight: 0.35,
+			f:      newStressTxn(s.cli, s.keyTxnSuffixRange, s.keyTxnOps),
+		})
+	}
+	s.stressTable = createStressTable(stressEntries)
 
 
 	s.emu.Lock()
 	s.emu.Lock()
 	s.paused = false
 	s.paused = false
@@ -103,7 +104,7 @@ func (s *keyStresser) Stress() error {
 
 
 	s.lg.Info(
 	s.lg.Info(
 		"stress START",
 		"stress START",
-		zap.String("stress-type", "KV"),
+		zap.String("stress-type", s.stype.String()),
 		zap.String("endpoint", s.m.EtcdClientEndpoint),
 		zap.String("endpoint", s.m.EtcdClientEndpoint),
 	)
 	)
 	return nil
 	return nil
@@ -128,7 +129,41 @@ func (s *keyStresser) run() {
 			continue
 			continue
 		}
 		}
 
 
-		if !s.isRetryableError(err) {
+		switch rpctypes.ErrorDesc(err) {
+		case context.DeadlineExceeded.Error():
+			// This retries when request is triggered at the same time as
+			// leader failure. When we terminate the leader, the request to
+			// that leader cannot be processed, and times out. Also requests
+			// to followers cannot be forwarded to the old leader, so timing out
+			// as well. We want to keep stressing until the cluster elects a
+			// new leader and start processing requests again.
+		case etcdserver.ErrTimeoutDueToLeaderFail.Error(), etcdserver.ErrTimeout.Error():
+			// This retries when request is triggered at the same time as
+			// leader failure and follower nodes receive time out errors
+			// from losing their leader. Followers should retry to connect
+			// to the new leader.
+		case etcdserver.ErrStopped.Error():
+			// one of the etcd nodes stopped from failure injection
+		// case transport.ErrConnClosing.Desc:
+		// 	// server closed the transport (failure injected node)
+		case rpctypes.ErrNotCapable.Error():
+			// capability check has not been done (in the beginning)
+		case rpctypes.ErrTooManyRequests.Error():
+			// hitting the recovering member.
+		case context.Canceled.Error():
+			// from stresser.Cancel method:
+			return
+		case grpc.ErrClientConnClosing.Error():
+			// from stresser.Cancel method:
+			return
+		default:
+			s.lg.Warn(
+				"stress run exiting",
+				zap.String("stress-type", s.stype.String()),
+				zap.String("endpoint", s.m.EtcdClientEndpoint),
+				zap.String("error-type", reflect.TypeOf(err).String()),
+				zap.Error(err),
+			)
 			return
 			return
 		}
 		}
 
 
@@ -141,58 +176,6 @@ func (s *keyStresser) run() {
 	}
 	}
 }
 }
 
 
-func (s *keyStresser) isRetryableError(err error) bool {
-	switch rpctypes.ErrorDesc(err) {
-	// retryable
-	case context.DeadlineExceeded.Error():
-		// This retries when request is triggered at the same time as
-		// leader failure. When we terminate the leader, the request to
-		// that leader cannot be processed, and times out. Also requests
-		// to followers cannot be forwarded to the old leader, so timing out
-		// as well. We want to keep stressing until the cluster elects a
-		// new leader and start processing requests again.
-		return true
-	case etcdserver.ErrTimeoutDueToLeaderFail.Error(), etcdserver.ErrTimeout.Error():
-		// This retries when request is triggered at the same time as
-		// leader failure and follower nodes receive time out errors
-		// from losing their leader. Followers should retry to connect
-		// to the new leader.
-		return true
-	case etcdserver.ErrStopped.Error():
-		// one of the etcd nodes stopped from failure injection
-		return true
-	case rpctypes.ErrNotCapable.Error():
-		// capability check has not been done (in the beginning)
-		return true
-	case rpctypes.ErrTooManyRequests.Error():
-		// hitting the recovering member.
-		return true
-	// case raft.ErrProposalDropped.Error():
-	// 	// removed member, or leadership has changed (old leader got raftpb.MsgProp)
-	// 	return true
-
-	// not retryable.
-	case context.Canceled.Error():
-		// from stresser.Cancel method:
-		return false
-	}
-
-	if status.Convert(err).Code() == codes.Unavailable {
-		// gRPC connection errors are translated to status.Unavailable
-		return true
-	}
-
-	s.lg.Warn(
-		"stress run exiting",
-		zap.String("stress-type", "KV"),
-		zap.String("endpoint", s.m.EtcdClientEndpoint),
-		zap.String("error-type", reflect.TypeOf(err).String()),
-		zap.String("error-desc", rpctypes.ErrorDesc(err)),
-		zap.Error(err),
-	)
-	return false
-}
-
 func (s *keyStresser) Pause() map[string]int {
 func (s *keyStresser) Pause() map[string]int {
 	return s.Close()
 	return s.Close()
 }
 }
@@ -210,7 +193,7 @@ func (s *keyStresser) Close() map[string]int {
 
 
 	s.lg.Info(
 	s.lg.Info(
 		"stress STOP",
 		"stress STOP",
-		zap.String("stress-type", "KV"),
+		zap.String("stress-type", s.stype.String()),
 		zap.String("endpoint", s.m.EtcdClientEndpoint),
 		zap.String("endpoint", s.m.EtcdClientEndpoint),
 	)
 	)
 	return ess
 	return ess
@@ -223,13 +206,13 @@ func (s *keyStresser) ModifiedKeys() int64 {
 type stressFunc func(ctx context.Context) (err error, modifiedKeys int64)
 type stressFunc func(ctx context.Context) (err error, modifiedKeys int64)
 
 
 type stressEntry struct {
 type stressEntry struct {
-	weight float64
+	weight float32
 	f      stressFunc
 	f      stressFunc
 }
 }
 
 
 type stressTable struct {
 type stressTable struct {
 	entries    []stressEntry
 	entries    []stressEntry
-	sumWeights float64
+	sumWeights float32
 }
 }
 
 
 func createStressTable(entries []stressEntry) *stressTable {
 func createStressTable(entries []stressEntry) *stressTable {
@@ -241,8 +224,8 @@ func createStressTable(entries []stressEntry) *stressTable {
 }
 }
 
 
 func (st *stressTable) choose() stressFunc {
 func (st *stressTable) choose() stressFunc {
-	v := rand.Float64() * st.sumWeights
-	var sum float64
+	v := rand.Float32() * st.sumWeights
+	var sum float32
 	var idx int
 	var idx int
 	for i := range st.entries {
 	for i := range st.entries {
 		sum += st.entries[i].weight
 		sum += st.entries[i].weight

+ 1 - 1
functional/tester/stresser_lease.go

@@ -38,7 +38,7 @@ const (
 )
 )
 
 
 type leaseStresser struct {
 type leaseStresser struct {
-	stype rpcpb.StresserType
+	stype rpcpb.Stresser
 	lg    *zap.Logger
 	lg    *zap.Logger
 
 
 	m      *rpcpb.Member
 	m      *rpcpb.Member

+ 2 - 3
functional/tester/stresser_runner.go

@@ -27,7 +27,7 @@ import (
 )
 )
 
 
 type runnerStresser struct {
 type runnerStresser struct {
-	stype              rpcpb.StresserType
+	stype              rpcpb.Stresser
 	etcdClientEndpoint string
 	etcdClientEndpoint string
 	lg                 *zap.Logger
 	lg                 *zap.Logger
 
 
@@ -42,7 +42,7 @@ type runnerStresser struct {
 }
 }
 
 
 func newRunnerStresser(
 func newRunnerStresser(
-	stype rpcpb.StresserType,
+	stype rpcpb.Stresser,
 	ep string,
 	ep string,
 	lg *zap.Logger,
 	lg *zap.Logger,
 	cmdStr string,
 	cmdStr string,
@@ -54,7 +54,6 @@ func newRunnerStresser(
 	return &runnerStresser{
 	return &runnerStresser{
 		stype:              stype,
 		stype:              stype,
 		etcdClientEndpoint: ep,
 		etcdClientEndpoint: ep,
-		lg:                 lg,
 		cmdStr:             cmdStr,
 		cmdStr:             cmdStr,
 		args:               args,
 		args:               args,
 		rl:                 rl,
 		rl:                 rl,

+ 0 - 0
functional/tester/utils.go


Some files were not shown because too many files changed in this diff