main.go 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215
  1. // Copyright 2015 The etcd Authors
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package main
  15. import (
  16. "flag"
  17. "fmt"
  18. "net/http"
  19. "net/http/pprof"
  20. "os"
  21. "strings"
  22. "github.com/coreos/pkg/capnslog"
  23. "github.com/prometheus/client_golang/prometheus"
  24. "golang.org/x/time/rate"
  25. )
  26. var plog = capnslog.NewPackageLogger("github.com/coreos/etcd", "etcd-tester")
  27. const (
  28. defaultClientPort = 2379
  29. defaultPeerPort = 2380
  30. defaultFailpointPort = 2381
  31. )
  32. const pprofPrefix = "/debug/pprof-tester"
  33. func main() {
  34. endpointStr := flag.String("agent-endpoints", "localhost:9027", "HTTP RPC endpoints of agents. Do not specify the schema.")
  35. clientPorts := flag.String("client-ports", "", "etcd client port for each agent endpoint")
  36. peerPorts := flag.String("peer-ports", "", "etcd peer port for each agent endpoint")
  37. failpointPorts := flag.String("failpoint-ports", "", "etcd failpoint port for each agent endpoint")
  38. datadir := flag.String("data-dir", "agent.etcd", "etcd data directory location on agent machine.")
  39. stressKeyLargeSize := flag.Uint("stress-key-large-size", 32*1024+1, "the size of each large key written into etcd.")
  40. stressKeySize := flag.Uint("stress-key-size", 100, "the size of each small key written into etcd.")
  41. stressKeySuffixRange := flag.Uint("stress-key-count", 250000, "the count of key range written into etcd.")
  42. limit := flag.Int("limit", -1, "the limit of rounds to run failure set (-1 to run without limits).")
  43. stressQPS := flag.Int("stress-qps", 10000, "maximum number of stresser requests per second.")
  44. schedCases := flag.String("schedule-cases", "", "test case schedule")
  45. consistencyCheck := flag.Bool("consistency-check", true, "true to check consistency (revision, hash)")
  46. stresserType := flag.String("stresser", "keys,lease", "comma separated list of stressers (keys, lease, v2keys, nop).")
  47. failureTypes := flag.String("failures", "default,failpoints", "specify failures (concat of \"default\" and \"failpoints\").")
  48. externalFailures := flag.String("external-failures", "", "specify a path of script for enabling/disabling an external fault injector")
  49. enablePprof := flag.Bool("enable-pprof", false, "true to enable pprof")
  50. flag.Parse()
  51. eps := strings.Split(*endpointStr, ",")
  52. cports := portsFromArg(*clientPorts, len(eps), defaultClientPort)
  53. pports := portsFromArg(*peerPorts, len(eps), defaultPeerPort)
  54. fports := portsFromArg(*failpointPorts, len(eps), defaultFailpointPort)
  55. agents := make([]agentConfig, len(eps))
  56. for i := range eps {
  57. agents[i].endpoint = eps[i]
  58. agents[i].clientPort = cports[i]
  59. agents[i].peerPort = pports[i]
  60. agents[i].failpointPort = fports[i]
  61. agents[i].datadir = *datadir
  62. }
  63. c := &cluster{agents: agents}
  64. if err := c.bootstrap(); err != nil {
  65. plog.Fatal(err)
  66. }
  67. defer c.Terminate()
  68. // ensure cluster is fully booted to know failpoints are available
  69. c.WaitHealth()
  70. var failures []failure
  71. if failureTypes != nil && *failureTypes != "" {
  72. failures = makeFailures(*failureTypes, c)
  73. }
  74. if externalFailures != nil && *externalFailures != "" {
  75. if len(failures) != 0 {
  76. plog.Errorf("specify only one of -failures or -external-failures")
  77. os.Exit(1)
  78. }
  79. failures = append(failures, newFailureExternal(*externalFailures))
  80. }
  81. if len(failures) == 0 {
  82. plog.Infof("no failures\n")
  83. failures = append(failures, newFailureNop())
  84. }
  85. schedule := failures
  86. if schedCases != nil && *schedCases != "" {
  87. cases := strings.Split(*schedCases, " ")
  88. schedule = make([]failure, len(cases))
  89. for i := range cases {
  90. caseNum := 0
  91. n, err := fmt.Sscanf(cases[i], "%d", &caseNum)
  92. if n == 0 || err != nil {
  93. plog.Fatalf(`couldn't parse case "%s" (%v)`, cases[i], err)
  94. }
  95. schedule[i] = failures[caseNum]
  96. }
  97. }
  98. scfg := stressConfig{
  99. rateLimiter: rate.NewLimiter(rate.Limit(*stressQPS), *stressQPS),
  100. keyLargeSize: int(*stressKeyLargeSize),
  101. keySize: int(*stressKeySize),
  102. keySuffixRange: int(*stressKeySuffixRange),
  103. numLeases: 10,
  104. keysPerLease: 10,
  105. }
  106. t := &tester{
  107. failures: schedule,
  108. cluster: c,
  109. limit: *limit,
  110. scfg: scfg,
  111. stresserType: *stresserType,
  112. doChecks: *consistencyCheck,
  113. }
  114. sh := statusHandler{status: &t.status}
  115. http.Handle("/status", sh)
  116. http.Handle("/metrics", prometheus.Handler())
  117. if *enablePprof {
  118. http.Handle(pprofPrefix+"/", http.HandlerFunc(pprof.Index))
  119. http.Handle(pprofPrefix+"/profile", http.HandlerFunc(pprof.Profile))
  120. http.Handle(pprofPrefix+"/symbol", http.HandlerFunc(pprof.Symbol))
  121. http.Handle(pprofPrefix+"/cmdline", http.HandlerFunc(pprof.Cmdline))
  122. http.Handle(pprofPrefix+"/trace", http.HandlerFunc(pprof.Trace))
  123. http.Handle(pprofPrefix+"/heap", pprof.Handler("heap"))
  124. http.Handle(pprofPrefix+"/goroutine", pprof.Handler("goroutine"))
  125. http.Handle(pprofPrefix+"/threadcreate", pprof.Handler("threadcreate"))
  126. http.Handle(pprofPrefix+"/block", pprof.Handler("block"))
  127. }
  128. go func() { plog.Fatal(http.ListenAndServe(":9028", nil)) }()
  129. t.runLoop()
  130. }
  131. // portsFromArg converts a comma separated list into a slice of ints
  132. func portsFromArg(arg string, n, defaultPort int) []int {
  133. ret := make([]int, n)
  134. if len(arg) == 0 {
  135. for i := range ret {
  136. ret[i] = defaultPort
  137. }
  138. return ret
  139. }
  140. s := strings.Split(arg, ",")
  141. if len(s) != n {
  142. fmt.Printf("expected %d ports, got %d (%s)\n", n, len(s), arg)
  143. os.Exit(1)
  144. }
  145. for i := range s {
  146. if _, err := fmt.Sscanf(s[i], "%d", &ret[i]); err != nil {
  147. fmt.Println(err)
  148. os.Exit(1)
  149. }
  150. }
  151. return ret
  152. }
  153. func makeFailures(types string, c *cluster) []failure {
  154. var failures []failure
  155. fails := strings.Split(types, ",")
  156. for i := range fails {
  157. switch fails[i] {
  158. case "default":
  159. defaultFailures := []failure{
  160. newFailureKillAll(),
  161. newFailureKillMajority(),
  162. newFailureKillOne(),
  163. newFailureKillLeader(),
  164. newFailureKillOneForLongTime(),
  165. newFailureKillLeaderForLongTime(),
  166. newFailureIsolate(),
  167. newFailureIsolateAll(),
  168. newFailureSlowNetworkOneMember(),
  169. newFailureSlowNetworkLeader(),
  170. newFailureSlowNetworkAll(),
  171. }
  172. failures = append(failures, defaultFailures...)
  173. case "failpoints":
  174. fpFailures, fperr := failpointFailures(c)
  175. if len(fpFailures) == 0 {
  176. plog.Infof("no failpoints found (%v)", fperr)
  177. }
  178. failures = append(failures, fpFailures...)
  179. default:
  180. plog.Errorf("unknown failure: %s\n", fails[i])
  181. os.Exit(1)
  182. }
  183. }
  184. return failures
  185. }