stresser_key.go 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348
  1. // Copyright 2018 The etcd Authors
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package tester
  15. import (
  16. "context"
  17. "fmt"
  18. "math/rand"
  19. "reflect"
  20. "sync"
  21. "sync/atomic"
  22. "time"
  23. "github.com/coreos/etcd/clientv3"
  24. "github.com/coreos/etcd/etcdserver"
  25. "github.com/coreos/etcd/etcdserver/api/v3rpc/rpctypes"
  26. "github.com/coreos/etcd/functional/rpcpb"
  27. "github.com/coreos/etcd/raft"
  28. "go.uber.org/zap"
  29. "golang.org/x/time/rate"
  30. "google.golang.org/grpc"
  31. "google.golang.org/grpc/transport"
  32. )
  33. type keyStresser struct {
  34. stype rpcpb.Stresser
  35. lg *zap.Logger
  36. m *rpcpb.Member
  37. keySize int
  38. keyLargeSize int
  39. keySuffixRange int
  40. keyTxnSuffixRange int
  41. keyTxnOps int
  42. rateLimiter *rate.Limiter
  43. wg sync.WaitGroup
  44. clientsN int
  45. ctx context.Context
  46. cancel func()
  47. cli *clientv3.Client
  48. emu sync.RWMutex
  49. ems map[string]int
  50. paused bool
  51. // atomicModifiedKeys records the number of keys created and deleted by the stresser.
  52. atomicModifiedKeys int64
  53. stressTable *stressTable
  54. }
  55. func (s *keyStresser) Stress() error {
  56. var err error
  57. s.cli, err = s.m.CreateEtcdClient(grpc.WithBackoffMaxDelay(1 * time.Second))
  58. if err != nil {
  59. return fmt.Errorf("%v (%q)", err, s.m.EtcdClientEndpoint)
  60. }
  61. s.ctx, s.cancel = context.WithCancel(context.Background())
  62. s.wg.Add(s.clientsN)
  63. var stressEntries = []stressEntry{
  64. {weight: 0.7, f: newStressPut(s.cli, s.keySuffixRange, s.keySize)},
  65. {
  66. weight: 0.7 * float32(s.keySize) / float32(s.keyLargeSize),
  67. f: newStressPut(s.cli, s.keySuffixRange, s.keyLargeSize),
  68. },
  69. {weight: 0.07, f: newStressRange(s.cli, s.keySuffixRange)},
  70. {weight: 0.07, f: newStressRangeInterval(s.cli, s.keySuffixRange)},
  71. {weight: 0.07, f: newStressDelete(s.cli, s.keySuffixRange)},
  72. {weight: 0.07, f: newStressDeleteInterval(s.cli, s.keySuffixRange)},
  73. }
  74. if s.keyTxnSuffixRange > 0 {
  75. // adjust to make up ±70% of workloads with writes
  76. stressEntries[0].weight = 0.35
  77. stressEntries = append(stressEntries, stressEntry{
  78. weight: 0.35,
  79. f: newStressTxn(s.cli, s.keyTxnSuffixRange, s.keyTxnOps),
  80. })
  81. }
  82. s.stressTable = createStressTable(stressEntries)
  83. s.emu.Lock()
  84. s.paused = false
  85. s.ems = make(map[string]int, 100)
  86. s.emu.Unlock()
  87. for i := 0; i < s.clientsN; i++ {
  88. go s.run()
  89. }
  90. s.lg.Info(
  91. "stress START",
  92. zap.String("stress-type", s.stype.String()),
  93. zap.String("endpoint", s.m.EtcdClientEndpoint),
  94. )
  95. return nil
  96. }
  97. func (s *keyStresser) run() {
  98. defer s.wg.Done()
  99. for {
  100. if err := s.rateLimiter.Wait(s.ctx); err == context.Canceled {
  101. return
  102. }
  103. // TODO: 10-second is enough timeout to cover leader failure
  104. // and immediate leader election. Find out what other cases this
  105. // could be timed out.
  106. sctx, scancel := context.WithTimeout(s.ctx, 10*time.Second)
  107. err, modifiedKeys := s.stressTable.choose()(sctx)
  108. scancel()
  109. if err == nil {
  110. atomic.AddInt64(&s.atomicModifiedKeys, modifiedKeys)
  111. continue
  112. }
  113. switch rpctypes.ErrorDesc(err) {
  114. case context.DeadlineExceeded.Error():
  115. // This retries when request is triggered at the same time as
  116. // leader failure. When we terminate the leader, the request to
  117. // that leader cannot be processed, and times out. Also requests
  118. // to followers cannot be forwarded to the old leader, so timing out
  119. // as well. We want to keep stressing until the cluster elects a
  120. // new leader and start processing requests again.
  121. case etcdserver.ErrTimeoutDueToLeaderFail.Error(), etcdserver.ErrTimeout.Error():
  122. // This retries when request is triggered at the same time as
  123. // leader failure and follower nodes receive time out errors
  124. // from losing their leader. Followers should retry to connect
  125. // to the new leader.
  126. case etcdserver.ErrStopped.Error():
  127. // one of the etcd nodes stopped from failure injection
  128. case transport.ErrConnClosing.Desc:
  129. // server closed the transport (failure injected node)
  130. case rpctypes.ErrNotCapable.Error():
  131. // capability check has not been done (in the beginning)
  132. case rpctypes.ErrTooManyRequests.Error():
  133. // hitting the recovering member.
  134. case raft.ErrProposalDropped.Error():
  135. // removed member, or leadership has changed (old leader got raftpb.MsgProp)
  136. case context.Canceled.Error():
  137. // from stresser.Cancel method:
  138. return
  139. case grpc.ErrClientConnClosing.Error():
  140. // from stresser.Cancel method:
  141. return
  142. default:
  143. s.lg.Warn(
  144. "stress run exiting",
  145. zap.String("stress-type", s.stype.String()),
  146. zap.String("endpoint", s.m.EtcdClientEndpoint),
  147. zap.String("error-type", reflect.TypeOf(err).String()),
  148. zap.String("error-desc", rpctypes.ErrorDesc(err)),
  149. zap.Error(err),
  150. )
  151. return
  152. }
  153. // only record errors before pausing stressers
  154. s.emu.Lock()
  155. if !s.paused {
  156. s.ems[err.Error()]++
  157. }
  158. s.emu.Unlock()
  159. }
  160. }
  161. func (s *keyStresser) Pause() map[string]int {
  162. return s.Close()
  163. }
  164. func (s *keyStresser) Close() map[string]int {
  165. s.cancel()
  166. s.cli.Close()
  167. s.wg.Wait()
  168. s.emu.Lock()
  169. s.paused = true
  170. ess := s.ems
  171. s.ems = make(map[string]int, 100)
  172. s.emu.Unlock()
  173. s.lg.Info(
  174. "stress STOP",
  175. zap.String("stress-type", s.stype.String()),
  176. zap.String("endpoint", s.m.EtcdClientEndpoint),
  177. )
  178. return ess
  179. }
  180. func (s *keyStresser) ModifiedKeys() int64 {
  181. return atomic.LoadInt64(&s.atomicModifiedKeys)
  182. }
  183. type stressFunc func(ctx context.Context) (err error, modifiedKeys int64)
  184. type stressEntry struct {
  185. weight float32
  186. f stressFunc
  187. }
  188. type stressTable struct {
  189. entries []stressEntry
  190. sumWeights float32
  191. }
  192. func createStressTable(entries []stressEntry) *stressTable {
  193. st := stressTable{entries: entries}
  194. for _, entry := range st.entries {
  195. st.sumWeights += entry.weight
  196. }
  197. return &st
  198. }
  199. func (st *stressTable) choose() stressFunc {
  200. v := rand.Float32() * st.sumWeights
  201. var sum float32
  202. var idx int
  203. for i := range st.entries {
  204. sum += st.entries[i].weight
  205. if sum >= v {
  206. idx = i
  207. break
  208. }
  209. }
  210. return st.entries[idx].f
  211. }
  212. func newStressPut(cli *clientv3.Client, keySuffixRange, keySize int) stressFunc {
  213. return func(ctx context.Context) (error, int64) {
  214. _, err := cli.Put(
  215. ctx,
  216. fmt.Sprintf("foo%016x", rand.Intn(keySuffixRange)),
  217. string(randBytes(keySize)),
  218. )
  219. return err, 1
  220. }
  221. }
  222. func newStressTxn(cli *clientv3.Client, keyTxnSuffixRange, txnOps int) stressFunc {
  223. keys := make([]string, keyTxnSuffixRange)
  224. for i := range keys {
  225. keys[i] = fmt.Sprintf("/k%03d", i)
  226. }
  227. return writeTxn(cli, keys, txnOps)
  228. }
  229. func writeTxn(cli *clientv3.Client, keys []string, txnOps int) stressFunc {
  230. return func(ctx context.Context) (error, int64) {
  231. ks := make(map[string]struct{}, txnOps)
  232. for len(ks) != txnOps {
  233. ks[keys[rand.Intn(len(keys))]] = struct{}{}
  234. }
  235. selected := make([]string, 0, txnOps)
  236. for k := range ks {
  237. selected = append(selected, k)
  238. }
  239. com, delOp, putOp := getTxnOps(selected[0], "bar00")
  240. thenOps := []clientv3.Op{delOp}
  241. elseOps := []clientv3.Op{putOp}
  242. for i := 1; i < txnOps; i++ { // nested txns
  243. k, v := selected[i], fmt.Sprintf("bar%02d", i)
  244. com, delOp, putOp = getTxnOps(k, v)
  245. txnOp := clientv3.OpTxn(
  246. []clientv3.Cmp{com},
  247. []clientv3.Op{delOp},
  248. []clientv3.Op{putOp},
  249. )
  250. thenOps = append(thenOps, txnOp)
  251. elseOps = append(elseOps, txnOp)
  252. }
  253. _, err := cli.Txn(ctx).
  254. If(com).
  255. Then(thenOps...).
  256. Else(elseOps...).
  257. Commit()
  258. return err, int64(txnOps)
  259. }
  260. }
  261. func getTxnOps(k, v string) (
  262. cmp clientv3.Cmp,
  263. dop clientv3.Op,
  264. pop clientv3.Op) {
  265. // if key exists (version > 0)
  266. cmp = clientv3.Compare(clientv3.Version(k), ">", 0)
  267. dop = clientv3.OpDelete(k)
  268. pop = clientv3.OpPut(k, v)
  269. return cmp, dop, pop
  270. }
  271. func newStressRange(cli *clientv3.Client, keySuffixRange int) stressFunc {
  272. return func(ctx context.Context) (error, int64) {
  273. _, err := cli.Get(ctx, fmt.Sprintf("foo%016x", rand.Intn(keySuffixRange)))
  274. return err, 0
  275. }
  276. }
  277. func newStressRangeInterval(cli *clientv3.Client, keySuffixRange int) stressFunc {
  278. return func(ctx context.Context) (error, int64) {
  279. start := rand.Intn(keySuffixRange)
  280. end := start + 500
  281. _, err := cli.Get(
  282. ctx,
  283. fmt.Sprintf("foo%016x", start),
  284. clientv3.WithRange(fmt.Sprintf("foo%016x", end)),
  285. )
  286. return err, 0
  287. }
  288. }
  289. func newStressDelete(cli *clientv3.Client, keySuffixRange int) stressFunc {
  290. return func(ctx context.Context) (error, int64) {
  291. _, err := cli.Delete(ctx, fmt.Sprintf("foo%016x", rand.Intn(keySuffixRange)))
  292. return err, 1
  293. }
  294. }
  295. func newStressDeleteInterval(cli *clientv3.Client, keySuffixRange int) stressFunc {
  296. return func(ctx context.Context) (error, int64) {
  297. start := rand.Intn(keySuffixRange)
  298. end := start + 500
  299. resp, err := cli.Delete(ctx,
  300. fmt.Sprintf("foo%016x", start),
  301. clientv3.WithRange(fmt.Sprintf("foo%016x", end)),
  302. )
  303. if err == nil {
  304. return nil, resp.Deleted
  305. }
  306. return err, 0
  307. }
  308. }