stress_key.go 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320
  1. // Copyright 2018 The etcd Authors
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package tester
  15. import (
  16. "context"
  17. "fmt"
  18. "math/rand"
  19. "sync"
  20. "sync/atomic"
  21. "time"
  22. "github.com/coreos/etcd/clientv3"
  23. "github.com/coreos/etcd/etcdserver"
  24. "github.com/coreos/etcd/etcdserver/api/v3rpc/rpctypes"
  25. "github.com/coreos/etcd/tools/functional-tester/rpcpb"
  26. "go.uber.org/zap"
  27. "golang.org/x/time/rate"
  28. "google.golang.org/grpc"
  29. "google.golang.org/grpc/transport"
  30. )
  31. type keyStresser struct {
  32. lg *zap.Logger
  33. m *rpcpb.Member
  34. keySize int
  35. keyLargeSize int
  36. keySuffixRange int
  37. keyTxnSuffixRange int
  38. keyTxnOps int
  39. N int
  40. rateLimiter *rate.Limiter
  41. wg sync.WaitGroup
  42. cancel func()
  43. cli *clientv3.Client
  44. // atomicModifiedKeys records the number of keys created and deleted by the stresser.
  45. atomicModifiedKeys int64
  46. stressTable *stressTable
  47. }
  48. func (s *keyStresser) Stress() error {
  49. // TODO: add backoff option
  50. cli, err := s.m.CreateEtcdClient()
  51. if err != nil {
  52. return fmt.Errorf("%v (%q)", err, s.m.EtcdClientEndpoint)
  53. }
  54. ctx, cancel := context.WithCancel(context.Background())
  55. s.wg.Add(s.N)
  56. s.cli = cli
  57. s.cancel = cancel
  58. var stressEntries = []stressEntry{
  59. {weight: 0.7, f: newStressPut(cli, s.keySuffixRange, s.keySize)},
  60. {
  61. weight: 0.7 * float32(s.keySize) / float32(s.keyLargeSize),
  62. f: newStressPut(cli, s.keySuffixRange, s.keyLargeSize),
  63. },
  64. {weight: 0.07, f: newStressRange(cli, s.keySuffixRange)},
  65. {weight: 0.07, f: newStressRangeInterval(cli, s.keySuffixRange)},
  66. {weight: 0.07, f: newStressDelete(cli, s.keySuffixRange)},
  67. {weight: 0.07, f: newStressDeleteInterval(cli, s.keySuffixRange)},
  68. }
  69. if s.keyTxnSuffixRange > 0 {
  70. // adjust to make up ±70% of workloads with writes
  71. stressEntries[0].weight = 0.35
  72. stressEntries = append(stressEntries, stressEntry{
  73. weight: 0.35,
  74. f: newStressTxn(cli, s.keyTxnSuffixRange, s.keyTxnOps),
  75. })
  76. }
  77. s.stressTable = createStressTable(stressEntries)
  78. for i := 0; i < s.N; i++ {
  79. go s.run(ctx)
  80. }
  81. s.lg.Info(
  82. "key stresser started in background",
  83. zap.String("endpoint", s.m.EtcdClientEndpoint),
  84. )
  85. return nil
  86. }
  87. func (s *keyStresser) run(ctx context.Context) {
  88. defer s.wg.Done()
  89. for {
  90. if err := s.rateLimiter.Wait(ctx); err == context.Canceled {
  91. return
  92. }
  93. // TODO: 10-second is enough timeout to cover leader failure
  94. // and immediate leader election. Find out what other cases this
  95. // could be timed out.
  96. sctx, scancel := context.WithTimeout(ctx, 10*time.Second)
  97. err, modifiedKeys := s.stressTable.choose()(sctx)
  98. scancel()
  99. if err == nil {
  100. atomic.AddInt64(&s.atomicModifiedKeys, modifiedKeys)
  101. continue
  102. }
  103. switch rpctypes.ErrorDesc(err) {
  104. case context.DeadlineExceeded.Error():
  105. // This retries when request is triggered at the same time as
  106. // leader failure. When we terminate the leader, the request to
  107. // that leader cannot be processed, and times out. Also requests
  108. // to followers cannot be forwarded to the old leader, so timing out
  109. // as well. We want to keep stressing until the cluster elects a
  110. // new leader and start processing requests again.
  111. case etcdserver.ErrTimeoutDueToLeaderFail.Error(), etcdserver.ErrTimeout.Error():
  112. // This retries when request is triggered at the same time as
  113. // leader failure and follower nodes receive time out errors
  114. // from losing their leader. Followers should retry to connect
  115. // to the new leader.
  116. case etcdserver.ErrStopped.Error():
  117. // one of the etcd nodes stopped from failure injection
  118. case transport.ErrConnClosing.Desc:
  119. // server closed the transport (failure injected node)
  120. case rpctypes.ErrNotCapable.Error():
  121. // capability check has not been done (in the beginning)
  122. case rpctypes.ErrTooManyRequests.Error():
  123. // hitting the recovering member.
  124. case context.Canceled.Error():
  125. // from stresser.Cancel method:
  126. return
  127. case grpc.ErrClientConnClosing.Error():
  128. // from stresser.Cancel method:
  129. return
  130. default:
  131. s.lg.Warn(
  132. "key stresser exited with error",
  133. zap.String("endpoint", s.m.EtcdClientEndpoint),
  134. zap.Error(err),
  135. )
  136. return
  137. }
  138. }
  139. }
  140. func (s *keyStresser) Pause() {
  141. s.Close()
  142. }
  143. func (s *keyStresser) Close() {
  144. s.cancel()
  145. s.cli.Close()
  146. s.wg.Wait()
  147. s.lg.Info(
  148. "key stresser is closed",
  149. zap.String("endpoint", s.m.EtcdClientEndpoint),
  150. )
  151. }
  152. func (s *keyStresser) ModifiedKeys() int64 {
  153. return atomic.LoadInt64(&s.atomicModifiedKeys)
  154. }
  155. func (s *keyStresser) Checker() Checker { return nil }
  156. type stressFunc func(ctx context.Context) (err error, modifiedKeys int64)
  157. type stressEntry struct {
  158. weight float32
  159. f stressFunc
  160. }
  161. type stressTable struct {
  162. entries []stressEntry
  163. sumWeights float32
  164. }
  165. func createStressTable(entries []stressEntry) *stressTable {
  166. st := stressTable{entries: entries}
  167. for _, entry := range st.entries {
  168. st.sumWeights += entry.weight
  169. }
  170. return &st
  171. }
  172. func (st *stressTable) choose() stressFunc {
  173. v := rand.Float32() * st.sumWeights
  174. var sum float32
  175. var idx int
  176. for i := range st.entries {
  177. sum += st.entries[i].weight
  178. if sum >= v {
  179. idx = i
  180. break
  181. }
  182. }
  183. return st.entries[idx].f
  184. }
  185. func newStressPut(cli *clientv3.Client, keySuffixRange, keySize int) stressFunc {
  186. return func(ctx context.Context) (error, int64) {
  187. _, err := cli.Put(
  188. ctx,
  189. fmt.Sprintf("foo%016x", rand.Intn(keySuffixRange)),
  190. string(randBytes(keySize)),
  191. )
  192. return err, 1
  193. }
  194. }
  195. func newStressTxn(cli *clientv3.Client, keyTxnSuffixRange, txnOps int) stressFunc {
  196. keys := make([]string, keyTxnSuffixRange)
  197. for i := range keys {
  198. keys[i] = fmt.Sprintf("/k%03d", i)
  199. }
  200. return writeTxn(cli, keys, txnOps)
  201. }
  202. func writeTxn(cli *clientv3.Client, keys []string, txnOps int) stressFunc {
  203. return func(ctx context.Context) (error, int64) {
  204. ks := make(map[string]struct{}, txnOps)
  205. for len(ks) != txnOps {
  206. ks[keys[rand.Intn(len(keys))]] = struct{}{}
  207. }
  208. selected := make([]string, 0, txnOps)
  209. for k := range ks {
  210. selected = append(selected, k)
  211. }
  212. com, delOp, putOp := getTxnOps(selected[0], "bar00")
  213. thenOps := []clientv3.Op{delOp}
  214. elseOps := []clientv3.Op{putOp}
  215. for i := 1; i < txnOps; i++ { // nested txns
  216. k, v := selected[i], fmt.Sprintf("bar%02d", i)
  217. com, delOp, putOp = getTxnOps(k, v)
  218. txnOp := clientv3.OpTxn(
  219. []clientv3.Cmp{com},
  220. []clientv3.Op{delOp},
  221. []clientv3.Op{putOp},
  222. )
  223. thenOps = append(thenOps, txnOp)
  224. elseOps = append(elseOps, txnOp)
  225. }
  226. _, err := cli.Txn(ctx).
  227. If(com).
  228. Else(elseOps...).
  229. Then(thenOps...).
  230. Commit()
  231. return err, int64(txnOps)
  232. }
  233. }
  234. func getTxnOps(k, v string) (
  235. cmp clientv3.Cmp,
  236. dop clientv3.Op,
  237. pop clientv3.Op) {
  238. // if key exists (version > 0)
  239. cmp = clientv3.Compare(clientv3.Version(k), ">", 0)
  240. dop = clientv3.OpDelete(k)
  241. pop = clientv3.OpPut(k, v)
  242. return cmp, dop, pop
  243. }
  244. func newStressRange(cli *clientv3.Client, keySuffixRange int) stressFunc {
  245. return func(ctx context.Context) (error, int64) {
  246. _, err := cli.Get(ctx, fmt.Sprintf("foo%016x", rand.Intn(keySuffixRange)))
  247. return err, 0
  248. }
  249. }
  250. func newStressRangeInterval(cli *clientv3.Client, keySuffixRange int) stressFunc {
  251. return func(ctx context.Context) (error, int64) {
  252. start := rand.Intn(keySuffixRange)
  253. end := start + 500
  254. _, err := cli.Get(
  255. ctx,
  256. fmt.Sprintf("foo%016x", start),
  257. clientv3.WithRange(fmt.Sprintf("foo%016x", end)),
  258. )
  259. return err, 0
  260. }
  261. }
  262. func newStressDelete(cli *clientv3.Client, keySuffixRange int) stressFunc {
  263. return func(ctx context.Context) (error, int64) {
  264. _, err := cli.Delete(ctx, fmt.Sprintf("foo%016x", rand.Intn(keySuffixRange)))
  265. return err, 1
  266. }
  267. }
  268. func newStressDeleteInterval(cli *clientv3.Client, keySuffixRange int) stressFunc {
  269. return func(ctx context.Context) (error, int64) {
  270. start := rand.Intn(keySuffixRange)
  271. end := start + 500
  272. resp, err := cli.Delete(ctx,
  273. fmt.Sprintf("foo%016x", start),
  274. clientv3.WithRange(fmt.Sprintf("foo%016x", end)),
  275. )
  276. if err == nil {
  277. return nil, resp.Deleted
  278. }
  279. return err, 0
  280. }
  281. }