stress_key.go 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318
  1. // Copyright 2018 The etcd Authors
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package tester
  15. import (
  16. "context"
  17. "fmt"
  18. "math/rand"
  19. "sync"
  20. "sync/atomic"
  21. "time"
  22. "github.com/coreos/etcd/clientv3"
  23. "github.com/coreos/etcd/etcdserver"
  24. "github.com/coreos/etcd/etcdserver/api/v3rpc/rpctypes"
  25. "github.com/coreos/etcd/tools/functional-tester/rpcpb"
  26. "go.uber.org/zap"
  27. "golang.org/x/time/rate"
  28. "google.golang.org/grpc"
  29. "google.golang.org/grpc/transport"
  30. )
  31. type keyStresser struct {
  32. lg *zap.Logger
  33. m *rpcpb.Member
  34. keySize int
  35. keyLargeSize int
  36. keySuffixRange int
  37. keyTxnSuffixRange int
  38. keyTxnOps int
  39. rateLimiter *rate.Limiter
  40. wg sync.WaitGroup
  41. clientsN int
  42. ctx context.Context
  43. cancel func()
  44. cli *clientv3.Client
  45. // atomicModifiedKeys records the number of keys created and deleted by the stresser.
  46. atomicModifiedKeys int64
  47. stressTable *stressTable
  48. }
  49. func (s *keyStresser) Stress() error {
  50. var err error
  51. s.cli, err = s.m.CreateEtcdClient(grpc.WithBackoffMaxDelay(1 * time.Second))
  52. if err != nil {
  53. return fmt.Errorf("%v (%q)", err, s.m.EtcdClientEndpoint)
  54. }
  55. s.ctx, s.cancel = context.WithCancel(context.Background())
  56. s.wg.Add(s.clientsN)
  57. var stressEntries = []stressEntry{
  58. {weight: 0.7, f: newStressPut(s.cli, s.keySuffixRange, s.keySize)},
  59. {
  60. weight: 0.7 * float32(s.keySize) / float32(s.keyLargeSize),
  61. f: newStressPut(s.cli, s.keySuffixRange, s.keyLargeSize),
  62. },
  63. {weight: 0.07, f: newStressRange(s.cli, s.keySuffixRange)},
  64. {weight: 0.07, f: newStressRangeInterval(s.cli, s.keySuffixRange)},
  65. {weight: 0.07, f: newStressDelete(s.cli, s.keySuffixRange)},
  66. {weight: 0.07, f: newStressDeleteInterval(s.cli, s.keySuffixRange)},
  67. }
  68. if s.keyTxnSuffixRange > 0 {
  69. // adjust to make up ±70% of workloads with writes
  70. stressEntries[0].weight = 0.35
  71. stressEntries = append(stressEntries, stressEntry{
  72. weight: 0.35,
  73. f: newStressTxn(s.cli, s.keyTxnSuffixRange, s.keyTxnOps),
  74. })
  75. }
  76. s.stressTable = createStressTable(stressEntries)
  77. for i := 0; i < s.clientsN; i++ {
  78. go s.run()
  79. }
  80. s.lg.Info(
  81. "key stresser started in background",
  82. zap.String("endpoint", s.m.EtcdClientEndpoint),
  83. )
  84. return nil
  85. }
  86. func (s *keyStresser) run() {
  87. defer s.wg.Done()
  88. for {
  89. if err := s.rateLimiter.Wait(s.ctx); err == context.Canceled {
  90. return
  91. }
  92. // TODO: 10-second is enough timeout to cover leader failure
  93. // and immediate leader election. Find out what other cases this
  94. // could be timed out.
  95. sctx, scancel := context.WithTimeout(s.ctx, 10*time.Second)
  96. err, modifiedKeys := s.stressTable.choose()(sctx)
  97. scancel()
  98. if err == nil {
  99. atomic.AddInt64(&s.atomicModifiedKeys, modifiedKeys)
  100. continue
  101. }
  102. switch rpctypes.ErrorDesc(err) {
  103. case context.DeadlineExceeded.Error():
  104. // This retries when request is triggered at the same time as
  105. // leader failure. When we terminate the leader, the request to
  106. // that leader cannot be processed, and times out. Also requests
  107. // to followers cannot be forwarded to the old leader, so timing out
  108. // as well. We want to keep stressing until the cluster elects a
  109. // new leader and start processing requests again.
  110. case etcdserver.ErrTimeoutDueToLeaderFail.Error(), etcdserver.ErrTimeout.Error():
  111. // This retries when request is triggered at the same time as
  112. // leader failure and follower nodes receive time out errors
  113. // from losing their leader. Followers should retry to connect
  114. // to the new leader.
  115. case etcdserver.ErrStopped.Error():
  116. // one of the etcd nodes stopped from failure injection
  117. case transport.ErrConnClosing.Desc:
  118. // server closed the transport (failure injected node)
  119. case rpctypes.ErrNotCapable.Error():
  120. // capability check has not been done (in the beginning)
  121. case rpctypes.ErrTooManyRequests.Error():
  122. // hitting the recovering member.
  123. case context.Canceled.Error():
  124. // from stresser.Cancel method:
  125. return
  126. case grpc.ErrClientConnClosing.Error():
  127. // from stresser.Cancel method:
  128. return
  129. default:
  130. s.lg.Warn(
  131. "key stresser exited with error",
  132. zap.String("endpoint", s.m.EtcdClientEndpoint),
  133. zap.Error(err),
  134. )
  135. return
  136. }
  137. }
  138. }
  139. func (s *keyStresser) Pause() {
  140. s.Close()
  141. }
  142. func (s *keyStresser) Close() {
  143. s.cancel()
  144. s.cli.Close()
  145. s.wg.Wait()
  146. s.lg.Info(
  147. "key stresser is closed",
  148. zap.String("endpoint", s.m.EtcdClientEndpoint),
  149. )
  150. }
  151. func (s *keyStresser) ModifiedKeys() int64 {
  152. return atomic.LoadInt64(&s.atomicModifiedKeys)
  153. }
  154. func (s *keyStresser) Checker() Checker { return nil }
  155. type stressFunc func(ctx context.Context) (err error, modifiedKeys int64)
  156. type stressEntry struct {
  157. weight float32
  158. f stressFunc
  159. }
  160. type stressTable struct {
  161. entries []stressEntry
  162. sumWeights float32
  163. }
  164. func createStressTable(entries []stressEntry) *stressTable {
  165. st := stressTable{entries: entries}
  166. for _, entry := range st.entries {
  167. st.sumWeights += entry.weight
  168. }
  169. return &st
  170. }
  171. func (st *stressTable) choose() stressFunc {
  172. v := rand.Float32() * st.sumWeights
  173. var sum float32
  174. var idx int
  175. for i := range st.entries {
  176. sum += st.entries[i].weight
  177. if sum >= v {
  178. idx = i
  179. break
  180. }
  181. }
  182. return st.entries[idx].f
  183. }
  184. func newStressPut(cli *clientv3.Client, keySuffixRange, keySize int) stressFunc {
  185. return func(ctx context.Context) (error, int64) {
  186. _, err := cli.Put(
  187. ctx,
  188. fmt.Sprintf("foo%016x", rand.Intn(keySuffixRange)),
  189. string(randBytes(keySize)),
  190. )
  191. return err, 1
  192. }
  193. }
  194. func newStressTxn(cli *clientv3.Client, keyTxnSuffixRange, txnOps int) stressFunc {
  195. keys := make([]string, keyTxnSuffixRange)
  196. for i := range keys {
  197. keys[i] = fmt.Sprintf("/k%03d", i)
  198. }
  199. return writeTxn(cli, keys, txnOps)
  200. }
  201. func writeTxn(cli *clientv3.Client, keys []string, txnOps int) stressFunc {
  202. return func(ctx context.Context) (error, int64) {
  203. ks := make(map[string]struct{}, txnOps)
  204. for len(ks) != txnOps {
  205. ks[keys[rand.Intn(len(keys))]] = struct{}{}
  206. }
  207. selected := make([]string, 0, txnOps)
  208. for k := range ks {
  209. selected = append(selected, k)
  210. }
  211. com, delOp, putOp := getTxnOps(selected[0], "bar00")
  212. thenOps := []clientv3.Op{delOp}
  213. elseOps := []clientv3.Op{putOp}
  214. for i := 1; i < txnOps; i++ { // nested txns
  215. k, v := selected[i], fmt.Sprintf("bar%02d", i)
  216. com, delOp, putOp = getTxnOps(k, v)
  217. txnOp := clientv3.OpTxn(
  218. []clientv3.Cmp{com},
  219. []clientv3.Op{delOp},
  220. []clientv3.Op{putOp},
  221. )
  222. thenOps = append(thenOps, txnOp)
  223. elseOps = append(elseOps, txnOp)
  224. }
  225. _, err := cli.Txn(ctx).
  226. If(com).
  227. Then(thenOps...).
  228. Else(elseOps...).
  229. Commit()
  230. return err, int64(txnOps)
  231. }
  232. }
  233. func getTxnOps(k, v string) (
  234. cmp clientv3.Cmp,
  235. dop clientv3.Op,
  236. pop clientv3.Op) {
  237. // if key exists (version > 0)
  238. cmp = clientv3.Compare(clientv3.Version(k), ">", 0)
  239. dop = clientv3.OpDelete(k)
  240. pop = clientv3.OpPut(k, v)
  241. return cmp, dop, pop
  242. }
  243. func newStressRange(cli *clientv3.Client, keySuffixRange int) stressFunc {
  244. return func(ctx context.Context) (error, int64) {
  245. _, err := cli.Get(ctx, fmt.Sprintf("foo%016x", rand.Intn(keySuffixRange)))
  246. return err, 0
  247. }
  248. }
  249. func newStressRangeInterval(cli *clientv3.Client, keySuffixRange int) stressFunc {
  250. return func(ctx context.Context) (error, int64) {
  251. start := rand.Intn(keySuffixRange)
  252. end := start + 500
  253. _, err := cli.Get(
  254. ctx,
  255. fmt.Sprintf("foo%016x", start),
  256. clientv3.WithRange(fmt.Sprintf("foo%016x", end)),
  257. )
  258. return err, 0
  259. }
  260. }
  261. func newStressDelete(cli *clientv3.Client, keySuffixRange int) stressFunc {
  262. return func(ctx context.Context) (error, int64) {
  263. _, err := cli.Delete(ctx, fmt.Sprintf("foo%016x", rand.Intn(keySuffixRange)))
  264. return err, 1
  265. }
  266. }
  267. func newStressDeleteInterval(cli *clientv3.Client, keySuffixRange int) stressFunc {
  268. return func(ctx context.Context) (error, int64) {
  269. start := rand.Intn(keySuffixRange)
  270. end := start + 500
  271. resp, err := cli.Delete(ctx,
  272. fmt.Sprintf("foo%016x", start),
  273. clientv3.WithRange(fmt.Sprintf("foo%016x", end)),
  274. )
  275. if err == nil {
  276. return nil, resp.Deleted
  277. }
  278. return err, 0
  279. }
  280. }