stress_key.go 8.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340
  1. // Copyright 2018 The etcd Authors
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package tester
  15. import (
  16. "context"
  17. "fmt"
  18. "math/rand"
  19. "sync"
  20. "sync/atomic"
  21. "time"
  22. "github.com/coreos/etcd/clientv3"
  23. "github.com/coreos/etcd/etcdserver"
  24. "github.com/coreos/etcd/etcdserver/api/v3rpc/rpctypes"
  25. "github.com/coreos/etcd/tools/functional-tester/rpcpb"
  26. "go.uber.org/zap"
  27. "golang.org/x/time/rate"
  28. "google.golang.org/grpc"
  29. "google.golang.org/grpc/transport"
  30. )
  31. type keyStresser struct {
  32. lg *zap.Logger
  33. m *rpcpb.Member
  34. keySize int
  35. keyLargeSize int
  36. keySuffixRange int
  37. keyTxnSuffixRange int
  38. keyTxnOps int
  39. rateLimiter *rate.Limiter
  40. wg sync.WaitGroup
  41. clientsN int
  42. ctx context.Context
  43. cancel func()
  44. cli *clientv3.Client
  45. emu sync.RWMutex
  46. ems map[string]int
  47. paused bool
  48. // atomicModifiedKeys records the number of keys created and deleted by the stresser.
  49. atomicModifiedKeys int64
  50. stressTable *stressTable
  51. }
  52. func (s *keyStresser) Stress() error {
  53. var err error
  54. s.cli, err = s.m.CreateEtcdClient(grpc.WithBackoffMaxDelay(1 * time.Second))
  55. if err != nil {
  56. return fmt.Errorf("%v (%q)", err, s.m.EtcdClientEndpoint)
  57. }
  58. s.ctx, s.cancel = context.WithCancel(context.Background())
  59. s.wg.Add(s.clientsN)
  60. var stressEntries = []stressEntry{
  61. {weight: 0.7, f: newStressPut(s.cli, s.keySuffixRange, s.keySize)},
  62. {
  63. weight: 0.7 * float32(s.keySize) / float32(s.keyLargeSize),
  64. f: newStressPut(s.cli, s.keySuffixRange, s.keyLargeSize),
  65. },
  66. {weight: 0.07, f: newStressRange(s.cli, s.keySuffixRange)},
  67. {weight: 0.07, f: newStressRangeInterval(s.cli, s.keySuffixRange)},
  68. {weight: 0.07, f: newStressDelete(s.cli, s.keySuffixRange)},
  69. {weight: 0.07, f: newStressDeleteInterval(s.cli, s.keySuffixRange)},
  70. }
  71. if s.keyTxnSuffixRange > 0 {
  72. // adjust to make up ±70% of workloads with writes
  73. stressEntries[0].weight = 0.35
  74. stressEntries = append(stressEntries, stressEntry{
  75. weight: 0.35,
  76. f: newStressTxn(s.cli, s.keyTxnSuffixRange, s.keyTxnOps),
  77. })
  78. }
  79. s.stressTable = createStressTable(stressEntries)
  80. s.emu.Lock()
  81. s.paused = false
  82. s.ems = make(map[string]int, 100)
  83. s.emu.Unlock()
  84. for i := 0; i < s.clientsN; i++ {
  85. go s.run()
  86. }
  87. s.lg.Info(
  88. "key stresser START",
  89. zap.String("endpoint", s.m.EtcdClientEndpoint),
  90. )
  91. return nil
  92. }
  93. func (s *keyStresser) run() {
  94. defer s.wg.Done()
  95. for {
  96. if err := s.rateLimiter.Wait(s.ctx); err == context.Canceled {
  97. return
  98. }
  99. // TODO: 10-second is enough timeout to cover leader failure
  100. // and immediate leader election. Find out what other cases this
  101. // could be timed out.
  102. sctx, scancel := context.WithTimeout(s.ctx, 10*time.Second)
  103. err, modifiedKeys := s.stressTable.choose()(sctx)
  104. scancel()
  105. if err == nil {
  106. atomic.AddInt64(&s.atomicModifiedKeys, modifiedKeys)
  107. continue
  108. }
  109. switch rpctypes.ErrorDesc(err) {
  110. case context.DeadlineExceeded.Error():
  111. // This retries when request is triggered at the same time as
  112. // leader failure. When we terminate the leader, the request to
  113. // that leader cannot be processed, and times out. Also requests
  114. // to followers cannot be forwarded to the old leader, so timing out
  115. // as well. We want to keep stressing until the cluster elects a
  116. // new leader and start processing requests again.
  117. case etcdserver.ErrTimeoutDueToLeaderFail.Error(), etcdserver.ErrTimeout.Error():
  118. // This retries when request is triggered at the same time as
  119. // leader failure and follower nodes receive time out errors
  120. // from losing their leader. Followers should retry to connect
  121. // to the new leader.
  122. case etcdserver.ErrStopped.Error():
  123. // one of the etcd nodes stopped from failure injection
  124. case transport.ErrConnClosing.Desc:
  125. // server closed the transport (failure injected node)
  126. case rpctypes.ErrNotCapable.Error():
  127. // capability check has not been done (in the beginning)
  128. case rpctypes.ErrTooManyRequests.Error():
  129. // hitting the recovering member.
  130. case context.Canceled.Error():
  131. // from stresser.Cancel method:
  132. return
  133. case grpc.ErrClientConnClosing.Error():
  134. // from stresser.Cancel method:
  135. return
  136. default:
  137. s.lg.Warn(
  138. "key stresser exited with error",
  139. zap.String("endpoint", s.m.EtcdClientEndpoint),
  140. zap.Error(err),
  141. )
  142. return
  143. }
  144. // only record errors before pausing stressers
  145. s.emu.Lock()
  146. if !s.paused {
  147. s.ems[err.Error()]++
  148. }
  149. s.emu.Unlock()
  150. }
  151. }
  152. func (s *keyStresser) Pause() map[string]int {
  153. return s.Close()
  154. }
  155. func (s *keyStresser) Close() map[string]int {
  156. s.cancel()
  157. s.cli.Close()
  158. s.wg.Wait()
  159. s.emu.Lock()
  160. s.paused = true
  161. ess := s.ems
  162. s.ems = make(map[string]int, 100)
  163. s.emu.Unlock()
  164. s.lg.Info(
  165. "key stresser STOP",
  166. zap.String("endpoint", s.m.EtcdClientEndpoint),
  167. )
  168. return ess
  169. }
  170. func (s *keyStresser) ModifiedKeys() int64 {
  171. return atomic.LoadInt64(&s.atomicModifiedKeys)
  172. }
  173. func (s *keyStresser) Checker() Checker { return nil }
  174. type stressFunc func(ctx context.Context) (err error, modifiedKeys int64)
  175. type stressEntry struct {
  176. weight float32
  177. f stressFunc
  178. }
  179. type stressTable struct {
  180. entries []stressEntry
  181. sumWeights float32
  182. }
  183. func createStressTable(entries []stressEntry) *stressTable {
  184. st := stressTable{entries: entries}
  185. for _, entry := range st.entries {
  186. st.sumWeights += entry.weight
  187. }
  188. return &st
  189. }
  190. func (st *stressTable) choose() stressFunc {
  191. v := rand.Float32() * st.sumWeights
  192. var sum float32
  193. var idx int
  194. for i := range st.entries {
  195. sum += st.entries[i].weight
  196. if sum >= v {
  197. idx = i
  198. break
  199. }
  200. }
  201. return st.entries[idx].f
  202. }
  203. func newStressPut(cli *clientv3.Client, keySuffixRange, keySize int) stressFunc {
  204. return func(ctx context.Context) (error, int64) {
  205. _, err := cli.Put(
  206. ctx,
  207. fmt.Sprintf("foo%016x", rand.Intn(keySuffixRange)),
  208. string(randBytes(keySize)),
  209. )
  210. return err, 1
  211. }
  212. }
  213. func newStressTxn(cli *clientv3.Client, keyTxnSuffixRange, txnOps int) stressFunc {
  214. keys := make([]string, keyTxnSuffixRange)
  215. for i := range keys {
  216. keys[i] = fmt.Sprintf("/k%03d", i)
  217. }
  218. return writeTxn(cli, keys, txnOps)
  219. }
  220. func writeTxn(cli *clientv3.Client, keys []string, txnOps int) stressFunc {
  221. return func(ctx context.Context) (error, int64) {
  222. ks := make(map[string]struct{}, txnOps)
  223. for len(ks) != txnOps {
  224. ks[keys[rand.Intn(len(keys))]] = struct{}{}
  225. }
  226. selected := make([]string, 0, txnOps)
  227. for k := range ks {
  228. selected = append(selected, k)
  229. }
  230. com, delOp, putOp := getTxnOps(selected[0], "bar00")
  231. thenOps := []clientv3.Op{delOp}
  232. elseOps := []clientv3.Op{putOp}
  233. for i := 1; i < txnOps; i++ { // nested txns
  234. k, v := selected[i], fmt.Sprintf("bar%02d", i)
  235. com, delOp, putOp = getTxnOps(k, v)
  236. txnOp := clientv3.OpTxn(
  237. []clientv3.Cmp{com},
  238. []clientv3.Op{delOp},
  239. []clientv3.Op{putOp},
  240. )
  241. thenOps = append(thenOps, txnOp)
  242. elseOps = append(elseOps, txnOp)
  243. }
  244. _, err := cli.Txn(ctx).
  245. If(com).
  246. Then(thenOps...).
  247. Else(elseOps...).
  248. Commit()
  249. return err, int64(txnOps)
  250. }
  251. }
  252. func getTxnOps(k, v string) (
  253. cmp clientv3.Cmp,
  254. dop clientv3.Op,
  255. pop clientv3.Op) {
  256. // if key exists (version > 0)
  257. cmp = clientv3.Compare(clientv3.Version(k), ">", 0)
  258. dop = clientv3.OpDelete(k)
  259. pop = clientv3.OpPut(k, v)
  260. return cmp, dop, pop
  261. }
  262. func newStressRange(cli *clientv3.Client, keySuffixRange int) stressFunc {
  263. return func(ctx context.Context) (error, int64) {
  264. _, err := cli.Get(ctx, fmt.Sprintf("foo%016x", rand.Intn(keySuffixRange)))
  265. return err, 0
  266. }
  267. }
  268. func newStressRangeInterval(cli *clientv3.Client, keySuffixRange int) stressFunc {
  269. return func(ctx context.Context) (error, int64) {
  270. start := rand.Intn(keySuffixRange)
  271. end := start + 500
  272. _, err := cli.Get(
  273. ctx,
  274. fmt.Sprintf("foo%016x", start),
  275. clientv3.WithRange(fmt.Sprintf("foo%016x", end)),
  276. )
  277. return err, 0
  278. }
  279. }
  280. func newStressDelete(cli *clientv3.Client, keySuffixRange int) stressFunc {
  281. return func(ctx context.Context) (error, int64) {
  282. _, err := cli.Delete(ctx, fmt.Sprintf("foo%016x", rand.Intn(keySuffixRange)))
  283. return err, 1
  284. }
  285. }
  286. func newStressDeleteInterval(cli *clientv3.Client, keySuffixRange int) stressFunc {
  287. return func(ctx context.Context) (error, int64) {
  288. start := rand.Intn(keySuffixRange)
  289. end := start + 500
  290. resp, err := cli.Delete(ctx,
  291. fmt.Sprintf("foo%016x", start),
  292. clientv3.WithRange(fmt.Sprintf("foo%016x", end)),
  293. )
  294. if err == nil {
  295. return nil, resp.Deleted
  296. }
  297. return err, 0
  298. }
  299. }