stress_key.go 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346
  1. // Copyright 2018 The etcd Authors
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package tester
  15. import (
  16. "context"
  17. "fmt"
  18. "math/rand"
  19. "reflect"
  20. "sync"
  21. "sync/atomic"
  22. "time"
  23. "github.com/coreos/etcd/clientv3"
  24. "github.com/coreos/etcd/etcdserver"
  25. "github.com/coreos/etcd/etcdserver/api/v3rpc/rpctypes"
  26. "github.com/coreos/etcd/functional/rpcpb"
  27. "go.uber.org/zap"
  28. "golang.org/x/time/rate"
  29. "google.golang.org/grpc"
  30. "google.golang.org/grpc/transport"
  31. )
  32. type keyStresser struct {
  33. stype rpcpb.StressType
  34. lg *zap.Logger
  35. m *rpcpb.Member
  36. keySize int
  37. keyLargeSize int
  38. keySuffixRange int
  39. keyTxnSuffixRange int
  40. keyTxnOps int
  41. rateLimiter *rate.Limiter
  42. wg sync.WaitGroup
  43. clientsN int
  44. ctx context.Context
  45. cancel func()
  46. cli *clientv3.Client
  47. emu sync.RWMutex
  48. ems map[string]int
  49. paused bool
  50. // atomicModifiedKeys records the number of keys created and deleted by the stresser.
  51. atomicModifiedKeys int64
  52. stressTable *stressTable
  53. }
  54. func (s *keyStresser) Stress() error {
  55. var err error
  56. s.cli, err = s.m.CreateEtcdClient(grpc.WithBackoffMaxDelay(1 * time.Second))
  57. if err != nil {
  58. return fmt.Errorf("%v (%q)", err, s.m.EtcdClientEndpoint)
  59. }
  60. s.ctx, s.cancel = context.WithCancel(context.Background())
  61. s.wg.Add(s.clientsN)
  62. var stressEntries = []stressEntry{
  63. {weight: 0.7, f: newStressPut(s.cli, s.keySuffixRange, s.keySize)},
  64. {
  65. weight: 0.7 * float32(s.keySize) / float32(s.keyLargeSize),
  66. f: newStressPut(s.cli, s.keySuffixRange, s.keyLargeSize),
  67. },
  68. {weight: 0.07, f: newStressRange(s.cli, s.keySuffixRange)},
  69. {weight: 0.07, f: newStressRangeInterval(s.cli, s.keySuffixRange)},
  70. {weight: 0.07, f: newStressDelete(s.cli, s.keySuffixRange)},
  71. {weight: 0.07, f: newStressDeleteInterval(s.cli, s.keySuffixRange)},
  72. }
  73. if s.keyTxnSuffixRange > 0 {
  74. // adjust to make up ±70% of workloads with writes
  75. stressEntries[0].weight = 0.35
  76. stressEntries = append(stressEntries, stressEntry{
  77. weight: 0.35,
  78. f: newStressTxn(s.cli, s.keyTxnSuffixRange, s.keyTxnOps),
  79. })
  80. }
  81. s.stressTable = createStressTable(stressEntries)
  82. s.emu.Lock()
  83. s.paused = false
  84. s.ems = make(map[string]int, 100)
  85. s.emu.Unlock()
  86. for i := 0; i < s.clientsN; i++ {
  87. go s.run()
  88. }
  89. s.lg.Info(
  90. "stress START",
  91. zap.String("stress-type", s.stype.String()),
  92. zap.String("endpoint", s.m.EtcdClientEndpoint),
  93. )
  94. return nil
  95. }
  96. func (s *keyStresser) run() {
  97. defer s.wg.Done()
  98. for {
  99. if err := s.rateLimiter.Wait(s.ctx); err == context.Canceled {
  100. return
  101. }
  102. // TODO: 10-second is enough timeout to cover leader failure
  103. // and immediate leader election. Find out what other cases this
  104. // could be timed out.
  105. sctx, scancel := context.WithTimeout(s.ctx, 10*time.Second)
  106. err, modifiedKeys := s.stressTable.choose()(sctx)
  107. scancel()
  108. if err == nil {
  109. atomic.AddInt64(&s.atomicModifiedKeys, modifiedKeys)
  110. continue
  111. }
  112. switch rpctypes.ErrorDesc(err) {
  113. case context.DeadlineExceeded.Error():
  114. // This retries when request is triggered at the same time as
  115. // leader failure. When we terminate the leader, the request to
  116. // that leader cannot be processed, and times out. Also requests
  117. // to followers cannot be forwarded to the old leader, so timing out
  118. // as well. We want to keep stressing until the cluster elects a
  119. // new leader and start processing requests again.
  120. case etcdserver.ErrTimeoutDueToLeaderFail.Error(), etcdserver.ErrTimeout.Error():
  121. // This retries when request is triggered at the same time as
  122. // leader failure and follower nodes receive time out errors
  123. // from losing their leader. Followers should retry to connect
  124. // to the new leader.
  125. case etcdserver.ErrStopped.Error():
  126. // one of the etcd nodes stopped from failure injection
  127. case transport.ErrConnClosing.Desc:
  128. // server closed the transport (failure injected node)
  129. case rpctypes.ErrNotCapable.Error():
  130. // capability check has not been done (in the beginning)
  131. case rpctypes.ErrTooManyRequests.Error():
  132. // hitting the recovering member.
  133. case context.Canceled.Error():
  134. // from stresser.Cancel method:
  135. return
  136. case grpc.ErrClientConnClosing.Error():
  137. // from stresser.Cancel method:
  138. return
  139. default:
  140. s.lg.Warn(
  141. "stress run exiting",
  142. zap.String("stress-type", s.stype.String()),
  143. zap.String("endpoint", s.m.EtcdClientEndpoint),
  144. zap.String("error-type", reflect.TypeOf(err).String()),
  145. zap.Error(err),
  146. )
  147. return
  148. }
  149. // only record errors before pausing stressers
  150. s.emu.Lock()
  151. if !s.paused {
  152. s.ems[err.Error()]++
  153. }
  154. s.emu.Unlock()
  155. }
  156. }
  157. func (s *keyStresser) Pause() map[string]int {
  158. return s.Close()
  159. }
  160. func (s *keyStresser) Close() map[string]int {
  161. s.cancel()
  162. s.cli.Close()
  163. s.wg.Wait()
  164. s.emu.Lock()
  165. s.paused = true
  166. ess := s.ems
  167. s.ems = make(map[string]int, 100)
  168. s.emu.Unlock()
  169. s.lg.Info(
  170. "stress STOP",
  171. zap.String("stress-type", s.stype.String()),
  172. zap.String("endpoint", s.m.EtcdClientEndpoint),
  173. )
  174. return ess
  175. }
  176. func (s *keyStresser) ModifiedKeys() int64 {
  177. return atomic.LoadInt64(&s.atomicModifiedKeys)
  178. }
  179. func (s *keyStresser) Checker() Checker { return nil }
  180. type stressFunc func(ctx context.Context) (err error, modifiedKeys int64)
  181. type stressEntry struct {
  182. weight float32
  183. f stressFunc
  184. }
  185. type stressTable struct {
  186. entries []stressEntry
  187. sumWeights float32
  188. }
  189. func createStressTable(entries []stressEntry) *stressTable {
  190. st := stressTable{entries: entries}
  191. for _, entry := range st.entries {
  192. st.sumWeights += entry.weight
  193. }
  194. return &st
  195. }
  196. func (st *stressTable) choose() stressFunc {
  197. v := rand.Float32() * st.sumWeights
  198. var sum float32
  199. var idx int
  200. for i := range st.entries {
  201. sum += st.entries[i].weight
  202. if sum >= v {
  203. idx = i
  204. break
  205. }
  206. }
  207. return st.entries[idx].f
  208. }
  209. func newStressPut(cli *clientv3.Client, keySuffixRange, keySize int) stressFunc {
  210. return func(ctx context.Context) (error, int64) {
  211. _, err := cli.Put(
  212. ctx,
  213. fmt.Sprintf("foo%016x", rand.Intn(keySuffixRange)),
  214. string(randBytes(keySize)),
  215. )
  216. return err, 1
  217. }
  218. }
  219. func newStressTxn(cli *clientv3.Client, keyTxnSuffixRange, txnOps int) stressFunc {
  220. keys := make([]string, keyTxnSuffixRange)
  221. for i := range keys {
  222. keys[i] = fmt.Sprintf("/k%03d", i)
  223. }
  224. return writeTxn(cli, keys, txnOps)
  225. }
  226. func writeTxn(cli *clientv3.Client, keys []string, txnOps int) stressFunc {
  227. return func(ctx context.Context) (error, int64) {
  228. ks := make(map[string]struct{}, txnOps)
  229. for len(ks) != txnOps {
  230. ks[keys[rand.Intn(len(keys))]] = struct{}{}
  231. }
  232. selected := make([]string, 0, txnOps)
  233. for k := range ks {
  234. selected = append(selected, k)
  235. }
  236. com, delOp, putOp := getTxnOps(selected[0], "bar00")
  237. thenOps := []clientv3.Op{delOp}
  238. elseOps := []clientv3.Op{putOp}
  239. for i := 1; i < txnOps; i++ { // nested txns
  240. k, v := selected[i], fmt.Sprintf("bar%02d", i)
  241. com, delOp, putOp = getTxnOps(k, v)
  242. txnOp := clientv3.OpTxn(
  243. []clientv3.Cmp{com},
  244. []clientv3.Op{delOp},
  245. []clientv3.Op{putOp},
  246. )
  247. thenOps = append(thenOps, txnOp)
  248. elseOps = append(elseOps, txnOp)
  249. }
  250. _, err := cli.Txn(ctx).
  251. If(com).
  252. Then(thenOps...).
  253. Else(elseOps...).
  254. Commit()
  255. return err, int64(txnOps)
  256. }
  257. }
  258. func getTxnOps(k, v string) (
  259. cmp clientv3.Cmp,
  260. dop clientv3.Op,
  261. pop clientv3.Op) {
  262. // if key exists (version > 0)
  263. cmp = clientv3.Compare(clientv3.Version(k), ">", 0)
  264. dop = clientv3.OpDelete(k)
  265. pop = clientv3.OpPut(k, v)
  266. return cmp, dop, pop
  267. }
  268. func newStressRange(cli *clientv3.Client, keySuffixRange int) stressFunc {
  269. return func(ctx context.Context) (error, int64) {
  270. _, err := cli.Get(ctx, fmt.Sprintf("foo%016x", rand.Intn(keySuffixRange)))
  271. return err, 0
  272. }
  273. }
  274. func newStressRangeInterval(cli *clientv3.Client, keySuffixRange int) stressFunc {
  275. return func(ctx context.Context) (error, int64) {
  276. start := rand.Intn(keySuffixRange)
  277. end := start + 500
  278. _, err := cli.Get(
  279. ctx,
  280. fmt.Sprintf("foo%016x", start),
  281. clientv3.WithRange(fmt.Sprintf("foo%016x", end)),
  282. )
  283. return err, 0
  284. }
  285. }
  286. func newStressDelete(cli *clientv3.Client, keySuffixRange int) stressFunc {
  287. return func(ctx context.Context) (error, int64) {
  288. _, err := cli.Delete(ctx, fmt.Sprintf("foo%016x", rand.Intn(keySuffixRange)))
  289. return err, 1
  290. }
  291. }
  292. func newStressDeleteInterval(cli *clientv3.Client, keySuffixRange int) stressFunc {
  293. return func(ctx context.Context) (error, int64) {
  294. start := rand.Intn(keySuffixRange)
  295. end := start + 500
  296. resp, err := cli.Delete(ctx,
  297. fmt.Sprintf("foo%016x", start),
  298. clientv3.WithRange(fmt.Sprintf("foo%016x", end)),
  299. )
  300. if err == nil {
  301. return nil, resp.Deleted
  302. }
  303. return err, 0
  304. }
  305. }