stress_lease.go 13 KB


  1. // Copyright 2018 The etcd Authors
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package tester
  15. import (
  16. "context"
  17. "fmt"
  18. "math/rand"
  19. "sync"
  20. "sync/atomic"
  21. "time"
  22. "github.com/coreos/etcd/clientv3"
  23. "github.com/coreos/etcd/etcdserver/api/v3rpc/rpctypes"
  24. "github.com/coreos/etcd/tools/functional-tester/rpcpb"
  25. "go.uber.org/zap"
  26. "golang.org/x/time/rate"
  27. "google.golang.org/grpc"
  28. )
  29. const (
  30. // time to live for lease
  31. defaultTTL = 120
  32. defaultTTLShort = 2
  33. )
  34. type leaseStresser struct {
  35. lg *zap.Logger
  36. m *rpcpb.Member
  37. cli *clientv3.Client
  38. ctx context.Context
  39. cancel func()
  40. rateLimiter *rate.Limiter
  41. // atomicModifiedKey records the number of keys created and deleted during a test case
  42. atomicModifiedKey int64
  43. numLeases int
  44. keysPerLease int
  45. aliveLeases *atomicLeases
  46. revokedLeases *atomicLeases
  47. shortLivedLeases *atomicLeases
  48. runWg sync.WaitGroup
  49. aliveWg sync.WaitGroup
  50. }
  51. type atomicLeases struct {
  52. // rwLock is used to protect read/write access of leases map
  53. // which are accessed and modified by different go routines.
  54. rwLock sync.RWMutex
  55. leases map[int64]time.Time
  56. }
  57. func (al *atomicLeases) add(leaseID int64, t time.Time) {
  58. al.rwLock.Lock()
  59. al.leases[leaseID] = t
  60. al.rwLock.Unlock()
  61. }
  62. func (al *atomicLeases) update(leaseID int64, t time.Time) {
  63. al.rwLock.Lock()
  64. _, ok := al.leases[leaseID]
  65. if ok {
  66. al.leases[leaseID] = t
  67. }
  68. al.rwLock.Unlock()
  69. }
  70. func (al *atomicLeases) read(leaseID int64) (rv time.Time, ok bool) {
  71. al.rwLock.RLock()
  72. rv, ok = al.leases[leaseID]
  73. al.rwLock.RUnlock()
  74. return rv, ok
  75. }
  76. func (al *atomicLeases) remove(leaseID int64) {
  77. al.rwLock.Lock()
  78. delete(al.leases, leaseID)
  79. al.rwLock.Unlock()
  80. }
  81. func (al *atomicLeases) getLeasesMap() map[int64]time.Time {
  82. leasesCopy := make(map[int64]time.Time)
  83. al.rwLock.RLock()
  84. for k, v := range al.leases {
  85. leasesCopy[k] = v
  86. }
  87. al.rwLock.RUnlock()
  88. return leasesCopy
  89. }
  90. func (ls *leaseStresser) setupOnce() error {
  91. if ls.aliveLeases != nil {
  92. return nil
  93. }
  94. if ls.numLeases == 0 {
  95. panic("expect numLeases to be set")
  96. }
  97. if ls.keysPerLease == 0 {
  98. panic("expect keysPerLease to be set")
  99. }
  100. ls.aliveLeases = &atomicLeases{leases: make(map[int64]time.Time)}
  101. return nil
  102. }
  103. func (ls *leaseStresser) Stress() error {
  104. ls.lg.Info(
  105. "lease stresser is started",
  106. zap.String("endpoint", ls.m.EtcdClientEndpoint),
  107. )
  108. if err := ls.setupOnce(); err != nil {
  109. return err
  110. }
  111. ctx, cancel := context.WithCancel(context.Background())
  112. ls.ctx = ctx
  113. ls.cancel = cancel
  114. cli, err := ls.m.CreateEtcdClient(grpc.WithBackoffMaxDelay(1 * time.Second))
  115. if err != nil {
  116. return fmt.Errorf("%v (%s)", err, ls.m.EtcdClientEndpoint)
  117. }
  118. ls.cli = cli
  119. ls.revokedLeases = &atomicLeases{leases: make(map[int64]time.Time)}
  120. ls.shortLivedLeases = &atomicLeases{leases: make(map[int64]time.Time)}
  121. ls.runWg.Add(1)
  122. go ls.run()
  123. return nil
  124. }
  125. func (ls *leaseStresser) run() {
  126. defer ls.runWg.Done()
  127. ls.restartKeepAlives()
  128. for {
  129. // the number of keys created and deleted is roughly 2x the number of created keys for an iteration.
  130. // the rateLimiter therefore consumes 2x ls.numLeases*ls.keysPerLease tokens where each token represents a create/delete operation for key.
  131. err := ls.rateLimiter.WaitN(ls.ctx, 2*ls.numLeases*ls.keysPerLease)
  132. if err == context.Canceled {
  133. return
  134. }
  135. ls.lg.Debug(
  136. "lease stresser is creating leases",
  137. zap.String("endpoint", ls.m.EtcdClientEndpoint),
  138. )
  139. ls.createLeases()
  140. ls.lg.Debug(
  141. "lease stresser created leases",
  142. zap.String("endpoint", ls.m.EtcdClientEndpoint),
  143. )
  144. ls.lg.Debug(
  145. "lease stresser is dropped leases",
  146. zap.String("endpoint", ls.m.EtcdClientEndpoint),
  147. )
  148. ls.randomlyDropLeases()
  149. ls.lg.Debug(
  150. "lease stresser dropped leases",
  151. zap.String("endpoint", ls.m.EtcdClientEndpoint),
  152. )
  153. }
  154. }
  155. func (ls *leaseStresser) restartKeepAlives() {
  156. for leaseID := range ls.aliveLeases.getLeasesMap() {
  157. ls.aliveWg.Add(1)
  158. go func(id int64) {
  159. ls.keepLeaseAlive(id)
  160. }(leaseID)
  161. }
  162. }
  163. func (ls *leaseStresser) createLeases() {
  164. ls.createAliveLeases()
  165. ls.createShortLivedLeases()
  166. }
  167. func (ls *leaseStresser) createAliveLeases() {
  168. neededLeases := ls.numLeases - len(ls.aliveLeases.getLeasesMap())
  169. var wg sync.WaitGroup
  170. for i := 0; i < neededLeases; i++ {
  171. wg.Add(1)
  172. go func() {
  173. defer wg.Done()
  174. leaseID, err := ls.createLeaseWithKeys(defaultTTL)
  175. if err != nil {
  176. ls.lg.Debug(
  177. "createLeaseWithKeys failed",
  178. zap.String("endpoint", ls.m.EtcdClientEndpoint),
  179. zap.Error(err),
  180. )
  181. return
  182. }
  183. ls.aliveLeases.add(leaseID, time.Now())
  184. // keep track of all the keep lease alive go routines
  185. ls.aliveWg.Add(1)
  186. go ls.keepLeaseAlive(leaseID)
  187. }()
  188. }
  189. wg.Wait()
  190. }
  191. func (ls *leaseStresser) createShortLivedLeases() {
  192. // one round of createLeases() might not create all the short lived leases we want due to falures.
  193. // thus, we want to create remaining short lived leases in the future round.
  194. neededLeases := ls.numLeases - len(ls.shortLivedLeases.getLeasesMap())
  195. var wg sync.WaitGroup
  196. for i := 0; i < neededLeases; i++ {
  197. wg.Add(1)
  198. go func() {
  199. defer wg.Done()
  200. leaseID, err := ls.createLeaseWithKeys(defaultTTLShort)
  201. if err != nil {
  202. return
  203. }
  204. ls.shortLivedLeases.add(leaseID, time.Now())
  205. }()
  206. }
  207. wg.Wait()
  208. }
  209. func (ls *leaseStresser) createLeaseWithKeys(ttl int64) (int64, error) {
  210. leaseID, err := ls.createLease(ttl)
  211. if err != nil {
  212. ls.lg.Debug(
  213. "createLease failed",
  214. zap.String("endpoint", ls.m.EtcdClientEndpoint),
  215. zap.Error(err),
  216. )
  217. return -1, err
  218. }
  219. ls.lg.Debug(
  220. "createLease created lease",
  221. zap.String("endpoint", ls.m.EtcdClientEndpoint),
  222. zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
  223. )
  224. if err := ls.attachKeysWithLease(leaseID); err != nil {
  225. return -1, err
  226. }
  227. return leaseID, nil
  228. }
  229. func (ls *leaseStresser) randomlyDropLeases() {
  230. var wg sync.WaitGroup
  231. for l := range ls.aliveLeases.getLeasesMap() {
  232. wg.Add(1)
  233. go func(leaseID int64) {
  234. defer wg.Done()
  235. dropped, err := ls.randomlyDropLease(leaseID)
  236. // if randomlyDropLease encountered an error such as context is cancelled, remove the lease from aliveLeases
  237. // because we can't tell whether the lease is dropped or not.
  238. if err != nil {
  239. ls.lg.Debug(
  240. "randomlyDropLease failed",
  241. zap.String("endpoint", ls.m.EtcdClientEndpoint),
  242. zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
  243. zap.Error(err),
  244. )
  245. ls.aliveLeases.remove(leaseID)
  246. return
  247. }
  248. if !dropped {
  249. return
  250. }
  251. ls.lg.Debug(
  252. "randomlyDropLease dropped a lease",
  253. zap.String("endpoint", ls.m.EtcdClientEndpoint),
  254. zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
  255. )
  256. ls.revokedLeases.add(leaseID, time.Now())
  257. ls.aliveLeases.remove(leaseID)
  258. }(l)
  259. }
  260. wg.Wait()
  261. }
  262. func (ls *leaseStresser) createLease(ttl int64) (int64, error) {
  263. resp, err := ls.cli.Grant(ls.ctx, ttl)
  264. if err != nil {
  265. return -1, err
  266. }
  267. return int64(resp.ID), nil
  268. }
  269. func (ls *leaseStresser) keepLeaseAlive(leaseID int64) {
  270. defer ls.aliveWg.Done()
  271. ctx, cancel := context.WithCancel(ls.ctx)
  272. stream, err := ls.cli.KeepAlive(ctx, clientv3.LeaseID(leaseID))
  273. defer func() { cancel() }()
  274. for {
  275. select {
  276. case <-time.After(500 * time.Millisecond):
  277. case <-ls.ctx.Done():
  278. ls.lg.Debug(
  279. "keepLeaseAlive context canceled",
  280. zap.String("endpoint", ls.m.EtcdClientEndpoint),
  281. zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
  282. zap.Error(ls.ctx.Err()),
  283. )
  284. // it is possible that lease expires at invariant checking phase but not at keepLeaseAlive() phase.
  285. // this scenerio is possible when alive lease is just about to expire when keepLeaseAlive() exists and expires at invariant checking phase.
  286. // to circumvent that scenerio, we check each lease before keepalive loop exist to see if it has been renewed in last TTL/2 duration.
  287. // if it is renewed, this means that invariant checking have at least ttl/2 time before lease exipres which is long enough for the checking to finish.
  288. // if it is not renewed, we remove the lease from the alive map so that the lease doesn't exipre during invariant checking
  289. renewTime, ok := ls.aliveLeases.read(leaseID)
  290. if ok && renewTime.Add(defaultTTL/2*time.Second).Before(time.Now()) {
  291. ls.aliveLeases.remove(leaseID)
  292. ls.lg.Debug(
  293. "keepLeaseAlive lease has not been renewed, dropped it",
  294. zap.String("endpoint", ls.m.EtcdClientEndpoint),
  295. zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
  296. )
  297. }
  298. return
  299. }
  300. if err != nil {
  301. ls.lg.Debug(
  302. "keepLeaseAlive lease creates stream error",
  303. zap.String("endpoint", ls.m.EtcdClientEndpoint),
  304. zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
  305. zap.Error(err),
  306. )
  307. cancel()
  308. ctx, cancel = context.WithCancel(ls.ctx)
  309. stream, err = ls.cli.KeepAlive(ctx, clientv3.LeaseID(leaseID))
  310. cancel()
  311. continue
  312. }
  313. if err != nil {
  314. ls.lg.Debug(
  315. "keepLeaseAlive failed to receive lease keepalive response",
  316. zap.String("endpoint", ls.m.EtcdClientEndpoint),
  317. zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
  318. zap.Error(err),
  319. )
  320. continue
  321. }
  322. ls.lg.Debug(
  323. "keepLeaseAlive waiting on lease stream",
  324. zap.String("endpoint", ls.m.EtcdClientEndpoint),
  325. zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
  326. )
  327. leaseRenewTime := time.Now()
  328. respRC := <-stream
  329. if respRC == nil {
  330. ls.lg.Debug(
  331. "keepLeaseAlive received nil lease keepalive response",
  332. zap.String("endpoint", ls.m.EtcdClientEndpoint),
  333. zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
  334. )
  335. continue
  336. }
  337. // lease expires after TTL become 0
  338. // don't send keepalive if the lease has expired
  339. if respRC.TTL <= 0 {
  340. ls.lg.Debug(
  341. "keepLeaseAlive stream received lease keepalive response TTL <= 0",
  342. zap.String("endpoint", ls.m.EtcdClientEndpoint),
  343. zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
  344. zap.Int64("ttl", respRC.TTL),
  345. )
  346. ls.aliveLeases.remove(leaseID)
  347. return
  348. }
  349. // renew lease timestamp only if lease is present
  350. ls.lg.Debug(
  351. "keepLeaseAlive renewed a lease",
  352. zap.String("endpoint", ls.m.EtcdClientEndpoint),
  353. zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
  354. )
  355. ls.aliveLeases.update(leaseID, leaseRenewTime)
  356. }
  357. }
  358. // attachKeysWithLease function attaches keys to the lease.
  359. // the format of key is the concat of leaseID + '_' + '<order of key creation>'
  360. // e.g 5186835655248304152_0 for first created key and 5186835655248304152_1 for second created key
  361. func (ls *leaseStresser) attachKeysWithLease(leaseID int64) error {
  362. var txnPuts []clientv3.Op
  363. for j := 0; j < ls.keysPerLease; j++ {
  364. txnput := clientv3.OpPut(
  365. fmt.Sprintf("%d%s%d", leaseID, "_", j),
  366. fmt.Sprintf("bar"),
  367. clientv3.WithLease(clientv3.LeaseID(leaseID)),
  368. )
  369. txnPuts = append(txnPuts, txnput)
  370. }
  371. // keep retrying until lease is not found or ctx is being canceled
  372. for ls.ctx.Err() == nil {
  373. _, err := ls.cli.Txn(ls.ctx).Then(txnPuts...).Commit()
  374. if err == nil {
  375. // since all created keys will be deleted too, the number of operations on keys will be roughly 2x the number of created keys
  376. atomic.AddInt64(&ls.atomicModifiedKey, 2*int64(ls.keysPerLease))
  377. return nil
  378. }
  379. if rpctypes.Error(err) == rpctypes.ErrLeaseNotFound {
  380. return err
  381. }
  382. }
  383. return ls.ctx.Err()
  384. }
  385. // randomlyDropLease drops the lease only when the rand.Int(2) returns 1.
  386. // This creates a 50/50 percents chance of dropping a lease
  387. func (ls *leaseStresser) randomlyDropLease(leaseID int64) (bool, error) {
  388. if rand.Intn(2) != 0 {
  389. return false, nil
  390. }
  391. // keep retrying until a lease is dropped or ctx is being canceled
  392. for ls.ctx.Err() == nil {
  393. _, err := ls.cli.Revoke(ls.ctx, clientv3.LeaseID(leaseID))
  394. if err == nil || rpctypes.Error(err) == rpctypes.ErrLeaseNotFound {
  395. return true, nil
  396. }
  397. }
  398. ls.lg.Debug(
  399. "randomlyDropLease error",
  400. zap.String("endpoint", ls.m.EtcdClientEndpoint),
  401. zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
  402. zap.Error(ls.ctx.Err()),
  403. )
  404. return false, ls.ctx.Err()
  405. }
  406. func (ls *leaseStresser) Pause() {
  407. ls.Close()
  408. }
  409. func (ls *leaseStresser) Close() {
  410. ls.lg.Info(
  411. "lease stresser is closing",
  412. zap.String("endpoint", ls.m.EtcdClientEndpoint),
  413. )
  414. ls.cancel()
  415. ls.runWg.Wait()
  416. ls.aliveWg.Wait()
  417. ls.cli.Close()
  418. ls.lg.Info(
  419. "lease stresser is closed",
  420. zap.String("endpoint", ls.m.EtcdClientEndpoint),
  421. )
  422. }
  423. func (ls *leaseStresser) ModifiedKeys() int64 {
  424. return atomic.LoadInt64(&ls.atomicModifiedKey)
  425. }
  426. func (ls *leaseStresser) Checker() Checker {
  427. return &leaseChecker{lg: ls.lg, m: ls.m, ls: ls}
  428. }