kvstore_txn.go 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321
  1. // Copyright 2017 The etcd Authors
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package mvcc
  15. import (
  16. "go.etcd.io/etcd/lease"
  17. "go.etcd.io/etcd/mvcc/backend"
  18. "go.etcd.io/etcd/mvcc/mvccpb"
  19. "go.etcd.io/etcd/pkg/traceutil"
  20. "go.uber.org/zap"
  21. )
  22. type storeTxnRead struct {
  23. s *store
  24. tx backend.ReadTx
  25. firstRev int64
  26. rev int64
  27. trace *traceutil.Trace
  28. }
  29. func (s *store) Read(trace *traceutil.Trace) TxnRead {
  30. s.mu.RLock()
  31. s.revMu.RLock()
  32. // backend holds b.readTx.RLock() only when creating the concurrentReadTx. After
  33. // ConcurrentReadTx is created, it will not block write transaction.
  34. tx := s.b.ConcurrentReadTx()
  35. tx.RLock() // RLock is no-op. concurrentReadTx does not need to be locked after it is created.
  36. firstRev, rev := s.compactMainRev, s.currentRev
  37. s.revMu.RUnlock()
  38. return newMetricsTxnRead(&storeTxnRead{s, tx, firstRev, rev, trace})
  39. }
  40. func (tr *storeTxnRead) FirstRev() int64 { return tr.firstRev }
  41. func (tr *storeTxnRead) Rev() int64 { return tr.rev }
  42. func (tr *storeTxnRead) Range(key, end []byte, ro RangeOptions) (r *RangeResult, err error) {
  43. return tr.rangeKeys(key, end, tr.Rev(), ro)
  44. }
  45. func (tr *storeTxnRead) End() {
  46. tr.tx.RUnlock() // RUnlock signals the end of concurrentReadTx.
  47. tr.s.mu.RUnlock()
  48. }
  49. type storeTxnWrite struct {
  50. storeTxnRead
  51. tx backend.BatchTx
  52. // beginRev is the revision where the txn begins; it will write to the next revision.
  53. beginRev int64
  54. changes []mvccpb.KeyValue
  55. }
  56. func (s *store) Write(trace *traceutil.Trace) TxnWrite {
  57. s.mu.RLock()
  58. tx := s.b.BatchTx()
  59. tx.Lock()
  60. tw := &storeTxnWrite{
  61. storeTxnRead: storeTxnRead{s, tx, 0, 0, trace},
  62. tx: tx,
  63. beginRev: s.currentRev,
  64. changes: make([]mvccpb.KeyValue, 0, 4),
  65. }
  66. return newMetricsTxnWrite(tw)
  67. }
  68. func (tw *storeTxnWrite) Rev() int64 { return tw.beginRev }
  69. func (tw *storeTxnWrite) Range(key, end []byte, ro RangeOptions) (r *RangeResult, err error) {
  70. rev := tw.beginRev
  71. if len(tw.changes) > 0 {
  72. rev++
  73. }
  74. return tw.rangeKeys(key, end, rev, ro)
  75. }
  76. func (tw *storeTxnWrite) DeleteRange(key, end []byte) (int64, int64) {
  77. if n := tw.deleteRange(key, end); n != 0 || len(tw.changes) > 0 {
  78. return n, tw.beginRev + 1
  79. }
  80. return 0, tw.beginRev
  81. }
  82. func (tw *storeTxnWrite) Put(key, value []byte, lease lease.LeaseID) int64 {
  83. tw.put(key, value, lease)
  84. return tw.beginRev + 1
  85. }
  86. func (tw *storeTxnWrite) End() {
  87. // only update index if the txn modifies the mvcc state.
  88. if len(tw.changes) != 0 {
  89. tw.s.saveIndex(tw.tx)
  90. // hold revMu lock to prevent new read txns from opening until writeback.
  91. tw.s.revMu.Lock()
  92. tw.s.currentRev++
  93. }
  94. tw.tx.Unlock()
  95. if len(tw.changes) != 0 {
  96. tw.s.revMu.Unlock()
  97. }
  98. tw.s.mu.RUnlock()
  99. }
  100. func (tr *storeTxnRead) rangeKeys(key, end []byte, curRev int64, ro RangeOptions) (*RangeResult, error) {
  101. rev := ro.Rev
  102. if rev > curRev {
  103. return &RangeResult{KVs: nil, Count: -1, Rev: curRev}, ErrFutureRev
  104. }
  105. if rev <= 0 {
  106. rev = curRev
  107. }
  108. if rev < tr.s.compactMainRev {
  109. return &RangeResult{KVs: nil, Count: -1, Rev: 0}, ErrCompacted
  110. }
  111. revpairs := tr.s.kvindex.Revisions(key, end, rev)
  112. tr.trace.Step("range keys from in-memory index tree")
  113. if len(revpairs) == 0 {
  114. return &RangeResult{KVs: nil, Count: 0, Rev: curRev}, nil
  115. }
  116. if ro.Count {
  117. return &RangeResult{KVs: nil, Count: len(revpairs), Rev: curRev}, nil
  118. }
  119. limit := int(ro.Limit)
  120. if limit <= 0 || limit > len(revpairs) {
  121. limit = len(revpairs)
  122. }
  123. kvs := make([]mvccpb.KeyValue, limit)
  124. revBytes := newRevBytes()
  125. for i, revpair := range revpairs[:len(kvs)] {
  126. revToBytes(revpair, revBytes)
  127. _, vs := tr.tx.UnsafeRange(keyBucketName, revBytes, nil, 0)
  128. if len(vs) != 1 {
  129. if tr.s.lg != nil {
  130. tr.s.lg.Fatal(
  131. "range failed to find revision pair",
  132. zap.Int64("revision-main", revpair.main),
  133. zap.Int64("revision-sub", revpair.sub),
  134. )
  135. } else {
  136. plog.Fatalf("range cannot find rev (%d,%d)", revpair.main, revpair.sub)
  137. }
  138. }
  139. if err := kvs[i].Unmarshal(vs[0]); err != nil {
  140. if tr.s.lg != nil {
  141. tr.s.lg.Fatal(
  142. "failed to unmarshal mvccpb.KeyValue",
  143. zap.Error(err),
  144. )
  145. } else {
  146. plog.Fatalf("cannot unmarshal event: %v", err)
  147. }
  148. }
  149. }
  150. tr.trace.Step("range keys from bolt db")
  151. return &RangeResult{KVs: kvs, Count: len(revpairs), Rev: curRev}, nil
  152. }
  153. func (tw *storeTxnWrite) put(key, value []byte, leaseID lease.LeaseID) {
  154. rev := tw.beginRev + 1
  155. c := rev
  156. oldLease := lease.NoLease
  157. // if the key exists before, use its previous created and
  158. // get its previous leaseID
  159. _, created, ver, err := tw.s.kvindex.Get(key, rev)
  160. if err == nil {
  161. c = created.main
  162. oldLease = tw.s.le.GetLease(lease.LeaseItem{Key: string(key)})
  163. }
  164. tw.trace.Step("get key's previous created_revision and leaseID")
  165. ibytes := newRevBytes()
  166. idxRev := revision{main: rev, sub: int64(len(tw.changes))}
  167. revToBytes(idxRev, ibytes)
  168. ver = ver + 1
  169. kv := mvccpb.KeyValue{
  170. Key: key,
  171. Value: value,
  172. CreateRevision: c,
  173. ModRevision: rev,
  174. Version: ver,
  175. Lease: int64(leaseID),
  176. }
  177. d, err := kv.Marshal()
  178. if err != nil {
  179. if tw.storeTxnRead.s.lg != nil {
  180. tw.storeTxnRead.s.lg.Fatal(
  181. "failed to marshal mvccpb.KeyValue",
  182. zap.Error(err),
  183. )
  184. } else {
  185. plog.Fatalf("cannot marshal event: %v", err)
  186. }
  187. }
  188. tw.trace.Step("marshal mvccpb.KeyValue")
  189. tw.tx.UnsafeSeqPut(keyBucketName, ibytes, d)
  190. tw.s.kvindex.Put(key, idxRev)
  191. tw.changes = append(tw.changes, kv)
  192. tw.trace.Step("store kv pair into bolt db")
  193. if oldLease != lease.NoLease {
  194. if tw.s.le == nil {
  195. panic("no lessor to detach lease")
  196. }
  197. err = tw.s.le.Detach(oldLease, []lease.LeaseItem{{Key: string(key)}})
  198. if err != nil {
  199. if tw.storeTxnRead.s.lg != nil {
  200. tw.storeTxnRead.s.lg.Fatal(
  201. "failed to detach old lease from a key",
  202. zap.Error(err),
  203. )
  204. } else {
  205. plog.Errorf("unexpected error from lease detach: %v", err)
  206. }
  207. }
  208. }
  209. if leaseID != lease.NoLease {
  210. if tw.s.le == nil {
  211. panic("no lessor to attach lease")
  212. }
  213. err = tw.s.le.Attach(leaseID, []lease.LeaseItem{{Key: string(key)}})
  214. if err != nil {
  215. panic("unexpected error from lease Attach")
  216. }
  217. }
  218. tw.trace.Step("attach lease to kv pair")
  219. }
  220. func (tw *storeTxnWrite) deleteRange(key, end []byte) int64 {
  221. rrev := tw.beginRev
  222. if len(tw.changes) > 0 {
  223. rrev++
  224. }
  225. keys, _ := tw.s.kvindex.Range(key, end, rrev)
  226. if len(keys) == 0 {
  227. return 0
  228. }
  229. for _, key := range keys {
  230. tw.delete(key)
  231. }
  232. return int64(len(keys))
  233. }
  234. func (tw *storeTxnWrite) delete(key []byte) {
  235. ibytes := newRevBytes()
  236. idxRev := revision{main: tw.beginRev + 1, sub: int64(len(tw.changes))}
  237. revToBytes(idxRev, ibytes)
  238. if tw.storeTxnRead.s != nil && tw.storeTxnRead.s.lg != nil {
  239. ibytes = appendMarkTombstone(tw.storeTxnRead.s.lg, ibytes)
  240. } else {
  241. // TODO: remove this in v3.5
  242. ibytes = appendMarkTombstone(nil, ibytes)
  243. }
  244. kv := mvccpb.KeyValue{Key: key}
  245. d, err := kv.Marshal()
  246. if err != nil {
  247. if tw.storeTxnRead.s.lg != nil {
  248. tw.storeTxnRead.s.lg.Fatal(
  249. "failed to marshal mvccpb.KeyValue",
  250. zap.Error(err),
  251. )
  252. } else {
  253. plog.Fatalf("cannot marshal event: %v", err)
  254. }
  255. }
  256. tw.tx.UnsafeSeqPut(keyBucketName, ibytes, d)
  257. err = tw.s.kvindex.Tombstone(key, idxRev)
  258. if err != nil {
  259. if tw.storeTxnRead.s.lg != nil {
  260. tw.storeTxnRead.s.lg.Fatal(
  261. "failed to tombstone an existing key",
  262. zap.String("key", string(key)),
  263. zap.Error(err),
  264. )
  265. } else {
  266. plog.Fatalf("cannot tombstone an existing key (%s): %v", string(key), err)
  267. }
  268. }
  269. tw.changes = append(tw.changes, kv)
  270. item := lease.LeaseItem{Key: string(key)}
  271. leaseID := tw.s.le.GetLease(item)
  272. if leaseID != lease.NoLease {
  273. err = tw.s.le.Detach(leaseID, []lease.LeaseItem{item})
  274. if err != nil {
  275. if tw.storeTxnRead.s.lg != nil {
  276. tw.storeTxnRead.s.lg.Fatal(
  277. "failed to detach old lease from a key",
  278. zap.Error(err),
  279. )
  280. } else {
  281. plog.Errorf("cannot detach %v", err)
  282. }
  283. }
  284. }
  285. }
  286. func (tw *storeTxnWrite) Changes() []mvccpb.KeyValue { return tw.changes }