kvstore.go 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412
  1. // Copyright 2015 The etcd Authors
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package mvcc
  15. import (
  16. "encoding/binary"
  17. "errors"
  18. "math"
  19. "sync"
  20. "time"
  21. "github.com/coreos/etcd/lease"
  22. "github.com/coreos/etcd/mvcc/backend"
  23. "github.com/coreos/etcd/mvcc/mvccpb"
  24. "github.com/coreos/etcd/pkg/schedule"
  25. "github.com/coreos/pkg/capnslog"
  26. "golang.org/x/net/context"
  27. )
  28. var (
  29. keyBucketName = []byte("key")
  30. metaBucketName = []byte("meta")
  31. // markedRevBytesLen is the byte length of marked revision.
  32. // The first `revBytesLen` bytes represents a normal revision. The last
  33. // one byte is the mark.
  34. markedRevBytesLen = revBytesLen + 1
  35. markBytePosition = markedRevBytesLen - 1
  36. markTombstone byte = 't'
  37. consistentIndexKeyName = []byte("consistent_index")
  38. scheduledCompactKeyName = []byte("scheduledCompactRev")
  39. finishedCompactKeyName = []byte("finishedCompactRev")
  40. ErrCompacted = errors.New("mvcc: required revision has been compacted")
  41. ErrFutureRev = errors.New("mvcc: required revision is a future revision")
  42. ErrCanceled = errors.New("mvcc: watcher is canceled")
  43. ErrClosed = errors.New("mvcc: closed")
  44. plog = capnslog.NewPackageLogger("github.com/coreos/etcd", "mvcc")
  45. )
  46. // ConsistentIndexGetter is an interface that wraps the Get method.
  47. // Consistent index is the offset of an entry in a consistent replicated log.
  48. type ConsistentIndexGetter interface {
  49. // ConsistentIndex returns the consistent index of current executing entry.
  50. ConsistentIndex() uint64
  51. }
  52. type store struct {
  53. ReadView
  54. WriteView
  55. // mu read locks for txns and write locks for non-txn store changes.
  56. mu sync.RWMutex
  57. ig ConsistentIndexGetter
  58. b backend.Backend
  59. kvindex index
  60. le lease.Lessor
  61. // revMuLock protects currentRev and compactMainRev.
  62. // Locked at end of write txn and released after write txn unlock lock.
  63. // Locked before locking read txn and released after locking.
  64. revMu sync.RWMutex
  65. // currentRev is the revision of the last completed transaction.
  66. currentRev int64
  67. // compactMainRev is the main revision of the last compaction.
  68. compactMainRev int64
  69. // bytesBuf8 is a byte slice of length 8
  70. // to avoid a repetitive allocation in saveIndex.
  71. bytesBuf8 []byte
  72. fifoSched schedule.Scheduler
  73. stopc chan struct{}
  74. }
  75. // NewStore returns a new store. It is useful to create a store inside
  76. // mvcc pkg. It should only be used for testing externally.
  77. func NewStore(b backend.Backend, le lease.Lessor, ig ConsistentIndexGetter) *store {
  78. s := &store{
  79. b: b,
  80. ig: ig,
  81. kvindex: newTreeIndex(),
  82. le: le,
  83. currentRev: 1,
  84. compactMainRev: -1,
  85. bytesBuf8: make([]byte, 8),
  86. fifoSched: schedule.NewFIFOScheduler(),
  87. stopc: make(chan struct{}),
  88. }
  89. s.ReadView = &readView{s}
  90. s.WriteView = &writeView{s}
  91. if s.le != nil {
  92. s.le.SetRangeDeleter(func() lease.TxnDelete { return s.Write() })
  93. }
  94. tx := s.b.BatchTx()
  95. tx.Lock()
  96. tx.UnsafeCreateBucket(keyBucketName)
  97. tx.UnsafeCreateBucket(metaBucketName)
  98. tx.Unlock()
  99. s.b.ForceCommit()
  100. if err := s.restore(); err != nil {
  101. // TODO: return the error instead of panic here?
  102. panic("failed to recover store from backend")
  103. }
  104. return s
  105. }
  106. func (s *store) compactBarrier(ctx context.Context, ch chan struct{}) {
  107. if ctx == nil || ctx.Err() != nil {
  108. s.mu.Lock()
  109. select {
  110. case <-s.stopc:
  111. default:
  112. f := func(ctx context.Context) { s.compactBarrier(ctx, ch) }
  113. s.fifoSched.Schedule(f)
  114. }
  115. s.mu.Unlock()
  116. return
  117. }
  118. close(ch)
  119. }
  120. func (s *store) Hash() (hash uint32, revision int64, err error) {
  121. // TODO: nothing should be able to call into backend when closed
  122. select {
  123. case <-s.stopc:
  124. return 0, 0, ErrClosed
  125. default:
  126. }
  127. s.b.ForceCommit()
  128. h, err := s.b.Hash(DefaultIgnores)
  129. return h, s.currentRev, err
  130. }
  131. func (s *store) Compact(rev int64) (<-chan struct{}, error) {
  132. s.mu.Lock()
  133. defer s.mu.Unlock()
  134. s.revMu.Lock()
  135. defer s.revMu.Unlock()
  136. if rev <= s.compactMainRev {
  137. ch := make(chan struct{})
  138. f := func(ctx context.Context) { s.compactBarrier(ctx, ch) }
  139. s.fifoSched.Schedule(f)
  140. return ch, ErrCompacted
  141. }
  142. if rev > s.currentRev {
  143. return nil, ErrFutureRev
  144. }
  145. start := time.Now()
  146. s.compactMainRev = rev
  147. rbytes := newRevBytes()
  148. revToBytes(revision{main: rev}, rbytes)
  149. tx := s.b.BatchTx()
  150. tx.Lock()
  151. tx.UnsafePut(metaBucketName, scheduledCompactKeyName, rbytes)
  152. tx.Unlock()
  153. // ensure that desired compaction is persisted
  154. s.b.ForceCommit()
  155. keep := s.kvindex.Compact(rev)
  156. ch := make(chan struct{})
  157. var j = func(ctx context.Context) {
  158. if ctx.Err() != nil {
  159. s.compactBarrier(ctx, ch)
  160. return
  161. }
  162. if !s.scheduleCompaction(rev, keep) {
  163. s.compactBarrier(nil, ch)
  164. return
  165. }
  166. close(ch)
  167. }
  168. s.fifoSched.Schedule(j)
  169. indexCompactionPauseDurations.Observe(float64(time.Since(start) / time.Millisecond))
  170. return ch, nil
  171. }
  172. // DefaultIgnores is a map of keys to ignore in hash checking.
  173. var DefaultIgnores map[backend.IgnoreKey]struct{}
  174. func init() {
  175. DefaultIgnores = map[backend.IgnoreKey]struct{}{
  176. // consistent index might be changed due to v2 internal sync, which
  177. // is not controllable by the user.
  178. {Bucket: string(metaBucketName), Key: string(consistentIndexKeyName)}: {},
  179. }
  180. }
  181. func (s *store) Commit() {
  182. s.mu.Lock()
  183. defer s.mu.Unlock()
  184. tx := s.b.BatchTx()
  185. tx.Lock()
  186. s.saveIndex(tx)
  187. tx.Unlock()
  188. s.b.ForceCommit()
  189. }
  190. func (s *store) Restore(b backend.Backend) error {
  191. s.mu.Lock()
  192. defer s.mu.Unlock()
  193. close(s.stopc)
  194. s.fifoSched.Stop()
  195. s.b = b
  196. s.kvindex = newTreeIndex()
  197. s.currentRev = 1
  198. s.compactMainRev = -1
  199. s.fifoSched = schedule.NewFIFOScheduler()
  200. s.stopc = make(chan struct{})
  201. return s.restore()
  202. }
  203. func (s *store) restore() error {
  204. min, max := newRevBytes(), newRevBytes()
  205. revToBytes(revision{main: 1}, min)
  206. revToBytes(revision{main: math.MaxInt64, sub: math.MaxInt64}, max)
  207. keyToLease := make(map[string]lease.LeaseID)
  208. // use an unordered map to hold the temp index data to speed up
  209. // the initial key index recovery.
  210. // we will convert this unordered map into the tree index later.
  211. unordered := make(map[string]*keyIndex, 100000)
  212. // restore index
  213. tx := s.b.BatchTx()
  214. tx.Lock()
  215. _, finishedCompactBytes := tx.UnsafeRange(metaBucketName, finishedCompactKeyName, nil, 0)
  216. if len(finishedCompactBytes) != 0 {
  217. s.compactMainRev = bytesToRev(finishedCompactBytes[0]).main
  218. plog.Printf("restore compact to %d", s.compactMainRev)
  219. }
  220. // TODO: limit N to reduce max memory usage
  221. keys, vals := tx.UnsafeRange(keyBucketName, min, max, 0)
  222. for i, key := range keys {
  223. var kv mvccpb.KeyValue
  224. if err := kv.Unmarshal(vals[i]); err != nil {
  225. plog.Fatalf("cannot unmarshal event: %v", err)
  226. }
  227. rev := bytesToRev(key[:revBytesLen])
  228. s.currentRev = rev.main
  229. // restore index
  230. switch {
  231. case isTombstone(key):
  232. if ki, ok := unordered[string(kv.Key)]; ok {
  233. ki.tombstone(rev.main, rev.sub)
  234. }
  235. delete(keyToLease, string(kv.Key))
  236. default:
  237. ki, ok := unordered[string(kv.Key)]
  238. if ok {
  239. ki.put(rev.main, rev.sub)
  240. } else {
  241. ki = &keyIndex{key: kv.Key}
  242. ki.restore(revision{kv.CreateRevision, 0}, rev, kv.Version)
  243. unordered[string(kv.Key)] = ki
  244. }
  245. if lid := lease.LeaseID(kv.Lease); lid != lease.NoLease {
  246. keyToLease[string(kv.Key)] = lid
  247. } else {
  248. delete(keyToLease, string(kv.Key))
  249. }
  250. }
  251. }
  252. // restore the tree index from the unordered index.
  253. for _, v := range unordered {
  254. s.kvindex.Insert(v)
  255. }
  256. // keys in the range [compacted revision -N, compaction] might all be deleted due to compaction.
  257. // the correct revision should be set to compaction revision in the case, not the largest revision
  258. // we have seen.
  259. if s.currentRev < s.compactMainRev {
  260. s.currentRev = s.compactMainRev
  261. }
  262. for key, lid := range keyToLease {
  263. if s.le == nil {
  264. panic("no lessor to attach lease")
  265. }
  266. err := s.le.Attach(lid, []lease.LeaseItem{{Key: key}})
  267. if err != nil {
  268. plog.Errorf("unexpected Attach error: %v", err)
  269. }
  270. }
  271. _, scheduledCompactBytes := tx.UnsafeRange(metaBucketName, scheduledCompactKeyName, nil, 0)
  272. scheduledCompact := int64(0)
  273. if len(scheduledCompactBytes) != 0 {
  274. scheduledCompact = bytesToRev(scheduledCompactBytes[0]).main
  275. if scheduledCompact <= s.compactMainRev {
  276. scheduledCompact = 0
  277. }
  278. }
  279. tx.Unlock()
  280. if scheduledCompact != 0 {
  281. s.Compact(scheduledCompact)
  282. plog.Printf("resume scheduled compaction at %d", scheduledCompact)
  283. }
  284. return nil
  285. }
  286. func (s *store) Close() error {
  287. close(s.stopc)
  288. s.fifoSched.Stop()
  289. return nil
  290. }
  291. func (a *store) Equal(b *store) bool {
  292. if a.currentRev != b.currentRev {
  293. return false
  294. }
  295. if a.compactMainRev != b.compactMainRev {
  296. return false
  297. }
  298. return a.kvindex.Equal(b.kvindex)
  299. }
  300. func (s *store) saveIndex(tx backend.BatchTx) {
  301. if s.ig == nil {
  302. return
  303. }
  304. bs := s.bytesBuf8
  305. binary.BigEndian.PutUint64(bs, s.ig.ConsistentIndex())
  306. // put the index into the underlying backend
  307. // tx has been locked in TxnBegin, so there is no need to lock it again
  308. tx.UnsafePut(metaBucketName, consistentIndexKeyName, bs)
  309. }
  310. func (s *store) ConsistentIndex() uint64 {
  311. // TODO: cache index in a uint64 field?
  312. tx := s.b.BatchTx()
  313. tx.Lock()
  314. defer tx.Unlock()
  315. _, vs := tx.UnsafeRange(metaBucketName, consistentIndexKeyName, nil, 0)
  316. if len(vs) == 0 {
  317. return 0
  318. }
  319. return binary.BigEndian.Uint64(vs[0])
  320. }
  321. // appendMarkTombstone appends tombstone mark to normal revision bytes.
  322. func appendMarkTombstone(b []byte) []byte {
  323. if len(b) != revBytesLen {
  324. plog.Panicf("cannot append mark to non normal revision bytes")
  325. }
  326. return append(b, markTombstone)
  327. }
  328. // isTombstone checks whether the revision bytes is a tombstone.
  329. func isTombstone(b []byte) bool {
  330. return len(b) == markedRevBytesLen && b[markBytePosition] == markTombstone
  331. }
  332. // revBytesRange returns the range of revision bytes at
  333. // the given revision.
  334. func revBytesRange(rev revision) (start, end []byte) {
  335. start = newRevBytes()
  336. revToBytes(rev, start)
  337. end = newRevBytes()
  338. endRev := revision{main: rev.main, sub: rev.sub + 1}
  339. revToBytes(endRev, end)
  340. return start, end
  341. }