kvstore.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484
  1. // Copyright 2015 The etcd Authors
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package mvcc
  15. import (
  16. "encoding/binary"
  17. "errors"
  18. "math"
  19. "sync"
  20. "time"
  21. "github.com/coreos/etcd/lease"
  22. "github.com/coreos/etcd/mvcc/backend"
  23. "github.com/coreos/etcd/mvcc/mvccpb"
  24. "github.com/coreos/etcd/pkg/schedule"
  25. "github.com/coreos/pkg/capnslog"
  26. "golang.org/x/net/context"
  27. )
  28. var (
  29. keyBucketName = []byte("key")
  30. metaBucketName = []byte("meta")
  31. consistentIndexKeyName = []byte("consistent_index")
  32. scheduledCompactKeyName = []byte("scheduledCompactRev")
  33. finishedCompactKeyName = []byte("finishedCompactRev")
  34. ErrCompacted = errors.New("mvcc: required revision has been compacted")
  35. ErrFutureRev = errors.New("mvcc: required revision is a future revision")
  36. ErrCanceled = errors.New("mvcc: watcher is canceled")
  37. ErrClosed = errors.New("mvcc: closed")
  38. plog = capnslog.NewPackageLogger("github.com/coreos/etcd", "mvcc")
  39. )
  40. const (
  41. // markedRevBytesLen is the byte length of marked revision.
  42. // The first `revBytesLen` bytes represents a normal revision. The last
  43. // one byte is the mark.
  44. markedRevBytesLen = revBytesLen + 1
  45. markBytePosition = markedRevBytesLen - 1
  46. markTombstone byte = 't'
  47. )
  48. var restoreChunkKeys = 10000 // non-const for testing
  49. // ConsistentIndexGetter is an interface that wraps the Get method.
  50. // Consistent index is the offset of an entry in a consistent replicated log.
  51. type ConsistentIndexGetter interface {
  52. // ConsistentIndex returns the consistent index of current executing entry.
  53. ConsistentIndex() uint64
  54. }
  55. type store struct {
  56. ReadView
  57. WriteView
  58. // mu read locks for txns and write locks for non-txn store changes.
  59. mu sync.RWMutex
  60. ig ConsistentIndexGetter
  61. b backend.Backend
  62. kvindex index
  63. le lease.Lessor
  64. // revMuLock protects currentRev and compactMainRev.
  65. // Locked at end of write txn and released after write txn unlock lock.
  66. // Locked before locking read txn and released after locking.
  67. revMu sync.RWMutex
  68. // currentRev is the revision of the last completed transaction.
  69. currentRev int64
  70. // compactMainRev is the main revision of the last compaction.
  71. compactMainRev int64
  72. // bytesBuf8 is a byte slice of length 8
  73. // to avoid a repetitive allocation in saveIndex.
  74. bytesBuf8 []byte
  75. fifoSched schedule.Scheduler
  76. stopc chan struct{}
  77. }
  78. // NewStore returns a new store. It is useful to create a store inside
  79. // mvcc pkg. It should only be used for testing externally.
  80. func NewStore(b backend.Backend, le lease.Lessor, ig ConsistentIndexGetter) *store {
  81. s := &store{
  82. b: b,
  83. ig: ig,
  84. kvindex: newTreeIndex(),
  85. le: le,
  86. currentRev: 1,
  87. compactMainRev: -1,
  88. bytesBuf8: make([]byte, 8),
  89. fifoSched: schedule.NewFIFOScheduler(),
  90. stopc: make(chan struct{}),
  91. }
  92. s.ReadView = &readView{s}
  93. s.WriteView = &writeView{s}
  94. if s.le != nil {
  95. s.le.SetRangeDeleter(func() lease.TxnDelete { return s.Write() })
  96. }
  97. tx := s.b.BatchTx()
  98. tx.Lock()
  99. tx.UnsafeCreateBucket(keyBucketName)
  100. tx.UnsafeCreateBucket(metaBucketName)
  101. tx.Unlock()
  102. s.b.ForceCommit()
  103. if err := s.restore(); err != nil {
  104. // TODO: return the error instead of panic here?
  105. panic("failed to recover store from backend")
  106. }
  107. return s
  108. }
  109. func (s *store) compactBarrier(ctx context.Context, ch chan struct{}) {
  110. if ctx == nil || ctx.Err() != nil {
  111. s.mu.Lock()
  112. select {
  113. case <-s.stopc:
  114. default:
  115. f := func(ctx context.Context) { s.compactBarrier(ctx, ch) }
  116. s.fifoSched.Schedule(f)
  117. }
  118. s.mu.Unlock()
  119. return
  120. }
  121. close(ch)
  122. }
  123. func (s *store) Hash() (hash uint32, revision int64, err error) {
  124. start := time.Now()
  125. s.b.ForceCommit()
  126. h, err := s.b.Hash(DefaultIgnores)
  127. hashDurations.Observe(time.Since(start).Seconds())
  128. return h, s.currentRev, err
  129. }
  130. func (s *store) Compact(rev int64) (<-chan struct{}, error) {
  131. s.mu.Lock()
  132. defer s.mu.Unlock()
  133. s.revMu.Lock()
  134. defer s.revMu.Unlock()
  135. if rev <= s.compactMainRev {
  136. ch := make(chan struct{})
  137. f := func(ctx context.Context) { s.compactBarrier(ctx, ch) }
  138. s.fifoSched.Schedule(f)
  139. return ch, ErrCompacted
  140. }
  141. if rev > s.currentRev {
  142. return nil, ErrFutureRev
  143. }
  144. start := time.Now()
  145. s.compactMainRev = rev
  146. rbytes := newRevBytes()
  147. revToBytes(revision{main: rev}, rbytes)
  148. tx := s.b.BatchTx()
  149. tx.Lock()
  150. tx.UnsafePut(metaBucketName, scheduledCompactKeyName, rbytes)
  151. tx.Unlock()
  152. // ensure that desired compaction is persisted
  153. s.b.ForceCommit()
  154. keep := s.kvindex.Compact(rev)
  155. ch := make(chan struct{})
  156. var j = func(ctx context.Context) {
  157. if ctx.Err() != nil {
  158. s.compactBarrier(ctx, ch)
  159. return
  160. }
  161. if !s.scheduleCompaction(rev, keep) {
  162. s.compactBarrier(nil, ch)
  163. return
  164. }
  165. close(ch)
  166. }
  167. s.fifoSched.Schedule(j)
  168. indexCompactionPauseDurations.Observe(float64(time.Since(start) / time.Millisecond))
  169. return ch, nil
  170. }
  171. // DefaultIgnores is a map of keys to ignore in hash checking.
  172. var DefaultIgnores map[backend.IgnoreKey]struct{}
  173. func init() {
  174. DefaultIgnores = map[backend.IgnoreKey]struct{}{
  175. // consistent index might be changed due to v2 internal sync, which
  176. // is not controllable by the user.
  177. {Bucket: string(metaBucketName), Key: string(consistentIndexKeyName)}: {},
  178. }
  179. }
  180. func (s *store) Commit() {
  181. s.mu.Lock()
  182. defer s.mu.Unlock()
  183. tx := s.b.BatchTx()
  184. tx.Lock()
  185. s.saveIndex(tx)
  186. tx.Unlock()
  187. s.b.ForceCommit()
  188. }
  189. func (s *store) Restore(b backend.Backend) error {
  190. s.mu.Lock()
  191. defer s.mu.Unlock()
  192. close(s.stopc)
  193. s.fifoSched.Stop()
  194. s.b = b
  195. s.kvindex = newTreeIndex()
  196. s.currentRev = 1
  197. s.compactMainRev = -1
  198. s.fifoSched = schedule.NewFIFOScheduler()
  199. s.stopc = make(chan struct{})
  200. return s.restore()
  201. }
  202. func (s *store) restore() error {
  203. s.setupMetricsReporter()
  204. min, max := newRevBytes(), newRevBytes()
  205. revToBytes(revision{main: 1}, min)
  206. revToBytes(revision{main: math.MaxInt64, sub: math.MaxInt64}, max)
  207. keyToLease := make(map[string]lease.LeaseID)
  208. // restore index
  209. tx := s.b.BatchTx()
  210. tx.Lock()
  211. _, finishedCompactBytes := tx.UnsafeRange(metaBucketName, finishedCompactKeyName, nil, 0)
  212. if len(finishedCompactBytes) != 0 {
  213. s.compactMainRev = bytesToRev(finishedCompactBytes[0]).main
  214. plog.Printf("restore compact to %d", s.compactMainRev)
  215. }
  216. _, scheduledCompactBytes := tx.UnsafeRange(metaBucketName, scheduledCompactKeyName, nil, 0)
  217. scheduledCompact := int64(0)
  218. if len(scheduledCompactBytes) != 0 {
  219. scheduledCompact = bytesToRev(scheduledCompactBytes[0]).main
  220. }
  221. // index keys concurrently as they're loaded in from tx
  222. keysGauge.Set(0)
  223. rkvc, revc := restoreIntoIndex(s.kvindex)
  224. for {
  225. keys, vals := tx.UnsafeRange(keyBucketName, min, max, int64(restoreChunkKeys))
  226. if len(keys) == 0 {
  227. break
  228. }
  229. // rkvc blocks if the total pending keys exceeds the restore
  230. // chunk size to keep keys from consuming too much memory.
  231. restoreChunk(rkvc, keys, vals, keyToLease)
  232. if len(keys) < restoreChunkKeys {
  233. // partial set implies final set
  234. break
  235. }
  236. // next set begins after where this one ended
  237. newMin := bytesToRev(keys[len(keys)-1][:revBytesLen])
  238. newMin.sub++
  239. revToBytes(newMin, min)
  240. }
  241. close(rkvc)
  242. s.currentRev = <-revc
  243. // keys in the range [compacted revision -N, compaction] might all be deleted due to compaction.
  244. // the correct revision should be set to compaction revision in the case, not the largest revision
  245. // we have seen.
  246. if s.currentRev < s.compactMainRev {
  247. s.currentRev = s.compactMainRev
  248. }
  249. if scheduledCompact <= s.compactMainRev {
  250. scheduledCompact = 0
  251. }
  252. for key, lid := range keyToLease {
  253. if s.le == nil {
  254. panic("no lessor to attach lease")
  255. }
  256. err := s.le.Attach(lid, []lease.LeaseItem{{Key: key}})
  257. if err != nil {
  258. plog.Errorf("unexpected Attach error: %v", err)
  259. }
  260. }
  261. tx.Unlock()
  262. if scheduledCompact != 0 {
  263. s.Compact(scheduledCompact)
  264. plog.Printf("resume scheduled compaction at %d", scheduledCompact)
  265. }
  266. return nil
  267. }
  268. type revKeyValue struct {
  269. key []byte
  270. kv mvccpb.KeyValue
  271. kstr string
  272. }
  273. func restoreIntoIndex(idx index) (chan<- revKeyValue, <-chan int64) {
  274. rkvc, revc := make(chan revKeyValue, restoreChunkKeys), make(chan int64, 1)
  275. go func() {
  276. currentRev := int64(1)
  277. defer func() { revc <- currentRev }()
  278. // restore the tree index from streaming the unordered index.
  279. kiCache := make(map[string]*keyIndex, restoreChunkKeys)
  280. for rkv := range rkvc {
  281. ki, ok := kiCache[rkv.kstr]
  282. // purge kiCache if many keys but still missing in the cache
  283. if !ok && len(kiCache) >= restoreChunkKeys {
  284. i := 10
  285. for k := range kiCache {
  286. delete(kiCache, k)
  287. if i--; i == 0 {
  288. break
  289. }
  290. }
  291. }
  292. // cache miss, fetch from tree index if there
  293. if !ok {
  294. ki = &keyIndex{key: rkv.kv.Key}
  295. if idxKey := idx.KeyIndex(ki); idxKey != nil {
  296. kiCache[rkv.kstr], ki = idxKey, idxKey
  297. ok = true
  298. }
  299. }
  300. rev := bytesToRev(rkv.key)
  301. currentRev = rev.main
  302. if ok {
  303. if isTombstone(rkv.key) {
  304. ki.tombstone(rev.main, rev.sub)
  305. continue
  306. }
  307. ki.put(rev.main, rev.sub)
  308. } else if !isTombstone(rkv.key) {
  309. ki.restore(revision{rkv.kv.CreateRevision, 0}, rev, rkv.kv.Version)
  310. idx.Insert(ki)
  311. kiCache[rkv.kstr] = ki
  312. }
  313. }
  314. }()
  315. return rkvc, revc
  316. }
  317. func restoreChunk(kvc chan<- revKeyValue, keys, vals [][]byte, keyToLease map[string]lease.LeaseID) {
  318. for i, key := range keys {
  319. rkv := revKeyValue{key: key}
  320. if err := rkv.kv.Unmarshal(vals[i]); err != nil {
  321. plog.Fatalf("cannot unmarshal event: %v", err)
  322. }
  323. rkv.kstr = string(rkv.kv.Key)
  324. if isTombstone(key) {
  325. delete(keyToLease, rkv.kstr)
  326. } else if lid := lease.LeaseID(rkv.kv.Lease); lid != lease.NoLease {
  327. keyToLease[rkv.kstr] = lid
  328. } else {
  329. delete(keyToLease, rkv.kstr)
  330. }
  331. kvc <- rkv
  332. }
  333. }
  334. func (s *store) Close() error {
  335. close(s.stopc)
  336. s.fifoSched.Stop()
  337. return nil
  338. }
  339. func (a *store) Equal(b *store) bool {
  340. if a.currentRev != b.currentRev {
  341. return false
  342. }
  343. if a.compactMainRev != b.compactMainRev {
  344. return false
  345. }
  346. return a.kvindex.Equal(b.kvindex)
  347. }
  348. func (s *store) saveIndex(tx backend.BatchTx) {
  349. if s.ig == nil {
  350. return
  351. }
  352. bs := s.bytesBuf8
  353. binary.BigEndian.PutUint64(bs, s.ig.ConsistentIndex())
  354. // put the index into the underlying backend
  355. // tx has been locked in TxnBegin, so there is no need to lock it again
  356. tx.UnsafePut(metaBucketName, consistentIndexKeyName, bs)
  357. }
  358. func (s *store) ConsistentIndex() uint64 {
  359. // TODO: cache index in a uint64 field?
  360. tx := s.b.BatchTx()
  361. tx.Lock()
  362. defer tx.Unlock()
  363. _, vs := tx.UnsafeRange(metaBucketName, consistentIndexKeyName, nil, 0)
  364. if len(vs) == 0 {
  365. return 0
  366. }
  367. return binary.BigEndian.Uint64(vs[0])
  368. }
  369. func (s *store) setupMetricsReporter() {
  370. b := s.b
  371. reportDbTotalSizeInBytesMu.Lock()
  372. reportDbTotalSizeInBytes = func() float64 { return float64(b.Size()) }
  373. reportDbTotalSizeInBytesMu.Unlock()
  374. reportDbTotalSizeInUseInBytesMu.Lock()
  375. reportDbTotalSizeInUseInBytes = func() float64 { return float64(b.SizeInUse()) }
  376. reportDbTotalSizeInUseInBytesMu.Unlock()
  377. reportCurrentRevMu.Lock()
  378. reportCurrentRev = func() float64 {
  379. s.revMu.RLock()
  380. defer s.revMu.RUnlock()
  381. return float64(s.currentRev)
  382. }
  383. reportCurrentRevMu.Unlock()
  384. reportCompactRevMu.Lock()
  385. reportCompactRev = func() float64 {
  386. s.revMu.RLock()
  387. defer s.revMu.RUnlock()
  388. return float64(s.compactMainRev)
  389. }
  390. reportCompactRevMu.Unlock()
  391. }
  392. // appendMarkTombstone appends tombstone mark to normal revision bytes.
  393. func appendMarkTombstone(b []byte) []byte {
  394. if len(b) != revBytesLen {
  395. plog.Panicf("cannot append mark to non normal revision bytes")
  396. }
  397. return append(b, markTombstone)
  398. }
  399. // isTombstone checks whether the revision bytes is a tombstone.
  400. func isTombstone(b []byte) bool {
  401. return len(b) == markedRevBytesLen && b[markBytePosition] == markTombstone
  402. }
  403. // revBytesRange returns the range of revision bytes at
  404. // the given revision.
  405. func revBytesRange(rev revision) (start, end []byte) {
  406. start = newRevBytes()
  407. revToBytes(rev, start)
  408. end = newRevBytes()
  409. endRev := revision{main: rev.main, sub: rev.sub + 1}
  410. revToBytes(endRev, end)
  411. return start, end
  412. }