kvstore.go 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527
  1. // Copyright 2015 The etcd Authors
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package mvcc
  15. import (
  16. "context"
  17. "encoding/binary"
  18. "errors"
  19. "hash/crc32"
  20. "math"
  21. "sync"
  22. "sync/atomic"
  23. "time"
  24. "github.com/coreos/etcd/lease"
  25. "github.com/coreos/etcd/mvcc/backend"
  26. "github.com/coreos/etcd/mvcc/mvccpb"
  27. "github.com/coreos/etcd/pkg/schedule"
  28. "github.com/coreos/pkg/capnslog"
  29. )
  30. var (
  31. keyBucketName = []byte("key")
  32. metaBucketName = []byte("meta")
  33. consistentIndexKeyName = []byte("consistent_index")
  34. scheduledCompactKeyName = []byte("scheduledCompactRev")
  35. finishedCompactKeyName = []byte("finishedCompactRev")
  36. ErrCompacted = errors.New("mvcc: required revision has been compacted")
  37. ErrFutureRev = errors.New("mvcc: required revision is a future revision")
  38. ErrCanceled = errors.New("mvcc: watcher is canceled")
  39. ErrClosed = errors.New("mvcc: closed")
  40. plog = capnslog.NewPackageLogger("github.com/coreos/etcd", "mvcc")
  41. )
  42. const (
  43. // markedRevBytesLen is the byte length of marked revision.
  44. // The first `revBytesLen` bytes represents a normal revision. The last
  45. // one byte is the mark.
  46. markedRevBytesLen = revBytesLen + 1
  47. markBytePosition = markedRevBytesLen - 1
  48. markTombstone byte = 't'
  49. )
  50. var restoreChunkKeys = 10000 // non-const for testing
  51. // ConsistentIndexGetter is an interface that wraps the Get method.
  52. // Consistent index is the offset of an entry in a consistent replicated log.
  53. type ConsistentIndexGetter interface {
  54. // ConsistentIndex returns the consistent index of current executing entry.
  55. ConsistentIndex() uint64
  56. }
  57. type store struct {
  58. ReadView
  59. WriteView
  60. // consistentIndex caches the "consistent_index" key's value. Accessed
  61. // through atomics so must be 64-bit aligned.
  62. consistentIndex uint64
  63. // mu read locks for txns and write locks for non-txn store changes.
  64. mu sync.RWMutex
  65. ig ConsistentIndexGetter
  66. b backend.Backend
  67. kvindex index
  68. le lease.Lessor
  69. // revMuLock protects currentRev and compactMainRev.
  70. // Locked at end of write txn and released after write txn unlock lock.
  71. // Locked before locking read txn and released after locking.
  72. revMu sync.RWMutex
  73. // currentRev is the revision of the last completed transaction.
  74. currentRev int64
  75. // compactMainRev is the main revision of the last compaction.
  76. compactMainRev int64
  77. // bytesBuf8 is a byte slice of length 8
  78. // to avoid a repetitive allocation in saveIndex.
  79. bytesBuf8 []byte
  80. fifoSched schedule.Scheduler
  81. stopc chan struct{}
  82. }
  83. // NewStore returns a new store. It is useful to create a store inside
  84. // mvcc pkg. It should only be used for testing externally.
  85. func NewStore(b backend.Backend, le lease.Lessor, ig ConsistentIndexGetter) *store {
  86. s := &store{
  87. b: b,
  88. ig: ig,
  89. kvindex: newTreeIndex(),
  90. le: le,
  91. currentRev: 1,
  92. compactMainRev: -1,
  93. bytesBuf8: make([]byte, 8),
  94. fifoSched: schedule.NewFIFOScheduler(),
  95. stopc: make(chan struct{}),
  96. }
  97. s.ReadView = &readView{s}
  98. s.WriteView = &writeView{s}
  99. if s.le != nil {
  100. s.le.SetRangeDeleter(func() lease.TxnDelete { return s.Write() })
  101. }
  102. tx := s.b.BatchTx()
  103. tx.Lock()
  104. tx.UnsafeCreateBucket(keyBucketName)
  105. tx.UnsafeCreateBucket(metaBucketName)
  106. tx.Unlock()
  107. s.b.ForceCommit()
  108. if err := s.restore(); err != nil {
  109. // TODO: return the error instead of panic here?
  110. panic("failed to recover store from backend")
  111. }
  112. return s
  113. }
  114. func (s *store) compactBarrier(ctx context.Context, ch chan struct{}) {
  115. if ctx == nil || ctx.Err() != nil {
  116. s.mu.Lock()
  117. select {
  118. case <-s.stopc:
  119. default:
  120. f := func(ctx context.Context) { s.compactBarrier(ctx, ch) }
  121. s.fifoSched.Schedule(f)
  122. }
  123. s.mu.Unlock()
  124. return
  125. }
  126. close(ch)
  127. }
  128. func (s *store) Hash() (hash uint32, revision int64, err error) {
  129. start := time.Now()
  130. s.b.ForceCommit()
  131. h, err := s.b.Hash(DefaultIgnores)
  132. hashDurations.Observe(time.Since(start).Seconds())
  133. return h, s.currentRev, err
  134. }
  135. func (s *store) HashByRev(rev int64) (hash uint32, currentRev int64, compactRev int64, err error) {
  136. start := time.Now()
  137. s.mu.RLock()
  138. s.revMu.RLock()
  139. compactRev, currentRev = s.compactMainRev, s.currentRev
  140. s.revMu.RUnlock()
  141. if rev > 0 && rev <= compactRev {
  142. s.mu.RUnlock()
  143. return 0, 0, compactRev, ErrCompacted
  144. } else if rev > 0 && rev > currentRev {
  145. s.mu.RUnlock()
  146. return 0, currentRev, 0, ErrFutureRev
  147. }
  148. if rev == 0 {
  149. rev = currentRev
  150. }
  151. keep := s.kvindex.Keep(rev)
  152. tx := s.b.ReadTx()
  153. tx.Lock()
  154. defer tx.Unlock()
  155. s.mu.RUnlock()
  156. upper := revision{main: rev + 1}
  157. lower := revision{main: compactRev + 1}
  158. h := crc32.New(crc32.MakeTable(crc32.Castagnoli))
  159. h.Write(keyBucketName)
  160. err = tx.UnsafeForEach(keyBucketName, func(k, v []byte) error {
  161. kr := bytesToRev(k)
  162. if !upper.GreaterThan(kr) {
  163. return nil
  164. }
  165. // skip revisions that are scheduled for deletion
  166. // due to compacting; don't skip if there isn't one.
  167. if lower.GreaterThan(kr) && len(keep) > 0 {
  168. if _, ok := keep[kr]; !ok {
  169. return nil
  170. }
  171. }
  172. h.Write(k)
  173. h.Write(v)
  174. return nil
  175. })
  176. hash = h.Sum32()
  177. hashRevDurations.Observe(time.Since(start).Seconds())
  178. return hash, currentRev, compactRev, err
  179. }
  180. func (s *store) Compact(rev int64) (<-chan struct{}, error) {
  181. s.mu.Lock()
  182. defer s.mu.Unlock()
  183. s.revMu.Lock()
  184. defer s.revMu.Unlock()
  185. if rev <= s.compactMainRev {
  186. ch := make(chan struct{})
  187. f := func(ctx context.Context) { s.compactBarrier(ctx, ch) }
  188. s.fifoSched.Schedule(f)
  189. return ch, ErrCompacted
  190. }
  191. if rev > s.currentRev {
  192. return nil, ErrFutureRev
  193. }
  194. start := time.Now()
  195. s.compactMainRev = rev
  196. rbytes := newRevBytes()
  197. revToBytes(revision{main: rev}, rbytes)
  198. tx := s.b.BatchTx()
  199. tx.Lock()
  200. tx.UnsafePut(metaBucketName, scheduledCompactKeyName, rbytes)
  201. tx.Unlock()
  202. // ensure that desired compaction is persisted
  203. s.b.ForceCommit()
  204. keep := s.kvindex.Compact(rev)
  205. ch := make(chan struct{})
  206. var j = func(ctx context.Context) {
  207. if ctx.Err() != nil {
  208. s.compactBarrier(ctx, ch)
  209. return
  210. }
  211. if !s.scheduleCompaction(rev, keep) {
  212. s.compactBarrier(nil, ch)
  213. return
  214. }
  215. close(ch)
  216. }
  217. s.fifoSched.Schedule(j)
  218. indexCompactionPauseDurations.Observe(float64(time.Since(start) / time.Millisecond))
  219. return ch, nil
  220. }
  221. // DefaultIgnores is a map of keys to ignore in hash checking.
  222. var DefaultIgnores map[backend.IgnoreKey]struct{}
  223. func init() {
  224. DefaultIgnores = map[backend.IgnoreKey]struct{}{
  225. // consistent index might be changed due to v2 internal sync, which
  226. // is not controllable by the user.
  227. {Bucket: string(metaBucketName), Key: string(consistentIndexKeyName)}: {},
  228. }
  229. }
  230. func (s *store) Commit() {
  231. s.mu.Lock()
  232. defer s.mu.Unlock()
  233. tx := s.b.BatchTx()
  234. tx.Lock()
  235. s.saveIndex(tx)
  236. tx.Unlock()
  237. s.b.ForceCommit()
  238. }
  239. func (s *store) Restore(b backend.Backend) error {
  240. s.mu.Lock()
  241. defer s.mu.Unlock()
  242. close(s.stopc)
  243. s.fifoSched.Stop()
  244. atomic.StoreUint64(&s.consistentIndex, 0)
  245. s.b = b
  246. s.kvindex = newTreeIndex()
  247. s.currentRev = 1
  248. s.compactMainRev = -1
  249. s.fifoSched = schedule.NewFIFOScheduler()
  250. s.stopc = make(chan struct{})
  251. return s.restore()
  252. }
  253. func (s *store) restore() error {
  254. s.setupMetricsReporter()
  255. min, max := newRevBytes(), newRevBytes()
  256. revToBytes(revision{main: 1}, min)
  257. revToBytes(revision{main: math.MaxInt64, sub: math.MaxInt64}, max)
  258. keyToLease := make(map[string]lease.LeaseID)
  259. // restore index
  260. tx := s.b.BatchTx()
  261. tx.Lock()
  262. _, finishedCompactBytes := tx.UnsafeRange(metaBucketName, finishedCompactKeyName, nil, 0)
  263. if len(finishedCompactBytes) != 0 {
  264. s.compactMainRev = bytesToRev(finishedCompactBytes[0]).main
  265. plog.Printf("restore compact to %d", s.compactMainRev)
  266. }
  267. _, scheduledCompactBytes := tx.UnsafeRange(metaBucketName, scheduledCompactKeyName, nil, 0)
  268. scheduledCompact := int64(0)
  269. if len(scheduledCompactBytes) != 0 {
  270. scheduledCompact = bytesToRev(scheduledCompactBytes[0]).main
  271. }
  272. // index keys concurrently as they're loaded in from tx
  273. keysGauge.Set(0)
  274. rkvc, revc := restoreIntoIndex(s.kvindex)
  275. for {
  276. keys, vals := tx.UnsafeRange(keyBucketName, min, max, int64(restoreChunkKeys))
  277. if len(keys) == 0 {
  278. break
  279. }
  280. // rkvc blocks if the total pending keys exceeds the restore
  281. // chunk size to keep keys from consuming too much memory.
  282. restoreChunk(rkvc, keys, vals, keyToLease)
  283. if len(keys) < restoreChunkKeys {
  284. // partial set implies final set
  285. break
  286. }
  287. // next set begins after where this one ended
  288. newMin := bytesToRev(keys[len(keys)-1][:revBytesLen])
  289. newMin.sub++
  290. revToBytes(newMin, min)
  291. }
  292. close(rkvc)
  293. s.currentRev = <-revc
  294. // keys in the range [compacted revision -N, compaction] might all be deleted due to compaction.
  295. // the correct revision should be set to compaction revision in the case, not the largest revision
  296. // we have seen.
  297. if s.currentRev < s.compactMainRev {
  298. s.currentRev = s.compactMainRev
  299. }
  300. if scheduledCompact <= s.compactMainRev {
  301. scheduledCompact = 0
  302. }
  303. for key, lid := range keyToLease {
  304. if s.le == nil {
  305. panic("no lessor to attach lease")
  306. }
  307. err := s.le.Attach(lid, []lease.LeaseItem{{Key: key}})
  308. if err != nil {
  309. plog.Errorf("unexpected Attach error: %v", err)
  310. }
  311. }
  312. tx.Unlock()
  313. if scheduledCompact != 0 {
  314. s.Compact(scheduledCompact)
  315. plog.Printf("resume scheduled compaction at %d", scheduledCompact)
  316. }
  317. return nil
  318. }
  319. type revKeyValue struct {
  320. key []byte
  321. kv mvccpb.KeyValue
  322. kstr string
  323. }
  324. func restoreIntoIndex(idx index) (chan<- revKeyValue, <-chan int64) {
  325. rkvc, revc := make(chan revKeyValue, restoreChunkKeys), make(chan int64, 1)
  326. go func() {
  327. currentRev := int64(1)
  328. defer func() { revc <- currentRev }()
  329. // restore the tree index from streaming the unordered index.
  330. kiCache := make(map[string]*keyIndex, restoreChunkKeys)
  331. for rkv := range rkvc {
  332. ki, ok := kiCache[rkv.kstr]
  333. // purge kiCache if many keys but still missing in the cache
  334. if !ok && len(kiCache) >= restoreChunkKeys {
  335. i := 10
  336. for k := range kiCache {
  337. delete(kiCache, k)
  338. if i--; i == 0 {
  339. break
  340. }
  341. }
  342. }
  343. // cache miss, fetch from tree index if there
  344. if !ok {
  345. ki = &keyIndex{key: rkv.kv.Key}
  346. if idxKey := idx.KeyIndex(ki); idxKey != nil {
  347. kiCache[rkv.kstr], ki = idxKey, idxKey
  348. ok = true
  349. }
  350. }
  351. rev := bytesToRev(rkv.key)
  352. currentRev = rev.main
  353. if ok {
  354. if isTombstone(rkv.key) {
  355. ki.tombstone(rev.main, rev.sub)
  356. continue
  357. }
  358. ki.put(rev.main, rev.sub)
  359. } else if !isTombstone(rkv.key) {
  360. ki.restore(revision{rkv.kv.CreateRevision, 0}, rev, rkv.kv.Version)
  361. idx.Insert(ki)
  362. kiCache[rkv.kstr] = ki
  363. }
  364. }
  365. }()
  366. return rkvc, revc
  367. }
  368. func restoreChunk(kvc chan<- revKeyValue, keys, vals [][]byte, keyToLease map[string]lease.LeaseID) {
  369. for i, key := range keys {
  370. rkv := revKeyValue{key: key}
  371. if err := rkv.kv.Unmarshal(vals[i]); err != nil {
  372. plog.Fatalf("cannot unmarshal event: %v", err)
  373. }
  374. rkv.kstr = string(rkv.kv.Key)
  375. if isTombstone(key) {
  376. delete(keyToLease, rkv.kstr)
  377. } else if lid := lease.LeaseID(rkv.kv.Lease); lid != lease.NoLease {
  378. keyToLease[rkv.kstr] = lid
  379. } else {
  380. delete(keyToLease, rkv.kstr)
  381. }
  382. kvc <- rkv
  383. }
  384. }
  385. func (s *store) Close() error {
  386. close(s.stopc)
  387. s.fifoSched.Stop()
  388. return nil
  389. }
  390. func (s *store) saveIndex(tx backend.BatchTx) {
  391. if s.ig == nil {
  392. return
  393. }
  394. bs := s.bytesBuf8
  395. ci := s.ig.ConsistentIndex()
  396. binary.BigEndian.PutUint64(bs, ci)
  397. // put the index into the underlying backend
  398. // tx has been locked in TxnBegin, so there is no need to lock it again
  399. tx.UnsafePut(metaBucketName, consistentIndexKeyName, bs)
  400. atomic.StoreUint64(&s.consistentIndex, ci)
  401. }
  402. func (s *store) ConsistentIndex() uint64 {
  403. if ci := atomic.LoadUint64(&s.consistentIndex); ci > 0 {
  404. return ci
  405. }
  406. tx := s.b.BatchTx()
  407. tx.Lock()
  408. defer tx.Unlock()
  409. _, vs := tx.UnsafeRange(metaBucketName, consistentIndexKeyName, nil, 0)
  410. if len(vs) == 0 {
  411. return 0
  412. }
  413. v := binary.BigEndian.Uint64(vs[0])
  414. atomic.StoreUint64(&s.consistentIndex, v)
  415. return v
  416. }
  417. func (s *store) setupMetricsReporter() {
  418. b := s.b
  419. reportDbTotalSizeInBytesMu.Lock()
  420. reportDbTotalSizeInBytes = func() float64 { return float64(b.Size()) }
  421. reportDbTotalSizeInBytesMu.Unlock()
  422. reportDbTotalSizeInUseInBytesMu.Lock()
  423. reportDbTotalSizeInUseInBytes = func() float64 { return float64(b.SizeInUse()) }
  424. reportDbTotalSizeInUseInBytesMu.Unlock()
  425. reportCurrentRevMu.Lock()
  426. reportCurrentRev = func() float64 {
  427. s.revMu.RLock()
  428. defer s.revMu.RUnlock()
  429. return float64(s.currentRev)
  430. }
  431. reportCurrentRevMu.Unlock()
  432. reportCompactRevMu.Lock()
  433. reportCompactRev = func() float64 {
  434. s.revMu.RLock()
  435. defer s.revMu.RUnlock()
  436. return float64(s.compactMainRev)
  437. }
  438. reportCompactRevMu.Unlock()
  439. }
  440. // appendMarkTombstone appends tombstone mark to normal revision bytes.
  441. func appendMarkTombstone(b []byte) []byte {
  442. if len(b) != revBytesLen {
  443. plog.Panicf("cannot append mark to non normal revision bytes")
  444. }
  445. return append(b, markTombstone)
  446. }
  447. // isTombstone checks whether the revision bytes is a tombstone.
  448. func isTombstone(b []byte) bool {
  449. return len(b) == markedRevBytesLen && b[markBytePosition] == markTombstone
  450. }