kvstore.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562
  1. // Copyright 2015 The etcd Authors
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package mvcc
  15. import (
  16. "context"
  17. "encoding/binary"
  18. "errors"
  19. "fmt"
  20. "hash/crc32"
  21. "math"
  22. "sync"
  23. "sync/atomic"
  24. "time"
  25. "github.com/coreos/etcd/lease"
  26. "github.com/coreos/etcd/mvcc/backend"
  27. "github.com/coreos/etcd/mvcc/mvccpb"
  28. "github.com/coreos/etcd/pkg/schedule"
  29. "github.com/coreos/pkg/capnslog"
  30. "go.uber.org/zap"
  31. )
  32. var (
  33. keyBucketName = []byte("key")
  34. metaBucketName = []byte("meta")
  35. consistentIndexKeyName = []byte("consistent_index")
  36. scheduledCompactKeyName = []byte("scheduledCompactRev")
  37. finishedCompactKeyName = []byte("finishedCompactRev")
  38. ErrCompacted = errors.New("mvcc: required revision has been compacted")
  39. ErrFutureRev = errors.New("mvcc: required revision is a future revision")
  40. ErrCanceled = errors.New("mvcc: watcher is canceled")
  41. ErrClosed = errors.New("mvcc: closed")
  42. plog = capnslog.NewPackageLogger("github.com/coreos/etcd", "mvcc")
  43. )
  44. const (
  45. // markedRevBytesLen is the byte length of marked revision.
  46. // The first `revBytesLen` bytes represents a normal revision. The last
  47. // one byte is the mark.
  48. markedRevBytesLen = revBytesLen + 1
  49. markBytePosition = markedRevBytesLen - 1
  50. markTombstone byte = 't'
  51. )
  52. var restoreChunkKeys = 10000 // non-const for testing
  53. // ConsistentIndexGetter is an interface that wraps the Get method.
  54. // Consistent index is the offset of an entry in a consistent replicated log.
  55. type ConsistentIndexGetter interface {
  56. // ConsistentIndex returns the consistent index of current executing entry.
  57. ConsistentIndex() uint64
  58. }
  59. type store struct {
  60. ReadView
  61. WriteView
  62. // consistentIndex caches the "consistent_index" key's value. Accessed
  63. // through atomics so must be 64-bit aligned.
  64. consistentIndex uint64
  65. // mu read locks for txns and write locks for non-txn store changes.
  66. mu sync.RWMutex
  67. ig ConsistentIndexGetter
  68. b backend.Backend
  69. kvindex index
  70. le lease.Lessor
  71. // revMuLock protects currentRev and compactMainRev.
  72. // Locked at end of write txn and released after write txn unlock lock.
  73. // Locked before locking read txn and released after locking.
  74. revMu sync.RWMutex
  75. // currentRev is the revision of the last completed transaction.
  76. currentRev int64
  77. // compactMainRev is the main revision of the last compaction.
  78. compactMainRev int64
  79. // bytesBuf8 is a byte slice of length 8
  80. // to avoid a repetitive allocation in saveIndex.
  81. bytesBuf8 []byte
  82. fifoSched schedule.Scheduler
  83. stopc chan struct{}
  84. lg *zap.Logger
  85. }
  86. // NewStore returns a new store. It is useful to create a store inside
  87. // mvcc pkg. It should only be used for testing externally.
  88. func NewStore(lg *zap.Logger, b backend.Backend, le lease.Lessor, ig ConsistentIndexGetter) *store {
  89. s := &store{
  90. b: b,
  91. ig: ig,
  92. kvindex: newTreeIndex(lg),
  93. le: le,
  94. currentRev: 1,
  95. compactMainRev: -1,
  96. bytesBuf8: make([]byte, 8),
  97. fifoSched: schedule.NewFIFOScheduler(),
  98. stopc: make(chan struct{}),
  99. lg: lg,
  100. }
  101. s.ReadView = &readView{s}
  102. s.WriteView = &writeView{s}
  103. if s.le != nil {
  104. s.le.SetRangeDeleter(func() lease.TxnDelete { return s.Write() })
  105. }
  106. tx := s.b.BatchTx()
  107. tx.Lock()
  108. tx.UnsafeCreateBucket(keyBucketName)
  109. tx.UnsafeCreateBucket(metaBucketName)
  110. tx.Unlock()
  111. s.b.ForceCommit()
  112. if err := s.restore(); err != nil {
  113. // TODO: return the error instead of panic here?
  114. panic("failed to recover store from backend")
  115. }
  116. return s
  117. }
  118. func (s *store) compactBarrier(ctx context.Context, ch chan struct{}) {
  119. if ctx == nil || ctx.Err() != nil {
  120. s.mu.Lock()
  121. select {
  122. case <-s.stopc:
  123. default:
  124. f := func(ctx context.Context) { s.compactBarrier(ctx, ch) }
  125. s.fifoSched.Schedule(f)
  126. }
  127. s.mu.Unlock()
  128. return
  129. }
  130. close(ch)
  131. }
  132. func (s *store) Hash() (hash uint32, revision int64, err error) {
  133. start := time.Now()
  134. s.b.ForceCommit()
  135. h, err := s.b.Hash(DefaultIgnores)
  136. hashSec.Observe(time.Since(start).Seconds())
  137. return h, s.currentRev, err
  138. }
  139. func (s *store) HashByRev(rev int64) (hash uint32, currentRev int64, compactRev int64, err error) {
  140. start := time.Now()
  141. s.mu.RLock()
  142. s.revMu.RLock()
  143. compactRev, currentRev = s.compactMainRev, s.currentRev
  144. s.revMu.RUnlock()
  145. if rev > 0 && rev <= compactRev {
  146. s.mu.RUnlock()
  147. return 0, 0, compactRev, ErrCompacted
  148. } else if rev > 0 && rev > currentRev {
  149. s.mu.RUnlock()
  150. return 0, currentRev, 0, ErrFutureRev
  151. }
  152. if rev == 0 {
  153. rev = currentRev
  154. }
  155. keep := s.kvindex.Keep(rev)
  156. tx := s.b.ReadTx()
  157. tx.Lock()
  158. defer tx.Unlock()
  159. s.mu.RUnlock()
  160. upper := revision{main: rev + 1}
  161. lower := revision{main: compactRev + 1}
  162. h := crc32.New(crc32.MakeTable(crc32.Castagnoli))
  163. h.Write(keyBucketName)
  164. err = tx.UnsafeForEach(keyBucketName, func(k, v []byte) error {
  165. kr := bytesToRev(k)
  166. if !upper.GreaterThan(kr) {
  167. return nil
  168. }
  169. // skip revisions that are scheduled for deletion
  170. // due to compacting; don't skip if there isn't one.
  171. if lower.GreaterThan(kr) && len(keep) > 0 {
  172. if _, ok := keep[kr]; !ok {
  173. return nil
  174. }
  175. }
  176. h.Write(k)
  177. h.Write(v)
  178. return nil
  179. })
  180. hash = h.Sum32()
  181. hashRevSec.Observe(time.Since(start).Seconds())
  182. return hash, currentRev, compactRev, err
  183. }
  184. func (s *store) Compact(rev int64) (<-chan struct{}, error) {
  185. s.mu.Lock()
  186. s.revMu.Lock()
  187. if rev <= s.compactMainRev {
  188. ch := make(chan struct{})
  189. f := func(ctx context.Context) { s.compactBarrier(ctx, ch) }
  190. s.fifoSched.Schedule(f)
  191. s.mu.Unlock()
  192. s.revMu.Unlock()
  193. return ch, ErrCompacted
  194. }
  195. if rev > s.currentRev {
  196. s.mu.Unlock()
  197. s.revMu.Unlock()
  198. return nil, ErrFutureRev
  199. }
  200. start := time.Now()
  201. s.compactMainRev = rev
  202. rbytes := newRevBytes()
  203. revToBytes(revision{main: rev}, rbytes)
  204. tx := s.b.BatchTx()
  205. tx.Lock()
  206. tx.UnsafePut(metaBucketName, scheduledCompactKeyName, rbytes)
  207. tx.Unlock()
  208. // ensure that desired compaction is persisted
  209. s.b.ForceCommit()
  210. s.mu.Unlock()
  211. s.revMu.Unlock()
  212. keep := s.kvindex.Compact(rev)
  213. ch := make(chan struct{})
  214. var j = func(ctx context.Context) {
  215. if ctx.Err() != nil {
  216. s.compactBarrier(ctx, ch)
  217. return
  218. }
  219. if !s.scheduleCompaction(rev, keep) {
  220. s.compactBarrier(nil, ch)
  221. return
  222. }
  223. close(ch)
  224. }
  225. s.fifoSched.Schedule(j)
  226. indexCompactionPauseMs.Observe(float64(time.Since(start) / time.Millisecond))
  227. return ch, nil
  228. }
  229. // DefaultIgnores is a map of keys to ignore in hash checking.
  230. var DefaultIgnores map[backend.IgnoreKey]struct{}
  231. func init() {
  232. DefaultIgnores = map[backend.IgnoreKey]struct{}{
  233. // consistent index might be changed due to v2 internal sync, which
  234. // is not controllable by the user.
  235. {Bucket: string(metaBucketName), Key: string(consistentIndexKeyName)}: {},
  236. }
  237. }
  238. func (s *store) Commit() {
  239. s.mu.Lock()
  240. defer s.mu.Unlock()
  241. tx := s.b.BatchTx()
  242. tx.Lock()
  243. s.saveIndex(tx)
  244. tx.Unlock()
  245. s.b.ForceCommit()
  246. }
  247. func (s *store) Restore(b backend.Backend) error {
  248. s.mu.Lock()
  249. defer s.mu.Unlock()
  250. close(s.stopc)
  251. s.fifoSched.Stop()
  252. atomic.StoreUint64(&s.consistentIndex, 0)
  253. s.b = b
  254. s.kvindex = newTreeIndex(s.lg)
  255. s.currentRev = 1
  256. s.compactMainRev = -1
  257. s.fifoSched = schedule.NewFIFOScheduler()
  258. s.stopc = make(chan struct{})
  259. return s.restore()
  260. }
  261. func (s *store) restore() error {
  262. b := s.b
  263. reportDbTotalSizeInBytesMu.Lock()
  264. reportDbTotalSizeInBytes = func() float64 { return float64(b.Size()) }
  265. reportDbTotalSizeInBytesMu.Unlock()
  266. reportDbTotalSizeInBytesDebuggingMu.Lock()
  267. reportDbTotalSizeInBytesDebugging = func() float64 { return float64(b.Size()) }
  268. reportDbTotalSizeInBytesDebuggingMu.Unlock()
  269. reportDbTotalSizeInUseInBytesMu.Lock()
  270. reportDbTotalSizeInUseInBytes = func() float64 { return float64(b.SizeInUse()) }
  271. reportDbTotalSizeInUseInBytesMu.Unlock()
  272. min, max := newRevBytes(), newRevBytes()
  273. revToBytes(revision{main: 1}, min)
  274. revToBytes(revision{main: math.MaxInt64, sub: math.MaxInt64}, max)
  275. keyToLease := make(map[string]lease.LeaseID)
  276. // restore index
  277. tx := s.b.BatchTx()
  278. tx.Lock()
  279. _, finishedCompactBytes := tx.UnsafeRange(metaBucketName, finishedCompactKeyName, nil, 0)
  280. if len(finishedCompactBytes) != 0 {
  281. s.compactMainRev = bytesToRev(finishedCompactBytes[0]).main
  282. if s.lg != nil {
  283. s.lg.Info(
  284. "restored last compact revision",
  285. zap.String("meta-bucket-name", string(metaBucketName)),
  286. zap.String("meta-bucket-name-key", string(finishedCompactKeyName)),
  287. zap.Int64("restored-compact-revision", s.compactMainRev),
  288. )
  289. } else {
  290. plog.Printf("restore compact to %d", s.compactMainRev)
  291. }
  292. }
  293. _, scheduledCompactBytes := tx.UnsafeRange(metaBucketName, scheduledCompactKeyName, nil, 0)
  294. scheduledCompact := int64(0)
  295. if len(scheduledCompactBytes) != 0 {
  296. scheduledCompact = bytesToRev(scheduledCompactBytes[0]).main
  297. }
  298. // index keys concurrently as they're loaded in from tx
  299. keysGauge.Set(0)
  300. rkvc, revc := restoreIntoIndex(s.lg, s.kvindex)
  301. for {
  302. keys, vals := tx.UnsafeRange(keyBucketName, min, max, int64(restoreChunkKeys))
  303. if len(keys) == 0 {
  304. break
  305. }
  306. // rkvc blocks if the total pending keys exceeds the restore
  307. // chunk size to keep keys from consuming too much memory.
  308. restoreChunk(s.lg, rkvc, keys, vals, keyToLease)
  309. if len(keys) < restoreChunkKeys {
  310. // partial set implies final set
  311. break
  312. }
  313. // next set begins after where this one ended
  314. newMin := bytesToRev(keys[len(keys)-1][:revBytesLen])
  315. newMin.sub++
  316. revToBytes(newMin, min)
  317. }
  318. close(rkvc)
  319. s.currentRev = <-revc
  320. // keys in the range [compacted revision -N, compaction] might all be deleted due to compaction.
  321. // the correct revision should be set to compaction revision in the case, not the largest revision
  322. // we have seen.
  323. if s.currentRev < s.compactMainRev {
  324. s.currentRev = s.compactMainRev
  325. }
  326. if scheduledCompact <= s.compactMainRev {
  327. scheduledCompact = 0
  328. }
  329. for key, lid := range keyToLease {
  330. if s.le == nil {
  331. panic("no lessor to attach lease")
  332. }
  333. err := s.le.Attach(lid, []lease.LeaseItem{{Key: key}})
  334. if err != nil {
  335. if s.lg != nil {
  336. s.lg.Warn(
  337. "failed to attach a lease",
  338. zap.String("lease-id", fmt.Sprintf("%016x", lid)),
  339. zap.Error(err),
  340. )
  341. } else {
  342. plog.Errorf("unexpected Attach error: %v", err)
  343. }
  344. }
  345. }
  346. tx.Unlock()
  347. if scheduledCompact != 0 {
  348. s.Compact(scheduledCompact)
  349. if s.lg != nil {
  350. s.lg.Info(
  351. "resume scheduled compaction",
  352. zap.String("meta-bucket-name", string(metaBucketName)),
  353. zap.String("meta-bucket-name-key", string(scheduledCompactKeyName)),
  354. zap.Int64("scheduled-compact-revision", scheduledCompact),
  355. )
  356. } else {
  357. plog.Printf("resume scheduled compaction at %d", scheduledCompact)
  358. }
  359. }
  360. return nil
  361. }
  362. type revKeyValue struct {
  363. key []byte
  364. kv mvccpb.KeyValue
  365. kstr string
  366. }
  367. func restoreIntoIndex(lg *zap.Logger, idx index) (chan<- revKeyValue, <-chan int64) {
  368. rkvc, revc := make(chan revKeyValue, restoreChunkKeys), make(chan int64, 1)
  369. go func() {
  370. currentRev := int64(1)
  371. defer func() { revc <- currentRev }()
  372. // restore the tree index from streaming the unordered index.
  373. kiCache := make(map[string]*keyIndex, restoreChunkKeys)
  374. for rkv := range rkvc {
  375. ki, ok := kiCache[rkv.kstr]
  376. // purge kiCache if many keys but still missing in the cache
  377. if !ok && len(kiCache) >= restoreChunkKeys {
  378. i := 10
  379. for k := range kiCache {
  380. delete(kiCache, k)
  381. if i--; i == 0 {
  382. break
  383. }
  384. }
  385. }
  386. // cache miss, fetch from tree index if there
  387. if !ok {
  388. ki = &keyIndex{key: rkv.kv.Key}
  389. if idxKey := idx.KeyIndex(ki); idxKey != nil {
  390. kiCache[rkv.kstr], ki = idxKey, idxKey
  391. ok = true
  392. }
  393. }
  394. rev := bytesToRev(rkv.key)
  395. currentRev = rev.main
  396. if ok {
  397. if isTombstone(rkv.key) {
  398. ki.tombstone(lg, rev.main, rev.sub)
  399. continue
  400. }
  401. ki.put(lg, rev.main, rev.sub)
  402. } else if !isTombstone(rkv.key) {
  403. ki.restore(lg, revision{rkv.kv.CreateRevision, 0}, rev, rkv.kv.Version)
  404. idx.Insert(ki)
  405. kiCache[rkv.kstr] = ki
  406. }
  407. }
  408. }()
  409. return rkvc, revc
  410. }
  411. func restoreChunk(lg *zap.Logger, kvc chan<- revKeyValue, keys, vals [][]byte, keyToLease map[string]lease.LeaseID) {
  412. for i, key := range keys {
  413. rkv := revKeyValue{key: key}
  414. if err := rkv.kv.Unmarshal(vals[i]); err != nil {
  415. if lg != nil {
  416. lg.Fatal("failed to unmarshal mvccpb.KeyValue", zap.Error(err))
  417. } else {
  418. plog.Fatalf("cannot unmarshal event: %v", err)
  419. }
  420. }
  421. rkv.kstr = string(rkv.kv.Key)
  422. if isTombstone(key) {
  423. delete(keyToLease, rkv.kstr)
  424. } else if lid := lease.LeaseID(rkv.kv.Lease); lid != lease.NoLease {
  425. keyToLease[rkv.kstr] = lid
  426. } else {
  427. delete(keyToLease, rkv.kstr)
  428. }
  429. kvc <- rkv
  430. }
  431. }
  432. func (s *store) Close() error {
  433. close(s.stopc)
  434. s.fifoSched.Stop()
  435. return nil
  436. }
  437. func (s *store) saveIndex(tx backend.BatchTx) {
  438. if s.ig == nil {
  439. return
  440. }
  441. bs := s.bytesBuf8
  442. ci := s.ig.ConsistentIndex()
  443. binary.BigEndian.PutUint64(bs, ci)
  444. // put the index into the underlying backend
  445. // tx has been locked in TxnBegin, so there is no need to lock it again
  446. tx.UnsafePut(metaBucketName, consistentIndexKeyName, bs)
  447. atomic.StoreUint64(&s.consistentIndex, ci)
  448. }
  449. func (s *store) ConsistentIndex() uint64 {
  450. if ci := atomic.LoadUint64(&s.consistentIndex); ci > 0 {
  451. return ci
  452. }
  453. tx := s.b.BatchTx()
  454. tx.Lock()
  455. defer tx.Unlock()
  456. _, vs := tx.UnsafeRange(metaBucketName, consistentIndexKeyName, nil, 0)
  457. if len(vs) == 0 {
  458. return 0
  459. }
  460. v := binary.BigEndian.Uint64(vs[0])
  461. atomic.StoreUint64(&s.consistentIndex, v)
  462. return v
  463. }
  464. // appendMarkTombstone appends tombstone mark to normal revision bytes.
  465. func appendMarkTombstone(lg *zap.Logger, b []byte) []byte {
  466. if len(b) != revBytesLen {
  467. if lg != nil {
  468. lg.Panic(
  469. "cannot append tombstone mark to non-normal revision bytes",
  470. zap.Int("expected-revision-bytes-size", revBytesLen),
  471. zap.Int("given-revision-bytes-size", len(b)),
  472. )
  473. } else {
  474. plog.Panicf("cannot append mark to non normal revision bytes")
  475. }
  476. }
  477. return append(b, markTombstone)
  478. }
  479. // isTombstone checks whether the revision bytes is a tombstone.
  480. func isTombstone(b []byte) bool {
  481. return len(b) == markedRevBytesLen && b[markBytePosition] == markTombstone
  482. }