kvstore.go 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430
  1. // Copyright 2015 The etcd Authors
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package mvcc
  15. import (
  16. "encoding/binary"
  17. "errors"
  18. "math"
  19. "sync"
  20. "sync/atomic"
  21. "time"
  22. "github.com/coreos/etcd/lease"
  23. "github.com/coreos/etcd/mvcc/backend"
  24. "github.com/coreos/etcd/mvcc/mvccpb"
  25. "github.com/coreos/etcd/pkg/schedule"
  26. "github.com/coreos/pkg/capnslog"
  27. "golang.org/x/net/context"
  28. )
  29. var (
  30. keyBucketName = []byte("key")
  31. metaBucketName = []byte("meta")
  32. consistentIndexKeyName = []byte("consistent_index")
  33. scheduledCompactKeyName = []byte("scheduledCompactRev")
  34. finishedCompactKeyName = []byte("finishedCompactRev")
  35. ErrCompacted = errors.New("mvcc: required revision has been compacted")
  36. ErrFutureRev = errors.New("mvcc: required revision is a future revision")
  37. ErrCanceled = errors.New("mvcc: watcher is canceled")
  38. ErrClosed = errors.New("mvcc: closed")
  39. plog = capnslog.NewPackageLogger("github.com/coreos/etcd", "mvcc")
  40. )
  41. const (
  42. // markedRevBytesLen is the byte length of marked revision.
  43. // The first `revBytesLen` bytes represents a normal revision. The last
  44. // one byte is the mark.
  45. markedRevBytesLen = revBytesLen + 1
  46. markBytePosition = markedRevBytesLen - 1
  47. markTombstone byte = 't'
  48. restoreChunkKeys = 10000
  49. )
  50. // ConsistentIndexGetter is an interface that wraps the Get method.
  51. // Consistent index is the offset of an entry in a consistent replicated log.
  52. type ConsistentIndexGetter interface {
  53. // ConsistentIndex returns the consistent index of current executing entry.
  54. ConsistentIndex() uint64
  55. }
  56. type store struct {
  57. ReadView
  58. WriteView
  59. // consistentIndex caches the "consistent_index" key's value. Accessed
  60. // through atomics so must be 64-bit aligned.
  61. consistentIndex uint64
  62. // mu read locks for txns and write locks for non-txn store changes.
  63. mu sync.RWMutex
  64. ig ConsistentIndexGetter
  65. b backend.Backend
  66. kvindex index
  67. le lease.Lessor
  68. // revMuLock protects currentRev and compactMainRev.
  69. // Locked at end of write txn and released after write txn unlock lock.
  70. // Locked before locking read txn and released after locking.
  71. revMu sync.RWMutex
  72. // currentRev is the revision of the last completed transaction.
  73. currentRev int64
  74. // compactMainRev is the main revision of the last compaction.
  75. compactMainRev int64
  76. // bytesBuf8 is a byte slice of length 8
  77. // to avoid a repetitive allocation in saveIndex.
  78. bytesBuf8 []byte
  79. fifoSched schedule.Scheduler
  80. stopc chan struct{}
  81. }
  82. // NewStore returns a new store. It is useful to create a store inside
  83. // mvcc pkg. It should only be used for testing externally.
  84. func NewStore(b backend.Backend, le lease.Lessor, ig ConsistentIndexGetter) *store {
  85. s := &store{
  86. b: b,
  87. ig: ig,
  88. kvindex: newTreeIndex(),
  89. le: le,
  90. currentRev: 1,
  91. compactMainRev: -1,
  92. bytesBuf8: make([]byte, 8),
  93. fifoSched: schedule.NewFIFOScheduler(),
  94. stopc: make(chan struct{}),
  95. }
  96. s.ReadView = &readView{s}
  97. s.WriteView = &writeView{s}
  98. if s.le != nil {
  99. s.le.SetRangeDeleter(func() lease.TxnDelete { return s.Write() })
  100. }
  101. tx := s.b.BatchTx()
  102. tx.Lock()
  103. tx.UnsafeCreateBucket(keyBucketName)
  104. tx.UnsafeCreateBucket(metaBucketName)
  105. tx.Unlock()
  106. s.b.ForceCommit()
  107. if err := s.restore(); err != nil {
  108. // TODO: return the error instead of panic here?
  109. panic("failed to recover store from backend")
  110. }
  111. return s
  112. }
  113. func (s *store) compactBarrier(ctx context.Context, ch chan struct{}) {
  114. if ctx == nil || ctx.Err() != nil {
  115. s.mu.Lock()
  116. select {
  117. case <-s.stopc:
  118. default:
  119. f := func(ctx context.Context) { s.compactBarrier(ctx, ch) }
  120. s.fifoSched.Schedule(f)
  121. }
  122. s.mu.Unlock()
  123. return
  124. }
  125. close(ch)
  126. }
  127. func (s *store) Hash() (hash uint32, revision int64, err error) {
  128. s.b.ForceCommit()
  129. h, err := s.b.Hash(DefaultIgnores)
  130. return h, s.currentRev, err
  131. }
  132. func (s *store) Compact(rev int64) (<-chan struct{}, error) {
  133. s.mu.Lock()
  134. defer s.mu.Unlock()
  135. s.revMu.Lock()
  136. defer s.revMu.Unlock()
  137. if rev <= s.compactMainRev {
  138. ch := make(chan struct{})
  139. f := func(ctx context.Context) { s.compactBarrier(ctx, ch) }
  140. s.fifoSched.Schedule(f)
  141. return ch, ErrCompacted
  142. }
  143. if rev > s.currentRev {
  144. return nil, ErrFutureRev
  145. }
  146. start := time.Now()
  147. s.compactMainRev = rev
  148. rbytes := newRevBytes()
  149. revToBytes(revision{main: rev}, rbytes)
  150. tx := s.b.BatchTx()
  151. tx.Lock()
  152. tx.UnsafePut(metaBucketName, scheduledCompactKeyName, rbytes)
  153. tx.Unlock()
  154. // ensure that desired compaction is persisted
  155. s.b.ForceCommit()
  156. keep := s.kvindex.Compact(rev)
  157. ch := make(chan struct{})
  158. var j = func(ctx context.Context) {
  159. if ctx.Err() != nil {
  160. s.compactBarrier(ctx, ch)
  161. return
  162. }
  163. if !s.scheduleCompaction(rev, keep) {
  164. s.compactBarrier(nil, ch)
  165. return
  166. }
  167. close(ch)
  168. }
  169. s.fifoSched.Schedule(j)
  170. indexCompactionPauseDurations.Observe(float64(time.Since(start) / time.Millisecond))
  171. return ch, nil
  172. }
  173. // DefaultIgnores is a map of keys to ignore in hash checking.
  174. var DefaultIgnores map[backend.IgnoreKey]struct{}
  175. func init() {
  176. DefaultIgnores = map[backend.IgnoreKey]struct{}{
  177. // consistent index might be changed due to v2 internal sync, which
  178. // is not controllable by the user.
  179. {Bucket: string(metaBucketName), Key: string(consistentIndexKeyName)}: {},
  180. }
  181. }
  182. func (s *store) Commit() {
  183. s.mu.Lock()
  184. defer s.mu.Unlock()
  185. tx := s.b.BatchTx()
  186. tx.Lock()
  187. s.saveIndex(tx)
  188. tx.Unlock()
  189. s.b.ForceCommit()
  190. }
  191. func (s *store) Restore(b backend.Backend) error {
  192. s.mu.Lock()
  193. defer s.mu.Unlock()
  194. close(s.stopc)
  195. s.fifoSched.Stop()
  196. atomic.StoreUint64(&s.consistentIndex, 0)
  197. s.b = b
  198. s.kvindex = newTreeIndex()
  199. s.currentRev = 1
  200. s.compactMainRev = -1
  201. s.fifoSched = schedule.NewFIFOScheduler()
  202. s.stopc = make(chan struct{})
  203. return s.restore()
  204. }
  205. func (s *store) restore() error {
  206. min, max := newRevBytes(), newRevBytes()
  207. revToBytes(revision{main: 1}, min)
  208. revToBytes(revision{main: math.MaxInt64, sub: math.MaxInt64}, max)
  209. keyToLease := make(map[string]lease.LeaseID)
  210. // restore index
  211. tx := s.b.BatchTx()
  212. tx.Lock()
  213. dbTotalSize.Set(float64(s.b.Size()))
  214. _, finishedCompactBytes := tx.UnsafeRange(metaBucketName, finishedCompactKeyName, nil, 0)
  215. if len(finishedCompactBytes) != 0 {
  216. s.compactMainRev = bytesToRev(finishedCompactBytes[0]).main
  217. plog.Printf("restore compact to %d", s.compactMainRev)
  218. }
  219. _, scheduledCompactBytes := tx.UnsafeRange(metaBucketName, scheduledCompactKeyName, nil, 0)
  220. scheduledCompact := int64(0)
  221. if len(scheduledCompactBytes) != 0 {
  222. scheduledCompact = bytesToRev(scheduledCompactBytes[0]).main
  223. }
  224. // index keys concurrently as they're loaded in from tx
  225. unorderedc, donec := make(chan map[string]*keyIndex), make(chan struct{})
  226. go func() {
  227. defer close(donec)
  228. for unordered := range unorderedc {
  229. // restore the tree index from the unordered index.
  230. for _, v := range unordered {
  231. s.kvindex.Insert(v)
  232. }
  233. }
  234. }()
  235. for {
  236. keys, vals := tx.UnsafeRange(keyBucketName, min, max, restoreChunkKeys)
  237. if len(keys) == 0 {
  238. break
  239. }
  240. // unbuffered so keys don't pile up in memory
  241. unorderedc <- s.restoreChunk(keys, vals, keyToLease)
  242. if len(keys) < restoreChunkKeys {
  243. // partial set implies final set
  244. break
  245. }
  246. // next set begins after where this one ended
  247. newMin := bytesToRev(keys[len(keys)-1][:revBytesLen])
  248. newMin.sub++
  249. revToBytes(newMin, min)
  250. }
  251. close(unorderedc)
  252. <-donec
  253. // keys in the range [compacted revision -N, compaction] might all be deleted due to compaction.
  254. // the correct revision should be set to compaction revision in the case, not the largest revision
  255. // we have seen.
  256. if s.currentRev < s.compactMainRev {
  257. s.currentRev = s.compactMainRev
  258. }
  259. if scheduledCompact <= s.compactMainRev {
  260. scheduledCompact = 0
  261. }
  262. for key, lid := range keyToLease {
  263. if s.le == nil {
  264. panic("no lessor to attach lease")
  265. }
  266. err := s.le.Attach(lid, []lease.LeaseItem{{Key: key}})
  267. if err != nil {
  268. plog.Errorf("unexpected Attach error: %v", err)
  269. }
  270. }
  271. tx.Unlock()
  272. if scheduledCompact != 0 {
  273. s.Compact(scheduledCompact)
  274. plog.Printf("resume scheduled compaction at %d", scheduledCompact)
  275. }
  276. return nil
  277. }
  278. func (s *store) restoreChunk(keys, vals [][]byte, keyToLease map[string]lease.LeaseID) map[string]*keyIndex {
  279. // assume half of keys are overwrites
  280. unordered := make(map[string]*keyIndex, len(keys)/2)
  281. for i, key := range keys {
  282. var kv mvccpb.KeyValue
  283. if err := kv.Unmarshal(vals[i]); err != nil {
  284. plog.Fatalf("cannot unmarshal event: %v", err)
  285. }
  286. rev := bytesToRev(key[:revBytesLen])
  287. s.currentRev = rev.main
  288. kstr := string(kv.Key)
  289. if isTombstone(key) {
  290. if ki, ok := unordered[kstr]; ok {
  291. ki.tombstone(rev.main, rev.sub)
  292. }
  293. delete(keyToLease, kstr)
  294. continue
  295. }
  296. if ki, ok := unordered[kstr]; ok {
  297. ki.put(rev.main, rev.sub)
  298. } else {
  299. ki = &keyIndex{key: kv.Key}
  300. ki.restore(revision{kv.CreateRevision, 0}, rev, kv.Version)
  301. unordered[kstr] = ki
  302. }
  303. if lid := lease.LeaseID(kv.Lease); lid != lease.NoLease {
  304. keyToLease[kstr] = lid
  305. } else {
  306. delete(keyToLease, kstr)
  307. }
  308. }
  309. return unordered
  310. }
  311. func (s *store) Close() error {
  312. close(s.stopc)
  313. s.fifoSched.Stop()
  314. return nil
  315. }
  316. func (s *store) saveIndex(tx backend.BatchTx) {
  317. if s.ig == nil {
  318. return
  319. }
  320. bs := s.bytesBuf8
  321. ci := s.ig.ConsistentIndex()
  322. binary.BigEndian.PutUint64(bs, ci)
  323. // put the index into the underlying backend
  324. // tx has been locked in TxnBegin, so there is no need to lock it again
  325. tx.UnsafePut(metaBucketName, consistentIndexKeyName, bs)
  326. atomic.StoreUint64(&s.consistentIndex, ci)
  327. }
  328. func (s *store) ConsistentIndex() uint64 {
  329. if ci := atomic.LoadUint64(&s.consistentIndex); ci > 0 {
  330. return ci
  331. }
  332. tx := s.b.BatchTx()
  333. tx.Lock()
  334. defer tx.Unlock()
  335. _, vs := tx.UnsafeRange(metaBucketName, consistentIndexKeyName, nil, 0)
  336. if len(vs) == 0 {
  337. return 0
  338. }
  339. v := binary.BigEndian.Uint64(vs[0])
  340. atomic.StoreUint64(&s.consistentIndex, v)
  341. return v
  342. }
  343. // appendMarkTombstone appends tombstone mark to normal revision bytes.
  344. func appendMarkTombstone(b []byte) []byte {
  345. if len(b) != revBytesLen {
  346. plog.Panicf("cannot append mark to non normal revision bytes")
  347. }
  348. return append(b, markTombstone)
  349. }
  350. // isTombstone checks whether the revision bytes is a tombstone.
  351. func isTombstone(b []byte) bool {
  352. return len(b) == markedRevBytesLen && b[markBytePosition] == markTombstone
  353. }
  354. // revBytesRange returns the range of revision bytes at
  355. // the given revision.
  356. func revBytesRange(rev revision) (start, end []byte) {
  357. start = newRevBytes()
  358. revToBytes(rev, start)
  359. end = newRevBytes()
  360. endRev := revision{main: rev.main, sub: rev.sub + 1}
  361. revToBytes(endRev, end)
  362. return start, end
  363. }