key_index.go 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402
  1. // Copyright 2015 The etcd Authors
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package mvcc
  15. import (
  16. "bytes"
  17. "errors"
  18. "fmt"
  19. "github.com/google/btree"
  20. "go.uber.org/zap"
  21. )
  22. var (
  23. ErrRevisionNotFound = errors.New("mvcc: revision not found")
  24. )
  25. // keyIndex stores the revisions of a key in the backend.
  26. // Each keyIndex has at least one key generation.
  27. // Each generation might have several key versions.
  28. // Tombstone on a key appends an tombstone version at the end
  29. // of the current generation and creates a new empty generation.
  30. // Each version of a key has an index pointing to the backend.
  31. //
  32. // For example: put(1.0);put(2.0);tombstone(3.0);put(4.0);tombstone(5.0) on key "foo"
  33. // generate a keyIndex:
  34. // key: "foo"
  35. // rev: 5
  36. // generations:
  37. // {empty}
  38. // {4.0, 5.0(t)}
  39. // {1.0, 2.0, 3.0(t)}
  40. //
  41. // Compact a keyIndex removes the versions with smaller or equal to
  42. // rev except the largest one. If the generation becomes empty
  43. // during compaction, it will be removed. if all the generations get
  44. // removed, the keyIndex should be removed.
  45. //
  46. // For example:
  47. // compact(2) on the previous example
  48. // generations:
  49. // {empty}
  50. // {4.0, 5.0(t)}
  51. // {2.0, 3.0(t)}
  52. //
  53. // compact(4)
  54. // generations:
  55. // {empty}
  56. // {4.0, 5.0(t)}
  57. //
  58. // compact(5):
  59. // generations:
  60. // {empty} -> key SHOULD be removed.
  61. //
  62. // compact(6):
  63. // generations:
  64. // {empty} -> key SHOULD be removed.
  65. type keyIndex struct {
  66. key []byte
  67. modified revision // the main rev of the last modification
  68. generations []generation
  69. }
  70. // put puts a revision to the keyIndex.
  71. func (ki *keyIndex) put(lg *zap.Logger, main int64, sub int64) {
  72. rev := revision{main: main, sub: sub}
  73. if !rev.GreaterThan(ki.modified) {
  74. if lg != nil {
  75. lg.Panic(
  76. "'put' with an unexpected smaller revision",
  77. zap.Int64("given-revision-main", rev.main),
  78. zap.Int64("given-revision-sub", rev.sub),
  79. zap.Int64("modified-revision-main", ki.modified.main),
  80. zap.Int64("modified-revision-sub", ki.modified.sub),
  81. )
  82. } else {
  83. plog.Panicf("store.keyindex: put with unexpected smaller revision [%v / %v]", rev, ki.modified)
  84. }
  85. }
  86. if len(ki.generations) == 0 {
  87. ki.generations = append(ki.generations, generation{})
  88. }
  89. g := &ki.generations[len(ki.generations)-1]
  90. if len(g.revs) == 0 { // create a new key
  91. keysGauge.Inc()
  92. g.created = rev
  93. }
  94. g.revs = append(g.revs, rev)
  95. g.ver++
  96. ki.modified = rev
  97. }
  98. func (ki *keyIndex) restore(lg *zap.Logger, created, modified revision, ver int64) {
  99. if len(ki.generations) != 0 {
  100. if lg != nil {
  101. lg.Panic(
  102. "'restore' got an unexpected non-empty generations",
  103. zap.Int("generations-size", len(ki.generations)),
  104. )
  105. } else {
  106. plog.Panicf("store.keyindex: cannot restore non-empty keyIndex")
  107. }
  108. }
  109. ki.modified = modified
  110. g := generation{created: created, ver: ver, revs: []revision{modified}}
  111. ki.generations = append(ki.generations, g)
  112. keysGauge.Inc()
  113. }
  114. // tombstone puts a revision, pointing to a tombstone, to the keyIndex.
  115. // It also creates a new empty generation in the keyIndex.
  116. // It returns ErrRevisionNotFound when tombstone on an empty generation.
  117. func (ki *keyIndex) tombstone(lg *zap.Logger, main int64, sub int64) error {
  118. if ki.isEmpty() {
  119. if lg != nil {
  120. lg.Panic(
  121. "'tombstone' got an unexpected empty keyIndex",
  122. zap.String("key", string(ki.key)),
  123. )
  124. } else {
  125. plog.Panicf("store.keyindex: unexpected tombstone on empty keyIndex %s", string(ki.key))
  126. }
  127. }
  128. if ki.generations[len(ki.generations)-1].isEmpty() {
  129. return ErrRevisionNotFound
  130. }
  131. ki.put(lg, main, sub)
  132. ki.generations = append(ki.generations, generation{})
  133. keysGauge.Dec()
  134. return nil
  135. }
  136. // get gets the modified, created revision and version of the key that satisfies the given atRev.
  137. // Rev must be higher than or equal to the given atRev.
  138. func (ki *keyIndex) get(lg *zap.Logger, atRev int64) (modified, created revision, ver int64, err error) {
  139. if ki.isEmpty() {
  140. if lg != nil {
  141. lg.Panic(
  142. "'get' got an unexpected empty keyIndex",
  143. zap.String("key", string(ki.key)),
  144. )
  145. } else {
  146. plog.Panicf("store.keyindex: unexpected get on empty keyIndex %s", string(ki.key))
  147. }
  148. }
  149. g := ki.findGeneration(atRev)
  150. if g.isEmpty() {
  151. return revision{}, revision{}, 0, ErrRevisionNotFound
  152. }
  153. n := g.walk(func(rev revision) bool { return rev.main > atRev })
  154. if n != -1 {
  155. return g.revs[n], g.created, g.ver - int64(len(g.revs)-n-1), nil
  156. }
  157. return revision{}, revision{}, 0, ErrRevisionNotFound
  158. }
  159. // since returns revisions since the given rev. Only the revision with the
  160. // largest sub revision will be returned if multiple revisions have the same
  161. // main revision.
  162. func (ki *keyIndex) since(lg *zap.Logger, rev int64) []revision {
  163. if ki.isEmpty() {
  164. if lg != nil {
  165. lg.Panic(
  166. "'since' got an unexpected empty keyIndex",
  167. zap.String("key", string(ki.key)),
  168. )
  169. } else {
  170. plog.Panicf("store.keyindex: unexpected get on empty keyIndex %s", string(ki.key))
  171. }
  172. }
  173. since := revision{rev, 0}
  174. var gi int
  175. // find the generations to start checking
  176. for gi = len(ki.generations) - 1; gi > 0; gi-- {
  177. g := ki.generations[gi]
  178. if g.isEmpty() {
  179. continue
  180. }
  181. if since.GreaterThan(g.created) {
  182. break
  183. }
  184. }
  185. var revs []revision
  186. var last int64
  187. for ; gi < len(ki.generations); gi++ {
  188. for _, r := range ki.generations[gi].revs {
  189. if since.GreaterThan(r) {
  190. continue
  191. }
  192. if r.main == last {
  193. // replace the revision with a new one that has higher sub value,
  194. // because the original one should not be seen by external
  195. revs[len(revs)-1] = r
  196. continue
  197. }
  198. revs = append(revs, r)
  199. last = r.main
  200. }
  201. }
  202. return revs
  203. }
  204. // compact compacts a keyIndex by removing the versions with smaller or equal
  205. // revision than the given atRev except the largest one (If the largest one is
  206. // a tombstone, it will not be kept).
  207. // If a generation becomes empty during compaction, it will be removed.
  208. func (ki *keyIndex) compact(lg *zap.Logger, atRev int64, available map[revision]struct{}) {
  209. if ki.isEmpty() {
  210. if lg != nil {
  211. lg.Panic(
  212. "'compact' got an unexpected empty keyIndex",
  213. zap.String("key", string(ki.key)),
  214. )
  215. } else {
  216. plog.Panicf("store.keyindex: unexpected compact on empty keyIndex %s", string(ki.key))
  217. }
  218. }
  219. genIdx, revIndex := ki.doCompact(atRev, available)
  220. g := &ki.generations[genIdx]
  221. if !g.isEmpty() {
  222. // remove the previous contents.
  223. if revIndex != -1 {
  224. g.revs = g.revs[revIndex:]
  225. }
  226. // remove any tombstone
  227. if len(g.revs) == 1 && genIdx != len(ki.generations)-1 {
  228. delete(available, g.revs[0])
  229. genIdx++
  230. }
  231. }
  232. // remove the previous generations.
  233. ki.generations = ki.generations[genIdx:]
  234. }
  235. // keep finds the revision to be kept if compact is called at given atRev.
  236. func (ki *keyIndex) keep(atRev int64, available map[revision]struct{}) {
  237. if ki.isEmpty() {
  238. return
  239. }
  240. genIdx, revIndex := ki.doCompact(atRev, available)
  241. g := &ki.generations[genIdx]
  242. if !g.isEmpty() {
  243. // remove any tombstone
  244. if revIndex == len(g.revs)-1 && genIdx != len(ki.generations)-1 {
  245. delete(available, g.revs[revIndex])
  246. }
  247. }
  248. }
  249. func (ki *keyIndex) doCompact(atRev int64, available map[revision]struct{}) (genIdx int, revIndex int) {
  250. // walk until reaching the first revision smaller or equal to "atRev",
  251. // and add the revision to the available map
  252. f := func(rev revision) bool {
  253. if rev.main <= atRev {
  254. available[rev] = struct{}{}
  255. return false
  256. }
  257. return true
  258. }
  259. genIdx, g := 0, &ki.generations[0]
  260. // find first generation includes atRev or created after atRev
  261. for genIdx < len(ki.generations)-1 {
  262. if tomb := g.revs[len(g.revs)-1].main; tomb > atRev {
  263. break
  264. }
  265. genIdx++
  266. g = &ki.generations[genIdx]
  267. }
  268. revIndex = g.walk(f)
  269. return genIdx, revIndex
  270. }
  271. func (ki *keyIndex) isEmpty() bool {
  272. return len(ki.generations) == 1 && ki.generations[0].isEmpty()
  273. }
  274. // findGeneration finds out the generation of the keyIndex that the
  275. // given rev belongs to. If the given rev is at the gap of two generations,
  276. // which means that the key does not exist at the given rev, it returns nil.
  277. func (ki *keyIndex) findGeneration(rev int64) *generation {
  278. lastg := len(ki.generations) - 1
  279. cg := lastg
  280. for cg >= 0 {
  281. if len(ki.generations[cg].revs) == 0 {
  282. cg--
  283. continue
  284. }
  285. g := ki.generations[cg]
  286. if cg != lastg {
  287. if tomb := g.revs[len(g.revs)-1].main; tomb <= rev {
  288. return nil
  289. }
  290. }
  291. if g.revs[0].main <= rev {
  292. return &ki.generations[cg]
  293. }
  294. cg--
  295. }
  296. return nil
  297. }
  298. func (ki *keyIndex) Less(b btree.Item) bool {
  299. return bytes.Compare(ki.key, b.(*keyIndex).key) == -1
  300. }
  301. func (ki *keyIndex) equal(b *keyIndex) bool {
  302. if !bytes.Equal(ki.key, b.key) {
  303. return false
  304. }
  305. if ki.modified != b.modified {
  306. return false
  307. }
  308. if len(ki.generations) != len(b.generations) {
  309. return false
  310. }
  311. for i := range ki.generations {
  312. ag, bg := ki.generations[i], b.generations[i]
  313. if !ag.equal(bg) {
  314. return false
  315. }
  316. }
  317. return true
  318. }
  319. func (ki *keyIndex) String() string {
  320. var s string
  321. for _, g := range ki.generations {
  322. s += g.String()
  323. }
  324. return s
  325. }
  326. // generation contains multiple revisions of a key.
  327. type generation struct {
  328. ver int64
  329. created revision // when the generation is created (put in first revision).
  330. revs []revision
  331. }
  332. func (g *generation) isEmpty() bool { return g == nil || len(g.revs) == 0 }
  333. // walk walks through the revisions in the generation in descending order.
  334. // It passes the revision to the given function.
  335. // walk returns until: 1. it finishes walking all pairs 2. the function returns false.
  336. // walk returns the position at where it stopped. If it stopped after
  337. // finishing walking, -1 will be returned.
  338. func (g *generation) walk(f func(rev revision) bool) int {
  339. l := len(g.revs)
  340. for i := range g.revs {
  341. ok := f(g.revs[l-i-1])
  342. if !ok {
  343. return l - i - 1
  344. }
  345. }
  346. return -1
  347. }
  348. func (g *generation) String() string {
  349. return fmt.Sprintf("g: created[%d] ver[%d], revs %#v\n", g.created, g.ver, g.revs)
  350. }
  351. func (g generation) equal(b generation) bool {
  352. if g.ver != b.ver {
  353. return false
  354. }
  355. if len(g.revs) != len(b.revs) {
  356. return false
  357. }
  358. for i := range g.revs {
  359. ar, br := g.revs[i], b.revs[i]
  360. if ar != br {
  361. return false
  362. }
  363. }
  364. return true
  365. }