log.go 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372
  1. // Copyright 2015 The etcd Authors
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package raft
  15. import (
  16. "fmt"
  17. "log"
  18. pb "go.etcd.io/etcd/raft/raftpb"
  19. )
  20. type raftLog struct {
  21. // storage contains all stable entries since the last snapshot.
  22. storage Storage
  23. // unstable contains all unstable entries and snapshot.
  24. // they will be saved into storage.
  25. unstable unstable
  26. // committed is the highest log position that is known to be in
  27. // stable storage on a quorum of nodes.
  28. committed uint64
  29. // applied is the highest log position that the application has
  30. // been instructed to apply to its state machine.
  31. // Invariant: applied <= committed
  32. applied uint64
  33. logger Logger
  34. // maxNextEntsSize is the maximum number aggregate byte size of the messages
  35. // returned from calls to nextEnts.
  36. maxNextEntsSize uint64
  37. }
  38. // newLog returns log using the given storage and default options. It
  39. // recovers the log to the state that it just commits and applies the
  40. // latest snapshot.
  41. func newLog(storage Storage, logger Logger) *raftLog {
  42. return newLogWithSize(storage, logger, noLimit)
  43. }
  44. // newLogWithSize returns a log using the given storage and max
  45. // message size.
  46. func newLogWithSize(storage Storage, logger Logger, maxNextEntsSize uint64) *raftLog {
  47. if storage == nil {
  48. log.Panic("storage must not be nil")
  49. }
  50. log := &raftLog{
  51. storage: storage,
  52. logger: logger,
  53. maxNextEntsSize: maxNextEntsSize,
  54. }
  55. firstIndex, err := storage.FirstIndex()
  56. if err != nil {
  57. panic(err) // TODO(bdarnell)
  58. }
  59. lastIndex, err := storage.LastIndex()
  60. if err != nil {
  61. panic(err) // TODO(bdarnell)
  62. }
  63. log.unstable.offset = lastIndex + 1
  64. log.unstable.logger = logger
  65. // Initialize our committed and applied pointers to the time of the last compaction.
  66. log.committed = firstIndex - 1
  67. log.applied = firstIndex - 1
  68. return log
  69. }
  70. func (l *raftLog) String() string {
  71. return fmt.Sprintf("committed=%d, applied=%d, unstable.offset=%d, len(unstable.Entries)=%d", l.committed, l.applied, l.unstable.offset, len(l.unstable.entries))
  72. }
  73. // maybeAppend returns (0, false) if the entries cannot be appended. Otherwise,
  74. // it returns (last index of new entries, true).
  75. func (l *raftLog) maybeAppend(index, logTerm, committed uint64, ents ...pb.Entry) (lastnewi uint64, ok bool) {
  76. if l.matchTerm(index, logTerm) {
  77. lastnewi = index + uint64(len(ents))
  78. ci := l.findConflict(ents)
  79. switch {
  80. case ci == 0:
  81. case ci <= l.committed:
  82. l.logger.Panicf("entry %d conflict with committed entry [committed(%d)]", ci, l.committed)
  83. default:
  84. offset := index + 1
  85. l.append(ents[ci-offset:]...)
  86. }
  87. l.commitTo(min(committed, lastnewi))
  88. return lastnewi, true
  89. }
  90. return 0, false
  91. }
  92. func (l *raftLog) append(ents ...pb.Entry) uint64 {
  93. if len(ents) == 0 {
  94. return l.lastIndex()
  95. }
  96. if after := ents[0].Index - 1; after < l.committed {
  97. l.logger.Panicf("after(%d) is out of range [committed(%d)]", after, l.committed)
  98. }
  99. l.unstable.truncateAndAppend(ents)
  100. return l.lastIndex()
  101. }
  102. // findConflict finds the index of the conflict.
  103. // It returns the first pair of conflicting entries between the existing
  104. // entries and the given entries, if there are any.
  105. // If there is no conflicting entries, and the existing entries contains
  106. // all the given entries, zero will be returned.
  107. // If there is no conflicting entries, but the given entries contains new
  108. // entries, the index of the first new entry will be returned.
  109. // An entry is considered to be conflicting if it has the same index but
  110. // a different term.
  111. // The first entry MUST have an index equal to the argument 'from'.
  112. // The index of the given entries MUST be continuously increasing.
  113. func (l *raftLog) findConflict(ents []pb.Entry) uint64 {
  114. for _, ne := range ents {
  115. if !l.matchTerm(ne.Index, ne.Term) {
  116. if ne.Index <= l.lastIndex() {
  117. l.logger.Infof("found conflict at index %d [existing term: %d, conflicting term: %d]",
  118. ne.Index, l.zeroTermOnErrCompacted(l.term(ne.Index)), ne.Term)
  119. }
  120. return ne.Index
  121. }
  122. }
  123. return 0
  124. }
  125. func (l *raftLog) unstableEntries() []pb.Entry {
  126. if len(l.unstable.entries) == 0 {
  127. return nil
  128. }
  129. return l.unstable.entries
  130. }
  131. // nextEnts returns all the available entries for execution.
  132. // If applied is smaller than the index of snapshot, it returns all committed
  133. // entries after the index of snapshot.
  134. func (l *raftLog) nextEnts() (ents []pb.Entry) {
  135. off := max(l.applied+1, l.firstIndex())
  136. if l.committed+1 > off {
  137. ents, err := l.slice(off, l.committed+1, l.maxNextEntsSize)
  138. if err != nil {
  139. l.logger.Panicf("unexpected error when getting unapplied entries (%v)", err)
  140. }
  141. return ents
  142. }
  143. return nil
  144. }
  145. // hasNextEnts returns if there is any available entries for execution. This
  146. // is a fast check without heavy raftLog.slice() in raftLog.nextEnts().
  147. func (l *raftLog) hasNextEnts() bool {
  148. off := max(l.applied+1, l.firstIndex())
  149. return l.committed+1 > off
  150. }
  151. func (l *raftLog) snapshot() (pb.Snapshot, error) {
  152. if l.unstable.snapshot != nil {
  153. return *l.unstable.snapshot, nil
  154. }
  155. return l.storage.Snapshot()
  156. }
  157. func (l *raftLog) firstIndex() uint64 {
  158. if i, ok := l.unstable.maybeFirstIndex(); ok {
  159. return i
  160. }
  161. index, err := l.storage.FirstIndex()
  162. if err != nil {
  163. panic(err) // TODO(bdarnell)
  164. }
  165. return index
  166. }
  167. func (l *raftLog) lastIndex() uint64 {
  168. if i, ok := l.unstable.maybeLastIndex(); ok {
  169. return i
  170. }
  171. i, err := l.storage.LastIndex()
  172. if err != nil {
  173. panic(err) // TODO(bdarnell)
  174. }
  175. return i
  176. }
  177. func (l *raftLog) commitTo(tocommit uint64) {
  178. // never decrease commit
  179. if l.committed < tocommit {
  180. if l.lastIndex() < tocommit {
  181. l.logger.Panicf("tocommit(%d) is out of range [lastIndex(%d)]. Was the raft log corrupted, truncated, or lost?", tocommit, l.lastIndex())
  182. }
  183. l.committed = tocommit
  184. }
  185. }
  186. func (l *raftLog) appliedTo(i uint64) {
  187. if i == 0 {
  188. return
  189. }
  190. if l.committed < i || i < l.applied {
  191. l.logger.Panicf("applied(%d) is out of range [prevApplied(%d), committed(%d)]", i, l.applied, l.committed)
  192. }
  193. l.applied = i
  194. }
  195. func (l *raftLog) stableTo(i, t uint64) { l.unstable.stableTo(i, t) }
  196. func (l *raftLog) stableSnapTo(i uint64) { l.unstable.stableSnapTo(i) }
  197. func (l *raftLog) lastTerm() uint64 {
  198. t, err := l.term(l.lastIndex())
  199. if err != nil {
  200. l.logger.Panicf("unexpected error when getting the last term (%v)", err)
  201. }
  202. return t
  203. }
  204. func (l *raftLog) term(i uint64) (uint64, error) {
  205. // the valid term range is [index of dummy entry, last index]
  206. dummyIndex := l.firstIndex() - 1
  207. if i < dummyIndex || i > l.lastIndex() {
  208. // TODO: return an error instead?
  209. return 0, nil
  210. }
  211. if t, ok := l.unstable.maybeTerm(i); ok {
  212. return t, nil
  213. }
  214. t, err := l.storage.Term(i)
  215. if err == nil {
  216. return t, nil
  217. }
  218. if err == ErrCompacted || err == ErrUnavailable {
  219. return 0, err
  220. }
  221. panic(err) // TODO(bdarnell)
  222. }
  223. func (l *raftLog) entries(i, maxsize uint64) ([]pb.Entry, error) {
  224. if i > l.lastIndex() {
  225. return nil, nil
  226. }
  227. return l.slice(i, l.lastIndex()+1, maxsize)
  228. }
  229. // allEntries returns all entries in the log.
  230. func (l *raftLog) allEntries() []pb.Entry {
  231. ents, err := l.entries(l.firstIndex(), noLimit)
  232. if err == nil {
  233. return ents
  234. }
  235. if err == ErrCompacted { // try again if there was a racing compaction
  236. return l.allEntries()
  237. }
  238. // TODO (xiangli): handle error?
  239. panic(err)
  240. }
  241. // isUpToDate determines if the given (lastIndex,term) log is more up-to-date
  242. // by comparing the index and term of the last entries in the existing logs.
  243. // If the logs have last entries with different terms, then the log with the
  244. // later term is more up-to-date. If the logs end with the same term, then
  245. // whichever log has the larger lastIndex is more up-to-date. If the logs are
  246. // the same, the given log is up-to-date.
  247. func (l *raftLog) isUpToDate(lasti, term uint64) bool {
  248. return term > l.lastTerm() || (term == l.lastTerm() && lasti >= l.lastIndex())
  249. }
  250. func (l *raftLog) matchTerm(i, term uint64) bool {
  251. t, err := l.term(i)
  252. if err != nil {
  253. return false
  254. }
  255. return t == term
  256. }
  257. func (l *raftLog) maybeCommit(maxIndex, term uint64) bool {
  258. if maxIndex > l.committed && l.zeroTermOnErrCompacted(l.term(maxIndex)) == term {
  259. l.commitTo(maxIndex)
  260. return true
  261. }
  262. return false
  263. }
  264. func (l *raftLog) restore(s pb.Snapshot) {
  265. l.logger.Infof("log [%s] starts to restore snapshot [index: %d, term: %d]", l, s.Metadata.Index, s.Metadata.Term)
  266. l.committed = s.Metadata.Index
  267. l.unstable.restore(s)
  268. }
  269. // slice returns a slice of log entries from lo through hi-1, inclusive.
  270. func (l *raftLog) slice(lo, hi, maxSize uint64) ([]pb.Entry, error) {
  271. err := l.mustCheckOutOfBounds(lo, hi)
  272. if err != nil {
  273. return nil, err
  274. }
  275. if lo == hi {
  276. return nil, nil
  277. }
  278. var ents []pb.Entry
  279. if lo < l.unstable.offset {
  280. storedEnts, err := l.storage.Entries(lo, min(hi, l.unstable.offset), maxSize)
  281. if err == ErrCompacted {
  282. return nil, err
  283. } else if err == ErrUnavailable {
  284. l.logger.Panicf("entries[%d:%d) is unavailable from storage", lo, min(hi, l.unstable.offset))
  285. } else if err != nil {
  286. panic(err) // TODO(bdarnell)
  287. }
  288. // check if ents has reached the size limitation
  289. if uint64(len(storedEnts)) < min(hi, l.unstable.offset)-lo {
  290. return storedEnts, nil
  291. }
  292. ents = storedEnts
  293. }
  294. if hi > l.unstable.offset {
  295. unstable := l.unstable.slice(max(lo, l.unstable.offset), hi)
  296. if len(ents) > 0 {
  297. combined := make([]pb.Entry, len(ents)+len(unstable))
  298. n := copy(combined, ents)
  299. copy(combined[n:], unstable)
  300. ents = combined
  301. } else {
  302. ents = unstable
  303. }
  304. }
  305. return limitSize(ents, maxSize), nil
  306. }
  307. // l.firstIndex <= lo <= hi <= l.firstIndex + len(l.entries)
  308. func (l *raftLog) mustCheckOutOfBounds(lo, hi uint64) error {
  309. if lo > hi {
  310. l.logger.Panicf("invalid slice %d > %d", lo, hi)
  311. }
  312. fi := l.firstIndex()
  313. if lo < fi {
  314. return ErrCompacted
  315. }
  316. length := l.lastIndex() + 1 - fi
  317. if lo < fi || hi > fi+length {
  318. l.logger.Panicf("slice[%d,%d) out of bound [%d,%d]", lo, hi, fi, l.lastIndex())
  319. }
  320. return nil
  321. }
  322. func (l *raftLog) zeroTermOnErrCompacted(t uint64, err error) uint64 {
  323. if err == nil {
  324. return t
  325. }
  326. if err == ErrCompacted {
  327. return 0
  328. }
  329. l.logger.Panicf("unexpected error (%v)", err)
  330. return 0
  331. }