log.go 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629
  1. package raft
  2. import (
  3. "bufio"
  4. "code.google.com/p/goprotobuf/proto"
  5. "errors"
  6. "fmt"
  7. "github.com/coreos/raft/protobuf"
  8. "io"
  9. "os"
  10. "sync"
  11. )
  12. //------------------------------------------------------------------------------
  13. //
  14. // Typedefs
  15. //
  16. //------------------------------------------------------------------------------
  17. // A log is a collection of log entries that are persisted to durable storage.
  18. type Log struct {
  19. ApplyFunc func(Command) (interface{}, error)
  20. file *os.File
  21. path string
  22. entries []*LogEntry
  23. results []*logResult
  24. commitIndex uint64
  25. mutex sync.RWMutex
  26. startIndex uint64 // the index before the first entry in the Log entries
  27. startTerm uint64
  28. pBuffer *proto.Buffer
  29. pLogEntry *protobuf.ProtoLogEntry
  30. }
  31. // The results of the applying a log entry.
  32. type logResult struct {
  33. returnValue interface{}
  34. err error
  35. }
  36. //------------------------------------------------------------------------------
  37. //
  38. // Constructor
  39. //
  40. //------------------------------------------------------------------------------
  41. // Creates a new log.
  42. func newLog() *Log {
  43. return &Log{
  44. entries: make([]*LogEntry, 0),
  45. pBuffer: proto.NewBuffer(nil),
  46. pLogEntry: &protobuf.ProtoLogEntry{},
  47. }
  48. }
  49. //------------------------------------------------------------------------------
  50. //
  51. // Accessors
  52. //
  53. //------------------------------------------------------------------------------
  54. //--------------------------------------
  55. // Log Indices
  56. //--------------------------------------
  57. // The last committed index in the log.
  58. func (l *Log) CommitIndex() uint64 {
  59. l.mutex.RLock()
  60. defer l.mutex.RUnlock()
  61. return l.commitIndex
  62. }
  63. // The current index in the log.
  64. func (l *Log) currentIndex() uint64 {
  65. l.mutex.RLock()
  66. defer l.mutex.RUnlock()
  67. if len(l.entries) == 0 {
  68. return l.startIndex
  69. }
  70. return l.entries[len(l.entries)-1].Index
  71. }
  72. // The current index in the log without locking
  73. func (l *Log) internalCurrentIndex() uint64 {
  74. if len(l.entries) == 0 {
  75. return l.startIndex
  76. }
  77. return l.entries[len(l.entries)-1].Index
  78. }
  79. // The next index in the log.
  80. func (l *Log) nextIndex() uint64 {
  81. return l.currentIndex() + 1
  82. }
  83. // Determines if the log contains zero entries.
  84. func (l *Log) isEmpty() bool {
  85. l.mutex.RLock()
  86. defer l.mutex.RUnlock()
  87. return (len(l.entries) == 0) && (l.startIndex == 0)
  88. }
  89. // The name of the last command in the log.
  90. func (l *Log) lastCommandName() string {
  91. l.mutex.RLock()
  92. defer l.mutex.RUnlock()
  93. if len(l.entries) > 0 {
  94. if entry := l.entries[len(l.entries)-1]; entry != nil {
  95. return entry.CommandName
  96. }
  97. }
  98. return ""
  99. }
  100. //--------------------------------------
  101. // Log Terms
  102. //--------------------------------------
  103. // The current term in the log.
  104. func (l *Log) currentTerm() uint64 {
  105. l.mutex.RLock()
  106. defer l.mutex.RUnlock()
  107. if len(l.entries) == 0 {
  108. return l.startTerm
  109. }
  110. return l.entries[len(l.entries)-1].Term
  111. }
  112. //------------------------------------------------------------------------------
  113. //
  114. // Methods
  115. //
  116. //------------------------------------------------------------------------------
  117. //--------------------------------------
  118. // State
  119. //--------------------------------------
  120. // Opens the log file and reads existing entries. The log can remain open and
  121. // continue to append entries to the end of the log.
  122. func (l *Log) open(path string) error {
  123. // Read all the entries from the log if one exists.
  124. var readBytes int64
  125. var err error
  126. debugln("log.open.open ", path)
  127. // open log file
  128. l.file, err = os.OpenFile(path, os.O_RDWR, 0600)
  129. l.path = path
  130. if err != nil {
  131. // if the log file does not exist before
  132. // we create the log file and set commitIndex to 0
  133. if os.IsNotExist(err) {
  134. l.file, err = os.OpenFile(path, os.O_WRONLY|os.O_CREATE, 0600)
  135. debugln("log.open.create ", path)
  136. return err
  137. }
  138. return err
  139. }
  140. debugln("log.open.exist ", path)
  141. // Read the file and decode entries.
  142. for {
  143. // Instantiate log entry and decode into it.
  144. entry, _ := newLogEntry(l, 0, 0, nil)
  145. entry.Position, _ = l.file.Seek(0, os.SEEK_CUR)
  146. n, err := entry.decode(l.file)
  147. if err != nil {
  148. if err == io.EOF {
  149. debugln("open.log.append: finish ")
  150. } else {
  151. if err = os.Truncate(path, readBytes); err != nil {
  152. return fmt.Errorf("raft.Log: Unable to recover: %v", err)
  153. }
  154. }
  155. break
  156. }
  157. if entry.Index > l.startIndex {
  158. // Append entry.
  159. l.entries = append(l.entries, entry)
  160. if entry.Index <= l.commitIndex {
  161. command, err := newCommand(entry.CommandName, entry.Command)
  162. if err != nil {
  163. continue
  164. }
  165. l.ApplyFunc(command)
  166. }
  167. debugln("open.log.append log index ", entry.Index)
  168. }
  169. readBytes += int64(n)
  170. }
  171. l.results = make([]*logResult, len(l.entries))
  172. debugln("open.log.recovery number of log ", len(l.entries))
  173. return nil
  174. }
  175. // Closes the log file.
  176. func (l *Log) close() {
  177. l.mutex.Lock()
  178. defer l.mutex.Unlock()
  179. if l.file != nil {
  180. l.file.Close()
  181. l.file = nil
  182. }
  183. l.entries = make([]*LogEntry, 0)
  184. l.results = make([]*logResult, 0)
  185. }
  186. //--------------------------------------
  187. // Entries
  188. //--------------------------------------
  189. // Creates a log entry associated with this log.
  190. func (l *Log) createEntry(term uint64, command Command) (*LogEntry, error) {
  191. return newLogEntry(l, l.nextIndex(), term, command)
  192. }
  193. // Retrieves an entry from the log. If the entry has been eliminated because
  194. // of a snapshot then nil is returned.
  195. func (l *Log) getEntry(index uint64) *LogEntry {
  196. l.mutex.RLock()
  197. defer l.mutex.RUnlock()
  198. if index <= l.startIndex || index > (l.startIndex+uint64(len(l.entries))) {
  199. return nil
  200. }
  201. return l.entries[index-l.startIndex-1]
  202. }
  203. // Checks if the log contains a given index/term combination.
  204. func (l *Log) containsEntry(index uint64, term uint64) bool {
  205. entry := l.getEntry(index)
  206. return (entry != nil && entry.Term == term)
  207. }
  208. // Retrieves a list of entries after a given index as well as the term of the
  209. // index provided. A nil list of entries is returned if the index no longer
  210. // exists because a snapshot was made.
  211. func (l *Log) getEntriesAfter(index uint64, maxLogEntriesPerRequest uint64) ([]*LogEntry, uint64) {
  212. l.mutex.Lock()
  213. defer l.mutex.Unlock()
  214. // Return nil if index is before the start of the log.
  215. if index < l.startIndex {
  216. traceln("log.entriesAfter.before: ", index, " ", l.startIndex)
  217. return nil, 0
  218. }
  219. // Return an error if the index doesn't exist.
  220. if index > (uint64(len(l.entries)) + l.startIndex) {
  221. panic(fmt.Sprintf("raft: Index is beyond end of log: %v %v", len(l.entries), index))
  222. }
  223. // If we're going from the beginning of the log then return the whole log.
  224. if index == l.startIndex {
  225. traceln("log.entriesAfter.beginning: ", index, " ", l.startIndex)
  226. return l.entries, l.startTerm
  227. }
  228. traceln("log.entriesAfter.partial: ", index, " ", l.entries[len(l.entries)-1].Index)
  229. entries := l.entries[index-l.startIndex:]
  230. length := len(entries)
  231. traceln("log.entriesAfter: startIndex:", l.startIndex, " lenght", len(l.entries))
  232. if uint64(length) < maxLogEntriesPerRequest {
  233. // Determine the term at the given entry and return a subslice.
  234. return entries, l.entries[index-1-l.startIndex].Term
  235. } else {
  236. return entries[:maxLogEntriesPerRequest], l.entries[index-1-l.startIndex].Term
  237. }
  238. }
  239. // Retrieves the return value and error for an entry. The result can only exist
  240. // after the entry has been committed.
  241. func (l *Log) getEntryResult(entry *LogEntry, clear bool) (interface{}, error) {
  242. l.mutex.RLock()
  243. defer l.mutex.RUnlock()
  244. if entry == nil {
  245. panic("raft: Log entry required for error retrieval")
  246. }
  247. debugln("getEntryResult.result index: ", entry.Index-l.startIndex-1)
  248. // If a result exists for the entry then return it with its error.
  249. if entry.Index > l.startIndex && entry.Index <= l.startIndex+uint64(len(l.results)) {
  250. if result := l.results[entry.Index-l.startIndex-1]; result != nil {
  251. // keep the records before remove it
  252. returnValue, err := result.returnValue, result.err
  253. // Remove reference to result if it's being cleared after retrieval.
  254. if clear {
  255. result.returnValue = nil
  256. }
  257. return returnValue, err
  258. }
  259. }
  260. return nil, nil
  261. }
  262. //--------------------------------------
  263. // Commit
  264. //--------------------------------------
  265. // Retrieves the last index and term that has been committed to the log.
  266. func (l *Log) commitInfo() (index uint64, term uint64) {
  267. l.mutex.RLock()
  268. defer l.mutex.RUnlock()
  269. // If we don't have any committed entries then just return zeros.
  270. if l.commitIndex == 0 {
  271. return 0, 0
  272. }
  273. // No new commit log after snapshot
  274. if l.commitIndex == l.startIndex {
  275. return l.startIndex, l.startTerm
  276. }
  277. // Return the last index & term from the last committed entry.
  278. debugln("commitInfo.get.[", l.commitIndex, "/", l.startIndex, "]")
  279. entry := l.entries[l.commitIndex-1-l.startIndex]
  280. return entry.Index, entry.Term
  281. }
  282. // Retrieves the last index and term that has been committed to the log.
  283. func (l *Log) lastInfo() (index uint64, term uint64) {
  284. l.mutex.RLock()
  285. defer l.mutex.RUnlock()
  286. // If we don't have any entries then just return zeros.
  287. if len(l.entries) == 0 {
  288. return l.startIndex, l.startTerm
  289. }
  290. // Return the last index & term
  291. entry := l.entries[len(l.entries)-1]
  292. return entry.Index, entry.Term
  293. }
  294. // Updates the commit index
  295. func (l *Log) updateCommitIndex(index uint64) {
  296. l.mutex.Lock()
  297. defer l.mutex.Unlock()
  298. if index > l.commitIndex {
  299. l.commitIndex = index
  300. }
  301. debugln("update.commit.index ", index)
  302. }
  303. // Updates the commit index and writes entries after that index to the stable storage.
  304. func (l *Log) setCommitIndex(index uint64) error {
  305. l.mutex.Lock()
  306. defer l.mutex.Unlock()
  307. // this is not error any more after limited the number of sending entries
  308. // commit up to what we already have
  309. if index > l.startIndex+uint64(len(l.entries)) {
  310. debugln("raft.StartIndex", l.startIndex)
  311. debugln("raft.Log: Commit index", index, "set back to ", l.startIndex+uint64(len(l.entries)))
  312. index = l.startIndex + uint64(len(l.entries))
  313. }
  314. // Do not allow previous indices to be committed again.
  315. // This could happens, since the guarantee is that the new leader has up-to-dated
  316. // log entires rather than has most up-to-dated committed index
  317. // For example, Leader 1 send log 80 to follower 2 and follower 3
  318. // follower 2 and follow 3 all got the new entries and reply
  319. // leader 1 committed entry 80 and send reply to follower 2 and follower3
  320. // follower 2 receive the new committed index and update committed index to 80
  321. // leader 1 fail to send the committed index to follower 3
  322. // follower 3 promote to leader (server 1 and server 2 will vote, since leader 3
  323. // has up-to-dated the entries)
  324. // when new leader 3 send heartbeat with committed index = 0 to follower 2,
  325. // follower 2 should reply success and let leader 3 update the committed index to 80
  326. if index < l.commitIndex {
  327. debugln("raft.Log: index", index, "committedIndex", l.commitIndex)
  328. return nil
  329. }
  330. // Find all entries whose index is between the previous index and the current index.
  331. for i := l.commitIndex + 1; i <= index; i++ {
  332. entryIndex := i - 1 - l.startIndex
  333. entry := l.entries[entryIndex]
  334. // Update commit index.
  335. l.commitIndex = entry.Index
  336. // Decode the command.
  337. command, err := newCommand(entry.CommandName, entry.Command)
  338. if err != nil {
  339. return err
  340. }
  341. // Apply the changes to the state machine and store the error code.
  342. returnValue, err := l.ApplyFunc(command)
  343. debugln("setCommitIndex.set.result index: ", entryIndex)
  344. l.results[entryIndex] = &logResult{returnValue: returnValue, err: err}
  345. }
  346. return nil
  347. }
  348. // Set the commitIndex at the head of the log file to the current
  349. // commit Index. This should be called after obtained a log lock
  350. func (l *Log) flushCommitIndex() {
  351. l.file.Seek(0, os.SEEK_SET)
  352. fmt.Fprintf(l.file, "%8x\n", l.commitIndex)
  353. l.file.Seek(0, os.SEEK_END)
  354. }
  355. //--------------------------------------
  356. // Truncation
  357. //--------------------------------------
  358. // Truncates the log to the given index and term. This only works if the log
  359. // at the index has not been committed.
  360. func (l *Log) truncate(index uint64, term uint64) error {
  361. l.mutex.Lock()
  362. defer l.mutex.Unlock()
  363. debugln("log.truncate: ", index)
  364. // Do not allow committed entries to be truncated.
  365. if index < l.commitIndex {
  366. debugln("log.truncate.before")
  367. return fmt.Errorf("raft.Log: Index is already committed (%v): (IDX=%v, TERM=%v)", l.commitIndex, index, term)
  368. }
  369. // Do not truncate past end of entries.
  370. if index > l.startIndex+uint64(len(l.entries)) {
  371. debugln("log.truncate.after")
  372. return fmt.Errorf("raft.Log: Entry index does not exist (MAX=%v): (IDX=%v, TERM=%v)", len(l.entries), index, term)
  373. }
  374. // If we're truncating everything then just clear the entries.
  375. if index == l.startIndex {
  376. debugln("log.truncate.clear")
  377. l.file.Truncate(0)
  378. l.file.Seek(0, os.SEEK_SET)
  379. l.entries = []*LogEntry{}
  380. } else {
  381. // Do not truncate if the entry at index does not have the matching term.
  382. entry := l.entries[index-l.startIndex-1]
  383. if len(l.entries) > 0 && entry.Term != term {
  384. debugln("log.truncate.termMismatch")
  385. return fmt.Errorf("raft.Log: Entry at index does not have matching term (%v): (IDX=%v, TERM=%v)", entry.Term, index, term)
  386. }
  387. // Otherwise truncate up to the desired entry.
  388. if index < l.startIndex+uint64(len(l.entries)) {
  389. debugln("log.truncate.finish")
  390. position := l.entries[index-l.startIndex].Position
  391. l.file.Truncate(position)
  392. l.file.Seek(position, os.SEEK_SET)
  393. l.entries = l.entries[0 : index-l.startIndex]
  394. }
  395. }
  396. return nil
  397. }
  398. //--------------------------------------
  399. // Append
  400. //--------------------------------------
  401. // Appends a series of entries to the log. These entries are not written to
  402. // disk until setCommitIndex() is called.
  403. func (l *Log) appendEntries(entries []*LogEntry) error {
  404. l.mutex.Lock()
  405. defer l.mutex.Unlock()
  406. startPosition, _ := l.file.Seek(0, os.SEEK_CUR)
  407. w := bufio.NewWriter(l.file)
  408. var size int64
  409. var err error
  410. // Append each entry but exit if we hit an error.
  411. for _, entry := range entries {
  412. entry.log = l
  413. if size, err = l.writeEntry(entry, w); err != nil {
  414. return err
  415. }
  416. entry.Position = startPosition
  417. startPosition += size
  418. }
  419. w.Flush()
  420. return nil
  421. }
  422. // Writes a single log entry to the end of the log. This function does not
  423. // obtain a lock and should only be used internally. Use AppendEntries() and
  424. // AppendEntry() to use it externally.
  425. func (l *Log) appendEntry(entry *LogEntry) error {
  426. if l.file == nil {
  427. return errors.New("raft.Log: Log is not open")
  428. }
  429. // Make sure the term and index are greater than the previous.
  430. if len(l.entries) > 0 {
  431. lastEntry := l.entries[len(l.entries)-1]
  432. if entry.Term < lastEntry.Term {
  433. return fmt.Errorf("raft.Log: Cannot append entry with earlier term (%x:%x <= %x:%x)", entry.Term, entry.Index, lastEntry.Term, lastEntry.Index)
  434. } else if entry.Term == lastEntry.Term && entry.Index <= lastEntry.Index {
  435. return fmt.Errorf("raft.Log: Cannot append entry with earlier index in the same term (%x:%x <= %x:%x)", entry.Term, entry.Index, lastEntry.Term, lastEntry.Index)
  436. }
  437. }
  438. position, _ := l.file.Seek(0, os.SEEK_CUR)
  439. entry.Position = position
  440. // Write to storage.
  441. if _, err := entry.encode(l.file); err != nil {
  442. return err
  443. }
  444. // Append to entries list if stored on disk.
  445. l.entries = append(l.entries, entry)
  446. l.results = append(l.results, nil)
  447. return nil
  448. }
  449. // appendEntry with Buffered io
  450. func (l *Log) writeEntry(entry *LogEntry, w io.Writer) (int64, error) {
  451. if l.file == nil {
  452. return -1, errors.New("raft.Log: Log is not open")
  453. }
  454. // Make sure the term and index are greater than the previous.
  455. if len(l.entries) > 0 {
  456. lastEntry := l.entries[len(l.entries)-1]
  457. if entry.Term < lastEntry.Term {
  458. return -1, fmt.Errorf("raft.Log: Cannot append entry with earlier term (%x:%x <= %x:%x)", entry.Term, entry.Index, lastEntry.Term, lastEntry.Index)
  459. } else if entry.Term == lastEntry.Term && entry.Index <= lastEntry.Index {
  460. return -1, fmt.Errorf("raft.Log: Cannot append entry with earlier index in the same term (%x:%x <= %x:%x)", entry.Term, entry.Index, lastEntry.Term, lastEntry.Index)
  461. }
  462. }
  463. // Write to storage.
  464. size, err := entry.encode(w)
  465. if err != nil {
  466. return -1, err
  467. }
  468. // Append to entries list if stored on disk.
  469. l.entries = append(l.entries, entry)
  470. l.results = append(l.results, nil)
  471. return int64(size), nil
  472. }
  473. //--------------------------------------
  474. // Log compaction
  475. //--------------------------------------
  476. // compact the log before index (including index)
  477. func (l *Log) compact(index uint64, term uint64) error {
  478. var entries []*LogEntry
  479. var results []*logResult
  480. l.mutex.Lock()
  481. defer l.mutex.Unlock()
  482. if index == 0 {
  483. return nil
  484. }
  485. // nothing to compaction
  486. // the index may be greater than the current index if
  487. // we just recovery from on snapshot
  488. if index >= l.internalCurrentIndex() {
  489. entries = make([]*LogEntry, 0)
  490. results = make([]*logResult, 0)
  491. } else {
  492. // get all log entries after index
  493. entries = l.entries[index-l.startIndex:]
  494. results = l.results[index-l.startIndex:]
  495. }
  496. // create a new log file and add all the entries
  497. file, err := os.OpenFile(l.path+".new", os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0600)
  498. if err != nil {
  499. return err
  500. }
  501. for _, entry := range entries {
  502. position, _ := l.file.Seek(0, os.SEEK_CUR)
  503. entry.Position = position
  504. if _, err = entry.encode(file); err != nil {
  505. return err
  506. }
  507. }
  508. // close the current log file
  509. l.file.Close()
  510. // remove the current log file to .bak
  511. err = os.Remove(l.path)
  512. if err != nil {
  513. return err
  514. }
  515. // rename the new log file
  516. err = os.Rename(l.path+".new", l.path)
  517. if err != nil {
  518. return err
  519. }
  520. l.file = file
  521. // compaction the in memory log
  522. l.entries = entries
  523. l.results = results
  524. l.startIndex = index
  525. l.startTerm = term
  526. return nil
  527. }