log.go 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609
  1. package raft
  2. import (
  3. "bufio"
  4. "code.google.com/p/goprotobuf/proto"
  5. "errors"
  6. "fmt"
  7. "github.com/coreos/raft/protobuf"
  8. "io"
  9. "os"
  10. "sync"
  11. )
  12. //------------------------------------------------------------------------------
  13. //
  14. // Typedefs
  15. //
  16. //------------------------------------------------------------------------------
  17. // A log is a collection of log entries that are persisted to durable storage.
  18. type Log struct {
  19. ApplyFunc func(Command) (interface{}, error)
  20. file *os.File
  21. path string
  22. entries []*LogEntry
  23. commitIndex uint64
  24. mutex sync.RWMutex
  25. startIndex uint64 // the index before the first entry in the Log entries
  26. startTerm uint64
  27. pBuffer *proto.Buffer
  28. pLogEntry *protobuf.ProtoLogEntry
  29. }
  30. // The results of the applying a log entry.
  31. type logResult struct {
  32. returnValue interface{}
  33. err error
  34. }
  35. //------------------------------------------------------------------------------
  36. //
  37. // Constructor
  38. //
  39. //------------------------------------------------------------------------------
  40. // Creates a new log.
  41. func newLog() *Log {
  42. return &Log{
  43. entries: make([]*LogEntry, 0),
  44. pBuffer: proto.NewBuffer(nil),
  45. pLogEntry: &protobuf.ProtoLogEntry{},
  46. }
  47. }
  48. //------------------------------------------------------------------------------
  49. //
  50. // Accessors
  51. //
  52. //------------------------------------------------------------------------------
  53. //--------------------------------------
  54. // Log Indices
  55. //--------------------------------------
  56. // The last committed index in the log.
  57. func (l *Log) CommitIndex() uint64 {
  58. l.mutex.RLock()
  59. defer l.mutex.RUnlock()
  60. return l.commitIndex
  61. }
  62. // The current index in the log.
  63. func (l *Log) currentIndex() uint64 {
  64. l.mutex.RLock()
  65. defer l.mutex.RUnlock()
  66. return l.internalCurrentIndex()
  67. }
  68. // The current index in the log without locking
  69. func (l *Log) internalCurrentIndex() uint64 {
  70. if len(l.entries) == 0 {
  71. return l.startIndex
  72. }
  73. return l.entries[len(l.entries)-1].Index
  74. }
  75. // The next index in the log.
  76. func (l *Log) nextIndex() uint64 {
  77. return l.currentIndex() + 1
  78. }
  79. // Determines if the log contains zero entries.
  80. func (l *Log) isEmpty() bool {
  81. l.mutex.RLock()
  82. defer l.mutex.RUnlock()
  83. return (len(l.entries) == 0) && (l.startIndex == 0)
  84. }
  85. // The name of the last command in the log.
  86. func (l *Log) lastCommandName() string {
  87. l.mutex.RLock()
  88. defer l.mutex.RUnlock()
  89. if len(l.entries) > 0 {
  90. if entry := l.entries[len(l.entries)-1]; entry != nil {
  91. return entry.CommandName
  92. }
  93. }
  94. return ""
  95. }
  96. //--------------------------------------
  97. // Log Terms
  98. //--------------------------------------
  99. // The current term in the log.
  100. func (l *Log) currentTerm() uint64 {
  101. l.mutex.RLock()
  102. defer l.mutex.RUnlock()
  103. if len(l.entries) == 0 {
  104. return l.startTerm
  105. }
  106. return l.entries[len(l.entries)-1].Term
  107. }
  108. //------------------------------------------------------------------------------
  109. //
  110. // Methods
  111. //
  112. //------------------------------------------------------------------------------
  113. //--------------------------------------
  114. // State
  115. //--------------------------------------
  116. // Opens the log file and reads existing entries. The log can remain open and
  117. // continue to append entries to the end of the log.
  118. func (l *Log) open(path string) error {
  119. // Read all the entries from the log if one exists.
  120. var readBytes int64
  121. var err error
  122. debugln("log.open.open ", path)
  123. // open log file
  124. l.file, err = os.OpenFile(path, os.O_RDWR, 0600)
  125. l.path = path
  126. if err != nil {
  127. // if the log file does not exist before
  128. // we create the log file and set commitIndex to 0
  129. if os.IsNotExist(err) {
  130. l.file, err = os.OpenFile(path, os.O_WRONLY|os.O_CREATE, 0600)
  131. debugln("log.open.create ", path)
  132. return err
  133. }
  134. return err
  135. }
  136. debugln("log.open.exist ", path)
  137. // Read the file and decode entries.
  138. for {
  139. // Instantiate log entry and decode into it.
  140. entry, _ := newLogEntry(l, nil, 0, 0, nil)
  141. entry.Position, _ = l.file.Seek(0, os.SEEK_CUR)
  142. n, err := entry.decode(l.file)
  143. if err != nil {
  144. if err == io.EOF {
  145. debugln("open.log.append: finish ")
  146. } else {
  147. if err = os.Truncate(path, readBytes); err != nil {
  148. return fmt.Errorf("raft.Log: Unable to recover: %v", err)
  149. }
  150. }
  151. break
  152. }
  153. if entry.Index > l.startIndex {
  154. // Append entry.
  155. l.entries = append(l.entries, entry)
  156. if entry.Index <= l.commitIndex {
  157. command, err := newCommand(entry.CommandName, entry.Command)
  158. if err != nil {
  159. continue
  160. }
  161. l.ApplyFunc(command)
  162. }
  163. debugln("open.log.append log index ", entry.Index)
  164. }
  165. readBytes += int64(n)
  166. }
  167. debugln("open.log.recovery number of log ", len(l.entries))
  168. return nil
  169. }
  170. // Closes the log file.
  171. func (l *Log) close() {
  172. l.mutex.Lock()
  173. defer l.mutex.Unlock()
  174. if l.file != nil {
  175. l.file.Close()
  176. l.file = nil
  177. }
  178. l.entries = make([]*LogEntry, 0)
  179. }
  180. //--------------------------------------
  181. // Entries
  182. //--------------------------------------
  183. // Creates a log entry associated with this log.
  184. func (l *Log) createEntry(term uint64, command Command, e *ev) (*LogEntry, error) {
  185. return newLogEntry(l, e, l.nextIndex(), term, command)
  186. }
  187. // Retrieves an entry from the log. If the entry has been eliminated because
  188. // of a snapshot then nil is returned.
  189. func (l *Log) getEntry(index uint64) *LogEntry {
  190. l.mutex.RLock()
  191. defer l.mutex.RUnlock()
  192. if index <= l.startIndex || index > (l.startIndex+uint64(len(l.entries))) {
  193. return nil
  194. }
  195. return l.entries[index-l.startIndex-1]
  196. }
  197. // Checks if the log contains a given index/term combination.
  198. func (l *Log) containsEntry(index uint64, term uint64) bool {
  199. entry := l.getEntry(index)
  200. return (entry != nil && entry.Term == term)
  201. }
  202. // Retrieves a list of entries after a given index as well as the term of the
  203. // index provided. A nil list of entries is returned if the index no longer
  204. // exists because a snapshot was made.
  205. func (l *Log) getEntriesAfter(index uint64, maxLogEntriesPerRequest uint64) ([]*LogEntry, uint64) {
  206. l.mutex.Lock()
  207. defer l.mutex.Unlock()
  208. // Return nil if index is before the start of the log.
  209. if index < l.startIndex {
  210. traceln("log.entriesAfter.before: ", index, " ", l.startIndex)
  211. return nil, 0
  212. }
  213. // Return an error if the index doesn't exist.
  214. if index > (uint64(len(l.entries)) + l.startIndex) {
  215. panic(fmt.Sprintf("raft: Index is beyond end of log: %v %v", len(l.entries), index))
  216. }
  217. // If we're going from the beginning of the log then return the whole log.
  218. if index == l.startIndex {
  219. traceln("log.entriesAfter.beginning: ", index, " ", l.startIndex)
  220. return l.entries, l.startTerm
  221. }
  222. traceln("log.entriesAfter.partial: ", index, " ", l.entries[len(l.entries)-1].Index)
  223. entries := l.entries[index-l.startIndex:]
  224. length := len(entries)
  225. traceln("log.entriesAfter: startIndex:", l.startIndex, " lenght", len(l.entries))
  226. if uint64(length) < maxLogEntriesPerRequest {
  227. // Determine the term at the given entry and return a subslice.
  228. return entries, l.entries[index-1-l.startIndex].Term
  229. } else {
  230. return entries[:maxLogEntriesPerRequest], l.entries[index-1-l.startIndex].Term
  231. }
  232. }
  233. //--------------------------------------
  234. // Commit
  235. //--------------------------------------
  236. // Retrieves the last index and term that has been committed to the log.
  237. func (l *Log) commitInfo() (index uint64, term uint64) {
  238. l.mutex.RLock()
  239. defer l.mutex.RUnlock()
  240. // If we don't have any committed entries then just return zeros.
  241. if l.commitIndex == 0 {
  242. return 0, 0
  243. }
  244. // No new commit log after snapshot
  245. if l.commitIndex == l.startIndex {
  246. return l.startIndex, l.startTerm
  247. }
  248. // Return the last index & term from the last committed entry.
  249. debugln("commitInfo.get.[", l.commitIndex, "/", l.startIndex, "]")
  250. entry := l.entries[l.commitIndex-1-l.startIndex]
  251. return entry.Index, entry.Term
  252. }
  253. // Retrieves the last index and term that has been appended to the log.
  254. func (l *Log) lastInfo() (index uint64, term uint64) {
  255. l.mutex.RLock()
  256. defer l.mutex.RUnlock()
  257. // If we don't have any entries then just return zeros.
  258. if len(l.entries) == 0 {
  259. return l.startIndex, l.startTerm
  260. }
  261. // Return the last index & term
  262. entry := l.entries[len(l.entries)-1]
  263. return entry.Index, entry.Term
  264. }
  265. // Updates the commit index
  266. func (l *Log) updateCommitIndex(index uint64) {
  267. l.mutex.Lock()
  268. defer l.mutex.Unlock()
  269. if index > l.commitIndex {
  270. l.commitIndex = index
  271. }
  272. debugln("update.commit.index ", index)
  273. }
  274. // Updates the commit index and writes entries after that index to the stable storage.
  275. func (l *Log) setCommitIndex(index uint64) error {
  276. l.mutex.Lock()
  277. defer l.mutex.Unlock()
  278. // this is not error any more after limited the number of sending entries
  279. // commit up to what we already have
  280. if index > l.startIndex+uint64(len(l.entries)) {
  281. debugln("raft.Log: Commit index", index, "set back to ", len(l.entries))
  282. index = l.startIndex + uint64(len(l.entries))
  283. }
  284. // Do not allow previous indices to be committed again.
  285. // This could happens, since the guarantee is that the new leader has up-to-dated
  286. // log entires rather than has most up-to-dated committed index
  287. // For example, Leader 1 send log 80 to follower 2 and follower 3
  288. // follower 2 and follow 3 all got the new entries and reply
  289. // leader 1 committed entry 80 and send reply to follower 2 and follower3
  290. // follower 2 receive the new committed index and update committed index to 80
  291. // leader 1 fail to send the committed index to follower 3
  292. // follower 3 promote to leader (server 1 and server 2 will vote, since leader 3
  293. // has up-to-dated the entries)
  294. // when new leader 3 send heartbeat with committed index = 0 to follower 2,
  295. // follower 2 should reply success and let leader 3 update the committed index to 80
  296. if index < l.commitIndex {
  297. return nil
  298. }
  299. // Find all entries whose index is between the previous index and the current index.
  300. for i := l.commitIndex + 1; i <= index; i++ {
  301. entryIndex := i - 1 - l.startIndex
  302. entry := l.entries[entryIndex]
  303. // Update commit index.
  304. l.commitIndex = entry.Index
  305. // Decode the command.
  306. command, err := newCommand(entry.CommandName, entry.Command)
  307. if err != nil {
  308. return err
  309. }
  310. // Apply the changes to the state machine and store the error code.
  311. returnValue, err := l.ApplyFunc(command)
  312. debugln("setCommitIndex.set.result index: ", entryIndex)
  313. if entry.event != nil {
  314. entry.event.returnValue = returnValue
  315. entry.event.c <- err
  316. }
  317. }
  318. return nil
  319. }
  320. // Set the commitIndex at the head of the log file to the current
  321. // commit Index. This should be called after obtained a log lock
  322. func (l *Log) flushCommitIndex() {
  323. l.file.Seek(0, os.SEEK_SET)
  324. fmt.Fprintf(l.file, "%8x\n", l.commitIndex)
  325. l.file.Seek(0, os.SEEK_END)
  326. }
  327. //--------------------------------------
  328. // Truncation
  329. //--------------------------------------
  330. // Truncates the log to the given index and term. This only works if the log
  331. // at the index has not been committed.
  332. func (l *Log) truncate(index uint64, term uint64) error {
  333. l.mutex.Lock()
  334. defer l.mutex.Unlock()
  335. debugln("log.truncate: ", index)
  336. // Do not allow committed entries to be truncated.
  337. if index < l.commitIndex {
  338. debugln("log.truncate.before")
  339. return fmt.Errorf("raft.Log: Index is already committed (%v): (IDX=%v, TERM=%v)", l.commitIndex, index, term)
  340. }
  341. // Do not truncate past end of entries.
  342. if index > l.startIndex+uint64(len(l.entries)) {
  343. debugln("log.truncate.after")
  344. return fmt.Errorf("raft.Log: Entry index does not exist (MAX=%v): (IDX=%v, TERM=%v)", len(l.entries), index, term)
  345. }
  346. // If we're truncating everything then just clear the entries.
  347. if index == l.startIndex {
  348. debugln("log.truncate.clear")
  349. l.file.Truncate(0)
  350. l.file.Seek(0, os.SEEK_SET)
  351. // notify clients if this node is the previous leader
  352. for _, entry := range l.entries {
  353. if entry.event != nil {
  354. entry.event.c <- errors.New("command failed to be committed due to node failure")
  355. }
  356. }
  357. l.entries = []*LogEntry{}
  358. } else {
  359. // Do not truncate if the entry at index does not have the matching term.
  360. entry := l.entries[index-l.startIndex-1]
  361. if len(l.entries) > 0 && entry.Term != term {
  362. debugln("log.truncate.termMismatch")
  363. return fmt.Errorf("raft.Log: Entry at index does not have matching term (%v): (IDX=%v, TERM=%v)", entry.Term, index, term)
  364. }
  365. // Otherwise truncate up to the desired entry.
  366. if index < l.startIndex+uint64(len(l.entries)) {
  367. debugln("log.truncate.finish")
  368. position := l.entries[index-l.startIndex].Position
  369. l.file.Truncate(position)
  370. l.file.Seek(position, os.SEEK_SET)
  371. // notify clients if this node is the previous leader
  372. for i := index - l.startIndex; i < uint64(len(l.entries)); i++ {
  373. entry := l.entries[i]
  374. if entry.event != nil {
  375. entry.event.c <- errors.New("command failed to be committed due to node failure")
  376. }
  377. }
  378. l.entries = l.entries[0 : index-l.startIndex]
  379. }
  380. }
  381. return nil
  382. }
  383. //--------------------------------------
  384. // Append
  385. //--------------------------------------
  386. // Appends a series of entries to the log.
  387. func (l *Log) appendEntries(entries []*LogEntry) error {
  388. l.mutex.Lock()
  389. defer l.mutex.Unlock()
  390. startPosition, _ := l.file.Seek(0, os.SEEK_CUR)
  391. w := bufio.NewWriter(l.file)
  392. var size int64
  393. var err error
  394. // Append each entry but exit if we hit an error.
  395. for _, entry := range entries {
  396. entry.log = l
  397. if size, err = l.writeEntry(entry, w); err != nil {
  398. return err
  399. }
  400. entry.Position = startPosition
  401. startPosition += size
  402. }
  403. w.Flush()
  404. err = l.file.Sync()
  405. if err != nil {
  406. panic(err)
  407. }
  408. return nil
  409. }
  410. // Writes a single log entry to the end of the log.
  411. func (l *Log) appendEntry(entry *LogEntry) error {
  412. l.mutex.Lock()
  413. defer l.mutex.Unlock()
  414. if l.file == nil {
  415. return errors.New("raft.Log: Log is not open")
  416. }
  417. // Make sure the term and index are greater than the previous.
  418. if len(l.entries) > 0 {
  419. lastEntry := l.entries[len(l.entries)-1]
  420. if entry.Term < lastEntry.Term {
  421. return fmt.Errorf("raft.Log: Cannot append entry with earlier term (%x:%x <= %x:%x)", entry.Term, entry.Index, lastEntry.Term, lastEntry.Index)
  422. } else if entry.Term == lastEntry.Term && entry.Index <= lastEntry.Index {
  423. return fmt.Errorf("raft.Log: Cannot append entry with earlier index in the same term (%x:%x <= %x:%x)", entry.Term, entry.Index, lastEntry.Term, lastEntry.Index)
  424. }
  425. }
  426. position, _ := l.file.Seek(0, os.SEEK_CUR)
  427. entry.Position = position
  428. // Write to storage.
  429. if _, err := entry.encode(l.file); err != nil {
  430. return err
  431. }
  432. // Append to entries list if stored on disk.
  433. l.entries = append(l.entries, entry)
  434. return nil
  435. }
  436. // appendEntry with Buffered io
  437. func (l *Log) writeEntry(entry *LogEntry, w io.Writer) (int64, error) {
  438. if l.file == nil {
  439. return -1, errors.New("raft.Log: Log is not open")
  440. }
  441. // Make sure the term and index are greater than the previous.
  442. if len(l.entries) > 0 {
  443. lastEntry := l.entries[len(l.entries)-1]
  444. if entry.Term < lastEntry.Term {
  445. return -1, fmt.Errorf("raft.Log: Cannot append entry with earlier term (%x:%x <= %x:%x)", entry.Term, entry.Index, lastEntry.Term, lastEntry.Index)
  446. } else if entry.Term == lastEntry.Term && entry.Index <= lastEntry.Index {
  447. return -1, fmt.Errorf("raft.Log: Cannot append entry with earlier index in the same term (%x:%x <= %x:%x)", entry.Term, entry.Index, lastEntry.Term, lastEntry.Index)
  448. }
  449. }
  450. // Write to storage.
  451. size, err := entry.encode(w)
  452. if err != nil {
  453. return -1, err
  454. }
  455. // Append to entries list if stored on disk.
  456. l.entries = append(l.entries, entry)
  457. return int64(size), nil
  458. }
  459. //--------------------------------------
  460. // Log compaction
  461. //--------------------------------------
  462. // compact the log before index (including index)
  463. func (l *Log) compact(index uint64, term uint64) error {
  464. var entries []*LogEntry
  465. l.mutex.Lock()
  466. defer l.mutex.Unlock()
  467. if index == 0 {
  468. return nil
  469. }
  470. // nothing to compaction
  471. // the index may be greater than the current index if
  472. // we just recovery from on snapshot
  473. if index >= l.internalCurrentIndex() {
  474. entries = make([]*LogEntry, 0)
  475. } else {
  476. // get all log entries after index
  477. entries = l.entries[index-l.startIndex:]
  478. }
  479. // create a new log file and add all the entries
  480. file, err := os.OpenFile(l.path+".new", os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0600)
  481. if err != nil {
  482. return err
  483. }
  484. for _, entry := range entries {
  485. position, _ := l.file.Seek(0, os.SEEK_CUR)
  486. entry.Position = position
  487. if _, err = entry.encode(file); err != nil {
  488. return err
  489. }
  490. }
  491. // close the current log file
  492. l.file.Close()
  493. // remove the current log file to .bak
  494. err = os.Remove(l.path)
  495. if err != nil {
  496. return err
  497. }
  498. // rename the new log file
  499. err = os.Rename(l.path+".new", l.path)
  500. if err != nil {
  501. return err
  502. }
  503. l.file = file
  504. // compaction the in memory log
  505. l.entries = entries
  506. l.startIndex = index
  507. l.startTerm = term
  508. return nil
  509. }