log.go 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624
  1. package raft
  2. import (
  3. "bufio"
  4. "errors"
  5. "fmt"
  6. "io"
  7. "os"
  8. "sync"
  9. "github.com/coreos/etcd/third_party/github.com/goraft/raft/protobuf"
  10. )
  11. //------------------------------------------------------------------------------
  12. //
  13. // Typedefs
  14. //
  15. //------------------------------------------------------------------------------
  16. // A log is a collection of log entries that are persisted to durable storage.
  17. type Log struct {
  18. ApplyFunc func(*LogEntry, Command) (interface{}, error)
  19. file *os.File
  20. path string
  21. entries []*LogEntry
  22. commitIndex uint64
  23. mutex sync.RWMutex
  24. startIndex uint64 // the index before the first entry in the Log entries
  25. startTerm uint64
  26. initialized bool
  27. }
  28. // The results of the applying a log entry.
  29. type logResult struct {
  30. returnValue interface{}
  31. err error
  32. }
  33. //------------------------------------------------------------------------------
  34. //
  35. // Constructor
  36. //
  37. //------------------------------------------------------------------------------
  38. // Creates a new log.
  39. func newLog() *Log {
  40. return &Log{
  41. entries: make([]*LogEntry, 0),
  42. }
  43. }
  44. //------------------------------------------------------------------------------
  45. //
  46. // Accessors
  47. //
  48. //------------------------------------------------------------------------------
  49. //--------------------------------------
  50. // Log Indices
  51. //--------------------------------------
  52. // The last committed index in the log.
  53. func (l *Log) CommitIndex() uint64 {
  54. l.mutex.RLock()
  55. defer l.mutex.RUnlock()
  56. return l.commitIndex
  57. }
  58. // The current index in the log.
  59. func (l *Log) currentIndex() uint64 {
  60. l.mutex.RLock()
  61. defer l.mutex.RUnlock()
  62. return l.internalCurrentIndex()
  63. }
  64. // The current index in the log without locking
  65. func (l *Log) internalCurrentIndex() uint64 {
  66. if len(l.entries) == 0 {
  67. return l.startIndex
  68. }
  69. return l.entries[len(l.entries)-1].Index()
  70. }
  71. // The next index in the log.
  72. func (l *Log) nextIndex() uint64 {
  73. return l.currentIndex() + 1
  74. }
  75. // Determines if the log contains zero entries.
  76. func (l *Log) isEmpty() bool {
  77. l.mutex.RLock()
  78. defer l.mutex.RUnlock()
  79. return (len(l.entries) == 0) && (l.startIndex == 0)
  80. }
  81. // The name of the last command in the log.
  82. func (l *Log) lastCommandName() string {
  83. l.mutex.RLock()
  84. defer l.mutex.RUnlock()
  85. if len(l.entries) > 0 {
  86. if entry := l.entries[len(l.entries)-1]; entry != nil {
  87. return entry.CommandName()
  88. }
  89. }
  90. return ""
  91. }
  92. //--------------------------------------
  93. // Log Terms
  94. //--------------------------------------
  95. // The current term in the log.
  96. func (l *Log) currentTerm() uint64 {
  97. l.mutex.RLock()
  98. defer l.mutex.RUnlock()
  99. if len(l.entries) == 0 {
  100. return l.startTerm
  101. }
  102. return l.entries[len(l.entries)-1].Term()
  103. }
  104. //------------------------------------------------------------------------------
  105. //
  106. // Methods
  107. //
  108. //------------------------------------------------------------------------------
  109. //--------------------------------------
  110. // State
  111. //--------------------------------------
  112. // Opens the log file and reads existing entries. The log can remain open and
  113. // continue to append entries to the end of the log.
  114. func (l *Log) open(path string) error {
  115. // Read all the entries from the log if one exists.
  116. var readBytes int64
  117. var err error
  118. debugln("log.open.open ", path)
  119. // open log file
  120. l.file, err = os.OpenFile(path, os.O_RDWR, 0600)
  121. l.path = path
  122. if err != nil {
  123. // if the log file does not exist before
  124. // we create the log file and set commitIndex to 0
  125. if os.IsNotExist(err) {
  126. l.file, err = os.OpenFile(path, os.O_WRONLY|os.O_CREATE, 0600)
  127. debugln("log.open.create ", path)
  128. if err == nil {
  129. l.initialized = true
  130. }
  131. return err
  132. }
  133. return err
  134. }
  135. debugln("log.open.exist ", path)
  136. // Read the file and decode entries.
  137. for {
  138. // Instantiate log entry and decode into it.
  139. entry, _ := newLogEntry(l, nil, 0, 0, nil)
  140. entry.Position, _ = l.file.Seek(0, os.SEEK_CUR)
  141. n, err := entry.Decode(l.file)
  142. if err != nil {
  143. if err == io.EOF {
  144. debugln("open.log.append: finish ")
  145. } else {
  146. if err = l.file.Truncate(readBytes); err != nil {
  147. return fmt.Errorf("raft.Log: Unable to recover: %v", err)
  148. }
  149. l.file.Seek(readBytes, os.SEEK_SET)
  150. }
  151. break
  152. }
  153. if entry.Index() > l.startIndex {
  154. // Append entry.
  155. l.entries = append(l.entries, entry)
  156. if entry.Index() <= l.commitIndex {
  157. command, err := newCommand(entry.CommandName(), entry.Command())
  158. if err != nil {
  159. continue
  160. }
  161. l.ApplyFunc(entry, command)
  162. }
  163. debugln("open.log.append log index ", entry.Index())
  164. }
  165. readBytes += int64(n)
  166. }
  167. debugln("open.log.recovery number of log ", len(l.entries))
  168. l.initialized = true
  169. return nil
  170. }
  171. // Closes the log file.
  172. func (l *Log) close() {
  173. l.mutex.Lock()
  174. defer l.mutex.Unlock()
  175. if l.file != nil {
  176. l.file.Close()
  177. l.file = nil
  178. }
  179. l.entries = make([]*LogEntry, 0)
  180. }
  181. // sync to disk
  182. func (l *Log) sync() error {
  183. return l.file.Sync()
  184. }
  185. //--------------------------------------
  186. // Entries
  187. //--------------------------------------
  188. // Creates a log entry associated with this log.
  189. func (l *Log) createEntry(term uint64, command Command, e *ev) (*LogEntry, error) {
  190. return newLogEntry(l, e, l.nextIndex(), term, command)
  191. }
  192. // Retrieves an entry from the log. If the entry has been eliminated because
  193. // of a snapshot then nil is returned.
  194. func (l *Log) getEntry(index uint64) *LogEntry {
  195. l.mutex.RLock()
  196. defer l.mutex.RUnlock()
  197. if index <= l.startIndex || index > (l.startIndex+uint64(len(l.entries))) {
  198. return nil
  199. }
  200. return l.entries[index-l.startIndex-1]
  201. }
  202. // Checks if the log contains a given index/term combination.
  203. func (l *Log) containsEntry(index uint64, term uint64) bool {
  204. entry := l.getEntry(index)
  205. return (entry != nil && entry.Term() == term)
  206. }
  207. // Retrieves a list of entries after a given index as well as the term of the
  208. // index provided. A nil list of entries is returned if the index no longer
  209. // exists because a snapshot was made.
  210. func (l *Log) getEntriesAfter(index uint64, maxLogEntriesPerRequest uint64) ([]*LogEntry, uint64) {
  211. l.mutex.RLock()
  212. defer l.mutex.RUnlock()
  213. // Return nil if index is before the start of the log.
  214. if index < l.startIndex {
  215. traceln("log.entriesAfter.before: ", index, " ", l.startIndex)
  216. return nil, 0
  217. }
  218. // Return an error if the index doesn't exist.
  219. if index > (uint64(len(l.entries)) + l.startIndex) {
  220. panic(fmt.Sprintf("raft: Index is beyond end of log: %v %v", len(l.entries), index))
  221. }
  222. // If we're going from the beginning of the log then return the whole log.
  223. if index == l.startIndex {
  224. traceln("log.entriesAfter.beginning: ", index, " ", l.startIndex)
  225. return l.entries, l.startTerm
  226. }
  227. traceln("log.entriesAfter.partial: ", index, " ", l.entries[len(l.entries)-1].Index)
  228. entries := l.entries[index-l.startIndex:]
  229. length := len(entries)
  230. traceln("log.entriesAfter: startIndex:", l.startIndex, " length", len(l.entries))
  231. if uint64(length) < maxLogEntriesPerRequest {
  232. // Determine the term at the given entry and return a subslice.
  233. return entries, l.entries[index-1-l.startIndex].Term()
  234. } else {
  235. return entries[:maxLogEntriesPerRequest], l.entries[index-1-l.startIndex].Term()
  236. }
  237. }
  238. //--------------------------------------
  239. // Commit
  240. //--------------------------------------
  241. // Retrieves the last index and term that has been committed to the log.
  242. func (l *Log) commitInfo() (index uint64, term uint64) {
  243. l.mutex.RLock()
  244. defer l.mutex.RUnlock()
  245. // If we don't have any committed entries then just return zeros.
  246. if l.commitIndex == 0 {
  247. return 0, 0
  248. }
  249. // No new commit log after snapshot
  250. if l.commitIndex == l.startIndex {
  251. return l.startIndex, l.startTerm
  252. }
  253. // Return the last index & term from the last committed entry.
  254. debugln("commitInfo.get.[", l.commitIndex, "/", l.startIndex, "]")
  255. entry := l.entries[l.commitIndex-1-l.startIndex]
  256. return entry.Index(), entry.Term()
  257. }
  258. // Retrieves the last index and term that has been appended to the log.
  259. func (l *Log) lastInfo() (index uint64, term uint64) {
  260. l.mutex.RLock()
  261. defer l.mutex.RUnlock()
  262. // If we don't have any entries then just return zeros.
  263. if len(l.entries) == 0 {
  264. return l.startIndex, l.startTerm
  265. }
  266. // Return the last index & term
  267. entry := l.entries[len(l.entries)-1]
  268. return entry.Index(), entry.Term()
  269. }
  270. // Updates the commit index
  271. func (l *Log) updateCommitIndex(index uint64) {
  272. l.mutex.Lock()
  273. defer l.mutex.Unlock()
  274. if index > l.commitIndex {
  275. l.commitIndex = index
  276. }
  277. debugln("update.commit.index ", index)
  278. }
  279. // Updates the commit index and writes entries after that index to the stable storage.
  280. func (l *Log) setCommitIndex(index uint64) error {
  281. l.mutex.Lock()
  282. defer l.mutex.Unlock()
  283. // this is not error any more after limited the number of sending entries
  284. // commit up to what we already have
  285. if index > l.startIndex+uint64(len(l.entries)) {
  286. debugln("raft.Log: Commit index", index, "set back to ", len(l.entries))
  287. index = l.startIndex + uint64(len(l.entries))
  288. }
  289. // Do not allow previous indices to be committed again.
  290. // This could happens, since the guarantee is that the new leader has up-to-dated
  291. // log entries rather than has most up-to-dated committed index
  292. // For example, Leader 1 send log 80 to follower 2 and follower 3
  293. // follower 2 and follow 3 all got the new entries and reply
  294. // leader 1 committed entry 80 and send reply to follower 2 and follower3
  295. // follower 2 receive the new committed index and update committed index to 80
  296. // leader 1 fail to send the committed index to follower 3
  297. // follower 3 promote to leader (server 1 and server 2 will vote, since leader 3
  298. // has up-to-dated the entries)
  299. // when new leader 3 send heartbeat with committed index = 0 to follower 2,
  300. // follower 2 should reply success and let leader 3 update the committed index to 80
  301. if index < l.commitIndex {
  302. return nil
  303. }
  304. // Find all entries whose index is between the previous index and the current index.
  305. for i := l.commitIndex + 1; i <= index; i++ {
  306. entryIndex := i - 1 - l.startIndex
  307. entry := l.entries[entryIndex]
  308. // Update commit index.
  309. l.commitIndex = entry.Index()
  310. // Decode the command.
  311. command, err := newCommand(entry.CommandName(), entry.Command())
  312. if err != nil {
  313. return err
  314. }
  315. // Apply the changes to the state machine and store the error code.
  316. returnValue, err := l.ApplyFunc(entry, command)
  317. debugf("setCommitIndex.set.result index: %v, entries index: %v", i, entryIndex)
  318. if entry.event != nil {
  319. entry.event.returnValue = returnValue
  320. entry.event.c <- err
  321. }
  322. }
  323. return nil
  324. }
  325. // Set the commitIndex at the head of the log file to the current
  326. // commit Index. This should be called after obtained a log lock
  327. func (l *Log) flushCommitIndex() {
  328. l.file.Seek(0, os.SEEK_SET)
  329. fmt.Fprintf(l.file, "%8x\n", l.commitIndex)
  330. l.file.Seek(0, os.SEEK_END)
  331. }
  332. //--------------------------------------
  333. // Truncation
  334. //--------------------------------------
  335. // Truncates the log to the given index and term. This only works if the log
  336. // at the index has not been committed.
  337. func (l *Log) truncate(index uint64, term uint64) error {
  338. l.mutex.Lock()
  339. defer l.mutex.Unlock()
  340. debugln("log.truncate: ", index)
  341. // Do not allow committed entries to be truncated.
  342. if index < l.commitIndex {
  343. debugln("log.truncate.before")
  344. return fmt.Errorf("raft.Log: Index is already committed (%v): (IDX=%v, TERM=%v)", l.commitIndex, index, term)
  345. }
  346. // Do not truncate past end of entries.
  347. if index > l.startIndex+uint64(len(l.entries)) {
  348. debugln("log.truncate.after")
  349. return fmt.Errorf("raft.Log: Entry index does not exist (MAX=%v): (IDX=%v, TERM=%v)", len(l.entries), index, term)
  350. }
  351. // If we're truncating everything then just clear the entries.
  352. if index == l.startIndex {
  353. debugln("log.truncate.clear")
  354. l.file.Truncate(0)
  355. l.file.Seek(0, os.SEEK_SET)
  356. // notify clients if this node is the previous leader
  357. for _, entry := range l.entries {
  358. if entry.event != nil {
  359. entry.event.c <- errors.New("command failed to be committed due to node failure")
  360. }
  361. }
  362. l.entries = []*LogEntry{}
  363. } else {
  364. // Do not truncate if the entry at index does not have the matching term.
  365. entry := l.entries[index-l.startIndex-1]
  366. if len(l.entries) > 0 && entry.Term() != term {
  367. debugln("log.truncate.termMismatch")
  368. return fmt.Errorf("raft.Log: Entry at index does not have matching term (%v): (IDX=%v, TERM=%v)", entry.Term(), index, term)
  369. }
  370. // Otherwise truncate up to the desired entry.
  371. if index < l.startIndex+uint64(len(l.entries)) {
  372. debugln("log.truncate.finish")
  373. position := l.entries[index-l.startIndex].Position
  374. l.file.Truncate(position)
  375. l.file.Seek(position, os.SEEK_SET)
  376. // notify clients if this node is the previous leader
  377. for i := index - l.startIndex; i < uint64(len(l.entries)); i++ {
  378. entry := l.entries[i]
  379. if entry.event != nil {
  380. entry.event.c <- errors.New("command failed to be committed due to node failure")
  381. }
  382. }
  383. l.entries = l.entries[0 : index-l.startIndex]
  384. }
  385. }
  386. return nil
  387. }
  388. //--------------------------------------
  389. // Append
  390. //--------------------------------------
  391. // Appends a series of entries to the log.
  392. func (l *Log) appendEntries(entries []*protobuf.LogEntry) error {
  393. l.mutex.Lock()
  394. defer l.mutex.Unlock()
  395. startPosition, _ := l.file.Seek(0, os.SEEK_CUR)
  396. w := bufio.NewWriter(l.file)
  397. var size int64
  398. var err error
  399. // Append each entry but exit if we hit an error.
  400. for i := range entries {
  401. logEntry := &LogEntry{
  402. log: l,
  403. Position: startPosition,
  404. pb: entries[i],
  405. }
  406. if size, err = l.writeEntry(logEntry, w); err != nil {
  407. return err
  408. }
  409. startPosition += size
  410. }
  411. w.Flush()
  412. err = l.sync()
  413. if err != nil {
  414. panic(err)
  415. }
  416. return nil
  417. }
  418. // Writes a single log entry to the end of the log.
  419. func (l *Log) appendEntry(entry *LogEntry) error {
  420. l.mutex.Lock()
  421. defer l.mutex.Unlock()
  422. if l.file == nil {
  423. return errors.New("raft.Log: Log is not open")
  424. }
  425. // Make sure the term and index are greater than the previous.
  426. if len(l.entries) > 0 {
  427. lastEntry := l.entries[len(l.entries)-1]
  428. if entry.Term() < lastEntry.Term() {
  429. return fmt.Errorf("raft.Log: Cannot append entry with earlier term (%x:%x <= %x:%x)", entry.Term(), entry.Index(), lastEntry.Term(), lastEntry.Index())
  430. } else if entry.Term() == lastEntry.Term() && entry.Index() <= lastEntry.Index() {
  431. return fmt.Errorf("raft.Log: Cannot append entry with earlier index in the same term (%x:%x <= %x:%x)", entry.Term(), entry.Index(), lastEntry.Term(), lastEntry.Index())
  432. }
  433. }
  434. position, _ := l.file.Seek(0, os.SEEK_CUR)
  435. entry.Position = position
  436. // Write to storage.
  437. if _, err := entry.Encode(l.file); err != nil {
  438. return err
  439. }
  440. // Append to entries list if stored on disk.
  441. l.entries = append(l.entries, entry)
  442. return nil
  443. }
  444. // appendEntry with Buffered io
  445. func (l *Log) writeEntry(entry *LogEntry, w io.Writer) (int64, error) {
  446. if l.file == nil {
  447. return -1, errors.New("raft.Log: Log is not open")
  448. }
  449. // Make sure the term and index are greater than the previous.
  450. if len(l.entries) > 0 {
  451. lastEntry := l.entries[len(l.entries)-1]
  452. if entry.Term() < lastEntry.Term() {
  453. return -1, fmt.Errorf("raft.Log: Cannot append entry with earlier term (%x:%x <= %x:%x)", entry.Term(), entry.Index(), lastEntry.Term(), lastEntry.Index())
  454. } else if entry.Term() == lastEntry.Term() && entry.Index() <= lastEntry.Index() {
  455. return -1, fmt.Errorf("raft.Log: Cannot append entry with earlier index in the same term (%x:%x <= %x:%x)", entry.Term(), entry.Index(), lastEntry.Term(), lastEntry.Index())
  456. }
  457. }
  458. // Write to storage.
  459. size, err := entry.Encode(w)
  460. if err != nil {
  461. return -1, err
  462. }
  463. // Append to entries list if stored on disk.
  464. l.entries = append(l.entries, entry)
  465. return int64(size), nil
  466. }
  467. //--------------------------------------
  468. // Log compaction
  469. //--------------------------------------
  470. // compact the log before index (including index)
  471. func (l *Log) compact(index uint64, term uint64) error {
  472. var entries []*LogEntry
  473. l.mutex.Lock()
  474. defer l.mutex.Unlock()
  475. if index == 0 {
  476. return nil
  477. }
  478. // nothing to compaction
  479. // the index may be greater than the current index if
  480. // we just recovery from on snapshot
  481. if index >= l.internalCurrentIndex() {
  482. entries = make([]*LogEntry, 0)
  483. } else {
  484. // get all log entries after index
  485. entries = l.entries[index-l.startIndex:]
  486. }
  487. // create a new log file and add all the entries
  488. new_file_path := l.path + ".new"
  489. file, err := os.OpenFile(new_file_path, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0600)
  490. if err != nil {
  491. return err
  492. }
  493. for _, entry := range entries {
  494. position, _ := l.file.Seek(0, os.SEEK_CUR)
  495. entry.Position = position
  496. if _, err = entry.Encode(file); err != nil {
  497. file.Close()
  498. os.Remove(new_file_path)
  499. return err
  500. }
  501. }
  502. file.Sync()
  503. old_file := l.file
  504. // rename the new log file
  505. err = os.Rename(new_file_path, l.path)
  506. if err != nil {
  507. file.Close()
  508. os.Remove(new_file_path)
  509. return err
  510. }
  511. l.file = file
  512. // close the old log file
  513. old_file.Close()
  514. // compaction the in memory log
  515. l.entries = entries
  516. l.startIndex = index
  517. l.startTerm = term
  518. return nil
  519. }