raft.go 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587
  1. package raft
  2. import (
  3. "errors"
  4. "fmt"
  5. "math/rand"
  6. "sort"
  7. pb "github.com/coreos/etcd/raft/raftpb"
  8. )
  9. // None is a placeholder node ID used when there is no leader.
  10. const None uint64 = 0
  11. var errNoLeader = errors.New("no leader")
  12. // Possible values for StateType.
  13. const (
  14. StateFollower StateType = iota
  15. StateCandidate
  16. StateLeader
  17. )
  18. // StateType represents the role of a node in a cluster.
  19. type StateType uint64
  20. var stmap = [...]string{
  21. "StateFollower",
  22. "StateCandidate",
  23. "StateLeader",
  24. }
  25. func (st StateType) String() string {
  26. return stmap[uint64(st)]
  27. }
  28. type progress struct {
  29. match, next uint64
  30. }
  31. func (pr *progress) update(n uint64) {
  32. pr.match = n
  33. pr.next = n + 1
  34. }
  35. // maybeDecrTo returns false if the given to index comes from an out of order message.
  36. // Otherwise it decreases the progress next index and returns true.
  37. func (pr *progress) maybeDecrTo(to uint64) bool {
  38. // the rejection must be stale if the
  39. // progress has matched with follower
  40. // or "to" does not match next - 1
  41. if pr.match != 0 || pr.next-1 != to {
  42. return false
  43. }
  44. if pr.next--; pr.next < 1 {
  45. pr.next = 1
  46. }
  47. return true
  48. }
  49. func (pr *progress) String() string {
  50. return fmt.Sprintf("n=%d m=%d", pr.next, pr.match)
  51. }
  52. // uint64Slice implements sort interface
  53. type uint64Slice []uint64
  54. func (p uint64Slice) Len() int { return len(p) }
  55. func (p uint64Slice) Less(i, j int) bool { return p[i] < p[j] }
  56. func (p uint64Slice) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
  57. type raft struct {
  58. pb.HardState
  59. id uint64
  60. // the log
  61. raftLog *raftLog
  62. prs map[uint64]*progress
  63. state StateType
  64. votes map[uint64]bool
  65. msgs []pb.Message
  66. // the leader id
  67. lead uint64
  68. // New configuration is ignored if there exists unapplied configuration.
  69. pendingConf bool
  70. // TODO: need GC and recovery from snapshot
  71. removed map[uint64]bool
  72. elapsed int // number of ticks since the last msg
  73. heartbeatTimeout int
  74. electionTimeout int
  75. tick func()
  76. step stepFunc
  77. }
  78. func newRaft(id uint64, peers []uint64, election, heartbeat int) *raft {
  79. if id == None {
  80. panic("cannot use none id")
  81. }
  82. rand.Seed(int64(id))
  83. r := &raft{
  84. id: id,
  85. lead: None,
  86. raftLog: newLog(),
  87. prs: make(map[uint64]*progress),
  88. removed: make(map[uint64]bool),
  89. electionTimeout: election,
  90. heartbeatTimeout: heartbeat,
  91. }
  92. for _, p := range peers {
  93. r.prs[p] = &progress{}
  94. }
  95. r.becomeFollower(0, None)
  96. return r
  97. }
  98. func (r *raft) hasLeader() bool { return r.lead != None }
  99. func (r *raft) shouldStop() bool { return r.removed[r.id] }
  100. func (r *raft) softState() *SoftState {
  101. return &SoftState{Lead: r.lead, RaftState: r.state, Nodes: r.nodes(), ShouldStop: r.shouldStop()}
  102. }
  103. func (r *raft) String() string {
  104. s := fmt.Sprintf(`state=%v term=%d`, r.state, r.Term)
  105. switch r.state {
  106. case StateFollower:
  107. s += fmt.Sprintf(" vote=%v lead=%v", r.Vote, r.lead)
  108. case StateCandidate:
  109. s += fmt.Sprintf(` votes="%v"`, r.votes)
  110. case StateLeader:
  111. s += fmt.Sprintf(` prs="%v"`, r.prs)
  112. }
  113. return s
  114. }
  115. func (r *raft) poll(id uint64, v bool) (granted int) {
  116. if _, ok := r.votes[id]; !ok {
  117. r.votes[id] = v
  118. }
  119. for _, vv := range r.votes {
  120. if vv {
  121. granted++
  122. }
  123. }
  124. return granted
  125. }
  126. // send persists state to stable storage and then sends to its mailbox.
  127. func (r *raft) send(m pb.Message) {
  128. m.From = r.id
  129. // do not attach term to msgProp
  130. // proposals are a way to forward to the leader and
  131. // should be treated as local message.
  132. if m.Type != pb.MsgProp {
  133. m.Term = r.Term
  134. }
  135. r.msgs = append(r.msgs, m)
  136. }
  137. // sendAppend sends RRPC, with entries to the given peer.
  138. func (r *raft) sendAppend(to uint64) {
  139. pr := r.prs[to]
  140. m := pb.Message{}
  141. m.To = to
  142. m.Index = pr.next - 1
  143. if r.needSnapshot(m.Index) {
  144. m.Type = pb.MsgSnap
  145. m.Snapshot = r.raftLog.snapshot
  146. } else {
  147. m.Type = pb.MsgApp
  148. m.LogTerm = r.raftLog.term(pr.next - 1)
  149. m.Entries = r.raftLog.entries(pr.next)
  150. m.Commit = r.raftLog.committed
  151. }
  152. r.send(m)
  153. }
  154. // sendHeartbeat sends an empty msgApp
  155. func (r *raft) sendHeartbeat(to uint64) {
  156. m := pb.Message{
  157. To: to,
  158. Type: pb.MsgApp,
  159. }
  160. r.send(m)
  161. }
  162. // bcastAppend sends RRPC, with entries to all peers that are not up-to-date according to r.mis.
  163. func (r *raft) bcastAppend() {
  164. for i := range r.prs {
  165. if i == r.id {
  166. continue
  167. }
  168. r.sendAppend(i)
  169. }
  170. }
  171. // bcastHeartbeat sends RRPC, without entries to all the peers.
  172. func (r *raft) bcastHeartbeat() {
  173. for i := range r.prs {
  174. if i == r.id {
  175. continue
  176. }
  177. r.sendHeartbeat(i)
  178. }
  179. }
  180. func (r *raft) maybeCommit() bool {
  181. // TODO(bmizerany): optimize.. Currently naive
  182. mis := make(uint64Slice, 0, len(r.prs))
  183. for i := range r.prs {
  184. mis = append(mis, r.prs[i].match)
  185. }
  186. sort.Sort(sort.Reverse(mis))
  187. mci := mis[r.q()-1]
  188. return r.raftLog.maybeCommit(mci, r.Term)
  189. }
  190. func (r *raft) reset(term uint64) {
  191. r.Term = term
  192. r.lead = None
  193. r.Vote = None
  194. r.elapsed = 0
  195. r.votes = make(map[uint64]bool)
  196. for i := range r.prs {
  197. r.prs[i] = &progress{next: r.raftLog.lastIndex() + 1}
  198. if i == r.id {
  199. r.prs[i].match = r.raftLog.lastIndex()
  200. }
  201. }
  202. r.pendingConf = false
  203. }
  204. func (r *raft) q() int {
  205. return len(r.prs)/2 + 1
  206. }
  207. func (r *raft) appendEntry(e pb.Entry) {
  208. e.Term = r.Term
  209. e.Index = r.raftLog.lastIndex() + 1
  210. r.raftLog.append(r.raftLog.lastIndex(), e)
  211. r.prs[r.id].update(r.raftLog.lastIndex())
  212. r.maybeCommit()
  213. }
  214. // tickElection is ran by followers and candidates after r.electionTimeout.
  215. func (r *raft) tickElection() {
  216. if !r.promotable() {
  217. r.elapsed = 0
  218. return
  219. }
  220. r.elapsed++
  221. if r.isElectionTimeout() {
  222. r.elapsed = 0
  223. r.Step(pb.Message{From: r.id, Type: pb.MsgHup})
  224. }
  225. }
  226. // tickHeartbeat is ran by leaders to send a msgBeat after r.heartbeatTimeout.
  227. func (r *raft) tickHeartbeat() {
  228. r.elapsed++
  229. if r.elapsed > r.heartbeatTimeout {
  230. r.elapsed = 0
  231. r.Step(pb.Message{From: r.id, Type: pb.MsgBeat})
  232. }
  233. }
  234. func (r *raft) becomeFollower(term uint64, lead uint64) {
  235. r.step = stepFollower
  236. r.reset(term)
  237. r.tick = r.tickElection
  238. r.lead = lead
  239. r.state = StateFollower
  240. }
  241. func (r *raft) becomeCandidate() {
  242. // TODO(xiangli) remove the panic when the raft implementation is stable
  243. if r.state == StateLeader {
  244. panic("invalid transition [leader -> candidate]")
  245. }
  246. r.step = stepCandidate
  247. r.reset(r.Term + 1)
  248. r.tick = r.tickElection
  249. r.Vote = r.id
  250. r.state = StateCandidate
  251. }
  252. func (r *raft) becomeLeader() {
  253. // TODO(xiangli) remove the panic when the raft implementation is stable
  254. if r.state == StateFollower {
  255. panic("invalid transition [follower -> leader]")
  256. }
  257. r.step = stepLeader
  258. r.reset(r.Term)
  259. r.tick = r.tickHeartbeat
  260. r.lead = r.id
  261. r.state = StateLeader
  262. for _, e := range r.raftLog.entries(r.raftLog.committed + 1) {
  263. if e.Type != pb.EntryConfChange {
  264. continue
  265. }
  266. if r.pendingConf {
  267. panic("unexpected double uncommitted config entry")
  268. }
  269. r.pendingConf = true
  270. }
  271. r.appendEntry(pb.Entry{Data: nil})
  272. }
  273. func (r *raft) ReadMessages() []pb.Message {
  274. msgs := r.msgs
  275. r.msgs = make([]pb.Message, 0)
  276. return msgs
  277. }
  278. func (r *raft) campaign() {
  279. r.becomeCandidate()
  280. if r.q() == r.poll(r.id, true) {
  281. r.becomeLeader()
  282. }
  283. for i := range r.prs {
  284. if i == r.id {
  285. continue
  286. }
  287. lasti := r.raftLog.lastIndex()
  288. r.send(pb.Message{To: i, Type: pb.MsgVote, Index: lasti, LogTerm: r.raftLog.term(lasti)})
  289. }
  290. }
  291. func (r *raft) Step(m pb.Message) error {
  292. // TODO(bmizerany): this likely allocs - prevent that.
  293. defer func() { r.Commit = r.raftLog.committed }()
  294. if r.removed[m.From] {
  295. if m.From != r.id {
  296. r.send(pb.Message{To: m.From, Type: pb.MsgDenied})
  297. }
  298. // TODO: return an error?
  299. return nil
  300. }
  301. if m.Type == pb.MsgDenied {
  302. r.removed[r.id] = true
  303. // TODO: return an error?
  304. return nil
  305. }
  306. if m.Type == pb.MsgHup {
  307. r.campaign()
  308. }
  309. switch {
  310. case m.Term == 0:
  311. // local message
  312. case m.Term > r.Term:
  313. lead := m.From
  314. if m.Type == pb.MsgVote {
  315. lead = None
  316. }
  317. r.becomeFollower(m.Term, lead)
  318. case m.Term < r.Term:
  319. // ignore
  320. return nil
  321. }
  322. r.step(r, m)
  323. return nil
  324. }
  325. func (r *raft) handleAppendEntries(m pb.Message) {
  326. if r.raftLog.maybeAppend(m.Index, m.LogTerm, m.Commit, m.Entries...) {
  327. r.send(pb.Message{To: m.From, Type: pb.MsgAppResp, Index: r.raftLog.lastIndex()})
  328. } else {
  329. r.send(pb.Message{To: m.From, Type: pb.MsgAppResp, Index: m.Index, Reject: true})
  330. }
  331. }
  332. func (r *raft) handleSnapshot(m pb.Message) {
  333. if r.restore(m.Snapshot) {
  334. r.send(pb.Message{To: m.From, Type: pb.MsgAppResp, Index: r.raftLog.lastIndex()})
  335. } else {
  336. r.send(pb.Message{To: m.From, Type: pb.MsgAppResp, Index: r.raftLog.committed})
  337. }
  338. }
  339. func (r *raft) addNode(id uint64) {
  340. r.setProgress(id, 0, r.raftLog.lastIndex()+1)
  341. r.pendingConf = false
  342. }
  343. func (r *raft) removeNode(id uint64) {
  344. r.delProgress(id)
  345. r.pendingConf = false
  346. r.removed[id] = true
  347. }
  348. type stepFunc func(r *raft, m pb.Message)
  349. func stepLeader(r *raft, m pb.Message) {
  350. switch m.Type {
  351. case pb.MsgBeat:
  352. r.bcastHeartbeat()
  353. case pb.MsgProp:
  354. if len(m.Entries) != 1 {
  355. panic("unexpected length(entries) of a msgProp")
  356. }
  357. e := m.Entries[0]
  358. if e.Type == pb.EntryConfChange {
  359. if r.pendingConf {
  360. return
  361. }
  362. r.pendingConf = true
  363. }
  364. r.appendEntry(e)
  365. r.bcastAppend()
  366. case pb.MsgAppResp:
  367. if m.Reject {
  368. if r.prs[m.From].maybeDecrTo(m.Index) {
  369. r.sendAppend(m.From)
  370. }
  371. } else {
  372. r.prs[m.From].update(m.Index)
  373. if r.maybeCommit() {
  374. r.bcastAppend()
  375. }
  376. }
  377. case pb.MsgVote:
  378. r.send(pb.Message{To: m.From, Type: pb.MsgVoteResp, Reject: true})
  379. }
  380. }
  381. func stepCandidate(r *raft, m pb.Message) {
  382. switch m.Type {
  383. case pb.MsgProp:
  384. panic("no leader")
  385. case pb.MsgApp:
  386. r.becomeFollower(r.Term, m.From)
  387. r.handleAppendEntries(m)
  388. case pb.MsgSnap:
  389. r.becomeFollower(m.Term, m.From)
  390. r.handleSnapshot(m)
  391. case pb.MsgVote:
  392. r.send(pb.Message{To: m.From, Type: pb.MsgVoteResp, Reject: true})
  393. case pb.MsgVoteResp:
  394. gr := r.poll(m.From, !m.Reject)
  395. switch r.q() {
  396. case gr:
  397. r.becomeLeader()
  398. r.bcastAppend()
  399. case len(r.votes) - gr:
  400. r.becomeFollower(r.Term, None)
  401. }
  402. }
  403. }
  404. func stepFollower(r *raft, m pb.Message) {
  405. switch m.Type {
  406. case pb.MsgProp:
  407. if r.lead == None {
  408. panic("no leader")
  409. }
  410. m.To = r.lead
  411. r.send(m)
  412. case pb.MsgApp:
  413. r.elapsed = 0
  414. r.lead = m.From
  415. r.handleAppendEntries(m)
  416. case pb.MsgSnap:
  417. r.elapsed = 0
  418. r.handleSnapshot(m)
  419. case pb.MsgVote:
  420. if (r.Vote == None || r.Vote == m.From) && r.raftLog.isUpToDate(m.Index, m.LogTerm) {
  421. r.elapsed = 0
  422. r.Vote = m.From
  423. r.send(pb.Message{To: m.From, Type: pb.MsgVoteResp})
  424. } else {
  425. r.send(pb.Message{To: m.From, Type: pb.MsgVoteResp, Reject: true})
  426. }
  427. }
  428. }
  429. func (r *raft) compact(index uint64, nodes []uint64, d []byte) {
  430. if index > r.raftLog.applied {
  431. panic(fmt.Sprintf("raft: compact index (%d) exceeds applied index (%d)", index, r.raftLog.applied))
  432. }
  433. // We do not get the removed nodes at the given index.
  434. // We get the removed nodes at current index. So a state machine might
  435. // have a newer verison of removed nodes after recovery. It is OK.
  436. r.raftLog.snap(d, index, r.raftLog.term(index), nodes, r.removedNodes())
  437. r.raftLog.compact(index)
  438. }
  439. // restore recovers the statemachine from a snapshot. It restores the log and the
  440. // configuration of statemachine.
  441. func (r *raft) restore(s pb.Snapshot) bool {
  442. if s.Index <= r.raftLog.committed {
  443. return false
  444. }
  445. r.raftLog.restore(s)
  446. r.prs = make(map[uint64]*progress)
  447. for _, n := range s.Nodes {
  448. if n == r.id {
  449. r.setProgress(n, r.raftLog.lastIndex(), r.raftLog.lastIndex()+1)
  450. } else {
  451. r.setProgress(n, 0, r.raftLog.lastIndex()+1)
  452. }
  453. }
  454. r.removed = make(map[uint64]bool)
  455. for _, n := range s.RemovedNodes {
  456. r.removed[n] = true
  457. }
  458. return true
  459. }
  460. func (r *raft) needSnapshot(i uint64) bool {
  461. if i < r.raftLog.offset {
  462. if r.raftLog.snapshot.Term == 0 {
  463. panic("need non-empty snapshot")
  464. }
  465. return true
  466. }
  467. return false
  468. }
  469. func (r *raft) nodes() []uint64 {
  470. nodes := make([]uint64, 0, len(r.prs))
  471. for k := range r.prs {
  472. nodes = append(nodes, k)
  473. }
  474. return nodes
  475. }
  476. func (r *raft) removedNodes() []uint64 {
  477. removed := make([]uint64, 0, len(r.removed))
  478. for k := range r.removed {
  479. removed = append(removed, k)
  480. }
  481. return removed
  482. }
  483. func (r *raft) setProgress(id, match, next uint64) {
  484. r.prs[id] = &progress{next: next, match: match}
  485. }
  486. func (r *raft) delProgress(id uint64) {
  487. delete(r.prs, id)
  488. }
  489. // promotable indicates whether state machine can be promoted to leader,
  490. // which is true when its own id is in progress list.
  491. func (r *raft) promotable() bool {
  492. _, ok := r.prs[r.id]
  493. return ok
  494. }
  495. func (r *raft) loadEnts(ents []pb.Entry) {
  496. r.raftLog.load(ents)
  497. }
  498. func (r *raft) loadState(state pb.HardState) {
  499. r.raftLog.committed = state.Commit
  500. r.Term = state.Term
  501. r.Vote = state.Vote
  502. r.Commit = state.Commit
  503. }
  504. // isElectionTimeout returns true if r.elapsed is greater than the
  505. // randomized election timeout in (electiontimeout, 2 * electiontimeout - 1).
  506. // Otherwise, it returns false.
  507. func (r *raft) isElectionTimeout() bool {
  508. d := r.elapsed - r.electionTimeout
  509. if d < 0 {
  510. return false
  511. }
  512. return d > rand.Int()%r.electionTimeout
  513. }