raft.go 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602
  1. package raft
  2. import (
  3. "errors"
  4. "fmt"
  5. "math/rand"
  6. "sort"
  7. pb "github.com/coreos/etcd/raft/raftpb"
  8. )
  9. // None is a placeholder node ID used when there is no leader.
  10. const None int64 = 0
  11. type messageType int64
  12. const (
  13. msgHup int64 = iota
  14. msgBeat
  15. msgProp
  16. msgApp
  17. msgAppResp
  18. msgVote
  19. msgVoteResp
  20. msgSnap
  21. msgDenied
  22. )
  23. var mtmap = [...]string{
  24. "msgHup",
  25. "msgBeat",
  26. "msgProp",
  27. "msgApp",
  28. "msgAppResp",
  29. "msgVote",
  30. "msgVoteResp",
  31. "msgSnap",
  32. "msgDenied",
  33. }
  34. func (mt messageType) String() string {
  35. return mtmap[int64(mt)]
  36. }
  37. var errNoLeader = errors.New("no leader")
  38. // Possible values for StateType.
  39. const (
  40. StateFollower StateType = iota
  41. StateCandidate
  42. StateLeader
  43. )
  44. // StateType represents the role of a node in a cluster.
  45. type StateType int64
  46. var stmap = [...]string{
  47. "StateFollower",
  48. "StateCandidate",
  49. "StateLeader",
  50. }
  51. func (st StateType) String() string {
  52. return stmap[int64(st)]
  53. }
  54. type progress struct {
  55. match, next int64
  56. }
  57. func (pr *progress) update(n int64) {
  58. pr.match = n
  59. pr.next = n + 1
  60. }
  61. // maybeDecrTo returns false if the given to index comes from an out of order message.
  62. // Otherwise it decreases the progress next index and returns true.
  63. func (pr *progress) maybeDecrTo(to int64) bool {
  64. // the rejection must be stale if the
  65. // progress has matched with follower
  66. // or "to" does not match next - 1
  67. if pr.match != 0 || pr.next-1 != to {
  68. return false
  69. }
  70. if pr.next--; pr.next < 1 {
  71. pr.next = 1
  72. }
  73. return true
  74. }
  75. func (pr *progress) String() string {
  76. return fmt.Sprintf("n=%d m=%d", pr.next, pr.match)
  77. }
  78. // int64Slice implements sort interface
  79. type int64Slice []int64
  80. func (p int64Slice) Len() int { return len(p) }
  81. func (p int64Slice) Less(i, j int) bool { return p[i] < p[j] }
  82. func (p int64Slice) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
  83. type raft struct {
  84. pb.HardState
  85. id int64
  86. // the log
  87. raftLog *raftLog
  88. prs map[int64]*progress
  89. state StateType
  90. votes map[int64]bool
  91. msgs []pb.Message
  92. // the leader id
  93. lead int64
  94. // New configuration is ignored if there exists unapplied configuration.
  95. pendingConf bool
  96. // TODO: need GC and recovery from snapshot
  97. removed map[int64]bool
  98. elapsed int // number of ticks since the last msg
  99. heartbeatTimeout int
  100. electionTimeout int
  101. tick func()
  102. step stepFunc
  103. }
  104. func newRaft(id int64, peers []int64, election, heartbeat int) *raft {
  105. if id == None {
  106. panic("cannot use none id")
  107. }
  108. rand.Seed(id)
  109. r := &raft{
  110. id: id,
  111. lead: None,
  112. raftLog: newLog(),
  113. prs: make(map[int64]*progress),
  114. removed: make(map[int64]bool),
  115. electionTimeout: election,
  116. heartbeatTimeout: heartbeat,
  117. }
  118. for _, p := range peers {
  119. r.prs[p] = &progress{}
  120. }
  121. r.becomeFollower(0, None)
  122. return r
  123. }
  124. func (r *raft) hasLeader() bool { return r.lead != None }
  125. func (r *raft) shouldStop() bool { return r.removed[r.id] }
  126. func (r *raft) softState() *SoftState {
  127. return &SoftState{Lead: r.lead, RaftState: r.state, Nodes: r.nodes(), ShouldStop: r.shouldStop()}
  128. }
  129. func (r *raft) String() string {
  130. s := fmt.Sprintf(`state=%v term=%d`, r.state, r.Term)
  131. switch r.state {
  132. case StateFollower:
  133. s += fmt.Sprintf(" vote=%v lead=%v", r.Vote, r.lead)
  134. case StateCandidate:
  135. s += fmt.Sprintf(` votes="%v"`, r.votes)
  136. case StateLeader:
  137. s += fmt.Sprintf(` prs="%v"`, r.prs)
  138. }
  139. return s
  140. }
  141. func (r *raft) poll(id int64, v bool) (granted int) {
  142. if _, ok := r.votes[id]; !ok {
  143. r.votes[id] = v
  144. }
  145. for _, vv := range r.votes {
  146. if vv {
  147. granted++
  148. }
  149. }
  150. return granted
  151. }
  152. // send persists state to stable storage and then sends to its mailbox.
  153. func (r *raft) send(m pb.Message) {
  154. m.From = r.id
  155. // do not attach term to msgProp
  156. // proposals are a way to forward to the leader and
  157. // should be treated as local message.
  158. if m.Type != msgProp {
  159. m.Term = r.Term
  160. }
  161. r.msgs = append(r.msgs, m)
  162. }
  163. // sendAppend sends RRPC, with entries to the given peer.
  164. func (r *raft) sendAppend(to int64) {
  165. pr := r.prs[to]
  166. m := pb.Message{}
  167. m.To = to
  168. m.Index = pr.next - 1
  169. if r.needSnapshot(m.Index) {
  170. m.Type = msgSnap
  171. m.Snapshot = r.raftLog.snapshot
  172. } else {
  173. m.Type = msgApp
  174. m.LogTerm = r.raftLog.term(pr.next - 1)
  175. m.Entries = r.raftLog.entries(pr.next)
  176. m.Commit = r.raftLog.committed
  177. }
  178. r.send(m)
  179. }
  180. // sendHeartbeat sends an empty msgApp
  181. func (r *raft) sendHeartbeat(to int64) {
  182. m := pb.Message{
  183. To: to,
  184. Type: msgApp,
  185. }
  186. r.send(m)
  187. }
  188. // bcastAppend sends RRPC, with entries to all peers that are not up-to-date according to r.mis.
  189. func (r *raft) bcastAppend() {
  190. for i := range r.prs {
  191. if i == r.id {
  192. continue
  193. }
  194. r.sendAppend(i)
  195. }
  196. }
  197. // bcastHeartbeat sends RRPC, without entries to all the peers.
  198. func (r *raft) bcastHeartbeat() {
  199. for i := range r.prs {
  200. if i == r.id {
  201. continue
  202. }
  203. r.sendHeartbeat(i)
  204. }
  205. }
  206. func (r *raft) maybeCommit() bool {
  207. // TODO(bmizerany): optimize.. Currently naive
  208. mis := make(int64Slice, 0, len(r.prs))
  209. for i := range r.prs {
  210. mis = append(mis, r.prs[i].match)
  211. }
  212. sort.Sort(sort.Reverse(mis))
  213. mci := mis[r.q()-1]
  214. return r.raftLog.maybeCommit(mci, r.Term)
  215. }
  216. func (r *raft) reset(term int64) {
  217. r.Term = term
  218. r.lead = None
  219. r.Vote = None
  220. r.elapsed = 0
  221. r.votes = make(map[int64]bool)
  222. for i := range r.prs {
  223. r.prs[i] = &progress{next: r.raftLog.lastIndex() + 1}
  224. if i == r.id {
  225. r.prs[i].match = r.raftLog.lastIndex()
  226. }
  227. }
  228. r.pendingConf = false
  229. }
  230. func (r *raft) q() int {
  231. return len(r.prs)/2 + 1
  232. }
  233. func (r *raft) appendEntry(e pb.Entry) {
  234. e.Term = r.Term
  235. e.Index = r.raftLog.lastIndex() + 1
  236. r.raftLog.append(r.raftLog.lastIndex(), e)
  237. r.prs[r.id].update(r.raftLog.lastIndex())
  238. r.maybeCommit()
  239. }
  240. // tickElection is ran by followers and candidates after r.electionTimeout.
  241. func (r *raft) tickElection() {
  242. if !r.promotable() {
  243. r.elapsed = 0
  244. return
  245. }
  246. r.elapsed++
  247. if r.isElectionTimeout() {
  248. r.elapsed = 0
  249. r.Step(pb.Message{From: r.id, Type: msgHup})
  250. }
  251. }
  252. // tickHeartbeat is ran by leaders to send a msgBeat after r.heartbeatTimeout.
  253. func (r *raft) tickHeartbeat() {
  254. r.elapsed++
  255. if r.elapsed > r.heartbeatTimeout {
  256. r.elapsed = 0
  257. r.Step(pb.Message{From: r.id, Type: msgBeat})
  258. }
  259. }
  260. func (r *raft) becomeFollower(term int64, lead int64) {
  261. r.step = stepFollower
  262. r.reset(term)
  263. r.tick = r.tickElection
  264. r.lead = lead
  265. r.state = StateFollower
  266. }
  267. func (r *raft) becomeCandidate() {
  268. // TODO(xiangli) remove the panic when the raft implementation is stable
  269. if r.state == StateLeader {
  270. panic("invalid transition [leader -> candidate]")
  271. }
  272. r.step = stepCandidate
  273. r.reset(r.Term + 1)
  274. r.tick = r.tickElection
  275. r.Vote = r.id
  276. r.state = StateCandidate
  277. }
  278. func (r *raft) becomeLeader() {
  279. // TODO(xiangli) remove the panic when the raft implementation is stable
  280. if r.state == StateFollower {
  281. panic("invalid transition [follower -> leader]")
  282. }
  283. r.step = stepLeader
  284. r.reset(r.Term)
  285. r.tick = r.tickHeartbeat
  286. r.lead = r.id
  287. r.state = StateLeader
  288. for _, e := range r.raftLog.entries(r.raftLog.committed + 1) {
  289. if e.Type != pb.EntryConfChange {
  290. continue
  291. }
  292. if r.pendingConf {
  293. panic("unexpected double uncommitted config entry")
  294. }
  295. r.pendingConf = true
  296. }
  297. r.appendEntry(pb.Entry{Data: nil})
  298. }
  299. func (r *raft) ReadMessages() []pb.Message {
  300. msgs := r.msgs
  301. r.msgs = make([]pb.Message, 0)
  302. return msgs
  303. }
  304. func (r *raft) campaign() {
  305. r.becomeCandidate()
  306. if r.q() == r.poll(r.id, true) {
  307. r.becomeLeader()
  308. }
  309. for i := range r.prs {
  310. if i == r.id {
  311. continue
  312. }
  313. lasti := r.raftLog.lastIndex()
  314. r.send(pb.Message{To: i, Type: msgVote, Index: lasti, LogTerm: r.raftLog.term(lasti)})
  315. }
  316. }
  317. func (r *raft) Step(m pb.Message) error {
  318. // TODO(bmizerany): this likely allocs - prevent that.
  319. defer func() { r.Commit = r.raftLog.committed }()
  320. if r.removed[m.From] {
  321. if m.From != r.id {
  322. r.send(pb.Message{To: m.From, Type: msgDenied})
  323. }
  324. // TODO: return an error?
  325. return nil
  326. }
  327. if m.Type == msgDenied {
  328. r.removed[r.id] = true
  329. // TODO: return an error?
  330. return nil
  331. }
  332. if m.Type == msgHup {
  333. r.campaign()
  334. }
  335. switch {
  336. case m.Term == 0:
  337. // local message
  338. case m.Term > r.Term:
  339. lead := m.From
  340. if m.Type == msgVote {
  341. lead = None
  342. }
  343. r.becomeFollower(m.Term, lead)
  344. case m.Term < r.Term:
  345. // ignore
  346. return nil
  347. }
  348. r.step(r, m)
  349. return nil
  350. }
  351. func (r *raft) handleAppendEntries(m pb.Message) {
  352. if r.raftLog.maybeAppend(m.Index, m.LogTerm, m.Commit, m.Entries...) {
  353. r.send(pb.Message{To: m.From, Type: msgAppResp, Index: r.raftLog.lastIndex()})
  354. } else {
  355. r.send(pb.Message{To: m.From, Type: msgAppResp, Index: m.Index, Reject: true})
  356. }
  357. }
  358. func (r *raft) handleSnapshot(m pb.Message) {
  359. if r.restore(m.Snapshot) {
  360. r.send(pb.Message{To: m.From, Type: msgAppResp, Index: r.raftLog.lastIndex()})
  361. } else {
  362. r.send(pb.Message{To: m.From, Type: msgAppResp, Index: r.raftLog.committed})
  363. }
  364. }
  365. func (r *raft) addNode(id int64) {
  366. r.setProgress(id, 0, r.raftLog.lastIndex()+1)
  367. r.pendingConf = false
  368. }
  369. func (r *raft) removeNode(id int64) {
  370. r.delProgress(id)
  371. r.pendingConf = false
  372. r.removed[id] = true
  373. }
  374. type stepFunc func(r *raft, m pb.Message)
  375. func stepLeader(r *raft, m pb.Message) {
  376. switch m.Type {
  377. case msgBeat:
  378. r.bcastHeartbeat()
  379. case msgProp:
  380. if len(m.Entries) != 1 {
  381. panic("unexpected length(entries) of a msgProp")
  382. }
  383. e := m.Entries[0]
  384. if e.Type == pb.EntryConfChange {
  385. if r.pendingConf {
  386. return
  387. }
  388. r.pendingConf = true
  389. }
  390. r.appendEntry(e)
  391. r.bcastAppend()
  392. case msgAppResp:
  393. if m.Reject {
  394. if r.prs[m.From].maybeDecrTo(m.Index) {
  395. r.sendAppend(m.From)
  396. }
  397. } else {
  398. r.prs[m.From].update(m.Index)
  399. if r.maybeCommit() {
  400. r.bcastAppend()
  401. }
  402. }
  403. case msgVote:
  404. r.send(pb.Message{To: m.From, Type: msgVoteResp, Reject: true})
  405. }
  406. }
  407. func stepCandidate(r *raft, m pb.Message) {
  408. switch m.Type {
  409. case msgProp:
  410. panic("no leader")
  411. case msgApp:
  412. r.becomeFollower(r.Term, m.From)
  413. r.handleAppendEntries(m)
  414. case msgSnap:
  415. r.becomeFollower(m.Term, m.From)
  416. r.handleSnapshot(m)
  417. case msgVote:
  418. r.send(pb.Message{To: m.From, Type: msgVoteResp, Reject: true})
  419. case msgVoteResp:
  420. gr := r.poll(m.From, !m.Reject)
  421. switch r.q() {
  422. case gr:
  423. r.becomeLeader()
  424. r.bcastAppend()
  425. case len(r.votes) - gr:
  426. r.becomeFollower(r.Term, None)
  427. }
  428. }
  429. }
  430. func stepFollower(r *raft, m pb.Message) {
  431. switch m.Type {
  432. case msgProp:
  433. if r.lead == None {
  434. panic("no leader")
  435. }
  436. m.To = r.lead
  437. r.send(m)
  438. case msgApp:
  439. r.elapsed = 0
  440. r.lead = m.From
  441. r.handleAppendEntries(m)
  442. case msgSnap:
  443. r.elapsed = 0
  444. r.handleSnapshot(m)
  445. case msgVote:
  446. if (r.Vote == None || r.Vote == m.From) && r.raftLog.isUpToDate(m.Index, m.LogTerm) {
  447. r.elapsed = 0
  448. r.Vote = m.From
  449. r.send(pb.Message{To: m.From, Type: msgVoteResp})
  450. } else {
  451. r.send(pb.Message{To: m.From, Type: msgVoteResp, Reject: true})
  452. }
  453. }
  454. }
  455. func (r *raft) compact(index int64, nodes []int64, d []byte) {
  456. if index > r.raftLog.applied {
  457. panic(fmt.Sprintf("raft: compact index (%d) exceeds applied index (%d)", index, r.raftLog.applied))
  458. }
  459. r.raftLog.snap(d, index, r.raftLog.term(index), nodes)
  460. r.raftLog.compact(index)
  461. }
  462. // restore recovers the statemachine from a snapshot. It restores the log and the
  463. // configuration of statemachine.
  464. func (r *raft) restore(s pb.Snapshot) bool {
  465. if s.Index <= r.raftLog.committed {
  466. return false
  467. }
  468. r.raftLog.restore(s)
  469. r.prs = make(map[int64]*progress)
  470. for _, n := range s.Nodes {
  471. if n == r.id {
  472. r.setProgress(n, r.raftLog.lastIndex(), r.raftLog.lastIndex()+1)
  473. } else {
  474. r.setProgress(n, 0, r.raftLog.lastIndex()+1)
  475. }
  476. }
  477. return true
  478. }
  479. func (r *raft) needSnapshot(i int64) bool {
  480. if i < r.raftLog.offset {
  481. if r.raftLog.snapshot.Term == 0 {
  482. panic("need non-empty snapshot")
  483. }
  484. return true
  485. }
  486. return false
  487. }
  488. func (r *raft) nodes() []int64 {
  489. nodes := make([]int64, 0, len(r.prs))
  490. for k := range r.prs {
  491. nodes = append(nodes, k)
  492. }
  493. return nodes
  494. }
  495. func (r *raft) setProgress(id, match, next int64) {
  496. r.prs[id] = &progress{next: next, match: match}
  497. }
  498. func (r *raft) delProgress(id int64) {
  499. delete(r.prs, id)
  500. }
  501. // promotable indicates whether state machine can be promoted to leader,
  502. // which is true when its own id is in progress list.
  503. func (r *raft) promotable() bool {
  504. _, ok := r.prs[r.id]
  505. return ok
  506. }
  507. func (r *raft) loadEnts(ents []pb.Entry) {
  508. r.raftLog.load(ents)
  509. }
  510. func (r *raft) loadState(state pb.HardState) {
  511. r.raftLog.committed = state.Commit
  512. r.Term = state.Term
  513. r.Vote = state.Vote
  514. r.Commit = state.Commit
  515. }
  516. // isElectionTimeout returns true if r.elapsed is greater than the
  517. // randomized election timeout in (electiontimeout, 2 * electiontimeout - 1).
  518. // Otherwise, it returns false.
  519. func (r *raft) isElectionTimeout() bool {
  520. d := r.elapsed - r.electionTimeout
  521. if d < 0 {
  522. return false
  523. }
  524. return d > rand.Int()%r.electionTimeout
  525. }