raft.go 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553
  1. package raft
  2. import (
  3. "errors"
  4. "fmt"
  5. "sort"
  6. pb "github.com/coreos/etcd/raft/raftpb"
  7. )
  8. const None int64 = 0
  9. type messageType int64
  10. const (
  11. msgHup int64 = iota
  12. msgBeat
  13. msgProp
  14. msgApp
  15. msgAppResp
  16. msgVote
  17. msgVoteResp
  18. msgSnap
  19. msgDenied
  20. )
  21. var mtmap = [...]string{
  22. msgHup: "msgHup",
  23. msgBeat: "msgBeat",
  24. msgProp: "msgProp",
  25. msgApp: "msgApp",
  26. msgAppResp: "msgAppResp",
  27. msgVote: "msgVote",
  28. msgVoteResp: "msgVoteResp",
  29. msgSnap: "msgSnap",
  30. msgDenied: "msgDenied",
  31. }
  32. func (mt messageType) String() string {
  33. return mtmap[int64(mt)]
  34. }
  35. var errNoLeader = errors.New("no leader")
  36. const (
  37. StateFollower StateType = iota
  38. StateCandidate
  39. StateLeader
  40. )
  41. type StateType int64
  42. var stmap = [...]string{
  43. StateFollower: "StateFollower",
  44. StateCandidate: "StateCandidate",
  45. StateLeader: "StateLeader",
  46. }
  47. func (st StateType) String() string {
  48. return stmap[int64(st)]
  49. }
  50. type progress struct {
  51. match, next int64
  52. }
  53. func (pr *progress) update(n int64) {
  54. pr.match = n
  55. pr.next = n + 1
  56. }
  57. func (pr *progress) decr() {
  58. if pr.next--; pr.next < 1 {
  59. pr.next = 1
  60. }
  61. }
  62. func (pr *progress) String() string {
  63. return fmt.Sprintf("n=%d m=%d", pr.next, pr.match)
  64. }
  65. // int64Slice implements sort interface
  66. type int64Slice []int64
  67. func (p int64Slice) Len() int { return len(p) }
  68. func (p int64Slice) Less(i, j int) bool { return p[i] < p[j] }
  69. func (p int64Slice) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
  70. type raft struct {
  71. pb.HardState
  72. id int64
  73. // the log
  74. raftLog *raftLog
  75. prs map[int64]*progress
  76. state StateType
  77. votes map[int64]bool
  78. msgs []pb.Message
  79. // the leader id
  80. lead int64
  81. // New configuration is ignored if there exists unapplied configuration.
  82. pendingConf bool
  83. elapsed int // number of ticks since the last msg
  84. heartbeatTimeout int
  85. electionTimeout int
  86. tick func()
  87. step stepFunc
  88. }
  89. func newRaft(id int64, peers []int64, election, heartbeat int) *raft {
  90. if id == None {
  91. panic("cannot use none id")
  92. }
  93. r := &raft{
  94. id: id,
  95. lead: None,
  96. raftLog: newLog(),
  97. prs: make(map[int64]*progress),
  98. electionTimeout: election,
  99. heartbeatTimeout: heartbeat,
  100. }
  101. for _, p := range peers {
  102. r.prs[p] = &progress{}
  103. }
  104. r.becomeFollower(0, None)
  105. return r
  106. }
  107. func (r *raft) hasLeader() bool { return r.lead != None }
  108. func (r *raft) softState() *SoftState {
  109. return &SoftState{Lead: r.lead, RaftState: r.state}
  110. }
  111. func (r *raft) String() string {
  112. s := fmt.Sprintf(`state=%v term=%d`, r.state, r.Term)
  113. switch r.state {
  114. case StateFollower:
  115. s += fmt.Sprintf(" vote=%v lead=%v", r.Vote, r.lead)
  116. case StateCandidate:
  117. s += fmt.Sprintf(` votes="%v"`, r.votes)
  118. case StateLeader:
  119. s += fmt.Sprintf(` prs="%v"`, r.prs)
  120. }
  121. return s
  122. }
  123. func (r *raft) poll(id int64, v bool) (granted int) {
  124. if _, ok := r.votes[id]; !ok {
  125. r.votes[id] = v
  126. }
  127. for _, vv := range r.votes {
  128. if vv {
  129. granted++
  130. }
  131. }
  132. return granted
  133. }
  134. // send persists state to stable storage and then sends to its mailbox.
  135. func (r *raft) send(m pb.Message) {
  136. m.From = r.id
  137. m.Term = r.Term
  138. r.msgs = append(r.msgs, m)
  139. }
  140. // sendAppend sends RRPC, with entries to the given peer.
  141. func (r *raft) sendAppend(to int64) {
  142. pr := r.prs[to]
  143. m := pb.Message{}
  144. m.To = to
  145. m.Index = pr.next - 1
  146. if r.needSnapshot(m.Index) {
  147. m.Type = msgSnap
  148. m.Snapshot = r.raftLog.snapshot
  149. } else {
  150. m.Type = msgApp
  151. m.LogTerm = r.raftLog.term(pr.next - 1)
  152. m.Entries = r.raftLog.entries(pr.next)
  153. m.Commit = r.raftLog.committed
  154. }
  155. r.send(m)
  156. }
  157. // sendHeartbeat sends RRPC, without entries to the given peer.
  158. func (r *raft) sendHeartbeat(to int64) {
  159. pr := r.prs[to]
  160. index := max(pr.next-1, r.raftLog.offset)
  161. m := pb.Message{
  162. To: to,
  163. Type: msgApp,
  164. Index: index,
  165. LogTerm: r.raftLog.term(index),
  166. Commit: r.raftLog.committed,
  167. }
  168. r.send(m)
  169. }
  170. // bcastAppend sends RRPC, with entries to all peers that are not up-to-date according to r.mis.
  171. func (r *raft) bcastAppend() {
  172. for i := range r.prs {
  173. if i == r.id {
  174. continue
  175. }
  176. r.sendAppend(i)
  177. }
  178. }
  179. // bcastHeartbeat sends RRPC, without entries to all the peers.
  180. func (r *raft) bcastHeartbeat() {
  181. for i := range r.prs {
  182. if i == r.id {
  183. continue
  184. }
  185. r.sendHeartbeat(i)
  186. }
  187. }
  188. func (r *raft) maybeCommit() bool {
  189. // TODO(bmizerany): optimize.. Currently naive
  190. mis := make(int64Slice, 0, len(r.prs))
  191. for i := range r.prs {
  192. mis = append(mis, r.prs[i].match)
  193. }
  194. sort.Sort(sort.Reverse(mis))
  195. mci := mis[r.q()-1]
  196. return r.raftLog.maybeCommit(mci, r.Term)
  197. }
  198. func (r *raft) reset(term int64) {
  199. r.Term = term
  200. r.lead = None
  201. r.Vote = None
  202. r.elapsed = 0
  203. r.votes = make(map[int64]bool)
  204. for i := range r.prs {
  205. r.prs[i] = &progress{next: r.raftLog.lastIndex() + 1}
  206. if i == r.id {
  207. r.prs[i].match = r.raftLog.lastIndex()
  208. }
  209. }
  210. r.pendingConf = false
  211. }
  212. func (r *raft) q() int {
  213. return len(r.prs)/2 + 1
  214. }
  215. func (r *raft) appendEntry(e pb.Entry) {
  216. e.Term = r.Term
  217. e.Index = r.raftLog.lastIndex() + 1
  218. r.raftLog.append(r.raftLog.lastIndex(), e)
  219. r.prs[r.id].update(r.raftLog.lastIndex())
  220. r.maybeCommit()
  221. }
  222. // tickElection is ran by followers and candidates after r.electionTimeout.
  223. func (r *raft) tickElection() {
  224. if !r.promotable() {
  225. r.elapsed = 0
  226. return
  227. }
  228. r.elapsed++
  229. // TODO (xiangli): elctionTimeout should be randomized.
  230. if r.elapsed > r.electionTimeout {
  231. r.elapsed = 0
  232. r.Step(pb.Message{From: r.id, Type: msgHup})
  233. }
  234. }
  235. // tickHeartbeat is ran by leaders to send a msgBeat after r.heartbeatTimeout.
  236. func (r *raft) tickHeartbeat() {
  237. r.elapsed++
  238. if r.elapsed > r.heartbeatTimeout {
  239. r.elapsed = 0
  240. r.Step(pb.Message{From: r.id, Type: msgBeat})
  241. }
  242. }
  243. func (r *raft) becomeFollower(term int64, lead int64) {
  244. r.step = stepFollower
  245. r.reset(term)
  246. r.tick = r.tickElection
  247. r.lead = lead
  248. r.state = StateFollower
  249. }
  250. func (r *raft) becomeCandidate() {
  251. // TODO(xiangli) remove the panic when the raft implementation is stable
  252. if r.state == StateLeader {
  253. panic("invalid transition [leader -> candidate]")
  254. }
  255. r.step = stepCandidate
  256. r.reset(r.Term + 1)
  257. r.tick = r.tickElection
  258. r.Vote = r.id
  259. r.state = StateCandidate
  260. }
  261. func (r *raft) becomeLeader() {
  262. // TODO(xiangli) remove the panic when the raft implementation is stable
  263. if r.state == StateFollower {
  264. panic("invalid transition [follower -> leader]")
  265. }
  266. r.step = stepLeader
  267. r.reset(r.Term)
  268. r.tick = r.tickHeartbeat
  269. r.lead = r.id
  270. r.state = StateLeader
  271. for _, e := range r.raftLog.entries(r.raftLog.committed + 1) {
  272. if e.Type != pb.EntryConfChange {
  273. continue
  274. }
  275. if r.pendingConf {
  276. panic("unexpected double uncommitted config entry")
  277. }
  278. r.pendingConf = true
  279. }
  280. r.appendEntry(pb.Entry{Data: nil})
  281. }
  282. func (r *raft) ReadMessages() []pb.Message {
  283. msgs := r.msgs
  284. r.msgs = make([]pb.Message, 0)
  285. return msgs
  286. }
  287. func (r *raft) campaign() {
  288. r.becomeCandidate()
  289. if r.q() == r.poll(r.id, true) {
  290. r.becomeLeader()
  291. }
  292. for i := range r.prs {
  293. if i == r.id {
  294. continue
  295. }
  296. lasti := r.raftLog.lastIndex()
  297. r.send(pb.Message{To: i, Type: msgVote, Index: lasti, LogTerm: r.raftLog.term(lasti)})
  298. }
  299. }
  300. func (r *raft) Step(m pb.Message) error {
  301. // TODO(bmizerany): this likely allocs - prevent that.
  302. defer func() { r.Commit = r.raftLog.committed }()
  303. if m.Type == msgHup {
  304. r.campaign()
  305. }
  306. switch {
  307. case m.Term == 0:
  308. // local message
  309. case m.Term > r.Term:
  310. lead := m.From
  311. if m.Type == msgVote {
  312. lead = None
  313. }
  314. r.becomeFollower(m.Term, lead)
  315. case m.Term < r.Term:
  316. // ignore
  317. return nil
  318. }
  319. r.step(r, m)
  320. return nil
  321. }
  322. func (r *raft) handleAppendEntries(m pb.Message) {
  323. if r.raftLog.maybeAppend(m.Index, m.LogTerm, m.Commit, m.Entries...) {
  324. r.send(pb.Message{To: m.From, Type: msgAppResp, Index: r.raftLog.lastIndex()})
  325. } else {
  326. r.send(pb.Message{To: m.From, Type: msgAppResp, Index: -1})
  327. }
  328. }
  329. func (r *raft) handleSnapshot(m pb.Message) {
  330. if r.restore(m.Snapshot) {
  331. r.send(pb.Message{To: m.From, Type: msgAppResp, Index: r.raftLog.lastIndex()})
  332. } else {
  333. r.send(pb.Message{To: m.From, Type: msgAppResp, Index: r.raftLog.committed})
  334. }
  335. }
  336. func (r *raft) addNode(id int64) {
  337. r.setProgress(id, 0, r.raftLog.lastIndex()+1)
  338. r.pendingConf = false
  339. }
  340. func (r *raft) removeNode(id int64) {
  341. r.delProgress(id)
  342. r.pendingConf = false
  343. }
  344. type stepFunc func(r *raft, m pb.Message)
  345. func stepLeader(r *raft, m pb.Message) {
  346. switch m.Type {
  347. case msgBeat:
  348. r.bcastHeartbeat()
  349. case msgProp:
  350. if len(m.Entries) != 1 {
  351. panic("unexpected length(entries) of a msgProp")
  352. }
  353. e := m.Entries[0]
  354. if e.Type == pb.EntryConfChange {
  355. if r.pendingConf {
  356. return
  357. }
  358. r.pendingConf = true
  359. }
  360. r.appendEntry(e)
  361. r.bcastAppend()
  362. case msgAppResp:
  363. if m.Index < 0 {
  364. r.prs[m.From].decr()
  365. r.sendAppend(m.From)
  366. } else {
  367. r.prs[m.From].update(m.Index)
  368. if r.maybeCommit() {
  369. r.bcastAppend()
  370. }
  371. }
  372. case msgVote:
  373. r.send(pb.Message{To: m.From, Type: msgVoteResp, Index: -1})
  374. }
  375. }
  376. func stepCandidate(r *raft, m pb.Message) {
  377. switch m.Type {
  378. case msgProp:
  379. panic("no leader")
  380. case msgApp:
  381. r.becomeFollower(r.Term, m.From)
  382. r.handleAppendEntries(m)
  383. case msgSnap:
  384. r.becomeFollower(m.Term, m.From)
  385. r.handleSnapshot(m)
  386. case msgVote:
  387. r.send(pb.Message{To: m.From, Type: msgVoteResp, Index: -1})
  388. case msgVoteResp:
  389. gr := r.poll(m.From, m.Index >= 0)
  390. switch r.q() {
  391. case gr:
  392. r.becomeLeader()
  393. r.bcastAppend()
  394. case len(r.votes) - gr:
  395. r.becomeFollower(r.Term, None)
  396. }
  397. }
  398. }
  399. func stepFollower(r *raft, m pb.Message) {
  400. switch m.Type {
  401. case msgProp:
  402. if r.lead == None {
  403. panic("no leader")
  404. }
  405. m.To = r.lead
  406. r.send(m)
  407. case msgApp:
  408. r.elapsed = 0
  409. r.lead = m.From
  410. r.handleAppendEntries(m)
  411. case msgSnap:
  412. r.elapsed = 0
  413. r.handleSnapshot(m)
  414. case msgVote:
  415. if (r.Vote == None || r.Vote == m.From) && r.raftLog.isUpToDate(m.Index, m.LogTerm) {
  416. r.elapsed = 0
  417. r.Vote = m.From
  418. r.send(pb.Message{To: m.From, Type: msgVoteResp, Index: r.raftLog.lastIndex()})
  419. } else {
  420. r.send(pb.Message{To: m.From, Type: msgVoteResp, Index: -1})
  421. }
  422. }
  423. }
  424. func (r *raft) compact(d []byte) {
  425. r.raftLog.snap(d, r.raftLog.applied, r.raftLog.term(r.raftLog.applied), r.nodes())
  426. r.raftLog.compact(r.raftLog.applied)
  427. }
  428. // restore recovers the statemachine from a snapshot. It restores the log and the
  429. // configuration of statemachine.
  430. func (r *raft) restore(s pb.Snapshot) bool {
  431. if s.Index <= r.raftLog.committed {
  432. return false
  433. }
  434. r.raftLog.restore(s)
  435. r.prs = make(map[int64]*progress)
  436. for _, n := range s.Nodes {
  437. if n == r.id {
  438. r.setProgress(n, r.raftLog.lastIndex(), r.raftLog.lastIndex()+1)
  439. } else {
  440. r.setProgress(n, 0, r.raftLog.lastIndex()+1)
  441. }
  442. }
  443. return true
  444. }
  445. func (r *raft) needSnapshot(i int64) bool {
  446. if i < r.raftLog.offset {
  447. if r.raftLog.snapshot.Term == 0 {
  448. panic("need non-empty snapshot")
  449. }
  450. return true
  451. }
  452. return false
  453. }
  454. func (r *raft) nodes() []int64 {
  455. nodes := make([]int64, 0, len(r.prs))
  456. for k := range r.prs {
  457. nodes = append(nodes, k)
  458. }
  459. return nodes
  460. }
  461. func (r *raft) setProgress(id, match, next int64) {
  462. r.prs[id] = &progress{next: next, match: match}
  463. }
  464. func (r *raft) delProgress(id int64) {
  465. delete(r.prs, id)
  466. }
  467. // promotable indicates whether state machine can be promoted to leader,
  468. // which is true when its own id is in progress list.
  469. func (r *raft) promotable() bool {
  470. _, ok := r.prs[r.id]
  471. return ok
  472. }
  473. func (r *raft) loadEnts(ents []pb.Entry) {
  474. r.raftLog.load(ents)
  475. }
  476. func (r *raft) loadState(state pb.HardState) {
  477. r.raftLog.committed = state.Commit
  478. r.Term = state.Term
  479. r.Vote = state.Vote
  480. r.Commit = state.Commit
  481. }