progress.go 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460
  1. // Copyright 2015 The etcd Authors
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package raft
  15. import (
  16. "fmt"
  17. "sort"
  18. )
  19. const (
  20. ProgressStateProbe ProgressStateType = iota
  21. ProgressStateReplicate
  22. ProgressStateSnapshot
  23. )
  24. type ProgressStateType uint64
  25. var prstmap = [...]string{
  26. "ProgressStateProbe",
  27. "ProgressStateReplicate",
  28. "ProgressStateSnapshot",
  29. }
  30. func (st ProgressStateType) String() string { return prstmap[uint64(st)] }
  31. // Progress represents a follower’s progress in the view of the leader. Leader maintains
  32. // progresses of all followers, and sends entries to the follower based on its progress.
  33. type Progress struct {
  34. Match, Next uint64
  35. // State defines how the leader should interact with the follower.
  36. //
  37. // When in ProgressStateProbe, leader sends at most one replication message
  38. // per heartbeat interval. It also probes actual progress of the follower.
  39. //
  40. // When in ProgressStateReplicate, leader optimistically increases next
  41. // to the latest entry sent after sending replication message. This is
  42. // an optimized state for fast replicating log entries to the follower.
  43. //
  44. // When in ProgressStateSnapshot, leader should have sent out snapshot
  45. // before and stops sending any replication message.
  46. State ProgressStateType
  47. // Paused is used in ProgressStateProbe.
  48. // When Paused is true, raft should pause sending replication message to this peer.
  49. Paused bool
  50. // PendingSnapshot is used in ProgressStateSnapshot.
  51. // If there is a pending snapshot, the pendingSnapshot will be set to the
  52. // index of the snapshot. If pendingSnapshot is set, the replication process of
  53. // this Progress will be paused. raft will not resend snapshot until the pending one
  54. // is reported to be failed.
  55. PendingSnapshot uint64
  56. // RecentActive is true if the progress is recently active. Receiving any messages
  57. // from the corresponding follower indicates the progress is active.
  58. // RecentActive can be reset to false after an election timeout.
  59. RecentActive bool
  60. // inflights is a sliding window for the inflight messages.
  61. // Each inflight message contains one or more log entries.
  62. // The max number of entries per message is defined in raft config as MaxSizePerMsg.
  63. // Thus inflight effectively limits both the number of inflight messages
  64. // and the bandwidth each Progress can use.
  65. // When inflights is full, no more message should be sent.
  66. // When a leader sends out a message, the index of the last
  67. // entry should be added to inflights. The index MUST be added
  68. // into inflights in order.
  69. // When a leader receives a reply, the previous inflights should
  70. // be freed by calling inflights.freeTo with the index of the last
  71. // received entry.
  72. ins *inflights
  73. // IsLearner is true if this progress is tracked for a learner.
  74. IsLearner bool
  75. }
  76. func (pr *Progress) resetState(state ProgressStateType) {
  77. pr.Paused = false
  78. pr.PendingSnapshot = 0
  79. pr.State = state
  80. pr.ins.reset()
  81. }
  82. func (pr *Progress) becomeProbe() {
  83. // If the original state is ProgressStateSnapshot, progress knows that
  84. // the pending snapshot has been sent to this peer successfully, then
  85. // probes from pendingSnapshot + 1.
  86. if pr.State == ProgressStateSnapshot {
  87. pendingSnapshot := pr.PendingSnapshot
  88. pr.resetState(ProgressStateProbe)
  89. pr.Next = max(pr.Match+1, pendingSnapshot+1)
  90. } else {
  91. pr.resetState(ProgressStateProbe)
  92. pr.Next = pr.Match + 1
  93. }
  94. }
  95. func (pr *Progress) becomeReplicate() {
  96. pr.resetState(ProgressStateReplicate)
  97. pr.Next = pr.Match + 1
  98. }
  99. func (pr *Progress) becomeSnapshot(snapshoti uint64) {
  100. pr.resetState(ProgressStateSnapshot)
  101. pr.PendingSnapshot = snapshoti
  102. }
  103. // maybeUpdate returns false if the given n index comes from an outdated message.
  104. // Otherwise it updates the progress and returns true.
  105. func (pr *Progress) maybeUpdate(n uint64) bool {
  106. var updated bool
  107. if pr.Match < n {
  108. pr.Match = n
  109. updated = true
  110. pr.resume()
  111. }
  112. if pr.Next < n+1 {
  113. pr.Next = n + 1
  114. }
  115. return updated
  116. }
  117. func (pr *Progress) optimisticUpdate(n uint64) { pr.Next = n + 1 }
  118. // maybeDecrTo returns false if the given to index comes from an out of order message.
  119. // Otherwise it decreases the progress next index to min(rejected, last) and returns true.
  120. func (pr *Progress) maybeDecrTo(rejected, last uint64) bool {
  121. if pr.State == ProgressStateReplicate {
  122. // the rejection must be stale if the progress has matched and "rejected"
  123. // is smaller than "match".
  124. if rejected <= pr.Match {
  125. return false
  126. }
  127. // directly decrease next to match + 1
  128. pr.Next = pr.Match + 1
  129. return true
  130. }
  131. // the rejection must be stale if "rejected" does not match next - 1
  132. if pr.Next-1 != rejected {
  133. return false
  134. }
  135. if pr.Next = min(rejected, last+1); pr.Next < 1 {
  136. pr.Next = 1
  137. }
  138. pr.resume()
  139. return true
  140. }
  141. func (pr *Progress) pause() { pr.Paused = true }
  142. func (pr *Progress) resume() { pr.Paused = false }
  143. // IsPaused returns whether sending log entries to this node has been
  144. // paused. A node may be paused because it has rejected recent
  145. // MsgApps, is currently waiting for a snapshot, or has reached the
  146. // MaxInflightMsgs limit.
  147. func (pr *Progress) IsPaused() bool {
  148. switch pr.State {
  149. case ProgressStateProbe:
  150. return pr.Paused
  151. case ProgressStateReplicate:
  152. return pr.ins.full()
  153. case ProgressStateSnapshot:
  154. return true
  155. default:
  156. panic("unexpected state")
  157. }
  158. }
  159. func (pr *Progress) snapshotFailure() { pr.PendingSnapshot = 0 }
  160. // needSnapshotAbort returns true if snapshot progress's Match
  161. // is equal or higher than the pendingSnapshot.
  162. func (pr *Progress) needSnapshotAbort() bool {
  163. return pr.State == ProgressStateSnapshot && pr.Match >= pr.PendingSnapshot
  164. }
  165. func (pr *Progress) String() string {
  166. return fmt.Sprintf("next = %d, match = %d, state = %s, waiting = %v, pendingSnapshot = %d, recentActive = %v, isLearner = %v",
  167. pr.Next, pr.Match, pr.State, pr.IsPaused(), pr.PendingSnapshot, pr.RecentActive, pr.IsLearner)
  168. }
  169. type inflights struct {
  170. // the starting index in the buffer
  171. start int
  172. // number of inflights in the buffer
  173. count int
  174. // the size of the buffer
  175. size int
  176. // buffer contains the index of the last entry
  177. // inside one message.
  178. buffer []uint64
  179. }
  180. func newInflights(size int) *inflights {
  181. return &inflights{
  182. size: size,
  183. }
  184. }
  185. // add adds an inflight into inflights
  186. func (in *inflights) add(inflight uint64) {
  187. if in.full() {
  188. panic("cannot add into a full inflights")
  189. }
  190. next := in.start + in.count
  191. size := in.size
  192. if next >= size {
  193. next -= size
  194. }
  195. if next >= len(in.buffer) {
  196. in.growBuf()
  197. }
  198. in.buffer[next] = inflight
  199. in.count++
  200. }
  201. // grow the inflight buffer by doubling up to inflights.size. We grow on demand
  202. // instead of preallocating to inflights.size to handle systems which have
  203. // thousands of Raft groups per process.
  204. func (in *inflights) growBuf() {
  205. newSize := len(in.buffer) * 2
  206. if newSize == 0 {
  207. newSize = 1
  208. } else if newSize > in.size {
  209. newSize = in.size
  210. }
  211. newBuffer := make([]uint64, newSize)
  212. copy(newBuffer, in.buffer)
  213. in.buffer = newBuffer
  214. }
  215. // freeTo frees the inflights smaller or equal to the given `to` flight.
  216. func (in *inflights) freeTo(to uint64) {
  217. if in.count == 0 || to < in.buffer[in.start] {
  218. // out of the left side of the window
  219. return
  220. }
  221. idx := in.start
  222. var i int
  223. for i = 0; i < in.count; i++ {
  224. if to < in.buffer[idx] { // found the first large inflight
  225. break
  226. }
  227. // increase index and maybe rotate
  228. size := in.size
  229. if idx++; idx >= size {
  230. idx -= size
  231. }
  232. }
  233. // free i inflights and set new start index
  234. in.count -= i
  235. in.start = idx
  236. if in.count == 0 {
  237. // inflights is empty, reset the start index so that we don't grow the
  238. // buffer unnecessarily.
  239. in.start = 0
  240. }
  241. }
  242. func (in *inflights) freeFirstOne() { in.freeTo(in.buffer[in.start]) }
  243. // full returns true if the inflights is full.
  244. func (in *inflights) full() bool {
  245. return in.count == in.size
  246. }
  247. // resets frees all inflights.
  248. func (in *inflights) reset() {
  249. in.count = 0
  250. in.start = 0
  251. }
  252. // progressTracker tracks the currently active configuration and the information
  253. // known about the nodes and learners in it. In particular, it tracks the match
  254. // index for each peer which in turn allows reasoning about the committed index.
  255. type progressTracker struct {
  256. nodes map[uint64]*Progress
  257. learners map[uint64]*Progress
  258. votes map[uint64]bool
  259. maxInflight int
  260. matchBuf uint64Slice
  261. }
  262. func makePRS(maxInflight int) progressTracker {
  263. p := progressTracker{
  264. maxInflight: maxInflight,
  265. nodes: map[uint64]*Progress{},
  266. learners: map[uint64]*Progress{},
  267. votes: map[uint64]bool{},
  268. }
  269. return p
  270. }
  271. // isSingleton returns true if (and only if) there is only one voting member
  272. // (i.e. the leader) in the current configuration.
  273. func (p *progressTracker) isSingleton() bool {
  274. return len(p.nodes) == 1
  275. }
  276. func (p *progressTracker) quorum() int {
  277. return len(p.nodes)/2 + 1
  278. }
  279. func (p *progressTracker) hasQuorum(m map[uint64]struct{}) bool {
  280. return len(m) >= p.quorum()
  281. }
  282. // committed returns the largest log index known to be committed based on what
  283. // the voting members of the group have acknowledged.
  284. func (p *progressTracker) committed() uint64 {
  285. // Preserving matchBuf across calls is an optimization
  286. // used to avoid allocating a new slice on each call.
  287. if cap(p.matchBuf) < len(p.nodes) {
  288. p.matchBuf = make(uint64Slice, len(p.nodes))
  289. }
  290. p.matchBuf = p.matchBuf[:len(p.nodes)]
  291. idx := 0
  292. for _, pr := range p.nodes {
  293. p.matchBuf[idx] = pr.Match
  294. idx++
  295. }
  296. sort.Sort(&p.matchBuf)
  297. return p.matchBuf[len(p.matchBuf)-p.quorum()]
  298. }
  299. func (p *progressTracker) removeAny(id uint64) {
  300. pN := p.nodes[id]
  301. pL := p.learners[id]
  302. if pN == nil && pL == nil {
  303. panic("attempting to remove unknown peer %x")
  304. } else if pN != nil && pL != nil {
  305. panic(fmt.Sprintf("peer %x is both voter and learner", id))
  306. }
  307. delete(p.nodes, id)
  308. delete(p.learners, id)
  309. }
  310. // initProgress initializes a new progress for the given node or learner. The
  311. // node may not exist yet in either form or a panic will ensue.
  312. func (p *progressTracker) initProgress(id, match, next uint64, isLearner bool) {
  313. if pr := p.nodes[id]; pr != nil {
  314. panic(fmt.Sprintf("peer %x already tracked as node %v", id, pr))
  315. }
  316. if pr := p.learners[id]; pr != nil {
  317. panic(fmt.Sprintf("peer %x already tracked as learner %v", id, pr))
  318. }
  319. if !isLearner {
  320. p.nodes[id] = &Progress{Next: next, Match: match, ins: newInflights(p.maxInflight)}
  321. return
  322. }
  323. p.learners[id] = &Progress{Next: next, Match: match, ins: newInflights(p.maxInflight), IsLearner: true}
  324. }
  325. func (p *progressTracker) getProgress(id uint64) *Progress {
  326. if pr, ok := p.nodes[id]; ok {
  327. return pr
  328. }
  329. return p.learners[id]
  330. }
  331. // visit invokes the supplied closure for all tracked progresses.
  332. func (p *progressTracker) visit(f func(id uint64, pr *Progress)) {
  333. for id, pr := range p.nodes {
  334. f(id, pr)
  335. }
  336. for id, pr := range p.learners {
  337. f(id, pr)
  338. }
  339. }
  340. // checkQuorumActive returns true if the quorum is active from
  341. // the view of the local raft state machine. Otherwise, it returns
  342. // false.
  343. func (p *progressTracker) quorumActive() bool {
  344. var act int
  345. p.visit(func(id uint64, pr *Progress) {
  346. if pr.RecentActive && !pr.IsLearner {
  347. act++
  348. }
  349. })
  350. return act >= p.quorum()
  351. }
  352. func (p *progressTracker) voterNodes() []uint64 {
  353. nodes := make([]uint64, 0, len(p.nodes))
  354. for id := range p.nodes {
  355. nodes = append(nodes, id)
  356. }
  357. sort.Sort(uint64Slice(nodes))
  358. return nodes
  359. }
  360. func (p *progressTracker) learnerNodes() []uint64 {
  361. nodes := make([]uint64, 0, len(p.learners))
  362. for id := range p.learners {
  363. nodes = append(nodes, id)
  364. }
  365. sort.Sort(uint64Slice(nodes))
  366. return nodes
  367. }
  368. // resetVotes prepares for a new round of vote counting via recordVote.
  369. func (p *progressTracker) resetVotes() {
  370. p.votes = map[uint64]bool{}
  371. }
  372. // recordVote records that the node with the given id voted for this Raft
  373. // instance if v == true (and declined it otherwise).
  374. func (p *progressTracker) recordVote(id uint64, v bool) {
  375. _, ok := p.votes[id]
  376. if !ok {
  377. p.votes[id] = v
  378. }
  379. }
  380. // tallyVotes returns the number of granted and rejected votes, and whether the
  381. // election outcome is known.
  382. func (p *progressTracker) tallyVotes() (granted int, rejected int, result electionResult) {
  383. for _, v := range p.votes {
  384. if v {
  385. granted++
  386. } else {
  387. rejected++
  388. }
  389. }
  390. q := p.quorum()
  391. result = electionIndeterminate
  392. if granted >= q {
  393. result = electionWon
  394. } else if rejected >= q {
  395. result = electionLost
  396. }
  397. return granted, rejected, result
  398. }