progress.go 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457
  1. // Copyright 2015 The etcd Authors
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package raft
  15. import (
  16. "fmt"
  17. "sort"
  18. "go.etcd.io/etcd/raft/quorum"
  19. )
  20. const (
  21. ProgressStateProbe ProgressStateType = iota
  22. ProgressStateReplicate
  23. ProgressStateSnapshot
  24. )
  25. type ProgressStateType uint64
  26. var prstmap = [...]string{
  27. "ProgressStateProbe",
  28. "ProgressStateReplicate",
  29. "ProgressStateSnapshot",
  30. }
  31. func (st ProgressStateType) String() string { return prstmap[uint64(st)] }
  32. // Progress represents a follower’s progress in the view of the leader. Leader maintains
  33. // progresses of all followers, and sends entries to the follower based on its progress.
  34. type Progress struct {
  35. Match, Next uint64
  36. // State defines how the leader should interact with the follower.
  37. //
  38. // When in ProgressStateProbe, leader sends at most one replication message
  39. // per heartbeat interval. It also probes actual progress of the follower.
  40. //
  41. // When in ProgressStateReplicate, leader optimistically increases next
  42. // to the latest entry sent after sending replication message. This is
  43. // an optimized state for fast replicating log entries to the follower.
  44. //
  45. // When in ProgressStateSnapshot, leader should have sent out snapshot
  46. // before and stops sending any replication message.
  47. State ProgressStateType
  48. // Paused is used in ProgressStateProbe.
  49. // When Paused is true, raft should pause sending replication message to this peer.
  50. Paused bool
  51. // PendingSnapshot is used in ProgressStateSnapshot.
  52. // If there is a pending snapshot, the pendingSnapshot will be set to the
  53. // index of the snapshot. If pendingSnapshot is set, the replication process of
  54. // this Progress will be paused. raft will not resend snapshot until the pending one
  55. // is reported to be failed.
  56. PendingSnapshot uint64
  57. // RecentActive is true if the progress is recently active. Receiving any messages
  58. // from the corresponding follower indicates the progress is active.
  59. // RecentActive can be reset to false after an election timeout.
  60. RecentActive bool
  61. // inflights is a sliding window for the inflight messages.
  62. // Each inflight message contains one or more log entries.
  63. // The max number of entries per message is defined in raft config as MaxSizePerMsg.
  64. // Thus inflight effectively limits both the number of inflight messages
  65. // and the bandwidth each Progress can use.
  66. // When inflights is full, no more message should be sent.
  67. // When a leader sends out a message, the index of the last
  68. // entry should be added to inflights. The index MUST be added
  69. // into inflights in order.
  70. // When a leader receives a reply, the previous inflights should
  71. // be freed by calling inflights.freeTo with the index of the last
  72. // received entry.
  73. ins *inflights
  74. // IsLearner is true if this progress is tracked for a learner.
  75. IsLearner bool
  76. }
  77. func (pr *Progress) resetState(state ProgressStateType) {
  78. pr.Paused = false
  79. pr.PendingSnapshot = 0
  80. pr.State = state
  81. pr.ins.reset()
  82. }
  83. func (pr *Progress) becomeProbe() {
  84. // If the original state is ProgressStateSnapshot, progress knows that
  85. // the pending snapshot has been sent to this peer successfully, then
  86. // probes from pendingSnapshot + 1.
  87. if pr.State == ProgressStateSnapshot {
  88. pendingSnapshot := pr.PendingSnapshot
  89. pr.resetState(ProgressStateProbe)
  90. pr.Next = max(pr.Match+1, pendingSnapshot+1)
  91. } else {
  92. pr.resetState(ProgressStateProbe)
  93. pr.Next = pr.Match + 1
  94. }
  95. }
  96. func (pr *Progress) becomeReplicate() {
  97. pr.resetState(ProgressStateReplicate)
  98. pr.Next = pr.Match + 1
  99. }
  100. func (pr *Progress) becomeSnapshot(snapshoti uint64) {
  101. pr.resetState(ProgressStateSnapshot)
  102. pr.PendingSnapshot = snapshoti
  103. }
  104. // maybeUpdate returns false if the given n index comes from an outdated message.
  105. // Otherwise it updates the progress and returns true.
  106. func (pr *Progress) maybeUpdate(n uint64) bool {
  107. var updated bool
  108. if pr.Match < n {
  109. pr.Match = n
  110. updated = true
  111. pr.resume()
  112. }
  113. if pr.Next < n+1 {
  114. pr.Next = n + 1
  115. }
  116. return updated
  117. }
  118. func (pr *Progress) optimisticUpdate(n uint64) { pr.Next = n + 1 }
  119. // maybeDecrTo returns false if the given to index comes from an out of order message.
  120. // Otherwise it decreases the progress next index to min(rejected, last) and returns true.
  121. func (pr *Progress) maybeDecrTo(rejected, last uint64) bool {
  122. if pr.State == ProgressStateReplicate {
  123. // the rejection must be stale if the progress has matched and "rejected"
  124. // is smaller than "match".
  125. if rejected <= pr.Match {
  126. return false
  127. }
  128. // directly decrease next to match + 1
  129. pr.Next = pr.Match + 1
  130. return true
  131. }
  132. // the rejection must be stale if "rejected" does not match next - 1
  133. if pr.Next-1 != rejected {
  134. return false
  135. }
  136. if pr.Next = min(rejected, last+1); pr.Next < 1 {
  137. pr.Next = 1
  138. }
  139. pr.resume()
  140. return true
  141. }
  142. func (pr *Progress) pause() { pr.Paused = true }
  143. func (pr *Progress) resume() { pr.Paused = false }
  144. // IsPaused returns whether sending log entries to this node has been
  145. // paused. A node may be paused because it has rejected recent
  146. // MsgApps, is currently waiting for a snapshot, or has reached the
  147. // MaxInflightMsgs limit.
  148. func (pr *Progress) IsPaused() bool {
  149. switch pr.State {
  150. case ProgressStateProbe:
  151. return pr.Paused
  152. case ProgressStateReplicate:
  153. return pr.ins.full()
  154. case ProgressStateSnapshot:
  155. return true
  156. default:
  157. panic("unexpected state")
  158. }
  159. }
  160. func (pr *Progress) snapshotFailure() { pr.PendingSnapshot = 0 }
  161. // needSnapshotAbort returns true if snapshot progress's Match
  162. // is equal or higher than the pendingSnapshot.
  163. func (pr *Progress) needSnapshotAbort() bool {
  164. return pr.State == ProgressStateSnapshot && pr.Match >= pr.PendingSnapshot
  165. }
  166. func (pr *Progress) String() string {
  167. return fmt.Sprintf("next = %d, match = %d, state = %s, waiting = %v, pendingSnapshot = %d, recentActive = %v, isLearner = %v",
  168. pr.Next, pr.Match, pr.State, pr.IsPaused(), pr.PendingSnapshot, pr.RecentActive, pr.IsLearner)
  169. }
  170. type inflights struct {
  171. // the starting index in the buffer
  172. start int
  173. // number of inflights in the buffer
  174. count int
  175. // the size of the buffer
  176. size int
  177. // buffer contains the index of the last entry
  178. // inside one message.
  179. buffer []uint64
  180. }
  181. func newInflights(size int) *inflights {
  182. return &inflights{
  183. size: size,
  184. }
  185. }
  186. // add adds an inflight into inflights
  187. func (in *inflights) add(inflight uint64) {
  188. if in.full() {
  189. panic("cannot add into a full inflights")
  190. }
  191. next := in.start + in.count
  192. size := in.size
  193. if next >= size {
  194. next -= size
  195. }
  196. if next >= len(in.buffer) {
  197. in.growBuf()
  198. }
  199. in.buffer[next] = inflight
  200. in.count++
  201. }
  202. // grow the inflight buffer by doubling up to inflights.size. We grow on demand
  203. // instead of preallocating to inflights.size to handle systems which have
  204. // thousands of Raft groups per process.
  205. func (in *inflights) growBuf() {
  206. newSize := len(in.buffer) * 2
  207. if newSize == 0 {
  208. newSize = 1
  209. } else if newSize > in.size {
  210. newSize = in.size
  211. }
  212. newBuffer := make([]uint64, newSize)
  213. copy(newBuffer, in.buffer)
  214. in.buffer = newBuffer
  215. }
  216. // freeTo frees the inflights smaller or equal to the given `to` flight.
  217. func (in *inflights) freeTo(to uint64) {
  218. if in.count == 0 || to < in.buffer[in.start] {
  219. // out of the left side of the window
  220. return
  221. }
  222. idx := in.start
  223. var i int
  224. for i = 0; i < in.count; i++ {
  225. if to < in.buffer[idx] { // found the first large inflight
  226. break
  227. }
  228. // increase index and maybe rotate
  229. size := in.size
  230. if idx++; idx >= size {
  231. idx -= size
  232. }
  233. }
  234. // free i inflights and set new start index
  235. in.count -= i
  236. in.start = idx
  237. if in.count == 0 {
  238. // inflights is empty, reset the start index so that we don't grow the
  239. // buffer unnecessarily.
  240. in.start = 0
  241. }
  242. }
  243. func (in *inflights) freeFirstOne() { in.freeTo(in.buffer[in.start]) }
  244. // full returns true if the inflights is full.
  245. func (in *inflights) full() bool {
  246. return in.count == in.size
  247. }
  248. // resets frees all inflights.
  249. func (in *inflights) reset() {
  250. in.count = 0
  251. in.start = 0
  252. }
  253. // progressTracker tracks the currently active configuration and the information
  254. // known about the nodes and learners in it. In particular, it tracks the match
  255. // index for each peer which in turn allows reasoning about the committed index.
  256. type progressTracker struct {
  257. voters quorum.JointConfig
  258. learners map[uint64]struct{}
  259. prs map[uint64]*Progress
  260. votes map[uint64]bool
  261. maxInflight int
  262. }
  263. func makeProgressTracker(maxInflight int) progressTracker {
  264. p := progressTracker{
  265. maxInflight: maxInflight,
  266. voters: quorum.JointConfig{
  267. quorum.MajorityConfig{},
  268. quorum.MajorityConfig{},
  269. },
  270. learners: map[uint64]struct{}{},
  271. votes: map[uint64]bool{},
  272. prs: map[uint64]*Progress{},
  273. }
  274. return p
  275. }
  276. // isSingleton returns true if (and only if) there is only one voting member
  277. // (i.e. the leader) in the current configuration.
  278. func (p *progressTracker) isSingleton() bool {
  279. return len(p.voters[0]) == 1 && len(p.voters[1]) == 0
  280. }
  281. type progressAckIndexer map[uint64]*Progress
  282. var _ quorum.AckedIndexer = progressAckIndexer(nil)
  283. func (l progressAckIndexer) AckedIndex(id uint64) (quorum.Index, bool) {
  284. pr, ok := l[id]
  285. if !ok {
  286. return 0, false
  287. }
  288. return quorum.Index(pr.Match), true
  289. }
  290. // committed returns the largest log index known to be committed based on what
  291. // the voting members of the group have acknowledged.
  292. func (p *progressTracker) committed() uint64 {
  293. return uint64(p.voters.CommittedIndex(progressAckIndexer(p.prs)))
  294. }
  295. func (p *progressTracker) removeAny(id uint64) {
  296. _, okPR := p.prs[id]
  297. _, okV1 := p.voters[0][id]
  298. _, okV2 := p.voters[1][id]
  299. _, okL := p.learners[id]
  300. okV := okV1 || okV2
  301. if !okPR {
  302. panic("attempting to remove unknown peer %x")
  303. } else if !okV && !okL {
  304. panic("attempting to remove unknown peer %x")
  305. } else if okV && okL {
  306. panic(fmt.Sprintf("peer %x is both voter and learner", id))
  307. }
  308. delete(p.voters[0], id)
  309. delete(p.voters[1], id)
  310. delete(p.learners, id)
  311. delete(p.prs, id)
  312. }
  313. // initProgress initializes a new progress for the given node or learner. The
  314. // node may not exist yet in either form or a panic will ensue.
  315. func (p *progressTracker) initProgress(id, match, next uint64, isLearner bool) {
  316. if pr := p.prs[id]; pr != nil {
  317. panic(fmt.Sprintf("peer %x already tracked as node %v", id, pr))
  318. }
  319. if !isLearner {
  320. p.voters[0][id] = struct{}{}
  321. } else {
  322. p.learners[id] = struct{}{}
  323. }
  324. p.prs[id] = &Progress{Next: next, Match: match, ins: newInflights(p.maxInflight), IsLearner: isLearner}
  325. }
  326. func (p *progressTracker) getProgress(id uint64) *Progress {
  327. return p.prs[id]
  328. }
  329. // visit invokes the supplied closure for all tracked progresses.
  330. func (p *progressTracker) visit(f func(id uint64, pr *Progress)) {
  331. for id, pr := range p.prs {
  332. f(id, pr)
  333. }
  334. }
  335. // checkQuorumActive returns true if the quorum is active from
  336. // the view of the local raft state machine. Otherwise, it returns
  337. // false.
  338. func (p *progressTracker) quorumActive() bool {
  339. votes := map[uint64]bool{}
  340. p.visit(func(id uint64, pr *Progress) {
  341. if pr.IsLearner {
  342. return
  343. }
  344. votes[id] = pr.RecentActive
  345. })
  346. return p.voters.VoteResult(votes) == quorum.VoteWon
  347. }
  348. func (p *progressTracker) voterNodes() []uint64 {
  349. m := p.voters.IDs()
  350. nodes := make([]uint64, 0, len(m))
  351. for id := range m {
  352. nodes = append(nodes, id)
  353. }
  354. sort.Sort(uint64Slice(nodes))
  355. return nodes
  356. }
  357. func (p *progressTracker) learnerNodes() []uint64 {
  358. nodes := make([]uint64, 0, len(p.learners))
  359. for id := range p.learners {
  360. nodes = append(nodes, id)
  361. }
  362. sort.Sort(uint64Slice(nodes))
  363. return nodes
  364. }
  365. // resetVotes prepares for a new round of vote counting via recordVote.
  366. func (p *progressTracker) resetVotes() {
  367. p.votes = map[uint64]bool{}
  368. }
  369. // recordVote records that the node with the given id voted for this Raft
  370. // instance if v == true (and declined it otherwise).
  371. func (p *progressTracker) recordVote(id uint64, v bool) {
  372. _, ok := p.votes[id]
  373. if !ok {
  374. p.votes[id] = v
  375. }
  376. }
  377. // tallyVotes returns the number of granted and rejected votes, and whether the
  378. // election outcome is known.
  379. func (p *progressTracker) tallyVotes() (granted int, rejected int, _ quorum.VoteResult) {
  380. // Make sure to populate granted/rejected correctly even if the votes slice
  381. // contains members no longer part of the configuration. This doesn't really
  382. // matter in the way the numbers are used (they're informational), but might
  383. // as well get it right.
  384. for id, pr := range p.prs {
  385. if pr.IsLearner {
  386. continue
  387. }
  388. if p.votes[id] {
  389. granted++
  390. } else {
  391. rejected++
  392. }
  393. }
  394. result := p.voters.VoteResult(p.votes)
  395. return granted, rejected, result
  396. }