confchange.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425
  1. // Copyright 2019 The etcd Authors
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package confchange
  15. import (
  16. "errors"
  17. "fmt"
  18. "strings"
  19. "go.etcd.io/etcd/raft/quorum"
  20. pb "go.etcd.io/etcd/raft/raftpb"
  21. "go.etcd.io/etcd/raft/tracker"
  22. )
  23. // Changer facilitates configuration changes. It exposes methods to handle
  24. // simple and joint consensus while performing the proper validation that allows
  25. // refusing invalid configuration changes before they affect the active
  26. // configuration.
  27. type Changer struct {
  28. Tracker tracker.ProgressTracker
  29. LastIndex uint64
  30. }
  31. // EnterJoint verifies that the outgoing (=right) majority config of the joint
  32. // config is empty and initializes it with a copy of the incoming (=left)
  33. // majority config. That is, it transitions from
  34. //
  35. // (1 2 3)&&()
  36. // to
  37. // (1 2 3)&&(1 2 3).
  38. //
  39. // The supplied changes are then applied to the incoming majority config,
  40. // resulting in a joint configuration that in terms of the Raft thesis[1]
  41. // (Section 4.3) corresponds to `C_{new,old}`.
  42. //
  43. // [1]: https://github.com/ongardie/dissertation/blob/master/online-trim.pdf
  44. func (c Changer) EnterJoint(autoLeave bool, ccs ...pb.ConfChangeSingle) (tracker.Config, tracker.ProgressMap, error) {
  45. cfg, prs, err := c.checkAndCopy()
  46. if err != nil {
  47. return c.err(err)
  48. }
  49. if joint(cfg) {
  50. err := errors.New("config is already joint")
  51. return c.err(err)
  52. }
  53. if len(incoming(cfg.Voters)) == 0 {
  54. // We allow adding nodes to an empty config for convenience (testing and
  55. // bootstrap), but you can't enter a joint state.
  56. err := errors.New("can't make a zero-voter config joint")
  57. return c.err(err)
  58. }
  59. // Clear the outgoing config.
  60. *outgoingPtr(&cfg.Voters) = quorum.MajorityConfig{}
  61. // Copy incoming to outgoing.
  62. for id := range incoming(cfg.Voters) {
  63. outgoing(cfg.Voters)[id] = struct{}{}
  64. }
  65. if err := c.apply(&cfg, prs, ccs...); err != nil {
  66. return c.err(err)
  67. }
  68. cfg.AutoLeave = autoLeave
  69. return checkAndReturn(cfg, prs)
  70. }
  71. // LeaveJoint transitions out of a joint configuration. It is an error to call
  72. // this method if the configuration is not joint, i.e. if the outgoing majority
  73. // config Voters[1] is empty.
  74. //
  75. // The outgoing majority config of the joint configuration will be removed,
  76. // that is, the incoming config is promoted as the sole decision maker. In the
  77. // notation of the Raft thesis[1] (Section 4.3), this method transitions from
  78. // `C_{new,old}` into `C_new`.
  79. //
  80. // At the same time, any staged learners (LearnersNext) the addition of which
  81. // was held back by an overlapping voter in the former outgoing config will be
  82. // inserted into Learners.
  83. //
  84. // [1]: https://github.com/ongardie/dissertation/blob/master/online-trim.pdf
  85. func (c Changer) LeaveJoint() (tracker.Config, tracker.ProgressMap, error) {
  86. cfg, prs, err := c.checkAndCopy()
  87. if err != nil {
  88. return c.err(err)
  89. }
  90. if !joint(cfg) {
  91. err := errors.New("can't leave a non-joint config")
  92. return c.err(err)
  93. }
  94. if len(outgoing(cfg.Voters)) == 0 {
  95. err := fmt.Errorf("configuration is not joint: %v", cfg)
  96. return c.err(err)
  97. }
  98. for id := range cfg.LearnersNext {
  99. nilAwareAdd(&cfg.Learners, id)
  100. prs[id].IsLearner = true
  101. }
  102. cfg.LearnersNext = nil
  103. for id := range outgoing(cfg.Voters) {
  104. _, isVoter := incoming(cfg.Voters)[id]
  105. _, isLearner := cfg.Learners[id]
  106. if !isVoter && !isLearner {
  107. delete(prs, id)
  108. }
  109. }
  110. *outgoingPtr(&cfg.Voters) = nil
  111. cfg.AutoLeave = false
  112. return checkAndReturn(cfg, prs)
  113. }
  114. // Simple carries out a series of configuration changes that (in aggregate)
  115. // mutates the incoming majority config Voters[0] by at most one. This method
  116. // will return an error if that is not the case, if the resulting quorum is
  117. // zero, or if the configuration is in a joint state (i.e. if there is an
  118. // outgoing configuration).
  119. func (c Changer) Simple(ccs ...pb.ConfChangeSingle) (tracker.Config, tracker.ProgressMap, error) {
  120. cfg, prs, err := c.checkAndCopy()
  121. if err != nil {
  122. return c.err(err)
  123. }
  124. if joint(cfg) {
  125. err := errors.New("can't apply simple config change in joint config")
  126. return c.err(err)
  127. }
  128. if err := c.apply(&cfg, prs, ccs...); err != nil {
  129. return c.err(err)
  130. }
  131. if n := symdiff(incoming(c.Tracker.Voters), incoming(cfg.Voters)); n > 1 {
  132. return tracker.Config{}, nil, errors.New("more than one voter changed without entering joint config")
  133. }
  134. if err := checkInvariants(cfg, prs); err != nil {
  135. return tracker.Config{}, tracker.ProgressMap{}, nil
  136. }
  137. return checkAndReturn(cfg, prs)
  138. }
  139. // apply a change to the configuration. By convention, changes to voters are
  140. // always made to the incoming majority config Voters[0]. Voters[1] is either
  141. // empty or preserves the outgoing majority configuration while in a joint state.
  142. func (c Changer) apply(cfg *tracker.Config, prs tracker.ProgressMap, ccs ...pb.ConfChangeSingle) error {
  143. for _, cc := range ccs {
  144. if cc.NodeID == 0 {
  145. // etcd replaces the NodeID with zero if it decides (downstream of
  146. // raft) to not apply a change, so we have to have explicit code
  147. // here to ignore these.
  148. continue
  149. }
  150. switch cc.Type {
  151. case pb.ConfChangeAddNode:
  152. c.makeVoter(cfg, prs, cc.NodeID)
  153. case pb.ConfChangeAddLearnerNode:
  154. c.makeLearner(cfg, prs, cc.NodeID)
  155. case pb.ConfChangeRemoveNode:
  156. c.remove(cfg, prs, cc.NodeID)
  157. case pb.ConfChangeUpdateNode:
  158. default:
  159. return fmt.Errorf("unexpected conf type %d", cc.Type)
  160. }
  161. }
  162. if len(incoming(cfg.Voters)) == 0 {
  163. return errors.New("removed all voters")
  164. }
  165. return nil
  166. }
  167. // makeVoter adds or promotes the given ID to be a voter in the incoming
  168. // majority config.
  169. func (c Changer) makeVoter(cfg *tracker.Config, prs tracker.ProgressMap, id uint64) {
  170. pr := prs[id]
  171. if pr == nil {
  172. c.initProgress(cfg, prs, id, false /* isLearner */)
  173. return
  174. }
  175. pr.IsLearner = false
  176. nilAwareDelete(&cfg.Learners, id)
  177. nilAwareDelete(&cfg.LearnersNext, id)
  178. incoming(cfg.Voters)[id] = struct{}{}
  179. return
  180. }
  181. // makeLearner makes the given ID a learner or stages it to be a learner once
  182. // an active joint configuration is exited.
  183. //
  184. // The former happens when the peer is not a part of the outgoing config, in
  185. // which case we either add a new learner or demote a voter in the incoming
  186. // config.
  187. //
  188. // The latter case occurs when the configuration is joint and the peer is a
  189. // voter in the outgoing config. In that case, we do not want to add the peer
  190. // as a learner because then we'd have to track a peer as a voter and learner
  191. // simultaneously. Instead, we add the learner to LearnersNext, so that it will
  192. // be added to Learners the moment the outgoing config is removed by
  193. // LeaveJoint().
  194. func (c Changer) makeLearner(cfg *tracker.Config, prs tracker.ProgressMap, id uint64) {
  195. pr := prs[id]
  196. if pr == nil {
  197. c.initProgress(cfg, prs, id, true /* isLearner */)
  198. return
  199. }
  200. if pr.IsLearner {
  201. return
  202. }
  203. // Remove any existing voter in the incoming config...
  204. c.remove(cfg, prs, id)
  205. // ... but save the Progress.
  206. prs[id] = pr
  207. // Use LearnersNext if we can't add the learner to Learners directly, i.e.
  208. // if the peer is still tracked as a voter in the outgoing config. It will
  209. // be turned into a learner in LeaveJoint().
  210. //
  211. // Otherwise, add a regular learner right away.
  212. if _, onRight := outgoing(cfg.Voters)[id]; onRight {
  213. nilAwareAdd(&cfg.LearnersNext, id)
  214. } else {
  215. pr.IsLearner = true
  216. nilAwareAdd(&cfg.Learners, id)
  217. }
  218. }
  219. // remove this peer as a voter or learner from the incoming config.
  220. func (c Changer) remove(cfg *tracker.Config, prs tracker.ProgressMap, id uint64) {
  221. if _, ok := prs[id]; !ok {
  222. return
  223. }
  224. delete(incoming(cfg.Voters), id)
  225. nilAwareDelete(&cfg.Learners, id)
  226. nilAwareDelete(&cfg.LearnersNext, id)
  227. // If the peer is still a voter in the outgoing config, keep the Progress.
  228. if _, onRight := outgoing(cfg.Voters)[id]; !onRight {
  229. delete(prs, id)
  230. }
  231. }
  232. // initProgress initializes a new progress for the given node or learner.
  233. func (c Changer) initProgress(cfg *tracker.Config, prs tracker.ProgressMap, id uint64, isLearner bool) {
  234. if !isLearner {
  235. incoming(cfg.Voters)[id] = struct{}{}
  236. } else {
  237. nilAwareAdd(&cfg.Learners, id)
  238. }
  239. prs[id] = &tracker.Progress{
  240. // Initializing the Progress with the last index means that the follower
  241. // can be probed (with the last index).
  242. //
  243. // TODO(tbg): seems awfully optimistic. Using the first index would be
  244. // better. The general expectation here is that the follower has no log
  245. // at all (and will thus likely need a snapshot), though the app may
  246. // have applied a snapshot out of band before adding the replica (thus
  247. // making the first index the better choice).
  248. Next: c.LastIndex,
  249. Match: 0,
  250. Inflights: tracker.NewInflights(c.Tracker.MaxInflight),
  251. IsLearner: isLearner,
  252. // When a node is first added, we should mark it as recently active.
  253. // Otherwise, CheckQuorum may cause us to step down if it is invoked
  254. // before the added node has had a chance to communicate with us.
  255. RecentActive: true,
  256. }
  257. }
  258. // checkInvariants makes sure that the config and progress are compatible with
  259. // each other. This is used to check both what the Changer is initialized with,
  260. // as well as what it returns.
  261. func checkInvariants(cfg tracker.Config, prs tracker.ProgressMap) error {
  262. // NB: intentionally allow the empty config. In production we'll never see a
  263. // non-empty config (we prevent it from being created) but we will need to
  264. // be able to *create* an initial config, for example during bootstrap (or
  265. // during tests). Instead of having to hand-code this, we allow
  266. // transitioning from an empty config into any other legal and non-empty
  267. // config.
  268. for _, ids := range []map[uint64]struct{}{
  269. cfg.Voters.IDs(),
  270. cfg.Learners,
  271. cfg.LearnersNext,
  272. } {
  273. for id := range ids {
  274. if _, ok := prs[id]; !ok {
  275. return fmt.Errorf("no progress for %d", id)
  276. }
  277. }
  278. }
  279. // Any staged learner was staged because it could not be directly added due
  280. // to a conflicting voter in the outgoing config.
  281. for id := range cfg.LearnersNext {
  282. if _, ok := outgoing(cfg.Voters)[id]; !ok {
  283. return fmt.Errorf("%d is in LearnersNext, but not Voters[1]", id)
  284. }
  285. if prs[id].IsLearner {
  286. return fmt.Errorf("%d is in LearnersNext, but is already marked as learner", id)
  287. }
  288. }
  289. // Conversely Learners and Voters doesn't intersect at all.
  290. for id := range cfg.Learners {
  291. if _, ok := outgoing(cfg.Voters)[id]; ok {
  292. return fmt.Errorf("%d is in Learners and Voters[1]", id)
  293. }
  294. if _, ok := incoming(cfg.Voters)[id]; ok {
  295. return fmt.Errorf("%d is in Learners and Voters[0]", id)
  296. }
  297. if !prs[id].IsLearner {
  298. return fmt.Errorf("%d is in Learners, but is not marked as learner", id)
  299. }
  300. }
  301. if !joint(cfg) {
  302. // We enforce that empty maps are nil instead of zero.
  303. if outgoing(cfg.Voters) != nil {
  304. return fmt.Errorf("Voters[1] must be nil when not joint")
  305. }
  306. if cfg.LearnersNext != nil {
  307. return fmt.Errorf("LearnersNext must be nil when not joint")
  308. }
  309. if cfg.AutoLeave {
  310. return fmt.Errorf("AutoLeave must be false when not joint")
  311. }
  312. }
  313. return nil
  314. }
  315. // checkAndCopy copies the tracker's config and progress map (deeply enough for
  316. // the purposes of the Changer) and returns those copies. It returns an error
  317. // if checkInvariants does.
  318. func (c Changer) checkAndCopy() (tracker.Config, tracker.ProgressMap, error) {
  319. cfg := c.Tracker.Config.Clone()
  320. prs := tracker.ProgressMap{}
  321. for id, pr := range c.Tracker.Progress {
  322. // A shallow copy is enough because we only mutate the Learner field.
  323. ppr := *pr
  324. prs[id] = &ppr
  325. }
  326. return checkAndReturn(cfg, prs)
  327. }
  328. // checkAndReturn calls checkInvariants on the input and returns either the
  329. // resulting error or the input.
  330. func checkAndReturn(cfg tracker.Config, prs tracker.ProgressMap) (tracker.Config, tracker.ProgressMap, error) {
  331. if err := checkInvariants(cfg, prs); err != nil {
  332. return tracker.Config{}, tracker.ProgressMap{}, err
  333. }
  334. return cfg, prs, nil
  335. }
  336. // err returns zero values and an error.
  337. func (c Changer) err(err error) (tracker.Config, tracker.ProgressMap, error) {
  338. return tracker.Config{}, nil, err
  339. }
  340. // nilAwareAdd populates a map entry, creating the map if necessary.
  341. func nilAwareAdd(m *map[uint64]struct{}, id uint64) {
  342. if *m == nil {
  343. *m = map[uint64]struct{}{}
  344. }
  345. (*m)[id] = struct{}{}
  346. }
  347. // nilAwareDelete deletes from a map, nil'ing the map itself if it is empty after.
  348. func nilAwareDelete(m *map[uint64]struct{}, id uint64) {
  349. if *m == nil {
  350. return
  351. }
  352. delete(*m, id)
  353. if len(*m) == 0 {
  354. *m = nil
  355. }
  356. }
  357. // symdiff returns the count of the symmetric difference between the sets of
  358. // uint64s, i.e. len( (l - r) \union (r - l)).
  359. func symdiff(l, r map[uint64]struct{}) int {
  360. var n int
  361. pairs := [][2]quorum.MajorityConfig{
  362. {l, r}, // count elems in l but not in r
  363. {r, l}, // count elems in r but not in l
  364. }
  365. for _, p := range pairs {
  366. for id := range p[0] {
  367. if _, ok := p[1][id]; !ok {
  368. n++
  369. }
  370. }
  371. }
  372. return n
  373. }
  374. func joint(cfg tracker.Config) bool {
  375. return len(outgoing(cfg.Voters)) > 0
  376. }
  377. func incoming(voters quorum.JointConfig) quorum.MajorityConfig { return voters[0] }
  378. func outgoing(voters quorum.JointConfig) quorum.MajorityConfig { return voters[1] }
  379. func outgoingPtr(voters *quorum.JointConfig) *quorum.MajorityConfig { return &voters[1] }
  380. // Describe prints the type and NodeID of the configuration changes as a
  381. // space-delimited string.
  382. func Describe(ccs ...pb.ConfChangeSingle) string {
  383. var buf strings.Builder
  384. for _, cc := range ccs {
  385. if buf.Len() > 0 {
  386. buf.WriteByte(' ')
  387. }
  388. fmt.Fprintf(&buf, "%s(%d)", cc.Type, cc.NodeID)
  389. }
  390. return buf.String()
  391. }