multinode.go 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503
  1. // Copyright 2015 CoreOS, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package raft
  15. import (
  16. "github.com/coreos/etcd/Godeps/_workspace/src/golang.org/x/net/context"
  17. pb "github.com/coreos/etcd/raft/raftpb"
  18. )
  19. // MultiNode represents a node that is participating in multiple consensus groups.
  20. // A MultiNode is more efficient than a collection of Nodes.
  21. // The methods of this interface correspond to the methods of Node and are described
  22. // more fully there.
  23. type MultiNode interface {
  24. // CreateGroup adds a new group to the MultiNode. The application must call CreateGroup
  25. // on each particpating node with the same group ID; it may create groups on demand as it
  26. // receives messages. If the given storage contains existing log entries the list of peers
  27. // may be empty. If Config.ID field is zero it will be replaced by the ID passed
  28. // to StartMultiNode.
  29. CreateGroup(group uint64, c *Config, peers []Peer) error
  30. // RemoveGroup removes a group from the MultiNode.
  31. RemoveGroup(group uint64) error
  32. // Tick advances the internal logical clock by a single tick.
  33. Tick()
  34. // Campaign causes this MultiNode to transition to candidate state in the given group.
  35. Campaign(ctx context.Context, group uint64) error
  36. // Propose proposes that data be appended to the given group's log.
  37. Propose(ctx context.Context, group uint64, data []byte) error
  38. // ProposeConfChange proposes a config change.
  39. ProposeConfChange(ctx context.Context, group uint64, cc pb.ConfChange) error
  40. // ApplyConfChange applies a config change to the local node.
  41. ApplyConfChange(group uint64, cc pb.ConfChange) *pb.ConfState
  42. // Step advances the state machine using the given message.
  43. Step(ctx context.Context, group uint64, msg pb.Message) error
  44. // Ready returns a channel that returns the current point-in-time state of any ready
  45. // groups. Only groups with something to report will appear in the map.
  46. Ready() <-chan map[uint64]Ready
  47. // Advance notifies the node that the application has applied and saved progress in the
  48. // last Ready results. It must be called with the last value returned from the Ready()
  49. // channel.
  50. Advance(map[uint64]Ready)
  51. // Status returns the current status of the given group. Returns nil if no such group
  52. // exists.
  53. Status(group uint64) *Status
  54. // Report reports the given node is not reachable for the last send.
  55. ReportUnreachable(id, groupID uint64)
  56. // ReportSnapshot reports the stutus of the sent snapshot.
  57. ReportSnapshot(id, groupID uint64, status SnapshotStatus)
  58. // Stop performs any necessary termination of the MultiNode.
  59. Stop()
  60. }
  61. // StartMultiNode creates a MultiNode and starts its background
  62. // goroutine. If id is non-zero it identifies this node and will be
  63. // used as its node ID in all groups. The election and heartbeat
  64. // timers are in units of ticks.
  65. func StartMultiNode(id uint64) MultiNode {
  66. mn := newMultiNode(id)
  67. go mn.run()
  68. return &mn
  69. }
  70. // TODO(bdarnell): add group ID to the underlying protos?
  71. type multiMessage struct {
  72. group uint64
  73. msg pb.Message
  74. }
  75. type multiConfChange struct {
  76. group uint64
  77. msg pb.ConfChange
  78. ch chan pb.ConfState
  79. }
  80. type multiStatus struct {
  81. group uint64
  82. ch chan *Status
  83. }
  84. type groupCreation struct {
  85. id uint64
  86. config *Config
  87. peers []Peer
  88. // TODO(bdarnell): do we really need the done channel here? It's
  89. // unlike the rest of this package, but we need the group creation
  90. // to be complete before any Propose or other calls.
  91. done chan struct{}
  92. }
  93. type groupRemoval struct {
  94. id uint64
  95. // TODO(bdarnell): see comment on groupCreation.done
  96. done chan struct{}
  97. }
  98. type multiNode struct {
  99. id uint64
  100. groupc chan groupCreation
  101. rmgroupc chan groupRemoval
  102. propc chan multiMessage
  103. recvc chan multiMessage
  104. confc chan multiConfChange
  105. readyc chan map[uint64]Ready
  106. advancec chan map[uint64]Ready
  107. tickc chan struct{}
  108. stop chan struct{}
  109. done chan struct{}
  110. status chan multiStatus
  111. }
  112. func newMultiNode(id uint64) multiNode {
  113. return multiNode{
  114. id: id,
  115. groupc: make(chan groupCreation),
  116. rmgroupc: make(chan groupRemoval),
  117. propc: make(chan multiMessage),
  118. recvc: make(chan multiMessage),
  119. confc: make(chan multiConfChange),
  120. readyc: make(chan map[uint64]Ready),
  121. advancec: make(chan map[uint64]Ready),
  122. tickc: make(chan struct{}),
  123. stop: make(chan struct{}),
  124. done: make(chan struct{}),
  125. status: make(chan multiStatus),
  126. }
  127. }
  128. type groupState struct {
  129. id uint64
  130. raft *raft
  131. prevSoftSt *SoftState
  132. prevHardSt pb.HardState
  133. prevSnapi uint64
  134. }
  135. func (g *groupState) newReady() Ready {
  136. return newReady(g.raft, g.prevSoftSt, g.prevHardSt)
  137. }
  138. func (g *groupState) commitReady(rd Ready) {
  139. if rd.SoftState != nil {
  140. g.prevSoftSt = rd.SoftState
  141. }
  142. if !IsEmptyHardState(rd.HardState) {
  143. g.prevHardSt = rd.HardState
  144. }
  145. if g.prevHardSt.Commit != 0 {
  146. // In most cases, prevHardSt and rd.HardState will be the same
  147. // because when there are new entries to apply we just sent a
  148. // HardState with an updated Commit value. However, on initial
  149. // startup the two are different because we don't send a HardState
  150. // until something changes, but we do send any un-applied but
  151. // committed entries (and previously-committed entries may be
  152. // incorporated into the snapshot, even if rd.CommittedEntries is
  153. // empty). Therefore we mark all committed entries as applied
  154. // whether they were included in rd.HardState or not.
  155. g.raft.raftLog.appliedTo(g.prevHardSt.Commit)
  156. }
  157. if len(rd.Entries) > 0 {
  158. e := rd.Entries[len(rd.Entries)-1]
  159. g.raft.raftLog.stableTo(e.Index, e.Term)
  160. }
  161. if !IsEmptySnap(rd.Snapshot) {
  162. g.prevSnapi = rd.Snapshot.Metadata.Index
  163. g.raft.raftLog.stableSnapTo(g.prevSnapi)
  164. }
  165. }
  166. func (mn *multiNode) run() {
  167. groups := map[uint64]*groupState{}
  168. rds := map[uint64]Ready{}
  169. var advancec chan map[uint64]Ready
  170. for {
  171. // Only select readyc if we have something to report and we are not
  172. // currently waiting for an advance.
  173. readyc := mn.readyc
  174. if len(rds) == 0 || advancec != nil {
  175. readyc = nil
  176. }
  177. // group points to the group that was touched on this iteration (if any)
  178. var group *groupState
  179. select {
  180. case gc := <-mn.groupc:
  181. if (gc.config.ID != mn.id) && (gc.config.ID != 0 && mn.id != 0) {
  182. panic("if gc.config.ID and mn.id differ, one of them must be zero")
  183. }
  184. if gc.config.ID == 0 {
  185. gc.config.ID = mn.id
  186. }
  187. r := newRaft(gc.config)
  188. group = &groupState{
  189. id: gc.id,
  190. raft: r,
  191. }
  192. groups[gc.id] = group
  193. lastIndex, err := gc.config.Storage.LastIndex()
  194. if err != nil {
  195. panic(err) // TODO(bdarnell)
  196. }
  197. // If the log is empty, this is a new group (like StartNode); otherwise it's
  198. // restoring an existing group (like RestartNode).
  199. // TODO(bdarnell): rethink group initialization and whether the application needs
  200. // to be able to tell us when it expects the group to exist.
  201. if lastIndex == 0 {
  202. r.becomeFollower(1, None)
  203. ents := make([]pb.Entry, len(gc.peers))
  204. for i, peer := range gc.peers {
  205. cc := pb.ConfChange{Type: pb.ConfChangeAddNode, NodeID: peer.ID, Context: peer.Context}
  206. data, err := cc.Marshal()
  207. if err != nil {
  208. panic("unexpected marshal error")
  209. }
  210. ents[i] = pb.Entry{Type: pb.EntryConfChange, Term: 1, Index: uint64(i + 1), Data: data}
  211. }
  212. r.raftLog.append(ents...)
  213. r.raftLog.committed = uint64(len(ents))
  214. for _, peer := range gc.peers {
  215. r.addNode(peer.ID)
  216. }
  217. }
  218. // Set the initial hard and soft states after performing all initialization.
  219. group.prevSoftSt = r.softState()
  220. group.prevHardSt = r.HardState
  221. close(gc.done)
  222. case gr := <-mn.rmgroupc:
  223. delete(groups, gr.id)
  224. delete(rds, gr.id)
  225. close(gr.done)
  226. case mm := <-mn.propc:
  227. // TODO(bdarnell): single-node impl doesn't read from propc unless the group
  228. // has a leader; we can't do that since we have one propc for many groups.
  229. // We'll have to buffer somewhere on a group-by-group basis, or just let
  230. // raft.Step drop any such proposals on the floor.
  231. var ok bool
  232. if group, ok = groups[mm.group]; ok {
  233. mm.msg.From = group.raft.id
  234. group.raft.Step(mm.msg)
  235. }
  236. case mm := <-mn.recvc:
  237. group = groups[mm.group]
  238. if _, ok := group.raft.prs[mm.msg.From]; ok || !IsResponseMsg(mm.msg) {
  239. group.raft.Step(mm.msg)
  240. }
  241. case mcc := <-mn.confc:
  242. group = groups[mcc.group]
  243. if mcc.msg.NodeID == None {
  244. group.raft.resetPendingConf()
  245. select {
  246. case mcc.ch <- pb.ConfState{Nodes: group.raft.nodes()}:
  247. case <-mn.done:
  248. }
  249. break
  250. }
  251. switch mcc.msg.Type {
  252. case pb.ConfChangeAddNode:
  253. group.raft.addNode(mcc.msg.NodeID)
  254. case pb.ConfChangeRemoveNode:
  255. group.raft.removeNode(mcc.msg.NodeID)
  256. case pb.ConfChangeUpdateNode:
  257. group.raft.resetPendingConf()
  258. default:
  259. panic("unexpected conf type")
  260. }
  261. select {
  262. case mcc.ch <- pb.ConfState{Nodes: group.raft.nodes()}:
  263. case <-mn.done:
  264. }
  265. case <-mn.tickc:
  266. // TODO(bdarnell): instead of calling every group on every tick,
  267. // we should have a priority queue of groups based on their next
  268. // time-based event.
  269. for _, g := range groups {
  270. g.raft.tick()
  271. rd := g.newReady()
  272. if rd.containsUpdates() {
  273. rds[g.id] = rd
  274. }
  275. }
  276. case readyc <- rds:
  277. // Clear outgoing messages as soon as we've passed them to the application.
  278. for g := range rds {
  279. groups[g].raft.msgs = nil
  280. }
  281. rds = map[uint64]Ready{}
  282. advancec = mn.advancec
  283. case advs := <-advancec:
  284. for groupID, rd := range advs {
  285. g, ok := groups[groupID]
  286. if !ok {
  287. continue
  288. }
  289. g.commitReady(rd)
  290. // We've been accumulating new entries in rds which may now be obsolete.
  291. // Drop the old Ready object and create a new one if needed.
  292. delete(rds, groupID)
  293. newRd := g.newReady()
  294. if newRd.containsUpdates() {
  295. rds[groupID] = newRd
  296. }
  297. }
  298. advancec = nil
  299. case ms := <-mn.status:
  300. if g, ok := groups[ms.group]; ok {
  301. s := getStatus(g.raft)
  302. ms.ch <- &s
  303. } else {
  304. ms.ch <- nil
  305. }
  306. case <-mn.stop:
  307. close(mn.done)
  308. return
  309. }
  310. if group != nil {
  311. rd := group.newReady()
  312. if rd.containsUpdates() {
  313. rds[group.id] = rd
  314. }
  315. }
  316. }
  317. }
  318. func (mn *multiNode) CreateGroup(id uint64, config *Config, peers []Peer) error {
  319. gc := groupCreation{
  320. id: id,
  321. config: config,
  322. peers: peers,
  323. done: make(chan struct{}),
  324. }
  325. mn.groupc <- gc
  326. select {
  327. case <-gc.done:
  328. return nil
  329. case <-mn.done:
  330. return ErrStopped
  331. }
  332. }
  333. func (mn *multiNode) RemoveGroup(id uint64) error {
  334. gr := groupRemoval{
  335. id: id,
  336. done: make(chan struct{}),
  337. }
  338. mn.rmgroupc <- gr
  339. select {
  340. case <-gr.done:
  341. return nil
  342. case <-mn.done:
  343. return ErrStopped
  344. }
  345. }
  346. func (mn *multiNode) Stop() {
  347. select {
  348. case mn.stop <- struct{}{}:
  349. case <-mn.done:
  350. }
  351. <-mn.done
  352. }
  353. func (mn *multiNode) Tick() {
  354. select {
  355. case mn.tickc <- struct{}{}:
  356. case <-mn.done:
  357. }
  358. }
  359. func (mn *multiNode) Campaign(ctx context.Context, group uint64) error {
  360. return mn.step(ctx, multiMessage{group,
  361. pb.Message{
  362. Type: pb.MsgHup,
  363. },
  364. })
  365. }
  366. func (mn *multiNode) Propose(ctx context.Context, group uint64, data []byte) error {
  367. return mn.step(ctx, multiMessage{group,
  368. pb.Message{
  369. Type: pb.MsgProp,
  370. Entries: []pb.Entry{
  371. {Data: data},
  372. },
  373. }})
  374. }
  375. func (mn *multiNode) ProposeConfChange(ctx context.Context, group uint64, cc pb.ConfChange) error {
  376. data, err := cc.Marshal()
  377. if err != nil {
  378. return err
  379. }
  380. return mn.Step(ctx, group,
  381. pb.Message{
  382. Type: pb.MsgProp,
  383. Entries: []pb.Entry{
  384. {Type: pb.EntryConfChange, Data: data},
  385. },
  386. })
  387. }
  388. func (mn *multiNode) step(ctx context.Context, m multiMessage) error {
  389. ch := mn.recvc
  390. if m.msg.Type == pb.MsgProp {
  391. ch = mn.propc
  392. }
  393. select {
  394. case ch <- m:
  395. return nil
  396. case <-ctx.Done():
  397. return ctx.Err()
  398. case <-mn.done:
  399. return ErrStopped
  400. }
  401. }
  402. func (mn *multiNode) ApplyConfChange(group uint64, cc pb.ConfChange) *pb.ConfState {
  403. mcc := multiConfChange{group, cc, make(chan pb.ConfState)}
  404. select {
  405. case mn.confc <- mcc:
  406. case <-mn.done:
  407. }
  408. select {
  409. case cs := <-mcc.ch:
  410. return &cs
  411. case <-mn.done:
  412. // Per comments on Node.ApplyConfChange, this method should never return nil.
  413. return &pb.ConfState{}
  414. }
  415. }
  416. func (mn *multiNode) Step(ctx context.Context, group uint64, m pb.Message) error {
  417. // ignore unexpected local messages receiving over network
  418. if IsLocalMsg(m) {
  419. // TODO: return an error?
  420. return nil
  421. }
  422. return mn.step(ctx, multiMessage{group, m})
  423. }
  424. func (mn *multiNode) Ready() <-chan map[uint64]Ready {
  425. return mn.readyc
  426. }
  427. func (mn *multiNode) Advance(rds map[uint64]Ready) {
  428. select {
  429. case mn.advancec <- rds:
  430. case <-mn.done:
  431. }
  432. }
  433. func (mn *multiNode) Status(group uint64) *Status {
  434. ms := multiStatus{
  435. group: group,
  436. ch: make(chan *Status),
  437. }
  438. mn.status <- ms
  439. return <-ms.ch
  440. }
  441. func (mn *multiNode) ReportUnreachable(id, groupID uint64) {
  442. select {
  443. case mn.recvc <- multiMessage{
  444. group: groupID,
  445. msg: pb.Message{Type: pb.MsgUnreachable, From: id},
  446. }:
  447. case <-mn.done:
  448. }
  449. }
  450. func (mn *multiNode) ReportSnapshot(id, groupID uint64, status SnapshotStatus) {
  451. rej := status == SnapshotFailure
  452. select {
  453. case mn.recvc <- multiMessage{
  454. group: groupID,
  455. msg: pb.Message{Type: pb.MsgSnapStatus, From: id, Reject: rej},
  456. }:
  457. case <-mn.done:
  458. }
  459. }