http.go 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550
  1. // Copyright 2015 The etcd Authors
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package rafthttp
  15. import (
  16. "context"
  17. "errors"
  18. "fmt"
  19. "io/ioutil"
  20. "net/http"
  21. "path"
  22. "strings"
  23. pioutil "github.com/coreos/etcd/pkg/ioutil"
  24. "github.com/coreos/etcd/pkg/types"
  25. "github.com/coreos/etcd/raft/raftpb"
  26. "github.com/coreos/etcd/raftsnap"
  27. "github.com/coreos/etcd/version"
  28. "go.uber.org/zap"
  29. )
  30. const (
  31. // connReadLimitByte limits the number of bytes
  32. // a single read can read out.
  33. //
  34. // 64KB should be large enough for not causing
  35. // throughput bottleneck as well as small enough
  36. // for not causing a read timeout.
  37. connReadLimitByte = 64 * 1024
  38. )
  39. var (
  40. RaftPrefix = "/raft"
  41. ProbingPrefix = path.Join(RaftPrefix, "probing")
  42. RaftStreamPrefix = path.Join(RaftPrefix, "stream")
  43. RaftSnapshotPrefix = path.Join(RaftPrefix, "snapshot")
  44. errIncompatibleVersion = errors.New("incompatible version")
  45. errClusterIDMismatch = errors.New("cluster ID mismatch")
  46. )
  47. type peerGetter interface {
  48. Get(id types.ID) Peer
  49. }
  50. type writerToResponse interface {
  51. WriteTo(w http.ResponseWriter)
  52. }
  53. type pipelineHandler struct {
  54. lg *zap.Logger
  55. localID types.ID
  56. tr Transporter
  57. r Raft
  58. cid types.ID
  59. }
  60. // newPipelineHandler returns a handler for handling raft messages
  61. // from pipeline for RaftPrefix.
  62. //
  63. // The handler reads out the raft message from request body,
  64. // and forwards it to the given raft state machine for processing.
  65. func newPipelineHandler(t *Transport, r Raft, cid types.ID) http.Handler {
  66. return &pipelineHandler{
  67. lg: t.Logger,
  68. localID: t.ID,
  69. tr: t,
  70. r: r,
  71. cid: cid,
  72. }
  73. }
  74. func (h *pipelineHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
  75. if r.Method != "POST" {
  76. w.Header().Set("Allow", "POST")
  77. http.Error(w, "Method Not Allowed", http.StatusMethodNotAllowed)
  78. return
  79. }
  80. w.Header().Set("X-Etcd-Cluster-ID", h.cid.String())
  81. if err := checkClusterCompatibilityFromHeader(h.lg, h.localID, r.Header, h.cid); err != nil {
  82. http.Error(w, err.Error(), http.StatusPreconditionFailed)
  83. return
  84. }
  85. addRemoteFromRequest(h.tr, r)
  86. // Limit the data size that could be read from the request body, which ensures that read from
  87. // connection will not time out accidentally due to possible blocking in underlying implementation.
  88. limitedr := pioutil.NewLimitedBufferReader(r.Body, connReadLimitByte)
  89. b, err := ioutil.ReadAll(limitedr)
  90. if err != nil {
  91. if h.lg != nil {
  92. h.lg.Warn(
  93. "failed to read Raft message",
  94. zap.String("local-member-id", h.localID.String()),
  95. zap.Error(err),
  96. )
  97. } else {
  98. plog.Errorf("failed to read raft message (%v)", err)
  99. }
  100. http.Error(w, "error reading raft message", http.StatusBadRequest)
  101. recvFailures.WithLabelValues(r.RemoteAddr).Inc()
  102. return
  103. }
  104. var m raftpb.Message
  105. if err := m.Unmarshal(b); err != nil {
  106. if h.lg != nil {
  107. h.lg.Warn(
  108. "failed to unmarshal Raft message",
  109. zap.String("local-member-id", h.localID.String()),
  110. zap.Error(err),
  111. )
  112. } else {
  113. plog.Errorf("failed to unmarshal raft message (%v)", err)
  114. }
  115. http.Error(w, "error unmarshaling raft message", http.StatusBadRequest)
  116. recvFailures.WithLabelValues(r.RemoteAddr).Inc()
  117. return
  118. }
  119. receivedBytes.WithLabelValues(types.ID(m.From).String()).Add(float64(len(b)))
  120. if err := h.r.Process(context.TODO(), m); err != nil {
  121. switch v := err.(type) {
  122. case writerToResponse:
  123. v.WriteTo(w)
  124. default:
  125. if h.lg != nil {
  126. h.lg.Warn(
  127. "failed to process Raft message",
  128. zap.String("local-member-id", h.localID.String()),
  129. zap.Error(err),
  130. )
  131. } else {
  132. plog.Warningf("failed to process raft message (%v)", err)
  133. }
  134. http.Error(w, "error processing raft message", http.StatusInternalServerError)
  135. w.(http.Flusher).Flush()
  136. // disconnect the http stream
  137. panic(err)
  138. }
  139. return
  140. }
  141. // Write StatusNoContent header after the message has been processed by
  142. // raft, which facilitates the client to report MsgSnap status.
  143. w.WriteHeader(http.StatusNoContent)
  144. }
  145. type snapshotHandler struct {
  146. lg *zap.Logger
  147. tr Transporter
  148. r Raft
  149. snapshotter *raftsnap.Snapshotter
  150. localID types.ID
  151. cid types.ID
  152. }
  153. func newSnapshotHandler(t *Transport, r Raft, snapshotter *raftsnap.Snapshotter, cid types.ID) http.Handler {
  154. return &snapshotHandler{
  155. lg: t.Logger,
  156. tr: t,
  157. r: r,
  158. snapshotter: snapshotter,
  159. localID: t.ID,
  160. cid: cid,
  161. }
  162. }
  163. // ServeHTTP serves HTTP request to receive and process snapshot message.
  164. //
  165. // If request sender dies without closing underlying TCP connection,
  166. // the handler will keep waiting for the request body until TCP keepalive
  167. // finds out that the connection is broken after several minutes.
  168. // This is acceptable because
  169. // 1. snapshot messages sent through other TCP connections could still be
  170. // received and processed.
  171. // 2. this case should happen rarely, so no further optimization is done.
  172. func (h *snapshotHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
  173. if r.Method != "POST" {
  174. w.Header().Set("Allow", "POST")
  175. http.Error(w, "Method Not Allowed", http.StatusMethodNotAllowed)
  176. return
  177. }
  178. w.Header().Set("X-Etcd-Cluster-ID", h.cid.String())
  179. if err := checkClusterCompatibilityFromHeader(h.lg, h.localID, r.Header, h.cid); err != nil {
  180. http.Error(w, err.Error(), http.StatusPreconditionFailed)
  181. return
  182. }
  183. addRemoteFromRequest(h.tr, r)
  184. dec := &messageDecoder{r: r.Body}
  185. // let snapshots be very large since they can exceed 512MB for large installations
  186. m, err := dec.decodeLimit(uint64(1 << 63))
  187. if err != nil {
  188. msg := fmt.Sprintf("failed to decode raft message (%v)", err)
  189. if h.lg != nil {
  190. h.lg.Warn(
  191. "failed to decode Raft message",
  192. zap.String("local-member-id", h.localID.String()),
  193. zap.String("remote-snapshot-sender-id", types.ID(m.From).String()),
  194. zap.Error(err),
  195. )
  196. } else {
  197. plog.Error(msg)
  198. }
  199. http.Error(w, msg, http.StatusBadRequest)
  200. recvFailures.WithLabelValues(r.RemoteAddr).Inc()
  201. return
  202. }
  203. receivedBytes.WithLabelValues(types.ID(m.From).String()).Add(float64(m.Size()))
  204. if m.Type != raftpb.MsgSnap {
  205. if h.lg != nil {
  206. h.lg.Warn(
  207. "unexpected Raft message type",
  208. zap.String("local-member-id", h.localID.String()),
  209. zap.String("remote-snapshot-sender-id", types.ID(m.From).String()),
  210. zap.String("message-type", m.Type.String()),
  211. )
  212. } else {
  213. plog.Errorf("unexpected raft message type %s on snapshot path", m.Type)
  214. }
  215. http.Error(w, "wrong raft message type", http.StatusBadRequest)
  216. return
  217. }
  218. if h.lg != nil {
  219. h.lg.Info(
  220. "receiving database snapshot",
  221. zap.String("local-member-id", h.localID.String()),
  222. zap.String("remote-snapshot-sender-id", types.ID(m.From).String()),
  223. zap.Uint64("snapshot-index", m.Snapshot.Metadata.Index),
  224. )
  225. } else {
  226. plog.Infof("receiving database snapshot [index:%d, from %s] ...", m.Snapshot.Metadata.Index, types.ID(m.From))
  227. }
  228. // save incoming database snapshot.
  229. n, err := h.snapshotter.SaveDBFrom(r.Body, m.Snapshot.Metadata.Index)
  230. if err != nil {
  231. msg := fmt.Sprintf("failed to save KV snapshot (%v)", err)
  232. if h.lg != nil {
  233. h.lg.Warn(
  234. "failed to save KV snapshot",
  235. zap.String("local-member-id", h.localID.String()),
  236. zap.String("remote-snapshot-sender-id", types.ID(m.From).String()),
  237. zap.Error(err),
  238. )
  239. } else {
  240. plog.Error(msg)
  241. }
  242. http.Error(w, msg, http.StatusInternalServerError)
  243. return
  244. }
  245. receivedBytes.WithLabelValues(types.ID(m.From).String()).Add(float64(n))
  246. if h.lg != nil {
  247. h.lg.Info(
  248. "received and saved database snapshot",
  249. zap.String("local-member-id", h.localID.String()),
  250. zap.String("remote-snapshot-sender-id", types.ID(m.From).String()),
  251. zap.Uint64("snapshot-index", m.Snapshot.Metadata.Index),
  252. )
  253. } else {
  254. plog.Infof("received and saved database snapshot [index: %d, from: %s] successfully", m.Snapshot.Metadata.Index, types.ID(m.From))
  255. }
  256. if err := h.r.Process(context.TODO(), m); err != nil {
  257. switch v := err.(type) {
  258. // Process may return writerToResponse error when doing some
  259. // additional checks before calling raft.Node.Step.
  260. case writerToResponse:
  261. v.WriteTo(w)
  262. default:
  263. msg := fmt.Sprintf("failed to process raft message (%v)", err)
  264. if h.lg != nil {
  265. h.lg.Warn(
  266. "failed to process Raft message",
  267. zap.String("local-member-id", h.localID.String()),
  268. zap.String("remote-snapshot-sender-id", types.ID(m.From).String()),
  269. zap.Error(err),
  270. )
  271. } else {
  272. plog.Error(msg)
  273. }
  274. http.Error(w, msg, http.StatusInternalServerError)
  275. }
  276. return
  277. }
  278. // Write StatusNoContent header after the message has been processed by
  279. // raft, which facilitates the client to report MsgSnap status.
  280. w.WriteHeader(http.StatusNoContent)
  281. }
  282. type streamHandler struct {
  283. lg *zap.Logger
  284. tr *Transport
  285. peerGetter peerGetter
  286. r Raft
  287. id types.ID
  288. cid types.ID
  289. }
  290. func newStreamHandler(t *Transport, pg peerGetter, r Raft, id, cid types.ID) http.Handler {
  291. return &streamHandler{
  292. lg: t.Logger,
  293. tr: t,
  294. peerGetter: pg,
  295. r: r,
  296. id: id,
  297. cid: cid,
  298. }
  299. }
  300. func (h *streamHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
  301. if r.Method != "GET" {
  302. w.Header().Set("Allow", "GET")
  303. http.Error(w, "Method Not Allowed", http.StatusMethodNotAllowed)
  304. return
  305. }
  306. w.Header().Set("X-Server-Version", version.Version)
  307. w.Header().Set("X-Etcd-Cluster-ID", h.cid.String())
  308. if err := checkClusterCompatibilityFromHeader(h.lg, h.tr.ID, r.Header, h.cid); err != nil {
  309. http.Error(w, err.Error(), http.StatusPreconditionFailed)
  310. return
  311. }
  312. var t streamType
  313. switch path.Dir(r.URL.Path) {
  314. case streamTypeMsgAppV2.endpoint():
  315. t = streamTypeMsgAppV2
  316. case streamTypeMessage.endpoint():
  317. t = streamTypeMessage
  318. default:
  319. if h.lg != nil {
  320. h.lg.Debug(
  321. "ignored unexpected streaming request path",
  322. zap.String("local-member-id", h.tr.ID.String()),
  323. zap.String("remote-peer-id-stream-handler", h.id.String()),
  324. zap.String("path", r.URL.Path),
  325. )
  326. } else {
  327. plog.Debugf("ignored unexpected streaming request path %s", r.URL.Path)
  328. }
  329. http.Error(w, "invalid path", http.StatusNotFound)
  330. return
  331. }
  332. fromStr := path.Base(r.URL.Path)
  333. from, err := types.IDFromString(fromStr)
  334. if err != nil {
  335. if h.lg != nil {
  336. h.lg.Warn(
  337. "failed to parse path into ID",
  338. zap.String("local-member-id", h.tr.ID.String()),
  339. zap.String("remote-peer-id-stream-handler", h.id.String()),
  340. zap.String("path", fromStr),
  341. zap.Error(err),
  342. )
  343. } else {
  344. plog.Errorf("failed to parse from %s into ID (%v)", fromStr, err)
  345. }
  346. http.Error(w, "invalid from", http.StatusNotFound)
  347. return
  348. }
  349. if h.r.IsIDRemoved(uint64(from)) {
  350. if h.lg != nil {
  351. h.lg.Warn(
  352. "rejected stream from remote peer because it was removed",
  353. zap.String("local-member-id", h.tr.ID.String()),
  354. zap.String("remote-peer-id-stream-handler", h.id.String()),
  355. zap.String("remote-peer-id-from", from.String()),
  356. )
  357. } else {
  358. plog.Warningf("rejected the stream from peer %s since it was removed", from)
  359. }
  360. http.Error(w, "removed member", http.StatusGone)
  361. return
  362. }
  363. p := h.peerGetter.Get(from)
  364. if p == nil {
  365. // This may happen in following cases:
  366. // 1. user starts a remote peer that belongs to a different cluster
  367. // with the same cluster ID.
  368. // 2. local etcd falls behind of the cluster, and cannot recognize
  369. // the members that joined after its current progress.
  370. if urls := r.Header.Get("X-PeerURLs"); urls != "" {
  371. h.tr.AddRemote(from, strings.Split(urls, ","))
  372. }
  373. if h.lg != nil {
  374. h.lg.Warn(
  375. "failed to find remote peer in cluster",
  376. zap.String("local-member-id", h.tr.ID.String()),
  377. zap.String("remote-peer-id-stream-handler", h.id.String()),
  378. zap.String("remote-peer-id-from", from.String()),
  379. zap.String("cluster-id", h.cid.String()),
  380. )
  381. } else {
  382. plog.Errorf("failed to find member %s in cluster %s", from, h.cid)
  383. }
  384. http.Error(w, "error sender not found", http.StatusNotFound)
  385. return
  386. }
  387. wto := h.id.String()
  388. if gto := r.Header.Get("X-Raft-To"); gto != wto {
  389. if h.lg != nil {
  390. h.lg.Warn(
  391. "ignored streaming request; ID mismatch",
  392. zap.String("local-member-id", h.tr.ID.String()),
  393. zap.String("remote-peer-id-stream-handler", h.id.String()),
  394. zap.String("remote-peer-id-header", gto),
  395. zap.String("remote-peer-id-from", from.String()),
  396. zap.String("cluster-id", h.cid.String()),
  397. )
  398. } else {
  399. plog.Errorf("streaming request ignored (ID mismatch got %s want %s)", gto, wto)
  400. }
  401. http.Error(w, "to field mismatch", http.StatusPreconditionFailed)
  402. return
  403. }
  404. w.WriteHeader(http.StatusOK)
  405. w.(http.Flusher).Flush()
  406. c := newCloseNotifier()
  407. conn := &outgoingConn{
  408. t: t,
  409. Writer: w,
  410. Flusher: w.(http.Flusher),
  411. Closer: c,
  412. localID: h.tr.ID,
  413. peerID: h.id,
  414. }
  415. p.attachOutgoingConn(conn)
  416. <-c.closeNotify()
  417. }
  418. // checkClusterCompatibilityFromHeader checks the cluster compatibility of
  419. // the local member from the given header.
  420. // It checks whether the version of local member is compatible with
  421. // the versions in the header, and whether the cluster ID of local member
  422. // matches the one in the header.
  423. func checkClusterCompatibilityFromHeader(lg *zap.Logger, localID types.ID, header http.Header, cid types.ID) error {
  424. remoteName := header.Get("X-Server-From")
  425. remoteServer := serverVersion(header)
  426. remoteVs := ""
  427. if remoteServer != nil {
  428. remoteVs = remoteServer.String()
  429. }
  430. remoteMinClusterVer := minClusterVersion(header)
  431. remoteMinClusterVs := ""
  432. if remoteMinClusterVer != nil {
  433. remoteMinClusterVs = remoteMinClusterVer.String()
  434. }
  435. localServer, localMinCluster, err := checkVersionCompatibility(remoteName, remoteServer, remoteMinClusterVer)
  436. localVs := ""
  437. if localServer != nil {
  438. localVs = localServer.String()
  439. }
  440. localMinClusterVs := ""
  441. if localMinCluster != nil {
  442. localMinClusterVs = localMinCluster.String()
  443. }
  444. if err != nil {
  445. if lg != nil {
  446. lg.Warn(
  447. "failed to check version compatibility",
  448. zap.String("local-member-id", localID.String()),
  449. zap.String("local-member-cluster-id", cid.String()),
  450. zap.String("local-member-server-version", localVs),
  451. zap.String("local-member-server-minimum-cluster-version", localMinClusterVs),
  452. zap.String("remote-peer-server-name", remoteName),
  453. zap.String("remote-peer-server-version", remoteVs),
  454. zap.String("remote-peer-server-minimum-cluster-version", remoteMinClusterVs),
  455. zap.Error(err),
  456. )
  457. } else {
  458. plog.Errorf("request version incompatibility (%v)", err)
  459. }
  460. return errIncompatibleVersion
  461. }
  462. if gcid := header.Get("X-Etcd-Cluster-ID"); gcid != cid.String() {
  463. if lg != nil {
  464. lg.Warn(
  465. "request cluster ID mismatch",
  466. zap.String("local-member-id", localID.String()),
  467. zap.String("local-member-cluster-id", cid.String()),
  468. zap.String("local-member-server-version", localVs),
  469. zap.String("local-member-server-minimum-cluster-version", localMinClusterVs),
  470. zap.String("remote-peer-server-name", remoteName),
  471. zap.String("remote-peer-server-version", remoteVs),
  472. zap.String("remote-peer-server-minimum-cluster-version", remoteMinClusterVs),
  473. zap.String("remote-peer-cluster-id", gcid),
  474. )
  475. } else {
  476. plog.Errorf("request cluster ID mismatch (got %s want %s)", gcid, cid)
  477. }
  478. return errClusterIDMismatch
  479. }
  480. return nil
  481. }
  482. type closeNotifier struct {
  483. done chan struct{}
  484. }
  485. func newCloseNotifier() *closeNotifier {
  486. return &closeNotifier{
  487. done: make(chan struct{}),
  488. }
  489. }
  490. func (n *closeNotifier) Close() error {
  491. close(n.done)
  492. return nil
  493. }
  494. func (n *closeNotifier) closeNotify() <-chan struct{} { return n.done }