v3_snapshot.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492
  1. // Copyright 2018 The etcd Authors
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package snapshot
  15. import (
  16. "context"
  17. "crypto/sha256"
  18. "encoding/json"
  19. "fmt"
  20. "hash/crc32"
  21. "io"
  22. "math"
  23. "os"
  24. "path/filepath"
  25. "reflect"
  26. "strings"
  27. "time"
  28. bolt "github.com/coreos/bbolt"
  29. "github.com/coreos/etcd/clientv3"
  30. "github.com/coreos/etcd/etcdserver"
  31. "github.com/coreos/etcd/etcdserver/etcdserverpb"
  32. "github.com/coreos/etcd/etcdserver/membership"
  33. "github.com/coreos/etcd/lease"
  34. "github.com/coreos/etcd/mvcc"
  35. "github.com/coreos/etcd/mvcc/backend"
  36. "github.com/coreos/etcd/pkg/fileutil"
  37. "github.com/coreos/etcd/pkg/types"
  38. "github.com/coreos/etcd/raft"
  39. "github.com/coreos/etcd/raft/raftpb"
  40. "github.com/coreos/etcd/snap"
  41. "github.com/coreos/etcd/store"
  42. "github.com/coreos/etcd/wal"
  43. "github.com/coreos/etcd/wal/walpb"
  44. "go.uber.org/zap"
  45. )
  46. // Manager defines snapshot methods.
  47. type Manager interface {
  48. // Save fetches snapshot from remote etcd server and saves data
  49. // to target path. If the context "ctx" is canceled or timed out,
  50. // snapshot save stream will error out (e.g. context.Canceled,
  51. // context.DeadlineExceeded). Make sure to specify only one endpoint
  52. // in client configuration. Snapshot API must be requested to a
  53. // selected node, and saved snapshot is the point-in-time state of
  54. // the selected node.
  55. Save(ctx context.Context, cfg clientv3.Config, dbPath string) error
  56. // Status returns the snapshot file information.
  57. Status(dbPath string) (Status, error)
  58. // Restore restores a new etcd data directory from given snapshot
  59. // file. It returns an error if specified data directory already
  60. // exists, to prevent unintended data directory overwrites.
  61. Restore(cfg RestoreConfig) error
  62. }
  63. // NewV3 returns a new snapshot Manager for v3.x snapshot.
  64. func NewV3(lg *zap.Logger) Manager {
  65. if lg == nil {
  66. lg = zap.NewExample()
  67. }
  68. return &v3Manager{lg: lg}
  69. }
  70. type v3Manager struct {
  71. lg *zap.Logger
  72. name string
  73. dbPath string
  74. walDir string
  75. snapDir string
  76. cl *membership.RaftCluster
  77. skipHashCheck bool
  78. }
  79. // Save fetches snapshot from remote etcd server and saves data to target path.
  80. func (s *v3Manager) Save(ctx context.Context, cfg clientv3.Config, dbPath string) error {
  81. if len(cfg.Endpoints) != 1 {
  82. return fmt.Errorf("snapshot must be requested to one selected node, not multiple %v", cfg.Endpoints)
  83. }
  84. cli, err := clientv3.New(cfg)
  85. if err != nil {
  86. return err
  87. }
  88. defer cli.Close()
  89. partpath := dbPath + ".part"
  90. defer os.RemoveAll(partpath)
  91. var f *os.File
  92. f, err = os.OpenFile(partpath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, fileutil.PrivateFileMode)
  93. if err != nil {
  94. return fmt.Errorf("could not open %s (%v)", partpath, err)
  95. }
  96. s.lg.Info(
  97. "created temporary db file",
  98. zap.String("path", partpath),
  99. )
  100. now := time.Now()
  101. var rd io.ReadCloser
  102. rd, err = cli.Snapshot(ctx)
  103. if err != nil {
  104. return err
  105. }
  106. s.lg.Info(
  107. "fetching snapshot",
  108. zap.String("endpoint", cfg.Endpoints[0]),
  109. )
  110. if _, err = io.Copy(f, rd); err != nil {
  111. return err
  112. }
  113. if err = fileutil.Fsync(f); err != nil {
  114. return err
  115. }
  116. if err = f.Close(); err != nil {
  117. return err
  118. }
  119. s.lg.Info(
  120. "fetched snapshot",
  121. zap.String("endpoint", cfg.Endpoints[0]),
  122. zap.Duration("took", time.Since(now)),
  123. )
  124. if err = os.Rename(partpath, dbPath); err != nil {
  125. return fmt.Errorf("could not rename %s to %s (%v)", partpath, dbPath, err)
  126. }
  127. s.lg.Info("saved", zap.String("path", dbPath))
  128. return nil
  129. }
  130. // Status is the snapshot file status.
  131. type Status struct {
  132. Hash uint32 `json:"hash"`
  133. Revision int64 `json:"revision"`
  134. TotalKey int `json:"totalKey"`
  135. TotalSize int64 `json:"totalSize"`
  136. }
  137. // Status returns the snapshot file information.
  138. func (s *v3Manager) Status(dbPath string) (ds Status, err error) {
  139. if _, err = os.Stat(dbPath); err != nil {
  140. return ds, err
  141. }
  142. db, err := bolt.Open(dbPath, 0400, &bolt.Options{ReadOnly: true})
  143. if err != nil {
  144. return ds, err
  145. }
  146. defer db.Close()
  147. h := crc32.New(crc32.MakeTable(crc32.Castagnoli))
  148. if err = db.View(func(tx *bolt.Tx) error {
  149. // check snapshot file integrity first
  150. var dbErrStrings []string
  151. for dbErr := range tx.Check() {
  152. dbErrStrings = append(dbErrStrings, dbErr.Error())
  153. }
  154. if len(dbErrStrings) > 0 {
  155. return fmt.Errorf("snapshot file integrity check failed. %d errors found.\n"+strings.Join(dbErrStrings, "\n"), len(dbErrStrings))
  156. }
  157. ds.TotalSize = tx.Size()
  158. c := tx.Cursor()
  159. for next, _ := c.First(); next != nil; next, _ = c.Next() {
  160. b := tx.Bucket(next)
  161. if b == nil {
  162. return fmt.Errorf("cannot get hash of bucket %s", string(next))
  163. }
  164. h.Write(next)
  165. iskeyb := (string(next) == "key")
  166. b.ForEach(func(k, v []byte) error {
  167. h.Write(k)
  168. h.Write(v)
  169. if iskeyb {
  170. rev := bytesToRev(k)
  171. ds.Revision = rev.main
  172. }
  173. ds.TotalKey++
  174. return nil
  175. })
  176. }
  177. return nil
  178. }); err != nil {
  179. return ds, err
  180. }
  181. ds.Hash = h.Sum32()
  182. return ds, nil
  183. }
  184. // RestoreConfig configures snapshot restore operation.
  185. type RestoreConfig struct {
  186. // SnapshotPath is the path of snapshot file to restore from.
  187. SnapshotPath string
  188. // Name is the human-readable name of this member.
  189. Name string
  190. // OutputDataDir is the target data directory to save restored data.
  191. // OutputDataDir should not conflict with existing etcd data directory.
  192. // If OutputDataDir already exists, it will return an error to prevent
  193. // unintended data directory overwrites.
  194. // If empty, defaults to "[Name].etcd" if not given.
  195. OutputDataDir string
  196. // OutputWALDir is the target WAL data directory.
  197. // If empty, defaults to "[OutputDataDir]/member/wal" if not given.
  198. OutputWALDir string
  199. // PeerURLs is a list of member's peer URLs to advertise to the rest of the cluster.
  200. PeerURLs []string
  201. // InitialCluster is the initial cluster configuration for restore bootstrap.
  202. InitialCluster string
  203. // InitialClusterToken is the initial cluster token for etcd cluster during restore bootstrap.
  204. InitialClusterToken string
  205. // SkipHashCheck is "true" to ignore snapshot integrity hash value
  206. // (required if copied from data directory).
  207. SkipHashCheck bool
  208. }
  209. // Restore restores a new etcd data directory from given snapshot file.
  210. func (s *v3Manager) Restore(cfg RestoreConfig) error {
  211. pURLs, err := types.NewURLs(cfg.PeerURLs)
  212. if err != nil {
  213. return err
  214. }
  215. var ics types.URLsMap
  216. ics, err = types.NewURLsMap(cfg.InitialCluster)
  217. if err != nil {
  218. return err
  219. }
  220. srv := etcdserver.ServerConfig{
  221. Name: cfg.Name,
  222. PeerURLs: pURLs,
  223. InitialPeerURLsMap: ics,
  224. InitialClusterToken: cfg.InitialClusterToken,
  225. }
  226. if err = srv.VerifyBootstrap(); err != nil {
  227. return err
  228. }
  229. s.cl, err = membership.NewClusterFromURLsMap(cfg.InitialClusterToken, ics)
  230. if err != nil {
  231. return err
  232. }
  233. dataDir := cfg.OutputDataDir
  234. if dataDir == "" {
  235. dataDir = cfg.Name + ".etcd"
  236. }
  237. if fileutil.Exist(dataDir) {
  238. return fmt.Errorf("data-dir %q exists", dataDir)
  239. }
  240. walDir := cfg.OutputWALDir
  241. if walDir == "" {
  242. walDir = filepath.Join(dataDir, "member", "wal")
  243. } else if fileutil.Exist(walDir) {
  244. return fmt.Errorf("wal-dir %q exists", walDir)
  245. }
  246. s.name = cfg.Name
  247. s.dbPath = cfg.SnapshotPath
  248. s.walDir = walDir
  249. s.snapDir = filepath.Join(dataDir, "member", "snap")
  250. s.skipHashCheck = cfg.SkipHashCheck
  251. s.lg.Info(
  252. "restoring snapshot",
  253. zap.String("path", s.dbPath),
  254. zap.String("wal-dir", s.walDir),
  255. zap.String("data-dir", dataDir),
  256. zap.String("snap-dir", s.snapDir),
  257. )
  258. if err = s.saveDB(); err != nil {
  259. return err
  260. }
  261. if err = s.saveWALAndSnap(); err != nil {
  262. return err
  263. }
  264. s.lg.Info(
  265. "restored snapshot",
  266. zap.String("path", s.dbPath),
  267. zap.String("wal-dir", s.walDir),
  268. zap.String("data-dir", dataDir),
  269. zap.String("snap-dir", s.snapDir),
  270. )
  271. return nil
  272. }
  273. // saveDB copies the database snapshot to the snapshot directory
  274. func (s *v3Manager) saveDB() error {
  275. f, ferr := os.OpenFile(s.dbPath, os.O_RDONLY, 0600)
  276. if ferr != nil {
  277. return ferr
  278. }
  279. defer f.Close()
  280. // get snapshot integrity hash
  281. if _, err := f.Seek(-sha256.Size, io.SeekEnd); err != nil {
  282. return err
  283. }
  284. sha := make([]byte, sha256.Size)
  285. if _, err := f.Read(sha); err != nil {
  286. return err
  287. }
  288. if _, err := f.Seek(0, io.SeekStart); err != nil {
  289. return err
  290. }
  291. if err := fileutil.CreateDirAll(s.snapDir); err != nil {
  292. return err
  293. }
  294. dbpath := filepath.Join(s.snapDir, "db")
  295. db, dberr := os.OpenFile(dbpath, os.O_RDWR|os.O_CREATE, 0600)
  296. if dberr != nil {
  297. return dberr
  298. }
  299. if _, err := io.Copy(db, f); err != nil {
  300. return err
  301. }
  302. // truncate away integrity hash, if any.
  303. off, serr := db.Seek(0, io.SeekEnd)
  304. if serr != nil {
  305. return serr
  306. }
  307. hasHash := (off % 512) == sha256.Size
  308. if hasHash {
  309. if err := db.Truncate(off - sha256.Size); err != nil {
  310. return err
  311. }
  312. }
  313. if !hasHash && !s.skipHashCheck {
  314. return fmt.Errorf("snapshot missing hash but --skip-hash-check=false")
  315. }
  316. if hasHash && !s.skipHashCheck {
  317. // check for match
  318. if _, err := db.Seek(0, io.SeekStart); err != nil {
  319. return err
  320. }
  321. h := sha256.New()
  322. if _, err := io.Copy(h, db); err != nil {
  323. return err
  324. }
  325. dbsha := h.Sum(nil)
  326. if !reflect.DeepEqual(sha, dbsha) {
  327. return fmt.Errorf("expected sha256 %v, got %v", sha, dbsha)
  328. }
  329. }
  330. // db hash is OK, can now modify DB so it can be part of a new cluster
  331. db.Close()
  332. commit := len(s.cl.Members())
  333. // update consistentIndex so applies go through on etcdserver despite
  334. // having a new raft instance
  335. be := backend.NewDefaultBackend(dbpath)
  336. // a lessor never timeouts leases
  337. lessor := lease.NewLessor(be, math.MaxInt64)
  338. mvs := mvcc.NewStore(be, lessor, (*initIndex)(&commit))
  339. txn := mvs.Write()
  340. btx := be.BatchTx()
  341. del := func(k, v []byte) error {
  342. txn.DeleteRange(k, nil)
  343. return nil
  344. }
  345. // delete stored members from old cluster since using new members
  346. btx.UnsafeForEach([]byte("members"), del)
  347. // todo: add back new members when we start to deprecate old snap file.
  348. btx.UnsafeForEach([]byte("members_removed"), del)
  349. // trigger write-out of new consistent index
  350. txn.End()
  351. mvs.Commit()
  352. mvs.Close()
  353. be.Close()
  354. return nil
  355. }
  356. // saveWALAndSnap creates a WAL for the initial cluster
  357. func (s *v3Manager) saveWALAndSnap() error {
  358. if err := fileutil.CreateDirAll(s.walDir); err != nil {
  359. return err
  360. }
  361. // add members again to persist them to the store we create.
  362. st := store.New(etcdserver.StoreClusterPrefix, etcdserver.StoreKeysPrefix)
  363. s.cl.SetStore(st)
  364. for _, m := range s.cl.Members() {
  365. s.cl.AddMember(m)
  366. }
  367. m := s.cl.MemberByName(s.name)
  368. md := &etcdserverpb.Metadata{NodeID: uint64(m.ID), ClusterID: uint64(s.cl.ID())}
  369. metadata, merr := md.Marshal()
  370. if merr != nil {
  371. return merr
  372. }
  373. w, walerr := wal.Create(s.walDir, metadata)
  374. if walerr != nil {
  375. return walerr
  376. }
  377. defer w.Close()
  378. peers := make([]raft.Peer, len(s.cl.MemberIDs()))
  379. for i, id := range s.cl.MemberIDs() {
  380. ctx, err := json.Marshal((*s.cl).Member(id))
  381. if err != nil {
  382. return err
  383. }
  384. peers[i] = raft.Peer{ID: uint64(id), Context: ctx}
  385. }
  386. ents := make([]raftpb.Entry, len(peers))
  387. nodeIDs := make([]uint64, len(peers))
  388. for i, p := range peers {
  389. nodeIDs[i] = p.ID
  390. cc := raftpb.ConfChange{
  391. Type: raftpb.ConfChangeAddNode,
  392. NodeID: p.ID,
  393. Context: p.Context,
  394. }
  395. d, err := cc.Marshal()
  396. if err != nil {
  397. return err
  398. }
  399. ents[i] = raftpb.Entry{
  400. Type: raftpb.EntryConfChange,
  401. Term: 1,
  402. Index: uint64(i + 1),
  403. Data: d,
  404. }
  405. }
  406. commit, term := uint64(len(ents)), uint64(1)
  407. if err := w.Save(raftpb.HardState{
  408. Term: term,
  409. Vote: peers[0].ID,
  410. Commit: commit,
  411. }, ents); err != nil {
  412. return err
  413. }
  414. b, berr := st.Save()
  415. if berr != nil {
  416. return berr
  417. }
  418. raftSnap := raftpb.Snapshot{
  419. Data: b,
  420. Metadata: raftpb.SnapshotMetadata{
  421. Index: commit,
  422. Term: term,
  423. ConfState: raftpb.ConfState{
  424. Nodes: nodeIDs,
  425. },
  426. },
  427. }
  428. sn := snap.New(s.snapDir)
  429. if err := sn.SaveSnap(raftSnap); err != nil {
  430. return err
  431. }
  432. return w.SaveSnapshot(walpb.Snapshot{Index: commit, Term: term})
  433. }