Browse Source

etcdctl: verify snapshot hash on restore

Fixes #4097
Anthony Romano 9 years ago
parent
commit
798718c49b

+ 2 - 0
Documentation/op-guide/recovery.md

@@ -18,6 +18,8 @@ $ etcdctl --endpoints $ENDPOINT snapshot save snapshot.db
 
 To restore a cluster, all that is needed is a single snapshot "db" file. A cluster restore with `etcdctl snapshot restore` creates new etcd data directories; all members should restore using the same snapshot. Restoring overwrites some snapshot metadata (specifically, the member ID and cluster ID); the member loses its former identity. This metadata overwrite prevents the new member from inadvertently joining an existing cluster. Therefore in order to start a cluster from a snapshot, the restore must start a new logical cluster.
 
+Snapshot integrity may be optionally verified at restore time. If the snapshot is taken with `etcdctl snapshot save`, it will have an integrity hash that is checked by `etcdctl snapshot restore`. If the snapshot is copied from the data directory, there is no integrity hash and it will only restore by using `--skip-hash-check`.
+
 A restore initializes a new member of a new cluster, with a fresh cluster configuration using `etcd`'s cluster configuration flags, but preserves the contents of the etcd keyspace. Continuing from the previous example, the following creates new etcd data directories (`m1.etcd`, `m2.etcd`, `m3.etcd`) for a three member cluster:
 
 ```sh

+ 32 - 0
e2e/ctl_v3_snapshot_test.go

@@ -52,6 +52,38 @@ func snapshotTest(cx ctlCtx) {
 	}
 }
 
+func TestCtlV3SnapshotCorrupt(t *testing.T) { testCtl(t, snapshotCorruptTest) }
+
+func snapshotCorruptTest(cx ctlCtx) {
+	fpath := "test.snapshot"
+	defer os.RemoveAll(fpath)
+
+	if err := ctlV3SnapshotSave(cx, fpath); err != nil {
+		cx.t.Fatalf("snapshotTest ctlV3SnapshotSave error (%v)", err)
+	}
+
+	// corrupt file
+	f, oerr := os.OpenFile(fpath, os.O_WRONLY, 0)
+	if oerr != nil {
+		cx.t.Fatal(oerr)
+	}
+	if _, err := f.Write(make([]byte, 512)); err != nil {
+		cx.t.Fatal(err)
+	}
+	f.Close()
+
+	defer os.RemoveAll("snap.etcd")
+	serr := spawnWithExpect(
+		append(cx.PrefixArgs(), "snapshot", "restore",
+			"--data-dir", "snap.etcd",
+			fpath),
+		"expected sha256")
+
+	if serr != nil {
+		cx.t.Fatal(serr)
+	}
+}
+
 func ctlV3SnapshotSave(cx ctlCtx, fpath string) error {
 	cmdArgs := append(cx.PrefixArgs(), "snapshot", "save", fpath)
 	return spawnWithExpect(cmdArgs, fmt.Sprintf("Snapshot saved at %s", fpath))

+ 54 - 2
etcdctl/ctlv3/command/snapshot_command.go

@@ -15,6 +15,7 @@
 package command
 
 import (
+	"crypto/sha256"
 	"encoding/binary"
 	"encoding/json"
 	"fmt"
@@ -22,6 +23,7 @@ import (
 	"io"
 	"os"
 	"path"
+	"reflect"
 	"strings"
 
 	"github.com/boltdb/bolt"
@@ -50,6 +52,7 @@ var (
 	restoreDataDir      string
 	restorePeerURLs     string
 	restoreName         string
+	skipHashCheck       bool
 )
 
 // NewSnapshotCommand returns the cobra command for "snapshot".
@@ -94,6 +97,7 @@ func NewSnapshotRestoreCommand() *cobra.Command {
 	cmd.Flags().StringVar(&restoreClusterToken, "initial-cluster-token", "etcd-cluster", "Initial cluster token for the etcd cluster during restore bootstrap.")
 	cmd.Flags().StringVar(&restorePeerURLs, "initial-advertise-peer-urls", defaultInitialAdvertisePeerURLs, "List of this member's peer URLs to advertise to the rest of the cluster.")
 	cmd.Flags().StringVar(&restoreName, "name", defaultName, "Human-readable name for this member.")
+	cmd.Flags().BoolVar(&skipHashCheck, "skip-hash-check", false, "Ignore snapshot integrity hash value (required if copied from data directory).")
 
 	return cmd
 }
@@ -191,7 +195,7 @@ func initialClusterFromName(name string) string {
 	if name == "" {
 		n = defaultName
 	}
-	return fmt.Sprintf("%s=http://localhost:2380", n, n)
+	return fmt.Sprintf("%s=http://localhost:2380", n)
 }
 
 // makeWAL creates a WAL for the initial cluster
@@ -261,18 +265,65 @@ func makeDB(snapdir, dbfile string) {
 	}
 	defer f.Close()
 
+	// get snapshot integrity hash
+	if _, err := f.Seek(-sha256.Size, os.SEEK_END); err != nil {
+		ExitWithError(ExitIO, err)
+	}
+	sha := make([]byte, sha256.Size)
+	if _, err := f.Read(sha); err != nil {
+		ExitWithError(ExitIO, err)
+	}
+	if _, err := f.Seek(0, os.SEEK_SET); err != nil {
+		ExitWithError(ExitIO, err)
+	}
+
 	if err := os.MkdirAll(snapdir, 0755); err != nil {
 		ExitWithError(ExitIO, err)
 	}
 
 	dbpath := path.Join(snapdir, "db")
-	db, dberr := os.OpenFile(dbpath, os.O_WRONLY|os.O_CREATE, 0600)
+	db, dberr := os.OpenFile(dbpath, os.O_RDWR|os.O_CREATE, 0600)
 	if dberr != nil {
 		ExitWithError(ExitIO, dberr)
 	}
 	if _, err := io.Copy(db, f); err != nil {
 		ExitWithError(ExitIO, err)
 	}
+
+	// truncate away integrity hash, if any.
+	off, serr := db.Seek(0, os.SEEK_END)
+	if serr != nil {
+		ExitWithError(ExitIO, serr)
+	}
+	hasHash := (off % 512) == sha256.Size
+	if hasHash {
+		if err := db.Truncate(off - sha256.Size); err != nil {
+			ExitWithError(ExitIO, err)
+		}
+	}
+
+	if !hasHash && !skipHashCheck {
+		err := fmt.Errorf("snapshot missing hash but --skip-hash-check=false")
+		ExitWithError(ExitBadArgs, err)
+	}
+
+	if hasHash && !skipHashCheck {
+		// check for match
+		if _, err := db.Seek(0, os.SEEK_SET); err != nil {
+			ExitWithError(ExitIO, err)
+		}
+		h := sha256.New()
+		if _, err := io.Copy(h, db); err != nil {
+			ExitWithError(ExitIO, err)
+		}
+		dbsha := h.Sum(nil)
+		if !reflect.DeepEqual(sha, dbsha) {
+			err := fmt.Errorf("expected sha256 %v, got %v", sha, dbsha)
+			ExitWithError(ExitInvalidInput, err)
+		}
+	}
+
+	// db hash is OK, can now modify DB so it can be part of a new cluster
 	db.Close()
 
 	// update consistentIndex so applies go through on etcdserver despite
@@ -285,6 +336,7 @@ func makeDB(snapdir, dbfile string) {
 		_, _, err := s.TxnDeleteRange(id, k, nil)
 		return err
 	}
+
 	// delete stored members from old cluster since using new members
 	btx.UnsafeForEach([]byte("members"), del)
 	btx.UnsafeForEach([]byte("members_removed"), del)