|
|
@@ -17,47 +17,49 @@ limitations under the License.
|
|
|
package wal
|
|
|
|
|
|
import (
|
|
|
- "bufio"
|
|
|
- "bytes"
|
|
|
+ "errors"
|
|
|
"fmt"
|
|
|
+ "hash/crc32"
|
|
|
"io"
|
|
|
"log"
|
|
|
"os"
|
|
|
"path"
|
|
|
"sort"
|
|
|
|
|
|
- "github.com/coreos/etcd/raft"
|
|
|
+ "github.com/coreos/etcd/raft/raftpb"
|
|
|
)
|
|
|
|
|
|
const (
|
|
|
infoType int64 = iota + 1
|
|
|
entryType
|
|
|
stateType
|
|
|
+ crcType
|
|
|
)
|
|
|
|
|
|
var (
|
|
|
- ErrIdMismatch = fmt.Errorf("unmatch id")
|
|
|
- ErrNotFound = fmt.Errorf("wal file is not found")
|
|
|
+ ErrIdMismatch = fmt.Errorf("wal: unmatch id")
|
|
|
+ ErrNotFound = fmt.Errorf("wal: file is not found")
|
|
|
+ ErrCRCMismatch = errors.New("wal: crc mismatch")
|
|
|
+ crcTable = crc32.MakeTable(crc32.Castagnoli)
|
|
|
)
|
|
|
|
|
|
+// WAL is a logical repersentation of the stable storage.
|
|
|
+// WAL is either in read mode or append mode but not both.
|
|
|
+// A newly created WAL is in append mode, and ready for appending records.
|
|
|
+// A just opened WAL is in read mode, and ready for reading records.
|
|
|
+// The WAL will be ready for appending after reading out all the previous records.
|
|
|
type WAL struct {
|
|
|
- f *os.File
|
|
|
- bw *bufio.Writer
|
|
|
- buf *bytes.Buffer
|
|
|
-}
|
|
|
+ dir string // the living directory of the underlay files
|
|
|
|
|
|
-func newWAL(f *os.File) *WAL {
|
|
|
- return &WAL{f, bufio.NewWriter(f), new(bytes.Buffer)}
|
|
|
-}
|
|
|
+ ri int64 // index of entry to start reading
|
|
|
+ decoder *decoder // decoder to decode records
|
|
|
|
|
|
-func Exist(dirpath string) bool {
|
|
|
- names, err := readDir(dirpath)
|
|
|
- if err != nil {
|
|
|
- return false
|
|
|
- }
|
|
|
- return len(names) != 0
|
|
|
+ f *os.File // underlay file opened for appending, sync
|
|
|
+ seq int64 // current sequence of the wal file
|
|
|
+ encoder *encoder // encoder to encode records
|
|
|
}
|
|
|
|
|
|
+// Create creates a WAL ready for appending records.
|
|
|
func Create(dirpath string) (*WAL, error) {
|
|
|
log.Printf("path=%s wal.create", dirpath)
|
|
|
if Exist(dirpath) {
|
|
|
@@ -68,11 +70,24 @@ func Create(dirpath string) (*WAL, error) {
|
|
|
if err != nil {
|
|
|
return nil, err
|
|
|
}
|
|
|
- return newWAL(f), nil
|
|
|
+ w := &WAL{
|
|
|
+ dir: dirpath,
|
|
|
+ seq: 0,
|
|
|
+ f: f,
|
|
|
+ encoder: newEncoder(f, 0),
|
|
|
+ }
|
|
|
+ if err := w.saveCrc(0); err != nil {
|
|
|
+ return nil, err
|
|
|
+ }
|
|
|
+ return w, nil
|
|
|
}
|
|
|
|
|
|
-func Open(dirpath string) (*WAL, error) {
|
|
|
- log.Printf("path=%s wal.append", dirpath)
|
|
|
+// OpenFromIndex opens the WAL files containing all the entries after
|
|
|
+// the given index.
|
|
|
+// The returned WAL is ready to read. The WAL cannot be appended to before
|
|
|
+// reading out all of its previous records.
|
|
|
+func OpenFromIndex(dirpath string, index int64) (*WAL, error) {
|
|
|
+ log.Printf("path=%s wal.load index=%d", dirpath, index)
|
|
|
names, err := readDir(dirpath)
|
|
|
if err != nil {
|
|
|
return nil, err
|
|
|
@@ -82,298 +97,166 @@ func Open(dirpath string) (*WAL, error) {
|
|
|
return nil, ErrNotFound
|
|
|
}
|
|
|
|
|
|
- name := names[len(names)-1]
|
|
|
- p := path.Join(dirpath, name)
|
|
|
- f, err := os.OpenFile(p, os.O_WRONLY|os.O_APPEND, 0)
|
|
|
- if err != nil {
|
|
|
- return nil, err
|
|
|
- }
|
|
|
- return newWAL(f), nil
|
|
|
-}
|
|
|
-
|
|
|
-// index should be the index of last log entry currently.
|
|
|
-// Cut closes current file written and creates a new one to append.
|
|
|
-func (w *WAL) Cut(index int64) error {
|
|
|
- log.Printf("path=%s wal.cut index=%d", w.f.Name(), index)
|
|
|
- fpath := w.f.Name()
|
|
|
- seq, _, err := parseWalName(path.Base(fpath))
|
|
|
- if err != nil {
|
|
|
- panic("parse correct name error")
|
|
|
- }
|
|
|
- fpath = path.Join(path.Dir(fpath), fmt.Sprintf("%016x-%016x.wal", seq+1, index))
|
|
|
- f, err := os.OpenFile(fpath, os.O_WRONLY|os.O_APPEND|os.O_CREATE, 0600)
|
|
|
- if err != nil {
|
|
|
- return err
|
|
|
- }
|
|
|
-
|
|
|
- w.Sync()
|
|
|
- w.f.Close()
|
|
|
- w.f = f
|
|
|
- w.bw = bufio.NewWriter(f)
|
|
|
- return nil
|
|
|
-}
|
|
|
-
|
|
|
-func (w *WAL) Sync() error {
|
|
|
- if err := w.bw.Flush(); err != nil {
|
|
|
- return err
|
|
|
- }
|
|
|
- return w.f.Sync()
|
|
|
-}
|
|
|
+ sort.Sort(sort.StringSlice(names))
|
|
|
|
|
|
-func (w *WAL) Close() {
|
|
|
- log.Printf("path=%s wal.close", w.f.Name())
|
|
|
- if w.f != nil {
|
|
|
- w.Sync()
|
|
|
- w.f.Close()
|
|
|
+ nameIndex, ok := searchIndex(names, index)
|
|
|
+ if !ok || !isValidSeq(names[nameIndex:]) {
|
|
|
+ return nil, ErrNotFound
|
|
|
}
|
|
|
-}
|
|
|
|
|
|
-func (w *WAL) SaveInfo(i *raft.Info) error {
|
|
|
- log.Printf("path=%s wal.saveInfo id=%d", w.f.Name(), i.Id)
|
|
|
- if err := w.checkAtHead(); err != nil {
|
|
|
- return err
|
|
|
- }
|
|
|
- b, err := i.Marshal()
|
|
|
- if err != nil {
|
|
|
- panic(err)
|
|
|
+ // open the wal files for reading
|
|
|
+ rcs := make([]io.ReadCloser, 0)
|
|
|
+ for _, name := range names[nameIndex:] {
|
|
|
+ f, err := os.Open(path.Join(dirpath, name))
|
|
|
+ if err != nil {
|
|
|
+ return nil, err
|
|
|
+ }
|
|
|
+ rcs = append(rcs, f)
|
|
|
}
|
|
|
- rec := &Record{Type: infoType, Data: b}
|
|
|
- return writeRecord(w.bw, rec)
|
|
|
-}
|
|
|
+ rc := MultiReadCloser(rcs...)
|
|
|
|
|
|
-func (w *WAL) SaveEntry(e *raft.Entry) error {
|
|
|
- b, err := e.Marshal()
|
|
|
+ // open the lastest wal file for appending
|
|
|
+ last := path.Join(dirpath, names[len(names)-1])
|
|
|
+ f, err := os.OpenFile(last, os.O_WRONLY|os.O_APPEND, 0)
|
|
|
if err != nil {
|
|
|
- panic(err)
|
|
|
+ rc.Close()
|
|
|
+ return nil, err
|
|
|
}
|
|
|
- rec := &Record{Type: entryType, Data: b}
|
|
|
- return writeRecord(w.bw, rec)
|
|
|
-}
|
|
|
|
|
|
-func (w *WAL) SaveState(s *raft.State) error {
|
|
|
- log.Printf("path=%s wal.saveState state=\"%+v\"", w.f.Name(), s)
|
|
|
- b, err := s.Marshal()
|
|
|
- if err != nil {
|
|
|
- panic(err)
|
|
|
- }
|
|
|
- rec := &Record{Type: stateType, Data: b}
|
|
|
- return writeRecord(w.bw, rec)
|
|
|
-}
|
|
|
+ // create a WAL ready for reading
|
|
|
+ w := &WAL{
|
|
|
+ ri: index,
|
|
|
+ decoder: newDecoder(rc),
|
|
|
|
|
|
-func (w *WAL) checkAtHead() error {
|
|
|
- o, err := w.f.Seek(0, os.SEEK_CUR)
|
|
|
- if err != nil {
|
|
|
- return err
|
|
|
+ f: f,
|
|
|
}
|
|
|
- if o != 0 || w.bw.Buffered() != 0 {
|
|
|
- return fmt.Errorf("cannot write info at %d, expect 0", max(o, int64(w.bw.Buffered())))
|
|
|
- }
|
|
|
- return nil
|
|
|
+ return w, nil
|
|
|
}
|
|
|
|
|
|
-type Node struct {
|
|
|
- Id int64
|
|
|
- Ents []raft.Entry
|
|
|
- State raft.State
|
|
|
-
|
|
|
- // index of the first entry
|
|
|
- index int64
|
|
|
-}
|
|
|
-
|
|
|
-func newNode(index int64) *Node {
|
|
|
- return &Node{Ents: make([]raft.Entry, 0), index: index + 1}
|
|
|
-}
|
|
|
+// ReadAll reads out all records of the current WAL.
|
|
|
+// After ReadAll, the WAL will be ready for appending new records.
|
|
|
+func (w *WAL) ReadAll() (int64, raftpb.State, []raftpb.Entry, error) {
|
|
|
+ var id int64
|
|
|
+ var state raftpb.State
|
|
|
+ var entries []raftpb.Entry
|
|
|
|
|
|
-func (n *Node) load(path string) error {
|
|
|
- f, err := os.Open(path)
|
|
|
- if err != nil {
|
|
|
- return err
|
|
|
- }
|
|
|
- defer f.Close()
|
|
|
- br := bufio.NewReader(f)
|
|
|
rec := &Record{}
|
|
|
-
|
|
|
- err = readRecord(br, rec)
|
|
|
- if err != nil {
|
|
|
- return err
|
|
|
- }
|
|
|
- if rec.Type != infoType {
|
|
|
- return fmt.Errorf("the first block of wal is not infoType but %d", rec.Type)
|
|
|
- }
|
|
|
- i, err := loadInfo(rec.Data)
|
|
|
- if err != nil {
|
|
|
- return err
|
|
|
- }
|
|
|
- if n.Id != 0 && n.Id != i.Id {
|
|
|
- return ErrIdMismatch
|
|
|
- }
|
|
|
- n.Id = i.Id
|
|
|
-
|
|
|
- for err = readRecord(br, rec); err == nil; err = readRecord(br, rec) {
|
|
|
+ decoder := w.decoder
|
|
|
+ var err error
|
|
|
+ for err = decoder.decode(rec); err == nil; err = decoder.decode(rec) {
|
|
|
switch rec.Type {
|
|
|
case entryType:
|
|
|
- e, err := loadEntry(rec.Data)
|
|
|
- if err != nil {
|
|
|
- return err
|
|
|
- }
|
|
|
- if e.Index >= n.index {
|
|
|
- n.Ents = append(n.Ents[:e.Index-n.index], e)
|
|
|
+ e := mustUnmarshalEntry(rec.Data)
|
|
|
+ if e.Index > w.ri {
|
|
|
+ entries = append(entries[:e.Index-w.ri-1], e)
|
|
|
}
|
|
|
case stateType:
|
|
|
- s, err := loadState(rec.Data)
|
|
|
- if err != nil {
|
|
|
- return err
|
|
|
+ state = mustUnmarshalState(rec.Data)
|
|
|
+ case infoType:
|
|
|
+ i := mustUnmarshalInfo(rec.Data)
|
|
|
+ if id != 0 && id != i.Id {
|
|
|
+ state.Reset()
|
|
|
+ return 0, state, nil, ErrIdMismatch
|
|
|
}
|
|
|
- n.State = s
|
|
|
+ id = i.Id
|
|
|
+ case crcType:
|
|
|
+ crc := decoder.crc.Sum32()
|
|
|
+ // current crc of decoder must match the crc of the record.
|
|
|
+ // do no need to match 0 crc, since the decoder is a new one at this case.
|
|
|
+ if crc != 0 && rec.validate(crc) != nil {
|
|
|
+ state.Reset()
|
|
|
+ return 0, state, nil, ErrCRCMismatch
|
|
|
+ }
|
|
|
+ decoder.updateCRC(rec.Crc)
|
|
|
default:
|
|
|
- return fmt.Errorf("unexpected block type %d", rec.Type)
|
|
|
+ state.Reset()
|
|
|
+ return 0, state, nil, fmt.Errorf("unexpected block type %d", rec.Type)
|
|
|
}
|
|
|
}
|
|
|
if err != io.EOF {
|
|
|
- return err
|
|
|
+ state.Reset()
|
|
|
+ return 0, state, nil, err
|
|
|
}
|
|
|
- return nil
|
|
|
-}
|
|
|
|
|
|
-func (n *Node) startFrom(index int64) error {
|
|
|
- diff := int(index - n.index)
|
|
|
- if diff > len(n.Ents) {
|
|
|
- return ErrNotFound
|
|
|
- }
|
|
|
- n.Ents = n.Ents[diff:]
|
|
|
- return nil
|
|
|
+ // close decoder, disable reading
|
|
|
+ w.decoder.close()
|
|
|
+ w.ri = 0
|
|
|
+
|
|
|
+ // create encoder (chain crc with the decoder), enable appending
|
|
|
+ w.encoder = newEncoder(w.f, w.decoder.lastCRC())
|
|
|
+ w.decoder = nil
|
|
|
+ return id, state, entries, nil
|
|
|
}
|
|
|
|
|
|
-// Read loads all entries after index (index is not included).
|
|
|
-func Read(dirpath string, index int64) (*Node, error) {
|
|
|
- log.Printf("path=%s wal.load index=%d", dirpath, index)
|
|
|
- names, err := readDir(dirpath)
|
|
|
+// index should be the index of last log entry.
|
|
|
+// Cut closes current file written and creates a new one ready to append.
|
|
|
+func (w *WAL) Cut(index int64) error {
|
|
|
+ log.Printf("wal.cut index=%d", index)
|
|
|
+
|
|
|
+ // create a new wal file with name sequence + 1
|
|
|
+ fpath := path.Join(w.dir, fmt.Sprintf("%016x-%016x.wal", w.seq+1, index))
|
|
|
+ f, err := os.OpenFile(fpath, os.O_WRONLY|os.O_APPEND|os.O_CREATE, 0600)
|
|
|
if err != nil {
|
|
|
- return nil, err
|
|
|
- }
|
|
|
- names = checkWalNames(names)
|
|
|
- if len(names) == 0 {
|
|
|
- return nil, ErrNotFound
|
|
|
+ return err
|
|
|
}
|
|
|
|
|
|
- sort.Sort(sort.StringSlice(names))
|
|
|
- nameIndex, ok := searchIndex(names, index)
|
|
|
- if !ok || !isValidSeq(names[nameIndex:]) {
|
|
|
- return nil, ErrNotFound
|
|
|
- }
|
|
|
+ w.Sync()
|
|
|
+ w.f.Close()
|
|
|
|
|
|
- _, initIndex, err := parseWalName(names[nameIndex])
|
|
|
- if err != nil {
|
|
|
- panic("parse correct name error")
|
|
|
- }
|
|
|
- n := newNode(initIndex)
|
|
|
- for _, name := range names[nameIndex:] {
|
|
|
- if err := n.load(path.Join(dirpath, name)); err != nil {
|
|
|
- return nil, err
|
|
|
- }
|
|
|
- }
|
|
|
- if err := n.startFrom(index + 1); err != nil {
|
|
|
- return nil, ErrNotFound
|
|
|
- }
|
|
|
- return n, nil
|
|
|
+ // update writer and save the previous crc
|
|
|
+ w.f = f
|
|
|
+ w.seq++
|
|
|
+ prevCrc := w.encoder.crc.Sum32()
|
|
|
+ w.encoder = newEncoder(w.f, prevCrc)
|
|
|
+ return w.saveCrc(prevCrc)
|
|
|
}
|
|
|
|
|
|
-// The input names should be sorted.
|
|
|
-// serachIndex returns the array index of the last name that has
|
|
|
-// a smaller raft index section than the given raft index.
|
|
|
-func searchIndex(names []string, index int64) (int, bool) {
|
|
|
- for i := len(names) - 1; i >= 0; i-- {
|
|
|
- name := names[i]
|
|
|
- _, curIndex, err := parseWalName(name)
|
|
|
- if err != nil {
|
|
|
- panic("parse correct name error")
|
|
|
- }
|
|
|
- if index >= curIndex {
|
|
|
- return i, true
|
|
|
+func (w *WAL) Sync() error {
|
|
|
+ if w.encoder != nil {
|
|
|
+ if err := w.encoder.flush(); err != nil {
|
|
|
+ return err
|
|
|
}
|
|
|
}
|
|
|
- return -1, false
|
|
|
+ return w.f.Sync()
|
|
|
}
|
|
|
|
|
|
-// names should have been sorted based on sequence number.
|
|
|
-// isValidSeq checks whether seq increases continuously.
|
|
|
-func isValidSeq(names []string) bool {
|
|
|
- var lastSeq int64
|
|
|
- for _, name := range names {
|
|
|
- curSeq, _, err := parseWalName(name)
|
|
|
- if err != nil {
|
|
|
- panic("parse correct name error")
|
|
|
- }
|
|
|
- if lastSeq != 0 && lastSeq != curSeq-1 {
|
|
|
- return false
|
|
|
- }
|
|
|
- lastSeq = curSeq
|
|
|
+func (w *WAL) Close() {
|
|
|
+ log.Printf("path=%s wal.close", w.f.Name())
|
|
|
+ if w.f != nil {
|
|
|
+ w.Sync()
|
|
|
+ w.f.Close()
|
|
|
}
|
|
|
- return true
|
|
|
}
|
|
|
|
|
|
-func loadInfo(d []byte) (raft.Info, error) {
|
|
|
- var i raft.Info
|
|
|
- err := i.Unmarshal(d)
|
|
|
+func (w *WAL) SaveInfo(i *raftpb.Info) error {
|
|
|
+ log.Printf("path=%s wal.saveInfo id=%d", w.f.Name(), i.Id)
|
|
|
+ b, err := i.Marshal()
|
|
|
if err != nil {
|
|
|
panic(err)
|
|
|
}
|
|
|
- return i, err
|
|
|
+ rec := &Record{Type: infoType, Data: b}
|
|
|
+ return w.encoder.encode(rec)
|
|
|
}
|
|
|
|
|
|
-func loadEntry(d []byte) (raft.Entry, error) {
|
|
|
- var e raft.Entry
|
|
|
- err := e.Unmarshal(d)
|
|
|
+func (w *WAL) SaveEntry(e *raftpb.Entry) error {
|
|
|
+ b, err := e.Marshal()
|
|
|
if err != nil {
|
|
|
panic(err)
|
|
|
}
|
|
|
- return e, err
|
|
|
-}
|
|
|
-
|
|
|
-func loadState(d []byte) (raft.State, error) {
|
|
|
- var s raft.State
|
|
|
- err := s.Unmarshal(d)
|
|
|
- return s, err
|
|
|
+ rec := &Record{Type: entryType, Data: b}
|
|
|
+ return w.encoder.encode(rec)
|
|
|
}
|
|
|
|
|
|
-// readDir returns the filenames in wal directory.
|
|
|
-func readDir(dirpath string) ([]string, error) {
|
|
|
- dir, err := os.Open(dirpath)
|
|
|
- if err != nil {
|
|
|
- return nil, err
|
|
|
- }
|
|
|
- defer dir.Close()
|
|
|
- names, err := dir.Readdirnames(-1)
|
|
|
+func (w *WAL) SaveState(s *raftpb.State) error {
|
|
|
+ log.Printf("path=%s wal.saveState state=\"%+v\"", w.f.Name(), s)
|
|
|
+ b, err := s.Marshal()
|
|
|
if err != nil {
|
|
|
- return nil, err
|
|
|
- }
|
|
|
- return names, nil
|
|
|
-}
|
|
|
-
|
|
|
-func checkWalNames(names []string) []string {
|
|
|
- wnames := make([]string, 0)
|
|
|
- for _, name := range names {
|
|
|
- if _, _, err := parseWalName(name); err != nil {
|
|
|
- log.Printf("parse %s: %v", name, err)
|
|
|
- continue
|
|
|
- }
|
|
|
- wnames = append(wnames, name)
|
|
|
- }
|
|
|
- return wnames
|
|
|
-}
|
|
|
-
|
|
|
-func parseWalName(str string) (seq, index int64, err error) {
|
|
|
- var num int
|
|
|
- num, err = fmt.Sscanf(str, "%016x-%016x.wal", &seq, &index)
|
|
|
- if num != 2 && err == nil {
|
|
|
- err = fmt.Errorf("bad wal name: %s", str)
|
|
|
+ panic(err)
|
|
|
}
|
|
|
- return
|
|
|
+ rec := &Record{Type: stateType, Data: b}
|
|
|
+ return w.encoder.encode(rec)
|
|
|
}
|
|
|
|
|
|
-func max(a, b int64) int64 {
|
|
|
- if a > b {
|
|
|
- return a
|
|
|
- }
|
|
|
- return b
|
|
|
+func (w *WAL) saveCrc(prevCrc uint32) error {
|
|
|
+ return w.encoder.encode(&Record{Type: crcType, Crc: prevCrc})
|
|
|
}
|