wal.go 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301
  1. /*
  2. Copyright 2014 CoreOS, Inc.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. package wal
  14. import (
  15. "errors"
  16. "fmt"
  17. "hash/crc32"
  18. "io"
  19. "os"
  20. "path"
  21. "reflect"
  22. "sort"
  23. "github.com/coreos/etcd/pkg/pbutil"
  24. "github.com/coreos/etcd/raft"
  25. "github.com/coreos/etcd/raft/raftpb"
  26. "github.com/coreos/etcd/wal/walpb"
  27. )
  28. const (
  29. metadataType int64 = iota + 1
  30. entryType
  31. stateType
  32. crcType
  33. // the owner can make/remove files inside the directory
  34. privateDirMode = 0700
  35. )
  36. var (
  37. ErrMetadataConflict = errors.New("wal: conflicting metadata found")
  38. ErrFileNotFound = errors.New("wal: file not found")
  39. ErrIndexNotFound = errors.New("wal: index not found in file")
  40. ErrCRCMismatch = errors.New("wal: crc mismatch")
  41. crcTable = crc32.MakeTable(crc32.Castagnoli)
  42. )
  43. // WAL is a logical repersentation of the stable storage.
  44. // WAL is either in read mode or append mode but not both.
  45. // A newly created WAL is in append mode, and ready for appending records.
  46. // A just opened WAL is in read mode, and ready for reading records.
  47. // The WAL will be ready for appending after reading out all the previous records.
  48. type WAL struct {
  49. dir string // the living directory of the underlay files
  50. metadata []byte // metadata recorded at the head of each WAL
  51. ri uint64 // index of entry to start reading
  52. decoder *decoder // decoder to decode records
  53. f *os.File // underlay file opened for appending, sync
  54. seq uint64 // sequence of the wal file currently used for writes
  55. enti uint64 // index of the last entry saved to the wal
  56. encoder *encoder // encoder to encode records
  57. }
  58. // Create creates a WAL ready for appending records. The given metadata is
  59. // recorded at the head of each WAL file, and can be retrieved with ReadAll.
  60. func Create(dirpath string, metadata []byte) (*WAL, error) {
  61. if Exist(dirpath) {
  62. return nil, os.ErrExist
  63. }
  64. if err := os.MkdirAll(dirpath, privateDirMode); err != nil {
  65. return nil, err
  66. }
  67. p := path.Join(dirpath, walName(0, 0))
  68. f, err := os.OpenFile(p, os.O_WRONLY|os.O_APPEND|os.O_CREATE, 0600)
  69. if err != nil {
  70. return nil, err
  71. }
  72. w := &WAL{
  73. dir: dirpath,
  74. metadata: metadata,
  75. seq: 0,
  76. f: f,
  77. encoder: newEncoder(f, 0),
  78. }
  79. if err := w.saveCrc(0); err != nil {
  80. return nil, err
  81. }
  82. if err := w.encoder.encode(&walpb.Record{Type: metadataType, Data: metadata}); err != nil {
  83. return nil, err
  84. }
  85. if err = w.sync(); err != nil {
  86. return nil, err
  87. }
  88. return w, nil
  89. }
  90. // OpenAtIndex opens the WAL at the given index.
  91. // The index SHOULD have been previously committed to the WAL, or the following
  92. // ReadAll will fail.
  93. // The returned WAL is ready to read and the first record will be the given
  94. // index. The WAL cannot be appended to before reading out all of its
  95. // previous records.
  96. func OpenAtIndex(dirpath string, index uint64) (*WAL, error) {
  97. names, err := readDir(dirpath)
  98. if err != nil {
  99. return nil, err
  100. }
  101. names = checkWalNames(names)
  102. if len(names) == 0 {
  103. return nil, ErrFileNotFound
  104. }
  105. sort.Sort(sort.StringSlice(names))
  106. nameIndex, ok := searchIndex(names, index)
  107. if !ok || !isValidSeq(names[nameIndex:]) {
  108. return nil, ErrFileNotFound
  109. }
  110. // open the wal files for reading
  111. rcs := make([]io.ReadCloser, 0)
  112. for _, name := range names[nameIndex:] {
  113. f, err := os.Open(path.Join(dirpath, name))
  114. if err != nil {
  115. return nil, err
  116. }
  117. rcs = append(rcs, f)
  118. }
  119. rc := MultiReadCloser(rcs...)
  120. // open the lastest wal file for appending
  121. seq, _, err := parseWalName(names[len(names)-1])
  122. if err != nil {
  123. rc.Close()
  124. return nil, err
  125. }
  126. last := path.Join(dirpath, names[len(names)-1])
  127. f, err := os.OpenFile(last, os.O_WRONLY|os.O_APPEND, 0)
  128. if err != nil {
  129. rc.Close()
  130. return nil, err
  131. }
  132. // create a WAL ready for reading
  133. w := &WAL{
  134. dir: dirpath,
  135. ri: index,
  136. decoder: newDecoder(rc),
  137. f: f,
  138. seq: seq,
  139. }
  140. return w, nil
  141. }
  142. // ReadAll reads out all records of the current WAL.
  143. // If it cannot read out the expected entry, it will return ErrIndexNotFound.
  144. // After ReadAll, the WAL will be ready for appending new records.
  145. func (w *WAL) ReadAll() (metadata []byte, state raftpb.HardState, ents []raftpb.Entry, err error) {
  146. rec := &walpb.Record{}
  147. decoder := w.decoder
  148. for err = decoder.decode(rec); err == nil; err = decoder.decode(rec) {
  149. switch rec.Type {
  150. case entryType:
  151. e := mustUnmarshalEntry(rec.Data)
  152. if e.Index >= w.ri {
  153. ents = append(ents[:e.Index-w.ri], e)
  154. }
  155. w.enti = e.Index
  156. case stateType:
  157. state = mustUnmarshalState(rec.Data)
  158. case metadataType:
  159. if metadata != nil && !reflect.DeepEqual(metadata, rec.Data) {
  160. state.Reset()
  161. return nil, state, nil, ErrMetadataConflict
  162. }
  163. metadata = rec.Data
  164. case crcType:
  165. crc := decoder.crc.Sum32()
  166. // current crc of decoder must match the crc of the record.
  167. // do no need to match 0 crc, since the decoder is a new one at this case.
  168. if crc != 0 && rec.Validate(crc) != nil {
  169. state.Reset()
  170. return nil, state, nil, ErrCRCMismatch
  171. }
  172. decoder.updateCRC(rec.Crc)
  173. default:
  174. state.Reset()
  175. return nil, state, nil, fmt.Errorf("unexpected block type %d", rec.Type)
  176. }
  177. }
  178. if err != io.EOF {
  179. state.Reset()
  180. return nil, state, nil, err
  181. }
  182. if w.enti < w.ri {
  183. state.Reset()
  184. return nil, state, nil, ErrIndexNotFound
  185. }
  186. // close decoder, disable reading
  187. w.decoder.close()
  188. w.ri = 0
  189. w.metadata = metadata
  190. // create encoder (chain crc with the decoder), enable appending
  191. w.encoder = newEncoder(w.f, w.decoder.lastCRC())
  192. w.decoder = nil
  193. return metadata, state, ents, nil
  194. }
  195. // Cut closes current file written and creates a new one ready to append.
  196. func (w *WAL) Cut() error {
  197. // create a new wal file with name sequence + 1
  198. fpath := path.Join(w.dir, walName(w.seq+1, w.enti+1))
  199. f, err := os.OpenFile(fpath, os.O_WRONLY|os.O_APPEND|os.O_CREATE, 0600)
  200. if err != nil {
  201. return err
  202. }
  203. if err = w.sync(); err != nil {
  204. return err
  205. }
  206. w.f.Close()
  207. // update writer and save the previous crc
  208. w.f = f
  209. w.seq++
  210. prevCrc := w.encoder.crc.Sum32()
  211. w.encoder = newEncoder(w.f, prevCrc)
  212. if err := w.saveCrc(prevCrc); err != nil {
  213. return err
  214. }
  215. return w.encoder.encode(&walpb.Record{Type: metadataType, Data: w.metadata})
  216. }
  217. func (w *WAL) sync() error {
  218. if w.encoder != nil {
  219. if err := w.encoder.flush(); err != nil {
  220. return err
  221. }
  222. }
  223. return w.f.Sync()
  224. }
  225. func (w *WAL) Close() error {
  226. if w.f != nil {
  227. if err := w.sync(); err != nil {
  228. return err
  229. }
  230. if err := w.f.Close(); err != nil {
  231. return err
  232. }
  233. }
  234. return nil
  235. }
  236. func (w *WAL) SaveEntry(e *raftpb.Entry) error {
  237. b := pbutil.MustMarshal(e)
  238. rec := &walpb.Record{Type: entryType, Data: b}
  239. if err := w.encoder.encode(rec); err != nil {
  240. return err
  241. }
  242. w.enti = e.Index
  243. return nil
  244. }
  245. func (w *WAL) SaveState(s *raftpb.HardState) error {
  246. if raft.IsEmptyHardState(*s) {
  247. return nil
  248. }
  249. b := pbutil.MustMarshal(s)
  250. rec := &walpb.Record{Type: stateType, Data: b}
  251. return w.encoder.encode(rec)
  252. }
  253. func (w *WAL) Save(st raftpb.HardState, ents []raftpb.Entry) error {
  254. // TODO(xiangli): no more reference operator
  255. if err := w.SaveState(&st); err != nil {
  256. return err
  257. }
  258. for i := range ents {
  259. if err := w.SaveEntry(&ents[i]); err != nil {
  260. return err
  261. }
  262. }
  263. return w.sync()
  264. }
  265. func (w *WAL) saveCrc(prevCrc uint32) error {
  266. return w.encoder.encode(&walpb.Record{Type: crcType, Crc: prevCrc})
  267. }