wal.go 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379
  1. /*
  2. Copyright 2014 CoreOS, Inc.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. package wal
  14. import (
  15. "errors"
  16. "fmt"
  17. "hash/crc32"
  18. "io"
  19. "log"
  20. "os"
  21. "path"
  22. "reflect"
  23. "github.com/coreos/etcd/pkg/fileutil"
  24. "github.com/coreos/etcd/pkg/pbutil"
  25. "github.com/coreos/etcd/raft"
  26. "github.com/coreos/etcd/raft/raftpb"
  27. "github.com/coreos/etcd/wal/walpb"
  28. )
  29. const (
  30. metadataType int64 = iota + 1
  31. entryType
  32. stateType
  33. crcType
  34. // the owner can make/remove files inside the directory
  35. privateDirMode = 0700
  36. )
  37. var (
  38. ErrMetadataConflict = errors.New("wal: conflicting metadata found")
  39. ErrFileNotFound = errors.New("wal: file not found")
  40. ErrCRCMismatch = errors.New("wal: crc mismatch")
  41. crcTable = crc32.MakeTable(crc32.Castagnoli)
  42. )
  43. // WAL is a logical repersentation of the stable storage.
  44. // WAL is either in read mode or append mode but not both.
  45. // A newly created WAL is in append mode, and ready for appending records.
  46. // A just opened WAL is in read mode, and ready for reading records.
  47. // The WAL will be ready for appending after reading out all the previous records.
  48. type WAL struct {
  49. dir string // the living directory of the underlay files
  50. metadata []byte // metadata recorded at the head of each WAL
  51. state raftpb.HardState // hardstate recorded at the head of WAL
  52. ri uint64 // index of entry to start reading
  53. decoder *decoder // decoder to decode records
  54. f *os.File // underlay file opened for appending, sync
  55. seq uint64 // sequence of the wal file currently used for writes
  56. enti uint64 // index of the last entry saved to the wal
  57. encoder *encoder // encoder to encode records
  58. locks []fileutil.Lock // the file locks the WAL is holding (the name is increasing)
  59. }
  60. // Create creates a WAL ready for appending records. The given metadata is
  61. // recorded at the head of each WAL file, and can be retrieved with ReadAll.
  62. func Create(dirpath string, metadata []byte) (*WAL, error) {
  63. if Exist(dirpath) {
  64. return nil, os.ErrExist
  65. }
  66. if err := os.MkdirAll(dirpath, privateDirMode); err != nil {
  67. return nil, err
  68. }
  69. p := path.Join(dirpath, walName(0, 0))
  70. f, err := os.OpenFile(p, os.O_WRONLY|os.O_APPEND|os.O_CREATE, 0600)
  71. if err != nil {
  72. return nil, err
  73. }
  74. l, err := fileutil.NewLock(f.Name())
  75. if err != nil {
  76. return nil, err
  77. }
  78. err = l.Lock()
  79. if err != nil {
  80. return nil, err
  81. }
  82. w := &WAL{
  83. dir: dirpath,
  84. metadata: metadata,
  85. seq: 0,
  86. f: f,
  87. encoder: newEncoder(f, 0),
  88. }
  89. w.locks = append(w.locks, l)
  90. if err := w.saveCrc(0); err != nil {
  91. return nil, err
  92. }
  93. if err := w.encoder.encode(&walpb.Record{Type: metadataType, Data: metadata}); err != nil {
  94. return nil, err
  95. }
  96. if err = w.sync(); err != nil {
  97. return nil, err
  98. }
  99. return w, nil
  100. }
  101. // Open opens the WAL at the given index.
  102. // The index SHOULD have been previously committed to the WAL, or the following
  103. // ReadAll will fail.
  104. // The returned WAL is ready to read and the first record will be the given
  105. // index. The WAL cannot be appended to before reading out all of its
  106. // previous records.
  107. func Open(dirpath string, index uint64) (*WAL, error) {
  108. return openAtIndex(dirpath, index, true)
  109. }
  110. // OpenNotInUse only opens the wal files that are not in use.
  111. // Other than that, it is similar to Open.
  112. func OpenNotInUse(dirpath string, index uint64) (*WAL, error) {
  113. return openAtIndex(dirpath, index, false)
  114. }
  115. func openAtIndex(dirpath string, index uint64, all bool) (*WAL, error) {
  116. names, err := fileutil.ReadDir(dirpath)
  117. if err != nil {
  118. return nil, err
  119. }
  120. names = checkWalNames(names)
  121. if len(names) == 0 {
  122. return nil, ErrFileNotFound
  123. }
  124. nameIndex, ok := searchIndex(names, index)
  125. if !ok || !isValidSeq(names[nameIndex:]) {
  126. return nil, ErrFileNotFound
  127. }
  128. // open the wal files for reading
  129. rcs := make([]io.ReadCloser, 0)
  130. ls := make([]fileutil.Lock, 0)
  131. for _, name := range names[nameIndex:] {
  132. f, err := os.Open(path.Join(dirpath, name))
  133. if err != nil {
  134. return nil, err
  135. }
  136. l, err := fileutil.NewLock(f.Name())
  137. if err != nil {
  138. return nil, err
  139. }
  140. err = l.TryLock()
  141. if err != nil {
  142. if all {
  143. return nil, err
  144. } else {
  145. log.Printf("wal: opened all the files until %s, since it is still in use by an etcd server", name)
  146. break
  147. }
  148. }
  149. rcs = append(rcs, f)
  150. ls = append(ls, l)
  151. }
  152. rc := MultiReadCloser(rcs...)
  153. // open the lastest wal file for appending
  154. seq, _, err := parseWalName(names[len(names)-1])
  155. if err != nil {
  156. rc.Close()
  157. return nil, err
  158. }
  159. last := path.Join(dirpath, names[len(names)-1])
  160. f, err := os.OpenFile(last, os.O_WRONLY|os.O_APPEND, 0)
  161. if err != nil {
  162. rc.Close()
  163. return nil, err
  164. }
  165. // create a WAL ready for reading
  166. w := &WAL{
  167. dir: dirpath,
  168. ri: index,
  169. decoder: newDecoder(rc),
  170. f: f,
  171. seq: seq,
  172. locks: ls,
  173. }
  174. return w, nil
  175. }
  176. // ReadAll reads out all records of the current WAL.
  177. // If it cannot read out the expected entry, it will return ErrIndexNotFound.
  178. // After ReadAll, the WAL will be ready for appending new records.
  179. func (w *WAL) ReadAll() (metadata []byte, state raftpb.HardState, ents []raftpb.Entry, err error) {
  180. rec := &walpb.Record{}
  181. decoder := w.decoder
  182. for err = decoder.decode(rec); err == nil; err = decoder.decode(rec) {
  183. switch rec.Type {
  184. case entryType:
  185. e := mustUnmarshalEntry(rec.Data)
  186. if e.Index >= w.ri {
  187. ents = append(ents[:e.Index-w.ri], e)
  188. }
  189. w.enti = e.Index
  190. case stateType:
  191. state = mustUnmarshalState(rec.Data)
  192. case metadataType:
  193. if metadata != nil && !reflect.DeepEqual(metadata, rec.Data) {
  194. state.Reset()
  195. return nil, state, nil, ErrMetadataConflict
  196. }
  197. metadata = rec.Data
  198. case crcType:
  199. crc := decoder.crc.Sum32()
  200. // current crc of decoder must match the crc of the record.
  201. // do no need to match 0 crc, since the decoder is a new one at this case.
  202. if crc != 0 && rec.Validate(crc) != nil {
  203. state.Reset()
  204. return nil, state, nil, ErrCRCMismatch
  205. }
  206. decoder.updateCRC(rec.Crc)
  207. default:
  208. state.Reset()
  209. return nil, state, nil, fmt.Errorf("unexpected block type %d", rec.Type)
  210. }
  211. }
  212. if err != io.EOF {
  213. state.Reset()
  214. return nil, state, nil, err
  215. }
  216. // close decoder, disable reading
  217. w.decoder.close()
  218. w.ri = 0
  219. w.metadata = metadata
  220. // create encoder (chain crc with the decoder), enable appending
  221. w.encoder = newEncoder(w.f, w.decoder.lastCRC())
  222. w.decoder = nil
  223. return metadata, state, ents, nil
  224. }
  225. // Cut closes current file written and creates a new one ready to append.
  226. func (w *WAL) Cut() error {
  227. // create a new wal file with name sequence + 1
  228. fpath := path.Join(w.dir, walName(w.seq+1, w.enti+1))
  229. f, err := os.OpenFile(fpath, os.O_WRONLY|os.O_APPEND|os.O_CREATE, 0600)
  230. if err != nil {
  231. return err
  232. }
  233. l, err := fileutil.NewLock(f.Name())
  234. if err != nil {
  235. return err
  236. }
  237. err = l.Lock()
  238. if err != nil {
  239. return err
  240. }
  241. w.locks = append(w.locks, l)
  242. if err = w.sync(); err != nil {
  243. return err
  244. }
  245. w.f.Close()
  246. // update writer and save the previous crc
  247. w.f = f
  248. w.seq++
  249. prevCrc := w.encoder.crc.Sum32()
  250. w.encoder = newEncoder(w.f, prevCrc)
  251. if err := w.saveCrc(prevCrc); err != nil {
  252. return err
  253. }
  254. if err := w.encoder.encode(&walpb.Record{Type: metadataType, Data: w.metadata}); err != nil {
  255. return err
  256. }
  257. if err := w.SaveState(&w.state); err != nil {
  258. return err
  259. }
  260. return w.sync()
  261. }
  262. func (w *WAL) sync() error {
  263. if w.encoder != nil {
  264. if err := w.encoder.flush(); err != nil {
  265. return err
  266. }
  267. }
  268. return w.f.Sync()
  269. }
  270. // ReleaseLockTo releases the locks w is holding, which
  271. // have index smaller or equal to the given index.
  272. func (w *WAL) ReleaseLockTo(index uint64) error {
  273. for _, l := range w.locks {
  274. _, i, err := parseWalName(path.Base(l.Name()))
  275. if err != nil {
  276. return err
  277. }
  278. if i > index {
  279. return nil
  280. }
  281. err = l.Unlock()
  282. if err != nil {
  283. return err
  284. }
  285. err = l.Destroy()
  286. if err != nil {
  287. return err
  288. }
  289. w.locks = w.locks[1:]
  290. }
  291. return nil
  292. }
  293. func (w *WAL) Close() error {
  294. if w.f != nil {
  295. if err := w.sync(); err != nil {
  296. return err
  297. }
  298. if err := w.f.Close(); err != nil {
  299. return err
  300. }
  301. }
  302. for _, l := range w.locks {
  303. // TODO: log the error
  304. l.Unlock()
  305. l.Destroy()
  306. }
  307. return nil
  308. }
  309. func (w *WAL) SaveEntry(e *raftpb.Entry) error {
  310. b := pbutil.MustMarshal(e)
  311. rec := &walpb.Record{Type: entryType, Data: b}
  312. if err := w.encoder.encode(rec); err != nil {
  313. return err
  314. }
  315. w.enti = e.Index
  316. return nil
  317. }
  318. func (w *WAL) SaveState(s *raftpb.HardState) error {
  319. if raft.IsEmptyHardState(*s) {
  320. return nil
  321. }
  322. w.state = *s
  323. b := pbutil.MustMarshal(s)
  324. rec := &walpb.Record{Type: stateType, Data: b}
  325. return w.encoder.encode(rec)
  326. }
  327. func (w *WAL) Save(st raftpb.HardState, ents []raftpb.Entry) error {
  328. // TODO(xiangli): no more reference operator
  329. if err := w.SaveState(&st); err != nil {
  330. return err
  331. }
  332. for i := range ents {
  333. if err := w.SaveEntry(&ents[i]); err != nil {
  334. return err
  335. }
  336. }
  337. return w.sync()
  338. }
  339. func (w *WAL) saveCrc(prevCrc uint32) error {
  340. return w.encoder.encode(&walpb.Record{Type: crcType, Crc: prevCrc})
  341. }