mscfb.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389
  1. // Copyright 2013 Richard Lehane. All rights reserved.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. // Package mscfb implements a reader for Microsoft's Compound File Binary File Format (http://msdn.microsoft.com/en-us/library/dd942138.aspx).
  15. //
  16. // The Compound File Binary File Format is also known as the Object Linking and Embedding (OLE) or Component Object Model (COM) format and was used by many
  17. // early MS software such as MS Office.
  18. //
  19. // Example:
  20. // file, _ := os.Open("test/test.doc")
  21. // defer file.Close()
  22. // doc, err := mscfb.New(file)
  23. // if err != nil {
  24. // log.Fatal(err)
  25. // }
  26. // for entry, err := doc.Next(); err == nil; entry, err = doc.Next() {
  27. // buf := make([]byte, 512)
  28. // i, _ := entry.Read(buf)
  29. // if i > 0 {
  30. // fmt.Println(buf[:i])
  31. // }
  32. // fmt.Println(entry.Name)
  33. // }
  34. package mscfb
  35. import (
  36. "encoding/binary"
  37. "io"
  38. "strconv"
  39. "time"
  40. )
  41. func fileOffset(ss, sn uint32) int64 {
  42. return int64((sn + 1) * ss)
  43. }
  44. const (
  45. signature uint64 = 0xE11AB1A1E011CFD0
  46. miniStreamSectorSize uint32 = 64
  47. miniStreamCutoffSize int64 = 4096
  48. dirEntrySize uint32 = 128 //128 bytes
  49. )
  50. const (
  51. maxRegSect uint32 = 0xFFFFFFFA // Maximum regular sector number
  52. difatSect uint32 = 0xFFFFFFFC //Specifies a DIFAT sector in the FAT
  53. fatSect uint32 = 0xFFFFFFFD // Specifies a FAT sector in the FAT
  54. endOfChain uint32 = 0xFFFFFFFE // End of linked chain of sectors
  55. freeSect uint32 = 0xFFFFFFFF // Speficies unallocated sector in the FAT, Mini FAT or DIFAT
  56. maxRegStreamID uint32 = 0xFFFFFFFA // maximum regular stream ID
  57. noStream uint32 = 0xFFFFFFFF // empty pointer
  58. )
  59. const lenHeader int = 8 + 16 + 10 + 6 + 12 + 8 + 16 + 109*4
  60. type headerFields struct {
  61. signature uint64
  62. _ [16]byte //CLSID - ignore, must be null
  63. minorVersion uint16 //Version number for non-breaking changes. This field SHOULD be set to 0x003E if the major version field is either 0x0003 or 0x0004.
  64. majorVersion uint16 //Version number for breaking changes. This field MUST be set to either 0x0003 (version 3) or 0x0004 (version 4).
  65. _ [2]byte //byte order - ignore, must be little endian
  66. sectorSize uint16 //This field MUST be set to 0x0009, or 0x000c, depending on the Major Version field. This field specifies the sector size of the compound file as a power of 2. If Major Version is 3, then the Sector Shift MUST be 0x0009, specifying a sector size of 512 bytes. If Major Version is 4, then the Sector Shift MUST be 0x000C, specifying a sector size of 4096 bytes.
  67. _ [2]byte // ministream sector size - ignore, must be 64 bytes
  68. _ [6]byte // reserved - ignore, not used
  69. numDirectorySectors uint32 //This integer field contains the count of the number of directory sectors in the compound file. If Major Version is 3, then the Number of Directory Sectors MUST be zero. This field is not supported for version 3 compound files.
  70. numFatSectors uint32 //This integer field contains the count of the number of FAT sectors in the compound file.
  71. directorySectorLoc uint32 //This integer field contains the starting sector number for the directory stream.
  72. _ [4]byte // transaction - ignore, not used
  73. _ [4]byte // mini stream size cutooff - ignore, must be 4096 bytes
  74. miniFatSectorLoc uint32 //This integer field contains the starting sector number for the mini FAT.
  75. numMiniFatSectors uint32 //This integer field contains the count of the number of mini FAT sectors in the compound file.
  76. difatSectorLoc uint32 //This integer field contains the starting sector number for the DIFAT.
  77. numDifatSectors uint32 //This integer field contains the count of the number of DIFAT sectors in the compound file.
  78. initialDifats [109]uint32 //The first 109 difat sectors are included in the header
  79. }
  80. func makeHeader(b []byte) *headerFields {
  81. h := &headerFields{}
  82. h.signature = binary.LittleEndian.Uint64(b[:8])
  83. h.minorVersion = binary.LittleEndian.Uint16(b[24:26])
  84. h.majorVersion = binary.LittleEndian.Uint16(b[26:28])
  85. h.sectorSize = binary.LittleEndian.Uint16(b[30:32])
  86. h.numDirectorySectors = binary.LittleEndian.Uint32(b[40:44])
  87. h.numFatSectors = binary.LittleEndian.Uint32(b[44:48])
  88. h.directorySectorLoc = binary.LittleEndian.Uint32(b[48:52])
  89. h.miniFatSectorLoc = binary.LittleEndian.Uint32(b[60:64])
  90. h.numMiniFatSectors = binary.LittleEndian.Uint32(b[64:68])
  91. h.difatSectorLoc = binary.LittleEndian.Uint32(b[68:72])
  92. h.numDifatSectors = binary.LittleEndian.Uint32(b[72:76])
  93. var idx int
  94. for i := 76; i < 512; i = i + 4 {
  95. h.initialDifats[idx] = binary.LittleEndian.Uint32(b[i : i+4])
  96. idx++
  97. }
  98. return h
  99. }
  100. type header struct {
  101. *headerFields
  102. difats []uint32
  103. miniFatLocs []uint32
  104. miniStreamLocs []uint32 // chain of sectors containing the ministream
  105. }
  106. func (r *Reader) setHeader() error {
  107. buf, err := r.readAt(0, lenHeader)
  108. if err != nil {
  109. return err
  110. }
  111. r.header = &header{headerFields: makeHeader(buf)}
  112. // sanity check - check signature
  113. if r.header.signature != signature {
  114. return Error{ErrFormat, "bad signature", int64(r.header.signature)}
  115. }
  116. // check for legal sector size
  117. if r.header.sectorSize == 0x0009 || r.header.sectorSize == 0x000c {
  118. r.sectorSize = uint32(1 << r.header.sectorSize)
  119. } else {
  120. return Error{ErrFormat, "illegal sector size", int64(r.header.sectorSize)}
  121. }
  122. // check for DIFAT overflow
  123. if r.header.numDifatSectors > 0 {
  124. sz := (r.sectorSize / 4) - 1
  125. if int(r.header.numDifatSectors*sz+109) < 0 {
  126. return Error{ErrFormat, "DIFAT int overflow", int64(r.header.numDifatSectors)}
  127. }
  128. if r.header.numDifatSectors*sz+109 > r.header.numFatSectors+sz {
  129. return Error{ErrFormat, "num DIFATs exceeds FAT sectors", int64(r.header.numDifatSectors)}
  130. }
  131. }
  132. // check for mini FAT overflow
  133. if r.header.numMiniFatSectors > 0 {
  134. if int(r.sectorSize/4*r.header.numMiniFatSectors) < 0 {
  135. return Error{ErrFormat, "mini FAT int overflow", int64(r.header.numMiniFatSectors)}
  136. }
  137. if r.header.numMiniFatSectors > r.header.numFatSectors*(r.sectorSize/miniStreamSectorSize) {
  138. return Error{ErrFormat, "num mini FATs exceeds FAT sectors", int64(r.header.numFatSectors)}
  139. }
  140. }
  141. return nil
  142. }
  143. func (r *Reader) setDifats() error {
  144. r.header.difats = r.header.initialDifats[:]
  145. // return early if no extra DIFAT sectors
  146. if r.header.numDifatSectors == 0 {
  147. return nil
  148. }
  149. sz := (r.sectorSize / 4) - 1
  150. n := make([]uint32, 109, r.header.numDifatSectors*sz+109)
  151. copy(n, r.header.difats)
  152. r.header.difats = n
  153. off := r.header.difatSectorLoc
  154. for i := 0; i < int(r.header.numDifatSectors); i++ {
  155. buf, err := r.readAt(fileOffset(r.sectorSize, off), int(r.sectorSize))
  156. if err != nil {
  157. return Error{ErrFormat, "error setting DIFAT(" + err.Error() + ")", int64(off)}
  158. }
  159. for j := 0; j < int(sz); j++ {
  160. r.header.difats = append(r.header.difats, binary.LittleEndian.Uint32(buf[j*4:j*4+4]))
  161. }
  162. off = binary.LittleEndian.Uint32(buf[len(buf)-4:])
  163. }
  164. return nil
  165. }
  166. // set the ministream FAT and sector slices in the header
  167. func (r *Reader) setMiniStream() error {
  168. // do nothing if there is no ministream
  169. if r.direntries[0].startingSectorLoc == endOfChain || r.header.miniFatSectorLoc == endOfChain || r.header.numMiniFatSectors == 0 {
  170. return nil
  171. }
  172. // build a slice of minifat sectors (akin to the DIFAT slice)
  173. c := int(r.header.numMiniFatSectors)
  174. r.header.miniFatLocs = make([]uint32, c)
  175. r.header.miniFatLocs[0] = r.header.miniFatSectorLoc
  176. for i := 1; i < c; i++ {
  177. loc, err := r.findNext(r.header.miniFatLocs[i-1], false)
  178. if err != nil {
  179. return Error{ErrFormat, "setting mini stream (" + err.Error() + ")", int64(r.header.miniFatLocs[i-1])}
  180. }
  181. r.header.miniFatLocs[i] = loc
  182. }
  183. // build a slice of ministream sectors
  184. c = int(r.sectorSize / 4 * r.header.numMiniFatSectors)
  185. r.header.miniStreamLocs = make([]uint32, 0, c)
  186. sn := r.direntries[0].startingSectorLoc
  187. var err error
  188. for sn != endOfChain {
  189. r.header.miniStreamLocs = append(r.header.miniStreamLocs, sn)
  190. sn, err = r.findNext(sn, false)
  191. if err != nil {
  192. return Error{ErrFormat, "setting mini stream (" + err.Error() + ")", int64(sn)}
  193. }
  194. }
  195. return nil
  196. }
  197. func (r *Reader) readAt(offset int64, length int) ([]byte, error) {
  198. if r.slicer {
  199. b, err := r.ra.(slicer).Slice(offset, length)
  200. if err != nil {
  201. return nil, Error{ErrRead, "slicer read error (" + err.Error() + ")", offset}
  202. }
  203. return b, nil
  204. }
  205. if length > len(r.buf) {
  206. return nil, Error{ErrRead, "read length greater than read buffer", int64(length)}
  207. }
  208. if _, err := r.ra.ReadAt(r.buf[:length], offset); err != nil {
  209. return nil, Error{ErrRead, err.Error(), offset}
  210. }
  211. return r.buf[:length], nil
  212. }
  213. func (r *Reader) getOffset(sn uint32, mini bool) (int64, error) {
  214. if mini {
  215. num := r.sectorSize / 64
  216. sec := int(sn / num)
  217. if sec >= len(r.header.miniStreamLocs) {
  218. return 0, Error{ErrRead, "minisector number is outside minisector range", int64(sec)}
  219. }
  220. dif := sn % num
  221. return int64((r.header.miniStreamLocs[sec]+1)*r.sectorSize + dif*64), nil
  222. }
  223. return fileOffset(r.sectorSize, sn), nil
  224. }
  225. // check the FAT sector for the next sector in a chain
  226. func (r *Reader) findNext(sn uint32, mini bool) (uint32, error) {
  227. entries := r.sectorSize / 4
  228. index := int(sn / entries) // find position in DIFAT or minifat array
  229. var sect uint32
  230. if mini {
  231. if index < 0 || index >= len(r.header.miniFatLocs) {
  232. return 0, Error{ErrRead, "minisector index is outside miniFAT range", int64(index)}
  233. }
  234. sect = r.header.miniFatLocs[index]
  235. } else {
  236. if index < 0 || index >= len(r.header.difats) {
  237. return 0, Error{ErrRead, "FAT index is outside DIFAT range", int64(index)}
  238. }
  239. sect = r.header.difats[index]
  240. }
  241. fatIndex := sn % entries // find position within FAT or MiniFAT sector
  242. offset := fileOffset(r.sectorSize, sect) + int64(fatIndex*4)
  243. buf, err := r.readAt(offset, 4)
  244. if err != nil {
  245. return 0, Error{ErrRead, "bad read finding next sector (" + err.Error() + ")", offset}
  246. }
  247. return binary.LittleEndian.Uint32(buf), nil
  248. }
  249. // Reader provides sequential access to the contents of a MS compound file (MSCFB)
  250. type Reader struct {
  251. slicer bool
  252. sectorSize uint32
  253. buf []byte
  254. header *header
  255. File []*File // File is an ordered slice of final directory entries.
  256. direntries []*File // unordered raw directory entries
  257. entry int
  258. ra io.ReaderAt
  259. wa io.WriterAt
  260. }
  261. // New returns a MSCFB reader
  262. func New(ra io.ReaderAt) (*Reader, error) {
  263. r := &Reader{ra: ra}
  264. if _, ok := ra.(slicer); ok {
  265. r.slicer = true
  266. } else {
  267. r.buf = make([]byte, lenHeader)
  268. }
  269. if err := r.setHeader(); err != nil {
  270. return nil, err
  271. }
  272. // resize the buffer to 4096 if sector size isn't 512
  273. if !r.slicer && int(r.sectorSize) > len(r.buf) {
  274. r.buf = make([]byte, r.sectorSize)
  275. }
  276. if err := r.setDifats(); err != nil {
  277. return nil, err
  278. }
  279. if err := r.setDirEntries(); err != nil {
  280. return nil, err
  281. }
  282. if err := r.setMiniStream(); err != nil {
  283. return nil, err
  284. }
  285. if err := r.traverse(); err != nil {
  286. return nil, err
  287. }
  288. return r, nil
  289. }
  290. // ID returns the CLSID (class ID) field from the root directory entry
  291. func (r *Reader) ID() string {
  292. return r.File[0].ID()
  293. }
  294. // Created returns the created field from the root directory entry
  295. func (r *Reader) Created() time.Time {
  296. return r.File[0].Created()
  297. }
  298. // Modified returns the last modified field from the root directory entry
  299. func (r *Reader) Modified() time.Time {
  300. return r.File[0].Modified()
  301. }
  302. // Next iterates to the next directory entry.
  303. // This isn't necessarily an adjacent *File within the File slice, but is based on the Left Sibling, Right Sibling and Child information in directory entries.
  304. func (r *Reader) Next() (*File, error) {
  305. r.entry++
  306. if r.entry >= len(r.File) {
  307. return nil, io.EOF
  308. }
  309. return r.File[r.entry], nil
  310. }
  311. // Read the current directory entry
  312. func (r *Reader) Read(b []byte) (n int, err error) {
  313. if r.entry >= len(r.File) {
  314. return 0, io.EOF
  315. }
  316. return r.File[r.entry].Read(b)
  317. }
  318. // Debug provides granular information from an mscfb file to assist with debugging
  319. func (r *Reader) Debug() map[string][]uint32 {
  320. ret := map[string][]uint32{
  321. "sector size": []uint32{r.sectorSize},
  322. "mini fat locs": r.header.miniFatLocs,
  323. "mini stream locs": r.header.miniStreamLocs,
  324. "directory sector": []uint32{r.header.directorySectorLoc},
  325. "mini stream start/size": []uint32{r.File[0].startingSectorLoc, binary.LittleEndian.Uint32(r.File[0].streamSize[:])},
  326. }
  327. for f, err := r.Next(); err == nil; f, err = r.Next() {
  328. ret[f.Name+" start/size"] = []uint32{f.startingSectorLoc, binary.LittleEndian.Uint32(f.streamSize[:])}
  329. }
  330. return ret
  331. }
  332. const (
  333. // ErrFormat reports issues with the MSCFB's header structures
  334. ErrFormat = iota
  335. // ErrRead reports issues attempting to read MSCFB streams
  336. ErrRead
  337. // ErrSeek reports seek issues
  338. ErrSeek
  339. // ErrWrite reports write issues
  340. ErrWrite
  341. // ErrTraverse reports issues attempting to traverse the child-parent-sibling relations
  342. // between MSCFB storage objects
  343. ErrTraverse
  344. )
  345. type Error struct {
  346. typ int
  347. msg string
  348. val int64
  349. }
  350. func (e Error) Error() string {
  351. return "mscfb: " + e.msg + "; " + strconv.FormatInt(e.val, 10)
  352. }
  353. // Typ gives the type of MSCFB error
  354. func (e Error) Typ() int {
  355. return e.typ
  356. }
  357. // Slicer interface avoids a copy by obtaining a byte slice directly from the underlying reader
  358. type slicer interface {
  359. Slice(offset int64, length int) ([]byte, error)
  360. }