reader.go 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623
  1. // Copyright 2010 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package zip
  5. import (
  6. "bufio"
  7. "encoding/binary"
  8. "errors"
  9. "fmt"
  10. "hash"
  11. "hash/crc32"
  12. "io"
  13. "os"
  14. "time"
  15. )
  16. var (
  17. ErrFormat = errors.New("zip: not a valid zip file")
  18. ErrAlgorithm = errors.New("zip: unsupported compression algorithm")
  19. ErrChecksum = errors.New("zip: checksum error")
  20. )
  21. type Reader struct {
  22. r io.ReaderAt
  23. File []*File
  24. Comment string
  25. decompressors map[uint16]Decompressor
  26. }
  27. type ReadCloser struct {
  28. f *os.File
  29. Reader
  30. }
  31. type File struct {
  32. FileHeader
  33. zip *Reader
  34. zipr io.ReaderAt
  35. zipsize int64
  36. headerOffset int64
  37. }
  38. func (f *File) hasDataDescriptor() bool {
  39. return f.Flags&0x8 != 0
  40. }
  41. // OpenReader will open the Zip file specified by name and return a ReadCloser.
  42. func OpenReader(name string) (*ReadCloser, error) {
  43. f, err := os.Open(name)
  44. if err != nil {
  45. return nil, err
  46. }
  47. fi, err := f.Stat()
  48. if err != nil {
  49. f.Close()
  50. return nil, err
  51. }
  52. r := new(ReadCloser)
  53. if err := r.init(f, fi.Size()); err != nil {
  54. f.Close()
  55. return nil, err
  56. }
  57. r.f = f
  58. return r, nil
  59. }
  60. // NewReader returns a new Reader reading from r, which is assumed to
  61. // have the given size in bytes.
  62. func NewReader(r io.ReaderAt, size int64) (*Reader, error) {
  63. if size < 0 {
  64. return nil, errors.New("zip: size cannot be negative")
  65. }
  66. zr := new(Reader)
  67. if err := zr.init(r, size); err != nil {
  68. return nil, err
  69. }
  70. return zr, nil
  71. }
  72. func (z *Reader) init(r io.ReaderAt, size int64) error {
  73. end, err := readDirectoryEnd(r, size)
  74. if err != nil {
  75. return err
  76. }
  77. if end.directoryRecords > uint64(size)/fileHeaderLen {
  78. return fmt.Errorf("archive/zip: TOC declares impossible %d files in %d byte zip", end.directoryRecords, size)
  79. }
  80. z.r = r
  81. z.File = make([]*File, 0, end.directoryRecords)
  82. z.Comment = end.comment
  83. rs := io.NewSectionReader(r, 0, size)
  84. if _, err = rs.Seek(int64(end.directoryOffset), io.SeekStart); err != nil {
  85. return err
  86. }
  87. buf := bufio.NewReader(rs)
  88. // The count of files inside a zip is truncated to fit in a uint16.
  89. // Gloss over this by reading headers until we encounter
  90. // a bad one, and then only report an ErrFormat or UnexpectedEOF if
  91. // the file count modulo 65536 is incorrect.
  92. for {
  93. f := &File{zip: z, zipr: r, zipsize: size}
  94. err = readDirectoryHeader(f, buf)
  95. if err == ErrFormat || err == io.ErrUnexpectedEOF {
  96. break
  97. }
  98. if err != nil {
  99. return err
  100. }
  101. z.File = append(z.File, f)
  102. }
  103. if uint16(len(z.File)) != uint16(end.directoryRecords) { // only compare 16 bits here
  104. // Return the readDirectoryHeader error if we read
  105. // the wrong number of directory entries.
  106. return err
  107. }
  108. return nil
  109. }
  110. // RegisterDecompressor registers or overrides a custom decompressor for a
  111. // specific method ID. If a decompressor for a given method is not found,
  112. // Reader will default to looking up the decompressor at the package level.
  113. func (z *Reader) RegisterDecompressor(method uint16, dcomp Decompressor) {
  114. if z.decompressors == nil {
  115. z.decompressors = make(map[uint16]Decompressor)
  116. }
  117. z.decompressors[method] = dcomp
  118. }
  119. func (z *Reader) decompressor(method uint16) Decompressor {
  120. dcomp := z.decompressors[method]
  121. if dcomp == nil {
  122. dcomp = decompressor(method)
  123. }
  124. return dcomp
  125. }
  126. // Close closes the Zip file, rendering it unusable for I/O.
  127. func (rc *ReadCloser) Close() error {
  128. return rc.f.Close()
  129. }
  130. // DataOffset returns the offset of the file's possibly-compressed
  131. // data, relative to the beginning of the zip file.
  132. //
  133. // Most callers should instead use Open, which transparently
  134. // decompresses data and verifies checksums.
  135. func (f *File) DataOffset() (offset int64, err error) {
  136. bodyOffset, err := f.findBodyOffset()
  137. if err != nil {
  138. return
  139. }
  140. return f.headerOffset + bodyOffset, nil
  141. }
  142. // Open returns a ReadCloser that provides access to the File's contents.
  143. // Multiple files may be read concurrently.
  144. func (f *File) Open() (io.ReadCloser, error) {
  145. bodyOffset, err := f.findBodyOffset()
  146. if err != nil {
  147. return nil, err
  148. }
  149. size := int64(f.CompressedSize64)
  150. r := io.NewSectionReader(f.zipr, f.headerOffset+bodyOffset, size)
  151. dcomp := f.zip.decompressor(f.Method)
  152. if dcomp == nil {
  153. return nil, ErrAlgorithm
  154. }
  155. var rc io.ReadCloser = dcomp(r)
  156. var desr io.Reader
  157. if f.hasDataDescriptor() {
  158. desr = io.NewSectionReader(f.zipr, f.headerOffset+bodyOffset+size, dataDescriptorLen)
  159. }
  160. rc = &checksumReader{
  161. rc: rc,
  162. hash: crc32.NewIEEE(),
  163. f: f,
  164. desr: desr,
  165. }
  166. return rc, nil
  167. }
  168. // OpenRaw returns a Reader that returns the *compressed* output of the file.
  169. func (f *File) OpenRaw() (io.Reader, error) {
  170. bodyOffset, err := f.findBodyOffset()
  171. if err != nil {
  172. return nil, err
  173. }
  174. size := int64(f.CompressedSize64)
  175. return io.NewSectionReader(f.zipr, f.headerOffset+bodyOffset, size), nil
  176. }
  177. type checksumReader struct {
  178. rc io.ReadCloser
  179. hash hash.Hash32
  180. nread uint64 // number of bytes read so far
  181. f *File
  182. desr io.Reader // if non-nil, where to read the data descriptor
  183. err error // sticky error
  184. }
  185. func (r *checksumReader) Read(b []byte) (n int, err error) {
  186. if r.err != nil {
  187. return 0, r.err
  188. }
  189. n, err = r.rc.Read(b)
  190. r.hash.Write(b[:n])
  191. r.nread += uint64(n)
  192. if err == nil {
  193. return
  194. }
  195. if err == io.EOF {
  196. if r.nread != r.f.UncompressedSize64 {
  197. return 0, io.ErrUnexpectedEOF
  198. }
  199. if r.desr != nil {
  200. if err1 := readDataDescriptor(r.desr, r.f); err1 != nil {
  201. if err1 == io.EOF {
  202. err = io.ErrUnexpectedEOF
  203. } else {
  204. err = err1
  205. }
  206. } else if r.hash.Sum32() != r.f.CRC32 {
  207. err = ErrChecksum
  208. }
  209. } else {
  210. // If there's not a data descriptor, we still compare
  211. // the CRC32 of what we've read against the file header
  212. // or TOC's CRC32, if it seems like it was set.
  213. if r.f.CRC32 != 0 && r.hash.Sum32() != r.f.CRC32 {
  214. err = ErrChecksum
  215. }
  216. }
  217. }
  218. r.err = err
  219. return
  220. }
  221. func (r *checksumReader) Close() error { return r.rc.Close() }
  222. // findBodyOffset does the minimum work to verify the file has a header
  223. // and returns the file body offset.
  224. func (f *File) findBodyOffset() (int64, error) {
  225. var buf [fileHeaderLen]byte
  226. if _, err := f.zipr.ReadAt(buf[:], f.headerOffset); err != nil {
  227. return 0, err
  228. }
  229. b := readBuf(buf[:])
  230. if sig := b.uint32(); sig != fileHeaderSignature {
  231. return 0, ErrFormat
  232. }
  233. b = b[22:] // skip over most of the header
  234. filenameLen := int(b.uint16())
  235. extraLen := int(b.uint16())
  236. return int64(fileHeaderLen + filenameLen + extraLen), nil
  237. }
  238. // readDirectoryHeader attempts to read a directory header from r.
  239. // It returns io.ErrUnexpectedEOF if it cannot read a complete header,
  240. // and ErrFormat if it doesn't find a valid header signature.
  241. func readDirectoryHeader(f *File, r io.Reader) error {
  242. var buf [directoryHeaderLen]byte
  243. if _, err := io.ReadFull(r, buf[:]); err != nil {
  244. return err
  245. }
  246. b := readBuf(buf[:])
  247. if sig := b.uint32(); sig != directoryHeaderSignature {
  248. return ErrFormat
  249. }
  250. f.CreatorVersion = b.uint16()
  251. f.ReaderVersion = b.uint16()
  252. f.Flags = b.uint16()
  253. f.Method = b.uint16()
  254. f.ModifiedTime = b.uint16()
  255. f.ModifiedDate = b.uint16()
  256. f.CRC32 = b.uint32()
  257. f.CompressedSize = b.uint32()
  258. f.UncompressedSize = b.uint32()
  259. f.CompressedSize64 = uint64(f.CompressedSize)
  260. f.UncompressedSize64 = uint64(f.UncompressedSize)
  261. filenameLen := int(b.uint16())
  262. extraLen := int(b.uint16())
  263. commentLen := int(b.uint16())
  264. b = b[4:] // skipped start disk number and internal attributes (2x uint16)
  265. f.ExternalAttrs = b.uint32()
  266. f.headerOffset = int64(b.uint32())
  267. d := make([]byte, filenameLen+extraLen+commentLen)
  268. if _, err := io.ReadFull(r, d); err != nil {
  269. return err
  270. }
  271. f.Name = string(d[:filenameLen])
  272. f.Extra = d[filenameLen : filenameLen+extraLen]
  273. f.Comment = string(d[filenameLen+extraLen:])
  274. // Determine the character encoding.
  275. utf8Valid1, utf8Require1 := detectUTF8(f.Name)
  276. utf8Valid2, utf8Require2 := detectUTF8(f.Comment)
  277. switch {
  278. case !utf8Valid1 || !utf8Valid2:
  279. // Name and Comment definitely not UTF-8.
  280. f.NonUTF8 = true
  281. case !utf8Require1 && !utf8Require2:
  282. // Name and Comment use only single-byte runes that overlap with UTF-8.
  283. f.NonUTF8 = false
  284. default:
  285. // Might be UTF-8, might be some other encoding; preserve existing flag.
  286. // Some ZIP writers use UTF-8 encoding without setting the UTF-8 flag.
  287. // Since it is impossible to always distinguish valid UTF-8 from some
  288. // other encoding (e.g., GBK or Shift-JIS), we trust the flag.
  289. f.NonUTF8 = f.Flags&0x800 == 0
  290. }
  291. needUSize := f.UncompressedSize == ^uint32(0)
  292. needCSize := f.CompressedSize == ^uint32(0)
  293. needHeaderOffset := f.headerOffset == int64(^uint32(0))
  294. // Best effort to find what we need.
  295. // Other zip authors might not even follow the basic format,
  296. // and we'll just ignore the Extra content in that case.
  297. var modified time.Time
  298. parseExtras:
  299. for extra := readBuf(f.Extra); len(extra) >= 4; { // need at least tag and size
  300. fieldTag := extra.uint16()
  301. fieldSize := int(extra.uint16())
  302. if len(extra) < fieldSize {
  303. break
  304. }
  305. fieldBuf := extra.sub(fieldSize)
  306. switch fieldTag {
  307. case zip64ExtraID:
  308. // update directory values from the zip64 extra block.
  309. // They should only be consulted if the sizes read earlier
  310. // are maxed out.
  311. // See golang.org/issue/13367.
  312. if needUSize {
  313. needUSize = false
  314. if len(fieldBuf) < 8 {
  315. return ErrFormat
  316. }
  317. f.UncompressedSize64 = fieldBuf.uint64()
  318. }
  319. if needCSize {
  320. needCSize = false
  321. if len(fieldBuf) < 8 {
  322. return ErrFormat
  323. }
  324. f.CompressedSize64 = fieldBuf.uint64()
  325. }
  326. if needHeaderOffset {
  327. needHeaderOffset = false
  328. if len(fieldBuf) < 8 {
  329. return ErrFormat
  330. }
  331. f.headerOffset = int64(fieldBuf.uint64())
  332. }
  333. case ntfsExtraID:
  334. if len(fieldBuf) < 4 {
  335. continue parseExtras
  336. }
  337. fieldBuf.uint32() // reserved (ignored)
  338. for len(fieldBuf) >= 4 { // need at least tag and size
  339. attrTag := fieldBuf.uint16()
  340. attrSize := int(fieldBuf.uint16())
  341. if len(fieldBuf) < attrSize {
  342. continue parseExtras
  343. }
  344. attrBuf := fieldBuf.sub(attrSize)
  345. if attrTag != 1 || attrSize != 24 {
  346. continue // Ignore irrelevant attributes
  347. }
  348. const ticksPerSecond = 1e7 // Windows timestamp resolution
  349. ts := int64(attrBuf.uint64()) // ModTime since Windows epoch
  350. secs := int64(ts / ticksPerSecond)
  351. nsecs := (1e9 / ticksPerSecond) * int64(ts%ticksPerSecond)
  352. epoch := time.Date(1601, time.January, 1, 0, 0, 0, 0, time.UTC)
  353. modified = time.Unix(epoch.Unix()+secs, nsecs)
  354. }
  355. case unixExtraID, infoZipUnixExtraID:
  356. if len(fieldBuf) < 8 {
  357. continue parseExtras
  358. }
  359. fieldBuf.uint32() // AcTime (ignored)
  360. ts := int64(fieldBuf.uint32()) // ModTime since Unix epoch
  361. modified = time.Unix(ts, 0)
  362. case extTimeExtraID:
  363. if len(fieldBuf) < 5 || fieldBuf.uint8()&1 == 0 {
  364. continue parseExtras
  365. }
  366. ts := int64(fieldBuf.uint32()) // ModTime since Unix epoch
  367. modified = time.Unix(ts, 0)
  368. }
  369. }
  370. msdosModified := msDosTimeToTime(f.ModifiedDate, f.ModifiedTime)
  371. f.Modified = msdosModified
  372. if !modified.IsZero() {
  373. f.Modified = modified.UTC()
  374. // If legacy MS-DOS timestamps are set, we can use the delta between
  375. // the legacy and extended versions to estimate timezone offset.
  376. //
  377. // A non-UTC timezone is always used (even if offset is zero).
  378. // Thus, FileHeader.Modified.Location() == time.UTC is useful for
  379. // determining whether extended timestamps are present.
  380. // This is necessary for users that need to do additional time
  381. // calculations when dealing with legacy ZIP formats.
  382. if f.ModifiedTime != 0 || f.ModifiedDate != 0 {
  383. f.Modified = modified.In(timeZone(msdosModified.Sub(modified)))
  384. }
  385. }
  386. // Assume that uncompressed size 2³²-1 could plausibly happen in
  387. // an old zip32 file that was sharding inputs into the largest chunks
  388. // possible (or is just malicious; search the web for 42.zip).
  389. // If needUSize is true still, it means we didn't see a zip64 extension.
  390. // As long as the compressed size is not also 2³²-1 (implausible)
  391. // and the header is not also 2³²-1 (equally implausible),
  392. // accept the uncompressed size 2³²-1 as valid.
  393. // If nothing else, this keeps archive/zip working with 42.zip.
  394. _ = needUSize
  395. if needCSize || needHeaderOffset {
  396. return ErrFormat
  397. }
  398. return nil
  399. }
  400. func readDataDescriptor(r io.Reader, f *File) error {
  401. var buf [dataDescriptorLen]byte
  402. // The spec says: "Although not originally assigned a
  403. // signature, the value 0x08074b50 has commonly been adopted
  404. // as a signature value for the data descriptor record.
  405. // Implementers should be aware that ZIP files may be
  406. // encountered with or without this signature marking data
  407. // descriptors and should account for either case when reading
  408. // ZIP files to ensure compatibility."
  409. //
  410. // dataDescriptorLen includes the size of the signature but
  411. // first read just those 4 bytes to see if it exists.
  412. if _, err := io.ReadFull(r, buf[:4]); err != nil {
  413. return err
  414. }
  415. off := 0
  416. maybeSig := readBuf(buf[:4])
  417. if maybeSig.uint32() != dataDescriptorSignature {
  418. // No data descriptor signature. Keep these four
  419. // bytes.
  420. off += 4
  421. }
  422. if _, err := io.ReadFull(r, buf[off:12]); err != nil {
  423. return err
  424. }
  425. b := readBuf(buf[:12])
  426. if b.uint32() != f.CRC32 {
  427. return ErrChecksum
  428. }
  429. // The two sizes that follow here can be either 32 bits or 64 bits
  430. // but the spec is not very clear on this and different
  431. // interpretations has been made causing incompatibilities. We
  432. // already have the sizes from the central directory so we can
  433. // just ignore these.
  434. return nil
  435. }
  436. func readDirectoryEnd(r io.ReaderAt, size int64) (dir *directoryEnd, err error) {
  437. // look for directoryEndSignature in the last 1k, then in the last 65k
  438. var buf []byte
  439. var directoryEndOffset int64
  440. for i, bLen := range []int64{1024, 65 * 1024} {
  441. if bLen > size {
  442. bLen = size
  443. }
  444. buf = make([]byte, int(bLen))
  445. if _, err := r.ReadAt(buf, size-bLen); err != nil && err != io.EOF {
  446. return nil, err
  447. }
  448. if p := findSignatureInBlock(buf); p >= 0 {
  449. buf = buf[p:]
  450. directoryEndOffset = size - bLen + int64(p)
  451. break
  452. }
  453. if i == 1 || bLen == size {
  454. return nil, ErrFormat
  455. }
  456. }
  457. // read header into struct
  458. b := readBuf(buf[4:]) // skip signature
  459. d := &directoryEnd{
  460. diskNbr: uint32(b.uint16()),
  461. dirDiskNbr: uint32(b.uint16()),
  462. dirRecordsThisDisk: uint64(b.uint16()),
  463. directoryRecords: uint64(b.uint16()),
  464. directorySize: uint64(b.uint32()),
  465. directoryOffset: uint64(b.uint32()),
  466. commentLen: b.uint16(),
  467. }
  468. l := int(d.commentLen)
  469. if l > len(b) {
  470. return nil, errors.New("zip: invalid comment length")
  471. }
  472. d.comment = string(b[:l])
  473. // These values mean that the file can be a zip64 file
  474. if d.directoryRecords == 0xffff || d.directorySize == 0xffff || d.directoryOffset == 0xffffffff {
  475. p, err := findDirectory64End(r, directoryEndOffset)
  476. if err == nil && p >= 0 {
  477. err = readDirectory64End(r, p, d)
  478. }
  479. if err != nil {
  480. return nil, err
  481. }
  482. }
  483. // Make sure directoryOffset points to somewhere in our file.
  484. if o := int64(d.directoryOffset); o < 0 || o >= size {
  485. return nil, ErrFormat
  486. }
  487. return d, nil
  488. }
  489. // findDirectory64End tries to read the zip64 locator just before the
  490. // directory end and returns the offset of the zip64 directory end if
  491. // found.
  492. func findDirectory64End(r io.ReaderAt, directoryEndOffset int64) (int64, error) {
  493. locOffset := directoryEndOffset - directory64LocLen
  494. if locOffset < 0 {
  495. return -1, nil // no need to look for a header outside the file
  496. }
  497. buf := make([]byte, directory64LocLen)
  498. if _, err := r.ReadAt(buf, locOffset); err != nil {
  499. return -1, err
  500. }
  501. b := readBuf(buf)
  502. if sig := b.uint32(); sig != directory64LocSignature {
  503. return -1, nil
  504. }
  505. if b.uint32() != 0 { // number of the disk with the start of the zip64 end of central directory
  506. return -1, nil // the file is not a valid zip64-file
  507. }
  508. p := b.uint64() // relative offset of the zip64 end of central directory record
  509. if b.uint32() != 1 { // total number of disks
  510. return -1, nil // the file is not a valid zip64-file
  511. }
  512. return int64(p), nil
  513. }
  514. // readDirectory64End reads the zip64 directory end and updates the
  515. // directory end with the zip64 directory end values.
  516. func readDirectory64End(r io.ReaderAt, offset int64, d *directoryEnd) (err error) {
  517. buf := make([]byte, directory64EndLen)
  518. if _, err := r.ReadAt(buf, offset); err != nil {
  519. return err
  520. }
  521. b := readBuf(buf)
  522. if sig := b.uint32(); sig != directory64EndSignature {
  523. return ErrFormat
  524. }
  525. b = b[12:] // skip dir size, version and version needed (uint64 + 2x uint16)
  526. d.diskNbr = b.uint32() // number of this disk
  527. d.dirDiskNbr = b.uint32() // number of the disk with the start of the central directory
  528. d.dirRecordsThisDisk = b.uint64() // total number of entries in the central directory on this disk
  529. d.directoryRecords = b.uint64() // total number of entries in the central directory
  530. d.directorySize = b.uint64() // size of the central directory
  531. d.directoryOffset = b.uint64() // offset of start of central directory with respect to the starting disk number
  532. return nil
  533. }
  534. func findSignatureInBlock(b []byte) int {
  535. for i := len(b) - directoryEndLen; i >= 0; i-- {
  536. // defined from directoryEndSignature in struct.go
  537. if b[i] == 'P' && b[i+1] == 'K' && b[i+2] == 0x05 && b[i+3] == 0x06 {
  538. // n is length of comment
  539. n := int(b[i+directoryEndLen-2]) | int(b[i+directoryEndLen-1])<<8
  540. if n+directoryEndLen+i <= len(b) {
  541. return i
  542. }
  543. }
  544. }
  545. return -1
  546. }
  547. type readBuf []byte
  548. func (b *readBuf) uint8() uint8 {
  549. v := (*b)[0]
  550. *b = (*b)[1:]
  551. return v
  552. }
  553. func (b *readBuf) uint16() uint16 {
  554. v := binary.LittleEndian.Uint16(*b)
  555. *b = (*b)[2:]
  556. return v
  557. }
  558. func (b *readBuf) uint32() uint32 {
  559. v := binary.LittleEndian.Uint32(*b)
  560. *b = (*b)[4:]
  561. return v
  562. }
  563. func (b *readBuf) uint64() uint64 {
  564. v := binary.LittleEndian.Uint64(*b)
  565. *b = (*b)[8:]
  566. return v
  567. }
  568. func (b *readBuf) sub(n int) readBuf {
  569. b2 := (*b)[:n]
  570. *b = (*b)[n:]
  571. return b2
  572. }