snappy.go 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436
  1. // Copyright 2019+ Klaus Post. All rights reserved.
  2. // License information can be found in the LICENSE file.
  3. // Based on work by Yann Collet, released under BSD License.
  4. package zstd
  5. import (
  6. "encoding/binary"
  7. "errors"
  8. "hash/crc32"
  9. "io"
  10. "github.com/klauspost/compress/huff0"
  11. "github.com/klauspost/compress/snappy"
  12. )
  13. const (
  14. snappyTagLiteral = 0x00
  15. snappyTagCopy1 = 0x01
  16. snappyTagCopy2 = 0x02
  17. snappyTagCopy4 = 0x03
  18. )
  19. const (
  20. snappyChecksumSize = 4
  21. snappyMagicBody = "sNaPpY"
  22. // snappyMaxBlockSize is the maximum size of the input to encodeBlock. It is not
  23. // part of the wire format per se, but some parts of the encoder assume
  24. // that an offset fits into a uint16.
  25. //
  26. // Also, for the framing format (Writer type instead of Encode function),
  27. // https://github.com/google/snappy/blob/master/framing_format.txt says
  28. // that "the uncompressed data in a chunk must be no longer than 65536
  29. // bytes".
  30. snappyMaxBlockSize = 65536
  31. // snappyMaxEncodedLenOfMaxBlockSize equals MaxEncodedLen(snappyMaxBlockSize), but is
  32. // hard coded to be a const instead of a variable, so that obufLen can also
  33. // be a const. Their equivalence is confirmed by
  34. // TestMaxEncodedLenOfMaxBlockSize.
  35. snappyMaxEncodedLenOfMaxBlockSize = 76490
  36. )
  37. const (
  38. chunkTypeCompressedData = 0x00
  39. chunkTypeUncompressedData = 0x01
  40. chunkTypePadding = 0xfe
  41. chunkTypeStreamIdentifier = 0xff
  42. )
  43. var (
  44. // ErrSnappyCorrupt reports that the input is invalid.
  45. ErrSnappyCorrupt = errors.New("snappy: corrupt input")
  46. // ErrSnappyTooLarge reports that the uncompressed length is too large.
  47. ErrSnappyTooLarge = errors.New("snappy: decoded block is too large")
  48. // ErrSnappyUnsupported reports that the input isn't supported.
  49. ErrSnappyUnsupported = errors.New("snappy: unsupported input")
  50. errUnsupportedLiteralLength = errors.New("snappy: unsupported literal length")
  51. )
  52. // SnappyConverter can read SnappyConverter-compressed streams and convert them to zstd.
  53. // Conversion is done by converting the stream directly from Snappy without intermediate
  54. // full decoding.
  55. // Therefore the compression ratio is much less than what can be done by a full decompression
  56. // and compression, and a faulty Snappy stream may lead to a faulty Zstandard stream without
  57. // any errors being generated.
  58. // No CRC value is being generated and not all CRC values of the Snappy stream are checked.
  59. // However, it provides really fast recompression of Snappy streams.
  60. // The converter can be reused to avoid allocations, even after errors.
  61. type SnappyConverter struct {
  62. r io.Reader
  63. err error
  64. buf []byte
  65. block *blockEnc
  66. }
  67. // Convert the Snappy stream supplied in 'in' and write the zStandard stream to 'w'.
  68. // If any error is detected on the Snappy stream it is returned.
  69. // The number of bytes written is returned.
  70. func (r *SnappyConverter) Convert(in io.Reader, w io.Writer) (int64, error) {
  71. initPredefined()
  72. r.err = nil
  73. r.r = in
  74. if r.block == nil {
  75. r.block = &blockEnc{}
  76. r.block.init()
  77. }
  78. r.block.initNewEncode()
  79. if len(r.buf) != snappyMaxEncodedLenOfMaxBlockSize+snappyChecksumSize {
  80. r.buf = make([]byte, snappyMaxEncodedLenOfMaxBlockSize+snappyChecksumSize)
  81. }
  82. r.block.litEnc.Reuse = huff0.ReusePolicyNone
  83. var written int64
  84. var readHeader bool
  85. {
  86. var header []byte
  87. var n int
  88. header, r.err = frameHeader{WindowSize: snappyMaxBlockSize}.appendTo(r.buf[:0])
  89. n, r.err = w.Write(header)
  90. if r.err != nil {
  91. return written, r.err
  92. }
  93. written += int64(n)
  94. }
  95. for {
  96. if !r.readFull(r.buf[:4], true) {
  97. // Add empty last block
  98. r.block.reset(nil)
  99. r.block.last = true
  100. err := r.block.encodeLits(false)
  101. if err != nil {
  102. return written, err
  103. }
  104. n, err := w.Write(r.block.output)
  105. if err != nil {
  106. return written, err
  107. }
  108. written += int64(n)
  109. return written, r.err
  110. }
  111. chunkType := r.buf[0]
  112. if !readHeader {
  113. if chunkType != chunkTypeStreamIdentifier {
  114. println("chunkType != chunkTypeStreamIdentifier", chunkType)
  115. r.err = ErrSnappyCorrupt
  116. return written, r.err
  117. }
  118. readHeader = true
  119. }
  120. chunkLen := int(r.buf[1]) | int(r.buf[2])<<8 | int(r.buf[3])<<16
  121. if chunkLen > len(r.buf) {
  122. println("chunkLen > len(r.buf)", chunkType)
  123. r.err = ErrSnappyUnsupported
  124. return written, r.err
  125. }
  126. // The chunk types are specified at
  127. // https://github.com/google/snappy/blob/master/framing_format.txt
  128. switch chunkType {
  129. case chunkTypeCompressedData:
  130. // Section 4.2. Compressed data (chunk type 0x00).
  131. if chunkLen < snappyChecksumSize {
  132. println("chunkLen < snappyChecksumSize", chunkLen, snappyChecksumSize)
  133. r.err = ErrSnappyCorrupt
  134. return written, r.err
  135. }
  136. buf := r.buf[:chunkLen]
  137. if !r.readFull(buf, false) {
  138. return written, r.err
  139. }
  140. //checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
  141. buf = buf[snappyChecksumSize:]
  142. n, hdr, err := snappyDecodedLen(buf)
  143. if err != nil {
  144. r.err = err
  145. return written, r.err
  146. }
  147. buf = buf[hdr:]
  148. if n > snappyMaxBlockSize {
  149. println("n > snappyMaxBlockSize", n, snappyMaxBlockSize)
  150. r.err = ErrSnappyCorrupt
  151. return written, r.err
  152. }
  153. r.block.reset(nil)
  154. r.block.pushOffsets()
  155. if err := decodeSnappy(r.block, buf); err != nil {
  156. r.err = err
  157. return written, r.err
  158. }
  159. if r.block.size+r.block.extraLits != n {
  160. printf("invalid size, want %d, got %d\n", n, r.block.size+r.block.extraLits)
  161. r.err = ErrSnappyCorrupt
  162. return written, r.err
  163. }
  164. err = r.block.encode(false)
  165. switch err {
  166. case errIncompressible:
  167. r.block.popOffsets()
  168. r.block.reset(nil)
  169. r.block.literals, err = snappy.Decode(r.block.literals[:n], r.buf[snappyChecksumSize:chunkLen])
  170. if err != nil {
  171. println("snappy.Decode:", err)
  172. return written, err
  173. }
  174. err = r.block.encodeLits(false)
  175. if err != nil {
  176. return written, err
  177. }
  178. case nil:
  179. default:
  180. return written, err
  181. }
  182. n, r.err = w.Write(r.block.output)
  183. if r.err != nil {
  184. return written, err
  185. }
  186. written += int64(n)
  187. continue
  188. case chunkTypeUncompressedData:
  189. if debug {
  190. println("Uncompressed, chunklen", chunkLen)
  191. }
  192. // Section 4.3. Uncompressed data (chunk type 0x01).
  193. if chunkLen < snappyChecksumSize {
  194. println("chunkLen < snappyChecksumSize", chunkLen, snappyChecksumSize)
  195. r.err = ErrSnappyCorrupt
  196. return written, r.err
  197. }
  198. r.block.reset(nil)
  199. buf := r.buf[:snappyChecksumSize]
  200. if !r.readFull(buf, false) {
  201. return written, r.err
  202. }
  203. checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
  204. // Read directly into r.decoded instead of via r.buf.
  205. n := chunkLen - snappyChecksumSize
  206. if n > snappyMaxBlockSize {
  207. println("n > snappyMaxBlockSize", n, snappyMaxBlockSize)
  208. r.err = ErrSnappyCorrupt
  209. return written, r.err
  210. }
  211. r.block.literals = r.block.literals[:n]
  212. if !r.readFull(r.block.literals, false) {
  213. return written, r.err
  214. }
  215. if snappyCRC(r.block.literals) != checksum {
  216. println("literals crc mismatch")
  217. r.err = ErrSnappyCorrupt
  218. return written, r.err
  219. }
  220. err := r.block.encodeLits(false)
  221. if err != nil {
  222. return written, err
  223. }
  224. n, r.err = w.Write(r.block.output)
  225. if r.err != nil {
  226. return written, err
  227. }
  228. written += int64(n)
  229. continue
  230. case chunkTypeStreamIdentifier:
  231. if debug {
  232. println("stream id", chunkLen, len(snappyMagicBody))
  233. }
  234. // Section 4.1. Stream identifier (chunk type 0xff).
  235. if chunkLen != len(snappyMagicBody) {
  236. println("chunkLen != len(snappyMagicBody)", chunkLen, len(snappyMagicBody))
  237. r.err = ErrSnappyCorrupt
  238. return written, r.err
  239. }
  240. if !r.readFull(r.buf[:len(snappyMagicBody)], false) {
  241. return written, r.err
  242. }
  243. for i := 0; i < len(snappyMagicBody); i++ {
  244. if r.buf[i] != snappyMagicBody[i] {
  245. println("r.buf[i] != snappyMagicBody[i]", r.buf[i], snappyMagicBody[i], i)
  246. r.err = ErrSnappyCorrupt
  247. return written, r.err
  248. }
  249. }
  250. continue
  251. }
  252. if chunkType <= 0x7f {
  253. // Section 4.5. Reserved unskippable chunks (chunk types 0x02-0x7f).
  254. println("chunkType <= 0x7f")
  255. r.err = ErrSnappyUnsupported
  256. return written, r.err
  257. }
  258. // Section 4.4 Padding (chunk type 0xfe).
  259. // Section 4.6. Reserved skippable chunks (chunk types 0x80-0xfd).
  260. if !r.readFull(r.buf[:chunkLen], false) {
  261. return written, r.err
  262. }
  263. }
  264. }
  265. // decodeSnappy writes the decoding of src to dst. It assumes that the varint-encoded
  266. // length of the decompressed bytes has already been read.
  267. func decodeSnappy(blk *blockEnc, src []byte) error {
  268. //decodeRef(make([]byte, snappyMaxBlockSize), src)
  269. var s, length int
  270. lits := blk.extraLits
  271. var offset uint32
  272. for s < len(src) {
  273. switch src[s] & 0x03 {
  274. case snappyTagLiteral:
  275. x := uint32(src[s] >> 2)
  276. switch {
  277. case x < 60:
  278. s++
  279. case x == 60:
  280. s += 2
  281. if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
  282. println("uint(s) > uint(len(src)", s, src)
  283. return ErrSnappyCorrupt
  284. }
  285. x = uint32(src[s-1])
  286. case x == 61:
  287. s += 3
  288. if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
  289. println("uint(s) > uint(len(src)", s, src)
  290. return ErrSnappyCorrupt
  291. }
  292. x = uint32(src[s-2]) | uint32(src[s-1])<<8
  293. case x == 62:
  294. s += 4
  295. if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
  296. println("uint(s) > uint(len(src)", s, src)
  297. return ErrSnappyCorrupt
  298. }
  299. x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
  300. case x == 63:
  301. s += 5
  302. if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
  303. println("uint(s) > uint(len(src)", s, src)
  304. return ErrSnappyCorrupt
  305. }
  306. x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
  307. }
  308. if x > snappyMaxBlockSize {
  309. println("x > snappyMaxBlockSize", x, snappyMaxBlockSize)
  310. return ErrSnappyCorrupt
  311. }
  312. length = int(x) + 1
  313. if length <= 0 {
  314. println("length <= 0 ", length)
  315. return errUnsupportedLiteralLength
  316. }
  317. //if length > snappyMaxBlockSize-d || uint32(length) > len(src)-s {
  318. // return ErrSnappyCorrupt
  319. //}
  320. blk.literals = append(blk.literals, src[s:s+length]...)
  321. //println(length, "litLen")
  322. lits += length
  323. s += length
  324. continue
  325. case snappyTagCopy1:
  326. s += 2
  327. if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
  328. println("uint(s) > uint(len(src)", s, len(src))
  329. return ErrSnappyCorrupt
  330. }
  331. length = 4 + int(src[s-2])>>2&0x7
  332. offset = uint32(src[s-2])&0xe0<<3 | uint32(src[s-1])
  333. case snappyTagCopy2:
  334. s += 3
  335. if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
  336. println("uint(s) > uint(len(src)", s, len(src))
  337. return ErrSnappyCorrupt
  338. }
  339. length = 1 + int(src[s-3])>>2
  340. offset = uint32(src[s-2]) | uint32(src[s-1])<<8
  341. case snappyTagCopy4:
  342. s += 5
  343. if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
  344. println("uint(s) > uint(len(src)", s, len(src))
  345. return ErrSnappyCorrupt
  346. }
  347. length = 1 + int(src[s-5])>>2
  348. offset = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
  349. }
  350. if offset <= 0 || blk.size+lits < int(offset) /*|| length > len(blk)-d */ {
  351. println("offset <= 0 || blk.size+lits < int(offset)", offset, blk.size+lits, int(offset), blk.size, lits)
  352. return ErrSnappyCorrupt
  353. }
  354. // Check if offset is one of the recent offsets.
  355. // Adjusts the output offset accordingly.
  356. // Gives a tiny bit of compression, typically around 1%.
  357. if false {
  358. offset = blk.matchOffset(offset, uint32(lits))
  359. } else {
  360. offset += 3
  361. }
  362. blk.sequences = append(blk.sequences, seq{
  363. litLen: uint32(lits),
  364. offset: offset,
  365. matchLen: uint32(length) - zstdMinMatch,
  366. })
  367. blk.size += length + lits
  368. lits = 0
  369. }
  370. blk.extraLits = lits
  371. return nil
  372. }
  373. func (r *SnappyConverter) readFull(p []byte, allowEOF bool) (ok bool) {
  374. if _, r.err = io.ReadFull(r.r, p); r.err != nil {
  375. if r.err == io.ErrUnexpectedEOF || (r.err == io.EOF && !allowEOF) {
  376. r.err = ErrSnappyCorrupt
  377. }
  378. return false
  379. }
  380. return true
  381. }
  382. var crcTable = crc32.MakeTable(crc32.Castagnoli)
  383. // crc implements the checksum specified in section 3 of
  384. // https://github.com/google/snappy/blob/master/framing_format.txt
  385. func snappyCRC(b []byte) uint32 {
  386. c := crc32.Update(0, crcTable, b)
  387. return uint32(c>>15|c<<17) + 0xa282ead8
  388. }
  389. // snappyDecodedLen returns the length of the decoded block and the number of bytes
  390. // that the length header occupied.
  391. func snappyDecodedLen(src []byte) (blockLen, headerLen int, err error) {
  392. v, n := binary.Uvarint(src)
  393. if n <= 0 || v > 0xffffffff {
  394. return 0, 0, ErrSnappyCorrupt
  395. }
  396. const wordSize = 32 << (^uint(0) >> 32 & 1)
  397. if wordSize == 32 && v > 0x7fffffff {
  398. return 0, 0, ErrSnappyTooLarge
  399. }
  400. return int(v), n, nil
  401. }