Bladeren bron

Use 64K blocks when encoding long inputs.

This enables future optimizations, such as an encoder's hash table entry being
uint16 instead of int32.
Nigel Tao 10 jaren geleden
bovenliggende
commit
bf2ded9d81
4 gewijzigde bestanden met toevoegingen van 50 en 50 verwijderingen
  1. 2 2
      decode.go
  2. 11 19
      encode.go
  3. 15 8
      snappy.go
  4. 22 21
      snappy_test.go

+ 2 - 2
decode.go

@@ -140,8 +140,8 @@ func Decode(dst, src []byte) ([]byte, error) {
 func NewReader(r io.Reader) *Reader {
 	return &Reader{
 		r:       r,
-		decoded: make([]byte, maxUncompressedChunkLen),
-		buf:     make([]byte, maxEncodedLenOfMaxUncompressedChunkLen+checksumSize),
+		decoded: make([]byte, maxBlockSize),
+		buf:     make([]byte, maxEncodedLenOfMaxBlockSize+checksumSize),
 	}
 }
 

+ 11 - 19
encode.go

@@ -94,30 +94,22 @@ func Encode(dst, src []byte) []byte {
 	for len(src) > 0 {
 		p := src
 		src = nil
-		if len(p) > maxInternalEncodeSrcLen {
-			p, src = p[:maxInternalEncodeSrcLen], p[maxInternalEncodeSrcLen:]
+		if len(p) > maxBlockSize {
+			p, src = p[:maxBlockSize], p[maxBlockSize:]
 		}
-		d += encode(dst[d:], p)
+		d += encodeBlock(dst[d:], p)
 	}
 	return dst[:d]
 }
 
-// maxInternalEncodeSrcLen must be less than math.MaxInt32, so that in the
-// (internal) encode function, it is safe to have the s variable (which indexes
-// the src slice), and therefore the hash table entries, to have type int32
-// instead of int.
-const maxInternalEncodeSrcLen = 0x40000000
-
-// encode encodes a non-empty src to a guaranteed-large-enough dst. It assumes
-// that the varint-encoded length of the decompressed bytes has already been
-// written.
+// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
 //
 // It also assumes that:
 //	len(dst) >= MaxEncodedLen(len(src)) &&
-// 	0 < len(src) &&
-//	len(src) <= maxInternalEncodeSrcLen &&
-// 	maxInternalEncodeSrcLen < math.MaxInt32.
-func encode(dst, src []byte) (d int) {
+// 	0 < len(src) && len(src) <= maxBlockSize
+func encodeBlock(dst, src []byte) (d int) {
 	// Return early if src is short.
 	if len(src) <= 4 {
 		return emitLiteral(dst, src)
@@ -258,7 +250,7 @@ func NewWriter(w io.Writer) *Writer {
 func NewBufferedWriter(w io.Writer) *Writer {
 	return &Writer{
 		w:    w,
-		ibuf: make([]byte, 0, maxUncompressedChunkLen),
+		ibuf: make([]byte, 0, maxBlockSize),
 		obuf: make([]byte, obufLen),
 	}
 }
@@ -342,8 +334,8 @@ func (w *Writer) write(p []byte) (nRet int, errRet error) {
 		}
 
 		var uncompressed []byte
-		if len(p) > maxUncompressedChunkLen {
-			uncompressed, p = p[:maxUncompressedChunkLen], p[maxUncompressedChunkLen:]
+		if len(p) > maxBlockSize {
+			uncompressed, p = p[:maxBlockSize], p[maxBlockSize:]
 		} else {
 			uncompressed, p = p, nil
 		}

+ 15 - 8
snappy.go

@@ -46,18 +46,25 @@ const (
 	chunkHeaderSize = 4
 	magicChunk      = "\xff\x06\x00\x00" + magicBody
 	magicBody       = "sNaPpY"
+
+	// maxBlockSize is the maximum size of the input to encodeBlock. It is not
+	// part of the wire format per se, but some parts of the encoder assume
+	// that an offset fits into a uint16.
+	//
+	// Also, for the framing format (Writer type instead of Encode function),
 	// https://github.com/google/snappy/blob/master/framing_format.txt says
-	// that "the uncompressed data in a chunk must be no longer than 65536 bytes".
-	maxUncompressedChunkLen = 65536
+	// that "the uncompressed data in a chunk must be no longer than 65536
+	// bytes".
+	maxBlockSize = 65536
 
-	// maxEncodedLenOfMaxUncompressedChunkLen equals
-	// MaxEncodedLen(maxUncompressedChunkLen), but is hard coded to be a const
-	// instead of a variable, so that obufLen can also be a const. Their
-	// equivalence is confirmed by TestMaxEncodedLenOfMaxUncompressedChunkLen.
-	maxEncodedLenOfMaxUncompressedChunkLen = 76490
+	// maxEncodedLenOfMaxBlockSize equals MaxEncodedLen(maxBlockSize), but is
+	// hard coded to be a const instead of a variable, so that obufLen can also
+	// be a const. Their equivalence is confirmed by
+	// TestMaxEncodedLenOfMaxBlockSize.
+	maxEncodedLenOfMaxBlockSize = 76490
 
 	obufHeaderLen = len(magicChunk) + checksumSize + chunkHeaderSize
-	obufLen       = obufHeaderLen + maxEncodedLenOfMaxUncompressedChunkLen
+	obufLen       = obufHeaderLen + maxEncodedLenOfMaxBlockSize
 )
 
 const (

+ 22 - 21
snappy_test.go

@@ -23,9 +23,9 @@ var (
 	testdata = flag.String("testdata", "testdata", "Directory containing the test data")
 )
 
-func TestMaxEncodedLenOfMaxUncompressedChunkLen(t *testing.T) {
-	got := maxEncodedLenOfMaxUncompressedChunkLen
-	want := MaxEncodedLen(maxUncompressedChunkLen)
+func TestMaxEncodedLenOfMaxBlockSize(t *testing.T) {
+	got := maxEncodedLenOfMaxBlockSize
+	want := MaxEncodedLen(maxBlockSize)
 	if got != want {
 		t.Fatalf("got %d, want %d", got, want)
 	}
@@ -237,23 +237,24 @@ func TestDecode(t *testing.T) {
 	}
 }
 
-// TestEncodeNoiseThenRepeats encodes a 32K block for which the first half is
-// very incompressible and the second half is very compressible. The encoded
-// form's length should be closer to 50% of the original length than 100%.
+// TestEncodeNoiseThenRepeats encodes input for which the first half is very
+// incompressible and the second half is very compressible. The encoded form's
+// length should be closer to 50% of the original length than 100%.
 func TestEncodeNoiseThenRepeats(t *testing.T) {
-	const origLen = 32768
-	src := make([]byte, origLen)
-	rng := rand.New(rand.NewSource(1))
-	firstHalf, secondHalf := src[:origLen/2], src[origLen/2:]
-	for i := range firstHalf {
-		firstHalf[i] = uint8(rng.Intn(256))
-	}
-	for i := range secondHalf {
-		secondHalf[i] = uint8(i >> 8)
-	}
-	dst := Encode(nil, src)
-	if got, want := len(dst), origLen*3/4; got >= want {
-		t.Fatalf("got %d encoded bytes, want less than %d", got, want)
+	for _, origLen := range []int{32 * 1024, 256 * 1024, 2048 * 1024} {
+		src := make([]byte, origLen)
+		rng := rand.New(rand.NewSource(1))
+		firstHalf, secondHalf := src[:origLen/2], src[origLen/2:]
+		for i := range firstHalf {
+			firstHalf[i] = uint8(rng.Intn(256))
+		}
+		for i := range secondHalf {
+			secondHalf[i] = uint8(i >> 8)
+		}
+		dst := Encode(nil, src)
+		if got, want := len(dst), origLen*3/4; got >= want {
+			t.Errorf("origLen=%d: got %d encoded bytes, want less than %d", origLen, got, want)
+		}
 	}
 }
 
@@ -272,7 +273,7 @@ func cmp(a, b []byte) error {
 func TestFramingFormat(t *testing.T) {
 	// src is comprised of alternating 1e5-sized sequences of random
 	// (incompressible) bytes and repeated (compressible) bytes. 1e5 was chosen
-	// because it is larger than maxUncompressedChunkLen (64k).
+	// because it is larger than maxBlockSize (64k).
 	src := make([]byte, 1e6)
 	rng := rand.New(rand.NewSource(1))
 	for i := 0; i < 10; i++ {
@@ -330,7 +331,7 @@ func TestNewBufferedWriter(t *testing.T) {
 	// Test all 32 possible sub-sequences of these 5 input slices.
 	//
 	// Their lengths sum to 400,000, which is over 6 times the Writer ibuf
-	// capacity: 6 * maxUncompressedChunkLen is 393,216.
+	// capacity: 6 * maxBlockSize is 393,216.
 	inputs := [][]byte{
 		bytes.Repeat([]byte{'a'}, 40000),
 		bytes.Repeat([]byte{'b'}, 150000),