瀏覽代碼

Use 64K blocks when encoding long inputs.

This enables future optimizations, such as an encoder's hash table entry being
uint16 instead of int32.
Nigel Tao 10 年之前
父節點
當前提交
bf2ded9d81
共有 4 個文件被更改,包括 50 次插入50 次删除
  1. 2 2
      decode.go
  2. 11 19
      encode.go
  3. 15 8
      snappy.go
  4. 22 21
      snappy_test.go

+ 2 - 2
decode.go

@@ -140,8 +140,8 @@ func Decode(dst, src []byte) ([]byte, error) {
 func NewReader(r io.Reader) *Reader {
 func NewReader(r io.Reader) *Reader {
 	return &Reader{
 	return &Reader{
 		r:       r,
 		r:       r,
-		decoded: make([]byte, maxUncompressedChunkLen),
-		buf:     make([]byte, maxEncodedLenOfMaxUncompressedChunkLen+checksumSize),
+		decoded: make([]byte, maxBlockSize),
+		buf:     make([]byte, maxEncodedLenOfMaxBlockSize+checksumSize),
 	}
 	}
 }
 }
 
 

+ 11 - 19
encode.go

@@ -94,30 +94,22 @@ func Encode(dst, src []byte) []byte {
 	for len(src) > 0 {
 	for len(src) > 0 {
 		p := src
 		p := src
 		src = nil
 		src = nil
-		if len(p) > maxInternalEncodeSrcLen {
-			p, src = p[:maxInternalEncodeSrcLen], p[maxInternalEncodeSrcLen:]
+		if len(p) > maxBlockSize {
+			p, src = p[:maxBlockSize], p[maxBlockSize:]
 		}
 		}
-		d += encode(dst[d:], p)
+		d += encodeBlock(dst[d:], p)
 	}
 	}
 	return dst[:d]
 	return dst[:d]
 }
 }
 
 
-// maxInternalEncodeSrcLen must be less than math.MaxInt32, so that in the
-// (internal) encode function, it is safe to have the s variable (which indexes
-// the src slice), and therefore the hash table entries, to have type int32
-// instead of int.
-const maxInternalEncodeSrcLen = 0x40000000
-
-// encode encodes a non-empty src to a guaranteed-large-enough dst. It assumes
-// that the varint-encoded length of the decompressed bytes has already been
-// written.
+// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
 //
 //
 // It also assumes that:
 // It also assumes that:
 //	len(dst) >= MaxEncodedLen(len(src)) &&
 //	len(dst) >= MaxEncodedLen(len(src)) &&
-// 	0 < len(src) &&
-//	len(src) <= maxInternalEncodeSrcLen &&
-// 	maxInternalEncodeSrcLen < math.MaxInt32.
-func encode(dst, src []byte) (d int) {
+// 	0 < len(src) && len(src) <= maxBlockSize
+func encodeBlock(dst, src []byte) (d int) {
 	// Return early if src is short.
 	// Return early if src is short.
 	if len(src) <= 4 {
 	if len(src) <= 4 {
 		return emitLiteral(dst, src)
 		return emitLiteral(dst, src)
@@ -258,7 +250,7 @@ func NewWriter(w io.Writer) *Writer {
 func NewBufferedWriter(w io.Writer) *Writer {
 func NewBufferedWriter(w io.Writer) *Writer {
 	return &Writer{
 	return &Writer{
 		w:    w,
 		w:    w,
-		ibuf: make([]byte, 0, maxUncompressedChunkLen),
+		ibuf: make([]byte, 0, maxBlockSize),
 		obuf: make([]byte, obufLen),
 		obuf: make([]byte, obufLen),
 	}
 	}
 }
 }
@@ -342,8 +334,8 @@ func (w *Writer) write(p []byte) (nRet int, errRet error) {
 		}
 		}
 
 
 		var uncompressed []byte
 		var uncompressed []byte
-		if len(p) > maxUncompressedChunkLen {
-			uncompressed, p = p[:maxUncompressedChunkLen], p[maxUncompressedChunkLen:]
+		if len(p) > maxBlockSize {
+			uncompressed, p = p[:maxBlockSize], p[maxBlockSize:]
 		} else {
 		} else {
 			uncompressed, p = p, nil
 			uncompressed, p = p, nil
 		}
 		}

+ 15 - 8
snappy.go

@@ -46,18 +46,25 @@ const (
 	chunkHeaderSize = 4
 	chunkHeaderSize = 4
 	magicChunk      = "\xff\x06\x00\x00" + magicBody
 	magicChunk      = "\xff\x06\x00\x00" + magicBody
 	magicBody       = "sNaPpY"
 	magicBody       = "sNaPpY"
+
+	// maxBlockSize is the maximum size of the input to encodeBlock. It is not
+	// part of the wire format per se, but some parts of the encoder assume
+	// that an offset fits into a uint16.
+	//
+	// Also, for the framing format (Writer type instead of Encode function),
 	// https://github.com/google/snappy/blob/master/framing_format.txt says
 	// https://github.com/google/snappy/blob/master/framing_format.txt says
-	// that "the uncompressed data in a chunk must be no longer than 65536 bytes".
-	maxUncompressedChunkLen = 65536
+	// that "the uncompressed data in a chunk must be no longer than 65536
+	// bytes".
+	maxBlockSize = 65536
 
 
-	// maxEncodedLenOfMaxUncompressedChunkLen equals
-	// MaxEncodedLen(maxUncompressedChunkLen), but is hard coded to be a const
-	// instead of a variable, so that obufLen can also be a const. Their
-	// equivalence is confirmed by TestMaxEncodedLenOfMaxUncompressedChunkLen.
-	maxEncodedLenOfMaxUncompressedChunkLen = 76490
+	// maxEncodedLenOfMaxBlockSize equals MaxEncodedLen(maxBlockSize), but is
+	// hard coded to be a const instead of a variable, so that obufLen can also
+	// be a const. Their equivalence is confirmed by
+	// TestMaxEncodedLenOfMaxBlockSize.
+	maxEncodedLenOfMaxBlockSize = 76490
 
 
 	obufHeaderLen = len(magicChunk) + checksumSize + chunkHeaderSize
 	obufHeaderLen = len(magicChunk) + checksumSize + chunkHeaderSize
-	obufLen       = obufHeaderLen + maxEncodedLenOfMaxUncompressedChunkLen
+	obufLen       = obufHeaderLen + maxEncodedLenOfMaxBlockSize
 )
 )
 
 
 const (
 const (

+ 22 - 21
snappy_test.go

@@ -23,9 +23,9 @@ var (
 	testdata = flag.String("testdata", "testdata", "Directory containing the test data")
 	testdata = flag.String("testdata", "testdata", "Directory containing the test data")
 )
 )
 
 
-func TestMaxEncodedLenOfMaxUncompressedChunkLen(t *testing.T) {
-	got := maxEncodedLenOfMaxUncompressedChunkLen
-	want := MaxEncodedLen(maxUncompressedChunkLen)
+func TestMaxEncodedLenOfMaxBlockSize(t *testing.T) {
+	got := maxEncodedLenOfMaxBlockSize
+	want := MaxEncodedLen(maxBlockSize)
 	if got != want {
 	if got != want {
 		t.Fatalf("got %d, want %d", got, want)
 		t.Fatalf("got %d, want %d", got, want)
 	}
 	}
@@ -237,23 +237,24 @@ func TestDecode(t *testing.T) {
 	}
 	}
 }
 }
 
 
-// TestEncodeNoiseThenRepeats encodes a 32K block for which the first half is
-// very incompressible and the second half is very compressible. The encoded
-// form's length should be closer to 50% of the original length than 100%.
+// TestEncodeNoiseThenRepeats encodes input for which the first half is very
+// incompressible and the second half is very compressible. The encoded form's
+// length should be closer to 50% of the original length than 100%.
 func TestEncodeNoiseThenRepeats(t *testing.T) {
 func TestEncodeNoiseThenRepeats(t *testing.T) {
-	const origLen = 32768
-	src := make([]byte, origLen)
-	rng := rand.New(rand.NewSource(1))
-	firstHalf, secondHalf := src[:origLen/2], src[origLen/2:]
-	for i := range firstHalf {
-		firstHalf[i] = uint8(rng.Intn(256))
-	}
-	for i := range secondHalf {
-		secondHalf[i] = uint8(i >> 8)
-	}
-	dst := Encode(nil, src)
-	if got, want := len(dst), origLen*3/4; got >= want {
-		t.Fatalf("got %d encoded bytes, want less than %d", got, want)
+	for _, origLen := range []int{32 * 1024, 256 * 1024, 2048 * 1024} {
+		src := make([]byte, origLen)
+		rng := rand.New(rand.NewSource(1))
+		firstHalf, secondHalf := src[:origLen/2], src[origLen/2:]
+		for i := range firstHalf {
+			firstHalf[i] = uint8(rng.Intn(256))
+		}
+		for i := range secondHalf {
+			secondHalf[i] = uint8(i >> 8)
+		}
+		dst := Encode(nil, src)
+		if got, want := len(dst), origLen*3/4; got >= want {
+			t.Errorf("origLen=%d: got %d encoded bytes, want less than %d", origLen, got, want)
+		}
 	}
 	}
 }
 }
 
 
@@ -272,7 +273,7 @@ func cmp(a, b []byte) error {
 func TestFramingFormat(t *testing.T) {
 func TestFramingFormat(t *testing.T) {
 	// src is comprised of alternating 1e5-sized sequences of random
 	// src is comprised of alternating 1e5-sized sequences of random
 	// (incompressible) bytes and repeated (compressible) bytes. 1e5 was chosen
 	// (incompressible) bytes and repeated (compressible) bytes. 1e5 was chosen
-	// because it is larger than maxUncompressedChunkLen (64k).
+	// because it is larger than maxBlockSize (64k).
 	src := make([]byte, 1e6)
 	src := make([]byte, 1e6)
 	rng := rand.New(rand.NewSource(1))
 	rng := rand.New(rand.NewSource(1))
 	for i := 0; i < 10; i++ {
 	for i := 0; i < 10; i++ {
@@ -330,7 +331,7 @@ func TestNewBufferedWriter(t *testing.T) {
 	// Test all 32 possible sub-sequences of these 5 input slices.
 	// Test all 32 possible sub-sequences of these 5 input slices.
 	//
 	//
 	// Their lengths sum to 400,000, which is over 6 times the Writer ibuf
 	// Their lengths sum to 400,000, which is over 6 times the Writer ibuf
-	// capacity: 6 * maxUncompressedChunkLen is 393,216.
+	// capacity: 6 * maxBlockSize is 393,216.
 	inputs := [][]byte{
 	inputs := [][]byte{
 		bytes.Repeat([]byte{'a'}, 40000),
 		bytes.Repeat([]byte{'a'}, 40000),
 		bytes.Repeat([]byte{'b'}, 150000),
 		bytes.Repeat([]byte{'b'}, 150000),