瀏覽代碼

Write the encoder's emitCopy in asm.

name              old speed      new speed      delta
WordsEncode1e1-8   690MB/s ± 0%   665MB/s ± 0%  -3.64%  (p=0.008 n=5+5)
WordsEncode1e2-8  83.7MB/s ± 1%  83.8MB/s ± 1%    ~     (p=0.421 n=5+5)
WordsEncode1e3-8   230MB/s ± 1%   231MB/s ± 1%    ~     (p=0.421 n=5+5)
WordsEncode1e4-8   233MB/s ± 1%   232MB/s ± 1%    ~     (p=0.151 n=5+5)
WordsEncode1e5-8   212MB/s ± 0%   212MB/s ± 1%    ~     (p=1.000 n=5+5)
WordsEncode1e6-8   255MB/s ± 0%   257MB/s ± 0%  +0.57%  (p=0.008 n=5+5)
RandomEncode-8    13.2GB/s ± 1%  13.2GB/s ± 1%    ~     (p=0.151 n=5+5)
_ZFlat0-8          623MB/s ± 0%   629MB/s ± 0%  +0.93%  (p=0.008 n=5+5)
_ZFlat1-8          319MB/s ± 1%   324MB/s ± 0%  +1.65%  (p=0.008 n=5+5)
_ZFlat2-8         13.9GB/s ± 1%  13.9GB/s ± 1%    ~     (p=0.548 n=5+5)
_ZFlat3-8          176MB/s ± 0%   176MB/s ± 1%    ~     (p=0.690 n=5+5)
_ZFlat4-8         6.05GB/s ± 0%  6.12GB/s ± 0%  +1.20%  (p=0.008 n=5+5)
_ZFlat5-8          603MB/s ± 0%   614MB/s ± 0%  +1.71%  (p=0.008 n=5+5)
_ZFlat6-8          228MB/s ± 0%   230MB/s ± 0%  +0.83%  (p=0.008 n=5+5)
_ZFlat7-8          212MB/s ± 0%   214MB/s ± 0%  +0.74%  (p=0.008 n=5+5)
_ZFlat8-8          242MB/s ± 0%   244MB/s ± 0%  +0.99%  (p=0.008 n=5+5)
_ZFlat9-8          199MB/s ± 1%   200MB/s ± 0%  +0.57%  (p=0.008 n=5+5)
_ZFlat10-8         796MB/s ± 1%   797MB/s ± 0%    ~     (p=1.000 n=5+5)
_ZFlat11-8         348MB/s ± 0%   351MB/s ± 1%    ~     (p=0.056 n=5+5)

I'm not overly worried about the WordsEncode1e1-8 change: the time/op is
around 15 nanoseconds, which is tiny. In comparison, _ZFlat0-8 takes
around 163 microseconds (note µs not ns).
Nigel Tao 9 年之前
父節點
當前提交
d8211ff0ee
共有 4 個文件被更改,包括 128 次插入46 次删除
  1. 0 46
      encode.go
  2. 5 0
      encode_amd64.go
  3. 77 0
      encode_amd64.s
  4. 46 0
      encode_other.go

+ 0 - 46
encode.go

@@ -45,52 +45,6 @@ func emitLiteral(dst, lit []byte) int {
 	return i + copy(dst[i:], lit)
 }
 
-// emitCopy writes a copy chunk and returns the number of bytes written.
-//
-// It assumes that:
-//	dst is long enough to hold the encoded bytes
-//	1 <= offset && offset <= 65535
-//	4 <= length && length <= 65535
-func emitCopy(dst []byte, offset, length int) int {
-	i := 0
-	// The maximum length for a single tagCopy1 or tagCopy2 op is 64 bytes. The
-	// threshold for this loop is a little higher (at 68 = 64 + 4), and the
-	// length emitted down below is is a little lower (at 60 = 64 - 4), because
-	// it's shorter to encode a length 67 copy as a length 60 tagCopy2 followed
-	// by a length 7 tagCopy1 (which encodes as 3+2 bytes) than to encode it as
-	// a length 64 tagCopy2 followed by a length 3 tagCopy2 (which encodes as
-	// 3+3 bytes). The magic 4 in the 64±4 is because the minimum length for a
-	// tagCopy1 op is 4 bytes, which is why a length 3 copy has to be an
-	// encodes-as-3-bytes tagCopy2 instead of an encodes-as-2-bytes tagCopy1.
-	for length >= 68 {
-		// Emit a length 64 copy, encoded as 3 bytes.
-		dst[i+0] = 63<<2 | tagCopy2
-		dst[i+1] = uint8(offset)
-		dst[i+2] = uint8(offset >> 8)
-		i += 3
-		length -= 64
-	}
-	if length > 64 {
-		// Emit a length 60 copy, encoded as 3 bytes.
-		dst[i+0] = 59<<2 | tagCopy2
-		dst[i+1] = uint8(offset)
-		dst[i+2] = uint8(offset >> 8)
-		i += 3
-		length -= 60
-	}
-	if length >= 12 || offset >= 2048 {
-		// Emit the remaining copy, encoded as 3 bytes.
-		dst[i+0] = uint8(length-1)<<2 | tagCopy2
-		dst[i+1] = uint8(offset)
-		dst[i+2] = uint8(offset >> 8)
-		return i + 3
-	}
-	// Emit the remaining copy, encoded as 2 bytes.
-	dst[i+0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
-	dst[i+1] = uint8(offset)
-	return i + 2
-}
-
 // Encode returns the encoded form of src. The returned slice may be a sub-
 // slice of dst if dst was large enough to hold the entire encoded block.
 // Otherwise, a newly allocated slice will be returned.

+ 5 - 0
encode_amd64.go

@@ -8,6 +8,11 @@
 
 package snappy
 
+// emitCopy has the same semantics as in encode_other.go.
+//
+//go:noescape
+func emitCopy(dst []byte, offset, length int) int
+
 // extendMatch has the same semantics as in encode_other.go.
 //
 //go:noescape

+ 77 - 0
encode_amd64.s

@@ -11,6 +11,83 @@
 // The asm code generally follows the pure Go code in encode_other.go, except
 // where marked with a "!!!".
 
+// ----------------------------------------------------------------------------
+
+// func emitCopy(dst []byte, offset, length int) int
+//
+// All local variables fit into registers. The register allocation:
+//	- BX	offset
+//	- CX	length
+//	- SI	&dst[0]
+//	- DI	&dst[i]
+TEXT ·emitCopy(SB), NOSPLIT, $0-48
+	MOVQ dst_base+0(FP), DI
+	MOVQ DI, SI
+	MOVQ offset+24(FP), BX
+	MOVQ length+32(FP), CX
+
+loop0:
+	// for length >= 68 { etc }
+	CMPL CX, $68
+	JLT  step1
+
+	// Emit a length 64 copy, encoded as 3 bytes.
+	MOVB $0xfe, 0(DI)
+	MOVW BX, 1(DI)
+	ADDQ $3, DI
+	SUBL $64, CX
+	JMP  loop0
+
+step1:
+	// if length > 64 { etc }
+	CMPL CX, $64
+	JLE  step2
+
+	// Emit a length 60 copy, encoded as 3 bytes.
+	MOVB $0xee, 0(DI)
+	MOVW BX, 1(DI)
+	ADDQ $3, DI
+	SUBL $60, CX
+
+step2:
+	// if length >= 12 || offset >= 2048 { goto step3 }
+	CMPL CX, $12
+	JGE  step3
+	CMPL BX, $2048
+	JGE  step3
+
+	// Emit the remaining copy, encoded as 2 bytes.
+	MOVB BX, 1(DI)
+	SHRL $8, BX
+	SHLB $5, BX
+	SUBB $4, CX
+	SHLB $2, CX
+	ORB  CX, BX
+	ORB  $1, BX
+	MOVB BX, 0(DI)
+	ADDQ $2, DI
+
+	// Return the number of bytes written.
+	SUBQ SI, DI
+	MOVQ DI, ret+40(FP)
+	RET
+
+step3:
+	// Emit the remaining copy, encoded as 3 bytes.
+	SUBL $1, CX
+	SHLB $2, CX
+	ORB  $2, CX
+	MOVB CX, 0(DI)
+	MOVW BX, 1(DI)
+	ADDQ $3, DI
+
+	// Return the number of bytes written.
+	SUBQ SI, DI
+	MOVQ DI, ret+40(FP)
+	RET
+
+// ----------------------------------------------------------------------------
+
 // func extendMatch(src []byte, i, j int) int
 //
 // All local variables fit into registers. The register allocation:

+ 46 - 0
encode_other.go

@@ -6,6 +6,52 @@
 
 package snappy
 
+// emitCopy writes a copy chunk and returns the number of bytes written.
+//
+// It assumes that:
+//	dst is long enough to hold the encoded bytes
+//	1 <= offset && offset <= 65535
+//	4 <= length && length <= 65535
+func emitCopy(dst []byte, offset, length int) int {
+	i := 0
+	// The maximum length for a single tagCopy1 or tagCopy2 op is 64 bytes. The
+	// threshold for this loop is a little higher (at 68 = 64 + 4), and the
+	// length emitted down below is is a little lower (at 60 = 64 - 4), because
+	// it's shorter to encode a length 67 copy as a length 60 tagCopy2 followed
+	// by a length 7 tagCopy1 (which encodes as 3+2 bytes) than to encode it as
+	// a length 64 tagCopy2 followed by a length 3 tagCopy2 (which encodes as
+	// 3+3 bytes). The magic 4 in the 64±4 is because the minimum length for a
+	// tagCopy1 op is 4 bytes, which is why a length 3 copy has to be an
+	// encodes-as-3-bytes tagCopy2 instead of an encodes-as-2-bytes tagCopy1.
+	for length >= 68 {
+		// Emit a length 64 copy, encoded as 3 bytes.
+		dst[i+0] = 63<<2 | tagCopy2
+		dst[i+1] = uint8(offset)
+		dst[i+2] = uint8(offset >> 8)
+		i += 3
+		length -= 64
+	}
+	if length > 64 {
+		// Emit a length 60 copy, encoded as 3 bytes.
+		dst[i+0] = 59<<2 | tagCopy2
+		dst[i+1] = uint8(offset)
+		dst[i+2] = uint8(offset >> 8)
+		i += 3
+		length -= 60
+	}
+	if length >= 12 || offset >= 2048 {
+		// Emit the remaining copy, encoded as 3 bytes.
+		dst[i+0] = uint8(length-1)<<2 | tagCopy2
+		dst[i+1] = uint8(offset)
+		dst[i+2] = uint8(offset >> 8)
+		return i + 3
+	}
+	// Emit the remaining copy, encoded as 2 bytes.
+	dst[i+0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
+	dst[i+1] = uint8(offset)
+	return i + 2
+}
+
 // extendMatch returns the largest k such that k <= len(src) and that
 // src[i:i+k-j] and src[j:k] have the same contents.
 //