5 лет назад · 4e086a0fe3
--- a/internal/xxh32/xxh32zero.go
+++ b/internal/xxh32/xxh32zero.go
@@ -20,10 +20,7 @@ const (
 
				 
			
 
				 // XXHZero represents an xxhash32 object with seed 0.
			
 
				 type XXHZero struct {
			
 
				-	v1       uint32
			
 
				-	v2       uint32
			
 
				-	v3       uint32
			
 
				-	v4       uint32
			
 
				+	v        [4]uint32
			
 
				 	totalLen uint64
			
 
				 	buf      [16]byte
			
 
				 	bufused  int
			
@@ -38,10 +35,10 @@ func (xxh XXHZero) Sum(b []byte) []byte {
 
				 
			
 
				 // Reset resets the Hash to its initial state.
			
 
				 func (xxh *XXHZero) Reset() {
			
 
				-	xxh.v1 = prime1plus2
			
 
				-	xxh.v2 = prime2
			
 
				-	xxh.v3 = 0
			
 
				-	xxh.v4 = prime1minus
			
 
				+	xxh.v[0] = prime1plus2
			
 
				+	xxh.v[1] = prime2
			
 
				+	xxh.v[2] = 0
			
 
				+	xxh.v[3] = prime1minus
			
 
				 	xxh.totalLen = 0
			
 
				 	xxh.bufused = 0
			
 
				 }
			
@@ -74,42 +71,48 @@ func (xxh *XXHZero) Write(input []byte) (int, error) {
 
				 		return n, nil
			
 
				 	}
			
 
				 
			
 
				-	p := 0
			
 
				-	// Causes compiler to work directly from registers instead of stack:
			
 
				-	v1, v2, v3, v4 := xxh.v1, xxh.v2, xxh.v3, xxh.v4
			
 
				-	if m > 0 {
			
 
				+	var buf *[16]byte
			
 
				+	if m != 0 {
			
 
				 		// some data left from previous update
			
 
				-		copy(xxh.buf[m:], input)
			
 
				+		buf = &xxh.buf
			
 
				+		c := copy(buf[m:], input)
			
 
				+		n -= c
			
 
				+		input = input[c:]
			
 
				+	}
			
 
				+	update(&xxh.v, buf, input)
			
 
				+	xxh.bufused = copy(xxh.buf[:], input[n-n%16:])
			
 
				+
			
 
				+	return n, nil
			
 
				+}
			
 
				+
			
 
				+// Portable version of update. This updates v by processing all of buf
			
 
				+// (if not nil) and all full 16-byte blocks of input.
			
 
				+func updateGo(v *[4]uint32, buf *[16]byte, input []byte) {
			
 
				+	// Causes compiler to work directly from registers instead of stack:
			
 
				+	v1, v2, v3, v4 := v[0], v[1], v[2], v[3]
			
 
				 
			
 
				-		// fast rotl(13)
			
 
				-		buf := xxh.buf[:16] // BCE hint.
			
 
				+	if buf != nil {
			
 
				 		v1 = rol13(v1+binary.LittleEndian.Uint32(buf[:])*prime2) * prime1
			
 
				 		v2 = rol13(v2+binary.LittleEndian.Uint32(buf[4:])*prime2) * prime1
			
 
				 		v3 = rol13(v3+binary.LittleEndian.Uint32(buf[8:])*prime2) * prime1
			
 
				 		v4 = rol13(v4+binary.LittleEndian.Uint32(buf[12:])*prime2) * prime1
			
 
				-		p = r
			
 
				 	}
			
 
				 
			
 
				-	for n := n - 16; p <= n; p += 16 {
			
 
				-		sub := input[p:][:16] //BCE hint for compiler
			
 
				+	for ; len(input) >= 16; input = input[16:] {
			
 
				+		sub := input[:16] //BCE hint for compiler
			
 
				 		v1 = rol13(v1+binary.LittleEndian.Uint32(sub[:])*prime2) * prime1
			
 
				 		v2 = rol13(v2+binary.LittleEndian.Uint32(sub[4:])*prime2) * prime1
			
 
				 		v3 = rol13(v3+binary.LittleEndian.Uint32(sub[8:])*prime2) * prime1
			
 
				 		v4 = rol13(v4+binary.LittleEndian.Uint32(sub[12:])*prime2) * prime1
			
 
				 	}
			
 
				-	xxh.v1, xxh.v2, xxh.v3, xxh.v4 = v1, v2, v3, v4
			
 
				-
			
 
				-	copy(xxh.buf[:], input[p:])
			
 
				-	xxh.bufused = len(input) - p
			
 
				-
			
 
				-	return n, nil
			
 
				+	v[0], v[1], v[2], v[3] = v1, v2, v3, v4
			
 
				 }
			
 
				 
			
 
				 // Sum32 returns the 32 bits Hash value.
			
 
				 func (xxh *XXHZero) Sum32() uint32 {
			
 
				 	h32 := uint32(xxh.totalLen)
			
 
				 	if h32 >= 16 {
			
 
				-		h32 += rol1(xxh.v1) + rol7(xxh.v2) + rol12(xxh.v3) + rol18(xxh.v4)
			
 
				+		h32 += rol1(xxh.v[0]) + rol7(xxh.v[1]) + rol12(xxh.v[2]) + rol18(xxh.v[3])
			
 
				 	} else {
			
 
				 		h32 += prime5
			
 
				 	}
			
@@ -135,8 +138,8 @@ func (xxh *XXHZero) Sum32() uint32 {
 
				 	return h32
			
 
				 }
			
 
				 
			
 
				-// ChecksumZero returns the 32bits Hash value.
			
 
				-func ChecksumZero(input []byte) uint32 {
			
 
				+// Portable version of ChecksumZero.
			
 
				+func checksumZeroGo(input []byte) uint32 {
			
 
				 	n := len(input)
			
 
				 	h32 := uint32(n)
			
 
				 
			
--- a/internal/xxh32/xxh32zero_arm.go
+++ b/internal/xxh32/xxh32zero_arm.go
@@ -0,0 +1,11 @@
 
				+// +build !noasm
			
 
				+
			
 
				+package xxh32
			
 
				+
			
 
				+// ChecksumZero returns the 32-bit hash of input.
			
 
				+//
			
 
				+//go:noescape
			
 
				+func ChecksumZero([]byte) uint32
			
 
				+
			
 
				+//go:noescape
			
 
				+func update(*[4]uint32, *[16]byte, []byte)
			
--- a/internal/xxh32/xxh32zero_arm.s
+++ b/internal/xxh32/xxh32zero_arm.s
@@ -0,0 +1,259 @@
 
				+// +build !noasm
			
 
				+
			
 
				+#include "textflag.h"
			
 
				+
			
 
				+#define prime1		$2654435761
			
 
				+#define prime2		$2246822519
			
 
				+#define prime3		$3266489917
			
 
				+#define prime4		$668265263
			
 
				+#define prime5		$374761393
			
 
				+
			
 
				+#define prime1plus2	$606290984
			
 
				+#define prime1minus	$1640531535
			
 
				+
			
 
				+// Register allocation.
			
 
				+#define p	R0
			
 
				+#define n	R1
			
 
				+#define h	R2
			
 
				+#define v1	R2	// Alias for h.
			
 
				+#define v2	R3
			
 
				+#define v3	R4
			
 
				+#define v4	R5
			
 
				+#define x1	R6
			
 
				+#define x2	R7
			
 
				+#define x3	R8
			
 
				+#define x4	R9
			
 
				+
			
 
				+// We need the primes in registers. The 16-byte loop only uses prime{1,2}.
			
 
				+#define prime1r	R11
			
 
				+#define prime2r	R12
			
 
				+#define prime3r	R3	// The rest can alias v{2-4}.
			
 
				+#define prime4r	R4
			
 
				+#define prime5r	R5
			
 
				+
			
 
				+// Update round macros. These read from and increment p.
			
 
				+
			
 
				+#define round16aligned			\
			
 
				+	MOVM.IA.W (p), [x1, x2, x3, x4]	\
			
 
				+					\
			
 
				+	MULA x1, prime2r, v1, v1	\
			
 
				+	MULA x2, prime2r, v2, v2	\
			
 
				+	MULA x3, prime2r, v3, v3	\
			
 
				+	MULA x4, prime2r, v4, v4	\
			
 
				+					\
			
 
				+	MOVW v1 @> 19, v1		\
			
 
				+	MOVW v2 @> 19, v2		\
			
 
				+	MOVW v3 @> 19, v3		\
			
 
				+	MOVW v4 @> 19, v4		\
			
 
				+					\
			
 
				+	MUL prime1r, v1			\
			
 
				+	MUL prime1r, v2			\
			
 
				+	MUL prime1r, v3			\
			
 
				+	MUL prime1r, v4			\
			
 
				+
			
 
				+#define round16unaligned 		\
			
 
				+	MOVBU.P  16(p), x1		\
			
 
				+	MOVBU   -15(p), x2		\
			
 
				+	ORR     x2 <<  8, x1		\
			
 
				+	MOVBU   -14(p), x3		\
			
 
				+	MOVBU   -13(p), x4		\
			
 
				+	ORR     x4 <<  8, x3		\
			
 
				+	ORR     x3 << 16, x1		\
			
 
				+					\
			
 
				+	MULA x1, prime2r, v1, v1	\
			
 
				+	MOVW v1 @> 19, v1		\
			
 
				+	MUL prime1r, v1			\
			
 
				+					\
			
 
				+	MOVBU -12(p), x1		\
			
 
				+	MOVBU -11(p), x2		\
			
 
				+	ORR   x2 <<  8, x1		\
			
 
				+	MOVBU -10(p), x3		\
			
 
				+	MOVBU  -9(p), x4		\
			
 
				+	ORR   x4 <<  8, x3		\
			
 
				+	ORR   x3 << 16, x1		\
			
 
				+					\
			
 
				+	MULA x1, prime2r, v2, v2	\
			
 
				+	MOVW v2 @> 19, v2		\
			
 
				+	MUL prime1r, v2			\
			
 
				+					\
			
 
				+	MOVBU -8(p), x1			\
			
 
				+	MOVBU -7(p), x2			\
			
 
				+	ORR   x2 <<  8, x1		\
			
 
				+	MOVBU -6(p), x3			\
			
 
				+	MOVBU -5(p), x4			\
			
 
				+	ORR   x4 <<  8, x3		\
			
 
				+	ORR   x3 << 16, x1		\
			
 
				+					\
			
 
				+	MULA x1, prime2r, v3, v3	\
			
 
				+	MOVW v3 @> 19, v3		\
			
 
				+	MUL prime1r, v3			\
			
 
				+					\
			
 
				+	MOVBU -4(p), x1			\
			
 
				+	MOVBU -3(p), x2			\
			
 
				+	ORR   x2 <<  8, x1		\
			
 
				+	MOVBU -2(p), x3			\
			
 
				+	MOVBU -1(p), x4			\
			
 
				+	ORR   x4 <<  8, x3		\
			
 
				+	ORR   x3 << 16, x1		\
			
 
				+					\
			
 
				+	MULA x1, prime2r, v4, v4	\
			
 
				+	MOVW v4 @> 19, v4		\
			
 
				+	MUL prime1r, v4			\
			
 
				+
			
 
				+
			
 
				+// func ChecksumZero([]byte) uint32
			
 
				+TEXT ·ChecksumZero(SB), NOFRAME|NOSPLIT, $-4-16
			
 
				+	MOVW input_base+0(FP), p
			
 
				+	MOVW input_len+4(FP),  n
			
 
				+
			
 
				+	MOVW prime1, prime1r
			
 
				+	MOVW prime2, prime2r
			
 
				+
			
 
				+	// Set up h for n < 16. It's tempting to say {ADD prime5, n, h}
			
 
				+	// here, but that's a pseudo-op that generates a load through R11.
			
 
				+	MOVW prime5, prime5r
			
 
				+	ADD  prime5r, n, h
			
 
				+	CMP  $0, n
			
 
				+	BEQ  end
			
 
				+
			
 
				+	// We let n go negative so we can do comparisons with SUB.S
			
 
				+	// instead of separate CMP.
			
 
				+	SUB.S $16, n
			
 
				+	BMI   loop16done
			
 
				+
			
 
				+	MOVW prime1plus2, v1
			
 
				+	MOVW prime2,      v2
			
 
				+	MOVW $0,          v3
			
 
				+	MOVW prime1minus, v4
			
 
				+
			
 
				+	TST $3, p
			
 
				+	BNE loop16unaligned
			
 
				+
			
 
				+loop16aligned:
			
 
				+	SUB.S $16, n
			
 
				+	round16aligned
			
 
				+	BPL loop16aligned
			
 
				+	B   loop16finish
			
 
				+
			
 
				+loop16unaligned:
			
 
				+	SUB.S $16, n
			
 
				+	round16unaligned
			
 
				+	BPL loop16unaligned
			
 
				+
			
 
				+loop16finish:
			
 
				+	MOVW v1 @> 31, h
			
 
				+	ADD  v2 @> 25, h
			
 
				+	ADD  v3 @> 20, h
			
 
				+	ADD  v4 @> 14, h
			
 
				+
			
 
				+	// h += len(input) with v2 as temporary.
			
 
				+	MOVW input_len+4(FP), v2
			
 
				+	ADD  v2, h
			
 
				+
			
 
				+loop16done:
			
 
				+	ADD $16, n	// Restore number of bytes left.
			
 
				+
			
 
				+	SUB.S $4, n
			
 
				+	MOVW  prime3, prime3r
			
 
				+	BMI   loop4done
			
 
				+	MOVW  prime4, prime4r
			
 
				+
			
 
				+	TST $3, p
			
 
				+	BNE loop4unaligned
			
 
				+
			
 
				+loop4aligned:
			
 
				+	SUB.S $4, n
			
 
				+
			
 
				+	MOVW.P 4(p), x1
			
 
				+	MULA   prime3r, x1, h, h
			
 
				+	MOVW   h @> 15, h
			
 
				+	MUL    prime4r, h
			
 
				+
			
 
				+	BPL loop4aligned
			
 
				+	B   loop4done
			
 
				+
			
 
				+loop4unaligned:
			
 
				+	SUB.S $4, n
			
 
				+
			
 
				+	MOVBU.P  4(p), x1
			
 
				+	MOVBU   -3(p), x2
			
 
				+	ORR     x2 <<  8, x1
			
 
				+	MOVBU   -2(p), x3
			
 
				+	ORR     x3 << 16, x1
			
 
				+	MOVBU   -1(p), x4
			
 
				+	ORR     x4 << 24, x1
			
 
				+
			
 
				+	MULA prime3r, x1, h, h
			
 
				+	MOVW h @> 15, h
			
 
				+	MUL  prime4r, h
			
 
				+
			
 
				+	BPL loop4unaligned
			
 
				+
			
 
				+loop4done:
			
 
				+	ADD.S $4, n	// Restore number of bytes left.
			
 
				+	BEQ   end
			
 
				+
			
 
				+	MOVW prime5, prime5r
			
 
				+
			
 
				+loop1:
			
 
				+	SUB.S $1, n
			
 
				+
			
 
				+	MOVBU.P 1(p), x1
			
 
				+	MULA    prime5r, x1, h, h
			
 
				+	MOVW    h @> 21, h
			
 
				+	MUL     prime1r, h
			
 
				+
			
 
				+	BNE loop1
			
 
				+
			
 
				+end:
			
 
				+	MOVW prime3, prime3r
			
 
				+	EOR  h >> 15, h
			
 
				+	MUL  prime2r, h
			
 
				+	EOR  h >> 13, h
			
 
				+	MUL  prime3r, h
			
 
				+	EOR  h >> 16, h
			
 
				+
			
 
				+	MOVW h, ret+12(FP)
			
 
				+	RET
			
 
				+
			
 
				+
			
 
				+// func update(v *[4]uint64, buf *[16]byte, p []byte)
			
 
				+TEXT ·update(SB), NOFRAME|NOSPLIT, $-4-20
			
 
				+	MOVW    v_arg+0(FP), p
			
 
				+	MOVM.IA (p), [v1, v2, v3, v4]
			
 
				+
			
 
				+	MOVW prime1, prime1r
			
 
				+	MOVW prime2, prime2r
			
 
				+
			
 
				+	// Process buf, if not nil.
			
 
				+	MOVW buf_arg+4(FP), p
			
 
				+	CMP  $0, p
			
 
				+	BEQ  noBuffered
			
 
				+
			
 
				+	round16aligned
			
 
				+
			
 
				+noBuffered:
			
 
				+	MOVW input_ptr+ 8(FP), p
			
 
				+	MOVW input_len+12(FP), n
			
 
				+
			
 
				+	SUB.S $16, n
			
 
				+	BMI   end
			
 
				+
			
 
				+	TST $3, p
			
 
				+	BNE loop16unaligned
			
 
				+
			
 
				+loop16aligned:
			
 
				+	SUB.S $16, n
			
 
				+	round16aligned
			
 
				+	BPL loop16aligned
			
 
				+	B   end
			
 
				+
			
 
				+loop16unaligned:
			
 
				+	SUB.S $16, n
			
 
				+	round16unaligned
			
 
				+	BPL loop16unaligned
			
 
				+
			
 
				+end:
			
 
				+	MOVW    v_arg+0(FP), p
			
 
				+	MOVM.IA [v1, v2, v3, v4], (p)
			
 
				+	RET
			
--- a/internal/xxh32/xxh32zero_other.go
+++ b/internal/xxh32/xxh32zero_other.go
@@ -0,0 +1,10 @@
 
				+// +build !arm noasm
			
 
				+
			
 
				+package xxh32
			
 
				+
			
 
				+// ChecksumZero returns the 32-bit hash of input.
			
 
				+func ChecksumZero(input []byte) uint32 { return checksumZeroGo(input) }
			
 
				+
			
 
				+func update(v *[4]uint32, buf *[16]byte, input []byte) {
			
 
				+	updateGo(v, buf, input)
			
 
				+}
			
--- a/internal/xxh32/xxh32zero_test.go
+++ b/internal/xxh32/xxh32zero_test.go
@@ -55,7 +55,7 @@ func TestZeroData(t *testing.T) {
 
				 			t.Fatalf("got %x; want %x", got, want)
			
 
				 		}
			
 
				 		if got, want := xxh32.ChecksumZero(data), td.sum; got != want {
			
 
				-			t.Fatalf("got %x; want %x", got, want)
			
 
				+			t.Errorf("got %x; want %x", got, want)
			
 
				 		}
			
 
				 	}
			
 
				 }
			
@@ -92,7 +92,7 @@ func TestZeroChecksum(t *testing.T) {
 
				 		data := []byte(td.data)
			
 
				 		h := xxh32.ChecksumZero(data)
			
 
				 		if got, want := h, td.sum; got != want {
			
 
				-			t.Fatalf("got %x; want %x", got, want)
			
 
				+			t.Errorf("got %x; want %x", got, want)
			
 
				 		}
			
 
				 	}
			
 
				 }
			
@@ -103,12 +103,49 @@ func TestZeroReset(t *testing.T) {
 
				 		_, _ = xxh.Write([]byte(td.data))
			
 
				 		h := xxh.Sum32()
			
 
				 		if got, want := h, td.sum; got != want {
			
 
				-			t.Fatalf("got %x; want %x", got, want)
			
 
				+			t.Errorf("got %x; want %x", got, want)
			
 
				 		}
			
 
				 		xxh.Reset()
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				+func TestNil(t *testing.T) {
			
 
				+	want := xxh32.ChecksumZero([]byte(""))
			
 
				+
			
 
				+	var xxh xxh32.XXHZero
			
 
				+	xxh.Write(nil)
			
 
				+	got := xxh.Sum32()
			
 
				+	if got != want {
			
 
				+		t.Errorf("got %x; want %x", got, want)
			
 
				+	}
			
 
				+
			
 
				+	got = xxh32.ChecksumZero(nil)
			
 
				+	if got != want {
			
 
				+		t.Errorf("got %x; want %x", got, want)
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func TestUnaligned(t *testing.T) {
			
 
				+	zeros := make([]byte, 100)
			
 
				+	ha := xxh32.ChecksumZero(zeros[:len(zeros)-1])
			
 
				+	hu := xxh32.ChecksumZero(zeros[1:])
			
 
				+	if ha != hu {
			
 
				+		t.Errorf("mismatch: %x != %x", ha, hu)
			
 
				+	}
			
 
				+
			
 
				+	var xxh xxh32.XXHZero
			
 
				+	xxh.Write(zeros[:len(zeros)-1])
			
 
				+	ha = xxh.Sum32()
			
 
				+
			
 
				+	xxh.Reset()
			
 
				+	xxh.Write(zeros[1:])
			
 
				+	hu = xxh32.ChecksumZero(zeros[1:])
			
 
				+
			
 
				+	if ha != hu {
			
 
				+		t.Errorf("mismatch: %x != %x", ha, hu)
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 ///////////////////////////////////////////////////////////////////////////////
			
 
				 // Benchmarks
			
 
				 //
			
@@ -129,6 +166,22 @@ func Benchmark_XXH32_Checksum(b *testing.B) {
 
				 	}
			
 
				 }
			
 
				 
			
 
				+// The following two benchmark the case where 3/4 calls are not 4-byte-aligned.
			
 
				+func Benchmark_XXH32Unaligned(b *testing.B) {
			
 
				+	var h xxh32.XXHZero
			
 
				+	for n := 0; n < b.N; n++ {
			
 
				+		_, _ = h.Write(testdata1[n%4:])
			
 
				+		h.Sum32()
			
 
				+		h.Reset()
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func Benchmark_XXH32_ChecksumUnaligned(b *testing.B) {
			
 
				+	for n := 0; n < b.N; n++ {
			
 
				+		xxh32.ChecksumZero(testdata1[n%4:])
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 func Benchmark_CRC32(b *testing.B) {
			
 
				 	t := crc32.MakeTable(0)
			
 
				 	for i := 0; i < b.N; i++ {