Browse Source

Merge pull request #90 from greatroar/arm-asm-xxh32

xxh32: ARM assembler implementation
Pierre Curto 5 years ago
parent
commit
4e086a0fe3

+ 30 - 27
internal/xxh32/xxh32zero.go

@@ -20,10 +20,7 @@ const (
 
 // XXHZero represents an xxhash32 object with seed 0.
 type XXHZero struct {
-	v1       uint32
-	v2       uint32
-	v3       uint32
-	v4       uint32
+	v        [4]uint32
 	totalLen uint64
 	buf      [16]byte
 	bufused  int
@@ -38,10 +35,10 @@ func (xxh XXHZero) Sum(b []byte) []byte {
 
 // Reset resets the Hash to its initial state.
 func (xxh *XXHZero) Reset() {
-	xxh.v1 = prime1plus2
-	xxh.v2 = prime2
-	xxh.v3 = 0
-	xxh.v4 = prime1minus
+	xxh.v[0] = prime1plus2
+	xxh.v[1] = prime2
+	xxh.v[2] = 0
+	xxh.v[3] = prime1minus
 	xxh.totalLen = 0
 	xxh.bufused = 0
 }
@@ -74,42 +71,48 @@ func (xxh *XXHZero) Write(input []byte) (int, error) {
 		return n, nil
 	}
 
-	p := 0
-	// Causes compiler to work directly from registers instead of stack:
-	v1, v2, v3, v4 := xxh.v1, xxh.v2, xxh.v3, xxh.v4
-	if m > 0 {
+	var buf *[16]byte
+	if m != 0 {
 		// some data left from previous update
-		copy(xxh.buf[m:], input)
+		buf = &xxh.buf
+		c := copy(buf[m:], input)
+		n -= c
+		input = input[c:]
+	}
+	update(&xxh.v, buf, input)
+	xxh.bufused = copy(xxh.buf[:], input[n-n%16:])
+
+	return n, nil
+}
+
+// Portable version of update. This updates v by processing all of buf
+// (if not nil) and all full 16-byte blocks of input.
+func updateGo(v *[4]uint32, buf *[16]byte, input []byte) {
+	// Causes compiler to work directly from registers instead of stack:
+	v1, v2, v3, v4 := v[0], v[1], v[2], v[3]
 
-		// fast rotl(13)
-		buf := xxh.buf[:16] // BCE hint.
+	if buf != nil {
 		v1 = rol13(v1+binary.LittleEndian.Uint32(buf[:])*prime2) * prime1
 		v2 = rol13(v2+binary.LittleEndian.Uint32(buf[4:])*prime2) * prime1
 		v3 = rol13(v3+binary.LittleEndian.Uint32(buf[8:])*prime2) * prime1
 		v4 = rol13(v4+binary.LittleEndian.Uint32(buf[12:])*prime2) * prime1
-		p = r
 	}
 
-	for n := n - 16; p <= n; p += 16 {
-		sub := input[p:][:16] //BCE hint for compiler
+	for ; len(input) >= 16; input = input[16:] {
+		sub := input[:16] //BCE hint for compiler
 		v1 = rol13(v1+binary.LittleEndian.Uint32(sub[:])*prime2) * prime1
 		v2 = rol13(v2+binary.LittleEndian.Uint32(sub[4:])*prime2) * prime1
 		v3 = rol13(v3+binary.LittleEndian.Uint32(sub[8:])*prime2) * prime1
 		v4 = rol13(v4+binary.LittleEndian.Uint32(sub[12:])*prime2) * prime1
 	}
-	xxh.v1, xxh.v2, xxh.v3, xxh.v4 = v1, v2, v3, v4
-
-	copy(xxh.buf[:], input[p:])
-	xxh.bufused = len(input) - p
-
-	return n, nil
+	v[0], v[1], v[2], v[3] = v1, v2, v3, v4
 }
 
 // Sum32 returns the 32 bits Hash value.
 func (xxh *XXHZero) Sum32() uint32 {
 	h32 := uint32(xxh.totalLen)
 	if h32 >= 16 {
-		h32 += rol1(xxh.v1) + rol7(xxh.v2) + rol12(xxh.v3) + rol18(xxh.v4)
+		h32 += rol1(xxh.v[0]) + rol7(xxh.v[1]) + rol12(xxh.v[2]) + rol18(xxh.v[3])
 	} else {
 		h32 += prime5
 	}
@@ -135,8 +138,8 @@ func (xxh *XXHZero) Sum32() uint32 {
 	return h32
 }
 
-// ChecksumZero returns the 32bits Hash value.
-func ChecksumZero(input []byte) uint32 {
+// Portable version of ChecksumZero.
+func checksumZeroGo(input []byte) uint32 {
 	n := len(input)
 	h32 := uint32(n)
 

+ 11 - 0
internal/xxh32/xxh32zero_arm.go

@@ -0,0 +1,11 @@
+// +build !noasm
+
+package xxh32
+
+// ChecksumZero returns the 32-bit hash of input.
+//
+//go:noescape
+func ChecksumZero([]byte) uint32
+
+//go:noescape
+func update(*[4]uint32, *[16]byte, []byte)

+ 259 - 0
internal/xxh32/xxh32zero_arm.s

@@ -0,0 +1,259 @@
+// +build !noasm
+
+#include "textflag.h"
+
+#define prime1		$2654435761
+#define prime2		$2246822519
+#define prime3		$3266489917
+#define prime4		$668265263
+#define prime5		$374761393
+
+#define prime1plus2	$606290984
+#define prime1minus	$1640531535
+
+// Register allocation.
+#define p	R0
+#define n	R1
+#define h	R2
+#define v1	R2	// Alias for h.
+#define v2	R3
+#define v3	R4
+#define v4	R5
+#define x1	R6
+#define x2	R7
+#define x3	R8
+#define x4	R9
+
+// We need the primes in registers. The 16-byte loop only uses prime{1,2}.
+#define prime1r	R11
+#define prime2r	R12
+#define prime3r	R3	// The rest can alias v{2-4}.
+#define prime4r	R4
+#define prime5r	R5
+
+// Update round macros. These read from and increment p.
+
+#define round16aligned			\
+	MOVM.IA.W (p), [x1, x2, x3, x4]	\
+					\
+	MULA x1, prime2r, v1, v1	\
+	MULA x2, prime2r, v2, v2	\
+	MULA x3, prime2r, v3, v3	\
+	MULA x4, prime2r, v4, v4	\
+					\
+	MOVW v1 @> 19, v1		\
+	MOVW v2 @> 19, v2		\
+	MOVW v3 @> 19, v3		\
+	MOVW v4 @> 19, v4		\
+					\
+	MUL prime1r, v1			\
+	MUL prime1r, v2			\
+	MUL prime1r, v3			\
+	MUL prime1r, v4			\
+
+#define round16unaligned 		\
+	MOVBU.P  16(p), x1		\
+	MOVBU   -15(p), x2		\
+	ORR     x2 <<  8, x1		\
+	MOVBU   -14(p), x3		\
+	MOVBU   -13(p), x4		\
+	ORR     x4 <<  8, x3		\
+	ORR     x3 << 16, x1		\
+					\
+	MULA x1, prime2r, v1, v1	\
+	MOVW v1 @> 19, v1		\
+	MUL prime1r, v1			\
+					\
+	MOVBU -12(p), x1		\
+	MOVBU -11(p), x2		\
+	ORR   x2 <<  8, x1		\
+	MOVBU -10(p), x3		\
+	MOVBU  -9(p), x4		\
+	ORR   x4 <<  8, x3		\
+	ORR   x3 << 16, x1		\
+					\
+	MULA x1, prime2r, v2, v2	\
+	MOVW v2 @> 19, v2		\
+	MUL prime1r, v2			\
+					\
+	MOVBU -8(p), x1			\
+	MOVBU -7(p), x2			\
+	ORR   x2 <<  8, x1		\
+	MOVBU -6(p), x3			\
+	MOVBU -5(p), x4			\
+	ORR   x4 <<  8, x3		\
+	ORR   x3 << 16, x1		\
+					\
+	MULA x1, prime2r, v3, v3	\
+	MOVW v3 @> 19, v3		\
+	MUL prime1r, v3			\
+					\
+	MOVBU -4(p), x1			\
+	MOVBU -3(p), x2			\
+	ORR   x2 <<  8, x1		\
+	MOVBU -2(p), x3			\
+	MOVBU -1(p), x4			\
+	ORR   x4 <<  8, x3		\
+	ORR   x3 << 16, x1		\
+					\
+	MULA x1, prime2r, v4, v4	\
+	MOVW v4 @> 19, v4		\
+	MUL prime1r, v4			\
+
+
+// func ChecksumZero([]byte) uint32
+TEXT ·ChecksumZero(SB), NOFRAME|NOSPLIT, $-4-16
+	MOVW input_base+0(FP), p
+	MOVW input_len+4(FP),  n
+
+	MOVW prime1, prime1r
+	MOVW prime2, prime2r
+
+	// Set up h for n < 16. It's tempting to say {ADD prime5, n, h}
+	// here, but that's a pseudo-op that generates a load through R11.
+	MOVW prime5, prime5r
+	ADD  prime5r, n, h
+	CMP  $0, n
+	BEQ  end
+
+	// We let n go negative so we can do comparisons with SUB.S
+	// instead of separate CMP.
+	SUB.S $16, n
+	BMI   loop16done
+
+	MOVW prime1plus2, v1
+	MOVW prime2,      v2
+	MOVW $0,          v3
+	MOVW prime1minus, v4
+
+	TST $3, p
+	BNE loop16unaligned
+
+loop16aligned:
+	SUB.S $16, n
+	round16aligned
+	BPL loop16aligned
+	B   loop16finish
+
+loop16unaligned:
+	SUB.S $16, n
+	round16unaligned
+	BPL loop16unaligned
+
+loop16finish:
+	MOVW v1 @> 31, h
+	ADD  v2 @> 25, h
+	ADD  v3 @> 20, h
+	ADD  v4 @> 14, h
+
+	// h += len(input) with v2 as temporary.
+	MOVW input_len+4(FP), v2
+	ADD  v2, h
+
+loop16done:
+	ADD $16, n	// Restore number of bytes left.
+
+	SUB.S $4, n
+	MOVW  prime3, prime3r
+	BMI   loop4done
+	MOVW  prime4, prime4r
+
+	TST $3, p
+	BNE loop4unaligned
+
+loop4aligned:
+	SUB.S $4, n
+
+	MOVW.P 4(p), x1
+	MULA   prime3r, x1, h, h
+	MOVW   h @> 15, h
+	MUL    prime4r, h
+
+	BPL loop4aligned
+	B   loop4done
+
+loop4unaligned:
+	SUB.S $4, n
+
+	MOVBU.P  4(p), x1
+	MOVBU   -3(p), x2
+	ORR     x2 <<  8, x1
+	MOVBU   -2(p), x3
+	ORR     x3 << 16, x1
+	MOVBU   -1(p), x4
+	ORR     x4 << 24, x1
+
+	MULA prime3r, x1, h, h
+	MOVW h @> 15, h
+	MUL  prime4r, h
+
+	BPL loop4unaligned
+
+loop4done:
+	ADD.S $4, n	// Restore number of bytes left.
+	BEQ   end
+
+	MOVW prime5, prime5r
+
+loop1:
+	SUB.S $1, n
+
+	MOVBU.P 1(p), x1
+	MULA    prime5r, x1, h, h
+	MOVW    h @> 21, h
+	MUL     prime1r, h
+
+	BNE loop1
+
+end:
+	MOVW prime3, prime3r
+	EOR  h >> 15, h
+	MUL  prime2r, h
+	EOR  h >> 13, h
+	MUL  prime3r, h
+	EOR  h >> 16, h
+
+	MOVW h, ret+12(FP)
+	RET
+
+
+// func update(v *[4]uint64, buf *[16]byte, p []byte)
+TEXT ·update(SB), NOFRAME|NOSPLIT, $-4-20
+	MOVW    v_arg+0(FP), p
+	MOVM.IA (p), [v1, v2, v3, v4]
+
+	MOVW prime1, prime1r
+	MOVW prime2, prime2r
+
+	// Process buf, if not nil.
+	MOVW buf_arg+4(FP), p
+	CMP  $0, p
+	BEQ  noBuffered
+
+	round16aligned
+
+noBuffered:
+	MOVW input_ptr+ 8(FP), p
+	MOVW input_len+12(FP), n
+
+	SUB.S $16, n
+	BMI   end
+
+	TST $3, p
+	BNE loop16unaligned
+
+loop16aligned:
+	SUB.S $16, n
+	round16aligned
+	BPL loop16aligned
+	B   end
+
+loop16unaligned:
+	SUB.S $16, n
+	round16unaligned
+	BPL loop16unaligned
+
+end:
+	MOVW    v_arg+0(FP), p
+	MOVM.IA [v1, v2, v3, v4], (p)
+	RET

+ 10 - 0
internal/xxh32/xxh32zero_other.go

@@ -0,0 +1,10 @@
+// +build !arm noasm
+
+package xxh32
+
+// ChecksumZero returns the 32-bit hash of input.
+func ChecksumZero(input []byte) uint32 { return checksumZeroGo(input) }
+
+func update(v *[4]uint32, buf *[16]byte, input []byte) {
+	updateGo(v, buf, input)
+}

+ 56 - 3
internal/xxh32/xxh32zero_test.go

@@ -55,7 +55,7 @@ func TestZeroData(t *testing.T) {
 			t.Fatalf("got %x; want %x", got, want)
 		}
 		if got, want := xxh32.ChecksumZero(data), td.sum; got != want {
-			t.Fatalf("got %x; want %x", got, want)
+			t.Errorf("got %x; want %x", got, want)
 		}
 	}
 }
@@ -92,7 +92,7 @@ func TestZeroChecksum(t *testing.T) {
 		data := []byte(td.data)
 		h := xxh32.ChecksumZero(data)
 		if got, want := h, td.sum; got != want {
-			t.Fatalf("got %x; want %x", got, want)
+			t.Errorf("got %x; want %x", got, want)
 		}
 	}
 }
@@ -103,12 +103,49 @@ func TestZeroReset(t *testing.T) {
 		_, _ = xxh.Write([]byte(td.data))
 		h := xxh.Sum32()
 		if got, want := h, td.sum; got != want {
-			t.Fatalf("got %x; want %x", got, want)
+			t.Errorf("got %x; want %x", got, want)
 		}
 		xxh.Reset()
 	}
 }
 
+func TestNil(t *testing.T) {
+	want := xxh32.ChecksumZero([]byte(""))
+
+	var xxh xxh32.XXHZero
+	xxh.Write(nil)
+	got := xxh.Sum32()
+	if got != want {
+		t.Errorf("got %x; want %x", got, want)
+	}
+
+	got = xxh32.ChecksumZero(nil)
+	if got != want {
+		t.Errorf("got %x; want %x", got, want)
+	}
+}
+
+func TestUnaligned(t *testing.T) {
+	zeros := make([]byte, 100)
+	ha := xxh32.ChecksumZero(zeros[:len(zeros)-1])
+	hu := xxh32.ChecksumZero(zeros[1:])
+	if ha != hu {
+		t.Errorf("mismatch: %x != %x", ha, hu)
+	}
+
+	var xxh xxh32.XXHZero
+	xxh.Write(zeros[:len(zeros)-1])
+	ha = xxh.Sum32()
+
+	xxh.Reset()
+	xxh.Write(zeros[1:])
+	hu = xxh32.ChecksumZero(zeros[1:])
+
+	if ha != hu {
+		t.Errorf("mismatch: %x != %x", ha, hu)
+	}
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 // Benchmarks
 //
@@ -129,6 +166,22 @@ func Benchmark_XXH32_Checksum(b *testing.B) {
 	}
 }
 
+// The following two benchmark the case where 3/4 calls are not 4-byte-aligned.
+func Benchmark_XXH32Unaligned(b *testing.B) {
+	var h xxh32.XXHZero
+	for n := 0; n < b.N; n++ {
+		_, _ = h.Write(testdata1[n%4:])
+		h.Sum32()
+		h.Reset()
+	}
+}
+
+func Benchmark_XXH32_ChecksumUnaligned(b *testing.B) {
+	for n := 0; n < b.N; n++ {
+		xxh32.ChecksumZero(testdata1[n%4:])
+	}
+}
+
 func Benchmark_CRC32(b *testing.B) {
 	t := crc32.MakeTable(0)
 	for i := 0; i < b.N; i++ {