123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259 |
- // +build !noasm
- #include "textflag.h"
- #define prime1 $2654435761
- #define prime2 $2246822519
- #define prime3 $3266489917
- #define prime4 $668265263
- #define prime5 $374761393
- #define prime1plus2 $606290984
- #define prime1minus $1640531535
- // Register allocation.
- #define p R0
- #define n R1
- #define h R2
- #define v1 R2 // Alias for h.
- #define v2 R3
- #define v3 R4
- #define v4 R5
- #define x1 R6
- #define x2 R7
- #define x3 R8
- #define x4 R9
- // We need the primes in registers. The 16-byte loop only uses prime{1,2}.
- #define prime1r R11
- #define prime2r R12
- #define prime3r R3 // The rest can alias v{2-4}.
- #define prime4r R4
- #define prime5r R5
- // Update round macros. These read from and increment p.
- #define round16aligned \
- MOVM.IA.W (p), [x1, x2, x3, x4] \
- \
- MULA x1, prime2r, v1, v1 \
- MULA x2, prime2r, v2, v2 \
- MULA x3, prime2r, v3, v3 \
- MULA x4, prime2r, v4, v4 \
- \
- MOVW v1 @> 19, v1 \
- MOVW v2 @> 19, v2 \
- MOVW v3 @> 19, v3 \
- MOVW v4 @> 19, v4 \
- \
- MUL prime1r, v1 \
- MUL prime1r, v2 \
- MUL prime1r, v3 \
- MUL prime1r, v4 \
- #define round16unaligned \
- MOVBU.P 16(p), x1 \
- MOVBU -15(p), x2 \
- ORR x2 << 8, x1 \
- MOVBU -14(p), x3 \
- MOVBU -13(p), x4 \
- ORR x4 << 8, x3 \
- ORR x3 << 16, x1 \
- \
- MULA x1, prime2r, v1, v1 \
- MOVW v1 @> 19, v1 \
- MUL prime1r, v1 \
- \
- MOVBU -12(p), x1 \
- MOVBU -11(p), x2 \
- ORR x2 << 8, x1 \
- MOVBU -10(p), x3 \
- MOVBU -9(p), x4 \
- ORR x4 << 8, x3 \
- ORR x3 << 16, x1 \
- \
- MULA x1, prime2r, v2, v2 \
- MOVW v2 @> 19, v2 \
- MUL prime1r, v2 \
- \
- MOVBU -8(p), x1 \
- MOVBU -7(p), x2 \
- ORR x2 << 8, x1 \
- MOVBU -6(p), x3 \
- MOVBU -5(p), x4 \
- ORR x4 << 8, x3 \
- ORR x3 << 16, x1 \
- \
- MULA x1, prime2r, v3, v3 \
- MOVW v3 @> 19, v3 \
- MUL prime1r, v3 \
- \
- MOVBU -4(p), x1 \
- MOVBU -3(p), x2 \
- ORR x2 << 8, x1 \
- MOVBU -2(p), x3 \
- MOVBU -1(p), x4 \
- ORR x4 << 8, x3 \
- ORR x3 << 16, x1 \
- \
- MULA x1, prime2r, v4, v4 \
- MOVW v4 @> 19, v4 \
- MUL prime1r, v4 \
- // func ChecksumZero([]byte) uint32
- TEXT ·ChecksumZero(SB), NOFRAME|NOSPLIT, $-4-16
- MOVW input_base+0(FP), p
- MOVW input_len+4(FP), n
- MOVW prime1, prime1r
- MOVW prime2, prime2r
- // Set up h for n < 16. It's tempting to say {ADD prime5, n, h}
- // here, but that's a pseudo-op that generates a load through R11.
- MOVW prime5, prime5r
- ADD prime5r, n, h
- CMP $0, n
- BEQ end
- // We let n go negative so we can do comparisons with SUB.S
- // instead of separate CMP.
- SUB.S $16, n
- BMI loop16done
- MOVW prime1plus2, v1
- MOVW prime2, v2
- MOVW $0, v3
- MOVW prime1minus, v4
- TST $3, p
- BNE loop16unaligned
- loop16aligned:
- SUB.S $16, n
- round16aligned
- BPL loop16aligned
- B loop16finish
- loop16unaligned:
- SUB.S $16, n
- round16unaligned
- BPL loop16unaligned
- loop16finish:
- MOVW v1 @> 31, h
- ADD v2 @> 25, h
- ADD v3 @> 20, h
- ADD v4 @> 14, h
- // h += len(input) with v2 as temporary.
- MOVW input_len+4(FP), v2
- ADD v2, h
- loop16done:
- ADD $16, n // Restore number of bytes left.
- SUB.S $4, n
- MOVW prime3, prime3r
- BMI loop4done
- MOVW prime4, prime4r
- TST $3, p
- BNE loop4unaligned
- loop4aligned:
- SUB.S $4, n
- MOVW.P 4(p), x1
- MULA prime3r, x1, h, h
- MOVW h @> 15, h
- MUL prime4r, h
- BPL loop4aligned
- B loop4done
- loop4unaligned:
- SUB.S $4, n
- MOVBU.P 4(p), x1
- MOVBU -3(p), x2
- ORR x2 << 8, x1
- MOVBU -2(p), x3
- ORR x3 << 16, x1
- MOVBU -1(p), x4
- ORR x4 << 24, x1
- MULA prime3r, x1, h, h
- MOVW h @> 15, h
- MUL prime4r, h
- BPL loop4unaligned
- loop4done:
- ADD.S $4, n // Restore number of bytes left.
- BEQ end
- MOVW prime5, prime5r
- loop1:
- SUB.S $1, n
- MOVBU.P 1(p), x1
- MULA prime5r, x1, h, h
- MOVW h @> 21, h
- MUL prime1r, h
- BNE loop1
- end:
- MOVW prime3, prime3r
- EOR h >> 15, h
- MUL prime2r, h
- EOR h >> 13, h
- MUL prime3r, h
- EOR h >> 16, h
- MOVW h, ret+12(FP)
- RET
- // func update(v *[4]uint64, buf *[16]byte, p []byte)
- TEXT ·update(SB), NOFRAME|NOSPLIT, $-4-20
- MOVW v_arg+0(FP), p
- MOVM.IA (p), [v1, v2, v3, v4]
- MOVW prime1, prime1r
- MOVW prime2, prime2r
- // Process buf, if not nil.
- MOVW buf_arg+4(FP), p
- CMP $0, p
- BEQ noBuffered
- round16aligned
- noBuffered:
- MOVW input_ptr+ 8(FP), p
- MOVW input_len+12(FP), n
- SUB.S $16, n
- BMI end
- TST $3, p
- BNE loop16unaligned
- loop16aligned:
- SUB.S $16, n
- round16aligned
- BPL loop16aligned
- B end
- loop16unaligned:
- SUB.S $16, n
- round16unaligned
- BPL loop16unaligned
- end:
- MOVW v_arg+0(FP), p
- MOVM.IA [v1, v2, v3, v4], (p)
- RET
|