xxh32zero_arm.s 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259
  1. // +build !noasm
  2. #include "textflag.h"
  3. #define prime1 $2654435761
  4. #define prime2 $2246822519
  5. #define prime3 $3266489917
  6. #define prime4 $668265263
  7. #define prime5 $374761393
  8. #define prime1plus2 $606290984
  9. #define prime1minus $1640531535
  10. // Register allocation.
  11. #define p R0
  12. #define n R1
  13. #define h R2
  14. #define v1 R2 // Alias for h.
  15. #define v2 R3
  16. #define v3 R4
  17. #define v4 R5
  18. #define x1 R6
  19. #define x2 R7
  20. #define x3 R8
  21. #define x4 R9
  22. // We need the primes in registers. The 16-byte loop only uses prime{1,2}.
  23. #define prime1r R11
  24. #define prime2r R12
  25. #define prime3r R3 // The rest can alias v{2-4}.
  26. #define prime4r R4
  27. #define prime5r R5
  28. // Update round macros. These read from and increment p.
  29. #define round16aligned \
  30. MOVM.IA.W (p), [x1, x2, x3, x4] \
  31. \
  32. MULA x1, prime2r, v1, v1 \
  33. MULA x2, prime2r, v2, v2 \
  34. MULA x3, prime2r, v3, v3 \
  35. MULA x4, prime2r, v4, v4 \
  36. \
  37. MOVW v1 @> 19, v1 \
  38. MOVW v2 @> 19, v2 \
  39. MOVW v3 @> 19, v3 \
  40. MOVW v4 @> 19, v4 \
  41. \
  42. MUL prime1r, v1 \
  43. MUL prime1r, v2 \
  44. MUL prime1r, v3 \
  45. MUL prime1r, v4 \
  46. #define round16unaligned \
  47. MOVBU.P 16(p), x1 \
  48. MOVBU -15(p), x2 \
  49. ORR x2 << 8, x1 \
  50. MOVBU -14(p), x3 \
  51. MOVBU -13(p), x4 \
  52. ORR x4 << 8, x3 \
  53. ORR x3 << 16, x1 \
  54. \
  55. MULA x1, prime2r, v1, v1 \
  56. MOVW v1 @> 19, v1 \
  57. MUL prime1r, v1 \
  58. \
  59. MOVBU -12(p), x1 \
  60. MOVBU -11(p), x2 \
  61. ORR x2 << 8, x1 \
  62. MOVBU -10(p), x3 \
  63. MOVBU -9(p), x4 \
  64. ORR x4 << 8, x3 \
  65. ORR x3 << 16, x1 \
  66. \
  67. MULA x1, prime2r, v2, v2 \
  68. MOVW v2 @> 19, v2 \
  69. MUL prime1r, v2 \
  70. \
  71. MOVBU -8(p), x1 \
  72. MOVBU -7(p), x2 \
  73. ORR x2 << 8, x1 \
  74. MOVBU -6(p), x3 \
  75. MOVBU -5(p), x4 \
  76. ORR x4 << 8, x3 \
  77. ORR x3 << 16, x1 \
  78. \
  79. MULA x1, prime2r, v3, v3 \
  80. MOVW v3 @> 19, v3 \
  81. MUL prime1r, v3 \
  82. \
  83. MOVBU -4(p), x1 \
  84. MOVBU -3(p), x2 \
  85. ORR x2 << 8, x1 \
  86. MOVBU -2(p), x3 \
  87. MOVBU -1(p), x4 \
  88. ORR x4 << 8, x3 \
  89. ORR x3 << 16, x1 \
  90. \
  91. MULA x1, prime2r, v4, v4 \
  92. MOVW v4 @> 19, v4 \
  93. MUL prime1r, v4 \
  94. // func ChecksumZero([]byte) uint32
  95. TEXT ·ChecksumZero(SB), NOFRAME|NOSPLIT, $-4-16
  96. MOVW input_base+0(FP), p
  97. MOVW input_len+4(FP), n
  98. MOVW prime1, prime1r
  99. MOVW prime2, prime2r
  100. // Set up h for n < 16. It's tempting to say {ADD prime5, n, h}
  101. // here, but that's a pseudo-op that generates a load through R11.
  102. MOVW prime5, prime5r
  103. ADD prime5r, n, h
  104. CMP $0, n
  105. BEQ end
  106. // We let n go negative so we can do comparisons with SUB.S
  107. // instead of separate CMP.
  108. SUB.S $16, n
  109. BMI loop16done
  110. MOVW prime1plus2, v1
  111. MOVW prime2, v2
  112. MOVW $0, v3
  113. MOVW prime1minus, v4
  114. TST $3, p
  115. BNE loop16unaligned
  116. loop16aligned:
  117. SUB.S $16, n
  118. round16aligned
  119. BPL loop16aligned
  120. B loop16finish
  121. loop16unaligned:
  122. SUB.S $16, n
  123. round16unaligned
  124. BPL loop16unaligned
  125. loop16finish:
  126. MOVW v1 @> 31, h
  127. ADD v2 @> 25, h
  128. ADD v3 @> 20, h
  129. ADD v4 @> 14, h
  130. // h += len(input) with v2 as temporary.
  131. MOVW input_len+4(FP), v2
  132. ADD v2, h
  133. loop16done:
  134. ADD $16, n // Restore number of bytes left.
  135. SUB.S $4, n
  136. MOVW prime3, prime3r
  137. BMI loop4done
  138. MOVW prime4, prime4r
  139. TST $3, p
  140. BNE loop4unaligned
  141. loop4aligned:
  142. SUB.S $4, n
  143. MOVW.P 4(p), x1
  144. MULA prime3r, x1, h, h
  145. MOVW h @> 15, h
  146. MUL prime4r, h
  147. BPL loop4aligned
  148. B loop4done
  149. loop4unaligned:
  150. SUB.S $4, n
  151. MOVBU.P 4(p), x1
  152. MOVBU -3(p), x2
  153. ORR x2 << 8, x1
  154. MOVBU -2(p), x3
  155. ORR x3 << 16, x1
  156. MOVBU -1(p), x4
  157. ORR x4 << 24, x1
  158. MULA prime3r, x1, h, h
  159. MOVW h @> 15, h
  160. MUL prime4r, h
  161. BPL loop4unaligned
  162. loop4done:
  163. ADD.S $4, n // Restore number of bytes left.
  164. BEQ end
  165. MOVW prime5, prime5r
  166. loop1:
  167. SUB.S $1, n
  168. MOVBU.P 1(p), x1
  169. MULA prime5r, x1, h, h
  170. MOVW h @> 21, h
  171. MUL prime1r, h
  172. BNE loop1
  173. end:
  174. MOVW prime3, prime3r
  175. EOR h >> 15, h
  176. MUL prime2r, h
  177. EOR h >> 13, h
  178. MUL prime3r, h
  179. EOR h >> 16, h
  180. MOVW h, ret+12(FP)
  181. RET
  182. // func update(v *[4]uint64, buf *[16]byte, p []byte)
  183. TEXT ·update(SB), NOFRAME|NOSPLIT, $-4-20
  184. MOVW v_arg+0(FP), p
  185. MOVM.IA (p), [v1, v2, v3, v4]
  186. MOVW prime1, prime1r
  187. MOVW prime2, prime2r
  188. // Process buf, if not nil.
  189. MOVW buf_arg+4(FP), p
  190. CMP $0, p
  191. BEQ noBuffered
  192. round16aligned
  193. noBuffered:
  194. MOVW input_ptr+ 8(FP), p
  195. MOVW input_len+12(FP), n
  196. SUB.S $16, n
  197. BMI end
  198. TST $3, p
  199. BNE loop16unaligned
  200. loop16aligned:
  201. SUB.S $16, n
  202. round16aligned
  203. BPL loop16aligned
  204. B end
  205. loop16unaligned:
  206. SUB.S $16, n
  207. round16unaligned
  208. BPL loop16unaligned
  209. end:
  210. MOVW v_arg+0(FP), p
  211. MOVM.IA [v1, v2, v3, v4], (p)
  212. RET