xxhash_amd64.s 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215
  1. // +build !appengine
  2. // +build gc
  3. // +build !purego
  4. #include "textflag.h"
  5. // Register allocation:
  6. // AX h
  7. // CX pointer to advance through b
  8. // DX n
  9. // BX loop end
  10. // R8 v1, k1
  11. // R9 v2
  12. // R10 v3
  13. // R11 v4
  14. // R12 tmp
  15. // R13 prime1v
  16. // R14 prime2v
  17. // R15 prime4v
  18. // round reads from and advances the buffer pointer in CX.
  19. // It assumes that R13 has prime1v and R14 has prime2v.
  20. #define round(r) \
  21. MOVQ (CX), R12 \
  22. ADDQ $8, CX \
  23. IMULQ R14, R12 \
  24. ADDQ R12, r \
  25. ROLQ $31, r \
  26. IMULQ R13, r
  27. // mergeRound applies a merge round on the two registers acc and val.
  28. // It assumes that R13 has prime1v, R14 has prime2v, and R15 has prime4v.
  29. #define mergeRound(acc, val) \
  30. IMULQ R14, val \
  31. ROLQ $31, val \
  32. IMULQ R13, val \
  33. XORQ val, acc \
  34. IMULQ R13, acc \
  35. ADDQ R15, acc
  36. // func Sum64(b []byte) uint64
  37. TEXT ·Sum64(SB), NOSPLIT, $0-32
  38. // Load fixed primes.
  39. MOVQ ·prime1v(SB), R13
  40. MOVQ ·prime2v(SB), R14
  41. MOVQ ·prime4v(SB), R15
  42. // Load slice.
  43. MOVQ b_base+0(FP), CX
  44. MOVQ b_len+8(FP), DX
  45. LEAQ (CX)(DX*1), BX
  46. // The first loop limit will be len(b)-32.
  47. SUBQ $32, BX
  48. // Check whether we have at least one block.
  49. CMPQ DX, $32
  50. JLT noBlocks
  51. // Set up initial state (v1, v2, v3, v4).
  52. MOVQ R13, R8
  53. ADDQ R14, R8
  54. MOVQ R14, R9
  55. XORQ R10, R10
  56. XORQ R11, R11
  57. SUBQ R13, R11
  58. // Loop until CX > BX.
  59. blockLoop:
  60. round(R8)
  61. round(R9)
  62. round(R10)
  63. round(R11)
  64. CMPQ CX, BX
  65. JLE blockLoop
  66. MOVQ R8, AX
  67. ROLQ $1, AX
  68. MOVQ R9, R12
  69. ROLQ $7, R12
  70. ADDQ R12, AX
  71. MOVQ R10, R12
  72. ROLQ $12, R12
  73. ADDQ R12, AX
  74. MOVQ R11, R12
  75. ROLQ $18, R12
  76. ADDQ R12, AX
  77. mergeRound(AX, R8)
  78. mergeRound(AX, R9)
  79. mergeRound(AX, R10)
  80. mergeRound(AX, R11)
  81. JMP afterBlocks
  82. noBlocks:
  83. MOVQ ·prime5v(SB), AX
  84. afterBlocks:
  85. ADDQ DX, AX
  86. // Right now BX has len(b)-32, and we want to loop until CX > len(b)-8.
  87. ADDQ $24, BX
  88. CMPQ CX, BX
  89. JG fourByte
  90. wordLoop:
  91. // Calculate k1.
  92. MOVQ (CX), R8
  93. ADDQ $8, CX
  94. IMULQ R14, R8
  95. ROLQ $31, R8
  96. IMULQ R13, R8
  97. XORQ R8, AX
  98. ROLQ $27, AX
  99. IMULQ R13, AX
  100. ADDQ R15, AX
  101. CMPQ CX, BX
  102. JLE wordLoop
  103. fourByte:
  104. ADDQ $4, BX
  105. CMPQ CX, BX
  106. JG singles
  107. MOVL (CX), R8
  108. ADDQ $4, CX
  109. IMULQ R13, R8
  110. XORQ R8, AX
  111. ROLQ $23, AX
  112. IMULQ R14, AX
  113. ADDQ ·prime3v(SB), AX
  114. singles:
  115. ADDQ $4, BX
  116. CMPQ CX, BX
  117. JGE finalize
  118. singlesLoop:
  119. MOVBQZX (CX), R12
  120. ADDQ $1, CX
  121. IMULQ ·prime5v(SB), R12
  122. XORQ R12, AX
  123. ROLQ $11, AX
  124. IMULQ R13, AX
  125. CMPQ CX, BX
  126. JL singlesLoop
  127. finalize:
  128. MOVQ AX, R12
  129. SHRQ $33, R12
  130. XORQ R12, AX
  131. IMULQ R14, AX
  132. MOVQ AX, R12
  133. SHRQ $29, R12
  134. XORQ R12, AX
  135. IMULQ ·prime3v(SB), AX
  136. MOVQ AX, R12
  137. SHRQ $32, R12
  138. XORQ R12, AX
  139. MOVQ AX, ret+24(FP)
  140. RET
  141. // writeBlocks uses the same registers as above except that it uses AX to store
  142. // the d pointer.
  143. // func writeBlocks(d *Digest, b []byte) int
  144. TEXT ·writeBlocks(SB), NOSPLIT, $0-40
  145. // Load fixed primes needed for round.
  146. MOVQ ·prime1v(SB), R13
  147. MOVQ ·prime2v(SB), R14
  148. // Load slice.
  149. MOVQ arg1_base+8(FP), CX
  150. MOVQ arg1_len+16(FP), DX
  151. LEAQ (CX)(DX*1), BX
  152. SUBQ $32, BX
  153. // Load vN from d.
  154. MOVQ arg+0(FP), AX
  155. MOVQ 0(AX), R8 // v1
  156. MOVQ 8(AX), R9 // v2
  157. MOVQ 16(AX), R10 // v3
  158. MOVQ 24(AX), R11 // v4
  159. // We don't need to check the loop condition here; this function is
  160. // always called with at least one block of data to process.
  161. blockLoop:
  162. round(R8)
  163. round(R9)
  164. round(R10)
  165. round(R11)
  166. CMPQ CX, BX
  167. JLE blockLoop
  168. // Copy vN back to d.
  169. MOVQ R8, 0(AX)
  170. MOVQ R9, 8(AX)
  171. MOVQ R10, 16(AX)
  172. MOVQ R11, 24(AX)
  173. // The number of bytes written is CX minus the old base pointer.
  174. SUBQ arg1_base+8(FP), CX
  175. MOVQ CX, ret+32(FP)
  176. RET