encode_amd64.s 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212
  1. // Copyright 2016 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // +build !appengine
  5. // +build gc
  6. // +build !noasm
  7. #include "textflag.h"
  8. // The asm code generally follows the pure Go code in encode_other.go, except
  9. // where marked with a "!!!".
  10. // ----------------------------------------------------------------------------
  11. // func emitLiteral(dst, lit []byte) int
  12. //
  13. // All local variables fit into registers. The register allocation:
  14. // - AX return value
  15. // - BX n
  16. // - CX len(lit)
  17. // - SI &lit[0]
  18. // - DI &dst[i]
  19. //
  20. // The 24 bytes of stack space is to call runtime·memmove.
  21. TEXT ·emitLiteral(SB), NOSPLIT, $24-56
  22. MOVQ dst_base+0(FP), DI
  23. MOVQ lit_base+24(FP), SI
  24. MOVQ lit_len+32(FP), CX
  25. MOVQ CX, AX
  26. MOVL CX, BX
  27. SUBL $1, BX
  28. CMPL BX, $60
  29. JLT oneByte
  30. CMPL BX, $256
  31. JLT twoBytes
  32. threeBytes:
  33. MOVB $0xf4, 0(DI)
  34. MOVW BX, 1(DI)
  35. ADDQ $3, DI
  36. ADDQ $3, AX
  37. JMP end
  38. twoBytes:
  39. MOVB $0xf0, 0(DI)
  40. MOVB BX, 1(DI)
  41. ADDQ $2, DI
  42. ADDQ $2, AX
  43. JMP end
  44. oneByte:
  45. SHLB $2, BX
  46. MOVB BX, 0(DI)
  47. ADDQ $1, DI
  48. ADDQ $1, AX
  49. end:
  50. MOVQ AX, ret+48(FP)
  51. // copy(dst[i:], lit)
  52. //
  53. // This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
  54. // DI, SI and CX as arguments.
  55. MOVQ DI, 0(SP)
  56. MOVQ SI, 8(SP)
  57. MOVQ CX, 16(SP)
  58. CALL runtime·memmove(SB)
  59. RET
  60. // ----------------------------------------------------------------------------
  61. // func emitCopy(dst []byte, offset, length int) int
  62. //
  63. // All local variables fit into registers. The register allocation:
  64. // - BX offset
  65. // - CX length
  66. // - SI &dst[0]
  67. // - DI &dst[i]
  68. TEXT ·emitCopy(SB), NOSPLIT, $0-48
  69. MOVQ dst_base+0(FP), DI
  70. MOVQ DI, SI
  71. MOVQ offset+24(FP), BX
  72. MOVQ length+32(FP), CX
  73. loop0:
  74. // for length >= 68 { etc }
  75. CMPL CX, $68
  76. JLT step1
  77. // Emit a length 64 copy, encoded as 3 bytes.
  78. MOVB $0xfe, 0(DI)
  79. MOVW BX, 1(DI)
  80. ADDQ $3, DI
  81. SUBL $64, CX
  82. JMP loop0
  83. step1:
  84. // if length > 64 { etc }
  85. CMPL CX, $64
  86. JLE step2
  87. // Emit a length 60 copy, encoded as 3 bytes.
  88. MOVB $0xee, 0(DI)
  89. MOVW BX, 1(DI)
  90. ADDQ $3, DI
  91. SUBL $60, CX
  92. step2:
  93. // if length >= 12 || offset >= 2048 { goto step3 }
  94. CMPL CX, $12
  95. JGE step3
  96. CMPL BX, $2048
  97. JGE step3
  98. // Emit the remaining copy, encoded as 2 bytes.
  99. MOVB BX, 1(DI)
  100. SHRL $8, BX
  101. SHLB $5, BX
  102. SUBB $4, CX
  103. SHLB $2, CX
  104. ORB CX, BX
  105. ORB $1, BX
  106. MOVB BX, 0(DI)
  107. ADDQ $2, DI
  108. // Return the number of bytes written.
  109. SUBQ SI, DI
  110. MOVQ DI, ret+40(FP)
  111. RET
  112. step3:
  113. // Emit the remaining copy, encoded as 3 bytes.
  114. SUBL $1, CX
  115. SHLB $2, CX
  116. ORB $2, CX
  117. MOVB CX, 0(DI)
  118. MOVW BX, 1(DI)
  119. ADDQ $3, DI
  120. // Return the number of bytes written.
  121. SUBQ SI, DI
  122. MOVQ DI, ret+40(FP)
  123. RET
  124. // ----------------------------------------------------------------------------
  125. // func extendMatch(src []byte, i, j int) int
  126. //
  127. // All local variables fit into registers. The register allocation:
  128. // - CX &src[0]
  129. // - DX &src[len(src)]
  130. // - SI &src[i]
  131. // - DI &src[j]
  132. // - R9 &src[len(src) - 8]
  133. TEXT ·extendMatch(SB), NOSPLIT, $0-48
  134. MOVQ src_base+0(FP), CX
  135. MOVQ src_len+8(FP), DX
  136. MOVQ i+24(FP), SI
  137. MOVQ j+32(FP), DI
  138. ADDQ CX, DX
  139. ADDQ CX, SI
  140. ADDQ CX, DI
  141. MOVQ DX, R9
  142. SUBQ $8, R9
  143. cmp8:
  144. // As long as we are 8 or more bytes before the end of src, we can load and
  145. // compare 8 bytes at a time. If those 8 bytes are equal, repeat.
  146. CMPQ DI, R9
  147. JA cmp1
  148. MOVQ (SI), AX
  149. MOVQ (DI), BX
  150. CMPQ AX, BX
  151. JNE bsf
  152. ADDQ $8, SI
  153. ADDQ $8, DI
  154. JMP cmp8
  155. bsf:
  156. // If those 8 bytes were not equal, XOR the two 8 byte values, and return
  157. // the index of the first byte that differs. The BSF instruction finds the
  158. // least significant 1 bit, the amd64 architecture is little-endian, and
  159. // the shift by 3 converts a bit index to a byte index.
  160. XORQ AX, BX
  161. BSFQ BX, BX
  162. SHRQ $3, BX
  163. ADDQ BX, DI
  164. // Convert from &src[ret] to ret.
  165. SUBQ CX, DI
  166. MOVQ DI, ret+40(FP)
  167. RET
  168. cmp1:
  169. // In src's tail, compare 1 byte at a time.
  170. CMPQ DI, DX
  171. JAE end
  172. MOVB (SI), AX
  173. MOVB (DI), BX
  174. CMPB AX, BX
  175. JNE end
  176. ADDQ $1, SI
  177. ADDQ $1, DI
  178. JMP cmp1
  179. end:
  180. // Convert from &src[ret] to ret.
  181. SUBQ CX, DI
  182. MOVQ DI, ret+40(FP)
  183. RET