decode_amd64.s 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308
  1. // Copyright 2016 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. #include "textflag.h"
  5. // func decode(dst, src []byte) int
  6. //
  7. // The asm code generally follows the pure Go code in decode_other.go, except
  8. // where marked with a "!!!".
  9. //
  10. // All local variables fit into registers. The non-zero stack size is only to
  11. // spill registers and push args when issuing a CALL. The register allocation:
  12. // - AX scratch
  13. // - BX scratch
  14. // - CX length or x
  15. // - DX offset
  16. // - SI &src[s]
  17. // - DI &dst[d]
  18. // + R8 dst_base
  19. // + R9 dst_len
  20. // + R10 dst_base + dst_len
  21. // + R11 src_base
  22. // + R12 src_len
  23. // + R13 src_base + src_len
  24. // - R14 unused
  25. // - R15 used by doCopy
  26. //
  27. // The registers R8-R13 (marked with a "+") are set at the start of the
  28. // function, and after a CALL returns, and are not otherwise modified.
  29. //
  30. // The d variable is implicitly DI - R8, and len(dst)-d is R10 - DI.
  31. // The s variable is implicitly SI - R11, and len(src)-s is R13 - SI.
  32. TEXT ·decode(SB), NOSPLIT, $48-56
  33. // Initialize SI, DI and R8-R13.
  34. MOVQ dst_base+0(FP), R8
  35. MOVQ dst_len+8(FP), R9
  36. MOVQ R8, DI
  37. MOVQ R8, R10
  38. ADDQ R9, R10
  39. MOVQ src_base+24(FP), R11
  40. MOVQ src_len+32(FP), R12
  41. MOVQ R11, SI
  42. MOVQ R11, R13
  43. ADDQ R12, R13
  44. loop:
  45. // for s < len(src)
  46. CMPQ SI, R13
  47. JEQ end
  48. // CX = uint32(src[s])
  49. //
  50. // switch src[s] & 0x03
  51. MOVBLZX (SI), CX
  52. MOVL CX, BX
  53. ANDL $3, BX
  54. CMPL BX, $1
  55. JAE tagCopy
  56. // ----------------------------------------
  57. // The code below handles literal tags.
  58. // case tagLiteral:
  59. // x := uint32(src[s] >> 2)
  60. // switch
  61. SHRL $2, CX
  62. CMPL CX, $60
  63. JAE tagLit60Plus
  64. // case x < 60:
  65. // s++
  66. INCQ SI
  67. doLit:
  68. // This is the end of the inner "switch", when we have a literal tag.
  69. //
  70. // We assume that CX == x and x fits in a uint32, where x is the variable
  71. // used in the pure Go decode_other.go code.
  72. // length = int(x) + 1
  73. //
  74. // Unlike the pure Go code, we don't need to check if length <= 0 because
  75. // CX can hold 64 bits, so the increment cannot overflow.
  76. INCQ CX
  77. // Prepare to check if copying length bytes will run past the end of dst or
  78. // src.
  79. //
  80. // AX = len(dst) - d
  81. // BX = len(src) - s
  82. MOVQ R10, AX
  83. SUBQ DI, AX
  84. MOVQ R13, BX
  85. SUBQ SI, BX
  86. // if length > len(dst)-d || length > len(src)-s { etc }
  87. CMPQ CX, AX
  88. JGT errCorrupt
  89. CMPQ CX, BX
  90. JGT errCorrupt
  91. // copy(dst[d:], src[s:s+length])
  92. //
  93. // This means calling runtime·memmove(&dst[d], &src[s], length), so we push
  94. // DI, SI and CX as arguments. Coincidentally, we also need to spill those
  95. // three registers to the stack, to save local variables across the CALL.
  96. MOVQ DI, 0(SP)
  97. MOVQ SI, 8(SP)
  98. MOVQ CX, 16(SP)
  99. MOVQ DI, 24(SP)
  100. MOVQ SI, 32(SP)
  101. MOVQ CX, 40(SP)
  102. CALL runtime·memmove(SB)
  103. // Restore local variables: unspill registers from the stack and
  104. // re-calculate R8-R13.
  105. MOVQ 24(SP), DI
  106. MOVQ 32(SP), SI
  107. MOVQ 40(SP), CX
  108. MOVQ dst_base+0(FP), R8
  109. MOVQ dst_len+8(FP), R9
  110. MOVQ R8, R10
  111. ADDQ R9, R10
  112. MOVQ src_base+24(FP), R11
  113. MOVQ src_len+32(FP), R12
  114. MOVQ R11, R13
  115. ADDQ R12, R13
  116. // d += length
  117. // s += length
  118. ADDQ CX, DI
  119. ADDQ CX, SI
  120. JMP loop
  121. tagLit60Plus:
  122. // !!! This fragment does the
  123. //
  124. // s += x - 58; if uint(s) > uint(len(src)) { etc }
  125. //
  126. // checks. In the asm version, we code it once instead of once per switch case.
  127. ADDQ CX, SI
  128. SUBQ $58, SI
  129. MOVQ SI, BX
  130. SUBQ R11, BX
  131. CMPQ BX, R12
  132. JA errCorrupt
  133. // case x == 60:
  134. CMPL CX, $61
  135. JEQ tagLit61
  136. JA tagLit62Plus
  137. // x = uint32(src[s-1])
  138. MOVBLZX -1(SI), CX
  139. JMP doLit
  140. tagLit61:
  141. // case x == 61:
  142. // x = uint32(src[s-2]) | uint32(src[s-1])<<8
  143. MOVWLZX -2(SI), CX
  144. JMP doLit
  145. tagLit62Plus:
  146. CMPL CX, $62
  147. JA tagLit63
  148. // case x == 62:
  149. // x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
  150. MOVWLZX -3(SI), CX
  151. MOVBLZX -1(SI), BX
  152. SHLL $16, BX
  153. ORL BX, CX
  154. JMP doLit
  155. tagLit63:
  156. // case x == 63:
  157. // x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
  158. MOVL -4(SI), CX
  159. JMP doLit
  160. // The code above handles literal tags.
  161. // ----------------------------------------
  162. // The code below handles copy tags.
  163. tagCopy2:
  164. // case tagCopy2:
  165. // s += 3
  166. ADDQ $3, SI
  167. // if uint(s) > uint(len(src)) { etc }
  168. MOVQ SI, BX
  169. SUBQ R11, BX
  170. CMPQ BX, R12
  171. JA errCorrupt
  172. // length = 1 + int(src[s-3])>>2
  173. SHRQ $2, CX
  174. INCQ CX
  175. // offset = int(src[s-2]) | int(src[s-1])<<8
  176. MOVWQZX -2(SI), DX
  177. JMP doCopy
  178. tagCopy:
  179. // We have a copy tag. We assume that:
  180. // - BX == src[s] & 0x03
  181. // - CX == src[s]
  182. CMPQ BX, $2
  183. JEQ tagCopy2
  184. JA errUC4T
  185. // case tagCopy1:
  186. // s += 2
  187. ADDQ $2, SI
  188. // if uint(s) > uint(len(src)) { etc }
  189. MOVQ SI, BX
  190. SUBQ R11, BX
  191. CMPQ BX, R12
  192. JA errCorrupt
  193. // offset = int(src[s-2])&0xe0<<3 | int(src[s-1])
  194. MOVQ CX, DX
  195. ANDQ $0xe0, DX
  196. SHLQ $3, DX
  197. MOVBQZX -1(SI), BX
  198. ORQ BX, DX
  199. // length = 4 + int(src[s-2])>>2&0x7
  200. SHRQ $2, CX
  201. ANDQ $7, CX
  202. ADDQ $4, CX
  203. doCopy:
  204. // This is the end of the outer "switch", when we have a copy tag.
  205. //
  206. // We assume that:
  207. // - CX == length && CX > 0
  208. // - DX == offset
  209. // if offset <= 0 { etc }
  210. CMPQ DX, $0
  211. JLE errCorrupt
  212. // if d < offset { etc }
  213. MOVQ DI, BX
  214. SUBQ R8, BX
  215. CMPQ BX, DX
  216. JLT errCorrupt
  217. // if length > len(dst)-d { etc }
  218. MOVQ R10, BX
  219. SUBQ DI, BX
  220. CMPQ CX, BX
  221. JGT errCorrupt
  222. // forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
  223. //
  224. // Set:
  225. // - R15 = &dst[d-offset]
  226. MOVQ DI, R15
  227. SUBQ DX, R15
  228. verySlowForwardCopy:
  229. // verySlowForwardCopy is a simple implementation of forward copy. In C
  230. // parlance, this is a do/while loop instead of a while loop, since we know
  231. // that length > 0. In Go syntax:
  232. //
  233. // for {
  234. // dst[d] = dst[d - offset]
  235. // d++
  236. // length--
  237. // if length == 0 {
  238. // break
  239. // }
  240. // }
  241. MOVB (R15), BX
  242. MOVB BX, (DI)
  243. INCQ R15
  244. INCQ DI
  245. DECQ CX
  246. JNZ verySlowForwardCopy
  247. JMP loop
  248. // The code above handles copy tags.
  249. // ----------------------------------------
  250. end:
  251. // This is the end of the "for s < len(src)".
  252. //
  253. // if d != len(dst) { etc }
  254. CMPQ DI, R10
  255. JNE errCorrupt
  256. // return 0
  257. MOVQ $0, ret+48(FP)
  258. RET
  259. errCorrupt:
  260. // return decodeErrCodeCorrupt
  261. MOVQ $1, ret+48(FP)
  262. RET
  263. errUC4T:
  264. // return decodeErrCodeUnsupportedCopy4Tag
  265. MOVQ $3, ret+48(FP)
  266. RET