decode_amd64.s 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369
  1. // +build !appengine
  2. // +build gc
  3. // +build !noasm
  4. #include "textflag.h"
  5. // AX scratch
  6. // BX scratch
  7. // CX scratch
  8. // DX token
  9. //
  10. // DI &dst
  11. // SI &src
  12. // R8 &dst + len(dst)
  13. // R9 &src + len(src)
  14. // R11 &dst
  15. // R12 short output end
  16. // R13 short input end
  17. // func decodeBlock(dst, src []byte) int
  18. // using 50 bytes of stack currently
  19. TEXT ·decodeBlock(SB), NOSPLIT, $64-56
  20. MOVQ dst_base+0(FP), DI
  21. MOVQ DI, R11
  22. MOVQ dst_len+8(FP), R8
  23. ADDQ DI, R8
  24. MOVQ src_base+24(FP), SI
  25. MOVQ src_len+32(FP), R9
  26. CMPQ R9, $0
  27. JE err_corrupt
  28. ADDQ SI, R9
  29. // shortcut ends
  30. // short output end
  31. MOVQ R8, R12
  32. SUBQ $32, R12
  33. // short input end
  34. MOVQ R9, R13
  35. SUBQ $16, R13
  36. loop:
  37. // for si < len(src)
  38. CMPQ SI, R9
  39. JGE end
  40. // token := uint32(src[si])
  41. MOVBQZX (SI), DX
  42. INCQ SI
  43. // lit_len = token >> 4
  44. // if lit_len > 0
  45. // CX = lit_len
  46. MOVQ DX, CX
  47. SHRQ $4, CX
  48. // if lit_len != 0xF
  49. CMPQ CX, $0xF
  50. JEQ lit_len_loop_pre
  51. CMPQ DI, R12
  52. JGE lit_len_loop_pre
  53. CMPQ SI, R13
  54. JGE lit_len_loop_pre
  55. // copy shortcut
  56. // A two-stage shortcut for the most common case:
  57. // 1) If the literal length is 0..14, and there is enough space,
  58. // enter the shortcut and copy 16 bytes on behalf of the literals
  59. // (in the fast mode, only 8 bytes can be safely copied this way).
  60. // 2) Further if the match length is 4..18, copy 18 bytes in a similar
  61. // manner; but we ensure that there's enough space in the output for
  62. // those 18 bytes earlier, upon entering the shortcut (in other words,
  63. // there is a combined check for both stages).
  64. // copy literal
  65. MOVOU (SI), X0
  66. MOVOU X0, (DI)
  67. ADDQ CX, DI
  68. ADDQ CX, SI
  69. MOVQ DX, CX
  70. ANDQ $0xF, CX
  71. // The second stage: prepare for match copying, decode full info.
  72. // If it doesn't work out, the info won't be wasted.
  73. // offset := uint16(data[:2])
  74. MOVWQZX (SI), DX
  75. ADDQ $2, SI
  76. MOVQ DI, AX
  77. SUBQ DX, AX
  78. CMPQ AX, DI
  79. JGT err_short_buf
  80. // if we can't do the second stage then jump straight to read the
  81. // match length, we already have the offset.
  82. CMPQ CX, $0xF
  83. JEQ match_len_loop_pre
  84. CMPQ DX, $8
  85. JLT match_len_loop_pre
  86. CMPQ AX, R11
  87. JLT err_short_buf
  88. // memcpy(op + 0, match + 0, 8);
  89. MOVQ (AX), BX
  90. MOVQ BX, (DI)
  91. // memcpy(op + 8, match + 8, 8);
  92. MOVQ 8(AX), BX
  93. MOVQ BX, 8(DI)
  94. // memcpy(op +16, match +16, 2);
  95. MOVW 16(AX), BX
  96. MOVW BX, 16(DI)
  97. LEAQ 4(DI)(CX*1), DI // minmatch
  98. // shortcut complete, load next token
  99. JMP loop
  100. lit_len_loop_pre:
  101. // if lit_len > 0
  102. CMPQ CX, $0
  103. JEQ offset
  104. CMPQ CX, $0xF
  105. JNE copy_literal
  106. lit_len_loop:
  107. // for src[si] == 0xFF
  108. CMPB (SI), $0xFF
  109. JNE lit_len_finalise
  110. // bounds check src[si+1]
  111. LEAQ 1(SI), AX
  112. CMPQ AX, R9
  113. JGT err_short_buf
  114. // lit_len += 0xFF
  115. ADDQ $0xFF, CX
  116. INCQ SI
  117. JMP lit_len_loop
  118. lit_len_finalise:
  119. // lit_len += int(src[si])
  120. // si++
  121. MOVBQZX (SI), AX
  122. ADDQ AX, CX
  123. INCQ SI
  124. copy_literal:
  125. // bounds check src and dst
  126. LEAQ (SI)(CX*1), AX
  127. CMPQ AX, R9
  128. JGT err_short_buf
  129. LEAQ (DI)(CX*1), AX
  130. CMPQ AX, R8
  131. JGT err_short_buf
  132. // whats a good cut off to call memmove?
  133. CMPQ CX, $16
  134. JGT memmove_lit
  135. // if len(dst[di:]) < 16
  136. MOVQ R8, AX
  137. SUBQ DI, AX
  138. CMPQ AX, $16
  139. JLT memmove_lit
  140. // if len(src[si:]) < 16
  141. MOVQ R9, AX
  142. SUBQ SI, AX
  143. CMPQ AX, $16
  144. JLT memmove_lit
  145. MOVOU (SI), X0
  146. MOVOU X0, (DI)
  147. JMP finish_lit_copy
  148. memmove_lit:
  149. // memmove(to, from, len)
  150. MOVQ DI, 0(SP)
  151. MOVQ SI, 8(SP)
  152. MOVQ CX, 16(SP)
  153. // spill
  154. MOVQ DI, 24(SP)
  155. MOVQ SI, 32(SP)
  156. MOVQ CX, 40(SP) // need len to inc SI, DI after
  157. MOVB DX, 48(SP)
  158. CALL runtime·memmove(SB)
  159. // restore registers
  160. MOVQ 24(SP), DI
  161. MOVQ 32(SP), SI
  162. MOVQ 40(SP), CX
  163. MOVB 48(SP), DX
  164. // recalc initial values
  165. MOVQ dst_base+0(FP), R8
  166. MOVQ R8, R11
  167. ADDQ dst_len+8(FP), R8
  168. MOVQ src_base+24(FP), R9
  169. ADDQ src_len+32(FP), R9
  170. MOVQ R8, R12
  171. SUBQ $32, R12
  172. MOVQ R9, R13
  173. SUBQ $16, R13
  174. finish_lit_copy:
  175. ADDQ CX, SI
  176. ADDQ CX, DI
  177. CMPQ SI, R9
  178. JGE end
  179. offset:
  180. // CX := mLen
  181. // free up DX to use for offset
  182. MOVQ DX, CX
  183. LEAQ 2(SI), AX
  184. CMPQ AX, R9
  185. JGT err_short_buf
  186. // offset
  187. // DX := int(src[si]) | int(src[si+1])<<8
  188. MOVWQZX (SI), DX
  189. ADDQ $2, SI
  190. // 0 offset is invalid
  191. CMPQ DX, $0
  192. JEQ err_corrupt
  193. ANDB $0xF, CX
  194. match_len_loop_pre:
  195. // if mlen != 0xF
  196. CMPB CX, $0xF
  197. JNE copy_match
  198. match_len_loop:
  199. // for src[si] == 0xFF
  200. // lit_len += 0xFF
  201. CMPB (SI), $0xFF
  202. JNE match_len_finalise
  203. // bounds check src[si+1]
  204. LEAQ 1(SI), AX
  205. CMPQ AX, R9
  206. JGT err_short_buf
  207. ADDQ $0xFF, CX
  208. INCQ SI
  209. JMP match_len_loop
  210. match_len_finalise:
  211. // lit_len += int(src[si])
  212. // si++
  213. MOVBQZX (SI), AX
  214. ADDQ AX, CX
  215. INCQ SI
  216. copy_match:
  217. // mLen += minMatch
  218. ADDQ $4, CX
  219. // check we have match_len bytes left in dst
  220. // di+match_len < len(dst)
  221. LEAQ (DI)(CX*1), AX
  222. CMPQ AX, R8
  223. JGT err_short_buf
  224. // DX = offset
  225. // CX = match_len
  226. // BX = &dst + (di - offset)
  227. MOVQ DI, BX
  228. SUBQ DX, BX
  229. // check BX is within dst
  230. // if BX < &dst
  231. CMPQ BX, R11
  232. JLT err_short_buf
  233. // if offset + match_len < di
  234. LEAQ (BX)(CX*1), AX
  235. CMPQ DI, AX
  236. JGT copy_interior_match
  237. // AX := len(dst[:di])
  238. // MOVQ DI, AX
  239. // SUBQ R11, AX
  240. // copy 16 bytes at a time
  241. // if di-offset < 16 copy 16-(di-offset) bytes to di
  242. // then do the remaining
  243. copy_match_loop:
  244. // for match_len >= 0
  245. // dst[di] = dst[i]
  246. // di++
  247. // i++
  248. MOVB (BX), AX
  249. MOVB AX, (DI)
  250. INCQ DI
  251. INCQ BX
  252. DECQ CX
  253. CMPQ CX, $0
  254. JGT copy_match_loop
  255. JMP loop
  256. copy_interior_match:
  257. CMPQ CX, $16
  258. JGT memmove_match
  259. // if len(dst[di:]) < 16
  260. MOVQ R8, AX
  261. SUBQ DI, AX
  262. CMPQ AX, $16
  263. JLT memmove_match
  264. MOVOU (BX), X0
  265. MOVOU X0, (DI)
  266. ADDQ CX, DI
  267. JMP loop
  268. memmove_match:
  269. // memmove(to, from, len)
  270. MOVQ DI, 0(SP)
  271. MOVQ BX, 8(SP)
  272. MOVQ CX, 16(SP)
  273. // spill
  274. MOVQ DI, 24(SP)
  275. MOVQ SI, 32(SP)
  276. MOVQ CX, 40(SP) // need len to inc SI, DI after
  277. CALL runtime·memmove(SB)
  278. // restore registers
  279. MOVQ 24(SP), DI
  280. MOVQ 32(SP), SI
  281. MOVQ 40(SP), CX
  282. // recalc initial values
  283. MOVQ dst_base+0(FP), R8
  284. MOVQ R8, R11 // TODO: make these sensible numbers
  285. ADDQ dst_len+8(FP), R8
  286. MOVQ src_base+24(FP), R9
  287. ADDQ src_len+32(FP), R9
  288. MOVQ R8, R12
  289. SUBQ $32, R12
  290. MOVQ R9, R13
  291. SUBQ $16, R13
  292. ADDQ CX, DI
  293. JMP loop
  294. err_corrupt:
  295. MOVQ $-1, ret+48(FP)
  296. RET
  297. err_short_buf:
  298. MOVQ $-2, ret+48(FP)
  299. RET
  300. end:
  301. SUBQ R11, DI
  302. MOVQ DI, ret+48(FP)
  303. RET