decode_amd64.s 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375
  1. // +build !appengine
  2. // +build gc
  3. // +build !noasm
  4. #include "textflag.h"
  5. // AX scratch
  6. // BX scratch
  7. // CX scratch
  8. // DX token
  9. //
  10. // DI &dst
  11. // SI &src
  12. // R8 &dst + len(dst)
  13. // R9 &src + len(src)
  14. // R11 &dst
  15. // R12 short output end
  16. // R13 short input end
  17. // func decodeBlock(dst, src []byte) int
  18. // using 50 bytes of stack currently
  19. TEXT ·decodeBlock(SB), NOSPLIT, $64-56
  20. MOVQ dst_base+0(FP), DI
  21. MOVQ DI, R11
  22. MOVQ dst_len+8(FP), R8
  23. ADDQ DI, R8
  24. MOVQ src_base+24(FP), SI
  25. MOVQ src_len+32(FP), R9
  26. ADDQ SI, R9
  27. // shortcut ends
  28. // short output end
  29. MOVQ R8, R12
  30. SUBQ $32, R12
  31. // short input end
  32. MOVQ R9, R13
  33. SUBQ $16, R13
  34. loop:
  35. // for si < len(src)
  36. CMPQ SI, R9
  37. JGE end
  38. // token := uint32(src[si])
  39. MOVBQZX (SI), DX
  40. INCQ SI
  41. // lit_len = token >> 4
  42. // if lit_len > 0
  43. // CX = lit_len
  44. MOVQ DX, CX
  45. SHRQ $4, CX
  46. // if lit_len != 0xF
  47. CMPQ CX, $0xF
  48. JEQ lit_len_loop_pre
  49. CMPQ DI, R12
  50. JGE lit_len_loop_pre
  51. CMPQ SI, R13
  52. JGE lit_len_loop_pre
  53. // copy shortcut
  54. // A two-stage shortcut for the most common case:
  55. // 1) If the literal length is 0..14, and there is enough space,
  56. // enter the shortcut and copy 16 bytes on behalf of the literals
  57. // (in the fast mode, only 8 bytes can be safely copied this way).
  58. // 2) Further if the match length is 4..18, copy 18 bytes in a similar
  59. // manner; but we ensure that there's enough space in the output for
  60. // those 18 bytes earlier, upon entering the shortcut (in other words,
  61. // there is a combined check for both stages).
  62. // copy literal
  63. MOVOU (SI), X0
  64. MOVOU X0, (DI)
  65. ADDQ CX, DI
  66. ADDQ CX, SI
  67. MOVQ DX, CX
  68. ANDQ $0xF, CX
  69. // The second stage: prepare for match copying, decode full info.
  70. // If it doesn't work out, the info won't be wasted.
  71. // offset := uint16(data[:2])
  72. MOVWQZX (SI), DX
  73. ADDQ $2, SI
  74. MOVQ DI, AX
  75. SUBQ DX, AX
  76. CMPQ AX, DI
  77. JGT err_short_buf
  78. // if we can't do the second stage then jump straight to read the
  79. // match length, we already have the offset.
  80. CMPQ CX, $0xF
  81. JEQ match_len_loop_pre
  82. CMPQ DX, $8
  83. JLT match_len_loop_pre
  84. CMPQ AX, R11
  85. JLT err_short_buf
  86. // memcpy(op + 0, match + 0, 8);
  87. MOVQ (AX), BX
  88. MOVQ BX, (DI)
  89. // memcpy(op + 8, match + 8, 8);
  90. MOVQ 8(AX), BX
  91. MOVQ BX, 8(DI)
  92. // memcpy(op +16, match +16, 2);
  93. MOVW 16(AX), BX
  94. MOVW BX, 16(DI)
  95. ADDQ $4, DI // minmatch
  96. ADDQ CX, DI
  97. // shortcut complete, load next token
  98. JMP loop
  99. lit_len_loop_pre:
  100. // if lit_len > 0
  101. CMPQ CX, $0
  102. JEQ offset
  103. CMPQ CX, $0xF
  104. JNE copy_literal
  105. lit_len_loop:
  106. // for src[si] == 0xFF
  107. CMPB (SI), $0xFF
  108. JNE lit_len_finalise
  109. // bounds check src[si+1]
  110. MOVQ SI, AX
  111. ADDQ $1, AX
  112. CMPQ AX, R9
  113. JGT err_short_buf
  114. // lit_len += 0xFF
  115. ADDQ $0xFF, CX
  116. INCQ SI
  117. JMP lit_len_loop
  118. lit_len_finalise:
  119. // lit_len += int(src[si])
  120. // si++
  121. MOVBQZX (SI), AX
  122. ADDQ AX, CX
  123. INCQ SI
  124. copy_literal:
  125. // bounds check src and dst
  126. MOVQ SI, AX
  127. ADDQ CX, AX
  128. CMPQ AX, R9
  129. JGT err_short_buf
  130. MOVQ DI, AX
  131. ADDQ CX, AX
  132. CMPQ AX, R8
  133. JGT err_short_buf
  134. // whats a good cut off to call memmove?
  135. CMPQ CX, $16
  136. JGT memmove_lit
  137. // if len(dst[di:]) < 16
  138. MOVQ R8, AX
  139. SUBQ DI, AX
  140. CMPQ AX, $16
  141. JLT memmove_lit
  142. // if len(src[si:]) < 16
  143. MOVQ R9, AX
  144. SUBQ SI, AX
  145. CMPQ AX, $16
  146. JLT memmove_lit
  147. MOVOU (SI), X0
  148. MOVOU X0, (DI)
  149. JMP finish_lit_copy
  150. memmove_lit:
  151. // memmove(to, from, len)
  152. MOVQ DI, 0(SP)
  153. MOVQ SI, 8(SP)
  154. MOVQ CX, 16(SP)
  155. // spill
  156. MOVQ DI, 24(SP)
  157. MOVQ SI, 32(SP)
  158. MOVQ CX, 40(SP) // need len to inc SI, DI after
  159. MOVB DX, 48(SP)
  160. CALL runtime·memmove(SB)
  161. // restore registers
  162. MOVQ 24(SP), DI
  163. MOVQ 32(SP), SI
  164. MOVQ 40(SP), CX
  165. MOVB 48(SP), DX
  166. // recalc initial values
  167. MOVQ dst_base+0(FP), R8
  168. MOVQ R8, R11
  169. ADDQ dst_len+8(FP), R8
  170. MOVQ src_base+24(FP), R9
  171. ADDQ src_len+32(FP), R9
  172. MOVQ R8, R12
  173. SUBQ $32, R12
  174. MOVQ R9, R13
  175. SUBQ $16, R13
  176. finish_lit_copy:
  177. ADDQ CX, SI
  178. ADDQ CX, DI
  179. CMPQ SI, R9
  180. JGE end
  181. offset:
  182. // CX := mLen
  183. // free up DX to use for offset
  184. MOVQ DX, CX
  185. MOVQ SI, AX
  186. ADDQ $2, AX
  187. CMPQ AX, R9
  188. JGT err_short_buf
  189. // offset
  190. // DX := int(src[si]) | int(src[si+1])<<8
  191. MOVWQZX (SI), DX
  192. ADDQ $2, SI
  193. // 0 offset is invalid
  194. CMPQ DX, $0
  195. JEQ err_corrupt
  196. ANDB $0xF, CX
  197. match_len_loop_pre:
  198. // if mlen != 0xF
  199. CMPB CX, $0xF
  200. JNE copy_match
  201. match_len_loop:
  202. // for src[si] == 0xFF
  203. // lit_len += 0xFF
  204. CMPB (SI), $0xFF
  205. JNE match_len_finalise
  206. // bounds check src[si+1]
  207. MOVQ SI, AX
  208. ADDQ $1, AX
  209. CMPQ AX, R9
  210. JGT err_short_buf
  211. ADDQ $0xFF, CX
  212. INCQ SI
  213. JMP match_len_loop
  214. match_len_finalise:
  215. // lit_len += int(src[si])
  216. // si++
  217. MOVBQZX (SI), AX
  218. ADDQ AX, CX
  219. INCQ SI
  220. copy_match:
  221. // mLen += minMatch
  222. ADDQ $4, CX
  223. // check we have match_len bytes left in dst
  224. // di+match_len < len(dst)
  225. MOVQ DI, AX
  226. ADDQ CX, AX
  227. CMPQ AX, R8
  228. JGT err_short_buf
  229. // DX = offset
  230. // CX = match_len
  231. // BX = &dst + (di - offset)
  232. MOVQ DI, BX
  233. SUBQ DX, BX
  234. // check BX is within dst
  235. // if BX < &dst
  236. CMPQ BX, R11
  237. JLT err_short_buf
  238. // if offset + match_len < di
  239. MOVQ BX, AX
  240. ADDQ CX, AX
  241. CMPQ DI, AX
  242. JGT copy_interior_match
  243. // AX := len(dst[:di])
  244. // MOVQ DI, AX
  245. // SUBQ R11, AX
  246. // copy 16 bytes at a time
  247. // if di-offset < 16 copy 16-(di-offset) bytes to di
  248. // then do the remaining
  249. copy_match_loop:
  250. // for match_len >= 0
  251. // dst[di] = dst[i]
  252. // di++
  253. // i++
  254. MOVB (BX), AX
  255. MOVB AX, (DI)
  256. INCQ DI
  257. INCQ BX
  258. DECQ CX
  259. CMPQ CX, $0
  260. JGT copy_match_loop
  261. JMP loop
  262. copy_interior_match:
  263. CMPQ CX, $16
  264. JGT memmove_match
  265. // if len(dst[di:]) < 16
  266. MOVQ R8, AX
  267. SUBQ DI, AX
  268. CMPQ AX, $16
  269. JLT memmove_match
  270. MOVOU (BX), X0
  271. MOVOU X0, (DI)
  272. ADDQ CX, DI
  273. JMP loop
  274. memmove_match:
  275. // memmove(to, from, len)
  276. MOVQ DI, 0(SP)
  277. MOVQ BX, 8(SP)
  278. MOVQ CX, 16(SP)
  279. // spill
  280. MOVQ DI, 24(SP)
  281. MOVQ SI, 32(SP)
  282. MOVQ CX, 40(SP) // need len to inc SI, DI after
  283. CALL runtime·memmove(SB)
  284. // restore registers
  285. MOVQ 24(SP), DI
  286. MOVQ 32(SP), SI
  287. MOVQ 40(SP), CX
  288. // recalc initial values
  289. MOVQ dst_base+0(FP), R8
  290. MOVQ R8, R11 // TODO: make these sensible numbers
  291. ADDQ dst_len+8(FP), R8
  292. MOVQ src_base+24(FP), R9
  293. ADDQ src_len+32(FP), R9
  294. MOVQ R8, R12
  295. SUBQ $32, R12
  296. MOVQ R9, R13
  297. SUBQ $16, R13
  298. ADDQ CX, DI
  299. JMP loop
  300. err_corrupt:
  301. MOVQ $-1, ret+48(FP)
  302. RET
  303. err_short_buf:
  304. MOVQ $-2, ret+48(FP)
  305. RET
  306. end:
  307. SUBQ R11, DI
  308. MOVQ DI, ret+48(FP)
  309. RET