decode_amd64.s 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482
  1. // Copyright 2016 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // +build !appengine
  5. // +build gc
  6. // +build !noasm
  7. #include "textflag.h"
  8. // The asm code generally follows the pure Go code in decode_other.go, except
  9. // where marked with a "!!!".
  10. // func decode(dst, src []byte) int
  11. //
  12. // All local variables fit into registers. The non-zero stack size is only to
  13. // spill registers and push args when issuing a CALL. The register allocation:
  14. // - AX scratch
  15. // - BX scratch
  16. // - CX length or x
  17. // - DX offset
  18. // - SI &src[s]
  19. // - DI &dst[d]
  20. // + R8 dst_base
  21. // + R9 dst_len
  22. // + R10 dst_base + dst_len
  23. // + R11 src_base
  24. // + R12 src_len
  25. // + R13 src_base + src_len
  26. // - R14 used by doCopy
  27. // - R15 used by doCopy
  28. //
  29. // The registers R8-R13 (marked with a "+") are set at the start of the
  30. // function, and after a CALL returns, and are not otherwise modified.
  31. //
  32. // The d variable is implicitly DI - R8, and len(dst)-d is R10 - DI.
  33. // The s variable is implicitly SI - R11, and len(src)-s is R13 - SI.
  34. TEXT ·decode(SB), NOSPLIT, $48-56
  35. // Initialize SI, DI and R8-R13.
  36. MOVQ dst_base+0(FP), R8
  37. MOVQ dst_len+8(FP), R9
  38. MOVQ R8, DI
  39. MOVQ R8, R10
  40. ADDQ R9, R10
  41. MOVQ src_base+24(FP), R11
  42. MOVQ src_len+32(FP), R12
  43. MOVQ R11, SI
  44. MOVQ R11, R13
  45. ADDQ R12, R13
  46. loop:
  47. // for s < len(src)
  48. CMPQ SI, R13
  49. JEQ end
  50. // CX = uint32(src[s])
  51. //
  52. // switch src[s] & 0x03
  53. MOVBLZX (SI), CX
  54. MOVL CX, BX
  55. ANDL $3, BX
  56. CMPL BX, $1
  57. JAE tagCopy
  58. // ----------------------------------------
  59. // The code below handles literal tags.
  60. // case tagLiteral:
  61. // x := uint32(src[s] >> 2)
  62. // switch
  63. SHRL $2, CX
  64. CMPL CX, $60
  65. JAE tagLit60Plus
  66. // case x < 60:
  67. // s++
  68. INCQ SI
  69. doLit:
  70. // This is the end of the inner "switch", when we have a literal tag.
  71. //
  72. // We assume that CX == x and x fits in a uint32, where x is the variable
  73. // used in the pure Go decode_other.go code.
  74. // length = int(x) + 1
  75. //
  76. // Unlike the pure Go code, we don't need to check if length <= 0 because
  77. // CX can hold 64 bits, so the increment cannot overflow.
  78. INCQ CX
  79. // Prepare to check if copying length bytes will run past the end of dst or
  80. // src.
  81. //
  82. // AX = len(dst) - d
  83. // BX = len(src) - s
  84. MOVQ R10, AX
  85. SUBQ DI, AX
  86. MOVQ R13, BX
  87. SUBQ SI, BX
  88. // !!! Try a faster technique for short (16 or fewer bytes) copies.
  89. //
  90. // if length > 16 || len(dst)-d < 16 || len(src)-s < 16 {
  91. // goto callMemmove // Fall back on calling runtime·memmove.
  92. // }
  93. //
  94. // The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
  95. // against 21 instead of 16, because it cannot assume that all of its input
  96. // is contiguous in memory and so it needs to leave enough source bytes to
  97. // read the next tag without refilling buffers, but Go's Decode assumes
  98. // contiguousness (the src argument is a []byte).
  99. CMPQ CX, $16
  100. JGT callMemmove
  101. CMPQ AX, $16
  102. JLT callMemmove
  103. CMPQ BX, $16
  104. JLT callMemmove
  105. // !!! Implement the copy from src to dst as a 16-byte load and store.
  106. // (Decode's documentation says that dst and src must not overlap.)
  107. //
  108. // This always copies 16 bytes, instead of only length bytes, but that's
  109. // OK. If the input is a valid Snappy encoding then subsequent iterations
  110. // will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
  111. // non-nil error), so the overrun will be ignored.
  112. //
  113. // Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
  114. // 16-byte loads and stores. This technique probably wouldn't be as
  115. // effective on architectures that are fussier about alignment.
  116. MOVOU 0(SI), X0
  117. MOVOU X0, 0(DI)
  118. // d += length
  119. // s += length
  120. ADDQ CX, DI
  121. ADDQ CX, SI
  122. JMP loop
  123. callMemmove:
  124. // if length > len(dst)-d || length > len(src)-s { etc }
  125. CMPQ CX, AX
  126. JGT errCorrupt
  127. CMPQ CX, BX
  128. JGT errCorrupt
  129. // copy(dst[d:], src[s:s+length])
  130. //
  131. // This means calling runtime·memmove(&dst[d], &src[s], length), so we push
  132. // DI, SI and CX as arguments. Coincidentally, we also need to spill those
  133. // three registers to the stack, to save local variables across the CALL.
  134. MOVQ DI, 0(SP)
  135. MOVQ SI, 8(SP)
  136. MOVQ CX, 16(SP)
  137. MOVQ DI, 24(SP)
  138. MOVQ SI, 32(SP)
  139. MOVQ CX, 40(SP)
  140. CALL runtime·memmove(SB)
  141. // Restore local variables: unspill registers from the stack and
  142. // re-calculate R8-R13.
  143. MOVQ 24(SP), DI
  144. MOVQ 32(SP), SI
  145. MOVQ 40(SP), CX
  146. MOVQ dst_base+0(FP), R8
  147. MOVQ dst_len+8(FP), R9
  148. MOVQ R8, R10
  149. ADDQ R9, R10
  150. MOVQ src_base+24(FP), R11
  151. MOVQ src_len+32(FP), R12
  152. MOVQ R11, R13
  153. ADDQ R12, R13
  154. // d += length
  155. // s += length
  156. ADDQ CX, DI
  157. ADDQ CX, SI
  158. JMP loop
  159. tagLit60Plus:
  160. // !!! This fragment does the
  161. //
  162. // s += x - 58; if uint(s) > uint(len(src)) { etc }
  163. //
  164. // checks. In the asm version, we code it once instead of once per switch case.
  165. ADDQ CX, SI
  166. SUBQ $58, SI
  167. CMPQ SI, R13
  168. JA errCorrupt
  169. // case x == 60:
  170. CMPL CX, $61
  171. JEQ tagLit61
  172. JA tagLit62Plus
  173. // x = uint32(src[s-1])
  174. MOVBLZX -1(SI), CX
  175. JMP doLit
  176. tagLit61:
  177. // case x == 61:
  178. // x = uint32(src[s-2]) | uint32(src[s-1])<<8
  179. MOVWLZX -2(SI), CX
  180. JMP doLit
  181. tagLit62Plus:
  182. CMPL CX, $62
  183. JA tagLit63
  184. // case x == 62:
  185. // x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
  186. MOVWLZX -3(SI), CX
  187. MOVBLZX -1(SI), BX
  188. SHLL $16, BX
  189. ORL BX, CX
  190. JMP doLit
  191. tagLit63:
  192. // case x == 63:
  193. // x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
  194. MOVL -4(SI), CX
  195. JMP doLit
  196. // The code above handles literal tags.
  197. // ----------------------------------------
  198. // The code below handles copy tags.
  199. tagCopy4:
  200. // case tagCopy4:
  201. // s += 5
  202. ADDQ $5, SI
  203. // if uint(s) > uint(len(src)) { etc }
  204. CMPQ SI, R13
  205. JA errCorrupt
  206. // length = 1 + int(src[s-5])>>2
  207. SHRQ $2, CX
  208. INCQ CX
  209. // offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
  210. MOVLQZX -4(SI), DX
  211. JMP doCopy
  212. tagCopy2:
  213. // case tagCopy2:
  214. // s += 3
  215. ADDQ $3, SI
  216. // if uint(s) > uint(len(src)) { etc }
  217. CMPQ SI, R13
  218. JA errCorrupt
  219. // length = 1 + int(src[s-3])>>2
  220. SHRQ $2, CX
  221. INCQ CX
  222. // offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
  223. MOVWQZX -2(SI), DX
  224. JMP doCopy
  225. tagCopy:
  226. // We have a copy tag. We assume that:
  227. // - BX == src[s] & 0x03
  228. // - CX == src[s]
  229. CMPQ BX, $2
  230. JEQ tagCopy2
  231. JA tagCopy4
  232. // case tagCopy1:
  233. // s += 2
  234. ADDQ $2, SI
  235. // if uint(s) > uint(len(src)) { etc }
  236. CMPQ SI, R13
  237. JA errCorrupt
  238. // offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
  239. MOVQ CX, DX
  240. ANDQ $0xe0, DX
  241. SHLQ $3, DX
  242. MOVBQZX -1(SI), BX
  243. ORQ BX, DX
  244. // length = 4 + int(src[s-2])>>2&0x7
  245. SHRQ $2, CX
  246. ANDQ $7, CX
  247. ADDQ $4, CX
  248. doCopy:
  249. // This is the end of the outer "switch", when we have a copy tag.
  250. //
  251. // We assume that:
  252. // - CX == length && CX > 0
  253. // - DX == offset
  254. // if offset <= 0 { etc }
  255. CMPQ DX, $0
  256. JLE errCorrupt
  257. // if d < offset { etc }
  258. MOVQ DI, BX
  259. SUBQ R8, BX
  260. CMPQ BX, DX
  261. JLT errCorrupt
  262. // if length > len(dst)-d { etc }
  263. MOVQ R10, BX
  264. SUBQ DI, BX
  265. CMPQ CX, BX
  266. JGT errCorrupt
  267. // forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
  268. //
  269. // Set:
  270. // - R14 = len(dst)-d
  271. // - R15 = &dst[d-offset]
  272. MOVQ R10, R14
  273. SUBQ DI, R14
  274. MOVQ DI, R15
  275. SUBQ DX, R15
  276. // !!! Try a faster technique for short (16 or fewer bytes) forward copies.
  277. //
  278. // First, try using two 8-byte load/stores, similar to the doLit technique
  279. // above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
  280. // still OK if offset >= 8. Note that this has to be two 8-byte load/stores
  281. // and not one 16-byte load/store, and the first store has to be before the
  282. // second load, due to the overlap if offset is in the range [8, 16).
  283. //
  284. // if length > 16 || offset < 8 || len(dst)-d < 16 {
  285. // goto slowForwardCopy
  286. // }
  287. // copy 16 bytes
  288. // d += length
  289. CMPQ CX, $16
  290. JGT slowForwardCopy
  291. CMPQ DX, $8
  292. JLT slowForwardCopy
  293. CMPQ R14, $16
  294. JLT slowForwardCopy
  295. MOVQ 0(R15), AX
  296. MOVQ AX, 0(DI)
  297. MOVQ 8(R15), BX
  298. MOVQ BX, 8(DI)
  299. ADDQ CX, DI
  300. JMP loop
  301. slowForwardCopy:
  302. // !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
  303. // can still try 8-byte load stores, provided we can overrun up to 10 extra
  304. // bytes. As above, the overrun will be fixed up by subsequent iterations
  305. // of the outermost loop.
  306. //
  307. // The C++ snappy code calls this technique IncrementalCopyFastPath. Its
  308. // commentary says:
  309. //
  310. // ----
  311. //
  312. // The main part of this loop is a simple copy of eight bytes at a time
  313. // until we've copied (at least) the requested amount of bytes. However,
  314. // if d and d-offset are less than eight bytes apart (indicating a
  315. // repeating pattern of length < 8), we first need to expand the pattern in
  316. // order to get the correct results. For instance, if the buffer looks like
  317. // this, with the eight-byte <d-offset> and <d> patterns marked as
  318. // intervals:
  319. //
  320. // abxxxxxxxxxxxx
  321. // [------] d-offset
  322. // [------] d
  323. //
  324. // a single eight-byte copy from <d-offset> to <d> will repeat the pattern
  325. // once, after which we can move <d> two bytes without moving <d-offset>:
  326. //
  327. // ababxxxxxxxxxx
  328. // [------] d-offset
  329. // [------] d
  330. //
  331. // and repeat the exercise until the two no longer overlap.
  332. //
  333. // This allows us to do very well in the special case of one single byte
  334. // repeated many times, without taking a big hit for more general cases.
  335. //
  336. // The worst case of extra writing past the end of the match occurs when
  337. // offset == 1 and length == 1; the last copy will read from byte positions
  338. // [0..7] and write to [4..11], whereas it was only supposed to write to
  339. // position 1. Thus, ten excess bytes.
  340. //
  341. // ----
  342. //
  343. // That "10 byte overrun" worst case is confirmed by Go's
  344. // TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
  345. // and finishSlowForwardCopy algorithm.
  346. //
  347. // if length > len(dst)-d-10 {
  348. // goto verySlowForwardCopy
  349. // }
  350. SUBQ $10, R14
  351. CMPQ CX, R14
  352. JGT verySlowForwardCopy
  353. makeOffsetAtLeast8:
  354. // !!! As above, expand the pattern so that offset >= 8 and we can use
  355. // 8-byte load/stores.
  356. //
  357. // for offset < 8 {
  358. // copy 8 bytes from dst[d-offset:] to dst[d:]
  359. // length -= offset
  360. // d += offset
  361. // offset += offset
  362. // // The two previous lines together means that d-offset, and therefore
  363. // // R15, is unchanged.
  364. // }
  365. CMPQ DX, $8
  366. JGE fixUpSlowForwardCopy
  367. MOVQ (R15), BX
  368. MOVQ BX, (DI)
  369. SUBQ DX, CX
  370. ADDQ DX, DI
  371. ADDQ DX, DX
  372. JMP makeOffsetAtLeast8
  373. fixUpSlowForwardCopy:
  374. // !!! Add length (which might be negative now) to d (implied by DI being
  375. // &dst[d]) so that d ends up at the right place when we jump back to the
  376. // top of the loop. Before we do that, though, we save DI to AX so that, if
  377. // length is positive, copying the remaining length bytes will write to the
  378. // right place.
  379. MOVQ DI, AX
  380. ADDQ CX, DI
  381. finishSlowForwardCopy:
  382. // !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
  383. // length means that we overrun, but as above, that will be fixed up by
  384. // subsequent iterations of the outermost loop.
  385. CMPQ CX, $0
  386. JLE loop
  387. MOVQ (R15), BX
  388. MOVQ BX, (AX)
  389. ADDQ $8, R15
  390. ADDQ $8, AX
  391. SUBQ $8, CX
  392. JMP finishSlowForwardCopy
  393. verySlowForwardCopy:
  394. // verySlowForwardCopy is a simple implementation of forward copy. In C
  395. // parlance, this is a do/while loop instead of a while loop, since we know
  396. // that length > 0. In Go syntax:
  397. //
  398. // for {
  399. // dst[d] = dst[d - offset]
  400. // d++
  401. // length--
  402. // if length == 0 {
  403. // break
  404. // }
  405. // }
  406. MOVB (R15), BX
  407. MOVB BX, (DI)
  408. INCQ R15
  409. INCQ DI
  410. DECQ CX
  411. JNZ verySlowForwardCopy
  412. JMP loop
  413. // The code above handles copy tags.
  414. // ----------------------------------------
  415. end:
  416. // This is the end of the "for s < len(src)".
  417. //
  418. // if d != len(dst) { etc }
  419. CMPQ DI, R10
  420. JNE errCorrupt
  421. // return 0
  422. MOVQ $0, ret+48(FP)
  423. RET
  424. errCorrupt:
  425. // return decodeErrCodeCorrupt
  426. MOVQ $1, ret+48(FP)
  427. RET