decode_amd64.s 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570
  1. // Copyright 2016 The Go Authors. All rights reserved.
  2. // Copyright (c) 2019 Klaus Post. All rights reserved.
  3. // Use of this source code is governed by a BSD-style
  4. // license that can be found in the LICENSE file.
  5. // +build !appengine
  6. // +build gc
  7. // +build !noasm
  8. #include "textflag.h"
  9. #define R_TMP0 AX
  10. #define R_TMP1 BX
  11. #define R_LEN CX
  12. #define R_OFF DX
  13. #define R_SRC SI
  14. #define R_DST DI
  15. #define R_DBASE R8
  16. #define R_DLEN R9
  17. #define R_DEND R10
  18. #define R_SBASE R11
  19. #define R_SLEN R12
  20. #define R_SEND R13
  21. #define R_TMP2 R14
  22. #define R_TMP3 R15
  23. // The asm code generally follows the pure Go code in decode_other.go, except
  24. // where marked with a "!!!".
  25. // func decode(dst, src []byte) int
  26. //
  27. // All local variables fit into registers. The non-zero stack size is only to
  28. // spill registers and push args when issuing a CALL. The register allocation:
  29. // - R_TMP0 scratch
  30. // - R_TMP1 scratch
  31. // - R_LEN length or x (shared)
  32. // - R_X length or x (shared)
  33. // - R_OFF offset
  34. // - R_SRC &src[s]
  35. // - R_DST &dst[d]
  36. // + R_DBASE dst_base
  37. // + R_DLEN dst_len
  38. // + R_DEND dst_base + dst_len
  39. // + R_SBASE src_base
  40. // + R_SLEN src_len
  41. // + R_SEND src_base + src_len
  42. // - R_TMP2 used by doCopy
  43. // - R_TMP3 used by doCopy
  44. //
  45. // The registers R_DBASE-R_SEND (marked with a "+") are set at the start of the
  46. // function, and after a CALL returns, and are not otherwise modified.
  47. //
  48. // The d variable is implicitly R_DST - R_DBASE, and len(dst)-d is R_DEND - R_DST.
  49. // The s variable is implicitly R_SRC - R_SBASE, and len(src)-s is R_SEND - R_SRC.
  50. TEXT ·s2Decode(SB), NOSPLIT, $48-56
  51. // Initialize R_SRC, R_DST and R_DBASE-R_SEND.
  52. MOVQ dst_base+0(FP), R_DBASE
  53. MOVQ dst_len+8(FP), R_DLEN
  54. MOVQ R_DBASE, R_DST
  55. MOVQ R_DBASE, R_DEND
  56. ADDQ R_DLEN, R_DEND
  57. MOVQ src_base+24(FP), R_SBASE
  58. MOVQ src_len+32(FP), R_SLEN
  59. MOVQ R_SBASE, R_SRC
  60. MOVQ R_SBASE, R_SEND
  61. ADDQ R_SLEN, R_SEND
  62. XORQ R_OFF, R_OFF
  63. loop:
  64. // for s < len(src)
  65. CMPQ R_SRC, R_SEND
  66. JEQ end
  67. // R_LEN = uint32(src[s])
  68. //
  69. // switch src[s] & 0x03
  70. MOVBLZX (R_SRC), R_LEN
  71. MOVL R_LEN, R_TMP1
  72. ANDL $3, R_TMP1
  73. CMPL R_TMP1, $1
  74. JAE tagCopy
  75. // ----------------------------------------
  76. // The code below handles literal tags.
  77. // case tagLiteral:
  78. // x := uint32(src[s] >> 2)
  79. // switch
  80. SHRL $2, R_LEN
  81. CMPL R_LEN, $60
  82. JAE tagLit60Plus
  83. // case x < 60:
  84. // s++
  85. INCQ R_SRC
  86. doLit:
  87. // This is the end of the inner "switch", when we have a literal tag.
  88. //
  89. // We assume that R_LEN == x and x fits in a uint32, where x is the variable
  90. // used in the pure Go decode_other.go code.
  91. // length = int(x) + 1
  92. //
  93. // Unlike the pure Go code, we don't need to check if length <= 0 because
  94. // R_LEN can hold 64 bits, so the increment cannot overflow.
  95. INCQ R_LEN
  96. // Prepare to check if copying length bytes will run past the end of dst or
  97. // src.
  98. //
  99. // R_TMP0 = len(dst) - d
  100. // R_TMP1 = len(src) - s
  101. MOVQ R_DEND, R_TMP0
  102. SUBQ R_DST, R_TMP0
  103. MOVQ R_SEND, R_TMP1
  104. SUBQ R_SRC, R_TMP1
  105. // !!! Try a faster technique for short (16 or fewer bytes) copies.
  106. //
  107. // if length > 16 || len(dst)-d < 16 || len(src)-s < 16 {
  108. // goto callMemmove // Fall back on calling runtime·memmove.
  109. // }
  110. //
  111. // The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
  112. // against 21 instead of 16, because it cannot assume that all of its input
  113. // is contiguous in memory and so it needs to leave enough source bytes to
  114. // read the next tag without refilling buffers, but Go's Decode assumes
  115. // contiguousness (the src argument is a []byte).
  116. CMPQ R_LEN, $16
  117. JGT callMemmove
  118. CMPQ R_TMP0, $16
  119. JLT callMemmove
  120. CMPQ R_TMP1, $16
  121. JLT callMemmove
  122. // !!! Implement the copy from src to dst as a 16-byte load and store.
  123. // (Decode's documentation says that dst and src must not overlap.)
  124. //
  125. // This always copies 16 bytes, instead of only length bytes, but that's
  126. // OK. If the input is a valid Snappy encoding then subsequent iterations
  127. // will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
  128. // non-nil error), so the overrun will be ignored.
  129. //
  130. // Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
  131. // 16-byte loads and stores. This technique probably wouldn't be as
  132. // effective on architectures that are fussier about alignment.
  133. MOVOU 0(R_SRC), X0
  134. MOVOU X0, 0(R_DST)
  135. // d += length
  136. // s += length
  137. ADDQ R_LEN, R_DST
  138. ADDQ R_LEN, R_SRC
  139. JMP loop
  140. callMemmove:
  141. // if length > len(dst)-d || length > len(src)-s { etc }
  142. CMPQ R_LEN, R_TMP0
  143. JGT errCorrupt
  144. CMPQ R_LEN, R_TMP1
  145. JGT errCorrupt
  146. // copy(dst[d:], src[s:s+length])
  147. //
  148. // This means calling runtime·memmove(&dst[d], &src[s], length), so we push
  149. // R_DST, R_SRC and R_LEN as arguments. Coincidentally, we also need to spill those
  150. // three registers to the stack, to save local variables across the CALL.
  151. MOVQ R_DST, 0(SP)
  152. MOVQ R_SRC, 8(SP)
  153. MOVQ R_LEN, 16(SP)
  154. MOVQ R_DST, 24(SP)
  155. MOVQ R_SRC, 32(SP)
  156. MOVQ R_LEN, 40(SP)
  157. CALL runtime·memmove(SB)
  158. // Restore local variables: unspill registers from the stack and
  159. // re-calculate R_DBASE-R_SEND.
  160. MOVQ 24(SP), R_DST
  161. MOVQ 32(SP), R_SRC
  162. MOVQ 40(SP), R_LEN
  163. MOVQ dst_base+0(FP), R_DBASE
  164. MOVQ dst_len+8(FP), R_DLEN
  165. MOVQ R_DBASE, R_DEND
  166. ADDQ R_DLEN, R_DEND
  167. MOVQ src_base+24(FP), R_SBASE
  168. MOVQ src_len+32(FP), R_SLEN
  169. MOVQ R_SBASE, R_SEND
  170. ADDQ R_SLEN, R_SEND
  171. // d += length
  172. // s += length
  173. ADDQ R_LEN, R_DST
  174. ADDQ R_LEN, R_SRC
  175. JMP loop
  176. tagLit60Plus:
  177. // !!! This fragment does the
  178. //
  179. // s += x - 58; if uint(s) > uint(len(src)) { etc }
  180. //
  181. // checks. In the asm version, we code it once instead of once per switch case.
  182. ADDQ R_LEN, R_SRC
  183. SUBQ $58, R_SRC
  184. CMPQ R_SRC, R_SEND
  185. JA errCorrupt
  186. // case x == 60:
  187. CMPL R_LEN, $61
  188. JEQ tagLit61
  189. JA tagLit62Plus
  190. // x = uint32(src[s-1])
  191. MOVBLZX -1(R_SRC), R_LEN
  192. JMP doLit
  193. tagLit61:
  194. // case x == 61:
  195. // x = uint32(src[s-2]) | uint32(src[s-1])<<8
  196. MOVWLZX -2(R_SRC), R_LEN
  197. JMP doLit
  198. tagLit62Plus:
  199. CMPL R_LEN, $62
  200. JA tagLit63
  201. // case x == 62:
  202. // x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
  203. MOVWLZX -3(R_SRC), R_LEN
  204. MOVBLZX -1(R_SRC), R_TMP1
  205. SHLL $16, R_TMP1
  206. ORL R_TMP1, R_LEN
  207. JMP doLit
  208. tagLit63:
  209. // case x == 63:
  210. // x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
  211. MOVL -4(R_SRC), R_LEN
  212. JMP doLit
  213. // The code above handles literal tags.
  214. // ----------------------------------------
  215. // The code below handles copy tags.
  216. tagCopy4:
  217. // case tagCopy4:
  218. // s += 5
  219. ADDQ $5, R_SRC
  220. // if uint(s) > uint(len(src)) { etc }
  221. CMPQ R_SRC, R_SEND
  222. JA errCorrupt
  223. // length = 1 + int(src[s-5])>>2
  224. SHRQ $2, R_LEN
  225. INCQ R_LEN
  226. // offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
  227. MOVLQZX -4(R_SRC), R_OFF
  228. JMP doCopy
  229. tagCopy2:
  230. // case tagCopy2:
  231. // s += 3
  232. ADDQ $3, R_SRC
  233. // if uint(s) > uint(len(src)) { etc }
  234. CMPQ R_SRC, R_SEND
  235. JA errCorrupt
  236. // length = 1 + int(src[s-3])>>2
  237. SHRQ $2, R_LEN
  238. INCQ R_LEN
  239. // offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
  240. MOVWQZX -2(R_SRC), R_OFF
  241. JMP doCopy
  242. tagCopy:
  243. // We have a copy tag. We assume that:
  244. // - R_TMP1 == src[s] & 0x03
  245. // - R_LEN == src[s]
  246. CMPQ R_TMP1, $2
  247. JEQ tagCopy2
  248. JA tagCopy4
  249. // case tagCopy1:
  250. // s += 2
  251. ADDQ $2, R_SRC
  252. // if uint(s) > uint(len(src)) { etc }
  253. CMPQ R_SRC, R_SEND
  254. JA errCorrupt
  255. // offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
  256. MOVQ R_LEN, R_TMP0
  257. ANDQ $0xe0, R_TMP0
  258. SHLQ $3, R_TMP0
  259. MOVBQZX -1(R_SRC), R_TMP1
  260. ORQ R_TMP1, R_TMP0
  261. // length = 4 + int(src[s-2])>>2&0x7
  262. SHRQ $2, R_LEN
  263. ANDQ $7, R_LEN
  264. ADDQ $4, R_LEN
  265. // check if repeat code
  266. CMPQ R_TMP0, $0
  267. JE repeatCode
  268. // This is a regular copy, transfer our temporary value to R_OFF (length)
  269. MOVQ R_TMP0, R_OFF
  270. JMP doCopy
  271. // This is a repeat code.
  272. repeatCode:
  273. // If length < 9, reuse last offset, with the length already calculated.
  274. CMPQ R_LEN, $9
  275. JL doCopyRepeat
  276. // Read additional bytes for length.
  277. JE repeatLen1
  278. // Rare, so the extra branch shouldn't hurt too much.
  279. CMPQ R_LEN, $10
  280. JE repeatLen2
  281. JMP repeatLen3
  282. // Read repeat lengths.
  283. repeatLen1:
  284. // s ++
  285. ADDQ $1, R_SRC
  286. // if uint(s) > uint(len(src)) { etc }
  287. CMPQ R_SRC, R_SEND
  288. JA errCorrupt
  289. // length = src[s-1] + 8
  290. MOVBQZX -1(R_SRC), R_LEN
  291. ADDL $8, R_LEN
  292. JMP doCopyRepeat
  293. repeatLen2:
  294. // s +=2
  295. ADDQ $2, R_SRC
  296. // if uint(s) > uint(len(src)) { etc }
  297. CMPQ R_SRC, R_SEND
  298. JA errCorrupt
  299. // length = uint32(src[s-2]) | (uint32(src[s-1])<<8) + (1 << 8)
  300. MOVWQZX -2(R_SRC), R_LEN
  301. ADDL $260, R_LEN
  302. JMP doCopyRepeat
  303. repeatLen3:
  304. // s +=3
  305. ADDQ $3, R_SRC
  306. // if uint(s) > uint(len(src)) { etc }
  307. CMPQ R_SRC, R_SEND
  308. JA errCorrupt
  309. // length = uint32(src[s-3]) | (uint32(src[s-2])<<8) | (uint32(src[s-1])<<16) + (1 << 16)
  310. MOVBLZX -1(R_SRC), R_TMP0
  311. MOVWLZX -3(R_SRC), R_LEN
  312. SHLL $16, R_TMP0
  313. ORL R_TMP0, R_LEN
  314. ADDL $65540, R_LEN
  315. JMP doCopyRepeat
  316. doCopy:
  317. // This is the end of the outer "switch", when we have a copy tag.
  318. //
  319. // We assume that:
  320. // - R_LEN == length && R_LEN > 0
  321. // - R_OFF == offset
  322. // if d < offset { etc }
  323. MOVQ R_DST, R_TMP1
  324. SUBQ R_DBASE, R_TMP1
  325. CMPQ R_TMP1, R_OFF
  326. JLT errCorrupt
  327. // Repeat values can skip the test above, since any offset > 0 will be in dst.
  328. doCopyRepeat:
  329. // if offset <= 0 { etc }
  330. CMPQ R_OFF, $0
  331. JLE errCorrupt
  332. // if length > len(dst)-d { etc }
  333. MOVQ R_DEND, R_TMP1
  334. SUBQ R_DST, R_TMP1
  335. CMPQ R_LEN, R_TMP1
  336. JGT errCorrupt
  337. // forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
  338. //
  339. // Set:
  340. // - R_TMP2 = len(dst)-d
  341. // - R_TMP3 = &dst[d-offset]
  342. MOVQ R_DEND, R_TMP2
  343. SUBQ R_DST, R_TMP2
  344. MOVQ R_DST, R_TMP3
  345. SUBQ R_OFF, R_TMP3
  346. // !!! Try a faster technique for short (16 or fewer bytes) forward copies.
  347. //
  348. // First, try using two 8-byte load/stores, similar to the doLit technique
  349. // above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
  350. // still OK if offset >= 8. Note that this has to be two 8-byte load/stores
  351. // and not one 16-byte load/store, and the first store has to be before the
  352. // second load, due to the overlap if offset is in the range [8, 16).
  353. //
  354. // if length > 16 || offset < 8 || len(dst)-d < 16 {
  355. // goto slowForwardCopy
  356. // }
  357. // copy 16 bytes
  358. // d += length
  359. CMPQ R_LEN, $16
  360. JGT slowForwardCopy
  361. CMPQ R_OFF, $8
  362. JLT slowForwardCopy
  363. CMPQ R_TMP2, $16
  364. JLT slowForwardCopy
  365. MOVQ 0(R_TMP3), R_TMP0
  366. MOVQ R_TMP0, 0(R_DST)
  367. MOVQ 8(R_TMP3), R_TMP1
  368. MOVQ R_TMP1, 8(R_DST)
  369. ADDQ R_LEN, R_DST
  370. JMP loop
  371. slowForwardCopy:
  372. // !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
  373. // can still try 8-byte load stores, provided we can overrun up to 10 extra
  374. // bytes. As above, the overrun will be fixed up by subsequent iterations
  375. // of the outermost loop.
  376. //
  377. // The C++ snappy code calls this technique IncrementalCopyFastPath. Its
  378. // commentary says:
  379. //
  380. // ----
  381. //
  382. // The main part of this loop is a simple copy of eight bytes at a time
  383. // until we've copied (at least) the requested amount of bytes. However,
  384. // if d and d-offset are less than eight bytes apart (indicating a
  385. // repeating pattern of length < 8), we first need to expand the pattern in
  386. // order to get the correct results. For instance, if the buffer looks like
  387. // this, with the eight-byte <d-offset> and <d> patterns marked as
  388. // intervals:
  389. //
  390. // abxxxxxxxxxxxx
  391. // [------] d-offset
  392. // [------] d
  393. //
  394. // a single eight-byte copy from <d-offset> to <d> will repeat the pattern
  395. // once, after which we can move <d> two bytes without moving <d-offset>:
  396. //
  397. // ababxxxxxxxxxx
  398. // [------] d-offset
  399. // [------] d
  400. //
  401. // and repeat the exercise until the two no longer overlap.
  402. //
  403. // This allows us to do very well in the special case of one single byte
  404. // repeated many times, without taking a big hit for more general cases.
  405. //
  406. // The worst case of extra writing past the end of the match occurs when
  407. // offset == 1 and length == 1; the last copy will read from byte positions
  408. // [0..7] and write to [4..11], whereas it was only supposed to write to
  409. // position 1. Thus, ten excess bytes.
  410. //
  411. // ----
  412. //
  413. // That "10 byte overrun" worst case is confirmed by Go's
  414. // TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
  415. // and finishSlowForwardCopy algorithm.
  416. //
  417. // if length > len(dst)-d-10 {
  418. // goto verySlowForwardCopy
  419. // }
  420. SUBQ $10, R_TMP2
  421. CMPQ R_LEN, R_TMP2
  422. JGT verySlowForwardCopy
  423. // We want to keep the offset, so we use R_TMP2 from here.
  424. MOVQ R_OFF, R_TMP2
  425. makeOffsetAtLeast8:
  426. // !!! As above, expand the pattern so that offset >= 8 and we can use
  427. // 8-byte load/stores.
  428. //
  429. // for offset < 8 {
  430. // copy 8 bytes from dst[d-offset:] to dst[d:]
  431. // length -= offset
  432. // d += offset
  433. // offset += offset
  434. // // The two previous lines together means that d-offset, and therefore
  435. // // R_TMP3, is unchanged.
  436. // }
  437. CMPQ R_TMP2, $8
  438. JGE fixUpSlowForwardCopy
  439. MOVQ (R_TMP3), R_TMP1
  440. MOVQ R_TMP1, (R_DST)
  441. SUBQ R_TMP2, R_LEN
  442. ADDQ R_TMP2, R_DST
  443. ADDQ R_TMP2, R_TMP2
  444. JMP makeOffsetAtLeast8
  445. fixUpSlowForwardCopy:
  446. // !!! Add length (which might be negative now) to d (implied by R_DST being
  447. // &dst[d]) so that d ends up at the right place when we jump back to the
  448. // top of the loop. Before we do that, though, we save R_DST to R_TMP0 so that, if
  449. // length is positive, copying the remaining length bytes will write to the
  450. // right place.
  451. MOVQ R_DST, R_TMP0
  452. ADDQ R_LEN, R_DST
  453. finishSlowForwardCopy:
  454. // !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
  455. // length means that we overrun, but as above, that will be fixed up by
  456. // subsequent iterations of the outermost loop.
  457. CMPQ R_LEN, $0
  458. JLE loop
  459. MOVQ (R_TMP3), R_TMP1
  460. MOVQ R_TMP1, (R_TMP0)
  461. ADDQ $8, R_TMP3
  462. ADDQ $8, R_TMP0
  463. SUBQ $8, R_LEN
  464. JMP finishSlowForwardCopy
  465. verySlowForwardCopy:
  466. // verySlowForwardCopy is a simple implementation of forward copy. In C
  467. // parlance, this is a do/while loop instead of a while loop, since we know
  468. // that length > 0. In Go syntax:
  469. //
  470. // for {
  471. // dst[d] = dst[d - offset]
  472. // d++
  473. // length--
  474. // if length == 0 {
  475. // break
  476. // }
  477. // }
  478. MOVB (R_TMP3), R_TMP1
  479. MOVB R_TMP1, (R_DST)
  480. INCQ R_TMP3
  481. INCQ R_DST
  482. DECQ R_LEN
  483. JNZ verySlowForwardCopy
  484. JMP loop
  485. // The code above handles copy tags.
  486. // ----------------------------------------
  487. end:
  488. // This is the end of the "for s < len(src)".
  489. //
  490. // if d != len(dst) { etc }
  491. CMPQ R_DST, R_DEND
  492. JNE errCorrupt
  493. // return 0
  494. MOVQ $0, ret+48(FP)
  495. RET
  496. errCorrupt:
  497. // return decodeErrCodeCorrupt
  498. MOVQ $1, ret+48(FP)
  499. RET