encode_amd64.s 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626
  1. // Copyright 2016 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // +build !appengine
  5. // +build gc
  6. // +build !noasm
  7. #include "textflag.h"
  8. // The XXX lines assemble on Go 1.4, 1.5 and 1.7, but not 1.6, due to a
  9. // Go toolchain regression. See https://github.com/golang/go/issues/15426 and
  10. // https://github.com/golang/snappy/issues/29
  11. //
  12. // As a workaround, the package was built with a known good assembler, and
  13. // those instructions were disassembled by "objdump -d" to yield the
  14. // 4e 0f b7 7c 5c 78 movzwq 0x78(%rsp,%r11,2),%r15
  15. // style comments, in AT&T asm syntax. Note that rsp here is a physical
  16. // register, not Go/asm's SP pseudo-register (see https://golang.org/doc/asm).
  17. // The instructions were then encoded as "BYTE $0x.." sequences, which assemble
  18. // fine on Go 1.6.
  19. // The asm code generally follows the pure Go code in encode_other.go, except
  20. // where marked with a "!!!".
  21. // ----------------------------------------------------------------------------
  22. // func emitLiteral(dst, lit []byte) int
  23. //
  24. // All local variables fit into registers. The register allocation:
  25. // - AX len(lit)
  26. // - BX n
  27. // - DX return value
  28. // - DI &dst[i]
  29. // - R10 &lit[0]
  30. //
  31. // The 24 bytes of stack space is to call runtime·memmove.
  32. //
  33. // The unusual register allocation of local variables, such as R10 for the
  34. // source pointer, matches the allocation used at the call site in encodeBlock,
  35. // which makes it easier to manually inline this function.
  36. TEXT ·emitLiteral(SB), NOSPLIT, $24-56
  37. MOVQ dst_base+0(FP), DI
  38. MOVQ lit_base+24(FP), R10
  39. MOVQ lit_len+32(FP), AX
  40. MOVQ AX, DX
  41. MOVL AX, BX
  42. SUBL $1, BX
  43. CMPL BX, $60
  44. JLT oneByte
  45. CMPL BX, $256
  46. JLT twoBytes
  47. threeBytes:
  48. MOVB $0xf4, 0(DI)
  49. MOVW BX, 1(DI)
  50. ADDQ $3, DI
  51. ADDQ $3, DX
  52. JMP emitLiteralEnd
  53. twoBytes:
  54. MOVB $0xf0, 0(DI)
  55. MOVB BX, 1(DI)
  56. ADDQ $2, DI
  57. ADDQ $2, DX
  58. JMP emitLiteralEnd
  59. oneByte:
  60. SHLB $2, BX
  61. MOVB BX, 0(DI)
  62. ADDQ $1, DI
  63. ADDQ $1, DX
  64. emitLiteralEnd:
  65. MOVQ DX, ret+48(FP)
  66. // copy(dst[i:], lit)
  67. //
  68. // This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
  69. // DI, R10 and AX as arguments.
  70. MOVQ DI, 0(SP)
  71. MOVQ R10, 8(SP)
  72. MOVQ AX, 16(SP)
  73. CALL runtime·memmove(SB)
  74. RET
  75. // ----------------------------------------------------------------------------
  76. // func emitCopy(dst []byte, offset, length int) int
  77. //
  78. // All local variables fit into registers. The register allocation:
  79. // - AX length
  80. // - SI &dst[0]
  81. // - DI &dst[i]
  82. // - R11 offset
  83. //
  84. // The unusual register allocation of local variables, such as R11 for the
  85. // offset, matches the allocation used at the call site in encodeBlock, which
  86. // makes it easier to manually inline this function.
  87. TEXT ·emitCopy(SB), NOSPLIT, $0-48
  88. MOVQ dst_base+0(FP), DI
  89. MOVQ DI, SI
  90. MOVQ offset+24(FP), R11
  91. MOVQ length+32(FP), AX
  92. loop0:
  93. // for length >= 68 { etc }
  94. CMPL AX, $68
  95. JLT step1
  96. // Emit a length 64 copy, encoded as 3 bytes.
  97. MOVB $0xfe, 0(DI)
  98. MOVW R11, 1(DI)
  99. ADDQ $3, DI
  100. SUBL $64, AX
  101. JMP loop0
  102. step1:
  103. // if length > 64 { etc }
  104. CMPL AX, $64
  105. JLE step2
  106. // Emit a length 60 copy, encoded as 3 bytes.
  107. MOVB $0xee, 0(DI)
  108. MOVW R11, 1(DI)
  109. ADDQ $3, DI
  110. SUBL $60, AX
  111. step2:
  112. // if length >= 12 || offset >= 2048 { goto step3 }
  113. CMPL AX, $12
  114. JGE step3
  115. CMPL R11, $2048
  116. JGE step3
  117. // Emit the remaining copy, encoded as 2 bytes.
  118. MOVB R11, 1(DI)
  119. SHRL $8, R11
  120. SHLB $5, R11
  121. SUBB $4, AX
  122. SHLB $2, AX
  123. ORB AX, R11
  124. ORB $1, R11
  125. MOVB R11, 0(DI)
  126. ADDQ $2, DI
  127. // Return the number of bytes written.
  128. SUBQ SI, DI
  129. MOVQ DI, ret+40(FP)
  130. RET
  131. step3:
  132. // Emit the remaining copy, encoded as 3 bytes.
  133. SUBL $1, AX
  134. SHLB $2, AX
  135. ORB $2, AX
  136. MOVB AX, 0(DI)
  137. MOVW R11, 1(DI)
  138. ADDQ $3, DI
  139. // Return the number of bytes written.
  140. SUBQ SI, DI
  141. MOVQ DI, ret+40(FP)
  142. RET
  143. // ----------------------------------------------------------------------------
  144. // func extendMatch(src []byte, i, j int) int
  145. //
  146. // All local variables fit into registers. The register allocation:
  147. // - CX &src[0]
  148. // - DX &src[len(src)]
  149. // - SI &src[i]
  150. // - DI &src[j]
  151. // - R9 &src[len(src) - 8]
  152. TEXT ·extendMatch(SB), NOSPLIT, $0-48
  153. MOVQ src_base+0(FP), CX
  154. MOVQ src_len+8(FP), DX
  155. MOVQ i+24(FP), SI
  156. MOVQ j+32(FP), DI
  157. ADDQ CX, DX
  158. ADDQ CX, SI
  159. ADDQ CX, DI
  160. MOVQ DX, R9
  161. SUBQ $8, R9
  162. cmp8:
  163. // As long as we are 8 or more bytes before the end of src, we can load and
  164. // compare 8 bytes at a time. If those 8 bytes are equal, repeat.
  165. CMPQ DI, R9
  166. JA cmp1
  167. MOVQ (SI), AX
  168. MOVQ (DI), BX
  169. CMPQ AX, BX
  170. JNE bsf
  171. ADDQ $8, SI
  172. ADDQ $8, DI
  173. JMP cmp8
  174. bsf:
  175. // If those 8 bytes were not equal, XOR the two 8 byte values, and return
  176. // the index of the first byte that differs. The BSF instruction finds the
  177. // least significant 1 bit, the amd64 architecture is little-endian, and
  178. // the shift by 3 converts a bit index to a byte index.
  179. XORQ AX, BX
  180. BSFQ BX, BX
  181. SHRQ $3, BX
  182. ADDQ BX, DI
  183. // Convert from &src[ret] to ret.
  184. SUBQ CX, DI
  185. MOVQ DI, ret+40(FP)
  186. RET
  187. cmp1:
  188. // In src's tail, compare 1 byte at a time.
  189. CMPQ DI, DX
  190. JAE extendMatchEnd
  191. MOVB (SI), AX
  192. MOVB (DI), BX
  193. CMPB AX, BX
  194. JNE extendMatchEnd
  195. ADDQ $1, SI
  196. ADDQ $1, DI
  197. JMP cmp1
  198. extendMatchEnd:
  199. // Convert from &src[ret] to ret.
  200. SUBQ CX, DI
  201. MOVQ DI, ret+40(FP)
  202. RET
  203. // ----------------------------------------------------------------------------
  204. // func encodeBlock(dst, src []byte) (d int)
  205. //
  206. // All local variables fit into registers, other than "var table". The register
  207. // allocation:
  208. // - AX . .
  209. // - BX . .
  210. // - CX 56 shift (note that amd64 shifts by non-immediates must use CX).
  211. // - DX 64 &src[0], tableSize
  212. // - SI 72 &src[s]
  213. // - DI 80 &dst[d]
  214. // - R9 88 sLimit
  215. // - R10 . &src[nextEmit]
  216. // - R11 96 prevHash, currHash, nextHash, offset
  217. // - R12 104 &src[base], skip
  218. // - R13 . &src[nextS]
  219. // - R14 . len(src), bytesBetweenHashLookups, x
  220. // - R15 112 candidate
  221. //
  222. // The second column (56, 64, etc) is the stack offset to spill the registers
  223. // when calling other functions. We could pack this slightly tighter, but it's
  224. // simpler to have a dedicated spill map independent of the function called.
  225. //
  226. // "var table [maxTableSize]uint16" takes up 32768 bytes of stack space. An
  227. // extra 56 bytes, to call other functions, and an extra 64 bytes, to spill
  228. // local variables (registers) during calls gives 32768 + 56 + 64 = 32888.
  229. TEXT ·encodeBlock(SB), 0, $32888-56
  230. MOVQ dst_base+0(FP), DI
  231. MOVQ src_base+24(FP), SI
  232. MOVQ src_len+32(FP), R14
  233. // shift, tableSize := uint32(32-8), 1<<8
  234. MOVQ $24, CX
  235. MOVQ $256, DX
  236. calcShift:
  237. // for ; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
  238. // shift--
  239. // }
  240. CMPQ DX, $16384
  241. JGE varTable
  242. CMPQ DX, R14
  243. JGE varTable
  244. SUBQ $1, CX
  245. SHLQ $1, DX
  246. JMP calcShift
  247. varTable:
  248. // var table [maxTableSize]uint16
  249. //
  250. // In the asm code, unlike the Go code, we can zero-initialize only the
  251. // first tableSize elements. Each uint16 element is 2 bytes and each MOVOU
  252. // writes 16 bytes, so we can do only tableSize/8 writes instead of the
  253. // 2048 writes that would zero-initialize all of table's 32768 bytes.
  254. SHRQ $3, DX
  255. LEAQ table-32768(SP), BX
  256. PXOR X0, X0
  257. memclr:
  258. MOVOU X0, 0(BX)
  259. ADDQ $16, BX
  260. SUBQ $1, DX
  261. JNZ memclr
  262. // !!! DX = &src[0]
  263. MOVQ SI, DX
  264. // sLimit := len(src) - inputMargin
  265. MOVQ R14, R9
  266. SUBQ $15, R9
  267. // !!! Pre-emptively spill CX, DX and R9 to the stack. Their values don't
  268. // change for the rest of the function.
  269. MOVQ CX, 56(SP)
  270. MOVQ DX, 64(SP)
  271. MOVQ R9, 88(SP)
  272. // nextEmit := 0
  273. MOVQ DX, R10
  274. // s := 1
  275. ADDQ $1, SI
  276. // nextHash := hash(load32(src, s), shift)
  277. MOVL 0(SI), R11
  278. IMULL $0x1e35a7bd, R11
  279. SHRL CX, R11
  280. outer:
  281. // for { etc }
  282. // skip := 32
  283. MOVQ $32, R12
  284. // nextS := s
  285. MOVQ SI, R13
  286. // candidate := 0
  287. MOVQ $0, R15
  288. inner0:
  289. // for { etc }
  290. // s := nextS
  291. MOVQ R13, SI
  292. // bytesBetweenHashLookups := skip >> 5
  293. MOVQ R12, R14
  294. SHRQ $5, R14
  295. // nextS = s + bytesBetweenHashLookups
  296. ADDQ R14, R13
  297. // skip += bytesBetweenHashLookups
  298. ADDQ R14, R12
  299. // if nextS > sLimit { goto emitRemainder }
  300. MOVQ R13, AX
  301. SUBQ DX, AX
  302. CMPQ AX, R9
  303. JA emitRemainder
  304. // candidate = int(table[nextHash])
  305. // XXX: MOVWQZX table-32768(SP)(R11*2), R15
  306. // XXX: 4e 0f b7 7c 5c 78 movzwq 0x78(%rsp,%r11,2),%r15
  307. BYTE $0x4e
  308. BYTE $0x0f
  309. BYTE $0xb7
  310. BYTE $0x7c
  311. BYTE $0x5c
  312. BYTE $0x78
  313. // table[nextHash] = uint16(s)
  314. MOVQ SI, AX
  315. SUBQ DX, AX
  316. // XXX: MOVW AX, table-32768(SP)(R11*2)
  317. // XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2)
  318. BYTE $0x66
  319. BYTE $0x42
  320. BYTE $0x89
  321. BYTE $0x44
  322. BYTE $0x5c
  323. BYTE $0x78
  324. // nextHash = hash(load32(src, nextS), shift)
  325. MOVL 0(R13), R11
  326. IMULL $0x1e35a7bd, R11
  327. SHRL CX, R11
  328. // if load32(src, s) != load32(src, candidate) { continue } break
  329. MOVL 0(SI), AX
  330. MOVL (DX)(R15*1), BX
  331. CMPL AX, BX
  332. JNE inner0
  333. fourByteMatch:
  334. // As per the encode_other.go code:
  335. //
  336. // A 4-byte match has been found. We'll later see etc.
  337. // !!! Jump to a fast path for short (<= 16 byte) literals. See the comment
  338. // on inputMargin in encode.go.
  339. MOVQ SI, AX
  340. SUBQ R10, AX
  341. CMPQ AX, $16
  342. JLE emitLiteralFastPath
  343. // d += emitLiteral(dst[d:], src[nextEmit:s])
  344. //
  345. // Push args.
  346. MOVQ DI, 0(SP)
  347. MOVQ $0, 8(SP) // Unnecessary, as the callee ignores it, but conservative.
  348. MOVQ $0, 16(SP) // Unnecessary, as the callee ignores it, but conservative.
  349. MOVQ R10, 24(SP)
  350. MOVQ AX, 32(SP)
  351. MOVQ AX, 40(SP) // Unnecessary, as the callee ignores it, but conservative.
  352. // Spill local variables (registers) onto the stack; call; unspill.
  353. MOVQ SI, 72(SP)
  354. MOVQ DI, 80(SP)
  355. MOVQ R15, 112(SP)
  356. CALL ·emitLiteral(SB)
  357. MOVQ 56(SP), CX
  358. MOVQ 64(SP), DX
  359. MOVQ 72(SP), SI
  360. MOVQ 80(SP), DI
  361. MOVQ 88(SP), R9
  362. MOVQ 112(SP), R15
  363. // Finish the "d +=" part of "d += emitLiteral(etc)".
  364. ADDQ 48(SP), DI
  365. JMP inner1
  366. emitLiteralFastPath:
  367. // !!! Emit the 1-byte encoding "uint8(len(lit)-1)<<2".
  368. MOVB AX, BX
  369. SUBB $1, BX
  370. SHLB $2, BX
  371. MOVB BX, (DI)
  372. ADDQ $1, DI
  373. // !!! Implement the copy from lit to dst as a 16-byte load and store.
  374. // (Encode's documentation says that dst and src must not overlap.)
  375. //
  376. // This always copies 16 bytes, instead of only len(lit) bytes, but that's
  377. // OK. Subsequent iterations will fix up the overrun.
  378. //
  379. // Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
  380. // 16-byte loads and stores. This technique probably wouldn't be as
  381. // effective on architectures that are fussier about alignment.
  382. MOVOU 0(R10), X0
  383. MOVOU X0, 0(DI)
  384. ADDQ AX, DI
  385. inner1:
  386. // for { etc }
  387. // base := s
  388. MOVQ SI, R12
  389. // !!! offset := base - candidate
  390. MOVQ R12, R11
  391. SUBQ R15, R11
  392. SUBQ DX, R11
  393. // s = extendMatch(src, candidate+4, s+4)
  394. //
  395. // Push args.
  396. MOVQ DX, 0(SP)
  397. MOVQ src_len+32(FP), R14
  398. MOVQ R14, 8(SP)
  399. MOVQ R14, 16(SP) // Unnecessary, as the callee ignores it, but conservative.
  400. ADDQ $4, R15
  401. MOVQ R15, 24(SP)
  402. ADDQ $4, SI
  403. SUBQ DX, SI
  404. MOVQ SI, 32(SP)
  405. // Spill local variables (registers) onto the stack; call; unspill.
  406. //
  407. // We don't need to unspill CX or R9 as we are just about to call another
  408. // function.
  409. MOVQ DI, 80(SP)
  410. MOVQ R11, 96(SP)
  411. MOVQ R12, 104(SP)
  412. CALL ·extendMatch(SB)
  413. MOVQ 64(SP), DX
  414. MOVQ 80(SP), DI
  415. MOVQ 96(SP), R11
  416. MOVQ 104(SP), R12
  417. // Finish the "s =" part of "s = extendMatch(etc)", remembering that the SI
  418. // register holds &src[s], not s.
  419. MOVQ 40(SP), SI
  420. ADDQ DX, SI
  421. // d += emitCopy(dst[d:], base-candidate, s-base)
  422. //
  423. // Push args.
  424. MOVQ DI, 0(SP)
  425. MOVQ $0, 8(SP) // Unnecessary, as the callee ignores it, but conservative.
  426. MOVQ $0, 16(SP) // Unnecessary, as the callee ignores it, but conservative.
  427. MOVQ R11, 24(SP)
  428. MOVQ SI, AX
  429. SUBQ R12, AX
  430. MOVQ AX, 32(SP)
  431. // Spill local variables (registers) onto the stack; call; unspill.
  432. MOVQ SI, 72(SP)
  433. MOVQ DI, 80(SP)
  434. CALL ·emitCopy(SB)
  435. MOVQ 56(SP), CX
  436. MOVQ 64(SP), DX
  437. MOVQ 72(SP), SI
  438. MOVQ 80(SP), DI
  439. MOVQ 88(SP), R9
  440. // Finish the "d +=" part of "d += emitCopy(etc)".
  441. ADDQ 40(SP), DI
  442. // nextEmit = s
  443. MOVQ SI, R10
  444. // if s >= sLimit { goto emitRemainder }
  445. MOVQ SI, AX
  446. SUBQ DX, AX
  447. CMPQ AX, R9
  448. JAE emitRemainder
  449. // As per the encode_other.go code:
  450. //
  451. // We could immediately etc.
  452. // x := load64(src, s-1)
  453. MOVQ -1(SI), R14
  454. // prevHash := hash(uint32(x>>0), shift)
  455. MOVL R14, R11
  456. IMULL $0x1e35a7bd, R11
  457. SHRL CX, R11
  458. // table[prevHash] = uint16(s-1)
  459. MOVQ SI, AX
  460. SUBQ DX, AX
  461. SUBQ $1, AX
  462. // XXX: MOVW AX, table-32768(SP)(R11*2)
  463. // XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2)
  464. BYTE $0x66
  465. BYTE $0x42
  466. BYTE $0x89
  467. BYTE $0x44
  468. BYTE $0x5c
  469. BYTE $0x78
  470. // currHash := hash(uint32(x>>8), shift)
  471. SHRQ $8, R14
  472. MOVL R14, R11
  473. IMULL $0x1e35a7bd, R11
  474. SHRL CX, R11
  475. // candidate = int(table[currHash])
  476. // XXX: MOVWQZX table-32768(SP)(R11*2), R15
  477. // XXX: 4e 0f b7 7c 5c 78 movzwq 0x78(%rsp,%r11,2),%r15
  478. BYTE $0x4e
  479. BYTE $0x0f
  480. BYTE $0xb7
  481. BYTE $0x7c
  482. BYTE $0x5c
  483. BYTE $0x78
  484. // table[currHash] = uint16(s)
  485. ADDQ $1, AX
  486. // XXX: MOVW AX, table-32768(SP)(R11*2)
  487. // XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2)
  488. BYTE $0x66
  489. BYTE $0x42
  490. BYTE $0x89
  491. BYTE $0x44
  492. BYTE $0x5c
  493. BYTE $0x78
  494. // if uint32(x>>8) == load32(src, candidate) { continue }
  495. MOVL (DX)(R15*1), BX
  496. CMPL R14, BX
  497. JEQ inner1
  498. // nextHash = hash(uint32(x>>16), shift)
  499. SHRQ $8, R14
  500. MOVL R14, R11
  501. IMULL $0x1e35a7bd, R11
  502. SHRL CX, R11
  503. // s++
  504. ADDQ $1, SI
  505. // break out of the inner1 for loop, i.e. continue the outer loop.
  506. JMP outer
  507. emitRemainder:
  508. // if nextEmit < len(src) { etc }
  509. MOVQ src_len+32(FP), AX
  510. ADDQ DX, AX
  511. CMPQ R10, AX
  512. JEQ encodeBlockEnd
  513. // d += emitLiteral(dst[d:], src[nextEmit:])
  514. //
  515. // Push args.
  516. MOVQ DI, 0(SP)
  517. MOVQ $0, 8(SP) // Unnecessary, as the callee ignores it, but conservative.
  518. MOVQ $0, 16(SP) // Unnecessary, as the callee ignores it, but conservative.
  519. MOVQ R10, 24(SP)
  520. SUBQ R10, AX
  521. MOVQ AX, 32(SP)
  522. MOVQ AX, 40(SP) // Unnecessary, as the callee ignores it, but conservative.
  523. // Spill local variables (registers) onto the stack; call; unspill.
  524. MOVQ DI, 80(SP)
  525. CALL ·emitLiteral(SB)
  526. MOVQ 80(SP), DI
  527. // Finish the "d +=" part of "d += emitLiteral(etc)".
  528. ADDQ 48(SP), DI
  529. encodeBlockEnd:
  530. MOVQ dst_base+0(FP), AX
  531. SUBQ AX, DI
  532. MOVQ DI, d+48(FP)
  533. RET