encode_amd64.s 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622
  1. // Copyright 2016 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // +build !appengine
  5. // +build gc
  6. // +build !noasm
  7. #include "textflag.h"
  8. // The XXX lines assemble on Go 1.4, 1.5 and 1.7, but not 1.6, due to a
  9. // Go toolchain regression. See https://github.com/golang/go/issues/15426 and
  10. // https://github.com/golang/snappy/issues/29
  11. //
  12. // As a workaround, the package was built with a known good assembler, and
  13. // those instructions were disassembled by "objdump -d" to yield the
  14. // 4e 0f b7 7c 5c 78 movzwq 0x78(%rsp,%r11,2),%r15
  15. // style comments, in AT&T asm syntax. Note that rsp here is a physical
  16. // register, not Go/asm's SP pseudo-register (see https://golang.org/doc/asm).
  17. // The instructions were then encoded as "BYTE $0x.." sequences, which assemble
  18. // fine on Go 1.6.
  19. // The asm code generally follows the pure Go code in encode_other.go, except
  20. // where marked with a "!!!".
  21. // ----------------------------------------------------------------------------
  22. // func emitLiteral(dst, lit []byte) int
  23. //
  24. // All local variables fit into registers. The register allocation:
  25. // - AX return value
  26. // - BX n
  27. // - CX len(lit)
  28. // - SI &lit[0]
  29. // - DI &dst[i]
  30. //
  31. // The 24 bytes of stack space is to call runtime·memmove.
  32. TEXT ·emitLiteral(SB), NOSPLIT, $24-56
  33. MOVQ dst_base+0(FP), DI
  34. MOVQ lit_base+24(FP), SI
  35. MOVQ lit_len+32(FP), CX
  36. MOVQ CX, AX
  37. MOVL CX, BX
  38. SUBL $1, BX
  39. CMPL BX, $60
  40. JLT oneByte
  41. CMPL BX, $256
  42. JLT twoBytes
  43. threeBytes:
  44. MOVB $0xf4, 0(DI)
  45. MOVW BX, 1(DI)
  46. ADDQ $3, DI
  47. ADDQ $3, AX
  48. JMP emitLiteralEnd
  49. twoBytes:
  50. MOVB $0xf0, 0(DI)
  51. MOVB BX, 1(DI)
  52. ADDQ $2, DI
  53. ADDQ $2, AX
  54. JMP emitLiteralEnd
  55. oneByte:
  56. SHLB $2, BX
  57. MOVB BX, 0(DI)
  58. ADDQ $1, DI
  59. ADDQ $1, AX
  60. emitLiteralEnd:
  61. MOVQ AX, ret+48(FP)
  62. // copy(dst[i:], lit)
  63. //
  64. // This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
  65. // DI, SI and CX as arguments.
  66. MOVQ DI, 0(SP)
  67. MOVQ SI, 8(SP)
  68. MOVQ CX, 16(SP)
  69. CALL runtime·memmove(SB)
  70. RET
  71. // ----------------------------------------------------------------------------
  72. // func emitCopy(dst []byte, offset, length int) int
  73. //
  74. // All local variables fit into registers. The register allocation:
  75. // - AX length
  76. // - SI &dst[0]
  77. // - DI &dst[i]
  78. // - R11 offset
  79. //
  80. // The unusual register allocation of AX and R11 for local variables matches
  81. // the allocation used at the call site in encodeBlock, which makes it easier
  82. // to manually inline this function.
  83. TEXT ·emitCopy(SB), NOSPLIT, $0-48
  84. MOVQ dst_base+0(FP), DI
  85. MOVQ DI, SI
  86. MOVQ offset+24(FP), R11
  87. MOVQ length+32(FP), AX
  88. loop0:
  89. // for length >= 68 { etc }
  90. CMPL AX, $68
  91. JLT step1
  92. // Emit a length 64 copy, encoded as 3 bytes.
  93. MOVB $0xfe, 0(DI)
  94. MOVW R11, 1(DI)
  95. ADDQ $3, DI
  96. SUBL $64, AX
  97. JMP loop0
  98. step1:
  99. // if length > 64 { etc }
  100. CMPL AX, $64
  101. JLE step2
  102. // Emit a length 60 copy, encoded as 3 bytes.
  103. MOVB $0xee, 0(DI)
  104. MOVW R11, 1(DI)
  105. ADDQ $3, DI
  106. SUBL $60, AX
  107. step2:
  108. // if length >= 12 || offset >= 2048 { goto step3 }
  109. CMPL AX, $12
  110. JGE step3
  111. CMPL R11, $2048
  112. JGE step3
  113. // Emit the remaining copy, encoded as 2 bytes.
  114. MOVB R11, 1(DI)
  115. SHRL $8, R11
  116. SHLB $5, R11
  117. SUBB $4, AX
  118. SHLB $2, AX
  119. ORB AX, R11
  120. ORB $1, R11
  121. MOVB R11, 0(DI)
  122. ADDQ $2, DI
  123. // Return the number of bytes written.
  124. SUBQ SI, DI
  125. MOVQ DI, ret+40(FP)
  126. RET
  127. step3:
  128. // Emit the remaining copy, encoded as 3 bytes.
  129. SUBL $1, AX
  130. SHLB $2, AX
  131. ORB $2, AX
  132. MOVB AX, 0(DI)
  133. MOVW R11, 1(DI)
  134. ADDQ $3, DI
  135. // Return the number of bytes written.
  136. SUBQ SI, DI
  137. MOVQ DI, ret+40(FP)
  138. RET
  139. // ----------------------------------------------------------------------------
  140. // func extendMatch(src []byte, i, j int) int
  141. //
  142. // All local variables fit into registers. The register allocation:
  143. // - CX &src[0]
  144. // - DX &src[len(src)]
  145. // - SI &src[i]
  146. // - DI &src[j]
  147. // - R9 &src[len(src) - 8]
  148. TEXT ·extendMatch(SB), NOSPLIT, $0-48
  149. MOVQ src_base+0(FP), CX
  150. MOVQ src_len+8(FP), DX
  151. MOVQ i+24(FP), SI
  152. MOVQ j+32(FP), DI
  153. ADDQ CX, DX
  154. ADDQ CX, SI
  155. ADDQ CX, DI
  156. MOVQ DX, R9
  157. SUBQ $8, R9
  158. cmp8:
  159. // As long as we are 8 or more bytes before the end of src, we can load and
  160. // compare 8 bytes at a time. If those 8 bytes are equal, repeat.
  161. CMPQ DI, R9
  162. JA cmp1
  163. MOVQ (SI), AX
  164. MOVQ (DI), BX
  165. CMPQ AX, BX
  166. JNE bsf
  167. ADDQ $8, SI
  168. ADDQ $8, DI
  169. JMP cmp8
  170. bsf:
  171. // If those 8 bytes were not equal, XOR the two 8 byte values, and return
  172. // the index of the first byte that differs. The BSF instruction finds the
  173. // least significant 1 bit, the amd64 architecture is little-endian, and
  174. // the shift by 3 converts a bit index to a byte index.
  175. XORQ AX, BX
  176. BSFQ BX, BX
  177. SHRQ $3, BX
  178. ADDQ BX, DI
  179. // Convert from &src[ret] to ret.
  180. SUBQ CX, DI
  181. MOVQ DI, ret+40(FP)
  182. RET
  183. cmp1:
  184. // In src's tail, compare 1 byte at a time.
  185. CMPQ DI, DX
  186. JAE extendMatchEnd
  187. MOVB (SI), AX
  188. MOVB (DI), BX
  189. CMPB AX, BX
  190. JNE extendMatchEnd
  191. ADDQ $1, SI
  192. ADDQ $1, DI
  193. JMP cmp1
  194. extendMatchEnd:
  195. // Convert from &src[ret] to ret.
  196. SUBQ CX, DI
  197. MOVQ DI, ret+40(FP)
  198. RET
  199. // ----------------------------------------------------------------------------
  200. // func encodeBlock(dst, src []byte) (d int)
  201. //
  202. // All local variables fit into registers, other than "var table". The register
  203. // allocation:
  204. // - AX . .
  205. // - BX . .
  206. // - CX 56 shift (note that amd64 shifts by non-immediates must use CX).
  207. // - DX 64 &src[0], tableSize
  208. // - SI 72 &src[s]
  209. // - DI 80 &dst[d]
  210. // - R9 88 sLimit
  211. // - R10 . &src[nextEmit]
  212. // - R11 96 prevHash, currHash, nextHash, offset
  213. // - R12 104 &src[base], skip
  214. // - R13 . &src[nextS]
  215. // - R14 . len(src), bytesBetweenHashLookups, x
  216. // - R15 112 candidate
  217. //
  218. // The second column (56, 64, etc) is the stack offset to spill the registers
  219. // when calling other functions. We could pack this slightly tighter, but it's
  220. // simpler to have a dedicated spill map independent of the function called.
  221. //
  222. // "var table [maxTableSize]uint16" takes up 32768 bytes of stack space. An
  223. // extra 56 bytes, to call other functions, and an extra 64 bytes, to spill
  224. // local variables (registers) during calls gives 32768 + 56 + 64 = 32888.
  225. TEXT ·encodeBlock(SB), 0, $32888-56
  226. MOVQ dst_base+0(FP), DI
  227. MOVQ src_base+24(FP), SI
  228. MOVQ src_len+32(FP), R14
  229. // shift, tableSize := uint32(32-8), 1<<8
  230. MOVQ $24, CX
  231. MOVQ $256, DX
  232. calcShift:
  233. // for ; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
  234. // shift--
  235. // }
  236. CMPQ DX, $16384
  237. JGE varTable
  238. CMPQ DX, R14
  239. JGE varTable
  240. SUBQ $1, CX
  241. SHLQ $1, DX
  242. JMP calcShift
  243. varTable:
  244. // var table [maxTableSize]uint16
  245. //
  246. // In the asm code, unlike the Go code, we can zero-initialize only the
  247. // first tableSize elements. Each uint16 element is 2 bytes and each MOVOU
  248. // writes 16 bytes, so we can do only tableSize/8 writes instead of the
  249. // 2048 writes that would zero-initialize all of table's 32768 bytes.
  250. SHRQ $3, DX
  251. LEAQ table-32768(SP), BX
  252. PXOR X0, X0
  253. memclr:
  254. MOVOU X0, 0(BX)
  255. ADDQ $16, BX
  256. SUBQ $1, DX
  257. JNZ memclr
  258. // !!! DX = &src[0]
  259. MOVQ SI, DX
  260. // sLimit := len(src) - inputMargin
  261. MOVQ R14, R9
  262. SUBQ $15, R9
  263. // !!! Pre-emptively spill CX, DX and R9 to the stack. Their values don't
  264. // change for the rest of the function.
  265. MOVQ CX, 56(SP)
  266. MOVQ DX, 64(SP)
  267. MOVQ R9, 88(SP)
  268. // nextEmit := 0
  269. MOVQ DX, R10
  270. // s := 1
  271. ADDQ $1, SI
  272. // nextHash := hash(load32(src, s), shift)
  273. MOVL 0(SI), R11
  274. IMULL $0x1e35a7bd, R11
  275. SHRL CX, R11
  276. outer:
  277. // for { etc }
  278. // skip := 32
  279. MOVQ $32, R12
  280. // nextS := s
  281. MOVQ SI, R13
  282. // candidate := 0
  283. MOVQ $0, R15
  284. inner0:
  285. // for { etc }
  286. // s := nextS
  287. MOVQ R13, SI
  288. // bytesBetweenHashLookups := skip >> 5
  289. MOVQ R12, R14
  290. SHRQ $5, R14
  291. // nextS = s + bytesBetweenHashLookups
  292. ADDQ R14, R13
  293. // skip += bytesBetweenHashLookups
  294. ADDQ R14, R12
  295. // if nextS > sLimit { goto emitRemainder }
  296. MOVQ R13, AX
  297. SUBQ DX, AX
  298. CMPQ AX, R9
  299. JA emitRemainder
  300. // candidate = int(table[nextHash])
  301. // XXX: MOVWQZX table-32768(SP)(R11*2), R15
  302. // XXX: 4e 0f b7 7c 5c 78 movzwq 0x78(%rsp,%r11,2),%r15
  303. BYTE $0x4e
  304. BYTE $0x0f
  305. BYTE $0xb7
  306. BYTE $0x7c
  307. BYTE $0x5c
  308. BYTE $0x78
  309. // table[nextHash] = uint16(s)
  310. MOVQ SI, AX
  311. SUBQ DX, AX
  312. // XXX: MOVW AX, table-32768(SP)(R11*2)
  313. // XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2)
  314. BYTE $0x66
  315. BYTE $0x42
  316. BYTE $0x89
  317. BYTE $0x44
  318. BYTE $0x5c
  319. BYTE $0x78
  320. // nextHash = hash(load32(src, nextS), shift)
  321. MOVL 0(R13), R11
  322. IMULL $0x1e35a7bd, R11
  323. SHRL CX, R11
  324. // if load32(src, s) != load32(src, candidate) { continue } break
  325. MOVL 0(SI), AX
  326. MOVL (DX)(R15*1), BX
  327. CMPL AX, BX
  328. JNE inner0
  329. fourByteMatch:
  330. // As per the encode_other.go code:
  331. //
  332. // A 4-byte match has been found. We'll later see etc.
  333. // !!! Jump to a fast path for short (<= 16 byte) literals. See the comment
  334. // on inputMargin in encode.go.
  335. MOVQ SI, AX
  336. SUBQ R10, AX
  337. CMPQ AX, $16
  338. JLE emitLiteralFastPath
  339. // d += emitLiteral(dst[d:], src[nextEmit:s])
  340. //
  341. // Push args.
  342. MOVQ DI, 0(SP)
  343. MOVQ $0, 8(SP) // Unnecessary, as the callee ignores it, but conservative.
  344. MOVQ $0, 16(SP) // Unnecessary, as the callee ignores it, but conservative.
  345. MOVQ R10, 24(SP)
  346. MOVQ AX, 32(SP)
  347. MOVQ AX, 40(SP) // Unnecessary, as the callee ignores it, but conservative.
  348. // Spill local variables (registers) onto the stack; call; unspill.
  349. MOVQ SI, 72(SP)
  350. MOVQ DI, 80(SP)
  351. MOVQ R15, 112(SP)
  352. CALL ·emitLiteral(SB)
  353. MOVQ 56(SP), CX
  354. MOVQ 64(SP), DX
  355. MOVQ 72(SP), SI
  356. MOVQ 80(SP), DI
  357. MOVQ 88(SP), R9
  358. MOVQ 112(SP), R15
  359. // Finish the "d +=" part of "d += emitLiteral(etc)".
  360. ADDQ 48(SP), DI
  361. JMP inner1
  362. emitLiteralFastPath:
  363. // !!! Emit the 1-byte encoding "uint8(len(lit)-1)<<2".
  364. MOVB AX, BX
  365. SUBB $1, BX
  366. SHLB $2, BX
  367. MOVB BX, (DI)
  368. ADDQ $1, DI
  369. // !!! Implement the copy from lit to dst as a 16-byte load and store.
  370. // (Encode's documentation says that dst and src must not overlap.)
  371. //
  372. // This always copies 16 bytes, instead of only len(lit) bytes, but that's
  373. // OK. Subsequent iterations will fix up the overrun.
  374. //
  375. // Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
  376. // 16-byte loads and stores. This technique probably wouldn't be as
  377. // effective on architectures that are fussier about alignment.
  378. MOVOU 0(R10), X0
  379. MOVOU X0, 0(DI)
  380. ADDQ AX, DI
  381. inner1:
  382. // for { etc }
  383. // base := s
  384. MOVQ SI, R12
  385. // !!! offset := base - candidate
  386. MOVQ R12, R11
  387. SUBQ R15, R11
  388. SUBQ DX, R11
  389. // s = extendMatch(src, candidate+4, s+4)
  390. //
  391. // Push args.
  392. MOVQ DX, 0(SP)
  393. MOVQ src_len+32(FP), R14
  394. MOVQ R14, 8(SP)
  395. MOVQ R14, 16(SP) // Unnecessary, as the callee ignores it, but conservative.
  396. ADDQ $4, R15
  397. MOVQ R15, 24(SP)
  398. ADDQ $4, SI
  399. SUBQ DX, SI
  400. MOVQ SI, 32(SP)
  401. // Spill local variables (registers) onto the stack; call; unspill.
  402. //
  403. // We don't need to unspill CX or R9 as we are just about to call another
  404. // function.
  405. MOVQ DI, 80(SP)
  406. MOVQ R11, 96(SP)
  407. MOVQ R12, 104(SP)
  408. CALL ·extendMatch(SB)
  409. MOVQ 64(SP), DX
  410. MOVQ 80(SP), DI
  411. MOVQ 96(SP), R11
  412. MOVQ 104(SP), R12
  413. // Finish the "s =" part of "s = extendMatch(etc)", remembering that the SI
  414. // register holds &src[s], not s.
  415. MOVQ 40(SP), SI
  416. ADDQ DX, SI
  417. // d += emitCopy(dst[d:], base-candidate, s-base)
  418. //
  419. // Push args.
  420. MOVQ DI, 0(SP)
  421. MOVQ $0, 8(SP) // Unnecessary, as the callee ignores it, but conservative.
  422. MOVQ $0, 16(SP) // Unnecessary, as the callee ignores it, but conservative.
  423. MOVQ R11, 24(SP)
  424. MOVQ SI, AX
  425. SUBQ R12, AX
  426. MOVQ AX, 32(SP)
  427. // Spill local variables (registers) onto the stack; call; unspill.
  428. MOVQ SI, 72(SP)
  429. MOVQ DI, 80(SP)
  430. CALL ·emitCopy(SB)
  431. MOVQ 56(SP), CX
  432. MOVQ 64(SP), DX
  433. MOVQ 72(SP), SI
  434. MOVQ 80(SP), DI
  435. MOVQ 88(SP), R9
  436. // Finish the "d +=" part of "d += emitCopy(etc)".
  437. ADDQ 40(SP), DI
  438. // nextEmit = s
  439. MOVQ SI, R10
  440. // if s >= sLimit { goto emitRemainder }
  441. MOVQ SI, AX
  442. SUBQ DX, AX
  443. CMPQ AX, R9
  444. JAE emitRemainder
  445. // As per the encode_other.go code:
  446. //
  447. // We could immediately etc.
  448. // x := load64(src, s-1)
  449. MOVQ -1(SI), R14
  450. // prevHash := hash(uint32(x>>0), shift)
  451. MOVL R14, R11
  452. IMULL $0x1e35a7bd, R11
  453. SHRL CX, R11
  454. // table[prevHash] = uint16(s-1)
  455. MOVQ SI, AX
  456. SUBQ DX, AX
  457. SUBQ $1, AX
  458. // XXX: MOVW AX, table-32768(SP)(R11*2)
  459. // XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2)
  460. BYTE $0x66
  461. BYTE $0x42
  462. BYTE $0x89
  463. BYTE $0x44
  464. BYTE $0x5c
  465. BYTE $0x78
  466. // currHash := hash(uint32(x>>8), shift)
  467. SHRQ $8, R14
  468. MOVL R14, R11
  469. IMULL $0x1e35a7bd, R11
  470. SHRL CX, R11
  471. // candidate = int(table[currHash])
  472. // XXX: MOVWQZX table-32768(SP)(R11*2), R15
  473. // XXX: 4e 0f b7 7c 5c 78 movzwq 0x78(%rsp,%r11,2),%r15
  474. BYTE $0x4e
  475. BYTE $0x0f
  476. BYTE $0xb7
  477. BYTE $0x7c
  478. BYTE $0x5c
  479. BYTE $0x78
  480. // table[currHash] = uint16(s)
  481. ADDQ $1, AX
  482. // XXX: MOVW AX, table-32768(SP)(R11*2)
  483. // XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2)
  484. BYTE $0x66
  485. BYTE $0x42
  486. BYTE $0x89
  487. BYTE $0x44
  488. BYTE $0x5c
  489. BYTE $0x78
  490. // if uint32(x>>8) == load32(src, candidate) { continue }
  491. MOVL (DX)(R15*1), BX
  492. CMPL R14, BX
  493. JEQ inner1
  494. // nextHash = hash(uint32(x>>16), shift)
  495. SHRQ $8, R14
  496. MOVL R14, R11
  497. IMULL $0x1e35a7bd, R11
  498. SHRL CX, R11
  499. // s++
  500. ADDQ $1, SI
  501. // break out of the inner1 for loop, i.e. continue the outer loop.
  502. JMP outer
  503. emitRemainder:
  504. // if nextEmit < len(src) { etc }
  505. MOVQ src_len+32(FP), AX
  506. ADDQ DX, AX
  507. CMPQ R10, AX
  508. JEQ encodeBlockEnd
  509. // d += emitLiteral(dst[d:], src[nextEmit:])
  510. //
  511. // Push args.
  512. MOVQ DI, 0(SP)
  513. MOVQ $0, 8(SP) // Unnecessary, as the callee ignores it, but conservative.
  514. MOVQ $0, 16(SP) // Unnecessary, as the callee ignores it, but conservative.
  515. MOVQ R10, 24(SP)
  516. SUBQ R10, AX
  517. MOVQ AX, 32(SP)
  518. MOVQ AX, 40(SP) // Unnecessary, as the callee ignores it, but conservative.
  519. // Spill local variables (registers) onto the stack; call; unspill.
  520. MOVQ DI, 80(SP)
  521. CALL ·emitLiteral(SB)
  522. MOVQ 80(SP), DI
  523. // Finish the "d +=" part of "d += emitLiteral(etc)".
  524. ADDQ 48(SP), DI
  525. encodeBlockEnd:
  526. MOVQ dst_base+0(FP), AX
  527. SUBQ AX, DI
  528. MOVQ DI, d+48(FP)
  529. RET