encode_amd64.s 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615
  1. // Copyright 2016 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // +build !appengine
  5. // +build gc
  6. // +build !noasm
  7. #include "textflag.h"
  8. // The XXX lines assemble on Go 1.4, 1.5 and 1.7, but not 1.6, due to a
  9. // Go toolchain regression. See https://github.com/golang/go/issues/15426 and
  10. // https://github.com/golang/snappy/issues/29
  11. //
  12. // As a workaround, the package was built with a known good assembler, and
  13. // those instructions were disassembled by "objdump -d" to yield the
  14. // 4e 0f b7 7c 5c 78 movzwq 0x78(%rsp,%r11,2),%r15
  15. // style comments, in AT&T asm syntax. Note that rsp here is a physical
  16. // register, not Go/asm's SP pseudo-register (see https://golang.org/doc/asm).
  17. // The instructions were then encoded as "BYTE $0x.." sequences, which assemble
  18. // fine on Go 1.6.
  19. // The asm code generally follows the pure Go code in encode_other.go, except
  20. // where marked with a "!!!".
  21. // ----------------------------------------------------------------------------
  22. // func emitLiteral(dst, lit []byte) int
  23. //
  24. // All local variables fit into registers. The register allocation:
  25. // - AX return value
  26. // - BX n
  27. // - CX len(lit)
  28. // - SI &lit[0]
  29. // - DI &dst[i]
  30. //
  31. // The 24 bytes of stack space is to call runtime·memmove.
  32. TEXT ·emitLiteral(SB), NOSPLIT, $24-56
  33. MOVQ dst_base+0(FP), DI
  34. MOVQ lit_base+24(FP), SI
  35. MOVQ lit_len+32(FP), CX
  36. MOVQ CX, AX
  37. MOVL CX, BX
  38. SUBL $1, BX
  39. CMPL BX, $60
  40. JLT oneByte
  41. CMPL BX, $256
  42. JLT twoBytes
  43. threeBytes:
  44. MOVB $0xf4, 0(DI)
  45. MOVW BX, 1(DI)
  46. ADDQ $3, DI
  47. ADDQ $3, AX
  48. JMP emitLiteralEnd
  49. twoBytes:
  50. MOVB $0xf0, 0(DI)
  51. MOVB BX, 1(DI)
  52. ADDQ $2, DI
  53. ADDQ $2, AX
  54. JMP emitLiteralEnd
  55. oneByte:
  56. SHLB $2, BX
  57. MOVB BX, 0(DI)
  58. ADDQ $1, DI
  59. ADDQ $1, AX
  60. emitLiteralEnd:
  61. MOVQ AX, ret+48(FP)
  62. // copy(dst[i:], lit)
  63. //
  64. // This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
  65. // DI, SI and CX as arguments.
  66. MOVQ DI, 0(SP)
  67. MOVQ SI, 8(SP)
  68. MOVQ CX, 16(SP)
  69. CALL runtime·memmove(SB)
  70. RET
  71. // ----------------------------------------------------------------------------
  72. // func emitCopy(dst []byte, offset, length int) int
  73. //
  74. // All local variables fit into registers. The register allocation:
  75. // - BX offset
  76. // - CX length
  77. // - SI &dst[0]
  78. // - DI &dst[i]
  79. TEXT ·emitCopy(SB), NOSPLIT, $0-48
  80. MOVQ dst_base+0(FP), DI
  81. MOVQ DI, SI
  82. MOVQ offset+24(FP), BX
  83. MOVQ length+32(FP), CX
  84. loop0:
  85. // for length >= 68 { etc }
  86. CMPL CX, $68
  87. JLT step1
  88. // Emit a length 64 copy, encoded as 3 bytes.
  89. MOVB $0xfe, 0(DI)
  90. MOVW BX, 1(DI)
  91. ADDQ $3, DI
  92. SUBL $64, CX
  93. JMP loop0
  94. step1:
  95. // if length > 64 { etc }
  96. CMPL CX, $64
  97. JLE step2
  98. // Emit a length 60 copy, encoded as 3 bytes.
  99. MOVB $0xee, 0(DI)
  100. MOVW BX, 1(DI)
  101. ADDQ $3, DI
  102. SUBL $60, CX
  103. step2:
  104. // if length >= 12 || offset >= 2048 { goto step3 }
  105. CMPL CX, $12
  106. JGE step3
  107. CMPL BX, $2048
  108. JGE step3
  109. // Emit the remaining copy, encoded as 2 bytes.
  110. MOVB BX, 1(DI)
  111. SHRL $8, BX
  112. SHLB $5, BX
  113. SUBB $4, CX
  114. SHLB $2, CX
  115. ORB CX, BX
  116. ORB $1, BX
  117. MOVB BX, 0(DI)
  118. ADDQ $2, DI
  119. // Return the number of bytes written.
  120. SUBQ SI, DI
  121. MOVQ DI, ret+40(FP)
  122. RET
  123. step3:
  124. // Emit the remaining copy, encoded as 3 bytes.
  125. SUBL $1, CX
  126. SHLB $2, CX
  127. ORB $2, CX
  128. MOVB CX, 0(DI)
  129. MOVW BX, 1(DI)
  130. ADDQ $3, DI
  131. // Return the number of bytes written.
  132. SUBQ SI, DI
  133. MOVQ DI, ret+40(FP)
  134. RET
  135. // ----------------------------------------------------------------------------
  136. // func extendMatch(src []byte, i, j int) int
  137. //
  138. // All local variables fit into registers. The register allocation:
  139. // - CX &src[0]
  140. // - DX &src[len(src)]
  141. // - SI &src[i]
  142. // - DI &src[j]
  143. // - R9 &src[len(src) - 8]
  144. TEXT ·extendMatch(SB), NOSPLIT, $0-48
  145. MOVQ src_base+0(FP), CX
  146. MOVQ src_len+8(FP), DX
  147. MOVQ i+24(FP), SI
  148. MOVQ j+32(FP), DI
  149. ADDQ CX, DX
  150. ADDQ CX, SI
  151. ADDQ CX, DI
  152. MOVQ DX, R9
  153. SUBQ $8, R9
  154. cmp8:
  155. // As long as we are 8 or more bytes before the end of src, we can load and
  156. // compare 8 bytes at a time. If those 8 bytes are equal, repeat.
  157. CMPQ DI, R9
  158. JA cmp1
  159. MOVQ (SI), AX
  160. MOVQ (DI), BX
  161. CMPQ AX, BX
  162. JNE bsf
  163. ADDQ $8, SI
  164. ADDQ $8, DI
  165. JMP cmp8
  166. bsf:
  167. // If those 8 bytes were not equal, XOR the two 8 byte values, and return
  168. // the index of the first byte that differs. The BSF instruction finds the
  169. // least significant 1 bit, the amd64 architecture is little-endian, and
  170. // the shift by 3 converts a bit index to a byte index.
  171. XORQ AX, BX
  172. BSFQ BX, BX
  173. SHRQ $3, BX
  174. ADDQ BX, DI
  175. // Convert from &src[ret] to ret.
  176. SUBQ CX, DI
  177. MOVQ DI, ret+40(FP)
  178. RET
  179. cmp1:
  180. // In src's tail, compare 1 byte at a time.
  181. CMPQ DI, DX
  182. JAE extendMatchEnd
  183. MOVB (SI), AX
  184. MOVB (DI), BX
  185. CMPB AX, BX
  186. JNE extendMatchEnd
  187. ADDQ $1, SI
  188. ADDQ $1, DI
  189. JMP cmp1
  190. extendMatchEnd:
  191. // Convert from &src[ret] to ret.
  192. SUBQ CX, DI
  193. MOVQ DI, ret+40(FP)
  194. RET
  195. // ----------------------------------------------------------------------------
  196. // func encodeBlock(dst, src []byte) (d int)
  197. //
  198. // All local variables fit into registers, other than "var table". The register
  199. // allocation:
  200. // - AX . .
  201. // - BX . .
  202. // - CX 56 shift (note that amd64 shifts by non-immediates must use CX).
  203. // - DX 64 &src[0], tableSize
  204. // - SI 72 &src[s]
  205. // - DI 80 &dst[d]
  206. // - R9 88 sLimit
  207. // - R10 . &src[nextEmit]
  208. // - R11 96 prevHash, currHash, nextHash, offset
  209. // - R12 104 &src[base], skip
  210. // - R13 . &src[nextS]
  211. // - R14 . len(src), bytesBetweenHashLookups, x
  212. // - R15 112 candidate
  213. //
  214. // The second column (56, 64, etc) is the stack offset to spill the registers
  215. // when calling other functions. We could pack this slightly tighter, but it's
  216. // simpler to have a dedicated spill map independent of the function called.
  217. //
  218. // "var table [maxTableSize]uint16" takes up 32768 bytes of stack space. An
  219. // extra 56 bytes, to call other functions, and an extra 64 bytes, to spill
  220. // local variables (registers) during calls gives 32768 + 56 + 64 = 32888.
  221. TEXT ·encodeBlock(SB), 0, $32888-56
  222. MOVQ dst_base+0(FP), DI
  223. MOVQ src_base+24(FP), SI
  224. MOVQ src_len+32(FP), R14
  225. // shift, tableSize := uint32(32-8), 1<<8
  226. MOVQ $24, CX
  227. MOVQ $256, DX
  228. calcShift:
  229. // for ; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
  230. // shift--
  231. // }
  232. CMPQ DX, $16384
  233. JGE varTable
  234. CMPQ DX, R14
  235. JGE varTable
  236. SUBQ $1, CX
  237. SHLQ $1, DX
  238. JMP calcShift
  239. varTable:
  240. // var table [maxTableSize]uint16
  241. //
  242. // In the asm code, unlike the Go code, we can zero-initialize only the
  243. // first tableSize elements. Each uint16 element is 2 bytes and each MOVOU
  244. // writes 16 bytes, so we can do only tableSize/8 writes instead of the
  245. // 2048 writes that would zero-initialize all of table's 32768 bytes.
  246. SHRQ $3, DX
  247. LEAQ table-32768(SP), BX
  248. PXOR X0, X0
  249. memclr:
  250. MOVOU X0, 0(BX)
  251. ADDQ $16, BX
  252. SUBQ $1, DX
  253. JNZ memclr
  254. // !!! DX = &src[0]
  255. MOVQ SI, DX
  256. // sLimit := len(src) - inputMargin
  257. MOVQ R14, R9
  258. SUBQ $15, R9
  259. // !!! Pre-emptively spill CX, DX and R9 to the stack. Their values don't
  260. // change for the rest of the function.
  261. MOVQ CX, 56(SP)
  262. MOVQ DX, 64(SP)
  263. MOVQ R9, 88(SP)
  264. // nextEmit := 0
  265. MOVQ DX, R10
  266. // s := 1
  267. ADDQ $1, SI
  268. // nextHash := hash(load32(src, s), shift)
  269. MOVL 0(SI), R11
  270. IMULL $0x1e35a7bd, R11
  271. SHRL CX, R11
  272. outer:
  273. // for { etc }
  274. // skip := 32
  275. MOVQ $32, R12
  276. // nextS := s
  277. MOVQ SI, R13
  278. // candidate := 0
  279. MOVQ $0, R15
  280. inner0:
  281. // for { etc }
  282. // s := nextS
  283. MOVQ R13, SI
  284. // bytesBetweenHashLookups := skip >> 5
  285. MOVQ R12, R14
  286. SHRQ $5, R14
  287. // nextS = s + bytesBetweenHashLookups
  288. ADDQ R14, R13
  289. // skip += bytesBetweenHashLookups
  290. ADDQ R14, R12
  291. // if nextS > sLimit { goto emitRemainder }
  292. MOVQ R13, AX
  293. SUBQ DX, AX
  294. CMPQ AX, R9
  295. JA emitRemainder
  296. // candidate = int(table[nextHash])
  297. // XXX: MOVWQZX table-32768(SP)(R11*2), R15
  298. // XXX: 4e 0f b7 7c 5c 78 movzwq 0x78(%rsp,%r11,2),%r15
  299. BYTE $0x4e
  300. BYTE $0x0f
  301. BYTE $0xb7
  302. BYTE $0x7c
  303. BYTE $0x5c
  304. BYTE $0x78
  305. // table[nextHash] = uint16(s)
  306. MOVQ SI, AX
  307. SUBQ DX, AX
  308. // XXX: MOVW AX, table-32768(SP)(R11*2)
  309. // XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2)
  310. BYTE $0x66
  311. BYTE $0x42
  312. BYTE $0x89
  313. BYTE $0x44
  314. BYTE $0x5c
  315. BYTE $0x78
  316. // nextHash = hash(load32(src, nextS), shift)
  317. MOVL 0(R13), R11
  318. IMULL $0x1e35a7bd, R11
  319. SHRL CX, R11
  320. // if load32(src, s) != load32(src, candidate) { continue } break
  321. MOVL 0(SI), AX
  322. MOVL (DX)(R15*1), BX
  323. CMPL AX, BX
  324. JNE inner0
  325. fourByteMatch:
  326. // As per the encode_other.go code:
  327. //
  328. // A 4-byte match has been found. We'll later see etc.
  329. // !!! Jump to a fast path for short (<= 16 byte) literals. See the comment
  330. // on inputMargin in encode.go.
  331. MOVQ SI, AX
  332. SUBQ R10, AX
  333. CMPQ AX, $16
  334. JLE emitLiteralFastPath
  335. // d += emitLiteral(dst[d:], src[nextEmit:s])
  336. //
  337. // Push args.
  338. MOVQ DI, 0(SP)
  339. MOVQ $0, 8(SP) // Unnecessary, as the callee ignores it, but conservative.
  340. MOVQ $0, 16(SP) // Unnecessary, as the callee ignores it, but conservative.
  341. MOVQ R10, 24(SP)
  342. MOVQ AX, 32(SP)
  343. MOVQ AX, 40(SP) // Unnecessary, as the callee ignores it, but conservative.
  344. // Spill local variables (registers) onto the stack; call; unspill.
  345. MOVQ SI, 72(SP)
  346. MOVQ DI, 80(SP)
  347. MOVQ R15, 112(SP)
  348. CALL ·emitLiteral(SB)
  349. MOVQ 56(SP), CX
  350. MOVQ 64(SP), DX
  351. MOVQ 72(SP), SI
  352. MOVQ 80(SP), DI
  353. MOVQ 88(SP), R9
  354. MOVQ 112(SP), R15
  355. // Finish the "d +=" part of "d += emitLiteral(etc)".
  356. ADDQ 48(SP), DI
  357. JMP inner1
  358. emitLiteralFastPath:
  359. // !!! Emit the 1-byte encoding "uint8(len(lit)-1)<<2".
  360. MOVB AX, BX
  361. SUBB $1, BX
  362. SHLB $2, BX
  363. MOVB BX, (DI)
  364. ADDQ $1, DI
  365. // !!! Implement the copy from lit to dst as a 16-byte load and store.
  366. // (Encode's documentation says that dst and src must not overlap.)
  367. //
  368. // This always copies 16 bytes, instead of only len(lit) bytes, but that's
  369. // OK. Subsequent iterations will fix up the overrun.
  370. //
  371. // Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
  372. // 16-byte loads and stores. This technique probably wouldn't be as
  373. // effective on architectures that are fussier about alignment.
  374. MOVOU 0(R10), X0
  375. MOVOU X0, 0(DI)
  376. ADDQ AX, DI
  377. inner1:
  378. // for { etc }
  379. // base := s
  380. MOVQ SI, R12
  381. // !!! offset := base - candidate
  382. MOVQ R12, R11
  383. SUBQ R15, R11
  384. SUBQ DX, R11
  385. // s = extendMatch(src, candidate+4, s+4)
  386. //
  387. // Push args.
  388. MOVQ DX, 0(SP)
  389. MOVQ src_len+32(FP), R14
  390. MOVQ R14, 8(SP)
  391. MOVQ R14, 16(SP) // Unnecessary, as the callee ignores it, but conservative.
  392. ADDQ $4, R15
  393. MOVQ R15, 24(SP)
  394. ADDQ $4, SI
  395. SUBQ DX, SI
  396. MOVQ SI, 32(SP)
  397. // Spill local variables (registers) onto the stack; call; unspill.
  398. //
  399. // We don't need to unspill CX or R9 as we are just about to call another
  400. // function.
  401. MOVQ DI, 80(SP)
  402. MOVQ R11, 96(SP)
  403. MOVQ R12, 104(SP)
  404. CALL ·extendMatch(SB)
  405. MOVQ 64(SP), DX
  406. MOVQ 80(SP), DI
  407. MOVQ 96(SP), R11
  408. MOVQ 104(SP), R12
  409. // Finish the "s =" part of "s = extendMatch(etc)", remembering that the SI
  410. // register holds &src[s], not s.
  411. MOVQ 40(SP), SI
  412. ADDQ DX, SI
  413. // d += emitCopy(dst[d:], base-candidate, s-base)
  414. //
  415. // Push args.
  416. MOVQ DI, 0(SP)
  417. MOVQ $0, 8(SP) // Unnecessary, as the callee ignores it, but conservative.
  418. MOVQ $0, 16(SP) // Unnecessary, as the callee ignores it, but conservative.
  419. MOVQ R11, 24(SP)
  420. MOVQ SI, AX
  421. SUBQ R12, AX
  422. MOVQ AX, 32(SP)
  423. // Spill local variables (registers) onto the stack; call; unspill.
  424. MOVQ SI, 72(SP)
  425. MOVQ DI, 80(SP)
  426. CALL ·emitCopy(SB)
  427. MOVQ 56(SP), CX
  428. MOVQ 64(SP), DX
  429. MOVQ 72(SP), SI
  430. MOVQ 80(SP), DI
  431. MOVQ 88(SP), R9
  432. // Finish the "d +=" part of "d += emitCopy(etc)".
  433. ADDQ 40(SP), DI
  434. // nextEmit = s
  435. MOVQ SI, R10
  436. // if s >= sLimit { goto emitRemainder }
  437. MOVQ SI, AX
  438. SUBQ DX, AX
  439. CMPQ AX, R9
  440. JAE emitRemainder
  441. // As per the encode_other.go code:
  442. //
  443. // We could immediately etc.
  444. // x := load64(src, s-1)
  445. MOVQ -1(SI), R14
  446. // prevHash := hash(uint32(x>>0), shift)
  447. MOVL R14, R11
  448. IMULL $0x1e35a7bd, R11
  449. SHRL CX, R11
  450. // table[prevHash] = uint16(s-1)
  451. MOVQ SI, AX
  452. SUBQ DX, AX
  453. SUBQ $1, AX
  454. // XXX: MOVW AX, table-32768(SP)(R11*2)
  455. // XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2)
  456. BYTE $0x66
  457. BYTE $0x42
  458. BYTE $0x89
  459. BYTE $0x44
  460. BYTE $0x5c
  461. BYTE $0x78
  462. // currHash := hash(uint32(x>>8), shift)
  463. SHRQ $8, R14
  464. MOVL R14, R11
  465. IMULL $0x1e35a7bd, R11
  466. SHRL CX, R11
  467. // candidate = int(table[currHash])
  468. // XXX: MOVWQZX table-32768(SP)(R11*2), R15
  469. // XXX: 4e 0f b7 7c 5c 78 movzwq 0x78(%rsp,%r11,2),%r15
  470. BYTE $0x4e
  471. BYTE $0x0f
  472. BYTE $0xb7
  473. BYTE $0x7c
  474. BYTE $0x5c
  475. BYTE $0x78
  476. // table[currHash] = uint16(s)
  477. ADDQ $1, AX
  478. // XXX: MOVW AX, table-32768(SP)(R11*2)
  479. // XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2)
  480. BYTE $0x66
  481. BYTE $0x42
  482. BYTE $0x89
  483. BYTE $0x44
  484. BYTE $0x5c
  485. BYTE $0x78
  486. // if uint32(x>>8) == load32(src, candidate) { continue }
  487. MOVL (DX)(R15*1), BX
  488. CMPL R14, BX
  489. JEQ inner1
  490. // nextHash = hash(uint32(x>>16), shift)
  491. SHRQ $8, R14
  492. MOVL R14, R11
  493. IMULL $0x1e35a7bd, R11
  494. SHRL CX, R11
  495. // s++
  496. ADDQ $1, SI
  497. // break out of the inner1 for loop, i.e. continue the outer loop.
  498. JMP outer
  499. emitRemainder:
  500. // if nextEmit < len(src) { etc }
  501. MOVQ src_len+32(FP), AX
  502. ADDQ DX, AX
  503. CMPQ R10, AX
  504. JEQ encodeBlockEnd
  505. // d += emitLiteral(dst[d:], src[nextEmit:])
  506. //
  507. // Push args.
  508. MOVQ DI, 0(SP)
  509. MOVQ $0, 8(SP) // Unnecessary, as the callee ignores it, but conservative.
  510. MOVQ $0, 16(SP) // Unnecessary, as the callee ignores it, but conservative.
  511. MOVQ R10, 24(SP)
  512. SUBQ R10, AX
  513. MOVQ AX, 32(SP)
  514. MOVQ AX, 40(SP) // Unnecessary, as the callee ignores it, but conservative.
  515. // Spill local variables (registers) onto the stack; call; unspill.
  516. MOVQ DI, 80(SP)
  517. CALL ·emitLiteral(SB)
  518. MOVQ 80(SP), DI
  519. // Finish the "d +=" part of "d += emitLiteral(etc)".
  520. ADDQ 48(SP), DI
  521. encodeBlockEnd:
  522. MOVQ dst_base+0(FP), AX
  523. SUBQ AX, DI
  524. MOVQ DI, d+48(FP)
  525. RET