encode_amd64.s 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730
  1. // Copyright 2016 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // +build !appengine
  5. // +build gc
  6. // +build !noasm
  7. #include "textflag.h"
  8. // The XXX lines assemble on Go 1.4, 1.5 and 1.7, but not 1.6, due to a
  9. // Go toolchain regression. See https://github.com/golang/go/issues/15426 and
  10. // https://github.com/golang/snappy/issues/29
  11. //
  12. // As a workaround, the package was built with a known good assembler, and
  13. // those instructions were disassembled by "objdump -d" to yield the
  14. // 4e 0f b7 7c 5c 78 movzwq 0x78(%rsp,%r11,2),%r15
  15. // style comments, in AT&T asm syntax. Note that rsp here is a physical
  16. // register, not Go/asm's SP pseudo-register (see https://golang.org/doc/asm).
  17. // The instructions were then encoded as "BYTE $0x.." sequences, which assemble
  18. // fine on Go 1.6.
  19. // The asm code generally follows the pure Go code in encode_other.go, except
  20. // where marked with a "!!!".
  21. // ----------------------------------------------------------------------------
  22. // func emitLiteral(dst, lit []byte) int
  23. //
  24. // All local variables fit into registers. The register allocation:
  25. // - AX len(lit)
  26. // - BX n
  27. // - DX return value
  28. // - DI &dst[i]
  29. // - R10 &lit[0]
  30. //
  31. // The 24 bytes of stack space is to call runtime·memmove.
  32. //
  33. // The unusual register allocation of local variables, such as R10 for the
  34. // source pointer, matches the allocation used at the call site in encodeBlock,
  35. // which makes it easier to manually inline this function.
  36. TEXT ·emitLiteral(SB), NOSPLIT, $24-56
  37. MOVQ dst_base+0(FP), DI
  38. MOVQ lit_base+24(FP), R10
  39. MOVQ lit_len+32(FP), AX
  40. MOVQ AX, DX
  41. MOVL AX, BX
  42. SUBL $1, BX
  43. CMPL BX, $60
  44. JLT oneByte
  45. CMPL BX, $256
  46. JLT twoBytes
  47. threeBytes:
  48. MOVB $0xf4, 0(DI)
  49. MOVW BX, 1(DI)
  50. ADDQ $3, DI
  51. ADDQ $3, DX
  52. JMP memmove
  53. twoBytes:
  54. MOVB $0xf0, 0(DI)
  55. MOVB BX, 1(DI)
  56. ADDQ $2, DI
  57. ADDQ $2, DX
  58. JMP memmove
  59. oneByte:
  60. SHLB $2, BX
  61. MOVB BX, 0(DI)
  62. ADDQ $1, DI
  63. ADDQ $1, DX
  64. memmove:
  65. MOVQ DX, ret+48(FP)
  66. // copy(dst[i:], lit)
  67. //
  68. // This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
  69. // DI, R10 and AX as arguments.
  70. MOVQ DI, 0(SP)
  71. MOVQ R10, 8(SP)
  72. MOVQ AX, 16(SP)
  73. CALL runtime·memmove(SB)
  74. RET
  75. // ----------------------------------------------------------------------------
  76. // func emitCopy(dst []byte, offset, length int) int
  77. //
  78. // All local variables fit into registers. The register allocation:
  79. // - AX length
  80. // - SI &dst[0]
  81. // - DI &dst[i]
  82. // - R11 offset
  83. //
  84. // The unusual register allocation of local variables, such as R11 for the
  85. // offset, matches the allocation used at the call site in encodeBlock, which
  86. // makes it easier to manually inline this function.
  87. TEXT ·emitCopy(SB), NOSPLIT, $0-48
  88. MOVQ dst_base+0(FP), DI
  89. MOVQ DI, SI
  90. MOVQ offset+24(FP), R11
  91. MOVQ length+32(FP), AX
  92. loop0:
  93. // for length >= 68 { etc }
  94. CMPL AX, $68
  95. JLT step1
  96. // Emit a length 64 copy, encoded as 3 bytes.
  97. MOVB $0xfe, 0(DI)
  98. MOVW R11, 1(DI)
  99. ADDQ $3, DI
  100. SUBL $64, AX
  101. JMP loop0
  102. step1:
  103. // if length > 64 { etc }
  104. CMPL AX, $64
  105. JLE step2
  106. // Emit a length 60 copy, encoded as 3 bytes.
  107. MOVB $0xee, 0(DI)
  108. MOVW R11, 1(DI)
  109. ADDQ $3, DI
  110. SUBL $60, AX
  111. step2:
  112. // if length >= 12 || offset >= 2048 { goto step3 }
  113. CMPL AX, $12
  114. JGE step3
  115. CMPL R11, $2048
  116. JGE step3
  117. // Emit the remaining copy, encoded as 2 bytes.
  118. MOVB R11, 1(DI)
  119. SHRL $8, R11
  120. SHLB $5, R11
  121. SUBB $4, AX
  122. SHLB $2, AX
  123. ORB AX, R11
  124. ORB $1, R11
  125. MOVB R11, 0(DI)
  126. ADDQ $2, DI
  127. // Return the number of bytes written.
  128. SUBQ SI, DI
  129. MOVQ DI, ret+40(FP)
  130. RET
  131. step3:
  132. // Emit the remaining copy, encoded as 3 bytes.
  133. SUBL $1, AX
  134. SHLB $2, AX
  135. ORB $2, AX
  136. MOVB AX, 0(DI)
  137. MOVW R11, 1(DI)
  138. ADDQ $3, DI
  139. // Return the number of bytes written.
  140. SUBQ SI, DI
  141. MOVQ DI, ret+40(FP)
  142. RET
  143. // ----------------------------------------------------------------------------
  144. // func extendMatch(src []byte, i, j int) int
  145. //
  146. // All local variables fit into registers. The register allocation:
  147. // - DX &src[0]
  148. // - SI &src[j]
  149. // - R13 &src[len(src) - 8]
  150. // - R14 &src[len(src)]
  151. // - R15 &src[i]
  152. //
  153. // The unusual register allocation of local variables, such as R15 for a source
  154. // pointer, matches the allocation used at the call site in encodeBlock, which
  155. // makes it easier to manually inline this function.
  156. TEXT ·extendMatch(SB), NOSPLIT, $0-48
  157. MOVQ src_base+0(FP), DX
  158. MOVQ src_len+8(FP), R14
  159. MOVQ i+24(FP), R15
  160. MOVQ j+32(FP), SI
  161. ADDQ DX, R14
  162. ADDQ DX, R15
  163. ADDQ DX, SI
  164. MOVQ R14, R13
  165. SUBQ $8, R13
  166. cmp8:
  167. // As long as we are 8 or more bytes before the end of src, we can load and
  168. // compare 8 bytes at a time. If those 8 bytes are equal, repeat.
  169. CMPQ SI, R13
  170. JA cmp1
  171. MOVQ (R15), AX
  172. MOVQ (SI), BX
  173. CMPQ AX, BX
  174. JNE bsf
  175. ADDQ $8, R15
  176. ADDQ $8, SI
  177. JMP cmp8
  178. bsf:
  179. // If those 8 bytes were not equal, XOR the two 8 byte values, and return
  180. // the index of the first byte that differs. The BSF instruction finds the
  181. // least significant 1 bit, the amd64 architecture is little-endian, and
  182. // the shift by 3 converts a bit index to a byte index.
  183. XORQ AX, BX
  184. BSFQ BX, BX
  185. SHRQ $3, BX
  186. ADDQ BX, SI
  187. // Convert from &src[ret] to ret.
  188. SUBQ DX, SI
  189. MOVQ SI, ret+40(FP)
  190. RET
  191. cmp1:
  192. // In src's tail, compare 1 byte at a time.
  193. CMPQ SI, R14
  194. JAE extendMatchEnd
  195. MOVB (R15), AX
  196. MOVB (SI), BX
  197. CMPB AX, BX
  198. JNE extendMatchEnd
  199. ADDQ $1, R15
  200. ADDQ $1, SI
  201. JMP cmp1
  202. extendMatchEnd:
  203. // Convert from &src[ret] to ret.
  204. SUBQ DX, SI
  205. MOVQ SI, ret+40(FP)
  206. RET
  207. // ----------------------------------------------------------------------------
  208. // func encodeBlock(dst, src []byte) (d int)
  209. //
  210. // All local variables fit into registers, other than "var table". The register
  211. // allocation:
  212. // - AX . .
  213. // - BX . .
  214. // - CX 56 shift (note that amd64 shifts by non-immediates must use CX).
  215. // - DX 64 &src[0], tableSize
  216. // - SI 72 &src[s]
  217. // - DI 80 &dst[d]
  218. // - R9 88 sLimit
  219. // - R10 . &src[nextEmit]
  220. // - R11 96 prevHash, currHash, nextHash, offset
  221. // - R12 104 &src[base], skip
  222. // - R13 . &src[nextS], &src[len(src) - 8]
  223. // - R14 . len(src), bytesBetweenHashLookups, &src[len(src)], x
  224. // - R15 112 candidate
  225. //
  226. // The second column (56, 64, etc) is the stack offset to spill the registers
  227. // when calling other functions. We could pack this slightly tighter, but it's
  228. // simpler to have a dedicated spill map independent of the function called.
  229. //
  230. // "var table [maxTableSize]uint16" takes up 32768 bytes of stack space. An
  231. // extra 56 bytes, to call other functions, and an extra 64 bytes, to spill
  232. // local variables (registers) during calls gives 32768 + 56 + 64 = 32888.
  233. TEXT ·encodeBlock(SB), 0, $32888-56
  234. MOVQ dst_base+0(FP), DI
  235. MOVQ src_base+24(FP), SI
  236. MOVQ src_len+32(FP), R14
  237. // shift, tableSize := uint32(32-8), 1<<8
  238. MOVQ $24, CX
  239. MOVQ $256, DX
  240. calcShift:
  241. // for ; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
  242. // shift--
  243. // }
  244. CMPQ DX, $16384
  245. JGE varTable
  246. CMPQ DX, R14
  247. JGE varTable
  248. SUBQ $1, CX
  249. SHLQ $1, DX
  250. JMP calcShift
  251. varTable:
  252. // var table [maxTableSize]uint16
  253. //
  254. // In the asm code, unlike the Go code, we can zero-initialize only the
  255. // first tableSize elements. Each uint16 element is 2 bytes and each MOVOU
  256. // writes 16 bytes, so we can do only tableSize/8 writes instead of the
  257. // 2048 writes that would zero-initialize all of table's 32768 bytes.
  258. SHRQ $3, DX
  259. LEAQ table-32768(SP), BX
  260. PXOR X0, X0
  261. memclr:
  262. MOVOU X0, 0(BX)
  263. ADDQ $16, BX
  264. SUBQ $1, DX
  265. JNZ memclr
  266. // !!! DX = &src[0]
  267. MOVQ SI, DX
  268. // sLimit := len(src) - inputMargin
  269. MOVQ R14, R9
  270. SUBQ $15, R9
  271. // !!! Pre-emptively spill CX, DX and R9 to the stack. Their values don't
  272. // change for the rest of the function.
  273. MOVQ CX, 56(SP)
  274. MOVQ DX, 64(SP)
  275. MOVQ R9, 88(SP)
  276. // nextEmit := 0
  277. MOVQ DX, R10
  278. // s := 1
  279. ADDQ $1, SI
  280. // nextHash := hash(load32(src, s), shift)
  281. MOVL 0(SI), R11
  282. IMULL $0x1e35a7bd, R11
  283. SHRL CX, R11
  284. outer:
  285. // for { etc }
  286. // skip := 32
  287. MOVQ $32, R12
  288. // nextS := s
  289. MOVQ SI, R13
  290. // candidate := 0
  291. MOVQ $0, R15
  292. inner0:
  293. // for { etc }
  294. // s := nextS
  295. MOVQ R13, SI
  296. // bytesBetweenHashLookups := skip >> 5
  297. MOVQ R12, R14
  298. SHRQ $5, R14
  299. // nextS = s + bytesBetweenHashLookups
  300. ADDQ R14, R13
  301. // skip += bytesBetweenHashLookups
  302. ADDQ R14, R12
  303. // if nextS > sLimit { goto emitRemainder }
  304. MOVQ R13, AX
  305. SUBQ DX, AX
  306. CMPQ AX, R9
  307. JA emitRemainder
  308. // candidate = int(table[nextHash])
  309. // XXX: MOVWQZX table-32768(SP)(R11*2), R15
  310. // XXX: 4e 0f b7 7c 5c 78 movzwq 0x78(%rsp,%r11,2),%r15
  311. BYTE $0x4e
  312. BYTE $0x0f
  313. BYTE $0xb7
  314. BYTE $0x7c
  315. BYTE $0x5c
  316. BYTE $0x78
  317. // table[nextHash] = uint16(s)
  318. MOVQ SI, AX
  319. SUBQ DX, AX
  320. // XXX: MOVW AX, table-32768(SP)(R11*2)
  321. // XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2)
  322. BYTE $0x66
  323. BYTE $0x42
  324. BYTE $0x89
  325. BYTE $0x44
  326. BYTE $0x5c
  327. BYTE $0x78
  328. // nextHash = hash(load32(src, nextS), shift)
  329. MOVL 0(R13), R11
  330. IMULL $0x1e35a7bd, R11
  331. SHRL CX, R11
  332. // if load32(src, s) != load32(src, candidate) { continue } break
  333. MOVL 0(SI), AX
  334. MOVL (DX)(R15*1), BX
  335. CMPL AX, BX
  336. JNE inner0
  337. fourByteMatch:
  338. // As per the encode_other.go code:
  339. //
  340. // A 4-byte match has been found. We'll later see etc.
  341. // !!! Jump to a fast path for short (<= 16 byte) literals. See the comment
  342. // on inputMargin in encode.go.
  343. MOVQ SI, AX
  344. SUBQ R10, AX
  345. CMPQ AX, $16
  346. JLE emitLiteralFastPath
  347. // ----------------------------------------
  348. // Begin inline of the emitLiteral call.
  349. //
  350. // d += emitLiteral(dst[d:], src[nextEmit:s])
  351. MOVL AX, BX
  352. SUBL $1, BX
  353. CMPL BX, $60
  354. JLT inlineEmitLiteralOneByte
  355. CMPL BX, $256
  356. JLT inlineEmitLiteralTwoBytes
  357. inlineEmitLiteralThreeBytes:
  358. MOVB $0xf4, 0(DI)
  359. MOVW BX, 1(DI)
  360. ADDQ $3, DI
  361. JMP inlineEmitLiteralMemmove
  362. inlineEmitLiteralTwoBytes:
  363. MOVB $0xf0, 0(DI)
  364. MOVB BX, 1(DI)
  365. ADDQ $2, DI
  366. JMP inlineEmitLiteralMemmove
  367. inlineEmitLiteralOneByte:
  368. SHLB $2, BX
  369. MOVB BX, 0(DI)
  370. ADDQ $1, DI
  371. inlineEmitLiteralMemmove:
  372. // Spill local variables (registers) onto the stack; call; unspill.
  373. //
  374. // copy(dst[i:], lit)
  375. //
  376. // This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
  377. // DI, R10 and AX as arguments.
  378. MOVQ DI, 0(SP)
  379. MOVQ R10, 8(SP)
  380. MOVQ AX, 16(SP)
  381. ADDQ AX, DI // Finish the "d +=" part of "d += emitLiteral(etc)".
  382. MOVQ SI, 72(SP)
  383. MOVQ DI, 80(SP)
  384. MOVQ R15, 112(SP)
  385. CALL runtime·memmove(SB)
  386. MOVQ 56(SP), CX
  387. MOVQ 64(SP), DX
  388. MOVQ 72(SP), SI
  389. MOVQ 80(SP), DI
  390. MOVQ 88(SP), R9
  391. MOVQ 112(SP), R15
  392. JMP inner1
  393. inlineEmitLiteralEnd:
  394. // End inline of the emitLiteral call.
  395. // ----------------------------------------
  396. emitLiteralFastPath:
  397. // !!! Emit the 1-byte encoding "uint8(len(lit)-1)<<2".
  398. MOVB AX, BX
  399. SUBB $1, BX
  400. SHLB $2, BX
  401. MOVB BX, (DI)
  402. ADDQ $1, DI
  403. // !!! Implement the copy from lit to dst as a 16-byte load and store.
  404. // (Encode's documentation says that dst and src must not overlap.)
  405. //
  406. // This always copies 16 bytes, instead of only len(lit) bytes, but that's
  407. // OK. Subsequent iterations will fix up the overrun.
  408. //
  409. // Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
  410. // 16-byte loads and stores. This technique probably wouldn't be as
  411. // effective on architectures that are fussier about alignment.
  412. MOVOU 0(R10), X0
  413. MOVOU X0, 0(DI)
  414. ADDQ AX, DI
  415. inner1:
  416. // for { etc }
  417. // base := s
  418. MOVQ SI, R12
  419. // !!! offset := base - candidate
  420. MOVQ R12, R11
  421. SUBQ R15, R11
  422. SUBQ DX, R11
  423. // ----------------------------------------
  424. // Begin inline of the extendMatch call.
  425. //
  426. // s = extendMatch(src, candidate+4, s+4)
  427. // !!! R14 = &src[len(src)]
  428. MOVQ src_len+32(FP), R14
  429. ADDQ DX, R14
  430. // !!! R13 = &src[len(src) - 8]
  431. MOVQ R14, R13
  432. SUBQ $8, R13
  433. // !!! R15 = &src[candidate + 4]
  434. ADDQ $4, R15
  435. ADDQ DX, R15
  436. // !!! s += 4
  437. ADDQ $4, SI
  438. inlineExtendMatchCmp8:
  439. // As long as we are 8 or more bytes before the end of src, we can load and
  440. // compare 8 bytes at a time. If those 8 bytes are equal, repeat.
  441. CMPQ SI, R13
  442. JA inlineExtendMatchCmp1
  443. MOVQ (R15), AX
  444. MOVQ (SI), BX
  445. CMPQ AX, BX
  446. JNE inlineExtendMatchBSF
  447. ADDQ $8, R15
  448. ADDQ $8, SI
  449. JMP inlineExtendMatchCmp8
  450. inlineExtendMatchBSF:
  451. // If those 8 bytes were not equal, XOR the two 8 byte values, and return
  452. // the index of the first byte that differs. The BSF instruction finds the
  453. // least significant 1 bit, the amd64 architecture is little-endian, and
  454. // the shift by 3 converts a bit index to a byte index.
  455. XORQ AX, BX
  456. BSFQ BX, BX
  457. SHRQ $3, BX
  458. ADDQ BX, SI
  459. JMP inlineExtendMatchEnd
  460. inlineExtendMatchCmp1:
  461. // In src's tail, compare 1 byte at a time.
  462. CMPQ SI, R14
  463. JAE inlineExtendMatchEnd
  464. MOVB (R15), AX
  465. MOVB (SI), BX
  466. CMPB AX, BX
  467. JNE inlineExtendMatchEnd
  468. ADDQ $1, R15
  469. ADDQ $1, SI
  470. JMP inlineExtendMatchCmp1
  471. inlineExtendMatchEnd:
  472. // End inline of the extendMatch call.
  473. // ----------------------------------------
  474. // ----------------------------------------
  475. // Begin inline of the emitCopy call.
  476. //
  477. // d += emitCopy(dst[d:], base-candidate, s-base)
  478. // !!! length := s - base
  479. MOVQ SI, AX
  480. SUBQ R12, AX
  481. inlineEmitCopyLoop0:
  482. // for length >= 68 { etc }
  483. CMPL AX, $68
  484. JLT inlineEmitCopyStep1
  485. // Emit a length 64 copy, encoded as 3 bytes.
  486. MOVB $0xfe, 0(DI)
  487. MOVW R11, 1(DI)
  488. ADDQ $3, DI
  489. SUBL $64, AX
  490. JMP inlineEmitCopyLoop0
  491. inlineEmitCopyStep1:
  492. // if length > 64 { etc }
  493. CMPL AX, $64
  494. JLE inlineEmitCopyStep2
  495. // Emit a length 60 copy, encoded as 3 bytes.
  496. MOVB $0xee, 0(DI)
  497. MOVW R11, 1(DI)
  498. ADDQ $3, DI
  499. SUBL $60, AX
  500. inlineEmitCopyStep2:
  501. // if length >= 12 || offset >= 2048 { goto inlineEmitCopyStep3 }
  502. CMPL AX, $12
  503. JGE inlineEmitCopyStep3
  504. CMPL R11, $2048
  505. JGE inlineEmitCopyStep3
  506. // Emit the remaining copy, encoded as 2 bytes.
  507. MOVB R11, 1(DI)
  508. SHRL $8, R11
  509. SHLB $5, R11
  510. SUBB $4, AX
  511. SHLB $2, AX
  512. ORB AX, R11
  513. ORB $1, R11
  514. MOVB R11, 0(DI)
  515. ADDQ $2, DI
  516. JMP inlineEmitCopyEnd
  517. inlineEmitCopyStep3:
  518. // Emit the remaining copy, encoded as 3 bytes.
  519. SUBL $1, AX
  520. SHLB $2, AX
  521. ORB $2, AX
  522. MOVB AX, 0(DI)
  523. MOVW R11, 1(DI)
  524. ADDQ $3, DI
  525. inlineEmitCopyEnd:
  526. // End inline of the emitCopy call.
  527. // ----------------------------------------
  528. // nextEmit = s
  529. MOVQ SI, R10
  530. // if s >= sLimit { goto emitRemainder }
  531. MOVQ SI, AX
  532. SUBQ DX, AX
  533. CMPQ AX, R9
  534. JAE emitRemainder
  535. // As per the encode_other.go code:
  536. //
  537. // We could immediately etc.
  538. // x := load64(src, s-1)
  539. MOVQ -1(SI), R14
  540. // prevHash := hash(uint32(x>>0), shift)
  541. MOVL R14, R11
  542. IMULL $0x1e35a7bd, R11
  543. SHRL CX, R11
  544. // table[prevHash] = uint16(s-1)
  545. MOVQ SI, AX
  546. SUBQ DX, AX
  547. SUBQ $1, AX
  548. // XXX: MOVW AX, table-32768(SP)(R11*2)
  549. // XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2)
  550. BYTE $0x66
  551. BYTE $0x42
  552. BYTE $0x89
  553. BYTE $0x44
  554. BYTE $0x5c
  555. BYTE $0x78
  556. // currHash := hash(uint32(x>>8), shift)
  557. SHRQ $8, R14
  558. MOVL R14, R11
  559. IMULL $0x1e35a7bd, R11
  560. SHRL CX, R11
  561. // candidate = int(table[currHash])
  562. // XXX: MOVWQZX table-32768(SP)(R11*2), R15
  563. // XXX: 4e 0f b7 7c 5c 78 movzwq 0x78(%rsp,%r11,2),%r15
  564. BYTE $0x4e
  565. BYTE $0x0f
  566. BYTE $0xb7
  567. BYTE $0x7c
  568. BYTE $0x5c
  569. BYTE $0x78
  570. // table[currHash] = uint16(s)
  571. ADDQ $1, AX
  572. // XXX: MOVW AX, table-32768(SP)(R11*2)
  573. // XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2)
  574. BYTE $0x66
  575. BYTE $0x42
  576. BYTE $0x89
  577. BYTE $0x44
  578. BYTE $0x5c
  579. BYTE $0x78
  580. // if uint32(x>>8) == load32(src, candidate) { continue }
  581. MOVL (DX)(R15*1), BX
  582. CMPL R14, BX
  583. JEQ inner1
  584. // nextHash = hash(uint32(x>>16), shift)
  585. SHRQ $8, R14
  586. MOVL R14, R11
  587. IMULL $0x1e35a7bd, R11
  588. SHRL CX, R11
  589. // s++
  590. ADDQ $1, SI
  591. // break out of the inner1 for loop, i.e. continue the outer loop.
  592. JMP outer
  593. emitRemainder:
  594. // if nextEmit < len(src) { etc }
  595. MOVQ src_len+32(FP), AX
  596. ADDQ DX, AX
  597. CMPQ R10, AX
  598. JEQ encodeBlockEnd
  599. // d += emitLiteral(dst[d:], src[nextEmit:])
  600. //
  601. // Push args.
  602. MOVQ DI, 0(SP)
  603. MOVQ $0, 8(SP) // Unnecessary, as the callee ignores it, but conservative.
  604. MOVQ $0, 16(SP) // Unnecessary, as the callee ignores it, but conservative.
  605. MOVQ R10, 24(SP)
  606. SUBQ R10, AX
  607. MOVQ AX, 32(SP)
  608. MOVQ AX, 40(SP) // Unnecessary, as the callee ignores it, but conservative.
  609. // Spill local variables (registers) onto the stack; call; unspill.
  610. MOVQ DI, 80(SP)
  611. CALL ·emitLiteral(SB)
  612. MOVQ 80(SP), DI
  613. // Finish the "d +=" part of "d += emitLiteral(etc)".
  614. ADDQ 48(SP), DI
  615. encodeBlockEnd:
  616. MOVQ dst_base+0(FP), AX
  617. SUBQ AX, DI
  618. MOVQ DI, d+48(FP)
  619. RET