encode_amd64.s 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608
  1. // Copyright 2016 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // +build !appengine
  5. // +build gc
  6. // +build !noasm
  7. #include "textflag.h"
  8. // TODO: figure out why the XXX lines compile with Go 1.4 and Go tip but not
  9. // Go 1.6.
  10. //
  11. // This is https://github.com/golang/snappy/issues/29
  12. // The asm code generally follows the pure Go code in encode_other.go, except
  13. // where marked with a "!!!".
  14. // ----------------------------------------------------------------------------
  15. // func emitLiteral(dst, lit []byte) int
  16. //
  17. // All local variables fit into registers. The register allocation:
  18. // - AX return value
  19. // - BX n
  20. // - CX len(lit)
  21. // - SI &lit[0]
  22. // - DI &dst[i]
  23. //
  24. // The 24 bytes of stack space is to call runtime·memmove.
  25. TEXT ·emitLiteral(SB), NOSPLIT, $24-56
  26. MOVQ dst_base+0(FP), DI
  27. MOVQ lit_base+24(FP), SI
  28. MOVQ lit_len+32(FP), CX
  29. MOVQ CX, AX
  30. MOVL CX, BX
  31. SUBL $1, BX
  32. CMPL BX, $60
  33. JLT oneByte
  34. CMPL BX, $256
  35. JLT twoBytes
  36. threeBytes:
  37. MOVB $0xf4, 0(DI)
  38. MOVW BX, 1(DI)
  39. ADDQ $3, DI
  40. ADDQ $3, AX
  41. JMP emitLiteralEnd
  42. twoBytes:
  43. MOVB $0xf0, 0(DI)
  44. MOVB BX, 1(DI)
  45. ADDQ $2, DI
  46. ADDQ $2, AX
  47. JMP emitLiteralEnd
  48. oneByte:
  49. SHLB $2, BX
  50. MOVB BX, 0(DI)
  51. ADDQ $1, DI
  52. ADDQ $1, AX
  53. emitLiteralEnd:
  54. MOVQ AX, ret+48(FP)
  55. // copy(dst[i:], lit)
  56. //
  57. // This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
  58. // DI, SI and CX as arguments.
  59. MOVQ DI, 0(SP)
  60. MOVQ SI, 8(SP)
  61. MOVQ CX, 16(SP)
  62. CALL runtime·memmove(SB)
  63. RET
  64. // ----------------------------------------------------------------------------
  65. // func emitCopy(dst []byte, offset, length int) int
  66. //
  67. // All local variables fit into registers. The register allocation:
  68. // - BX offset
  69. // - CX length
  70. // - SI &dst[0]
  71. // - DI &dst[i]
  72. TEXT ·emitCopy(SB), NOSPLIT, $0-48
  73. MOVQ dst_base+0(FP), DI
  74. MOVQ DI, SI
  75. MOVQ offset+24(FP), BX
  76. MOVQ length+32(FP), CX
  77. loop0:
  78. // for length >= 68 { etc }
  79. CMPL CX, $68
  80. JLT step1
  81. // Emit a length 64 copy, encoded as 3 bytes.
  82. MOVB $0xfe, 0(DI)
  83. MOVW BX, 1(DI)
  84. ADDQ $3, DI
  85. SUBL $64, CX
  86. JMP loop0
  87. step1:
  88. // if length > 64 { etc }
  89. CMPL CX, $64
  90. JLE step2
  91. // Emit a length 60 copy, encoded as 3 bytes.
  92. MOVB $0xee, 0(DI)
  93. MOVW BX, 1(DI)
  94. ADDQ $3, DI
  95. SUBL $60, CX
  96. step2:
  97. // if length >= 12 || offset >= 2048 { goto step3 }
  98. CMPL CX, $12
  99. JGE step3
  100. CMPL BX, $2048
  101. JGE step3
  102. // Emit the remaining copy, encoded as 2 bytes.
  103. MOVB BX, 1(DI)
  104. SHRL $8, BX
  105. SHLB $5, BX
  106. SUBB $4, CX
  107. SHLB $2, CX
  108. ORB CX, BX
  109. ORB $1, BX
  110. MOVB BX, 0(DI)
  111. ADDQ $2, DI
  112. // Return the number of bytes written.
  113. SUBQ SI, DI
  114. MOVQ DI, ret+40(FP)
  115. RET
  116. step3:
  117. // Emit the remaining copy, encoded as 3 bytes.
  118. SUBL $1, CX
  119. SHLB $2, CX
  120. ORB $2, CX
  121. MOVB CX, 0(DI)
  122. MOVW BX, 1(DI)
  123. ADDQ $3, DI
  124. // Return the number of bytes written.
  125. SUBQ SI, DI
  126. MOVQ DI, ret+40(FP)
  127. RET
  128. // ----------------------------------------------------------------------------
  129. // func extendMatch(src []byte, i, j int) int
  130. //
  131. // All local variables fit into registers. The register allocation:
  132. // - CX &src[0]
  133. // - DX &src[len(src)]
  134. // - SI &src[i]
  135. // - DI &src[j]
  136. // - R9 &src[len(src) - 8]
  137. TEXT ·extendMatch(SB), NOSPLIT, $0-48
  138. MOVQ src_base+0(FP), CX
  139. MOVQ src_len+8(FP), DX
  140. MOVQ i+24(FP), SI
  141. MOVQ j+32(FP), DI
  142. ADDQ CX, DX
  143. ADDQ CX, SI
  144. ADDQ CX, DI
  145. MOVQ DX, R9
  146. SUBQ $8, R9
  147. cmp8:
  148. // As long as we are 8 or more bytes before the end of src, we can load and
  149. // compare 8 bytes at a time. If those 8 bytes are equal, repeat.
  150. CMPQ DI, R9
  151. JA cmp1
  152. MOVQ (SI), AX
  153. MOVQ (DI), BX
  154. CMPQ AX, BX
  155. JNE bsf
  156. ADDQ $8, SI
  157. ADDQ $8, DI
  158. JMP cmp8
  159. bsf:
  160. // If those 8 bytes were not equal, XOR the two 8 byte values, and return
  161. // the index of the first byte that differs. The BSF instruction finds the
  162. // least significant 1 bit, the amd64 architecture is little-endian, and
  163. // the shift by 3 converts a bit index to a byte index.
  164. XORQ AX, BX
  165. BSFQ BX, BX
  166. SHRQ $3, BX
  167. ADDQ BX, DI
  168. // Convert from &src[ret] to ret.
  169. SUBQ CX, DI
  170. MOVQ DI, ret+40(FP)
  171. RET
  172. cmp1:
  173. // In src's tail, compare 1 byte at a time.
  174. CMPQ DI, DX
  175. JAE extendMatchEnd
  176. MOVB (SI), AX
  177. MOVB (DI), BX
  178. CMPB AX, BX
  179. JNE extendMatchEnd
  180. ADDQ $1, SI
  181. ADDQ $1, DI
  182. JMP cmp1
  183. extendMatchEnd:
  184. // Convert from &src[ret] to ret.
  185. SUBQ CX, DI
  186. MOVQ DI, ret+40(FP)
  187. RET
  188. // ----------------------------------------------------------------------------
  189. // func encodeBlock(dst, src []byte) (d int)
  190. //
  191. // All local variables fit into registers, other than "var table". The register
  192. // allocation:
  193. // - AX . .
  194. // - BX . .
  195. // - CX 56 shift (note that amd64 shifts by non-immediates must use CX).
  196. // - DX 64 &src[0], tableSize
  197. // - SI 72 &src[s]
  198. // - DI 80 &dst[d]
  199. // - R9 88 sLimit
  200. // - R10 . &src[nextEmit]
  201. // - R11 96 prevHash, currHash, nextHash, offset
  202. // - R12 104 &src[base], skip
  203. // - R13 . &src[nextS]
  204. // - R14 . len(src), bytesBetweenHashLookups, x
  205. // - R15 112 candidate
  206. //
  207. // The second column (56, 64, etc) is the stack offset to spill the registers
  208. // when calling other functions. We could pack this slightly tighter, but it's
  209. // simpler to have a dedicated spill map independent of the function called.
  210. //
  211. // "var table [maxTableSize]uint16" takes up 32768 bytes of stack space. An
  212. // extra 56 bytes, to call other functions, and an extra 64 bytes, to spill
  213. // local variables (registers) during calls gives 32768 + 56 + 64 = 32888.
  214. TEXT ·encodeBlock(SB), 0, $32888-56
  215. MOVQ dst_base+0(FP), DI
  216. MOVQ src_base+24(FP), SI
  217. MOVQ src_len+32(FP), R14
  218. // shift, tableSize := uint32(32-8), 1<<8
  219. MOVQ $24, CX
  220. MOVQ $256, DX
  221. calcShift:
  222. // for ; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
  223. // shift--
  224. // }
  225. CMPQ DX, $16384
  226. JGE varTable
  227. CMPQ DX, R14
  228. JGE varTable
  229. SUBQ $1, CX
  230. SHLQ $1, DX
  231. JMP calcShift
  232. varTable:
  233. // var table [maxTableSize]uint16
  234. //
  235. // In the asm code, unlike the Go code, we can zero-initialize only the
  236. // first tableSize elements. Each uint16 element is 2 bytes and each MOVOU
  237. // writes 16 bytes, so we can do only tableSize/8 writes instead of the
  238. // 2048 writes that would zero-initialize all of table's 32768 bytes.
  239. SHRQ $3, DX
  240. LEAQ table-32768(SP), BX
  241. PXOR X0, X0
  242. memclr:
  243. MOVOU X0, 0(BX)
  244. ADDQ $16, BX
  245. SUBQ $1, DX
  246. JNZ memclr
  247. // !!! DX = &src[0]
  248. MOVQ SI, DX
  249. // sLimit := len(src) - inputMargin
  250. MOVQ R14, R9
  251. SUBQ $15, R9
  252. // !!! Pre-emptively spill CX, DX and R9 to the stack. Their values don't
  253. // change for the rest of the function.
  254. MOVQ CX, 56(SP)
  255. MOVQ DX, 64(SP)
  256. MOVQ R9, 88(SP)
  257. // nextEmit := 0
  258. MOVQ DX, R10
  259. // s := 1
  260. ADDQ $1, SI
  261. // nextHash := hash(load32(src, s), shift)
  262. MOVL 0(SI), R11
  263. IMULL $0x1e35a7bd, R11
  264. SHRL CX, R11
  265. outer:
  266. // for { etc }
  267. // skip := 32
  268. MOVQ $32, R12
  269. // nextS := s
  270. MOVQ SI, R13
  271. // candidate := 0
  272. MOVQ $0, R15
  273. inner0:
  274. // for { etc }
  275. // s := nextS
  276. MOVQ R13, SI
  277. // bytesBetweenHashLookups := skip >> 5
  278. MOVQ R12, R14
  279. SHRQ $5, R14
  280. // nextS = s + bytesBetweenHashLookups
  281. ADDQ R14, R13
  282. // skip += bytesBetweenHashLookups
  283. ADDQ R14, R12
  284. // if nextS > sLimit { goto emitRemainder }
  285. MOVQ R13, AX
  286. SUBQ DX, AX
  287. CMPQ AX, R9
  288. JA emitRemainder
  289. // candidate = int(table[nextHash])
  290. // XXX: MOVWQZX table-32768(SP)(R11*2), R15
  291. // XXX: 4e 0f b7 7c 5c 78 movzwq 0x78(%rsp,%r11,2),%r15
  292. BYTE $0x4e
  293. BYTE $0x0f
  294. BYTE $0xb7
  295. BYTE $0x7c
  296. BYTE $0x5c
  297. BYTE $0x78
  298. // table[nextHash] = uint16(s)
  299. MOVQ SI, AX
  300. SUBQ DX, AX
  301. // XXX: MOVW AX, table-32768(SP)(R11*2)
  302. // XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2)
  303. BYTE $0x66
  304. BYTE $0x42
  305. BYTE $0x89
  306. BYTE $0x44
  307. BYTE $0x5c
  308. BYTE $0x78
  309. // nextHash = hash(load32(src, nextS), shift)
  310. MOVL 0(R13), R11
  311. IMULL $0x1e35a7bd, R11
  312. SHRL CX, R11
  313. // if load32(src, s) != load32(src, candidate) { continue } break
  314. MOVL 0(SI), AX
  315. MOVL (DX)(R15*1), BX
  316. CMPL AX, BX
  317. JNE inner0
  318. fourByteMatch:
  319. // As per the encode_other.go code:
  320. //
  321. // A 4-byte match has been found. We'll later see etc.
  322. // !!! Jump to a fast path for short (<= 16 byte) literals. See the comment
  323. // on inputMargin in encode.go.
  324. MOVQ SI, AX
  325. SUBQ R10, AX
  326. CMPQ AX, $16
  327. JLE emitLiteralFastPath
  328. // d += emitLiteral(dst[d:], src[nextEmit:s])
  329. //
  330. // Push args.
  331. MOVQ DI, 0(SP)
  332. MOVQ $0, 8(SP) // Unnecessary, as the callee ignores it, but conservative.
  333. MOVQ $0, 16(SP) // Unnecessary, as the callee ignores it, but conservative.
  334. MOVQ R10, 24(SP)
  335. MOVQ AX, 32(SP)
  336. MOVQ AX, 40(SP) // Unnecessary, as the callee ignores it, but conservative.
  337. // Spill local variables (registers) onto the stack; call; unspill.
  338. MOVQ SI, 72(SP)
  339. MOVQ DI, 80(SP)
  340. MOVQ R15, 112(SP)
  341. CALL ·emitLiteral(SB)
  342. MOVQ 56(SP), CX
  343. MOVQ 64(SP), DX
  344. MOVQ 72(SP), SI
  345. MOVQ 80(SP), DI
  346. MOVQ 88(SP), R9
  347. MOVQ 112(SP), R15
  348. // Finish the "d +=" part of "d += emitLiteral(etc)".
  349. ADDQ 48(SP), DI
  350. JMP inner1
  351. emitLiteralFastPath:
  352. // !!! Emit the 1-byte encoding "uint8(len(lit)-1)<<2".
  353. MOVB AX, BX
  354. SUBB $1, BX
  355. SHLB $2, BX
  356. MOVB BX, (DI)
  357. ADDQ $1, DI
  358. // !!! Implement the copy from lit to dst as a 16-byte load and store.
  359. // (Encode's documentation says that dst and src must not overlap.)
  360. //
  361. // This always copies 16 bytes, instead of only len(lit) bytes, but that's
  362. // OK. Subsequent iterations will fix up the overrun.
  363. //
  364. // Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
  365. // 16-byte loads and stores. This technique probably wouldn't be as
  366. // effective on architectures that are fussier about alignment.
  367. MOVOU 0(R10), X0
  368. MOVOU X0, 0(DI)
  369. ADDQ AX, DI
  370. inner1:
  371. // for { etc }
  372. // base := s
  373. MOVQ SI, R12
  374. // !!! offset := base - candidate
  375. MOVQ R12, R11
  376. SUBQ R15, R11
  377. SUBQ DX, R11
  378. // s = extendMatch(src, candidate+4, s+4)
  379. //
  380. // Push args.
  381. MOVQ DX, 0(SP)
  382. MOVQ src_len+32(FP), R14
  383. MOVQ R14, 8(SP)
  384. MOVQ R14, 16(SP) // Unnecessary, as the callee ignores it, but conservative.
  385. ADDQ $4, R15
  386. MOVQ R15, 24(SP)
  387. ADDQ $4, SI
  388. SUBQ DX, SI
  389. MOVQ SI, 32(SP)
  390. // Spill local variables (registers) onto the stack; call; unspill.
  391. //
  392. // We don't need to unspill CX or R9 as we are just about to call another
  393. // function.
  394. MOVQ DI, 80(SP)
  395. MOVQ R11, 96(SP)
  396. MOVQ R12, 104(SP)
  397. CALL ·extendMatch(SB)
  398. MOVQ 64(SP), DX
  399. MOVQ 80(SP), DI
  400. MOVQ 96(SP), R11
  401. MOVQ 104(SP), R12
  402. // Finish the "s =" part of "s = extendMatch(etc)", remembering that the SI
  403. // register holds &src[s], not s.
  404. MOVQ 40(SP), SI
  405. ADDQ DX, SI
  406. // d += emitCopy(dst[d:], base-candidate, s-base)
  407. //
  408. // Push args.
  409. MOVQ DI, 0(SP)
  410. MOVQ $0, 8(SP) // Unnecessary, as the callee ignores it, but conservative.
  411. MOVQ $0, 16(SP) // Unnecessary, as the callee ignores it, but conservative.
  412. MOVQ R11, 24(SP)
  413. MOVQ SI, AX
  414. SUBQ R12, AX
  415. MOVQ AX, 32(SP)
  416. // Spill local variables (registers) onto the stack; call; unspill.
  417. MOVQ SI, 72(SP)
  418. MOVQ DI, 80(SP)
  419. CALL ·emitCopy(SB)
  420. MOVQ 56(SP), CX
  421. MOVQ 64(SP), DX
  422. MOVQ 72(SP), SI
  423. MOVQ 80(SP), DI
  424. MOVQ 88(SP), R9
  425. // Finish the "d +=" part of "d += emitCopy(etc)".
  426. ADDQ 40(SP), DI
  427. // nextEmit = s
  428. MOVQ SI, R10
  429. // if s >= sLimit { goto emitRemainder }
  430. MOVQ SI, AX
  431. SUBQ DX, AX
  432. CMPQ AX, R9
  433. JAE emitRemainder
  434. // As per the encode_other.go code:
  435. //
  436. // We could immediately etc.
  437. // x := load64(src, s-1)
  438. MOVQ -1(SI), R14
  439. // prevHash := hash(uint32(x>>0), shift)
  440. MOVL R14, R11
  441. IMULL $0x1e35a7bd, R11
  442. SHRL CX, R11
  443. // table[prevHash] = uint16(s-1)
  444. MOVQ SI, AX
  445. SUBQ DX, AX
  446. SUBQ $1, AX
  447. // XXX: MOVW AX, table-32768(SP)(R11*2)
  448. // XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2)
  449. BYTE $0x66
  450. BYTE $0x42
  451. BYTE $0x89
  452. BYTE $0x44
  453. BYTE $0x5c
  454. BYTE $0x78
  455. // currHash := hash(uint32(x>>8), shift)
  456. SHRQ $8, R14
  457. MOVL R14, R11
  458. IMULL $0x1e35a7bd, R11
  459. SHRL CX, R11
  460. // candidate = int(table[currHash])
  461. // XXX: MOVWQZX table-32768(SP)(R11*2), R15
  462. // XXX: 4e 0f b7 7c 5c 78 movzwq 0x78(%rsp,%r11,2),%r15
  463. BYTE $0x4e
  464. BYTE $0x0f
  465. BYTE $0xb7
  466. BYTE $0x7c
  467. BYTE $0x5c
  468. BYTE $0x78
  469. // table[currHash] = uint16(s)
  470. ADDQ $1, AX
  471. // XXX: MOVW AX, table-32768(SP)(R11*2)
  472. // XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2)
  473. BYTE $0x66
  474. BYTE $0x42
  475. BYTE $0x89
  476. BYTE $0x44
  477. BYTE $0x5c
  478. BYTE $0x78
  479. // if uint32(x>>8) == load32(src, candidate) { continue }
  480. MOVL (DX)(R15*1), BX
  481. CMPL R14, BX
  482. JEQ inner1
  483. // nextHash = hash(uint32(x>>16), shift)
  484. SHRQ $8, R14
  485. MOVL R14, R11
  486. IMULL $0x1e35a7bd, R11
  487. SHRL CX, R11
  488. // s++
  489. ADDQ $1, SI
  490. // break out of the inner1 for loop, i.e. continue the outer loop.
  491. JMP outer
  492. emitRemainder:
  493. // if nextEmit < len(src) { etc }
  494. MOVQ src_len+32(FP), AX
  495. ADDQ DX, AX
  496. CMPQ R10, AX
  497. JEQ encodeBlockEnd
  498. // d += emitLiteral(dst[d:], src[nextEmit:])
  499. //
  500. // Push args.
  501. MOVQ DI, 0(SP)
  502. MOVQ $0, 8(SP) // Unnecessary, as the callee ignores it, but conservative.
  503. MOVQ $0, 16(SP) // Unnecessary, as the callee ignores it, but conservative.
  504. MOVQ R10, 24(SP)
  505. SUBQ R10, AX
  506. MOVQ AX, 32(SP)
  507. MOVQ AX, 40(SP) // Unnecessary, as the callee ignores it, but conservative.
  508. // Spill local variables (registers) onto the stack; call; unspill.
  509. MOVQ DI, 80(SP)
  510. CALL ·emitLiteral(SB)
  511. MOVQ 80(SP), DI
  512. // Finish the "d +=" part of "d += emitLiteral(etc)".
  513. ADDQ 48(SP), DI
  514. encodeBlockEnd:
  515. MOVQ dst_base+0(FP), AX
  516. SUBQ AX, DI
  517. MOVQ DI, d+48(FP)
  518. RET