encode_amd64.s 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538
  1. // Copyright 2016 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // +build !appengine
  5. // +build gc
  6. // +build !noasm
  7. #include "textflag.h"
  8. // The asm code generally follows the pure Go code in encode_other.go, except
  9. // where marked with a "!!!".
  10. // ----------------------------------------------------------------------------
  11. // func emitLiteral(dst, lit []byte) int
  12. //
  13. // All local variables fit into registers. The register allocation:
  14. // - AX return value
  15. // - BX n
  16. // - CX len(lit)
  17. // - SI &lit[0]
  18. // - DI &dst[i]
  19. //
  20. // The 24 bytes of stack space is to call runtime·memmove.
  21. TEXT ·emitLiteral(SB), NOSPLIT, $24-56
  22. MOVQ dst_base+0(FP), DI
  23. MOVQ lit_base+24(FP), SI
  24. MOVQ lit_len+32(FP), CX
  25. MOVQ CX, AX
  26. MOVL CX, BX
  27. SUBL $1, BX
  28. CMPL BX, $60
  29. JLT oneByte
  30. CMPL BX, $256
  31. JLT twoBytes
  32. threeBytes:
  33. MOVB $0xf4, 0(DI)
  34. MOVW BX, 1(DI)
  35. ADDQ $3, DI
  36. ADDQ $3, AX
  37. JMP end
  38. twoBytes:
  39. MOVB $0xf0, 0(DI)
  40. MOVB BX, 1(DI)
  41. ADDQ $2, DI
  42. ADDQ $2, AX
  43. JMP end
  44. oneByte:
  45. SHLB $2, BX
  46. MOVB BX, 0(DI)
  47. ADDQ $1, DI
  48. ADDQ $1, AX
  49. end:
  50. MOVQ AX, ret+48(FP)
  51. // copy(dst[i:], lit)
  52. //
  53. // This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
  54. // DI, SI and CX as arguments.
  55. MOVQ DI, 0(SP)
  56. MOVQ SI, 8(SP)
  57. MOVQ CX, 16(SP)
  58. CALL runtime·memmove(SB)
  59. RET
  60. // ----------------------------------------------------------------------------
  61. // func emitCopy(dst []byte, offset, length int) int
  62. //
  63. // All local variables fit into registers. The register allocation:
  64. // - BX offset
  65. // - CX length
  66. // - SI &dst[0]
  67. // - DI &dst[i]
  68. TEXT ·emitCopy(SB), NOSPLIT, $0-48
  69. MOVQ dst_base+0(FP), DI
  70. MOVQ DI, SI
  71. MOVQ offset+24(FP), BX
  72. MOVQ length+32(FP), CX
  73. loop0:
  74. // for length >= 68 { etc }
  75. CMPL CX, $68
  76. JLT step1
  77. // Emit a length 64 copy, encoded as 3 bytes.
  78. MOVB $0xfe, 0(DI)
  79. MOVW BX, 1(DI)
  80. ADDQ $3, DI
  81. SUBL $64, CX
  82. JMP loop0
  83. step1:
  84. // if length > 64 { etc }
  85. CMPL CX, $64
  86. JLE step2
  87. // Emit a length 60 copy, encoded as 3 bytes.
  88. MOVB $0xee, 0(DI)
  89. MOVW BX, 1(DI)
  90. ADDQ $3, DI
  91. SUBL $60, CX
  92. step2:
  93. // if length >= 12 || offset >= 2048 { goto step3 }
  94. CMPL CX, $12
  95. JGE step3
  96. CMPL BX, $2048
  97. JGE step3
  98. // Emit the remaining copy, encoded as 2 bytes.
  99. MOVB BX, 1(DI)
  100. SHRL $8, BX
  101. SHLB $5, BX
  102. SUBB $4, CX
  103. SHLB $2, CX
  104. ORB CX, BX
  105. ORB $1, BX
  106. MOVB BX, 0(DI)
  107. ADDQ $2, DI
  108. // Return the number of bytes written.
  109. SUBQ SI, DI
  110. MOVQ DI, ret+40(FP)
  111. RET
  112. step3:
  113. // Emit the remaining copy, encoded as 3 bytes.
  114. SUBL $1, CX
  115. SHLB $2, CX
  116. ORB $2, CX
  117. MOVB CX, 0(DI)
  118. MOVW BX, 1(DI)
  119. ADDQ $3, DI
  120. // Return the number of bytes written.
  121. SUBQ SI, DI
  122. MOVQ DI, ret+40(FP)
  123. RET
  124. // ----------------------------------------------------------------------------
  125. // func extendMatch(src []byte, i, j int) int
  126. //
  127. // All local variables fit into registers. The register allocation:
  128. // - CX &src[0]
  129. // - DX &src[len(src)]
  130. // - SI &src[i]
  131. // - DI &src[j]
  132. // - R9 &src[len(src) - 8]
  133. TEXT ·extendMatch(SB), NOSPLIT, $0-48
  134. MOVQ src_base+0(FP), CX
  135. MOVQ src_len+8(FP), DX
  136. MOVQ i+24(FP), SI
  137. MOVQ j+32(FP), DI
  138. ADDQ CX, DX
  139. ADDQ CX, SI
  140. ADDQ CX, DI
  141. MOVQ DX, R9
  142. SUBQ $8, R9
  143. cmp8:
  144. // As long as we are 8 or more bytes before the end of src, we can load and
  145. // compare 8 bytes at a time. If those 8 bytes are equal, repeat.
  146. CMPQ DI, R9
  147. JA cmp1
  148. MOVQ (SI), AX
  149. MOVQ (DI), BX
  150. CMPQ AX, BX
  151. JNE bsf
  152. ADDQ $8, SI
  153. ADDQ $8, DI
  154. JMP cmp8
  155. bsf:
  156. // If those 8 bytes were not equal, XOR the two 8 byte values, and return
  157. // the index of the first byte that differs. The BSF instruction finds the
  158. // least significant 1 bit, the amd64 architecture is little-endian, and
  159. // the shift by 3 converts a bit index to a byte index.
  160. XORQ AX, BX
  161. BSFQ BX, BX
  162. SHRQ $3, BX
  163. ADDQ BX, DI
  164. // Convert from &src[ret] to ret.
  165. SUBQ CX, DI
  166. MOVQ DI, ret+40(FP)
  167. RET
  168. cmp1:
  169. // In src's tail, compare 1 byte at a time.
  170. CMPQ DI, DX
  171. JAE end
  172. MOVB (SI), AX
  173. MOVB (DI), BX
  174. CMPB AX, BX
  175. JNE end
  176. ADDQ $1, SI
  177. ADDQ $1, DI
  178. JMP cmp1
  179. end:
  180. // Convert from &src[ret] to ret.
  181. SUBQ CX, DI
  182. MOVQ DI, ret+40(FP)
  183. RET
  184. // ----------------------------------------------------------------------------
  185. // func encodeBlock(dst, src []byte) (d int)
  186. //
  187. // All local variables fit into registers, other than "var table". The register
  188. // allocation:
  189. // - AX . .
  190. // - BX . .
  191. // - CX 56 shift (note that amd64 shifts by non-immediates must use CX).
  192. // - DX 64 &src[0], tableSize
  193. // - SI 72 &src[s]
  194. // - DI 80 &dst[d]
  195. // - R9 88 sLimit
  196. // - R10 . &src[nextEmit]
  197. // - R11 96 prevHash, currHash, nextHash, offset
  198. // - R12 104 &src[base], skip
  199. // - R13 . &src[nextS]
  200. // - R14 . len(src), bytesBetweenHashLookups, x
  201. // - R15 112 candidate
  202. //
  203. // The second column (56, 64, etc) is the stack offset to spill the registers
  204. // when calling other functions. We could pack this slightly tighter, but it's
  205. // simpler to have a dedicated spill map independent of the function called.
  206. //
  207. // "var table [maxTableSize]uint16" takes up 32768 bytes of stack space. An
  208. // extra 56 bytes, to call other functions, and an extra 64 bytes, to spill
  209. // local variables (registers) during calls gives 32768 + 56 + 64 = 32888.
  210. TEXT ·encodeBlock(SB), 0, $32888-56
  211. MOVQ dst_base+0(FP), DI
  212. MOVQ src_base+24(FP), SI
  213. MOVQ src_len+32(FP), R14
  214. // shift, tableSize := uint32(32-8), 1<<8
  215. MOVQ $24, CX
  216. MOVQ $256, DX
  217. calcShift:
  218. // for ; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
  219. // shift--
  220. // }
  221. CMPQ DX, $16384
  222. JGE varTable
  223. CMPQ DX, R14
  224. JGE varTable
  225. SUBQ $1, CX
  226. SHLQ $1, DX
  227. JMP calcShift
  228. varTable:
  229. // var table [maxTableSize]uint16
  230. //
  231. // sizeof(table) is 32768 bytes, which is 2048 16-byte writes.
  232. MOVQ $2048, DX
  233. LEAQ table-32768(SP), BX
  234. PXOR X0, X0
  235. memclr:
  236. MOVOU X0, 0(BX)
  237. ADDQ $16, BX
  238. SUBQ $1, DX
  239. JNZ memclr
  240. // !!! DX = &src[0]
  241. MOVQ SI, DX
  242. // sLimit := len(src) - inputMargin
  243. MOVQ R14, R9
  244. SUBQ $15, R9
  245. // !!! Pre-emptively spill CX, DX and R9 to the stack. Their values don't
  246. // change for the rest of the function.
  247. MOVQ CX, 56(SP)
  248. MOVQ DX, 64(SP)
  249. MOVQ R9, 88(SP)
  250. // nextEmit := 0
  251. MOVQ DX, R10
  252. // s := 1
  253. ADDQ $1, SI
  254. // nextHash := hash(load32(src, s), shift)
  255. MOVL 0(SI), R11
  256. IMULL $0x1e35a7bd, R11
  257. SHRL CX, R11
  258. outer:
  259. // for { etc }
  260. // skip := 32
  261. MOVQ $32, R12
  262. // nextS := s
  263. MOVQ SI, R13
  264. // candidate := 0
  265. MOVQ $0, R15
  266. inner0:
  267. // for { etc }
  268. // s := nextS
  269. MOVQ R13, SI
  270. // bytesBetweenHashLookups := skip >> 5
  271. MOVQ R12, R14
  272. SHRQ $5, R14
  273. // nextS = s + bytesBetweenHashLookups
  274. ADDQ R14, R13
  275. // skip += bytesBetweenHashLookups
  276. ADDQ R14, R12
  277. // if nextS > sLimit { goto emitRemainder }
  278. MOVQ R13, AX
  279. SUBQ DX, AX
  280. CMPQ AX, R9
  281. JA emitRemainder
  282. // candidate = int(table[nextHash])
  283. MOVWQZX table-32768(SP)(R11*2), R15
  284. // table[nextHash] = uint16(s)
  285. MOVQ SI, AX
  286. SUBQ DX, AX
  287. MOVW AX, table-32768(SP)(R11*2)
  288. // nextHash = hash(load32(src, nextS), shift)
  289. MOVL 0(R13), R11
  290. IMULL $0x1e35a7bd, R11
  291. SHRL CX, R11
  292. // if load32(src, s) != load32(src, candidate) { continue } break
  293. MOVL 0(SI), AX
  294. MOVL (DX)(R15*1), BX
  295. CMPL AX, BX
  296. JNE inner0
  297. fourByteMatch:
  298. // As per the encode_other.go code:
  299. //
  300. // A 4-byte match has been found. We'll later see etc.
  301. // d += emitLiteral(dst[d:], src[nextEmit:s])
  302. //
  303. // Push args.
  304. MOVQ DI, 0(SP)
  305. MOVQ $0, 8(SP) // Unnecessary, as the callee ignores it, but conservative.
  306. MOVQ $0, 16(SP) // Unnecessary, as the callee ignores it, but conservative.
  307. MOVQ R10, 24(SP)
  308. MOVQ SI, AX
  309. SUBQ R10, AX
  310. MOVQ AX, 32(SP)
  311. MOVQ AX, 40(SP) // Unnecessary, as the callee ignores it, but conservative.
  312. // Spill local variables (registers) onto the stack; call; unspill.
  313. MOVQ SI, 72(SP)
  314. MOVQ DI, 80(SP)
  315. MOVQ R15, 112(SP)
  316. CALL ·emitLiteral(SB)
  317. MOVQ 56(SP), CX
  318. MOVQ 64(SP), DX
  319. MOVQ 72(SP), SI
  320. MOVQ 80(SP), DI
  321. MOVQ 88(SP), R9
  322. MOVQ 112(SP), R15
  323. // Finish the "d +=" part of "d += emitLiteral(etc)".
  324. ADDQ 48(SP), DI
  325. inner1:
  326. // for { etc }
  327. // base := s
  328. MOVQ SI, R12
  329. // !!! offset := base - candidate
  330. MOVQ R12, R11
  331. SUBQ R15, R11
  332. SUBQ DX, R11
  333. // s = extendMatch(src, candidate+4, s+4)
  334. //
  335. // Push args.
  336. MOVQ DX, 0(SP)
  337. MOVQ src_len+32(FP), R14
  338. MOVQ R14, 8(SP)
  339. MOVQ R14, 16(SP) // Unnecessary, as the callee ignores it, but conservative.
  340. ADDQ $4, R15
  341. MOVQ R15, 24(SP)
  342. ADDQ $4, SI
  343. SUBQ DX, SI
  344. MOVQ SI, 32(SP)
  345. // Spill local variables (registers) onto the stack; call; unspill.
  346. //
  347. // We don't need to unspill CX or R9 as we are just about to call another
  348. // function.
  349. MOVQ DI, 80(SP)
  350. MOVQ R11, 96(SP)
  351. MOVQ R12, 104(SP)
  352. CALL ·extendMatch(SB)
  353. MOVQ 64(SP), DX
  354. MOVQ 80(SP), DI
  355. MOVQ 96(SP), R11
  356. MOVQ 104(SP), R12
  357. // Finish the "s =" part of "s = extendMatch(etc)", remembering that the SI
  358. // register holds &src[s], not s.
  359. MOVQ 40(SP), SI
  360. ADDQ DX, SI
  361. // d += emitCopy(dst[d:], base-candidate, s-base)
  362. //
  363. // Push args.
  364. MOVQ DI, 0(SP)
  365. MOVQ $0, 8(SP) // Unnecessary, as the callee ignores it, but conservative.
  366. MOVQ $0, 16(SP) // Unnecessary, as the callee ignores it, but conservative.
  367. MOVQ R11, 24(SP)
  368. MOVQ SI, AX
  369. SUBQ R12, AX
  370. MOVQ AX, 32(SP)
  371. // Spill local variables (registers) onto the stack; call; unspill.
  372. MOVQ SI, 72(SP)
  373. MOVQ DI, 80(SP)
  374. CALL ·emitCopy(SB)
  375. MOVQ 56(SP), CX
  376. MOVQ 64(SP), DX
  377. MOVQ 72(SP), SI
  378. MOVQ 80(SP), DI
  379. MOVQ 88(SP), R9
  380. // Finish the "d +=" part of "d += emitCopy(etc)".
  381. ADDQ 40(SP), DI
  382. // nextEmit = s
  383. MOVQ SI, R10
  384. // if s >= sLimit { goto emitRemainder }
  385. MOVQ SI, AX
  386. SUBQ DX, AX
  387. CMPQ AX, R9
  388. JAE emitRemainder
  389. // As per the encode_other.go code:
  390. //
  391. // We could immediately etc.
  392. // x := load64(src, s-1)
  393. MOVQ -1(SI), R14
  394. // prevHash := hash(uint32(x>>0), shift)
  395. MOVL R14, R11
  396. IMULL $0x1e35a7bd, R11
  397. SHRL CX, R11
  398. // table[prevHash] = uint16(s-1)
  399. MOVQ SI, AX
  400. SUBQ DX, AX
  401. SUBQ $1, AX
  402. MOVW AX, table-32768(SP)(R11*2)
  403. // currHash := hash(uint32(x>>8), shift)
  404. SHRQ $8, R14
  405. MOVL R14, R11
  406. IMULL $0x1e35a7bd, R11
  407. SHRL CX, R11
  408. // candidate = int(table[currHash])
  409. MOVWQZX table-32768(SP)(R11*2), R15
  410. // table[currHash] = uint16(s)
  411. ADDQ $1, AX
  412. MOVW AX, table-32768(SP)(R11*2)
  413. // if uint32(x>>8) == load32(src, candidate) { continue }
  414. MOVL (DX)(R15*1), BX
  415. CMPL R14, BX
  416. JEQ inner1
  417. // nextHash = hash(uint32(x>>16), shift)
  418. SHRQ $8, R14
  419. MOVL R14, R11
  420. IMULL $0x1e35a7bd, R11
  421. SHRL CX, R11
  422. // s++
  423. ADDQ $1, SI
  424. // break out of the inner1 for loop, i.e. continue the outer loop.
  425. JMP outer
  426. emitRemainder:
  427. // if nextEmit < len(src) { etc }
  428. MOVQ src_len+32(FP), AX
  429. ADDQ DX, AX
  430. CMPQ R10, AX
  431. JEQ end
  432. // d += emitLiteral(dst[d:], src[nextEmit:])
  433. //
  434. // Push args.
  435. MOVQ DI, 0(SP)
  436. MOVQ $0, 8(SP) // Unnecessary, as the callee ignores it, but conservative.
  437. MOVQ $0, 16(SP) // Unnecessary, as the callee ignores it, but conservative.
  438. MOVQ R10, 24(SP)
  439. SUBQ R10, AX
  440. MOVQ AX, 32(SP)
  441. MOVQ AX, 40(SP) // Unnecessary, as the callee ignores it, but conservative.
  442. // Spill local variables (registers) onto the stack; call; unspill.
  443. MOVQ DI, 80(SP)
  444. CALL ·emitLiteral(SB)
  445. MOVQ 80(SP), DI
  446. // Finish the "d +=" part of "d += emitLiteral(etc)".
  447. ADDQ 48(SP), DI
  448. end:
  449. MOVQ dst_base+0(FP), AX
  450. SUBQ AX, DI
  451. MOVQ DI, d+48(FP)
  452. RET