encode_amd64.s 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541
  1. // Copyright 2016 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // +build !appengine
  5. // +build gc
  6. // +build !noasm
  7. #include "textflag.h"
  8. // The asm code generally follows the pure Go code in encode_other.go, except
  9. // where marked with a "!!!".
  10. // ----------------------------------------------------------------------------
  11. // func emitLiteral(dst, lit []byte) int
  12. //
  13. // All local variables fit into registers. The register allocation:
  14. // - AX return value
  15. // - BX n
  16. // - CX len(lit)
  17. // - SI &lit[0]
  18. // - DI &dst[i]
  19. //
  20. // The 24 bytes of stack space is to call runtime·memmove.
  21. TEXT ·emitLiteral(SB), NOSPLIT, $24-56
  22. MOVQ dst_base+0(FP), DI
  23. MOVQ lit_base+24(FP), SI
  24. MOVQ lit_len+32(FP), CX
  25. MOVQ CX, AX
  26. MOVL CX, BX
  27. SUBL $1, BX
  28. CMPL BX, $60
  29. JLT oneByte
  30. CMPL BX, $256
  31. JLT twoBytes
  32. threeBytes:
  33. MOVB $0xf4, 0(DI)
  34. MOVW BX, 1(DI)
  35. ADDQ $3, DI
  36. ADDQ $3, AX
  37. JMP end
  38. twoBytes:
  39. MOVB $0xf0, 0(DI)
  40. MOVB BX, 1(DI)
  41. ADDQ $2, DI
  42. ADDQ $2, AX
  43. JMP end
  44. oneByte:
  45. SHLB $2, BX
  46. MOVB BX, 0(DI)
  47. ADDQ $1, DI
  48. ADDQ $1, AX
  49. end:
  50. MOVQ AX, ret+48(FP)
  51. // copy(dst[i:], lit)
  52. //
  53. // This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
  54. // DI, SI and CX as arguments.
  55. MOVQ DI, 0(SP)
  56. MOVQ SI, 8(SP)
  57. MOVQ CX, 16(SP)
  58. CALL runtime·memmove(SB)
  59. RET
  60. // ----------------------------------------------------------------------------
  61. // func emitCopy(dst []byte, offset, length int) int
  62. //
  63. // All local variables fit into registers. The register allocation:
  64. // - BX offset
  65. // - CX length
  66. // - SI &dst[0]
  67. // - DI &dst[i]
  68. TEXT ·emitCopy(SB), NOSPLIT, $0-48
  69. MOVQ dst_base+0(FP), DI
  70. MOVQ DI, SI
  71. MOVQ offset+24(FP), BX
  72. MOVQ length+32(FP), CX
  73. loop0:
  74. // for length >= 68 { etc }
  75. CMPL CX, $68
  76. JLT step1
  77. // Emit a length 64 copy, encoded as 3 bytes.
  78. MOVB $0xfe, 0(DI)
  79. MOVW BX, 1(DI)
  80. ADDQ $3, DI
  81. SUBL $64, CX
  82. JMP loop0
  83. step1:
  84. // if length > 64 { etc }
  85. CMPL CX, $64
  86. JLE step2
  87. // Emit a length 60 copy, encoded as 3 bytes.
  88. MOVB $0xee, 0(DI)
  89. MOVW BX, 1(DI)
  90. ADDQ $3, DI
  91. SUBL $60, CX
  92. step2:
  93. // if length >= 12 || offset >= 2048 { goto step3 }
  94. CMPL CX, $12
  95. JGE step3
  96. CMPL BX, $2048
  97. JGE step3
  98. // Emit the remaining copy, encoded as 2 bytes.
  99. MOVB BX, 1(DI)
  100. SHRL $8, BX
  101. SHLB $5, BX
  102. SUBB $4, CX
  103. SHLB $2, CX
  104. ORB CX, BX
  105. ORB $1, BX
  106. MOVB BX, 0(DI)
  107. ADDQ $2, DI
  108. // Return the number of bytes written.
  109. SUBQ SI, DI
  110. MOVQ DI, ret+40(FP)
  111. RET
  112. step3:
  113. // Emit the remaining copy, encoded as 3 bytes.
  114. SUBL $1, CX
  115. SHLB $2, CX
  116. ORB $2, CX
  117. MOVB CX, 0(DI)
  118. MOVW BX, 1(DI)
  119. ADDQ $3, DI
  120. // Return the number of bytes written.
  121. SUBQ SI, DI
  122. MOVQ DI, ret+40(FP)
  123. RET
  124. // ----------------------------------------------------------------------------
  125. // func extendMatch(src []byte, i, j int) int
  126. //
  127. // All local variables fit into registers. The register allocation:
  128. // - CX &src[0]
  129. // - DX &src[len(src)]
  130. // - SI &src[i]
  131. // - DI &src[j]
  132. // - R9 &src[len(src) - 8]
  133. TEXT ·extendMatch(SB), NOSPLIT, $0-48
  134. MOVQ src_base+0(FP), CX
  135. MOVQ src_len+8(FP), DX
  136. MOVQ i+24(FP), SI
  137. MOVQ j+32(FP), DI
  138. ADDQ CX, DX
  139. ADDQ CX, SI
  140. ADDQ CX, DI
  141. MOVQ DX, R9
  142. SUBQ $8, R9
  143. cmp8:
  144. // As long as we are 8 or more bytes before the end of src, we can load and
  145. // compare 8 bytes at a time. If those 8 bytes are equal, repeat.
  146. CMPQ DI, R9
  147. JA cmp1
  148. MOVQ (SI), AX
  149. MOVQ (DI), BX
  150. CMPQ AX, BX
  151. JNE bsf
  152. ADDQ $8, SI
  153. ADDQ $8, DI
  154. JMP cmp8
  155. bsf:
  156. // If those 8 bytes were not equal, XOR the two 8 byte values, and return
  157. // the index of the first byte that differs. The BSF instruction finds the
  158. // least significant 1 bit, the amd64 architecture is little-endian, and
  159. // the shift by 3 converts a bit index to a byte index.
  160. XORQ AX, BX
  161. BSFQ BX, BX
  162. SHRQ $3, BX
  163. ADDQ BX, DI
  164. // Convert from &src[ret] to ret.
  165. SUBQ CX, DI
  166. MOVQ DI, ret+40(FP)
  167. RET
  168. cmp1:
  169. // In src's tail, compare 1 byte at a time.
  170. CMPQ DI, DX
  171. JAE end
  172. MOVB (SI), AX
  173. MOVB (DI), BX
  174. CMPB AX, BX
  175. JNE end
  176. ADDQ $1, SI
  177. ADDQ $1, DI
  178. JMP cmp1
  179. end:
  180. // Convert from &src[ret] to ret.
  181. SUBQ CX, DI
  182. MOVQ DI, ret+40(FP)
  183. RET
  184. // ----------------------------------------------------------------------------
  185. // func encodeBlock(dst, src []byte) (d int)
  186. //
  187. // All local variables fit into registers, other than "var table". The register
  188. // allocation:
  189. // - AX . .
  190. // - BX . .
  191. // - CX 56 shift (note that amd64 shifts by non-immediates must use CX).
  192. // - DX 64 &src[0], tableSize
  193. // - SI 72 &src[s]
  194. // - DI 80 &dst[d]
  195. // - R9 88 sLimit
  196. // - R10 . &src[nextEmit]
  197. // - R11 96 prevHash, currHash, nextHash, offset
  198. // - R12 104 &src[base], skip
  199. // - R13 . &src[nextS]
  200. // - R14 . len(src), bytesBetweenHashLookups, x
  201. // - R15 112 candidate
  202. //
  203. // The second column (56, 64, etc) is the stack offset to spill the registers
  204. // when calling other functions. We could pack this slightly tighter, but it's
  205. // simpler to have a dedicated spill map independent of the function called.
  206. //
  207. // "var table [maxTableSize]uint16" takes up 32768 bytes of stack space. An
  208. // extra 56 bytes, to call other functions, and an extra 64 bytes, to spill
  209. // local variables (registers) during calls gives 32768 + 56 + 64 = 32888.
  210. TEXT ·encodeBlock(SB), 0, $32888-56
  211. MOVQ dst_base+0(FP), DI
  212. MOVQ src_base+24(FP), SI
  213. MOVQ src_len+32(FP), R14
  214. // shift, tableSize := uint32(32-8), 1<<8
  215. MOVQ $24, CX
  216. MOVQ $256, DX
  217. calcShift:
  218. // for ; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
  219. // shift--
  220. // }
  221. CMPQ DX, $16384
  222. JGE varTable
  223. CMPQ DX, R14
  224. JGE varTable
  225. SUBQ $1, CX
  226. SHLQ $1, DX
  227. JMP calcShift
  228. varTable:
  229. // var table [maxTableSize]uint16
  230. //
  231. // In the asm code, unlike the Go code, we can zero-initialize only the
  232. // first tableSize elements. Each uint16 element is 2 bytes and each MOVOU
  233. // writes 16 bytes, so we can do only tableSize/8 writes instead of the
  234. // 2048 writes that would zero-initialize all of table's 32768 bytes.
  235. SHRQ $3, DX
  236. LEAQ table-32768(SP), BX
  237. PXOR X0, X0
  238. memclr:
  239. MOVOU X0, 0(BX)
  240. ADDQ $16, BX
  241. SUBQ $1, DX
  242. JNZ memclr
  243. // !!! DX = &src[0]
  244. MOVQ SI, DX
  245. // sLimit := len(src) - inputMargin
  246. MOVQ R14, R9
  247. SUBQ $15, R9
  248. // !!! Pre-emptively spill CX, DX and R9 to the stack. Their values don't
  249. // change for the rest of the function.
  250. MOVQ CX, 56(SP)
  251. MOVQ DX, 64(SP)
  252. MOVQ R9, 88(SP)
  253. // nextEmit := 0
  254. MOVQ DX, R10
  255. // s := 1
  256. ADDQ $1, SI
  257. // nextHash := hash(load32(src, s), shift)
  258. MOVL 0(SI), R11
  259. IMULL $0x1e35a7bd, R11
  260. SHRL CX, R11
  261. outer:
  262. // for { etc }
  263. // skip := 32
  264. MOVQ $32, R12
  265. // nextS := s
  266. MOVQ SI, R13
  267. // candidate := 0
  268. MOVQ $0, R15
  269. inner0:
  270. // for { etc }
  271. // s := nextS
  272. MOVQ R13, SI
  273. // bytesBetweenHashLookups := skip >> 5
  274. MOVQ R12, R14
  275. SHRQ $5, R14
  276. // nextS = s + bytesBetweenHashLookups
  277. ADDQ R14, R13
  278. // skip += bytesBetweenHashLookups
  279. ADDQ R14, R12
  280. // if nextS > sLimit { goto emitRemainder }
  281. MOVQ R13, AX
  282. SUBQ DX, AX
  283. CMPQ AX, R9
  284. JA emitRemainder
  285. // candidate = int(table[nextHash])
  286. MOVWQZX table-32768(SP)(R11*2), R15
  287. // table[nextHash] = uint16(s)
  288. MOVQ SI, AX
  289. SUBQ DX, AX
  290. MOVW AX, table-32768(SP)(R11*2)
  291. // nextHash = hash(load32(src, nextS), shift)
  292. MOVL 0(R13), R11
  293. IMULL $0x1e35a7bd, R11
  294. SHRL CX, R11
  295. // if load32(src, s) != load32(src, candidate) { continue } break
  296. MOVL 0(SI), AX
  297. MOVL (DX)(R15*1), BX
  298. CMPL AX, BX
  299. JNE inner0
  300. fourByteMatch:
  301. // As per the encode_other.go code:
  302. //
  303. // A 4-byte match has been found. We'll later see etc.
  304. // d += emitLiteral(dst[d:], src[nextEmit:s])
  305. //
  306. // Push args.
  307. MOVQ DI, 0(SP)
  308. MOVQ $0, 8(SP) // Unnecessary, as the callee ignores it, but conservative.
  309. MOVQ $0, 16(SP) // Unnecessary, as the callee ignores it, but conservative.
  310. MOVQ R10, 24(SP)
  311. MOVQ SI, AX
  312. SUBQ R10, AX
  313. MOVQ AX, 32(SP)
  314. MOVQ AX, 40(SP) // Unnecessary, as the callee ignores it, but conservative.
  315. // Spill local variables (registers) onto the stack; call; unspill.
  316. MOVQ SI, 72(SP)
  317. MOVQ DI, 80(SP)
  318. MOVQ R15, 112(SP)
  319. CALL ·emitLiteral(SB)
  320. MOVQ 56(SP), CX
  321. MOVQ 64(SP), DX
  322. MOVQ 72(SP), SI
  323. MOVQ 80(SP), DI
  324. MOVQ 88(SP), R9
  325. MOVQ 112(SP), R15
  326. // Finish the "d +=" part of "d += emitLiteral(etc)".
  327. ADDQ 48(SP), DI
  328. inner1:
  329. // for { etc }
  330. // base := s
  331. MOVQ SI, R12
  332. // !!! offset := base - candidate
  333. MOVQ R12, R11
  334. SUBQ R15, R11
  335. SUBQ DX, R11
  336. // s = extendMatch(src, candidate+4, s+4)
  337. //
  338. // Push args.
  339. MOVQ DX, 0(SP)
  340. MOVQ src_len+32(FP), R14
  341. MOVQ R14, 8(SP)
  342. MOVQ R14, 16(SP) // Unnecessary, as the callee ignores it, but conservative.
  343. ADDQ $4, R15
  344. MOVQ R15, 24(SP)
  345. ADDQ $4, SI
  346. SUBQ DX, SI
  347. MOVQ SI, 32(SP)
  348. // Spill local variables (registers) onto the stack; call; unspill.
  349. //
  350. // We don't need to unspill CX or R9 as we are just about to call another
  351. // function.
  352. MOVQ DI, 80(SP)
  353. MOVQ R11, 96(SP)
  354. MOVQ R12, 104(SP)
  355. CALL ·extendMatch(SB)
  356. MOVQ 64(SP), DX
  357. MOVQ 80(SP), DI
  358. MOVQ 96(SP), R11
  359. MOVQ 104(SP), R12
  360. // Finish the "s =" part of "s = extendMatch(etc)", remembering that the SI
  361. // register holds &src[s], not s.
  362. MOVQ 40(SP), SI
  363. ADDQ DX, SI
  364. // d += emitCopy(dst[d:], base-candidate, s-base)
  365. //
  366. // Push args.
  367. MOVQ DI, 0(SP)
  368. MOVQ $0, 8(SP) // Unnecessary, as the callee ignores it, but conservative.
  369. MOVQ $0, 16(SP) // Unnecessary, as the callee ignores it, but conservative.
  370. MOVQ R11, 24(SP)
  371. MOVQ SI, AX
  372. SUBQ R12, AX
  373. MOVQ AX, 32(SP)
  374. // Spill local variables (registers) onto the stack; call; unspill.
  375. MOVQ SI, 72(SP)
  376. MOVQ DI, 80(SP)
  377. CALL ·emitCopy(SB)
  378. MOVQ 56(SP), CX
  379. MOVQ 64(SP), DX
  380. MOVQ 72(SP), SI
  381. MOVQ 80(SP), DI
  382. MOVQ 88(SP), R9
  383. // Finish the "d +=" part of "d += emitCopy(etc)".
  384. ADDQ 40(SP), DI
  385. // nextEmit = s
  386. MOVQ SI, R10
  387. // if s >= sLimit { goto emitRemainder }
  388. MOVQ SI, AX
  389. SUBQ DX, AX
  390. CMPQ AX, R9
  391. JAE emitRemainder
  392. // As per the encode_other.go code:
  393. //
  394. // We could immediately etc.
  395. // x := load64(src, s-1)
  396. MOVQ -1(SI), R14
  397. // prevHash := hash(uint32(x>>0), shift)
  398. MOVL R14, R11
  399. IMULL $0x1e35a7bd, R11
  400. SHRL CX, R11
  401. // table[prevHash] = uint16(s-1)
  402. MOVQ SI, AX
  403. SUBQ DX, AX
  404. SUBQ $1, AX
  405. MOVW AX, table-32768(SP)(R11*2)
  406. // currHash := hash(uint32(x>>8), shift)
  407. SHRQ $8, R14
  408. MOVL R14, R11
  409. IMULL $0x1e35a7bd, R11
  410. SHRL CX, R11
  411. // candidate = int(table[currHash])
  412. MOVWQZX table-32768(SP)(R11*2), R15
  413. // table[currHash] = uint16(s)
  414. ADDQ $1, AX
  415. MOVW AX, table-32768(SP)(R11*2)
  416. // if uint32(x>>8) == load32(src, candidate) { continue }
  417. MOVL (DX)(R15*1), BX
  418. CMPL R14, BX
  419. JEQ inner1
  420. // nextHash = hash(uint32(x>>16), shift)
  421. SHRQ $8, R14
  422. MOVL R14, R11
  423. IMULL $0x1e35a7bd, R11
  424. SHRL CX, R11
  425. // s++
  426. ADDQ $1, SI
  427. // break out of the inner1 for loop, i.e. continue the outer loop.
  428. JMP outer
  429. emitRemainder:
  430. // if nextEmit < len(src) { etc }
  431. MOVQ src_len+32(FP), AX
  432. ADDQ DX, AX
  433. CMPQ R10, AX
  434. JEQ end
  435. // d += emitLiteral(dst[d:], src[nextEmit:])
  436. //
  437. // Push args.
  438. MOVQ DI, 0(SP)
  439. MOVQ $0, 8(SP) // Unnecessary, as the callee ignores it, but conservative.
  440. MOVQ $0, 16(SP) // Unnecessary, as the callee ignores it, but conservative.
  441. MOVQ R10, 24(SP)
  442. SUBQ R10, AX
  443. MOVQ AX, 32(SP)
  444. MOVQ AX, 40(SP) // Unnecessary, as the callee ignores it, but conservative.
  445. // Spill local variables (registers) onto the stack; call; unspill.
  446. MOVQ DI, 80(SP)
  447. CALL ·emitLiteral(SB)
  448. MOVQ 80(SP), DI
  449. // Finish the "d +=" part of "d += emitLiteral(etc)".
  450. ADDQ 48(SP), DI
  451. end:
  452. MOVQ dst_base+0(FP), AX
  453. SUBQ AX, DI
  454. MOVQ DI, d+48(FP)
  455. RET