gen.go 54 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861
  1. //+build generate
  2. //go:generate go run gen.go -out encodeblock_amd64.s -stubs encodeblock_amd64.go
  3. package main
  4. import (
  5. "fmt"
  6. "runtime"
  7. . "github.com/mmcloughlin/avo/build"
  8. "github.com/mmcloughlin/avo/buildtags"
  9. "github.com/mmcloughlin/avo/operand"
  10. . "github.com/mmcloughlin/avo/operand"
  11. "github.com/mmcloughlin/avo/reg"
  12. )
  13. func main() {
  14. Constraint(buildtags.Not("appengine").ToConstraint())
  15. Constraint(buildtags.Not("noasm").ToConstraint())
  16. Constraint(buildtags.Term("gc").ToConstraint())
  17. genEncodeBlockAsm("encodeBlockAsm", 14, 6, 6, false, false)
  18. genEncodeBlockAsm("encodeBlockAsm12B", 12, 5, 5, false, false)
  19. genEncodeBlockAsm("encodeBlockAsm10B", 10, 5, 5, false, false)
  20. genEncodeBlockAsm("encodeBlockAsm8B", 8, 4, 4, false, false)
  21. genEncodeBlockAsm("encodeBlockAsmAvx", 14, 5, 6, true, false)
  22. genEncodeBlockAsm("encodeBlockAsm12BAvx", 12, 5, 5, true, false)
  23. genEncodeBlockAsm("encodeBlockAsm10BAvx", 10, 5, 4, true, false)
  24. genEncodeBlockAsm("encodeBlockAsm8BAvx", 8, 4, 4, true, false)
  25. // Snappy compatible
  26. genEncodeBlockAsm("encodeSnappyBlockAsm", 14, 6, 6, false, true)
  27. genEncodeBlockAsm("encodeSnappyBlockAsm12B", 12, 5, 5, false, true)
  28. genEncodeBlockAsm("encodeSnappyBlockAsm10B", 10, 5, 5, false, true)
  29. genEncodeBlockAsm("encodeSnappyBlockAsm8B", 8, 4, 4, false, true)
  30. genEncodeBlockAsm("encodeSnappyBlockAsmAvx", 14, 6, 6, true, true)
  31. genEncodeBlockAsm("encodeSnappyBlockAsm12BAvx", 12, 5, 5, true, true)
  32. genEncodeBlockAsm("encodeSnappyBlockAsm10BAvx", 10, 5, 4, true, true)
  33. genEncodeBlockAsm("encodeSnappyBlockAsm8BAvx", 8, 4, 4, true, true)
  34. genEmitLiteral()
  35. genEmitRepeat()
  36. genEmitCopy()
  37. genEmitCopyNoRepeat()
  38. genMatchLen()
  39. Generate()
  40. }
  41. func debugval(v operand.Op) {
  42. value := reg.R15
  43. MOVQ(v, value)
  44. INT(Imm(3))
  45. }
  46. func debugval32(v operand.Op) {
  47. value := reg.R15L
  48. MOVL(v, value)
  49. INT(Imm(3))
  50. }
  51. var assertCounter int
  52. // insert extra checks here and there.
  53. const debug = false
  54. // assert will insert code if debug is enabled.
  55. // The code should jump to 'ok' is assertion is success.
  56. func assert(fn func(ok LabelRef)) {
  57. if debug {
  58. caller := [100]uintptr{0}
  59. runtime.Callers(2, caller[:])
  60. frame, _ := runtime.CallersFrames(caller[:]).Next()
  61. ok := fmt.Sprintf("assert_check_%d_ok_srcline_%d", assertCounter, frame.Line)
  62. fn(LabelRef(ok))
  63. // Emit several since delve is imprecise.
  64. INT(Imm(3))
  65. INT(Imm(3))
  66. Label(ok)
  67. assertCounter++
  68. }
  69. }
  70. func genEncodeBlockAsm(name string, tableBits, skipLog, hashBytes int, avx bool, snappy bool) {
  71. TEXT(name, 0, "func(dst, src []byte) int")
  72. Doc(name+" encodes a non-empty src to a guaranteed-large-enough dst.",
  73. "It assumes that the varint-encoded length of the decompressed bytes has already been written.", "")
  74. Pragma("noescape")
  75. const literalMaxOverhead = 4
  76. var tableSize = 4 * (1 << tableBits)
  77. // Memzero needs at least 128 bytes.
  78. if tableSize < 128 {
  79. panic("tableSize must be at least 128 bytes")
  80. }
  81. lenSrcBasic, err := Param("src").Len().Resolve()
  82. if err != nil {
  83. panic(err)
  84. }
  85. lenSrcQ := lenSrcBasic.Addr
  86. lenDstBasic, err := Param("dst").Len().Resolve()
  87. if err != nil {
  88. panic(err)
  89. }
  90. lenDstQ := lenDstBasic.Addr
  91. // Bail if we can't compress to at least this.
  92. dstLimitPtrQ := AllocLocal(8)
  93. // sLimitL is when to stop looking for offset/length copies.
  94. sLimitL := AllocLocal(4)
  95. // nextEmitL keeps track of the point we have emitted to.
  96. nextEmitL := AllocLocal(4)
  97. // Repeat stores the last match offset.
  98. repeatL := AllocLocal(4)
  99. // nextSTempL keeps nextS while other functions are being called.
  100. nextSTempL := AllocLocal(4)
  101. // Alloc table last
  102. table := AllocLocal(tableSize)
  103. dst := GP64()
  104. {
  105. dstBaseBasic, err := Param("dst").Base().Resolve()
  106. if err != nil {
  107. panic(err)
  108. }
  109. dstBaseQ := dstBaseBasic.Addr
  110. MOVQ(dstBaseQ, dst)
  111. }
  112. srcBaseBasic, err := Param("src").Base().Resolve()
  113. if err != nil {
  114. panic(err)
  115. }
  116. srcBaseQ := srcBaseBasic.Addr
  117. // Zero table
  118. {
  119. iReg := GP64()
  120. MOVQ(U32(tableSize/8/16), iReg)
  121. tablePtr := GP64()
  122. LEAQ(table, tablePtr)
  123. zeroXmm := XMM()
  124. PXOR(zeroXmm, zeroXmm)
  125. Label("zero_loop_" + name)
  126. for i := 0; i < 8; i++ {
  127. MOVOU(zeroXmm, Mem{Base: tablePtr, Disp: i * 16})
  128. }
  129. ADDQ(U8(16*8), tablePtr)
  130. DECQ(iReg)
  131. JNZ(LabelRef("zero_loop_" + name))
  132. }
  133. {
  134. // nextEmit is offset n src where the next emitLiteral should start from.
  135. MOVL(U32(0), nextEmitL)
  136. const inputMargin = 8
  137. tmp, tmp2, tmp3 := GP64(), GP64(), GP64()
  138. MOVQ(lenSrcQ, tmp)
  139. LEAQ(Mem{Base: tmp, Disp: -5}, tmp2)
  140. // sLimitL := len(src) - inputMargin
  141. LEAQ(Mem{Base: tmp, Disp: -inputMargin}, tmp3)
  142. assert(func(ok LabelRef) {
  143. CMPQ(tmp3, lenSrcQ)
  144. JL(ok)
  145. })
  146. MOVL(tmp3.As32(), sLimitL)
  147. // dstLimit := (len(src) - 5 ) - len(src)>>5
  148. SHRQ(U8(5), tmp)
  149. SUBL(tmp.As32(), tmp2.As32()) // tmp2 = tmp2 - tmp
  150. assert(func(ok LabelRef) {
  151. // if len(src) > len(src) - len(src)>>5 - 5: ok
  152. CMPQ(lenSrcQ, tmp2)
  153. JGE(ok)
  154. })
  155. LEAQ(Mem{Base: dst, Index: tmp2, Scale: 1}, tmp2)
  156. MOVQ(tmp2, dstLimitPtrQ)
  157. // Store dst start address
  158. //MOVQ(dstAddr, dstStartPtrQ)
  159. }
  160. // s = 1
  161. s := GP32()
  162. MOVL(U32(1), s)
  163. // repeatL = 1
  164. MOVL(s, repeatL)
  165. src := GP64()
  166. Load(Param("src").Base(), src)
  167. // Load cv
  168. Label("search_loop_" + name)
  169. candidate := GP32()
  170. {
  171. assert(func(ok LabelRef) {
  172. // Check if somebody changed src
  173. tmp := GP64()
  174. MOVQ(srcBaseQ, tmp)
  175. CMPQ(tmp, src)
  176. JEQ(ok)
  177. })
  178. assert(func(ok LabelRef) {
  179. // Check if s is valid
  180. tmp := GP64()
  181. MOVQ(lenSrcQ, tmp)
  182. CMPQ(tmp, s.As64())
  183. JG(ok)
  184. })
  185. cv := GP64()
  186. MOVQ(Mem{Base: src, Index: s, Scale: 1}, cv)
  187. nextS := GP32()
  188. // nextS := s + (s-nextEmit)>>6 + 4
  189. {
  190. tmp := GP64()
  191. MOVL(s, tmp.As32()) // tmp = s
  192. SUBL(nextEmitL, tmp.As32()) // tmp = s - nextEmit
  193. SHRL(U8(skipLog), tmp.As32()) // tmp = (s - nextEmit) >> skipLog
  194. LEAL(Mem{Base: s, Disp: 4, Index: tmp, Scale: 1}, nextS)
  195. }
  196. // if nextS > sLimit {goto emitRemainder}
  197. {
  198. tmp := GP32()
  199. MOVL(sLimitL, tmp.As32())
  200. CMPL(nextS.As32(), tmp.As32())
  201. JGT(LabelRef("emit_remainder_" + name))
  202. }
  203. // move nextS to stack.
  204. MOVL(nextS.As32(), nextSTempL)
  205. candidate2 := GP32()
  206. hasher := hashN(hashBytes, tableBits)
  207. {
  208. hash0, hash1 := GP64(), GP64()
  209. MOVQ(cv, hash0)
  210. MOVQ(cv, hash1)
  211. SHRQ(U8(8), hash1)
  212. hasher.hash(hash0)
  213. hasher.hash(hash1)
  214. MOVL(table.Idx(hash0, 4), candidate)
  215. MOVL(table.Idx(hash1, 4), candidate2)
  216. assert(func(ok LabelRef) {
  217. CMPQ(hash0, U32(tableSize))
  218. JL(ok)
  219. })
  220. assert(func(ok LabelRef) {
  221. CMPQ(hash1, U32(tableSize))
  222. JL(ok)
  223. })
  224. MOVL(s, table.Idx(hash0, 4))
  225. tmp := GP32()
  226. LEAL(Mem{Base: s, Disp: 1}, tmp)
  227. MOVL(tmp, table.Idx(hash1, 4))
  228. }
  229. // Can be moved up if registers are available.
  230. hash2 := GP64()
  231. {
  232. // hash2 := hash6(cv>>16, tableBits)
  233. // hasher = hash6(tableBits)
  234. MOVQ(cv, hash2)
  235. SHRQ(U8(16), hash2)
  236. hasher.hash(hash2)
  237. assert(func(ok LabelRef) {
  238. CMPQ(hash2, U32(tableSize))
  239. JL(ok)
  240. })
  241. }
  242. // En/disable repeat matching.
  243. if true {
  244. // Check repeat at offset checkRep
  245. const checkRep = 1
  246. {
  247. // rep = s - repeat
  248. rep := GP32()
  249. MOVL(s, rep)
  250. SUBL(repeatL, rep) // rep = s - repeat
  251. // if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
  252. left, right := GP64(), GP64()
  253. MOVL(Mem{Base: src, Index: rep, Disp: checkRep, Scale: 1}, right.As32())
  254. MOVQ(cv, left)
  255. SHRQ(U8(checkRep*8), left)
  256. CMPL(left.As32(), right.As32())
  257. // BAIL, no repeat.
  258. JNE(LabelRef("no_repeat_found_" + name))
  259. }
  260. // base = s + checkRep
  261. base := GP32()
  262. LEAL(Mem{Base: s, Disp: checkRep}, base)
  263. // nextEmit before repeat.
  264. nextEmit := GP32()
  265. MOVL(nextEmitL, nextEmit)
  266. // Extend back
  267. if true {
  268. i := GP32()
  269. MOVL(base, i)
  270. SUBL(repeatL, i)
  271. JZ(LabelRef("repeat_extend_back_end_" + name))
  272. Label("repeat_extend_back_loop_" + name)
  273. // if base <= nextemit {exit}
  274. CMPL(base.As32(), nextEmit)
  275. JLE(LabelRef("repeat_extend_back_end_" + name))
  276. // if src[i-1] == src[base-1]
  277. tmp, tmp2 := GP64(), GP64()
  278. MOVB(Mem{Base: src, Index: i, Scale: 1, Disp: -1}, tmp.As8())
  279. MOVB(Mem{Base: src, Index: base, Scale: 1, Disp: -1}, tmp2.As8())
  280. CMPB(tmp.As8(), tmp2.As8())
  281. JNE(LabelRef("repeat_extend_back_end_" + name))
  282. LEAL(Mem{Base: base, Disp: -1}, base)
  283. DECL(i)
  284. JNZ(LabelRef("repeat_extend_back_loop_" + name))
  285. }
  286. Label("repeat_extend_back_end_" + name)
  287. // Base is now at start. Emit until base.
  288. // d += emitLiteral(dst[d:], src[nextEmit:base])
  289. if true {
  290. emitLiteralsDstP(nextEmitL, base, src, dst, "repeat_emit_"+name, avx)
  291. }
  292. // Extend forward
  293. {
  294. // s += 4 + checkRep
  295. ADDL(U8(4+checkRep), s)
  296. if true {
  297. // candidate := s - repeat + 4 + checkRep
  298. MOVL(s, candidate)
  299. SUBL(repeatL, candidate) // candidate = s - repeat
  300. // srcLeft = len(src) - s
  301. srcLeft := GP64()
  302. MOVQ(lenSrcQ, srcLeft)
  303. SUBL(s, srcLeft.As32())
  304. assert(func(ok LabelRef) {
  305. // if srcleft < maxint32: ok
  306. CMPQ(srcLeft, U32(0x7fffffff))
  307. JL(ok)
  308. })
  309. // Forward address
  310. forwardStart := GP64()
  311. LEAQ(Mem{Base: src, Index: s, Scale: 1}, forwardStart)
  312. // End address
  313. backStart := GP64()
  314. LEAQ(Mem{Base: src, Index: candidate, Scale: 1}, backStart)
  315. length := matchLen("repeat_extend", forwardStart, backStart, srcLeft, LabelRef("repeat_extend_forward_end_"+name))
  316. Label("repeat_extend_forward_end_" + name)
  317. // s+= length
  318. ADDL(length.As32(), s)
  319. }
  320. }
  321. // Emit
  322. if true {
  323. // length = s-base
  324. length := GP32()
  325. MOVL(s, length)
  326. SUBL(base.As32(), length) // length = s - base
  327. offsetVal := GP32()
  328. MOVL(repeatL, offsetVal)
  329. if !snappy {
  330. // if nextEmit == 0 {do copy instead...}
  331. TESTL(nextEmit, nextEmit)
  332. JZ(LabelRef("repeat_as_copy_" + name))
  333. // Emit as repeat...
  334. emitRepeat("match_repeat_"+name, length, offsetVal, nil, dst, LabelRef("repeat_end_emit_"+name))
  335. // Emit as copy instead...
  336. Label("repeat_as_copy_" + name)
  337. }
  338. emitCopy("repeat_as_copy_"+name, length, offsetVal, nil, dst, LabelRef("repeat_end_emit_"+name), snappy)
  339. Label("repeat_end_emit_" + name)
  340. // Store new dst and nextEmit
  341. MOVL(s, nextEmitL)
  342. }
  343. // if s >= sLimit
  344. // can be omitted.
  345. if true {
  346. CMPL(s, sLimitL)
  347. JGE(LabelRef("emit_remainder_" + name))
  348. }
  349. JMP(LabelRef("search_loop_" + name))
  350. }
  351. Label("no_repeat_found_" + name)
  352. {
  353. // Check candidates are ok. All must be < s and < len(src)
  354. assert(func(ok LabelRef) {
  355. tmp := GP64()
  356. MOVQ(lenSrcQ, tmp)
  357. CMPL(tmp.As32(), candidate)
  358. JG(ok)
  359. })
  360. assert(func(ok LabelRef) {
  361. CMPL(s, candidate)
  362. JG(ok)
  363. })
  364. assert(func(ok LabelRef) {
  365. tmp := GP64()
  366. MOVQ(lenSrcQ, tmp)
  367. CMPL(tmp.As32(), candidate2)
  368. JG(ok)
  369. })
  370. assert(func(ok LabelRef) {
  371. CMPL(s, candidate2)
  372. JG(ok)
  373. })
  374. CMPL(Mem{Base: src, Index: candidate, Scale: 1}, cv.As32())
  375. JEQ(LabelRef("candidate_match_" + name))
  376. tmp := GP32()
  377. // cv >>= 8
  378. SHRQ(U8(8), cv)
  379. // candidate = int(table[hash2]) - load early.
  380. MOVL(table.Idx(hash2, 4), candidate)
  381. assert(func(ok LabelRef) {
  382. tmp := GP64()
  383. MOVQ(lenSrcQ, tmp)
  384. CMPL(tmp.As32(), candidate)
  385. JG(ok)
  386. })
  387. assert(func(ok LabelRef) {
  388. // We may get s and s+1
  389. tmp := GP32()
  390. LEAL(Mem{Base: s, Disp: 2}, tmp)
  391. CMPL(tmp, candidate)
  392. JG(ok)
  393. })
  394. LEAL(Mem{Base: s, Disp: 2}, tmp)
  395. //if uint32(cv>>8) == load32(src, candidate2)
  396. CMPL(Mem{Base: src, Index: candidate2, Scale: 1}, cv.As32())
  397. JEQ(LabelRef("candidate2_match_" + name))
  398. // table[hash2] = uint32(s + 2)
  399. MOVL(tmp, table.Idx(hash2, 4))
  400. // cv >>= 8 (>> 16 total)
  401. SHRQ(U8(8), cv)
  402. // if uint32(cv>>16) == load32(src, candidate)
  403. CMPL(Mem{Base: src, Index: candidate, Scale: 1}, cv.As32())
  404. JEQ(LabelRef("candidate3_match_" + name))
  405. // No match found, next loop
  406. // s = nextS
  407. MOVL(nextSTempL, s)
  408. JMP(LabelRef("search_loop_" + name))
  409. // Matches candidate at s + 2 (3rd check)
  410. Label("candidate3_match_" + name)
  411. ADDL(U8(2), s)
  412. JMP(LabelRef("candidate_match_" + name))
  413. // Match at s + 1 (we calculated the hash, lets store it)
  414. Label("candidate2_match_" + name)
  415. // table[hash2] = uint32(s + 2)
  416. MOVL(tmp, table.Idx(hash2, 4))
  417. // s++
  418. INCL(s)
  419. MOVL(candidate2, candidate)
  420. }
  421. }
  422. Label("candidate_match_" + name)
  423. // We have a match at 's' with src offset in "candidate" that matches at least 4 bytes.
  424. // Extend backwards
  425. if true {
  426. ne := GP32()
  427. MOVL(nextEmitL, ne)
  428. TESTL(candidate, candidate)
  429. JZ(LabelRef("match_extend_back_end_" + name))
  430. // candidate is tested when decremented, so we loop back here.
  431. Label("match_extend_back_loop_" + name)
  432. // if s <= nextEmit {exit}
  433. CMPL(s, ne)
  434. JLE(LabelRef("match_extend_back_end_" + name))
  435. // if src[candidate-1] == src[s-1]
  436. tmp, tmp2 := GP64(), GP64()
  437. MOVB(Mem{Base: src, Index: candidate, Scale: 1, Disp: -1}, tmp.As8())
  438. MOVB(Mem{Base: src, Index: s, Scale: 1, Disp: -1}, tmp2.As8())
  439. CMPB(tmp.As8(), tmp2.As8())
  440. JNE(LabelRef("match_extend_back_end_" + name))
  441. LEAL(Mem{Base: s, Disp: -1}, s)
  442. DECL(candidate)
  443. JZ(LabelRef("match_extend_back_end_" + name))
  444. JMP(LabelRef("match_extend_back_loop_" + name))
  445. }
  446. Label("match_extend_back_end_" + name)
  447. // Bail if we exceed the maximum size.
  448. if true {
  449. // tmp = s-nextEmit
  450. tmp := GP64()
  451. MOVL(s, tmp.As32())
  452. SUBL(nextEmitL, tmp.As32())
  453. // tmp = &dst + s-nextEmit
  454. LEAQ(Mem{Base: dst, Index: tmp, Scale: 1, Disp: literalMaxOverhead}, tmp)
  455. CMPQ(tmp, dstLimitPtrQ)
  456. JL(LabelRef("match_dst_size_check_" + name))
  457. ri, err := ReturnIndex(0).Resolve()
  458. if err != nil {
  459. panic(err)
  460. }
  461. MOVQ(U32(0), ri.Addr)
  462. RET()
  463. }
  464. Label("match_dst_size_check_" + name)
  465. {
  466. base := GP32()
  467. MOVL(s, base.As32())
  468. emitLiteralsDstP(nextEmitL, base, src, dst, "match_emit_"+name, avx)
  469. }
  470. Label("match_nolit_loop_" + name)
  471. {
  472. // Update repeat
  473. {
  474. // repeat = base - candidate
  475. repeatVal := GP64().As32()
  476. MOVL(s, repeatVal)
  477. SUBL(candidate, repeatVal)
  478. MOVL(repeatVal, repeatL)
  479. }
  480. // s+=4, candidate+=4
  481. ADDL(U8(4), s)
  482. ADDL(U8(4), candidate)
  483. // Extend the 4-byte match as long as possible and emit copy.
  484. {
  485. assert(func(ok LabelRef) {
  486. // s must be > candidate cannot be equal.
  487. CMPL(s, candidate)
  488. JG(ok)
  489. })
  490. // srcLeft = len(src) - s
  491. srcLeft := GP64()
  492. MOVQ(lenSrcQ, srcLeft)
  493. SUBL(s, srcLeft.As32())
  494. assert(func(ok LabelRef) {
  495. // if srcleft < maxint32: ok
  496. CMPQ(srcLeft, U32(0x7fffffff))
  497. JL(ok)
  498. })
  499. a, b := GP64(), GP64()
  500. LEAQ(Mem{Base: src, Index: s, Scale: 1}, a)
  501. LEAQ(Mem{Base: src, Index: candidate, Scale: 1}, b)
  502. length := matchLen("match_nolit_"+name,
  503. a, b,
  504. srcLeft,
  505. LabelRef("match_nolit_end_"+name),
  506. )
  507. Label("match_nolit_end_" + name)
  508. // s += length (length is destroyed, use it now)
  509. ADDL(length.As32(), s)
  510. // Load offset from repeat value.
  511. offset := GP64()
  512. MOVL(repeatL, offset.As32())
  513. // length += 4
  514. ADDL(U8(4), length)
  515. emitCopy("match_nolit_"+name, length, offset, nil, dst, LabelRef("match_nolit_emitcopy_end_"+name), snappy)
  516. Label("match_nolit_emitcopy_end_" + name)
  517. MOVL(s, nextEmitL)
  518. // if s >= sLimit { end }
  519. CMPL(s, sLimitL)
  520. JGE(LabelRef("emit_remainder_" + name))
  521. // Bail if we exceed the maximum size.
  522. {
  523. CMPQ(dst, dstLimitPtrQ)
  524. JL(LabelRef("match_nolit_dst_ok_" + name))
  525. ri, err := ReturnIndex(0).Resolve()
  526. if err != nil {
  527. panic(err)
  528. }
  529. MOVQ(U32(0), ri.Addr)
  530. RET()
  531. Label("match_nolit_dst_ok_" + name)
  532. }
  533. }
  534. {
  535. // Check for an immediate match, otherwise start search at s+1
  536. x := GP64()
  537. // Index s-2
  538. MOVQ(Mem{Base: src, Index: s, Scale: 1, Disp: -2}, x)
  539. hasher := hashN(hashBytes, tableBits)
  540. hash0, hash1 := GP64(), GP64()
  541. MOVQ(x, hash0) // s-2
  542. SHRQ(U8(16), x)
  543. MOVQ(x, hash1) // s
  544. hasher.hash(hash0)
  545. hasher.hash(hash1)
  546. sm2 := GP32() // s - 2
  547. LEAL(Mem{Base: s, Disp: -2}, sm2)
  548. assert(func(ok LabelRef) {
  549. CMPQ(hash0, U32(tableSize))
  550. JL(ok)
  551. })
  552. assert(func(ok LabelRef) {
  553. CMPQ(hash1, U32(tableSize))
  554. JL(ok)
  555. })
  556. MOVL(table.Idx(hash1, 4), candidate)
  557. MOVL(sm2, table.Idx(hash0, 4))
  558. MOVL(s, table.Idx(hash1, 4))
  559. CMPL(Mem{Base: src, Index: candidate, Scale: 1}, x.As32())
  560. JEQ(LabelRef("match_nolit_loop_" + name))
  561. INCL(s)
  562. }
  563. JMP(LabelRef("search_loop_" + name))
  564. }
  565. Label("emit_remainder_" + name)
  566. // Bail if we exceed the maximum size.
  567. // if d+len(src)-nextEmitL > dstLimitPtrQ { return 0
  568. {
  569. // remain = len(src) - nextEmit
  570. remain := GP64()
  571. MOVQ(lenSrcQ, remain)
  572. SUBL(nextEmitL, remain.As32())
  573. dstExpect := GP64()
  574. // dst := dst + (len(src)-nextEmitL)
  575. LEAQ(Mem{Base: dst, Index: remain, Scale: 1, Disp: literalMaxOverhead}, dstExpect)
  576. CMPQ(dstExpect, dstLimitPtrQ)
  577. JL(LabelRef("emit_remainder_ok_" + name))
  578. ri, err := ReturnIndex(0).Resolve()
  579. if err != nil {
  580. panic(err)
  581. }
  582. MOVQ(U32(0), ri.Addr)
  583. RET()
  584. Label("emit_remainder_ok_" + name)
  585. }
  586. // emitLiteral(dst[d:], src[nextEmitL:])
  587. emitEnd := GP64()
  588. MOVQ(lenSrcQ, emitEnd)
  589. // Emit final literals.
  590. emitLiteralsDstP(nextEmitL, emitEnd, src, dst, "emit_remainder_"+name, avx)
  591. // Assert size is < limit
  592. assert(func(ok LabelRef) {
  593. // if dstBaseQ < dstLimitPtrQ: ok
  594. CMPQ(dst, dstLimitPtrQ)
  595. JL(ok)
  596. })
  597. // length := start - base (ptr arithmetic)
  598. length := GP64()
  599. base := Load(Param("dst").Base(), GP64())
  600. MOVQ(dst, length)
  601. SUBQ(base, length)
  602. // Assert size is < len(src)
  603. assert(func(ok LabelRef) {
  604. // if len(src) >= length: ok
  605. CMPQ(lenSrcQ, length)
  606. JGE(ok)
  607. })
  608. // Assert size is < len(dst)
  609. assert(func(ok LabelRef) {
  610. // if len(dst) >= length: ok
  611. CMPQ(lenDstQ, length)
  612. JGE(ok)
  613. })
  614. Store(length, ReturnIndex(0))
  615. RET()
  616. }
  617. // emitLiterals emits literals from nextEmit to base, updates nextEmit, dstBase.
  618. // Checks if base == nextemit.
  619. // src & base are untouched.
  620. func emitLiterals(nextEmitL Mem, base reg.GPVirtual, src reg.GPVirtual, dstBase Mem, name string, avx bool) {
  621. nextEmit, litLen, dstBaseTmp, litBase := GP32(), GP32(), GP64(), GP64()
  622. MOVL(nextEmitL, nextEmit)
  623. CMPL(nextEmit, base.As32())
  624. JEQ(LabelRef("emit_literal_skip_" + name))
  625. MOVL(base.As32(), litLen.As32())
  626. // Base is now next emit.
  627. MOVL(base.As32(), nextEmitL)
  628. // litBase = src[nextEmitL:]
  629. LEAQ(Mem{Base: src, Index: nextEmit, Scale: 1}, litBase)
  630. SUBL(nextEmit, litLen.As32()) // litlen = base - nextEmit
  631. // Load (and store when we return)
  632. MOVQ(dstBase, dstBaseTmp)
  633. emitLiteral(name, litLen, nil, dstBaseTmp, litBase, LabelRef("emit_literal_done_"+name), avx, true)
  634. Label("emit_literal_done_" + name)
  635. // Emitted length must be > litlen.
  636. // We have already checked for len(0) above.
  637. assert(func(ok LabelRef) {
  638. tmp := GP64()
  639. MOVQ(dstBaseTmp, tmp)
  640. SUBQ(dstBase, tmp) // tmp = dstBaseTmp - dstBase
  641. // if tmp > litLen: ok
  642. CMPQ(tmp, litLen.As64())
  643. JG(ok)
  644. })
  645. // Store updated dstBase
  646. MOVQ(dstBaseTmp, dstBase)
  647. Label("emit_literal_skip_" + name)
  648. }
  649. // emitLiterals emits literals from nextEmit to base, updates nextEmit, dstBase.
  650. // Checks if base == nextemit.
  651. // src & base are untouched.
  652. func emitLiteralsDstP(nextEmitL Mem, base reg.GPVirtual, src, dst reg.GPVirtual, name string, avx bool) {
  653. nextEmit, litLen, litBase := GP32(), GP32(), GP64()
  654. MOVL(nextEmitL, nextEmit)
  655. CMPL(nextEmit, base.As32())
  656. JEQ(LabelRef("emit_literal_done_" + name))
  657. MOVL(base.As32(), litLen.As32())
  658. // Base is now next emit.
  659. MOVL(base.As32(), nextEmitL)
  660. // litBase = src[nextEmitL:]
  661. LEAQ(Mem{Base: src, Index: nextEmit, Scale: 1}, litBase)
  662. SUBL(nextEmit, litLen.As32()) // litlen = base - nextEmit
  663. // Load (and store when we return)
  664. emitLiteral(name, litLen, nil, dst, litBase, LabelRef("emit_literal_done_"+name), avx, true)
  665. Label("emit_literal_done_" + name)
  666. }
  667. type hashGen struct {
  668. bytes int
  669. tablebits int
  670. mulreg reg.GPVirtual
  671. }
  672. // hashN uses multiply to get a 'output' hash on the hash of the lowest 'bytes' bytes in value.
  673. func hashN(hashBytes, tablebits int) hashGen {
  674. h := hashGen{
  675. bytes: hashBytes,
  676. tablebits: tablebits,
  677. mulreg: GP64(),
  678. }
  679. primebytes := uint64(0)
  680. switch hashBytes {
  681. case 3:
  682. primebytes = 506832829
  683. case 4:
  684. primebytes = 2654435761
  685. case 5:
  686. primebytes = 889523592379
  687. case 6:
  688. primebytes = 227718039650203
  689. case 7:
  690. primebytes = 58295818150454627
  691. case 8:
  692. primebytes = 0xcf1bbcdcb7a56463
  693. default:
  694. panic("invalid hash length")
  695. }
  696. MOVQ(Imm(primebytes), h.mulreg)
  697. return h
  698. }
  699. // hash uses multiply to get hash of the value.
  700. func (h hashGen) hash(val reg.GPVirtual) {
  701. // Move value to top of register.
  702. SHLQ(U8(64-8*h.bytes), val)
  703. IMULQ(h.mulreg, val)
  704. // Move value to bottom
  705. SHRQ(U8(64-h.tablebits), val)
  706. }
  707. func genEmitLiteral() {
  708. TEXT("emitLiteral", NOSPLIT, "func(dst, lit []byte) int")
  709. Doc("emitLiteral writes a literal chunk and returns the number of bytes written.", "",
  710. "It assumes that:",
  711. " dst is long enough to hold the encoded bytes",
  712. " 0 <= len(lit) && len(lit) <= math.MaxUint32", "")
  713. Pragma("noescape")
  714. dstBase, litBase, litLen, retval := GP64(), GP64(), GP64(), GP64()
  715. Load(Param("dst").Base(), dstBase)
  716. Load(Param("lit").Base(), litBase)
  717. Load(Param("lit").Len(), litLen)
  718. emitLiteral("standalone", litLen, retval, dstBase, litBase, "emit_literal_end_standalone", false, false)
  719. Label("emit_literal_end_standalone")
  720. Store(retval, ReturnIndex(0))
  721. RET()
  722. TEXT("emitLiteralAvx", NOSPLIT, "func(dst, lit []byte) int")
  723. Doc("emitLiteralAvx writes a literal chunk and returns the number of bytes written.", "",
  724. "It assumes that:",
  725. " dst is long enough to hold the encoded bytes",
  726. " 0 <= len(lit) && len(lit) <= math.MaxUint32", "")
  727. Pragma("noescape")
  728. dstBase, litBase, litLen, retval = GP64(), GP64(), GP64(), GP64()
  729. Load(Param("dst").Base(), dstBase)
  730. Load(Param("lit").Base(), litBase)
  731. Load(Param("lit").Len(), litLen)
  732. emitLiteral("standalone", litLen, retval, dstBase, litBase, "emit_literal_end_avx_standalone", true, false)
  733. Label("emit_literal_end_avx_standalone")
  734. Store(retval, ReturnIndex(0))
  735. RET()
  736. }
  737. // emitLiteral can be used for inlining an emitLiteral call.
  738. // stack must have at least 32 bytes.
  739. // retval will contain emitted bytes, but can be nil if this is not interesting.
  740. // dstBase and litBase are updated.
  741. // Uses 2 GP registers. With AVX 4 registers.
  742. // If updateDst is true dstBase will have the updated end pointer and an additional register will be used.
  743. func emitLiteral(name string, litLen, retval, dstBase, litBase reg.GPVirtual, end LabelRef, avx, updateDst bool) {
  744. n := GP32()
  745. n16 := GP32()
  746. // We always add litLen bytes
  747. if retval != nil {
  748. MOVL(litLen.As32(), retval.As32())
  749. }
  750. MOVL(litLen.As32(), n)
  751. SUBL(U8(1), n.As32())
  752. // Return if AX was 0
  753. JC(end)
  754. // Find number of bytes to emit for tag.
  755. CMPL(n.As32(), U8(60))
  756. JLT(LabelRef("one_byte_" + name))
  757. CMPL(n.As32(), U32(1<<8))
  758. JLT(LabelRef("two_bytes_" + name))
  759. CMPL(n.As32(), U32(1<<16))
  760. JLT(LabelRef("three_bytes_" + name))
  761. CMPL(n.As32(), U32(1<<24))
  762. JLT(LabelRef("four_bytes_" + name))
  763. Label("five_bytes_" + name)
  764. MOVB(U8(252), Mem{Base: dstBase})
  765. MOVL(n.As32(), Mem{Base: dstBase, Disp: 1})
  766. if retval != nil {
  767. ADDQ(U8(5), retval)
  768. }
  769. ADDQ(U8(5), dstBase)
  770. JMP(LabelRef("memmove_" + name))
  771. Label("four_bytes_" + name)
  772. MOVL(n, n16)
  773. SHRL(U8(16), n16.As32())
  774. MOVB(U8(248), Mem{Base: dstBase})
  775. MOVW(n.As16(), Mem{Base: dstBase, Disp: 1})
  776. MOVB(n16.As8(), Mem{Base: dstBase, Disp: 3})
  777. if retval != nil {
  778. ADDQ(U8(4), retval)
  779. }
  780. ADDQ(U8(4), dstBase)
  781. JMP(LabelRef("memmove_" + name))
  782. Label("three_bytes_" + name)
  783. MOVB(U8(0xf4), Mem{Base: dstBase})
  784. MOVW(n.As16(), Mem{Base: dstBase, Disp: 1})
  785. if retval != nil {
  786. ADDQ(U8(3), retval)
  787. }
  788. ADDQ(U8(3), dstBase)
  789. JMP(LabelRef("memmove_" + name))
  790. Label("two_bytes_" + name)
  791. MOVB(U8(0xf0), Mem{Base: dstBase})
  792. MOVB(n.As8(), Mem{Base: dstBase, Disp: 1})
  793. if retval != nil {
  794. ADDQ(U8(2), retval)
  795. }
  796. ADDQ(U8(2), dstBase)
  797. JMP(LabelRef("memmove_" + name))
  798. Label("one_byte_" + name)
  799. SHLB(U8(2), n.As8())
  800. MOVB(n.As8(), Mem{Base: dstBase})
  801. if retval != nil {
  802. ADDQ(U8(1), retval)
  803. }
  804. ADDQ(U8(1), dstBase)
  805. // Fallthrough
  806. Label("memmove_" + name)
  807. // copy(dst[i:], lit)
  808. if true {
  809. dstEnd := GP64()
  810. copyEnd := end
  811. if updateDst {
  812. copyEnd = LabelRef("memmove_end_copy_" + name)
  813. LEAQ(Mem{Base: dstBase, Index: litLen, Scale: 1}, dstEnd)
  814. }
  815. length := GP64()
  816. MOVL(litLen.As32(), length.As32())
  817. // updates litBase.
  818. genMemMove2("emit_lit_memmove_"+name, dstBase, litBase, length, copyEnd, avx)
  819. if updateDst {
  820. Label("memmove_end_copy_" + name)
  821. MOVQ(dstEnd, dstBase)
  822. JMP(end)
  823. }
  824. } else {
  825. // genMemMove always updates dstBase
  826. src := GP64()
  827. MOVQ(litBase, src)
  828. genMemMove("emit_lit_memmove_"+name, dstBase, litBase, litLen, end)
  829. }
  830. // Should be unreachable
  831. if debug {
  832. INT(Imm(3))
  833. }
  834. return
  835. }
  836. // genEmitRepeat generates a standlone emitRepeat.
  837. func genEmitRepeat() {
  838. TEXT("emitRepeat", NOSPLIT, "func(dst []byte, offset, length int) int")
  839. Doc("emitRepeat writes a repeat chunk and returns the number of bytes written.",
  840. "Length must be at least 4 and < 1<<32", "")
  841. Pragma("noescape")
  842. dstBase, offset, length, retval := GP64(), GP64(), GP64(), GP64()
  843. // retval = 0
  844. XORQ(retval, retval)
  845. Load(Param("dst").Base(), dstBase)
  846. Load(Param("offset"), offset)
  847. Load(Param("length"), length)
  848. emitRepeat("standalone", length, offset, retval, dstBase, LabelRef("gen_emit_repeat_end"))
  849. Label("gen_emit_repeat_end")
  850. Store(retval, ReturnIndex(0))
  851. RET()
  852. }
  853. // emitRepeat can be used for inlining an emitRepeat call.
  854. // length >= 4 and < 1<<32
  855. // length is modified. dstBase is updated. retval is added to input.
  856. // retval can be nil.
  857. // Will jump to end label when finished.
  858. // Uses 1 GP register.
  859. func emitRepeat(name string, length, offset, retval, dstBase reg.GPVirtual, end LabelRef) {
  860. Label("emit_repeat_again_" + name)
  861. tmp := GP32()
  862. MOVL(length.As32(), tmp) // Copy length
  863. // length -= 4
  864. LEAL(Mem{Base: length, Disp: -4}, length.As32())
  865. // if length <= 4 (use copied value)
  866. CMPL(tmp.As32(), U8(8))
  867. JLE(LabelRef("repeat_two_" + name))
  868. // length < 8 && offset < 2048
  869. CMPL(tmp.As32(), U8(12))
  870. JGE(LabelRef("cant_repeat_two_offset_" + name))
  871. CMPL(offset.As32(), U32(2048))
  872. JLT(LabelRef("repeat_two_offset_" + name))
  873. const maxRepeat = ((1 << 24) - 1) + 65536
  874. Label("cant_repeat_two_offset_" + name)
  875. CMPL(length.As32(), U32((1<<8)+4))
  876. JLT(LabelRef("repeat_three_" + name)) // if length < (1<<8)+4
  877. CMPL(length.As32(), U32((1<<16)+(1<<8)))
  878. JLT(LabelRef("repeat_four_" + name)) // if length < (1 << 16) + (1 << 8)
  879. CMPL(length.As32(), U32(maxRepeat))
  880. JLT(LabelRef("repeat_five_" + name)) // If less than 24 bits to represent.
  881. // We have have more than 24 bits
  882. // Emit so we have at least 4 bytes left.
  883. LEAL(Mem{Base: length, Disp: -(maxRepeat - 4)}, length.As32()) // length -= (maxRepeat - 4)
  884. MOVW(U16(7<<2|tagCopy1), Mem{Base: dstBase}) // dst[0] = 7<<2 | tagCopy1, dst[1] = 0
  885. MOVW(U16(65531), Mem{Base: dstBase, Disp: 2}) // 0xfffb
  886. MOVB(U8(255), Mem{Base: dstBase, Disp: 4})
  887. ADDQ(U8(5), dstBase)
  888. if retval != nil {
  889. ADDQ(U8(5), retval)
  890. }
  891. JMP(LabelRef("emit_repeat_again_" + name))
  892. // Must be able to be within 5 bytes.
  893. Label("repeat_five_" + name)
  894. LEAL(Mem{Base: length, Disp: -65536}, length.As32()) // length -= 65536
  895. MOVL(length.As32(), offset.As32())
  896. MOVW(U16(7<<2|tagCopy1), Mem{Base: dstBase}) // dst[0] = 7<<2 | tagCopy1, dst[1] = 0
  897. MOVW(length.As16(), Mem{Base: dstBase, Disp: 2}) // dst[2] = uint8(length), dst[3] = uint8(length >> 8)
  898. SARL(U8(16), offset.As32()) // offset = length >> 16
  899. MOVB(offset.As8(), Mem{Base: dstBase, Disp: 4}) // dst[4] = length >> 16
  900. if retval != nil {
  901. ADDQ(U8(5), retval) // i += 5
  902. }
  903. ADDQ(U8(5), dstBase) // dst += 5
  904. JMP(end)
  905. Label("repeat_four_" + name)
  906. LEAL(Mem{Base: length, Disp: -256}, length.As32()) // length -= 256
  907. MOVW(U16(6<<2|tagCopy1), Mem{Base: dstBase}) // dst[0] = 6<<2 | tagCopy1, dst[1] = 0
  908. MOVW(length.As16(), Mem{Base: dstBase, Disp: 2}) // dst[2] = uint8(length), dst[3] = uint8(length >> 8)
  909. if retval != nil {
  910. ADDQ(U8(4), retval) // i += 4
  911. }
  912. ADDQ(U8(4), dstBase) // dst += 4
  913. JMP(end)
  914. Label("repeat_three_" + name)
  915. LEAL(Mem{Base: length, Disp: -4}, length.As32()) // length -= 4
  916. MOVW(U16(5<<2|tagCopy1), Mem{Base: dstBase}) // dst[0] = 5<<2 | tagCopy1, dst[1] = 0
  917. MOVB(length.As8(), Mem{Base: dstBase, Disp: 2}) // dst[2] = uint8(length)
  918. if retval != nil {
  919. ADDQ(U8(3), retval) // i += 3
  920. }
  921. ADDQ(U8(3), dstBase) // dst += 3
  922. JMP(end)
  923. Label("repeat_two_" + name)
  924. // dst[0] = uint8(length)<<2 | tagCopy1, dst[1] = 0
  925. SHLL(U8(2), length.As32())
  926. ORL(U8(tagCopy1), length.As32())
  927. MOVW(length.As16(), Mem{Base: dstBase}) // dst[0] = 7<<2 | tagCopy1, dst[1] = 0
  928. if retval != nil {
  929. ADDQ(U8(2), retval) // i += 2
  930. }
  931. ADDQ(U8(2), dstBase) // dst += 2
  932. JMP(end)
  933. Label("repeat_two_offset_" + name)
  934. // Emit the remaining copy, encoded as 2 bytes.
  935. // dst[1] = uint8(offset)
  936. // dst[0] = uint8(offset>>8)<<5 | uint8(length)<<2 | tagCopy1
  937. tmp = GP64()
  938. XORQ(tmp, tmp)
  939. // Use scale and displacement to shift and subtract values from length.
  940. LEAL(Mem{Base: tmp, Index: length, Scale: 4, Disp: tagCopy1}, length.As32())
  941. MOVB(offset.As8(), Mem{Base: dstBase, Disp: 1}) // Store offset lower byte
  942. SARL(U8(8), offset.As32()) // Remove lower
  943. SHLL(U8(5), offset.As32()) // Shift back up
  944. ORL(offset.As32(), length.As32()) // OR result
  945. MOVB(length.As8(), Mem{Base: dstBase, Disp: 0})
  946. if retval != nil {
  947. ADDQ(U8(2), retval) // i += 2
  948. }
  949. ADDQ(U8(2), dstBase) // dst += 2
  950. JMP(end)
  951. }
  952. // emitCopy writes a copy chunk and returns the number of bytes written.
  953. //
  954. // It assumes that:
  955. // dst is long enough to hold the encoded bytes
  956. // 1 <= offset && offset <= math.MaxUint32
  957. // 4 <= length && length <= 1 << 24
  958. // genEmitCopy generates a standlone emitCopy
  959. func genEmitCopy() {
  960. TEXT("emitCopy", NOSPLIT, "func(dst []byte, offset, length int) int")
  961. Doc("emitCopy writes a copy chunk and returns the number of bytes written.", "",
  962. "It assumes that:",
  963. " dst is long enough to hold the encoded bytes",
  964. " 1 <= offset && offset <= math.MaxUint32",
  965. " 4 <= length && length <= 1 << 24", "")
  966. Pragma("noescape")
  967. dstBase, offset, length, retval := GP64(), GP64(), GP64(), GP64()
  968. // i := 0
  969. XORQ(retval, retval)
  970. Load(Param("dst").Base(), dstBase)
  971. Load(Param("offset"), offset)
  972. Load(Param("length"), length)
  973. emitCopy("standalone", length, offset, retval, dstBase, LabelRef("gen_emit_copy_end"), false)
  974. Label("gen_emit_copy_end")
  975. Store(retval, ReturnIndex(0))
  976. RET()
  977. }
  978. // emitCopy writes a copy chunk and returns the number of bytes written.
  979. //
  980. // It assumes that:
  981. // dst is long enough to hold the encoded bytes
  982. // 1 <= offset && offset <= math.MaxUint32
  983. // 4 <= length && length <= 1 << 24
  984. // genEmitCopy generates a standlone emitCopy
  985. func genEmitCopyNoRepeat() {
  986. TEXT("emitCopyNoRepeat", NOSPLIT, "func(dst []byte, offset, length int) int")
  987. Doc("emitCopyNoRepeat writes a copy chunk and returns the number of bytes written.", "",
  988. "It assumes that:",
  989. " dst is long enough to hold the encoded bytes",
  990. " 1 <= offset && offset <= math.MaxUint32",
  991. " 4 <= length && length <= 1 << 24", "")
  992. Pragma("noescape")
  993. dstBase, offset, length, retval := GP64(), GP64(), GP64(), GP64()
  994. // i := 0
  995. XORQ(retval, retval)
  996. Load(Param("dst").Base(), dstBase)
  997. Load(Param("offset"), offset)
  998. Load(Param("length"), length)
  999. emitCopy("standalone_snappy", length, offset, retval, dstBase, "gen_emit_copy_end_snappy", true)
  1000. Label("gen_emit_copy_end_snappy")
  1001. Store(retval, ReturnIndex(0))
  1002. RET()
  1003. }
  1004. const (
  1005. tagLiteral = 0x00
  1006. tagCopy1 = 0x01
  1007. tagCopy2 = 0x02
  1008. tagCopy4 = 0x03
  1009. )
  1010. // emitCopy can be used for inlining an emitCopy call.
  1011. // length is modified (and junk). dstBase is updated. retval is added to input.
  1012. // retval can be nil.
  1013. // Will jump to end label when finished.
  1014. // Uses 2 GP registers.
  1015. func emitCopy(name string, length, offset, retval, dstBase reg.GPVirtual, end LabelRef, snappy bool) {
  1016. //if offset >= 65536 {
  1017. CMPL(offset.As32(), U32(65536))
  1018. JL(LabelRef("two_byte_offset_" + name))
  1019. // offset is >= 65536
  1020. // if length <= 64 goto four_bytes_remain_
  1021. Label("four_bytes_loop_back_" + name)
  1022. CMPL(length.As32(), U8(64))
  1023. JLE(LabelRef("four_bytes_remain_" + name))
  1024. // Emit a length 64 copy, encoded as 5 bytes.
  1025. // dst[0] = 63<<2 | tagCopy4
  1026. MOVB(U8(63<<2|tagCopy4), Mem{Base: dstBase})
  1027. // dst[4] = uint8(offset >> 24)
  1028. // dst[3] = uint8(offset >> 16)
  1029. // dst[2] = uint8(offset >> 8)
  1030. // dst[1] = uint8(offset)
  1031. MOVL(offset.As32(), Mem{Base: dstBase, Disp: 1})
  1032. // length -= 64
  1033. LEAL(Mem{Base: length, Disp: -64}, length.As32())
  1034. if retval != nil {
  1035. ADDQ(U8(5), retval) // i+=5
  1036. }
  1037. ADDQ(U8(5), dstBase) // dst+=5
  1038. // if length >= 4 {
  1039. CMPL(length.As32(), U8(4))
  1040. JL(LabelRef("four_bytes_remain_" + name))
  1041. // Emit remaining as repeats
  1042. // return 5 + emitRepeat(dst[5:], offset, length)
  1043. // Inline call to emitRepeat. Will jump to end
  1044. if !snappy {
  1045. emitRepeat(name+"_emit_copy", length, offset, retval, dstBase, end)
  1046. }
  1047. JMP(LabelRef("four_bytes_loop_back_" + name))
  1048. Label("four_bytes_remain_" + name)
  1049. // if length == 0 {
  1050. // return i
  1051. // }
  1052. TESTL(length.As32(), length.As32())
  1053. JZ(end)
  1054. // Emit a copy, offset encoded as 4 bytes.
  1055. // dst[i+0] = uint8(length-1)<<2 | tagCopy4
  1056. // dst[i+1] = uint8(offset)
  1057. // dst[i+2] = uint8(offset >> 8)
  1058. // dst[i+3] = uint8(offset >> 16)
  1059. // dst[i+4] = uint8(offset >> 24)
  1060. tmp := GP64()
  1061. MOVB(U8(tagCopy4), tmp.As8())
  1062. // Use displacement to subtract 1 from upshifted length.
  1063. LEAL(Mem{Base: tmp, Disp: -(1 << 2), Index: length, Scale: 4}, length.As32())
  1064. MOVB(length.As8(), Mem{Base: dstBase})
  1065. MOVL(offset.As32(), Mem{Base: dstBase, Disp: 1})
  1066. // return i + 5
  1067. if retval != nil {
  1068. ADDQ(U8(5), retval)
  1069. }
  1070. ADDQ(U8(5), dstBase)
  1071. JMP(end)
  1072. Label("two_byte_offset_" + name)
  1073. // Offset no more than 2 bytes.
  1074. //if length > 64 {
  1075. CMPL(length.As32(), U8(64))
  1076. JLE(LabelRef("two_byte_offset_short_" + name))
  1077. // Emit a length 60 copy, encoded as 3 bytes.
  1078. // Emit remaining as repeat value (minimum 4 bytes).
  1079. // dst[2] = uint8(offset >> 8)
  1080. // dst[1] = uint8(offset)
  1081. // dst[0] = 59<<2 | tagCopy2
  1082. MOVB(U8(59<<2|tagCopy2), Mem{Base: dstBase})
  1083. MOVW(offset.As16(), Mem{Base: dstBase, Disp: 1})
  1084. // length -= 60
  1085. LEAL(Mem{Base: length, Disp: -60}, length.As32())
  1086. // Emit remaining as repeats, at least 4 bytes remain.
  1087. // return 3 + emitRepeat(dst[3:], offset, length)
  1088. //}
  1089. ADDQ(U8(3), dstBase)
  1090. if retval != nil {
  1091. ADDQ(U8(3), retval)
  1092. }
  1093. // Inline call to emitRepeat. Will jump to end
  1094. if !snappy {
  1095. emitRepeat(name+"_emit_copy_short", length, offset, retval, dstBase, end)
  1096. }
  1097. JMP(LabelRef("two_byte_offset_" + name))
  1098. Label("two_byte_offset_short_" + name)
  1099. //if length >= 12 || offset >= 2048 {
  1100. CMPL(length.As32(), U8(12))
  1101. JGE(LabelRef("emit_copy_three_" + name))
  1102. CMPL(offset.As32(), U32(2048))
  1103. JGE(LabelRef("emit_copy_three_" + name))
  1104. // Emit the remaining copy, encoded as 2 bytes.
  1105. // dst[1] = uint8(offset)
  1106. // dst[0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
  1107. tmp = GP64()
  1108. MOVB(U8(tagCopy1), tmp.As8())
  1109. // Use scale and displacement to shift and subtract values from length.
  1110. LEAL(Mem{Base: tmp, Index: length, Scale: 4, Disp: -(4 << 2)}, length.As32())
  1111. MOVB(offset.As8(), Mem{Base: dstBase, Disp: 1}) // Store offset lower byte
  1112. SHRL(U8(8), offset.As32()) // Remove lower
  1113. SHLL(U8(5), offset.As32()) // Shift back up
  1114. ORL(offset.As32(), length.As32()) // OR result
  1115. MOVB(length.As8(), Mem{Base: dstBase, Disp: 0})
  1116. if retval != nil {
  1117. ADDQ(U8(2), retval) // i += 2
  1118. }
  1119. ADDQ(U8(2), dstBase) // dst += 2
  1120. // return 2
  1121. JMP(end)
  1122. Label("emit_copy_three_" + name)
  1123. // // Emit the remaining copy, encoded as 3 bytes.
  1124. // dst[2] = uint8(offset >> 8)
  1125. // dst[1] = uint8(offset)
  1126. // dst[0] = uint8(length-1)<<2 | tagCopy2
  1127. tmp = GP64()
  1128. MOVB(U8(tagCopy2), tmp.As8())
  1129. LEAL(Mem{Base: tmp, Disp: -(1 << 2), Index: length, Scale: 4}, length.As32())
  1130. MOVB(length.As8(), Mem{Base: dstBase})
  1131. MOVW(offset.As16(), Mem{Base: dstBase, Disp: 1})
  1132. // return 3
  1133. if retval != nil {
  1134. ADDQ(U8(3), retval) // i += 3
  1135. }
  1136. ADDQ(U8(3), dstBase) // dst += 3
  1137. JMP(end)
  1138. }
  1139. // func memmove(to, from unsafe.Pointer, n uintptr)
  1140. // to and from will be at the end, n will be 0.
  1141. // to and from may not overlap.
  1142. // Fairly simplistic for now, can ofc. be extended.
  1143. // Uses one GP register and 8 SSE registers.
  1144. func genMemMove(name string, to, from, n reg.GPVirtual, end LabelRef) {
  1145. tmp := GP32()
  1146. MOVL(n.As32(), tmp)
  1147. // tmp = n/128
  1148. SHRL(U8(7), tmp)
  1149. TESTL(tmp, tmp)
  1150. JZ(LabelRef("done_128_" + name))
  1151. Label("loop_128_" + name)
  1152. var xmmregs [8]reg.VecVirtual
  1153. // Prefetch destination for next loop.
  1154. // Prefetching source doesn't provide speedup.
  1155. // This seems to give a small boost.
  1156. const preOff = 128
  1157. PREFETCHT0(Mem{Base: to, Disp: preOff})
  1158. PREFETCHT0(Mem{Base: to, Disp: preOff + 64})
  1159. for i := 0; i < 8; i++ {
  1160. xmmregs[i] = XMM()
  1161. MOVOU(Mem{Base: from}.Offset(i*16), xmmregs[i])
  1162. }
  1163. for i := 0; i < 8; i++ {
  1164. MOVOU(xmmregs[i], Mem{Base: to}.Offset(i*16))
  1165. }
  1166. LEAL(Mem{Base: n, Disp: -128}, n.As32())
  1167. ADDQ(U8(8*16), from)
  1168. ADDQ(U8(8*16), to)
  1169. DECL(tmp)
  1170. JNZ(LabelRef("loop_128_" + name))
  1171. Label("done_128_" + name)
  1172. MOVL(n.As32(), tmp)
  1173. // tmp = n/16
  1174. SHRL(U8(4), tmp)
  1175. TESTL(tmp, tmp)
  1176. JZ(LabelRef("done_16_" + name))
  1177. Label("loop_16_" + name)
  1178. xmm := XMM()
  1179. MOVOU(Mem{Base: from}, xmm)
  1180. MOVOU(xmm, Mem{Base: to})
  1181. LEAL(Mem{Base: n, Disp: -16}, n.As32())
  1182. ADDQ(U8(16), from)
  1183. ADDQ(U8(16), to)
  1184. DECL(tmp)
  1185. JNZ(LabelRef("loop_16_" + name))
  1186. Label("done_16_" + name)
  1187. // TODO: Use REP; MOVSB somehow.
  1188. TESTL(n.As32(), n.As32())
  1189. JZ(end)
  1190. Label("loop_1_" + name)
  1191. MOVB(Mem{Base: from}, tmp.As8())
  1192. MOVB(tmp.As8(), Mem{Base: to})
  1193. INCQ(from)
  1194. INCQ(to)
  1195. DECL(n.As32())
  1196. JNZ(LabelRef("loop_1_" + name))
  1197. JMP(end)
  1198. }
  1199. // func memmove(to, from unsafe.Pointer, n uintptr)
  1200. // src and dst may not overlap.
  1201. // Non AVX uses 2 GP register, 16 SSE2 registers.
  1202. // AVX uses 4 GP registers 16 AVX/SSE registers.
  1203. // All passed registers may be updated.
  1204. func genMemMove2(name string, dst, src, length reg.GPVirtual, end LabelRef, avx bool) {
  1205. AX, CX := GP64(), GP64()
  1206. NOP()
  1207. name += "_memmove_"
  1208. Label(name + "tail")
  1209. // move_129through256 or smaller work whether or not the source and the
  1210. // destination memory regions overlap because they load all data into
  1211. // registers before writing it back. move_256through2048 on the other
  1212. // hand can be used only when the memory regions don't overlap or the copy
  1213. // direction is forward.
  1214. //
  1215. // BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing.
  1216. TESTQ(length, length)
  1217. JEQ(end)
  1218. CMPQ(length, U8(2))
  1219. JBE(LabelRef(name + "move_1or2"))
  1220. CMPQ(length, U8(4))
  1221. JB(LabelRef(name + "move_3"))
  1222. JBE(LabelRef(name + "move_4"))
  1223. CMPQ(length, U8(8))
  1224. JB(LabelRef(name + "move_5through7"))
  1225. JE(LabelRef(name + "move_8"))
  1226. CMPQ(length, U8(16))
  1227. JBE(LabelRef(name + "move_9through16"))
  1228. CMPQ(length, U8(32))
  1229. JBE(LabelRef(name + "move_17through32"))
  1230. CMPQ(length, U8(64))
  1231. JBE(LabelRef(name + "move_33through64"))
  1232. CMPQ(length, U8(128))
  1233. JBE(LabelRef(name + "move_65through128"))
  1234. CMPQ(length, U32(256))
  1235. JBE(LabelRef(name + "move_129through256"))
  1236. if avx {
  1237. JMP(LabelRef(name + "avxUnaligned"))
  1238. } else {
  1239. if false {
  1240. // Don't check length for now.
  1241. Label(name + "forward")
  1242. CMPQ(length, U32(2048))
  1243. JLS(LabelRef(name + "move_256through2048"))
  1244. genMemMove(name+"fallback", dst, src, length, end)
  1245. } else {
  1246. JMP(LabelRef(name + "move_256through2048"))
  1247. }
  1248. }
  1249. /*
  1250. // If REP MOVSB isn't fast, don't use it
  1251. // FIXME: internal∕cpu·X86+const_offsetX86HasERMS(SB)
  1252. // CMPB(U8(1), U8(1)) // enhanced REP MOVSB/STOSB
  1253. JMP(LabelRef(name + "fwdBy8"))
  1254. // Check alignment
  1255. MOVL(src.As32(), AX.As32())
  1256. ORL(dst.As32(), AX.As32())
  1257. TESTL(U32(7), AX.As32())
  1258. JEQ(LabelRef(name + "fwdBy8"))
  1259. // Do 1 byte at a time
  1260. // MOVQ(length, CX)
  1261. // FIXME:
  1262. // REP; MOVSB
  1263. JMP(end)
  1264. Label(name + "fwdBy8")
  1265. // Do 8 bytes at a time
  1266. MOVQ(length, CX)
  1267. SHRQ(U8(3), CX)
  1268. ANDQ(U8(7), length)
  1269. // FIXME:
  1270. //REP; MOVSQ
  1271. JMP(LabelRef(name + "tail"))
  1272. Label(name + "back")
  1273. //check overlap
  1274. MOVQ(src, CX)
  1275. ADDQ(length, CX)
  1276. CMPQ(CX, dst)
  1277. JLS(LabelRef(name + "forward"))
  1278. //whole thing backwards has
  1279. //adjusted addresses
  1280. ADDQ(length, dst)
  1281. ADDQ(length, src)
  1282. STD()
  1283. //
  1284. // copy
  1285. //
  1286. MOVQ(length, CX)
  1287. SHRQ(U8(3), CX)
  1288. ANDQ(U8(7), length)
  1289. SUBQ(U8(8), dst)
  1290. SUBQ(U8(8), src)
  1291. // FIXME:
  1292. //REP; MOVSQ
  1293. // FIXME:
  1294. //CLD()
  1295. ADDQ(U8(8), dst)
  1296. ADDQ(U8(8), src)
  1297. SUBQ(length, dst)
  1298. SUBQ(length, src)
  1299. JMP(LabelRef(name + "tail"))
  1300. */
  1301. Label(name + "move_1or2")
  1302. MOVB(Mem{Base: src}, AX.As8())
  1303. MOVB(Mem{Base: src, Disp: -1, Index: length, Scale: 1}, CX.As8())
  1304. MOVB(AX.As8(), Mem{Base: dst})
  1305. MOVB(CX.As8(), Mem{Base: dst, Disp: -1, Index: length, Scale: 1})
  1306. JMP(end)
  1307. Label(name + "move_4")
  1308. MOVL(Mem{Base: src}, AX.As32())
  1309. MOVL(AX.As32(), Mem{Base: dst})
  1310. JMP(end)
  1311. Label(name + "move_3")
  1312. MOVW(Mem{Base: src}, AX.As16())
  1313. MOVB(Mem{Base: src, Disp: 2}, CX.As8())
  1314. MOVW(AX.As16(), Mem{Base: dst})
  1315. MOVB(CX.As8(), Mem{Base: dst, Disp: 2})
  1316. JMP(end)
  1317. Label(name + "move_5through7")
  1318. MOVL(Mem{Base: src}, AX.As32())
  1319. MOVL(Mem{Base: src, Disp: -4, Index: length, Scale: 1}, CX.As32())
  1320. MOVL(AX.As32(), Mem{Base: dst})
  1321. MOVL(CX.As32(), Mem{Base: dst, Disp: -4, Index: length, Scale: 1})
  1322. JMP(end)
  1323. Label(name + "move_8")
  1324. // We need a separate case for 8 to make sure we write pointers atomically.
  1325. MOVQ(Mem{Base: src}, AX)
  1326. MOVQ(AX, Mem{Base: dst})
  1327. JMP(end)
  1328. Label(name + "move_9through16")
  1329. MOVQ(Mem{Base: src}, AX)
  1330. MOVQ(Mem{Base: src, Disp: -8, Index: length, Scale: 1}, CX)
  1331. MOVQ(AX, Mem{Base: dst})
  1332. MOVQ(CX, Mem{Base: dst, Disp: -8, Index: length, Scale: 1})
  1333. JMP(end)
  1334. Label(name + "move_17through32")
  1335. X0, X1, X2, X3, X4, X5, X6, X7 := XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM()
  1336. X8, X9, X10, X11, X12, X13, X14, X15 := XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM()
  1337. MOVOU(Mem{Base: src}, X0)
  1338. MOVOU(Mem{Base: src, Disp: -16, Index: length, Scale: 1}, X1)
  1339. MOVOU(X0, Mem{Base: dst})
  1340. MOVOU(X1, Mem{Base: dst, Disp: -16, Index: length, Scale: 1})
  1341. JMP(end)
  1342. Label(name + "move_33through64")
  1343. MOVOU(Mem{Base: src}, X0)
  1344. MOVOU(Mem{Base: src, Disp: 16}, X1)
  1345. MOVOU(Mem{Base: src, Disp: -32, Index: length, Scale: 1}, X2)
  1346. MOVOU(Mem{Base: src, Disp: -16, Index: length, Scale: 1}, X3)
  1347. MOVOU(X0, Mem{Base: dst})
  1348. MOVOU(X1, Mem{Base: dst, Disp: 16})
  1349. MOVOU(X2, Mem{Base: dst, Disp: -32, Index: length, Scale: 1})
  1350. MOVOU(X3, Mem{Base: dst, Disp: -16, Index: length, Scale: 1})
  1351. JMP(end)
  1352. Label(name + "move_65through128")
  1353. MOVOU(Mem{Base: src}, X0)
  1354. MOVOU(Mem{Base: src, Disp: 16}, X1)
  1355. MOVOU(Mem{Base: src, Disp: 32}, X2)
  1356. MOVOU(Mem{Base: src, Disp: 48}, X3)
  1357. MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -64}, X12)
  1358. MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -48}, X13)
  1359. MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -32}, X14)
  1360. MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -16}, X15)
  1361. MOVOU(X0, Mem{Base: dst})
  1362. MOVOU(X1, Mem{Base: dst, Disp: 16})
  1363. MOVOU(X2, Mem{Base: dst, Disp: 32})
  1364. MOVOU(X3, Mem{Base: dst, Disp: 48})
  1365. MOVOU(X12, Mem{Base: dst, Index: length, Scale: 1, Disp: -64})
  1366. MOVOU(X13, Mem{Base: dst, Index: length, Scale: 1, Disp: -48})
  1367. MOVOU(X14, Mem{Base: dst, Index: length, Scale: 1, Disp: -32})
  1368. MOVOU(X15, Mem{Base: dst, Index: length, Scale: 1, Disp: -16})
  1369. JMP(end)
  1370. Label(name + "move_129through256")
  1371. MOVOU(Mem{Base: src}, X0)
  1372. MOVOU(Mem{Base: src, Disp: 16}, X1)
  1373. MOVOU(Mem{Base: src, Disp: 32}, X2)
  1374. MOVOU(Mem{Base: src, Disp: 48}, X3)
  1375. MOVOU(Mem{Base: src, Disp: 64}, X4)
  1376. MOVOU(Mem{Base: src, Disp: 80}, X5)
  1377. MOVOU(Mem{Base: src, Disp: 96}, X6)
  1378. MOVOU(Mem{Base: src, Disp: 112}, X7)
  1379. MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -128}, X8)
  1380. MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -112}, X9)
  1381. MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -96}, X10)
  1382. MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -80}, X11)
  1383. MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -64}, X12)
  1384. MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -48}, X13)
  1385. MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -32}, X14)
  1386. MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -16}, X15)
  1387. MOVOU(X0, Mem{Base: dst})
  1388. MOVOU(X1, Mem{Base: dst, Disp: 16})
  1389. MOVOU(X2, Mem{Base: dst, Disp: 32})
  1390. MOVOU(X3, Mem{Base: dst, Disp: 48})
  1391. MOVOU(X4, Mem{Base: dst, Disp: 64})
  1392. MOVOU(X5, Mem{Base: dst, Disp: 80})
  1393. MOVOU(X6, Mem{Base: dst, Disp: 96})
  1394. MOVOU(X7, Mem{Base: dst, Disp: 112})
  1395. MOVOU(X8, Mem{Base: dst, Index: length, Scale: 1, Disp: -128})
  1396. MOVOU(X9, Mem{Base: dst, Index: length, Scale: 1, Disp: -112})
  1397. MOVOU(X10, Mem{Base: dst, Index: length, Scale: 1, Disp: -96})
  1398. MOVOU(X11, Mem{Base: dst, Index: length, Scale: 1, Disp: -80})
  1399. MOVOU(X12, Mem{Base: dst, Index: length, Scale: 1, Disp: -64})
  1400. MOVOU(X13, Mem{Base: dst, Index: length, Scale: 1, Disp: -48})
  1401. MOVOU(X14, Mem{Base: dst, Index: length, Scale: 1, Disp: -32})
  1402. MOVOU(X15, Mem{Base: dst, Index: length, Scale: 1, Disp: -16})
  1403. JMP(end)
  1404. Label(name + "move_256through2048")
  1405. LEAQ(Mem{Base: length, Disp: -256}, length)
  1406. MOVOU(Mem{Base: src}, X0)
  1407. MOVOU(Mem{Base: src, Disp: 16}, X1)
  1408. MOVOU(Mem{Base: src, Disp: 32}, X2)
  1409. MOVOU(Mem{Base: src, Disp: 48}, X3)
  1410. MOVOU(Mem{Base: src, Disp: 64}, X4)
  1411. MOVOU(Mem{Base: src, Disp: 80}, X5)
  1412. MOVOU(Mem{Base: src, Disp: 96}, X6)
  1413. MOVOU(Mem{Base: src, Disp: 112}, X7)
  1414. MOVOU(Mem{Base: src, Disp: 128}, X8)
  1415. MOVOU(Mem{Base: src, Disp: 144}, X9)
  1416. MOVOU(Mem{Base: src, Disp: 160}, X10)
  1417. MOVOU(Mem{Base: src, Disp: 176}, X11)
  1418. MOVOU(Mem{Base: src, Disp: 192}, X12)
  1419. MOVOU(Mem{Base: src, Disp: 208}, X13)
  1420. MOVOU(Mem{Base: src, Disp: 224}, X14)
  1421. MOVOU(Mem{Base: src, Disp: 240}, X15)
  1422. MOVOU(X0, Mem{Base: dst})
  1423. MOVOU(X1, Mem{Base: dst, Disp: 16})
  1424. MOVOU(X2, Mem{Base: dst, Disp: 32})
  1425. MOVOU(X3, Mem{Base: dst, Disp: 48})
  1426. MOVOU(X4, Mem{Base: dst, Disp: 64})
  1427. MOVOU(X5, Mem{Base: dst, Disp: 80})
  1428. MOVOU(X6, Mem{Base: dst, Disp: 96})
  1429. MOVOU(X7, Mem{Base: dst, Disp: 112})
  1430. MOVOU(X8, Mem{Base: dst, Disp: 128})
  1431. MOVOU(X9, Mem{Base: dst, Disp: 144})
  1432. MOVOU(X10, Mem{Base: dst, Disp: 160})
  1433. MOVOU(X11, Mem{Base: dst, Disp: 176})
  1434. MOVOU(X12, Mem{Base: dst, Disp: 192})
  1435. MOVOU(X13, Mem{Base: dst, Disp: 208})
  1436. MOVOU(X14, Mem{Base: dst, Disp: 224})
  1437. MOVOU(X15, Mem{Base: dst, Disp: 240})
  1438. CMPQ(length, U32(256))
  1439. LEAQ(Mem{Base: src, Disp: 256}, src)
  1440. LEAQ(Mem{Base: dst, Disp: 256}, dst)
  1441. JGE(LabelRef(name + "move_256through2048"))
  1442. JMP(LabelRef(name + "tail"))
  1443. if avx {
  1444. Label(name + "avxUnaligned")
  1445. R8, R10 := GP64(), GP64()
  1446. // There are two implementations of move algorithm.
  1447. // The first one for non-overlapped memory regions. It uses forward copying.
  1448. // We do not support overlapping input
  1449. // Non-temporal copy would be better for big sizes.
  1450. // Disabled since big copies are unlikely.
  1451. // If enabling, test functionality.
  1452. const enableBigData = false
  1453. if enableBigData {
  1454. CMPQ(length, U32(0x100000))
  1455. JAE(LabelRef(name + "gobble_big_data_fwd"))
  1456. }
  1457. // Memory layout on the source side
  1458. // src CX
  1459. // |<---------length before correction--------->|
  1460. // | |<--length corrected-->| |
  1461. // | | |<--- AX --->|
  1462. // |<-R11->| |<-128 bytes->|
  1463. // +----------------------------------------+
  1464. // | Head | Body | Tail |
  1465. // +-------+------------------+-------------+
  1466. // ^ ^ ^
  1467. // | | |
  1468. // Save head into Y4 Save tail into X5..X12
  1469. // |
  1470. // src+R11, where R11 = ((dst & -32) + 32) - dst
  1471. // Algorithm:
  1472. // 1. Unaligned save of the tail's 128 bytes
  1473. // 2. Unaligned save of the head's 32 bytes
  1474. // 3. Destination-aligned copying of body (128 bytes per iteration)
  1475. // 4. Put head on the new place
  1476. // 5. Put the tail on the new place
  1477. // It can be important to satisfy processor's pipeline requirements for
  1478. // small sizes as the cost of unaligned memory region copying is
  1479. // comparable with the cost of main loop. So code is slightly messed there.
  1480. // There is more clean implementation of that algorithm for bigger sizes
  1481. // where the cost of unaligned part copying is negligible.
  1482. // You can see it after gobble_big_data_fwd label.
  1483. Y0, Y1, Y2, Y3, Y4 := YMM(), YMM(), YMM(), YMM(), YMM()
  1484. LEAQ(Mem{Base: src, Index: length, Scale: 1}, CX)
  1485. MOVQ(dst, R10)
  1486. // CX points to the end of buffer so we need go back slightly. We will use negative offsets there.
  1487. MOVOU(Mem{Base: CX, Disp: -0x80}, X5)
  1488. MOVOU(Mem{Base: CX, Disp: -0x70}, X6)
  1489. MOVQ(U32(0x80), AX)
  1490. // Align destination address
  1491. ANDQ(U32(0xffffffe0), dst)
  1492. ADDQ(U8(32), dst)
  1493. // Continue tail saving.
  1494. MOVOU(Mem{Base: CX, Disp: -0x60}, X7)
  1495. MOVOU(Mem{Base: CX, Disp: -0x50}, X8)
  1496. // Make R8 delta between aligned and unaligned destination addresses.
  1497. MOVQ(dst, R8)
  1498. SUBQ(R10, R8)
  1499. // Continue tail saving.
  1500. MOVOU(Mem{Base: CX, Disp: -0x40}, X9)
  1501. MOVOU(Mem{Base: CX, Disp: -0x30}, X10)
  1502. // Let's make bytes-to-copy value adjusted as we've prepared unaligned part for copying.
  1503. SUBQ(R8, length)
  1504. // Continue tail saving.
  1505. MOVOU(Mem{Base: CX, Disp: -0x20}, X11)
  1506. MOVOU(Mem{Base: CX, Disp: -0x10}, X12)
  1507. // The tail will be put on its place after main body copying.
  1508. // It's time for the unaligned heading part.
  1509. VMOVDQU(Mem{Base: src}, Y4)
  1510. // Adjust source address to point past head.
  1511. ADDQ(R8, src)
  1512. SUBQ(AX, length)
  1513. // Aligned memory copying there
  1514. Label(name + "gobble_128_loop")
  1515. VMOVDQU(Mem{Base: src}, Y0)
  1516. VMOVDQU(Mem{Base: src, Disp: 0x20}, Y1)
  1517. VMOVDQU(Mem{Base: src, Disp: 0x40}, Y2)
  1518. VMOVDQU(Mem{Base: src, Disp: 0x60}, Y3)
  1519. ADDQ(AX, src)
  1520. VMOVDQA(Y0, Mem{Base: dst})
  1521. VMOVDQA(Y1, Mem{Base: dst, Disp: 0x20})
  1522. VMOVDQA(Y2, Mem{Base: dst, Disp: 0x40})
  1523. VMOVDQA(Y3, Mem{Base: dst, Disp: 0x60})
  1524. ADDQ(AX, dst)
  1525. SUBQ(AX, length)
  1526. JA(LabelRef(name + "gobble_128_loop"))
  1527. // Now we can store unaligned parts.
  1528. ADDQ(AX, length)
  1529. ADDQ(dst, length)
  1530. VMOVDQU(Y4, Mem{Base: R10})
  1531. VZEROUPPER()
  1532. MOVOU(X5, Mem{Base: length, Disp: -0x80})
  1533. MOVOU(X6, Mem{Base: length, Disp: -0x70})
  1534. MOVOU(X7, Mem{Base: length, Disp: -0x60})
  1535. MOVOU(X8, Mem{Base: length, Disp: -0x50})
  1536. MOVOU(X9, Mem{Base: length, Disp: -0x40})
  1537. MOVOU(X10, Mem{Base: length, Disp: -0x30})
  1538. MOVOU(X11, Mem{Base: length, Disp: -0x20})
  1539. MOVOU(X12, Mem{Base: length, Disp: -0x10})
  1540. JMP(end)
  1541. if enableBigData {
  1542. Label(name + "gobble_big_data_fwd")
  1543. // There is forward copying for big regions.
  1544. // It uses non-temporal mov instructions.
  1545. // Details of this algorithm are commented previously for small sizes.
  1546. LEAQ(Mem{Base: src, Index: length, Scale: 1}, CX)
  1547. MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -0x80}, X5)
  1548. MOVOU(Mem{Base: CX, Disp: -0x70}, X6)
  1549. MOVOU(Mem{Base: CX, Disp: -0x60}, X7)
  1550. MOVOU(Mem{Base: CX, Disp: -0x50}, X8)
  1551. MOVOU(Mem{Base: CX, Disp: -0x40}, X9)
  1552. MOVOU(Mem{Base: CX, Disp: -0x30}, X10)
  1553. MOVOU(Mem{Base: CX, Disp: -0x20}, X11)
  1554. MOVOU(Mem{Base: CX, Disp: -0x10}, X12)
  1555. VMOVDQU(Mem{Base: src}, Y4)
  1556. MOVQ(dst, R8)
  1557. ANDQ(U32(0xffffffe0), dst)
  1558. ADDQ(U8(32), dst)
  1559. MOVQ(dst, R10)
  1560. SUBQ(R8, R10)
  1561. SUBQ(R10, length)
  1562. ADDQ(R10, src)
  1563. LEAQ(Mem{Base: dst, Index: length, Scale: 1}, CX)
  1564. SUBQ(U8(0x80), length)
  1565. Label(name + "gobble_mem_fwd_loop")
  1566. PREFETCHNTA(Mem{Base: src, Disp: 0x1c0})
  1567. PREFETCHNTA(Mem{Base: src, Disp: 0x280})
  1568. // Prefetch values were chosen empirically.
  1569. // Approach for prefetch usage as in 7.6.6 of [1]
  1570. // [1] 64-ia-32-architectures-optimization-manual.pdf
  1571. // https://www.intel.ru/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
  1572. VMOVDQU(Mem{Base: src}, Y0)
  1573. VMOVDQU(Mem{Base: src, Disp: 0x20}, Y1)
  1574. VMOVDQU(Mem{Base: src, Disp: 0x40}, Y2)
  1575. VMOVDQU(Mem{Base: src, Disp: 0x60}, Y3)
  1576. ADDQ(U8(0x80), src)
  1577. VMOVNTDQ(Y0, Mem{Base: dst})
  1578. VMOVNTDQ(Y1, Mem{Base: dst, Disp: 0x20})
  1579. VMOVNTDQ(Y2, Mem{Base: dst, Disp: 0x20})
  1580. VMOVNTDQ(Y3, Mem{Base: dst, Disp: 0x60})
  1581. ADDQ(U8(0x80), dst)
  1582. SUBQ(U8(0x80), length)
  1583. JA(LabelRef(name + "gobble_mem_fwd_loop"))
  1584. // NT instructions don't follow the normal cache-coherency rules.
  1585. // We need SFENCE there to make copied data available timely.
  1586. SFENCE()
  1587. VMOVDQU(Y4, Mem{Base: R8})
  1588. VZEROUPPER()
  1589. MOVOU(X5, Mem{Base: CX, Disp: -0x80})
  1590. MOVOU(X6, Mem{Base: CX, Disp: -0x70})
  1591. MOVOU(X7, Mem{Base: CX, Disp: -0x60})
  1592. MOVOU(X8, Mem{Base: CX, Disp: -0x50})
  1593. MOVOU(X9, Mem{Base: CX, Disp: -0x40})
  1594. MOVOU(X10, Mem{Base: CX, Disp: -0x30})
  1595. MOVOU(X11, Mem{Base: CX, Disp: -0x20})
  1596. MOVOU(X12, Mem{Base: CX, Disp: -0x10})
  1597. JMP(end)
  1598. }
  1599. }
  1600. }
  1601. // genMatchLen generates standalone matchLen.
  1602. func genMatchLen() {
  1603. TEXT("matchLen", NOSPLIT, "func(a, b []byte) int")
  1604. Doc("matchLen returns how many bytes match in a and b", "",
  1605. "It assumes that:",
  1606. " len(a) <= len(b)", "")
  1607. Pragma("noescape")
  1608. aBase, bBase, length := GP64(), GP64(), GP64()
  1609. Load(Param("a").Base(), aBase)
  1610. Load(Param("b").Base(), bBase)
  1611. Load(Param("a").Len(), length)
  1612. l := matchLen("standalone", aBase, bBase, length, LabelRef("gen_match_len_end"))
  1613. Label("gen_match_len_end")
  1614. Store(l.As64(), ReturnIndex(0))
  1615. RET()
  1616. }
  1617. // matchLen returns the number of matching bytes of a and b.
  1618. // len is the maximum number of bytes to match.
  1619. // Will jump to end when done and returns the length.
  1620. // Uses 2 GP registers.
  1621. func matchLen(name string, a, b, len reg.GPVirtual, end LabelRef) reg.GPVirtual {
  1622. tmp, matched := GP64(), GP32()
  1623. XORL(matched, matched)
  1624. CMPL(len.As32(), U8(8))
  1625. JL(LabelRef("matchlen_single_" + name))
  1626. Label("matchlen_loopback_" + name)
  1627. MOVQ(Mem{Base: a, Index: matched, Scale: 1}, tmp)
  1628. XORQ(Mem{Base: b, Index: matched, Scale: 1}, tmp)
  1629. TESTQ(tmp, tmp)
  1630. JZ(LabelRef("matchlen_loop_" + name))
  1631. // Not all match.
  1632. BSFQ(tmp, tmp)
  1633. SARQ(U8(3), tmp)
  1634. LEAL(Mem{Base: matched, Index: tmp, Scale: 1}, matched)
  1635. JMP(end)
  1636. // All 8 byte matched, update and loop.
  1637. Label("matchlen_loop_" + name)
  1638. LEAL(Mem{Base: len, Disp: -8}, len.As32())
  1639. LEAL(Mem{Base: matched, Disp: 8}, matched)
  1640. CMPL(len.As32(), U8(8))
  1641. JGE(LabelRef("matchlen_loopback_" + name))
  1642. // Less than 8 bytes left.
  1643. Label("matchlen_single_" + name)
  1644. TESTL(len.As32(), len.As32())
  1645. JZ(end)
  1646. Label("matchlen_single_loopback_" + name)
  1647. MOVB(Mem{Base: a, Index: matched, Scale: 1}, tmp.As8())
  1648. CMPB(Mem{Base: b, Index: matched, Scale: 1}, tmp.As8())
  1649. JNE(end)
  1650. LEAL(Mem{Base: matched, Disp: 1}, matched)
  1651. DECL(len.As32())
  1652. JNZ(LabelRef("matchlen_single_loopback_" + name))
  1653. JMP(end)
  1654. return matched
  1655. }