1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861 |
- //+build generate
- //go:generate go run gen.go -out encodeblock_amd64.s -stubs encodeblock_amd64.go
- package main
- import (
- "fmt"
- "runtime"
- . "github.com/mmcloughlin/avo/build"
- "github.com/mmcloughlin/avo/buildtags"
- "github.com/mmcloughlin/avo/operand"
- . "github.com/mmcloughlin/avo/operand"
- "github.com/mmcloughlin/avo/reg"
- )
- func main() {
- Constraint(buildtags.Not("appengine").ToConstraint())
- Constraint(buildtags.Not("noasm").ToConstraint())
- Constraint(buildtags.Term("gc").ToConstraint())
- genEncodeBlockAsm("encodeBlockAsm", 14, 6, 6, false, false)
- genEncodeBlockAsm("encodeBlockAsm12B", 12, 5, 5, false, false)
- genEncodeBlockAsm("encodeBlockAsm10B", 10, 5, 5, false, false)
- genEncodeBlockAsm("encodeBlockAsm8B", 8, 4, 4, false, false)
- genEncodeBlockAsm("encodeBlockAsmAvx", 14, 5, 6, true, false)
- genEncodeBlockAsm("encodeBlockAsm12BAvx", 12, 5, 5, true, false)
- genEncodeBlockAsm("encodeBlockAsm10BAvx", 10, 5, 4, true, false)
- genEncodeBlockAsm("encodeBlockAsm8BAvx", 8, 4, 4, true, false)
- // Snappy compatible
- genEncodeBlockAsm("encodeSnappyBlockAsm", 14, 6, 6, false, true)
- genEncodeBlockAsm("encodeSnappyBlockAsm12B", 12, 5, 5, false, true)
- genEncodeBlockAsm("encodeSnappyBlockAsm10B", 10, 5, 5, false, true)
- genEncodeBlockAsm("encodeSnappyBlockAsm8B", 8, 4, 4, false, true)
- genEncodeBlockAsm("encodeSnappyBlockAsmAvx", 14, 6, 6, true, true)
- genEncodeBlockAsm("encodeSnappyBlockAsm12BAvx", 12, 5, 5, true, true)
- genEncodeBlockAsm("encodeSnappyBlockAsm10BAvx", 10, 5, 4, true, true)
- genEncodeBlockAsm("encodeSnappyBlockAsm8BAvx", 8, 4, 4, true, true)
- genEmitLiteral()
- genEmitRepeat()
- genEmitCopy()
- genEmitCopyNoRepeat()
- genMatchLen()
- Generate()
- }
- func debugval(v operand.Op) {
- value := reg.R15
- MOVQ(v, value)
- INT(Imm(3))
- }
- func debugval32(v operand.Op) {
- value := reg.R15L
- MOVL(v, value)
- INT(Imm(3))
- }
- var assertCounter int
- // insert extra checks here and there.
- const debug = false
- // assert will insert code if debug is enabled.
- // The code should jump to 'ok' is assertion is success.
- func assert(fn func(ok LabelRef)) {
- if debug {
- caller := [100]uintptr{0}
- runtime.Callers(2, caller[:])
- frame, _ := runtime.CallersFrames(caller[:]).Next()
- ok := fmt.Sprintf("assert_check_%d_ok_srcline_%d", assertCounter, frame.Line)
- fn(LabelRef(ok))
- // Emit several since delve is imprecise.
- INT(Imm(3))
- INT(Imm(3))
- Label(ok)
- assertCounter++
- }
- }
- func genEncodeBlockAsm(name string, tableBits, skipLog, hashBytes int, avx bool, snappy bool) {
- TEXT(name, 0, "func(dst, src []byte) int")
- Doc(name+" encodes a non-empty src to a guaranteed-large-enough dst.",
- "It assumes that the varint-encoded length of the decompressed bytes has already been written.", "")
- Pragma("noescape")
- const literalMaxOverhead = 4
- var tableSize = 4 * (1 << tableBits)
- // Memzero needs at least 128 bytes.
- if tableSize < 128 {
- panic("tableSize must be at least 128 bytes")
- }
- lenSrcBasic, err := Param("src").Len().Resolve()
- if err != nil {
- panic(err)
- }
- lenSrcQ := lenSrcBasic.Addr
- lenDstBasic, err := Param("dst").Len().Resolve()
- if err != nil {
- panic(err)
- }
- lenDstQ := lenDstBasic.Addr
- // Bail if we can't compress to at least this.
- dstLimitPtrQ := AllocLocal(8)
- // sLimitL is when to stop looking for offset/length copies.
- sLimitL := AllocLocal(4)
- // nextEmitL keeps track of the point we have emitted to.
- nextEmitL := AllocLocal(4)
- // Repeat stores the last match offset.
- repeatL := AllocLocal(4)
- // nextSTempL keeps nextS while other functions are being called.
- nextSTempL := AllocLocal(4)
- // Alloc table last
- table := AllocLocal(tableSize)
- dst := GP64()
- {
- dstBaseBasic, err := Param("dst").Base().Resolve()
- if err != nil {
- panic(err)
- }
- dstBaseQ := dstBaseBasic.Addr
- MOVQ(dstBaseQ, dst)
- }
- srcBaseBasic, err := Param("src").Base().Resolve()
- if err != nil {
- panic(err)
- }
- srcBaseQ := srcBaseBasic.Addr
- // Zero table
- {
- iReg := GP64()
- MOVQ(U32(tableSize/8/16), iReg)
- tablePtr := GP64()
- LEAQ(table, tablePtr)
- zeroXmm := XMM()
- PXOR(zeroXmm, zeroXmm)
- Label("zero_loop_" + name)
- for i := 0; i < 8; i++ {
- MOVOU(zeroXmm, Mem{Base: tablePtr, Disp: i * 16})
- }
- ADDQ(U8(16*8), tablePtr)
- DECQ(iReg)
- JNZ(LabelRef("zero_loop_" + name))
- }
- {
- // nextEmit is offset n src where the next emitLiteral should start from.
- MOVL(U32(0), nextEmitL)
- const inputMargin = 8
- tmp, tmp2, tmp3 := GP64(), GP64(), GP64()
- MOVQ(lenSrcQ, tmp)
- LEAQ(Mem{Base: tmp, Disp: -5}, tmp2)
- // sLimitL := len(src) - inputMargin
- LEAQ(Mem{Base: tmp, Disp: -inputMargin}, tmp3)
- assert(func(ok LabelRef) {
- CMPQ(tmp3, lenSrcQ)
- JL(ok)
- })
- MOVL(tmp3.As32(), sLimitL)
- // dstLimit := (len(src) - 5 ) - len(src)>>5
- SHRQ(U8(5), tmp)
- SUBL(tmp.As32(), tmp2.As32()) // tmp2 = tmp2 - tmp
- assert(func(ok LabelRef) {
- // if len(src) > len(src) - len(src)>>5 - 5: ok
- CMPQ(lenSrcQ, tmp2)
- JGE(ok)
- })
- LEAQ(Mem{Base: dst, Index: tmp2, Scale: 1}, tmp2)
- MOVQ(tmp2, dstLimitPtrQ)
- // Store dst start address
- //MOVQ(dstAddr, dstStartPtrQ)
- }
- // s = 1
- s := GP32()
- MOVL(U32(1), s)
- // repeatL = 1
- MOVL(s, repeatL)
- src := GP64()
- Load(Param("src").Base(), src)
- // Load cv
- Label("search_loop_" + name)
- candidate := GP32()
- {
- assert(func(ok LabelRef) {
- // Check if somebody changed src
- tmp := GP64()
- MOVQ(srcBaseQ, tmp)
- CMPQ(tmp, src)
- JEQ(ok)
- })
- assert(func(ok LabelRef) {
- // Check if s is valid
- tmp := GP64()
- MOVQ(lenSrcQ, tmp)
- CMPQ(tmp, s.As64())
- JG(ok)
- })
- cv := GP64()
- MOVQ(Mem{Base: src, Index: s, Scale: 1}, cv)
- nextS := GP32()
- // nextS := s + (s-nextEmit)>>6 + 4
- {
- tmp := GP64()
- MOVL(s, tmp.As32()) // tmp = s
- SUBL(nextEmitL, tmp.As32()) // tmp = s - nextEmit
- SHRL(U8(skipLog), tmp.As32()) // tmp = (s - nextEmit) >> skipLog
- LEAL(Mem{Base: s, Disp: 4, Index: tmp, Scale: 1}, nextS)
- }
- // if nextS > sLimit {goto emitRemainder}
- {
- tmp := GP32()
- MOVL(sLimitL, tmp.As32())
- CMPL(nextS.As32(), tmp.As32())
- JGT(LabelRef("emit_remainder_" + name))
- }
- // move nextS to stack.
- MOVL(nextS.As32(), nextSTempL)
- candidate2 := GP32()
- hasher := hashN(hashBytes, tableBits)
- {
- hash0, hash1 := GP64(), GP64()
- MOVQ(cv, hash0)
- MOVQ(cv, hash1)
- SHRQ(U8(8), hash1)
- hasher.hash(hash0)
- hasher.hash(hash1)
- MOVL(table.Idx(hash0, 4), candidate)
- MOVL(table.Idx(hash1, 4), candidate2)
- assert(func(ok LabelRef) {
- CMPQ(hash0, U32(tableSize))
- JL(ok)
- })
- assert(func(ok LabelRef) {
- CMPQ(hash1, U32(tableSize))
- JL(ok)
- })
- MOVL(s, table.Idx(hash0, 4))
- tmp := GP32()
- LEAL(Mem{Base: s, Disp: 1}, tmp)
- MOVL(tmp, table.Idx(hash1, 4))
- }
- // Can be moved up if registers are available.
- hash2 := GP64()
- {
- // hash2 := hash6(cv>>16, tableBits)
- // hasher = hash6(tableBits)
- MOVQ(cv, hash2)
- SHRQ(U8(16), hash2)
- hasher.hash(hash2)
- assert(func(ok LabelRef) {
- CMPQ(hash2, U32(tableSize))
- JL(ok)
- })
- }
- // En/disable repeat matching.
- if true {
- // Check repeat at offset checkRep
- const checkRep = 1
- {
- // rep = s - repeat
- rep := GP32()
- MOVL(s, rep)
- SUBL(repeatL, rep) // rep = s - repeat
- // if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
- left, right := GP64(), GP64()
- MOVL(Mem{Base: src, Index: rep, Disp: checkRep, Scale: 1}, right.As32())
- MOVQ(cv, left)
- SHRQ(U8(checkRep*8), left)
- CMPL(left.As32(), right.As32())
- // BAIL, no repeat.
- JNE(LabelRef("no_repeat_found_" + name))
- }
- // base = s + checkRep
- base := GP32()
- LEAL(Mem{Base: s, Disp: checkRep}, base)
- // nextEmit before repeat.
- nextEmit := GP32()
- MOVL(nextEmitL, nextEmit)
- // Extend back
- if true {
- i := GP32()
- MOVL(base, i)
- SUBL(repeatL, i)
- JZ(LabelRef("repeat_extend_back_end_" + name))
- Label("repeat_extend_back_loop_" + name)
- // if base <= nextemit {exit}
- CMPL(base.As32(), nextEmit)
- JLE(LabelRef("repeat_extend_back_end_" + name))
- // if src[i-1] == src[base-1]
- tmp, tmp2 := GP64(), GP64()
- MOVB(Mem{Base: src, Index: i, Scale: 1, Disp: -1}, tmp.As8())
- MOVB(Mem{Base: src, Index: base, Scale: 1, Disp: -1}, tmp2.As8())
- CMPB(tmp.As8(), tmp2.As8())
- JNE(LabelRef("repeat_extend_back_end_" + name))
- LEAL(Mem{Base: base, Disp: -1}, base)
- DECL(i)
- JNZ(LabelRef("repeat_extend_back_loop_" + name))
- }
- Label("repeat_extend_back_end_" + name)
- // Base is now at start. Emit until base.
- // d += emitLiteral(dst[d:], src[nextEmit:base])
- if true {
- emitLiteralsDstP(nextEmitL, base, src, dst, "repeat_emit_"+name, avx)
- }
- // Extend forward
- {
- // s += 4 + checkRep
- ADDL(U8(4+checkRep), s)
- if true {
- // candidate := s - repeat + 4 + checkRep
- MOVL(s, candidate)
- SUBL(repeatL, candidate) // candidate = s - repeat
- // srcLeft = len(src) - s
- srcLeft := GP64()
- MOVQ(lenSrcQ, srcLeft)
- SUBL(s, srcLeft.As32())
- assert(func(ok LabelRef) {
- // if srcleft < maxint32: ok
- CMPQ(srcLeft, U32(0x7fffffff))
- JL(ok)
- })
- // Forward address
- forwardStart := GP64()
- LEAQ(Mem{Base: src, Index: s, Scale: 1}, forwardStart)
- // End address
- backStart := GP64()
- LEAQ(Mem{Base: src, Index: candidate, Scale: 1}, backStart)
- length := matchLen("repeat_extend", forwardStart, backStart, srcLeft, LabelRef("repeat_extend_forward_end_"+name))
- Label("repeat_extend_forward_end_" + name)
- // s+= length
- ADDL(length.As32(), s)
- }
- }
- // Emit
- if true {
- // length = s-base
- length := GP32()
- MOVL(s, length)
- SUBL(base.As32(), length) // length = s - base
- offsetVal := GP32()
- MOVL(repeatL, offsetVal)
- if !snappy {
- // if nextEmit == 0 {do copy instead...}
- TESTL(nextEmit, nextEmit)
- JZ(LabelRef("repeat_as_copy_" + name))
- // Emit as repeat...
- emitRepeat("match_repeat_"+name, length, offsetVal, nil, dst, LabelRef("repeat_end_emit_"+name))
- // Emit as copy instead...
- Label("repeat_as_copy_" + name)
- }
- emitCopy("repeat_as_copy_"+name, length, offsetVal, nil, dst, LabelRef("repeat_end_emit_"+name), snappy)
- Label("repeat_end_emit_" + name)
- // Store new dst and nextEmit
- MOVL(s, nextEmitL)
- }
- // if s >= sLimit
- // can be omitted.
- if true {
- CMPL(s, sLimitL)
- JGE(LabelRef("emit_remainder_" + name))
- }
- JMP(LabelRef("search_loop_" + name))
- }
- Label("no_repeat_found_" + name)
- {
- // Check candidates are ok. All must be < s and < len(src)
- assert(func(ok LabelRef) {
- tmp := GP64()
- MOVQ(lenSrcQ, tmp)
- CMPL(tmp.As32(), candidate)
- JG(ok)
- })
- assert(func(ok LabelRef) {
- CMPL(s, candidate)
- JG(ok)
- })
- assert(func(ok LabelRef) {
- tmp := GP64()
- MOVQ(lenSrcQ, tmp)
- CMPL(tmp.As32(), candidate2)
- JG(ok)
- })
- assert(func(ok LabelRef) {
- CMPL(s, candidate2)
- JG(ok)
- })
- CMPL(Mem{Base: src, Index: candidate, Scale: 1}, cv.As32())
- JEQ(LabelRef("candidate_match_" + name))
- tmp := GP32()
- // cv >>= 8
- SHRQ(U8(8), cv)
- // candidate = int(table[hash2]) - load early.
- MOVL(table.Idx(hash2, 4), candidate)
- assert(func(ok LabelRef) {
- tmp := GP64()
- MOVQ(lenSrcQ, tmp)
- CMPL(tmp.As32(), candidate)
- JG(ok)
- })
- assert(func(ok LabelRef) {
- // We may get s and s+1
- tmp := GP32()
- LEAL(Mem{Base: s, Disp: 2}, tmp)
- CMPL(tmp, candidate)
- JG(ok)
- })
- LEAL(Mem{Base: s, Disp: 2}, tmp)
- //if uint32(cv>>8) == load32(src, candidate2)
- CMPL(Mem{Base: src, Index: candidate2, Scale: 1}, cv.As32())
- JEQ(LabelRef("candidate2_match_" + name))
- // table[hash2] = uint32(s + 2)
- MOVL(tmp, table.Idx(hash2, 4))
- // cv >>= 8 (>> 16 total)
- SHRQ(U8(8), cv)
- // if uint32(cv>>16) == load32(src, candidate)
- CMPL(Mem{Base: src, Index: candidate, Scale: 1}, cv.As32())
- JEQ(LabelRef("candidate3_match_" + name))
- // No match found, next loop
- // s = nextS
- MOVL(nextSTempL, s)
- JMP(LabelRef("search_loop_" + name))
- // Matches candidate at s + 2 (3rd check)
- Label("candidate3_match_" + name)
- ADDL(U8(2), s)
- JMP(LabelRef("candidate_match_" + name))
- // Match at s + 1 (we calculated the hash, lets store it)
- Label("candidate2_match_" + name)
- // table[hash2] = uint32(s + 2)
- MOVL(tmp, table.Idx(hash2, 4))
- // s++
- INCL(s)
- MOVL(candidate2, candidate)
- }
- }
- Label("candidate_match_" + name)
- // We have a match at 's' with src offset in "candidate" that matches at least 4 bytes.
- // Extend backwards
- if true {
- ne := GP32()
- MOVL(nextEmitL, ne)
- TESTL(candidate, candidate)
- JZ(LabelRef("match_extend_back_end_" + name))
- // candidate is tested when decremented, so we loop back here.
- Label("match_extend_back_loop_" + name)
- // if s <= nextEmit {exit}
- CMPL(s, ne)
- JLE(LabelRef("match_extend_back_end_" + name))
- // if src[candidate-1] == src[s-1]
- tmp, tmp2 := GP64(), GP64()
- MOVB(Mem{Base: src, Index: candidate, Scale: 1, Disp: -1}, tmp.As8())
- MOVB(Mem{Base: src, Index: s, Scale: 1, Disp: -1}, tmp2.As8())
- CMPB(tmp.As8(), tmp2.As8())
- JNE(LabelRef("match_extend_back_end_" + name))
- LEAL(Mem{Base: s, Disp: -1}, s)
- DECL(candidate)
- JZ(LabelRef("match_extend_back_end_" + name))
- JMP(LabelRef("match_extend_back_loop_" + name))
- }
- Label("match_extend_back_end_" + name)
- // Bail if we exceed the maximum size.
- if true {
- // tmp = s-nextEmit
- tmp := GP64()
- MOVL(s, tmp.As32())
- SUBL(nextEmitL, tmp.As32())
- // tmp = &dst + s-nextEmit
- LEAQ(Mem{Base: dst, Index: tmp, Scale: 1, Disp: literalMaxOverhead}, tmp)
- CMPQ(tmp, dstLimitPtrQ)
- JL(LabelRef("match_dst_size_check_" + name))
- ri, err := ReturnIndex(0).Resolve()
- if err != nil {
- panic(err)
- }
- MOVQ(U32(0), ri.Addr)
- RET()
- }
- Label("match_dst_size_check_" + name)
- {
- base := GP32()
- MOVL(s, base.As32())
- emitLiteralsDstP(nextEmitL, base, src, dst, "match_emit_"+name, avx)
- }
- Label("match_nolit_loop_" + name)
- {
- // Update repeat
- {
- // repeat = base - candidate
- repeatVal := GP64().As32()
- MOVL(s, repeatVal)
- SUBL(candidate, repeatVal)
- MOVL(repeatVal, repeatL)
- }
- // s+=4, candidate+=4
- ADDL(U8(4), s)
- ADDL(U8(4), candidate)
- // Extend the 4-byte match as long as possible and emit copy.
- {
- assert(func(ok LabelRef) {
- // s must be > candidate cannot be equal.
- CMPL(s, candidate)
- JG(ok)
- })
- // srcLeft = len(src) - s
- srcLeft := GP64()
- MOVQ(lenSrcQ, srcLeft)
- SUBL(s, srcLeft.As32())
- assert(func(ok LabelRef) {
- // if srcleft < maxint32: ok
- CMPQ(srcLeft, U32(0x7fffffff))
- JL(ok)
- })
- a, b := GP64(), GP64()
- LEAQ(Mem{Base: src, Index: s, Scale: 1}, a)
- LEAQ(Mem{Base: src, Index: candidate, Scale: 1}, b)
- length := matchLen("match_nolit_"+name,
- a, b,
- srcLeft,
- LabelRef("match_nolit_end_"+name),
- )
- Label("match_nolit_end_" + name)
- // s += length (length is destroyed, use it now)
- ADDL(length.As32(), s)
- // Load offset from repeat value.
- offset := GP64()
- MOVL(repeatL, offset.As32())
- // length += 4
- ADDL(U8(4), length)
- emitCopy("match_nolit_"+name, length, offset, nil, dst, LabelRef("match_nolit_emitcopy_end_"+name), snappy)
- Label("match_nolit_emitcopy_end_" + name)
- MOVL(s, nextEmitL)
- // if s >= sLimit { end }
- CMPL(s, sLimitL)
- JGE(LabelRef("emit_remainder_" + name))
- // Bail if we exceed the maximum size.
- {
- CMPQ(dst, dstLimitPtrQ)
- JL(LabelRef("match_nolit_dst_ok_" + name))
- ri, err := ReturnIndex(0).Resolve()
- if err != nil {
- panic(err)
- }
- MOVQ(U32(0), ri.Addr)
- RET()
- Label("match_nolit_dst_ok_" + name)
- }
- }
- {
- // Check for an immediate match, otherwise start search at s+1
- x := GP64()
- // Index s-2
- MOVQ(Mem{Base: src, Index: s, Scale: 1, Disp: -2}, x)
- hasher := hashN(hashBytes, tableBits)
- hash0, hash1 := GP64(), GP64()
- MOVQ(x, hash0) // s-2
- SHRQ(U8(16), x)
- MOVQ(x, hash1) // s
- hasher.hash(hash0)
- hasher.hash(hash1)
- sm2 := GP32() // s - 2
- LEAL(Mem{Base: s, Disp: -2}, sm2)
- assert(func(ok LabelRef) {
- CMPQ(hash0, U32(tableSize))
- JL(ok)
- })
- assert(func(ok LabelRef) {
- CMPQ(hash1, U32(tableSize))
- JL(ok)
- })
- MOVL(table.Idx(hash1, 4), candidate)
- MOVL(sm2, table.Idx(hash0, 4))
- MOVL(s, table.Idx(hash1, 4))
- CMPL(Mem{Base: src, Index: candidate, Scale: 1}, x.As32())
- JEQ(LabelRef("match_nolit_loop_" + name))
- INCL(s)
- }
- JMP(LabelRef("search_loop_" + name))
- }
- Label("emit_remainder_" + name)
- // Bail if we exceed the maximum size.
- // if d+len(src)-nextEmitL > dstLimitPtrQ { return 0
- {
- // remain = len(src) - nextEmit
- remain := GP64()
- MOVQ(lenSrcQ, remain)
- SUBL(nextEmitL, remain.As32())
- dstExpect := GP64()
- // dst := dst + (len(src)-nextEmitL)
- LEAQ(Mem{Base: dst, Index: remain, Scale: 1, Disp: literalMaxOverhead}, dstExpect)
- CMPQ(dstExpect, dstLimitPtrQ)
- JL(LabelRef("emit_remainder_ok_" + name))
- ri, err := ReturnIndex(0).Resolve()
- if err != nil {
- panic(err)
- }
- MOVQ(U32(0), ri.Addr)
- RET()
- Label("emit_remainder_ok_" + name)
- }
- // emitLiteral(dst[d:], src[nextEmitL:])
- emitEnd := GP64()
- MOVQ(lenSrcQ, emitEnd)
- // Emit final literals.
- emitLiteralsDstP(nextEmitL, emitEnd, src, dst, "emit_remainder_"+name, avx)
- // Assert size is < limit
- assert(func(ok LabelRef) {
- // if dstBaseQ < dstLimitPtrQ: ok
- CMPQ(dst, dstLimitPtrQ)
- JL(ok)
- })
- // length := start - base (ptr arithmetic)
- length := GP64()
- base := Load(Param("dst").Base(), GP64())
- MOVQ(dst, length)
- SUBQ(base, length)
- // Assert size is < len(src)
- assert(func(ok LabelRef) {
- // if len(src) >= length: ok
- CMPQ(lenSrcQ, length)
- JGE(ok)
- })
- // Assert size is < len(dst)
- assert(func(ok LabelRef) {
- // if len(dst) >= length: ok
- CMPQ(lenDstQ, length)
- JGE(ok)
- })
- Store(length, ReturnIndex(0))
- RET()
- }
- // emitLiterals emits literals from nextEmit to base, updates nextEmit, dstBase.
- // Checks if base == nextemit.
- // src & base are untouched.
- func emitLiterals(nextEmitL Mem, base reg.GPVirtual, src reg.GPVirtual, dstBase Mem, name string, avx bool) {
- nextEmit, litLen, dstBaseTmp, litBase := GP32(), GP32(), GP64(), GP64()
- MOVL(nextEmitL, nextEmit)
- CMPL(nextEmit, base.As32())
- JEQ(LabelRef("emit_literal_skip_" + name))
- MOVL(base.As32(), litLen.As32())
- // Base is now next emit.
- MOVL(base.As32(), nextEmitL)
- // litBase = src[nextEmitL:]
- LEAQ(Mem{Base: src, Index: nextEmit, Scale: 1}, litBase)
- SUBL(nextEmit, litLen.As32()) // litlen = base - nextEmit
- // Load (and store when we return)
- MOVQ(dstBase, dstBaseTmp)
- emitLiteral(name, litLen, nil, dstBaseTmp, litBase, LabelRef("emit_literal_done_"+name), avx, true)
- Label("emit_literal_done_" + name)
- // Emitted length must be > litlen.
- // We have already checked for len(0) above.
- assert(func(ok LabelRef) {
- tmp := GP64()
- MOVQ(dstBaseTmp, tmp)
- SUBQ(dstBase, tmp) // tmp = dstBaseTmp - dstBase
- // if tmp > litLen: ok
- CMPQ(tmp, litLen.As64())
- JG(ok)
- })
- // Store updated dstBase
- MOVQ(dstBaseTmp, dstBase)
- Label("emit_literal_skip_" + name)
- }
- // emitLiterals emits literals from nextEmit to base, updates nextEmit, dstBase.
- // Checks if base == nextemit.
- // src & base are untouched.
- func emitLiteralsDstP(nextEmitL Mem, base reg.GPVirtual, src, dst reg.GPVirtual, name string, avx bool) {
- nextEmit, litLen, litBase := GP32(), GP32(), GP64()
- MOVL(nextEmitL, nextEmit)
- CMPL(nextEmit, base.As32())
- JEQ(LabelRef("emit_literal_done_" + name))
- MOVL(base.As32(), litLen.As32())
- // Base is now next emit.
- MOVL(base.As32(), nextEmitL)
- // litBase = src[nextEmitL:]
- LEAQ(Mem{Base: src, Index: nextEmit, Scale: 1}, litBase)
- SUBL(nextEmit, litLen.As32()) // litlen = base - nextEmit
- // Load (and store when we return)
- emitLiteral(name, litLen, nil, dst, litBase, LabelRef("emit_literal_done_"+name), avx, true)
- Label("emit_literal_done_" + name)
- }
- type hashGen struct {
- bytes int
- tablebits int
- mulreg reg.GPVirtual
- }
- // hashN uses multiply to get a 'output' hash on the hash of the lowest 'bytes' bytes in value.
- func hashN(hashBytes, tablebits int) hashGen {
- h := hashGen{
- bytes: hashBytes,
- tablebits: tablebits,
- mulreg: GP64(),
- }
- primebytes := uint64(0)
- switch hashBytes {
- case 3:
- primebytes = 506832829
- case 4:
- primebytes = 2654435761
- case 5:
- primebytes = 889523592379
- case 6:
- primebytes = 227718039650203
- case 7:
- primebytes = 58295818150454627
- case 8:
- primebytes = 0xcf1bbcdcb7a56463
- default:
- panic("invalid hash length")
- }
- MOVQ(Imm(primebytes), h.mulreg)
- return h
- }
- // hash uses multiply to get hash of the value.
- func (h hashGen) hash(val reg.GPVirtual) {
- // Move value to top of register.
- SHLQ(U8(64-8*h.bytes), val)
- IMULQ(h.mulreg, val)
- // Move value to bottom
- SHRQ(U8(64-h.tablebits), val)
- }
- func genEmitLiteral() {
- TEXT("emitLiteral", NOSPLIT, "func(dst, lit []byte) int")
- Doc("emitLiteral writes a literal chunk and returns the number of bytes written.", "",
- "It assumes that:",
- " dst is long enough to hold the encoded bytes",
- " 0 <= len(lit) && len(lit) <= math.MaxUint32", "")
- Pragma("noescape")
- dstBase, litBase, litLen, retval := GP64(), GP64(), GP64(), GP64()
- Load(Param("dst").Base(), dstBase)
- Load(Param("lit").Base(), litBase)
- Load(Param("lit").Len(), litLen)
- emitLiteral("standalone", litLen, retval, dstBase, litBase, "emit_literal_end_standalone", false, false)
- Label("emit_literal_end_standalone")
- Store(retval, ReturnIndex(0))
- RET()
- TEXT("emitLiteralAvx", NOSPLIT, "func(dst, lit []byte) int")
- Doc("emitLiteralAvx writes a literal chunk and returns the number of bytes written.", "",
- "It assumes that:",
- " dst is long enough to hold the encoded bytes",
- " 0 <= len(lit) && len(lit) <= math.MaxUint32", "")
- Pragma("noescape")
- dstBase, litBase, litLen, retval = GP64(), GP64(), GP64(), GP64()
- Load(Param("dst").Base(), dstBase)
- Load(Param("lit").Base(), litBase)
- Load(Param("lit").Len(), litLen)
- emitLiteral("standalone", litLen, retval, dstBase, litBase, "emit_literal_end_avx_standalone", true, false)
- Label("emit_literal_end_avx_standalone")
- Store(retval, ReturnIndex(0))
- RET()
- }
- // emitLiteral can be used for inlining an emitLiteral call.
- // stack must have at least 32 bytes.
- // retval will contain emitted bytes, but can be nil if this is not interesting.
- // dstBase and litBase are updated.
- // Uses 2 GP registers. With AVX 4 registers.
- // If updateDst is true dstBase will have the updated end pointer and an additional register will be used.
- func emitLiteral(name string, litLen, retval, dstBase, litBase reg.GPVirtual, end LabelRef, avx, updateDst bool) {
- n := GP32()
- n16 := GP32()
- // We always add litLen bytes
- if retval != nil {
- MOVL(litLen.As32(), retval.As32())
- }
- MOVL(litLen.As32(), n)
- SUBL(U8(1), n.As32())
- // Return if AX was 0
- JC(end)
- // Find number of bytes to emit for tag.
- CMPL(n.As32(), U8(60))
- JLT(LabelRef("one_byte_" + name))
- CMPL(n.As32(), U32(1<<8))
- JLT(LabelRef("two_bytes_" + name))
- CMPL(n.As32(), U32(1<<16))
- JLT(LabelRef("three_bytes_" + name))
- CMPL(n.As32(), U32(1<<24))
- JLT(LabelRef("four_bytes_" + name))
- Label("five_bytes_" + name)
- MOVB(U8(252), Mem{Base: dstBase})
- MOVL(n.As32(), Mem{Base: dstBase, Disp: 1})
- if retval != nil {
- ADDQ(U8(5), retval)
- }
- ADDQ(U8(5), dstBase)
- JMP(LabelRef("memmove_" + name))
- Label("four_bytes_" + name)
- MOVL(n, n16)
- SHRL(U8(16), n16.As32())
- MOVB(U8(248), Mem{Base: dstBase})
- MOVW(n.As16(), Mem{Base: dstBase, Disp: 1})
- MOVB(n16.As8(), Mem{Base: dstBase, Disp: 3})
- if retval != nil {
- ADDQ(U8(4), retval)
- }
- ADDQ(U8(4), dstBase)
- JMP(LabelRef("memmove_" + name))
- Label("three_bytes_" + name)
- MOVB(U8(0xf4), Mem{Base: dstBase})
- MOVW(n.As16(), Mem{Base: dstBase, Disp: 1})
- if retval != nil {
- ADDQ(U8(3), retval)
- }
- ADDQ(U8(3), dstBase)
- JMP(LabelRef("memmove_" + name))
- Label("two_bytes_" + name)
- MOVB(U8(0xf0), Mem{Base: dstBase})
- MOVB(n.As8(), Mem{Base: dstBase, Disp: 1})
- if retval != nil {
- ADDQ(U8(2), retval)
- }
- ADDQ(U8(2), dstBase)
- JMP(LabelRef("memmove_" + name))
- Label("one_byte_" + name)
- SHLB(U8(2), n.As8())
- MOVB(n.As8(), Mem{Base: dstBase})
- if retval != nil {
- ADDQ(U8(1), retval)
- }
- ADDQ(U8(1), dstBase)
- // Fallthrough
- Label("memmove_" + name)
- // copy(dst[i:], lit)
- if true {
- dstEnd := GP64()
- copyEnd := end
- if updateDst {
- copyEnd = LabelRef("memmove_end_copy_" + name)
- LEAQ(Mem{Base: dstBase, Index: litLen, Scale: 1}, dstEnd)
- }
- length := GP64()
- MOVL(litLen.As32(), length.As32())
- // updates litBase.
- genMemMove2("emit_lit_memmove_"+name, dstBase, litBase, length, copyEnd, avx)
- if updateDst {
- Label("memmove_end_copy_" + name)
- MOVQ(dstEnd, dstBase)
- JMP(end)
- }
- } else {
- // genMemMove always updates dstBase
- src := GP64()
- MOVQ(litBase, src)
- genMemMove("emit_lit_memmove_"+name, dstBase, litBase, litLen, end)
- }
- // Should be unreachable
- if debug {
- INT(Imm(3))
- }
- return
- }
- // genEmitRepeat generates a standlone emitRepeat.
- func genEmitRepeat() {
- TEXT("emitRepeat", NOSPLIT, "func(dst []byte, offset, length int) int")
- Doc("emitRepeat writes a repeat chunk and returns the number of bytes written.",
- "Length must be at least 4 and < 1<<32", "")
- Pragma("noescape")
- dstBase, offset, length, retval := GP64(), GP64(), GP64(), GP64()
- // retval = 0
- XORQ(retval, retval)
- Load(Param("dst").Base(), dstBase)
- Load(Param("offset"), offset)
- Load(Param("length"), length)
- emitRepeat("standalone", length, offset, retval, dstBase, LabelRef("gen_emit_repeat_end"))
- Label("gen_emit_repeat_end")
- Store(retval, ReturnIndex(0))
- RET()
- }
- // emitRepeat can be used for inlining an emitRepeat call.
- // length >= 4 and < 1<<32
- // length is modified. dstBase is updated. retval is added to input.
- // retval can be nil.
- // Will jump to end label when finished.
- // Uses 1 GP register.
- func emitRepeat(name string, length, offset, retval, dstBase reg.GPVirtual, end LabelRef) {
- Label("emit_repeat_again_" + name)
- tmp := GP32()
- MOVL(length.As32(), tmp) // Copy length
- // length -= 4
- LEAL(Mem{Base: length, Disp: -4}, length.As32())
- // if length <= 4 (use copied value)
- CMPL(tmp.As32(), U8(8))
- JLE(LabelRef("repeat_two_" + name))
- // length < 8 && offset < 2048
- CMPL(tmp.As32(), U8(12))
- JGE(LabelRef("cant_repeat_two_offset_" + name))
- CMPL(offset.As32(), U32(2048))
- JLT(LabelRef("repeat_two_offset_" + name))
- const maxRepeat = ((1 << 24) - 1) + 65536
- Label("cant_repeat_two_offset_" + name)
- CMPL(length.As32(), U32((1<<8)+4))
- JLT(LabelRef("repeat_three_" + name)) // if length < (1<<8)+4
- CMPL(length.As32(), U32((1<<16)+(1<<8)))
- JLT(LabelRef("repeat_four_" + name)) // if length < (1 << 16) + (1 << 8)
- CMPL(length.As32(), U32(maxRepeat))
- JLT(LabelRef("repeat_five_" + name)) // If less than 24 bits to represent.
- // We have have more than 24 bits
- // Emit so we have at least 4 bytes left.
- LEAL(Mem{Base: length, Disp: -(maxRepeat - 4)}, length.As32()) // length -= (maxRepeat - 4)
- MOVW(U16(7<<2|tagCopy1), Mem{Base: dstBase}) // dst[0] = 7<<2 | tagCopy1, dst[1] = 0
- MOVW(U16(65531), Mem{Base: dstBase, Disp: 2}) // 0xfffb
- MOVB(U8(255), Mem{Base: dstBase, Disp: 4})
- ADDQ(U8(5), dstBase)
- if retval != nil {
- ADDQ(U8(5), retval)
- }
- JMP(LabelRef("emit_repeat_again_" + name))
- // Must be able to be within 5 bytes.
- Label("repeat_five_" + name)
- LEAL(Mem{Base: length, Disp: -65536}, length.As32()) // length -= 65536
- MOVL(length.As32(), offset.As32())
- MOVW(U16(7<<2|tagCopy1), Mem{Base: dstBase}) // dst[0] = 7<<2 | tagCopy1, dst[1] = 0
- MOVW(length.As16(), Mem{Base: dstBase, Disp: 2}) // dst[2] = uint8(length), dst[3] = uint8(length >> 8)
- SARL(U8(16), offset.As32()) // offset = length >> 16
- MOVB(offset.As8(), Mem{Base: dstBase, Disp: 4}) // dst[4] = length >> 16
- if retval != nil {
- ADDQ(U8(5), retval) // i += 5
- }
- ADDQ(U8(5), dstBase) // dst += 5
- JMP(end)
- Label("repeat_four_" + name)
- LEAL(Mem{Base: length, Disp: -256}, length.As32()) // length -= 256
- MOVW(U16(6<<2|tagCopy1), Mem{Base: dstBase}) // dst[0] = 6<<2 | tagCopy1, dst[1] = 0
- MOVW(length.As16(), Mem{Base: dstBase, Disp: 2}) // dst[2] = uint8(length), dst[3] = uint8(length >> 8)
- if retval != nil {
- ADDQ(U8(4), retval) // i += 4
- }
- ADDQ(U8(4), dstBase) // dst += 4
- JMP(end)
- Label("repeat_three_" + name)
- LEAL(Mem{Base: length, Disp: -4}, length.As32()) // length -= 4
- MOVW(U16(5<<2|tagCopy1), Mem{Base: dstBase}) // dst[0] = 5<<2 | tagCopy1, dst[1] = 0
- MOVB(length.As8(), Mem{Base: dstBase, Disp: 2}) // dst[2] = uint8(length)
- if retval != nil {
- ADDQ(U8(3), retval) // i += 3
- }
- ADDQ(U8(3), dstBase) // dst += 3
- JMP(end)
- Label("repeat_two_" + name)
- // dst[0] = uint8(length)<<2 | tagCopy1, dst[1] = 0
- SHLL(U8(2), length.As32())
- ORL(U8(tagCopy1), length.As32())
- MOVW(length.As16(), Mem{Base: dstBase}) // dst[0] = 7<<2 | tagCopy1, dst[1] = 0
- if retval != nil {
- ADDQ(U8(2), retval) // i += 2
- }
- ADDQ(U8(2), dstBase) // dst += 2
- JMP(end)
- Label("repeat_two_offset_" + name)
- // Emit the remaining copy, encoded as 2 bytes.
- // dst[1] = uint8(offset)
- // dst[0] = uint8(offset>>8)<<5 | uint8(length)<<2 | tagCopy1
- tmp = GP64()
- XORQ(tmp, tmp)
- // Use scale and displacement to shift and subtract values from length.
- LEAL(Mem{Base: tmp, Index: length, Scale: 4, Disp: tagCopy1}, length.As32())
- MOVB(offset.As8(), Mem{Base: dstBase, Disp: 1}) // Store offset lower byte
- SARL(U8(8), offset.As32()) // Remove lower
- SHLL(U8(5), offset.As32()) // Shift back up
- ORL(offset.As32(), length.As32()) // OR result
- MOVB(length.As8(), Mem{Base: dstBase, Disp: 0})
- if retval != nil {
- ADDQ(U8(2), retval) // i += 2
- }
- ADDQ(U8(2), dstBase) // dst += 2
- JMP(end)
- }
- // emitCopy writes a copy chunk and returns the number of bytes written.
- //
- // It assumes that:
- // dst is long enough to hold the encoded bytes
- // 1 <= offset && offset <= math.MaxUint32
- // 4 <= length && length <= 1 << 24
- // genEmitCopy generates a standlone emitCopy
- func genEmitCopy() {
- TEXT("emitCopy", NOSPLIT, "func(dst []byte, offset, length int) int")
- Doc("emitCopy writes a copy chunk and returns the number of bytes written.", "",
- "It assumes that:",
- " dst is long enough to hold the encoded bytes",
- " 1 <= offset && offset <= math.MaxUint32",
- " 4 <= length && length <= 1 << 24", "")
- Pragma("noescape")
- dstBase, offset, length, retval := GP64(), GP64(), GP64(), GP64()
- // i := 0
- XORQ(retval, retval)
- Load(Param("dst").Base(), dstBase)
- Load(Param("offset"), offset)
- Load(Param("length"), length)
- emitCopy("standalone", length, offset, retval, dstBase, LabelRef("gen_emit_copy_end"), false)
- Label("gen_emit_copy_end")
- Store(retval, ReturnIndex(0))
- RET()
- }
- // emitCopy writes a copy chunk and returns the number of bytes written.
- //
- // It assumes that:
- // dst is long enough to hold the encoded bytes
- // 1 <= offset && offset <= math.MaxUint32
- // 4 <= length && length <= 1 << 24
- // genEmitCopy generates a standlone emitCopy
- func genEmitCopyNoRepeat() {
- TEXT("emitCopyNoRepeat", NOSPLIT, "func(dst []byte, offset, length int) int")
- Doc("emitCopyNoRepeat writes a copy chunk and returns the number of bytes written.", "",
- "It assumes that:",
- " dst is long enough to hold the encoded bytes",
- " 1 <= offset && offset <= math.MaxUint32",
- " 4 <= length && length <= 1 << 24", "")
- Pragma("noescape")
- dstBase, offset, length, retval := GP64(), GP64(), GP64(), GP64()
- // i := 0
- XORQ(retval, retval)
- Load(Param("dst").Base(), dstBase)
- Load(Param("offset"), offset)
- Load(Param("length"), length)
- emitCopy("standalone_snappy", length, offset, retval, dstBase, "gen_emit_copy_end_snappy", true)
- Label("gen_emit_copy_end_snappy")
- Store(retval, ReturnIndex(0))
- RET()
- }
- const (
- tagLiteral = 0x00
- tagCopy1 = 0x01
- tagCopy2 = 0x02
- tagCopy4 = 0x03
- )
- // emitCopy can be used for inlining an emitCopy call.
- // length is modified (and junk). dstBase is updated. retval is added to input.
- // retval can be nil.
- // Will jump to end label when finished.
- // Uses 2 GP registers.
- func emitCopy(name string, length, offset, retval, dstBase reg.GPVirtual, end LabelRef, snappy bool) {
- //if offset >= 65536 {
- CMPL(offset.As32(), U32(65536))
- JL(LabelRef("two_byte_offset_" + name))
- // offset is >= 65536
- // if length <= 64 goto four_bytes_remain_
- Label("four_bytes_loop_back_" + name)
- CMPL(length.As32(), U8(64))
- JLE(LabelRef("four_bytes_remain_" + name))
- // Emit a length 64 copy, encoded as 5 bytes.
- // dst[0] = 63<<2 | tagCopy4
- MOVB(U8(63<<2|tagCopy4), Mem{Base: dstBase})
- // dst[4] = uint8(offset >> 24)
- // dst[3] = uint8(offset >> 16)
- // dst[2] = uint8(offset >> 8)
- // dst[1] = uint8(offset)
- MOVL(offset.As32(), Mem{Base: dstBase, Disp: 1})
- // length -= 64
- LEAL(Mem{Base: length, Disp: -64}, length.As32())
- if retval != nil {
- ADDQ(U8(5), retval) // i+=5
- }
- ADDQ(U8(5), dstBase) // dst+=5
- // if length >= 4 {
- CMPL(length.As32(), U8(4))
- JL(LabelRef("four_bytes_remain_" + name))
- // Emit remaining as repeats
- // return 5 + emitRepeat(dst[5:], offset, length)
- // Inline call to emitRepeat. Will jump to end
- if !snappy {
- emitRepeat(name+"_emit_copy", length, offset, retval, dstBase, end)
- }
- JMP(LabelRef("four_bytes_loop_back_" + name))
- Label("four_bytes_remain_" + name)
- // if length == 0 {
- // return i
- // }
- TESTL(length.As32(), length.As32())
- JZ(end)
- // Emit a copy, offset encoded as 4 bytes.
- // dst[i+0] = uint8(length-1)<<2 | tagCopy4
- // dst[i+1] = uint8(offset)
- // dst[i+2] = uint8(offset >> 8)
- // dst[i+3] = uint8(offset >> 16)
- // dst[i+4] = uint8(offset >> 24)
- tmp := GP64()
- MOVB(U8(tagCopy4), tmp.As8())
- // Use displacement to subtract 1 from upshifted length.
- LEAL(Mem{Base: tmp, Disp: -(1 << 2), Index: length, Scale: 4}, length.As32())
- MOVB(length.As8(), Mem{Base: dstBase})
- MOVL(offset.As32(), Mem{Base: dstBase, Disp: 1})
- // return i + 5
- if retval != nil {
- ADDQ(U8(5), retval)
- }
- ADDQ(U8(5), dstBase)
- JMP(end)
- Label("two_byte_offset_" + name)
- // Offset no more than 2 bytes.
- //if length > 64 {
- CMPL(length.As32(), U8(64))
- JLE(LabelRef("two_byte_offset_short_" + name))
- // Emit a length 60 copy, encoded as 3 bytes.
- // Emit remaining as repeat value (minimum 4 bytes).
- // dst[2] = uint8(offset >> 8)
- // dst[1] = uint8(offset)
- // dst[0] = 59<<2 | tagCopy2
- MOVB(U8(59<<2|tagCopy2), Mem{Base: dstBase})
- MOVW(offset.As16(), Mem{Base: dstBase, Disp: 1})
- // length -= 60
- LEAL(Mem{Base: length, Disp: -60}, length.As32())
- // Emit remaining as repeats, at least 4 bytes remain.
- // return 3 + emitRepeat(dst[3:], offset, length)
- //}
- ADDQ(U8(3), dstBase)
- if retval != nil {
- ADDQ(U8(3), retval)
- }
- // Inline call to emitRepeat. Will jump to end
- if !snappy {
- emitRepeat(name+"_emit_copy_short", length, offset, retval, dstBase, end)
- }
- JMP(LabelRef("two_byte_offset_" + name))
- Label("two_byte_offset_short_" + name)
- //if length >= 12 || offset >= 2048 {
- CMPL(length.As32(), U8(12))
- JGE(LabelRef("emit_copy_three_" + name))
- CMPL(offset.As32(), U32(2048))
- JGE(LabelRef("emit_copy_three_" + name))
- // Emit the remaining copy, encoded as 2 bytes.
- // dst[1] = uint8(offset)
- // dst[0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
- tmp = GP64()
- MOVB(U8(tagCopy1), tmp.As8())
- // Use scale and displacement to shift and subtract values from length.
- LEAL(Mem{Base: tmp, Index: length, Scale: 4, Disp: -(4 << 2)}, length.As32())
- MOVB(offset.As8(), Mem{Base: dstBase, Disp: 1}) // Store offset lower byte
- SHRL(U8(8), offset.As32()) // Remove lower
- SHLL(U8(5), offset.As32()) // Shift back up
- ORL(offset.As32(), length.As32()) // OR result
- MOVB(length.As8(), Mem{Base: dstBase, Disp: 0})
- if retval != nil {
- ADDQ(U8(2), retval) // i += 2
- }
- ADDQ(U8(2), dstBase) // dst += 2
- // return 2
- JMP(end)
- Label("emit_copy_three_" + name)
- // // Emit the remaining copy, encoded as 3 bytes.
- // dst[2] = uint8(offset >> 8)
- // dst[1] = uint8(offset)
- // dst[0] = uint8(length-1)<<2 | tagCopy2
- tmp = GP64()
- MOVB(U8(tagCopy2), tmp.As8())
- LEAL(Mem{Base: tmp, Disp: -(1 << 2), Index: length, Scale: 4}, length.As32())
- MOVB(length.As8(), Mem{Base: dstBase})
- MOVW(offset.As16(), Mem{Base: dstBase, Disp: 1})
- // return 3
- if retval != nil {
- ADDQ(U8(3), retval) // i += 3
- }
- ADDQ(U8(3), dstBase) // dst += 3
- JMP(end)
- }
- // func memmove(to, from unsafe.Pointer, n uintptr)
- // to and from will be at the end, n will be 0.
- // to and from may not overlap.
- // Fairly simplistic for now, can ofc. be extended.
- // Uses one GP register and 8 SSE registers.
- func genMemMove(name string, to, from, n reg.GPVirtual, end LabelRef) {
- tmp := GP32()
- MOVL(n.As32(), tmp)
- // tmp = n/128
- SHRL(U8(7), tmp)
- TESTL(tmp, tmp)
- JZ(LabelRef("done_128_" + name))
- Label("loop_128_" + name)
- var xmmregs [8]reg.VecVirtual
- // Prefetch destination for next loop.
- // Prefetching source doesn't provide speedup.
- // This seems to give a small boost.
- const preOff = 128
- PREFETCHT0(Mem{Base: to, Disp: preOff})
- PREFETCHT0(Mem{Base: to, Disp: preOff + 64})
- for i := 0; i < 8; i++ {
- xmmregs[i] = XMM()
- MOVOU(Mem{Base: from}.Offset(i*16), xmmregs[i])
- }
- for i := 0; i < 8; i++ {
- MOVOU(xmmregs[i], Mem{Base: to}.Offset(i*16))
- }
- LEAL(Mem{Base: n, Disp: -128}, n.As32())
- ADDQ(U8(8*16), from)
- ADDQ(U8(8*16), to)
- DECL(tmp)
- JNZ(LabelRef("loop_128_" + name))
- Label("done_128_" + name)
- MOVL(n.As32(), tmp)
- // tmp = n/16
- SHRL(U8(4), tmp)
- TESTL(tmp, tmp)
- JZ(LabelRef("done_16_" + name))
- Label("loop_16_" + name)
- xmm := XMM()
- MOVOU(Mem{Base: from}, xmm)
- MOVOU(xmm, Mem{Base: to})
- LEAL(Mem{Base: n, Disp: -16}, n.As32())
- ADDQ(U8(16), from)
- ADDQ(U8(16), to)
- DECL(tmp)
- JNZ(LabelRef("loop_16_" + name))
- Label("done_16_" + name)
- // TODO: Use REP; MOVSB somehow.
- TESTL(n.As32(), n.As32())
- JZ(end)
- Label("loop_1_" + name)
- MOVB(Mem{Base: from}, tmp.As8())
- MOVB(tmp.As8(), Mem{Base: to})
- INCQ(from)
- INCQ(to)
- DECL(n.As32())
- JNZ(LabelRef("loop_1_" + name))
- JMP(end)
- }
- // func memmove(to, from unsafe.Pointer, n uintptr)
- // src and dst may not overlap.
- // Non AVX uses 2 GP register, 16 SSE2 registers.
- // AVX uses 4 GP registers 16 AVX/SSE registers.
- // All passed registers may be updated.
- func genMemMove2(name string, dst, src, length reg.GPVirtual, end LabelRef, avx bool) {
- AX, CX := GP64(), GP64()
- NOP()
- name += "_memmove_"
- Label(name + "tail")
- // move_129through256 or smaller work whether or not the source and the
- // destination memory regions overlap because they load all data into
- // registers before writing it back. move_256through2048 on the other
- // hand can be used only when the memory regions don't overlap or the copy
- // direction is forward.
- //
- // BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing.
- TESTQ(length, length)
- JEQ(end)
- CMPQ(length, U8(2))
- JBE(LabelRef(name + "move_1or2"))
- CMPQ(length, U8(4))
- JB(LabelRef(name + "move_3"))
- JBE(LabelRef(name + "move_4"))
- CMPQ(length, U8(8))
- JB(LabelRef(name + "move_5through7"))
- JE(LabelRef(name + "move_8"))
- CMPQ(length, U8(16))
- JBE(LabelRef(name + "move_9through16"))
- CMPQ(length, U8(32))
- JBE(LabelRef(name + "move_17through32"))
- CMPQ(length, U8(64))
- JBE(LabelRef(name + "move_33through64"))
- CMPQ(length, U8(128))
- JBE(LabelRef(name + "move_65through128"))
- CMPQ(length, U32(256))
- JBE(LabelRef(name + "move_129through256"))
- if avx {
- JMP(LabelRef(name + "avxUnaligned"))
- } else {
- if false {
- // Don't check length for now.
- Label(name + "forward")
- CMPQ(length, U32(2048))
- JLS(LabelRef(name + "move_256through2048"))
- genMemMove(name+"fallback", dst, src, length, end)
- } else {
- JMP(LabelRef(name + "move_256through2048"))
- }
- }
- /*
- // If REP MOVSB isn't fast, don't use it
- // FIXME: internal∕cpu·X86+const_offsetX86HasERMS(SB)
- // CMPB(U8(1), U8(1)) // enhanced REP MOVSB/STOSB
- JMP(LabelRef(name + "fwdBy8"))
- // Check alignment
- MOVL(src.As32(), AX.As32())
- ORL(dst.As32(), AX.As32())
- TESTL(U32(7), AX.As32())
- JEQ(LabelRef(name + "fwdBy8"))
- // Do 1 byte at a time
- // MOVQ(length, CX)
- // FIXME:
- // REP; MOVSB
- JMP(end)
- Label(name + "fwdBy8")
- // Do 8 bytes at a time
- MOVQ(length, CX)
- SHRQ(U8(3), CX)
- ANDQ(U8(7), length)
- // FIXME:
- //REP; MOVSQ
- JMP(LabelRef(name + "tail"))
- Label(name + "back")
- //check overlap
- MOVQ(src, CX)
- ADDQ(length, CX)
- CMPQ(CX, dst)
- JLS(LabelRef(name + "forward"))
- //whole thing backwards has
- //adjusted addresses
- ADDQ(length, dst)
- ADDQ(length, src)
- STD()
- //
- // copy
- //
- MOVQ(length, CX)
- SHRQ(U8(3), CX)
- ANDQ(U8(7), length)
- SUBQ(U8(8), dst)
- SUBQ(U8(8), src)
- // FIXME:
- //REP; MOVSQ
- // FIXME:
- //CLD()
- ADDQ(U8(8), dst)
- ADDQ(U8(8), src)
- SUBQ(length, dst)
- SUBQ(length, src)
- JMP(LabelRef(name + "tail"))
- */
- Label(name + "move_1or2")
- MOVB(Mem{Base: src}, AX.As8())
- MOVB(Mem{Base: src, Disp: -1, Index: length, Scale: 1}, CX.As8())
- MOVB(AX.As8(), Mem{Base: dst})
- MOVB(CX.As8(), Mem{Base: dst, Disp: -1, Index: length, Scale: 1})
- JMP(end)
- Label(name + "move_4")
- MOVL(Mem{Base: src}, AX.As32())
- MOVL(AX.As32(), Mem{Base: dst})
- JMP(end)
- Label(name + "move_3")
- MOVW(Mem{Base: src}, AX.As16())
- MOVB(Mem{Base: src, Disp: 2}, CX.As8())
- MOVW(AX.As16(), Mem{Base: dst})
- MOVB(CX.As8(), Mem{Base: dst, Disp: 2})
- JMP(end)
- Label(name + "move_5through7")
- MOVL(Mem{Base: src}, AX.As32())
- MOVL(Mem{Base: src, Disp: -4, Index: length, Scale: 1}, CX.As32())
- MOVL(AX.As32(), Mem{Base: dst})
- MOVL(CX.As32(), Mem{Base: dst, Disp: -4, Index: length, Scale: 1})
- JMP(end)
- Label(name + "move_8")
- // We need a separate case for 8 to make sure we write pointers atomically.
- MOVQ(Mem{Base: src}, AX)
- MOVQ(AX, Mem{Base: dst})
- JMP(end)
- Label(name + "move_9through16")
- MOVQ(Mem{Base: src}, AX)
- MOVQ(Mem{Base: src, Disp: -8, Index: length, Scale: 1}, CX)
- MOVQ(AX, Mem{Base: dst})
- MOVQ(CX, Mem{Base: dst, Disp: -8, Index: length, Scale: 1})
- JMP(end)
- Label(name + "move_17through32")
- X0, X1, X2, X3, X4, X5, X6, X7 := XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM()
- X8, X9, X10, X11, X12, X13, X14, X15 := XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM()
- MOVOU(Mem{Base: src}, X0)
- MOVOU(Mem{Base: src, Disp: -16, Index: length, Scale: 1}, X1)
- MOVOU(X0, Mem{Base: dst})
- MOVOU(X1, Mem{Base: dst, Disp: -16, Index: length, Scale: 1})
- JMP(end)
- Label(name + "move_33through64")
- MOVOU(Mem{Base: src}, X0)
- MOVOU(Mem{Base: src, Disp: 16}, X1)
- MOVOU(Mem{Base: src, Disp: -32, Index: length, Scale: 1}, X2)
- MOVOU(Mem{Base: src, Disp: -16, Index: length, Scale: 1}, X3)
- MOVOU(X0, Mem{Base: dst})
- MOVOU(X1, Mem{Base: dst, Disp: 16})
- MOVOU(X2, Mem{Base: dst, Disp: -32, Index: length, Scale: 1})
- MOVOU(X3, Mem{Base: dst, Disp: -16, Index: length, Scale: 1})
- JMP(end)
- Label(name + "move_65through128")
- MOVOU(Mem{Base: src}, X0)
- MOVOU(Mem{Base: src, Disp: 16}, X1)
- MOVOU(Mem{Base: src, Disp: 32}, X2)
- MOVOU(Mem{Base: src, Disp: 48}, X3)
- MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -64}, X12)
- MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -48}, X13)
- MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -32}, X14)
- MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -16}, X15)
- MOVOU(X0, Mem{Base: dst})
- MOVOU(X1, Mem{Base: dst, Disp: 16})
- MOVOU(X2, Mem{Base: dst, Disp: 32})
- MOVOU(X3, Mem{Base: dst, Disp: 48})
- MOVOU(X12, Mem{Base: dst, Index: length, Scale: 1, Disp: -64})
- MOVOU(X13, Mem{Base: dst, Index: length, Scale: 1, Disp: -48})
- MOVOU(X14, Mem{Base: dst, Index: length, Scale: 1, Disp: -32})
- MOVOU(X15, Mem{Base: dst, Index: length, Scale: 1, Disp: -16})
- JMP(end)
- Label(name + "move_129through256")
- MOVOU(Mem{Base: src}, X0)
- MOVOU(Mem{Base: src, Disp: 16}, X1)
- MOVOU(Mem{Base: src, Disp: 32}, X2)
- MOVOU(Mem{Base: src, Disp: 48}, X3)
- MOVOU(Mem{Base: src, Disp: 64}, X4)
- MOVOU(Mem{Base: src, Disp: 80}, X5)
- MOVOU(Mem{Base: src, Disp: 96}, X6)
- MOVOU(Mem{Base: src, Disp: 112}, X7)
- MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -128}, X8)
- MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -112}, X9)
- MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -96}, X10)
- MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -80}, X11)
- MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -64}, X12)
- MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -48}, X13)
- MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -32}, X14)
- MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -16}, X15)
- MOVOU(X0, Mem{Base: dst})
- MOVOU(X1, Mem{Base: dst, Disp: 16})
- MOVOU(X2, Mem{Base: dst, Disp: 32})
- MOVOU(X3, Mem{Base: dst, Disp: 48})
- MOVOU(X4, Mem{Base: dst, Disp: 64})
- MOVOU(X5, Mem{Base: dst, Disp: 80})
- MOVOU(X6, Mem{Base: dst, Disp: 96})
- MOVOU(X7, Mem{Base: dst, Disp: 112})
- MOVOU(X8, Mem{Base: dst, Index: length, Scale: 1, Disp: -128})
- MOVOU(X9, Mem{Base: dst, Index: length, Scale: 1, Disp: -112})
- MOVOU(X10, Mem{Base: dst, Index: length, Scale: 1, Disp: -96})
- MOVOU(X11, Mem{Base: dst, Index: length, Scale: 1, Disp: -80})
- MOVOU(X12, Mem{Base: dst, Index: length, Scale: 1, Disp: -64})
- MOVOU(X13, Mem{Base: dst, Index: length, Scale: 1, Disp: -48})
- MOVOU(X14, Mem{Base: dst, Index: length, Scale: 1, Disp: -32})
- MOVOU(X15, Mem{Base: dst, Index: length, Scale: 1, Disp: -16})
- JMP(end)
- Label(name + "move_256through2048")
- LEAQ(Mem{Base: length, Disp: -256}, length)
- MOVOU(Mem{Base: src}, X0)
- MOVOU(Mem{Base: src, Disp: 16}, X1)
- MOVOU(Mem{Base: src, Disp: 32}, X2)
- MOVOU(Mem{Base: src, Disp: 48}, X3)
- MOVOU(Mem{Base: src, Disp: 64}, X4)
- MOVOU(Mem{Base: src, Disp: 80}, X5)
- MOVOU(Mem{Base: src, Disp: 96}, X6)
- MOVOU(Mem{Base: src, Disp: 112}, X7)
- MOVOU(Mem{Base: src, Disp: 128}, X8)
- MOVOU(Mem{Base: src, Disp: 144}, X9)
- MOVOU(Mem{Base: src, Disp: 160}, X10)
- MOVOU(Mem{Base: src, Disp: 176}, X11)
- MOVOU(Mem{Base: src, Disp: 192}, X12)
- MOVOU(Mem{Base: src, Disp: 208}, X13)
- MOVOU(Mem{Base: src, Disp: 224}, X14)
- MOVOU(Mem{Base: src, Disp: 240}, X15)
- MOVOU(X0, Mem{Base: dst})
- MOVOU(X1, Mem{Base: dst, Disp: 16})
- MOVOU(X2, Mem{Base: dst, Disp: 32})
- MOVOU(X3, Mem{Base: dst, Disp: 48})
- MOVOU(X4, Mem{Base: dst, Disp: 64})
- MOVOU(X5, Mem{Base: dst, Disp: 80})
- MOVOU(X6, Mem{Base: dst, Disp: 96})
- MOVOU(X7, Mem{Base: dst, Disp: 112})
- MOVOU(X8, Mem{Base: dst, Disp: 128})
- MOVOU(X9, Mem{Base: dst, Disp: 144})
- MOVOU(X10, Mem{Base: dst, Disp: 160})
- MOVOU(X11, Mem{Base: dst, Disp: 176})
- MOVOU(X12, Mem{Base: dst, Disp: 192})
- MOVOU(X13, Mem{Base: dst, Disp: 208})
- MOVOU(X14, Mem{Base: dst, Disp: 224})
- MOVOU(X15, Mem{Base: dst, Disp: 240})
- CMPQ(length, U32(256))
- LEAQ(Mem{Base: src, Disp: 256}, src)
- LEAQ(Mem{Base: dst, Disp: 256}, dst)
- JGE(LabelRef(name + "move_256through2048"))
- JMP(LabelRef(name + "tail"))
- if avx {
- Label(name + "avxUnaligned")
- R8, R10 := GP64(), GP64()
- // There are two implementations of move algorithm.
- // The first one for non-overlapped memory regions. It uses forward copying.
- // We do not support overlapping input
- // Non-temporal copy would be better for big sizes.
- // Disabled since big copies are unlikely.
- // If enabling, test functionality.
- const enableBigData = false
- if enableBigData {
- CMPQ(length, U32(0x100000))
- JAE(LabelRef(name + "gobble_big_data_fwd"))
- }
- // Memory layout on the source side
- // src CX
- // |<---------length before correction--------->|
- // | |<--length corrected-->| |
- // | | |<--- AX --->|
- // |<-R11->| |<-128 bytes->|
- // +----------------------------------------+
- // | Head | Body | Tail |
- // +-------+------------------+-------------+
- // ^ ^ ^
- // | | |
- // Save head into Y4 Save tail into X5..X12
- // |
- // src+R11, where R11 = ((dst & -32) + 32) - dst
- // Algorithm:
- // 1. Unaligned save of the tail's 128 bytes
- // 2. Unaligned save of the head's 32 bytes
- // 3. Destination-aligned copying of body (128 bytes per iteration)
- // 4. Put head on the new place
- // 5. Put the tail on the new place
- // It can be important to satisfy processor's pipeline requirements for
- // small sizes as the cost of unaligned memory region copying is
- // comparable with the cost of main loop. So code is slightly messed there.
- // There is more clean implementation of that algorithm for bigger sizes
- // where the cost of unaligned part copying is negligible.
- // You can see it after gobble_big_data_fwd label.
- Y0, Y1, Y2, Y3, Y4 := YMM(), YMM(), YMM(), YMM(), YMM()
- LEAQ(Mem{Base: src, Index: length, Scale: 1}, CX)
- MOVQ(dst, R10)
- // CX points to the end of buffer so we need go back slightly. We will use negative offsets there.
- MOVOU(Mem{Base: CX, Disp: -0x80}, X5)
- MOVOU(Mem{Base: CX, Disp: -0x70}, X6)
- MOVQ(U32(0x80), AX)
- // Align destination address
- ANDQ(U32(0xffffffe0), dst)
- ADDQ(U8(32), dst)
- // Continue tail saving.
- MOVOU(Mem{Base: CX, Disp: -0x60}, X7)
- MOVOU(Mem{Base: CX, Disp: -0x50}, X8)
- // Make R8 delta between aligned and unaligned destination addresses.
- MOVQ(dst, R8)
- SUBQ(R10, R8)
- // Continue tail saving.
- MOVOU(Mem{Base: CX, Disp: -0x40}, X9)
- MOVOU(Mem{Base: CX, Disp: -0x30}, X10)
- // Let's make bytes-to-copy value adjusted as we've prepared unaligned part for copying.
- SUBQ(R8, length)
- // Continue tail saving.
- MOVOU(Mem{Base: CX, Disp: -0x20}, X11)
- MOVOU(Mem{Base: CX, Disp: -0x10}, X12)
- // The tail will be put on its place after main body copying.
- // It's time for the unaligned heading part.
- VMOVDQU(Mem{Base: src}, Y4)
- // Adjust source address to point past head.
- ADDQ(R8, src)
- SUBQ(AX, length)
- // Aligned memory copying there
- Label(name + "gobble_128_loop")
- VMOVDQU(Mem{Base: src}, Y0)
- VMOVDQU(Mem{Base: src, Disp: 0x20}, Y1)
- VMOVDQU(Mem{Base: src, Disp: 0x40}, Y2)
- VMOVDQU(Mem{Base: src, Disp: 0x60}, Y3)
- ADDQ(AX, src)
- VMOVDQA(Y0, Mem{Base: dst})
- VMOVDQA(Y1, Mem{Base: dst, Disp: 0x20})
- VMOVDQA(Y2, Mem{Base: dst, Disp: 0x40})
- VMOVDQA(Y3, Mem{Base: dst, Disp: 0x60})
- ADDQ(AX, dst)
- SUBQ(AX, length)
- JA(LabelRef(name + "gobble_128_loop"))
- // Now we can store unaligned parts.
- ADDQ(AX, length)
- ADDQ(dst, length)
- VMOVDQU(Y4, Mem{Base: R10})
- VZEROUPPER()
- MOVOU(X5, Mem{Base: length, Disp: -0x80})
- MOVOU(X6, Mem{Base: length, Disp: -0x70})
- MOVOU(X7, Mem{Base: length, Disp: -0x60})
- MOVOU(X8, Mem{Base: length, Disp: -0x50})
- MOVOU(X9, Mem{Base: length, Disp: -0x40})
- MOVOU(X10, Mem{Base: length, Disp: -0x30})
- MOVOU(X11, Mem{Base: length, Disp: -0x20})
- MOVOU(X12, Mem{Base: length, Disp: -0x10})
- JMP(end)
- if enableBigData {
- Label(name + "gobble_big_data_fwd")
- // There is forward copying for big regions.
- // It uses non-temporal mov instructions.
- // Details of this algorithm are commented previously for small sizes.
- LEAQ(Mem{Base: src, Index: length, Scale: 1}, CX)
- MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -0x80}, X5)
- MOVOU(Mem{Base: CX, Disp: -0x70}, X6)
- MOVOU(Mem{Base: CX, Disp: -0x60}, X7)
- MOVOU(Mem{Base: CX, Disp: -0x50}, X8)
- MOVOU(Mem{Base: CX, Disp: -0x40}, X9)
- MOVOU(Mem{Base: CX, Disp: -0x30}, X10)
- MOVOU(Mem{Base: CX, Disp: -0x20}, X11)
- MOVOU(Mem{Base: CX, Disp: -0x10}, X12)
- VMOVDQU(Mem{Base: src}, Y4)
- MOVQ(dst, R8)
- ANDQ(U32(0xffffffe0), dst)
- ADDQ(U8(32), dst)
- MOVQ(dst, R10)
- SUBQ(R8, R10)
- SUBQ(R10, length)
- ADDQ(R10, src)
- LEAQ(Mem{Base: dst, Index: length, Scale: 1}, CX)
- SUBQ(U8(0x80), length)
- Label(name + "gobble_mem_fwd_loop")
- PREFETCHNTA(Mem{Base: src, Disp: 0x1c0})
- PREFETCHNTA(Mem{Base: src, Disp: 0x280})
- // Prefetch values were chosen empirically.
- // Approach for prefetch usage as in 7.6.6 of [1]
- // [1] 64-ia-32-architectures-optimization-manual.pdf
- // https://www.intel.ru/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
- VMOVDQU(Mem{Base: src}, Y0)
- VMOVDQU(Mem{Base: src, Disp: 0x20}, Y1)
- VMOVDQU(Mem{Base: src, Disp: 0x40}, Y2)
- VMOVDQU(Mem{Base: src, Disp: 0x60}, Y3)
- ADDQ(U8(0x80), src)
- VMOVNTDQ(Y0, Mem{Base: dst})
- VMOVNTDQ(Y1, Mem{Base: dst, Disp: 0x20})
- VMOVNTDQ(Y2, Mem{Base: dst, Disp: 0x20})
- VMOVNTDQ(Y3, Mem{Base: dst, Disp: 0x60})
- ADDQ(U8(0x80), dst)
- SUBQ(U8(0x80), length)
- JA(LabelRef(name + "gobble_mem_fwd_loop"))
- // NT instructions don't follow the normal cache-coherency rules.
- // We need SFENCE there to make copied data available timely.
- SFENCE()
- VMOVDQU(Y4, Mem{Base: R8})
- VZEROUPPER()
- MOVOU(X5, Mem{Base: CX, Disp: -0x80})
- MOVOU(X6, Mem{Base: CX, Disp: -0x70})
- MOVOU(X7, Mem{Base: CX, Disp: -0x60})
- MOVOU(X8, Mem{Base: CX, Disp: -0x50})
- MOVOU(X9, Mem{Base: CX, Disp: -0x40})
- MOVOU(X10, Mem{Base: CX, Disp: -0x30})
- MOVOU(X11, Mem{Base: CX, Disp: -0x20})
- MOVOU(X12, Mem{Base: CX, Disp: -0x10})
- JMP(end)
- }
- }
- }
- // genMatchLen generates standalone matchLen.
- func genMatchLen() {
- TEXT("matchLen", NOSPLIT, "func(a, b []byte) int")
- Doc("matchLen returns how many bytes match in a and b", "",
- "It assumes that:",
- " len(a) <= len(b)", "")
- Pragma("noescape")
- aBase, bBase, length := GP64(), GP64(), GP64()
- Load(Param("a").Base(), aBase)
- Load(Param("b").Base(), bBase)
- Load(Param("a").Len(), length)
- l := matchLen("standalone", aBase, bBase, length, LabelRef("gen_match_len_end"))
- Label("gen_match_len_end")
- Store(l.As64(), ReturnIndex(0))
- RET()
- }
- // matchLen returns the number of matching bytes of a and b.
- // len is the maximum number of bytes to match.
- // Will jump to end when done and returns the length.
- // Uses 2 GP registers.
- func matchLen(name string, a, b, len reg.GPVirtual, end LabelRef) reg.GPVirtual {
- tmp, matched := GP64(), GP32()
- XORL(matched, matched)
- CMPL(len.As32(), U8(8))
- JL(LabelRef("matchlen_single_" + name))
- Label("matchlen_loopback_" + name)
- MOVQ(Mem{Base: a, Index: matched, Scale: 1}, tmp)
- XORQ(Mem{Base: b, Index: matched, Scale: 1}, tmp)
- TESTQ(tmp, tmp)
- JZ(LabelRef("matchlen_loop_" + name))
- // Not all match.
- BSFQ(tmp, tmp)
- SARQ(U8(3), tmp)
- LEAL(Mem{Base: matched, Index: tmp, Scale: 1}, matched)
- JMP(end)
- // All 8 byte matched, update and loop.
- Label("matchlen_loop_" + name)
- LEAL(Mem{Base: len, Disp: -8}, len.As32())
- LEAL(Mem{Base: matched, Disp: 8}, matched)
- CMPL(len.As32(), U8(8))
- JGE(LabelRef("matchlen_loopback_" + name))
- // Less than 8 bytes left.
- Label("matchlen_single_" + name)
- TESTL(len.As32(), len.As32())
- JZ(end)
- Label("matchlen_single_loopback_" + name)
- MOVB(Mem{Base: a, Index: matched, Scale: 1}, tmp.As8())
- CMPB(Mem{Base: b, Index: matched, Scale: 1}, tmp.As8())
- JNE(end)
- LEAL(Mem{Base: matched, Disp: 1}, matched)
- DECL(len.As32())
- JNZ(LabelRef("matchlen_single_loopback_" + name))
- JMP(end)
- return matched
- }
|