123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375 |
- // +build !appengine
- // +build gc
- // +build !noasm
- #include "textflag.h"
- // AX scratch
- // BX scratch
- // CX scratch
- // DX token
- //
- // DI &dst
- // SI &src
- // R8 &dst + len(dst)
- // R9 &src + len(src)
- // R11 &dst
- // R12 short output end
- // R13 short input end
- // func decodeBlock(dst, src []byte) int
- // using 50 bytes of stack currently
- TEXT ·decodeBlock(SB), NOSPLIT, $64-56
- MOVQ dst_base+0(FP), DI
- MOVQ DI, R11
- MOVQ dst_len+8(FP), R8
- ADDQ DI, R8
- MOVQ src_base+24(FP), SI
- MOVQ src_len+32(FP), R9
- ADDQ SI, R9
- // shortcut ends
- // short output end
- MOVQ R8, R12
- SUBQ $32, R12
- // short input end
- MOVQ R9, R13
- SUBQ $16, R13
- loop:
- // for si < len(src)
- CMPQ SI, R9
- JGE end
- // token := uint32(src[si])
- MOVBQZX (SI), DX
- INCQ SI
- // lit_len = token >> 4
- // if lit_len > 0
- // CX = lit_len
- MOVQ DX, CX
- SHRQ $4, CX
- // if lit_len != 0xF
- CMPQ CX, $0xF
- JEQ lit_len_loop_pre
- CMPQ DI, R12
- JGE lit_len_loop_pre
- CMPQ SI, R13
- JGE lit_len_loop_pre
- // copy shortcut
- // A two-stage shortcut for the most common case:
- // 1) If the literal length is 0..14, and there is enough space,
- // enter the shortcut and copy 16 bytes on behalf of the literals
- // (in the fast mode, only 8 bytes can be safely copied this way).
- // 2) Further if the match length is 4..18, copy 18 bytes in a similar
- // manner; but we ensure that there's enough space in the output for
- // those 18 bytes earlier, upon entering the shortcut (in other words,
- // there is a combined check for both stages).
- // copy literal
- MOVOU (SI), X0
- MOVOU X0, (DI)
- ADDQ CX, DI
- ADDQ CX, SI
- MOVQ DX, CX
- ANDQ $0xF, CX
- // The second stage: prepare for match copying, decode full info.
- // If it doesn't work out, the info won't be wasted.
- // offset := uint16(data[:2])
- MOVWQZX (SI), DX
- ADDQ $2, SI
- MOVQ DI, AX
- SUBQ DX, AX
- CMPQ AX, DI
- JGT err_short_buf
- // if we can't do the second stage then jump straight to read the
- // match length, we already have the offset.
- CMPQ CX, $0xF
- JEQ match_len_loop_pre
- CMPQ DX, $8
- JLT match_len_loop_pre
- CMPQ AX, R11
- JLT err_short_buf
- // memcpy(op + 0, match + 0, 8);
- MOVQ (AX), BX
- MOVQ BX, (DI)
- // memcpy(op + 8, match + 8, 8);
- MOVQ 8(AX), BX
- MOVQ BX, 8(DI)
- // memcpy(op +16, match +16, 2);
- MOVW 16(AX), BX
- MOVW BX, 16(DI)
- ADDQ $4, DI // minmatch
- ADDQ CX, DI
- // shortcut complete, load next token
- JMP loop
- lit_len_loop_pre:
- // if lit_len > 0
- CMPQ CX, $0
- JEQ offset
- CMPQ CX, $0xF
- JNE copy_literal
- lit_len_loop:
- // for src[si] == 0xFF
- CMPB (SI), $0xFF
- JNE lit_len_finalise
- // bounds check src[si+1]
- MOVQ SI, AX
- ADDQ $1, AX
- CMPQ AX, R9
- JGT err_short_buf
- // lit_len += 0xFF
- ADDQ $0xFF, CX
- INCQ SI
- JMP lit_len_loop
- lit_len_finalise:
- // lit_len += int(src[si])
- // si++
- MOVBQZX (SI), AX
- ADDQ AX, CX
- INCQ SI
- copy_literal:
- // bounds check src and dst
- MOVQ SI, AX
- ADDQ CX, AX
- CMPQ AX, R9
- JGT err_short_buf
- MOVQ DI, AX
- ADDQ CX, AX
- CMPQ AX, R8
- JGT err_short_buf
- // whats a good cut off to call memmove?
- CMPQ CX, $16
- JGT memmove_lit
- // if len(dst[di:]) < 16
- MOVQ R8, AX
- SUBQ DI, AX
- CMPQ AX, $16
- JLT memmove_lit
- // if len(src[si:]) < 16
- MOVQ R9, AX
- SUBQ SI, AX
- CMPQ AX, $16
- JLT memmove_lit
- MOVOU (SI), X0
- MOVOU X0, (DI)
- JMP finish_lit_copy
- memmove_lit:
- // memmove(to, from, len)
- MOVQ DI, 0(SP)
- MOVQ SI, 8(SP)
- MOVQ CX, 16(SP)
- // spill
- MOVQ DI, 24(SP)
- MOVQ SI, 32(SP)
- MOVQ CX, 40(SP) // need len to inc SI, DI after
- MOVB DX, 48(SP)
- CALL runtime·memmove(SB)
- // restore registers
- MOVQ 24(SP), DI
- MOVQ 32(SP), SI
- MOVQ 40(SP), CX
- MOVB 48(SP), DX
- // recalc initial values
- MOVQ dst_base+0(FP), R8
- MOVQ R8, R11
- ADDQ dst_len+8(FP), R8
- MOVQ src_base+24(FP), R9
- ADDQ src_len+32(FP), R9
- MOVQ R8, R12
- SUBQ $32, R12
- MOVQ R9, R13
- SUBQ $16, R13
- finish_lit_copy:
- ADDQ CX, SI
- ADDQ CX, DI
- CMPQ SI, R9
- JGE end
- offset:
- // CX := mLen
- // free up DX to use for offset
- MOVQ DX, CX
- MOVQ SI, AX
- ADDQ $2, AX
- CMPQ AX, R9
- JGT err_short_buf
- // offset
- // DX := int(src[si]) | int(src[si+1])<<8
- MOVWQZX (SI), DX
- ADDQ $2, SI
- // 0 offset is invalid
- CMPQ DX, $0
- JEQ err_corrupt
- ANDB $0xF, CX
- match_len_loop_pre:
- // if mlen != 0xF
- CMPB CX, $0xF
- JNE copy_match
- match_len_loop:
- // for src[si] == 0xFF
- // lit_len += 0xFF
- CMPB (SI), $0xFF
- JNE match_len_finalise
- // bounds check src[si+1]
- MOVQ SI, AX
- ADDQ $1, AX
- CMPQ AX, R9
- JGT err_short_buf
- ADDQ $0xFF, CX
- INCQ SI
- JMP match_len_loop
- match_len_finalise:
- // lit_len += int(src[si])
- // si++
- MOVBQZX (SI), AX
- ADDQ AX, CX
- INCQ SI
- copy_match:
- // mLen += minMatch
- ADDQ $4, CX
- // check we have match_len bytes left in dst
- // di+match_len < len(dst)
- MOVQ DI, AX
- ADDQ CX, AX
- CMPQ AX, R8
- JGT err_short_buf
- // DX = offset
- // CX = match_len
- // BX = &dst + (di - offset)
- MOVQ DI, BX
- SUBQ DX, BX
- // check BX is within dst
- // if BX < &dst
- CMPQ BX, R11
- JLT err_short_buf
- // if offset + match_len < di
- MOVQ BX, AX
- ADDQ CX, AX
- CMPQ DI, AX
- JGT copy_interior_match
- // AX := len(dst[:di])
- // MOVQ DI, AX
- // SUBQ R11, AX
- // copy 16 bytes at a time
- // if di-offset < 16 copy 16-(di-offset) bytes to di
- // then do the remaining
- copy_match_loop:
- // for match_len >= 0
- // dst[di] = dst[i]
- // di++
- // i++
- MOVB (BX), AX
- MOVB AX, (DI)
- INCQ DI
- INCQ BX
- DECQ CX
- CMPQ CX, $0
- JGT copy_match_loop
- JMP loop
- copy_interior_match:
- CMPQ CX, $16
- JGT memmove_match
- // if len(dst[di:]) < 16
- MOVQ R8, AX
- SUBQ DI, AX
- CMPQ AX, $16
- JLT memmove_match
- MOVOU (BX), X0
- MOVOU X0, (DI)
- ADDQ CX, DI
- JMP loop
- memmove_match:
- // memmove(to, from, len)
- MOVQ DI, 0(SP)
- MOVQ BX, 8(SP)
- MOVQ CX, 16(SP)
- // spill
- MOVQ DI, 24(SP)
- MOVQ SI, 32(SP)
- MOVQ CX, 40(SP) // need len to inc SI, DI after
- CALL runtime·memmove(SB)
- // restore registers
- MOVQ 24(SP), DI
- MOVQ 32(SP), SI
- MOVQ 40(SP), CX
- // recalc initial values
- MOVQ dst_base+0(FP), R8
- MOVQ R8, R11 // TODO: make these sensible numbers
- ADDQ dst_len+8(FP), R8
- MOVQ src_base+24(FP), R9
- ADDQ src_len+32(FP), R9
- MOVQ R8, R12
- SUBQ $32, R12
- MOVQ R9, R13
- SUBQ $16, R13
- ADDQ CX, DI
- JMP loop
- err_corrupt:
- MOVQ $-1, ret+48(FP)
- RET
- err_short_buf:
- MOVQ $-2, ret+48(FP)
- RET
- end:
- SUBQ R11, DI
- MOVQ DI, ret+48(FP)
- RET
|