github.com2
/
golang_snappy


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349
							// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

#include "textflag.h"

// func decode(dst, src []byte) int
//
// The asm code generally follows the pure Go code in decode_other.go, except
// where marked with a "!!!".
//
// All local variables fit into registers. The non-zero stack size is only to
// spill registers and push args when issuing a CALL. The register allocation:
//	- AX	scratch
//	- BX	scratch
//	- CX	length or x
//	- DX	offset
//	- SI	&src[s]
//	- DI	&dst[d]
//	+ R8	dst_base
//	+ R9	dst_len
//	+ R10	dst_base + dst_len
//	+ R11	src_base
//	+ R12	src_len
//	+ R13	src_base + src_len
//	- R14	unused
//	- R15	used by doCopy
//
// The registers R8-R13 (marked with a "+") are set at the start of the
// function, and after a CALL returns, and are not otherwise modified.
//
// The d variable is implicitly DI - R8,  and len(dst)-d is R10 - DI.
// The s variable is implicitly SI - R11, and len(src)-s is R13 - SI.
TEXT ·decode(SB), NOSPLIT, $48-56
	// Initialize SI, DI and R8-R13.
	MOVQ dst_base+0(FP), R8
	MOVQ dst_len+8(FP), R9
	MOVQ R8, DI
	MOVQ R8, R10
	ADDQ R9, R10
	MOVQ src_base+24(FP), R11
	MOVQ src_len+32(FP), R12
	MOVQ R11, SI
	MOVQ R11, R13
	ADDQ R12, R13

loop:
	// for s < len(src)
	CMPQ SI, R13
	JEQ  end

	// CX = uint32(src[s])
	//
	// switch src[s] & 0x03
	MOVBLZX (SI), CX
	MOVL    CX, BX
	ANDL    $3, BX
	CMPL    BX, $1
	JAE     tagCopy

	// ----------------------------------------
	// The code below handles literal tags.

	// case tagLiteral:
	// x := uint32(src[s] >> 2)
	// switch
	SHRL $2, CX
	CMPL CX, $60
	JAE  tagLit60Plus

	// case x < 60:
	// s++
	INCQ SI

doLit:
	// This is the end of the inner "switch", when we have a literal tag.
	//
	// We assume that CX == x and x fits in a uint32, where x is the variable
	// used in the pure Go decode_other.go code.

	// length = int(x) + 1
	//
	// Unlike the pure Go code, we don't need to check if length <= 0 because
	// CX can hold 64 bits, so the increment cannot overflow.
	INCQ CX

	// Prepare to check if copying length bytes will run past the end of dst or
	// src.
	//
	// AX = len(dst) - d
	// BX = len(src) - s
	MOVQ R10, AX
	SUBQ DI, AX
	MOVQ R13, BX
	SUBQ SI, BX

	// !!! Try a faster technique for short (16 or fewer bytes) copies.
	//
	// if length > 16 || len(dst)-d < 16 || len(src)-s < 16 {
	//   goto callMemmove // Fall back on calling runtime·memmove.
	// }
	//
	// The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
	// against 21 instead of 16, because it cannot assume that all of its input
	// is contiguous in memory and so it needs to leave enough source bytes to
	// read the next tag without refilling buffers, but Go's Decode assumes
	// contiguousness (the src argument is a []byte).
	CMPQ CX, $16
	JGT  callMemmove
	CMPQ AX, $16
	JLT  callMemmove
	CMPQ BX, $16
	JLT  callMemmove

	// !!! Implement the copy from src to dst as two 8-byte loads and stores.
	// (Decode's documentation says that dst and src must not overlap.)
	//
	// This always copies 16 bytes, instead of only length bytes, but that's
	// OK. If the input is a valid Snappy encoding then subsequent iterations
	// will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
	// non-nil error), so the overrun will be ignored.
	//
	// Note that on amd64, it is legal and cheap to issue unaligned 8-byte
	// loads and stores. This technique probably wouldn't be as effective on
	// architectures that are fussier about alignment.
	MOVQ 0(SI), AX
	MOVQ AX, 0(DI)
	MOVQ 8(SI), BX
	MOVQ BX, 8(DI)

	// d += length
	// s += length
	ADDQ CX, DI
	ADDQ CX, SI
	JMP  loop

callMemmove:
	// if length > len(dst)-d || length > len(src)-s { etc }
	CMPQ CX, AX
	JGT  errCorrupt
	CMPQ CX, BX
	JGT  errCorrupt

	// copy(dst[d:], src[s:s+length])
	//
	// This means calling runtime·memmove(&dst[d], &src[s], length), so we push
	// DI, SI and CX as arguments. Coincidentally, we also need to spill those
	// three registers to the stack, to save local variables across the CALL.
	MOVQ DI, 0(SP)
	MOVQ SI, 8(SP)
	MOVQ CX, 16(SP)
	MOVQ DI, 24(SP)
	MOVQ SI, 32(SP)
	MOVQ CX, 40(SP)
	CALL runtime·memmove(SB)

	// Restore local variables: unspill registers from the stack and
	// re-calculate R8-R13.
	MOVQ 24(SP), DI
	MOVQ 32(SP), SI
	MOVQ 40(SP), CX
	MOVQ dst_base+0(FP), R8
	MOVQ dst_len+8(FP), R9
	MOVQ R8, R10
	ADDQ R9, R10
	MOVQ src_base+24(FP), R11
	MOVQ src_len+32(FP), R12
	MOVQ R11, R13
	ADDQ R12, R13

	// d += length
	// s += length
	ADDQ CX, DI
	ADDQ CX, SI
	JMP  loop

tagLit60Plus:
	// !!! This fragment does the
	//
	// s += x - 58; if uint(s) > uint(len(src)) { etc }
	//
	// checks. In the asm version, we code it once instead of once per switch case.
	ADDQ CX, SI
	SUBQ $58, SI
	MOVQ SI, BX
	SUBQ R11, BX
	CMPQ BX, R12
	JA   errCorrupt

	// case x == 60:
	CMPL CX, $61
	JEQ  tagLit61
	JA   tagLit62Plus

	// x = uint32(src[s-1])
	MOVBLZX -1(SI), CX
	JMP     doLit

tagLit61:
	// case x == 61:
	// x = uint32(src[s-2]) | uint32(src[s-1])<<8
	MOVWLZX -2(SI), CX
	JMP     doLit

tagLit62Plus:
	CMPL CX, $62
	JA   tagLit63

	// case x == 62:
	// x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
	MOVWLZX -3(SI), CX
	MOVBLZX -1(SI), BX
	SHLL    $16, BX
	ORL     BX, CX
	JMP     doLit

tagLit63:
	// case x == 63:
	// x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
	MOVL -4(SI), CX
	JMP  doLit

// The code above handles literal tags.
// ----------------------------------------
// The code below handles copy tags.

tagCopy2:
	// case tagCopy2:
	// s += 3
	ADDQ $3, SI

	// if uint(s) > uint(len(src)) { etc }
	MOVQ SI, BX
	SUBQ R11, BX
	CMPQ BX, R12
	JA   errCorrupt

	// length = 1 + int(src[s-3])>>2
	SHRQ $2, CX
	INCQ CX

	// offset = int(src[s-2]) | int(src[s-1])<<8
	MOVWQZX -2(SI), DX
	JMP     doCopy

tagCopy:
	// We have a copy tag. We assume that:
	//	- BX == src[s] & 0x03
	//	- CX == src[s]
	CMPQ BX, $2
	JEQ  tagCopy2
	JA   errUC4T

	// case tagCopy1:
	// s += 2
	ADDQ $2, SI

	// if uint(s) > uint(len(src)) { etc }
	MOVQ SI, BX
	SUBQ R11, BX
	CMPQ BX, R12
	JA   errCorrupt

	// offset = int(src[s-2])&0xe0<<3 | int(src[s-1])
	MOVQ    CX, DX
	ANDQ    $0xe0, DX
	SHLQ    $3, DX
	MOVBQZX -1(SI), BX
	ORQ     BX, DX

	// length = 4 + int(src[s-2])>>2&0x7
	SHRQ $2, CX
	ANDQ $7, CX
	ADDQ $4, CX

doCopy:
	// This is the end of the outer "switch", when we have a copy tag.
	//
	// We assume that:
	//	- CX == length && CX > 0
	//	- DX == offset

	// if offset <= 0 { etc }
	CMPQ DX, $0
	JLE  errCorrupt

	// if d < offset { etc }
	MOVQ DI, BX
	SUBQ R8, BX
	CMPQ BX, DX
	JLT  errCorrupt

	// if length > len(dst)-d { etc }
	MOVQ R10, BX
	SUBQ DI, BX
	CMPQ CX, BX
	JGT  errCorrupt

	// forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
	//
	// Set:
	//	- R15 = &dst[d-offset]
	MOVQ DI, R15
	SUBQ DX, R15

verySlowForwardCopy:
	// verySlowForwardCopy is a simple implementation of forward copy. In C
	// parlance, this is a do/while loop instead of a while loop, since we know
	// that length > 0. In Go syntax:
	//
	// for {
	//   dst[d] = dst[d - offset]
	//   d++
	//   length--
	//   if length == 0 {
	//     break
	//   }
	// }
	MOVB (R15), BX
	MOVB BX, (DI)
	INCQ R15
	INCQ DI
	DECQ CX
	JNZ  verySlowForwardCopy
	JMP  loop

// The code above handles copy tags.
// ----------------------------------------

end:
	// This is the end of the "for s < len(src)".
	//
	// if d != len(dst) { etc }
	CMPQ DI, R10
	JNE  errCorrupt

	// return 0
	MOVQ $0, ret+48(FP)
	RET

errCorrupt:
	// return decodeErrCodeCorrupt
	MOVQ $1, ret+48(FP)
	RET

errUC4T:
	// return decodeErrCodeUnsupportedCopy4Tag
	MOVQ $3, ret+48(FP)
	RET