// Copyright 2016 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. #include "textflag.h" // func decode(dst, src []byte) int // // The asm code generally follows the pure Go code in decode_other.go, except // where marked with a "!!!". // // All local variables fit into registers. The non-zero stack size is only to // spill registers and push args when issuing a CALL. The register allocation: // - AX scratch // - BX scratch // - CX length or x // - DX offset // - SI &src[s] // - DI &dst[d] // + R8 dst_base // + R9 dst_len // + R10 dst_base + dst_len // + R11 src_base // + R12 src_len // + R13 src_base + src_len // - R14 unused // - R15 used by doCopy // // The registers R8-R13 (marked with a "+") are set at the start of the // function, and after a CALL returns, and are not otherwise modified. // // The d variable is implicitly DI - R8, and len(dst)-d is R10 - DI. // The s variable is implicitly SI - R11, and len(src)-s is R13 - SI. TEXT ·decode(SB), NOSPLIT, $48-56 // Initialize SI, DI and R8-R13. MOVQ dst_base+0(FP), R8 MOVQ dst_len+8(FP), R9 MOVQ R8, DI MOVQ R8, R10 ADDQ R9, R10 MOVQ src_base+24(FP), R11 MOVQ src_len+32(FP), R12 MOVQ R11, SI MOVQ R11, R13 ADDQ R12, R13 loop: // for s < len(src) CMPQ SI, R13 JEQ end // CX = uint32(src[s]) // // switch src[s] & 0x03 MOVBLZX (SI), CX MOVL CX, BX ANDL $3, BX CMPL BX, $1 JAE tagCopy // ---------------------------------------- // The code below handles literal tags. // case tagLiteral: // x := uint32(src[s] >> 2) // switch SHRL $2, CX CMPL CX, $60 JAE tagLit60Plus // case x < 60: // s++ INCQ SI doLit: // This is the end of the inner "switch", when we have a literal tag. // // We assume that CX == x and x fits in a uint32, where x is the variable // used in the pure Go decode_other.go code. // length = int(x) + 1 // // Unlike the pure Go code, we don't need to check if length <= 0 because // CX can hold 64 bits, so the increment cannot overflow. INCQ CX // Prepare to check if copying length bytes will run past the end of dst or // src. // // AX = len(dst) - d // BX = len(src) - s MOVQ R10, AX SUBQ DI, AX MOVQ R13, BX SUBQ SI, BX // if length > len(dst)-d || length > len(src)-s { etc } CMPQ CX, AX JGT errCorrupt CMPQ CX, BX JGT errCorrupt // copy(dst[d:], src[s:s+length]) // // This means calling runtime·memmove(&dst[d], &src[s], length), so we push // DI, SI and CX as arguments. Coincidentally, we also need to spill those // three registers to the stack, to save local variables across the CALL. MOVQ DI, 0(SP) MOVQ SI, 8(SP) MOVQ CX, 16(SP) MOVQ DI, 24(SP) MOVQ SI, 32(SP) MOVQ CX, 40(SP) CALL runtime·memmove(SB) // Restore local variables: unspill registers from the stack and // re-calculate R8-R13. MOVQ 24(SP), DI MOVQ 32(SP), SI MOVQ 40(SP), CX MOVQ dst_base+0(FP), R8 MOVQ dst_len+8(FP), R9 MOVQ R8, R10 ADDQ R9, R10 MOVQ src_base+24(FP), R11 MOVQ src_len+32(FP), R12 MOVQ R11, R13 ADDQ R12, R13 // d += length // s += length ADDQ CX, DI ADDQ CX, SI JMP loop tagLit60Plus: // !!! This fragment does the // // s += x - 58; if uint(s) > uint(len(src)) { etc } // // checks. In the asm version, we code it once instead of once per switch case. ADDQ CX, SI SUBQ $58, SI MOVQ SI, BX SUBQ R11, BX CMPQ BX, R12 JA errCorrupt // case x == 60: CMPL CX, $61 JEQ tagLit61 JA tagLit62Plus // x = uint32(src[s-1]) MOVBLZX -1(SI), CX JMP doLit tagLit61: // case x == 61: // x = uint32(src[s-2]) | uint32(src[s-1])<<8 MOVWLZX -2(SI), CX JMP doLit tagLit62Plus: CMPL CX, $62 JA tagLit63 // case x == 62: // x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16 MOVWLZX -3(SI), CX MOVBLZX -1(SI), BX SHLL $16, BX ORL BX, CX JMP doLit tagLit63: // case x == 63: // x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24 MOVL -4(SI), CX JMP doLit // The code above handles literal tags. // ---------------------------------------- // The code below handles copy tags. tagCopy2: // case tagCopy2: // s += 3 ADDQ $3, SI // if uint(s) > uint(len(src)) { etc } MOVQ SI, BX SUBQ R11, BX CMPQ BX, R12 JA errCorrupt // length = 1 + int(src[s-3])>>2 SHRQ $2, CX INCQ CX // offset = int(src[s-2]) | int(src[s-1])<<8 MOVWQZX -2(SI), DX JMP doCopy tagCopy: // We have a copy tag. We assume that: // - BX == src[s] & 0x03 // - CX == src[s] CMPQ BX, $2 JEQ tagCopy2 JA errUC4T // case tagCopy1: // s += 2 ADDQ $2, SI // if uint(s) > uint(len(src)) { etc } MOVQ SI, BX SUBQ R11, BX CMPQ BX, R12 JA errCorrupt // offset = int(src[s-2])&0xe0<<3 | int(src[s-1]) MOVQ CX, DX ANDQ $0xe0, DX SHLQ $3, DX MOVBQZX -1(SI), BX ORQ BX, DX // length = 4 + int(src[s-2])>>2&0x7 SHRQ $2, CX ANDQ $7, CX ADDQ $4, CX doCopy: // This is the end of the outer "switch", when we have a copy tag. // // We assume that: // - CX == length && CX > 0 // - DX == offset // if offset <= 0 { etc } CMPQ DX, $0 JLE errCorrupt // if d < offset { etc } MOVQ DI, BX SUBQ R8, BX CMPQ BX, DX JLT errCorrupt // if length > len(dst)-d { etc } MOVQ R10, BX SUBQ DI, BX CMPQ CX, BX JGT errCorrupt // forwardCopy(dst[d:d+length], dst[d-offset:]); d += length // // Set: // - R15 = &dst[d-offset] MOVQ DI, R15 SUBQ DX, R15 verySlowForwardCopy: // verySlowForwardCopy is a simple implementation of forward copy. In C // parlance, this is a do/while loop instead of a while loop, since we know // that length > 0. In Go syntax: // // for { // dst[d] = dst[d - offset] // d++ // length-- // if length == 0 { // break // } // } MOVB (R15), BX MOVB BX, (DI) INCQ R15 INCQ DI DECQ CX JNZ verySlowForwardCopy JMP loop // The code above handles copy tags. // ---------------------------------------- end: // This is the end of the "for s < len(src)". // // if d != len(dst) { etc } CMPQ DI, R10 JNE errCorrupt // return 0 MOVQ $0, ret+48(FP) RET errCorrupt: // return decodeErrCodeCorrupt MOVQ $1, ret+48(FP) RET errUC4T: // return decodeErrCodeUnsupportedCopy4Tag MOVQ $3, ret+48(FP) RET