Просмотр исходного кода

x/crypto/poly1305: optimize amd64 assembly performance

Improve performance on amd64 through faster assembly.

name 		old time/op 	new time/op 	delta
64-8 		101ns ± 4% 	42ns ± 3% 	-58.31% (p=0.002 n=6+6)
1K-8 		887ns ± 1% 	456ns ± 1% 	-48.53% (p=0.002 n=6+6)
64Unaligned-8 	98.1ns ± 1% 	41.1ns ± 1% 	-58.06% (p=0.002 n=6+6)
1KUnaligned-8 	885ns ± 2% 	460ns ± 3% 	-48.04% (p=0.002 n=6+6)

name 		old speed 	new speed 	delta
64-8 		635MB/s ± 4% 	1525MB/s ± 3% 	+140.15% (p=0.002 n=6+6)
1K-8 		1.15GB/s ± 1% 	2.24GB/s ± 1% 	+94.22%  (p=0.002 n=6+6)
64Unaligned-8 	653MB/s ± 1% 	1557MB/s ± 1% 	+138.58% (p=0.002 n=6+6)
1KUnaligned-8  	1.16GB/s ± 2%  	2.23GB/s ± 3%	+92.46%  (p=0.002 n=6+6)

Change-Id: Ia3be8e7ff012f8a9b451d728a646f29f809ba665
Reviewed-on: https://go-review.googlesource.com/29993
Reviewed-by: Adam Langley <agl@golang.org>
Andreas Auernhammer 9 лет назад
Родитель
Сommit
568507f56e
2 измененных файлов с 122 добавлено и 532 удалено
  1. 0 45
      poly1305/const_amd64.s
  2. 122 487
      poly1305/poly1305_amd64.s

+ 0 - 45
poly1305/const_amd64.s

@@ -1,45 +0,0 @@
-// Copyright 2012 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// This code was translated into a form compatible with 6a from the public
-// domain sources in SUPERCOP: http://bench.cr.yp.to/supercop.html
-
-// +build amd64,!gccgo,!appengine
-
-DATA ·SCALE(SB)/8, $0x37F4000000000000
-GLOBL ·SCALE(SB), 8, $8
-DATA ·TWO32(SB)/8, $0x41F0000000000000
-GLOBL ·TWO32(SB), 8, $8
-DATA ·TWO64(SB)/8, $0x43F0000000000000
-GLOBL ·TWO64(SB), 8, $8
-DATA ·TWO96(SB)/8, $0x45F0000000000000
-GLOBL ·TWO96(SB), 8, $8
-DATA ·ALPHA32(SB)/8, $0x45E8000000000000
-GLOBL ·ALPHA32(SB), 8, $8
-DATA ·ALPHA64(SB)/8, $0x47E8000000000000
-GLOBL ·ALPHA64(SB), 8, $8
-DATA ·ALPHA96(SB)/8, $0x49E8000000000000
-GLOBL ·ALPHA96(SB), 8, $8
-DATA ·ALPHA130(SB)/8, $0x4C08000000000000
-GLOBL ·ALPHA130(SB), 8, $8
-DATA ·DOFFSET0(SB)/8, $0x4330000000000000
-GLOBL ·DOFFSET0(SB), 8, $8
-DATA ·DOFFSET1(SB)/8, $0x4530000000000000
-GLOBL ·DOFFSET1(SB), 8, $8
-DATA ·DOFFSET2(SB)/8, $0x4730000000000000
-GLOBL ·DOFFSET2(SB), 8, $8
-DATA ·DOFFSET3(SB)/8, $0x4930000000000000
-GLOBL ·DOFFSET3(SB), 8, $8
-DATA ·DOFFSET3MINUSTWO128(SB)/8, $0x492FFFFE00000000
-GLOBL ·DOFFSET3MINUSTWO128(SB), 8, $8
-DATA ·HOFFSET0(SB)/8, $0x43300001FFFFFFFB
-GLOBL ·HOFFSET0(SB), 8, $8
-DATA ·HOFFSET1(SB)/8, $0x45300001FFFFFFFE
-GLOBL ·HOFFSET1(SB), 8, $8
-DATA ·HOFFSET2(SB)/8, $0x47300001FFFFFFFE
-GLOBL ·HOFFSET2(SB), 8, $8
-DATA ·HOFFSET3(SB)/8, $0x49300003FFFFFFFE
-GLOBL ·HOFFSET3(SB), 8, $8
-DATA ·ROUNDING(SB)/2, $0x137f
-GLOBL ·ROUNDING(SB), 8, $2

+ 122 - 487
poly1305/poly1305_amd64.s

@@ -2,496 +2,131 @@
 // Use of this source code is governed by a BSD-style
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // license that can be found in the LICENSE file.
 
 
-// This code was translated into a form compatible with 6a from the public
-// domain sources in SUPERCOP: http://bench.cr.yp.to/supercop.html
-
 // +build amd64,!gccgo,!appengine
 // +build amd64,!gccgo,!appengine
 
 
+#include "textflag.h"
+
+#define POLY1305_ADD(msg, h0, h1, h2) \
+	ADDQ 0(msg), h0;  \
+	ADCQ 8(msg), h1;  \
+	ADCQ $1, h2;      \
+	LEAQ 16(msg), msg
+
+#define POLY1305_MUL(h0, h1, h2, r0, r1, t0, t1, t2, t3) \
+	MOVQ  r0, AX;                  \
+	MULQ  h0;                      \
+	MOVQ  AX, t0;                  \
+	MOVQ  DX, t1;                  \
+	MOVQ  r0, AX;                  \
+	MULQ  h1;                      \
+	ADDQ  AX, t1;                  \
+	ADCQ  $0, DX;                  \
+	MOVQ  r0, t2;                  \
+	IMULQ h2, t2;                  \
+	ADDQ  DX, t2;                  \
+	                               \
+	MOVQ  r1, AX;                  \
+	MULQ  h0;                      \
+	ADDQ  AX, t1;                  \
+	ADCQ  $0, DX;                  \
+	MOVQ  DX, h0;                  \
+	MOVQ  r1, t3;                  \
+	IMULQ h2, t3;                  \
+	MOVQ  r1, AX;                  \
+	MULQ  h1;                      \
+	ADDQ  AX, t2;                  \
+	ADCQ  DX, t3;                  \
+	ADDQ  h0, t2;                  \
+	ADCQ  $0, t3;                  \
+	                               \
+	MOVQ  t0, h0;                  \
+	MOVQ  t1, h1;                  \
+	MOVQ  t2, h2;                  \
+	ANDQ  $3, h2;                  \
+	MOVQ  t2, t0;                  \
+	ANDQ  $0xFFFFFFFFFFFFFFFC, t0; \
+	ADDQ  t0, h0;                  \
+	ADCQ  t3, h1;                  \
+	ADCQ  $0, h2;                  \
+	SHRQ  $2, t3, t2;              \
+	SHRQ  $2, t3;                  \
+	ADDQ  t2, h0;                  \
+	ADCQ  t3, h1;                  \
+	ADCQ  $0, h2
+
+DATA poly1305Mask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF
+DATA poly1305Mask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC
+GLOBL poly1305Mask<>(SB), RODATA, $16
+
 // func poly1305(out *[16]byte, m *byte, mlen uint64, key *[32]key)
 // func poly1305(out *[16]byte, m *byte, mlen uint64, key *[32]key)
-TEXT ·poly1305(SB),0,$224-32
-	MOVQ out+0(FP),DI
-	MOVQ m+8(FP),SI
-	MOVQ mlen+16(FP),DX
-	MOVQ key+24(FP),CX
+TEXT ·poly1305(SB), $0-32
+	MOVQ out+0(FP), DI
+	MOVQ m+8(FP), SI
+	MOVQ mlen+16(FP), R15
+	MOVQ key+24(FP), AX
+
+	MOVQ SP, BP
+	ANDQ $0xFFFFFFFFFFFFFFF0, SP
+	SUBQ $32, SP
+
+	MOVOU 0(AX), X0
+	MOVOU 16(AX), X1
+	MOVOU poly1305Mask<>(SB), X2
+	PAND  X2, X0
+	MOVO  X0, 0(SP)
+	MOVO  X1, 16(SP)
+
+	XORQ R8, R8     // h0
+	XORQ R9, R9     // h1
+	XORQ R10, R10   // h2
+	MOVQ 0(SP), R11 // r0
+	MOVQ 8(SP), R12 // r1
+
+	CMPQ R15, $16
+	JB   bytes_between_0_and_15
+
+loop:
+	POLY1305_ADD(SI, R8, R9, R10)
+multiply:
+	POLY1305_MUL(R8, R9, R10, R11, R12, BX, CX, R13, R14)
+	SUBQ $16, R15
+	CMPQ R15, $16
+	JAE  loop
+
+bytes_between_0_and_15:
+	TESTQ R15, R15
+	JZ    done
+	MOVQ  $1, BX
+	XORQ  CX, CX
+	XORQ  R13, R13
+	ADDQ  R15, SI
+
+flush_buffer:
+	SHLQ $8, BX, CX
+	SHLQ $8, BX
+	MOVB -1(SI), R13
+	XORQ R13, BX
+	DECQ SI
+	DECQ R15
+	JNZ  flush_buffer
+
+	ADDQ BX, R8
+	ADCQ CX, R9
+	ADCQ $0, R10
+	MOVQ $16, R15
+	JMP  multiply
 
 
-	MOVQ SP,R11
-	MOVQ $31,R9
-	NOTQ R9
-	ANDQ R9,SP
-	ADDQ $32,SP
+done:
+	MOVQ    R8, AX
+	MOVQ    R9, BX
+	SUBQ    $0xFFFFFFFFFFFFFFFB, AX
+	SBBQ    $0xFFFFFFFFFFFFFFFF, BX
+	CMOVQCS R8, AX
+	CMOVQCS R9, BX
+	ADDQ    16(SP), AX
+	ADCQ    24(SP), BX
 
 
-	MOVQ R11,32(SP)
-	MOVQ R12,40(SP)
-	MOVQ R13,48(SP)
-	MOVQ R14,56(SP)
-	MOVQ R15,64(SP)
-	MOVQ BX,72(SP)
-	MOVQ BP,80(SP)
-	FLDCW ·ROUNDING(SB)
-	MOVL 0(CX),R8
-	MOVL 4(CX),R9
-	MOVL 8(CX),AX
-	MOVL 12(CX),R10
-	MOVQ DI,88(SP)
-	MOVQ CX,96(SP)
-	MOVL $0X43300000,108(SP)
-	MOVL $0X45300000,116(SP)
-	MOVL $0X47300000,124(SP)
-	MOVL $0X49300000,132(SP)
-	ANDL $0X0FFFFFFF,R8
-	ANDL $0X0FFFFFFC,R9
-	ANDL $0X0FFFFFFC,AX
-	ANDL $0X0FFFFFFC,R10
-	MOVL R8,104(SP)
-	MOVL R9,112(SP)
-	MOVL AX,120(SP)
-	MOVL R10,128(SP)
-	FMOVD 104(SP), F0
-	FSUBD ·DOFFSET0(SB), F0
-	FMOVD 112(SP), F0
-	FSUBD ·DOFFSET1(SB), F0
-	FMOVD 120(SP), F0
-	FSUBD ·DOFFSET2(SB), F0
-	FMOVD 128(SP), F0
-	FSUBD ·DOFFSET3(SB), F0
-	FXCHD F0, F3
-	FMOVDP F0, 136(SP)
-	FXCHD F0, F1
-	FMOVD F0, 144(SP)
-	FMULD ·SCALE(SB), F0
-	FMOVDP F0, 152(SP)
-	FMOVD F0, 160(SP)
-	FMULD ·SCALE(SB), F0
-	FMOVDP F0, 168(SP)
-	FMOVD F0, 176(SP)
-	FMULD ·SCALE(SB), F0
-	FMOVDP F0, 184(SP)
-	FLDZ
-	FLDZ
-	FLDZ
-	FLDZ
-	CMPQ DX,$16
-	JB ADDATMOST15BYTES
-	INITIALATLEAST16BYTES:
-	MOVL 12(SI),DI
-	MOVL 8(SI),CX
-	MOVL 4(SI),R8
-	MOVL 0(SI),R9
-	MOVL DI,128(SP)
-	MOVL CX,120(SP)
-	MOVL R8,112(SP)
-	MOVL R9,104(SP)
-	ADDQ $16,SI
-	SUBQ $16,DX
-	FXCHD F0, F3
-	FADDD 128(SP), F0
-	FSUBD ·DOFFSET3MINUSTWO128(SB), F0
-	FXCHD F0, F1
-	FADDD 112(SP), F0
-	FSUBD ·DOFFSET1(SB), F0
-	FXCHD F0, F2
-	FADDD 120(SP), F0
-	FSUBD ·DOFFSET2(SB), F0
-	FXCHD F0, F3
-	FADDD 104(SP), F0
-	FSUBD ·DOFFSET0(SB), F0
-	CMPQ DX,$16
-	JB MULTIPLYADDATMOST15BYTES
-	MULTIPLYADDATLEAST16BYTES:
-	MOVL 12(SI),DI
-	MOVL 8(SI),CX
-	MOVL 4(SI),R8
-	MOVL 0(SI),R9
-	MOVL DI,128(SP)
-	MOVL CX,120(SP)
-	MOVL R8,112(SP)
-	MOVL R9,104(SP)
-	ADDQ $16,SI
-	SUBQ $16,DX
-	FMOVD ·ALPHA130(SB), F0
-	FADDD F2,F0
-	FSUBD ·ALPHA130(SB), F0
-	FSUBD F0,F2
-	FMULD ·SCALE(SB), F0
-	FMOVD ·ALPHA32(SB), F0
-	FADDD F2,F0
-	FSUBD ·ALPHA32(SB), F0
-	FSUBD F0,F2
-	FXCHD F0, F2
-	FADDDP F0,F1
-	FMOVD ·ALPHA64(SB), F0
-	FADDD F4,F0
-	FSUBD ·ALPHA64(SB), F0
-	FSUBD F0,F4
-	FMOVD ·ALPHA96(SB), F0
-	FADDD F6,F0
-	FSUBD ·ALPHA96(SB), F0
-	FSUBD F0,F6
-	FXCHD F0, F6
-	FADDDP F0,F1
-	FXCHD F0, F3
-	FADDDP F0,F5
-	FXCHD F0, F3
-	FADDDP F0,F1
-	FMOVD 176(SP), F0
-	FMULD F3,F0
-	FMOVD 160(SP), F0
-	FMULD F4,F0
-	FMOVD 144(SP), F0
-	FMULD F5,F0
-	FMOVD 136(SP), F0
-	FMULDP F0,F6
-	FMOVD 160(SP), F0
-	FMULD F4,F0
-	FADDDP F0,F3
-	FMOVD 144(SP), F0
-	FMULD F4,F0
-	FADDDP F0,F2
-	FMOVD 136(SP), F0
-	FMULD F4,F0
-	FADDDP F0,F1
-	FMOVD 184(SP), F0
-	FMULDP F0,F4
-	FXCHD F0, F3
-	FADDDP F0,F5
-	FMOVD 144(SP), F0
-	FMULD F4,F0
-	FADDDP F0,F2
-	FMOVD 136(SP), F0
-	FMULD F4,F0
-	FADDDP F0,F1
-	FMOVD 184(SP), F0
-	FMULD F4,F0
-	FADDDP F0,F3
-	FMOVD 168(SP), F0
-	FMULDP F0,F4
-	FXCHD F0, F3
-	FADDDP F0,F4
-	FMOVD 136(SP), F0
-	FMULD F5,F0
-	FADDDP F0,F1
-	FXCHD F0, F3
-	FMOVD 184(SP), F0
-	FMULD F5,F0
-	FADDDP F0,F3
-	FXCHD F0, F1
-	FMOVD 168(SP), F0
-	FMULD F5,F0
-	FADDDP F0,F1
-	FMOVD 152(SP), F0
-	FMULDP F0,F5
-	FXCHD F0, F4
-	FADDDP F0,F1
-	CMPQ DX,$16
-	FXCHD F0, F2
-	FMOVD 128(SP), F0
-	FSUBD ·DOFFSET3MINUSTWO128(SB), F0
-	FADDDP F0,F1
-	FXCHD F0, F1
-	FMOVD 120(SP), F0
-	FSUBD ·DOFFSET2(SB), F0
-	FADDDP F0,F1
-	FXCHD F0, F3
-	FMOVD 112(SP), F0
-	FSUBD ·DOFFSET1(SB), F0
-	FADDDP F0,F1
-	FXCHD F0, F2
-	FMOVD 104(SP), F0
-	FSUBD ·DOFFSET0(SB), F0
-	FADDDP F0,F1
-	JAE MULTIPLYADDATLEAST16BYTES
-	MULTIPLYADDATMOST15BYTES:
-	FMOVD ·ALPHA130(SB), F0
-	FADDD F2,F0
-	FSUBD ·ALPHA130(SB), F0
-	FSUBD F0,F2
-	FMULD ·SCALE(SB), F0
-	FMOVD ·ALPHA32(SB), F0
-	FADDD F2,F0
-	FSUBD ·ALPHA32(SB), F0
-	FSUBD F0,F2
-	FMOVD ·ALPHA64(SB), F0
-	FADDD F5,F0
-	FSUBD ·ALPHA64(SB), F0
-	FSUBD F0,F5
-	FMOVD ·ALPHA96(SB), F0
-	FADDD F7,F0
-	FSUBD ·ALPHA96(SB), F0
-	FSUBD F0,F7
-	FXCHD F0, F7
-	FADDDP F0,F1
-	FXCHD F0, F5
-	FADDDP F0,F1
-	FXCHD F0, F3
-	FADDDP F0,F5
-	FADDDP F0,F1
-	FMOVD 176(SP), F0
-	FMULD F1,F0
-	FMOVD 160(SP), F0
-	FMULD F2,F0
-	FMOVD 144(SP), F0
-	FMULD F3,F0
-	FMOVD 136(SP), F0
-	FMULDP F0,F4
-	FMOVD 160(SP), F0
-	FMULD F5,F0
-	FADDDP F0,F3
-	FMOVD 144(SP), F0
-	FMULD F5,F0
-	FADDDP F0,F2
-	FMOVD 136(SP), F0
-	FMULD F5,F0
-	FADDDP F0,F1
-	FMOVD 184(SP), F0
-	FMULDP F0,F5
-	FXCHD F0, F4
-	FADDDP F0,F3
-	FMOVD 144(SP), F0
-	FMULD F5,F0
-	FADDDP F0,F2
-	FMOVD 136(SP), F0
-	FMULD F5,F0
-	FADDDP F0,F1
-	FMOVD 184(SP), F0
-	FMULD F5,F0
-	FADDDP F0,F4
-	FMOVD 168(SP), F0
-	FMULDP F0,F5
-	FXCHD F0, F4
-	FADDDP F0,F2
-	FMOVD 136(SP), F0
-	FMULD F5,F0
-	FADDDP F0,F1
-	FMOVD 184(SP), F0
-	FMULD F5,F0
-	FADDDP F0,F4
-	FMOVD 168(SP), F0
-	FMULD F5,F0
-	FADDDP F0,F3
-	FMOVD 152(SP), F0
-	FMULDP F0,F5
-	FXCHD F0, F4
-	FADDDP F0,F1
-	ADDATMOST15BYTES:
-	CMPQ DX,$0
-	JE NOMOREBYTES
-	MOVL $0,0(SP)
-	MOVL $0, 4 (SP)
-	MOVL $0, 8 (SP)
-	MOVL $0, 12 (SP)
-	LEAQ 0(SP),DI
-	MOVQ DX,CX
-	REP; MOVSB
-	MOVB $1,0(DI)
-	MOVL  12 (SP),DI
-	MOVL  8 (SP),SI
-	MOVL  4 (SP),DX
-	MOVL 0(SP),CX
-	MOVL DI,128(SP)
-	MOVL SI,120(SP)
-	MOVL DX,112(SP)
-	MOVL CX,104(SP)
-	FXCHD F0, F3
-	FADDD 128(SP), F0
-	FSUBD ·DOFFSET3(SB), F0
-	FXCHD F0, F2
-	FADDD 120(SP), F0
-	FSUBD ·DOFFSET2(SB), F0
-	FXCHD F0, F1
-	FADDD 112(SP), F0
-	FSUBD ·DOFFSET1(SB), F0
-	FXCHD F0, F3
-	FADDD 104(SP), F0
-	FSUBD ·DOFFSET0(SB), F0
-	FMOVD ·ALPHA130(SB), F0
-	FADDD F3,F0
-	FSUBD ·ALPHA130(SB), F0
-	FSUBD F0,F3
-	FMULD ·SCALE(SB), F0
-	FMOVD ·ALPHA32(SB), F0
-	FADDD F2,F0
-	FSUBD ·ALPHA32(SB), F0
-	FSUBD F0,F2
-	FMOVD ·ALPHA64(SB), F0
-	FADDD F6,F0
-	FSUBD ·ALPHA64(SB), F0
-	FSUBD F0,F6
-	FMOVD ·ALPHA96(SB), F0
-	FADDD F5,F0
-	FSUBD ·ALPHA96(SB), F0
-	FSUBD F0,F5
-	FXCHD F0, F4
-	FADDDP F0,F3
-	FXCHD F0, F6
-	FADDDP F0,F1
-	FXCHD F0, F3
-	FADDDP F0,F5
-	FXCHD F0, F3
-	FADDDP F0,F1
-	FMOVD 176(SP), F0
-	FMULD F3,F0
-	FMOVD 160(SP), F0
-	FMULD F4,F0
-	FMOVD 144(SP), F0
-	FMULD F5,F0
-	FMOVD 136(SP), F0
-	FMULDP F0,F6
-	FMOVD 160(SP), F0
-	FMULD F5,F0
-	FADDDP F0,F3
-	FMOVD 144(SP), F0
-	FMULD F5,F0
-	FADDDP F0,F2
-	FMOVD 136(SP), F0
-	FMULD F5,F0
-	FADDDP F0,F1
-	FMOVD 184(SP), F0
-	FMULDP F0,F5
-	FXCHD F0, F4
-	FADDDP F0,F5
-	FMOVD 144(SP), F0
-	FMULD F6,F0
-	FADDDP F0,F2
-	FMOVD 136(SP), F0
-	FMULD F6,F0
-	FADDDP F0,F1
-	FMOVD 184(SP), F0
-	FMULD F6,F0
-	FADDDP F0,F4
-	FMOVD 168(SP), F0
-	FMULDP F0,F6
-	FXCHD F0, F5
-	FADDDP F0,F4
-	FMOVD 136(SP), F0
-	FMULD F2,F0
-	FADDDP F0,F1
-	FMOVD 184(SP), F0
-	FMULD F2,F0
-	FADDDP F0,F5
-	FMOVD 168(SP), F0
-	FMULD F2,F0
-	FADDDP F0,F3
-	FMOVD 152(SP), F0
-	FMULDP F0,F2
-	FXCHD F0, F1
-	FADDDP F0,F3
-	FXCHD F0, F3
-	FXCHD F0, F2
-	NOMOREBYTES:
-	MOVL $0,R10
-	FMOVD ·ALPHA130(SB), F0
-	FADDD F4,F0
-	FSUBD ·ALPHA130(SB), F0
-	FSUBD F0,F4
-	FMULD ·SCALE(SB), F0
-	FMOVD ·ALPHA32(SB), F0
-	FADDD F2,F0
-	FSUBD ·ALPHA32(SB), F0
-	FSUBD F0,F2
-	FMOVD ·ALPHA64(SB), F0
-	FADDD F4,F0
-	FSUBD ·ALPHA64(SB), F0
-	FSUBD F0,F4
-	FMOVD ·ALPHA96(SB), F0
-	FADDD F6,F0
-	FSUBD ·ALPHA96(SB), F0
-	FXCHD F0, F6
-	FSUBD F6,F0
-	FXCHD F0, F4
-	FADDDP F0,F3
-	FXCHD F0, F4
-	FADDDP F0,F1
-	FXCHD F0, F2
-	FADDDP F0,F3
-	FXCHD F0, F4
-	FADDDP F0,F3
-	FXCHD F0, F3
-	FADDD ·HOFFSET0(SB), F0
-	FXCHD F0, F3
-	FADDD ·HOFFSET1(SB), F0
-	FXCHD F0, F1
-	FADDD ·HOFFSET2(SB), F0
-	FXCHD F0, F2
-	FADDD ·HOFFSET3(SB), F0
-	FXCHD F0, F3
-	FMOVDP F0, 104(SP)
-	FMOVDP F0, 112(SP)
-	FMOVDP F0, 120(SP)
-	FMOVDP F0, 128(SP)
-	MOVL 108(SP),DI
-	ANDL $63,DI
-	MOVL 116(SP),SI
-	ANDL $63,SI
-	MOVL 124(SP),DX
-	ANDL $63,DX
-	MOVL 132(SP),CX
-	ANDL $63,CX
-	MOVL 112(SP),R8
-	ADDL DI,R8
-	MOVQ R8,112(SP)
-	MOVL 120(SP),DI
-	ADCL SI,DI
-	MOVQ DI,120(SP)
-	MOVL 128(SP),DI
-	ADCL DX,DI
-	MOVQ DI,128(SP)
-	MOVL R10,DI
-	ADCL CX,DI
-	MOVQ DI,136(SP)
-	MOVQ $5,DI
-	MOVL 104(SP),SI
-	ADDL SI,DI
-	MOVQ DI,104(SP)
-	MOVL R10,DI
-	MOVQ 112(SP),DX
-	ADCL DX,DI
-	MOVQ DI,112(SP)
-	MOVL R10,DI
-	MOVQ 120(SP),CX
-	ADCL CX,DI
-	MOVQ DI,120(SP)
-	MOVL R10,DI
-	MOVQ 128(SP),R8
-	ADCL R8,DI
-	MOVQ DI,128(SP)
-	MOVQ $0XFFFFFFFC,DI
-	MOVQ 136(SP),R9
-	ADCL R9,DI
-	SARL $16,DI
-	MOVQ DI,R9
-	XORL $0XFFFFFFFF,R9
-	ANDQ DI,SI
-	MOVQ 104(SP),AX
-	ANDQ R9,AX
-	ORQ AX,SI
-	ANDQ DI,DX
-	MOVQ 112(SP),AX
-	ANDQ R9,AX
-	ORQ AX,DX
-	ANDQ DI,CX
-	MOVQ 120(SP),AX
-	ANDQ R9,AX
-	ORQ AX,CX
-	ANDQ DI,R8
-	MOVQ 128(SP),DI
-	ANDQ R9,DI
-	ORQ DI,R8
-	MOVQ 88(SP),DI
-	MOVQ 96(SP),R9
-	ADDL 16(R9),SI
-	ADCL 20(R9),DX
-	ADCL 24(R9),CX
-	ADCL 28(R9),R8
-	MOVL SI,0(DI)
-	MOVL DX,4(DI)
-	MOVL CX,8(DI)
-	MOVL R8,12(DI)
-	MOVQ 32(SP),R11
-	MOVQ 40(SP),R12
-	MOVQ 48(SP),R13
-	MOVQ 56(SP),R14
-	MOVQ 64(SP),R15
-	MOVQ 72(SP),BX
-	MOVQ 80(SP),BP
-	MOVQ R11,SP
+	MOVQ BP, SP
+	MOVQ AX, 0(DI)
+	MOVQ BX, 8(DI)
 	RET
 	RET