Bläddra i källkod

go.crypto/curve25519: add package.

This consists of ~2000 lines of amd64 assembly and a, much slower,
generic Go version in curve25519.go. The assembly has been ported from
djb's public domain sources and the only semantic alterations are to
deal with Go's split stacks.

R=rsc
CC=golang-dev
https://golang.org/cl/5786045
Adam Langley 13 år sedan
förälder
incheckning
124e52db8d

+ 18 - 0
curve25519/const_amd64.s

@@ -0,0 +1,18 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This code was translated into a form compatible with 6a from the public
+// domain sources in SUPERCOP: http://bench.cr.yp.to/supercop.html
+
+DATA ·REDMASK51(SB)/8, $0x0007FFFFFFFFFFFF
+GLOBL ·REDMASK51(SB), $8
+
+DATA ·_121666_213(SB)/8, $996687872
+GLOBL ·_121666_213(SB), $8
+
+DATA ·_2P0(SB)/8, $0xFFFFFFFFFFFDA
+GLOBL ·_2P0(SB), $8
+
+DATA ·_2P1234(SB)/8, $0xFFFFFFFFFFFFE
+GLOBL ·_2P1234(SB), $8

+ 86 - 0
curve25519/cswap_amd64.s

@@ -0,0 +1,86 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This code was translated into a form compatible with 6a from the public
+// domain sources in SUPERCOP: http://bench.cr.yp.to/supercop.html
+
+// func cswap(inout *[5]uint64, v uint64)
+TEXT ·cswap(SB),7,$0
+	MOVQ inout+0(FP),DI
+	MOVQ v+8(FP),SI
+
+	CMPQ SI,$1
+	MOVQ 0(DI),SI
+	MOVQ 80(DI),DX
+	MOVQ 8(DI),CX
+	MOVQ 88(DI),R8
+	MOVQ SI,R9
+	CMOVQEQ DX,SI
+	CMOVQEQ R9,DX
+	MOVQ CX,R9
+	CMOVQEQ R8,CX
+	CMOVQEQ R9,R8
+	MOVQ SI,0(DI)
+	MOVQ DX,80(DI)
+	MOVQ CX,8(DI)
+	MOVQ R8,88(DI)
+	MOVQ 16(DI),SI
+	MOVQ 96(DI),DX
+	MOVQ 24(DI),CX
+	MOVQ 104(DI),R8
+	MOVQ SI,R9
+	CMOVQEQ DX,SI
+	CMOVQEQ R9,DX
+	MOVQ CX,R9
+	CMOVQEQ R8,CX
+	CMOVQEQ R9,R8
+	MOVQ SI,16(DI)
+	MOVQ DX,96(DI)
+	MOVQ CX,24(DI)
+	MOVQ R8,104(DI)
+	MOVQ 32(DI),SI
+	MOVQ 112(DI),DX
+	MOVQ 40(DI),CX
+	MOVQ 120(DI),R8
+	MOVQ SI,R9
+	CMOVQEQ DX,SI
+	CMOVQEQ R9,DX
+	MOVQ CX,R9
+	CMOVQEQ R8,CX
+	CMOVQEQ R9,R8
+	MOVQ SI,32(DI)
+	MOVQ DX,112(DI)
+	MOVQ CX,40(DI)
+	MOVQ R8,120(DI)
+	MOVQ 48(DI),SI
+	MOVQ 128(DI),DX
+	MOVQ 56(DI),CX
+	MOVQ 136(DI),R8
+	MOVQ SI,R9
+	CMOVQEQ DX,SI
+	CMOVQEQ R9,DX
+	MOVQ CX,R9
+	CMOVQEQ R8,CX
+	CMOVQEQ R9,R8
+	MOVQ SI,48(DI)
+	MOVQ DX,128(DI)
+	MOVQ CX,56(DI)
+	MOVQ R8,136(DI)
+	MOVQ 64(DI),SI
+	MOVQ 144(DI),DX
+	MOVQ 72(DI),CX
+	MOVQ 152(DI),R8
+	MOVQ SI,R9
+	CMOVQEQ DX,SI
+	CMOVQEQ R9,DX
+	MOVQ CX,R9
+	CMOVQEQ R8,CX
+	CMOVQEQ R9,R8
+	MOVQ SI,64(DI)
+	MOVQ DX,144(DI)
+	MOVQ CX,72(DI)
+	MOVQ R8,152(DI)
+	MOVQ DI,AX
+	MOVQ SI,DX
+	RET

+ 136 - 0
curve25519/curve25519.go

@@ -0,0 +1,136 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package curve25519
+
+// We have a implementation in amd64 assembly so this code is only run on
+// non-amd64 platforms.
+// +build !amd64
+
+import (
+	"math/big"
+)
+
+// p is the prime order of the underlying field: 2^255-19
+var p *big.Int
+
+// pMinus2 is p-2
+var pMinus2 *big.Int
+
+// a is a parameter of the elliptic curve: 486662
+var a *big.Int
+
+func init() {
+	p, _ = new(big.Int).SetString("7fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffed", 16)
+	pMinus2, _ = new(big.Int).SetString("7fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffeb", 16)
+	a = new(big.Int).SetInt64(486662)
+}
+
+// context contains state shared throughout the computation, including scratch
+// variables to save on allocation.
+type context struct {
+	tmp1, tmp2, tmp3, tmp4 *big.Int
+	x1                     *big.Int
+}
+
+// add sets (outx, outz) to the sum of two points in the elliptic curve group.
+// See http://hyperelliptic.org/EFD/g1p/auto-montgom-xz.html#diffadd-dadd-1987-m
+// outx and outz should not alias any of the other inputs.
+func (c *context) add(outx, outz, xn, zn, xm, zm *big.Int) {
+	// x₃ = 4(x·x′ - z·z′)² · z1
+	// (z1 == 1 here)
+	c.tmp1.Mul(xn, xm)
+	c.tmp2.Mul(zn, zm)
+	c.tmp3.Sub(c.tmp1, c.tmp2)
+	outx.Mul(c.tmp3, c.tmp3)
+	outx.Lsh(outx, 2)
+	outx.Mod(outx, p)
+
+	// z₃ = 4(x·z′ - z·x′)² · x1
+	// (x1 == 1 here)
+	c.tmp1.Mul(xm, zn)
+	c.tmp2.Mul(zm, xn)
+	c.tmp3.Sub(c.tmp1, c.tmp2)
+	outz.Mul(c.tmp3, c.tmp3)
+	outz.Mul(outz, c.x1)
+	outz.Lsh(outz, 2)
+	outz.Mod(outz, p)
+
+	return
+}
+
+// double sets (outx, outz) to 2*(x,z) in the elliptic curve group. See
+// http://hyperelliptic.org/EFD/g1p/auto-montgom-xz.html#doubling-dbl-1987-m
+// outx and outz should not alias any of the other inputs.
+func (c *context) double(outx, outz, x, z *big.Int) {
+	// x₂ = (x² - z²)²
+	c.tmp1.Mul(x, x)
+	c.tmp2.Mul(z, z)
+	c.tmp3.Sub(c.tmp1, c.tmp2)
+	outx.Mul(c.tmp3, c.tmp3)
+	outx.Mod(outx, p)
+
+	// z₂ = 4xz·(x² + Axz + z²)
+	c.tmp3.Add(c.tmp1, c.tmp2)
+	c.tmp1.Mul(x, z)
+	c.tmp2.Mul(c.tmp1, a)
+	outz.Add(c.tmp3, c.tmp2)
+	c.tmp2.Lsh(c.tmp1, 2)
+	outz.Mul(outz, c.tmp2)
+	outz.Mod(outz, p)
+
+	return
+}
+
+func scalarMult(out, in, base *[32]byte) {
+	var baseReversed, inCopy [32]byte
+	for i := 0; i < 32; i++ {
+		baseReversed[31-i] = base[i]
+		inCopy[i] = in[i]
+	}
+
+	inCopy[31] &= 127
+	inCopy[31] |= 64
+	inCopy[0] &= 248
+
+	c := &context{new(big.Int), new(big.Int), new(big.Int), new(big.Int), nil}
+	c.x1 = new(big.Int).SetBytes(baseReversed[:])
+
+	x1 := new(big.Int).SetInt64(1)
+	z1 := new(big.Int)
+	x2 := new(big.Int).Set(c.x1)
+	z2 := new(big.Int).SetInt64(1)
+	outx := new(big.Int)
+	outz := new(big.Int)
+
+	for i := 0; i < 32; i++ {
+		b := inCopy[31-i]
+		for j := 0; j < 8; j++ {
+			if b&0x80 != 0 {
+				c.add(outx, outz, x1, z1, x2, z2)
+				x1, z1, outx, outz = outx, outz, x1, z1
+				c.double(outx, outz, x2, z2)
+				x2, z2, outx, outz = outx, outz, x2, z2
+			} else {
+				c.add(outx, outz, x1, z1, x2, z2)
+				x2, z2, outx, outz = outx, outz, x2, z2
+				c.double(outx, outz, x1, z1)
+				x1, z1, outx, outz = outx, outz, x1, z1
+			}
+			b <<= 1
+		}
+	}
+
+	c.tmp1.Exp(z1, pMinus2, p)
+	c.tmp2.Mul(x1, c.tmp1)
+	c.tmp3.Mod(c.tmp2, p)
+
+	outReversed := c.tmp3.Bytes()
+	for i := 0; i < len(outReversed); i++ {
+		out[i] = outReversed[len(outReversed)-(1+i)]
+	}
+	for i := len(outReversed); i < 32; i++ {
+		out[i] = 0
+	}
+}

+ 29 - 0
curve25519/curve25519_test.go

@@ -0,0 +1,29 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package curve25519
+
+import (
+	"fmt"
+	"testing"
+)
+
+const expectedHex = "89161fde887b2b53de549af483940106ecc114d6982daa98256de23bdf77661a"
+
+func TestBaseScalarMult(t *testing.T) {
+	var a, b [32]byte
+	in := &a
+	out := &b
+	a[0] = 1
+
+	for i := 0; i < 200; i++ {
+		ScalarBaseMult(out, in)
+		in, out = out, in
+	}
+
+	result := fmt.Sprintf("%x", in[:])
+	if result != expectedHex {
+		t.Errorf("incorrect result: got %s, want %s", result, expectedHex)
+	}
+}

+ 23 - 0
curve25519/doc.go

@@ -0,0 +1,23 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package curve25519 provides an implementation of scalar multiplication on
+// the elliptic curve known as curve25519. See http://cr.yp.to/ecdh.html
+package curve25519
+
+// basePoint is the x coordinate of the generator of the curve.
+var basePoint = [32]byte{9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
+
+// ScalarMult sets dst to the product in*base where dst and base are the x
+// coordinates of group points and all values are in little-endian form.
+func ScalarMult(dst, in, base *[32]byte) {
+	scalarMult(dst, in, base)
+}
+
+// ScalarBaseMult sets dst to the product in*base where dst and base are the x
+// coordinates of group points, base is the standard generator and all values
+// are in little-endian form.
+func ScalarBaseMult(dst, in *[32]byte) {
+	ScalarMult(dst, in, &basePoint)
+}

+ 92 - 0
curve25519/freeze_amd64.s

@@ -0,0 +1,92 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This code was translated into a form compatible with 6a from the public
+// domain sources in SUPERCOP: http://bench.cr.yp.to/supercop.html
+
+// func freeze(inout *[5]uint64)
+TEXT ·freeze(SB),7,$96-8
+	MOVQ inout+0(FP), DI
+
+	MOVQ SP,R11
+	MOVQ $31,CX
+	NOTQ CX
+	ANDQ CX,SP
+	ADDQ $32,SP
+
+	MOVQ R11,0(SP)
+	MOVQ R12,8(SP)
+	MOVQ R13,16(SP)
+	MOVQ R14,24(SP)
+	MOVQ R15,32(SP)
+	MOVQ BX,40(SP)
+	MOVQ BP,48(SP)
+	MOVQ 0(DI),SI
+	MOVQ 8(DI),DX
+	MOVQ 16(DI),CX
+	MOVQ 24(DI),R8
+	MOVQ 32(DI),R9
+	MOVQ ·REDMASK51(SB),AX
+	MOVQ AX,R10
+	SUBQ $18,R10
+	MOVQ $3,R11
+REDUCELOOP:
+	MOVQ SI,R12
+	SHRQ $51,R12
+	ANDQ AX,SI
+	ADDQ R12,DX
+	MOVQ DX,R12
+	SHRQ $51,R12
+	ANDQ AX,DX
+	ADDQ R12,CX
+	MOVQ CX,R12
+	SHRQ $51,R12
+	ANDQ AX,CX
+	ADDQ R12,R8
+	MOVQ R8,R12
+	SHRQ $51,R12
+	ANDQ AX,R8
+	ADDQ R12,R9
+	MOVQ R9,R12
+	SHRQ $51,R12
+	ANDQ AX,R9
+	IMUL3Q $19,R12,R12
+	ADDQ R12,SI
+	SUBQ $1,R11
+	JA REDUCELOOP
+	MOVQ $1,R12
+	CMPQ R10,SI
+	CMOVQLT R11,R12
+	CMPQ AX,DX
+	CMOVQNE R11,R12
+	CMPQ AX,CX
+	CMOVQNE R11,R12
+	CMPQ AX,R8
+	CMOVQNE R11,R12
+	CMPQ AX,R9
+	CMOVQNE R11,R12
+	NEGQ R12
+	ANDQ R12,AX
+	ANDQ R12,R10
+	SUBQ R10,SI
+	SUBQ AX,DX
+	SUBQ AX,CX
+	SUBQ AX,R8
+	SUBQ AX,R9
+	MOVQ SI,0(DI)
+	MOVQ DX,8(DI)
+	MOVQ CX,16(DI)
+	MOVQ R8,24(DI)
+	MOVQ R9,32(DI)
+	MOVQ 0(SP),R11
+	MOVQ 8(SP),R12
+	MOVQ 16(SP),R13
+	MOVQ 24(SP),R14
+	MOVQ 32(SP),R15
+	MOVQ 40(SP),BX
+	MOVQ 48(SP),BP
+	MOVQ R11,SP
+	MOVQ DI,AX
+	MOVQ SI,DX
+	RET

+ 1396 - 0
curve25519/ladderstep_amd64.s

@@ -0,0 +1,1396 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This code was translated into a form compatible with 6a from the public
+// domain sources in SUPERCOP: http://bench.cr.yp.to/supercop.html
+
+// func ladderstep(inout *[5][5]uint64)
+TEXT ·ladderstep(SB),0,$384-8
+	MOVQ inout+0(FP),DI
+
+	MOVQ SP,R11
+	MOVQ $31,CX
+	NOTQ CX
+	ANDQ CX,SP
+	ADDQ $32,SP
+
+	MOVQ R11,0(SP)
+	MOVQ R12,8(SP)
+	MOVQ R13,16(SP)
+	MOVQ R14,24(SP)
+	MOVQ R15,32(SP)
+	MOVQ BX,40(SP)
+	MOVQ BP,48(SP)
+	MOVQ 40(DI),SI
+	MOVQ 48(DI),DX
+	MOVQ 56(DI),CX
+	MOVQ 64(DI),R8
+	MOVQ 72(DI),R9
+	MOVQ SI,AX
+	MOVQ DX,R10
+	MOVQ CX,R11
+	MOVQ R8,R12
+	MOVQ R9,R13
+	ADDQ ·_2P0(SB),AX
+	ADDQ ·_2P1234(SB),R10
+	ADDQ ·_2P1234(SB),R11
+	ADDQ ·_2P1234(SB),R12
+	ADDQ ·_2P1234(SB),R13
+	ADDQ 80(DI),SI
+	ADDQ 88(DI),DX
+	ADDQ 96(DI),CX
+	ADDQ 104(DI),R8
+	ADDQ 112(DI),R9
+	SUBQ 80(DI),AX
+	SUBQ 88(DI),R10
+	SUBQ 96(DI),R11
+	SUBQ 104(DI),R12
+	SUBQ 112(DI),R13
+	MOVQ SI,56(SP)
+	MOVQ DX,64(SP)
+	MOVQ CX,72(SP)
+	MOVQ R8,80(SP)
+	MOVQ R9,88(SP)
+	MOVQ AX,96(SP)
+	MOVQ R10,104(SP)
+	MOVQ R11,112(SP)
+	MOVQ R12,120(SP)
+	MOVQ R13,128(SP)
+	MOVQ 96(SP),AX
+	MULQ 96(SP)
+	MOVQ AX,SI
+	MOVQ DX,CX
+	MOVQ 96(SP),AX
+	SHLQ $1,AX
+	MULQ 104(SP)
+	MOVQ AX,R8
+	MOVQ DX,R9
+	MOVQ 96(SP),AX
+	SHLQ $1,AX
+	MULQ 112(SP)
+	MOVQ AX,R10
+	MOVQ DX,R11
+	MOVQ 96(SP),AX
+	SHLQ $1,AX
+	MULQ 120(SP)
+	MOVQ AX,R12
+	MOVQ DX,R13
+	MOVQ 96(SP),AX
+	SHLQ $1,AX
+	MULQ 128(SP)
+	MOVQ AX,R14
+	MOVQ DX,R15
+	MOVQ 104(SP),AX
+	MULQ 104(SP)
+	ADDQ AX,R10
+	ADCQ DX,R11
+	MOVQ 104(SP),AX
+	SHLQ $1,AX
+	MULQ 112(SP)
+	ADDQ AX,R12
+	ADCQ DX,R13
+	MOVQ 104(SP),AX
+	SHLQ $1,AX
+	MULQ 120(SP)
+	ADDQ AX,R14
+	ADCQ DX,R15
+	MOVQ 104(SP),DX
+	IMUL3Q $38,DX,AX
+	MULQ 128(SP)
+	ADDQ AX,SI
+	ADCQ DX,CX
+	MOVQ 112(SP),AX
+	MULQ 112(SP)
+	ADDQ AX,R14
+	ADCQ DX,R15
+	MOVQ 112(SP),DX
+	IMUL3Q $38,DX,AX
+	MULQ 120(SP)
+	ADDQ AX,SI
+	ADCQ DX,CX
+	MOVQ 112(SP),DX
+	IMUL3Q $38,DX,AX
+	MULQ 128(SP)
+	ADDQ AX,R8
+	ADCQ DX,R9
+	MOVQ 120(SP),DX
+	IMUL3Q $19,DX,AX
+	MULQ 120(SP)
+	ADDQ AX,R8
+	ADCQ DX,R9
+	MOVQ 120(SP),DX
+	IMUL3Q $38,DX,AX
+	MULQ 128(SP)
+	ADDQ AX,R10
+	ADCQ DX,R11
+	MOVQ 128(SP),DX
+	IMUL3Q $19,DX,AX
+	MULQ 128(SP)
+	ADDQ AX,R12
+	ADCQ DX,R13
+	MOVQ ·REDMASK51(SB),DX
+	SHLQ $13,CX:SI
+	ANDQ DX,SI
+	SHLQ $13,R9:R8
+	ANDQ DX,R8
+	ADDQ CX,R8
+	SHLQ $13,R11:R10
+	ANDQ DX,R10
+	ADDQ R9,R10
+	SHLQ $13,R13:R12
+	ANDQ DX,R12
+	ADDQ R11,R12
+	SHLQ $13,R15:R14
+	ANDQ DX,R14
+	ADDQ R13,R14
+	IMUL3Q $19,R15,CX
+	ADDQ CX,SI
+	MOVQ SI,CX
+	SHRQ $51,CX
+	ADDQ R8,CX
+	ANDQ DX,SI
+	MOVQ CX,R8
+	SHRQ $51,CX
+	ADDQ R10,CX
+	ANDQ DX,R8
+	MOVQ CX,R9
+	SHRQ $51,CX
+	ADDQ R12,CX
+	ANDQ DX,R9
+	MOVQ CX,AX
+	SHRQ $51,CX
+	ADDQ R14,CX
+	ANDQ DX,AX
+	MOVQ CX,R10
+	SHRQ $51,CX
+	IMUL3Q $19,CX,CX
+	ADDQ CX,SI
+	ANDQ DX,R10
+	MOVQ SI,136(SP)
+	MOVQ R8,144(SP)
+	MOVQ R9,152(SP)
+	MOVQ AX,160(SP)
+	MOVQ R10,168(SP)
+	MOVQ 56(SP),AX
+	MULQ 56(SP)
+	MOVQ AX,SI
+	MOVQ DX,CX
+	MOVQ 56(SP),AX
+	SHLQ $1,AX
+	MULQ 64(SP)
+	MOVQ AX,R8
+	MOVQ DX,R9
+	MOVQ 56(SP),AX
+	SHLQ $1,AX
+	MULQ 72(SP)
+	MOVQ AX,R10
+	MOVQ DX,R11
+	MOVQ 56(SP),AX
+	SHLQ $1,AX
+	MULQ 80(SP)
+	MOVQ AX,R12
+	MOVQ DX,R13
+	MOVQ 56(SP),AX
+	SHLQ $1,AX
+	MULQ 88(SP)
+	MOVQ AX,R14
+	MOVQ DX,R15
+	MOVQ 64(SP),AX
+	MULQ 64(SP)
+	ADDQ AX,R10
+	ADCQ DX,R11
+	MOVQ 64(SP),AX
+	SHLQ $1,AX
+	MULQ 72(SP)
+	ADDQ AX,R12
+	ADCQ DX,R13
+	MOVQ 64(SP),AX
+	SHLQ $1,AX
+	MULQ 80(SP)
+	ADDQ AX,R14
+	ADCQ DX,R15
+	MOVQ 64(SP),DX
+	IMUL3Q $38,DX,AX
+	MULQ 88(SP)
+	ADDQ AX,SI
+	ADCQ DX,CX
+	MOVQ 72(SP),AX
+	MULQ 72(SP)
+	ADDQ AX,R14
+	ADCQ DX,R15
+	MOVQ 72(SP),DX
+	IMUL3Q $38,DX,AX
+	MULQ 80(SP)
+	ADDQ AX,SI
+	ADCQ DX,CX
+	MOVQ 72(SP),DX
+	IMUL3Q $38,DX,AX
+	MULQ 88(SP)
+	ADDQ AX,R8
+	ADCQ DX,R9
+	MOVQ 80(SP),DX
+	IMUL3Q $19,DX,AX
+	MULQ 80(SP)
+	ADDQ AX,R8
+	ADCQ DX,R9
+	MOVQ 80(SP),DX
+	IMUL3Q $38,DX,AX
+	MULQ 88(SP)
+	ADDQ AX,R10
+	ADCQ DX,R11
+	MOVQ 88(SP),DX
+	IMUL3Q $19,DX,AX
+	MULQ 88(SP)
+	ADDQ AX,R12
+	ADCQ DX,R13
+	MOVQ ·REDMASK51(SB),DX
+	SHLQ $13,CX:SI
+	ANDQ DX,SI
+	SHLQ $13,R9:R8
+	ANDQ DX,R8
+	ADDQ CX,R8
+	SHLQ $13,R11:R10
+	ANDQ DX,R10
+	ADDQ R9,R10
+	SHLQ $13,R13:R12
+	ANDQ DX,R12
+	ADDQ R11,R12
+	SHLQ $13,R15:R14
+	ANDQ DX,R14
+	ADDQ R13,R14
+	IMUL3Q $19,R15,CX
+	ADDQ CX,SI
+	MOVQ SI,CX
+	SHRQ $51,CX
+	ADDQ R8,CX
+	ANDQ DX,SI
+	MOVQ CX,R8
+	SHRQ $51,CX
+	ADDQ R10,CX
+	ANDQ DX,R8
+	MOVQ CX,R9
+	SHRQ $51,CX
+	ADDQ R12,CX
+	ANDQ DX,R9
+	MOVQ CX,AX
+	SHRQ $51,CX
+	ADDQ R14,CX
+	ANDQ DX,AX
+	MOVQ CX,R10
+	SHRQ $51,CX
+	IMUL3Q $19,CX,CX
+	ADDQ CX,SI
+	ANDQ DX,R10
+	MOVQ SI,176(SP)
+	MOVQ R8,184(SP)
+	MOVQ R9,192(SP)
+	MOVQ AX,200(SP)
+	MOVQ R10,208(SP)
+	MOVQ SI,SI
+	MOVQ R8,DX
+	MOVQ R9,CX
+	MOVQ AX,R8
+	MOVQ R10,R9
+	ADDQ ·_2P0(SB),SI
+	ADDQ ·_2P1234(SB),DX
+	ADDQ ·_2P1234(SB),CX
+	ADDQ ·_2P1234(SB),R8
+	ADDQ ·_2P1234(SB),R9
+	SUBQ 136(SP),SI
+	SUBQ 144(SP),DX
+	SUBQ 152(SP),CX
+	SUBQ 160(SP),R8
+	SUBQ 168(SP),R9
+	MOVQ SI,216(SP)
+	MOVQ DX,224(SP)
+	MOVQ CX,232(SP)
+	MOVQ R8,240(SP)
+	MOVQ R9,248(SP)
+	MOVQ 120(DI),SI
+	MOVQ 128(DI),DX
+	MOVQ 136(DI),CX
+	MOVQ 144(DI),R8
+	MOVQ 152(DI),R9
+	MOVQ SI,AX
+	MOVQ DX,R10
+	MOVQ CX,R11
+	MOVQ R8,R12
+	MOVQ R9,R13
+	ADDQ ·_2P0(SB),AX
+	ADDQ ·_2P1234(SB),R10
+	ADDQ ·_2P1234(SB),R11
+	ADDQ ·_2P1234(SB),R12
+	ADDQ ·_2P1234(SB),R13
+	ADDQ 160(DI),SI
+	ADDQ 168(DI),DX
+	ADDQ 176(DI),CX
+	ADDQ 184(DI),R8
+	ADDQ 192(DI),R9
+	SUBQ 160(DI),AX
+	SUBQ 168(DI),R10
+	SUBQ 176(DI),R11
+	SUBQ 184(DI),R12
+	SUBQ 192(DI),R13
+	MOVQ SI,256(SP)
+	MOVQ DX,264(SP)
+	MOVQ CX,272(SP)
+	MOVQ R8,280(SP)
+	MOVQ R9,288(SP)
+	MOVQ AX,296(SP)
+	MOVQ R10,304(SP)
+	MOVQ R11,312(SP)
+	MOVQ R12,320(SP)
+	MOVQ R13,328(SP)
+	MOVQ 280(SP),SI
+	IMUL3Q $19,SI,AX
+	MOVQ AX,336(SP)
+	MULQ 112(SP)
+	MOVQ AX,SI
+	MOVQ DX,CX
+	MOVQ 288(SP),DX
+	IMUL3Q $19,DX,AX
+	MOVQ AX,344(SP)
+	MULQ 104(SP)
+	ADDQ AX,SI
+	ADCQ DX,CX
+	MOVQ 256(SP),AX
+	MULQ 96(SP)
+	ADDQ AX,SI
+	ADCQ DX,CX
+	MOVQ 256(SP),AX
+	MULQ 104(SP)
+	MOVQ AX,R8
+	MOVQ DX,R9
+	MOVQ 256(SP),AX
+	MULQ 112(SP)
+	MOVQ AX,R10
+	MOVQ DX,R11
+	MOVQ 256(SP),AX
+	MULQ 120(SP)
+	MOVQ AX,R12
+	MOVQ DX,R13
+	MOVQ 256(SP),AX
+	MULQ 128(SP)
+	MOVQ AX,R14
+	MOVQ DX,R15
+	MOVQ 264(SP),AX
+	MULQ 96(SP)
+	ADDQ AX,R8
+	ADCQ DX,R9
+	MOVQ 264(SP),AX
+	MULQ 104(SP)
+	ADDQ AX,R10
+	ADCQ DX,R11
+	MOVQ 264(SP),AX
+	MULQ 112(SP)
+	ADDQ AX,R12
+	ADCQ DX,R13
+	MOVQ 264(SP),AX
+	MULQ 120(SP)
+	ADDQ AX,R14
+	ADCQ DX,R15
+	MOVQ 264(SP),DX
+	IMUL3Q $19,DX,AX
+	MULQ 128(SP)
+	ADDQ AX,SI
+	ADCQ DX,CX
+	MOVQ 272(SP),AX
+	MULQ 96(SP)
+	ADDQ AX,R10
+	ADCQ DX,R11
+	MOVQ 272(SP),AX
+	MULQ 104(SP)
+	ADDQ AX,R12
+	ADCQ DX,R13
+	MOVQ 272(SP),AX
+	MULQ 112(SP)
+	ADDQ AX,R14
+	ADCQ DX,R15
+	MOVQ 272(SP),DX
+	IMUL3Q $19,DX,AX
+	MULQ 120(SP)
+	ADDQ AX,SI
+	ADCQ DX,CX
+	MOVQ 272(SP),DX
+	IMUL3Q $19,DX,AX
+	MULQ 128(SP)
+	ADDQ AX,R8
+	ADCQ DX,R9
+	MOVQ 280(SP),AX
+	MULQ 96(SP)
+	ADDQ AX,R12
+	ADCQ DX,R13
+	MOVQ 280(SP),AX
+	MULQ 104(SP)
+	ADDQ AX,R14
+	ADCQ DX,R15
+	MOVQ 336(SP),AX
+	MULQ 120(SP)
+	ADDQ AX,R8
+	ADCQ DX,R9
+	MOVQ 336(SP),AX
+	MULQ 128(SP)
+	ADDQ AX,R10
+	ADCQ DX,R11
+	MOVQ 288(SP),AX
+	MULQ 96(SP)
+	ADDQ AX,R14
+	ADCQ DX,R15
+	MOVQ 344(SP),AX
+	MULQ 112(SP)
+	ADDQ AX,R8
+	ADCQ DX,R9
+	MOVQ 344(SP),AX
+	MULQ 120(SP)
+	ADDQ AX,R10
+	ADCQ DX,R11
+	MOVQ 344(SP),AX
+	MULQ 128(SP)
+	ADDQ AX,R12
+	ADCQ DX,R13
+	MOVQ ·REDMASK51(SB),DX
+	SHLQ $13,CX:SI
+	ANDQ DX,SI
+	SHLQ $13,R9:R8
+	ANDQ DX,R8
+	ADDQ CX,R8
+	SHLQ $13,R11:R10
+	ANDQ DX,R10
+	ADDQ R9,R10
+	SHLQ $13,R13:R12
+	ANDQ DX,R12
+	ADDQ R11,R12
+	SHLQ $13,R15:R14
+	ANDQ DX,R14
+	ADDQ R13,R14
+	IMUL3Q $19,R15,CX
+	ADDQ CX,SI
+	MOVQ SI,CX
+	SHRQ $51,CX
+	ADDQ R8,CX
+	MOVQ CX,R8
+	SHRQ $51,CX
+	ANDQ DX,SI
+	ADDQ R10,CX
+	MOVQ CX,R9
+	SHRQ $51,CX
+	ANDQ DX,R8
+	ADDQ R12,CX
+	MOVQ CX,AX
+	SHRQ $51,CX
+	ANDQ DX,R9
+	ADDQ R14,CX
+	MOVQ CX,R10
+	SHRQ $51,CX
+	ANDQ DX,AX
+	IMUL3Q $19,CX,CX
+	ADDQ CX,SI
+	ANDQ DX,R10
+	MOVQ SI,96(SP)
+	MOVQ R8,104(SP)
+	MOVQ R9,112(SP)
+	MOVQ AX,120(SP)
+	MOVQ R10,128(SP)
+	MOVQ 320(SP),SI
+	IMUL3Q $19,SI,AX
+	MOVQ AX,256(SP)
+	MULQ 72(SP)
+	MOVQ AX,SI
+	MOVQ DX,CX
+	MOVQ 328(SP),DX
+	IMUL3Q $19,DX,AX
+	MOVQ AX,264(SP)
+	MULQ 64(SP)
+	ADDQ AX,SI
+	ADCQ DX,CX
+	MOVQ 296(SP),AX
+	MULQ 56(SP)
+	ADDQ AX,SI
+	ADCQ DX,CX
+	MOVQ 296(SP),AX
+	MULQ 64(SP)
+	MOVQ AX,R8
+	MOVQ DX,R9
+	MOVQ 296(SP),AX
+	MULQ 72(SP)
+	MOVQ AX,R10
+	MOVQ DX,R11
+	MOVQ 296(SP),AX
+	MULQ 80(SP)
+	MOVQ AX,R12
+	MOVQ DX,R13
+	MOVQ 296(SP),AX
+	MULQ 88(SP)
+	MOVQ AX,R14
+	MOVQ DX,R15
+	MOVQ 304(SP),AX
+	MULQ 56(SP)
+	ADDQ AX,R8
+	ADCQ DX,R9
+	MOVQ 304(SP),AX
+	MULQ 64(SP)
+	ADDQ AX,R10
+	ADCQ DX,R11
+	MOVQ 304(SP),AX
+	MULQ 72(SP)
+	ADDQ AX,R12
+	ADCQ DX,R13
+	MOVQ 304(SP),AX
+	MULQ 80(SP)
+	ADDQ AX,R14
+	ADCQ DX,R15
+	MOVQ 304(SP),DX
+	IMUL3Q $19,DX,AX
+	MULQ 88(SP)
+	ADDQ AX,SI
+	ADCQ DX,CX
+	MOVQ 312(SP),AX
+	MULQ 56(SP)
+	ADDQ AX,R10
+	ADCQ DX,R11
+	MOVQ 312(SP),AX
+	MULQ 64(SP)
+	ADDQ AX,R12
+	ADCQ DX,R13
+	MOVQ 312(SP),AX
+	MULQ 72(SP)
+	ADDQ AX,R14
+	ADCQ DX,R15
+	MOVQ 312(SP),DX
+	IMUL3Q $19,DX,AX
+	MULQ 80(SP)
+	ADDQ AX,SI
+	ADCQ DX,CX
+	MOVQ 312(SP),DX
+	IMUL3Q $19,DX,AX
+	MULQ 88(SP)
+	ADDQ AX,R8
+	ADCQ DX,R9
+	MOVQ 320(SP),AX
+	MULQ 56(SP)
+	ADDQ AX,R12
+	ADCQ DX,R13
+	MOVQ 320(SP),AX
+	MULQ 64(SP)
+	ADDQ AX,R14
+	ADCQ DX,R15
+	MOVQ 256(SP),AX
+	MULQ 80(SP)
+	ADDQ AX,R8
+	ADCQ DX,R9
+	MOVQ 256(SP),AX
+	MULQ 88(SP)
+	ADDQ AX,R10
+	ADCQ DX,R11
+	MOVQ 328(SP),AX
+	MULQ 56(SP)
+	ADDQ AX,R14
+	ADCQ DX,R15
+	MOVQ 264(SP),AX
+	MULQ 72(SP)
+	ADDQ AX,R8
+	ADCQ DX,R9
+	MOVQ 264(SP),AX
+	MULQ 80(SP)
+	ADDQ AX,R10
+	ADCQ DX,R11
+	MOVQ 264(SP),AX
+	MULQ 88(SP)
+	ADDQ AX,R12
+	ADCQ DX,R13
+	MOVQ ·REDMASK51(SB),DX
+	SHLQ $13,CX:SI
+	ANDQ DX,SI
+	SHLQ $13,R9:R8
+	ANDQ DX,R8
+	ADDQ CX,R8
+	SHLQ $13,R11:R10
+	ANDQ DX,R10
+	ADDQ R9,R10
+	SHLQ $13,R13:R12
+	ANDQ DX,R12
+	ADDQ R11,R12
+	SHLQ $13,R15:R14
+	ANDQ DX,R14
+	ADDQ R13,R14
+	IMUL3Q $19,R15,CX
+	ADDQ CX,SI
+	MOVQ SI,CX
+	SHRQ $51,CX
+	ADDQ R8,CX
+	MOVQ CX,R8
+	SHRQ $51,CX
+	ANDQ DX,SI
+	ADDQ R10,CX
+	MOVQ CX,R9
+	SHRQ $51,CX
+	ANDQ DX,R8
+	ADDQ R12,CX
+	MOVQ CX,AX
+	SHRQ $51,CX
+	ANDQ DX,R9
+	ADDQ R14,CX
+	MOVQ CX,R10
+	SHRQ $51,CX
+	ANDQ DX,AX
+	IMUL3Q $19,CX,CX
+	ADDQ CX,SI
+	ANDQ DX,R10
+	MOVQ SI,DX
+	MOVQ R8,CX
+	MOVQ R9,R11
+	MOVQ AX,R12
+	MOVQ R10,R13
+	ADDQ ·_2P0(SB),DX
+	ADDQ ·_2P1234(SB),CX
+	ADDQ ·_2P1234(SB),R11
+	ADDQ ·_2P1234(SB),R12
+	ADDQ ·_2P1234(SB),R13
+	ADDQ 96(SP),SI
+	ADDQ 104(SP),R8
+	ADDQ 112(SP),R9
+	ADDQ 120(SP),AX
+	ADDQ 128(SP),R10
+	SUBQ 96(SP),DX
+	SUBQ 104(SP),CX
+	SUBQ 112(SP),R11
+	SUBQ 120(SP),R12
+	SUBQ 128(SP),R13
+	MOVQ SI,120(DI)
+	MOVQ R8,128(DI)
+	MOVQ R9,136(DI)
+	MOVQ AX,144(DI)
+	MOVQ R10,152(DI)
+	MOVQ DX,160(DI)
+	MOVQ CX,168(DI)
+	MOVQ R11,176(DI)
+	MOVQ R12,184(DI)
+	MOVQ R13,192(DI)
+	MOVQ 120(DI),AX
+	MULQ 120(DI)
+	MOVQ AX,SI
+	MOVQ DX,CX
+	MOVQ 120(DI),AX
+	SHLQ $1,AX
+	MULQ 128(DI)
+	MOVQ AX,R8
+	MOVQ DX,R9
+	MOVQ 120(DI),AX
+	SHLQ $1,AX
+	MULQ 136(DI)
+	MOVQ AX,R10
+	MOVQ DX,R11
+	MOVQ 120(DI),AX
+	SHLQ $1,AX
+	MULQ 144(DI)
+	MOVQ AX,R12
+	MOVQ DX,R13
+	MOVQ 120(DI),AX
+	SHLQ $1,AX
+	MULQ 152(DI)
+	MOVQ AX,R14
+	MOVQ DX,R15
+	MOVQ 128(DI),AX
+	MULQ 128(DI)
+	ADDQ AX,R10
+	ADCQ DX,R11
+	MOVQ 128(DI),AX
+	SHLQ $1,AX
+	MULQ 136(DI)
+	ADDQ AX,R12
+	ADCQ DX,R13
+	MOVQ 128(DI),AX
+	SHLQ $1,AX
+	MULQ 144(DI)
+	ADDQ AX,R14
+	ADCQ DX,R15
+	MOVQ 128(DI),DX
+	IMUL3Q $38,DX,AX
+	MULQ 152(DI)
+	ADDQ AX,SI
+	ADCQ DX,CX
+	MOVQ 136(DI),AX
+	MULQ 136(DI)
+	ADDQ AX,R14
+	ADCQ DX,R15
+	MOVQ 136(DI),DX
+	IMUL3Q $38,DX,AX
+	MULQ 144(DI)
+	ADDQ AX,SI
+	ADCQ DX,CX
+	MOVQ 136(DI),DX
+	IMUL3Q $38,DX,AX
+	MULQ 152(DI)
+	ADDQ AX,R8
+	ADCQ DX,R9
+	MOVQ 144(DI),DX
+	IMUL3Q $19,DX,AX
+	MULQ 144(DI)
+	ADDQ AX,R8
+	ADCQ DX,R9
+	MOVQ 144(DI),DX
+	IMUL3Q $38,DX,AX
+	MULQ 152(DI)
+	ADDQ AX,R10
+	ADCQ DX,R11
+	MOVQ 152(DI),DX
+	IMUL3Q $19,DX,AX
+	MULQ 152(DI)
+	ADDQ AX,R12
+	ADCQ DX,R13
+	MOVQ ·REDMASK51(SB),DX
+	SHLQ $13,CX:SI
+	ANDQ DX,SI
+	SHLQ $13,R9:R8
+	ANDQ DX,R8
+	ADDQ CX,R8
+	SHLQ $13,R11:R10
+	ANDQ DX,R10
+	ADDQ R9,R10
+	SHLQ $13,R13:R12
+	ANDQ DX,R12
+	ADDQ R11,R12
+	SHLQ $13,R15:R14
+	ANDQ DX,R14
+	ADDQ R13,R14
+	IMUL3Q $19,R15,CX
+	ADDQ CX,SI
+	MOVQ SI,CX
+	SHRQ $51,CX
+	ADDQ R8,CX
+	ANDQ DX,SI
+	MOVQ CX,R8
+	SHRQ $51,CX
+	ADDQ R10,CX
+	ANDQ DX,R8
+	MOVQ CX,R9
+	SHRQ $51,CX
+	ADDQ R12,CX
+	ANDQ DX,R9
+	MOVQ CX,AX
+	SHRQ $51,CX
+	ADDQ R14,CX
+	ANDQ DX,AX
+	MOVQ CX,R10
+	SHRQ $51,CX
+	IMUL3Q $19,CX,CX
+	ADDQ CX,SI
+	ANDQ DX,R10
+	MOVQ SI,120(DI)
+	MOVQ R8,128(DI)
+	MOVQ R9,136(DI)
+	MOVQ AX,144(DI)
+	MOVQ R10,152(DI)
+	MOVQ 160(DI),AX
+	MULQ 160(DI)
+	MOVQ AX,SI
+	MOVQ DX,CX
+	MOVQ 160(DI),AX
+	SHLQ $1,AX
+	MULQ 168(DI)
+	MOVQ AX,R8
+	MOVQ DX,R9
+	MOVQ 160(DI),AX
+	SHLQ $1,AX
+	MULQ 176(DI)
+	MOVQ AX,R10
+	MOVQ DX,R11
+	MOVQ 160(DI),AX
+	SHLQ $1,AX
+	MULQ 184(DI)
+	MOVQ AX,R12
+	MOVQ DX,R13
+	MOVQ 160(DI),AX
+	SHLQ $1,AX
+	MULQ 192(DI)
+	MOVQ AX,R14
+	MOVQ DX,R15
+	MOVQ 168(DI),AX
+	MULQ 168(DI)
+	ADDQ AX,R10
+	ADCQ DX,R11
+	MOVQ 168(DI),AX
+	SHLQ $1,AX
+	MULQ 176(DI)
+	ADDQ AX,R12
+	ADCQ DX,R13
+	MOVQ 168(DI),AX
+	SHLQ $1,AX
+	MULQ 184(DI)
+	ADDQ AX,R14
+	ADCQ DX,R15
+	MOVQ 168(DI),DX
+	IMUL3Q $38,DX,AX
+	MULQ 192(DI)
+	ADDQ AX,SI
+	ADCQ DX,CX
+	MOVQ 176(DI),AX
+	MULQ 176(DI)
+	ADDQ AX,R14
+	ADCQ DX,R15
+	MOVQ 176(DI),DX
+	IMUL3Q $38,DX,AX
+	MULQ 184(DI)
+	ADDQ AX,SI
+	ADCQ DX,CX
+	MOVQ 176(DI),DX
+	IMUL3Q $38,DX,AX
+	MULQ 192(DI)
+	ADDQ AX,R8
+	ADCQ DX,R9
+	MOVQ 184(DI),DX
+	IMUL3Q $19,DX,AX
+	MULQ 184(DI)
+	ADDQ AX,R8
+	ADCQ DX,R9
+	MOVQ 184(DI),DX
+	IMUL3Q $38,DX,AX
+	MULQ 192(DI)
+	ADDQ AX,R10
+	ADCQ DX,R11
+	MOVQ 192(DI),DX
+	IMUL3Q $19,DX,AX
+	MULQ 192(DI)
+	ADDQ AX,R12
+	ADCQ DX,R13
+	MOVQ ·REDMASK51(SB),DX
+	SHLQ $13,CX:SI
+	ANDQ DX,SI
+	SHLQ $13,R9:R8
+	ANDQ DX,R8
+	ADDQ CX,R8
+	SHLQ $13,R11:R10
+	ANDQ DX,R10
+	ADDQ R9,R10
+	SHLQ $13,R13:R12
+	ANDQ DX,R12
+	ADDQ R11,R12
+	SHLQ $13,R15:R14
+	ANDQ DX,R14
+	ADDQ R13,R14
+	IMUL3Q $19,R15,CX
+	ADDQ CX,SI
+	MOVQ SI,CX
+	SHRQ $51,CX
+	ADDQ R8,CX
+	ANDQ DX,SI
+	MOVQ CX,R8
+	SHRQ $51,CX
+	ADDQ R10,CX
+	ANDQ DX,R8
+	MOVQ CX,R9
+	SHRQ $51,CX
+	ADDQ R12,CX
+	ANDQ DX,R9
+	MOVQ CX,AX
+	SHRQ $51,CX
+	ADDQ R14,CX
+	ANDQ DX,AX
+	MOVQ CX,R10
+	SHRQ $51,CX
+	IMUL3Q $19,CX,CX
+	ADDQ CX,SI
+	ANDQ DX,R10
+	MOVQ SI,160(DI)
+	MOVQ R8,168(DI)
+	MOVQ R9,176(DI)
+	MOVQ AX,184(DI)
+	MOVQ R10,192(DI)
+	MOVQ 184(DI),SI
+	IMUL3Q $19,SI,AX
+	MOVQ AX,56(SP)
+	MULQ 16(DI)
+	MOVQ AX,SI
+	MOVQ DX,CX
+	MOVQ 192(DI),DX
+	IMUL3Q $19,DX,AX
+	MOVQ AX,64(SP)
+	MULQ 8(DI)
+	ADDQ AX,SI
+	ADCQ DX,CX
+	MOVQ 160(DI),AX
+	MULQ 0(DI)
+	ADDQ AX,SI
+	ADCQ DX,CX
+	MOVQ 160(DI),AX
+	MULQ 8(DI)
+	MOVQ AX,R8
+	MOVQ DX,R9
+	MOVQ 160(DI),AX
+	MULQ 16(DI)
+	MOVQ AX,R10
+	MOVQ DX,R11
+	MOVQ 160(DI),AX
+	MULQ 24(DI)
+	MOVQ AX,R12
+	MOVQ DX,R13
+	MOVQ 160(DI),AX
+	MULQ 32(DI)
+	MOVQ AX,R14
+	MOVQ DX,R15
+	MOVQ 168(DI),AX
+	MULQ 0(DI)
+	ADDQ AX,R8
+	ADCQ DX,R9
+	MOVQ 168(DI),AX
+	MULQ 8(DI)
+	ADDQ AX,R10
+	ADCQ DX,R11
+	MOVQ 168(DI),AX
+	MULQ 16(DI)
+	ADDQ AX,R12
+	ADCQ DX,R13
+	MOVQ 168(DI),AX
+	MULQ 24(DI)
+	ADDQ AX,R14
+	ADCQ DX,R15
+	MOVQ 168(DI),DX
+	IMUL3Q $19,DX,AX
+	MULQ 32(DI)
+	ADDQ AX,SI
+	ADCQ DX,CX
+	MOVQ 176(DI),AX
+	MULQ 0(DI)
+	ADDQ AX,R10
+	ADCQ DX,R11
+	MOVQ 176(DI),AX
+	MULQ 8(DI)
+	ADDQ AX,R12
+	ADCQ DX,R13
+	MOVQ 176(DI),AX
+	MULQ 16(DI)
+	ADDQ AX,R14
+	ADCQ DX,R15
+	MOVQ 176(DI),DX
+	IMUL3Q $19,DX,AX
+	MULQ 24(DI)
+	ADDQ AX,SI
+	ADCQ DX,CX
+	MOVQ 176(DI),DX
+	IMUL3Q $19,DX,AX
+	MULQ 32(DI)
+	ADDQ AX,R8
+	ADCQ DX,R9
+	MOVQ 184(DI),AX
+	MULQ 0(DI)
+	ADDQ AX,R12
+	ADCQ DX,R13
+	MOVQ 184(DI),AX
+	MULQ 8(DI)
+	ADDQ AX,R14
+	ADCQ DX,R15
+	MOVQ 56(SP),AX
+	MULQ 24(DI)
+	ADDQ AX,R8
+	ADCQ DX,R9
+	MOVQ 56(SP),AX
+	MULQ 32(DI)
+	ADDQ AX,R10
+	ADCQ DX,R11
+	MOVQ 192(DI),AX
+	MULQ 0(DI)
+	ADDQ AX,R14
+	ADCQ DX,R15
+	MOVQ 64(SP),AX
+	MULQ 16(DI)
+	ADDQ AX,R8
+	ADCQ DX,R9
+	MOVQ 64(SP),AX
+	MULQ 24(DI)
+	ADDQ AX,R10
+	ADCQ DX,R11
+	MOVQ 64(SP),AX
+	MULQ 32(DI)
+	ADDQ AX,R12
+	ADCQ DX,R13
+	MOVQ ·REDMASK51(SB),DX
+	SHLQ $13,CX:SI
+	ANDQ DX,SI
+	SHLQ $13,R9:R8
+	ANDQ DX,R8
+	ADDQ CX,R8
+	SHLQ $13,R11:R10
+	ANDQ DX,R10
+	ADDQ R9,R10
+	SHLQ $13,R13:R12
+	ANDQ DX,R12
+	ADDQ R11,R12
+	SHLQ $13,R15:R14
+	ANDQ DX,R14
+	ADDQ R13,R14
+	IMUL3Q $19,R15,CX
+	ADDQ CX,SI
+	MOVQ SI,CX
+	SHRQ $51,CX
+	ADDQ R8,CX
+	MOVQ CX,R8
+	SHRQ $51,CX
+	ANDQ DX,SI
+	ADDQ R10,CX
+	MOVQ CX,R9
+	SHRQ $51,CX
+	ANDQ DX,R8
+	ADDQ R12,CX
+	MOVQ CX,AX
+	SHRQ $51,CX
+	ANDQ DX,R9
+	ADDQ R14,CX
+	MOVQ CX,R10
+	SHRQ $51,CX
+	ANDQ DX,AX
+	IMUL3Q $19,CX,CX
+	ADDQ CX,SI
+	ANDQ DX,R10
+	MOVQ SI,160(DI)
+	MOVQ R8,168(DI)
+	MOVQ R9,176(DI)
+	MOVQ AX,184(DI)
+	MOVQ R10,192(DI)
+	MOVQ 200(SP),SI
+	IMUL3Q $19,SI,AX
+	MOVQ AX,56(SP)
+	MULQ 152(SP)
+	MOVQ AX,SI
+	MOVQ DX,CX
+	MOVQ 208(SP),DX
+	IMUL3Q $19,DX,AX
+	MOVQ AX,64(SP)
+	MULQ 144(SP)
+	ADDQ AX,SI
+	ADCQ DX,CX
+	MOVQ 176(SP),AX
+	MULQ 136(SP)
+	ADDQ AX,SI
+	ADCQ DX,CX
+	MOVQ 176(SP),AX
+	MULQ 144(SP)
+	MOVQ AX,R8
+	MOVQ DX,R9
+	MOVQ 176(SP),AX
+	MULQ 152(SP)
+	MOVQ AX,R10
+	MOVQ DX,R11
+	MOVQ 176(SP),AX
+	MULQ 160(SP)
+	MOVQ AX,R12
+	MOVQ DX,R13
+	MOVQ 176(SP),AX
+	MULQ 168(SP)
+	MOVQ AX,R14
+	MOVQ DX,R15
+	MOVQ 184(SP),AX
+	MULQ 136(SP)
+	ADDQ AX,R8
+	ADCQ DX,R9
+	MOVQ 184(SP),AX
+	MULQ 144(SP)
+	ADDQ AX,R10
+	ADCQ DX,R11
+	MOVQ 184(SP),AX
+	MULQ 152(SP)
+	ADDQ AX,R12
+	ADCQ DX,R13
+	MOVQ 184(SP),AX
+	MULQ 160(SP)
+	ADDQ AX,R14
+	ADCQ DX,R15
+	MOVQ 184(SP),DX
+	IMUL3Q $19,DX,AX
+	MULQ 168(SP)
+	ADDQ AX,SI
+	ADCQ DX,CX
+	MOVQ 192(SP),AX
+	MULQ 136(SP)
+	ADDQ AX,R10
+	ADCQ DX,R11
+	MOVQ 192(SP),AX
+	MULQ 144(SP)
+	ADDQ AX,R12
+	ADCQ DX,R13
+	MOVQ 192(SP),AX
+	MULQ 152(SP)
+	ADDQ AX,R14
+	ADCQ DX,R15
+	MOVQ 192(SP),DX
+	IMUL3Q $19,DX,AX
+	MULQ 160(SP)
+	ADDQ AX,SI
+	ADCQ DX,CX
+	MOVQ 192(SP),DX
+	IMUL3Q $19,DX,AX
+	MULQ 168(SP)
+	ADDQ AX,R8
+	ADCQ DX,R9
+	MOVQ 200(SP),AX
+	MULQ 136(SP)
+	ADDQ AX,R12
+	ADCQ DX,R13
+	MOVQ 200(SP),AX
+	MULQ 144(SP)
+	ADDQ AX,R14
+	ADCQ DX,R15
+	MOVQ 56(SP),AX
+	MULQ 160(SP)
+	ADDQ AX,R8
+	ADCQ DX,R9
+	MOVQ 56(SP),AX
+	MULQ 168(SP)
+	ADDQ AX,R10
+	ADCQ DX,R11
+	MOVQ 208(SP),AX
+	MULQ 136(SP)
+	ADDQ AX,R14
+	ADCQ DX,R15
+	MOVQ 64(SP),AX
+	MULQ 152(SP)
+	ADDQ AX,R8
+	ADCQ DX,R9
+	MOVQ 64(SP),AX
+	MULQ 160(SP)
+	ADDQ AX,R10
+	ADCQ DX,R11
+	MOVQ 64(SP),AX
+	MULQ 168(SP)
+	ADDQ AX,R12
+	ADCQ DX,R13
+	MOVQ ·REDMASK51(SB),DX
+	SHLQ $13,CX:SI
+	ANDQ DX,SI
+	SHLQ $13,R9:R8
+	ANDQ DX,R8
+	ADDQ CX,R8
+	SHLQ $13,R11:R10
+	ANDQ DX,R10
+	ADDQ R9,R10
+	SHLQ $13,R13:R12
+	ANDQ DX,R12
+	ADDQ R11,R12
+	SHLQ $13,R15:R14
+	ANDQ DX,R14
+	ADDQ R13,R14
+	IMUL3Q $19,R15,CX
+	ADDQ CX,SI
+	MOVQ SI,CX
+	SHRQ $51,CX
+	ADDQ R8,CX
+	MOVQ CX,R8
+	SHRQ $51,CX
+	ANDQ DX,SI
+	ADDQ R10,CX
+	MOVQ CX,R9
+	SHRQ $51,CX
+	ANDQ DX,R8
+	ADDQ R12,CX
+	MOVQ CX,AX
+	SHRQ $51,CX
+	ANDQ DX,R9
+	ADDQ R14,CX
+	MOVQ CX,R10
+	SHRQ $51,CX
+	ANDQ DX,AX
+	IMUL3Q $19,CX,CX
+	ADDQ CX,SI
+	ANDQ DX,R10
+	MOVQ SI,40(DI)
+	MOVQ R8,48(DI)
+	MOVQ R9,56(DI)
+	MOVQ AX,64(DI)
+	MOVQ R10,72(DI)
+	MOVQ 216(SP),AX
+	MULQ ·_121666_213(SB)
+	SHRQ $13,AX
+	MOVQ AX,SI
+	MOVQ DX,CX
+	MOVQ 224(SP),AX
+	MULQ ·_121666_213(SB)
+	SHRQ $13,AX
+	ADDQ AX,CX
+	MOVQ DX,R8
+	MOVQ 232(SP),AX
+	MULQ ·_121666_213(SB)
+	SHRQ $13,AX
+	ADDQ AX,R8
+	MOVQ DX,R9
+	MOVQ 240(SP),AX
+	MULQ ·_121666_213(SB)
+	SHRQ $13,AX
+	ADDQ AX,R9
+	MOVQ DX,R10
+	MOVQ 248(SP),AX
+	MULQ ·_121666_213(SB)
+	SHRQ $13,AX
+	ADDQ AX,R10
+	IMUL3Q $19,DX,DX
+	ADDQ DX,SI
+	ADDQ 136(SP),SI
+	ADDQ 144(SP),CX
+	ADDQ 152(SP),R8
+	ADDQ 160(SP),R9
+	ADDQ 168(SP),R10
+	MOVQ SI,80(DI)
+	MOVQ CX,88(DI)
+	MOVQ R8,96(DI)
+	MOVQ R9,104(DI)
+	MOVQ R10,112(DI)
+	MOVQ 104(DI),SI
+	IMUL3Q $19,SI,AX
+	MOVQ AX,56(SP)
+	MULQ 232(SP)
+	MOVQ AX,SI
+	MOVQ DX,CX
+	MOVQ 112(DI),DX
+	IMUL3Q $19,DX,AX
+	MOVQ AX,64(SP)
+	MULQ 224(SP)
+	ADDQ AX,SI
+	ADCQ DX,CX
+	MOVQ 80(DI),AX
+	MULQ 216(SP)
+	ADDQ AX,SI
+	ADCQ DX,CX
+	MOVQ 80(DI),AX
+	MULQ 224(SP)
+	MOVQ AX,R8
+	MOVQ DX,R9
+	MOVQ 80(DI),AX
+	MULQ 232(SP)
+	MOVQ AX,R10
+	MOVQ DX,R11
+	MOVQ 80(DI),AX
+	MULQ 240(SP)
+	MOVQ AX,R12
+	MOVQ DX,R13
+	MOVQ 80(DI),AX
+	MULQ 248(SP)
+	MOVQ AX,R14
+	MOVQ DX,R15
+	MOVQ 88(DI),AX
+	MULQ 216(SP)
+	ADDQ AX,R8
+	ADCQ DX,R9
+	MOVQ 88(DI),AX
+	MULQ 224(SP)
+	ADDQ AX,R10
+	ADCQ DX,R11
+	MOVQ 88(DI),AX
+	MULQ 232(SP)
+	ADDQ AX,R12
+	ADCQ DX,R13
+	MOVQ 88(DI),AX
+	MULQ 240(SP)
+	ADDQ AX,R14
+	ADCQ DX,R15
+	MOVQ 88(DI),DX
+	IMUL3Q $19,DX,AX
+	MULQ 248(SP)
+	ADDQ AX,SI
+	ADCQ DX,CX
+	MOVQ 96(DI),AX
+	MULQ 216(SP)
+	ADDQ AX,R10
+	ADCQ DX,R11
+	MOVQ 96(DI),AX
+	MULQ 224(SP)
+	ADDQ AX,R12
+	ADCQ DX,R13
+	MOVQ 96(DI),AX
+	MULQ 232(SP)
+	ADDQ AX,R14
+	ADCQ DX,R15
+	MOVQ 96(DI),DX
+	IMUL3Q $19,DX,AX
+	MULQ 240(SP)
+	ADDQ AX,SI
+	ADCQ DX,CX
+	MOVQ 96(DI),DX
+	IMUL3Q $19,DX,AX
+	MULQ 248(SP)
+	ADDQ AX,R8
+	ADCQ DX,R9
+	MOVQ 104(DI),AX
+	MULQ 216(SP)
+	ADDQ AX,R12
+	ADCQ DX,R13
+	MOVQ 104(DI),AX
+	MULQ 224(SP)
+	ADDQ AX,R14
+	ADCQ DX,R15
+	MOVQ 56(SP),AX
+	MULQ 240(SP)
+	ADDQ AX,R8
+	ADCQ DX,R9
+	MOVQ 56(SP),AX
+	MULQ 248(SP)
+	ADDQ AX,R10
+	ADCQ DX,R11
+	MOVQ 112(DI),AX
+	MULQ 216(SP)
+	ADDQ AX,R14
+	ADCQ DX,R15
+	MOVQ 64(SP),AX
+	MULQ 232(SP)
+	ADDQ AX,R8
+	ADCQ DX,R9
+	MOVQ 64(SP),AX
+	MULQ 240(SP)
+	ADDQ AX,R10
+	ADCQ DX,R11
+	MOVQ 64(SP),AX
+	MULQ 248(SP)
+	ADDQ AX,R12
+	ADCQ DX,R13
+	MOVQ ·REDMASK51(SB),DX
+	SHLQ $13,CX:SI
+	ANDQ DX,SI
+	SHLQ $13,R9:R8
+	ANDQ DX,R8
+	ADDQ CX,R8
+	SHLQ $13,R11:R10
+	ANDQ DX,R10
+	ADDQ R9,R10
+	SHLQ $13,R13:R12
+	ANDQ DX,R12
+	ADDQ R11,R12
+	SHLQ $13,R15:R14
+	ANDQ DX,R14
+	ADDQ R13,R14
+	IMUL3Q $19,R15,CX
+	ADDQ CX,SI
+	MOVQ SI,CX
+	SHRQ $51,CX
+	ADDQ R8,CX
+	MOVQ CX,R8
+	SHRQ $51,CX
+	ANDQ DX,SI
+	ADDQ R10,CX
+	MOVQ CX,R9
+	SHRQ $51,CX
+	ANDQ DX,R8
+	ADDQ R12,CX
+	MOVQ CX,AX
+	SHRQ $51,CX
+	ANDQ DX,R9
+	ADDQ R14,CX
+	MOVQ CX,R10
+	SHRQ $51,CX
+	ANDQ DX,AX
+	IMUL3Q $19,CX,CX
+	ADDQ CX,SI
+	ANDQ DX,R10
+	MOVQ SI,80(DI)
+	MOVQ R8,88(DI)
+	MOVQ R9,96(DI)
+	MOVQ AX,104(DI)
+	MOVQ R10,112(DI)
+	MOVQ 0(SP),R11
+	MOVQ 8(SP),R12
+	MOVQ 16(SP),R13
+	MOVQ 24(SP),R14
+	MOVQ 32(SP),R15
+	MOVQ 40(SP),BX
+	MOVQ 48(SP),BP
+	MOVQ R11,SP
+	MOVQ DI,AX
+	MOVQ SI,DX
+	RET

+ 223 - 0
curve25519/mont25519_amd64.go

@@ -0,0 +1,223 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package curve25519
+
+// These functions are implemented in the .s files. The names of the functions
+// in the rest of the file are also taken from the SUPERCOP sources to help
+// people following along.
+func cswap(*[5]uint64, uint64)
+func ladderstep(*[5][5]uint64)
+func freeze(inout *[5]uint64)
+func mul(dest, a, b *[5]uint64)
+func square(out, in *[5]uint64)
+
+// mladder uses a Montgomery ladder to calculate (xr/zr) *= s.
+func mladder(xr, zr *[5]uint64, s *[32]byte) {
+	var work [5][5]uint64
+
+	work[0] = *xr
+	setint(&work[1], 1)
+	setint(&work[2], 0)
+	work[3] = *xr
+	setint(&work[4], 1)
+
+	j := uint(6)
+	var prevbit byte
+
+	for i := 31; i >= 0; i-- {
+		for j < 8 {
+			bit := ((*s)[i] >> j) & 1
+			swap := bit ^ prevbit
+			prevbit = bit
+			cswap(&work[1], uint64(swap))
+			ladderstep(&work)
+			j--
+		}
+		j = 7
+	}
+
+	*xr = work[1]
+	*zr = work[2]
+}
+
+func scalarMult(out, in, base *[32]byte) {
+	var e [32]byte
+	copy(e[:], (*in)[:])
+	e[0] &= 248
+	e[31] &= 127
+	e[31] |= 64
+
+	var t, z [5]uint64
+	unpack(&t, base)
+	mladder(&t, &z, &e)
+	invert(&z, &z)
+	mul(&t, &t, &z)
+	pack(out, &t)
+}
+
+func setint(r *[5]uint64, v uint64) {
+	r[0] = v
+	r[1] = 0
+	r[2] = 0
+	r[3] = 0
+	r[4] = 0
+}
+
+// unpack sets r = x where r consists of 5, 51-bit limbs in little-endian
+// order.
+func unpack(r *[5]uint64, x *[32]byte) {
+	r[0] = uint64(x[0]) |
+		uint64(x[1])<<8 |
+		uint64(x[2])<<16 |
+		uint64(x[3])<<24 |
+		uint64(x[4])<<32 |
+		uint64(x[5])<<40 |
+		uint64(x[6]&7)<<48
+
+	r[1] = uint64(x[6])>>3 |
+		uint64(x[7])<<5 |
+		uint64(x[8])<<13 |
+		uint64(x[9])<<21 |
+		uint64(x[10])<<29 |
+		uint64(x[11])<<37 |
+		uint64(x[12]&63)<<45
+
+	r[2] = uint64(x[12])>>6 |
+		uint64(x[13])<<2 |
+		uint64(x[14])<<10 |
+		uint64(x[15])<<18 |
+		uint64(x[16])<<26 |
+		uint64(x[17])<<34 |
+		uint64(x[18])<<42 |
+		uint64(x[19]&1)<<50
+
+	r[3] = uint64(x[19])>>1 |
+		uint64(x[20])<<7 |
+		uint64(x[21])<<15 |
+		uint64(x[22])<<23 |
+		uint64(x[23])<<31 |
+		uint64(x[24])<<39 |
+		uint64(x[25]&15)<<47
+
+	r[4] = uint64(x[25])>>4 |
+		uint64(x[26])<<4 |
+		uint64(x[27])<<12 |
+		uint64(x[28])<<20 |
+		uint64(x[29])<<28 |
+		uint64(x[30])<<36 |
+		uint64(x[31]&127)<<44
+}
+
+// pack sets out = x where out is the usual, little-endian form of the 5,
+// 51-bit limbs in x.
+func pack(out *[32]byte, x *[5]uint64) {
+	t := *x
+	freeze(&t)
+
+	out[0] = byte(t[0])
+	out[1] = byte(t[0] >> 8)
+	out[2] = byte(t[0] >> 16)
+	out[3] = byte(t[0] >> 24)
+	out[4] = byte(t[0] >> 32)
+	out[5] = byte(t[0] >> 40)
+	out[6] = byte(t[0] >> 48)
+
+	out[6] ^= byte(t[1]<<3) & 0xf8
+	out[7] = byte(t[1] >> 5)
+	out[8] = byte(t[1] >> 13)
+	out[9] = byte(t[1] >> 21)
+	out[10] = byte(t[1] >> 29)
+	out[11] = byte(t[1] >> 37)
+	out[12] = byte(t[1] >> 45)
+
+	out[12] ^= byte(t[2]<<6) & 0xc0
+	out[13] = byte(t[2] >> 2)
+	out[14] = byte(t[2] >> 10)
+	out[15] = byte(t[2] >> 18)
+	out[16] = byte(t[2] >> 26)
+	out[17] = byte(t[2] >> 34)
+	out[18] = byte(t[2] >> 42)
+	out[19] = byte(t[2] >> 50)
+
+	out[19] ^= byte(t[3]<<1) & 0xfe
+	out[20] = byte(t[3] >> 7)
+	out[21] = byte(t[3] >> 15)
+	out[22] = byte(t[3] >> 23)
+	out[23] = byte(t[3] >> 31)
+	out[24] = byte(t[3] >> 39)
+	out[25] = byte(t[3] >> 47)
+
+	out[25] ^= byte(t[4]<<4) & 0xf0
+	out[26] = byte(t[4] >> 4)
+	out[27] = byte(t[4] >> 12)
+	out[28] = byte(t[4] >> 20)
+	out[29] = byte(t[4] >> 28)
+	out[30] = byte(t[4] >> 36)
+	out[31] = byte(t[4] >> 44)
+}
+
+// invert calculates r = x^-1 mod p using Fermat's little theorem.
+func invert(r *[5]uint64, x *[5]uint64) {
+	var z2, z9, z11, z2_5_0, z2_10_0, z2_20_0, z2_50_0, z2_100_0, t [5]uint64
+
+	square(&z2, x)        /* 2 */
+	square(&t, &z2)       /* 4 */
+	square(&t, &t)        /* 8 */
+	mul(&z9, &t, x)       /* 9 */
+	mul(&z11, &z9, &z2)   /* 11 */
+	square(&t, &z11)      /* 22 */
+	mul(&z2_5_0, &t, &z9) /* 2^5 - 2^0 = 31 */
+
+	square(&t, &z2_5_0)      /* 2^6 - 2^1 */
+	for i := 1; i < 5; i++ { /* 2^20 - 2^10 */
+		square(&t, &t)
+	}
+	mul(&z2_10_0, &t, &z2_5_0) /* 2^10 - 2^0 */
+
+	square(&t, &z2_10_0)      /* 2^11 - 2^1 */
+	for i := 1; i < 10; i++ { /* 2^20 - 2^10 */
+		square(&t, &t)
+	}
+	mul(&z2_20_0, &t, &z2_10_0) /* 2^20 - 2^0 */
+
+	square(&t, &z2_20_0)      /* 2^21 - 2^1 */
+	for i := 1; i < 20; i++ { /* 2^40 - 2^20 */
+		square(&t, &t)
+	}
+	mul(&t, &t, &z2_20_0) /* 2^40 - 2^0 */
+
+	square(&t, &t)            /* 2^41 - 2^1 */
+	for i := 1; i < 10; i++ { /* 2^50 - 2^10 */
+		square(&t, &t)
+	}
+	mul(&z2_50_0, &t, &z2_10_0) /* 2^50 - 2^0 */
+
+	square(&t, &z2_50_0)      /* 2^51 - 2^1 */
+	for i := 1; i < 50; i++ { /* 2^100 - 2^50 */
+		square(&t, &t)
+	}
+	mul(&z2_100_0, &t, &z2_50_0) /* 2^100 - 2^0 */
+
+	square(&t, &z2_100_0)      /* 2^101 - 2^1 */
+	for i := 1; i < 100; i++ { /* 2^200 - 2^100 */
+		square(&t, &t)
+	}
+	mul(&t, &t, &z2_100_0) /* 2^200 - 2^0 */
+
+	square(&t, &t)            /* 2^201 - 2^1 */
+	for i := 1; i < 50; i++ { /* 2^250 - 2^50 */
+		square(&t, &t)
+	}
+	mul(&t, &t, &z2_50_0) /* 2^250 - 2^0 */
+
+	square(&t, &t) /* 2^251 - 2^1 */
+	square(&t, &t) /* 2^252 - 2^2 */
+	square(&t, &t) /* 2^253 - 2^3 */
+
+	square(&t, &t) /* 2^254 - 2^4 */
+
+	square(&t, &t)   /* 2^255 - 2^5 */
+	mul(r, &t, &z11) /* 2^255 - 21 */
+}

+ 189 - 0
curve25519/mul_amd64.s

@@ -0,0 +1,189 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This code was translated into a form compatible with 6a from the public
+// domain sources in SUPERCOP: http://bench.cr.yp.to/supercop.html
+
+// func mul(dest, a, b *[5]uint64)
+TEXT ·mul(SB),0,$128-24
+	MOVQ dest+0(FP), DI
+	MOVQ a+8(FP), SI
+	MOVQ b+16(FP), DX
+
+	MOVQ SP,R11
+	MOVQ $31,CX
+	NOTQ CX
+	ANDQ CX,SP
+	ADDQ $32,SP
+
+	MOVQ R11,0(SP)
+	MOVQ R12,8(SP)
+	MOVQ R13,16(SP)
+	MOVQ R14,24(SP)
+	MOVQ R15,32(SP)
+	MOVQ BX,40(SP)
+	MOVQ BP,48(SP)
+	MOVQ DI,56(SP)
+	MOVQ DX,CX
+	MOVQ 24(SI),DX
+	IMUL3Q $19,DX,AX
+	MOVQ AX,64(SP)
+	MULQ 16(CX)
+	MOVQ AX,R8
+	MOVQ DX,R9
+	MOVQ 32(SI),DX
+	IMUL3Q $19,DX,AX
+	MOVQ AX,72(SP)
+	MULQ 8(CX)
+	ADDQ AX,R8
+	ADCQ DX,R9
+	MOVQ 0(SI),AX
+	MULQ 0(CX)
+	ADDQ AX,R8
+	ADCQ DX,R9
+	MOVQ 0(SI),AX
+	MULQ 8(CX)
+	MOVQ AX,R10
+	MOVQ DX,R11
+	MOVQ 0(SI),AX
+	MULQ 16(CX)
+	MOVQ AX,R12
+	MOVQ DX,R13
+	MOVQ 0(SI),AX
+	MULQ 24(CX)
+	MOVQ AX,R14
+	MOVQ DX,R15
+	MOVQ 0(SI),AX
+	MULQ 32(CX)
+	MOVQ AX,BX
+	MOVQ DX,BP
+	MOVQ 8(SI),AX
+	MULQ 0(CX)
+	ADDQ AX,R10
+	ADCQ DX,R11
+	MOVQ 8(SI),AX
+	MULQ 8(CX)
+	ADDQ AX,R12
+	ADCQ DX,R13
+	MOVQ 8(SI),AX
+	MULQ 16(CX)
+	ADDQ AX,R14
+	ADCQ DX,R15
+	MOVQ 8(SI),AX
+	MULQ 24(CX)
+	ADDQ AX,BX
+	ADCQ DX,BP
+	MOVQ 8(SI),DX
+	IMUL3Q $19,DX,AX
+	MULQ 32(CX)
+	ADDQ AX,R8
+	ADCQ DX,R9
+	MOVQ 16(SI),AX
+	MULQ 0(CX)
+	ADDQ AX,R12
+	ADCQ DX,R13
+	MOVQ 16(SI),AX
+	MULQ 8(CX)
+	ADDQ AX,R14
+	ADCQ DX,R15
+	MOVQ 16(SI),AX
+	MULQ 16(CX)
+	ADDQ AX,BX
+	ADCQ DX,BP
+	MOVQ 16(SI),DX
+	IMUL3Q $19,DX,AX
+	MULQ 24(CX)
+	ADDQ AX,R8
+	ADCQ DX,R9
+	MOVQ 16(SI),DX
+	IMUL3Q $19,DX,AX
+	MULQ 32(CX)
+	ADDQ AX,R10
+	ADCQ DX,R11
+	MOVQ 24(SI),AX
+	MULQ 0(CX)
+	ADDQ AX,R14
+	ADCQ DX,R15
+	MOVQ 24(SI),AX
+	MULQ 8(CX)
+	ADDQ AX,BX
+	ADCQ DX,BP
+	MOVQ 64(SP),AX
+	MULQ 24(CX)
+	ADDQ AX,R10
+	ADCQ DX,R11
+	MOVQ 64(SP),AX
+	MULQ 32(CX)
+	ADDQ AX,R12
+	ADCQ DX,R13
+	MOVQ 32(SI),AX
+	MULQ 0(CX)
+	ADDQ AX,BX
+	ADCQ DX,BP
+	MOVQ 72(SP),AX
+	MULQ 16(CX)
+	ADDQ AX,R10
+	ADCQ DX,R11
+	MOVQ 72(SP),AX
+	MULQ 24(CX)
+	ADDQ AX,R12
+	ADCQ DX,R13
+	MOVQ 72(SP),AX
+	MULQ 32(CX)
+	ADDQ AX,R14
+	ADCQ DX,R15
+	MOVQ ·REDMASK51(SB),SI
+	SHLQ $13,R9:R8
+	ANDQ SI,R8
+	SHLQ $13,R11:R10
+	ANDQ SI,R10
+	ADDQ R9,R10
+	SHLQ $13,R13:R12
+	ANDQ SI,R12
+	ADDQ R11,R12
+	SHLQ $13,R15:R14
+	ANDQ SI,R14
+	ADDQ R13,R14
+	SHLQ $13,BP:BX
+	ANDQ SI,BX
+	ADDQ R15,BX
+	IMUL3Q $19,BP,DX
+	ADDQ DX,R8
+	MOVQ R8,DX
+	SHRQ $51,DX
+	ADDQ R10,DX
+	MOVQ DX,CX
+	SHRQ $51,DX
+	ANDQ SI,R8
+	ADDQ R12,DX
+	MOVQ DX,R9
+	SHRQ $51,DX
+	ANDQ SI,CX
+	ADDQ R14,DX
+	MOVQ DX,AX
+	SHRQ $51,DX
+	ANDQ SI,R9
+	ADDQ BX,DX
+	MOVQ DX,R10
+	SHRQ $51,DX
+	ANDQ SI,AX
+	IMUL3Q $19,DX,DX
+	ADDQ DX,R8
+	ANDQ SI,R10
+	MOVQ R8,0(DI)
+	MOVQ CX,8(DI)
+	MOVQ R9,16(DI)
+	MOVQ AX,24(DI)
+	MOVQ R10,32(DI)
+	MOVQ 0(SP),R11
+	MOVQ 8(SP),R12
+	MOVQ 16(SP),R13
+	MOVQ 24(SP),R14
+	MOVQ 32(SP),R15
+	MOVQ 40(SP),BX
+	MOVQ 48(SP),BP
+	MOVQ R11,SP
+	MOVQ DI,AX
+	MOVQ SI,DX
+	RET

+ 151 - 0
curve25519/square_amd64.s

@@ -0,0 +1,151 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This code was translated into a form compatible with 6a from the public
+// domain sources in SUPERCOP: http://bench.cr.yp.to/supercop.html
+
+// func square(dest, src *[5]uint64)
+TEXT ·square(SB),7,$96-16
+	MOVQ dest+0(FP), DI
+	MOVQ src+8(FP), SI
+
+	MOVQ SP,R11
+	MOVQ $31,CX
+	NOTQ CX
+	ANDQ CX,SP
+	ADDQ $32, SP
+
+	MOVQ R11,0(SP)
+	MOVQ R12,8(SP)
+	MOVQ R13,16(SP)
+	MOVQ R14,24(SP)
+	MOVQ R15,32(SP)
+	MOVQ BX,40(SP)
+	MOVQ BP,48(SP)
+	MOVQ 0(SI),AX
+	MULQ 0(SI)
+	MOVQ AX,CX
+	MOVQ DX,R8
+	MOVQ 0(SI),AX
+	SHLQ $1,AX
+	MULQ 8(SI)
+	MOVQ AX,R9
+	MOVQ DX,R10
+	MOVQ 0(SI),AX
+	SHLQ $1,AX
+	MULQ 16(SI)
+	MOVQ AX,R11
+	MOVQ DX,R12
+	MOVQ 0(SI),AX
+	SHLQ $1,AX
+	MULQ 24(SI)
+	MOVQ AX,R13
+	MOVQ DX,R14
+	MOVQ 0(SI),AX
+	SHLQ $1,AX
+	MULQ 32(SI)
+	MOVQ AX,R15
+	MOVQ DX,BX
+	MOVQ 8(SI),AX
+	MULQ 8(SI)
+	ADDQ AX,R11
+	ADCQ DX,R12
+	MOVQ 8(SI),AX
+	SHLQ $1,AX
+	MULQ 16(SI)
+	ADDQ AX,R13
+	ADCQ DX,R14
+	MOVQ 8(SI),AX
+	SHLQ $1,AX
+	MULQ 24(SI)
+	ADDQ AX,R15
+	ADCQ DX,BX
+	MOVQ 8(SI),DX
+	IMUL3Q $38,DX,AX
+	MULQ 32(SI)
+	ADDQ AX,CX
+	ADCQ DX,R8
+	MOVQ 16(SI),AX
+	MULQ 16(SI)
+	ADDQ AX,R15
+	ADCQ DX,BX
+	MOVQ 16(SI),DX
+	IMUL3Q $38,DX,AX
+	MULQ 24(SI)
+	ADDQ AX,CX
+	ADCQ DX,R8
+	MOVQ 16(SI),DX
+	IMUL3Q $38,DX,AX
+	MULQ 32(SI)
+	ADDQ AX,R9
+	ADCQ DX,R10
+	MOVQ 24(SI),DX
+	IMUL3Q $19,DX,AX
+	MULQ 24(SI)
+	ADDQ AX,R9
+	ADCQ DX,R10
+	MOVQ 24(SI),DX
+	IMUL3Q $38,DX,AX
+	MULQ 32(SI)
+	ADDQ AX,R11
+	ADCQ DX,R12
+	MOVQ 32(SI),DX
+	IMUL3Q $19,DX,AX
+	MULQ 32(SI)
+	ADDQ AX,R13
+	ADCQ DX,R14
+	MOVQ ·REDMASK51(SB),SI
+	SHLQ $13,R8:CX
+	ANDQ SI,CX
+	SHLQ $13,R10:R9
+	ANDQ SI,R9
+	ADDQ R8,R9
+	SHLQ $13,R12:R11
+	ANDQ SI,R11
+	ADDQ R10,R11
+	SHLQ $13,R14:R13
+	ANDQ SI,R13
+	ADDQ R12,R13
+	SHLQ $13,BX:R15
+	ANDQ SI,R15
+	ADDQ R14,R15
+	IMUL3Q $19,BX,DX
+	ADDQ DX,CX
+	MOVQ CX,DX
+	SHRQ $51,DX
+	ADDQ R9,DX
+	ANDQ SI,CX
+	MOVQ DX,R8
+	SHRQ $51,DX
+	ADDQ R11,DX
+	ANDQ SI,R8
+	MOVQ DX,R9
+	SHRQ $51,DX
+	ADDQ R13,DX
+	ANDQ SI,R9
+	MOVQ DX,AX
+	SHRQ $51,DX
+	ADDQ R15,DX
+	ANDQ SI,AX
+	MOVQ DX,R10
+	SHRQ $51,DX
+	IMUL3Q $19,DX,DX
+	ADDQ DX,CX
+	ANDQ SI,R10
+	MOVQ CX,0(DI)
+	MOVQ R8,8(DI)
+	MOVQ R9,16(DI)
+	MOVQ AX,24(DI)
+	MOVQ R10,32(DI)
+	MOVQ 0(SP),R11
+	MOVQ 8(SP),R12
+	MOVQ 16(SP),R13
+	MOVQ 24(SP),R14
+	MOVQ 32(SP),R15
+	MOVQ 40(SP),BX
+	MOVQ 48(SP),BP
+	MOVQ R11,SP
+	MOVQ DI,AX
+	MOVQ SI,DX
+	RET