Преглед на файлове

scrypt: 2x faster.

Work on uint32 slices instead of bytes.
Replace usage of Salsa20/8 from salsa package with
the specialized version.

benchmark       old ns/op    new ns/op    delta
BenchmarkKey    266430525    126657130  -52.46%

R=agl
CC=golang-dev
https://golang.org/cl/7139050
Dmitry Chestnykh преди 13 години
родител
ревизия
97488752ad
променени са 2 файла, в които са добавени 219 реда и са изтрити 37 реда
  1. 171 37
      scrypt/scrypt.go
  2. 48 0
      scrypt/scrypt_test.go

+ 171 - 37
scrypt/scrypt.go

@@ -9,69 +9,203 @@ package scrypt
 
 import (
 	"crypto/sha256"
-	"encoding/binary"
 	"errors"
 
 	"code.google.com/p/go.crypto/pbkdf2"
-	"code.google.com/p/go.crypto/salsa20/salsa"
 )
 
 const maxInt = int(^uint(0) >> 1)
 
-// blockCopy copies n bytes from src into dst.
-func blockCopy(dst, src []byte, n int) {
+// blockCopy copies n numbers from src into dst.
+func blockCopy(dst, src []uint32, n int) {
 	copy(dst, src[:n])
 }
 
-// blockXOR XORs bytes from dst with n bytes from src.
-func blockXOR(dst, src []byte, n int) {
+// blockXOR XORs numbers from dst with n numbers from src.
+func blockXOR(dst, src []uint32, n int) {
 	for i, v := range src[:n] {
 		dst[i] ^= v
 	}
 }
 
-func blockMix(b, y []byte, r int) {
-	var x [64]byte
-
-	blockCopy(x[:], b[(2*r-1)*64:], 64)
-
-	for i := 0; i < 2*r*64; i += 64 {
-		blockXOR(x[:], b[i:], 64)
-		salsa.Core208(&x, &x)
-		blockCopy(y[i:], x[:], 64)
-	}
-
-	for i := 0; i < r; i++ {
-		blockCopy(b[i*64:], y[i*2*64:], 64)
+// salsaXOR applies Salsa20/8 to the XOR of 16 numbers from tmp and in,
+// and puts the result into both both tmp and out.
+func salsaXOR(tmp *[16]uint32, in, out []uint32) {
+	w0 := tmp[0] ^ in[0]
+	w1 := tmp[1] ^ in[1]
+	w2 := tmp[2] ^ in[2]
+	w3 := tmp[3] ^ in[3]
+	w4 := tmp[4] ^ in[4]
+	w5 := tmp[5] ^ in[5]
+	w6 := tmp[6] ^ in[6]
+	w7 := tmp[7] ^ in[7]
+	w8 := tmp[8] ^ in[8]
+	w9 := tmp[9] ^ in[9]
+	w10 := tmp[10] ^ in[10]
+	w11 := tmp[11] ^ in[11]
+	w12 := tmp[12] ^ in[12]
+	w13 := tmp[13] ^ in[13]
+	w14 := tmp[14] ^ in[14]
+	w15 := tmp[15] ^ in[15]
+
+	x0, x1, x2, x3, x4, x5, x6, x7, x8 := w0, w1, w2, w3, w4, w5, w6, w7, w8
+	x9, x10, x11, x12, x13, x14, x15 := w9, w10, w11, w12, w13, w14, w15
+
+	for i := 0; i < 8; i += 2 {
+		u := x0 + x12
+		x4 ^= u<<7 | u>>(32-7)
+		u = x4 + x0
+		x8 ^= u<<9 | u>>(32-9)
+		u = x8 + x4
+		x12 ^= u<<13 | u>>(32-13)
+		u = x12 + x8
+		x0 ^= u<<18 | u>>(32-18)
+
+		u = x5 + x1
+		x9 ^= u<<7 | u>>(32-7)
+		u = x9 + x5
+		x13 ^= u<<9 | u>>(32-9)
+		u = x13 + x9
+		x1 ^= u<<13 | u>>(32-13)
+		u = x1 + x13
+		x5 ^= u<<18 | u>>(32-18)
+
+		u = x10 + x6
+		x14 ^= u<<7 | u>>(32-7)
+		u = x14 + x10
+		x2 ^= u<<9 | u>>(32-9)
+		u = x2 + x14
+		x6 ^= u<<13 | u>>(32-13)
+		u = x6 + x2
+		x10 ^= u<<18 | u>>(32-18)
+
+		u = x15 + x11
+		x3 ^= u<<7 | u>>(32-7)
+		u = x3 + x15
+		x7 ^= u<<9 | u>>(32-9)
+		u = x7 + x3
+		x11 ^= u<<13 | u>>(32-13)
+		u = x11 + x7
+		x15 ^= u<<18 | u>>(32-18)
+
+		u = x0 + x3
+		x1 ^= u<<7 | u>>(32-7)
+		u = x1 + x0
+		x2 ^= u<<9 | u>>(32-9)
+		u = x2 + x1
+		x3 ^= u<<13 | u>>(32-13)
+		u = x3 + x2
+		x0 ^= u<<18 | u>>(32-18)
+
+		u = x5 + x4
+		x6 ^= u<<7 | u>>(32-7)
+		u = x6 + x5
+		x7 ^= u<<9 | u>>(32-9)
+		u = x7 + x6
+		x4 ^= u<<13 | u>>(32-13)
+		u = x4 + x7
+		x5 ^= u<<18 | u>>(32-18)
+
+		u = x10 + x9
+		x11 ^= u<<7 | u>>(32-7)
+		u = x11 + x10
+		x8 ^= u<<9 | u>>(32-9)
+		u = x8 + x11
+		x9 ^= u<<13 | u>>(32-13)
+		u = x9 + x8
+		x10 ^= u<<18 | u>>(32-18)
+
+		u = x15 + x14
+		x12 ^= u<<7 | u>>(32-7)
+		u = x12 + x15
+		x13 ^= u<<9 | u>>(32-9)
+		u = x13 + x12
+		x14 ^= u<<13 | u>>(32-13)
+		u = x14 + x13
+		x15 ^= u<<18 | u>>(32-18)
 	}
+	x0 += w0
+	x1 += w1
+	x2 += w2
+	x3 += w3
+	x4 += w4
+	x5 += w5
+	x6 += w6
+	x7 += w7
+	x8 += w8
+	x9 += w9
+	x10 += w10
+	x11 += w11
+	x12 += w12
+	x13 += w13
+	x14 += w14
+	x15 += w15
+
+	out[0], tmp[0] = x0, x0
+	out[1], tmp[1] = x1, x1
+	out[2], tmp[2] = x2, x2
+	out[3], tmp[3] = x3, x3
+	out[4], tmp[4] = x4, x4
+	out[5], tmp[5] = x5, x5
+	out[6], tmp[6] = x6, x6
+	out[7], tmp[7] = x7, x7
+	out[8], tmp[8] = x8, x8
+	out[9], tmp[9] = x9, x9
+	out[10], tmp[10] = x10, x10
+	out[11], tmp[11] = x11, x11
+	out[12], tmp[12] = x12, x12
+	out[13], tmp[13] = x13, x13
+	out[14], tmp[14] = x14, x14
+	out[15], tmp[15] = x15, x15
+}
 
-	for i := 0; i < r; i++ {
-		blockCopy(b[(i+r)*64:], y[(i*2+1)*64:], 64)
+func blockMix(tmp *[16]uint32, in, out []uint32, r int) {
+	blockCopy(tmp[:], in[(2*r-1)*16:], 16)
+	for i := 0; i < 2*r; i += 2 {
+		salsaXOR(tmp, in[i*16:], out[i*8:])
+		salsaXOR(tmp, in[i*16+16:], out[i*8+r*16:])
 	}
 }
 
-func integer(b []byte, r int) uint64 {
-	return binary.LittleEndian.Uint64(b[(2*r-1)*64:])
+func integer(b []uint32, r int) uint64 {
+	j := (2*r - 1) * 16
+	return uint64(b[j]) | uint64(b[j+1])<<32
 }
 
-func smix(b []byte, r, N int, v, xy []byte) {
+func smix(b []byte, r, N int, v, xy []uint32) {
+	var tmp [16]uint32
 	x := xy
-	y := xy[128*r:]
+	y := xy[32*r:]
 
-	blockCopy(x, b, 128*r)
-
-	for i := 0; i < N; i++ {
-		blockCopy(v[i*128*r:], x, 128*r)
-		blockMix(x, y, r)
+	j := 0
+	for i := 0; i < 32*r; i++ {
+		x[i] = uint32(b[j]) | uint32(b[j+1])<<8 | uint32(b[j+2])<<16 | uint32(b[j+3])<<24
+		j += 4
 	}
+	for i := 0; i < N; i += 2 {
+		blockCopy(v[i*(32*r):], x, 32*r)
+		blockMix(&tmp, x, y, r)
 
-	for i := 0; i < N; i++ {
-		j := int(integer(x, r) & uint64(N-1))
-		blockXOR(x, v[j*128*r:], 128*r)
-		blockMix(x, y, r)
+		blockCopy(v[(i+1)*(32*r):], y, 32*r)
+		blockMix(&tmp, y, x, r)
 	}
+	for i := 0; i < N; i += 2 {
+		j := int(integer(x, r) & uint64(N-1))
+		blockXOR(x, v[j*(32*r):], 32*r)
+		blockMix(&tmp, x, y, r)
 
-	blockCopy(b, x, 128*r)
+		j = int(integer(y, r) & uint64(N-1))
+		blockXOR(y, v[j*(32*r):], 32*r)
+		blockMix(&tmp, y, x, r)
+	}
+	j = 0
+	for _, v := range x[:32*r] {
+		b[j+0] = byte(v >> 0)
+		b[j+1] = byte(v >> 8)
+		b[j+2] = byte(v >> 16)
+		b[j+3] = byte(v >> 24)
+		j += 4
+	}
 }
 
 // Key derives a key from the password, salt, and cost parameters, returning
@@ -97,8 +231,8 @@ func Key(password, salt []byte, N, r, p, keyLen int) ([]byte, error) {
 		return nil, errors.New("scrypt: parameters are too large")
 	}
 
-	xy := make([]byte, 256*r)
-	v := make([]byte, 128*r*N)
+	xy := make([]uint32, 64*r)
+	v := make([]uint32, 32*N*r)
 	b := pbkdf2.Key(password, salt, 1, p*128*r, sha256.New)
 
 	for i := 0; i < p; i++ {

+ 48 - 0
scrypt/scrypt_test.go

@@ -17,6 +17,54 @@ type testVector struct {
 }
 
 var good = []testVector{
+	{
+		"password",
+		"salt",
+		2, 10, 10,
+		[]byte{
+			0x48, 0x2c, 0x85, 0x8e, 0x22, 0x90, 0x55, 0xe6, 0x2f,
+			0x41, 0xe0, 0xec, 0x81, 0x9a, 0x5e, 0xe1, 0x8b, 0xdb,
+			0x87, 0x25, 0x1a, 0x53, 0x4f, 0x75, 0xac, 0xd9, 0x5a,
+			0xc5, 0xe5, 0xa, 0xa1, 0x5f,
+		},
+	},
+	{
+		"password",
+		"salt",
+		16, 100, 100,
+		[]byte{
+			0x88, 0xbd, 0x5e, 0xdb, 0x52, 0xd1, 0xdd, 0x0, 0x18,
+			0x87, 0x72, 0xad, 0x36, 0x17, 0x12, 0x90, 0x22, 0x4e,
+			0x74, 0x82, 0x95, 0x25, 0xb1, 0x8d, 0x73, 0x23, 0xa5,
+			0x7f, 0x91, 0x96, 0x3c, 0x37,
+		},
+	},
+	{
+		"this is a long \000 password",
+		"and this is a long \000 salt",
+		16384, 8, 1,
+		[]byte{
+			0xc3, 0xf1, 0x82, 0xee, 0x2d, 0xec, 0x84, 0x6e, 0x70,
+			0xa6, 0x94, 0x2f, 0xb5, 0x29, 0x98, 0x5a, 0x3a, 0x09,
+			0x76, 0x5e, 0xf0, 0x4c, 0x61, 0x29, 0x23, 0xb1, 0x7f,
+			0x18, 0x55, 0x5a, 0x37, 0x07, 0x6d, 0xeb, 0x2b, 0x98,
+			0x30, 0xd6, 0x9d, 0xe5, 0x49, 0x26, 0x51, 0xe4, 0x50,
+			0x6a, 0xe5, 0x77, 0x6d, 0x96, 0xd4, 0x0f, 0x67, 0xaa,
+			0xee, 0x37, 0xe1, 0x77, 0x7b, 0x8a, 0xd5, 0xc3, 0x11,
+			0x14, 0x32, 0xbb, 0x3b, 0x6f, 0x7e, 0x12, 0x64, 0x40,
+			0x18, 0x79, 0xe6, 0x41, 0xae,
+		},
+	},
+	{
+		"p",
+		"s",
+		2, 1, 1,
+		[]byte{
+			0x48, 0xb0, 0xd2, 0xa8, 0xa3, 0x27, 0x26, 0x11, 0x98,
+			0x4c, 0x50, 0xeb, 0xd6, 0x30, 0xaf, 0x52,
+		},
+	},
+
 	{
 		"",
 		"",