Browse Source

go.crypto/sha3: optimize round function

benchmark                        old ns/op     new ns/op     delta
BenchmarkPermutationFunction     1128          733           -35.02%
BenchmarkShake256_1MiB           10004381      6832715       -31.70%

benchmark                        old MB/s     new MB/s     speedup
BenchmarkPermutationFunction     177.23       272.54       1.54x
BenchmarkShake256_1MiB           104.81       153.46       1.46x

LGTM=agl
R=golang-codereviews, agl
CC=golang-codereviews
https://golang.org/cl/156770043
Eric Roshan-Eisner 11 years ago
parent
commit
9b55b542f6
1 changed files with 366 additions and 115 deletions
  1. 366 115
      sha3/keccakf.go

+ 366 - 115
sha3/keccakf.go

@@ -35,125 +35,376 @@ var rc = [24]uint64{
 // keccakF1600 applies the Keccak permutation to a 1600b-wide
 // state represented as a slice of 25 uint64s.
 func keccakF1600(a *[25]uint64) {
-	var t, bc0, bc1, bc2, bc3, bc4 uint64
-	for _, roundConstant := range rc {
-		// θ step
+	// Implementation translated from Keccak-inplace.c
+	// in the keccak reference code.
+	var t, bc0, bc1, bc2, bc3, bc4, d0, d1, d2, d3, d4 uint64
+
+	for i := 0; i < 24; i += 4 {
+		// Combines the 5 steps in each round into 2 steps.
+		// Unrolls 4 rounds per loop and spreads some steps across rounds.
+
+		// Round 1
+		bc0 = a[0] ^ a[5] ^ a[10] ^ a[15] ^ a[20]
+		bc1 = a[1] ^ a[6] ^ a[11] ^ a[16] ^ a[21]
+		bc2 = a[2] ^ a[7] ^ a[12] ^ a[17] ^ a[22]
+		bc3 = a[3] ^ a[8] ^ a[13] ^ a[18] ^ a[23]
+		bc4 = a[4] ^ a[9] ^ a[14] ^ a[19] ^ a[24]
+		d0 = bc4 ^ (bc1<<1 | bc1>>63)
+		d1 = bc0 ^ (bc2<<1 | bc2>>63)
+		d2 = bc1 ^ (bc3<<1 | bc3>>63)
+		d3 = bc2 ^ (bc4<<1 | bc4>>63)
+		d4 = bc3 ^ (bc0<<1 | bc0>>63)
+
+		bc0 = a[0] ^ d0
+		t = a[6] ^ d1
+		bc1 = t<<44 | t>>(64-44)
+		t = a[12] ^ d2
+		bc2 = t<<43 | t>>(64-43)
+		t = a[18] ^ d3
+		bc3 = t<<21 | t>>(64-21)
+		t = a[24] ^ d4
+		bc4 = t<<14 | t>>(64-14)
+		a[0] = bc0 ^ (bc2 &^ bc1) ^ rc[i]
+		a[6] = bc1 ^ (bc3 &^ bc2)
+		a[12] = bc2 ^ (bc4 &^ bc3)
+		a[18] = bc3 ^ (bc0 &^ bc4)
+		a[24] = bc4 ^ (bc1 &^ bc0)
+
+		t = a[10] ^ d0
+		bc2 = t<<3 | t>>(64-3)
+		t = a[16] ^ d1
+		bc3 = t<<45 | t>>(64-45)
+		t = a[22] ^ d2
+		bc4 = t<<61 | t>>(64-61)
+		t = a[3] ^ d3
+		bc0 = t<<28 | t>>(64-28)
+		t = a[9] ^ d4
+		bc1 = t<<20 | t>>(64-20)
+		a[10] = bc0 ^ (bc2 &^ bc1)
+		a[16] = bc1 ^ (bc3 &^ bc2)
+		a[22] = bc2 ^ (bc4 &^ bc3)
+		a[3] = bc3 ^ (bc0 &^ bc4)
+		a[9] = bc4 ^ (bc1 &^ bc0)
+
+		t = a[20] ^ d0
+		bc4 = t<<18 | t>>(64-18)
+		t = a[1] ^ d1
+		bc0 = t<<1 | t>>(64-1)
+		t = a[7] ^ d2
+		bc1 = t<<6 | t>>(64-6)
+		t = a[13] ^ d3
+		bc2 = t<<25 | t>>(64-25)
+		t = a[19] ^ d4
+		bc3 = t<<8 | t>>(64-8)
+		a[20] = bc0 ^ (bc2 &^ bc1)
+		a[1] = bc1 ^ (bc3 &^ bc2)
+		a[7] = bc2 ^ (bc4 &^ bc3)
+		a[13] = bc3 ^ (bc0 &^ bc4)
+		a[19] = bc4 ^ (bc1 &^ bc0)
+
+		t = a[5] ^ d0
+		bc1 = t<<36 | t>>(64-36)
+		t = a[11] ^ d1
+		bc2 = t<<10 | t>>(64-10)
+		t = a[17] ^ d2
+		bc3 = t<<15 | t>>(64-15)
+		t = a[23] ^ d3
+		bc4 = t<<56 | t>>(64-56)
+		t = a[4] ^ d4
+		bc0 = t<<27 | t>>(64-27)
+		a[5] = bc0 ^ (bc2 &^ bc1)
+		a[11] = bc1 ^ (bc3 &^ bc2)
+		a[17] = bc2 ^ (bc4 &^ bc3)
+		a[23] = bc3 ^ (bc0 &^ bc4)
+		a[4] = bc4 ^ (bc1 &^ bc0)
+
+		t = a[15] ^ d0
+		bc3 = t<<41 | t>>(64-41)
+		t = a[21] ^ d1
+		bc4 = t<<2 | t>>(64-2)
+		t = a[2] ^ d2
+		bc0 = t<<62 | t>>(64-62)
+		t = a[8] ^ d3
+		bc1 = t<<55 | t>>(64-55)
+		t = a[14] ^ d4
+		bc2 = t<<39 | t>>(64-39)
+		a[15] = bc0 ^ (bc2 &^ bc1)
+		a[21] = bc1 ^ (bc3 &^ bc2)
+		a[2] = bc2 ^ (bc4 &^ bc3)
+		a[8] = bc3 ^ (bc0 &^ bc4)
+		a[14] = bc4 ^ (bc1 &^ bc0)
+
+		// Round 2
+		bc0 = a[0] ^ a[5] ^ a[10] ^ a[15] ^ a[20]
+		bc1 = a[1] ^ a[6] ^ a[11] ^ a[16] ^ a[21]
+		bc2 = a[2] ^ a[7] ^ a[12] ^ a[17] ^ a[22]
+		bc3 = a[3] ^ a[8] ^ a[13] ^ a[18] ^ a[23]
+		bc4 = a[4] ^ a[9] ^ a[14] ^ a[19] ^ a[24]
+		d0 = bc4 ^ (bc1<<1 | bc1>>63)
+		d1 = bc0 ^ (bc2<<1 | bc2>>63)
+		d2 = bc1 ^ (bc3<<1 | bc3>>63)
+		d3 = bc2 ^ (bc4<<1 | bc4>>63)
+		d4 = bc3 ^ (bc0<<1 | bc0>>63)
+
+		bc0 = a[0] ^ d0
+		t = a[16] ^ d1
+		bc1 = t<<44 | t>>(64-44)
+		t = a[7] ^ d2
+		bc2 = t<<43 | t>>(64-43)
+		t = a[23] ^ d3
+		bc3 = t<<21 | t>>(64-21)
+		t = a[14] ^ d4
+		bc4 = t<<14 | t>>(64-14)
+		a[0] = bc0 ^ (bc2 &^ bc1) ^ rc[i+1]
+		a[16] = bc1 ^ (bc3 &^ bc2)
+		a[7] = bc2 ^ (bc4 &^ bc3)
+		a[23] = bc3 ^ (bc0 &^ bc4)
+		a[14] = bc4 ^ (bc1 &^ bc0)
+
+		t = a[20] ^ d0
+		bc2 = t<<3 | t>>(64-3)
+		t = a[11] ^ d1
+		bc3 = t<<45 | t>>(64-45)
+		t = a[2] ^ d2
+		bc4 = t<<61 | t>>(64-61)
+		t = a[18] ^ d3
+		bc0 = t<<28 | t>>(64-28)
+		t = a[9] ^ d4
+		bc1 = t<<20 | t>>(64-20)
+		a[20] = bc0 ^ (bc2 &^ bc1)
+		a[11] = bc1 ^ (bc3 &^ bc2)
+		a[2] = bc2 ^ (bc4 &^ bc3)
+		a[18] = bc3 ^ (bc0 &^ bc4)
+		a[9] = bc4 ^ (bc1 &^ bc0)
+
+		t = a[15] ^ d0
+		bc4 = t<<18 | t>>(64-18)
+		t = a[6] ^ d1
+		bc0 = t<<1 | t>>(64-1)
+		t = a[22] ^ d2
+		bc1 = t<<6 | t>>(64-6)
+		t = a[13] ^ d3
+		bc2 = t<<25 | t>>(64-25)
+		t = a[4] ^ d4
+		bc3 = t<<8 | t>>(64-8)
+		a[15] = bc0 ^ (bc2 &^ bc1)
+		a[6] = bc1 ^ (bc3 &^ bc2)
+		a[22] = bc2 ^ (bc4 &^ bc3)
+		a[13] = bc3 ^ (bc0 &^ bc4)
+		a[4] = bc4 ^ (bc1 &^ bc0)
+
+		t = a[10] ^ d0
+		bc1 = t<<36 | t>>(64-36)
+		t = a[1] ^ d1
+		bc2 = t<<10 | t>>(64-10)
+		t = a[17] ^ d2
+		bc3 = t<<15 | t>>(64-15)
+		t = a[8] ^ d3
+		bc4 = t<<56 | t>>(64-56)
+		t = a[24] ^ d4
+		bc0 = t<<27 | t>>(64-27)
+		a[10] = bc0 ^ (bc2 &^ bc1)
+		a[1] = bc1 ^ (bc3 &^ bc2)
+		a[17] = bc2 ^ (bc4 &^ bc3)
+		a[8] = bc3 ^ (bc0 &^ bc4)
+		a[24] = bc4 ^ (bc1 &^ bc0)
+
+		t = a[5] ^ d0
+		bc3 = t<<41 | t>>(64-41)
+		t = a[21] ^ d1
+		bc4 = t<<2 | t>>(64-2)
+		t = a[12] ^ d2
+		bc0 = t<<62 | t>>(64-62)
+		t = a[3] ^ d3
+		bc1 = t<<55 | t>>(64-55)
+		t = a[19] ^ d4
+		bc2 = t<<39 | t>>(64-39)
+		a[5] = bc0 ^ (bc2 &^ bc1)
+		a[21] = bc1 ^ (bc3 &^ bc2)
+		a[12] = bc2 ^ (bc4 &^ bc3)
+		a[3] = bc3 ^ (bc0 &^ bc4)
+		a[19] = bc4 ^ (bc1 &^ bc0)
+
+		// Round 3
+		bc0 = a[0] ^ a[5] ^ a[10] ^ a[15] ^ a[20]
+		bc1 = a[1] ^ a[6] ^ a[11] ^ a[16] ^ a[21]
+		bc2 = a[2] ^ a[7] ^ a[12] ^ a[17] ^ a[22]
+		bc3 = a[3] ^ a[8] ^ a[13] ^ a[18] ^ a[23]
+		bc4 = a[4] ^ a[9] ^ a[14] ^ a[19] ^ a[24]
+		d0 = bc4 ^ (bc1<<1 | bc1>>63)
+		d1 = bc0 ^ (bc2<<1 | bc2>>63)
+		d2 = bc1 ^ (bc3<<1 | bc3>>63)
+		d3 = bc2 ^ (bc4<<1 | bc4>>63)
+		d4 = bc3 ^ (bc0<<1 | bc0>>63)
+
+		bc0 = a[0] ^ d0
+		t = a[11] ^ d1
+		bc1 = t<<44 | t>>(64-44)
+		t = a[22] ^ d2
+		bc2 = t<<43 | t>>(64-43)
+		t = a[8] ^ d3
+		bc3 = t<<21 | t>>(64-21)
+		t = a[19] ^ d4
+		bc4 = t<<14 | t>>(64-14)
+		a[0] = bc0 ^ (bc2 &^ bc1) ^ rc[i+2]
+		a[11] = bc1 ^ (bc3 &^ bc2)
+		a[22] = bc2 ^ (bc4 &^ bc3)
+		a[8] = bc3 ^ (bc0 &^ bc4)
+		a[19] = bc4 ^ (bc1 &^ bc0)
+
+		t = a[15] ^ d0
+		bc2 = t<<3 | t>>(64-3)
+		t = a[1] ^ d1
+		bc3 = t<<45 | t>>(64-45)
+		t = a[12] ^ d2
+		bc4 = t<<61 | t>>(64-61)
+		t = a[23] ^ d3
+		bc0 = t<<28 | t>>(64-28)
+		t = a[9] ^ d4
+		bc1 = t<<20 | t>>(64-20)
+		a[15] = bc0 ^ (bc2 &^ bc1)
+		a[1] = bc1 ^ (bc3 &^ bc2)
+		a[12] = bc2 ^ (bc4 &^ bc3)
+		a[23] = bc3 ^ (bc0 &^ bc4)
+		a[9] = bc4 ^ (bc1 &^ bc0)
+
+		t = a[5] ^ d0
+		bc4 = t<<18 | t>>(64-18)
+		t = a[16] ^ d1
+		bc0 = t<<1 | t>>(64-1)
+		t = a[2] ^ d2
+		bc1 = t<<6 | t>>(64-6)
+		t = a[13] ^ d3
+		bc2 = t<<25 | t>>(64-25)
+		t = a[24] ^ d4
+		bc3 = t<<8 | t>>(64-8)
+		a[5] = bc0 ^ (bc2 &^ bc1)
+		a[16] = bc1 ^ (bc3 &^ bc2)
+		a[2] = bc2 ^ (bc4 &^ bc3)
+		a[13] = bc3 ^ (bc0 &^ bc4)
+		a[24] = bc4 ^ (bc1 &^ bc0)
+
+		t = a[20] ^ d0
+		bc1 = t<<36 | t>>(64-36)
+		t = a[6] ^ d1
+		bc2 = t<<10 | t>>(64-10)
+		t = a[17] ^ d2
+		bc3 = t<<15 | t>>(64-15)
+		t = a[3] ^ d3
+		bc4 = t<<56 | t>>(64-56)
+		t = a[14] ^ d4
+		bc0 = t<<27 | t>>(64-27)
+		a[20] = bc0 ^ (bc2 &^ bc1)
+		a[6] = bc1 ^ (bc3 &^ bc2)
+		a[17] = bc2 ^ (bc4 &^ bc3)
+		a[3] = bc3 ^ (bc0 &^ bc4)
+		a[14] = bc4 ^ (bc1 &^ bc0)
+
+		t = a[10] ^ d0
+		bc3 = t<<41 | t>>(64-41)
+		t = a[21] ^ d1
+		bc4 = t<<2 | t>>(64-2)
+		t = a[7] ^ d2
+		bc0 = t<<62 | t>>(64-62)
+		t = a[18] ^ d3
+		bc1 = t<<55 | t>>(64-55)
+		t = a[4] ^ d4
+		bc2 = t<<39 | t>>(64-39)
+		a[10] = bc0 ^ (bc2 &^ bc1)
+		a[21] = bc1 ^ (bc3 &^ bc2)
+		a[7] = bc2 ^ (bc4 &^ bc3)
+		a[18] = bc3 ^ (bc0 &^ bc4)
+		a[4] = bc4 ^ (bc1 &^ bc0)
+
+		// Round 4
 		bc0 = a[0] ^ a[5] ^ a[10] ^ a[15] ^ a[20]
 		bc1 = a[1] ^ a[6] ^ a[11] ^ a[16] ^ a[21]
 		bc2 = a[2] ^ a[7] ^ a[12] ^ a[17] ^ a[22]
 		bc3 = a[3] ^ a[8] ^ a[13] ^ a[18] ^ a[23]
 		bc4 = a[4] ^ a[9] ^ a[14] ^ a[19] ^ a[24]
-		t = bc4 ^ (bc1<<1 ^ bc1>>63)
-		a[0] ^= t
-		a[5] ^= t
-		a[10] ^= t
-		a[15] ^= t
-		a[20] ^= t
-		t = bc0 ^ (bc2<<1 ^ bc2>>63)
-		a[1] ^= t
-		a[6] ^= t
-		a[11] ^= t
-		a[16] ^= t
-		a[21] ^= t
-		t = bc1 ^ (bc3<<1 ^ bc3>>63)
-		a[2] ^= t
-		a[7] ^= t
-		a[12] ^= t
-		a[17] ^= t
-		a[22] ^= t
-		t = bc2 ^ (bc4<<1 ^ bc4>>63)
-		a[3] ^= t
-		a[8] ^= t
-		a[13] ^= t
-		a[18] ^= t
-		a[23] ^= t
-		t = bc3 ^ (bc0<<1 ^ bc0>>63)
-		a[4] ^= t
-		a[9] ^= t
-		a[14] ^= t
-		a[19] ^= t
-		a[24] ^= t
-
-		// ρ and π steps
-		t = a[1]
-		t, a[10] = a[10], t<<1^t>>(64-1)
-		t, a[7] = a[7], t<<3^t>>(64-3)
-		t, a[11] = a[11], t<<6^t>>(64-6)
-		t, a[17] = a[17], t<<10^t>>(64-10)
-		t, a[18] = a[18], t<<15^t>>(64-15)
-		t, a[3] = a[3], t<<21^t>>(64-21)
-		t, a[5] = a[5], t<<28^t>>(64-28)
-		t, a[16] = a[16], t<<36^t>>(64-36)
-		t, a[8] = a[8], t<<45^t>>(64-45)
-		t, a[21] = a[21], t<<55^t>>(64-55)
-		t, a[24] = a[24], t<<2^t>>(64-2)
-		t, a[4] = a[4], t<<14^t>>(64-14)
-		t, a[15] = a[15], t<<27^t>>(64-27)
-		t, a[23] = a[23], t<<41^t>>(64-41)
-		t, a[19] = a[19], t<<56^t>>(64-56)
-		t, a[13] = a[13], t<<8^t>>(64-8)
-		t, a[12] = a[12], t<<25^t>>(64-25)
-		t, a[2] = a[2], t<<43^t>>(64-43)
-		t, a[20] = a[20], t<<62^t>>(64-62)
-		t, a[14] = a[14], t<<18^t>>(64-18)
-		t, a[22] = a[22], t<<39^t>>(64-39)
-		t, a[9] = a[9], t<<61^t>>(64-61)
-		t, a[6] = a[6], t<<20^t>>(64-20)
-		a[1] = t<<44 ^ t>>(64-44)
-
-		// χ step
-		bc0 = a[0]
-		bc1 = a[1]
-		bc2 = a[2]
-		bc3 = a[3]
-		bc4 = a[4]
-		a[0] ^= bc2 &^ bc1
-		a[1] ^= bc3 &^ bc2
-		a[2] ^= bc4 &^ bc3
-		a[3] ^= bc0 &^ bc4
-		a[4] ^= bc1 &^ bc0
-		bc0 = a[5]
-		bc1 = a[6]
-		bc2 = a[7]
-		bc3 = a[8]
-		bc4 = a[9]
-		a[5] ^= bc2 &^ bc1
-		a[6] ^= bc3 &^ bc2
-		a[7] ^= bc4 &^ bc3
-		a[8] ^= bc0 &^ bc4
-		a[9] ^= bc1 &^ bc0
-		bc0 = a[10]
-		bc1 = a[11]
-		bc2 = a[12]
-		bc3 = a[13]
-		bc4 = a[14]
-		a[10] ^= bc2 &^ bc1
-		a[11] ^= bc3 &^ bc2
-		a[12] ^= bc4 &^ bc3
-		a[13] ^= bc0 &^ bc4
-		a[14] ^= bc1 &^ bc0
-		bc0 = a[15]
-		bc1 = a[16]
-		bc2 = a[17]
-		bc3 = a[18]
-		bc4 = a[19]
-		a[15] ^= bc2 &^ bc1
-		a[16] ^= bc3 &^ bc2
-		a[17] ^= bc4 &^ bc3
-		a[18] ^= bc0 &^ bc4
-		a[19] ^= bc1 &^ bc0
-		bc0 = a[20]
-		bc1 = a[21]
-		bc2 = a[22]
-		bc3 = a[23]
-		bc4 = a[24]
-		a[20] ^= bc2 &^ bc1
-		a[21] ^= bc3 &^ bc2
-		a[22] ^= bc4 &^ bc3
-		a[23] ^= bc0 &^ bc4
-		a[24] ^= bc1 &^ bc0
-
-		// ι step
-		a[0] ^= roundConstant
+		d0 = bc4 ^ (bc1<<1 | bc1>>63)
+		d1 = bc0 ^ (bc2<<1 | bc2>>63)
+		d2 = bc1 ^ (bc3<<1 | bc3>>63)
+		d3 = bc2 ^ (bc4<<1 | bc4>>63)
+		d4 = bc3 ^ (bc0<<1 | bc0>>63)
+
+		bc0 = a[0] ^ d0
+		t = a[1] ^ d1
+		bc1 = t<<44 | t>>(64-44)
+		t = a[2] ^ d2
+		bc2 = t<<43 | t>>(64-43)
+		t = a[3] ^ d3
+		bc3 = t<<21 | t>>(64-21)
+		t = a[4] ^ d4
+		bc4 = t<<14 | t>>(64-14)
+		a[0] = bc0 ^ (bc2 &^ bc1) ^ rc[i+3]
+		a[1] = bc1 ^ (bc3 &^ bc2)
+		a[2] = bc2 ^ (bc4 &^ bc3)
+		a[3] = bc3 ^ (bc0 &^ bc4)
+		a[4] = bc4 ^ (bc1 &^ bc0)
+
+		t = a[5] ^ d0
+		bc2 = t<<3 | t>>(64-3)
+		t = a[6] ^ d1
+		bc3 = t<<45 | t>>(64-45)
+		t = a[7] ^ d2
+		bc4 = t<<61 | t>>(64-61)
+		t = a[8] ^ d3
+		bc0 = t<<28 | t>>(64-28)
+		t = a[9] ^ d4
+		bc1 = t<<20 | t>>(64-20)
+		a[5] = bc0 ^ (bc2 &^ bc1)
+		a[6] = bc1 ^ (bc3 &^ bc2)
+		a[7] = bc2 ^ (bc4 &^ bc3)
+		a[8] = bc3 ^ (bc0 &^ bc4)
+		a[9] = bc4 ^ (bc1 &^ bc0)
+
+		t = a[10] ^ d0
+		bc4 = t<<18 | t>>(64-18)
+		t = a[11] ^ d1
+		bc0 = t<<1 | t>>(64-1)
+		t = a[12] ^ d2
+		bc1 = t<<6 | t>>(64-6)
+		t = a[13] ^ d3
+		bc2 = t<<25 | t>>(64-25)
+		t = a[14] ^ d4
+		bc3 = t<<8 | t>>(64-8)
+		a[10] = bc0 ^ (bc2 &^ bc1)
+		a[11] = bc1 ^ (bc3 &^ bc2)
+		a[12] = bc2 ^ (bc4 &^ bc3)
+		a[13] = bc3 ^ (bc0 &^ bc4)
+		a[14] = bc4 ^ (bc1 &^ bc0)
+
+		t = a[15] ^ d0
+		bc1 = t<<36 | t>>(64-36)
+		t = a[16] ^ d1
+		bc2 = t<<10 | t>>(64-10)
+		t = a[17] ^ d2
+		bc3 = t<<15 | t>>(64-15)
+		t = a[18] ^ d3
+		bc4 = t<<56 | t>>(64-56)
+		t = a[19] ^ d4
+		bc0 = t<<27 | t>>(64-27)
+		a[15] = bc0 ^ (bc2 &^ bc1)
+		a[16] = bc1 ^ (bc3 &^ bc2)
+		a[17] = bc2 ^ (bc4 &^ bc3)
+		a[18] = bc3 ^ (bc0 &^ bc4)
+		a[19] = bc4 ^ (bc1 &^ bc0)
+
+		t = a[20] ^ d0
+		bc3 = t<<41 | t>>(64-41)
+		t = a[21] ^ d1
+		bc4 = t<<2 | t>>(64-2)
+		t = a[22] ^ d2
+		bc0 = t<<62 | t>>(64-62)
+		t = a[23] ^ d3
+		bc1 = t<<55 | t>>(64-55)
+		t = a[24] ^ d4
+		bc2 = t<<39 | t>>(64-39)
+		a[20] = bc0 ^ (bc2 &^ bc1)
+		a[21] = bc1 ^ (bc3 &^ bc2)
+		a[22] = bc2 ^ (bc4 &^ bc3)
+		a[23] = bc3 ^ (bc0 &^ bc4)
+		a[24] = bc4 ^ (bc1 &^ bc0)
 	}
 }