Previously, the package worked by copying the input (or the output) into
a buffer, and then XOR'ing (or copying) it into (or out of) the state.
(Except for an input fast path.) There's no need for that! We can XOR
straight into the state, and copy straight out of it, at least on little
endian machines. This is a bit faster, almost halves the state size, and
will make it easier to implement marshaling, but most importantly look
at how much simpler it makes the code!

go: go1.23.0
goos: linux
goarch: amd64
pkg: golang.org/x/crypto/sha3
cpu: AMD Ryzen 7 PRO 8700GE w/ Radeon 780M Graphics
                      │ v0.27.0-2-g42ee18b9637 │    v0.27.0-2-g42ee18b9637-dirty    │
                      │         sec/op         │   sec/op     vs base               │
PermutationFunction-8              270.8n ± 0%   270.4n ± 0%       ~ (p=0.099 n=10)
Sha3_512_MTU-8                     5.762µ ± 0%   5.658µ ± 0%  -1.80% (p=0.000 n=10)
Sha3_384_MTU-8                     4.179µ ± 0%   4.070µ ± 0%  -2.60% (p=0.000 n=10)
Sha3_256_MTU-8                     3.316µ ± 0%   3.214µ ± 0%  -3.08% (p=0.000 n=10)
Sha3_224_MTU-8                     3.175µ ± 0%   3.061µ ± 0%  -3.61% (p=0.000 n=10)
Shake128_MTU-8                     2.779µ ± 0%   2.681µ ± 0%  -3.51% (p=0.000 n=10)
Shake256_MTU-8                     2.947µ ± 0%   2.957µ ± 0%  +0.32% (p=0.000 n=10)
Shake256_16x-8                     44.15µ ± 0%   44.45µ ± 0%  +0.67% (p=0.000 n=10)
Shake256_1MiB-8                    2.319m ± 0%   2.274m ± 0%  -1.93% (p=0.000 n=10)
Sha3_512_1MiB-8                    4.204m ± 0%   4.219m ± 0%  +0.34% (p=0.000 n=10)
geomean                            13.75µ        13.54µ       -1.55%

                      │ v0.27.0-2-g42ee18b9637 │    v0.27.0-2-g42ee18b9637-dirty     │
                      │          B/s           │     B/s       vs base               │
PermutationFunction-8             704.3Mi ± 0%   705.4Mi ± 0%       ~ (p=0.105 n=10)
Sha3_512_MTU-8                    223.5Mi ± 0%   227.6Mi ± 0%  +1.83% (p=0.000 n=10)
Sha3_384_MTU-8                    308.1Mi ± 0%   316.4Mi ± 0%  +2.67% (p=0.000 n=10)
Sha3_256_MTU-8                    388.2Mi ± 0%   400.5Mi ± 0%  +3.17% (p=0.000 n=10)
Sha3_224_MTU-8                    405.5Mi ± 0%   420.7Mi ± 0%  +3.73% (p=0.000 n=10)
Shake128_MTU-8                    463.4Mi ± 0%   480.2Mi ± 0%  +3.64% (p=0.000 n=10)
Shake256_MTU-8                    436.9Mi ± 0%   435.5Mi ± 0%  -0.32% (p=0.000 n=10)
Shake256_16x-8                    353.9Mi ± 0%   351.5Mi ± 0%  -0.66% (p=0.000 n=10)
Shake256_1MiB-8                   431.2Mi ± 0%   439.7Mi ± 0%  +1.97% (p=0.000 n=10)
Sha3_512_1MiB-8                   237.8Mi ± 0%   237.1Mi ± 0%  -0.33% (p=0.000 n=10)
geomean                           375.7Mi        381.6Mi       +1.57%

Even stronger effect when patched on top of CL 616555 (forced on).

go: go1.23.0
goos: darwin
goarch: arm64
pkg: golang.org/x/crypto/sha3
cpu: Apple M2
                      │    old    │               new                   │
                      │   sec/op  │     sec/op     vs base              │
PermutationFunction-8   154.7n ± 2%   153.8n ± 1%        ~ (p=0.469 n=10)
Sha3_512_MTU-8          3.260µ ± 2%   3.143µ ± 2%   -3.60% (p=0.000 n=10)
Sha3_384_MTU-8          2.389µ ± 2%   2.244µ ± 2%   -6.07% (p=0.000 n=10)
Sha3_256_MTU-8          1.950µ ± 2%   1.758µ ± 1%   -9.87% (p=0.000 n=10)
Sha3_224_MTU-8          1.874µ ± 2%   1.686µ ± 1%  -10.06% (p=0.000 n=10)
Shake128_MTU-8          1.827µ ± 3%   1.447µ ± 1%  -20.80% (p=0.000 n=10)
Shake256_MTU-8          1.665µ ± 3%   1.604µ ± 3%   -3.63% (p=0.003 n=10)
Shake256_16x-8          25.14µ ± 1%   25.23µ ± 2%        ~ (p=0.912 n=10)
Shake256_1MiB-8         1.236m ± 2%   1.243m ± 2%        ~ (p=0.631 n=10)
Sha3_512_1MiB-8         2.296m ± 2%   2.305m ± 1%        ~ (p=0.315 n=10)
geomean                 7.906µ        7.467µ        -5.56%

                      │    old    │               new                   │
                      │    B/op   │      B/op      vs base              │
PermutationFunction-8   1.204Gi ± 2%   1.212Gi ± 1%        ~ (p=0.529 n=10)
Sha3_512_MTU-8          394.9Mi ± 2%   409.7Mi ± 2%   +3.73% (p=0.000 n=10)
Sha3_384_MTU-8          539.0Mi ± 2%   573.8Mi ± 2%   +6.45% (p=0.000 n=10)
Sha3_256_MTU-8          660.3Mi ± 2%   732.6Mi ± 1%  +10.95% (p=0.000 n=10)
Sha3_224_MTU-8          687.1Mi ± 2%   763.9Mi ± 1%  +11.17% (p=0.000 n=10)
Shake128_MTU-8          704.7Mi ± 2%   889.6Mi ± 2%  +26.24% (p=0.000 n=10)
Shake256_MTU-8          773.4Mi ± 3%   802.5Mi ± 3%   +3.76% (p=0.004 n=10)
Shake256_16x-8          621.6Mi ± 1%   619.3Mi ± 2%        ~ (p=0.912 n=10)
Shake256_1MiB-8         809.1Mi ± 2%   804.7Mi ± 2%        ~ (p=0.631 n=10)
Sha3_512_1MiB-8         435.6Mi ± 2%   433.9Mi ± 1%        ~ (p=0.315 n=10)
geomean                 653.6Mi        692.0Mi        +5.88%

Change-Id: I33a0a1ddf305c395f99bf17f81473e2f42c5ce42
Reviewed-on: https://go-review.googlesource.com/c/crypto/+/616575
Reviewed-by: Daniel McCarney <daniel@binaryparadox.net>
Reviewed-by: Michael Pratt <mpratt@google.com>
Reviewed-by: Roland Shoemaker <roland@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Auto-Submit: Filippo Valsorda <filippo@golang.org>
Reviewed-by: Andrew Ekstedt <andrew.ekstedt@gmail.com>
This commit is contained in:
Filippo Valsorda 2024-09-29 23:27:16 +02:00 коммит произвёл Gopher Robot
Родитель 7cfb9161e8
Коммит c17aa50fbd
2 изменённых файлов: 49 добавлений и 102 удалений

Просмотреть файл

@ -4,6 +4,14 @@
package sha3
import (
"crypto/subtle"
"encoding/binary"
"unsafe"
"golang.org/x/sys/cpu"
)
// spongeDirection indicates the direction bytes are flowing through the sponge.
type spongeDirection int
@ -14,16 +22,13 @@ const (
spongeSqueezing
)
const (
// maxRate is the maximum size of the internal buffer. SHAKE-256
// currently needs the largest buffer.
maxRate = 168
)
type state struct {
// Generic sponge components.
a [25]uint64 // main state of the hash
rate int // the number of bytes of state to use
a [1600 / 8]byte // main state of the hash
// a[n:rate] is the buffer. If absorbing, it's the remaining space to XOR
// into before running the permutation. If squeezing, it's the remaining
// output to produce before running the permutation.
n, rate int
// dsbyte contains the "domain separation" bits and the first bit of
// the padding. Sections 6.1 and 6.2 of [1] separate the outputs of the
@ -39,10 +44,6 @@ type state struct {
// Extendable-Output Functions (May 2014)"
dsbyte byte
i, n int // storage[i:n] is the buffer, i is only used while squeezing
storage [maxRate]byte
// Specific to SHA-3 and SHAKE.
outputLen int // the default output size in bytes
state spongeDirection // whether the sponge is absorbing or squeezing
}
@ -61,7 +62,7 @@ func (d *state) Reset() {
d.a[i] = 0
}
d.state = spongeAbsorbing
d.i, d.n = 0, 0
d.n = 0
}
func (d *state) clone() *state {
@ -69,22 +70,25 @@ func (d *state) clone() *state {
return &ret
}
// permute applies the KeccakF-1600 permutation. It handles
// any input-output buffering.
// permute applies the KeccakF-1600 permutation.
func (d *state) permute() {
switch d.state {
case spongeAbsorbing:
// If we're absorbing, we need to xor the input into the state
// before applying the permutation.
xorIn(d, d.storage[:d.rate])
d.n = 0
keccakF1600(&d.a)
case spongeSqueezing:
// If we're squeezing, we need to apply the permutation before
// copying more output.
keccakF1600(&d.a)
d.i = 0
copyOut(d, d.storage[:d.rate])
var a *[25]uint64
if cpu.IsBigEndian {
a = new([25]uint64)
for i := range a {
a[i] = binary.LittleEndian.Uint64(d.a[i*8:])
}
} else {
a = (*[25]uint64)(unsafe.Pointer(&d.a))
}
keccakF1600(a)
d.n = 0
if cpu.IsBigEndian {
for i := range a {
binary.LittleEndian.PutUint64(d.a[i*8:], a[i])
}
}
}
@ -92,53 +96,36 @@ func (d *state) permute() {
// the multi-bitrate 10..1 padding rule, and permutes the state.
func (d *state) padAndPermute() {
// Pad with this instance's domain-separator bits. We know that there's
// at least one byte of space in d.buf because, if it were full,
// at least one byte of space in the sponge because, if it were full,
// permute would have been called to empty it. dsbyte also contains the
// first one bit for the padding. See the comment in the state struct.
d.storage[d.n] = d.dsbyte
d.n++
for d.n < d.rate {
d.storage[d.n] = 0
d.n++
}
d.a[d.n] ^= d.dsbyte
// This adds the final one bit for the padding. Because of the way that
// bits are numbered from the LSB upwards, the final bit is the MSB of
// the last byte.
d.storage[d.rate-1] ^= 0x80
d.a[d.rate-1] ^= 0x80
// Apply the permutation
d.permute()
d.state = spongeSqueezing
d.n = d.rate
copyOut(d, d.storage[:d.rate])
}
// Write absorbs more data into the hash's state. It panics if any
// output has already been read.
func (d *state) Write(p []byte) (written int, err error) {
func (d *state) Write(p []byte) (n int, err error) {
if d.state != spongeAbsorbing {
panic("sha3: Write after Read")
}
written = len(p)
n = len(p)
for len(p) > 0 {
if d.n == 0 && len(p) >= d.rate {
// The fast path; absorb a full "rate" bytes of input and apply the permutation.
xorIn(d, p[:d.rate])
p = p[d.rate:]
keccakF1600(&d.a)
} else {
// The slow path; buffer the input until we can fill the sponge, and then xor it in.
todo := d.rate - d.n
if todo > len(p) {
todo = len(p)
}
d.n += copy(d.storage[d.n:], p[:todo])
p = p[todo:]
x := subtle.XORBytes(d.a[d.n:d.rate], d.a[d.n:d.rate], p)
d.n += x
p = p[x:]
// If the sponge is full, apply the permutation.
if d.n == d.rate {
d.permute()
}
// If the sponge is full, apply the permutation.
if d.n == d.rate {
d.permute()
}
}
@ -156,12 +143,12 @@ func (d *state) Read(out []byte) (n int, err error) {
// Now, do the squeezing.
for len(out) > 0 {
n := copy(out, d.storage[d.i:d.n])
d.i += n
out = out[n:]
x := copy(out, d.a[d.n:d.rate])
d.n += x
out = out[x:]
// Apply the permutation if we've squeezed the sponge dry.
if d.i == d.rate {
if d.n == d.rate {
d.permute()
}
}

Просмотреть файл

@ -1,40 +0,0 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package sha3
import (
"crypto/subtle"
"encoding/binary"
"unsafe"
"golang.org/x/sys/cpu"
)
// xorIn xors the bytes in buf into the state.
func xorIn(d *state, buf []byte) {
if cpu.IsBigEndian {
for i := 0; len(buf) >= 8; i++ {
a := binary.LittleEndian.Uint64(buf)
d.a[i] ^= a
buf = buf[8:]
}
} else {
ab := (*[25 * 64 / 8]byte)(unsafe.Pointer(&d.a))
subtle.XORBytes(ab[:], ab[:], buf)
}
}
// copyOut copies uint64s to a byte buffer.
func copyOut(d *state, b []byte) {
if cpu.IsBigEndian {
for i := 0; len(b) >= 8; i++ {
binary.LittleEndian.PutUint64(b, d.a[i])
b = b[8:]
}
} else {
ab := (*[25 * 64 / 8]byte)(unsafe.Pointer(&d.a))
copy(b, ab[:])
}
}