sha3: avoid buffer copy
Previously, the package worked by copying the input (or the output) into a buffer, and then XOR'ing (or copying) it into (or out of) the state. (Except for an input fast path.) There's no need for that! We can XOR straight into the state, and copy straight out of it, at least on little endian machines. This is a bit faster, almost halves the state size, and will make it easier to implement marshaling, but most importantly look at how much simpler it makes the code! go: go1.23.0 goos: linux goarch: amd64 pkg: golang.org/x/crypto/sha3 cpu: AMD Ryzen 7 PRO 8700GE w/ Radeon 780M Graphics │ v0.27.0-2-g42ee18b9637 │ v0.27.0-2-g42ee18b9637-dirty │ │ sec/op │ sec/op vs base │ PermutationFunction-8 270.8n ± 0% 270.4n ± 0% ~ (p=0.099 n=10) Sha3_512_MTU-8 5.762µ ± 0% 5.658µ ± 0% -1.80% (p=0.000 n=10) Sha3_384_MTU-8 4.179µ ± 0% 4.070µ ± 0% -2.60% (p=0.000 n=10) Sha3_256_MTU-8 3.316µ ± 0% 3.214µ ± 0% -3.08% (p=0.000 n=10) Sha3_224_MTU-8 3.175µ ± 0% 3.061µ ± 0% -3.61% (p=0.000 n=10) Shake128_MTU-8 2.779µ ± 0% 2.681µ ± 0% -3.51% (p=0.000 n=10) Shake256_MTU-8 2.947µ ± 0% 2.957µ ± 0% +0.32% (p=0.000 n=10) Shake256_16x-8 44.15µ ± 0% 44.45µ ± 0% +0.67% (p=0.000 n=10) Shake256_1MiB-8 2.319m ± 0% 2.274m ± 0% -1.93% (p=0.000 n=10) Sha3_512_1MiB-8 4.204m ± 0% 4.219m ± 0% +0.34% (p=0.000 n=10) geomean 13.75µ 13.54µ -1.55% │ v0.27.0-2-g42ee18b9637 │ v0.27.0-2-g42ee18b9637-dirty │ │ B/s │ B/s vs base │ PermutationFunction-8 704.3Mi ± 0% 705.4Mi ± 0% ~ (p=0.105 n=10) Sha3_512_MTU-8 223.5Mi ± 0% 227.6Mi ± 0% +1.83% (p=0.000 n=10) Sha3_384_MTU-8 308.1Mi ± 0% 316.4Mi ± 0% +2.67% (p=0.000 n=10) Sha3_256_MTU-8 388.2Mi ± 0% 400.5Mi ± 0% +3.17% (p=0.000 n=10) Sha3_224_MTU-8 405.5Mi ± 0% 420.7Mi ± 0% +3.73% (p=0.000 n=10) Shake128_MTU-8 463.4Mi ± 0% 480.2Mi ± 0% +3.64% (p=0.000 n=10) Shake256_MTU-8 436.9Mi ± 0% 435.5Mi ± 0% -0.32% (p=0.000 n=10) Shake256_16x-8 353.9Mi ± 0% 351.5Mi ± 0% -0.66% (p=0.000 n=10) Shake256_1MiB-8 431.2Mi ± 0% 439.7Mi ± 0% +1.97% (p=0.000 n=10) Sha3_512_1MiB-8 237.8Mi ± 0% 237.1Mi ± 0% -0.33% (p=0.000 n=10) geomean 375.7Mi 381.6Mi +1.57% Even stronger effect when patched on top of CL 616555 (forced on). go: go1.23.0 goos: darwin goarch: arm64 pkg: golang.org/x/crypto/sha3 cpu: Apple M2 │ old │ new │ │ sec/op │ sec/op vs base │ PermutationFunction-8 154.7n ± 2% 153.8n ± 1% ~ (p=0.469 n=10) Sha3_512_MTU-8 3.260µ ± 2% 3.143µ ± 2% -3.60% (p=0.000 n=10) Sha3_384_MTU-8 2.389µ ± 2% 2.244µ ± 2% -6.07% (p=0.000 n=10) Sha3_256_MTU-8 1.950µ ± 2% 1.758µ ± 1% -9.87% (p=0.000 n=10) Sha3_224_MTU-8 1.874µ ± 2% 1.686µ ± 1% -10.06% (p=0.000 n=10) Shake128_MTU-8 1.827µ ± 3% 1.447µ ± 1% -20.80% (p=0.000 n=10) Shake256_MTU-8 1.665µ ± 3% 1.604µ ± 3% -3.63% (p=0.003 n=10) Shake256_16x-8 25.14µ ± 1% 25.23µ ± 2% ~ (p=0.912 n=10) Shake256_1MiB-8 1.236m ± 2% 1.243m ± 2% ~ (p=0.631 n=10) Sha3_512_1MiB-8 2.296m ± 2% 2.305m ± 1% ~ (p=0.315 n=10) geomean 7.906µ 7.467µ -5.56% │ old │ new │ │ B/op │ B/op vs base │ PermutationFunction-8 1.204Gi ± 2% 1.212Gi ± 1% ~ (p=0.529 n=10) Sha3_512_MTU-8 394.9Mi ± 2% 409.7Mi ± 2% +3.73% (p=0.000 n=10) Sha3_384_MTU-8 539.0Mi ± 2% 573.8Mi ± 2% +6.45% (p=0.000 n=10) Sha3_256_MTU-8 660.3Mi ± 2% 732.6Mi ± 1% +10.95% (p=0.000 n=10) Sha3_224_MTU-8 687.1Mi ± 2% 763.9Mi ± 1% +11.17% (p=0.000 n=10) Shake128_MTU-8 704.7Mi ± 2% 889.6Mi ± 2% +26.24% (p=0.000 n=10) Shake256_MTU-8 773.4Mi ± 3% 802.5Mi ± 3% +3.76% (p=0.004 n=10) Shake256_16x-8 621.6Mi ± 1% 619.3Mi ± 2% ~ (p=0.912 n=10) Shake256_1MiB-8 809.1Mi ± 2% 804.7Mi ± 2% ~ (p=0.631 n=10) Sha3_512_1MiB-8 435.6Mi ± 2% 433.9Mi ± 1% ~ (p=0.315 n=10) geomean 653.6Mi 692.0Mi +5.88% Change-Id: I33a0a1ddf305c395f99bf17f81473e2f42c5ce42 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/616575 Reviewed-by: Daniel McCarney <daniel@binaryparadox.net> Reviewed-by: Michael Pratt <mpratt@google.com> Reviewed-by: Roland Shoemaker <roland@golang.org> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Auto-Submit: Filippo Valsorda <filippo@golang.org> Reviewed-by: Andrew Ekstedt <andrew.ekstedt@gmail.com>
This commit is contained in:
Родитель
7cfb9161e8
Коммит
c17aa50fbd
111
sha3/sha3.go
111
sha3/sha3.go
|
@ -4,6 +4,14 @@
|
|||
|
||||
package sha3
|
||||
|
||||
import (
|
||||
"crypto/subtle"
|
||||
"encoding/binary"
|
||||
"unsafe"
|
||||
|
||||
"golang.org/x/sys/cpu"
|
||||
)
|
||||
|
||||
// spongeDirection indicates the direction bytes are flowing through the sponge.
|
||||
type spongeDirection int
|
||||
|
||||
|
@ -14,16 +22,13 @@ const (
|
|||
spongeSqueezing
|
||||
)
|
||||
|
||||
const (
|
||||
// maxRate is the maximum size of the internal buffer. SHAKE-256
|
||||
// currently needs the largest buffer.
|
||||
maxRate = 168
|
||||
)
|
||||
|
||||
type state struct {
|
||||
// Generic sponge components.
|
||||
a [25]uint64 // main state of the hash
|
||||
rate int // the number of bytes of state to use
|
||||
a [1600 / 8]byte // main state of the hash
|
||||
|
||||
// a[n:rate] is the buffer. If absorbing, it's the remaining space to XOR
|
||||
// into before running the permutation. If squeezing, it's the remaining
|
||||
// output to produce before running the permutation.
|
||||
n, rate int
|
||||
|
||||
// dsbyte contains the "domain separation" bits and the first bit of
|
||||
// the padding. Sections 6.1 and 6.2 of [1] separate the outputs of the
|
||||
|
@ -39,10 +44,6 @@ type state struct {
|
|||
// Extendable-Output Functions (May 2014)"
|
||||
dsbyte byte
|
||||
|
||||
i, n int // storage[i:n] is the buffer, i is only used while squeezing
|
||||
storage [maxRate]byte
|
||||
|
||||
// Specific to SHA-3 and SHAKE.
|
||||
outputLen int // the default output size in bytes
|
||||
state spongeDirection // whether the sponge is absorbing or squeezing
|
||||
}
|
||||
|
@ -61,7 +62,7 @@ func (d *state) Reset() {
|
|||
d.a[i] = 0
|
||||
}
|
||||
d.state = spongeAbsorbing
|
||||
d.i, d.n = 0, 0
|
||||
d.n = 0
|
||||
}
|
||||
|
||||
func (d *state) clone() *state {
|
||||
|
@ -69,22 +70,25 @@ func (d *state) clone() *state {
|
|||
return &ret
|
||||
}
|
||||
|
||||
// permute applies the KeccakF-1600 permutation. It handles
|
||||
// any input-output buffering.
|
||||
// permute applies the KeccakF-1600 permutation.
|
||||
func (d *state) permute() {
|
||||
switch d.state {
|
||||
case spongeAbsorbing:
|
||||
// If we're absorbing, we need to xor the input into the state
|
||||
// before applying the permutation.
|
||||
xorIn(d, d.storage[:d.rate])
|
||||
d.n = 0
|
||||
keccakF1600(&d.a)
|
||||
case spongeSqueezing:
|
||||
// If we're squeezing, we need to apply the permutation before
|
||||
// copying more output.
|
||||
keccakF1600(&d.a)
|
||||
d.i = 0
|
||||
copyOut(d, d.storage[:d.rate])
|
||||
var a *[25]uint64
|
||||
if cpu.IsBigEndian {
|
||||
a = new([25]uint64)
|
||||
for i := range a {
|
||||
a[i] = binary.LittleEndian.Uint64(d.a[i*8:])
|
||||
}
|
||||
} else {
|
||||
a = (*[25]uint64)(unsafe.Pointer(&d.a))
|
||||
}
|
||||
|
||||
keccakF1600(a)
|
||||
d.n = 0
|
||||
|
||||
if cpu.IsBigEndian {
|
||||
for i := range a {
|
||||
binary.LittleEndian.PutUint64(d.a[i*8:], a[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -92,53 +96,36 @@ func (d *state) permute() {
|
|||
// the multi-bitrate 10..1 padding rule, and permutes the state.
|
||||
func (d *state) padAndPermute() {
|
||||
// Pad with this instance's domain-separator bits. We know that there's
|
||||
// at least one byte of space in d.buf because, if it were full,
|
||||
// at least one byte of space in the sponge because, if it were full,
|
||||
// permute would have been called to empty it. dsbyte also contains the
|
||||
// first one bit for the padding. See the comment in the state struct.
|
||||
d.storage[d.n] = d.dsbyte
|
||||
d.n++
|
||||
for d.n < d.rate {
|
||||
d.storage[d.n] = 0
|
||||
d.n++
|
||||
}
|
||||
d.a[d.n] ^= d.dsbyte
|
||||
// This adds the final one bit for the padding. Because of the way that
|
||||
// bits are numbered from the LSB upwards, the final bit is the MSB of
|
||||
// the last byte.
|
||||
d.storage[d.rate-1] ^= 0x80
|
||||
d.a[d.rate-1] ^= 0x80
|
||||
// Apply the permutation
|
||||
d.permute()
|
||||
d.state = spongeSqueezing
|
||||
d.n = d.rate
|
||||
copyOut(d, d.storage[:d.rate])
|
||||
}
|
||||
|
||||
// Write absorbs more data into the hash's state. It panics if any
|
||||
// output has already been read.
|
||||
func (d *state) Write(p []byte) (written int, err error) {
|
||||
func (d *state) Write(p []byte) (n int, err error) {
|
||||
if d.state != spongeAbsorbing {
|
||||
panic("sha3: Write after Read")
|
||||
}
|
||||
written = len(p)
|
||||
|
||||
n = len(p)
|
||||
|
||||
for len(p) > 0 {
|
||||
if d.n == 0 && len(p) >= d.rate {
|
||||
// The fast path; absorb a full "rate" bytes of input and apply the permutation.
|
||||
xorIn(d, p[:d.rate])
|
||||
p = p[d.rate:]
|
||||
keccakF1600(&d.a)
|
||||
} else {
|
||||
// The slow path; buffer the input until we can fill the sponge, and then xor it in.
|
||||
todo := d.rate - d.n
|
||||
if todo > len(p) {
|
||||
todo = len(p)
|
||||
}
|
||||
d.n += copy(d.storage[d.n:], p[:todo])
|
||||
p = p[todo:]
|
||||
x := subtle.XORBytes(d.a[d.n:d.rate], d.a[d.n:d.rate], p)
|
||||
d.n += x
|
||||
p = p[x:]
|
||||
|
||||
// If the sponge is full, apply the permutation.
|
||||
if d.n == d.rate {
|
||||
d.permute()
|
||||
}
|
||||
// If the sponge is full, apply the permutation.
|
||||
if d.n == d.rate {
|
||||
d.permute()
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -156,12 +143,12 @@ func (d *state) Read(out []byte) (n int, err error) {
|
|||
|
||||
// Now, do the squeezing.
|
||||
for len(out) > 0 {
|
||||
n := copy(out, d.storage[d.i:d.n])
|
||||
d.i += n
|
||||
out = out[n:]
|
||||
x := copy(out, d.a[d.n:d.rate])
|
||||
d.n += x
|
||||
out = out[x:]
|
||||
|
||||
// Apply the permutation if we've squeezed the sponge dry.
|
||||
if d.i == d.rate {
|
||||
if d.n == d.rate {
|
||||
d.permute()
|
||||
}
|
||||
}
|
||||
|
|
40
sha3/xor.go
40
sha3/xor.go
|
@ -1,40 +0,0 @@
|
|||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package sha3
|
||||
|
||||
import (
|
||||
"crypto/subtle"
|
||||
"encoding/binary"
|
||||
"unsafe"
|
||||
|
||||
"golang.org/x/sys/cpu"
|
||||
)
|
||||
|
||||
// xorIn xors the bytes in buf into the state.
|
||||
func xorIn(d *state, buf []byte) {
|
||||
if cpu.IsBigEndian {
|
||||
for i := 0; len(buf) >= 8; i++ {
|
||||
a := binary.LittleEndian.Uint64(buf)
|
||||
d.a[i] ^= a
|
||||
buf = buf[8:]
|
||||
}
|
||||
} else {
|
||||
ab := (*[25 * 64 / 8]byte)(unsafe.Pointer(&d.a))
|
||||
subtle.XORBytes(ab[:], ab[:], buf)
|
||||
}
|
||||
}
|
||||
|
||||
// copyOut copies uint64s to a byte buffer.
|
||||
func copyOut(d *state, b []byte) {
|
||||
if cpu.IsBigEndian {
|
||||
for i := 0; len(b) >= 8; i++ {
|
||||
binary.LittleEndian.PutUint64(b, d.a[i])
|
||||
b = b[8:]
|
||||
}
|
||||
} else {
|
||||
ab := (*[25 * 64 / 8]byte)(unsafe.Pointer(&d.a))
|
||||
copy(b, ab[:])
|
||||
}
|
||||
}
|
Загрузка…
Ссылка в новой задаче