internal/chacha20: improve chacha20 performance on ppc64le
This improves the performance of the asm implementation for chacha20 on ppc64le by updating to the vsx implementation provided in cryptogams. The previous implementation was found to not perform as well as possible on power9. This implementation improves performance on both power8 and power9. Power9 improvement with this change as compared to current: name old time/op new time/op delta ChaCha20/32-64 361ns ± 0% 225ns ± 0% -37.67% (p=1.000 n=1+1) ChaCha20/63-64 364ns ± 0% 229ns ± 0% -37.09% (p=1.000 n=1+1) ChaCha20/64-64 364ns ± 0% 231ns ± 0% -36.54% (p=1.000 n=1+1) ChaCha20/256-64 332ns ± 0% 199ns ± 0% -40.06% (p=1.000 n=1+1) ChaCha20/1024-64 1.24µs ± 0% 0.70µs ± 0% -43.23% (p=1.000 n=1+1) ChaCha20/1350-64 1.89µs ± 0% 1.03µs ± 0% -45.35% (p=1.000 n=1+1) ChaCha20/65536-64 77.0µs ± 0% 42.5µs ± 0% -44.83% (p=1.000 n=1+1) This is discussed in issue golang/go#25051. A few asm instructions vmrgew and vmrgow were just added in Go 1.14 so have been encoded using WORD at this point. Change-Id: I2b192a63cf46b0b20195e60e4412c43c5dd14ad8 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/195959 Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Carlos Eduardo Seo <cseo@linux.vnet.ibm.com>
This commit is contained in:
Родитель
34f69633bf
Коммит
af544f31c8
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -2,7 +2,7 @@
|
|||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build !ppc64le,!arm64,!s390x arm64,!go1.11 gccgo appengine
|
||||
// +build !arm64,!s390x,!ppc64le arm64,!go1.11 gccgo appengine
|
||||
|
||||
package chacha20
|
||||
|
||||
|
|
|
@ -6,22 +6,24 @@
|
|||
|
||||
package chacha20
|
||||
|
||||
import "encoding/binary"
|
||||
|
||||
const (
|
||||
bufSize = 256
|
||||
haveAsm = true
|
||||
import (
|
||||
"encoding/binary"
|
||||
)
|
||||
|
||||
var haveAsm = true
|
||||
|
||||
const bufSize = 256
|
||||
|
||||
//go:noescape
|
||||
func chaCha20_ctr32_vmx(out, inp *byte, len int, key *[8]uint32, counter *uint32)
|
||||
func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32)
|
||||
|
||||
func (c *Cipher) xorKeyStreamAsm(dst, src []byte) {
|
||||
// This implementation can handle buffers that aren't multiples of
|
||||
// 256.
|
||||
if len(src) >= bufSize {
|
||||
chaCha20_ctr32_vmx(&dst[0], &src[0], len(src)-len(src)%bufSize, &c.key, &c.counter)
|
||||
}
|
||||
if len(src)%bufSize != 0 {
|
||||
chaCha20_ctr32_vmx(&c.buf[0], &c.buf[0], bufSize, &c.key, &c.counter)
|
||||
chaCha20_ctr32_vsx(&dst[0], &src[0], len(src), &c.key, &c.counter)
|
||||
} else if len(src)%bufSize != 0 {
|
||||
chaCha20_ctr32_vsx(&c.buf[0], &c.buf[0], bufSize, &c.key, &c.counter)
|
||||
start := len(src) - len(src)%bufSize
|
||||
ts, td, tb := src[start:], dst[start:], c.buf[:]
|
||||
// Unroll loop to XOR 32 bytes per iteration.
|
||||
|
@ -46,7 +48,6 @@ func (c *Cipher) xorKeyStreamAsm(dst, src []byte) {
|
|||
td[i] = tb[i] ^ v
|
||||
}
|
||||
c.len = bufSize - (len(src) % bufSize)
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Загрузка…
Ссылка в новой задаче