x/crypto/chacha20: cleanup chacha_ppc64le.s
- Adding PCALIGN before the loops - Changing WORD directive with corresponding Vector Merge EVEN/ODD word instructions - Replacing Branch Conditional (BC) with its extended mnemonic form BDNZ - VPERMXOR instruction usage in place of VXOR instructions followed by VRLW (rotate left) for cases of rotating in multiples of 8. This replacements give performace improvement both in time and space of around 7%-8% as listed below using benchstat tool. goos: linux goarch: ppc64le pkg: golang.org/x/crypto/chacha20 cpu: POWER10 | chacha20.prev.out | chacha20.new.out | | sec/op | sec/op vs base | ChaCha20/64 171.9n ± 0% 156.6n ± 1% -8.90% (p=0.002 n=6) ChaCha20/256 165.5n ± 0% 152.4n ± 0% -7.92% (p=0.002 n=6) ChaCha20/10x25 505.8n ± 0% 504.3n ± 2% -0.32% (p=0.589 n=6) ChaCha20/4096 2.265µ ± 0% 2.052µ ± 0% -9.40% (p=0.002 n=6) ChaCha20/100x40 5.359µ ± 3% 5.018µ ± 2% -6.37% (p=0.002 n=6) ChaCha20/65536 35.71µ ± 0% 32.29µ ± 0% -9.57% (p=0.002 n=6) ChaCha20/1000x65 44.63µ ± 0% 41.05µ ± 0% -8.02% (p=0.002 n=6) geomean 2.235µ 2.073µ -7.26% | chacha20.prev.out | chacha20.new.out | | B/s | B/s vs base | ChaCha20/64 355.1Mi ± 0% 389.8Mi ± 1% +9.78% (p=0.002 n=6) ChaCha20/256 1.440Gi ± 0% 1.565Gi ± 0% +8.62% (p=0.002 n=6) ChaCha20/10x25 471.3Mi ± 0% 472.8Mi ± 2% +0.31% (p=0.589 n=6) ChaCha20/4096 1.684Gi ± 0% 1.859Gi ± 0% +10.38% (p=0.002 n=6) ChaCha20/100x40 711.8Mi ± 3% 760.3Mi ± 2% +6.80% (p=0.002 n=6) ChaCha20/65536 1.709Gi ± 0% 1.890Gi ± 0% +10.59% (p=0.002 n=6) ChaCha20/1000x65 1.356Gi ± 0% 1.475Gi ± 0% +8.72% (p=0.002 n=6) geomean 957.3Mi 1.008Gi +7.83% Change-Id: Ib31cb10a2a11eacdacf0272fbfd887eb5ccd8bcb Reviewed-on: https://go-review.googlesource.com/c/crypto/+/564797 Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com> Run-TryBot: Paul Murphy <murp@ibm.com> TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com> Reviewed-by: Cherry Mui <cherryyz@google.com>
This commit is contained in:
Родитель
b91329d961
Коммит
8d0d405eed
|
@ -33,6 +33,9 @@
|
|||
#define CONSTBASE R16
|
||||
#define BLOCKS R17
|
||||
|
||||
// for VPERMXOR
|
||||
#define MASK R18
|
||||
|
||||
DATA consts<>+0x00(SB)/8, $0x3320646e61707865
|
||||
DATA consts<>+0x08(SB)/8, $0x6b20657479622d32
|
||||
DATA consts<>+0x10(SB)/8, $0x0000000000000001
|
||||
|
@ -53,7 +56,11 @@ DATA consts<>+0x80(SB)/8, $0x6b2065746b206574
|
|||
DATA consts<>+0x88(SB)/8, $0x6b2065746b206574
|
||||
DATA consts<>+0x90(SB)/8, $0x0000000100000000
|
||||
DATA consts<>+0x98(SB)/8, $0x0000000300000002
|
||||
GLOBL consts<>(SB), RODATA, $0xa0
|
||||
DATA consts<>+0xa0(SB)/8, $0x5566774411223300
|
||||
DATA consts<>+0xa8(SB)/8, $0xddeeffcc99aabb88
|
||||
DATA consts<>+0xb0(SB)/8, $0x6677445522330011
|
||||
DATA consts<>+0xb8(SB)/8, $0xeeffccddaabb8899
|
||||
GLOBL consts<>(SB), RODATA, $0xc0
|
||||
|
||||
//func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32)
|
||||
TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40
|
||||
|
@ -70,6 +77,9 @@ TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40
|
|||
MOVD $48, R10
|
||||
MOVD $64, R11
|
||||
SRD $6, LEN, BLOCKS
|
||||
// for VPERMXOR
|
||||
MOVD $consts<>+0xa0(SB), MASK
|
||||
MOVD $16, R20
|
||||
// V16
|
||||
LXVW4X (CONSTBASE)(R0), VS48
|
||||
ADD $80,CONSTBASE
|
||||
|
@ -87,6 +97,10 @@ TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40
|
|||
// V28
|
||||
LXVW4X (CONSTBASE)(R11), VS60
|
||||
|
||||
// Load mask constants for VPERMXOR
|
||||
LXVW4X (MASK)(R0), V20
|
||||
LXVW4X (MASK)(R20), V21
|
||||
|
||||
// splat slot from V19 -> V26
|
||||
VSPLTW $0, V19, V26
|
||||
|
||||
|
@ -97,7 +111,7 @@ TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40
|
|||
|
||||
MOVD $10, R14
|
||||
MOVD R14, CTR
|
||||
|
||||
PCALIGN $16
|
||||
loop_outer_vsx:
|
||||
// V0, V1, V2, V3
|
||||
LXVW4X (R0)(CONSTBASE), VS32
|
||||
|
@ -128,22 +142,17 @@ loop_outer_vsx:
|
|||
VSPLTISW $12, V28
|
||||
VSPLTISW $8, V29
|
||||
VSPLTISW $7, V30
|
||||
|
||||
PCALIGN $16
|
||||
loop_vsx:
|
||||
VADDUWM V0, V4, V0
|
||||
VADDUWM V1, V5, V1
|
||||
VADDUWM V2, V6, V2
|
||||
VADDUWM V3, V7, V3
|
||||
|
||||
VXOR V12, V0, V12
|
||||
VXOR V13, V1, V13
|
||||
VXOR V14, V2, V14
|
||||
VXOR V15, V3, V15
|
||||
|
||||
VRLW V12, V27, V12
|
||||
VRLW V13, V27, V13
|
||||
VRLW V14, V27, V14
|
||||
VRLW V15, V27, V15
|
||||
VPERMXOR V12, V0, V21, V12
|
||||
VPERMXOR V13, V1, V21, V13
|
||||
VPERMXOR V14, V2, V21, V14
|
||||
VPERMXOR V15, V3, V21, V15
|
||||
|
||||
VADDUWM V8, V12, V8
|
||||
VADDUWM V9, V13, V9
|
||||
|
@ -165,15 +174,10 @@ loop_vsx:
|
|||
VADDUWM V2, V6, V2
|
||||
VADDUWM V3, V7, V3
|
||||
|
||||
VXOR V12, V0, V12
|
||||
VXOR V13, V1, V13
|
||||
VXOR V14, V2, V14
|
||||
VXOR V15, V3, V15
|
||||
|
||||
VRLW V12, V29, V12
|
||||
VRLW V13, V29, V13
|
||||
VRLW V14, V29, V14
|
||||
VRLW V15, V29, V15
|
||||
VPERMXOR V12, V0, V20, V12
|
||||
VPERMXOR V13, V1, V20, V13
|
||||
VPERMXOR V14, V2, V20, V14
|
||||
VPERMXOR V15, V3, V20, V15
|
||||
|
||||
VADDUWM V8, V12, V8
|
||||
VADDUWM V9, V13, V9
|
||||
|
@ -195,15 +199,10 @@ loop_vsx:
|
|||
VADDUWM V2, V7, V2
|
||||
VADDUWM V3, V4, V3
|
||||
|
||||
VXOR V15, V0, V15
|
||||
VXOR V12, V1, V12
|
||||
VXOR V13, V2, V13
|
||||
VXOR V14, V3, V14
|
||||
|
||||
VRLW V15, V27, V15
|
||||
VRLW V12, V27, V12
|
||||
VRLW V13, V27, V13
|
||||
VRLW V14, V27, V14
|
||||
VPERMXOR V15, V0, V21, V15
|
||||
VPERMXOR V12, V1, V21, V12
|
||||
VPERMXOR V13, V2, V21, V13
|
||||
VPERMXOR V14, V3, V21, V14
|
||||
|
||||
VADDUWM V10, V15, V10
|
||||
VADDUWM V11, V12, V11
|
||||
|
@ -225,15 +224,10 @@ loop_vsx:
|
|||
VADDUWM V2, V7, V2
|
||||
VADDUWM V3, V4, V3
|
||||
|
||||
VXOR V15, V0, V15
|
||||
VXOR V12, V1, V12
|
||||
VXOR V13, V2, V13
|
||||
VXOR V14, V3, V14
|
||||
|
||||
VRLW V15, V29, V15
|
||||
VRLW V12, V29, V12
|
||||
VRLW V13, V29, V13
|
||||
VRLW V14, V29, V14
|
||||
VPERMXOR V15, V0, V20, V15
|
||||
VPERMXOR V12, V1, V20, V12
|
||||
VPERMXOR V13, V2, V20, V13
|
||||
VPERMXOR V14, V3, V20, V14
|
||||
|
||||
VADDUWM V10, V15, V10
|
||||
VADDUWM V11, V12, V11
|
||||
|
@ -249,48 +243,48 @@ loop_vsx:
|
|||
VRLW V6, V30, V6
|
||||
VRLW V7, V30, V7
|
||||
VRLW V4, V30, V4
|
||||
BC 16, LT, loop_vsx
|
||||
BDNZ loop_vsx
|
||||
|
||||
VADDUWM V12, V26, V12
|
||||
|
||||
WORD $0x13600F8C // VMRGEW V0, V1, V27
|
||||
WORD $0x13821F8C // VMRGEW V2, V3, V28
|
||||
VMRGEW V0, V1, V27
|
||||
VMRGEW V2, V3, V28
|
||||
|
||||
WORD $0x10000E8C // VMRGOW V0, V1, V0
|
||||
WORD $0x10421E8C // VMRGOW V2, V3, V2
|
||||
VMRGOW V0, V1, V0
|
||||
VMRGOW V2, V3, V2
|
||||
|
||||
WORD $0x13A42F8C // VMRGEW V4, V5, V29
|
||||
WORD $0x13C63F8C // VMRGEW V6, V7, V30
|
||||
VMRGEW V4, V5, V29
|
||||
VMRGEW V6, V7, V30
|
||||
|
||||
XXPERMDI VS32, VS34, $0, VS33
|
||||
XXPERMDI VS32, VS34, $3, VS35
|
||||
XXPERMDI VS59, VS60, $0, VS32
|
||||
XXPERMDI VS59, VS60, $3, VS34
|
||||
|
||||
WORD $0x10842E8C // VMRGOW V4, V5, V4
|
||||
WORD $0x10C63E8C // VMRGOW V6, V7, V6
|
||||
VMRGOW V4, V5, V4
|
||||
VMRGOW V6, V7, V6
|
||||
|
||||
WORD $0x13684F8C // VMRGEW V8, V9, V27
|
||||
WORD $0x138A5F8C // VMRGEW V10, V11, V28
|
||||
VMRGEW V8, V9, V27
|
||||
VMRGEW V10, V11, V28
|
||||
|
||||
XXPERMDI VS36, VS38, $0, VS37
|
||||
XXPERMDI VS36, VS38, $3, VS39
|
||||
XXPERMDI VS61, VS62, $0, VS36
|
||||
XXPERMDI VS61, VS62, $3, VS38
|
||||
|
||||
WORD $0x11084E8C // VMRGOW V8, V9, V8
|
||||
WORD $0x114A5E8C // VMRGOW V10, V11, V10
|
||||
VMRGOW V8, V9, V8
|
||||
VMRGOW V10, V11, V10
|
||||
|
||||
WORD $0x13AC6F8C // VMRGEW V12, V13, V29
|
||||
WORD $0x13CE7F8C // VMRGEW V14, V15, V30
|
||||
VMRGEW V12, V13, V29
|
||||
VMRGEW V14, V15, V30
|
||||
|
||||
XXPERMDI VS40, VS42, $0, VS41
|
||||
XXPERMDI VS40, VS42, $3, VS43
|
||||
XXPERMDI VS59, VS60, $0, VS40
|
||||
XXPERMDI VS59, VS60, $3, VS42
|
||||
|
||||
WORD $0x118C6E8C // VMRGOW V12, V13, V12
|
||||
WORD $0x11CE7E8C // VMRGOW V14, V15, V14
|
||||
VMRGOW V12, V13, V12
|
||||
VMRGOW V14, V15, V14
|
||||
|
||||
VSPLTISW $4, V27
|
||||
VADDUWM V26, V27, V26
|
||||
|
@ -431,7 +425,7 @@ tail_vsx:
|
|||
ADD $-1, R11, R12
|
||||
ADD $-1, INP
|
||||
ADD $-1, OUT
|
||||
|
||||
PCALIGN $16
|
||||
looptail_vsx:
|
||||
// Copying the result to OUT
|
||||
// in bytes.
|
||||
|
@ -439,7 +433,7 @@ looptail_vsx:
|
|||
MOVBZU 1(INP), TMP
|
||||
XOR KEY, TMP, KEY
|
||||
MOVBU KEY, 1(OUT)
|
||||
BC 16, LT, looptail_vsx
|
||||
BDNZ looptail_vsx
|
||||
|
||||
// Clear the stack values
|
||||
STXVW4X VS48, (R11)(R0)
|
||||
|
|
Загрузка…
Ссылка в новой задаче