arm64/crypto: issue aese/aesmc instructions in pairs
This changes the AES core transform implementations to issue aese/aesmc (and aesd/aesimc) in pairs. This enables a micro-architectural optimization in recent Cortex-A5x cores that improves performance by 50-90%. Measured performance in cycles per byte (Cortex-A57): CBC enc CBC dec CTR before 3.64 1.34 1.32 after 1.95 0.85 0.93 Note that this results in a ~5% performance decrease for older cores. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Will Deacon <will.deacon@arm.com>
This commit is contained in:
Родитель
b63dbef93f
Коммит
4a97abd443
|
@ -101,19 +101,19 @@ ENTRY(ce_aes_ccm_final)
|
||||||
0: mov v4.16b, v3.16b
|
0: mov v4.16b, v3.16b
|
||||||
1: ld1 {v5.2d}, [x2], #16 /* load next round key */
|
1: ld1 {v5.2d}, [x2], #16 /* load next round key */
|
||||||
aese v0.16b, v4.16b
|
aese v0.16b, v4.16b
|
||||||
aese v1.16b, v4.16b
|
|
||||||
aesmc v0.16b, v0.16b
|
aesmc v0.16b, v0.16b
|
||||||
|
aese v1.16b, v4.16b
|
||||||
aesmc v1.16b, v1.16b
|
aesmc v1.16b, v1.16b
|
||||||
2: ld1 {v3.2d}, [x2], #16 /* load next round key */
|
2: ld1 {v3.2d}, [x2], #16 /* load next round key */
|
||||||
aese v0.16b, v5.16b
|
aese v0.16b, v5.16b
|
||||||
aese v1.16b, v5.16b
|
|
||||||
aesmc v0.16b, v0.16b
|
aesmc v0.16b, v0.16b
|
||||||
|
aese v1.16b, v5.16b
|
||||||
aesmc v1.16b, v1.16b
|
aesmc v1.16b, v1.16b
|
||||||
3: ld1 {v4.2d}, [x2], #16 /* load next round key */
|
3: ld1 {v4.2d}, [x2], #16 /* load next round key */
|
||||||
subs w3, w3, #3
|
subs w3, w3, #3
|
||||||
aese v0.16b, v3.16b
|
aese v0.16b, v3.16b
|
||||||
aese v1.16b, v3.16b
|
|
||||||
aesmc v0.16b, v0.16b
|
aesmc v0.16b, v0.16b
|
||||||
|
aese v1.16b, v3.16b
|
||||||
aesmc v1.16b, v1.16b
|
aesmc v1.16b, v1.16b
|
||||||
bpl 1b
|
bpl 1b
|
||||||
aese v0.16b, v4.16b
|
aese v0.16b, v4.16b
|
||||||
|
@ -146,19 +146,19 @@ ENDPROC(ce_aes_ccm_final)
|
||||||
ld1 {v5.2d}, [x10], #16 /* load 2nd round key */
|
ld1 {v5.2d}, [x10], #16 /* load 2nd round key */
|
||||||
2: /* inner loop: 3 rounds, 2x interleaved */
|
2: /* inner loop: 3 rounds, 2x interleaved */
|
||||||
aese v0.16b, v4.16b
|
aese v0.16b, v4.16b
|
||||||
aese v1.16b, v4.16b
|
|
||||||
aesmc v0.16b, v0.16b
|
aesmc v0.16b, v0.16b
|
||||||
|
aese v1.16b, v4.16b
|
||||||
aesmc v1.16b, v1.16b
|
aesmc v1.16b, v1.16b
|
||||||
3: ld1 {v3.2d}, [x10], #16 /* load next round key */
|
3: ld1 {v3.2d}, [x10], #16 /* load next round key */
|
||||||
aese v0.16b, v5.16b
|
aese v0.16b, v5.16b
|
||||||
aese v1.16b, v5.16b
|
|
||||||
aesmc v0.16b, v0.16b
|
aesmc v0.16b, v0.16b
|
||||||
|
aese v1.16b, v5.16b
|
||||||
aesmc v1.16b, v1.16b
|
aesmc v1.16b, v1.16b
|
||||||
4: ld1 {v4.2d}, [x10], #16 /* load next round key */
|
4: ld1 {v4.2d}, [x10], #16 /* load next round key */
|
||||||
subs w7, w7, #3
|
subs w7, w7, #3
|
||||||
aese v0.16b, v3.16b
|
aese v0.16b, v3.16b
|
||||||
aese v1.16b, v3.16b
|
|
||||||
aesmc v0.16b, v0.16b
|
aesmc v0.16b, v0.16b
|
||||||
|
aese v1.16b, v3.16b
|
||||||
aesmc v1.16b, v1.16b
|
aesmc v1.16b, v1.16b
|
||||||
ld1 {v5.2d}, [x10], #16 /* load next round key */
|
ld1 {v5.2d}, [x10], #16 /* load next round key */
|
||||||
bpl 2b
|
bpl 2b
|
||||||
|
|
|
@ -45,18 +45,14 @@
|
||||||
|
|
||||||
.macro do_enc_Nx, de, mc, k, i0, i1, i2, i3
|
.macro do_enc_Nx, de, mc, k, i0, i1, i2, i3
|
||||||
aes\de \i0\().16b, \k\().16b
|
aes\de \i0\().16b, \k\().16b
|
||||||
.ifnb \i1
|
|
||||||
aes\de \i1\().16b, \k\().16b
|
|
||||||
.ifnb \i3
|
|
||||||
aes\de \i2\().16b, \k\().16b
|
|
||||||
aes\de \i3\().16b, \k\().16b
|
|
||||||
.endif
|
|
||||||
.endif
|
|
||||||
aes\mc \i0\().16b, \i0\().16b
|
aes\mc \i0\().16b, \i0\().16b
|
||||||
.ifnb \i1
|
.ifnb \i1
|
||||||
|
aes\de \i1\().16b, \k\().16b
|
||||||
aes\mc \i1\().16b, \i1\().16b
|
aes\mc \i1\().16b, \i1\().16b
|
||||||
.ifnb \i3
|
.ifnb \i3
|
||||||
|
aes\de \i2\().16b, \k\().16b
|
||||||
aes\mc \i2\().16b, \i2\().16b
|
aes\mc \i2\().16b, \i2\().16b
|
||||||
|
aes\de \i3\().16b, \k\().16b
|
||||||
aes\mc \i3\().16b, \i3\().16b
|
aes\mc \i3\().16b, \i3\().16b
|
||||||
.endif
|
.endif
|
||||||
.endif
|
.endif
|
||||||
|
|
Загрузка…
Ссылка в новой задаче