crypto: arm64/aes-blk - revert NEON yield for skciphers

The reasoning of commit f10dc56c64 ("crypto: arm64 - revert NEON yield
for fast AEAD implementations") applies equally to skciphers: the walk
API already guarantees that the input size of each call into the NEON
code is bounded to the size of a page, and so there is no need for an
additional TIF_NEED_RESCHED flag check inside the inner loop. So revert
the skcipher changes to aes-modes.S (but retain the mac ones)

This partially reverts commit 0c8f838a52.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
This commit is contained in:
Ard Biesheuvel 2018-09-10 16:41:13 +02:00 коммит произвёл Herbert Xu
Родитель 557ecb4543
Коммит 6e7de6af91
1 изменённых файлов: 108 добавлений и 173 удалений

Просмотреть файл

@ -14,12 +14,12 @@
.align 4 .align 4
aes_encrypt_block4x: aes_encrypt_block4x:
encrypt_block4x v0, v1, v2, v3, w22, x21, x8, w7 encrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
ret ret
ENDPROC(aes_encrypt_block4x) ENDPROC(aes_encrypt_block4x)
aes_decrypt_block4x: aes_decrypt_block4x:
decrypt_block4x v0, v1, v2, v3, w22, x21, x8, w7 decrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
ret ret
ENDPROC(aes_decrypt_block4x) ENDPROC(aes_decrypt_block4x)
@ -31,71 +31,57 @@ ENDPROC(aes_decrypt_block4x)
*/ */
AES_ENTRY(aes_ecb_encrypt) AES_ENTRY(aes_ecb_encrypt)
frame_push 5 stp x29, x30, [sp, #-16]!
mov x29, sp
mov x19, x0 enc_prepare w3, x2, x5
mov x20, x1
mov x21, x2
mov x22, x3
mov x23, x4
.Lecbencrestart:
enc_prepare w22, x21, x5
.LecbencloopNx: .LecbencloopNx:
subs w23, w23, #4 subs w4, w4, #4
bmi .Lecbenc1x bmi .Lecbenc1x
ld1 {v0.16b-v3.16b}, [x20], #64 /* get 4 pt blocks */ ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
bl aes_encrypt_block4x bl aes_encrypt_block4x
st1 {v0.16b-v3.16b}, [x19], #64 st1 {v0.16b-v3.16b}, [x0], #64
cond_yield_neon .Lecbencrestart
b .LecbencloopNx b .LecbencloopNx
.Lecbenc1x: .Lecbenc1x:
adds w23, w23, #4 adds w4, w4, #4
beq .Lecbencout beq .Lecbencout
.Lecbencloop: .Lecbencloop:
ld1 {v0.16b}, [x20], #16 /* get next pt block */ ld1 {v0.16b}, [x1], #16 /* get next pt block */
encrypt_block v0, w22, x21, x5, w6 encrypt_block v0, w3, x2, x5, w6
st1 {v0.16b}, [x19], #16 st1 {v0.16b}, [x0], #16
subs w23, w23, #1 subs w4, w4, #1
bne .Lecbencloop bne .Lecbencloop
.Lecbencout: .Lecbencout:
frame_pop ldp x29, x30, [sp], #16
ret ret
AES_ENDPROC(aes_ecb_encrypt) AES_ENDPROC(aes_ecb_encrypt)
AES_ENTRY(aes_ecb_decrypt) AES_ENTRY(aes_ecb_decrypt)
frame_push 5 stp x29, x30, [sp, #-16]!
mov x29, sp
mov x19, x0 dec_prepare w3, x2, x5
mov x20, x1
mov x21, x2
mov x22, x3
mov x23, x4
.Lecbdecrestart:
dec_prepare w22, x21, x5
.LecbdecloopNx: .LecbdecloopNx:
subs w23, w23, #4 subs w4, w4, #4
bmi .Lecbdec1x bmi .Lecbdec1x
ld1 {v0.16b-v3.16b}, [x20], #64 /* get 4 ct blocks */ ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
bl aes_decrypt_block4x bl aes_decrypt_block4x
st1 {v0.16b-v3.16b}, [x19], #64 st1 {v0.16b-v3.16b}, [x0], #64
cond_yield_neon .Lecbdecrestart
b .LecbdecloopNx b .LecbdecloopNx
.Lecbdec1x: .Lecbdec1x:
adds w23, w23, #4 adds w4, w4, #4
beq .Lecbdecout beq .Lecbdecout
.Lecbdecloop: .Lecbdecloop:
ld1 {v0.16b}, [x20], #16 /* get next ct block */ ld1 {v0.16b}, [x1], #16 /* get next ct block */
decrypt_block v0, w22, x21, x5, w6 decrypt_block v0, w3, x2, x5, w6
st1 {v0.16b}, [x19], #16 st1 {v0.16b}, [x0], #16
subs w23, w23, #1 subs w4, w4, #1
bne .Lecbdecloop bne .Lecbdecloop
.Lecbdecout: .Lecbdecout:
frame_pop ldp x29, x30, [sp], #16
ret ret
AES_ENDPROC(aes_ecb_decrypt) AES_ENDPROC(aes_ecb_decrypt)
@ -108,100 +94,78 @@ AES_ENDPROC(aes_ecb_decrypt)
*/ */
AES_ENTRY(aes_cbc_encrypt) AES_ENTRY(aes_cbc_encrypt)
frame_push 6 ld1 {v4.16b}, [x5] /* get iv */
enc_prepare w3, x2, x6
mov x19, x0
mov x20, x1
mov x21, x2
mov x22, x3
mov x23, x4
mov x24, x5
.Lcbcencrestart:
ld1 {v4.16b}, [x24] /* get iv */
enc_prepare w22, x21, x6
.Lcbcencloop4x: .Lcbcencloop4x:
subs w23, w23, #4 subs w4, w4, #4
bmi .Lcbcenc1x bmi .Lcbcenc1x
ld1 {v0.16b-v3.16b}, [x20], #64 /* get 4 pt blocks */ ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
eor v0.16b, v0.16b, v4.16b /* ..and xor with iv */ eor v0.16b, v0.16b, v4.16b /* ..and xor with iv */
encrypt_block v0, w22, x21, x6, w7 encrypt_block v0, w3, x2, x6, w7
eor v1.16b, v1.16b, v0.16b eor v1.16b, v1.16b, v0.16b
encrypt_block v1, w22, x21, x6, w7 encrypt_block v1, w3, x2, x6, w7
eor v2.16b, v2.16b, v1.16b eor v2.16b, v2.16b, v1.16b
encrypt_block v2, w22, x21, x6, w7 encrypt_block v2, w3, x2, x6, w7
eor v3.16b, v3.16b, v2.16b eor v3.16b, v3.16b, v2.16b
encrypt_block v3, w22, x21, x6, w7 encrypt_block v3, w3, x2, x6, w7
st1 {v0.16b-v3.16b}, [x19], #64 st1 {v0.16b-v3.16b}, [x0], #64
mov v4.16b, v3.16b mov v4.16b, v3.16b
st1 {v4.16b}, [x24] /* return iv */
cond_yield_neon .Lcbcencrestart
b .Lcbcencloop4x b .Lcbcencloop4x
.Lcbcenc1x: .Lcbcenc1x:
adds w23, w23, #4 adds w4, w4, #4
beq .Lcbcencout beq .Lcbcencout
.Lcbcencloop: .Lcbcencloop:
ld1 {v0.16b}, [x20], #16 /* get next pt block */ ld1 {v0.16b}, [x1], #16 /* get next pt block */
eor v4.16b, v4.16b, v0.16b /* ..and xor with iv */ eor v4.16b, v4.16b, v0.16b /* ..and xor with iv */
encrypt_block v4, w22, x21, x6, w7 encrypt_block v4, w3, x2, x6, w7
st1 {v4.16b}, [x19], #16 st1 {v4.16b}, [x0], #16
subs w23, w23, #1 subs w4, w4, #1
bne .Lcbcencloop bne .Lcbcencloop
.Lcbcencout: .Lcbcencout:
st1 {v4.16b}, [x24] /* return iv */ st1 {v4.16b}, [x5] /* return iv */
frame_pop
ret ret
AES_ENDPROC(aes_cbc_encrypt) AES_ENDPROC(aes_cbc_encrypt)
AES_ENTRY(aes_cbc_decrypt) AES_ENTRY(aes_cbc_decrypt)
frame_push 6 stp x29, x30, [sp, #-16]!
mov x29, sp
mov x19, x0 ld1 {v7.16b}, [x5] /* get iv */
mov x20, x1 dec_prepare w3, x2, x6
mov x21, x2
mov x22, x3
mov x23, x4
mov x24, x5
.Lcbcdecrestart:
ld1 {v7.16b}, [x24] /* get iv */
dec_prepare w22, x21, x6
.LcbcdecloopNx: .LcbcdecloopNx:
subs w23, w23, #4 subs w4, w4, #4
bmi .Lcbcdec1x bmi .Lcbcdec1x
ld1 {v0.16b-v3.16b}, [x20], #64 /* get 4 ct blocks */ ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
mov v4.16b, v0.16b mov v4.16b, v0.16b
mov v5.16b, v1.16b mov v5.16b, v1.16b
mov v6.16b, v2.16b mov v6.16b, v2.16b
bl aes_decrypt_block4x bl aes_decrypt_block4x
sub x20, x20, #16 sub x1, x1, #16
eor v0.16b, v0.16b, v7.16b eor v0.16b, v0.16b, v7.16b
eor v1.16b, v1.16b, v4.16b eor v1.16b, v1.16b, v4.16b
ld1 {v7.16b}, [x20], #16 /* reload 1 ct block */ ld1 {v7.16b}, [x1], #16 /* reload 1 ct block */
eor v2.16b, v2.16b, v5.16b eor v2.16b, v2.16b, v5.16b
eor v3.16b, v3.16b, v6.16b eor v3.16b, v3.16b, v6.16b
st1 {v0.16b-v3.16b}, [x19], #64 st1 {v0.16b-v3.16b}, [x0], #64
st1 {v7.16b}, [x24] /* return iv */
cond_yield_neon .Lcbcdecrestart
b .LcbcdecloopNx b .LcbcdecloopNx
.Lcbcdec1x: .Lcbcdec1x:
adds w23, w23, #4 adds w4, w4, #4
beq .Lcbcdecout beq .Lcbcdecout
.Lcbcdecloop: .Lcbcdecloop:
ld1 {v1.16b}, [x20], #16 /* get next ct block */ ld1 {v1.16b}, [x1], #16 /* get next ct block */
mov v0.16b, v1.16b /* ...and copy to v0 */ mov v0.16b, v1.16b /* ...and copy to v0 */
decrypt_block v0, w22, x21, x6, w7 decrypt_block v0, w3, x2, x6, w7
eor v0.16b, v0.16b, v7.16b /* xor with iv => pt */ eor v0.16b, v0.16b, v7.16b /* xor with iv => pt */
mov v7.16b, v1.16b /* ct is next iv */ mov v7.16b, v1.16b /* ct is next iv */
st1 {v0.16b}, [x19], #16 st1 {v0.16b}, [x0], #16
subs w23, w23, #1 subs w4, w4, #1
bne .Lcbcdecloop bne .Lcbcdecloop
.Lcbcdecout: .Lcbcdecout:
st1 {v7.16b}, [x24] /* return iv */ st1 {v7.16b}, [x5] /* return iv */
frame_pop ldp x29, x30, [sp], #16
ret ret
AES_ENDPROC(aes_cbc_decrypt) AES_ENDPROC(aes_cbc_decrypt)
@ -212,26 +176,19 @@ AES_ENDPROC(aes_cbc_decrypt)
*/ */
AES_ENTRY(aes_ctr_encrypt) AES_ENTRY(aes_ctr_encrypt)
frame_push 6 stp x29, x30, [sp, #-16]!
mov x29, sp
mov x19, x0 enc_prepare w3, x2, x6
mov x20, x1 ld1 {v4.16b}, [x5]
mov x21, x2
mov x22, x3
mov x23, x4
mov x24, x5
.Lctrrestart:
enc_prepare w22, x21, x6
ld1 {v4.16b}, [x24]
umov x6, v4.d[1] /* keep swabbed ctr in reg */ umov x6, v4.d[1] /* keep swabbed ctr in reg */
rev x6, x6 rev x6, x6
cmn w6, w4 /* 32 bit overflow? */
bcs .Lctrloop
.LctrloopNx: .LctrloopNx:
subs w23, w23, #4 subs w4, w4, #4
bmi .Lctr1x bmi .Lctr1x
cmn w6, #4 /* 32 bit overflow? */
bcs .Lctr1x
add w7, w6, #1 add w7, w6, #1
mov v0.16b, v4.16b mov v0.16b, v4.16b
add w8, w6, #2 add w8, w6, #2
@ -245,27 +202,25 @@ AES_ENTRY(aes_ctr_encrypt)
rev w9, w9 rev w9, w9
mov v2.s[3], w8 mov v2.s[3], w8
mov v3.s[3], w9 mov v3.s[3], w9
ld1 {v5.16b-v7.16b}, [x20], #48 /* get 3 input blocks */ ld1 {v5.16b-v7.16b}, [x1], #48 /* get 3 input blocks */
bl aes_encrypt_block4x bl aes_encrypt_block4x
eor v0.16b, v5.16b, v0.16b eor v0.16b, v5.16b, v0.16b
ld1 {v5.16b}, [x20], #16 /* get 1 input block */ ld1 {v5.16b}, [x1], #16 /* get 1 input block */
eor v1.16b, v6.16b, v1.16b eor v1.16b, v6.16b, v1.16b
eor v2.16b, v7.16b, v2.16b eor v2.16b, v7.16b, v2.16b
eor v3.16b, v5.16b, v3.16b eor v3.16b, v5.16b, v3.16b
st1 {v0.16b-v3.16b}, [x19], #64 st1 {v0.16b-v3.16b}, [x0], #64
add x6, x6, #4 add x6, x6, #4
rev x7, x6 rev x7, x6
ins v4.d[1], x7 ins v4.d[1], x7
cbz w23, .Lctrout cbz w4, .Lctrout
st1 {v4.16b}, [x24] /* return next CTR value */
cond_yield_neon .Lctrrestart
b .LctrloopNx b .LctrloopNx
.Lctr1x: .Lctr1x:
adds w23, w23, #4 adds w4, w4, #4
beq .Lctrout beq .Lctrout
.Lctrloop: .Lctrloop:
mov v0.16b, v4.16b mov v0.16b, v4.16b
encrypt_block v0, w22, x21, x8, w7 encrypt_block v0, w3, x2, x8, w7
adds x6, x6, #1 /* increment BE ctr */ adds x6, x6, #1 /* increment BE ctr */
rev x7, x6 rev x7, x6
@ -273,22 +228,22 @@ AES_ENTRY(aes_ctr_encrypt)
bcs .Lctrcarry /* overflow? */ bcs .Lctrcarry /* overflow? */
.Lctrcarrydone: .Lctrcarrydone:
subs w23, w23, #1 subs w4, w4, #1
bmi .Lctrtailblock /* blocks <0 means tail block */ bmi .Lctrtailblock /* blocks <0 means tail block */
ld1 {v3.16b}, [x20], #16 ld1 {v3.16b}, [x1], #16
eor v3.16b, v0.16b, v3.16b eor v3.16b, v0.16b, v3.16b
st1 {v3.16b}, [x19], #16 st1 {v3.16b}, [x0], #16
bne .Lctrloop bne .Lctrloop
.Lctrout: .Lctrout:
st1 {v4.16b}, [x24] /* return next CTR value */ st1 {v4.16b}, [x5] /* return next CTR value */
.Lctrret: ldp x29, x30, [sp], #16
frame_pop
ret ret
.Lctrtailblock: .Lctrtailblock:
st1 {v0.16b}, [x19] st1 {v0.16b}, [x0]
b .Lctrret ldp x29, x30, [sp], #16
ret
.Lctrcarry: .Lctrcarry:
umov x7, v4.d[0] /* load upper word of ctr */ umov x7, v4.d[0] /* load upper word of ctr */
@ -321,16 +276,10 @@ CPU_LE( .quad 1, 0x87 )
CPU_BE( .quad 0x87, 1 ) CPU_BE( .quad 0x87, 1 )
AES_ENTRY(aes_xts_encrypt) AES_ENTRY(aes_xts_encrypt)
frame_push 6 stp x29, x30, [sp, #-16]!
mov x29, sp
mov x19, x0 ld1 {v4.16b}, [x6]
mov x20, x1
mov x21, x2
mov x22, x3
mov x23, x4
mov x24, x6
ld1 {v4.16b}, [x24]
cbz w7, .Lxtsencnotfirst cbz w7, .Lxtsencnotfirst
enc_prepare w3, x5, x8 enc_prepare w3, x5, x8
@ -339,17 +288,15 @@ AES_ENTRY(aes_xts_encrypt)
ldr q7, .Lxts_mul_x ldr q7, .Lxts_mul_x
b .LxtsencNx b .LxtsencNx
.Lxtsencrestart:
ld1 {v4.16b}, [x24]
.Lxtsencnotfirst: .Lxtsencnotfirst:
enc_prepare w22, x21, x8 enc_prepare w3, x2, x8
.LxtsencloopNx: .LxtsencloopNx:
ldr q7, .Lxts_mul_x ldr q7, .Lxts_mul_x
next_tweak v4, v4, v7, v8 next_tweak v4, v4, v7, v8
.LxtsencNx: .LxtsencNx:
subs w23, w23, #4 subs w4, w4, #4
bmi .Lxtsenc1x bmi .Lxtsenc1x
ld1 {v0.16b-v3.16b}, [x20], #64 /* get 4 pt blocks */ ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
next_tweak v5, v4, v7, v8 next_tweak v5, v4, v7, v8
eor v0.16b, v0.16b, v4.16b eor v0.16b, v0.16b, v4.16b
next_tweak v6, v5, v7, v8 next_tweak v6, v5, v7, v8
@ -362,43 +309,35 @@ AES_ENTRY(aes_xts_encrypt)
eor v0.16b, v0.16b, v4.16b eor v0.16b, v0.16b, v4.16b
eor v1.16b, v1.16b, v5.16b eor v1.16b, v1.16b, v5.16b
eor v2.16b, v2.16b, v6.16b eor v2.16b, v2.16b, v6.16b
st1 {v0.16b-v3.16b}, [x19], #64 st1 {v0.16b-v3.16b}, [x0], #64
mov v4.16b, v7.16b mov v4.16b, v7.16b
cbz w23, .Lxtsencout cbz w4, .Lxtsencout
st1 {v4.16b}, [x24]
cond_yield_neon .Lxtsencrestart
b .LxtsencloopNx b .LxtsencloopNx
.Lxtsenc1x: .Lxtsenc1x:
adds w23, w23, #4 adds w4, w4, #4
beq .Lxtsencout beq .Lxtsencout
.Lxtsencloop: .Lxtsencloop:
ld1 {v1.16b}, [x20], #16 ld1 {v1.16b}, [x1], #16
eor v0.16b, v1.16b, v4.16b eor v0.16b, v1.16b, v4.16b
encrypt_block v0, w22, x21, x8, w7 encrypt_block v0, w3, x2, x8, w7
eor v0.16b, v0.16b, v4.16b eor v0.16b, v0.16b, v4.16b
st1 {v0.16b}, [x19], #16 st1 {v0.16b}, [x0], #16
subs w23, w23, #1 subs w4, w4, #1
beq .Lxtsencout beq .Lxtsencout
next_tweak v4, v4, v7, v8 next_tweak v4, v4, v7, v8
b .Lxtsencloop b .Lxtsencloop
.Lxtsencout: .Lxtsencout:
st1 {v4.16b}, [x24] st1 {v4.16b}, [x6]
frame_pop ldp x29, x30, [sp], #16
ret ret
AES_ENDPROC(aes_xts_encrypt) AES_ENDPROC(aes_xts_encrypt)
AES_ENTRY(aes_xts_decrypt) AES_ENTRY(aes_xts_decrypt)
frame_push 6 stp x29, x30, [sp, #-16]!
mov x29, sp
mov x19, x0 ld1 {v4.16b}, [x6]
mov x20, x1
mov x21, x2
mov x22, x3
mov x23, x4
mov x24, x6
ld1 {v4.16b}, [x24]
cbz w7, .Lxtsdecnotfirst cbz w7, .Lxtsdecnotfirst
enc_prepare w3, x5, x8 enc_prepare w3, x5, x8
@ -407,17 +346,15 @@ AES_ENTRY(aes_xts_decrypt)
ldr q7, .Lxts_mul_x ldr q7, .Lxts_mul_x
b .LxtsdecNx b .LxtsdecNx
.Lxtsdecrestart:
ld1 {v4.16b}, [x24]
.Lxtsdecnotfirst: .Lxtsdecnotfirst:
dec_prepare w22, x21, x8 dec_prepare w3, x2, x8
.LxtsdecloopNx: .LxtsdecloopNx:
ldr q7, .Lxts_mul_x ldr q7, .Lxts_mul_x
next_tweak v4, v4, v7, v8 next_tweak v4, v4, v7, v8
.LxtsdecNx: .LxtsdecNx:
subs w23, w23, #4 subs w4, w4, #4
bmi .Lxtsdec1x bmi .Lxtsdec1x
ld1 {v0.16b-v3.16b}, [x20], #64 /* get 4 ct blocks */ ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
next_tweak v5, v4, v7, v8 next_tweak v5, v4, v7, v8
eor v0.16b, v0.16b, v4.16b eor v0.16b, v0.16b, v4.16b
next_tweak v6, v5, v7, v8 next_tweak v6, v5, v7, v8
@ -430,28 +367,26 @@ AES_ENTRY(aes_xts_decrypt)
eor v0.16b, v0.16b, v4.16b eor v0.16b, v0.16b, v4.16b
eor v1.16b, v1.16b, v5.16b eor v1.16b, v1.16b, v5.16b
eor v2.16b, v2.16b, v6.16b eor v2.16b, v2.16b, v6.16b
st1 {v0.16b-v3.16b}, [x19], #64 st1 {v0.16b-v3.16b}, [x0], #64
mov v4.16b, v7.16b mov v4.16b, v7.16b
cbz w23, .Lxtsdecout cbz w4, .Lxtsdecout
st1 {v4.16b}, [x24]
cond_yield_neon .Lxtsdecrestart
b .LxtsdecloopNx b .LxtsdecloopNx
.Lxtsdec1x: .Lxtsdec1x:
adds w23, w23, #4 adds w4, w4, #4
beq .Lxtsdecout beq .Lxtsdecout
.Lxtsdecloop: .Lxtsdecloop:
ld1 {v1.16b}, [x20], #16 ld1 {v1.16b}, [x1], #16
eor v0.16b, v1.16b, v4.16b eor v0.16b, v1.16b, v4.16b
decrypt_block v0, w22, x21, x8, w7 decrypt_block v0, w3, x2, x8, w7
eor v0.16b, v0.16b, v4.16b eor v0.16b, v0.16b, v4.16b
st1 {v0.16b}, [x19], #16 st1 {v0.16b}, [x0], #16
subs w23, w23, #1 subs w4, w4, #1
beq .Lxtsdecout beq .Lxtsdecout
next_tweak v4, v4, v7, v8 next_tweak v4, v4, v7, v8
b .Lxtsdecloop b .Lxtsdecloop
.Lxtsdecout: .Lxtsdecout:
st1 {v4.16b}, [x24] st1 {v4.16b}, [x6]
frame_pop ldp x29, x30, [sp], #16
ret ret
AES_ENDPROC(aes_xts_decrypt) AES_ENDPROC(aes_xts_decrypt)