1315 строки
33 KiB
ArmAsm
1315 строки
33 KiB
ArmAsm
/* rijndael-armv8-aarch64-ce.S - ARMv8/CE accelerated AES
|
|
* Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
|
|
*
|
|
* This file is part of Libgcrypt.
|
|
*
|
|
* Libgcrypt is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU Lesser General Public License as
|
|
* published by the Free Software Foundation; either version 2.1 of
|
|
* the License, or (at your option) any later version.
|
|
*
|
|
* Libgcrypt is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with this program; if not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
#include <config.h>
|
|
|
|
#if defined(__AARCH64EL__) && \
|
|
defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
|
|
defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO)
|
|
|
|
.cpu generic+simd+crypto
|
|
|
|
.text
|
|
|
|
|
|
#define GET_DATA_POINTER(reg, name) \
|
|
adrp reg, :got:name ; \
|
|
ldr reg, [reg, #:got_lo12:name] ;
|
|
|
|
|
|
/* Register macros */
|
|
|
|
#define vk0 v17
|
|
#define vk1 v18
|
|
#define vk2 v19
|
|
#define vk3 v20
|
|
#define vk4 v21
|
|
#define vk5 v22
|
|
#define vk6 v23
|
|
#define vk7 v24
|
|
#define vk8 v25
|
|
#define vk9 v26
|
|
#define vk10 v27
|
|
#define vk11 v28
|
|
#define vk12 v29
|
|
#define vk13 v30
|
|
#define vk14 v31
|
|
|
|
|
|
/* AES macros */
|
|
|
|
#define aes_preload_keys(keysched, nrounds) \
|
|
cmp nrounds, #12; \
|
|
ld1 {vk0.16b-vk3.16b}, [keysched], #64; \
|
|
ld1 {vk4.16b-vk7.16b}, [keysched], #64; \
|
|
ld1 {vk8.16b-vk10.16b}, [keysched], #48; \
|
|
b.lo 1f; \
|
|
ld1 {vk11.16b-vk12.16b}, [keysched], #32; \
|
|
b.eq 1f; \
|
|
ld1 {vk13.16b-vk14.16b}, [keysched]; \
|
|
1: ;
|
|
|
|
#define do_aes_one128(ed, mcimc, vo, vb) \
|
|
aes##ed vb.16b, vk0.16b; \
|
|
aes##mcimc vb.16b, vb.16b; \
|
|
aes##ed vb.16b, vk1.16b; \
|
|
aes##mcimc vb.16b, vb.16b; \
|
|
aes##ed vb.16b, vk2.16b; \
|
|
aes##mcimc vb.16b, vb.16b; \
|
|
aes##ed vb.16b, vk3.16b; \
|
|
aes##mcimc vb.16b, vb.16b; \
|
|
aes##ed vb.16b, vk4.16b; \
|
|
aes##mcimc vb.16b, vb.16b; \
|
|
aes##ed vb.16b, vk5.16b; \
|
|
aes##mcimc vb.16b, vb.16b; \
|
|
aes##ed vb.16b, vk6.16b; \
|
|
aes##mcimc vb.16b, vb.16b; \
|
|
aes##ed vb.16b, vk7.16b; \
|
|
aes##mcimc vb.16b, vb.16b; \
|
|
aes##ed vb.16b, vk8.16b; \
|
|
aes##mcimc vb.16b, vb.16b; \
|
|
aes##ed vb.16b, vk9.16b; \
|
|
eor vo.16b, vb.16b, vk10.16b;
|
|
|
|
#define do_aes_one192(ed, mcimc, vo, vb) \
|
|
aes##ed vb.16b, vk0.16b; \
|
|
aes##mcimc vb.16b, vb.16b; \
|
|
aes##ed vb.16b, vk1.16b; \
|
|
aes##mcimc vb.16b, vb.16b; \
|
|
aes##ed vb.16b, vk2.16b; \
|
|
aes##mcimc vb.16b, vb.16b; \
|
|
aes##ed vb.16b, vk3.16b; \
|
|
aes##mcimc vb.16b, vb.16b; \
|
|
aes##ed vb.16b, vk4.16b; \
|
|
aes##mcimc vb.16b, vb.16b; \
|
|
aes##ed vb.16b, vk5.16b; \
|
|
aes##mcimc vb.16b, vb.16b; \
|
|
aes##ed vb.16b, vk6.16b; \
|
|
aes##mcimc vb.16b, vb.16b; \
|
|
aes##ed vb.16b, vk7.16b; \
|
|
aes##mcimc vb.16b, vb.16b; \
|
|
aes##ed vb.16b, vk8.16b; \
|
|
aes##mcimc vb.16b, vb.16b; \
|
|
aes##ed vb.16b, vk9.16b; \
|
|
aes##mcimc vb.16b, vb.16b; \
|
|
aes##ed vb.16b, vk10.16b; \
|
|
aes##mcimc vb.16b, vb.16b; \
|
|
aes##ed vb.16b, vk11.16b; \
|
|
eor vo.16b, vb.16b, vk12.16b;
|
|
|
|
#define do_aes_one256(ed, mcimc, vo, vb) \
|
|
aes##ed vb.16b, vk0.16b; \
|
|
aes##mcimc vb.16b, vb.16b; \
|
|
aes##ed vb.16b, vk1.16b; \
|
|
aes##mcimc vb.16b, vb.16b; \
|
|
aes##ed vb.16b, vk2.16b; \
|
|
aes##mcimc vb.16b, vb.16b; \
|
|
aes##ed vb.16b, vk3.16b; \
|
|
aes##mcimc vb.16b, vb.16b; \
|
|
aes##ed vb.16b, vk4.16b; \
|
|
aes##mcimc vb.16b, vb.16b; \
|
|
aes##ed vb.16b, vk5.16b; \
|
|
aes##mcimc vb.16b, vb.16b; \
|
|
aes##ed vb.16b, vk6.16b; \
|
|
aes##mcimc vb.16b, vb.16b; \
|
|
aes##ed vb.16b, vk7.16b; \
|
|
aes##mcimc vb.16b, vb.16b; \
|
|
aes##ed vb.16b, vk8.16b; \
|
|
aes##mcimc vb.16b, vb.16b; \
|
|
aes##ed vb.16b, vk9.16b; \
|
|
aes##mcimc vb.16b, vb.16b; \
|
|
aes##ed vb.16b, vk10.16b; \
|
|
aes##mcimc vb.16b, vb.16b; \
|
|
aes##ed vb.16b, vk11.16b; \
|
|
aes##mcimc vb.16b, vb.16b; \
|
|
aes##ed vb.16b, vk12.16b; \
|
|
aes##mcimc vb.16b, vb.16b; \
|
|
aes##ed vb.16b, vk13.16b; \
|
|
eor vo.16b, vb.16b, vk14.16b;
|
|
|
|
#define aes_round_4(ed, mcimc, b0, b1, b2, b3, key) \
|
|
aes##ed b0.16b, key.16b; \
|
|
aes##mcimc b0.16b, b0.16b; \
|
|
aes##ed b1.16b, key.16b; \
|
|
aes##mcimc b1.16b, b1.16b; \
|
|
aes##ed b2.16b, key.16b; \
|
|
aes##mcimc b2.16b, b2.16b; \
|
|
aes##ed b3.16b, key.16b; \
|
|
aes##mcimc b3.16b, b3.16b;
|
|
|
|
#define aes_lastround_4(ed, b0, b1, b2, b3, key1, key2) \
|
|
aes##ed b0.16b, key1.16b; \
|
|
eor b0.16b, b0.16b, key2.16b; \
|
|
aes##ed b1.16b, key1.16b; \
|
|
eor b1.16b, b1.16b, key2.16b; \
|
|
aes##ed b2.16b, key1.16b; \
|
|
eor b2.16b, b2.16b, key2.16b; \
|
|
aes##ed b3.16b, key1.16b; \
|
|
eor b3.16b, b3.16b, key2.16b;
|
|
|
|
#define do_aes_4_128(ed, mcimc, b0, b1, b2, b3) \
|
|
aes_round_4(ed, mcimc, b0, b1, b2, b3, vk0); \
|
|
aes_round_4(ed, mcimc, b0, b1, b2, b3, vk1); \
|
|
aes_round_4(ed, mcimc, b0, b1, b2, b3, vk2); \
|
|
aes_round_4(ed, mcimc, b0, b1, b2, b3, vk3); \
|
|
aes_round_4(ed, mcimc, b0, b1, b2, b3, vk4); \
|
|
aes_round_4(ed, mcimc, b0, b1, b2, b3, vk5); \
|
|
aes_round_4(ed, mcimc, b0, b1, b2, b3, vk6); \
|
|
aes_round_4(ed, mcimc, b0, b1, b2, b3, vk7); \
|
|
aes_round_4(ed, mcimc, b0, b1, b2, b3, vk8); \
|
|
aes_lastround_4(ed, b0, b1, b2, b3, vk9, vk10);
|
|
|
|
#define do_aes_4_192(ed, mcimc, b0, b1, b2, b3) \
|
|
aes_round_4(ed, mcimc, b0, b1, b2, b3, vk0); \
|
|
aes_round_4(ed, mcimc, b0, b1, b2, b3, vk1); \
|
|
aes_round_4(ed, mcimc, b0, b1, b2, b3, vk2); \
|
|
aes_round_4(ed, mcimc, b0, b1, b2, b3, vk3); \
|
|
aes_round_4(ed, mcimc, b0, b1, b2, b3, vk4); \
|
|
aes_round_4(ed, mcimc, b0, b1, b2, b3, vk5); \
|
|
aes_round_4(ed, mcimc, b0, b1, b2, b3, vk6); \
|
|
aes_round_4(ed, mcimc, b0, b1, b2, b3, vk7); \
|
|
aes_round_4(ed, mcimc, b0, b1, b2, b3, vk8); \
|
|
aes_round_4(ed, mcimc, b0, b1, b2, b3, vk9); \
|
|
aes_round_4(ed, mcimc, b0, b1, b2, b3, vk10); \
|
|
aes_lastround_4(ed, b0, b1, b2, b3, vk11, vk12);
|
|
|
|
#define do_aes_4_256(ed, mcimc, b0, b1, b2, b3) \
|
|
aes_round_4(ed, mcimc, b0, b1, b2, b3, vk0); \
|
|
aes_round_4(ed, mcimc, b0, b1, b2, b3, vk1); \
|
|
aes_round_4(ed, mcimc, b0, b1, b2, b3, vk2); \
|
|
aes_round_4(ed, mcimc, b0, b1, b2, b3, vk3); \
|
|
aes_round_4(ed, mcimc, b0, b1, b2, b3, vk4); \
|
|
aes_round_4(ed, mcimc, b0, b1, b2, b3, vk5); \
|
|
aes_round_4(ed, mcimc, b0, b1, b2, b3, vk6); \
|
|
aes_round_4(ed, mcimc, b0, b1, b2, b3, vk7); \
|
|
aes_round_4(ed, mcimc, b0, b1, b2, b3, vk8); \
|
|
aes_round_4(ed, mcimc, b0, b1, b2, b3, vk9); \
|
|
aes_round_4(ed, mcimc, b0, b1, b2, b3, vk10); \
|
|
aes_round_4(ed, mcimc, b0, b1, b2, b3, vk11); \
|
|
aes_round_4(ed, mcimc, b0, b1, b2, b3, vk12); \
|
|
aes_lastround_4(ed, b0, b1, b2, b3, vk13, vk14);
|
|
|
|
|
|
/* Other functional macros */
|
|
|
|
#define CLEAR_REG(reg) eor reg.16b, reg.16b, reg.16b;
|
|
|
|
#define aes_clear_keys(nrounds) \
|
|
cmp nrounds, #12; \
|
|
CLEAR_REG(vk0); \
|
|
CLEAR_REG(vk1); \
|
|
CLEAR_REG(vk2); \
|
|
CLEAR_REG(vk3); \
|
|
CLEAR_REG(vk4); \
|
|
CLEAR_REG(vk5); \
|
|
CLEAR_REG(vk6); \
|
|
CLEAR_REG(vk7); \
|
|
CLEAR_REG(vk9); \
|
|
CLEAR_REG(vk8); \
|
|
CLEAR_REG(vk10); \
|
|
b.lo 1f; \
|
|
CLEAR_REG(vk11); \
|
|
CLEAR_REG(vk12); \
|
|
b.eq 1f; \
|
|
CLEAR_REG(vk13); \
|
|
CLEAR_REG(vk14); \
|
|
1: ;
|
|
|
|
|
|
/*
|
|
* unsigned int _gcry_aes_enc_armv8_ce(void *keysched, byte *dst,
|
|
* const byte *src,
|
|
* unsigned int nrounds);
|
|
*/
|
|
.align 3
|
|
.globl _gcry_aes_enc_armv8_ce
|
|
.type _gcry_aes_enc_armv8_ce,%function;
|
|
_gcry_aes_enc_armv8_ce:
|
|
/* input:
|
|
* x0: keysched
|
|
* x1: dst
|
|
* x2: src
|
|
* w3: nrounds
|
|
*/
|
|
|
|
aes_preload_keys(x0, w3);
|
|
|
|
ld1 {v0.16b}, [x2]
|
|
|
|
b.hi .Lenc1_256
|
|
b.eq .Lenc1_192
|
|
|
|
.Lenc1_128:
|
|
do_aes_one128(e, mc, v0, v0);
|
|
|
|
.Lenc1_tail:
|
|
CLEAR_REG(vk0)
|
|
CLEAR_REG(vk1)
|
|
CLEAR_REG(vk2)
|
|
CLEAR_REG(vk3)
|
|
CLEAR_REG(vk4)
|
|
CLEAR_REG(vk5)
|
|
CLEAR_REG(vk6)
|
|
CLEAR_REG(vk7)
|
|
CLEAR_REG(vk8)
|
|
CLEAR_REG(vk9)
|
|
CLEAR_REG(vk10)
|
|
st1 {v0.16b}, [x1]
|
|
CLEAR_REG(v0)
|
|
|
|
mov x0, #0
|
|
ret
|
|
|
|
.Lenc1_192:
|
|
do_aes_one192(e, mc, v0, v0);
|
|
|
|
CLEAR_REG(vk11)
|
|
CLEAR_REG(vk12)
|
|
b .Lenc1_tail
|
|
|
|
.Lenc1_256:
|
|
do_aes_one256(e, mc, v0, v0);
|
|
|
|
CLEAR_REG(vk11)
|
|
CLEAR_REG(vk12)
|
|
CLEAR_REG(vk13)
|
|
CLEAR_REG(vk14)
|
|
b .Lenc1_tail
|
|
.size _gcry_aes_enc_armv8_ce,.-_gcry_aes_enc_armv8_ce;
|
|
|
|
|
|
/*
|
|
* unsigned int _gcry_aes_dec_armv8_ce(void *keysched, byte *dst,
|
|
* const byte *src,
|
|
* unsigned int nrounds);
|
|
*/
|
|
.align 3
|
|
.globl _gcry_aes_dec_armv8_ce
|
|
.type _gcry_aes_dec_armv8_ce,%function;
|
|
_gcry_aes_dec_armv8_ce:
|
|
/* input:
|
|
* x0: keysched
|
|
* x1: dst
|
|
* x2: src
|
|
* w3: nrounds
|
|
*/
|
|
|
|
aes_preload_keys(x0, w3);
|
|
|
|
ld1 {v0.16b}, [x2]
|
|
|
|
b.hi .Ldec1_256
|
|
b.eq .Ldec1_192
|
|
|
|
.Ldec1_128:
|
|
do_aes_one128(d, imc, v0, v0);
|
|
|
|
.Ldec1_tail:
|
|
CLEAR_REG(vk0)
|
|
CLEAR_REG(vk1)
|
|
CLEAR_REG(vk2)
|
|
CLEAR_REG(vk3)
|
|
CLEAR_REG(vk4)
|
|
CLEAR_REG(vk5)
|
|
CLEAR_REG(vk6)
|
|
CLEAR_REG(vk7)
|
|
CLEAR_REG(vk8)
|
|
CLEAR_REG(vk9)
|
|
CLEAR_REG(vk10)
|
|
st1 {v0.16b}, [x1]
|
|
CLEAR_REG(v0)
|
|
|
|
mov x0, #0
|
|
ret
|
|
|
|
.Ldec1_192:
|
|
do_aes_one192(d, imc, v0, v0);
|
|
|
|
CLEAR_REG(vk11)
|
|
CLEAR_REG(vk12)
|
|
b .Ldec1_tail
|
|
|
|
.Ldec1_256:
|
|
do_aes_one256(d, imc, v0, v0);
|
|
|
|
CLEAR_REG(vk11)
|
|
CLEAR_REG(vk12)
|
|
CLEAR_REG(vk13)
|
|
CLEAR_REG(vk14)
|
|
b .Ldec1_tail
|
|
.size _gcry_aes_dec_armv8_ce,.-_gcry_aes_dec_armv8_ce;
|
|
|
|
|
|
/*
|
|
* void _gcry_aes_cbc_enc_armv8_ce (const void *keysched,
|
|
* unsigned char *outbuf,
|
|
* const unsigned char *inbuf,
|
|
* unsigned char *iv, size_t nblocks,
|
|
* int cbc_mac, unsigned int nrounds);
|
|
*/
|
|
|
|
.align 3
|
|
.globl _gcry_aes_cbc_enc_armv8_ce
|
|
.type _gcry_aes_cbc_enc_armv8_ce,%function;
|
|
_gcry_aes_cbc_enc_armv8_ce:
|
|
/* input:
|
|
* x0: keysched
|
|
* x1: outbuf
|
|
* x2: inbuf
|
|
* x3: iv
|
|
* x4: nblocks
|
|
* w5: cbc_mac
|
|
* w6: nrounds
|
|
*/
|
|
|
|
cbz x4, .Lcbc_enc_skip
|
|
|
|
cmp w5, #0
|
|
ld1 {v1.16b}, [x3] /* load IV */
|
|
cset x5, eq
|
|
|
|
aes_preload_keys(x0, w6);
|
|
lsl x5, x5, #4
|
|
|
|
b.eq .Lcbc_enc_loop192
|
|
b.hi .Lcbc_enc_loop256
|
|
|
|
#define CBC_ENC(bits) \
|
|
.Lcbc_enc_loop##bits: \
|
|
ld1 {v0.16b}, [x2], #16; /* load plaintext */ \
|
|
eor v1.16b, v0.16b, v1.16b; \
|
|
sub x4, x4, #1; \
|
|
\
|
|
do_aes_one##bits(e, mc, v1, v1); \
|
|
\
|
|
st1 {v1.16b}, [x1], x5; /* store ciphertext */ \
|
|
\
|
|
cbnz x4, .Lcbc_enc_loop##bits; \
|
|
b .Lcbc_enc_done;
|
|
|
|
CBC_ENC(128)
|
|
CBC_ENC(192)
|
|
CBC_ENC(256)
|
|
|
|
#undef CBC_ENC
|
|
|
|
.Lcbc_enc_done:
|
|
aes_clear_keys(w6)
|
|
|
|
st1 {v1.16b}, [x3] /* store IV */
|
|
|
|
CLEAR_REG(v1)
|
|
CLEAR_REG(v0)
|
|
|
|
.Lcbc_enc_skip:
|
|
ret
|
|
.size _gcry_aes_cbc_enc_armv8_ce,.-_gcry_aes_cbc_enc_armv8_ce;
|
|
|
|
/*
|
|
* void _gcry_aes_cbc_dec_armv8_ce (const void *keysched,
|
|
* unsigned char *outbuf,
|
|
* const unsigned char *inbuf,
|
|
* unsigned char *iv, unsigned int nrounds);
|
|
*/
|
|
|
|
.align 3
|
|
.globl _gcry_aes_cbc_dec_armv8_ce
|
|
.type _gcry_aes_cbc_dec_armv8_ce,%function;
|
|
_gcry_aes_cbc_dec_armv8_ce:
|
|
/* input:
|
|
* x0: keysched
|
|
* x1: outbuf
|
|
* x2: inbuf
|
|
* x3: iv
|
|
* x4: nblocks
|
|
* w5: nrounds
|
|
*/
|
|
|
|
cbz x4, .Lcbc_dec_skip
|
|
|
|
ld1 {v0.16b}, [x3] /* load IV */
|
|
|
|
aes_preload_keys(x0, w5);
|
|
|
|
b.eq .Lcbc_dec_entry_192
|
|
b.hi .Lcbc_dec_entry_256
|
|
|
|
#define CBC_DEC(bits) \
|
|
.Lcbc_dec_entry_##bits: \
|
|
cmp x4, #4; \
|
|
b.lo .Lcbc_dec_loop_##bits; \
|
|
\
|
|
.Lcbc_dec_loop4_##bits: \
|
|
\
|
|
ld1 {v1.16b-v4.16b}, [x2], #64; /* load ciphertext */ \
|
|
sub x4, x4, #4; \
|
|
mov v5.16b, v1.16b; \
|
|
mov v6.16b, v2.16b; \
|
|
mov v7.16b, v3.16b; \
|
|
mov v16.16b, v4.16b; \
|
|
cmp x4, #4; \
|
|
\
|
|
do_aes_4_##bits(d, imc, v1, v2, v3, v4); \
|
|
\
|
|
eor v1.16b, v1.16b, v0.16b; \
|
|
eor v2.16b, v2.16b, v5.16b; \
|
|
st1 {v1.16b-v2.16b}, [x1], #32; /* store plaintext */ \
|
|
eor v3.16b, v3.16b, v6.16b; \
|
|
eor v4.16b, v4.16b, v7.16b; \
|
|
mov v0.16b, v16.16b; /* next IV */ \
|
|
st1 {v3.16b-v4.16b}, [x1], #32; /* store plaintext */ \
|
|
\
|
|
b.hs .Lcbc_dec_loop4_##bits; \
|
|
CLEAR_REG(v3); \
|
|
CLEAR_REG(v4); \
|
|
CLEAR_REG(v5); \
|
|
CLEAR_REG(v6); \
|
|
CLEAR_REG(v7); \
|
|
CLEAR_REG(v16); \
|
|
cbz x4, .Lcbc_dec_done; \
|
|
\
|
|
.Lcbc_dec_loop_##bits: \
|
|
ld1 {v1.16b}, [x2], #16; /* load ciphertext */ \
|
|
sub x4, x4, #1; \
|
|
mov v2.16b, v1.16b; \
|
|
\
|
|
do_aes_one##bits(d, imc, v1, v1); \
|
|
\
|
|
eor v1.16b, v1.16b, v0.16b; \
|
|
mov v0.16b, v2.16b; \
|
|
st1 {v1.16b}, [x1], #16; /* store plaintext */ \
|
|
\
|
|
cbnz x4, .Lcbc_dec_loop_##bits; \
|
|
b .Lcbc_dec_done;
|
|
|
|
CBC_DEC(128)
|
|
CBC_DEC(192)
|
|
CBC_DEC(256)
|
|
|
|
#undef CBC_DEC
|
|
|
|
.Lcbc_dec_done:
|
|
aes_clear_keys(w5)
|
|
|
|
st1 {v0.16b}, [x3] /* store IV */
|
|
|
|
CLEAR_REG(v0)
|
|
CLEAR_REG(v1)
|
|
CLEAR_REG(v2)
|
|
|
|
.Lcbc_dec_skip:
|
|
ret
|
|
.size _gcry_aes_cbc_dec_armv8_ce,.-_gcry_aes_cbc_dec_armv8_ce;
|
|
|
|
|
|
/*
|
|
* void _gcry_aes_ctr_enc_armv8_ce (const void *keysched,
|
|
* unsigned char *outbuf,
|
|
* const unsigned char *inbuf,
|
|
* unsigned char *iv, unsigned int nrounds);
|
|
*/
|
|
|
|
.align 3
|
|
.globl _gcry_aes_ctr_enc_armv8_ce
|
|
.type _gcry_aes_ctr_enc_armv8_ce,%function;
|
|
_gcry_aes_ctr_enc_armv8_ce:
|
|
/* input:
|
|
* r0: keysched
|
|
* r1: outbuf
|
|
* r2: inbuf
|
|
* r3: iv
|
|
* x4: nblocks
|
|
* w5: nrounds
|
|
*/
|
|
|
|
cbz x4, .Lctr_enc_skip
|
|
|
|
mov x6, #1
|
|
movi v16.16b, #0
|
|
mov v16.D[1], x6
|
|
|
|
/* load IV */
|
|
ldp x9, x10, [x3]
|
|
ld1 {v0.16b}, [x3]
|
|
rev x9, x9
|
|
rev x10, x10
|
|
|
|
aes_preload_keys(x0, w5);
|
|
|
|
b.eq .Lctr_enc_entry_192
|
|
b.hi .Lctr_enc_entry_256
|
|
|
|
#define CTR_ENC(bits) \
|
|
.Lctr_enc_entry_##bits: \
|
|
cmp x4, #4; \
|
|
b.lo .Lctr_enc_loop_##bits; \
|
|
\
|
|
.Lctr_enc_loop4_##bits: \
|
|
cmp x10, #0xfffffffffffffffc; \
|
|
sub x4, x4, #4; \
|
|
b.lo .Lctr_enc_loop4_##bits##_nocarry; \
|
|
\
|
|
adds x10, x10, #1; \
|
|
mov v1.16b, v0.16b; \
|
|
adc x9, x9, xzr; \
|
|
mov v2.D[1], x10; \
|
|
mov v2.D[0], x9; \
|
|
\
|
|
adds x10, x10, #1; \
|
|
rev64 v2.16b, v2.16b; \
|
|
adc x9, x9, xzr; \
|
|
mov v3.D[1], x10; \
|
|
mov v3.D[0], x9; \
|
|
\
|
|
adds x10, x10, #1; \
|
|
rev64 v3.16b, v3.16b; \
|
|
adc x9, x9, xzr; \
|
|
mov v4.D[1], x10; \
|
|
mov v4.D[0], x9; \
|
|
\
|
|
adds x10, x10, #1; \
|
|
rev64 v4.16b, v4.16b; \
|
|
adc x9, x9, xzr; \
|
|
mov v0.D[1], x10; \
|
|
mov v0.D[0], x9; \
|
|
rev64 v0.16b, v0.16b; \
|
|
\
|
|
b .Lctr_enc_loop4_##bits##_store_ctr; \
|
|
\
|
|
.Lctr_enc_loop4_##bits##_nocarry: \
|
|
\
|
|
add v3.2d, v16.2d, v16.2d; /* 2 */ \
|
|
rev64 v6.16b, v0.16b; \
|
|
add x10, x10, #4; \
|
|
add v4.2d, v3.2d, v16.2d; /* 3 */ \
|
|
add v0.2d, v3.2d, v3.2d; /* 4 */ \
|
|
rev64 v1.16b, v6.16b; \
|
|
add v2.2d, v6.2d, v16.2d; \
|
|
add v3.2d, v6.2d, v3.2d; \
|
|
add v4.2d, v6.2d, v4.2d; \
|
|
add v0.2d, v6.2d, v0.2d; \
|
|
rev64 v2.16b, v2.16b; \
|
|
rev64 v3.16b, v3.16b; \
|
|
rev64 v0.16b, v0.16b; \
|
|
rev64 v4.16b, v4.16b; \
|
|
\
|
|
.Lctr_enc_loop4_##bits##_store_ctr: \
|
|
\
|
|
st1 {v0.16b}, [x3]; \
|
|
cmp x4, #4; \
|
|
ld1 {v5.16b-v7.16b}, [x2], #48; /* preload ciphertext */ \
|
|
\
|
|
do_aes_4_##bits(e, mc, v1, v2, v3, v4); \
|
|
\
|
|
eor v1.16b, v1.16b, v5.16b; \
|
|
ld1 {v5.16b}, [x2], #16; /* load ciphertext */ \
|
|
eor v2.16b, v2.16b, v6.16b; \
|
|
eor v3.16b, v3.16b, v7.16b; \
|
|
eor v4.16b, v4.16b, v5.16b; \
|
|
st1 {v1.16b-v4.16b}, [x1], #64; /* store plaintext */ \
|
|
\
|
|
b.hs .Lctr_enc_loop4_##bits; \
|
|
CLEAR_REG(v3); \
|
|
CLEAR_REG(v4); \
|
|
CLEAR_REG(v5); \
|
|
CLEAR_REG(v6); \
|
|
CLEAR_REG(v7); \
|
|
cbz x4, .Lctr_enc_done; \
|
|
\
|
|
.Lctr_enc_loop_##bits: \
|
|
\
|
|
adds x10, x10, #1; \
|
|
mov v1.16b, v0.16b; \
|
|
adc x9, x9, xzr; \
|
|
mov v0.D[1], x10; \
|
|
mov v0.D[0], x9; \
|
|
sub x4, x4, #1; \
|
|
ld1 {v2.16b}, [x2], #16; /* load ciphertext */ \
|
|
rev64 v0.16b, v0.16b; \
|
|
\
|
|
do_aes_one##bits(e, mc, v1, v1); \
|
|
\
|
|
eor v1.16b, v2.16b, v1.16b; \
|
|
st1 {v1.16b}, [x1], #16; /* store plaintext */ \
|
|
\
|
|
cbnz x4, .Lctr_enc_loop_##bits; \
|
|
b .Lctr_enc_done;
|
|
|
|
CTR_ENC(128)
|
|
CTR_ENC(192)
|
|
CTR_ENC(256)
|
|
|
|
#undef CTR_ENC
|
|
|
|
.Lctr_enc_done:
|
|
aes_clear_keys(w5)
|
|
|
|
st1 {v0.16b}, [x3] /* store IV */
|
|
|
|
CLEAR_REG(v0)
|
|
CLEAR_REG(v1)
|
|
CLEAR_REG(v2)
|
|
|
|
.Lctr_enc_skip:
|
|
ret
|
|
|
|
.size _gcry_aes_ctr_enc_armv8_ce,.-_gcry_aes_ctr_enc_armv8_ce;
|
|
|
|
|
|
/*
|
|
* void _gcry_aes_cfb_enc_armv8_ce (const void *keysched,
|
|
* unsigned char *outbuf,
|
|
* const unsigned char *inbuf,
|
|
* unsigned char *iv, unsigned int nrounds);
|
|
*/
|
|
|
|
.align 3
|
|
.globl _gcry_aes_cfb_enc_armv8_ce
|
|
.type _gcry_aes_cfb_enc_armv8_ce,%function;
|
|
_gcry_aes_cfb_enc_armv8_ce:
|
|
/* input:
|
|
* r0: keysched
|
|
* r1: outbuf
|
|
* r2: inbuf
|
|
* r3: iv
|
|
* x4: nblocks
|
|
* w5: nrounds
|
|
*/
|
|
|
|
cbz x4, .Lcfb_enc_skip
|
|
|
|
/* load IV */
|
|
ld1 {v0.16b}, [x3]
|
|
|
|
aes_preload_keys(x0, w5);
|
|
|
|
b.eq .Lcfb_enc_entry_192
|
|
b.hi .Lcfb_enc_entry_256
|
|
|
|
#define CFB_ENC(bits) \
|
|
.Lcfb_enc_entry_##bits: \
|
|
.Lcfb_enc_loop_##bits: \
|
|
ld1 {v1.16b}, [x2], #16; /* load plaintext */ \
|
|
sub x4, x4, #1; \
|
|
\
|
|
do_aes_one##bits(e, mc, v0, v0); \
|
|
\
|
|
eor v0.16b, v1.16b, v0.16b; \
|
|
st1 {v0.16b}, [x1], #16; /* store ciphertext */ \
|
|
\
|
|
cbnz x4, .Lcfb_enc_loop_##bits; \
|
|
b .Lcfb_enc_done;
|
|
|
|
CFB_ENC(128)
|
|
CFB_ENC(192)
|
|
CFB_ENC(256)
|
|
|
|
#undef CFB_ENC
|
|
|
|
.Lcfb_enc_done:
|
|
aes_clear_keys(w5)
|
|
|
|
st1 {v0.16b}, [x3] /* store IV */
|
|
|
|
CLEAR_REG(v0)
|
|
CLEAR_REG(v1)
|
|
|
|
.Lcfb_enc_skip:
|
|
ret
|
|
.size _gcry_aes_cfb_enc_armv8_ce,.-_gcry_aes_cfb_enc_armv8_ce;
|
|
|
|
|
|
/*
|
|
* void _gcry_aes_cfb_dec_armv8_ce (const void *keysched,
|
|
* unsigned char *outbuf,
|
|
* const unsigned char *inbuf,
|
|
* unsigned char *iv, unsigned int nrounds);
|
|
*/
|
|
|
|
.align 3
|
|
.globl _gcry_aes_cfb_dec_armv8_ce
|
|
.type _gcry_aes_cfb_dec_armv8_ce,%function;
|
|
_gcry_aes_cfb_dec_armv8_ce:
|
|
/* input:
|
|
* r0: keysched
|
|
* r1: outbuf
|
|
* r2: inbuf
|
|
* r3: iv
|
|
* x4: nblocks
|
|
* w5: nrounds
|
|
*/
|
|
|
|
cbz x4, .Lcfb_dec_skip
|
|
|
|
/* load IV */
|
|
ld1 {v0.16b}, [x3]
|
|
|
|
aes_preload_keys(x0, w5);
|
|
|
|
b.eq .Lcfb_dec_entry_192
|
|
b.hi .Lcfb_dec_entry_256
|
|
|
|
#define CFB_DEC(bits) \
|
|
.Lcfb_dec_entry_##bits: \
|
|
cmp x4, #4; \
|
|
b.lo .Lcfb_dec_loop_##bits; \
|
|
\
|
|
.Lcfb_dec_loop4_##bits: \
|
|
\
|
|
ld1 {v2.16b-v4.16b}, [x2], #48; /* load ciphertext */ \
|
|
mov v1.16b, v0.16b; \
|
|
sub x4, x4, #4; \
|
|
cmp x4, #4; \
|
|
mov v5.16b, v2.16b; \
|
|
mov v6.16b, v3.16b; \
|
|
mov v7.16b, v4.16b; \
|
|
ld1 {v0.16b}, [x2], #16; /* load next IV / ciphertext */ \
|
|
\
|
|
do_aes_4_##bits(e, mc, v1, v2, v3, v4); \
|
|
\
|
|
eor v1.16b, v1.16b, v5.16b; \
|
|
eor v2.16b, v2.16b, v6.16b; \
|
|
eor v3.16b, v3.16b, v7.16b; \
|
|
eor v4.16b, v4.16b, v0.16b; \
|
|
st1 {v1.16b-v4.16b}, [x1], #64; /* store plaintext */ \
|
|
\
|
|
b.hs .Lcfb_dec_loop4_##bits; \
|
|
CLEAR_REG(v3); \
|
|
CLEAR_REG(v4); \
|
|
CLEAR_REG(v5); \
|
|
CLEAR_REG(v6); \
|
|
CLEAR_REG(v7); \
|
|
cbz x4, .Lcfb_dec_done; \
|
|
\
|
|
.Lcfb_dec_loop_##bits: \
|
|
\
|
|
ld1 {v1.16b}, [x2], #16; /* load ciphertext */ \
|
|
\
|
|
sub x4, x4, #1; \
|
|
\
|
|
do_aes_one##bits(e, mc, v0, v0); \
|
|
\
|
|
eor v2.16b, v1.16b, v0.16b; \
|
|
mov v0.16b, v1.16b; \
|
|
st1 {v2.16b}, [x1], #16; /* store plaintext */ \
|
|
\
|
|
cbnz x4, .Lcfb_dec_loop_##bits; \
|
|
b .Lcfb_dec_done;
|
|
|
|
CFB_DEC(128)
|
|
CFB_DEC(192)
|
|
CFB_DEC(256)
|
|
|
|
#undef CFB_DEC
|
|
|
|
.Lcfb_dec_done:
|
|
aes_clear_keys(w5)
|
|
|
|
st1 {v0.16b}, [x3] /* store IV */
|
|
|
|
CLEAR_REG(v0)
|
|
CLEAR_REG(v1)
|
|
CLEAR_REG(v2)
|
|
|
|
.Lcfb_dec_skip:
|
|
ret
|
|
.size _gcry_aes_cfb_dec_armv8_ce,.-_gcry_aes_cfb_dec_armv8_ce;
|
|
|
|
|
|
/*
|
|
* void _gcry_aes_ocb_enc_armv8_ce (const void *keysched,
|
|
* unsigned char *outbuf,
|
|
* const unsigned char *inbuf,
|
|
* unsigned char *offset,
|
|
* unsigned char *checksum,
|
|
* unsigned char *L_table,
|
|
* size_t nblocks,
|
|
* unsigned int nrounds,
|
|
* unsigned int blkn);
|
|
*/
|
|
|
|
.align 3
|
|
.globl _gcry_aes_ocb_enc_armv8_ce
|
|
.type _gcry_aes_ocb_enc_armv8_ce,%function;
|
|
_gcry_aes_ocb_enc_armv8_ce:
|
|
/* input:
|
|
* x0: keysched
|
|
* x1: outbuf
|
|
* x2: inbuf
|
|
* x3: offset
|
|
* x4: checksum
|
|
* x5: Ltable
|
|
* x6: nblocks (0 < nblocks <= 32)
|
|
* w7: nrounds
|
|
* %st+0: blkn => w12
|
|
*/
|
|
|
|
ldr w12, [sp]
|
|
ld1 {v0.16b}, [x3] /* load offset */
|
|
ld1 {v16.16b}, [x4] /* load checksum */
|
|
|
|
aes_preload_keys(x0, w7);
|
|
|
|
b.eq .Locb_enc_entry_192
|
|
b.hi .Locb_enc_entry_256
|
|
|
|
#define OCB_ENC(bits, ...) \
|
|
.Locb_enc_entry_##bits: \
|
|
cmp x6, #4; \
|
|
add x12, x12, #1; \
|
|
b.lo .Locb_enc_loop_##bits; \
|
|
\
|
|
.Locb_enc_loop4_##bits: \
|
|
\
|
|
/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
|
|
/* Checksum_i = Checksum_{i-1} xor P_i */ \
|
|
/* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ \
|
|
\
|
|
add w9, w12, #1; \
|
|
add w10, w12, #2; \
|
|
add w11, w12, #3; \
|
|
rbit w8, w12; \
|
|
add w12, w12, #4; \
|
|
rbit w9, w9; \
|
|
rbit w10, w10; \
|
|
rbit w11, w11; \
|
|
clz w8, w8; /* ntz(i+0) */ \
|
|
clz w9, w9; /* ntz(i+1) */ \
|
|
clz w10, w10; /* ntz(i+2) */ \
|
|
clz w11, w11; /* ntz(i+3) */ \
|
|
add x8, x5, x8, lsl #4; \
|
|
ld1 {v1.16b-v4.16b}, [x2], #64; /* load P_i+<0-3> */ \
|
|
add x9, x5, x9, lsl #4; \
|
|
add x10, x5, x10, lsl #4; \
|
|
add x11, x5, x11, lsl #4; \
|
|
\
|
|
sub x6, x6, #4; \
|
|
\
|
|
ld1 {v5.16b}, [x8]; /* load L_{ntz(i+0)} */ \
|
|
eor v16.16b, v16.16b, v1.16b; /* Checksum_i+0 */ \
|
|
ld1 {v6.16b}, [x9]; /* load L_{ntz(i+1)} */ \
|
|
eor v16.16b, v16.16b, v2.16b; /* Checksum_i+1 */ \
|
|
ld1 {v7.16b}, [x10]; /* load L_{ntz(i+2)} */ \
|
|
eor v16.16b, v16.16b, v3.16b; /* Checksum_i+2 */ \
|
|
eor v5.16b, v5.16b, v0.16b; /* Offset_i+0 */ \
|
|
ld1 {v0.16b}, [x11]; /* load L_{ntz(i+3)} */ \
|
|
eor v16.16b, v16.16b, v4.16b; /* Checksum_i+3 */ \
|
|
eor v6.16b, v6.16b, v5.16b; /* Offset_i+1 */ \
|
|
eor v1.16b, v1.16b, v5.16b; /* P_i+0 xor Offset_i+0 */ \
|
|
eor v7.16b, v7.16b, v6.16b; /* Offset_i+2 */ \
|
|
eor v2.16b, v2.16b, v6.16b; /* P_i+1 xor Offset_i+1 */ \
|
|
eor v0.16b, v0.16b, v7.16b; /* Offset_i+3 */ \
|
|
cmp x6, #4; \
|
|
eor v3.16b, v3.16b, v7.16b; /* P_i+2 xor Offset_i+2 */ \
|
|
eor v4.16b, v4.16b, v0.16b; /* P_i+3 xor Offset_i+3 */ \
|
|
\
|
|
do_aes_4_##bits(e, mc, v1, v2, v3, v4); \
|
|
\
|
|
eor v1.16b, v1.16b, v5.16b; /* xor Offset_i+0 */ \
|
|
eor v2.16b, v2.16b, v6.16b; /* xor Offset_i+1 */ \
|
|
eor v3.16b, v3.16b, v7.16b; /* xor Offset_i+2 */ \
|
|
eor v4.16b, v4.16b, v0.16b; /* xor Offset_i+3 */ \
|
|
st1 {v1.16b-v4.16b}, [x1], #64; \
|
|
\
|
|
b.hs .Locb_enc_loop4_##bits; \
|
|
CLEAR_REG(v3); \
|
|
CLEAR_REG(v4); \
|
|
CLEAR_REG(v5); \
|
|
CLEAR_REG(v6); \
|
|
CLEAR_REG(v7); \
|
|
cbz x6, .Locb_enc_done; \
|
|
\
|
|
.Locb_enc_loop_##bits: \
|
|
\
|
|
/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
|
|
/* Checksum_i = Checksum_{i-1} xor P_i */ \
|
|
/* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ \
|
|
\
|
|
rbit x8, x12; \
|
|
add x12, x12, #1; \
|
|
clz x8, x8; /* ntz(i) */ \
|
|
add x8, x5, x8, lsl #4; \
|
|
\
|
|
ld1 {v1.16b}, [x2], #16; /* load plaintext */ \
|
|
ld1 {v2.16b}, [x8]; /* load L_{ntz(i)} */ \
|
|
sub x6, x6, #1; \
|
|
eor v0.16b, v0.16b, v2.16b; \
|
|
eor v16.16b, v16.16b, v1.16b; \
|
|
eor v1.16b, v1.16b, v0.16b; \
|
|
\
|
|
do_aes_one##bits(e, mc, v1, v1); \
|
|
\
|
|
eor v1.16b, v1.16b, v0.16b; \
|
|
st1 {v1.16b}, [x1], #16; /* store ciphertext */ \
|
|
\
|
|
cbnz x6, .Locb_enc_loop_##bits; \
|
|
b .Locb_enc_done;
|
|
|
|
OCB_ENC(128)
|
|
OCB_ENC(192)
|
|
OCB_ENC(256)
|
|
|
|
#undef OCB_ENC
|
|
|
|
.Locb_enc_done:
|
|
aes_clear_keys(w7)
|
|
|
|
st1 {v16.16b}, [x4] /* store checksum */
|
|
st1 {v0.16b}, [x3] /* store offset */
|
|
|
|
CLEAR_REG(v0)
|
|
CLEAR_REG(v1)
|
|
CLEAR_REG(v2)
|
|
CLEAR_REG(v16)
|
|
|
|
ret
|
|
.size _gcry_aes_ocb_enc_armv8_ce,.-_gcry_aes_ocb_enc_armv8_ce;
|
|
|
|
|
|
/*
|
|
* void _gcry_aes_ocb_dec_armv8_ce (const void *keysched,
|
|
* unsigned char *outbuf,
|
|
* const unsigned char *inbuf,
|
|
* unsigned char *offset,
|
|
* unsigned char *checksum,
|
|
* unsigned char *L_table,
|
|
* size_t nblocks,
|
|
* unsigned int nrounds,
|
|
* unsigned int blkn);
|
|
*/
|
|
|
|
.align 3
|
|
.globl _gcry_aes_ocb_dec_armv8_ce
|
|
.type _gcry_aes_ocb_dec_armv8_ce,%function;
|
|
_gcry_aes_ocb_dec_armv8_ce:
|
|
/* input:
|
|
* x0: keysched
|
|
* x1: outbuf
|
|
* x2: inbuf
|
|
* x3: offset
|
|
* x4: checksum
|
|
* x5: Ltable
|
|
* x6: nblocks (0 < nblocks <= 32)
|
|
* w7: nrounds
|
|
* %st+0: blkn => w12
|
|
*/
|
|
|
|
ldr w12, [sp]
|
|
ld1 {v0.16b}, [x3] /* load offset */
|
|
ld1 {v16.16b}, [x4] /* load checksum */
|
|
|
|
aes_preload_keys(x0, w7);
|
|
|
|
b.eq .Locb_dec_entry_192
|
|
b.hi .Locb_dec_entry_256
|
|
|
|
#define OCB_DEC(bits) \
|
|
.Locb_dec_entry_##bits: \
|
|
cmp x6, #4; \
|
|
add w12, w12, #1; \
|
|
b.lo .Locb_dec_loop_##bits; \
|
|
\
|
|
.Locb_dec_loop4_##bits: \
|
|
\
|
|
/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
|
|
/* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */ \
|
|
/* Checksum_i = Checksum_{i-1} xor P_i */ \
|
|
\
|
|
add w9, w12, #1; \
|
|
add w10, w12, #2; \
|
|
add w11, w12, #3; \
|
|
rbit w8, w12; \
|
|
add w12, w12, #4; \
|
|
rbit w9, w9; \
|
|
rbit w10, w10; \
|
|
rbit w11, w11; \
|
|
clz w8, w8; /* ntz(i+0) */ \
|
|
clz w9, w9; /* ntz(i+1) */ \
|
|
clz w10, w10; /* ntz(i+2) */ \
|
|
clz w11, w11; /* ntz(i+3) */ \
|
|
add x8, x5, x8, lsl #4; \
|
|
ld1 {v1.16b-v4.16b}, [x2], #64; /* load C_i+<0-3> */ \
|
|
add x9, x5, x9, lsl #4; \
|
|
add x10, x5, x10, lsl #4; \
|
|
add x11, x5, x11, lsl #4; \
|
|
\
|
|
sub x6, x6, #4; \
|
|
\
|
|
ld1 {v5.16b}, [x8]; /* load L_{ntz(i+0)} */ \
|
|
ld1 {v6.16b}, [x9]; /* load L_{ntz(i+1)} */ \
|
|
ld1 {v7.16b}, [x10]; /* load L_{ntz(i+2)} */ \
|
|
eor v5.16b, v5.16b, v0.16b; /* Offset_i+0 */ \
|
|
ld1 {v0.16b}, [x11]; /* load L_{ntz(i+3)} */ \
|
|
eor v6.16b, v6.16b, v5.16b; /* Offset_i+1 */ \
|
|
eor v1.16b, v1.16b, v5.16b; /* C_i+0 xor Offset_i+0 */ \
|
|
eor v7.16b, v7.16b, v6.16b; /* Offset_i+2 */ \
|
|
eor v2.16b, v2.16b, v6.16b; /* C_i+1 xor Offset_i+1 */ \
|
|
eor v0.16b, v0.16b, v7.16b; /* Offset_i+3 */ \
|
|
cmp x6, #4; \
|
|
eor v3.16b, v3.16b, v7.16b; /* C_i+2 xor Offset_i+2 */ \
|
|
eor v4.16b, v4.16b, v0.16b; /* C_i+3 xor Offset_i+3 */ \
|
|
\
|
|
do_aes_4_##bits(d, imc, v1, v2, v3, v4); \
|
|
\
|
|
eor v1.16b, v1.16b, v5.16b; /* xor Offset_i+0 */ \
|
|
eor v2.16b, v2.16b, v6.16b; /* xor Offset_i+1 */ \
|
|
eor v16.16b, v16.16b, v1.16b; /* Checksum_i+0 */ \
|
|
eor v3.16b, v3.16b, v7.16b; /* xor Offset_i+2 */ \
|
|
eor v16.16b, v16.16b, v2.16b; /* Checksum_i+1 */ \
|
|
eor v4.16b, v4.16b, v0.16b; /* xor Offset_i+3 */ \
|
|
eor v16.16b, v16.16b, v3.16b; /* Checksum_i+2 */ \
|
|
eor v16.16b, v16.16b, v4.16b; /* Checksum_i+3 */ \
|
|
st1 {v1.16b-v4.16b}, [x1], #64; \
|
|
\
|
|
b.hs .Locb_dec_loop4_##bits; \
|
|
CLEAR_REG(v3); \
|
|
CLEAR_REG(v4); \
|
|
CLEAR_REG(v5); \
|
|
CLEAR_REG(v6); \
|
|
CLEAR_REG(v7); \
|
|
cbz x6, .Locb_dec_done; \
|
|
\
|
|
.Locb_dec_loop_##bits: \
|
|
\
|
|
/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
|
|
/* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */ \
|
|
/* Checksum_i = Checksum_{i-1} xor P_i */ \
|
|
\
|
|
rbit w8, w12; \
|
|
add w12, w12, #1; \
|
|
clz w8, w8; /* ntz(i) */ \
|
|
add x8, x5, x8, lsl #4; \
|
|
\
|
|
ld1 {v1.16b}, [x2], #16; /* load ciphertext */ \
|
|
ld1 {v2.16b}, [x8]; /* load L_{ntz(i)} */ \
|
|
sub x6, x6, #1; \
|
|
eor v0.16b, v0.16b, v2.16b; \
|
|
eor v1.16b, v1.16b, v0.16b; \
|
|
\
|
|
do_aes_one##bits(d, imc, v1, v1) \
|
|
\
|
|
eor v1.16b, v1.16b, v0.16b; \
|
|
st1 {v1.16b}, [x1], #16; /* store plaintext */ \
|
|
eor v16.16b, v16.16b, v1.16b; \
|
|
\
|
|
cbnz x6, .Locb_dec_loop_##bits; \
|
|
b .Locb_dec_done;
|
|
|
|
OCB_DEC(128)
|
|
OCB_DEC(192)
|
|
OCB_DEC(256)
|
|
|
|
#undef OCB_DEC
|
|
|
|
.Locb_dec_done:
|
|
aes_clear_keys(w7)
|
|
|
|
st1 {v16.16b}, [x4] /* store checksum */
|
|
st1 {v0.16b}, [x3] /* store offset */
|
|
|
|
CLEAR_REG(v0)
|
|
CLEAR_REG(v1)
|
|
CLEAR_REG(v2)
|
|
CLEAR_REG(v16)
|
|
|
|
ret
|
|
.size _gcry_aes_ocb_dec_armv8_ce,.-_gcry_aes_ocb_dec_armv8_ce;
|
|
|
|
|
|
/*
|
|
* void _gcry_aes_ocb_auth_armv8_ce (const void *keysched,
|
|
* const unsigned char *abuf,
|
|
* unsigned char *offset,
|
|
* unsigned char *checksum,
|
|
* unsigned char *L_table,
|
|
* size_t nblocks,
|
|
* unsigned int nrounds,
|
|
* unsigned int blkn);
|
|
*/
|
|
|
|
.align 3
|
|
.globl _gcry_aes_ocb_auth_armv8_ce
|
|
.type _gcry_aes_ocb_auth_armv8_ce,%function;
|
|
_gcry_aes_ocb_auth_armv8_ce:
|
|
/* input:
|
|
* x0: keysched
|
|
* x1: abuf
|
|
* x2: offset => x3
|
|
* x3: checksum => x4
|
|
* x4: Ltable => x5
|
|
* x5: nblocks => x6 (0 < nblocks <= 32)
|
|
* w6: nrounds => w7
|
|
* w7: blkn => w12
|
|
*/
|
|
mov x12, x7
|
|
mov x7, x6
|
|
mov x6, x5
|
|
mov x5, x4
|
|
mov x4, x3
|
|
mov x3, x2
|
|
|
|
aes_preload_keys(x0, w7);
|
|
|
|
ld1 {v0.16b}, [x3] /* load offset */
|
|
ld1 {v16.16b}, [x4] /* load checksum */
|
|
|
|
beq .Locb_auth_entry_192
|
|
bhi .Locb_auth_entry_256
|
|
|
|
#define OCB_AUTH(bits) \
|
|
.Locb_auth_entry_##bits: \
|
|
cmp x6, #4; \
|
|
add w12, w12, #1; \
|
|
b.lo .Locb_auth_loop_##bits; \
|
|
\
|
|
.Locb_auth_loop4_##bits: \
|
|
\
|
|
/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
|
|
/* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ \
|
|
\
|
|
add w9, w12, #1; \
|
|
add w10, w12, #2; \
|
|
add w11, w12, #3; \
|
|
rbit w8, w12; \
|
|
add w12, w12, #4; \
|
|
rbit w9, w9; \
|
|
rbit w10, w10; \
|
|
rbit w11, w11; \
|
|
clz w8, w8; /* ntz(i+0) */ \
|
|
clz w9, w9; /* ntz(i+1) */ \
|
|
clz w10, w10; /* ntz(i+2) */ \
|
|
clz w11, w11; /* ntz(i+3) */ \
|
|
add x8, x5, x8, lsl #4; \
|
|
ld1 {v1.16b-v4.16b}, [x1], #64; /* load A_i+<0-3> */ \
|
|
add x9, x5, x9, lsl #4; \
|
|
add x10, x5, x10, lsl #4; \
|
|
add x11, x5, x11, lsl #4; \
|
|
\
|
|
sub x6, x6, #4; \
|
|
\
|
|
ld1 {v5.16b}, [x8]; /* load L_{ntz(i+0)} */ \
|
|
ld1 {v6.16b}, [x9]; /* load L_{ntz(i+1)} */ \
|
|
ld1 {v7.16b}, [x10]; /* load L_{ntz(i+2)} */ \
|
|
eor v5.16b, v5.16b, v0.16b; /* Offset_i+0 */ \
|
|
ld1 {v0.16b}, [x11]; /* load L_{ntz(i+3)} */ \
|
|
eor v6.16b, v6.16b, v5.16b; /* Offset_i+1 */ \
|
|
eor v1.16b, v1.16b, v5.16b; /* A_i+0 xor Offset_i+0 */ \
|
|
eor v7.16b, v7.16b, v6.16b; /* Offset_i+2 */ \
|
|
eor v2.16b, v2.16b, v6.16b; /* A_i+1 xor Offset_i+1 */ \
|
|
eor v0.16b, v0.16b, v7.16b; /* Offset_i+3 */ \
|
|
cmp x6, #4; \
|
|
eor v3.16b, v3.16b, v7.16b; /* A_i+2 xor Offset_i+2 */ \
|
|
eor v4.16b, v4.16b, v0.16b; /* A_i+3 xor Offset_i+3 */ \
|
|
\
|
|
do_aes_4_##bits(e, mc, v1, v2, v3, v4); \
|
|
\
|
|
eor v1.16b, v1.16b, v2.16b; \
|
|
eor v16.16b, v16.16b, v3.16b; \
|
|
eor v1.16b, v1.16b, v4.16b; \
|
|
eor v16.16b, v16.16b, v1.16b; \
|
|
\
|
|
b.hs .Locb_auth_loop4_##bits; \
|
|
CLEAR_REG(v3); \
|
|
CLEAR_REG(v4); \
|
|
CLEAR_REG(v5); \
|
|
CLEAR_REG(v6); \
|
|
CLEAR_REG(v7); \
|
|
cbz x6, .Locb_auth_done; \
|
|
\
|
|
.Locb_auth_loop_##bits: \
|
|
\
|
|
/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
|
|
/* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ \
|
|
\
|
|
rbit w8, w12; \
|
|
add w12, w12, #1; \
|
|
clz w8, w8; /* ntz(i) */ \
|
|
add x8, x5, x8, lsl #4; \
|
|
\
|
|
ld1 {v1.16b}, [x1], #16; /* load aadtext */ \
|
|
ld1 {v2.16b}, [x8]; /* load L_{ntz(i)} */ \
|
|
sub x6, x6, #1; \
|
|
eor v0.16b, v0.16b, v2.16b; \
|
|
eor v1.16b, v1.16b, v0.16b; \
|
|
\
|
|
do_aes_one##bits(e, mc, v1, v1) \
|
|
\
|
|
eor v16.16b, v16.16b, v1.16b; \
|
|
\
|
|
cbnz x6, .Locb_auth_loop_##bits; \
|
|
b .Locb_auth_done;
|
|
|
|
OCB_AUTH(128)
|
|
OCB_AUTH(192)
|
|
OCB_AUTH(256)
|
|
|
|
#undef OCB_AUTH
|
|
|
|
.Locb_auth_done:
|
|
aes_clear_keys(w7)
|
|
|
|
st1 {v16.16b}, [x4] /* store checksum */
|
|
st1 {v0.16b}, [x3] /* store offset */
|
|
|
|
CLEAR_REG(v0)
|
|
CLEAR_REG(v1)
|
|
CLEAR_REG(v2)
|
|
CLEAR_REG(v16)
|
|
|
|
ret
|
|
.size _gcry_aes_ocb_auth_armv8_ce,.-_gcry_aes_ocb_auth_armv8_ce;
|
|
|
|
|
|
/*
|
|
* u32 _gcry_aes_sbox4_armv8_ce(u32 in4b);
|
|
*/
|
|
.align 3
|
|
.globl _gcry_aes_sbox4_armv8_ce
|
|
.type _gcry_aes_sbox4_armv8_ce,%function;
|
|
_gcry_aes_sbox4_armv8_ce:
|
|
/* See "Gouvêa, C. P. L. & López, J. Implementing GCM on ARMv8. Topics in
|
|
* Cryptology — CT-RSA 2015" for details.
|
|
*/
|
|
movi v0.16b, #0x52
|
|
movi v1.16b, #0
|
|
mov v0.S[0], w0
|
|
aese v0.16b, v1.16b
|
|
addv s0, v0.4s
|
|
mov w0, v0.S[0]
|
|
CLEAR_REG(v0)
|
|
ret
|
|
.size _gcry_aes_sbox4_armv8_ce,.-_gcry_aes_sbox4_armv8_ce;
|
|
|
|
|
|
/*
|
|
* void _gcry_aes_invmixcol_armv8_ce(void *dst, const void *src);
|
|
*/
|
|
.align 3
|
|
.globl _gcry_aes_invmixcol_armv8_ce
|
|
.type _gcry_aes_invmixcol_armv8_ce,%function;
|
|
_gcry_aes_invmixcol_armv8_ce:
|
|
ld1 {v0.16b}, [x1]
|
|
aesimc v0.16b, v0.16b
|
|
st1 {v0.16b}, [x0]
|
|
CLEAR_REG(v0)
|
|
ret
|
|
.size _gcry_aes_invmixcol_armv8_ce,.-_gcry_aes_invmixcol_armv8_ce;
|
|
|
|
#endif
|