crypto: x86 - Add optimized MORUS implementations

This patch adds optimized implementations of MORUS-640 and MORUS-1280,
utilizing the SSE2 and AVX2 x86 extensions.

For MORUS-1280 (which operates on 256-bit blocks) we provide both AVX2
and SSE2 implementation. Although SSE2 MORUS-1280 is slower than AVX2
MORUS-1280, it is comparable in speed to the SSE2 MORUS-640.

Signed-off-by: Ondrej Mosnacek <omosnacek@gmail.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
This commit is contained in:
Ondrej Mosnacek 2018-05-11 14:19:12 +02:00 коммит произвёл Herbert Xu
Родитель 56e8e57fc3
Коммит 6ecc9d9ff9
8 изменённых файлов: 2370 добавлений и 0 удалений

Просмотреть файл

@ -42,6 +42,9 @@ obj-$(CONFIG_CRYPTO_AEGIS128_AESNI_SSE2) += aegis128-aesni.o
obj-$(CONFIG_CRYPTO_AEGIS128L_AESNI_SSE2) += aegis128l-aesni.o obj-$(CONFIG_CRYPTO_AEGIS128L_AESNI_SSE2) += aegis128l-aesni.o
obj-$(CONFIG_CRYPTO_AEGIS256_AESNI_SSE2) += aegis256-aesni.o obj-$(CONFIG_CRYPTO_AEGIS256_AESNI_SSE2) += aegis256-aesni.o
obj-$(CONFIG_CRYPTO_MORUS640_SSE2) += morus640-sse2.o
obj-$(CONFIG_CRYPTO_MORUS1280_SSE2) += morus1280-sse2.o
# These modules require assembler to support AVX. # These modules require assembler to support AVX.
ifeq ($(avx_supported),yes) ifeq ($(avx_supported),yes)
obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += \ obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += \
@ -59,6 +62,8 @@ ifeq ($(avx2_supported),yes)
obj-$(CONFIG_CRYPTO_SHA1_MB) += sha1-mb/ obj-$(CONFIG_CRYPTO_SHA1_MB) += sha1-mb/
obj-$(CONFIG_CRYPTO_SHA256_MB) += sha256-mb/ obj-$(CONFIG_CRYPTO_SHA256_MB) += sha256-mb/
obj-$(CONFIG_CRYPTO_SHA512_MB) += sha512-mb/ obj-$(CONFIG_CRYPTO_SHA512_MB) += sha512-mb/
obj-$(CONFIG_CRYPTO_MORUS1280_AVX2) += morus1280-avx2.o
endif endif
aes-i586-y := aes-i586-asm_32.o aes_glue.o aes-i586-y := aes-i586-asm_32.o aes_glue.o
@ -80,6 +85,9 @@ aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o
aegis128l-aesni-y := aegis128l-aesni-asm.o aegis128l-aesni-glue.o aegis128l-aesni-y := aegis128l-aesni-asm.o aegis128l-aesni-glue.o
aegis256-aesni-y := aegis256-aesni-asm.o aegis256-aesni-glue.o aegis256-aesni-y := aegis256-aesni-asm.o aegis256-aesni-glue.o
morus640-sse2-y := morus640-sse2-asm.o morus640-sse2-glue.o
morus1280-sse2-y := morus1280-sse2-asm.o morus1280-sse2-glue.o
ifeq ($(avx_supported),yes) ifeq ($(avx_supported),yes)
camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \ camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
camellia_aesni_avx_glue.o camellia_aesni_avx_glue.o
@ -95,6 +103,8 @@ ifeq ($(avx2_supported),yes)
camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o
chacha20-x86_64-y += chacha20-avx2-x86_64.o chacha20-x86_64-y += chacha20-avx2-x86_64.o
serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o
morus1280-avx2-y := morus1280-avx2-asm.o morus1280-avx2-glue.o
endif endif
aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o

Просмотреть файл

@ -0,0 +1,621 @@
/*
* AVX2 implementation of MORUS-1280
*
* Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
* Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published
* by the Free Software Foundation.
*/
#include <linux/linkage.h>
#include <asm/frame.h>
#define SHUFFLE_MASK(i0, i1, i2, i3) \
(i0 | (i1 << 2) | (i2 << 4) | (i3 << 6))
#define MASK1 SHUFFLE_MASK(3, 0, 1, 2)
#define MASK2 SHUFFLE_MASK(2, 3, 0, 1)
#define MASK3 SHUFFLE_MASK(1, 2, 3, 0)
#define STATE0 %ymm0
#define STATE0_LOW %xmm0
#define STATE1 %ymm1
#define STATE2 %ymm2
#define STATE3 %ymm3
#define STATE4 %ymm4
#define KEY %ymm5
#define MSG %ymm5
#define MSG_LOW %xmm5
#define T0 %ymm6
#define T0_LOW %xmm6
#define T1 %ymm7
.section .rodata.cst32.morus1280_const, "aM", @progbits, 32
.align 32
.Lmorus1280_const:
.byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
.byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
.byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
.byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
.section .rodata.cst32.morus1280_counter, "aM", @progbits, 32
.align 32
.Lmorus1280_counter:
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
.byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
.byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
.byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
.text
.macro morus1280_round s0, s1, s2, s3, s4, b, w
vpand \s1, \s2, T0
vpxor T0, \s0, \s0
vpxor \s3, \s0, \s0
vpsllq $\b, \s0, T0
vpsrlq $(64 - \b), \s0, \s0
vpxor T0, \s0, \s0
vpermq $\w, \s3, \s3
.endm
/*
* __morus1280_update: internal ABI
* input:
* STATE[0-4] - input state
* MSG - message block
* output:
* STATE[0-4] - output state
* changed:
* T0
*/
__morus1280_update:
morus1280_round STATE0, STATE1, STATE2, STATE3, STATE4, 13, MASK1
vpxor MSG, STATE1, STATE1
morus1280_round STATE1, STATE2, STATE3, STATE4, STATE0, 46, MASK2
vpxor MSG, STATE2, STATE2
morus1280_round STATE2, STATE3, STATE4, STATE0, STATE1, 38, MASK3
vpxor MSG, STATE3, STATE3
morus1280_round STATE3, STATE4, STATE0, STATE1, STATE2, 7, MASK2
vpxor MSG, STATE4, STATE4
morus1280_round STATE4, STATE0, STATE1, STATE2, STATE3, 4, MASK1
ret
ENDPROC(__morus1280_update)
/*
* __morus1280_update_zero: internal ABI
* input:
* STATE[0-4] - input state
* output:
* STATE[0-4] - output state
* changed:
* T0
*/
__morus1280_update_zero:
morus1280_round STATE0, STATE1, STATE2, STATE3, STATE4, 13, MASK1
morus1280_round STATE1, STATE2, STATE3, STATE4, STATE0, 46, MASK2
morus1280_round STATE2, STATE3, STATE4, STATE0, STATE1, 38, MASK3
morus1280_round STATE3, STATE4, STATE0, STATE1, STATE2, 7, MASK2
morus1280_round STATE4, STATE0, STATE1, STATE2, STATE3, 4, MASK1
ret
ENDPROC(__morus1280_update_zero)
/*
* __load_partial: internal ABI
* input:
* %rsi - src
* %rcx - bytes
* output:
* MSG - message block
* changed:
* %r8
* %r9
*/
__load_partial:
xor %r9, %r9
vpxor MSG, MSG, MSG
mov %rcx, %r8
and $0x1, %r8
jz .Lld_partial_1
mov %rcx, %r8
and $0x1E, %r8
add %rsi, %r8
mov (%r8), %r9b
.Lld_partial_1:
mov %rcx, %r8
and $0x2, %r8
jz .Lld_partial_2
mov %rcx, %r8
and $0x1C, %r8
add %rsi, %r8
shl $16, %r9
mov (%r8), %r9w
.Lld_partial_2:
mov %rcx, %r8
and $0x4, %r8
jz .Lld_partial_4
mov %rcx, %r8
and $0x18, %r8
add %rsi, %r8
shl $32, %r9
mov (%r8), %r8d
xor %r8, %r9
.Lld_partial_4:
movq %r9, MSG_LOW
mov %rcx, %r8
and $0x8, %r8
jz .Lld_partial_8
mov %rcx, %r8
and $0x10, %r8
add %rsi, %r8
pshufd $MASK2, MSG_LOW, MSG_LOW
pinsrq $0, (%r8), MSG_LOW
.Lld_partial_8:
mov %rcx, %r8
and $0x10, %r8
jz .Lld_partial_16
vpermq $MASK2, MSG, MSG
movdqu (%rsi), MSG_LOW
.Lld_partial_16:
ret
ENDPROC(__load_partial)
/*
* __store_partial: internal ABI
* input:
* %rdx - dst
* %rcx - bytes
* output:
* T0 - message block
* changed:
* %r8
* %r9
* %r10
*/
__store_partial:
mov %rcx, %r8
mov %rdx, %r9
cmp $16, %r8
jl .Lst_partial_16
movdqu T0_LOW, (%r9)
vpermq $MASK2, T0, T0
sub $16, %r8
add $16, %r9
.Lst_partial_16:
movq T0_LOW, %r10
cmp $8, %r8
jl .Lst_partial_8
mov %r10, (%r9)
pextrq $1, T0_LOW, %r10
sub $8, %r8
add $8, %r9
.Lst_partial_8:
cmp $4, %r8
jl .Lst_partial_4
mov %r10d, (%r9)
shr $32, %r10
sub $4, %r8
add $4, %r9
.Lst_partial_4:
cmp $2, %r8
jl .Lst_partial_2
mov %r10w, (%r9)
shr $16, %r10
sub $2, %r8
add $2, %r9
.Lst_partial_2:
cmp $1, %r8
jl .Lst_partial_1
mov %r10b, (%r9)
.Lst_partial_1:
ret
ENDPROC(__store_partial)
/*
* void crypto_morus1280_avx2_init(void *state, const void *key,
* const void *iv);
*/
ENTRY(crypto_morus1280_avx2_init)
FRAME_BEGIN
/* load IV: */
vpxor STATE0, STATE0, STATE0
movdqu (%rdx), STATE0_LOW
/* load key: */
vmovdqu (%rsi), KEY
vmovdqa KEY, STATE1
/* load all ones: */
vpcmpeqd STATE2, STATE2, STATE2
/* load all zeros: */
vpxor STATE3, STATE3, STATE3
/* load the constant: */
vmovdqa .Lmorus1280_const, STATE4
/* update 16 times with zero: */
call __morus1280_update_zero
call __morus1280_update_zero
call __morus1280_update_zero
call __morus1280_update_zero
call __morus1280_update_zero
call __morus1280_update_zero
call __morus1280_update_zero
call __morus1280_update_zero
call __morus1280_update_zero
call __morus1280_update_zero
call __morus1280_update_zero
call __morus1280_update_zero
call __morus1280_update_zero
call __morus1280_update_zero
call __morus1280_update_zero
call __morus1280_update_zero
/* xor-in the key again after updates: */
vpxor KEY, STATE1, STATE1
/* store the state: */
vmovdqu STATE0, (0 * 32)(%rdi)
vmovdqu STATE1, (1 * 32)(%rdi)
vmovdqu STATE2, (2 * 32)(%rdi)
vmovdqu STATE3, (3 * 32)(%rdi)
vmovdqu STATE4, (4 * 32)(%rdi)
FRAME_END
ret
ENDPROC(crypto_morus1280_avx2_init)
/*
* void crypto_morus1280_avx2_ad(void *state, const void *data,
* unsigned int length);
*/
ENTRY(crypto_morus1280_avx2_ad)
FRAME_BEGIN
cmp $32, %rdx
jb .Lad_out
/* load the state: */
vmovdqu (0 * 32)(%rdi), STATE0
vmovdqu (1 * 32)(%rdi), STATE1
vmovdqu (2 * 32)(%rdi), STATE2
vmovdqu (3 * 32)(%rdi), STATE3
vmovdqu (4 * 32)(%rdi), STATE4
mov %rsi, %r8
and $0x1F, %r8
jnz .Lad_u_loop
.align 4
.Lad_a_loop:
vmovdqa (%rsi), MSG
call __morus1280_update
sub $32, %rdx
add $32, %rsi
cmp $32, %rdx
jge .Lad_a_loop
jmp .Lad_cont
.align 4
.Lad_u_loop:
vmovdqu (%rsi), MSG
call __morus1280_update
sub $32, %rdx
add $32, %rsi
cmp $32, %rdx
jge .Lad_u_loop
.Lad_cont:
/* store the state: */
vmovdqu STATE0, (0 * 32)(%rdi)
vmovdqu STATE1, (1 * 32)(%rdi)
vmovdqu STATE2, (2 * 32)(%rdi)
vmovdqu STATE3, (3 * 32)(%rdi)
vmovdqu STATE4, (4 * 32)(%rdi)
.Lad_out:
FRAME_END
ret
ENDPROC(crypto_morus1280_avx2_ad)
/*
* void crypto_morus1280_avx2_enc(void *state, const void *src, void *dst,
* unsigned int length);
*/
ENTRY(crypto_morus1280_avx2_enc)
FRAME_BEGIN
cmp $32, %rcx
jb .Lenc_out
/* load the state: */
vmovdqu (0 * 32)(%rdi), STATE0
vmovdqu (1 * 32)(%rdi), STATE1
vmovdqu (2 * 32)(%rdi), STATE2
vmovdqu (3 * 32)(%rdi), STATE3
vmovdqu (4 * 32)(%rdi), STATE4
mov %rsi, %r8
or %rdx, %r8
and $0x1F, %r8
jnz .Lenc_u_loop
.align 4
.Lenc_a_loop:
vmovdqa (%rsi), MSG
vmovdqa MSG, T0
vpxor STATE0, T0, T0
vpermq $MASK3, STATE1, T1
vpxor T1, T0, T0
vpand STATE2, STATE3, T1
vpxor T1, T0, T0
vmovdqa T0, (%rdx)
call __morus1280_update
sub $32, %rcx
add $32, %rsi
add $32, %rdx
cmp $32, %rcx
jge .Lenc_a_loop
jmp .Lenc_cont
.align 4
.Lenc_u_loop:
vmovdqu (%rsi), MSG
vmovdqa MSG, T0
vpxor STATE0, T0, T0
vpermq $MASK3, STATE1, T1
vpxor T1, T0, T0
vpand STATE2, STATE3, T1
vpxor T1, T0, T0
vmovdqu T0, (%rdx)
call __morus1280_update
sub $32, %rcx
add $32, %rsi
add $32, %rdx
cmp $32, %rcx
jge .Lenc_u_loop
.Lenc_cont:
/* store the state: */
vmovdqu STATE0, (0 * 32)(%rdi)
vmovdqu STATE1, (1 * 32)(%rdi)
vmovdqu STATE2, (2 * 32)(%rdi)
vmovdqu STATE3, (3 * 32)(%rdi)
vmovdqu STATE4, (4 * 32)(%rdi)
.Lenc_out:
FRAME_END
ret
ENDPROC(crypto_morus1280_avx2_enc)
/*
* void crypto_morus1280_avx2_enc_tail(void *state, const void *src, void *dst,
* unsigned int length);
*/
ENTRY(crypto_morus1280_avx2_enc_tail)
FRAME_BEGIN
/* load the state: */
vmovdqu (0 * 32)(%rdi), STATE0
vmovdqu (1 * 32)(%rdi), STATE1
vmovdqu (2 * 32)(%rdi), STATE2
vmovdqu (3 * 32)(%rdi), STATE3
vmovdqu (4 * 32)(%rdi), STATE4
/* encrypt message: */
call __load_partial
vmovdqa MSG, T0
vpxor STATE0, T0, T0
vpermq $MASK3, STATE1, T1
vpxor T1, T0, T0
vpand STATE2, STATE3, T1
vpxor T1, T0, T0
call __store_partial
call __morus1280_update
/* store the state: */
vmovdqu STATE0, (0 * 32)(%rdi)
vmovdqu STATE1, (1 * 32)(%rdi)
vmovdqu STATE2, (2 * 32)(%rdi)
vmovdqu STATE3, (3 * 32)(%rdi)
vmovdqu STATE4, (4 * 32)(%rdi)
FRAME_END
ENDPROC(crypto_morus1280_avx2_enc_tail)
/*
* void crypto_morus1280_avx2_dec(void *state, const void *src, void *dst,
* unsigned int length);
*/
ENTRY(crypto_morus1280_avx2_dec)
FRAME_BEGIN
cmp $32, %rcx
jb .Ldec_out
/* load the state: */
vmovdqu (0 * 32)(%rdi), STATE0
vmovdqu (1 * 32)(%rdi), STATE1
vmovdqu (2 * 32)(%rdi), STATE2
vmovdqu (3 * 32)(%rdi), STATE3
vmovdqu (4 * 32)(%rdi), STATE4
mov %rsi, %r8
or %rdx, %r8
and $0x1F, %r8
jnz .Ldec_u_loop
.align 4
.Ldec_a_loop:
vmovdqa (%rsi), MSG
vpxor STATE0, MSG, MSG
vpermq $MASK3, STATE1, T0
vpxor T0, MSG, MSG
vpand STATE2, STATE3, T0
vpxor T0, MSG, MSG
vmovdqa MSG, (%rdx)
call __morus1280_update
sub $32, %rcx
add $32, %rsi
add $32, %rdx
cmp $32, %rcx
jge .Ldec_a_loop
jmp .Ldec_cont
.align 4
.Ldec_u_loop:
vmovdqu (%rsi), MSG
vpxor STATE0, MSG, MSG
vpermq $MASK3, STATE1, T0
vpxor T0, MSG, MSG
vpand STATE2, STATE3, T0
vpxor T0, MSG, MSG
vmovdqu MSG, (%rdx)
call __morus1280_update
sub $32, %rcx
add $32, %rsi
add $32, %rdx
cmp $32, %rcx
jge .Ldec_u_loop
.Ldec_cont:
/* store the state: */
vmovdqu STATE0, (0 * 32)(%rdi)
vmovdqu STATE1, (1 * 32)(%rdi)
vmovdqu STATE2, (2 * 32)(%rdi)
vmovdqu STATE3, (3 * 32)(%rdi)
vmovdqu STATE4, (4 * 32)(%rdi)
.Ldec_out:
FRAME_END
ret
ENDPROC(crypto_morus1280_avx2_dec)
/*
* void crypto_morus1280_avx2_dec_tail(void *state, const void *src, void *dst,
* unsigned int length);
*/
ENTRY(crypto_morus1280_avx2_dec_tail)
FRAME_BEGIN
/* load the state: */
vmovdqu (0 * 32)(%rdi), STATE0
vmovdqu (1 * 32)(%rdi), STATE1
vmovdqu (2 * 32)(%rdi), STATE2
vmovdqu (3 * 32)(%rdi), STATE3
vmovdqu (4 * 32)(%rdi), STATE4
/* decrypt message: */
call __load_partial
vpxor STATE0, MSG, MSG
vpermq $MASK3, STATE1, T0
vpxor T0, MSG, MSG
vpand STATE2, STATE3, T0
vpxor T0, MSG, MSG
vmovdqa MSG, T0
call __store_partial
/* mask with byte count: */
movq %rcx, T0_LOW
vpbroadcastb T0_LOW, T0
vmovdqa .Lmorus1280_counter, T1
vpcmpgtb T1, T0, T0
vpand T0, MSG, MSG
call __morus1280_update
/* store the state: */
vmovdqu STATE0, (0 * 32)(%rdi)
vmovdqu STATE1, (1 * 32)(%rdi)
vmovdqu STATE2, (2 * 32)(%rdi)
vmovdqu STATE3, (3 * 32)(%rdi)
vmovdqu STATE4, (4 * 32)(%rdi)
FRAME_END
ret
ENDPROC(crypto_morus1280_avx2_dec_tail)
/*
* void crypto_morus1280_avx2_final(void *state, void *tag_xor,
* u64 assoclen, u64 cryptlen);
*/
ENTRY(crypto_morus1280_avx2_final)
FRAME_BEGIN
/* load the state: */
vmovdqu (0 * 32)(%rdi), STATE0
vmovdqu (1 * 32)(%rdi), STATE1
vmovdqu (2 * 32)(%rdi), STATE2
vmovdqu (3 * 32)(%rdi), STATE3
vmovdqu (4 * 32)(%rdi), STATE4
/* xor state[0] into state[4]: */
vpxor STATE0, STATE4, STATE4
/* prepare length block: */
vpxor MSG, MSG, MSG
vpinsrq $0, %rdx, MSG_LOW, MSG_LOW
vpinsrq $1, %rcx, MSG_LOW, MSG_LOW
vpsllq $3, MSG, MSG /* multiply by 8 (to get bit count) */
/* update state: */
call __morus1280_update
call __morus1280_update
call __morus1280_update
call __morus1280_update
call __morus1280_update
call __morus1280_update
call __morus1280_update
call __morus1280_update
call __morus1280_update
call __morus1280_update
/* xor tag: */
vmovdqu (%rsi), MSG
vpxor STATE0, MSG, MSG
vpermq $MASK3, STATE1, T0
vpxor T0, MSG, MSG
vpand STATE2, STATE3, T0
vpxor T0, MSG, MSG
vmovdqu MSG, (%rsi)
FRAME_END
ret
ENDPROC(crypto_morus1280_avx2_final)

Просмотреть файл

@ -0,0 +1,68 @@
/*
* The MORUS-1280 Authenticated-Encryption Algorithm
* Glue for AVX2 implementation
*
* Copyright (c) 2016-2018 Ondrej Mosnacek <omosnacek@gmail.com>
* Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*/
#include <crypto/internal/aead.h>
#include <crypto/morus1280_glue.h>
#include <linux/module.h>
#include <asm/fpu/api.h>
#include <asm/cpu_device_id.h>
asmlinkage void crypto_morus1280_avx2_init(void *state, const void *key,
const void *iv);
asmlinkage void crypto_morus1280_avx2_ad(void *state, const void *data,
unsigned int length);
asmlinkage void crypto_morus1280_avx2_enc(void *state, const void *src,
void *dst, unsigned int length);
asmlinkage void crypto_morus1280_avx2_dec(void *state, const void *src,
void *dst, unsigned int length);
asmlinkage void crypto_morus1280_avx2_enc_tail(void *state, const void *src,
void *dst, unsigned int length);
asmlinkage void crypto_morus1280_avx2_dec_tail(void *state, const void *src,
void *dst, unsigned int length);
asmlinkage void crypto_morus1280_avx2_final(void *state, void *tag_xor,
u64 assoclen, u64 cryptlen);
MORUS1280_DECLARE_ALGS(avx2, "morus1280-avx2", 400);
static const struct x86_cpu_id avx2_cpu_id[] = {
X86_FEATURE_MATCH(X86_FEATURE_AVX2),
{}
};
MODULE_DEVICE_TABLE(x86cpu, avx2_cpu_id);
static int __init crypto_morus1280_avx2_module_init(void)
{
if (!x86_match_cpu(avx2_cpu_id))
return -ENODEV;
return crypto_register_aeads(crypto_morus1280_avx2_algs,
ARRAY_SIZE(crypto_morus1280_avx2_algs));
}
static void __exit crypto_morus1280_avx2_module_exit(void)
{
crypto_unregister_aeads(crypto_morus1280_avx2_algs,
ARRAY_SIZE(crypto_morus1280_avx2_algs));
}
module_init(crypto_morus1280_avx2_module_init);
module_exit(crypto_morus1280_avx2_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>");
MODULE_DESCRIPTION("MORUS-1280 AEAD algorithm -- AVX2 implementation");
MODULE_ALIAS_CRYPTO("morus1280");
MODULE_ALIAS_CRYPTO("morus1280-avx2");

Просмотреть файл

@ -0,0 +1,895 @@
/*
* SSE2 implementation of MORUS-1280
*
* Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
* Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published
* by the Free Software Foundation.
*/
#include <linux/linkage.h>
#include <asm/frame.h>
#define SHUFFLE_MASK(i0, i1, i2, i3) \
(i0 | (i1 << 2) | (i2 << 4) | (i3 << 6))
#define MASK2 SHUFFLE_MASK(2, 3, 0, 1)
#define STATE0_LO %xmm0
#define STATE0_HI %xmm1
#define STATE1_LO %xmm2
#define STATE1_HI %xmm3
#define STATE2_LO %xmm4
#define STATE2_HI %xmm5
#define STATE3_LO %xmm6
#define STATE3_HI %xmm7
#define STATE4_LO %xmm8
#define STATE4_HI %xmm9
#define KEY_LO %xmm10
#define KEY_HI %xmm11
#define MSG_LO %xmm10
#define MSG_HI %xmm11
#define T0_LO %xmm12
#define T0_HI %xmm13
#define T1_LO %xmm14
#define T1_HI %xmm15
.section .rodata.cst16.morus640_const, "aM", @progbits, 16
.align 16
.Lmorus640_const_0:
.byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
.byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
.Lmorus640_const_1:
.byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
.byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
.section .rodata.cst16.morus640_counter, "aM", @progbits, 16
.align 16
.Lmorus640_counter_0:
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
.byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
.Lmorus640_counter_1:
.byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
.byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
.text
.macro rol1 hi, lo
/*
* HI_1 | HI_0 || LO_1 | LO_0
* ==>
* HI_0 | HI_1 || LO_1 | LO_0
* ==>
* HI_0 | LO_1 || LO_0 | HI_1
*/
pshufd $MASK2, \hi, \hi
movdqa \hi, T0_LO
punpcklqdq \lo, T0_LO
punpckhqdq \hi, \lo
movdqa \lo, \hi
movdqa T0_LO, \lo
.endm
.macro rol2 hi, lo
movdqa \lo, T0_LO
movdqa \hi, \lo
movdqa T0_LO, \hi
.endm
.macro rol3 hi, lo
/*
* HI_1 | HI_0 || LO_1 | LO_0
* ==>
* HI_0 | HI_1 || LO_1 | LO_0
* ==>
* LO_0 | HI_1 || HI_0 | LO_1
*/
pshufd $MASK2, \hi, \hi
movdqa \lo, T0_LO
punpckhqdq \hi, T0_LO
punpcklqdq \lo, \hi
movdqa T0_LO, \lo
.endm
.macro morus1280_round s0_l, s0_h, s1_l, s1_h, s2_l, s2_h, s3_l, s3_h, s4_l, s4_h, b, w
movdqa \s1_l, T0_LO
pand \s2_l, T0_LO
pxor T0_LO, \s0_l
movdqa \s1_h, T0_LO
pand \s2_h, T0_LO
pxor T0_LO, \s0_h
pxor \s3_l, \s0_l
pxor \s3_h, \s0_h
movdqa \s0_l, T0_LO
psllq $\b, T0_LO
psrlq $(64 - \b), \s0_l
pxor T0_LO, \s0_l
movdqa \s0_h, T0_LO
psllq $\b, T0_LO
psrlq $(64 - \b), \s0_h
pxor T0_LO, \s0_h
\w \s3_h, \s3_l
.endm
/*
* __morus1280_update: internal ABI
* input:
* STATE[0-4] - input state
* MSG - message block
* output:
* STATE[0-4] - output state
* changed:
* T0
*/
__morus1280_update:
morus1280_round \
STATE0_LO, STATE0_HI, \
STATE1_LO, STATE1_HI, \
STATE2_LO, STATE2_HI, \
STATE3_LO, STATE3_HI, \
STATE4_LO, STATE4_HI, \
13, rol1
pxor MSG_LO, STATE1_LO
pxor MSG_HI, STATE1_HI
morus1280_round \
STATE1_LO, STATE1_HI, \
STATE2_LO, STATE2_HI, \
STATE3_LO, STATE3_HI, \
STATE4_LO, STATE4_HI, \
STATE0_LO, STATE0_HI, \
46, rol2
pxor MSG_LO, STATE2_LO
pxor MSG_HI, STATE2_HI
morus1280_round \
STATE2_LO, STATE2_HI, \
STATE3_LO, STATE3_HI, \
STATE4_LO, STATE4_HI, \
STATE0_LO, STATE0_HI, \
STATE1_LO, STATE1_HI, \
38, rol3
pxor MSG_LO, STATE3_LO
pxor MSG_HI, STATE3_HI
morus1280_round \
STATE3_LO, STATE3_HI, \
STATE4_LO, STATE4_HI, \
STATE0_LO, STATE0_HI, \
STATE1_LO, STATE1_HI, \
STATE2_LO, STATE2_HI, \
7, rol2
pxor MSG_LO, STATE4_LO
pxor MSG_HI, STATE4_HI
morus1280_round \
STATE4_LO, STATE4_HI, \
STATE0_LO, STATE0_HI, \
STATE1_LO, STATE1_HI, \
STATE2_LO, STATE2_HI, \
STATE3_LO, STATE3_HI, \
4, rol1
ret
ENDPROC(__morus1280_update)
/*
* __morus1280_update_zero: internal ABI
* input:
* STATE[0-4] - input state
* output:
* STATE[0-4] - output state
* changed:
* T0
*/
__morus1280_update_zero:
morus1280_round \
STATE0_LO, STATE0_HI, \
STATE1_LO, STATE1_HI, \
STATE2_LO, STATE2_HI, \
STATE3_LO, STATE3_HI, \
STATE4_LO, STATE4_HI, \
13, rol1
morus1280_round \
STATE1_LO, STATE1_HI, \
STATE2_LO, STATE2_HI, \
STATE3_LO, STATE3_HI, \
STATE4_LO, STATE4_HI, \
STATE0_LO, STATE0_HI, \
46, rol2
morus1280_round \
STATE2_LO, STATE2_HI, \
STATE3_LO, STATE3_HI, \
STATE4_LO, STATE4_HI, \
STATE0_LO, STATE0_HI, \
STATE1_LO, STATE1_HI, \
38, rol3
morus1280_round \
STATE3_LO, STATE3_HI, \
STATE4_LO, STATE4_HI, \
STATE0_LO, STATE0_HI, \
STATE1_LO, STATE1_HI, \
STATE2_LO, STATE2_HI, \
7, rol2
morus1280_round \
STATE4_LO, STATE4_HI, \
STATE0_LO, STATE0_HI, \
STATE1_LO, STATE1_HI, \
STATE2_LO, STATE2_HI, \
STATE3_LO, STATE3_HI, \
4, rol1
ret
ENDPROC(__morus1280_update_zero)
/*
* __load_partial: internal ABI
* input:
* %rsi - src
* %rcx - bytes
* output:
* MSG - message block
* changed:
* %r8
* %r9
*/
__load_partial:
xor %r9, %r9
pxor MSG_LO, MSG_LO
pxor MSG_HI, MSG_HI
mov %rcx, %r8
and $0x1, %r8
jz .Lld_partial_1
mov %rcx, %r8
and $0x1E, %r8
add %rsi, %r8
mov (%r8), %r9b
.Lld_partial_1:
mov %rcx, %r8
and $0x2, %r8
jz .Lld_partial_2
mov %rcx, %r8
and $0x1C, %r8
add %rsi, %r8
shl $16, %r9
mov (%r8), %r9w
.Lld_partial_2:
mov %rcx, %r8
and $0x4, %r8
jz .Lld_partial_4
mov %rcx, %r8
and $0x18, %r8
add %rsi, %r8
shl $32, %r9
mov (%r8), %r8d
xor %r8, %r9
.Lld_partial_4:
movq %r9, MSG_LO
mov %rcx, %r8
and $0x8, %r8
jz .Lld_partial_8
mov %rcx, %r8
and $0x10, %r8
add %rsi, %r8
pslldq $8, MSG_LO
movq (%r8), T0_LO
pxor T0_LO, MSG_LO
.Lld_partial_8:
mov %rcx, %r8
and $0x10, %r8
jz .Lld_partial_16
movdqa MSG_LO, MSG_HI
movdqu (%rsi), MSG_LO
.Lld_partial_16:
ret
ENDPROC(__load_partial)
/*
* __store_partial: internal ABI
* input:
* %rdx - dst
* %rcx - bytes
* output:
* T0 - message block
* changed:
* %r8
* %r9
* %r10
*/
__store_partial:
mov %rcx, %r8
mov %rdx, %r9
cmp $16, %r8
jl .Lst_partial_16
movdqu T0_LO, (%r9)
movdqa T0_HI, T0_LO
sub $16, %r8
add $16, %r9
.Lst_partial_16:
movq T0_LO, %r10
cmp $8, %r8
jl .Lst_partial_8
mov %r10, (%r9)
psrldq $8, T0_LO
movq T0_LO, %r10
sub $8, %r8
add $8, %r9
.Lst_partial_8:
cmp $4, %r8
jl .Lst_partial_4
mov %r10d, (%r9)
shr $32, %r10
sub $4, %r8
add $4, %r9
.Lst_partial_4:
cmp $2, %r8
jl .Lst_partial_2
mov %r10w, (%r9)
shr $16, %r10
sub $2, %r8
add $2, %r9
.Lst_partial_2:
cmp $1, %r8
jl .Lst_partial_1
mov %r10b, (%r9)
.Lst_partial_1:
ret
ENDPROC(__store_partial)
/*
* void crypto_morus1280_sse2_init(void *state, const void *key,
* const void *iv);
*/
ENTRY(crypto_morus1280_sse2_init)
FRAME_BEGIN
/* load IV: */
pxor STATE0_HI, STATE0_HI
movdqu (%rdx), STATE0_LO
/* load key: */
movdqu 0(%rsi), KEY_LO
movdqu 16(%rsi), KEY_HI
movdqa KEY_LO, STATE1_LO
movdqa KEY_HI, STATE1_HI
/* load all ones: */
pcmpeqd STATE2_LO, STATE2_LO
pcmpeqd STATE2_HI, STATE2_HI
/* load all zeros: */
pxor STATE3_LO, STATE3_LO
pxor STATE3_HI, STATE3_HI
/* load the constant: */
movdqa .Lmorus640_const_0, STATE4_LO
movdqa .Lmorus640_const_1, STATE4_HI
/* update 16 times with zero: */
call __morus1280_update_zero
call __morus1280_update_zero
call __morus1280_update_zero
call __morus1280_update_zero
call __morus1280_update_zero
call __morus1280_update_zero
call __morus1280_update_zero
call __morus1280_update_zero
call __morus1280_update_zero
call __morus1280_update_zero
call __morus1280_update_zero
call __morus1280_update_zero
call __morus1280_update_zero
call __morus1280_update_zero
call __morus1280_update_zero
call __morus1280_update_zero
/* xor-in the key again after updates: */
pxor KEY_LO, STATE1_LO
pxor KEY_HI, STATE1_HI
/* store the state: */
movdqu STATE0_LO, (0 * 16)(%rdi)
movdqu STATE0_HI, (1 * 16)(%rdi)
movdqu STATE1_LO, (2 * 16)(%rdi)
movdqu STATE1_HI, (3 * 16)(%rdi)
movdqu STATE2_LO, (4 * 16)(%rdi)
movdqu STATE2_HI, (5 * 16)(%rdi)
movdqu STATE3_LO, (6 * 16)(%rdi)
movdqu STATE3_HI, (7 * 16)(%rdi)
movdqu STATE4_LO, (8 * 16)(%rdi)
movdqu STATE4_HI, (9 * 16)(%rdi)
FRAME_END
ret
ENDPROC(crypto_morus1280_sse2_init)
/*
* void crypto_morus1280_sse2_ad(void *state, const void *data,
* unsigned int length);
*/
ENTRY(crypto_morus1280_sse2_ad)
FRAME_BEGIN
cmp $32, %rdx
jb .Lad_out
/* load the state: */
movdqu (0 * 16)(%rdi), STATE0_LO
movdqu (1 * 16)(%rdi), STATE0_HI
movdqu (2 * 16)(%rdi), STATE1_LO
movdqu (3 * 16)(%rdi), STATE1_HI
movdqu (4 * 16)(%rdi), STATE2_LO
movdqu (5 * 16)(%rdi), STATE2_HI
movdqu (6 * 16)(%rdi), STATE3_LO
movdqu (7 * 16)(%rdi), STATE3_HI
movdqu (8 * 16)(%rdi), STATE4_LO
movdqu (9 * 16)(%rdi), STATE4_HI
mov %rsi, %r8
and $0xF, %r8
jnz .Lad_u_loop
.align 4
.Lad_a_loop:
movdqa 0(%rsi), MSG_LO
movdqa 16(%rsi), MSG_HI
call __morus1280_update
sub $32, %rdx
add $32, %rsi
cmp $32, %rdx
jge .Lad_a_loop
jmp .Lad_cont
.align 4
.Lad_u_loop:
movdqu 0(%rsi), MSG_LO
movdqu 16(%rsi), MSG_HI
call __morus1280_update
sub $32, %rdx
add $32, %rsi
cmp $32, %rdx
jge .Lad_u_loop
.Lad_cont:
/* store the state: */
movdqu STATE0_LO, (0 * 16)(%rdi)
movdqu STATE0_HI, (1 * 16)(%rdi)
movdqu STATE1_LO, (2 * 16)(%rdi)
movdqu STATE1_HI, (3 * 16)(%rdi)
movdqu STATE2_LO, (4 * 16)(%rdi)
movdqu STATE2_HI, (5 * 16)(%rdi)
movdqu STATE3_LO, (6 * 16)(%rdi)
movdqu STATE3_HI, (7 * 16)(%rdi)
movdqu STATE4_LO, (8 * 16)(%rdi)
movdqu STATE4_HI, (9 * 16)(%rdi)
.Lad_out:
FRAME_END
ret
ENDPROC(crypto_morus1280_sse2_ad)
/*
* void crypto_morus1280_sse2_enc(void *state, const void *src, void *dst,
* unsigned int length);
*/
ENTRY(crypto_morus1280_sse2_enc)
FRAME_BEGIN
cmp $32, %rcx
jb .Lenc_out
/* load the state: */
movdqu (0 * 16)(%rdi), STATE0_LO
movdqu (1 * 16)(%rdi), STATE0_HI
movdqu (2 * 16)(%rdi), STATE1_LO
movdqu (3 * 16)(%rdi), STATE1_HI
movdqu (4 * 16)(%rdi), STATE2_LO
movdqu (5 * 16)(%rdi), STATE2_HI
movdqu (6 * 16)(%rdi), STATE3_LO
movdqu (7 * 16)(%rdi), STATE3_HI
movdqu (8 * 16)(%rdi), STATE4_LO
movdqu (9 * 16)(%rdi), STATE4_HI
mov %rsi, %r8
or %rdx, %r8
and $0xF, %r8
jnz .Lenc_u_loop
.align 4
.Lenc_a_loop:
movdqa 0(%rsi), MSG_LO
movdqa 16(%rsi), MSG_HI
movdqa STATE1_LO, T1_LO
movdqa STATE1_HI, T1_HI
rol3 T1_HI, T1_LO
movdqa MSG_LO, T0_LO
movdqa MSG_HI, T0_HI
pxor T1_LO, T0_LO
pxor T1_HI, T0_HI
pxor STATE0_LO, T0_LO
pxor STATE0_HI, T0_HI
movdqa STATE2_LO, T1_LO
movdqa STATE2_HI, T1_HI
pand STATE3_LO, T1_LO
pand STATE3_HI, T1_HI
pxor T1_LO, T0_LO
pxor T1_HI, T0_HI
movdqa T0_LO, 0(%rdx)
movdqa T0_HI, 16(%rdx)
call __morus1280_update
sub $32, %rcx
add $32, %rsi
add $32, %rdx
cmp $32, %rcx
jge .Lenc_a_loop
jmp .Lenc_cont
.align 4
.Lenc_u_loop:
movdqu 0(%rsi), MSG_LO
movdqu 16(%rsi), MSG_HI
movdqa STATE1_LO, T1_LO
movdqa STATE1_HI, T1_HI
rol3 T1_HI, T1_LO
movdqa MSG_LO, T0_LO
movdqa MSG_HI, T0_HI
pxor T1_LO, T0_LO
pxor T1_HI, T0_HI
pxor STATE0_LO, T0_LO
pxor STATE0_HI, T0_HI
movdqa STATE2_LO, T1_LO
movdqa STATE2_HI, T1_HI
pand STATE3_LO, T1_LO
pand STATE3_HI, T1_HI
pxor T1_LO, T0_LO
pxor T1_HI, T0_HI
movdqu T0_LO, 0(%rdx)
movdqu T0_HI, 16(%rdx)
call __morus1280_update
sub $32, %rcx
add $32, %rsi
add $32, %rdx
cmp $32, %rcx
jge .Lenc_u_loop
.Lenc_cont:
/* store the state: */
movdqu STATE0_LO, (0 * 16)(%rdi)
movdqu STATE0_HI, (1 * 16)(%rdi)
movdqu STATE1_LO, (2 * 16)(%rdi)
movdqu STATE1_HI, (3 * 16)(%rdi)
movdqu STATE2_LO, (4 * 16)(%rdi)
movdqu STATE2_HI, (5 * 16)(%rdi)
movdqu STATE3_LO, (6 * 16)(%rdi)
movdqu STATE3_HI, (7 * 16)(%rdi)
movdqu STATE4_LO, (8 * 16)(%rdi)
movdqu STATE4_HI, (9 * 16)(%rdi)
.Lenc_out:
FRAME_END
ret
ENDPROC(crypto_morus1280_sse2_enc)
/*
* void crypto_morus1280_sse2_enc_tail(void *state, const void *src, void *dst,
* unsigned int length);
*/
ENTRY(crypto_morus1280_sse2_enc_tail)
FRAME_BEGIN
/* load the state: */
movdqu (0 * 16)(%rdi), STATE0_LO
movdqu (1 * 16)(%rdi), STATE0_HI
movdqu (2 * 16)(%rdi), STATE1_LO
movdqu (3 * 16)(%rdi), STATE1_HI
movdqu (4 * 16)(%rdi), STATE2_LO
movdqu (5 * 16)(%rdi), STATE2_HI
movdqu (6 * 16)(%rdi), STATE3_LO
movdqu (7 * 16)(%rdi), STATE3_HI
movdqu (8 * 16)(%rdi), STATE4_LO
movdqu (9 * 16)(%rdi), STATE4_HI
/* encrypt message: */
call __load_partial
movdqa STATE1_LO, T1_LO
movdqa STATE1_HI, T1_HI
rol3 T1_HI, T1_LO
movdqa MSG_LO, T0_LO
movdqa MSG_HI, T0_HI
pxor T1_LO, T0_LO
pxor T1_HI, T0_HI
pxor STATE0_LO, T0_LO
pxor STATE0_HI, T0_HI
movdqa STATE2_LO, T1_LO
movdqa STATE2_HI, T1_HI
pand STATE3_LO, T1_LO
pand STATE3_HI, T1_HI
pxor T1_LO, T0_LO
pxor T1_HI, T0_HI
call __store_partial
call __morus1280_update
/* store the state: */
movdqu STATE0_LO, (0 * 16)(%rdi)
movdqu STATE0_HI, (1 * 16)(%rdi)
movdqu STATE1_LO, (2 * 16)(%rdi)
movdqu STATE1_HI, (3 * 16)(%rdi)
movdqu STATE2_LO, (4 * 16)(%rdi)
movdqu STATE2_HI, (5 * 16)(%rdi)
movdqu STATE3_LO, (6 * 16)(%rdi)
movdqu STATE3_HI, (7 * 16)(%rdi)
movdqu STATE4_LO, (8 * 16)(%rdi)
movdqu STATE4_HI, (9 * 16)(%rdi)
FRAME_END
ENDPROC(crypto_morus1280_sse2_enc_tail)
/*
* void crypto_morus1280_sse2_dec(void *state, const void *src, void *dst,
* unsigned int length);
*/
ENTRY(crypto_morus1280_sse2_dec)
FRAME_BEGIN
cmp $32, %rcx
jb .Ldec_out
/* load the state: */
movdqu (0 * 16)(%rdi), STATE0_LO
movdqu (1 * 16)(%rdi), STATE0_HI
movdqu (2 * 16)(%rdi), STATE1_LO
movdqu (3 * 16)(%rdi), STATE1_HI
movdqu (4 * 16)(%rdi), STATE2_LO
movdqu (5 * 16)(%rdi), STATE2_HI
movdqu (6 * 16)(%rdi), STATE3_LO
movdqu (7 * 16)(%rdi), STATE3_HI
movdqu (8 * 16)(%rdi), STATE4_LO
movdqu (9 * 16)(%rdi), STATE4_HI
mov %rsi, %r8
or %rdx, %r8
and $0xF, %r8
jnz .Ldec_u_loop
.align 4
.Ldec_a_loop:
movdqa 0(%rsi), MSG_LO
movdqa 16(%rsi), MSG_HI
pxor STATE0_LO, MSG_LO
pxor STATE0_HI, MSG_HI
movdqa STATE1_LO, T1_LO
movdqa STATE1_HI, T1_HI
rol3 T1_HI, T1_LO
pxor T1_LO, MSG_LO
pxor T1_HI, MSG_HI
movdqa STATE2_LO, T1_LO
movdqa STATE2_HI, T1_HI
pand STATE3_LO, T1_LO
pand STATE3_HI, T1_HI
pxor T1_LO, MSG_LO
pxor T1_HI, MSG_HI
movdqa MSG_LO, 0(%rdx)
movdqa MSG_HI, 16(%rdx)
call __morus1280_update
sub $32, %rcx
add $32, %rsi
add $32, %rdx
cmp $32, %rcx
jge .Ldec_a_loop
jmp .Ldec_cont
.align 4
.Ldec_u_loop:
movdqu 0(%rsi), MSG_LO
movdqu 16(%rsi), MSG_HI
pxor STATE0_LO, MSG_LO
pxor STATE0_HI, MSG_HI
movdqa STATE1_LO, T1_LO
movdqa STATE1_HI, T1_HI
rol3 T1_HI, T1_LO
pxor T1_LO, MSG_LO
pxor T1_HI, MSG_HI
movdqa STATE2_LO, T1_LO
movdqa STATE2_HI, T1_HI
pand STATE3_LO, T1_LO
pand STATE3_HI, T1_HI
pxor T1_LO, MSG_LO
pxor T1_HI, MSG_HI
movdqu MSG_LO, 0(%rdx)
movdqu MSG_HI, 16(%rdx)
call __morus1280_update
sub $32, %rcx
add $32, %rsi
add $32, %rdx
cmp $32, %rcx
jge .Ldec_u_loop
.Ldec_cont:
/* store the state: */
movdqu STATE0_LO, (0 * 16)(%rdi)
movdqu STATE0_HI, (1 * 16)(%rdi)
movdqu STATE1_LO, (2 * 16)(%rdi)
movdqu STATE1_HI, (3 * 16)(%rdi)
movdqu STATE2_LO, (4 * 16)(%rdi)
movdqu STATE2_HI, (5 * 16)(%rdi)
movdqu STATE3_LO, (6 * 16)(%rdi)
movdqu STATE3_HI, (7 * 16)(%rdi)
movdqu STATE4_LO, (8 * 16)(%rdi)
movdqu STATE4_HI, (9 * 16)(%rdi)
.Ldec_out:
FRAME_END
ret
ENDPROC(crypto_morus1280_sse2_dec)
/*
* void crypto_morus1280_sse2_dec_tail(void *state, const void *src, void *dst,
* unsigned int length);
*/
ENTRY(crypto_morus1280_sse2_dec_tail)
FRAME_BEGIN
/* load the state: */
movdqu (0 * 16)(%rdi), STATE0_LO
movdqu (1 * 16)(%rdi), STATE0_HI
movdqu (2 * 16)(%rdi), STATE1_LO
movdqu (3 * 16)(%rdi), STATE1_HI
movdqu (4 * 16)(%rdi), STATE2_LO
movdqu (5 * 16)(%rdi), STATE2_HI
movdqu (6 * 16)(%rdi), STATE3_LO
movdqu (7 * 16)(%rdi), STATE3_HI
movdqu (8 * 16)(%rdi), STATE4_LO
movdqu (9 * 16)(%rdi), STATE4_HI
/* decrypt message: */
call __load_partial
pxor STATE0_LO, MSG_LO
pxor STATE0_HI, MSG_HI
movdqa STATE1_LO, T1_LO
movdqa STATE1_HI, T1_HI
rol3 T1_HI, T1_LO
pxor T1_LO, MSG_LO
pxor T1_HI, MSG_HI
movdqa STATE2_LO, T1_LO
movdqa STATE2_HI, T1_HI
pand STATE3_LO, T1_LO
pand STATE3_HI, T1_HI
pxor T1_LO, MSG_LO
pxor T1_HI, MSG_HI
movdqa MSG_LO, T0_LO
movdqa MSG_HI, T0_HI
call __store_partial
/* mask with byte count: */
movq %rcx, T0_LO
punpcklbw T0_LO, T0_LO
punpcklbw T0_LO, T0_LO
punpcklbw T0_LO, T0_LO
punpcklbw T0_LO, T0_LO
movdqa T0_LO, T0_HI
movdqa .Lmorus640_counter_0, T1_LO
movdqa .Lmorus640_counter_1, T1_HI
pcmpgtb T1_LO, T0_LO
pcmpgtb T1_HI, T0_HI
pand T0_LO, MSG_LO
pand T0_HI, MSG_HI
call __morus1280_update
/* store the state: */
movdqu STATE0_LO, (0 * 16)(%rdi)
movdqu STATE0_HI, (1 * 16)(%rdi)
movdqu STATE1_LO, (2 * 16)(%rdi)
movdqu STATE1_HI, (3 * 16)(%rdi)
movdqu STATE2_LO, (4 * 16)(%rdi)
movdqu STATE2_HI, (5 * 16)(%rdi)
movdqu STATE3_LO, (6 * 16)(%rdi)
movdqu STATE3_HI, (7 * 16)(%rdi)
movdqu STATE4_LO, (8 * 16)(%rdi)
movdqu STATE4_HI, (9 * 16)(%rdi)
FRAME_END
ret
ENDPROC(crypto_morus1280_sse2_dec_tail)
/*
* void crypto_morus1280_sse2_final(void *state, void *tag_xor,
* u64 assoclen, u64 cryptlen);
*/
ENTRY(crypto_morus1280_sse2_final)
FRAME_BEGIN
/* load the state: */
movdqu (0 * 16)(%rdi), STATE0_LO
movdqu (1 * 16)(%rdi), STATE0_HI
movdqu (2 * 16)(%rdi), STATE1_LO
movdqu (3 * 16)(%rdi), STATE1_HI
movdqu (4 * 16)(%rdi), STATE2_LO
movdqu (5 * 16)(%rdi), STATE2_HI
movdqu (6 * 16)(%rdi), STATE3_LO
movdqu (7 * 16)(%rdi), STATE3_HI
movdqu (8 * 16)(%rdi), STATE4_LO
movdqu (9 * 16)(%rdi), STATE4_HI
/* xor state[0] into state[4]: */
pxor STATE0_LO, STATE4_LO
pxor STATE0_HI, STATE4_HI
/* prepare length block: */
movq %rdx, MSG_LO
movq %rcx, T0_LO
pslldq $8, T0_LO
pxor T0_LO, MSG_LO
psllq $3, MSG_LO /* multiply by 8 (to get bit count) */
pxor MSG_HI, MSG_HI
/* update state: */
call __morus1280_update
call __morus1280_update
call __morus1280_update
call __morus1280_update
call __morus1280_update
call __morus1280_update
call __morus1280_update
call __morus1280_update
call __morus1280_update
call __morus1280_update
/* xor tag: */
movdqu 0(%rsi), MSG_LO
movdqu 16(%rsi), MSG_HI
pxor STATE0_LO, MSG_LO
pxor STATE0_HI, MSG_HI
movdqa STATE1_LO, T0_LO
movdqa STATE1_HI, T0_HI
rol3 T0_HI, T0_LO
pxor T0_LO, MSG_LO
pxor T0_HI, MSG_HI
movdqa STATE2_LO, T0_LO
movdqa STATE2_HI, T0_HI
pand STATE3_LO, T0_LO
pand STATE3_HI, T0_HI
pxor T0_LO, MSG_LO
pxor T0_HI, MSG_HI
movdqu MSG_LO, 0(%rsi)
movdqu MSG_HI, 16(%rsi)
FRAME_END
ret
ENDPROC(crypto_morus1280_sse2_final)

Просмотреть файл

@ -0,0 +1,68 @@
/*
* The MORUS-1280 Authenticated-Encryption Algorithm
* Glue for SSE2 implementation
*
* Copyright (c) 2016-2018 Ondrej Mosnacek <omosnacek@gmail.com>
* Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*/
#include <crypto/internal/aead.h>
#include <crypto/morus1280_glue.h>
#include <linux/module.h>
#include <asm/fpu/api.h>
#include <asm/cpu_device_id.h>
asmlinkage void crypto_morus1280_sse2_init(void *state, const void *key,
const void *iv);
asmlinkage void crypto_morus1280_sse2_ad(void *state, const void *data,
unsigned int length);
asmlinkage void crypto_morus1280_sse2_enc(void *state, const void *src,
void *dst, unsigned int length);
asmlinkage void crypto_morus1280_sse2_dec(void *state, const void *src,
void *dst, unsigned int length);
asmlinkage void crypto_morus1280_sse2_enc_tail(void *state, const void *src,
void *dst, unsigned int length);
asmlinkage void crypto_morus1280_sse2_dec_tail(void *state, const void *src,
void *dst, unsigned int length);
asmlinkage void crypto_morus1280_sse2_final(void *state, void *tag_xor,
u64 assoclen, u64 cryptlen);
MORUS1280_DECLARE_ALGS(sse2, "morus1280-sse2", 350);
static const struct x86_cpu_id sse2_cpu_id[] = {
X86_FEATURE_MATCH(X86_FEATURE_XMM2),
{}
};
MODULE_DEVICE_TABLE(x86cpu, sse2_cpu_id);
static int __init crypto_morus1280_sse2_module_init(void)
{
if (!x86_match_cpu(sse2_cpu_id))
return -ENODEV;
return crypto_register_aeads(crypto_morus1280_sse2_algs,
ARRAY_SIZE(crypto_morus1280_sse2_algs));
}
static void __exit crypto_morus1280_sse2_module_exit(void)
{
crypto_unregister_aeads(crypto_morus1280_sse2_algs,
ARRAY_SIZE(crypto_morus1280_sse2_algs));
}
module_init(crypto_morus1280_sse2_module_init);
module_exit(crypto_morus1280_sse2_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>");
MODULE_DESCRIPTION("MORUS-1280 AEAD algorithm -- SSE2 implementation");
MODULE_ALIAS_CRYPTO("morus1280");
MODULE_ALIAS_CRYPTO("morus1280-sse2");

Просмотреть файл

@ -0,0 +1,614 @@
/*
* SSE2 implementation of MORUS-640
*
* Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
* Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published
* by the Free Software Foundation.
*/
#include <linux/linkage.h>
#include <asm/frame.h>
#define SHUFFLE_MASK(i0, i1, i2, i3) \
(i0 | (i1 << 2) | (i2 << 4) | (i3 << 6))
#define MASK1 SHUFFLE_MASK(3, 0, 1, 2)
#define MASK2 SHUFFLE_MASK(2, 3, 0, 1)
#define MASK3 SHUFFLE_MASK(1, 2, 3, 0)
#define STATE0 %xmm0
#define STATE1 %xmm1
#define STATE2 %xmm2
#define STATE3 %xmm3
#define STATE4 %xmm4
#define KEY %xmm5
#define MSG %xmm5
#define T0 %xmm6
#define T1 %xmm7
.section .rodata.cst16.morus640_const, "aM", @progbits, 32
.align 16
.Lmorus640_const_0:
.byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
.byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
.Lmorus640_const_1:
.byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
.byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
.section .rodata.cst16.morus640_counter, "aM", @progbits, 16
.align 16
.Lmorus640_counter:
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
.byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
.text
.macro morus640_round s0, s1, s2, s3, s4, b, w
movdqa \s1, T0
pand \s2, T0
pxor T0, \s0
pxor \s3, \s0
movdqa \s0, T0
pslld $\b, T0
psrld $(32 - \b), \s0
pxor T0, \s0
pshufd $\w, \s3, \s3
.endm
/*
* __morus640_update: internal ABI
* input:
* STATE[0-4] - input state
* MSG - message block
* output:
* STATE[0-4] - output state
* changed:
* T0
*/
__morus640_update:
morus640_round STATE0, STATE1, STATE2, STATE3, STATE4, 5, MASK1
pxor MSG, STATE1
morus640_round STATE1, STATE2, STATE3, STATE4, STATE0, 31, MASK2
pxor MSG, STATE2
morus640_round STATE2, STATE3, STATE4, STATE0, STATE1, 7, MASK3
pxor MSG, STATE3
morus640_round STATE3, STATE4, STATE0, STATE1, STATE2, 22, MASK2
pxor MSG, STATE4
morus640_round STATE4, STATE0, STATE1, STATE2, STATE3, 13, MASK1
ret
ENDPROC(__morus640_update)
/*
* __morus640_update_zero: internal ABI
* input:
* STATE[0-4] - input state
* output:
* STATE[0-4] - output state
* changed:
* T0
*/
__morus640_update_zero:
morus640_round STATE0, STATE1, STATE2, STATE3, STATE4, 5, MASK1
morus640_round STATE1, STATE2, STATE3, STATE4, STATE0, 31, MASK2
morus640_round STATE2, STATE3, STATE4, STATE0, STATE1, 7, MASK3
morus640_round STATE3, STATE4, STATE0, STATE1, STATE2, 22, MASK2
morus640_round STATE4, STATE0, STATE1, STATE2, STATE3, 13, MASK1
ret
ENDPROC(__morus640_update_zero)
/*
* __load_partial: internal ABI
* input:
* %rsi - src
* %rcx - bytes
* output:
* MSG - message block
* changed:
* T0
* %r8
* %r9
*/
__load_partial:
xor %r9, %r9
pxor MSG, MSG
mov %rcx, %r8
and $0x1, %r8
jz .Lld_partial_1
mov %rcx, %r8
and $0x1E, %r8
add %rsi, %r8
mov (%r8), %r9b
.Lld_partial_1:
mov %rcx, %r8
and $0x2, %r8
jz .Lld_partial_2
mov %rcx, %r8
and $0x1C, %r8
add %rsi, %r8
shl $16, %r9
mov (%r8), %r9w
.Lld_partial_2:
mov %rcx, %r8
and $0x4, %r8
jz .Lld_partial_4
mov %rcx, %r8
and $0x18, %r8
add %rsi, %r8
shl $32, %r9
mov (%r8), %r8d
xor %r8, %r9
.Lld_partial_4:
movq %r9, MSG
mov %rcx, %r8
and $0x8, %r8
jz .Lld_partial_8
mov %rcx, %r8
and $0x10, %r8
add %rsi, %r8
pslldq $8, MSG
movq (%r8), T0
pxor T0, MSG
.Lld_partial_8:
ret
ENDPROC(__load_partial)
/*
* __store_partial: internal ABI
* input:
* %rdx - dst
* %rcx - bytes
* output:
* T0 - message block
* changed:
* %r8
* %r9
* %r10
*/
__store_partial:
mov %rcx, %r8
mov %rdx, %r9
movq T0, %r10
cmp $8, %r8
jl .Lst_partial_8
mov %r10, (%r9)
psrldq $8, T0
movq T0, %r10
sub $8, %r8
add $8, %r9
.Lst_partial_8:
cmp $4, %r8
jl .Lst_partial_4
mov %r10d, (%r9)
shr $32, %r10
sub $4, %r8
add $4, %r9
.Lst_partial_4:
cmp $2, %r8
jl .Lst_partial_2
mov %r10w, (%r9)
shr $16, %r10
sub $2, %r8
add $2, %r9
.Lst_partial_2:
cmp $1, %r8
jl .Lst_partial_1
mov %r10b, (%r9)
.Lst_partial_1:
ret
ENDPROC(__store_partial)
/*
* void crypto_morus640_sse2_init(void *state, const void *key, const void *iv);
*/
ENTRY(crypto_morus640_sse2_init)
FRAME_BEGIN
/* load IV: */
movdqu (%rdx), STATE0
/* load key: */
movdqu (%rsi), KEY
movdqa KEY, STATE1
/* load all ones: */
pcmpeqd STATE2, STATE2
/* load the constants: */
movdqa .Lmorus640_const_0, STATE3
movdqa .Lmorus640_const_1, STATE4
/* update 16 times with zero: */
call __morus640_update_zero
call __morus640_update_zero
call __morus640_update_zero
call __morus640_update_zero
call __morus640_update_zero
call __morus640_update_zero
call __morus640_update_zero
call __morus640_update_zero
call __morus640_update_zero
call __morus640_update_zero
call __morus640_update_zero
call __morus640_update_zero
call __morus640_update_zero
call __morus640_update_zero
call __morus640_update_zero
call __morus640_update_zero
/* xor-in the key again after updates: */
pxor KEY, STATE1
/* store the state: */
movdqu STATE0, (0 * 16)(%rdi)
movdqu STATE1, (1 * 16)(%rdi)
movdqu STATE2, (2 * 16)(%rdi)
movdqu STATE3, (3 * 16)(%rdi)
movdqu STATE4, (4 * 16)(%rdi)
FRAME_END
ret
ENDPROC(crypto_morus640_sse2_init)
/*
* void crypto_morus640_sse2_ad(void *state, const void *data,
* unsigned int length);
*/
ENTRY(crypto_morus640_sse2_ad)
FRAME_BEGIN
cmp $16, %rdx
jb .Lad_out
/* load the state: */
movdqu (0 * 16)(%rdi), STATE0
movdqu (1 * 16)(%rdi), STATE1
movdqu (2 * 16)(%rdi), STATE2
movdqu (3 * 16)(%rdi), STATE3
movdqu (4 * 16)(%rdi), STATE4
mov %rsi, %r8
and $0xF, %r8
jnz .Lad_u_loop
.align 4
.Lad_a_loop:
movdqa (%rsi), MSG
call __morus640_update
sub $16, %rdx
add $16, %rsi
cmp $16, %rdx
jge .Lad_a_loop
jmp .Lad_cont
.align 4
.Lad_u_loop:
movdqu (%rsi), MSG
call __morus640_update
sub $16, %rdx
add $16, %rsi
cmp $16, %rdx
jge .Lad_u_loop
.Lad_cont:
/* store the state: */
movdqu STATE0, (0 * 16)(%rdi)
movdqu STATE1, (1 * 16)(%rdi)
movdqu STATE2, (2 * 16)(%rdi)
movdqu STATE3, (3 * 16)(%rdi)
movdqu STATE4, (4 * 16)(%rdi)
.Lad_out:
FRAME_END
ret
ENDPROC(crypto_morus640_sse2_ad)
/*
* void crypto_morus640_sse2_enc(void *state, const void *src, void *dst,
* unsigned int length);
*/
ENTRY(crypto_morus640_sse2_enc)
FRAME_BEGIN
cmp $16, %rcx
jb .Lenc_out
/* load the state: */
movdqu (0 * 16)(%rdi), STATE0
movdqu (1 * 16)(%rdi), STATE1
movdqu (2 * 16)(%rdi), STATE2
movdqu (3 * 16)(%rdi), STATE3
movdqu (4 * 16)(%rdi), STATE4
mov %rsi, %r8
or %rdx, %r8
and $0xF, %r8
jnz .Lenc_u_loop
.align 4
.Lenc_a_loop:
movdqa (%rsi), MSG
movdqa MSG, T0
pxor STATE0, T0
pshufd $MASK3, STATE1, T1
pxor T1, T0
movdqa STATE2, T1
pand STATE3, T1
pxor T1, T0
movdqa T0, (%rdx)
call __morus640_update
sub $16, %rcx
add $16, %rsi
add $16, %rdx
cmp $16, %rcx
jge .Lenc_a_loop
jmp .Lenc_cont
.align 4
.Lenc_u_loop:
movdqu (%rsi), MSG
movdqa MSG, T0
pxor STATE0, T0
pshufd $MASK3, STATE1, T1
pxor T1, T0
movdqa STATE2, T1
pand STATE3, T1
pxor T1, T0
movdqu T0, (%rdx)
call __morus640_update
sub $16, %rcx
add $16, %rsi
add $16, %rdx
cmp $16, %rcx
jge .Lenc_u_loop
.Lenc_cont:
/* store the state: */
movdqu STATE0, (0 * 16)(%rdi)
movdqu STATE1, (1 * 16)(%rdi)
movdqu STATE2, (2 * 16)(%rdi)
movdqu STATE3, (3 * 16)(%rdi)
movdqu STATE4, (4 * 16)(%rdi)
.Lenc_out:
FRAME_END
ret
ENDPROC(crypto_morus640_sse2_enc)
/*
* void crypto_morus640_sse2_enc_tail(void *state, const void *src, void *dst,
* unsigned int length);
*/
ENTRY(crypto_morus640_sse2_enc_tail)
FRAME_BEGIN
/* load the state: */
movdqu (0 * 16)(%rdi), STATE0
movdqu (1 * 16)(%rdi), STATE1
movdqu (2 * 16)(%rdi), STATE2
movdqu (3 * 16)(%rdi), STATE3
movdqu (4 * 16)(%rdi), STATE4
/* encrypt message: */
call __load_partial
movdqa MSG, T0
pxor STATE0, T0
pshufd $MASK3, STATE1, T1
pxor T1, T0
movdqa STATE2, T1
pand STATE3, T1
pxor T1, T0
call __store_partial
call __morus640_update
/* store the state: */
movdqu STATE0, (0 * 16)(%rdi)
movdqu STATE1, (1 * 16)(%rdi)
movdqu STATE2, (2 * 16)(%rdi)
movdqu STATE3, (3 * 16)(%rdi)
movdqu STATE4, (4 * 16)(%rdi)
FRAME_END
ENDPROC(crypto_morus640_sse2_enc_tail)
/*
* void crypto_morus640_sse2_dec(void *state, const void *src, void *dst,
* unsigned int length);
*/
ENTRY(crypto_morus640_sse2_dec)
FRAME_BEGIN
cmp $16, %rcx
jb .Ldec_out
/* load the state: */
movdqu (0 * 16)(%rdi), STATE0
movdqu (1 * 16)(%rdi), STATE1
movdqu (2 * 16)(%rdi), STATE2
movdqu (3 * 16)(%rdi), STATE3
movdqu (4 * 16)(%rdi), STATE4
mov %rsi, %r8
or %rdx, %r8
and $0xF, %r8
jnz .Ldec_u_loop
.align 4
.Ldec_a_loop:
movdqa (%rsi), MSG
pxor STATE0, MSG
pshufd $MASK3, STATE1, T0
pxor T0, MSG
movdqa STATE2, T0
pand STATE3, T0
pxor T0, MSG
movdqa MSG, (%rdx)
call __morus640_update
sub $16, %rcx
add $16, %rsi
add $16, %rdx
cmp $16, %rcx
jge .Ldec_a_loop
jmp .Ldec_cont
.align 4
.Ldec_u_loop:
movdqu (%rsi), MSG
pxor STATE0, MSG
pshufd $MASK3, STATE1, T0
pxor T0, MSG
movdqa STATE2, T0
pand STATE3, T0
pxor T0, MSG
movdqu MSG, (%rdx)
call __morus640_update
sub $16, %rcx
add $16, %rsi
add $16, %rdx
cmp $16, %rcx
jge .Ldec_u_loop
.Ldec_cont:
/* store the state: */
movdqu STATE0, (0 * 16)(%rdi)
movdqu STATE1, (1 * 16)(%rdi)
movdqu STATE2, (2 * 16)(%rdi)
movdqu STATE3, (3 * 16)(%rdi)
movdqu STATE4, (4 * 16)(%rdi)
.Ldec_out:
FRAME_END
ret
ENDPROC(crypto_morus640_sse2_dec)
/*
* void crypto_morus640_sse2_dec_tail(void *state, const void *src, void *dst,
* unsigned int length);
*/
ENTRY(crypto_morus640_sse2_dec_tail)
FRAME_BEGIN
/* load the state: */
movdqu (0 * 16)(%rdi), STATE0
movdqu (1 * 16)(%rdi), STATE1
movdqu (2 * 16)(%rdi), STATE2
movdqu (3 * 16)(%rdi), STATE3
movdqu (4 * 16)(%rdi), STATE4
/* decrypt message: */
call __load_partial
pxor STATE0, MSG
pshufd $MASK3, STATE1, T0
pxor T0, MSG
movdqa STATE2, T0
pand STATE3, T0
pxor T0, MSG
movdqa MSG, T0
call __store_partial
/* mask with byte count: */
movq %rcx, T0
punpcklbw T0, T0
punpcklbw T0, T0
punpcklbw T0, T0
punpcklbw T0, T0
movdqa .Lmorus640_counter, T1
pcmpgtb T1, T0
pand T0, MSG
call __morus640_update
/* store the state: */
movdqu STATE0, (0 * 16)(%rdi)
movdqu STATE1, (1 * 16)(%rdi)
movdqu STATE2, (2 * 16)(%rdi)
movdqu STATE3, (3 * 16)(%rdi)
movdqu STATE4, (4 * 16)(%rdi)
FRAME_END
ret
ENDPROC(crypto_morus640_sse2_dec_tail)
/*
* void crypto_morus640_sse2_final(void *state, void *tag_xor,
* u64 assoclen, u64 cryptlen);
*/
ENTRY(crypto_morus640_sse2_final)
FRAME_BEGIN
/* load the state: */
movdqu (0 * 16)(%rdi), STATE0
movdqu (1 * 16)(%rdi), STATE1
movdqu (2 * 16)(%rdi), STATE2
movdqu (3 * 16)(%rdi), STATE3
movdqu (4 * 16)(%rdi), STATE4
/* xor state[0] into state[4]: */
pxor STATE0, STATE4
/* prepare length block: */
movq %rdx, MSG
movq %rcx, T0
pslldq $8, T0
pxor T0, MSG
psllq $3, MSG /* multiply by 8 (to get bit count) */
/* update state: */
call __morus640_update
call __morus640_update
call __morus640_update
call __morus640_update
call __morus640_update
call __morus640_update
call __morus640_update
call __morus640_update
call __morus640_update
call __morus640_update
/* xor tag: */
movdqu (%rsi), MSG
pxor STATE0, MSG
pshufd $MASK3, STATE1, T0
pxor T0, MSG
movdqa STATE2, T0
pand STATE3, T0
pxor T0, MSG
movdqu MSG, (%rsi)
FRAME_END
ret
ENDPROC(crypto_morus640_sse2_final)

Просмотреть файл

@ -0,0 +1,68 @@
/*
* The MORUS-640 Authenticated-Encryption Algorithm
* Glue for SSE2 implementation
*
* Copyright (c) 2016-2018 Ondrej Mosnacek <omosnacek@gmail.com>
* Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*/
#include <crypto/internal/aead.h>
#include <crypto/morus640_glue.h>
#include <linux/module.h>
#include <asm/fpu/api.h>
#include <asm/cpu_device_id.h>
asmlinkage void crypto_morus640_sse2_init(void *state, const void *key,
const void *iv);
asmlinkage void crypto_morus640_sse2_ad(void *state, const void *data,
unsigned int length);
asmlinkage void crypto_morus640_sse2_enc(void *state, const void *src,
void *dst, unsigned int length);
asmlinkage void crypto_morus640_sse2_dec(void *state, const void *src,
void *dst, unsigned int length);
asmlinkage void crypto_morus640_sse2_enc_tail(void *state, const void *src,
void *dst, unsigned int length);
asmlinkage void crypto_morus640_sse2_dec_tail(void *state, const void *src,
void *dst, unsigned int length);
asmlinkage void crypto_morus640_sse2_final(void *state, void *tag_xor,
u64 assoclen, u64 cryptlen);
MORUS640_DECLARE_ALGS(sse2, "morus640-sse2", 400);
static const struct x86_cpu_id sse2_cpu_id[] = {
X86_FEATURE_MATCH(X86_FEATURE_XMM2),
{}
};
MODULE_DEVICE_TABLE(x86cpu, sse2_cpu_id);
static int __init crypto_morus640_sse2_module_init(void)
{
if (!x86_match_cpu(sse2_cpu_id))
return -ENODEV;
return crypto_register_aeads(crypto_morus640_sse2_algs,
ARRAY_SIZE(crypto_morus640_sse2_algs));
}
static void __exit crypto_morus640_sse2_module_exit(void)
{
crypto_unregister_aeads(crypto_morus640_sse2_algs,
ARRAY_SIZE(crypto_morus640_sse2_algs));
}
module_init(crypto_morus640_sse2_module_init);
module_exit(crypto_morus640_sse2_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>");
MODULE_DESCRIPTION("MORUS-640 AEAD algorithm -- SSE2 implementation");
MODULE_ALIAS_CRYPTO("morus640");
MODULE_ALIAS_CRYPTO("morus640-sse2");

Просмотреть файл

@ -348,6 +348,14 @@ config CRYPTO_MORUS640_GLUE
Common glue for SIMD optimizations of the MORUS-640 dedicated AEAD Common glue for SIMD optimizations of the MORUS-640 dedicated AEAD
algorithm. algorithm.
config CRYPTO_MORUS640_SSE2
tristate "MORUS-640 AEAD algorithm (x86_64 SSE2 implementation)"
depends on X86 && 64BIT
select CRYPTO_AEAD
select CRYPTO_MORUS640_GLUE
help
SSE2 implementation of the MORUS-640 dedicated AEAD algorithm.
config CRYPTO_MORUS1280 config CRYPTO_MORUS1280
tristate "MORUS-1280 AEAD algorithm" tristate "MORUS-1280 AEAD algorithm"
select CRYPTO_AEAD select CRYPTO_AEAD
@ -362,6 +370,24 @@ config CRYPTO_MORUS1280_GLUE
Common glue for SIMD optimizations of the MORUS-1280 dedicated AEAD Common glue for SIMD optimizations of the MORUS-1280 dedicated AEAD
algorithm. algorithm.
config CRYPTO_MORUS1280_SSE2
tristate "MORUS-1280 AEAD algorithm (x86_64 SSE2 implementation)"
depends on X86 && 64BIT
select CRYPTO_AEAD
select CRYPTO_MORUS1280_GLUE
help
SSE2 optimizedimplementation of the MORUS-1280 dedicated AEAD
algorithm.
config CRYPTO_MORUS1280_AVX2
tristate "MORUS-1280 AEAD algorithm (x86_64 AVX2 implementation)"
depends on X86 && 64BIT
select CRYPTO_AEAD
select CRYPTO_MORUS1280_GLUE
help
AVX2 optimized implementation of the MORUS-1280 dedicated AEAD
algorithm.
config CRYPTO_SEQIV config CRYPTO_SEQIV
tristate "Sequence Number IV Generator" tristate "Sequence Number IV Generator"
select CRYPTO_AEAD select CRYPTO_AEAD