arm64/crypto: improve performance of GHASH algorithm
This patches modifies the GHASH secure hash implementation to switch to a faster, polynomial multiplication based reduction instead of one that uses shifts and rotates. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
This commit is contained in:
Родитель
6aa8b209f5
Коммит
b913a6404c
|
@ -3,14 +3,6 @@
|
|||
*
|
||||
* Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org>
|
||||
*
|
||||
* Based on arch/x86/crypto/ghash-pmullni-intel_asm.S
|
||||
*
|
||||
* Copyright (c) 2009 Intel Corp.
|
||||
* Author: Huang Ying <ying.huang@intel.com>
|
||||
* Vinodh Gopal
|
||||
* Erdinc Ozturk
|
||||
* Deniz Karakoyunlu
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 as published
|
||||
* by the Free Software Foundation.
|
||||
|
@ -19,13 +11,15 @@
|
|||
#include <linux/linkage.h>
|
||||
#include <asm/assembler.h>
|
||||
|
||||
DATA .req v0
|
||||
SHASH .req v1
|
||||
IN1 .req v2
|
||||
SHASH .req v0
|
||||
SHASH2 .req v1
|
||||
T1 .req v2
|
||||
T2 .req v3
|
||||
T3 .req v4
|
||||
VZR .req v5
|
||||
MASK .req v4
|
||||
XL .req v5
|
||||
XM .req v6
|
||||
XH .req v7
|
||||
IN1 .req v7
|
||||
|
||||
.text
|
||||
.arch armv8-a+crypto
|
||||
|
@ -35,61 +29,51 @@
|
|||
* struct ghash_key const *k, const char *head)
|
||||
*/
|
||||
ENTRY(pmull_ghash_update)
|
||||
ld1 {DATA.16b}, [x1]
|
||||
ld1 {SHASH.16b}, [x3]
|
||||
eor VZR.16b, VZR.16b, VZR.16b
|
||||
ld1 {XL.16b}, [x1]
|
||||
movi MASK.16b, #0xe1
|
||||
ext SHASH2.16b, SHASH.16b, SHASH.16b, #8
|
||||
shl MASK.2d, MASK.2d, #57
|
||||
eor SHASH2.16b, SHASH2.16b, SHASH.16b
|
||||
|
||||
/* do the head block first, if supplied */
|
||||
cbz x4, 0f
|
||||
ld1 {IN1.2d}, [x4]
|
||||
ld1 {T1.2d}, [x4]
|
||||
b 1f
|
||||
|
||||
0: ld1 {IN1.2d}, [x2], #16
|
||||
0: ld1 {T1.2d}, [x2], #16
|
||||
sub w0, w0, #1
|
||||
1: ext IN1.16b, IN1.16b, IN1.16b, #8
|
||||
CPU_LE( rev64 IN1.16b, IN1.16b )
|
||||
eor DATA.16b, DATA.16b, IN1.16b
|
||||
|
||||
/* multiply DATA by SHASH in GF(2^128) */
|
||||
ext T2.16b, DATA.16b, DATA.16b, #8
|
||||
ext T3.16b, SHASH.16b, SHASH.16b, #8
|
||||
eor T2.16b, T2.16b, DATA.16b
|
||||
eor T3.16b, T3.16b, SHASH.16b
|
||||
1: /* multiply XL by SHASH in GF(2^128) */
|
||||
CPU_LE( rev64 T1.16b, T1.16b )
|
||||
|
||||
pmull2 T1.1q, SHASH.2d, DATA.2d // a1 * b1
|
||||
pmull DATA.1q, SHASH.1d, DATA.1d // a0 * b0
|
||||
pmull T2.1q, T2.1d, T3.1d // (a1 + a0)(b1 + b0)
|
||||
eor T2.16b, T2.16b, T1.16b // (a0 * b1) + (a1 * b0)
|
||||
eor T2.16b, T2.16b, DATA.16b
|
||||
|
||||
ext T3.16b, VZR.16b, T2.16b, #8
|
||||
ext T2.16b, T2.16b, VZR.16b, #8
|
||||
eor DATA.16b, DATA.16b, T3.16b
|
||||
eor T1.16b, T1.16b, T2.16b // <T1:DATA> is result of
|
||||
// carry-less multiplication
|
||||
|
||||
/* first phase of the reduction */
|
||||
shl T3.2d, DATA.2d, #1
|
||||
eor T3.16b, T3.16b, DATA.16b
|
||||
shl T3.2d, T3.2d, #5
|
||||
eor T3.16b, T3.16b, DATA.16b
|
||||
shl T3.2d, T3.2d, #57
|
||||
ext T2.16b, VZR.16b, T3.16b, #8
|
||||
ext T3.16b, T3.16b, VZR.16b, #8
|
||||
eor DATA.16b, DATA.16b, T2.16b
|
||||
eor T1.16b, T1.16b, T3.16b
|
||||
|
||||
/* second phase of the reduction */
|
||||
ushr T2.2d, DATA.2d, #5
|
||||
eor T2.16b, T2.16b, DATA.16b
|
||||
ushr T2.2d, T2.2d, #1
|
||||
eor T2.16b, T2.16b, DATA.16b
|
||||
ushr T2.2d, T2.2d, #1
|
||||
ext T2.16b, XL.16b, XL.16b, #8
|
||||
ext IN1.16b, T1.16b, T1.16b, #8
|
||||
eor T1.16b, T1.16b, T2.16b
|
||||
eor DATA.16b, DATA.16b, T1.16b
|
||||
eor XL.16b, XL.16b, IN1.16b
|
||||
|
||||
pmull2 XH.1q, SHASH.2d, XL.2d // a1 * b1
|
||||
eor T1.16b, T1.16b, XL.16b
|
||||
pmull XL.1q, SHASH.1d, XL.1d // a0 * b0
|
||||
pmull XM.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0)
|
||||
|
||||
ext T1.16b, XL.16b, XH.16b, #8
|
||||
eor T2.16b, XL.16b, XH.16b
|
||||
eor XM.16b, XM.16b, T1.16b
|
||||
eor XM.16b, XM.16b, T2.16b
|
||||
pmull T2.1q, XL.1d, MASK.1d
|
||||
|
||||
mov XH.d[0], XM.d[1]
|
||||
mov XM.d[1], XL.d[0]
|
||||
|
||||
eor XL.16b, XM.16b, T2.16b
|
||||
ext T2.16b, XL.16b, XL.16b, #8
|
||||
pmull XL.1q, XL.1d, MASK.1d
|
||||
eor T2.16b, T2.16b, XH.16b
|
||||
eor XL.16b, XL.16b, T2.16b
|
||||
|
||||
cbnz w0, 0b
|
||||
|
||||
st1 {DATA.16b}, [x1]
|
||||
st1 {XL.16b}, [x1]
|
||||
ret
|
||||
ENDPROC(pmull_ghash_update)
|
||||
|
|
|
@ -67,7 +67,7 @@ static int ghash_update(struct shash_desc *desc, const u8 *src,
|
|||
blocks = len / GHASH_BLOCK_SIZE;
|
||||
len %= GHASH_BLOCK_SIZE;
|
||||
|
||||
kernel_neon_begin_partial(6);
|
||||
kernel_neon_begin_partial(8);
|
||||
pmull_ghash_update(blocks, ctx->digest, src, key,
|
||||
partial ? ctx->buf : NULL);
|
||||
kernel_neon_end();
|
||||
|
@ -89,7 +89,7 @@ static int ghash_final(struct shash_desc *desc, u8 *dst)
|
|||
|
||||
memset(ctx->buf + partial, 0, GHASH_BLOCK_SIZE - partial);
|
||||
|
||||
kernel_neon_begin_partial(6);
|
||||
kernel_neon_begin_partial(8);
|
||||
pmull_ghash_update(1, ctx->digest, ctx->buf, key, NULL);
|
||||
kernel_neon_end();
|
||||
}
|
||||
|
|
Загрузка…
Ссылка в новой задаче