gecko-dev/security/nss/lib/freebl/intel-gcm.s

1341 строка
32 KiB
ArmAsm

# LICENSE:
# This submission to NSS is to be made available under the terms of the
# Mozilla Public License, v. 2.0. You can obtain one at http:
# //mozilla.org/MPL/2.0/.
################################################################################
# Copyright(c) 2012, Intel Corp.
.align 16
.Lone:
.quad 1,0
.Ltwo:
.quad 2,0
.Lbswap_mask:
.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
.Lshuff_mask:
.quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
.Lpoly:
.quad 0x1, 0xc200000000000000
################################################################################
# Generates the final GCM tag
# void intel_aes_gcmTAG(uint8_t Htbl[16*16], uint8_t *Tp, uint64_t Mlen, uint64_t Alen, uint8_t* X0, uint8_t* TAG);
.type intel_aes_gcmTAG,@function
.globl intel_aes_gcmTAG
.align 16
intel_aes_gcmTAG:
.set Htbl, %rdi
.set Tp, %rsi
.set Mlen, %rdx
.set Alen, %rcx
.set X0, %r8
.set TAG, %r9
.set T,%xmm0
.set TMP0,%xmm1
vmovdqu (Tp), T
vpshufb .Lbswap_mask(%rip), T, T
vpxor TMP0, TMP0, TMP0
shl $3, Mlen
shl $3, Alen
vpinsrq $0, Mlen, TMP0, TMP0
vpinsrq $1, Alen, TMP0, TMP0
vpxor TMP0, T, T
vmovdqu (Htbl), TMP0
call GFMUL
vpshufb .Lbswap_mask(%rip), T, T
vpxor (X0), T, T
vmovdqu T, (TAG)
ret
.size intel_aes_gcmTAG, .-intel_aes_gcmTAG
################################################################################
# Generates the H table
# void intel_aes_gcmINIT(uint8_t Htbl[16*16], uint8_t *KS, int NR);
.type intel_aes_gcmINIT,@function
.globl intel_aes_gcmINIT
.align 16
intel_aes_gcmINIT:
.set Htbl, %rdi
.set KS, %rsi
.set NR, %edx
.set T,%xmm0
.set TMP0,%xmm1
CALCULATE_POWERS_OF_H:
vmovdqu 16*0(KS), T
vaesenc 16*1(KS), T, T
vaesenc 16*2(KS), T, T
vaesenc 16*3(KS), T, T
vaesenc 16*4(KS), T, T
vaesenc 16*5(KS), T, T
vaesenc 16*6(KS), T, T
vaesenc 16*7(KS), T, T
vaesenc 16*8(KS), T, T
vaesenc 16*9(KS), T, T
vmovdqu 16*10(KS), TMP0
cmp $10, NR
je .LH0done
vaesenc 16*10(KS), T, T
vaesenc 16*11(KS), T, T
vmovdqu 16*12(KS), TMP0
cmp $12, NR
je .LH0done
vaesenc 16*12(KS), T, T
vaesenc 16*13(KS), T, T
vmovdqu 16*14(KS), TMP0
.LH0done:
vaesenclast TMP0, T, T
vpshufb .Lbswap_mask(%rip), T, T
vmovdqu T, TMP0
# Calculate H` = GFMUL(H, 2)
vpsrld $7 , T , %xmm3
vmovdqu .Lshuff_mask(%rip), %xmm4
vpshufb %xmm4, %xmm3 , %xmm3
movq $0xff00 , %rax
vmovq %rax, %xmm4
vpshufb %xmm3, %xmm4 , %xmm4
vmovdqu .Lpoly(%rip), %xmm5
vpand %xmm4, %xmm5, %xmm5
vpsrld $31, T, %xmm3
vpslld $1, T, %xmm4
vpslldq $4, %xmm3, %xmm3
vpxor %xmm3, %xmm4, T #xmm1 holds now p(x)<<1
#adding p(x)<<1 to xmm5
vpxor %xmm5, T , T
vmovdqu T, TMP0
vmovdqu T, (Htbl) # H * 2
call GFMUL
vmovdqu T, 16(Htbl) # H^2 * 2
call GFMUL
vmovdqu T, 32(Htbl) # H^3 * 2
call GFMUL
vmovdqu T, 48(Htbl) # H^4 * 2
call GFMUL
vmovdqu T, 64(Htbl) # H^5 * 2
call GFMUL
vmovdqu T, 80(Htbl) # H^6 * 2
call GFMUL
vmovdqu T, 96(Htbl) # H^7 * 2
call GFMUL
vmovdqu T, 112(Htbl) # H^8 * 2
# Precalculations for the reduce 4 step
vpshufd $78, (Htbl), %xmm8
vpshufd $78, 16(Htbl), %xmm9
vpshufd $78, 32(Htbl), %xmm10
vpshufd $78, 48(Htbl), %xmm11
vpshufd $78, 64(Htbl), %xmm12
vpshufd $78, 80(Htbl), %xmm13
vpshufd $78, 96(Htbl), %xmm14
vpshufd $78, 112(Htbl), %xmm15
vpxor (Htbl), %xmm8, %xmm8
vpxor 16(Htbl), %xmm9, %xmm9
vpxor 32(Htbl), %xmm10, %xmm10
vpxor 48(Htbl), %xmm11, %xmm11
vpxor 64(Htbl), %xmm12, %xmm12
vpxor 80(Htbl), %xmm13, %xmm13
vpxor 96(Htbl), %xmm14, %xmm14
vpxor 112(Htbl), %xmm15, %xmm15
vmovdqu %xmm8, 128(Htbl)
vmovdqu %xmm9, 144(Htbl)
vmovdqu %xmm10, 160(Htbl)
vmovdqu %xmm11, 176(Htbl)
vmovdqu %xmm12, 192(Htbl)
vmovdqu %xmm13, 208(Htbl)
vmovdqu %xmm14, 224(Htbl)
vmovdqu %xmm15, 240(Htbl)
ret
.size intel_aes_gcmINIT, .-intel_aes_gcmINIT
################################################################################
# Authenticate only
# void intel_aes_gcmAAD(uint8_t Htbl[16*16], uint8_t *AAD, uint64_t Alen, uint8_t *Tp);
.globl intel_aes_gcmAAD
.type intel_aes_gcmAAD,@function
.align 16
intel_aes_gcmAAD:
.set DATA, %xmm0
.set T, %xmm1
.set BSWAP_MASK, %xmm2
.set TMP0, %xmm3
.set TMP1, %xmm4
.set TMP2, %xmm5
.set TMP3, %xmm6
.set TMP4, %xmm7
.set Xhi, %xmm9
.set Htbl, %rdi
.set inp, %rsi
.set len, %rdx
.set Tp, %rcx
.set hlp0, %r11
.macro KARATSUBA_AAD i
vpclmulqdq $0x00, 16*\i(Htbl), DATA, TMP3
vpxor TMP3, TMP0, TMP0
vpclmulqdq $0x11, 16*\i(Htbl), DATA, TMP3
vpxor TMP3, TMP1, TMP1
vpshufd $78, DATA, TMP3
vpxor DATA, TMP3, TMP3
vpclmulqdq $0x00, 16*(\i+8)(Htbl), TMP3, TMP3
vpxor TMP3, TMP2, TMP2
.endm
test len, len
jnz .LbeginAAD
ret
.LbeginAAD:
push hlp0
vzeroupper
vmovdqa .Lbswap_mask(%rip), BSWAP_MASK
vpxor Xhi, Xhi, Xhi
vmovdqu (Tp),T
vpshufb BSWAP_MASK,T,T
# we hash 8 block each iteration, if the total amount of blocks is not a multiple of 8, we hash the first n%8 blocks first
mov len, hlp0
and $~-128, hlp0
jz .Lmod_loop
sub hlp0, len
sub $16, hlp0
#hash first prefix block
vmovdqu (inp), DATA
vpshufb BSWAP_MASK, DATA, DATA
vpxor T, DATA, DATA
vpclmulqdq $0x00, (Htbl, hlp0), DATA, TMP0
vpclmulqdq $0x11, (Htbl, hlp0), DATA, TMP1
vpshufd $78, DATA, TMP2
vpxor DATA, TMP2, TMP2
vpclmulqdq $0x00, 16*8(Htbl, hlp0), TMP2, TMP2
lea 16(inp), inp
test hlp0, hlp0
jnz .Lpre_loop
jmp .Lred1
#hash remaining prefix bocks (up to 7 total prefix blocks)
.align 64
.Lpre_loop:
sub $16, hlp0
vmovdqu (inp),DATA # next data block
vpshufb BSWAP_MASK,DATA,DATA
vpclmulqdq $0x00, (Htbl,hlp0), DATA, TMP3
vpxor TMP3, TMP0, TMP0
vpclmulqdq $0x11, (Htbl,hlp0), DATA, TMP3
vpxor TMP3, TMP1, TMP1
vpshufd $78, DATA, TMP3
vpxor DATA, TMP3, TMP3
vpclmulqdq $0x00, 16*8(Htbl,hlp0), TMP3, TMP3
vpxor TMP3, TMP2, TMP2
test hlp0, hlp0
lea 16(inp), inp
jnz .Lpre_loop
.Lred1:
vpxor TMP0, TMP2, TMP2
vpxor TMP1, TMP2, TMP2
vpsrldq $8, TMP2, TMP3
vpslldq $8, TMP2, TMP2
vpxor TMP3, TMP1, Xhi
vpxor TMP2, TMP0, T
.align 64
.Lmod_loop:
sub $0x80, len
jb .Ldone
vmovdqu 16*7(inp),DATA # Ii
vpshufb BSWAP_MASK,DATA,DATA
vpclmulqdq $0x00, (Htbl), DATA, TMP0
vpclmulqdq $0x11, (Htbl), DATA, TMP1
vpshufd $78, DATA, TMP2
vpxor DATA, TMP2, TMP2
vpclmulqdq $0x00, 16*8(Htbl), TMP2, TMP2
#########################################################
vmovdqu 16*6(inp),DATA
vpshufb BSWAP_MASK,DATA,DATA
KARATSUBA_AAD 1
#########################################################
vmovdqu 16*5(inp),DATA
vpshufb BSWAP_MASK,DATA,DATA
vpclmulqdq $0x10, .Lpoly(%rip), T, TMP4 #reduction stage 1a
vpalignr $8, T, T, T
KARATSUBA_AAD 2
vpxor TMP4, T, T #reduction stage 1b
#########################################################
vmovdqu 16*4(inp),DATA
vpshufb BSWAP_MASK,DATA,DATA
KARATSUBA_AAD 3
#########################################################
vmovdqu 16*3(inp),DATA
vpshufb BSWAP_MASK,DATA,DATA
vpclmulqdq $0x10, .Lpoly(%rip), T, TMP4 #reduction stage 2a
vpalignr $8, T, T, T
KARATSUBA_AAD 4
vpxor TMP4, T, T #reduction stage 2b
#########################################################
vmovdqu 16*2(inp),DATA
vpshufb BSWAP_MASK,DATA,DATA
KARATSUBA_AAD 5
vpxor Xhi, T, T #reduction finalize
#########################################################
vmovdqu 16*1(inp),DATA
vpshufb BSWAP_MASK,DATA,DATA
KARATSUBA_AAD 6
#########################################################
vmovdqu 16*0(inp),DATA
vpshufb BSWAP_MASK,DATA,DATA
vpxor T,DATA,DATA
KARATSUBA_AAD 7
#########################################################
vpxor TMP0, TMP2, TMP2 # karatsuba fixup
vpxor TMP1, TMP2, TMP2
vpsrldq $8, TMP2, TMP3
vpslldq $8, TMP2, TMP2
vpxor TMP3, TMP1, Xhi
vpxor TMP2, TMP0, T
lea 16*8(inp), inp
jmp .Lmod_loop
#########################################################
.Ldone:
vpclmulqdq $0x10, .Lpoly(%rip), T, TMP3
vpalignr $8, T, T, T
vpxor TMP3, T, T
vpclmulqdq $0x10, .Lpoly(%rip), T, TMP3
vpalignr $8, T, T, T
vpxor TMP3, T, T
vpxor Xhi, T, T
.Lsave:
vpshufb BSWAP_MASK,T, T
vmovdqu T,(Tp)
vzeroupper
pop hlp0
ret
.size intel_aes_gcmAAD,.-intel_aes_gcmAAD
################################################################################
# Encrypt and Authenticate
# void intel_aes_gcmENC(uint8_t* PT, uint8_t* CT, void *Gctx,uint64_t len);
.type intel_aes_gcmENC,@function
.globl intel_aes_gcmENC
.align 16
intel_aes_gcmENC:
.set PT,%rdi
.set CT,%rsi
.set Htbl, %rdx
.set len, %rcx
.set KS,%r9
.set NR,%r10d
.set Gctx, %rdx
.set T,%xmm0
.set TMP0,%xmm1
.set TMP1,%xmm2
.set TMP2,%xmm3
.set TMP3,%xmm4
.set TMP4,%xmm5
.set TMP5,%xmm6
.set CTR0,%xmm7
.set CTR1,%xmm8
.set CTR2,%xmm9
.set CTR3,%xmm10
.set CTR4,%xmm11
.set CTR5,%xmm12
.set CTR6,%xmm13
.set CTR7,%xmm14
.set CTR,%xmm15
.macro ROUND i
vmovdqu \i*16(KS), TMP3
vaesenc TMP3, CTR0, CTR0
vaesenc TMP3, CTR1, CTR1
vaesenc TMP3, CTR2, CTR2
vaesenc TMP3, CTR3, CTR3
vaesenc TMP3, CTR4, CTR4
vaesenc TMP3, CTR5, CTR5
vaesenc TMP3, CTR6, CTR6
vaesenc TMP3, CTR7, CTR7
.endm
.macro ROUNDMUL i
vmovdqu \i*16(%rsp), TMP5
vmovdqu \i*16(KS), TMP3
vaesenc TMP3, CTR0, CTR0
vaesenc TMP3, CTR1, CTR1
vaesenc TMP3, CTR2, CTR2
vaesenc TMP3, CTR3, CTR3
vpshufd $78, TMP5, TMP4
vpxor TMP5, TMP4, TMP4
vaesenc TMP3, CTR4, CTR4
vaesenc TMP3, CTR5, CTR5
vaesenc TMP3, CTR6, CTR6
vaesenc TMP3, CTR7, CTR7
vpclmulqdq $0x00, 128+\i*16(Htbl), TMP4, TMP3
vpxor TMP3, TMP0, TMP0
vmovdqa \i*16(Htbl), TMP4
vpclmulqdq $0x11, TMP4, TMP5, TMP3
vpxor TMP3, TMP1, TMP1
vpclmulqdq $0x00, TMP4, TMP5, TMP3
vpxor TMP3, TMP2, TMP2
.endm
.macro KARATSUBA i
vmovdqu \i*16(%rsp), TMP5
vpclmulqdq $0x11, 16*\i(Htbl), TMP5, TMP3
vpxor TMP3, TMP1, TMP1
vpclmulqdq $0x00, 16*\i(Htbl), TMP5, TMP3
vpxor TMP3, TMP2, TMP2
vpshufd $78, TMP5, TMP3
vpxor TMP5, TMP3, TMP5
vpclmulqdq $0x00, 128+\i*16(Htbl), TMP5, TMP3
vpxor TMP3, TMP0, TMP0
.endm
test len, len
jnz .Lbegin
ret
.Lbegin:
vzeroupper
push %rbp
push %rbx
movq %rsp, %rbp
sub $128, %rsp
andq $-16, %rsp
vmovdqu 288(Gctx), CTR
vmovdqu 272(Gctx), T
mov 304(Gctx), KS
# AESContext->Nr
mov 244(KS), NR
vpshufb .Lbswap_mask(%rip), CTR, CTR
vpshufb .Lbswap_mask(%rip), T, T
cmp $128, len
jb .LDataSingles
# Encrypt the first eight blocks
sub $128, len
vmovdqa CTR, CTR0
vpaddd .Lone(%rip), CTR0, CTR1
vpaddd .Ltwo(%rip), CTR0, CTR2
vpaddd .Lone(%rip), CTR2, CTR3
vpaddd .Ltwo(%rip), CTR2, CTR4
vpaddd .Lone(%rip), CTR4, CTR5
vpaddd .Ltwo(%rip), CTR4, CTR6
vpaddd .Lone(%rip), CTR6, CTR7
vpaddd .Ltwo(%rip), CTR6, CTR
vpshufb .Lbswap_mask(%rip), CTR0, CTR0
vpshufb .Lbswap_mask(%rip), CTR1, CTR1
vpshufb .Lbswap_mask(%rip), CTR2, CTR2
vpshufb .Lbswap_mask(%rip), CTR3, CTR3
vpshufb .Lbswap_mask(%rip), CTR4, CTR4
vpshufb .Lbswap_mask(%rip), CTR5, CTR5
vpshufb .Lbswap_mask(%rip), CTR6, CTR6
vpshufb .Lbswap_mask(%rip), CTR7, CTR7
vpxor (KS), CTR0, CTR0
vpxor (KS), CTR1, CTR1
vpxor (KS), CTR2, CTR2
vpxor (KS), CTR3, CTR3
vpxor (KS), CTR4, CTR4
vpxor (KS), CTR5, CTR5
vpxor (KS), CTR6, CTR6
vpxor (KS), CTR7, CTR7
ROUND 1
ROUND 2
ROUND 3
ROUND 4
ROUND 5
ROUND 6
ROUND 7
ROUND 8
ROUND 9
vmovdqu 160(KS), TMP5
cmp $12, NR
jb .LLast1
ROUND 10
ROUND 11
vmovdqu 192(KS), TMP5
cmp $14, NR
jb .LLast1
ROUND 12
ROUND 13
vmovdqu 224(KS), TMP5
.LLast1:
vpxor (PT), TMP5, TMP3
vaesenclast TMP3, CTR0, CTR0
vpxor 16(PT), TMP5, TMP3
vaesenclast TMP3, CTR1, CTR1
vpxor 32(PT), TMP5, TMP3
vaesenclast TMP3, CTR2, CTR2
vpxor 48(PT), TMP5, TMP3
vaesenclast TMP3, CTR3, CTR3
vpxor 64(PT), TMP5, TMP3
vaesenclast TMP3, CTR4, CTR4
vpxor 80(PT), TMP5, TMP3
vaesenclast TMP3, CTR5, CTR5
vpxor 96(PT), TMP5, TMP3
vaesenclast TMP3, CTR6, CTR6
vpxor 112(PT), TMP5, TMP3
vaesenclast TMP3, CTR7, CTR7
vmovdqu .Lbswap_mask(%rip), TMP3
vmovdqu CTR0, (CT)
vpshufb TMP3, CTR0, CTR0
vmovdqu CTR1, 16(CT)
vpshufb TMP3, CTR1, CTR1
vmovdqu CTR2, 32(CT)
vpshufb TMP3, CTR2, CTR2
vmovdqu CTR3, 48(CT)
vpshufb TMP3, CTR3, CTR3
vmovdqu CTR4, 64(CT)
vpshufb TMP3, CTR4, CTR4
vmovdqu CTR5, 80(CT)
vpshufb TMP3, CTR5, CTR5
vmovdqu CTR6, 96(CT)
vpshufb TMP3, CTR6, CTR6
vmovdqu CTR7, 112(CT)
vpshufb TMP3, CTR7, CTR7
lea 128(CT), CT
lea 128(PT), PT
jmp .LDataOctets
# Encrypt 8 blocks each time while hashing previous 8 blocks
.align 64
.LDataOctets:
cmp $128, len
jb .LEndOctets
sub $128, len
vmovdqa CTR7, TMP5
vmovdqa CTR6, 1*16(%rsp)
vmovdqa CTR5, 2*16(%rsp)
vmovdqa CTR4, 3*16(%rsp)
vmovdqa CTR3, 4*16(%rsp)
vmovdqa CTR2, 5*16(%rsp)
vmovdqa CTR1, 6*16(%rsp)
vmovdqa CTR0, 7*16(%rsp)
vmovdqa CTR, CTR0
vpaddd .Lone(%rip), CTR0, CTR1
vpaddd .Ltwo(%rip), CTR0, CTR2
vpaddd .Lone(%rip), CTR2, CTR3
vpaddd .Ltwo(%rip), CTR2, CTR4
vpaddd .Lone(%rip), CTR4, CTR5
vpaddd .Ltwo(%rip), CTR4, CTR6
vpaddd .Lone(%rip), CTR6, CTR7
vpaddd .Ltwo(%rip), CTR6, CTR
vmovdqu (KS), TMP4
vpshufb TMP3, CTR0, CTR0
vpxor TMP4, CTR0, CTR0
vpshufb TMP3, CTR1, CTR1
vpxor TMP4, CTR1, CTR1
vpshufb TMP3, CTR2, CTR2
vpxor TMP4, CTR2, CTR2
vpshufb TMP3, CTR3, CTR3
vpxor TMP4, CTR3, CTR3
vpshufb TMP3, CTR4, CTR4
vpxor TMP4, CTR4, CTR4
vpshufb TMP3, CTR5, CTR5
vpxor TMP4, CTR5, CTR5
vpshufb TMP3, CTR6, CTR6
vpxor TMP4, CTR6, CTR6
vpshufb TMP3, CTR7, CTR7
vpxor TMP4, CTR7, CTR7
vmovdqu 16*0(Htbl), TMP3
vpclmulqdq $0x11, TMP3, TMP5, TMP1
vpclmulqdq $0x00, TMP3, TMP5, TMP2
vpshufd $78, TMP5, TMP3
vpxor TMP5, TMP3, TMP5
vmovdqu 128+0*16(Htbl), TMP3
vpclmulqdq $0x00, TMP3, TMP5, TMP0
ROUNDMUL 1
ROUNDMUL 2
ROUNDMUL 3
ROUNDMUL 4
ROUNDMUL 5
ROUNDMUL 6
vpxor 7*16(%rsp), T, TMP5
vmovdqu 7*16(KS), TMP3
vaesenc TMP3, CTR0, CTR0
vaesenc TMP3, CTR1, CTR1
vaesenc TMP3, CTR2, CTR2
vaesenc TMP3, CTR3, CTR3
vpshufd $78, TMP5, TMP4
vpxor TMP5, TMP4, TMP4
vaesenc TMP3, CTR4, CTR4
vaesenc TMP3, CTR5, CTR5
vaesenc TMP3, CTR6, CTR6
vaesenc TMP3, CTR7, CTR7
vpclmulqdq $0x11, 7*16(Htbl), TMP5, TMP3
vpxor TMP3, TMP1, TMP1
vpclmulqdq $0x00, 7*16(Htbl), TMP5, TMP3
vpxor TMP3, TMP2, TMP2
vpclmulqdq $0x00, 128+7*16(Htbl), TMP4, TMP3
vpxor TMP3, TMP0, TMP0
ROUND 8
vmovdqa .Lpoly(%rip), TMP5
vpxor TMP1, TMP0, TMP0
vpxor TMP2, TMP0, TMP0
vpsrldq $8, TMP0, TMP3
vpxor TMP3, TMP1, TMP4
vpslldq $8, TMP0, TMP3
vpxor TMP3, TMP2, T
vpclmulqdq $0x10, TMP5, T, TMP1
vpalignr $8, T, T, T
vpxor T, TMP1, T
ROUND 9
vpclmulqdq $0x10, TMP5, T, TMP1
vpalignr $8, T, T, T
vpxor T, TMP1, T
vmovdqu 160(KS), TMP5
cmp $10, NR
jbe .LLast2
ROUND 10
ROUND 11
vmovdqu 192(KS), TMP5
cmp $12, NR
jbe .LLast2
ROUND 12
ROUND 13
vmovdqu 224(KS), TMP5
.LLast2:
vpxor (PT), TMP5, TMP3
vaesenclast TMP3, CTR0, CTR0
vpxor 16(PT), TMP5, TMP3
vaesenclast TMP3, CTR1, CTR1
vpxor 32(PT), TMP5, TMP3
vaesenclast TMP3, CTR2, CTR2
vpxor 48(PT), TMP5, TMP3
vaesenclast TMP3, CTR3, CTR3
vpxor 64(PT), TMP5, TMP3
vaesenclast TMP3, CTR4, CTR4
vpxor 80(PT), TMP5, TMP3
vaesenclast TMP3, CTR5, CTR5
vpxor 96(PT), TMP5, TMP3
vaesenclast TMP3, CTR6, CTR6
vpxor 112(PT), TMP5, TMP3
vaesenclast TMP3, CTR7, CTR7
vmovdqu .Lbswap_mask(%rip), TMP3
vmovdqu CTR0, (CT)
vpshufb TMP3, CTR0, CTR0
vmovdqu CTR1, 16(CT)
vpshufb TMP3, CTR1, CTR1
vmovdqu CTR2, 32(CT)
vpshufb TMP3, CTR2, CTR2
vmovdqu CTR3, 48(CT)
vpshufb TMP3, CTR3, CTR3
vmovdqu CTR4, 64(CT)
vpshufb TMP3, CTR4, CTR4
vmovdqu CTR5, 80(CT)
vpshufb TMP3, CTR5, CTR5
vmovdqu CTR6, 96(CT)
vpshufb TMP3, CTR6, CTR6
vmovdqu CTR7,112(CT)
vpshufb TMP3, CTR7, CTR7
vpxor TMP4, T, T
lea 128(CT), CT
lea 128(PT), PT
jmp .LDataOctets
.LEndOctets:
vmovdqa CTR7, TMP5
vmovdqa CTR6, 1*16(%rsp)
vmovdqa CTR5, 2*16(%rsp)
vmovdqa CTR4, 3*16(%rsp)
vmovdqa CTR3, 4*16(%rsp)
vmovdqa CTR2, 5*16(%rsp)
vmovdqa CTR1, 6*16(%rsp)
vmovdqa CTR0, 7*16(%rsp)
vmovdqu 16*0(Htbl), TMP3
vpclmulqdq $0x11, TMP3, TMP5, TMP1
vpclmulqdq $0x00, TMP3, TMP5, TMP2
vpshufd $78, TMP5, TMP3
vpxor TMP5, TMP3, TMP5
vmovdqu 128+0*16(Htbl), TMP3
vpclmulqdq $0x00, TMP3, TMP5, TMP0
KARATSUBA 1
KARATSUBA 2
KARATSUBA 3
KARATSUBA 4
KARATSUBA 5
KARATSUBA 6
vmovdqu 7*16(%rsp), TMP5
vpxor T, TMP5, TMP5
vmovdqu 16*7(Htbl), TMP4
vpclmulqdq $0x11, TMP4, TMP5, TMP3
vpxor TMP3, TMP1, TMP1
vpclmulqdq $0x00, TMP4, TMP5, TMP3
vpxor TMP3, TMP2, TMP2
vpshufd $78, TMP5, TMP3
vpxor TMP5, TMP3, TMP5
vmovdqu 128+7*16(Htbl), TMP4
vpclmulqdq $0x00, TMP4, TMP5, TMP3
vpxor TMP3, TMP0, TMP0
vpxor TMP1, TMP0, TMP0
vpxor TMP2, TMP0, TMP0
vpsrldq $8, TMP0, TMP3
vpxor TMP3, TMP1, TMP4
vpslldq $8, TMP0, TMP3
vpxor TMP3, TMP2, T
vmovdqa .Lpoly(%rip), TMP2
vpalignr $8, T, T, TMP1
vpclmulqdq $0x10, TMP2, T, T
vpxor T, TMP1, T
vpalignr $8, T, T, TMP1
vpclmulqdq $0x10, TMP2, T, T
vpxor T, TMP1, T
vpxor TMP4, T, T
#Here we encrypt any remaining whole block
.LDataSingles:
cmp $16, len
jb .LDataTail
sub $16, len
vpshufb .Lbswap_mask(%rip), CTR, TMP1
vpaddd .Lone(%rip), CTR, CTR
vpxor (KS), TMP1, TMP1
vaesenc 16*1(KS), TMP1, TMP1
vaesenc 16*2(KS), TMP1, TMP1
vaesenc 16*3(KS), TMP1, TMP1
vaesenc 16*4(KS), TMP1, TMP1
vaesenc 16*5(KS), TMP1, TMP1
vaesenc 16*6(KS), TMP1, TMP1
vaesenc 16*7(KS), TMP1, TMP1
vaesenc 16*8(KS), TMP1, TMP1
vaesenc 16*9(KS), TMP1, TMP1
vmovdqu 16*10(KS), TMP2
cmp $10, NR
je .LLast3
vaesenc 16*10(KS), TMP1, TMP1
vaesenc 16*11(KS), TMP1, TMP1
vmovdqu 16*12(KS), TMP2
cmp $12, NR
je .LLast3
vaesenc 16*12(KS), TMP1, TMP1
vaesenc 16*13(KS), TMP1, TMP1
vmovdqu 16*14(KS), TMP2
.LLast3:
vaesenclast TMP2, TMP1, TMP1
vpxor (PT), TMP1, TMP1
vmovdqu TMP1, (CT)
addq $16, CT
addq $16, PT
vpshufb .Lbswap_mask(%rip), TMP1, TMP1
vpxor TMP1, T, T
vmovdqu (Htbl), TMP0
call GFMUL
jmp .LDataSingles
#Here we encypt the final partial block, if there is one
.LDataTail:
test len, len
jz DATA_END
# First prepare the counter block
vpshufb .Lbswap_mask(%rip), CTR, TMP1
vpaddd .Lone(%rip), CTR, CTR
vpxor (KS), TMP1, TMP1
vaesenc 16*1(KS), TMP1, TMP1
vaesenc 16*2(KS), TMP1, TMP1
vaesenc 16*3(KS), TMP1, TMP1
vaesenc 16*4(KS), TMP1, TMP1
vaesenc 16*5(KS), TMP1, TMP1
vaesenc 16*6(KS), TMP1, TMP1
vaesenc 16*7(KS), TMP1, TMP1
vaesenc 16*8(KS), TMP1, TMP1
vaesenc 16*9(KS), TMP1, TMP1
vmovdqu 16*10(KS), TMP2
cmp $10, NR
je .LLast4
vaesenc 16*10(KS), TMP1, TMP1
vaesenc 16*11(KS), TMP1, TMP1
vmovdqu 16*12(KS), TMP2
cmp $12, NR
je .LLast4
vaesenc 16*12(KS), TMP1, TMP1
vaesenc 16*13(KS), TMP1, TMP1
vmovdqu 16*14(KS), TMP2
.LLast4:
vaesenclast TMP2, TMP1, TMP1
#Zero a temp location
vpxor TMP2, TMP2, TMP2
vmovdqa TMP2, (%rsp)
# Copy the required bytes only (could probably use rep movsb)
xor KS, KS
.LEncCpy:
cmp KS, len
je .LEncCpyEnd
movb (PT, KS, 1), %r8b
movb %r8b, (%rsp, KS, 1)
inc KS
jmp .LEncCpy
.LEncCpyEnd:
# Xor with the counter block
vpxor (%rsp), TMP1, TMP0
# Again, store at temp location
vmovdqa TMP0, (%rsp)
# Copy only the required bytes to CT, and zero the rest for the hash
xor KS, KS
.LEncCpy2:
cmp KS, len
je .LEncCpy3
movb (%rsp, KS, 1), %r8b
movb %r8b, (CT, KS, 1)
inc KS
jmp .LEncCpy2
.LEncCpy3:
cmp $16, KS
je .LEndCpy3
movb $0, (%rsp, KS, 1)
inc KS
jmp .LEncCpy3
.LEndCpy3:
vmovdqa (%rsp), TMP0
vpshufb .Lbswap_mask(%rip), TMP0, TMP0
vpxor TMP0, T, T
vmovdqu (Htbl), TMP0
call GFMUL
DATA_END:
vpshufb .Lbswap_mask(%rip), T, T
vpshufb .Lbswap_mask(%rip), CTR, CTR
vmovdqu T, 272(Gctx)
vmovdqu CTR, 288(Gctx)
movq %rbp, %rsp
popq %rbx
popq %rbp
ret
.size intel_aes_gcmENC, .-intel_aes_gcmENC
#########################
# Decrypt and Authenticate
# void intel_aes_gcmDEC(uint8_t* PT, uint8_t* CT, void *Gctx,uint64_t len);
.type intel_aes_gcmDEC,@function
.globl intel_aes_gcmDEC
.align 16
intel_aes_gcmDEC:
# parameter 1: CT # input
# parameter 2: PT # output
# parameter 3: %rdx # Gctx
# parameter 4: %rcx # len
.macro DEC_KARATSUBA i
vmovdqu (7-\i)*16(CT), TMP5
vpshufb .Lbswap_mask(%rip), TMP5, TMP5
vpclmulqdq $0x11, 16*\i(Htbl), TMP5, TMP3
vpxor TMP3, TMP1, TMP1
vpclmulqdq $0x00, 16*\i(Htbl), TMP5, TMP3
vpxor TMP3, TMP2, TMP2
vpshufd $78, TMP5, TMP3
vpxor TMP5, TMP3, TMP5
vpclmulqdq $0x00, 128+\i*16(Htbl), TMP5, TMP3
vpxor TMP3, TMP0, TMP0
.endm
.set PT,%rsi
.set CT,%rdi
.set Htbl, %rdx
.set len, %rcx
.set KS,%r9
.set NR,%r10d
.set Gctx, %rdx
.set T,%xmm0
.set TMP0,%xmm1
.set TMP1,%xmm2
.set TMP2,%xmm3
.set TMP3,%xmm4
.set TMP4,%xmm5
.set TMP5,%xmm6
.set CTR0,%xmm7
.set CTR1,%xmm8
.set CTR2,%xmm9
.set CTR3,%xmm10
.set CTR4,%xmm11
.set CTR5,%xmm12
.set CTR6,%xmm13
.set CTR7,%xmm14
.set CTR,%xmm15
test len, len
jnz .LbeginDec
ret
.LbeginDec:
pushq %rbp
pushq %rbx
movq %rsp, %rbp
sub $128, %rsp
andq $-16, %rsp
vmovdqu 288(Gctx), CTR
vmovdqu 272(Gctx), T
mov 304(Gctx), KS
# AESContext->Nr
mov 244(KS), NR
vpshufb .Lbswap_mask(%rip), CTR, CTR
vpshufb .Lbswap_mask(%rip), T, T
vmovdqu .Lbswap_mask(%rip), TMP3
jmp .LDECOctets
# Decrypt 8 blocks each time while hashing them at the same time
.align 64
.LDECOctets:
cmp $128, len
jb .LDECSingles
sub $128, len
vmovdqa CTR, CTR0
vpaddd .Lone(%rip), CTR0, CTR1
vpaddd .Ltwo(%rip), CTR0, CTR2
vpaddd .Lone(%rip), CTR2, CTR3
vpaddd .Ltwo(%rip), CTR2, CTR4
vpaddd .Lone(%rip), CTR4, CTR5
vpaddd .Ltwo(%rip), CTR4, CTR6
vpaddd .Lone(%rip), CTR6, CTR7
vpaddd .Ltwo(%rip), CTR6, CTR
vpshufb TMP3, CTR0, CTR0
vpshufb TMP3, CTR1, CTR1
vpshufb TMP3, CTR2, CTR2
vpshufb TMP3, CTR3, CTR3
vpshufb TMP3, CTR4, CTR4
vpshufb TMP3, CTR5, CTR5
vpshufb TMP3, CTR6, CTR6
vpshufb TMP3, CTR7, CTR7
vmovdqu (KS), TMP3
vpxor TMP3, CTR0, CTR0
vpxor TMP3, CTR1, CTR1
vpxor TMP3, CTR2, CTR2
vpxor TMP3, CTR3, CTR3
vpxor TMP3, CTR4, CTR4
vpxor TMP3, CTR5, CTR5
vpxor TMP3, CTR6, CTR6
vpxor TMP3, CTR7, CTR7
vmovdqu 7*16(CT), TMP5
vpshufb .Lbswap_mask(%rip), TMP5, TMP5
vmovdqu 16*0(Htbl), TMP3
vpclmulqdq $0x11, TMP3, TMP5, TMP1
vpclmulqdq $0x00, TMP3, TMP5, TMP2
vpshufd $78, TMP5, TMP3
vpxor TMP5, TMP3, TMP5
vmovdqu 128+0*16(Htbl), TMP3
vpclmulqdq $0x00, TMP3, TMP5, TMP0
ROUND 1
DEC_KARATSUBA 1
ROUND 2
DEC_KARATSUBA 2
ROUND 3
DEC_KARATSUBA 3
ROUND 4
DEC_KARATSUBA 4
ROUND 5
DEC_KARATSUBA 5
ROUND 6
DEC_KARATSUBA 6
ROUND 7
vmovdqu 0*16(CT), TMP5
vpshufb .Lbswap_mask(%rip), TMP5, TMP5
vpxor T, TMP5, TMP5
vmovdqu 16*7(Htbl), TMP4
vpclmulqdq $0x11, TMP4, TMP5, TMP3
vpxor TMP3, TMP1, TMP1
vpclmulqdq $0x00, TMP4, TMP5, TMP3
vpxor TMP3, TMP2, TMP2
vpshufd $78, TMP5, TMP3
vpxor TMP5, TMP3, TMP5
vmovdqu 128+7*16(Htbl), TMP4
vpclmulqdq $0x00, TMP4, TMP5, TMP3
vpxor TMP3, TMP0, TMP0
ROUND 8
vpxor TMP1, TMP0, TMP0
vpxor TMP2, TMP0, TMP0
vpsrldq $8, TMP0, TMP3
vpxor TMP3, TMP1, TMP4
vpslldq $8, TMP0, TMP3
vpxor TMP3, TMP2, T
vmovdqa .Lpoly(%rip), TMP2
vpalignr $8, T, T, TMP1
vpclmulqdq $0x10, TMP2, T, T
vpxor T, TMP1, T
ROUND 9
vpalignr $8, T, T, TMP1
vpclmulqdq $0x10, TMP2, T, T
vpxor T, TMP1, T
vmovdqu 160(KS), TMP5
cmp $10, NR
jbe .LDECLast1
ROUND 10
ROUND 11
vmovdqu 192(KS), TMP5
cmp $12, NR
jbe .LDECLast1
ROUND 12
ROUND 13
vmovdqu 224(KS), TMP5
.LDECLast1:
vpxor (CT), TMP5, TMP3
vaesenclast TMP3, CTR0, CTR0
vpxor 16(CT), TMP5, TMP3
vaesenclast TMP3, CTR1, CTR1
vpxor 32(CT), TMP5, TMP3
vaesenclast TMP3, CTR2, CTR2
vpxor 48(CT), TMP5, TMP3
vaesenclast TMP3, CTR3, CTR3
vpxor 64(CT), TMP5, TMP3
vaesenclast TMP3, CTR4, CTR4
vpxor 80(CT), TMP5, TMP3
vaesenclast TMP3, CTR5, CTR5
vpxor 96(CT), TMP5, TMP3
vaesenclast TMP3, CTR6, CTR6
vpxor 112(CT), TMP5, TMP3
vaesenclast TMP3, CTR7, CTR7
vmovdqu .Lbswap_mask(%rip), TMP3
vmovdqu CTR0, (PT)
vmovdqu CTR1, 16(PT)
vmovdqu CTR2, 32(PT)
vmovdqu CTR3, 48(PT)
vmovdqu CTR4, 64(PT)
vmovdqu CTR5, 80(PT)
vmovdqu CTR6, 96(PT)
vmovdqu CTR7,112(PT)
vpxor TMP4, T, T
lea 128(CT), CT
lea 128(PT), PT
jmp .LDECOctets
#Here we decrypt and hash any remaining whole block
.LDECSingles:
cmp $16, len
jb .LDECTail
sub $16, len
vmovdqu (CT), TMP1
vpshufb .Lbswap_mask(%rip), TMP1, TMP1
vpxor TMP1, T, T
vmovdqu (Htbl), TMP0
call GFMUL
vpshufb .Lbswap_mask(%rip), CTR, TMP1
vpaddd .Lone(%rip), CTR, CTR
vpxor (KS), TMP1, TMP1
vaesenc 16*1(KS), TMP1, TMP1
vaesenc 16*2(KS), TMP1, TMP1
vaesenc 16*3(KS), TMP1, TMP1
vaesenc 16*4(KS), TMP1, TMP1
vaesenc 16*5(KS), TMP1, TMP1
vaesenc 16*6(KS), TMP1, TMP1
vaesenc 16*7(KS), TMP1, TMP1
vaesenc 16*8(KS), TMP1, TMP1
vaesenc 16*9(KS), TMP1, TMP1
vmovdqu 16*10(KS), TMP2
cmp $10, NR
je .LDECLast2
vaesenc 16*10(KS), TMP1, TMP1
vaesenc 16*11(KS), TMP1, TMP1
vmovdqu 16*12(KS), TMP2
cmp $12, NR
je .LDECLast2
vaesenc 16*12(KS), TMP1, TMP1
vaesenc 16*13(KS), TMP1, TMP1
vmovdqu 16*14(KS), TMP2
.LDECLast2:
vaesenclast TMP2, TMP1, TMP1
vpxor (CT), TMP1, TMP1
vmovdqu TMP1, (PT)
addq $16, CT
addq $16, PT
jmp .LDECSingles
#Here we decrypt the final partial block, if there is one
.LDECTail:
test len, len
jz .LDEC_END
vpshufb .Lbswap_mask(%rip), CTR, TMP1
vpaddd .Lone(%rip), CTR, CTR
vpxor (KS), TMP1, TMP1
vaesenc 16*1(KS), TMP1, TMP1
vaesenc 16*2(KS), TMP1, TMP1
vaesenc 16*3(KS), TMP1, TMP1
vaesenc 16*4(KS), TMP1, TMP1
vaesenc 16*5(KS), TMP1, TMP1
vaesenc 16*6(KS), TMP1, TMP1
vaesenc 16*7(KS), TMP1, TMP1
vaesenc 16*8(KS), TMP1, TMP1
vaesenc 16*9(KS), TMP1, TMP1
vmovdqu 16*10(KS), TMP2
cmp $10, NR
je .LDECLast3
vaesenc 16*10(KS), TMP1, TMP1
vaesenc 16*11(KS), TMP1, TMP1
vmovdqu 16*12(KS), TMP2
cmp $12, NR
je .LDECLast3
vaesenc 16*12(KS), TMP1, TMP1
vaesenc 16*13(KS), TMP1, TMP1
vmovdqu 16*14(KS), TMP2
.LDECLast3:
vaesenclast TMP2, TMP1, TMP1
vpxor TMP2, TMP2, TMP2
vmovdqa TMP2, (%rsp)
# Copy the required bytes only (could probably use rep movsb)
xor KS, KS
.LDecCpy:
cmp KS, len
je .LDecCpy2
movb (CT, KS, 1), %r8b
movb %r8b, (%rsp, KS, 1)
inc KS
jmp .LDecCpy
.LDecCpy2:
cmp $16, KS
je .LDecCpyEnd
movb $0, (%rsp, KS, 1)
inc KS
jmp .LDecCpy2
.LDecCpyEnd:
# Xor with the counter block
vmovdqa (%rsp), TMP0
vpxor TMP0, TMP1, TMP1
# Again, store at temp location
vmovdqa TMP1, (%rsp)
# Copy only the required bytes to PT, and zero the rest for the hash
xor KS, KS
.LDecCpy3:
cmp KS, len
je .LDecCpyEnd3
movb (%rsp, KS, 1), %r8b
movb %r8b, (PT, KS, 1)
inc KS
jmp .LDecCpy3
.LDecCpyEnd3:
vpshufb .Lbswap_mask(%rip), TMP0, TMP0
vpxor TMP0, T, T
vmovdqu (Htbl), TMP0
call GFMUL
.LDEC_END:
vpshufb .Lbswap_mask(%rip), T, T
vpshufb .Lbswap_mask(%rip), CTR, CTR
vmovdqu T, 272(Gctx)
vmovdqu CTR, 288(Gctx)
movq %rbp, %rsp
popq %rbx
popq %rbp
ret
.size intel_aes_gcmDEC, .-intel_aes_gcmDEC
#########################
# a = T
# b = TMP0 - remains unchanged
# res = T
# uses also TMP1,TMP2,TMP3,TMP4
# __m128i GFMUL(__m128i A, __m128i B);
.type GFMUL,@function
.globl GFMUL
GFMUL:
vpclmulqdq $0x00, TMP0, T, TMP1
vpclmulqdq $0x11, TMP0, T, TMP4
vpshufd $78, T, TMP2
vpshufd $78, TMP0, TMP3
vpxor T, TMP2, TMP2
vpxor TMP0, TMP3, TMP3
vpclmulqdq $0x00, TMP3, TMP2, TMP2
vpxor TMP1, TMP2, TMP2
vpxor TMP4, TMP2, TMP2
vpslldq $8, TMP2, TMP3
vpsrldq $8, TMP2, TMP2
vpxor TMP3, TMP1, TMP1
vpxor TMP2, TMP4, TMP4
vpclmulqdq $0x10, .Lpoly(%rip), TMP1, TMP2
vpshufd $78, TMP1, TMP3
vpxor TMP3, TMP2, TMP1
vpclmulqdq $0x10, .Lpoly(%rip), TMP1, TMP2
vpshufd $78, TMP1, TMP3
vpxor TMP3, TMP2, TMP1
vpxor TMP4, TMP1, T
ret
.size GFMUL, .-GFMUL