9763 строки
186 KiB
ArmAsm
9763 строки
186 KiB
ArmAsm
// Code generated by command: go run chacha20poly1305_amd64_asm.go -out ../chacha20poly1305_amd64.s -pkg chacha20poly1305. DO NOT EDIT.
|
|
|
|
//go:build gc && !purego
|
|
|
|
#include "textflag.h"
|
|
|
|
// func polyHashADInternal<>()
|
|
TEXT polyHashADInternal<>(SB), NOSPLIT, $0
|
|
// Hack: Must declare #define macros inside of a function due to Avo constraints
|
|
// ROL rotates the uint32s in register R left by N bits, using temporary T.
|
|
#define ROL(N, R, T) \
|
|
MOVO R, T; \
|
|
PSLLL $(N), T; \
|
|
PSRLL $(32-(N)), R; \
|
|
PXOR T, R
|
|
|
|
// ROL8 rotates the uint32s in register R left by 8, using temporary T if needed.
|
|
#ifdef GOAMD64_v2
|
|
#define ROL8(R, T) PSHUFB ·rol8<>(SB), R
|
|
#else
|
|
#define ROL8(R, T) ROL(8, R, T)
|
|
#endif
|
|
|
|
// ROL16 rotates the uint32s in register R left by 16, using temporary T if needed.
|
|
#ifdef GOAMD64_v2
|
|
#define ROL16(R, T) PSHUFB ·rol16<>(SB), R
|
|
#else
|
|
#define ROL16(R, T) ROL(16, R, T)
|
|
#endif
|
|
XORQ R10, R10
|
|
XORQ R11, R11
|
|
XORQ R12, R12
|
|
CMPQ R9, $0x0d
|
|
JNE hashADLoop
|
|
MOVQ (CX), R10
|
|
MOVQ 5(CX), R11
|
|
SHRQ $0x18, R11
|
|
MOVQ $0x00000001, R12
|
|
MOVQ (BP), AX
|
|
MOVQ AX, R15
|
|
MULQ R10
|
|
MOVQ AX, R13
|
|
MOVQ DX, R14
|
|
MOVQ (BP), AX
|
|
MULQ R11
|
|
IMULQ R12, R15
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), AX
|
|
MOVQ AX, R8
|
|
MULQ R10
|
|
ADDQ AX, R14
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R10
|
|
MOVQ 8(BP), AX
|
|
MULQ R11
|
|
ADDQ AX, R15
|
|
ADCQ $0x00, DX
|
|
IMULQ R12, R8
|
|
ADDQ R10, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
RET
|
|
|
|
hashADLoop:
|
|
// Hash in 16 byte chunks
|
|
CMPQ R9, $0x10
|
|
JB hashADTail
|
|
ADDQ (CX), R10
|
|
ADCQ 8(CX), R11
|
|
ADCQ $0x01, R12
|
|
LEAQ 16(CX), CX
|
|
SUBQ $0x10, R9
|
|
MOVQ (BP), AX
|
|
MOVQ AX, R15
|
|
MULQ R10
|
|
MOVQ AX, R13
|
|
MOVQ DX, R14
|
|
MOVQ (BP), AX
|
|
MULQ R11
|
|
IMULQ R12, R15
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), AX
|
|
MOVQ AX, R8
|
|
MULQ R10
|
|
ADDQ AX, R14
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R10
|
|
MOVQ 8(BP), AX
|
|
MULQ R11
|
|
ADDQ AX, R15
|
|
ADCQ $0x00, DX
|
|
IMULQ R12, R8
|
|
ADDQ R10, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
JMP hashADLoop
|
|
|
|
hashADTail:
|
|
CMPQ R9, $0x00
|
|
JE hashADDone
|
|
|
|
// Hash last < 16 byte tail
|
|
XORQ R13, R13
|
|
XORQ R14, R14
|
|
XORQ R15, R15
|
|
ADDQ R9, CX
|
|
|
|
hashADTailLoop:
|
|
SHLQ $0x08, R13, R14
|
|
SHLQ $0x08, R13
|
|
MOVB -1(CX), R15
|
|
XORQ R15, R13
|
|
DECQ CX
|
|
DECQ R9
|
|
JNE hashADTailLoop
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), AX
|
|
MOVQ AX, R15
|
|
MULQ R10
|
|
MOVQ AX, R13
|
|
MOVQ DX, R14
|
|
MOVQ (BP), AX
|
|
MULQ R11
|
|
IMULQ R12, R15
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), AX
|
|
MOVQ AX, R8
|
|
MULQ R10
|
|
ADDQ AX, R14
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R10
|
|
MOVQ 8(BP), AX
|
|
MULQ R11
|
|
ADDQ AX, R15
|
|
ADCQ $0x00, DX
|
|
IMULQ R12, R8
|
|
ADDQ R10, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
|
|
hashADDone:
|
|
RET
|
|
|
|
// func chacha20Poly1305Open(dst []byte, key []uint32, src []byte, ad []byte) bool
|
|
// Requires: AVX, AVX2, BMI2, CMOV, SSE2
|
|
TEXT ·chacha20Poly1305Open(SB), $288-97
|
|
// For aligned stack access
|
|
MOVQ SP, BP
|
|
ADDQ $0x20, BP
|
|
ANDQ $-32, BP
|
|
MOVQ dst_base+0(FP), DI
|
|
MOVQ key_base+24(FP), R8
|
|
MOVQ src_base+48(FP), SI
|
|
MOVQ src_len+56(FP), BX
|
|
MOVQ ad_base+72(FP), CX
|
|
|
|
// Check for AVX2 support
|
|
CMPB ·useAVX2+0(SB), $0x01
|
|
JE chacha20Poly1305Open_AVX2
|
|
|
|
// Special optimization, for very short buffers
|
|
CMPQ BX, $0x80
|
|
JBE openSSE128
|
|
|
|
// For long buffers, prepare the poly key first
|
|
MOVOU ·chacha20Constants<>+0(SB), X0
|
|
MOVOU 16(R8), X3
|
|
MOVOU 32(R8), X6
|
|
MOVOU 48(R8), X9
|
|
MOVO X9, X13
|
|
|
|
// Store state on stack for future use
|
|
MOVO X3, 32(BP)
|
|
MOVO X6, 48(BP)
|
|
MOVO X9, 128(BP)
|
|
MOVQ $0x0000000a, R9
|
|
|
|
openSSEPreparePolyKey:
|
|
PADDD X3, X0
|
|
PXOR X0, X9
|
|
ROL16(X9, X12)
|
|
PADDD X9, X6
|
|
PXOR X6, X3
|
|
MOVO X3, X12
|
|
PSLLL $0x0c, X12
|
|
PSRLL $0x14, X3
|
|
PXOR X12, X3
|
|
PADDD X3, X0
|
|
PXOR X0, X9
|
|
ROL8(X9, X12)
|
|
PADDD X9, X6
|
|
PXOR X6, X3
|
|
MOVO X3, X12
|
|
PSLLL $0x07, X12
|
|
PSRLL $0x19, X3
|
|
PXOR X12, X3
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xdb
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xf6
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xc9
|
|
BYTE $0x0c
|
|
PADDD X3, X0
|
|
PXOR X0, X9
|
|
ROL16(X9, X12)
|
|
PADDD X9, X6
|
|
PXOR X6, X3
|
|
MOVO X3, X12
|
|
PSLLL $0x0c, X12
|
|
PSRLL $0x14, X3
|
|
PXOR X12, X3
|
|
PADDD X3, X0
|
|
PXOR X0, X9
|
|
ROL8(X9, X12)
|
|
PADDD X9, X6
|
|
PXOR X6, X3
|
|
MOVO X3, X12
|
|
PSLLL $0x07, X12
|
|
PSRLL $0x19, X3
|
|
PXOR X12, X3
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xdb
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xf6
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xc9
|
|
BYTE $0x04
|
|
DECQ R9
|
|
JNE openSSEPreparePolyKey
|
|
|
|
// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
|
|
PADDL ·chacha20Constants<>+0(SB), X0
|
|
PADDL 32(BP), X3
|
|
|
|
// Clamp and store the key
|
|
PAND ·polyClampMask<>+0(SB), X0
|
|
MOVO X0, (BP)
|
|
MOVO X3, 16(BP)
|
|
|
|
// Hash AAD
|
|
MOVQ ad_len+80(FP), R9
|
|
CALL polyHashADInternal<>(SB)
|
|
|
|
openSSEMainLoop:
|
|
CMPQ BX, $0x00000100
|
|
JB openSSEMainLoopDone
|
|
|
|
// Load state, increment counter blocks
|
|
MOVO ·chacha20Constants<>+0(SB), X0
|
|
MOVO 32(BP), X3
|
|
MOVO 48(BP), X6
|
|
MOVO 128(BP), X9
|
|
PADDL ·sseIncMask<>+0(SB), X9
|
|
MOVO X0, X1
|
|
MOVO X3, X4
|
|
MOVO X6, X7
|
|
MOVO X9, X10
|
|
PADDL ·sseIncMask<>+0(SB), X10
|
|
MOVO X1, X2
|
|
MOVO X4, X5
|
|
MOVO X7, X8
|
|
MOVO X10, X11
|
|
PADDL ·sseIncMask<>+0(SB), X11
|
|
MOVO X2, X12
|
|
MOVO X5, X13
|
|
MOVO X8, X14
|
|
MOVO X11, X15
|
|
PADDL ·sseIncMask<>+0(SB), X15
|
|
|
|
// Store counters
|
|
MOVO X9, 80(BP)
|
|
MOVO X10, 96(BP)
|
|
MOVO X11, 112(BP)
|
|
MOVO X15, 128(BP)
|
|
|
|
// There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash
|
|
// 2 blocks, and for the remaining 4 only 1 block - for a total of 16
|
|
MOVQ $0x00000004, CX
|
|
MOVQ SI, R9
|
|
|
|
openSSEInternalLoop:
|
|
MOVO X14, 64(BP)
|
|
PADDD X3, X0
|
|
PXOR X0, X9
|
|
ROL16(X9, X14)
|
|
PADDD X9, X6
|
|
PXOR X6, X3
|
|
MOVO X3, X14
|
|
PSLLL $0x0c, X14
|
|
PSRLL $0x14, X3
|
|
PXOR X14, X3
|
|
PADDD X3, X0
|
|
PXOR X0, X9
|
|
ROL8(X9, X14)
|
|
PADDD X9, X6
|
|
PXOR X6, X3
|
|
MOVO X3, X14
|
|
PSLLL $0x07, X14
|
|
PSRLL $0x19, X3
|
|
PXOR X14, X3
|
|
PADDD X4, X1
|
|
PXOR X1, X10
|
|
ROL16(X10, X14)
|
|
PADDD X10, X7
|
|
PXOR X7, X4
|
|
MOVO X4, X14
|
|
PSLLL $0x0c, X14
|
|
PSRLL $0x14, X4
|
|
PXOR X14, X4
|
|
PADDD X4, X1
|
|
PXOR X1, X10
|
|
ROL8(X10, X14)
|
|
PADDD X10, X7
|
|
PXOR X7, X4
|
|
MOVO X4, X14
|
|
PSLLL $0x07, X14
|
|
PSRLL $0x19, X4
|
|
PXOR X14, X4
|
|
PADDD X5, X2
|
|
PXOR X2, X11
|
|
ROL16(X11, X14)
|
|
PADDD X11, X8
|
|
PXOR X8, X5
|
|
MOVO X5, X14
|
|
PSLLL $0x0c, X14
|
|
PSRLL $0x14, X5
|
|
PXOR X14, X5
|
|
PADDD X5, X2
|
|
PXOR X2, X11
|
|
ROL8(X11, X14)
|
|
PADDD X11, X8
|
|
PXOR X8, X5
|
|
MOVO X5, X14
|
|
PSLLL $0x07, X14
|
|
PSRLL $0x19, X5
|
|
PXOR X14, X5
|
|
MOVO 64(BP), X14
|
|
MOVO X7, 64(BP)
|
|
PADDD X13, X12
|
|
PXOR X12, X15
|
|
ROL16(X15, X7)
|
|
PADDD X15, X14
|
|
PXOR X14, X13
|
|
MOVO X13, X7
|
|
PSLLL $0x0c, X7
|
|
PSRLL $0x14, X13
|
|
PXOR X7, X13
|
|
PADDD X13, X12
|
|
PXOR X12, X15
|
|
ROL8(X15, X7)
|
|
PADDD X15, X14
|
|
PXOR X14, X13
|
|
MOVO X13, X7
|
|
PSLLL $0x07, X7
|
|
PSRLL $0x19, X13
|
|
PXOR X7, X13
|
|
MOVO 64(BP), X7
|
|
ADDQ (R9), R10
|
|
ADCQ 8(R9), R11
|
|
ADCQ $0x01, R12
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xdb
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xe4
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xed
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xed
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xf6
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xff
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xc0
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xf6
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xc9
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xd2
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xdb
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xff
|
|
BYTE $0x0c
|
|
MOVQ (BP), AX
|
|
MOVQ AX, R15
|
|
MULQ R10
|
|
MOVQ AX, R13
|
|
MOVQ DX, R14
|
|
MOVQ (BP), AX
|
|
MULQ R11
|
|
IMULQ R12, R15
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), AX
|
|
MOVQ AX, R8
|
|
MULQ R10
|
|
ADDQ AX, R14
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R10
|
|
MOVQ 8(BP), AX
|
|
MULQ R11
|
|
ADDQ AX, R15
|
|
ADCQ $0x00, DX
|
|
LEAQ 16(R9), R9
|
|
MOVO X14, 64(BP)
|
|
PADDD X3, X0
|
|
PXOR X0, X9
|
|
ROL16(X9, X14)
|
|
PADDD X9, X6
|
|
PXOR X6, X3
|
|
MOVO X3, X14
|
|
PSLLL $0x0c, X14
|
|
PSRLL $0x14, X3
|
|
PXOR X14, X3
|
|
PADDD X3, X0
|
|
PXOR X0, X9
|
|
ROL8(X9, X14)
|
|
PADDD X9, X6
|
|
PXOR X6, X3
|
|
MOVO X3, X14
|
|
PSLLL $0x07, X14
|
|
PSRLL $0x19, X3
|
|
PXOR X14, X3
|
|
PADDD X4, X1
|
|
PXOR X1, X10
|
|
ROL16(X10, X14)
|
|
PADDD X10, X7
|
|
PXOR X7, X4
|
|
MOVO X4, X14
|
|
PSLLL $0x0c, X14
|
|
PSRLL $0x14, X4
|
|
PXOR X14, X4
|
|
PADDD X4, X1
|
|
PXOR X1, X10
|
|
ROL8(X10, X14)
|
|
PADDD X10, X7
|
|
PXOR X7, X4
|
|
MOVO X4, X14
|
|
PSLLL $0x07, X14
|
|
PSRLL $0x19, X4
|
|
PXOR X14, X4
|
|
PADDD X5, X2
|
|
PXOR X2, X11
|
|
ROL16(X11, X14)
|
|
PADDD X11, X8
|
|
PXOR X8, X5
|
|
MOVO X5, X14
|
|
PSLLL $0x0c, X14
|
|
PSRLL $0x14, X5
|
|
PXOR X14, X5
|
|
PADDD X5, X2
|
|
PXOR X2, X11
|
|
ROL8(X11, X14)
|
|
PADDD X11, X8
|
|
PXOR X8, X5
|
|
MOVO X5, X14
|
|
PSLLL $0x07, X14
|
|
PSRLL $0x19, X5
|
|
PXOR X14, X5
|
|
MOVO 64(BP), X14
|
|
MOVO X7, 64(BP)
|
|
IMULQ R12, R8
|
|
ADDQ R10, R15
|
|
ADCQ DX, R8
|
|
PADDD X13, X12
|
|
PXOR X12, X15
|
|
ROL16(X15, X7)
|
|
PADDD X15, X14
|
|
PXOR X14, X13
|
|
MOVO X13, X7
|
|
PSLLL $0x0c, X7
|
|
PSRLL $0x14, X13
|
|
PXOR X7, X13
|
|
PADDD X13, X12
|
|
PXOR X12, X15
|
|
ROL8(X15, X7)
|
|
PADDD X15, X14
|
|
PXOR X14, X13
|
|
MOVO X13, X7
|
|
PSLLL $0x07, X7
|
|
PSRLL $0x19, X13
|
|
PXOR X7, X13
|
|
MOVO 64(BP), X7
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xdb
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xe4
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xed
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xed
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xf6
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xff
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xc0
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xf6
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xc9
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xd2
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xdb
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xff
|
|
BYTE $0x04
|
|
DECQ CX
|
|
JGE openSSEInternalLoop
|
|
ADDQ (R9), R10
|
|
ADCQ 8(R9), R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), AX
|
|
MOVQ AX, R15
|
|
MULQ R10
|
|
MOVQ AX, R13
|
|
MOVQ DX, R14
|
|
MOVQ (BP), AX
|
|
MULQ R11
|
|
IMULQ R12, R15
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), AX
|
|
MOVQ AX, R8
|
|
MULQ R10
|
|
ADDQ AX, R14
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R10
|
|
MOVQ 8(BP), AX
|
|
MULQ R11
|
|
ADDQ AX, R15
|
|
ADCQ $0x00, DX
|
|
IMULQ R12, R8
|
|
ADDQ R10, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
LEAQ 16(R9), R9
|
|
CMPQ CX, $-6
|
|
JG openSSEInternalLoop
|
|
|
|
// Add in the state
|
|
PADDD ·chacha20Constants<>+0(SB), X0
|
|
PADDD ·chacha20Constants<>+0(SB), X1
|
|
PADDD ·chacha20Constants<>+0(SB), X2
|
|
PADDD ·chacha20Constants<>+0(SB), X12
|
|
PADDD 32(BP), X3
|
|
PADDD 32(BP), X4
|
|
PADDD 32(BP), X5
|
|
PADDD 32(BP), X13
|
|
PADDD 48(BP), X6
|
|
PADDD 48(BP), X7
|
|
PADDD 48(BP), X8
|
|
PADDD 48(BP), X14
|
|
PADDD 80(BP), X9
|
|
PADDD 96(BP), X10
|
|
PADDD 112(BP), X11
|
|
PADDD 128(BP), X15
|
|
|
|
// Load - xor - store
|
|
MOVO X15, 64(BP)
|
|
MOVOU (SI), X15
|
|
PXOR X15, X0
|
|
MOVOU X0, (DI)
|
|
MOVOU 16(SI), X15
|
|
PXOR X15, X3
|
|
MOVOU X3, 16(DI)
|
|
MOVOU 32(SI), X15
|
|
PXOR X15, X6
|
|
MOVOU X6, 32(DI)
|
|
MOVOU 48(SI), X15
|
|
PXOR X15, X9
|
|
MOVOU X9, 48(DI)
|
|
MOVOU 64(SI), X9
|
|
PXOR X9, X1
|
|
MOVOU X1, 64(DI)
|
|
MOVOU 80(SI), X9
|
|
PXOR X9, X4
|
|
MOVOU X4, 80(DI)
|
|
MOVOU 96(SI), X9
|
|
PXOR X9, X7
|
|
MOVOU X7, 96(DI)
|
|
MOVOU 112(SI), X9
|
|
PXOR X9, X10
|
|
MOVOU X10, 112(DI)
|
|
MOVOU 128(SI), X9
|
|
PXOR X9, X2
|
|
MOVOU X2, 128(DI)
|
|
MOVOU 144(SI), X9
|
|
PXOR X9, X5
|
|
MOVOU X5, 144(DI)
|
|
MOVOU 160(SI), X9
|
|
PXOR X9, X8
|
|
MOVOU X8, 160(DI)
|
|
MOVOU 176(SI), X9
|
|
PXOR X9, X11
|
|
MOVOU X11, 176(DI)
|
|
MOVOU 192(SI), X9
|
|
PXOR X9, X12
|
|
MOVOU X12, 192(DI)
|
|
MOVOU 208(SI), X9
|
|
PXOR X9, X13
|
|
MOVOU X13, 208(DI)
|
|
MOVOU 224(SI), X9
|
|
PXOR X9, X14
|
|
MOVOU X14, 224(DI)
|
|
MOVOU 240(SI), X9
|
|
PXOR 64(BP), X9
|
|
MOVOU X9, 240(DI)
|
|
LEAQ 256(SI), SI
|
|
LEAQ 256(DI), DI
|
|
SUBQ $0x00000100, BX
|
|
JMP openSSEMainLoop
|
|
|
|
openSSEMainLoopDone:
|
|
// Handle the various tail sizes efficiently
|
|
TESTQ BX, BX
|
|
JE openSSEFinalize
|
|
CMPQ BX, $0x40
|
|
JBE openSSETail64
|
|
CMPQ BX, $0x80
|
|
JBE openSSETail128
|
|
CMPQ BX, $0xc0
|
|
JBE openSSETail192
|
|
JMP openSSETail256
|
|
|
|
openSSEFinalize:
|
|
// Hash in the PT, AAD lengths
|
|
ADDQ ad_len+80(FP), R10
|
|
ADCQ src_len+56(FP), R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), AX
|
|
MOVQ AX, R15
|
|
MULQ R10
|
|
MOVQ AX, R13
|
|
MOVQ DX, R14
|
|
MOVQ (BP), AX
|
|
MULQ R11
|
|
IMULQ R12, R15
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), AX
|
|
MOVQ AX, R8
|
|
MULQ R10
|
|
ADDQ AX, R14
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R10
|
|
MOVQ 8(BP), AX
|
|
MULQ R11
|
|
ADDQ AX, R15
|
|
ADCQ $0x00, DX
|
|
IMULQ R12, R8
|
|
ADDQ R10, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
|
|
// Final reduce
|
|
MOVQ R10, R13
|
|
MOVQ R11, R14
|
|
MOVQ R12, R15
|
|
SUBQ $-5, R10
|
|
SBBQ $-1, R11
|
|
SBBQ $0x03, R12
|
|
CMOVQCS R13, R10
|
|
CMOVQCS R14, R11
|
|
CMOVQCS R15, R12
|
|
|
|
// Add in the "s" part of the key
|
|
ADDQ 16(BP), R10
|
|
ADCQ 24(BP), R11
|
|
|
|
// Finally, constant time compare to the tag at the end of the message
|
|
XORQ AX, AX
|
|
MOVQ $0x00000001, DX
|
|
XORQ (SI), R10
|
|
XORQ 8(SI), R11
|
|
ORQ R11, R10
|
|
CMOVQEQ DX, AX
|
|
|
|
// Return true iff tags are equal
|
|
MOVB AX, ret+96(FP)
|
|
RET
|
|
|
|
openSSE128:
|
|
MOVOU ·chacha20Constants<>+0(SB), X0
|
|
MOVOU 16(R8), X3
|
|
MOVOU 32(R8), X6
|
|
MOVOU 48(R8), X9
|
|
MOVO X0, X1
|
|
MOVO X3, X4
|
|
MOVO X6, X7
|
|
MOVO X9, X10
|
|
PADDL ·sseIncMask<>+0(SB), X10
|
|
MOVO X1, X2
|
|
MOVO X4, X5
|
|
MOVO X7, X8
|
|
MOVO X10, X11
|
|
PADDL ·sseIncMask<>+0(SB), X11
|
|
MOVO X3, X13
|
|
MOVO X6, X14
|
|
MOVO X10, X15
|
|
MOVQ $0x0000000a, R9
|
|
|
|
openSSE128InnerCipherLoop:
|
|
PADDD X3, X0
|
|
PXOR X0, X9
|
|
ROL16(X9, X12)
|
|
PADDD X9, X6
|
|
PXOR X6, X3
|
|
MOVO X3, X12
|
|
PSLLL $0x0c, X12
|
|
PSRLL $0x14, X3
|
|
PXOR X12, X3
|
|
PADDD X3, X0
|
|
PXOR X0, X9
|
|
ROL8(X9, X12)
|
|
PADDD X9, X6
|
|
PXOR X6, X3
|
|
MOVO X3, X12
|
|
PSLLL $0x07, X12
|
|
PSRLL $0x19, X3
|
|
PXOR X12, X3
|
|
PADDD X4, X1
|
|
PXOR X1, X10
|
|
ROL16(X10, X12)
|
|
PADDD X10, X7
|
|
PXOR X7, X4
|
|
MOVO X4, X12
|
|
PSLLL $0x0c, X12
|
|
PSRLL $0x14, X4
|
|
PXOR X12, X4
|
|
PADDD X4, X1
|
|
PXOR X1, X10
|
|
ROL8(X10, X12)
|
|
PADDD X10, X7
|
|
PXOR X7, X4
|
|
MOVO X4, X12
|
|
PSLLL $0x07, X12
|
|
PSRLL $0x19, X4
|
|
PXOR X12, X4
|
|
PADDD X5, X2
|
|
PXOR X2, X11
|
|
ROL16(X11, X12)
|
|
PADDD X11, X8
|
|
PXOR X8, X5
|
|
MOVO X5, X12
|
|
PSLLL $0x0c, X12
|
|
PSRLL $0x14, X5
|
|
PXOR X12, X5
|
|
PADDD X5, X2
|
|
PXOR X2, X11
|
|
ROL8(X11, X12)
|
|
PADDD X11, X8
|
|
PXOR X8, X5
|
|
MOVO X5, X12
|
|
PSLLL $0x07, X12
|
|
PSRLL $0x19, X5
|
|
PXOR X12, X5
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xdb
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xe4
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xed
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xf6
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xff
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xc0
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xc9
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xd2
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xdb
|
|
BYTE $0x0c
|
|
PADDD X3, X0
|
|
PXOR X0, X9
|
|
ROL16(X9, X12)
|
|
PADDD X9, X6
|
|
PXOR X6, X3
|
|
MOVO X3, X12
|
|
PSLLL $0x0c, X12
|
|
PSRLL $0x14, X3
|
|
PXOR X12, X3
|
|
PADDD X3, X0
|
|
PXOR X0, X9
|
|
ROL8(X9, X12)
|
|
PADDD X9, X6
|
|
PXOR X6, X3
|
|
MOVO X3, X12
|
|
PSLLL $0x07, X12
|
|
PSRLL $0x19, X3
|
|
PXOR X12, X3
|
|
PADDD X4, X1
|
|
PXOR X1, X10
|
|
ROL16(X10, X12)
|
|
PADDD X10, X7
|
|
PXOR X7, X4
|
|
MOVO X4, X12
|
|
PSLLL $0x0c, X12
|
|
PSRLL $0x14, X4
|
|
PXOR X12, X4
|
|
PADDD X4, X1
|
|
PXOR X1, X10
|
|
ROL8(X10, X12)
|
|
PADDD X10, X7
|
|
PXOR X7, X4
|
|
MOVO X4, X12
|
|
PSLLL $0x07, X12
|
|
PSRLL $0x19, X4
|
|
PXOR X12, X4
|
|
PADDD X5, X2
|
|
PXOR X2, X11
|
|
ROL16(X11, X12)
|
|
PADDD X11, X8
|
|
PXOR X8, X5
|
|
MOVO X5, X12
|
|
PSLLL $0x0c, X12
|
|
PSRLL $0x14, X5
|
|
PXOR X12, X5
|
|
PADDD X5, X2
|
|
PXOR X2, X11
|
|
ROL8(X11, X12)
|
|
PADDD X11, X8
|
|
PXOR X8, X5
|
|
MOVO X5, X12
|
|
PSLLL $0x07, X12
|
|
PSRLL $0x19, X5
|
|
PXOR X12, X5
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xdb
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xe4
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xed
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xf6
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xff
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xc0
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xc9
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xd2
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xdb
|
|
BYTE $0x04
|
|
DECQ R9
|
|
JNE openSSE128InnerCipherLoop
|
|
|
|
// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
|
|
PADDL ·chacha20Constants<>+0(SB), X0
|
|
PADDL ·chacha20Constants<>+0(SB), X1
|
|
PADDL ·chacha20Constants<>+0(SB), X2
|
|
PADDL X13, X3
|
|
PADDL X13, X4
|
|
PADDL X13, X5
|
|
PADDL X14, X7
|
|
PADDL X14, X8
|
|
PADDL X15, X10
|
|
PADDL ·sseIncMask<>+0(SB), X15
|
|
PADDL X15, X11
|
|
|
|
// Clamp and store the key
|
|
PAND ·polyClampMask<>+0(SB), X0
|
|
MOVOU X0, (BP)
|
|
MOVOU X3, 16(BP)
|
|
|
|
// Hash
|
|
MOVQ ad_len+80(FP), R9
|
|
CALL polyHashADInternal<>(SB)
|
|
|
|
openSSE128Open:
|
|
CMPQ BX, $0x10
|
|
JB openSSETail16
|
|
SUBQ $0x10, BX
|
|
|
|
// Load for hashing
|
|
ADDQ (SI), R10
|
|
ADCQ 8(SI), R11
|
|
ADCQ $0x01, R12
|
|
|
|
// Load for decryption
|
|
MOVOU (SI), X12
|
|
PXOR X12, X1
|
|
MOVOU X1, (DI)
|
|
LEAQ 16(SI), SI
|
|
LEAQ 16(DI), DI
|
|
MOVQ (BP), AX
|
|
MOVQ AX, R15
|
|
MULQ R10
|
|
MOVQ AX, R13
|
|
MOVQ DX, R14
|
|
MOVQ (BP), AX
|
|
MULQ R11
|
|
IMULQ R12, R15
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), AX
|
|
MOVQ AX, R8
|
|
MULQ R10
|
|
ADDQ AX, R14
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R10
|
|
MOVQ 8(BP), AX
|
|
MULQ R11
|
|
ADDQ AX, R15
|
|
ADCQ $0x00, DX
|
|
IMULQ R12, R8
|
|
ADDQ R10, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
|
|
// Shift the stream "left"
|
|
MOVO X4, X1
|
|
MOVO X7, X4
|
|
MOVO X10, X7
|
|
MOVO X2, X10
|
|
MOVO X5, X2
|
|
MOVO X8, X5
|
|
MOVO X11, X8
|
|
JMP openSSE128Open
|
|
|
|
openSSETail16:
|
|
TESTQ BX, BX
|
|
JE openSSEFinalize
|
|
|
|
// We can safely load the CT from the end, because it is padded with the MAC
|
|
MOVQ BX, R9
|
|
SHLQ $0x04, R9
|
|
LEAQ ·andMask<>+0(SB), R13
|
|
MOVOU (SI), X12
|
|
ADDQ BX, SI
|
|
PAND -16(R13)(R9*1), X12
|
|
MOVO X12, 64(BP)
|
|
MOVQ X12, R13
|
|
MOVQ 72(BP), R14
|
|
PXOR X1, X12
|
|
|
|
// We can only store one byte at a time, since plaintext can be shorter than 16 bytes
|
|
openSSETail16Store:
|
|
MOVQ X12, R8
|
|
MOVB R8, (DI)
|
|
PSRLDQ $0x01, X12
|
|
INCQ DI
|
|
DECQ BX
|
|
JNE openSSETail16Store
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), AX
|
|
MOVQ AX, R15
|
|
MULQ R10
|
|
MOVQ AX, R13
|
|
MOVQ DX, R14
|
|
MOVQ (BP), AX
|
|
MULQ R11
|
|
IMULQ R12, R15
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), AX
|
|
MOVQ AX, R8
|
|
MULQ R10
|
|
ADDQ AX, R14
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R10
|
|
MOVQ 8(BP), AX
|
|
MULQ R11
|
|
ADDQ AX, R15
|
|
ADCQ $0x00, DX
|
|
IMULQ R12, R8
|
|
ADDQ R10, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
JMP openSSEFinalize
|
|
|
|
openSSETail64:
|
|
MOVO ·chacha20Constants<>+0(SB), X0
|
|
MOVO 32(BP), X3
|
|
MOVO 48(BP), X6
|
|
MOVO 128(BP), X9
|
|
PADDL ·sseIncMask<>+0(SB), X9
|
|
MOVO X9, 80(BP)
|
|
XORQ R9, R9
|
|
MOVQ BX, CX
|
|
CMPQ CX, $0x10
|
|
JB openSSETail64LoopB
|
|
|
|
openSSETail64LoopA:
|
|
ADDQ (SI)(R9*1), R10
|
|
ADCQ 8(SI)(R9*1), R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), AX
|
|
MOVQ AX, R15
|
|
MULQ R10
|
|
MOVQ AX, R13
|
|
MOVQ DX, R14
|
|
MOVQ (BP), AX
|
|
MULQ R11
|
|
IMULQ R12, R15
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), AX
|
|
MOVQ AX, R8
|
|
MULQ R10
|
|
ADDQ AX, R14
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R10
|
|
MOVQ 8(BP), AX
|
|
MULQ R11
|
|
ADDQ AX, R15
|
|
ADCQ $0x00, DX
|
|
IMULQ R12, R8
|
|
ADDQ R10, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
SUBQ $0x10, CX
|
|
|
|
openSSETail64LoopB:
|
|
ADDQ $0x10, R9
|
|
PADDD X3, X0
|
|
PXOR X0, X9
|
|
ROL16(X9, X12)
|
|
PADDD X9, X6
|
|
PXOR X6, X3
|
|
MOVO X3, X12
|
|
PSLLL $0x0c, X12
|
|
PSRLL $0x14, X3
|
|
PXOR X12, X3
|
|
PADDD X3, X0
|
|
PXOR X0, X9
|
|
ROL8(X9, X12)
|
|
PADDD X9, X6
|
|
PXOR X6, X3
|
|
MOVO X3, X12
|
|
PSLLL $0x07, X12
|
|
PSRLL $0x19, X3
|
|
PXOR X12, X3
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xdb
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xf6
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xc9
|
|
BYTE $0x0c
|
|
PADDD X3, X0
|
|
PXOR X0, X9
|
|
ROL16(X9, X12)
|
|
PADDD X9, X6
|
|
PXOR X6, X3
|
|
MOVO X3, X12
|
|
PSLLL $0x0c, X12
|
|
PSRLL $0x14, X3
|
|
PXOR X12, X3
|
|
PADDD X3, X0
|
|
PXOR X0, X9
|
|
ROL8(X9, X12)
|
|
PADDD X9, X6
|
|
PXOR X6, X3
|
|
MOVO X3, X12
|
|
PSLLL $0x07, X12
|
|
PSRLL $0x19, X3
|
|
PXOR X12, X3
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xdb
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xf6
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xc9
|
|
BYTE $0x04
|
|
CMPQ CX, $0x10
|
|
JAE openSSETail64LoopA
|
|
CMPQ R9, $0xa0
|
|
JNE openSSETail64LoopB
|
|
PADDL ·chacha20Constants<>+0(SB), X0
|
|
PADDL 32(BP), X3
|
|
PADDL 48(BP), X6
|
|
PADDL 80(BP), X9
|
|
|
|
openSSETail64DecLoop:
|
|
CMPQ BX, $0x10
|
|
JB openSSETail64DecLoopDone
|
|
SUBQ $0x10, BX
|
|
MOVOU (SI), X12
|
|
PXOR X12, X0
|
|
MOVOU X0, (DI)
|
|
LEAQ 16(SI), SI
|
|
LEAQ 16(DI), DI
|
|
MOVO X3, X0
|
|
MOVO X6, X3
|
|
MOVO X9, X6
|
|
JMP openSSETail64DecLoop
|
|
|
|
openSSETail64DecLoopDone:
|
|
MOVO X0, X1
|
|
JMP openSSETail16
|
|
|
|
openSSETail128:
|
|
MOVO ·chacha20Constants<>+0(SB), X1
|
|
MOVO 32(BP), X4
|
|
MOVO 48(BP), X7
|
|
MOVO 128(BP), X10
|
|
PADDL ·sseIncMask<>+0(SB), X10
|
|
MOVO X10, 80(BP)
|
|
MOVO X1, X0
|
|
MOVO X4, X3
|
|
MOVO X7, X6
|
|
MOVO X10, X9
|
|
PADDL ·sseIncMask<>+0(SB), X9
|
|
MOVO X9, 96(BP)
|
|
XORQ R9, R9
|
|
MOVQ BX, CX
|
|
ANDQ $-16, CX
|
|
|
|
openSSETail128LoopA:
|
|
ADDQ (SI)(R9*1), R10
|
|
ADCQ 8(SI)(R9*1), R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), AX
|
|
MOVQ AX, R15
|
|
MULQ R10
|
|
MOVQ AX, R13
|
|
MOVQ DX, R14
|
|
MOVQ (BP), AX
|
|
MULQ R11
|
|
IMULQ R12, R15
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), AX
|
|
MOVQ AX, R8
|
|
MULQ R10
|
|
ADDQ AX, R14
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R10
|
|
MOVQ 8(BP), AX
|
|
MULQ R11
|
|
ADDQ AX, R15
|
|
ADCQ $0x00, DX
|
|
IMULQ R12, R8
|
|
ADDQ R10, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
|
|
openSSETail128LoopB:
|
|
ADDQ $0x10, R9
|
|
PADDD X3, X0
|
|
PXOR X0, X9
|
|
ROL16(X9, X12)
|
|
PADDD X9, X6
|
|
PXOR X6, X3
|
|
MOVO X3, X12
|
|
PSLLL $0x0c, X12
|
|
PSRLL $0x14, X3
|
|
PXOR X12, X3
|
|
PADDD X3, X0
|
|
PXOR X0, X9
|
|
ROL8(X9, X12)
|
|
PADDD X9, X6
|
|
PXOR X6, X3
|
|
MOVO X3, X12
|
|
PSLLL $0x07, X12
|
|
PSRLL $0x19, X3
|
|
PXOR X12, X3
|
|
PADDD X4, X1
|
|
PXOR X1, X10
|
|
ROL16(X10, X12)
|
|
PADDD X10, X7
|
|
PXOR X7, X4
|
|
MOVO X4, X12
|
|
PSLLL $0x0c, X12
|
|
PSRLL $0x14, X4
|
|
PXOR X12, X4
|
|
PADDD X4, X1
|
|
PXOR X1, X10
|
|
ROL8(X10, X12)
|
|
PADDD X10, X7
|
|
PXOR X7, X4
|
|
MOVO X4, X12
|
|
PSLLL $0x07, X12
|
|
PSRLL $0x19, X4
|
|
PXOR X12, X4
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xdb
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xf6
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xc9
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xe4
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xff
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xd2
|
|
BYTE $0x0c
|
|
PADDD X3, X0
|
|
PXOR X0, X9
|
|
ROL16(X9, X12)
|
|
PADDD X9, X6
|
|
PXOR X6, X3
|
|
MOVO X3, X12
|
|
PSLLL $0x0c, X12
|
|
PSRLL $0x14, X3
|
|
PXOR X12, X3
|
|
PADDD X3, X0
|
|
PXOR X0, X9
|
|
ROL8(X9, X12)
|
|
PADDD X9, X6
|
|
PXOR X6, X3
|
|
MOVO X3, X12
|
|
PSLLL $0x07, X12
|
|
PSRLL $0x19, X3
|
|
PXOR X12, X3
|
|
PADDD X4, X1
|
|
PXOR X1, X10
|
|
ROL16(X10, X12)
|
|
PADDD X10, X7
|
|
PXOR X7, X4
|
|
MOVO X4, X12
|
|
PSLLL $0x0c, X12
|
|
PSRLL $0x14, X4
|
|
PXOR X12, X4
|
|
PADDD X4, X1
|
|
PXOR X1, X10
|
|
ROL8(X10, X12)
|
|
PADDD X10, X7
|
|
PXOR X7, X4
|
|
MOVO X4, X12
|
|
PSLLL $0x07, X12
|
|
PSRLL $0x19, X4
|
|
PXOR X12, X4
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xdb
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xf6
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xc9
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xe4
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xff
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xd2
|
|
BYTE $0x04
|
|
CMPQ R9, CX
|
|
JB openSSETail128LoopA
|
|
CMPQ R9, $0xa0
|
|
JNE openSSETail128LoopB
|
|
PADDL ·chacha20Constants<>+0(SB), X0
|
|
PADDL ·chacha20Constants<>+0(SB), X1
|
|
PADDL 32(BP), X3
|
|
PADDL 32(BP), X4
|
|
PADDL 48(BP), X6
|
|
PADDL 48(BP), X7
|
|
PADDL 96(BP), X9
|
|
PADDL 80(BP), X10
|
|
MOVOU (SI), X12
|
|
MOVOU 16(SI), X13
|
|
MOVOU 32(SI), X14
|
|
MOVOU 48(SI), X15
|
|
PXOR X12, X1
|
|
PXOR X13, X4
|
|
PXOR X14, X7
|
|
PXOR X15, X10
|
|
MOVOU X1, (DI)
|
|
MOVOU X4, 16(DI)
|
|
MOVOU X7, 32(DI)
|
|
MOVOU X10, 48(DI)
|
|
SUBQ $0x40, BX
|
|
LEAQ 64(SI), SI
|
|
LEAQ 64(DI), DI
|
|
JMP openSSETail64DecLoop
|
|
|
|
openSSETail192:
|
|
MOVO ·chacha20Constants<>+0(SB), X2
|
|
MOVO 32(BP), X5
|
|
MOVO 48(BP), X8
|
|
MOVO 128(BP), X11
|
|
PADDL ·sseIncMask<>+0(SB), X11
|
|
MOVO X11, 80(BP)
|
|
MOVO X2, X1
|
|
MOVO X5, X4
|
|
MOVO X8, X7
|
|
MOVO X11, X10
|
|
PADDL ·sseIncMask<>+0(SB), X10
|
|
MOVO X10, 96(BP)
|
|
MOVO X1, X0
|
|
MOVO X4, X3
|
|
MOVO X7, X6
|
|
MOVO X10, X9
|
|
PADDL ·sseIncMask<>+0(SB), X9
|
|
MOVO X9, 112(BP)
|
|
MOVQ BX, CX
|
|
MOVQ $0x000000a0, R9
|
|
CMPQ CX, $0xa0
|
|
CMOVQGT R9, CX
|
|
ANDQ $-16, CX
|
|
XORQ R9, R9
|
|
|
|
openSSLTail192LoopA:
|
|
ADDQ (SI)(R9*1), R10
|
|
ADCQ 8(SI)(R9*1), R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), AX
|
|
MOVQ AX, R15
|
|
MULQ R10
|
|
MOVQ AX, R13
|
|
MOVQ DX, R14
|
|
MOVQ (BP), AX
|
|
MULQ R11
|
|
IMULQ R12, R15
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), AX
|
|
MOVQ AX, R8
|
|
MULQ R10
|
|
ADDQ AX, R14
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R10
|
|
MOVQ 8(BP), AX
|
|
MULQ R11
|
|
ADDQ AX, R15
|
|
ADCQ $0x00, DX
|
|
IMULQ R12, R8
|
|
ADDQ R10, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
|
|
openSSLTail192LoopB:
|
|
ADDQ $0x10, R9
|
|
PADDD X3, X0
|
|
PXOR X0, X9
|
|
ROL16(X9, X12)
|
|
PADDD X9, X6
|
|
PXOR X6, X3
|
|
MOVO X3, X12
|
|
PSLLL $0x0c, X12
|
|
PSRLL $0x14, X3
|
|
PXOR X12, X3
|
|
PADDD X3, X0
|
|
PXOR X0, X9
|
|
ROL8(X9, X12)
|
|
PADDD X9, X6
|
|
PXOR X6, X3
|
|
MOVO X3, X12
|
|
PSLLL $0x07, X12
|
|
PSRLL $0x19, X3
|
|
PXOR X12, X3
|
|
PADDD X4, X1
|
|
PXOR X1, X10
|
|
ROL16(X10, X12)
|
|
PADDD X10, X7
|
|
PXOR X7, X4
|
|
MOVO X4, X12
|
|
PSLLL $0x0c, X12
|
|
PSRLL $0x14, X4
|
|
PXOR X12, X4
|
|
PADDD X4, X1
|
|
PXOR X1, X10
|
|
ROL8(X10, X12)
|
|
PADDD X10, X7
|
|
PXOR X7, X4
|
|
MOVO X4, X12
|
|
PSLLL $0x07, X12
|
|
PSRLL $0x19, X4
|
|
PXOR X12, X4
|
|
PADDD X5, X2
|
|
PXOR X2, X11
|
|
ROL16(X11, X12)
|
|
PADDD X11, X8
|
|
PXOR X8, X5
|
|
MOVO X5, X12
|
|
PSLLL $0x0c, X12
|
|
PSRLL $0x14, X5
|
|
PXOR X12, X5
|
|
PADDD X5, X2
|
|
PXOR X2, X11
|
|
ROL8(X11, X12)
|
|
PADDD X11, X8
|
|
PXOR X8, X5
|
|
MOVO X5, X12
|
|
PSLLL $0x07, X12
|
|
PSRLL $0x19, X5
|
|
PXOR X12, X5
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xdb
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xf6
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xc9
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xe4
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xff
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xd2
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xed
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xc0
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xdb
|
|
BYTE $0x0c
|
|
PADDD X3, X0
|
|
PXOR X0, X9
|
|
ROL16(X9, X12)
|
|
PADDD X9, X6
|
|
PXOR X6, X3
|
|
MOVO X3, X12
|
|
PSLLL $0x0c, X12
|
|
PSRLL $0x14, X3
|
|
PXOR X12, X3
|
|
PADDD X3, X0
|
|
PXOR X0, X9
|
|
ROL8(X9, X12)
|
|
PADDD X9, X6
|
|
PXOR X6, X3
|
|
MOVO X3, X12
|
|
PSLLL $0x07, X12
|
|
PSRLL $0x19, X3
|
|
PXOR X12, X3
|
|
PADDD X4, X1
|
|
PXOR X1, X10
|
|
ROL16(X10, X12)
|
|
PADDD X10, X7
|
|
PXOR X7, X4
|
|
MOVO X4, X12
|
|
PSLLL $0x0c, X12
|
|
PSRLL $0x14, X4
|
|
PXOR X12, X4
|
|
PADDD X4, X1
|
|
PXOR X1, X10
|
|
ROL8(X10, X12)
|
|
PADDD X10, X7
|
|
PXOR X7, X4
|
|
MOVO X4, X12
|
|
PSLLL $0x07, X12
|
|
PSRLL $0x19, X4
|
|
PXOR X12, X4
|
|
PADDD X5, X2
|
|
PXOR X2, X11
|
|
ROL16(X11, X12)
|
|
PADDD X11, X8
|
|
PXOR X8, X5
|
|
MOVO X5, X12
|
|
PSLLL $0x0c, X12
|
|
PSRLL $0x14, X5
|
|
PXOR X12, X5
|
|
PADDD X5, X2
|
|
PXOR X2, X11
|
|
ROL8(X11, X12)
|
|
PADDD X11, X8
|
|
PXOR X8, X5
|
|
MOVO X5, X12
|
|
PSLLL $0x07, X12
|
|
PSRLL $0x19, X5
|
|
PXOR X12, X5
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xdb
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xf6
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xc9
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xe4
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xff
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xd2
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xed
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xc0
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xdb
|
|
BYTE $0x04
|
|
CMPQ R9, CX
|
|
JB openSSLTail192LoopA
|
|
CMPQ R9, $0xa0
|
|
JNE openSSLTail192LoopB
|
|
CMPQ BX, $0xb0
|
|
JB openSSLTail192Store
|
|
ADDQ 160(SI), R10
|
|
ADCQ 168(SI), R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), AX
|
|
MOVQ AX, R15
|
|
MULQ R10
|
|
MOVQ AX, R13
|
|
MOVQ DX, R14
|
|
MOVQ (BP), AX
|
|
MULQ R11
|
|
IMULQ R12, R15
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), AX
|
|
MOVQ AX, R8
|
|
MULQ R10
|
|
ADDQ AX, R14
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R10
|
|
MOVQ 8(BP), AX
|
|
MULQ R11
|
|
ADDQ AX, R15
|
|
ADCQ $0x00, DX
|
|
IMULQ R12, R8
|
|
ADDQ R10, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
CMPQ BX, $0xc0
|
|
JB openSSLTail192Store
|
|
ADDQ 176(SI), R10
|
|
ADCQ 184(SI), R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), AX
|
|
MOVQ AX, R15
|
|
MULQ R10
|
|
MOVQ AX, R13
|
|
MOVQ DX, R14
|
|
MOVQ (BP), AX
|
|
MULQ R11
|
|
IMULQ R12, R15
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), AX
|
|
MOVQ AX, R8
|
|
MULQ R10
|
|
ADDQ AX, R14
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R10
|
|
MOVQ 8(BP), AX
|
|
MULQ R11
|
|
ADDQ AX, R15
|
|
ADCQ $0x00, DX
|
|
IMULQ R12, R8
|
|
ADDQ R10, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
|
|
openSSLTail192Store:
|
|
PADDL ·chacha20Constants<>+0(SB), X0
|
|
PADDL ·chacha20Constants<>+0(SB), X1
|
|
PADDL ·chacha20Constants<>+0(SB), X2
|
|
PADDL 32(BP), X3
|
|
PADDL 32(BP), X4
|
|
PADDL 32(BP), X5
|
|
PADDL 48(BP), X6
|
|
PADDL 48(BP), X7
|
|
PADDL 48(BP), X8
|
|
PADDL 112(BP), X9
|
|
PADDL 96(BP), X10
|
|
PADDL 80(BP), X11
|
|
MOVOU (SI), X12
|
|
MOVOU 16(SI), X13
|
|
MOVOU 32(SI), X14
|
|
MOVOU 48(SI), X15
|
|
PXOR X12, X2
|
|
PXOR X13, X5
|
|
PXOR X14, X8
|
|
PXOR X15, X11
|
|
MOVOU X2, (DI)
|
|
MOVOU X5, 16(DI)
|
|
MOVOU X8, 32(DI)
|
|
MOVOU X11, 48(DI)
|
|
MOVOU 64(SI), X12
|
|
MOVOU 80(SI), X13
|
|
MOVOU 96(SI), X14
|
|
MOVOU 112(SI), X15
|
|
PXOR X12, X1
|
|
PXOR X13, X4
|
|
PXOR X14, X7
|
|
PXOR X15, X10
|
|
MOVOU X1, 64(DI)
|
|
MOVOU X4, 80(DI)
|
|
MOVOU X7, 96(DI)
|
|
MOVOU X10, 112(DI)
|
|
SUBQ $0x80, BX
|
|
LEAQ 128(SI), SI
|
|
LEAQ 128(DI), DI
|
|
JMP openSSETail64DecLoop
|
|
|
|
openSSETail256:
|
|
MOVO ·chacha20Constants<>+0(SB), X0
|
|
MOVO 32(BP), X3
|
|
MOVO 48(BP), X6
|
|
MOVO 128(BP), X9
|
|
PADDL ·sseIncMask<>+0(SB), X9
|
|
MOVO X0, X1
|
|
MOVO X3, X4
|
|
MOVO X6, X7
|
|
MOVO X9, X10
|
|
PADDL ·sseIncMask<>+0(SB), X10
|
|
MOVO X1, X2
|
|
MOVO X4, X5
|
|
MOVO X7, X8
|
|
MOVO X10, X11
|
|
PADDL ·sseIncMask<>+0(SB), X11
|
|
MOVO X2, X12
|
|
MOVO X5, X13
|
|
MOVO X8, X14
|
|
MOVO X11, X15
|
|
PADDL ·sseIncMask<>+0(SB), X15
|
|
|
|
// Store counters
|
|
MOVO X9, 80(BP)
|
|
MOVO X10, 96(BP)
|
|
MOVO X11, 112(BP)
|
|
MOVO X15, 128(BP)
|
|
XORQ R9, R9
|
|
|
|
openSSETail256Loop:
|
|
ADDQ (SI)(R9*1), R10
|
|
ADCQ 8(SI)(R9*1), R11
|
|
ADCQ $0x01, R12
|
|
MOVO X14, 64(BP)
|
|
PADDD X3, X0
|
|
PXOR X0, X9
|
|
ROL16(X9, X14)
|
|
PADDD X9, X6
|
|
PXOR X6, X3
|
|
MOVO X3, X14
|
|
PSLLL $0x0c, X14
|
|
PSRLL $0x14, X3
|
|
PXOR X14, X3
|
|
PADDD X3, X0
|
|
PXOR X0, X9
|
|
ROL8(X9, X14)
|
|
PADDD X9, X6
|
|
PXOR X6, X3
|
|
MOVO X3, X14
|
|
PSLLL $0x07, X14
|
|
PSRLL $0x19, X3
|
|
PXOR X14, X3
|
|
PADDD X4, X1
|
|
PXOR X1, X10
|
|
ROL16(X10, X14)
|
|
PADDD X10, X7
|
|
PXOR X7, X4
|
|
MOVO X4, X14
|
|
PSLLL $0x0c, X14
|
|
PSRLL $0x14, X4
|
|
PXOR X14, X4
|
|
PADDD X4, X1
|
|
PXOR X1, X10
|
|
ROL8(X10, X14)
|
|
PADDD X10, X7
|
|
PXOR X7, X4
|
|
MOVO X4, X14
|
|
PSLLL $0x07, X14
|
|
PSRLL $0x19, X4
|
|
PXOR X14, X4
|
|
PADDD X5, X2
|
|
PXOR X2, X11
|
|
ROL16(X11, X14)
|
|
PADDD X11, X8
|
|
PXOR X8, X5
|
|
MOVO X5, X14
|
|
PSLLL $0x0c, X14
|
|
PSRLL $0x14, X5
|
|
PXOR X14, X5
|
|
PADDD X5, X2
|
|
PXOR X2, X11
|
|
ROL8(X11, X14)
|
|
PADDD X11, X8
|
|
PXOR X8, X5
|
|
MOVO X5, X14
|
|
PSLLL $0x07, X14
|
|
PSRLL $0x19, X5
|
|
PXOR X14, X5
|
|
MOVO 64(BP), X14
|
|
MOVO X7, 64(BP)
|
|
PADDD X13, X12
|
|
PXOR X12, X15
|
|
ROL16(X15, X7)
|
|
PADDD X15, X14
|
|
PXOR X14, X13
|
|
MOVO X13, X7
|
|
PSLLL $0x0c, X7
|
|
PSRLL $0x14, X13
|
|
PXOR X7, X13
|
|
PADDD X13, X12
|
|
PXOR X12, X15
|
|
ROL8(X15, X7)
|
|
PADDD X15, X14
|
|
PXOR X14, X13
|
|
MOVO X13, X7
|
|
PSLLL $0x07, X7
|
|
PSRLL $0x19, X13
|
|
PXOR X7, X13
|
|
MOVO 64(BP), X7
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xdb
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xe4
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xed
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xed
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xf6
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xff
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xc0
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xf6
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xc9
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xd2
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xdb
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xff
|
|
BYTE $0x0c
|
|
MOVQ (BP), AX
|
|
MOVQ AX, R15
|
|
MULQ R10
|
|
MOVQ AX, R13
|
|
MOVQ DX, R14
|
|
MOVQ (BP), AX
|
|
MULQ R11
|
|
IMULQ R12, R15
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), AX
|
|
MOVQ AX, R8
|
|
MULQ R10
|
|
ADDQ AX, R14
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R10
|
|
MOVQ 8(BP), AX
|
|
MULQ R11
|
|
ADDQ AX, R15
|
|
ADCQ $0x00, DX
|
|
MOVO X14, 64(BP)
|
|
PADDD X3, X0
|
|
PXOR X0, X9
|
|
ROL16(X9, X14)
|
|
PADDD X9, X6
|
|
PXOR X6, X3
|
|
MOVO X3, X14
|
|
PSLLL $0x0c, X14
|
|
PSRLL $0x14, X3
|
|
PXOR X14, X3
|
|
PADDD X3, X0
|
|
PXOR X0, X9
|
|
ROL8(X9, X14)
|
|
PADDD X9, X6
|
|
PXOR X6, X3
|
|
MOVO X3, X14
|
|
PSLLL $0x07, X14
|
|
PSRLL $0x19, X3
|
|
PXOR X14, X3
|
|
PADDD X4, X1
|
|
PXOR X1, X10
|
|
ROL16(X10, X14)
|
|
PADDD X10, X7
|
|
PXOR X7, X4
|
|
MOVO X4, X14
|
|
PSLLL $0x0c, X14
|
|
PSRLL $0x14, X4
|
|
PXOR X14, X4
|
|
PADDD X4, X1
|
|
PXOR X1, X10
|
|
ROL8(X10, X14)
|
|
PADDD X10, X7
|
|
PXOR X7, X4
|
|
MOVO X4, X14
|
|
PSLLL $0x07, X14
|
|
PSRLL $0x19, X4
|
|
PXOR X14, X4
|
|
PADDD X5, X2
|
|
PXOR X2, X11
|
|
ROL16(X11, X14)
|
|
PADDD X11, X8
|
|
PXOR X8, X5
|
|
MOVO X5, X14
|
|
PSLLL $0x0c, X14
|
|
PSRLL $0x14, X5
|
|
PXOR X14, X5
|
|
PADDD X5, X2
|
|
PXOR X2, X11
|
|
ROL8(X11, X14)
|
|
PADDD X11, X8
|
|
PXOR X8, X5
|
|
MOVO X5, X14
|
|
PSLLL $0x07, X14
|
|
PSRLL $0x19, X5
|
|
PXOR X14, X5
|
|
MOVO 64(BP), X14
|
|
MOVO X7, 64(BP)
|
|
PADDD X13, X12
|
|
PXOR X12, X15
|
|
ROL16(X15, X7)
|
|
PADDD X15, X14
|
|
PXOR X14, X13
|
|
MOVO X13, X7
|
|
PSLLL $0x0c, X7
|
|
PSRLL $0x14, X13
|
|
PXOR X7, X13
|
|
PADDD X13, X12
|
|
PXOR X12, X15
|
|
ROL8(X15, X7)
|
|
PADDD X15, X14
|
|
PXOR X14, X13
|
|
MOVO X13, X7
|
|
PSLLL $0x07, X7
|
|
PSRLL $0x19, X13
|
|
PXOR X7, X13
|
|
MOVO 64(BP), X7
|
|
IMULQ R12, R8
|
|
ADDQ R10, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xdb
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xe4
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xed
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xed
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xf6
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xff
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xc0
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xf6
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xc9
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xd2
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xdb
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xff
|
|
BYTE $0x04
|
|
ADDQ $0x10, R9
|
|
CMPQ R9, $0xa0
|
|
JB openSSETail256Loop
|
|
MOVQ BX, CX
|
|
ANDQ $-16, CX
|
|
|
|
openSSETail256HashLoop:
|
|
ADDQ (SI)(R9*1), R10
|
|
ADCQ 8(SI)(R9*1), R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), AX
|
|
MOVQ AX, R15
|
|
MULQ R10
|
|
MOVQ AX, R13
|
|
MOVQ DX, R14
|
|
MOVQ (BP), AX
|
|
MULQ R11
|
|
IMULQ R12, R15
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), AX
|
|
MOVQ AX, R8
|
|
MULQ R10
|
|
ADDQ AX, R14
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R10
|
|
MOVQ 8(BP), AX
|
|
MULQ R11
|
|
ADDQ AX, R15
|
|
ADCQ $0x00, DX
|
|
IMULQ R12, R8
|
|
ADDQ R10, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ $0x10, R9
|
|
CMPQ R9, CX
|
|
JB openSSETail256HashLoop
|
|
|
|
// Add in the state
|
|
PADDD ·chacha20Constants<>+0(SB), X0
|
|
PADDD ·chacha20Constants<>+0(SB), X1
|
|
PADDD ·chacha20Constants<>+0(SB), X2
|
|
PADDD ·chacha20Constants<>+0(SB), X12
|
|
PADDD 32(BP), X3
|
|
PADDD 32(BP), X4
|
|
PADDD 32(BP), X5
|
|
PADDD 32(BP), X13
|
|
PADDD 48(BP), X6
|
|
PADDD 48(BP), X7
|
|
PADDD 48(BP), X8
|
|
PADDD 48(BP), X14
|
|
PADDD 80(BP), X9
|
|
PADDD 96(BP), X10
|
|
PADDD 112(BP), X11
|
|
PADDD 128(BP), X15
|
|
MOVO X15, 64(BP)
|
|
|
|
// Load - xor - store
|
|
MOVOU (SI), X15
|
|
PXOR X15, X0
|
|
MOVOU 16(SI), X15
|
|
PXOR X15, X3
|
|
MOVOU 32(SI), X15
|
|
PXOR X15, X6
|
|
MOVOU 48(SI), X15
|
|
PXOR X15, X9
|
|
MOVOU X0, (DI)
|
|
MOVOU X3, 16(DI)
|
|
MOVOU X6, 32(DI)
|
|
MOVOU X9, 48(DI)
|
|
MOVOU 64(SI), X0
|
|
MOVOU 80(SI), X3
|
|
MOVOU 96(SI), X6
|
|
MOVOU 112(SI), X9
|
|
PXOR X0, X1
|
|
PXOR X3, X4
|
|
PXOR X6, X7
|
|
PXOR X9, X10
|
|
MOVOU X1, 64(DI)
|
|
MOVOU X4, 80(DI)
|
|
MOVOU X7, 96(DI)
|
|
MOVOU X10, 112(DI)
|
|
MOVOU 128(SI), X0
|
|
MOVOU 144(SI), X3
|
|
MOVOU 160(SI), X6
|
|
MOVOU 176(SI), X9
|
|
PXOR X0, X2
|
|
PXOR X3, X5
|
|
PXOR X6, X8
|
|
PXOR X9, X11
|
|
MOVOU X2, 128(DI)
|
|
MOVOU X5, 144(DI)
|
|
MOVOU X8, 160(DI)
|
|
MOVOU X11, 176(DI)
|
|
LEAQ 192(SI), SI
|
|
LEAQ 192(DI), DI
|
|
SUBQ $0xc0, BX
|
|
MOVO X12, X0
|
|
MOVO X13, X3
|
|
MOVO X14, X6
|
|
MOVO 64(BP), X9
|
|
JMP openSSETail64DecLoop
|
|
|
|
chacha20Poly1305Open_AVX2:
|
|
VZEROUPPER
|
|
VMOVDQU ·chacha20Constants<>+0(SB), Y0
|
|
BYTE $0xc4
|
|
BYTE $0x42
|
|
BYTE $0x7d
|
|
BYTE $0x5a
|
|
BYTE $0x70
|
|
BYTE $0x10
|
|
BYTE $0xc4
|
|
BYTE $0x42
|
|
BYTE $0x7d
|
|
BYTE $0x5a
|
|
BYTE $0x60
|
|
BYTE $0x20
|
|
BYTE $0xc4
|
|
BYTE $0xc2
|
|
BYTE $0x7d
|
|
BYTE $0x5a
|
|
BYTE $0x60
|
|
BYTE $0x30
|
|
VPADDD ·avx2InitMask<>+0(SB), Y4, Y4
|
|
|
|
// Special optimization, for very short buffers
|
|
CMPQ BX, $0xc0
|
|
JBE openAVX2192
|
|
CMPQ BX, $0x00000140
|
|
JBE openAVX2320
|
|
|
|
// For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
|
|
VMOVDQA Y14, 32(BP)
|
|
VMOVDQA Y12, 64(BP)
|
|
VMOVDQA Y4, 192(BP)
|
|
MOVQ $0x0000000a, R9
|
|
|
|
openAVX2PreparePolyKey:
|
|
VPADDD Y14, Y0, Y0
|
|
VPXOR Y0, Y4, Y4
|
|
VPSHUFB ·rol16<>+0(SB), Y4, Y4
|
|
VPADDD Y4, Y12, Y12
|
|
VPXOR Y12, Y14, Y14
|
|
VPSLLD $0x0c, Y14, Y3
|
|
VPSRLD $0x14, Y14, Y14
|
|
VPXOR Y3, Y14, Y14
|
|
VPADDD Y14, Y0, Y0
|
|
VPXOR Y0, Y4, Y4
|
|
VPSHUFB ·rol8<>+0(SB), Y4, Y4
|
|
VPADDD Y4, Y12, Y12
|
|
VPXOR Y12, Y14, Y14
|
|
VPSLLD $0x07, Y14, Y3
|
|
VPSRLD $0x19, Y14, Y14
|
|
VPXOR Y3, Y14, Y14
|
|
VPALIGNR $0x04, Y14, Y14, Y14
|
|
VPALIGNR $0x08, Y12, Y12, Y12
|
|
VPALIGNR $0x0c, Y4, Y4, Y4
|
|
VPADDD Y14, Y0, Y0
|
|
VPXOR Y0, Y4, Y4
|
|
VPSHUFB ·rol16<>+0(SB), Y4, Y4
|
|
VPADDD Y4, Y12, Y12
|
|
VPXOR Y12, Y14, Y14
|
|
VPSLLD $0x0c, Y14, Y3
|
|
VPSRLD $0x14, Y14, Y14
|
|
VPXOR Y3, Y14, Y14
|
|
VPADDD Y14, Y0, Y0
|
|
VPXOR Y0, Y4, Y4
|
|
VPSHUFB ·rol8<>+0(SB), Y4, Y4
|
|
VPADDD Y4, Y12, Y12
|
|
VPXOR Y12, Y14, Y14
|
|
VPSLLD $0x07, Y14, Y3
|
|
VPSRLD $0x19, Y14, Y14
|
|
VPXOR Y3, Y14, Y14
|
|
VPALIGNR $0x0c, Y14, Y14, Y14
|
|
VPALIGNR $0x08, Y12, Y12, Y12
|
|
VPALIGNR $0x04, Y4, Y4, Y4
|
|
DECQ R9
|
|
JNE openAVX2PreparePolyKey
|
|
VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
|
|
VPADDD 32(BP), Y14, Y14
|
|
VPADDD 64(BP), Y12, Y12
|
|
VPADDD 192(BP), Y4, Y4
|
|
VPERM2I128 $0x02, Y0, Y14, Y3
|
|
|
|
// Clamp and store poly key
|
|
VPAND ·polyClampMask<>+0(SB), Y3, Y3
|
|
VMOVDQA Y3, (BP)
|
|
|
|
// Stream for the first 64 bytes
|
|
VPERM2I128 $0x13, Y0, Y14, Y0
|
|
VPERM2I128 $0x13, Y12, Y4, Y14
|
|
|
|
// Hash AD + first 64 bytes
|
|
MOVQ ad_len+80(FP), R9
|
|
CALL polyHashADInternal<>(SB)
|
|
XORQ CX, CX
|
|
|
|
openAVX2InitialHash64:
|
|
ADDQ (SI)(CX*1), R10
|
|
ADCQ 8(SI)(CX*1), R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), DX
|
|
MOVQ DX, R15
|
|
MULXQ R10, R13, R14
|
|
IMULQ R12, R15
|
|
MULXQ R11, AX, DX
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), DX
|
|
MULXQ R10, R10, AX
|
|
ADDQ R10, R14
|
|
MULXQ R11, R11, R8
|
|
ADCQ R11, R15
|
|
ADCQ $0x00, R8
|
|
IMULQ R12, DX
|
|
ADDQ AX, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ $0x10, CX
|
|
CMPQ CX, $0x40
|
|
JNE openAVX2InitialHash64
|
|
|
|
// Decrypt the first 64 bytes
|
|
VPXOR (SI), Y0, Y0
|
|
VPXOR 32(SI), Y14, Y14
|
|
VMOVDQU Y0, (DI)
|
|
VMOVDQU Y14, 32(DI)
|
|
LEAQ 64(SI), SI
|
|
LEAQ 64(DI), DI
|
|
SUBQ $0x40, BX
|
|
|
|
openAVX2MainLoop:
|
|
CMPQ BX, $0x00000200
|
|
JB openAVX2MainLoopDone
|
|
|
|
// Load state, increment counter blocks, store the incremented counters
|
|
VMOVDQU ·chacha20Constants<>+0(SB), Y0
|
|
VMOVDQA Y0, Y5
|
|
VMOVDQA Y0, Y6
|
|
VMOVDQA Y0, Y7
|
|
VMOVDQA 32(BP), Y14
|
|
VMOVDQA Y14, Y9
|
|
VMOVDQA Y14, Y10
|
|
VMOVDQA Y14, Y11
|
|
VMOVDQA 64(BP), Y12
|
|
VMOVDQA Y12, Y13
|
|
VMOVDQA Y12, Y8
|
|
VMOVDQA Y12, Y15
|
|
VMOVDQA 192(BP), Y4
|
|
VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
|
|
VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
|
|
VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
|
|
VPADDD ·avx2IncMask<>+0(SB), Y2, Y3
|
|
VMOVDQA Y4, 96(BP)
|
|
VMOVDQA Y1, 128(BP)
|
|
VMOVDQA Y2, 160(BP)
|
|
VMOVDQA Y3, 192(BP)
|
|
XORQ CX, CX
|
|
|
|
openAVX2InternalLoop:
|
|
ADDQ (SI)(CX*1), R10
|
|
ADCQ 8(SI)(CX*1), R11
|
|
ADCQ $0x01, R12
|
|
VPADDD Y14, Y0, Y0
|
|
VPADDD Y9, Y5, Y5
|
|
VPADDD Y10, Y6, Y6
|
|
VPADDD Y11, Y7, Y7
|
|
MOVQ (BP), DX
|
|
MOVQ DX, R15
|
|
MULXQ R10, R13, R14
|
|
IMULQ R12, R15
|
|
MULXQ R11, AX, DX
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
VPXOR Y0, Y4, Y4
|
|
VPXOR Y5, Y1, Y1
|
|
VPXOR Y6, Y2, Y2
|
|
VPXOR Y7, Y3, Y3
|
|
VPSHUFB ·rol16<>+0(SB), Y4, Y4
|
|
VPSHUFB ·rol16<>+0(SB), Y1, Y1
|
|
VPSHUFB ·rol16<>+0(SB), Y2, Y2
|
|
VPSHUFB ·rol16<>+0(SB), Y3, Y3
|
|
MOVQ 8(BP), DX
|
|
MULXQ R10, R10, AX
|
|
ADDQ R10, R14
|
|
MULXQ R11, R11, R8
|
|
ADCQ R11, R15
|
|
ADCQ $0x00, R8
|
|
VPADDD Y4, Y12, Y12
|
|
VPADDD Y1, Y13, Y13
|
|
VPADDD Y2, Y8, Y8
|
|
VPADDD Y3, Y15, Y15
|
|
VPXOR Y12, Y14, Y14
|
|
VPXOR Y13, Y9, Y9
|
|
VPXOR Y8, Y10, Y10
|
|
VPXOR Y15, Y11, Y11
|
|
IMULQ R12, DX
|
|
ADDQ AX, R15
|
|
ADCQ DX, R8
|
|
VMOVDQA Y15, 224(BP)
|
|
VPSLLD $0x0c, Y14, Y15
|
|
VPSRLD $0x14, Y14, Y14
|
|
VPXOR Y15, Y14, Y14
|
|
VPSLLD $0x0c, Y9, Y15
|
|
VPSRLD $0x14, Y9, Y9
|
|
VPXOR Y15, Y9, Y9
|
|
VPSLLD $0x0c, Y10, Y15
|
|
VPSRLD $0x14, Y10, Y10
|
|
VPXOR Y15, Y10, Y10
|
|
VPSLLD $0x0c, Y11, Y15
|
|
VPSRLD $0x14, Y11, Y11
|
|
VPXOR Y15, Y11, Y11
|
|
VMOVDQA 224(BP), Y15
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
VPADDD Y14, Y0, Y0
|
|
VPADDD Y9, Y5, Y5
|
|
VPADDD Y10, Y6, Y6
|
|
VPADDD Y11, Y7, Y7
|
|
VPXOR Y0, Y4, Y4
|
|
VPXOR Y5, Y1, Y1
|
|
VPXOR Y6, Y2, Y2
|
|
VPXOR Y7, Y3, Y3
|
|
VPSHUFB ·rol8<>+0(SB), Y4, Y4
|
|
VPSHUFB ·rol8<>+0(SB), Y1, Y1
|
|
VPSHUFB ·rol8<>+0(SB), Y2, Y2
|
|
VPSHUFB ·rol8<>+0(SB), Y3, Y3
|
|
ADDQ 16(SI)(CX*1), R10
|
|
ADCQ 24(SI)(CX*1), R11
|
|
ADCQ $0x01, R12
|
|
VPADDD Y4, Y12, Y12
|
|
VPADDD Y1, Y13, Y13
|
|
VPADDD Y2, Y8, Y8
|
|
VPADDD Y3, Y15, Y15
|
|
MOVQ (BP), DX
|
|
MOVQ DX, R15
|
|
MULXQ R10, R13, R14
|
|
IMULQ R12, R15
|
|
MULXQ R11, AX, DX
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
VPXOR Y12, Y14, Y14
|
|
VPXOR Y13, Y9, Y9
|
|
VPXOR Y8, Y10, Y10
|
|
VPXOR Y15, Y11, Y11
|
|
VMOVDQA Y15, 224(BP)
|
|
VPSLLD $0x07, Y14, Y15
|
|
VPSRLD $0x19, Y14, Y14
|
|
VPXOR Y15, Y14, Y14
|
|
VPSLLD $0x07, Y9, Y15
|
|
VPSRLD $0x19, Y9, Y9
|
|
VPXOR Y15, Y9, Y9
|
|
VPSLLD $0x07, Y10, Y15
|
|
VPSRLD $0x19, Y10, Y10
|
|
VPXOR Y15, Y10, Y10
|
|
VPSLLD $0x07, Y11, Y15
|
|
VPSRLD $0x19, Y11, Y11
|
|
VPXOR Y15, Y11, Y11
|
|
VMOVDQA 224(BP), Y15
|
|
MOVQ 8(BP), DX
|
|
MULXQ R10, R10, AX
|
|
ADDQ R10, R14
|
|
MULXQ R11, R11, R8
|
|
ADCQ R11, R15
|
|
ADCQ $0x00, R8
|
|
VPALIGNR $0x04, Y14, Y14, Y14
|
|
VPALIGNR $0x04, Y9, Y9, Y9
|
|
VPALIGNR $0x04, Y10, Y10, Y10
|
|
VPALIGNR $0x04, Y11, Y11, Y11
|
|
VPALIGNR $0x08, Y12, Y12, Y12
|
|
VPALIGNR $0x08, Y13, Y13, Y13
|
|
VPALIGNR $0x08, Y8, Y8, Y8
|
|
VPALIGNR $0x08, Y15, Y15, Y15
|
|
VPALIGNR $0x0c, Y4, Y4, Y4
|
|
VPALIGNR $0x0c, Y1, Y1, Y1
|
|
VPALIGNR $0x0c, Y2, Y2, Y2
|
|
VPALIGNR $0x0c, Y3, Y3, Y3
|
|
VPADDD Y14, Y0, Y0
|
|
VPADDD Y9, Y5, Y5
|
|
VPADDD Y10, Y6, Y6
|
|
VPADDD Y11, Y7, Y7
|
|
IMULQ R12, DX
|
|
ADDQ AX, R15
|
|
ADCQ DX, R8
|
|
VPXOR Y0, Y4, Y4
|
|
VPXOR Y5, Y1, Y1
|
|
VPXOR Y6, Y2, Y2
|
|
VPXOR Y7, Y3, Y3
|
|
VPSHUFB ·rol16<>+0(SB), Y4, Y4
|
|
VPSHUFB ·rol16<>+0(SB), Y1, Y1
|
|
VPSHUFB ·rol16<>+0(SB), Y2, Y2
|
|
VPSHUFB ·rol16<>+0(SB), Y3, Y3
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
VPADDD Y4, Y12, Y12
|
|
VPADDD Y1, Y13, Y13
|
|
VPADDD Y2, Y8, Y8
|
|
VPADDD Y3, Y15, Y15
|
|
VPXOR Y12, Y14, Y14
|
|
VPXOR Y13, Y9, Y9
|
|
VPXOR Y8, Y10, Y10
|
|
VPXOR Y15, Y11, Y11
|
|
ADDQ 32(SI)(CX*1), R10
|
|
ADCQ 40(SI)(CX*1), R11
|
|
ADCQ $0x01, R12
|
|
LEAQ 48(CX), CX
|
|
VMOVDQA Y15, 224(BP)
|
|
VPSLLD $0x0c, Y14, Y15
|
|
VPSRLD $0x14, Y14, Y14
|
|
VPXOR Y15, Y14, Y14
|
|
VPSLLD $0x0c, Y9, Y15
|
|
VPSRLD $0x14, Y9, Y9
|
|
VPXOR Y15, Y9, Y9
|
|
VPSLLD $0x0c, Y10, Y15
|
|
VPSRLD $0x14, Y10, Y10
|
|
VPXOR Y15, Y10, Y10
|
|
VPSLLD $0x0c, Y11, Y15
|
|
VPSRLD $0x14, Y11, Y11
|
|
VPXOR Y15, Y11, Y11
|
|
VMOVDQA 224(BP), Y15
|
|
MOVQ (BP), DX
|
|
MOVQ DX, R15
|
|
MULXQ R10, R13, R14
|
|
IMULQ R12, R15
|
|
MULXQ R11, AX, DX
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
VPADDD Y14, Y0, Y0
|
|
VPADDD Y9, Y5, Y5
|
|
VPADDD Y10, Y6, Y6
|
|
VPADDD Y11, Y7, Y7
|
|
VPXOR Y0, Y4, Y4
|
|
VPXOR Y5, Y1, Y1
|
|
VPXOR Y6, Y2, Y2
|
|
VPXOR Y7, Y3, Y3
|
|
MOVQ 8(BP), DX
|
|
MULXQ R10, R10, AX
|
|
ADDQ R10, R14
|
|
MULXQ R11, R11, R8
|
|
ADCQ R11, R15
|
|
ADCQ $0x00, R8
|
|
VPSHUFB ·rol8<>+0(SB), Y4, Y4
|
|
VPSHUFB ·rol8<>+0(SB), Y1, Y1
|
|
VPSHUFB ·rol8<>+0(SB), Y2, Y2
|
|
VPSHUFB ·rol8<>+0(SB), Y3, Y3
|
|
VPADDD Y4, Y12, Y12
|
|
VPADDD Y1, Y13, Y13
|
|
VPADDD Y2, Y8, Y8
|
|
VPADDD Y3, Y15, Y15
|
|
IMULQ R12, DX
|
|
ADDQ AX, R15
|
|
ADCQ DX, R8
|
|
VPXOR Y12, Y14, Y14
|
|
VPXOR Y13, Y9, Y9
|
|
VPXOR Y8, Y10, Y10
|
|
VPXOR Y15, Y11, Y11
|
|
VMOVDQA Y15, 224(BP)
|
|
VPSLLD $0x07, Y14, Y15
|
|
VPSRLD $0x19, Y14, Y14
|
|
VPXOR Y15, Y14, Y14
|
|
VPSLLD $0x07, Y9, Y15
|
|
VPSRLD $0x19, Y9, Y9
|
|
VPXOR Y15, Y9, Y9
|
|
VPSLLD $0x07, Y10, Y15
|
|
VPSRLD $0x19, Y10, Y10
|
|
VPXOR Y15, Y10, Y10
|
|
VPSLLD $0x07, Y11, Y15
|
|
VPSRLD $0x19, Y11, Y11
|
|
VPXOR Y15, Y11, Y11
|
|
VMOVDQA 224(BP), Y15
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
VPALIGNR $0x0c, Y14, Y14, Y14
|
|
VPALIGNR $0x0c, Y9, Y9, Y9
|
|
VPALIGNR $0x0c, Y10, Y10, Y10
|
|
VPALIGNR $0x0c, Y11, Y11, Y11
|
|
VPALIGNR $0x08, Y12, Y12, Y12
|
|
VPALIGNR $0x08, Y13, Y13, Y13
|
|
VPALIGNR $0x08, Y8, Y8, Y8
|
|
VPALIGNR $0x08, Y15, Y15, Y15
|
|
VPALIGNR $0x04, Y4, Y4, Y4
|
|
VPALIGNR $0x04, Y1, Y1, Y1
|
|
VPALIGNR $0x04, Y2, Y2, Y2
|
|
VPALIGNR $0x04, Y3, Y3, Y3
|
|
CMPQ CX, $0x000001e0
|
|
JNE openAVX2InternalLoop
|
|
VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
|
|
VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
|
|
VPADDD ·chacha20Constants<>+0(SB), Y6, Y6
|
|
VPADDD ·chacha20Constants<>+0(SB), Y7, Y7
|
|
VPADDD 32(BP), Y14, Y14
|
|
VPADDD 32(BP), Y9, Y9
|
|
VPADDD 32(BP), Y10, Y10
|
|
VPADDD 32(BP), Y11, Y11
|
|
VPADDD 64(BP), Y12, Y12
|
|
VPADDD 64(BP), Y13, Y13
|
|
VPADDD 64(BP), Y8, Y8
|
|
VPADDD 64(BP), Y15, Y15
|
|
VPADDD 96(BP), Y4, Y4
|
|
VPADDD 128(BP), Y1, Y1
|
|
VPADDD 160(BP), Y2, Y2
|
|
VPADDD 192(BP), Y3, Y3
|
|
VMOVDQA Y15, 224(BP)
|
|
|
|
// We only hashed 480 of the 512 bytes available - hash the remaining 32 here
|
|
ADDQ 480(SI), R10
|
|
ADCQ 488(SI), R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), DX
|
|
MOVQ DX, R15
|
|
MULXQ R10, R13, R14
|
|
IMULQ R12, R15
|
|
MULXQ R11, AX, DX
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), DX
|
|
MULXQ R10, R10, AX
|
|
ADDQ R10, R14
|
|
MULXQ R11, R11, R8
|
|
ADCQ R11, R15
|
|
ADCQ $0x00, R8
|
|
IMULQ R12, DX
|
|
ADDQ AX, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
VPERM2I128 $0x02, Y0, Y14, Y15
|
|
VPERM2I128 $0x13, Y0, Y14, Y14
|
|
VPERM2I128 $0x02, Y12, Y4, Y0
|
|
VPERM2I128 $0x13, Y12, Y4, Y12
|
|
VPXOR (SI), Y15, Y15
|
|
VPXOR 32(SI), Y0, Y0
|
|
VPXOR 64(SI), Y14, Y14
|
|
VPXOR 96(SI), Y12, Y12
|
|
VMOVDQU Y15, (DI)
|
|
VMOVDQU Y0, 32(DI)
|
|
VMOVDQU Y14, 64(DI)
|
|
VMOVDQU Y12, 96(DI)
|
|
VPERM2I128 $0x02, Y5, Y9, Y0
|
|
VPERM2I128 $0x02, Y13, Y1, Y14
|
|
VPERM2I128 $0x13, Y5, Y9, Y12
|
|
VPERM2I128 $0x13, Y13, Y1, Y4
|
|
VPXOR 128(SI), Y0, Y0
|
|
VPXOR 160(SI), Y14, Y14
|
|
VPXOR 192(SI), Y12, Y12
|
|
VPXOR 224(SI), Y4, Y4
|
|
VMOVDQU Y0, 128(DI)
|
|
VMOVDQU Y14, 160(DI)
|
|
VMOVDQU Y12, 192(DI)
|
|
VMOVDQU Y4, 224(DI)
|
|
|
|
// and here
|
|
ADDQ 496(SI), R10
|
|
ADCQ 504(SI), R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), DX
|
|
MOVQ DX, R15
|
|
MULXQ R10, R13, R14
|
|
IMULQ R12, R15
|
|
MULXQ R11, AX, DX
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), DX
|
|
MULXQ R10, R10, AX
|
|
ADDQ R10, R14
|
|
MULXQ R11, R11, R8
|
|
ADCQ R11, R15
|
|
ADCQ $0x00, R8
|
|
IMULQ R12, DX
|
|
ADDQ AX, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
VPERM2I128 $0x02, Y6, Y10, Y0
|
|
VPERM2I128 $0x02, Y8, Y2, Y14
|
|
VPERM2I128 $0x13, Y6, Y10, Y12
|
|
VPERM2I128 $0x13, Y8, Y2, Y4
|
|
VPXOR 256(SI), Y0, Y0
|
|
VPXOR 288(SI), Y14, Y14
|
|
VPXOR 320(SI), Y12, Y12
|
|
VPXOR 352(SI), Y4, Y4
|
|
VMOVDQU Y0, 256(DI)
|
|
VMOVDQU Y14, 288(DI)
|
|
VMOVDQU Y12, 320(DI)
|
|
VMOVDQU Y4, 352(DI)
|
|
VPERM2I128 $0x02, Y7, Y11, Y0
|
|
VPERM2I128 $0x02, 224(BP), Y3, Y14
|
|
VPERM2I128 $0x13, Y7, Y11, Y12
|
|
VPERM2I128 $0x13, 224(BP), Y3, Y4
|
|
VPXOR 384(SI), Y0, Y0
|
|
VPXOR 416(SI), Y14, Y14
|
|
VPXOR 448(SI), Y12, Y12
|
|
VPXOR 480(SI), Y4, Y4
|
|
VMOVDQU Y0, 384(DI)
|
|
VMOVDQU Y14, 416(DI)
|
|
VMOVDQU Y12, 448(DI)
|
|
VMOVDQU Y4, 480(DI)
|
|
LEAQ 512(SI), SI
|
|
LEAQ 512(DI), DI
|
|
SUBQ $0x00000200, BX
|
|
JMP openAVX2MainLoop
|
|
|
|
openAVX2MainLoopDone:
|
|
// Handle the various tail sizes efficiently
|
|
TESTQ BX, BX
|
|
JE openSSEFinalize
|
|
CMPQ BX, $0x80
|
|
JBE openAVX2Tail128
|
|
CMPQ BX, $0x00000100
|
|
JBE openAVX2Tail256
|
|
CMPQ BX, $0x00000180
|
|
JBE openAVX2Tail384
|
|
JMP openAVX2Tail512
|
|
|
|
openAVX2192:
|
|
VMOVDQA Y0, Y5
|
|
VMOVDQA Y14, Y9
|
|
VMOVDQA Y12, Y13
|
|
VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
|
|
VMOVDQA Y0, Y6
|
|
VMOVDQA Y14, Y10
|
|
VMOVDQA Y12, Y8
|
|
VMOVDQA Y4, Y2
|
|
VMOVDQA Y1, Y15
|
|
MOVQ $0x0000000a, R9
|
|
|
|
openAVX2192InnerCipherLoop:
|
|
VPADDD Y14, Y0, Y0
|
|
VPXOR Y0, Y4, Y4
|
|
VPSHUFB ·rol16<>+0(SB), Y4, Y4
|
|
VPADDD Y4, Y12, Y12
|
|
VPXOR Y12, Y14, Y14
|
|
VPSLLD $0x0c, Y14, Y3
|
|
VPSRLD $0x14, Y14, Y14
|
|
VPXOR Y3, Y14, Y14
|
|
VPADDD Y14, Y0, Y0
|
|
VPXOR Y0, Y4, Y4
|
|
VPSHUFB ·rol8<>+0(SB), Y4, Y4
|
|
VPADDD Y4, Y12, Y12
|
|
VPXOR Y12, Y14, Y14
|
|
VPSLLD $0x07, Y14, Y3
|
|
VPSRLD $0x19, Y14, Y14
|
|
VPXOR Y3, Y14, Y14
|
|
VPADDD Y9, Y5, Y5
|
|
VPXOR Y5, Y1, Y1
|
|
VPSHUFB ·rol16<>+0(SB), Y1, Y1
|
|
VPADDD Y1, Y13, Y13
|
|
VPXOR Y13, Y9, Y9
|
|
VPSLLD $0x0c, Y9, Y3
|
|
VPSRLD $0x14, Y9, Y9
|
|
VPXOR Y3, Y9, Y9
|
|
VPADDD Y9, Y5, Y5
|
|
VPXOR Y5, Y1, Y1
|
|
VPSHUFB ·rol8<>+0(SB), Y1, Y1
|
|
VPADDD Y1, Y13, Y13
|
|
VPXOR Y13, Y9, Y9
|
|
VPSLLD $0x07, Y9, Y3
|
|
VPSRLD $0x19, Y9, Y9
|
|
VPXOR Y3, Y9, Y9
|
|
VPALIGNR $0x04, Y14, Y14, Y14
|
|
VPALIGNR $0x04, Y9, Y9, Y9
|
|
VPALIGNR $0x08, Y12, Y12, Y12
|
|
VPALIGNR $0x08, Y13, Y13, Y13
|
|
VPALIGNR $0x0c, Y4, Y4, Y4
|
|
VPALIGNR $0x0c, Y1, Y1, Y1
|
|
VPADDD Y14, Y0, Y0
|
|
VPXOR Y0, Y4, Y4
|
|
VPSHUFB ·rol16<>+0(SB), Y4, Y4
|
|
VPADDD Y4, Y12, Y12
|
|
VPXOR Y12, Y14, Y14
|
|
VPSLLD $0x0c, Y14, Y3
|
|
VPSRLD $0x14, Y14, Y14
|
|
VPXOR Y3, Y14, Y14
|
|
VPADDD Y14, Y0, Y0
|
|
VPXOR Y0, Y4, Y4
|
|
VPSHUFB ·rol8<>+0(SB), Y4, Y4
|
|
VPADDD Y4, Y12, Y12
|
|
VPXOR Y12, Y14, Y14
|
|
VPSLLD $0x07, Y14, Y3
|
|
VPSRLD $0x19, Y14, Y14
|
|
VPXOR Y3, Y14, Y14
|
|
VPADDD Y9, Y5, Y5
|
|
VPXOR Y5, Y1, Y1
|
|
VPSHUFB ·rol16<>+0(SB), Y1, Y1
|
|
VPADDD Y1, Y13, Y13
|
|
VPXOR Y13, Y9, Y9
|
|
VPSLLD $0x0c, Y9, Y3
|
|
VPSRLD $0x14, Y9, Y9
|
|
VPXOR Y3, Y9, Y9
|
|
VPADDD Y9, Y5, Y5
|
|
VPXOR Y5, Y1, Y1
|
|
VPSHUFB ·rol8<>+0(SB), Y1, Y1
|
|
VPADDD Y1, Y13, Y13
|
|
VPXOR Y13, Y9, Y9
|
|
VPSLLD $0x07, Y9, Y3
|
|
VPSRLD $0x19, Y9, Y9
|
|
VPXOR Y3, Y9, Y9
|
|
VPALIGNR $0x0c, Y14, Y14, Y14
|
|
VPALIGNR $0x0c, Y9, Y9, Y9
|
|
VPALIGNR $0x08, Y12, Y12, Y12
|
|
VPALIGNR $0x08, Y13, Y13, Y13
|
|
VPALIGNR $0x04, Y4, Y4, Y4
|
|
VPALIGNR $0x04, Y1, Y1, Y1
|
|
DECQ R9
|
|
JNE openAVX2192InnerCipherLoop
|
|
VPADDD Y6, Y0, Y0
|
|
VPADDD Y6, Y5, Y5
|
|
VPADDD Y10, Y14, Y14
|
|
VPADDD Y10, Y9, Y9
|
|
VPADDD Y8, Y12, Y12
|
|
VPADDD Y8, Y13, Y13
|
|
VPADDD Y2, Y4, Y4
|
|
VPADDD Y15, Y1, Y1
|
|
VPERM2I128 $0x02, Y0, Y14, Y3
|
|
|
|
// Clamp and store poly key
|
|
VPAND ·polyClampMask<>+0(SB), Y3, Y3
|
|
VMOVDQA Y3, (BP)
|
|
|
|
// Stream for up to 192 bytes
|
|
VPERM2I128 $0x13, Y0, Y14, Y0
|
|
VPERM2I128 $0x13, Y12, Y4, Y14
|
|
VPERM2I128 $0x02, Y5, Y9, Y12
|
|
VPERM2I128 $0x02, Y13, Y1, Y4
|
|
VPERM2I128 $0x13, Y5, Y9, Y5
|
|
VPERM2I128 $0x13, Y13, Y1, Y9
|
|
|
|
openAVX2ShortOpen:
|
|
// Hash
|
|
MOVQ ad_len+80(FP), R9
|
|
CALL polyHashADInternal<>(SB)
|
|
|
|
openAVX2ShortOpenLoop:
|
|
CMPQ BX, $0x20
|
|
JB openAVX2ShortTail32
|
|
SUBQ $0x20, BX
|
|
|
|
// Load for hashing
|
|
ADDQ (SI), R10
|
|
ADCQ 8(SI), R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), DX
|
|
MOVQ DX, R15
|
|
MULXQ R10, R13, R14
|
|
IMULQ R12, R15
|
|
MULXQ R11, AX, DX
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), DX
|
|
MULXQ R10, R10, AX
|
|
ADDQ R10, R14
|
|
MULXQ R11, R11, R8
|
|
ADCQ R11, R15
|
|
ADCQ $0x00, R8
|
|
IMULQ R12, DX
|
|
ADDQ AX, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ 16(SI), R10
|
|
ADCQ 24(SI), R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), DX
|
|
MOVQ DX, R15
|
|
MULXQ R10, R13, R14
|
|
IMULQ R12, R15
|
|
MULXQ R11, AX, DX
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), DX
|
|
MULXQ R10, R10, AX
|
|
ADDQ R10, R14
|
|
MULXQ R11, R11, R8
|
|
ADCQ R11, R15
|
|
ADCQ $0x00, R8
|
|
IMULQ R12, DX
|
|
ADDQ AX, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
|
|
// Load for decryption
|
|
VPXOR (SI), Y0, Y0
|
|
VMOVDQU Y0, (DI)
|
|
LEAQ 32(SI), SI
|
|
LEAQ 32(DI), DI
|
|
|
|
// Shift stream left
|
|
VMOVDQA Y14, Y0
|
|
VMOVDQA Y12, Y14
|
|
VMOVDQA Y4, Y12
|
|
VMOVDQA Y5, Y4
|
|
VMOVDQA Y9, Y5
|
|
VMOVDQA Y13, Y9
|
|
VMOVDQA Y1, Y13
|
|
VMOVDQA Y6, Y1
|
|
VMOVDQA Y10, Y6
|
|
JMP openAVX2ShortOpenLoop
|
|
|
|
openAVX2ShortTail32:
|
|
CMPQ BX, $0x10
|
|
VMOVDQA X0, X1
|
|
JB openAVX2ShortDone
|
|
SUBQ $0x10, BX
|
|
|
|
// Load for hashing
|
|
ADDQ (SI), R10
|
|
ADCQ 8(SI), R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), DX
|
|
MOVQ DX, R15
|
|
MULXQ R10, R13, R14
|
|
IMULQ R12, R15
|
|
MULXQ R11, AX, DX
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), DX
|
|
MULXQ R10, R10, AX
|
|
ADDQ R10, R14
|
|
MULXQ R11, R11, R8
|
|
ADCQ R11, R15
|
|
ADCQ $0x00, R8
|
|
IMULQ R12, DX
|
|
ADDQ AX, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
|
|
// Load for decryption
|
|
VPXOR (SI), X0, X12
|
|
VMOVDQU X12, (DI)
|
|
LEAQ 16(SI), SI
|
|
LEAQ 16(DI), DI
|
|
VPERM2I128 $0x11, Y0, Y0, Y0
|
|
VMOVDQA X0, X1
|
|
|
|
openAVX2ShortDone:
|
|
VZEROUPPER
|
|
JMP openSSETail16
|
|
|
|
openAVX2320:
|
|
VMOVDQA Y0, Y5
|
|
VMOVDQA Y14, Y9
|
|
VMOVDQA Y12, Y13
|
|
VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
|
|
VMOVDQA Y0, Y6
|
|
VMOVDQA Y14, Y10
|
|
VMOVDQA Y12, Y8
|
|
VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
|
|
VMOVDQA Y14, Y7
|
|
VMOVDQA Y12, Y11
|
|
VMOVDQA Y4, Y15
|
|
MOVQ $0x0000000a, R9
|
|
|
|
openAVX2320InnerCipherLoop:
|
|
VPADDD Y14, Y0, Y0
|
|
VPXOR Y0, Y4, Y4
|
|
VPSHUFB ·rol16<>+0(SB), Y4, Y4
|
|
VPADDD Y4, Y12, Y12
|
|
VPXOR Y12, Y14, Y14
|
|
VPSLLD $0x0c, Y14, Y3
|
|
VPSRLD $0x14, Y14, Y14
|
|
VPXOR Y3, Y14, Y14
|
|
VPADDD Y14, Y0, Y0
|
|
VPXOR Y0, Y4, Y4
|
|
VPSHUFB ·rol8<>+0(SB), Y4, Y4
|
|
VPADDD Y4, Y12, Y12
|
|
VPXOR Y12, Y14, Y14
|
|
VPSLLD $0x07, Y14, Y3
|
|
VPSRLD $0x19, Y14, Y14
|
|
VPXOR Y3, Y14, Y14
|
|
VPADDD Y9, Y5, Y5
|
|
VPXOR Y5, Y1, Y1
|
|
VPSHUFB ·rol16<>+0(SB), Y1, Y1
|
|
VPADDD Y1, Y13, Y13
|
|
VPXOR Y13, Y9, Y9
|
|
VPSLLD $0x0c, Y9, Y3
|
|
VPSRLD $0x14, Y9, Y9
|
|
VPXOR Y3, Y9, Y9
|
|
VPADDD Y9, Y5, Y5
|
|
VPXOR Y5, Y1, Y1
|
|
VPSHUFB ·rol8<>+0(SB), Y1, Y1
|
|
VPADDD Y1, Y13, Y13
|
|
VPXOR Y13, Y9, Y9
|
|
VPSLLD $0x07, Y9, Y3
|
|
VPSRLD $0x19, Y9, Y9
|
|
VPXOR Y3, Y9, Y9
|
|
VPADDD Y10, Y6, Y6
|
|
VPXOR Y6, Y2, Y2
|
|
VPSHUFB ·rol16<>+0(SB), Y2, Y2
|
|
VPADDD Y2, Y8, Y8
|
|
VPXOR Y8, Y10, Y10
|
|
VPSLLD $0x0c, Y10, Y3
|
|
VPSRLD $0x14, Y10, Y10
|
|
VPXOR Y3, Y10, Y10
|
|
VPADDD Y10, Y6, Y6
|
|
VPXOR Y6, Y2, Y2
|
|
VPSHUFB ·rol8<>+0(SB), Y2, Y2
|
|
VPADDD Y2, Y8, Y8
|
|
VPXOR Y8, Y10, Y10
|
|
VPSLLD $0x07, Y10, Y3
|
|
VPSRLD $0x19, Y10, Y10
|
|
VPXOR Y3, Y10, Y10
|
|
VPALIGNR $0x04, Y14, Y14, Y14
|
|
VPALIGNR $0x04, Y9, Y9, Y9
|
|
VPALIGNR $0x04, Y10, Y10, Y10
|
|
VPALIGNR $0x08, Y12, Y12, Y12
|
|
VPALIGNR $0x08, Y13, Y13, Y13
|
|
VPALIGNR $0x08, Y8, Y8, Y8
|
|
VPALIGNR $0x0c, Y4, Y4, Y4
|
|
VPALIGNR $0x0c, Y1, Y1, Y1
|
|
VPALIGNR $0x0c, Y2, Y2, Y2
|
|
VPADDD Y14, Y0, Y0
|
|
VPXOR Y0, Y4, Y4
|
|
VPSHUFB ·rol16<>+0(SB), Y4, Y4
|
|
VPADDD Y4, Y12, Y12
|
|
VPXOR Y12, Y14, Y14
|
|
VPSLLD $0x0c, Y14, Y3
|
|
VPSRLD $0x14, Y14, Y14
|
|
VPXOR Y3, Y14, Y14
|
|
VPADDD Y14, Y0, Y0
|
|
VPXOR Y0, Y4, Y4
|
|
VPSHUFB ·rol8<>+0(SB), Y4, Y4
|
|
VPADDD Y4, Y12, Y12
|
|
VPXOR Y12, Y14, Y14
|
|
VPSLLD $0x07, Y14, Y3
|
|
VPSRLD $0x19, Y14, Y14
|
|
VPXOR Y3, Y14, Y14
|
|
VPADDD Y9, Y5, Y5
|
|
VPXOR Y5, Y1, Y1
|
|
VPSHUFB ·rol16<>+0(SB), Y1, Y1
|
|
VPADDD Y1, Y13, Y13
|
|
VPXOR Y13, Y9, Y9
|
|
VPSLLD $0x0c, Y9, Y3
|
|
VPSRLD $0x14, Y9, Y9
|
|
VPXOR Y3, Y9, Y9
|
|
VPADDD Y9, Y5, Y5
|
|
VPXOR Y5, Y1, Y1
|
|
VPSHUFB ·rol8<>+0(SB), Y1, Y1
|
|
VPADDD Y1, Y13, Y13
|
|
VPXOR Y13, Y9, Y9
|
|
VPSLLD $0x07, Y9, Y3
|
|
VPSRLD $0x19, Y9, Y9
|
|
VPXOR Y3, Y9, Y9
|
|
VPADDD Y10, Y6, Y6
|
|
VPXOR Y6, Y2, Y2
|
|
VPSHUFB ·rol16<>+0(SB), Y2, Y2
|
|
VPADDD Y2, Y8, Y8
|
|
VPXOR Y8, Y10, Y10
|
|
VPSLLD $0x0c, Y10, Y3
|
|
VPSRLD $0x14, Y10, Y10
|
|
VPXOR Y3, Y10, Y10
|
|
VPADDD Y10, Y6, Y6
|
|
VPXOR Y6, Y2, Y2
|
|
VPSHUFB ·rol8<>+0(SB), Y2, Y2
|
|
VPADDD Y2, Y8, Y8
|
|
VPXOR Y8, Y10, Y10
|
|
VPSLLD $0x07, Y10, Y3
|
|
VPSRLD $0x19, Y10, Y10
|
|
VPXOR Y3, Y10, Y10
|
|
VPALIGNR $0x0c, Y14, Y14, Y14
|
|
VPALIGNR $0x0c, Y9, Y9, Y9
|
|
VPALIGNR $0x0c, Y10, Y10, Y10
|
|
VPALIGNR $0x08, Y12, Y12, Y12
|
|
VPALIGNR $0x08, Y13, Y13, Y13
|
|
VPALIGNR $0x08, Y8, Y8, Y8
|
|
VPALIGNR $0x04, Y4, Y4, Y4
|
|
VPALIGNR $0x04, Y1, Y1, Y1
|
|
VPALIGNR $0x04, Y2, Y2, Y2
|
|
DECQ R9
|
|
JNE openAVX2320InnerCipherLoop
|
|
VMOVDQA ·chacha20Constants<>+0(SB), Y3
|
|
VPADDD Y3, Y0, Y0
|
|
VPADDD Y3, Y5, Y5
|
|
VPADDD Y3, Y6, Y6
|
|
VPADDD Y7, Y14, Y14
|
|
VPADDD Y7, Y9, Y9
|
|
VPADDD Y7, Y10, Y10
|
|
VPADDD Y11, Y12, Y12
|
|
VPADDD Y11, Y13, Y13
|
|
VPADDD Y11, Y8, Y8
|
|
VMOVDQA ·avx2IncMask<>+0(SB), Y3
|
|
VPADDD Y15, Y4, Y4
|
|
VPADDD Y3, Y15, Y15
|
|
VPADDD Y15, Y1, Y1
|
|
VPADDD Y3, Y15, Y15
|
|
VPADDD Y15, Y2, Y2
|
|
|
|
// Clamp and store poly key
|
|
VPERM2I128 $0x02, Y0, Y14, Y3
|
|
VPAND ·polyClampMask<>+0(SB), Y3, Y3
|
|
VMOVDQA Y3, (BP)
|
|
|
|
// Stream for up to 320 bytes
|
|
VPERM2I128 $0x13, Y0, Y14, Y0
|
|
VPERM2I128 $0x13, Y12, Y4, Y14
|
|
VPERM2I128 $0x02, Y5, Y9, Y12
|
|
VPERM2I128 $0x02, Y13, Y1, Y4
|
|
VPERM2I128 $0x13, Y5, Y9, Y5
|
|
VPERM2I128 $0x13, Y13, Y1, Y9
|
|
VPERM2I128 $0x02, Y6, Y10, Y13
|
|
VPERM2I128 $0x02, Y8, Y2, Y1
|
|
VPERM2I128 $0x13, Y6, Y10, Y6
|
|
VPERM2I128 $0x13, Y8, Y2, Y10
|
|
JMP openAVX2ShortOpen
|
|
|
|
openAVX2Tail128:
|
|
// Need to decrypt up to 128 bytes - prepare two blocks
|
|
VMOVDQA ·chacha20Constants<>+0(SB), Y5
|
|
VMOVDQA 32(BP), Y9
|
|
VMOVDQA 64(BP), Y13
|
|
VMOVDQA 192(BP), Y1
|
|
VPADDD ·avx2IncMask<>+0(SB), Y1, Y1
|
|
VMOVDQA Y1, Y4
|
|
XORQ R9, R9
|
|
MOVQ BX, CX
|
|
ANDQ $-16, CX
|
|
TESTQ CX, CX
|
|
JE openAVX2Tail128LoopB
|
|
|
|
openAVX2Tail128LoopA:
|
|
ADDQ (SI)(R9*1), R10
|
|
ADCQ 8(SI)(R9*1), R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), DX
|
|
MOVQ DX, R15
|
|
MULXQ R10, R13, R14
|
|
IMULQ R12, R15
|
|
MULXQ R11, AX, DX
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), DX
|
|
MULXQ R10, R10, AX
|
|
ADDQ R10, R14
|
|
MULXQ R11, R11, R8
|
|
ADCQ R11, R15
|
|
ADCQ $0x00, R8
|
|
IMULQ R12, DX
|
|
ADDQ AX, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
|
|
openAVX2Tail128LoopB:
|
|
ADDQ $0x10, R9
|
|
VPADDD Y9, Y5, Y5
|
|
VPXOR Y5, Y1, Y1
|
|
VPSHUFB ·rol16<>+0(SB), Y1, Y1
|
|
VPADDD Y1, Y13, Y13
|
|
VPXOR Y13, Y9, Y9
|
|
VPSLLD $0x0c, Y9, Y3
|
|
VPSRLD $0x14, Y9, Y9
|
|
VPXOR Y3, Y9, Y9
|
|
VPADDD Y9, Y5, Y5
|
|
VPXOR Y5, Y1, Y1
|
|
VPSHUFB ·rol8<>+0(SB), Y1, Y1
|
|
VPADDD Y1, Y13, Y13
|
|
VPXOR Y13, Y9, Y9
|
|
VPSLLD $0x07, Y9, Y3
|
|
VPSRLD $0x19, Y9, Y9
|
|
VPXOR Y3, Y9, Y9
|
|
VPALIGNR $0x04, Y9, Y9, Y9
|
|
VPALIGNR $0x08, Y13, Y13, Y13
|
|
VPALIGNR $0x0c, Y1, Y1, Y1
|
|
VPADDD Y9, Y5, Y5
|
|
VPXOR Y5, Y1, Y1
|
|
VPSHUFB ·rol16<>+0(SB), Y1, Y1
|
|
VPADDD Y1, Y13, Y13
|
|
VPXOR Y13, Y9, Y9
|
|
VPSLLD $0x0c, Y9, Y3
|
|
VPSRLD $0x14, Y9, Y9
|
|
VPXOR Y3, Y9, Y9
|
|
VPADDD Y9, Y5, Y5
|
|
VPXOR Y5, Y1, Y1
|
|
VPSHUFB ·rol8<>+0(SB), Y1, Y1
|
|
VPADDD Y1, Y13, Y13
|
|
VPXOR Y13, Y9, Y9
|
|
VPSLLD $0x07, Y9, Y3
|
|
VPSRLD $0x19, Y9, Y9
|
|
VPXOR Y3, Y9, Y9
|
|
VPALIGNR $0x0c, Y9, Y9, Y9
|
|
VPALIGNR $0x08, Y13, Y13, Y13
|
|
VPALIGNR $0x04, Y1, Y1, Y1
|
|
CMPQ R9, CX
|
|
JB openAVX2Tail128LoopA
|
|
CMPQ R9, $0xa0
|
|
JNE openAVX2Tail128LoopB
|
|
VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
|
|
VPADDD 32(BP), Y9, Y9
|
|
VPADDD 64(BP), Y13, Y13
|
|
VPADDD Y4, Y1, Y1
|
|
VPERM2I128 $0x02, Y5, Y9, Y0
|
|
VPERM2I128 $0x02, Y13, Y1, Y14
|
|
VPERM2I128 $0x13, Y5, Y9, Y12
|
|
VPERM2I128 $0x13, Y13, Y1, Y4
|
|
|
|
openAVX2TailLoop:
|
|
CMPQ BX, $0x20
|
|
JB openAVX2Tail
|
|
SUBQ $0x20, BX
|
|
|
|
// Load for decryption
|
|
VPXOR (SI), Y0, Y0
|
|
VMOVDQU Y0, (DI)
|
|
LEAQ 32(SI), SI
|
|
LEAQ 32(DI), DI
|
|
VMOVDQA Y14, Y0
|
|
VMOVDQA Y12, Y14
|
|
VMOVDQA Y4, Y12
|
|
JMP openAVX2TailLoop
|
|
|
|
openAVX2Tail:
|
|
CMPQ BX, $0x10
|
|
VMOVDQA X0, X1
|
|
JB openAVX2TailDone
|
|
SUBQ $0x10, BX
|
|
|
|
// Load for decryption
|
|
VPXOR (SI), X0, X12
|
|
VMOVDQU X12, (DI)
|
|
LEAQ 16(SI), SI
|
|
LEAQ 16(DI), DI
|
|
VPERM2I128 $0x11, Y0, Y0, Y0
|
|
VMOVDQA X0, X1
|
|
|
|
openAVX2TailDone:
|
|
VZEROUPPER
|
|
JMP openSSETail16
|
|
|
|
openAVX2Tail256:
|
|
VMOVDQA ·chacha20Constants<>+0(SB), Y0
|
|
VMOVDQA Y0, Y5
|
|
VMOVDQA 32(BP), Y14
|
|
VMOVDQA Y14, Y9
|
|
VMOVDQA 64(BP), Y12
|
|
VMOVDQA Y12, Y13
|
|
VMOVDQA 192(BP), Y4
|
|
VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
|
|
VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
|
|
VMOVDQA Y4, Y7
|
|
VMOVDQA Y1, Y11
|
|
|
|
// Compute the number of iterations that will hash data
|
|
MOVQ BX, 224(BP)
|
|
MOVQ BX, CX
|
|
SUBQ $0x80, CX
|
|
SHRQ $0x04, CX
|
|
MOVQ $0x0000000a, R9
|
|
CMPQ CX, $0x0a
|
|
CMOVQGT R9, CX
|
|
MOVQ SI, BX
|
|
XORQ R9, R9
|
|
|
|
openAVX2Tail256LoopA:
|
|
ADDQ (BX), R10
|
|
ADCQ 8(BX), R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), DX
|
|
MOVQ DX, R15
|
|
MULXQ R10, R13, R14
|
|
IMULQ R12, R15
|
|
MULXQ R11, AX, DX
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), DX
|
|
MULXQ R10, R10, AX
|
|
ADDQ R10, R14
|
|
MULXQ R11, R11, R8
|
|
ADCQ R11, R15
|
|
ADCQ $0x00, R8
|
|
IMULQ R12, DX
|
|
ADDQ AX, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
LEAQ 16(BX), BX
|
|
|
|
openAVX2Tail256LoopB:
|
|
VPADDD Y14, Y0, Y0
|
|
VPXOR Y0, Y4, Y4
|
|
VPSHUFB ·rol16<>+0(SB), Y4, Y4
|
|
VPADDD Y4, Y12, Y12
|
|
VPXOR Y12, Y14, Y14
|
|
VPSLLD $0x0c, Y14, Y3
|
|
VPSRLD $0x14, Y14, Y14
|
|
VPXOR Y3, Y14, Y14
|
|
VPADDD Y14, Y0, Y0
|
|
VPXOR Y0, Y4, Y4
|
|
VPSHUFB ·rol8<>+0(SB), Y4, Y4
|
|
VPADDD Y4, Y12, Y12
|
|
VPXOR Y12, Y14, Y14
|
|
VPSLLD $0x07, Y14, Y3
|
|
VPSRLD $0x19, Y14, Y14
|
|
VPXOR Y3, Y14, Y14
|
|
VPADDD Y9, Y5, Y5
|
|
VPXOR Y5, Y1, Y1
|
|
VPSHUFB ·rol16<>+0(SB), Y1, Y1
|
|
VPADDD Y1, Y13, Y13
|
|
VPXOR Y13, Y9, Y9
|
|
VPSLLD $0x0c, Y9, Y3
|
|
VPSRLD $0x14, Y9, Y9
|
|
VPXOR Y3, Y9, Y9
|
|
VPADDD Y9, Y5, Y5
|
|
VPXOR Y5, Y1, Y1
|
|
VPSHUFB ·rol8<>+0(SB), Y1, Y1
|
|
VPADDD Y1, Y13, Y13
|
|
VPXOR Y13, Y9, Y9
|
|
VPSLLD $0x07, Y9, Y3
|
|
VPSRLD $0x19, Y9, Y9
|
|
VPXOR Y3, Y9, Y9
|
|
VPALIGNR $0x04, Y14, Y14, Y14
|
|
VPALIGNR $0x04, Y9, Y9, Y9
|
|
VPALIGNR $0x08, Y12, Y12, Y12
|
|
VPALIGNR $0x08, Y13, Y13, Y13
|
|
VPALIGNR $0x0c, Y4, Y4, Y4
|
|
VPALIGNR $0x0c, Y1, Y1, Y1
|
|
INCQ R9
|
|
VPADDD Y14, Y0, Y0
|
|
VPXOR Y0, Y4, Y4
|
|
VPSHUFB ·rol16<>+0(SB), Y4, Y4
|
|
VPADDD Y4, Y12, Y12
|
|
VPXOR Y12, Y14, Y14
|
|
VPSLLD $0x0c, Y14, Y3
|
|
VPSRLD $0x14, Y14, Y14
|
|
VPXOR Y3, Y14, Y14
|
|
VPADDD Y14, Y0, Y0
|
|
VPXOR Y0, Y4, Y4
|
|
VPSHUFB ·rol8<>+0(SB), Y4, Y4
|
|
VPADDD Y4, Y12, Y12
|
|
VPXOR Y12, Y14, Y14
|
|
VPSLLD $0x07, Y14, Y3
|
|
VPSRLD $0x19, Y14, Y14
|
|
VPXOR Y3, Y14, Y14
|
|
VPADDD Y9, Y5, Y5
|
|
VPXOR Y5, Y1, Y1
|
|
VPSHUFB ·rol16<>+0(SB), Y1, Y1
|
|
VPADDD Y1, Y13, Y13
|
|
VPXOR Y13, Y9, Y9
|
|
VPSLLD $0x0c, Y9, Y3
|
|
VPSRLD $0x14, Y9, Y9
|
|
VPXOR Y3, Y9, Y9
|
|
VPADDD Y9, Y5, Y5
|
|
VPXOR Y5, Y1, Y1
|
|
VPSHUFB ·rol8<>+0(SB), Y1, Y1
|
|
VPADDD Y1, Y13, Y13
|
|
VPXOR Y13, Y9, Y9
|
|
VPSLLD $0x07, Y9, Y3
|
|
VPSRLD $0x19, Y9, Y9
|
|
VPXOR Y3, Y9, Y9
|
|
VPALIGNR $0x0c, Y14, Y14, Y14
|
|
VPALIGNR $0x0c, Y9, Y9, Y9
|
|
VPALIGNR $0x08, Y12, Y12, Y12
|
|
VPALIGNR $0x08, Y13, Y13, Y13
|
|
VPALIGNR $0x04, Y4, Y4, Y4
|
|
VPALIGNR $0x04, Y1, Y1, Y1
|
|
CMPQ R9, CX
|
|
JB openAVX2Tail256LoopA
|
|
CMPQ R9, $0x0a
|
|
JNE openAVX2Tail256LoopB
|
|
MOVQ BX, R9
|
|
SUBQ SI, BX
|
|
MOVQ BX, CX
|
|
MOVQ 224(BP), BX
|
|
|
|
openAVX2Tail256Hash:
|
|
ADDQ $0x10, CX
|
|
CMPQ CX, BX
|
|
JGT openAVX2Tail256HashEnd
|
|
ADDQ (R9), R10
|
|
ADCQ 8(R9), R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), DX
|
|
MOVQ DX, R15
|
|
MULXQ R10, R13, R14
|
|
IMULQ R12, R15
|
|
MULXQ R11, AX, DX
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), DX
|
|
MULXQ R10, R10, AX
|
|
ADDQ R10, R14
|
|
MULXQ R11, R11, R8
|
|
ADCQ R11, R15
|
|
ADCQ $0x00, R8
|
|
IMULQ R12, DX
|
|
ADDQ AX, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
LEAQ 16(R9), R9
|
|
JMP openAVX2Tail256Hash
|
|
|
|
openAVX2Tail256HashEnd:
|
|
VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
|
|
VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
|
|
VPADDD 32(BP), Y14, Y14
|
|
VPADDD 32(BP), Y9, Y9
|
|
VPADDD 64(BP), Y12, Y12
|
|
VPADDD 64(BP), Y13, Y13
|
|
VPADDD Y7, Y4, Y4
|
|
VPADDD Y11, Y1, Y1
|
|
VPERM2I128 $0x02, Y0, Y14, Y6
|
|
VPERM2I128 $0x02, Y12, Y4, Y10
|
|
VPERM2I128 $0x13, Y0, Y14, Y8
|
|
VPERM2I128 $0x13, Y12, Y4, Y2
|
|
VPERM2I128 $0x02, Y5, Y9, Y0
|
|
VPERM2I128 $0x02, Y13, Y1, Y14
|
|
VPERM2I128 $0x13, Y5, Y9, Y12
|
|
VPERM2I128 $0x13, Y13, Y1, Y4
|
|
VPXOR (SI), Y6, Y6
|
|
VPXOR 32(SI), Y10, Y10
|
|
VPXOR 64(SI), Y8, Y8
|
|
VPXOR 96(SI), Y2, Y2
|
|
VMOVDQU Y6, (DI)
|
|
VMOVDQU Y10, 32(DI)
|
|
VMOVDQU Y8, 64(DI)
|
|
VMOVDQU Y2, 96(DI)
|
|
LEAQ 128(SI), SI
|
|
LEAQ 128(DI), DI
|
|
SUBQ $0x80, BX
|
|
JMP openAVX2TailLoop
|
|
|
|
openAVX2Tail384:
|
|
// Need to decrypt up to 384 bytes - prepare six blocks
|
|
VMOVDQA ·chacha20Constants<>+0(SB), Y0
|
|
VMOVDQA Y0, Y5
|
|
VMOVDQA Y0, Y6
|
|
VMOVDQA 32(BP), Y14
|
|
VMOVDQA Y14, Y9
|
|
VMOVDQA Y14, Y10
|
|
VMOVDQA 64(BP), Y12
|
|
VMOVDQA Y12, Y13
|
|
VMOVDQA Y12, Y8
|
|
VMOVDQA 192(BP), Y4
|
|
VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
|
|
VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
|
|
VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
|
|
VMOVDQA Y4, 96(BP)
|
|
VMOVDQA Y1, 128(BP)
|
|
VMOVDQA Y2, 160(BP)
|
|
|
|
// Compute the number of iterations that will hash two blocks of data
|
|
MOVQ BX, 224(BP)
|
|
MOVQ BX, CX
|
|
SUBQ $0x00000100, CX
|
|
SHRQ $0x04, CX
|
|
ADDQ $0x06, CX
|
|
MOVQ $0x0000000a, R9
|
|
CMPQ CX, $0x0a
|
|
CMOVQGT R9, CX
|
|
MOVQ SI, BX
|
|
XORQ R9, R9
|
|
|
|
openAVX2Tail384LoopB:
|
|
ADDQ (BX), R10
|
|
ADCQ 8(BX), R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), DX
|
|
MOVQ DX, R15
|
|
MULXQ R10, R13, R14
|
|
IMULQ R12, R15
|
|
MULXQ R11, AX, DX
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), DX
|
|
MULXQ R10, R10, AX
|
|
ADDQ R10, R14
|
|
MULXQ R11, R11, R8
|
|
ADCQ R11, R15
|
|
ADCQ $0x00, R8
|
|
IMULQ R12, DX
|
|
ADDQ AX, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
LEAQ 16(BX), BX
|
|
|
|
openAVX2Tail384LoopA:
|
|
VPADDD Y14, Y0, Y0
|
|
VPXOR Y0, Y4, Y4
|
|
VPSHUFB ·rol16<>+0(SB), Y4, Y4
|
|
VPADDD Y4, Y12, Y12
|
|
VPXOR Y12, Y14, Y14
|
|
VPSLLD $0x0c, Y14, Y3
|
|
VPSRLD $0x14, Y14, Y14
|
|
VPXOR Y3, Y14, Y14
|
|
VPADDD Y14, Y0, Y0
|
|
VPXOR Y0, Y4, Y4
|
|
VPSHUFB ·rol8<>+0(SB), Y4, Y4
|
|
VPADDD Y4, Y12, Y12
|
|
VPXOR Y12, Y14, Y14
|
|
VPSLLD $0x07, Y14, Y3
|
|
VPSRLD $0x19, Y14, Y14
|
|
VPXOR Y3, Y14, Y14
|
|
VPADDD Y9, Y5, Y5
|
|
VPXOR Y5, Y1, Y1
|
|
VPSHUFB ·rol16<>+0(SB), Y1, Y1
|
|
VPADDD Y1, Y13, Y13
|
|
VPXOR Y13, Y9, Y9
|
|
VPSLLD $0x0c, Y9, Y3
|
|
VPSRLD $0x14, Y9, Y9
|
|
VPXOR Y3, Y9, Y9
|
|
VPADDD Y9, Y5, Y5
|
|
VPXOR Y5, Y1, Y1
|
|
VPSHUFB ·rol8<>+0(SB), Y1, Y1
|
|
VPADDD Y1, Y13, Y13
|
|
VPXOR Y13, Y9, Y9
|
|
VPSLLD $0x07, Y9, Y3
|
|
VPSRLD $0x19, Y9, Y9
|
|
VPXOR Y3, Y9, Y9
|
|
VPADDD Y10, Y6, Y6
|
|
VPXOR Y6, Y2, Y2
|
|
VPSHUFB ·rol16<>+0(SB), Y2, Y2
|
|
VPADDD Y2, Y8, Y8
|
|
VPXOR Y8, Y10, Y10
|
|
VPSLLD $0x0c, Y10, Y3
|
|
VPSRLD $0x14, Y10, Y10
|
|
VPXOR Y3, Y10, Y10
|
|
VPADDD Y10, Y6, Y6
|
|
VPXOR Y6, Y2, Y2
|
|
VPSHUFB ·rol8<>+0(SB), Y2, Y2
|
|
VPADDD Y2, Y8, Y8
|
|
VPXOR Y8, Y10, Y10
|
|
VPSLLD $0x07, Y10, Y3
|
|
VPSRLD $0x19, Y10, Y10
|
|
VPXOR Y3, Y10, Y10
|
|
VPALIGNR $0x04, Y14, Y14, Y14
|
|
VPALIGNR $0x04, Y9, Y9, Y9
|
|
VPALIGNR $0x04, Y10, Y10, Y10
|
|
VPALIGNR $0x08, Y12, Y12, Y12
|
|
VPALIGNR $0x08, Y13, Y13, Y13
|
|
VPALIGNR $0x08, Y8, Y8, Y8
|
|
VPALIGNR $0x0c, Y4, Y4, Y4
|
|
VPALIGNR $0x0c, Y1, Y1, Y1
|
|
VPALIGNR $0x0c, Y2, Y2, Y2
|
|
ADDQ (BX), R10
|
|
ADCQ 8(BX), R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), DX
|
|
MOVQ DX, R15
|
|
MULXQ R10, R13, R14
|
|
IMULQ R12, R15
|
|
MULXQ R11, AX, DX
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), DX
|
|
MULXQ R10, R10, AX
|
|
ADDQ R10, R14
|
|
MULXQ R11, R11, R8
|
|
ADCQ R11, R15
|
|
ADCQ $0x00, R8
|
|
IMULQ R12, DX
|
|
ADDQ AX, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
LEAQ 16(BX), BX
|
|
INCQ R9
|
|
VPADDD Y14, Y0, Y0
|
|
VPXOR Y0, Y4, Y4
|
|
VPSHUFB ·rol16<>+0(SB), Y4, Y4
|
|
VPADDD Y4, Y12, Y12
|
|
VPXOR Y12, Y14, Y14
|
|
VPSLLD $0x0c, Y14, Y3
|
|
VPSRLD $0x14, Y14, Y14
|
|
VPXOR Y3, Y14, Y14
|
|
VPADDD Y14, Y0, Y0
|
|
VPXOR Y0, Y4, Y4
|
|
VPSHUFB ·rol8<>+0(SB), Y4, Y4
|
|
VPADDD Y4, Y12, Y12
|
|
VPXOR Y12, Y14, Y14
|
|
VPSLLD $0x07, Y14, Y3
|
|
VPSRLD $0x19, Y14, Y14
|
|
VPXOR Y3, Y14, Y14
|
|
VPADDD Y9, Y5, Y5
|
|
VPXOR Y5, Y1, Y1
|
|
VPSHUFB ·rol16<>+0(SB), Y1, Y1
|
|
VPADDD Y1, Y13, Y13
|
|
VPXOR Y13, Y9, Y9
|
|
VPSLLD $0x0c, Y9, Y3
|
|
VPSRLD $0x14, Y9, Y9
|
|
VPXOR Y3, Y9, Y9
|
|
VPADDD Y9, Y5, Y5
|
|
VPXOR Y5, Y1, Y1
|
|
VPSHUFB ·rol8<>+0(SB), Y1, Y1
|
|
VPADDD Y1, Y13, Y13
|
|
VPXOR Y13, Y9, Y9
|
|
VPSLLD $0x07, Y9, Y3
|
|
VPSRLD $0x19, Y9, Y9
|
|
VPXOR Y3, Y9, Y9
|
|
VPADDD Y10, Y6, Y6
|
|
VPXOR Y6, Y2, Y2
|
|
VPSHUFB ·rol16<>+0(SB), Y2, Y2
|
|
VPADDD Y2, Y8, Y8
|
|
VPXOR Y8, Y10, Y10
|
|
VPSLLD $0x0c, Y10, Y3
|
|
VPSRLD $0x14, Y10, Y10
|
|
VPXOR Y3, Y10, Y10
|
|
VPADDD Y10, Y6, Y6
|
|
VPXOR Y6, Y2, Y2
|
|
VPSHUFB ·rol8<>+0(SB), Y2, Y2
|
|
VPADDD Y2, Y8, Y8
|
|
VPXOR Y8, Y10, Y10
|
|
VPSLLD $0x07, Y10, Y3
|
|
VPSRLD $0x19, Y10, Y10
|
|
VPXOR Y3, Y10, Y10
|
|
VPALIGNR $0x0c, Y14, Y14, Y14
|
|
VPALIGNR $0x0c, Y9, Y9, Y9
|
|
VPALIGNR $0x0c, Y10, Y10, Y10
|
|
VPALIGNR $0x08, Y12, Y12, Y12
|
|
VPALIGNR $0x08, Y13, Y13, Y13
|
|
VPALIGNR $0x08, Y8, Y8, Y8
|
|
VPALIGNR $0x04, Y4, Y4, Y4
|
|
VPALIGNR $0x04, Y1, Y1, Y1
|
|
VPALIGNR $0x04, Y2, Y2, Y2
|
|
CMPQ R9, CX
|
|
JB openAVX2Tail384LoopB
|
|
CMPQ R9, $0x0a
|
|
JNE openAVX2Tail384LoopA
|
|
MOVQ BX, R9
|
|
SUBQ SI, BX
|
|
MOVQ BX, CX
|
|
MOVQ 224(BP), BX
|
|
|
|
openAVX2Tail384Hash:
|
|
ADDQ $0x10, CX
|
|
CMPQ CX, BX
|
|
JGT openAVX2Tail384HashEnd
|
|
ADDQ (R9), R10
|
|
ADCQ 8(R9), R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), DX
|
|
MOVQ DX, R15
|
|
MULXQ R10, R13, R14
|
|
IMULQ R12, R15
|
|
MULXQ R11, AX, DX
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), DX
|
|
MULXQ R10, R10, AX
|
|
ADDQ R10, R14
|
|
MULXQ R11, R11, R8
|
|
ADCQ R11, R15
|
|
ADCQ $0x00, R8
|
|
IMULQ R12, DX
|
|
ADDQ AX, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
LEAQ 16(R9), R9
|
|
JMP openAVX2Tail384Hash
|
|
|
|
openAVX2Tail384HashEnd:
|
|
VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
|
|
VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
|
|
VPADDD ·chacha20Constants<>+0(SB), Y6, Y6
|
|
VPADDD 32(BP), Y14, Y14
|
|
VPADDD 32(BP), Y9, Y9
|
|
VPADDD 32(BP), Y10, Y10
|
|
VPADDD 64(BP), Y12, Y12
|
|
VPADDD 64(BP), Y13, Y13
|
|
VPADDD 64(BP), Y8, Y8
|
|
VPADDD 96(BP), Y4, Y4
|
|
VPADDD 128(BP), Y1, Y1
|
|
VPADDD 160(BP), Y2, Y2
|
|
VPERM2I128 $0x02, Y0, Y14, Y3
|
|
VPERM2I128 $0x02, Y12, Y4, Y7
|
|
VPERM2I128 $0x13, Y0, Y14, Y11
|
|
VPERM2I128 $0x13, Y12, Y4, Y15
|
|
VPXOR (SI), Y3, Y3
|
|
VPXOR 32(SI), Y7, Y7
|
|
VPXOR 64(SI), Y11, Y11
|
|
VPXOR 96(SI), Y15, Y15
|
|
VMOVDQU Y3, (DI)
|
|
VMOVDQU Y7, 32(DI)
|
|
VMOVDQU Y11, 64(DI)
|
|
VMOVDQU Y15, 96(DI)
|
|
VPERM2I128 $0x02, Y5, Y9, Y3
|
|
VPERM2I128 $0x02, Y13, Y1, Y7
|
|
VPERM2I128 $0x13, Y5, Y9, Y11
|
|
VPERM2I128 $0x13, Y13, Y1, Y15
|
|
VPXOR 128(SI), Y3, Y3
|
|
VPXOR 160(SI), Y7, Y7
|
|
VPXOR 192(SI), Y11, Y11
|
|
VPXOR 224(SI), Y15, Y15
|
|
VMOVDQU Y3, 128(DI)
|
|
VMOVDQU Y7, 160(DI)
|
|
VMOVDQU Y11, 192(DI)
|
|
VMOVDQU Y15, 224(DI)
|
|
VPERM2I128 $0x02, Y6, Y10, Y0
|
|
VPERM2I128 $0x02, Y8, Y2, Y14
|
|
VPERM2I128 $0x13, Y6, Y10, Y12
|
|
VPERM2I128 $0x13, Y8, Y2, Y4
|
|
LEAQ 256(SI), SI
|
|
LEAQ 256(DI), DI
|
|
SUBQ $0x00000100, BX
|
|
JMP openAVX2TailLoop
|
|
|
|
openAVX2Tail512:
|
|
VMOVDQU ·chacha20Constants<>+0(SB), Y0
|
|
VMOVDQA Y0, Y5
|
|
VMOVDQA Y0, Y6
|
|
VMOVDQA Y0, Y7
|
|
VMOVDQA 32(BP), Y14
|
|
VMOVDQA Y14, Y9
|
|
VMOVDQA Y14, Y10
|
|
VMOVDQA Y14, Y11
|
|
VMOVDQA 64(BP), Y12
|
|
VMOVDQA Y12, Y13
|
|
VMOVDQA Y12, Y8
|
|
VMOVDQA Y12, Y15
|
|
VMOVDQA 192(BP), Y4
|
|
VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
|
|
VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
|
|
VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
|
|
VPADDD ·avx2IncMask<>+0(SB), Y2, Y3
|
|
VMOVDQA Y4, 96(BP)
|
|
VMOVDQA Y1, 128(BP)
|
|
VMOVDQA Y2, 160(BP)
|
|
VMOVDQA Y3, 192(BP)
|
|
XORQ CX, CX
|
|
MOVQ SI, R9
|
|
|
|
openAVX2Tail512LoopB:
|
|
ADDQ (R9), R10
|
|
ADCQ 8(R9), R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), DX
|
|
MOVQ DX, R15
|
|
MULXQ R10, R13, R14
|
|
IMULQ R12, R15
|
|
MULXQ R11, AX, DX
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), DX
|
|
MULXQ R10, R10, AX
|
|
ADDQ R10, R14
|
|
MULXQ R11, R11, R8
|
|
ADCQ R11, R15
|
|
ADCQ $0x00, R8
|
|
IMULQ R12, DX
|
|
ADDQ AX, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
LEAQ 16(R9), R9
|
|
|
|
openAVX2Tail512LoopA:
|
|
VPADDD Y14, Y0, Y0
|
|
VPADDD Y9, Y5, Y5
|
|
VPADDD Y10, Y6, Y6
|
|
VPADDD Y11, Y7, Y7
|
|
VPXOR Y0, Y4, Y4
|
|
VPXOR Y5, Y1, Y1
|
|
VPXOR Y6, Y2, Y2
|
|
VPXOR Y7, Y3, Y3
|
|
VPSHUFB ·rol16<>+0(SB), Y4, Y4
|
|
VPSHUFB ·rol16<>+0(SB), Y1, Y1
|
|
VPSHUFB ·rol16<>+0(SB), Y2, Y2
|
|
VPSHUFB ·rol16<>+0(SB), Y3, Y3
|
|
VPADDD Y4, Y12, Y12
|
|
VPADDD Y1, Y13, Y13
|
|
VPADDD Y2, Y8, Y8
|
|
VPADDD Y3, Y15, Y15
|
|
VPXOR Y12, Y14, Y14
|
|
VPXOR Y13, Y9, Y9
|
|
VPXOR Y8, Y10, Y10
|
|
VPXOR Y15, Y11, Y11
|
|
VMOVDQA Y15, 224(BP)
|
|
VPSLLD $0x0c, Y14, Y15
|
|
VPSRLD $0x14, Y14, Y14
|
|
VPXOR Y15, Y14, Y14
|
|
VPSLLD $0x0c, Y9, Y15
|
|
VPSRLD $0x14, Y9, Y9
|
|
VPXOR Y15, Y9, Y9
|
|
VPSLLD $0x0c, Y10, Y15
|
|
VPSRLD $0x14, Y10, Y10
|
|
VPXOR Y15, Y10, Y10
|
|
VPSLLD $0x0c, Y11, Y15
|
|
VPSRLD $0x14, Y11, Y11
|
|
VPXOR Y15, Y11, Y11
|
|
VMOVDQA 224(BP), Y15
|
|
ADDQ (R9), R10
|
|
ADCQ 8(R9), R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), DX
|
|
MOVQ DX, R15
|
|
MULXQ R10, R13, R14
|
|
IMULQ R12, R15
|
|
MULXQ R11, AX, DX
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), DX
|
|
MULXQ R10, R10, AX
|
|
ADDQ R10, R14
|
|
MULXQ R11, R11, R8
|
|
ADCQ R11, R15
|
|
ADCQ $0x00, R8
|
|
IMULQ R12, DX
|
|
ADDQ AX, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
VPADDD Y14, Y0, Y0
|
|
VPADDD Y9, Y5, Y5
|
|
VPADDD Y10, Y6, Y6
|
|
VPADDD Y11, Y7, Y7
|
|
VPXOR Y0, Y4, Y4
|
|
VPXOR Y5, Y1, Y1
|
|
VPXOR Y6, Y2, Y2
|
|
VPXOR Y7, Y3, Y3
|
|
VPSHUFB ·rol8<>+0(SB), Y4, Y4
|
|
VPSHUFB ·rol8<>+0(SB), Y1, Y1
|
|
VPSHUFB ·rol8<>+0(SB), Y2, Y2
|
|
VPSHUFB ·rol8<>+0(SB), Y3, Y3
|
|
VPADDD Y4, Y12, Y12
|
|
VPADDD Y1, Y13, Y13
|
|
VPADDD Y2, Y8, Y8
|
|
VPADDD Y3, Y15, Y15
|
|
VPXOR Y12, Y14, Y14
|
|
VPXOR Y13, Y9, Y9
|
|
VPXOR Y8, Y10, Y10
|
|
VPXOR Y15, Y11, Y11
|
|
VMOVDQA Y15, 224(BP)
|
|
VPSLLD $0x07, Y14, Y15
|
|
VPSRLD $0x19, Y14, Y14
|
|
VPXOR Y15, Y14, Y14
|
|
VPSLLD $0x07, Y9, Y15
|
|
VPSRLD $0x19, Y9, Y9
|
|
VPXOR Y15, Y9, Y9
|
|
VPSLLD $0x07, Y10, Y15
|
|
VPSRLD $0x19, Y10, Y10
|
|
VPXOR Y15, Y10, Y10
|
|
VPSLLD $0x07, Y11, Y15
|
|
VPSRLD $0x19, Y11, Y11
|
|
VPXOR Y15, Y11, Y11
|
|
VMOVDQA 224(BP), Y15
|
|
VPALIGNR $0x04, Y14, Y14, Y14
|
|
VPALIGNR $0x04, Y9, Y9, Y9
|
|
VPALIGNR $0x04, Y10, Y10, Y10
|
|
VPALIGNR $0x04, Y11, Y11, Y11
|
|
VPALIGNR $0x08, Y12, Y12, Y12
|
|
VPALIGNR $0x08, Y13, Y13, Y13
|
|
VPALIGNR $0x08, Y8, Y8, Y8
|
|
VPALIGNR $0x08, Y15, Y15, Y15
|
|
VPALIGNR $0x0c, Y4, Y4, Y4
|
|
VPALIGNR $0x0c, Y1, Y1, Y1
|
|
VPALIGNR $0x0c, Y2, Y2, Y2
|
|
VPALIGNR $0x0c, Y3, Y3, Y3
|
|
VPADDD Y14, Y0, Y0
|
|
VPADDD Y9, Y5, Y5
|
|
VPADDD Y10, Y6, Y6
|
|
VPADDD Y11, Y7, Y7
|
|
VPXOR Y0, Y4, Y4
|
|
VPXOR Y5, Y1, Y1
|
|
VPXOR Y6, Y2, Y2
|
|
VPXOR Y7, Y3, Y3
|
|
VPSHUFB ·rol16<>+0(SB), Y4, Y4
|
|
VPSHUFB ·rol16<>+0(SB), Y1, Y1
|
|
VPSHUFB ·rol16<>+0(SB), Y2, Y2
|
|
VPSHUFB ·rol16<>+0(SB), Y3, Y3
|
|
VPADDD Y4, Y12, Y12
|
|
VPADDD Y1, Y13, Y13
|
|
VPADDD Y2, Y8, Y8
|
|
VPADDD Y3, Y15, Y15
|
|
VPXOR Y12, Y14, Y14
|
|
VPXOR Y13, Y9, Y9
|
|
VPXOR Y8, Y10, Y10
|
|
VPXOR Y15, Y11, Y11
|
|
ADDQ 16(R9), R10
|
|
ADCQ 24(R9), R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), DX
|
|
MOVQ DX, R15
|
|
MULXQ R10, R13, R14
|
|
IMULQ R12, R15
|
|
MULXQ R11, AX, DX
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), DX
|
|
MULXQ R10, R10, AX
|
|
ADDQ R10, R14
|
|
MULXQ R11, R11, R8
|
|
ADCQ R11, R15
|
|
ADCQ $0x00, R8
|
|
IMULQ R12, DX
|
|
ADDQ AX, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
LEAQ 32(R9), R9
|
|
VMOVDQA Y15, 224(BP)
|
|
VPSLLD $0x0c, Y14, Y15
|
|
VPSRLD $0x14, Y14, Y14
|
|
VPXOR Y15, Y14, Y14
|
|
VPSLLD $0x0c, Y9, Y15
|
|
VPSRLD $0x14, Y9, Y9
|
|
VPXOR Y15, Y9, Y9
|
|
VPSLLD $0x0c, Y10, Y15
|
|
VPSRLD $0x14, Y10, Y10
|
|
VPXOR Y15, Y10, Y10
|
|
VPSLLD $0x0c, Y11, Y15
|
|
VPSRLD $0x14, Y11, Y11
|
|
VPXOR Y15, Y11, Y11
|
|
VMOVDQA 224(BP), Y15
|
|
VPADDD Y14, Y0, Y0
|
|
VPADDD Y9, Y5, Y5
|
|
VPADDD Y10, Y6, Y6
|
|
VPADDD Y11, Y7, Y7
|
|
VPXOR Y0, Y4, Y4
|
|
VPXOR Y5, Y1, Y1
|
|
VPXOR Y6, Y2, Y2
|
|
VPXOR Y7, Y3, Y3
|
|
VPSHUFB ·rol8<>+0(SB), Y4, Y4
|
|
VPSHUFB ·rol8<>+0(SB), Y1, Y1
|
|
VPSHUFB ·rol8<>+0(SB), Y2, Y2
|
|
VPSHUFB ·rol8<>+0(SB), Y3, Y3
|
|
VPADDD Y4, Y12, Y12
|
|
VPADDD Y1, Y13, Y13
|
|
VPADDD Y2, Y8, Y8
|
|
VPADDD Y3, Y15, Y15
|
|
VPXOR Y12, Y14, Y14
|
|
VPXOR Y13, Y9, Y9
|
|
VPXOR Y8, Y10, Y10
|
|
VPXOR Y15, Y11, Y11
|
|
VMOVDQA Y15, 224(BP)
|
|
VPSLLD $0x07, Y14, Y15
|
|
VPSRLD $0x19, Y14, Y14
|
|
VPXOR Y15, Y14, Y14
|
|
VPSLLD $0x07, Y9, Y15
|
|
VPSRLD $0x19, Y9, Y9
|
|
VPXOR Y15, Y9, Y9
|
|
VPSLLD $0x07, Y10, Y15
|
|
VPSRLD $0x19, Y10, Y10
|
|
VPXOR Y15, Y10, Y10
|
|
VPSLLD $0x07, Y11, Y15
|
|
VPSRLD $0x19, Y11, Y11
|
|
VPXOR Y15, Y11, Y11
|
|
VMOVDQA 224(BP), Y15
|
|
VPALIGNR $0x0c, Y14, Y14, Y14
|
|
VPALIGNR $0x0c, Y9, Y9, Y9
|
|
VPALIGNR $0x0c, Y10, Y10, Y10
|
|
VPALIGNR $0x0c, Y11, Y11, Y11
|
|
VPALIGNR $0x08, Y12, Y12, Y12
|
|
VPALIGNR $0x08, Y13, Y13, Y13
|
|
VPALIGNR $0x08, Y8, Y8, Y8
|
|
VPALIGNR $0x08, Y15, Y15, Y15
|
|
VPALIGNR $0x04, Y4, Y4, Y4
|
|
VPALIGNR $0x04, Y1, Y1, Y1
|
|
VPALIGNR $0x04, Y2, Y2, Y2
|
|
VPALIGNR $0x04, Y3, Y3, Y3
|
|
INCQ CX
|
|
CMPQ CX, $0x04
|
|
JLT openAVX2Tail512LoopB
|
|
CMPQ CX, $0x0a
|
|
JNE openAVX2Tail512LoopA
|
|
MOVQ BX, CX
|
|
SUBQ $0x00000180, CX
|
|
ANDQ $-16, CX
|
|
|
|
openAVX2Tail512HashLoop:
|
|
TESTQ CX, CX
|
|
JE openAVX2Tail512HashEnd
|
|
ADDQ (R9), R10
|
|
ADCQ 8(R9), R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), DX
|
|
MOVQ DX, R15
|
|
MULXQ R10, R13, R14
|
|
IMULQ R12, R15
|
|
MULXQ R11, AX, DX
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), DX
|
|
MULXQ R10, R10, AX
|
|
ADDQ R10, R14
|
|
MULXQ R11, R11, R8
|
|
ADCQ R11, R15
|
|
ADCQ $0x00, R8
|
|
IMULQ R12, DX
|
|
ADDQ AX, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
LEAQ 16(R9), R9
|
|
SUBQ $0x10, CX
|
|
JMP openAVX2Tail512HashLoop
|
|
|
|
openAVX2Tail512HashEnd:
|
|
VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
|
|
VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
|
|
VPADDD ·chacha20Constants<>+0(SB), Y6, Y6
|
|
VPADDD ·chacha20Constants<>+0(SB), Y7, Y7
|
|
VPADDD 32(BP), Y14, Y14
|
|
VPADDD 32(BP), Y9, Y9
|
|
VPADDD 32(BP), Y10, Y10
|
|
VPADDD 32(BP), Y11, Y11
|
|
VPADDD 64(BP), Y12, Y12
|
|
VPADDD 64(BP), Y13, Y13
|
|
VPADDD 64(BP), Y8, Y8
|
|
VPADDD 64(BP), Y15, Y15
|
|
VPADDD 96(BP), Y4, Y4
|
|
VPADDD 128(BP), Y1, Y1
|
|
VPADDD 160(BP), Y2, Y2
|
|
VPADDD 192(BP), Y3, Y3
|
|
VMOVDQA Y15, 224(BP)
|
|
VPERM2I128 $0x02, Y0, Y14, Y15
|
|
VPERM2I128 $0x13, Y0, Y14, Y14
|
|
VPERM2I128 $0x02, Y12, Y4, Y0
|
|
VPERM2I128 $0x13, Y12, Y4, Y12
|
|
VPXOR (SI), Y15, Y15
|
|
VPXOR 32(SI), Y0, Y0
|
|
VPXOR 64(SI), Y14, Y14
|
|
VPXOR 96(SI), Y12, Y12
|
|
VMOVDQU Y15, (DI)
|
|
VMOVDQU Y0, 32(DI)
|
|
VMOVDQU Y14, 64(DI)
|
|
VMOVDQU Y12, 96(DI)
|
|
VPERM2I128 $0x02, Y5, Y9, Y0
|
|
VPERM2I128 $0x02, Y13, Y1, Y14
|
|
VPERM2I128 $0x13, Y5, Y9, Y12
|
|
VPERM2I128 $0x13, Y13, Y1, Y4
|
|
VPXOR 128(SI), Y0, Y0
|
|
VPXOR 160(SI), Y14, Y14
|
|
VPXOR 192(SI), Y12, Y12
|
|
VPXOR 224(SI), Y4, Y4
|
|
VMOVDQU Y0, 128(DI)
|
|
VMOVDQU Y14, 160(DI)
|
|
VMOVDQU Y12, 192(DI)
|
|
VMOVDQU Y4, 224(DI)
|
|
VPERM2I128 $0x02, Y6, Y10, Y0
|
|
VPERM2I128 $0x02, Y8, Y2, Y14
|
|
VPERM2I128 $0x13, Y6, Y10, Y12
|
|
VPERM2I128 $0x13, Y8, Y2, Y4
|
|
VPXOR 256(SI), Y0, Y0
|
|
VPXOR 288(SI), Y14, Y14
|
|
VPXOR 320(SI), Y12, Y12
|
|
VPXOR 352(SI), Y4, Y4
|
|
VMOVDQU Y0, 256(DI)
|
|
VMOVDQU Y14, 288(DI)
|
|
VMOVDQU Y12, 320(DI)
|
|
VMOVDQU Y4, 352(DI)
|
|
VPERM2I128 $0x02, Y7, Y11, Y0
|
|
VPERM2I128 $0x02, 224(BP), Y3, Y14
|
|
VPERM2I128 $0x13, Y7, Y11, Y12
|
|
VPERM2I128 $0x13, 224(BP), Y3, Y4
|
|
LEAQ 384(SI), SI
|
|
LEAQ 384(DI), DI
|
|
SUBQ $0x00000180, BX
|
|
JMP openAVX2TailLoop
|
|
|
|
DATA ·chacha20Constants<>+0(SB)/4, $0x61707865
|
|
DATA ·chacha20Constants<>+4(SB)/4, $0x3320646e
|
|
DATA ·chacha20Constants<>+8(SB)/4, $0x79622d32
|
|
DATA ·chacha20Constants<>+12(SB)/4, $0x6b206574
|
|
DATA ·chacha20Constants<>+16(SB)/4, $0x61707865
|
|
DATA ·chacha20Constants<>+20(SB)/4, $0x3320646e
|
|
DATA ·chacha20Constants<>+24(SB)/4, $0x79622d32
|
|
DATA ·chacha20Constants<>+28(SB)/4, $0x6b206574
|
|
GLOBL ·chacha20Constants<>(SB), RODATA|NOPTR, $32
|
|
|
|
DATA ·polyClampMask<>+0(SB)/8, $0x0ffffffc0fffffff
|
|
DATA ·polyClampMask<>+8(SB)/8, $0x0ffffffc0ffffffc
|
|
DATA ·polyClampMask<>+16(SB)/8, $0xffffffffffffffff
|
|
DATA ·polyClampMask<>+24(SB)/8, $0xffffffffffffffff
|
|
GLOBL ·polyClampMask<>(SB), RODATA|NOPTR, $32
|
|
|
|
DATA ·sseIncMask<>+0(SB)/8, $0x0000000000000001
|
|
DATA ·sseIncMask<>+8(SB)/8, $0x0000000000000000
|
|
GLOBL ·sseIncMask<>(SB), RODATA|NOPTR, $16
|
|
|
|
DATA ·andMask<>+0(SB)/8, $0x00000000000000ff
|
|
DATA ·andMask<>+8(SB)/8, $0x0000000000000000
|
|
DATA ·andMask<>+16(SB)/8, $0x000000000000ffff
|
|
DATA ·andMask<>+24(SB)/8, $0x0000000000000000
|
|
DATA ·andMask<>+32(SB)/8, $0x0000000000ffffff
|
|
DATA ·andMask<>+40(SB)/8, $0x0000000000000000
|
|
DATA ·andMask<>+48(SB)/8, $0x00000000ffffffff
|
|
DATA ·andMask<>+56(SB)/8, $0x0000000000000000
|
|
DATA ·andMask<>+64(SB)/8, $0x000000ffffffffff
|
|
DATA ·andMask<>+72(SB)/8, $0x0000000000000000
|
|
DATA ·andMask<>+80(SB)/8, $0x0000ffffffffffff
|
|
DATA ·andMask<>+88(SB)/8, $0x0000000000000000
|
|
DATA ·andMask<>+96(SB)/8, $0x00ffffffffffffff
|
|
DATA ·andMask<>+104(SB)/8, $0x0000000000000000
|
|
DATA ·andMask<>+112(SB)/8, $0xffffffffffffffff
|
|
DATA ·andMask<>+120(SB)/8, $0x0000000000000000
|
|
DATA ·andMask<>+128(SB)/8, $0xffffffffffffffff
|
|
DATA ·andMask<>+136(SB)/8, $0x00000000000000ff
|
|
DATA ·andMask<>+144(SB)/8, $0xffffffffffffffff
|
|
DATA ·andMask<>+152(SB)/8, $0x000000000000ffff
|
|
DATA ·andMask<>+160(SB)/8, $0xffffffffffffffff
|
|
DATA ·andMask<>+168(SB)/8, $0x0000000000ffffff
|
|
DATA ·andMask<>+176(SB)/8, $0xffffffffffffffff
|
|
DATA ·andMask<>+184(SB)/8, $0x00000000ffffffff
|
|
DATA ·andMask<>+192(SB)/8, $0xffffffffffffffff
|
|
DATA ·andMask<>+200(SB)/8, $0x000000ffffffffff
|
|
DATA ·andMask<>+208(SB)/8, $0xffffffffffffffff
|
|
DATA ·andMask<>+216(SB)/8, $0x0000ffffffffffff
|
|
DATA ·andMask<>+224(SB)/8, $0xffffffffffffffff
|
|
DATA ·andMask<>+232(SB)/8, $0x00ffffffffffffff
|
|
GLOBL ·andMask<>(SB), RODATA|NOPTR, $240
|
|
|
|
DATA ·avx2InitMask<>+0(SB)/8, $0x0000000000000000
|
|
DATA ·avx2InitMask<>+8(SB)/8, $0x0000000000000000
|
|
DATA ·avx2InitMask<>+16(SB)/8, $0x0000000000000001
|
|
DATA ·avx2InitMask<>+24(SB)/8, $0x0000000000000000
|
|
GLOBL ·avx2InitMask<>(SB), RODATA|NOPTR, $32
|
|
|
|
DATA ·rol16<>+0(SB)/8, $0x0504070601000302
|
|
DATA ·rol16<>+8(SB)/8, $0x0d0c0f0e09080b0a
|
|
DATA ·rol16<>+16(SB)/8, $0x0504070601000302
|
|
DATA ·rol16<>+24(SB)/8, $0x0d0c0f0e09080b0a
|
|
GLOBL ·rol16<>(SB), RODATA|NOPTR, $32
|
|
|
|
DATA ·rol8<>+0(SB)/8, $0x0605040702010003
|
|
DATA ·rol8<>+8(SB)/8, $0x0e0d0c0f0a09080b
|
|
DATA ·rol8<>+16(SB)/8, $0x0605040702010003
|
|
DATA ·rol8<>+24(SB)/8, $0x0e0d0c0f0a09080b
|
|
GLOBL ·rol8<>(SB), RODATA|NOPTR, $32
|
|
|
|
DATA ·avx2IncMask<>+0(SB)/8, $0x0000000000000002
|
|
DATA ·avx2IncMask<>+8(SB)/8, $0x0000000000000000
|
|
DATA ·avx2IncMask<>+16(SB)/8, $0x0000000000000002
|
|
DATA ·avx2IncMask<>+24(SB)/8, $0x0000000000000000
|
|
GLOBL ·avx2IncMask<>(SB), RODATA|NOPTR, $32
|
|
|
|
// func chacha20Poly1305Seal(dst []byte, key []uint32, src []byte, ad []byte)
|
|
// Requires: AVX, AVX2, BMI2, CMOV, SSE2
|
|
TEXT ·chacha20Poly1305Seal(SB), $288-96
|
|
MOVQ SP, BP
|
|
ADDQ $0x20, BP
|
|
ANDQ $-32, BP
|
|
MOVQ dst_base+0(FP), DI
|
|
MOVQ key_base+24(FP), R8
|
|
MOVQ src_base+48(FP), SI
|
|
MOVQ src_len+56(FP), BX
|
|
MOVQ ad_base+72(FP), CX
|
|
CMPB ·useAVX2+0(SB), $0x01
|
|
JE chacha20Poly1305Seal_AVX2
|
|
|
|
// Special optimization, for very short buffers
|
|
CMPQ BX, $0x80
|
|
JBE sealSSE128
|
|
|
|
// In the seal case - prepare the poly key + 3 blocks of stream in the first iteration
|
|
MOVOU ·chacha20Constants<>+0(SB), X0
|
|
MOVOU 16(R8), X3
|
|
MOVOU 32(R8), X6
|
|
MOVOU 48(R8), X9
|
|
|
|
// Store state on stack for future use
|
|
MOVO X3, 32(BP)
|
|
MOVO X6, 48(BP)
|
|
|
|
// Load state, increment counter blocks
|
|
MOVO X0, X1
|
|
MOVO X3, X4
|
|
MOVO X6, X7
|
|
MOVO X9, X10
|
|
PADDL ·sseIncMask<>+0(SB), X10
|
|
MOVO X1, X2
|
|
MOVO X4, X5
|
|
MOVO X7, X8
|
|
MOVO X10, X11
|
|
PADDL ·sseIncMask<>+0(SB), X11
|
|
MOVO X2, X12
|
|
MOVO X5, X13
|
|
MOVO X8, X14
|
|
MOVO X11, X15
|
|
PADDL ·sseIncMask<>+0(SB), X15
|
|
|
|
// Store counters
|
|
MOVO X9, 80(BP)
|
|
MOVO X10, 96(BP)
|
|
MOVO X11, 112(BP)
|
|
MOVO X15, 128(BP)
|
|
MOVQ $0x0000000a, R9
|
|
|
|
sealSSEIntroLoop:
|
|
MOVO X14, 64(BP)
|
|
PADDD X3, X0
|
|
PXOR X0, X9
|
|
ROL16(X9, X14)
|
|
PADDD X9, X6
|
|
PXOR X6, X3
|
|
MOVO X3, X14
|
|
PSLLL $0x0c, X14
|
|
PSRLL $0x14, X3
|
|
PXOR X14, X3
|
|
PADDD X3, X0
|
|
PXOR X0, X9
|
|
ROL8(X9, X14)
|
|
PADDD X9, X6
|
|
PXOR X6, X3
|
|
MOVO X3, X14
|
|
PSLLL $0x07, X14
|
|
PSRLL $0x19, X3
|
|
PXOR X14, X3
|
|
PADDD X4, X1
|
|
PXOR X1, X10
|
|
ROL16(X10, X14)
|
|
PADDD X10, X7
|
|
PXOR X7, X4
|
|
MOVO X4, X14
|
|
PSLLL $0x0c, X14
|
|
PSRLL $0x14, X4
|
|
PXOR X14, X4
|
|
PADDD X4, X1
|
|
PXOR X1, X10
|
|
ROL8(X10, X14)
|
|
PADDD X10, X7
|
|
PXOR X7, X4
|
|
MOVO X4, X14
|
|
PSLLL $0x07, X14
|
|
PSRLL $0x19, X4
|
|
PXOR X14, X4
|
|
PADDD X5, X2
|
|
PXOR X2, X11
|
|
ROL16(X11, X14)
|
|
PADDD X11, X8
|
|
PXOR X8, X5
|
|
MOVO X5, X14
|
|
PSLLL $0x0c, X14
|
|
PSRLL $0x14, X5
|
|
PXOR X14, X5
|
|
PADDD X5, X2
|
|
PXOR X2, X11
|
|
ROL8(X11, X14)
|
|
PADDD X11, X8
|
|
PXOR X8, X5
|
|
MOVO X5, X14
|
|
PSLLL $0x07, X14
|
|
PSRLL $0x19, X5
|
|
PXOR X14, X5
|
|
MOVO 64(BP), X14
|
|
MOVO X7, 64(BP)
|
|
PADDD X13, X12
|
|
PXOR X12, X15
|
|
ROL16(X15, X7)
|
|
PADDD X15, X14
|
|
PXOR X14, X13
|
|
MOVO X13, X7
|
|
PSLLL $0x0c, X7
|
|
PSRLL $0x14, X13
|
|
PXOR X7, X13
|
|
PADDD X13, X12
|
|
PXOR X12, X15
|
|
ROL8(X15, X7)
|
|
PADDD X15, X14
|
|
PXOR X14, X13
|
|
MOVO X13, X7
|
|
PSLLL $0x07, X7
|
|
PSRLL $0x19, X13
|
|
PXOR X7, X13
|
|
MOVO 64(BP), X7
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xdb
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xe4
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xed
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xed
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xf6
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xff
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xc0
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xf6
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xc9
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xd2
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xdb
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xff
|
|
BYTE $0x0c
|
|
MOVO X14, 64(BP)
|
|
PADDD X3, X0
|
|
PXOR X0, X9
|
|
ROL16(X9, X14)
|
|
PADDD X9, X6
|
|
PXOR X6, X3
|
|
MOVO X3, X14
|
|
PSLLL $0x0c, X14
|
|
PSRLL $0x14, X3
|
|
PXOR X14, X3
|
|
PADDD X3, X0
|
|
PXOR X0, X9
|
|
ROL8(X9, X14)
|
|
PADDD X9, X6
|
|
PXOR X6, X3
|
|
MOVO X3, X14
|
|
PSLLL $0x07, X14
|
|
PSRLL $0x19, X3
|
|
PXOR X14, X3
|
|
PADDD X4, X1
|
|
PXOR X1, X10
|
|
ROL16(X10, X14)
|
|
PADDD X10, X7
|
|
PXOR X7, X4
|
|
MOVO X4, X14
|
|
PSLLL $0x0c, X14
|
|
PSRLL $0x14, X4
|
|
PXOR X14, X4
|
|
PADDD X4, X1
|
|
PXOR X1, X10
|
|
ROL8(X10, X14)
|
|
PADDD X10, X7
|
|
PXOR X7, X4
|
|
MOVO X4, X14
|
|
PSLLL $0x07, X14
|
|
PSRLL $0x19, X4
|
|
PXOR X14, X4
|
|
PADDD X5, X2
|
|
PXOR X2, X11
|
|
ROL16(X11, X14)
|
|
PADDD X11, X8
|
|
PXOR X8, X5
|
|
MOVO X5, X14
|
|
PSLLL $0x0c, X14
|
|
PSRLL $0x14, X5
|
|
PXOR X14, X5
|
|
PADDD X5, X2
|
|
PXOR X2, X11
|
|
ROL8(X11, X14)
|
|
PADDD X11, X8
|
|
PXOR X8, X5
|
|
MOVO X5, X14
|
|
PSLLL $0x07, X14
|
|
PSRLL $0x19, X5
|
|
PXOR X14, X5
|
|
MOVO 64(BP), X14
|
|
MOVO X7, 64(BP)
|
|
PADDD X13, X12
|
|
PXOR X12, X15
|
|
ROL16(X15, X7)
|
|
PADDD X15, X14
|
|
PXOR X14, X13
|
|
MOVO X13, X7
|
|
PSLLL $0x0c, X7
|
|
PSRLL $0x14, X13
|
|
PXOR X7, X13
|
|
PADDD X13, X12
|
|
PXOR X12, X15
|
|
ROL8(X15, X7)
|
|
PADDD X15, X14
|
|
PXOR X14, X13
|
|
MOVO X13, X7
|
|
PSLLL $0x07, X7
|
|
PSRLL $0x19, X13
|
|
PXOR X7, X13
|
|
MOVO 64(BP), X7
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xdb
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xe4
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xed
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xed
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xf6
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xff
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xc0
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xf6
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xc9
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xd2
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xdb
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xff
|
|
BYTE $0x04
|
|
DECQ R9
|
|
JNE sealSSEIntroLoop
|
|
|
|
// Add in the state
|
|
PADDD ·chacha20Constants<>+0(SB), X0
|
|
PADDD ·chacha20Constants<>+0(SB), X1
|
|
PADDD ·chacha20Constants<>+0(SB), X2
|
|
PADDD ·chacha20Constants<>+0(SB), X12
|
|
PADDD 32(BP), X3
|
|
PADDD 32(BP), X4
|
|
PADDD 32(BP), X5
|
|
PADDD 32(BP), X13
|
|
PADDD 48(BP), X7
|
|
PADDD 48(BP), X8
|
|
PADDD 48(BP), X14
|
|
PADDD 96(BP), X10
|
|
PADDD 112(BP), X11
|
|
PADDD 128(BP), X15
|
|
|
|
// Clamp and store the key
|
|
PAND ·polyClampMask<>+0(SB), X0
|
|
MOVO X0, (BP)
|
|
MOVO X3, 16(BP)
|
|
|
|
// Hash AAD
|
|
MOVQ ad_len+80(FP), R9
|
|
CALL polyHashADInternal<>(SB)
|
|
MOVOU (SI), X0
|
|
MOVOU 16(SI), X3
|
|
MOVOU 32(SI), X6
|
|
MOVOU 48(SI), X9
|
|
PXOR X0, X1
|
|
PXOR X3, X4
|
|
PXOR X6, X7
|
|
PXOR X9, X10
|
|
MOVOU X1, (DI)
|
|
MOVOU X4, 16(DI)
|
|
MOVOU X7, 32(DI)
|
|
MOVOU X10, 48(DI)
|
|
MOVOU 64(SI), X0
|
|
MOVOU 80(SI), X3
|
|
MOVOU 96(SI), X6
|
|
MOVOU 112(SI), X9
|
|
PXOR X0, X2
|
|
PXOR X3, X5
|
|
PXOR X6, X8
|
|
PXOR X9, X11
|
|
MOVOU X2, 64(DI)
|
|
MOVOU X5, 80(DI)
|
|
MOVOU X8, 96(DI)
|
|
MOVOU X11, 112(DI)
|
|
MOVQ $0x00000080, CX
|
|
SUBQ $0x80, BX
|
|
LEAQ 128(SI), SI
|
|
MOVO X12, X1
|
|
MOVO X13, X4
|
|
MOVO X14, X7
|
|
MOVO X15, X10
|
|
CMPQ BX, $0x40
|
|
JBE sealSSE128SealHash
|
|
MOVOU (SI), X0
|
|
MOVOU 16(SI), X3
|
|
MOVOU 32(SI), X6
|
|
MOVOU 48(SI), X9
|
|
PXOR X0, X12
|
|
PXOR X3, X13
|
|
PXOR X6, X14
|
|
PXOR X9, X15
|
|
MOVOU X12, 128(DI)
|
|
MOVOU X13, 144(DI)
|
|
MOVOU X14, 160(DI)
|
|
MOVOU X15, 176(DI)
|
|
ADDQ $0x40, CX
|
|
SUBQ $0x40, BX
|
|
LEAQ 64(SI), SI
|
|
MOVQ $0x00000002, CX
|
|
MOVQ $0x00000008, R9
|
|
CMPQ BX, $0x40
|
|
JBE sealSSETail64
|
|
CMPQ BX, $0x80
|
|
JBE sealSSETail128
|
|
CMPQ BX, $0xc0
|
|
JBE sealSSETail192
|
|
|
|
sealSSEMainLoop:
|
|
// Load state, increment counter blocks
|
|
MOVO ·chacha20Constants<>+0(SB), X0
|
|
MOVO 32(BP), X3
|
|
MOVO 48(BP), X6
|
|
MOVO 128(BP), X9
|
|
PADDL ·sseIncMask<>+0(SB), X9
|
|
MOVO X0, X1
|
|
MOVO X3, X4
|
|
MOVO X6, X7
|
|
MOVO X9, X10
|
|
PADDL ·sseIncMask<>+0(SB), X10
|
|
MOVO X1, X2
|
|
MOVO X4, X5
|
|
MOVO X7, X8
|
|
MOVO X10, X11
|
|
PADDL ·sseIncMask<>+0(SB), X11
|
|
MOVO X2, X12
|
|
MOVO X5, X13
|
|
MOVO X8, X14
|
|
MOVO X11, X15
|
|
PADDL ·sseIncMask<>+0(SB), X15
|
|
|
|
// Store counters
|
|
MOVO X9, 80(BP)
|
|
MOVO X10, 96(BP)
|
|
MOVO X11, 112(BP)
|
|
MOVO X15, 128(BP)
|
|
|
|
sealSSEInnerLoop:
|
|
MOVO X14, 64(BP)
|
|
PADDD X3, X0
|
|
PXOR X0, X9
|
|
ROL16(X9, X14)
|
|
PADDD X9, X6
|
|
PXOR X6, X3
|
|
MOVO X3, X14
|
|
PSLLL $0x0c, X14
|
|
PSRLL $0x14, X3
|
|
PXOR X14, X3
|
|
PADDD X3, X0
|
|
PXOR X0, X9
|
|
ROL8(X9, X14)
|
|
PADDD X9, X6
|
|
PXOR X6, X3
|
|
MOVO X3, X14
|
|
PSLLL $0x07, X14
|
|
PSRLL $0x19, X3
|
|
PXOR X14, X3
|
|
PADDD X4, X1
|
|
PXOR X1, X10
|
|
ROL16(X10, X14)
|
|
PADDD X10, X7
|
|
PXOR X7, X4
|
|
MOVO X4, X14
|
|
PSLLL $0x0c, X14
|
|
PSRLL $0x14, X4
|
|
PXOR X14, X4
|
|
PADDD X4, X1
|
|
PXOR X1, X10
|
|
ROL8(X10, X14)
|
|
PADDD X10, X7
|
|
PXOR X7, X4
|
|
MOVO X4, X14
|
|
PSLLL $0x07, X14
|
|
PSRLL $0x19, X4
|
|
PXOR X14, X4
|
|
PADDD X5, X2
|
|
PXOR X2, X11
|
|
ROL16(X11, X14)
|
|
PADDD X11, X8
|
|
PXOR X8, X5
|
|
MOVO X5, X14
|
|
PSLLL $0x0c, X14
|
|
PSRLL $0x14, X5
|
|
PXOR X14, X5
|
|
PADDD X5, X2
|
|
PXOR X2, X11
|
|
ROL8(X11, X14)
|
|
PADDD X11, X8
|
|
PXOR X8, X5
|
|
MOVO X5, X14
|
|
PSLLL $0x07, X14
|
|
PSRLL $0x19, X5
|
|
PXOR X14, X5
|
|
MOVO 64(BP), X14
|
|
MOVO X7, 64(BP)
|
|
PADDD X13, X12
|
|
PXOR X12, X15
|
|
ROL16(X15, X7)
|
|
PADDD X15, X14
|
|
PXOR X14, X13
|
|
MOVO X13, X7
|
|
PSLLL $0x0c, X7
|
|
PSRLL $0x14, X13
|
|
PXOR X7, X13
|
|
PADDD X13, X12
|
|
PXOR X12, X15
|
|
ROL8(X15, X7)
|
|
PADDD X15, X14
|
|
PXOR X14, X13
|
|
MOVO X13, X7
|
|
PSLLL $0x07, X7
|
|
PSRLL $0x19, X13
|
|
PXOR X7, X13
|
|
MOVO 64(BP), X7
|
|
ADDQ (DI), R10
|
|
ADCQ 8(DI), R11
|
|
ADCQ $0x01, R12
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xdb
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xe4
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xed
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xed
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xf6
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xff
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xc0
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xf6
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xc9
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xd2
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xdb
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xff
|
|
BYTE $0x0c
|
|
MOVQ (BP), AX
|
|
MOVQ AX, R15
|
|
MULQ R10
|
|
MOVQ AX, R13
|
|
MOVQ DX, R14
|
|
MOVQ (BP), AX
|
|
MULQ R11
|
|
IMULQ R12, R15
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), AX
|
|
MOVQ AX, R8
|
|
MULQ R10
|
|
ADDQ AX, R14
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R10
|
|
MOVQ 8(BP), AX
|
|
MULQ R11
|
|
ADDQ AX, R15
|
|
ADCQ $0x00, DX
|
|
LEAQ 16(DI), DI
|
|
MOVO X14, 64(BP)
|
|
PADDD X3, X0
|
|
PXOR X0, X9
|
|
ROL16(X9, X14)
|
|
PADDD X9, X6
|
|
PXOR X6, X3
|
|
MOVO X3, X14
|
|
PSLLL $0x0c, X14
|
|
PSRLL $0x14, X3
|
|
PXOR X14, X3
|
|
PADDD X3, X0
|
|
PXOR X0, X9
|
|
ROL8(X9, X14)
|
|
PADDD X9, X6
|
|
PXOR X6, X3
|
|
MOVO X3, X14
|
|
PSLLL $0x07, X14
|
|
PSRLL $0x19, X3
|
|
PXOR X14, X3
|
|
PADDD X4, X1
|
|
PXOR X1, X10
|
|
ROL16(X10, X14)
|
|
PADDD X10, X7
|
|
PXOR X7, X4
|
|
MOVO X4, X14
|
|
PSLLL $0x0c, X14
|
|
PSRLL $0x14, X4
|
|
PXOR X14, X4
|
|
PADDD X4, X1
|
|
PXOR X1, X10
|
|
ROL8(X10, X14)
|
|
PADDD X10, X7
|
|
PXOR X7, X4
|
|
MOVO X4, X14
|
|
PSLLL $0x07, X14
|
|
PSRLL $0x19, X4
|
|
PXOR X14, X4
|
|
PADDD X5, X2
|
|
PXOR X2, X11
|
|
ROL16(X11, X14)
|
|
PADDD X11, X8
|
|
PXOR X8, X5
|
|
MOVO X5, X14
|
|
PSLLL $0x0c, X14
|
|
PSRLL $0x14, X5
|
|
PXOR X14, X5
|
|
PADDD X5, X2
|
|
PXOR X2, X11
|
|
ROL8(X11, X14)
|
|
PADDD X11, X8
|
|
PXOR X8, X5
|
|
MOVO X5, X14
|
|
PSLLL $0x07, X14
|
|
PSRLL $0x19, X5
|
|
PXOR X14, X5
|
|
MOVO 64(BP), X14
|
|
MOVO X7, 64(BP)
|
|
IMULQ R12, R8
|
|
ADDQ R10, R15
|
|
ADCQ DX, R8
|
|
PADDD X13, X12
|
|
PXOR X12, X15
|
|
ROL16(X15, X7)
|
|
PADDD X15, X14
|
|
PXOR X14, X13
|
|
MOVO X13, X7
|
|
PSLLL $0x0c, X7
|
|
PSRLL $0x14, X13
|
|
PXOR X7, X13
|
|
PADDD X13, X12
|
|
PXOR X12, X15
|
|
ROL8(X15, X7)
|
|
PADDD X15, X14
|
|
PXOR X14, X13
|
|
MOVO X13, X7
|
|
PSLLL $0x07, X7
|
|
PSRLL $0x19, X13
|
|
PXOR X7, X13
|
|
MOVO 64(BP), X7
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xdb
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xe4
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xed
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xed
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xf6
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xff
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xc0
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xf6
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xc9
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xd2
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xdb
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xff
|
|
BYTE $0x04
|
|
DECQ R9
|
|
JGE sealSSEInnerLoop
|
|
ADDQ (DI), R10
|
|
ADCQ 8(DI), R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), AX
|
|
MOVQ AX, R15
|
|
MULQ R10
|
|
MOVQ AX, R13
|
|
MOVQ DX, R14
|
|
MOVQ (BP), AX
|
|
MULQ R11
|
|
IMULQ R12, R15
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), AX
|
|
MOVQ AX, R8
|
|
MULQ R10
|
|
ADDQ AX, R14
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R10
|
|
MOVQ 8(BP), AX
|
|
MULQ R11
|
|
ADDQ AX, R15
|
|
ADCQ $0x00, DX
|
|
IMULQ R12, R8
|
|
ADDQ R10, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
LEAQ 16(DI), DI
|
|
DECQ CX
|
|
JG sealSSEInnerLoop
|
|
|
|
// Add in the state
|
|
PADDD ·chacha20Constants<>+0(SB), X0
|
|
PADDD ·chacha20Constants<>+0(SB), X1
|
|
PADDD ·chacha20Constants<>+0(SB), X2
|
|
PADDD ·chacha20Constants<>+0(SB), X12
|
|
PADDD 32(BP), X3
|
|
PADDD 32(BP), X4
|
|
PADDD 32(BP), X5
|
|
PADDD 32(BP), X13
|
|
PADDD 48(BP), X6
|
|
PADDD 48(BP), X7
|
|
PADDD 48(BP), X8
|
|
PADDD 48(BP), X14
|
|
PADDD 80(BP), X9
|
|
PADDD 96(BP), X10
|
|
PADDD 112(BP), X11
|
|
PADDD 128(BP), X15
|
|
MOVO X15, 64(BP)
|
|
|
|
// Load - xor - store
|
|
MOVOU (SI), X15
|
|
PXOR X15, X0
|
|
MOVOU 16(SI), X15
|
|
PXOR X15, X3
|
|
MOVOU 32(SI), X15
|
|
PXOR X15, X6
|
|
MOVOU 48(SI), X15
|
|
PXOR X15, X9
|
|
MOVOU X0, (DI)
|
|
MOVOU X3, 16(DI)
|
|
MOVOU X6, 32(DI)
|
|
MOVOU X9, 48(DI)
|
|
MOVO 64(BP), X15
|
|
MOVOU 64(SI), X0
|
|
MOVOU 80(SI), X3
|
|
MOVOU 96(SI), X6
|
|
MOVOU 112(SI), X9
|
|
PXOR X0, X1
|
|
PXOR X3, X4
|
|
PXOR X6, X7
|
|
PXOR X9, X10
|
|
MOVOU X1, 64(DI)
|
|
MOVOU X4, 80(DI)
|
|
MOVOU X7, 96(DI)
|
|
MOVOU X10, 112(DI)
|
|
MOVOU 128(SI), X0
|
|
MOVOU 144(SI), X3
|
|
MOVOU 160(SI), X6
|
|
MOVOU 176(SI), X9
|
|
PXOR X0, X2
|
|
PXOR X3, X5
|
|
PXOR X6, X8
|
|
PXOR X9, X11
|
|
MOVOU X2, 128(DI)
|
|
MOVOU X5, 144(DI)
|
|
MOVOU X8, 160(DI)
|
|
MOVOU X11, 176(DI)
|
|
ADDQ $0xc0, SI
|
|
MOVQ $0x000000c0, CX
|
|
SUBQ $0xc0, BX
|
|
MOVO X12, X1
|
|
MOVO X13, X4
|
|
MOVO X14, X7
|
|
MOVO X15, X10
|
|
CMPQ BX, $0x40
|
|
JBE sealSSE128SealHash
|
|
MOVOU (SI), X0
|
|
MOVOU 16(SI), X3
|
|
MOVOU 32(SI), X6
|
|
MOVOU 48(SI), X9
|
|
PXOR X0, X12
|
|
PXOR X3, X13
|
|
PXOR X6, X14
|
|
PXOR X9, X15
|
|
MOVOU X12, 192(DI)
|
|
MOVOU X13, 208(DI)
|
|
MOVOU X14, 224(DI)
|
|
MOVOU X15, 240(DI)
|
|
LEAQ 64(SI), SI
|
|
SUBQ $0x40, BX
|
|
MOVQ $0x00000006, CX
|
|
MOVQ $0x00000004, R9
|
|
CMPQ BX, $0xc0
|
|
JG sealSSEMainLoop
|
|
MOVQ BX, CX
|
|
TESTQ BX, BX
|
|
JE sealSSE128SealHash
|
|
MOVQ $0x00000006, CX
|
|
CMPQ BX, $0x40
|
|
JBE sealSSETail64
|
|
CMPQ BX, $0x80
|
|
JBE sealSSETail128
|
|
JMP sealSSETail192
|
|
|
|
sealSSETail64:
|
|
MOVO ·chacha20Constants<>+0(SB), X1
|
|
MOVO 32(BP), X4
|
|
MOVO 48(BP), X7
|
|
MOVO 128(BP), X10
|
|
PADDL ·sseIncMask<>+0(SB), X10
|
|
MOVO X10, 80(BP)
|
|
|
|
sealSSETail64LoopA:
|
|
ADDQ (DI), R10
|
|
ADCQ 8(DI), R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), AX
|
|
MOVQ AX, R15
|
|
MULQ R10
|
|
MOVQ AX, R13
|
|
MOVQ DX, R14
|
|
MOVQ (BP), AX
|
|
MULQ R11
|
|
IMULQ R12, R15
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), AX
|
|
MOVQ AX, R8
|
|
MULQ R10
|
|
ADDQ AX, R14
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R10
|
|
MOVQ 8(BP), AX
|
|
MULQ R11
|
|
ADDQ AX, R15
|
|
ADCQ $0x00, DX
|
|
IMULQ R12, R8
|
|
ADDQ R10, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
LEAQ 16(DI), DI
|
|
|
|
sealSSETail64LoopB:
|
|
PADDD X4, X1
|
|
PXOR X1, X10
|
|
ROL16(X10, X13)
|
|
PADDD X10, X7
|
|
PXOR X7, X4
|
|
MOVO X4, X13
|
|
PSLLL $0x0c, X13
|
|
PSRLL $0x14, X4
|
|
PXOR X13, X4
|
|
PADDD X4, X1
|
|
PXOR X1, X10
|
|
ROL8(X10, X13)
|
|
PADDD X10, X7
|
|
PXOR X7, X4
|
|
MOVO X4, X13
|
|
PSLLL $0x07, X13
|
|
PSRLL $0x19, X4
|
|
PXOR X13, X4
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xe4
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xff
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xd2
|
|
BYTE $0x0c
|
|
PADDD X4, X1
|
|
PXOR X1, X10
|
|
ROL16(X10, X13)
|
|
PADDD X10, X7
|
|
PXOR X7, X4
|
|
MOVO X4, X13
|
|
PSLLL $0x0c, X13
|
|
PSRLL $0x14, X4
|
|
PXOR X13, X4
|
|
PADDD X4, X1
|
|
PXOR X1, X10
|
|
ROL8(X10, X13)
|
|
PADDD X10, X7
|
|
PXOR X7, X4
|
|
MOVO X4, X13
|
|
PSLLL $0x07, X13
|
|
PSRLL $0x19, X4
|
|
PXOR X13, X4
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xe4
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xff
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xd2
|
|
BYTE $0x04
|
|
ADDQ (DI), R10
|
|
ADCQ 8(DI), R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), AX
|
|
MOVQ AX, R15
|
|
MULQ R10
|
|
MOVQ AX, R13
|
|
MOVQ DX, R14
|
|
MOVQ (BP), AX
|
|
MULQ R11
|
|
IMULQ R12, R15
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), AX
|
|
MOVQ AX, R8
|
|
MULQ R10
|
|
ADDQ AX, R14
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R10
|
|
MOVQ 8(BP), AX
|
|
MULQ R11
|
|
ADDQ AX, R15
|
|
ADCQ $0x00, DX
|
|
IMULQ R12, R8
|
|
ADDQ R10, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
LEAQ 16(DI), DI
|
|
DECQ CX
|
|
JG sealSSETail64LoopA
|
|
DECQ R9
|
|
JGE sealSSETail64LoopB
|
|
PADDL ·chacha20Constants<>+0(SB), X1
|
|
PADDL 32(BP), X4
|
|
PADDL 48(BP), X7
|
|
PADDL 80(BP), X10
|
|
JMP sealSSE128Seal
|
|
|
|
sealSSETail128:
|
|
MOVO ·chacha20Constants<>+0(SB), X0
|
|
MOVO 32(BP), X3
|
|
MOVO 48(BP), X6
|
|
MOVO 128(BP), X9
|
|
PADDL ·sseIncMask<>+0(SB), X9
|
|
MOVO X9, 80(BP)
|
|
MOVO X0, X1
|
|
MOVO X3, X4
|
|
MOVO X6, X7
|
|
MOVO X9, X10
|
|
PADDL ·sseIncMask<>+0(SB), X10
|
|
MOVO X10, 96(BP)
|
|
|
|
sealSSETail128LoopA:
|
|
ADDQ (DI), R10
|
|
ADCQ 8(DI), R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), AX
|
|
MOVQ AX, R15
|
|
MULQ R10
|
|
MOVQ AX, R13
|
|
MOVQ DX, R14
|
|
MOVQ (BP), AX
|
|
MULQ R11
|
|
IMULQ R12, R15
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), AX
|
|
MOVQ AX, R8
|
|
MULQ R10
|
|
ADDQ AX, R14
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R10
|
|
MOVQ 8(BP), AX
|
|
MULQ R11
|
|
ADDQ AX, R15
|
|
ADCQ $0x00, DX
|
|
IMULQ R12, R8
|
|
ADDQ R10, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
LEAQ 16(DI), DI
|
|
|
|
sealSSETail128LoopB:
|
|
PADDD X3, X0
|
|
PXOR X0, X9
|
|
ROL16(X9, X12)
|
|
PADDD X9, X6
|
|
PXOR X6, X3
|
|
MOVO X3, X12
|
|
PSLLL $0x0c, X12
|
|
PSRLL $0x14, X3
|
|
PXOR X12, X3
|
|
PADDD X3, X0
|
|
PXOR X0, X9
|
|
ROL8(X9, X12)
|
|
PADDD X9, X6
|
|
PXOR X6, X3
|
|
MOVO X3, X12
|
|
PSLLL $0x07, X12
|
|
PSRLL $0x19, X3
|
|
PXOR X12, X3
|
|
PADDD X4, X1
|
|
PXOR X1, X10
|
|
ROL16(X10, X12)
|
|
PADDD X10, X7
|
|
PXOR X7, X4
|
|
MOVO X4, X12
|
|
PSLLL $0x0c, X12
|
|
PSRLL $0x14, X4
|
|
PXOR X12, X4
|
|
PADDD X4, X1
|
|
PXOR X1, X10
|
|
ROL8(X10, X12)
|
|
PADDD X10, X7
|
|
PXOR X7, X4
|
|
MOVO X4, X12
|
|
PSLLL $0x07, X12
|
|
PSRLL $0x19, X4
|
|
PXOR X12, X4
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xdb
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xf6
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xc9
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xe4
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xff
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xd2
|
|
BYTE $0x0c
|
|
ADDQ (DI), R10
|
|
ADCQ 8(DI), R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), AX
|
|
MOVQ AX, R15
|
|
MULQ R10
|
|
MOVQ AX, R13
|
|
MOVQ DX, R14
|
|
MOVQ (BP), AX
|
|
MULQ R11
|
|
IMULQ R12, R15
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), AX
|
|
MOVQ AX, R8
|
|
MULQ R10
|
|
ADDQ AX, R14
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R10
|
|
MOVQ 8(BP), AX
|
|
MULQ R11
|
|
ADDQ AX, R15
|
|
ADCQ $0x00, DX
|
|
IMULQ R12, R8
|
|
ADDQ R10, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
LEAQ 16(DI), DI
|
|
PADDD X3, X0
|
|
PXOR X0, X9
|
|
ROL16(X9, X12)
|
|
PADDD X9, X6
|
|
PXOR X6, X3
|
|
MOVO X3, X12
|
|
PSLLL $0x0c, X12
|
|
PSRLL $0x14, X3
|
|
PXOR X12, X3
|
|
PADDD X3, X0
|
|
PXOR X0, X9
|
|
ROL8(X9, X12)
|
|
PADDD X9, X6
|
|
PXOR X6, X3
|
|
MOVO X3, X12
|
|
PSLLL $0x07, X12
|
|
PSRLL $0x19, X3
|
|
PXOR X12, X3
|
|
PADDD X4, X1
|
|
PXOR X1, X10
|
|
ROL16(X10, X12)
|
|
PADDD X10, X7
|
|
PXOR X7, X4
|
|
MOVO X4, X12
|
|
PSLLL $0x0c, X12
|
|
PSRLL $0x14, X4
|
|
PXOR X12, X4
|
|
PADDD X4, X1
|
|
PXOR X1, X10
|
|
ROL8(X10, X12)
|
|
PADDD X10, X7
|
|
PXOR X7, X4
|
|
MOVO X4, X12
|
|
PSLLL $0x07, X12
|
|
PSRLL $0x19, X4
|
|
PXOR X12, X4
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xdb
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xf6
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xc9
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xe4
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xff
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xd2
|
|
BYTE $0x04
|
|
DECQ CX
|
|
JG sealSSETail128LoopA
|
|
DECQ R9
|
|
JGE sealSSETail128LoopB
|
|
PADDL ·chacha20Constants<>+0(SB), X0
|
|
PADDL ·chacha20Constants<>+0(SB), X1
|
|
PADDL 32(BP), X3
|
|
PADDL 32(BP), X4
|
|
PADDL 48(BP), X6
|
|
PADDL 48(BP), X7
|
|
PADDL 80(BP), X9
|
|
PADDL 96(BP), X10
|
|
MOVOU (SI), X12
|
|
MOVOU 16(SI), X13
|
|
MOVOU 32(SI), X14
|
|
MOVOU 48(SI), X15
|
|
PXOR X12, X0
|
|
PXOR X13, X3
|
|
PXOR X14, X6
|
|
PXOR X15, X9
|
|
MOVOU X0, (DI)
|
|
MOVOU X3, 16(DI)
|
|
MOVOU X6, 32(DI)
|
|
MOVOU X9, 48(DI)
|
|
MOVQ $0x00000040, CX
|
|
LEAQ 64(SI), SI
|
|
SUBQ $0x40, BX
|
|
JMP sealSSE128SealHash
|
|
|
|
sealSSETail192:
|
|
MOVO ·chacha20Constants<>+0(SB), X0
|
|
MOVO 32(BP), X3
|
|
MOVO 48(BP), X6
|
|
MOVO 128(BP), X9
|
|
PADDL ·sseIncMask<>+0(SB), X9
|
|
MOVO X9, 80(BP)
|
|
MOVO X0, X1
|
|
MOVO X3, X4
|
|
MOVO X6, X7
|
|
MOVO X9, X10
|
|
PADDL ·sseIncMask<>+0(SB), X10
|
|
MOVO X10, 96(BP)
|
|
MOVO X1, X2
|
|
MOVO X4, X5
|
|
MOVO X7, X8
|
|
MOVO X10, X11
|
|
PADDL ·sseIncMask<>+0(SB), X11
|
|
MOVO X11, 112(BP)
|
|
|
|
sealSSETail192LoopA:
|
|
ADDQ (DI), R10
|
|
ADCQ 8(DI), R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), AX
|
|
MOVQ AX, R15
|
|
MULQ R10
|
|
MOVQ AX, R13
|
|
MOVQ DX, R14
|
|
MOVQ (BP), AX
|
|
MULQ R11
|
|
IMULQ R12, R15
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), AX
|
|
MOVQ AX, R8
|
|
MULQ R10
|
|
ADDQ AX, R14
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R10
|
|
MOVQ 8(BP), AX
|
|
MULQ R11
|
|
ADDQ AX, R15
|
|
ADCQ $0x00, DX
|
|
IMULQ R12, R8
|
|
ADDQ R10, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
LEAQ 16(DI), DI
|
|
|
|
sealSSETail192LoopB:
|
|
PADDD X3, X0
|
|
PXOR X0, X9
|
|
ROL16(X9, X12)
|
|
PADDD X9, X6
|
|
PXOR X6, X3
|
|
MOVO X3, X12
|
|
PSLLL $0x0c, X12
|
|
PSRLL $0x14, X3
|
|
PXOR X12, X3
|
|
PADDD X3, X0
|
|
PXOR X0, X9
|
|
ROL8(X9, X12)
|
|
PADDD X9, X6
|
|
PXOR X6, X3
|
|
MOVO X3, X12
|
|
PSLLL $0x07, X12
|
|
PSRLL $0x19, X3
|
|
PXOR X12, X3
|
|
PADDD X4, X1
|
|
PXOR X1, X10
|
|
ROL16(X10, X12)
|
|
PADDD X10, X7
|
|
PXOR X7, X4
|
|
MOVO X4, X12
|
|
PSLLL $0x0c, X12
|
|
PSRLL $0x14, X4
|
|
PXOR X12, X4
|
|
PADDD X4, X1
|
|
PXOR X1, X10
|
|
ROL8(X10, X12)
|
|
PADDD X10, X7
|
|
PXOR X7, X4
|
|
MOVO X4, X12
|
|
PSLLL $0x07, X12
|
|
PSRLL $0x19, X4
|
|
PXOR X12, X4
|
|
PADDD X5, X2
|
|
PXOR X2, X11
|
|
ROL16(X11, X12)
|
|
PADDD X11, X8
|
|
PXOR X8, X5
|
|
MOVO X5, X12
|
|
PSLLL $0x0c, X12
|
|
PSRLL $0x14, X5
|
|
PXOR X12, X5
|
|
PADDD X5, X2
|
|
PXOR X2, X11
|
|
ROL8(X11, X12)
|
|
PADDD X11, X8
|
|
PXOR X8, X5
|
|
MOVO X5, X12
|
|
PSLLL $0x07, X12
|
|
PSRLL $0x19, X5
|
|
PXOR X12, X5
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xdb
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xf6
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xc9
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xe4
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xff
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xd2
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xed
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xc0
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xdb
|
|
BYTE $0x0c
|
|
ADDQ (DI), R10
|
|
ADCQ 8(DI), R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), AX
|
|
MOVQ AX, R15
|
|
MULQ R10
|
|
MOVQ AX, R13
|
|
MOVQ DX, R14
|
|
MOVQ (BP), AX
|
|
MULQ R11
|
|
IMULQ R12, R15
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), AX
|
|
MOVQ AX, R8
|
|
MULQ R10
|
|
ADDQ AX, R14
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R10
|
|
MOVQ 8(BP), AX
|
|
MULQ R11
|
|
ADDQ AX, R15
|
|
ADCQ $0x00, DX
|
|
IMULQ R12, R8
|
|
ADDQ R10, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
LEAQ 16(DI), DI
|
|
PADDD X3, X0
|
|
PXOR X0, X9
|
|
ROL16(X9, X12)
|
|
PADDD X9, X6
|
|
PXOR X6, X3
|
|
MOVO X3, X12
|
|
PSLLL $0x0c, X12
|
|
PSRLL $0x14, X3
|
|
PXOR X12, X3
|
|
PADDD X3, X0
|
|
PXOR X0, X9
|
|
ROL8(X9, X12)
|
|
PADDD X9, X6
|
|
PXOR X6, X3
|
|
MOVO X3, X12
|
|
PSLLL $0x07, X12
|
|
PSRLL $0x19, X3
|
|
PXOR X12, X3
|
|
PADDD X4, X1
|
|
PXOR X1, X10
|
|
ROL16(X10, X12)
|
|
PADDD X10, X7
|
|
PXOR X7, X4
|
|
MOVO X4, X12
|
|
PSLLL $0x0c, X12
|
|
PSRLL $0x14, X4
|
|
PXOR X12, X4
|
|
PADDD X4, X1
|
|
PXOR X1, X10
|
|
ROL8(X10, X12)
|
|
PADDD X10, X7
|
|
PXOR X7, X4
|
|
MOVO X4, X12
|
|
PSLLL $0x07, X12
|
|
PSRLL $0x19, X4
|
|
PXOR X12, X4
|
|
PADDD X5, X2
|
|
PXOR X2, X11
|
|
ROL16(X11, X12)
|
|
PADDD X11, X8
|
|
PXOR X8, X5
|
|
MOVO X5, X12
|
|
PSLLL $0x0c, X12
|
|
PSRLL $0x14, X5
|
|
PXOR X12, X5
|
|
PADDD X5, X2
|
|
PXOR X2, X11
|
|
ROL8(X11, X12)
|
|
PADDD X11, X8
|
|
PXOR X8, X5
|
|
MOVO X5, X12
|
|
PSLLL $0x07, X12
|
|
PSRLL $0x19, X5
|
|
PXOR X12, X5
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xdb
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xf6
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xc9
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xe4
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xff
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xd2
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xed
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xc0
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xdb
|
|
BYTE $0x04
|
|
DECQ CX
|
|
JG sealSSETail192LoopA
|
|
DECQ R9
|
|
JGE sealSSETail192LoopB
|
|
PADDL ·chacha20Constants<>+0(SB), X0
|
|
PADDL ·chacha20Constants<>+0(SB), X1
|
|
PADDL ·chacha20Constants<>+0(SB), X2
|
|
PADDL 32(BP), X3
|
|
PADDL 32(BP), X4
|
|
PADDL 32(BP), X5
|
|
PADDL 48(BP), X6
|
|
PADDL 48(BP), X7
|
|
PADDL 48(BP), X8
|
|
PADDL 80(BP), X9
|
|
PADDL 96(BP), X10
|
|
PADDL 112(BP), X11
|
|
MOVOU (SI), X12
|
|
MOVOU 16(SI), X13
|
|
MOVOU 32(SI), X14
|
|
MOVOU 48(SI), X15
|
|
PXOR X12, X0
|
|
PXOR X13, X3
|
|
PXOR X14, X6
|
|
PXOR X15, X9
|
|
MOVOU X0, (DI)
|
|
MOVOU X3, 16(DI)
|
|
MOVOU X6, 32(DI)
|
|
MOVOU X9, 48(DI)
|
|
MOVOU 64(SI), X12
|
|
MOVOU 80(SI), X13
|
|
MOVOU 96(SI), X14
|
|
MOVOU 112(SI), X15
|
|
PXOR X12, X1
|
|
PXOR X13, X4
|
|
PXOR X14, X7
|
|
PXOR X15, X10
|
|
MOVOU X1, 64(DI)
|
|
MOVOU X4, 80(DI)
|
|
MOVOU X7, 96(DI)
|
|
MOVOU X10, 112(DI)
|
|
MOVO X2, X1
|
|
MOVO X5, X4
|
|
MOVO X8, X7
|
|
MOVO X11, X10
|
|
MOVQ $0x00000080, CX
|
|
LEAQ 128(SI), SI
|
|
SUBQ $0x80, BX
|
|
JMP sealSSE128SealHash
|
|
|
|
sealSSE128:
|
|
MOVOU ·chacha20Constants<>+0(SB), X0
|
|
MOVOU 16(R8), X3
|
|
MOVOU 32(R8), X6
|
|
MOVOU 48(R8), X9
|
|
MOVO X0, X1
|
|
MOVO X3, X4
|
|
MOVO X6, X7
|
|
MOVO X9, X10
|
|
PADDL ·sseIncMask<>+0(SB), X10
|
|
MOVO X1, X2
|
|
MOVO X4, X5
|
|
MOVO X7, X8
|
|
MOVO X10, X11
|
|
PADDL ·sseIncMask<>+0(SB), X11
|
|
MOVO X3, X13
|
|
MOVO X6, X14
|
|
MOVO X10, X15
|
|
MOVQ $0x0000000a, R9
|
|
|
|
sealSSE128InnerCipherLoop:
|
|
PADDD X3, X0
|
|
PXOR X0, X9
|
|
ROL16(X9, X12)
|
|
PADDD X9, X6
|
|
PXOR X6, X3
|
|
MOVO X3, X12
|
|
PSLLL $0x0c, X12
|
|
PSRLL $0x14, X3
|
|
PXOR X12, X3
|
|
PADDD X3, X0
|
|
PXOR X0, X9
|
|
ROL8(X9, X12)
|
|
PADDD X9, X6
|
|
PXOR X6, X3
|
|
MOVO X3, X12
|
|
PSLLL $0x07, X12
|
|
PSRLL $0x19, X3
|
|
PXOR X12, X3
|
|
PADDD X4, X1
|
|
PXOR X1, X10
|
|
ROL16(X10, X12)
|
|
PADDD X10, X7
|
|
PXOR X7, X4
|
|
MOVO X4, X12
|
|
PSLLL $0x0c, X12
|
|
PSRLL $0x14, X4
|
|
PXOR X12, X4
|
|
PADDD X4, X1
|
|
PXOR X1, X10
|
|
ROL8(X10, X12)
|
|
PADDD X10, X7
|
|
PXOR X7, X4
|
|
MOVO X4, X12
|
|
PSLLL $0x07, X12
|
|
PSRLL $0x19, X4
|
|
PXOR X12, X4
|
|
PADDD X5, X2
|
|
PXOR X2, X11
|
|
ROL16(X11, X12)
|
|
PADDD X11, X8
|
|
PXOR X8, X5
|
|
MOVO X5, X12
|
|
PSLLL $0x0c, X12
|
|
PSRLL $0x14, X5
|
|
PXOR X12, X5
|
|
PADDD X5, X2
|
|
PXOR X2, X11
|
|
ROL8(X11, X12)
|
|
PADDD X11, X8
|
|
PXOR X8, X5
|
|
MOVO X5, X12
|
|
PSLLL $0x07, X12
|
|
PSRLL $0x19, X5
|
|
PXOR X12, X5
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xdb
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xe4
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xed
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xf6
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xff
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xc0
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xc9
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xd2
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xdb
|
|
BYTE $0x0c
|
|
PADDD X3, X0
|
|
PXOR X0, X9
|
|
ROL16(X9, X12)
|
|
PADDD X9, X6
|
|
PXOR X6, X3
|
|
MOVO X3, X12
|
|
PSLLL $0x0c, X12
|
|
PSRLL $0x14, X3
|
|
PXOR X12, X3
|
|
PADDD X3, X0
|
|
PXOR X0, X9
|
|
ROL8(X9, X12)
|
|
PADDD X9, X6
|
|
PXOR X6, X3
|
|
MOVO X3, X12
|
|
PSLLL $0x07, X12
|
|
PSRLL $0x19, X3
|
|
PXOR X12, X3
|
|
PADDD X4, X1
|
|
PXOR X1, X10
|
|
ROL16(X10, X12)
|
|
PADDD X10, X7
|
|
PXOR X7, X4
|
|
MOVO X4, X12
|
|
PSLLL $0x0c, X12
|
|
PSRLL $0x14, X4
|
|
PXOR X12, X4
|
|
PADDD X4, X1
|
|
PXOR X1, X10
|
|
ROL8(X10, X12)
|
|
PADDD X10, X7
|
|
PXOR X7, X4
|
|
MOVO X4, X12
|
|
PSLLL $0x07, X12
|
|
PSRLL $0x19, X4
|
|
PXOR X12, X4
|
|
PADDD X5, X2
|
|
PXOR X2, X11
|
|
ROL16(X11, X12)
|
|
PADDD X11, X8
|
|
PXOR X8, X5
|
|
MOVO X5, X12
|
|
PSLLL $0x0c, X12
|
|
PSRLL $0x14, X5
|
|
PXOR X12, X5
|
|
PADDD X5, X2
|
|
PXOR X2, X11
|
|
ROL8(X11, X12)
|
|
PADDD X11, X8
|
|
PXOR X8, X5
|
|
MOVO X5, X12
|
|
PSLLL $0x07, X12
|
|
PSRLL $0x19, X5
|
|
PXOR X12, X5
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xdb
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xe4
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xed
|
|
BYTE $0x0c
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xf6
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xff
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xc0
|
|
BYTE $0x08
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xc9
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xd2
|
|
BYTE $0x04
|
|
BYTE $0x66
|
|
BYTE $0x45
|
|
BYTE $0x0f
|
|
BYTE $0x3a
|
|
BYTE $0x0f
|
|
BYTE $0xdb
|
|
BYTE $0x04
|
|
DECQ R9
|
|
JNE sealSSE128InnerCipherLoop
|
|
|
|
// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
|
|
PADDL ·chacha20Constants<>+0(SB), X0
|
|
PADDL ·chacha20Constants<>+0(SB), X1
|
|
PADDL ·chacha20Constants<>+0(SB), X2
|
|
PADDL X13, X3
|
|
PADDL X13, X4
|
|
PADDL X13, X5
|
|
PADDL X14, X7
|
|
PADDL X14, X8
|
|
PADDL X15, X10
|
|
PADDL ·sseIncMask<>+0(SB), X15
|
|
PADDL X15, X11
|
|
PAND ·polyClampMask<>+0(SB), X0
|
|
MOVOU X0, (BP)
|
|
MOVOU X3, 16(BP)
|
|
|
|
// Hash
|
|
MOVQ ad_len+80(FP), R9
|
|
CALL polyHashADInternal<>(SB)
|
|
XORQ CX, CX
|
|
|
|
sealSSE128SealHash:
|
|
CMPQ CX, $0x10
|
|
JB sealSSE128Seal
|
|
ADDQ (DI), R10
|
|
ADCQ 8(DI), R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), AX
|
|
MOVQ AX, R15
|
|
MULQ R10
|
|
MOVQ AX, R13
|
|
MOVQ DX, R14
|
|
MOVQ (BP), AX
|
|
MULQ R11
|
|
IMULQ R12, R15
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), AX
|
|
MOVQ AX, R8
|
|
MULQ R10
|
|
ADDQ AX, R14
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R10
|
|
MOVQ 8(BP), AX
|
|
MULQ R11
|
|
ADDQ AX, R15
|
|
ADCQ $0x00, DX
|
|
IMULQ R12, R8
|
|
ADDQ R10, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
SUBQ $0x10, CX
|
|
ADDQ $0x10, DI
|
|
JMP sealSSE128SealHash
|
|
|
|
sealSSE128Seal:
|
|
CMPQ BX, $0x10
|
|
JB sealSSETail
|
|
SUBQ $0x10, BX
|
|
|
|
// Load for decryption
|
|
MOVOU (SI), X12
|
|
PXOR X12, X1
|
|
MOVOU X1, (DI)
|
|
LEAQ 16(SI), SI
|
|
LEAQ 16(DI), DI
|
|
|
|
// Extract for hashing
|
|
MOVQ X1, R13
|
|
PSRLDQ $0x08, X1
|
|
MOVQ X1, R14
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), AX
|
|
MOVQ AX, R15
|
|
MULQ R10
|
|
MOVQ AX, R13
|
|
MOVQ DX, R14
|
|
MOVQ (BP), AX
|
|
MULQ R11
|
|
IMULQ R12, R15
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), AX
|
|
MOVQ AX, R8
|
|
MULQ R10
|
|
ADDQ AX, R14
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R10
|
|
MOVQ 8(BP), AX
|
|
MULQ R11
|
|
ADDQ AX, R15
|
|
ADCQ $0x00, DX
|
|
IMULQ R12, R8
|
|
ADDQ R10, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
|
|
// Shift the stream "left"
|
|
MOVO X4, X1
|
|
MOVO X7, X4
|
|
MOVO X10, X7
|
|
MOVO X2, X10
|
|
MOVO X5, X2
|
|
MOVO X8, X5
|
|
MOVO X11, X8
|
|
JMP sealSSE128Seal
|
|
|
|
sealSSETail:
|
|
TESTQ BX, BX
|
|
JE sealSSEFinalize
|
|
|
|
// We can only load the PT one byte at a time to avoid read after end of buffer
|
|
MOVQ BX, R9
|
|
SHLQ $0x04, R9
|
|
LEAQ ·andMask<>+0(SB), R13
|
|
MOVQ BX, CX
|
|
LEAQ -1(SI)(BX*1), SI
|
|
XORQ R15, R15
|
|
XORQ R8, R8
|
|
XORQ AX, AX
|
|
|
|
sealSSETailLoadLoop:
|
|
SHLQ $0x08, R15, R8
|
|
SHLQ $0x08, R15
|
|
MOVB (SI), AX
|
|
XORQ AX, R15
|
|
LEAQ -1(SI), SI
|
|
DECQ CX
|
|
JNE sealSSETailLoadLoop
|
|
MOVQ R15, 64(BP)
|
|
MOVQ R8, 72(BP)
|
|
PXOR 64(BP), X1
|
|
MOVOU X1, (DI)
|
|
MOVOU -16(R13)(R9*1), X12
|
|
PAND X12, X1
|
|
MOVQ X1, R13
|
|
PSRLDQ $0x08, X1
|
|
MOVQ X1, R14
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), AX
|
|
MOVQ AX, R15
|
|
MULQ R10
|
|
MOVQ AX, R13
|
|
MOVQ DX, R14
|
|
MOVQ (BP), AX
|
|
MULQ R11
|
|
IMULQ R12, R15
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), AX
|
|
MOVQ AX, R8
|
|
MULQ R10
|
|
ADDQ AX, R14
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R10
|
|
MOVQ 8(BP), AX
|
|
MULQ R11
|
|
ADDQ AX, R15
|
|
ADCQ $0x00, DX
|
|
IMULQ R12, R8
|
|
ADDQ R10, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ BX, DI
|
|
|
|
sealSSEFinalize:
|
|
// Hash in the buffer lengths
|
|
ADDQ ad_len+80(FP), R10
|
|
ADCQ src_len+56(FP), R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), AX
|
|
MOVQ AX, R15
|
|
MULQ R10
|
|
MOVQ AX, R13
|
|
MOVQ DX, R14
|
|
MOVQ (BP), AX
|
|
MULQ R11
|
|
IMULQ R12, R15
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), AX
|
|
MOVQ AX, R8
|
|
MULQ R10
|
|
ADDQ AX, R14
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R10
|
|
MOVQ 8(BP), AX
|
|
MULQ R11
|
|
ADDQ AX, R15
|
|
ADCQ $0x00, DX
|
|
IMULQ R12, R8
|
|
ADDQ R10, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
|
|
// Final reduce
|
|
MOVQ R10, R13
|
|
MOVQ R11, R14
|
|
MOVQ R12, R15
|
|
SUBQ $-5, R10
|
|
SBBQ $-1, R11
|
|
SBBQ $0x03, R12
|
|
CMOVQCS R13, R10
|
|
CMOVQCS R14, R11
|
|
CMOVQCS R15, R12
|
|
|
|
// Add in the "s" part of the key
|
|
ADDQ 16(BP), R10
|
|
ADCQ 24(BP), R11
|
|
|
|
// Finally store the tag at the end of the message
|
|
MOVQ R10, (DI)
|
|
MOVQ R11, 8(DI)
|
|
RET
|
|
|
|
chacha20Poly1305Seal_AVX2:
|
|
VZEROUPPER
|
|
VMOVDQU ·chacha20Constants<>+0(SB), Y0
|
|
BYTE $0xc4
|
|
BYTE $0x42
|
|
BYTE $0x7d
|
|
BYTE $0x5a
|
|
BYTE $0x70
|
|
BYTE $0x10
|
|
BYTE $0xc4
|
|
BYTE $0x42
|
|
BYTE $0x7d
|
|
BYTE $0x5a
|
|
BYTE $0x60
|
|
BYTE $0x20
|
|
BYTE $0xc4
|
|
BYTE $0xc2
|
|
BYTE $0x7d
|
|
BYTE $0x5a
|
|
BYTE $0x60
|
|
BYTE $0x30
|
|
VPADDD ·avx2InitMask<>+0(SB), Y4, Y4
|
|
|
|
// Special optimizations, for very short buffers
|
|
CMPQ BX, $0x000000c0
|
|
JBE seal192AVX2
|
|
CMPQ BX, $0x00000140
|
|
JBE seal320AVX2
|
|
|
|
// For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
|
|
VMOVDQA Y0, Y5
|
|
VMOVDQA Y0, Y6
|
|
VMOVDQA Y0, Y7
|
|
VMOVDQA Y14, Y9
|
|
VMOVDQA Y14, Y10
|
|
VMOVDQA Y14, Y11
|
|
VMOVDQA Y14, 32(BP)
|
|
VMOVDQA Y12, Y13
|
|
VMOVDQA Y12, Y8
|
|
VMOVDQA Y12, Y15
|
|
VMOVDQA Y12, 64(BP)
|
|
VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
|
|
VMOVDQA Y4, 96(BP)
|
|
VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
|
|
VMOVDQA Y1, 128(BP)
|
|
VPADDD ·avx2IncMask<>+0(SB), Y2, Y3
|
|
VMOVDQA Y2, 160(BP)
|
|
VMOVDQA Y3, 192(BP)
|
|
MOVQ $0x0000000a, R9
|
|
|
|
sealAVX2IntroLoop:
|
|
VMOVDQA Y15, 224(BP)
|
|
VPADDD Y14, Y0, Y0
|
|
VPXOR Y0, Y4, Y4
|
|
VPSHUFB ·rol16<>+0(SB), Y4, Y4
|
|
VPADDD Y4, Y12, Y12
|
|
VPXOR Y12, Y14, Y14
|
|
VPSLLD $0x0c, Y14, Y15
|
|
VPSRLD $0x14, Y14, Y14
|
|
VPXOR Y15, Y14, Y14
|
|
VPADDD Y14, Y0, Y0
|
|
VPXOR Y0, Y4, Y4
|
|
VPSHUFB ·rol8<>+0(SB), Y4, Y4
|
|
VPADDD Y4, Y12, Y12
|
|
VPXOR Y12, Y14, Y14
|
|
VPSLLD $0x07, Y14, Y15
|
|
VPSRLD $0x19, Y14, Y14
|
|
VPXOR Y15, Y14, Y14
|
|
VPADDD Y9, Y5, Y5
|
|
VPXOR Y5, Y1, Y1
|
|
VPSHUFB ·rol16<>+0(SB), Y1, Y1
|
|
VPADDD Y1, Y13, Y13
|
|
VPXOR Y13, Y9, Y9
|
|
VPSLLD $0x0c, Y9, Y15
|
|
VPSRLD $0x14, Y9, Y9
|
|
VPXOR Y15, Y9, Y9
|
|
VPADDD Y9, Y5, Y5
|
|
VPXOR Y5, Y1, Y1
|
|
VPSHUFB ·rol8<>+0(SB), Y1, Y1
|
|
VPADDD Y1, Y13, Y13
|
|
VPXOR Y13, Y9, Y9
|
|
VPSLLD $0x07, Y9, Y15
|
|
VPSRLD $0x19, Y9, Y9
|
|
VPXOR Y15, Y9, Y9
|
|
VPADDD Y10, Y6, Y6
|
|
VPXOR Y6, Y2, Y2
|
|
VPSHUFB ·rol16<>+0(SB), Y2, Y2
|
|
VPADDD Y2, Y8, Y8
|
|
VPXOR Y8, Y10, Y10
|
|
VPSLLD $0x0c, Y10, Y15
|
|
VPSRLD $0x14, Y10, Y10
|
|
VPXOR Y15, Y10, Y10
|
|
VPADDD Y10, Y6, Y6
|
|
VPXOR Y6, Y2, Y2
|
|
VPSHUFB ·rol8<>+0(SB), Y2, Y2
|
|
VPADDD Y2, Y8, Y8
|
|
VPXOR Y8, Y10, Y10
|
|
VPSLLD $0x07, Y10, Y15
|
|
VPSRLD $0x19, Y10, Y10
|
|
VPXOR Y15, Y10, Y10
|
|
VMOVDQA 224(BP), Y15
|
|
VMOVDQA Y13, 224(BP)
|
|
VPADDD Y11, Y7, Y7
|
|
VPXOR Y7, Y3, Y3
|
|
VPSHUFB ·rol16<>+0(SB), Y3, Y3
|
|
VPADDD Y3, Y15, Y15
|
|
VPXOR Y15, Y11, Y11
|
|
VPSLLD $0x0c, Y11, Y13
|
|
VPSRLD $0x14, Y11, Y11
|
|
VPXOR Y13, Y11, Y11
|
|
VPADDD Y11, Y7, Y7
|
|
VPXOR Y7, Y3, Y3
|
|
VPSHUFB ·rol8<>+0(SB), Y3, Y3
|
|
VPADDD Y3, Y15, Y15
|
|
VPXOR Y15, Y11, Y11
|
|
VPSLLD $0x07, Y11, Y13
|
|
VPSRLD $0x19, Y11, Y11
|
|
VPXOR Y13, Y11, Y11
|
|
VMOVDQA 224(BP), Y13
|
|
VPALIGNR $0x04, Y14, Y14, Y14
|
|
VPALIGNR $0x08, Y12, Y12, Y12
|
|
VPALIGNR $0x0c, Y4, Y4, Y4
|
|
VPALIGNR $0x04, Y9, Y9, Y9
|
|
VPALIGNR $0x08, Y13, Y13, Y13
|
|
VPALIGNR $0x0c, Y1, Y1, Y1
|
|
VPALIGNR $0x04, Y10, Y10, Y10
|
|
VPALIGNR $0x08, Y8, Y8, Y8
|
|
VPALIGNR $0x0c, Y2, Y2, Y2
|
|
VPALIGNR $0x04, Y11, Y11, Y11
|
|
VPALIGNR $0x08, Y15, Y15, Y15
|
|
VPALIGNR $0x0c, Y3, Y3, Y3
|
|
VMOVDQA Y15, 224(BP)
|
|
VPADDD Y14, Y0, Y0
|
|
VPXOR Y0, Y4, Y4
|
|
VPSHUFB ·rol16<>+0(SB), Y4, Y4
|
|
VPADDD Y4, Y12, Y12
|
|
VPXOR Y12, Y14, Y14
|
|
VPSLLD $0x0c, Y14, Y15
|
|
VPSRLD $0x14, Y14, Y14
|
|
VPXOR Y15, Y14, Y14
|
|
VPADDD Y14, Y0, Y0
|
|
VPXOR Y0, Y4, Y4
|
|
VPSHUFB ·rol8<>+0(SB), Y4, Y4
|
|
VPADDD Y4, Y12, Y12
|
|
VPXOR Y12, Y14, Y14
|
|
VPSLLD $0x07, Y14, Y15
|
|
VPSRLD $0x19, Y14, Y14
|
|
VPXOR Y15, Y14, Y14
|
|
VPADDD Y9, Y5, Y5
|
|
VPXOR Y5, Y1, Y1
|
|
VPSHUFB ·rol16<>+0(SB), Y1, Y1
|
|
VPADDD Y1, Y13, Y13
|
|
VPXOR Y13, Y9, Y9
|
|
VPSLLD $0x0c, Y9, Y15
|
|
VPSRLD $0x14, Y9, Y9
|
|
VPXOR Y15, Y9, Y9
|
|
VPADDD Y9, Y5, Y5
|
|
VPXOR Y5, Y1, Y1
|
|
VPSHUFB ·rol8<>+0(SB), Y1, Y1
|
|
VPADDD Y1, Y13, Y13
|
|
VPXOR Y13, Y9, Y9
|
|
VPSLLD $0x07, Y9, Y15
|
|
VPSRLD $0x19, Y9, Y9
|
|
VPXOR Y15, Y9, Y9
|
|
VPADDD Y10, Y6, Y6
|
|
VPXOR Y6, Y2, Y2
|
|
VPSHUFB ·rol16<>+0(SB), Y2, Y2
|
|
VPADDD Y2, Y8, Y8
|
|
VPXOR Y8, Y10, Y10
|
|
VPSLLD $0x0c, Y10, Y15
|
|
VPSRLD $0x14, Y10, Y10
|
|
VPXOR Y15, Y10, Y10
|
|
VPADDD Y10, Y6, Y6
|
|
VPXOR Y6, Y2, Y2
|
|
VPSHUFB ·rol8<>+0(SB), Y2, Y2
|
|
VPADDD Y2, Y8, Y8
|
|
VPXOR Y8, Y10, Y10
|
|
VPSLLD $0x07, Y10, Y15
|
|
VPSRLD $0x19, Y10, Y10
|
|
VPXOR Y15, Y10, Y10
|
|
VMOVDQA 224(BP), Y15
|
|
VMOVDQA Y13, 224(BP)
|
|
VPADDD Y11, Y7, Y7
|
|
VPXOR Y7, Y3, Y3
|
|
VPSHUFB ·rol16<>+0(SB), Y3, Y3
|
|
VPADDD Y3, Y15, Y15
|
|
VPXOR Y15, Y11, Y11
|
|
VPSLLD $0x0c, Y11, Y13
|
|
VPSRLD $0x14, Y11, Y11
|
|
VPXOR Y13, Y11, Y11
|
|
VPADDD Y11, Y7, Y7
|
|
VPXOR Y7, Y3, Y3
|
|
VPSHUFB ·rol8<>+0(SB), Y3, Y3
|
|
VPADDD Y3, Y15, Y15
|
|
VPXOR Y15, Y11, Y11
|
|
VPSLLD $0x07, Y11, Y13
|
|
VPSRLD $0x19, Y11, Y11
|
|
VPXOR Y13, Y11, Y11
|
|
VMOVDQA 224(BP), Y13
|
|
VPALIGNR $0x0c, Y14, Y14, Y14
|
|
VPALIGNR $0x08, Y12, Y12, Y12
|
|
VPALIGNR $0x04, Y4, Y4, Y4
|
|
VPALIGNR $0x0c, Y9, Y9, Y9
|
|
VPALIGNR $0x08, Y13, Y13, Y13
|
|
VPALIGNR $0x04, Y1, Y1, Y1
|
|
VPALIGNR $0x0c, Y10, Y10, Y10
|
|
VPALIGNR $0x08, Y8, Y8, Y8
|
|
VPALIGNR $0x04, Y2, Y2, Y2
|
|
VPALIGNR $0x0c, Y11, Y11, Y11
|
|
VPALIGNR $0x08, Y15, Y15, Y15
|
|
VPALIGNR $0x04, Y3, Y3, Y3
|
|
DECQ R9
|
|
JNE sealAVX2IntroLoop
|
|
VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
|
|
VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
|
|
VPADDD ·chacha20Constants<>+0(SB), Y6, Y6
|
|
VPADDD ·chacha20Constants<>+0(SB), Y7, Y7
|
|
VPADDD 32(BP), Y14, Y14
|
|
VPADDD 32(BP), Y9, Y9
|
|
VPADDD 32(BP), Y10, Y10
|
|
VPADDD 32(BP), Y11, Y11
|
|
VPADDD 64(BP), Y12, Y12
|
|
VPADDD 64(BP), Y13, Y13
|
|
VPADDD 64(BP), Y8, Y8
|
|
VPADDD 64(BP), Y15, Y15
|
|
VPADDD 96(BP), Y4, Y4
|
|
VPADDD 128(BP), Y1, Y1
|
|
VPADDD 160(BP), Y2, Y2
|
|
VPADDD 192(BP), Y3, Y3
|
|
VPERM2I128 $0x13, Y12, Y4, Y12
|
|
VPERM2I128 $0x02, Y0, Y14, Y4
|
|
VPERM2I128 $0x13, Y0, Y14, Y0
|
|
|
|
// Clamp and store poly key
|
|
VPAND ·polyClampMask<>+0(SB), Y4, Y4
|
|
VMOVDQA Y4, (BP)
|
|
|
|
// Hash AD
|
|
MOVQ ad_len+80(FP), R9
|
|
CALL polyHashADInternal<>(SB)
|
|
|
|
// Can store at least 320 bytes
|
|
VPXOR (SI), Y0, Y0
|
|
VPXOR 32(SI), Y12, Y12
|
|
VMOVDQU Y0, (DI)
|
|
VMOVDQU Y12, 32(DI)
|
|
VPERM2I128 $0x02, Y5, Y9, Y0
|
|
VPERM2I128 $0x02, Y13, Y1, Y14
|
|
VPERM2I128 $0x13, Y5, Y9, Y12
|
|
VPERM2I128 $0x13, Y13, Y1, Y4
|
|
VPXOR 64(SI), Y0, Y0
|
|
VPXOR 96(SI), Y14, Y14
|
|
VPXOR 128(SI), Y12, Y12
|
|
VPXOR 160(SI), Y4, Y4
|
|
VMOVDQU Y0, 64(DI)
|
|
VMOVDQU Y14, 96(DI)
|
|
VMOVDQU Y12, 128(DI)
|
|
VMOVDQU Y4, 160(DI)
|
|
VPERM2I128 $0x02, Y6, Y10, Y0
|
|
VPERM2I128 $0x02, Y8, Y2, Y14
|
|
VPERM2I128 $0x13, Y6, Y10, Y12
|
|
VPERM2I128 $0x13, Y8, Y2, Y4
|
|
VPXOR 192(SI), Y0, Y0
|
|
VPXOR 224(SI), Y14, Y14
|
|
VPXOR 256(SI), Y12, Y12
|
|
VPXOR 288(SI), Y4, Y4
|
|
VMOVDQU Y0, 192(DI)
|
|
VMOVDQU Y14, 224(DI)
|
|
VMOVDQU Y12, 256(DI)
|
|
VMOVDQU Y4, 288(DI)
|
|
MOVQ $0x00000140, CX
|
|
SUBQ $0x00000140, BX
|
|
LEAQ 320(SI), SI
|
|
VPERM2I128 $0x02, Y7, Y11, Y0
|
|
VPERM2I128 $0x02, Y15, Y3, Y14
|
|
VPERM2I128 $0x13, Y7, Y11, Y12
|
|
VPERM2I128 $0x13, Y15, Y3, Y4
|
|
CMPQ BX, $0x80
|
|
JBE sealAVX2SealHash
|
|
VPXOR (SI), Y0, Y0
|
|
VPXOR 32(SI), Y14, Y14
|
|
VPXOR 64(SI), Y12, Y12
|
|
VPXOR 96(SI), Y4, Y4
|
|
VMOVDQU Y0, 320(DI)
|
|
VMOVDQU Y14, 352(DI)
|
|
VMOVDQU Y12, 384(DI)
|
|
VMOVDQU Y4, 416(DI)
|
|
SUBQ $0x80, BX
|
|
LEAQ 128(SI), SI
|
|
MOVQ $0x00000008, CX
|
|
MOVQ $0x00000002, R9
|
|
CMPQ BX, $0x80
|
|
JBE sealAVX2Tail128
|
|
CMPQ BX, $0x00000100
|
|
JBE sealAVX2Tail256
|
|
CMPQ BX, $0x00000180
|
|
JBE sealAVX2Tail384
|
|
CMPQ BX, $0x00000200
|
|
JBE sealAVX2Tail512
|
|
|
|
// We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop
|
|
VMOVDQA ·chacha20Constants<>+0(SB), Y0
|
|
VMOVDQA Y0, Y5
|
|
VMOVDQA Y0, Y6
|
|
VMOVDQA Y0, Y7
|
|
VMOVDQA 32(BP), Y14
|
|
VMOVDQA Y14, Y9
|
|
VMOVDQA Y14, Y10
|
|
VMOVDQA Y14, Y11
|
|
VMOVDQA 64(BP), Y12
|
|
VMOVDQA Y12, Y13
|
|
VMOVDQA Y12, Y8
|
|
VMOVDQA Y12, Y15
|
|
VMOVDQA 192(BP), Y4
|
|
VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
|
|
VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
|
|
VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
|
|
VPADDD ·avx2IncMask<>+0(SB), Y2, Y3
|
|
VMOVDQA Y4, 96(BP)
|
|
VMOVDQA Y1, 128(BP)
|
|
VMOVDQA Y2, 160(BP)
|
|
VMOVDQA Y3, 192(BP)
|
|
VMOVDQA Y15, 224(BP)
|
|
VPADDD Y14, Y0, Y0
|
|
VPXOR Y0, Y4, Y4
|
|
VPSHUFB ·rol16<>+0(SB), Y4, Y4
|
|
VPADDD Y4, Y12, Y12
|
|
VPXOR Y12, Y14, Y14
|
|
VPSLLD $0x0c, Y14, Y15
|
|
VPSRLD $0x14, Y14, Y14
|
|
VPXOR Y15, Y14, Y14
|
|
VPADDD Y14, Y0, Y0
|
|
VPXOR Y0, Y4, Y4
|
|
VPSHUFB ·rol8<>+0(SB), Y4, Y4
|
|
VPADDD Y4, Y12, Y12
|
|
VPXOR Y12, Y14, Y14
|
|
VPSLLD $0x07, Y14, Y15
|
|
VPSRLD $0x19, Y14, Y14
|
|
VPXOR Y15, Y14, Y14
|
|
VPADDD Y9, Y5, Y5
|
|
VPXOR Y5, Y1, Y1
|
|
VPSHUFB ·rol16<>+0(SB), Y1, Y1
|
|
VPADDD Y1, Y13, Y13
|
|
VPXOR Y13, Y9, Y9
|
|
VPSLLD $0x0c, Y9, Y15
|
|
VPSRLD $0x14, Y9, Y9
|
|
VPXOR Y15, Y9, Y9
|
|
VPADDD Y9, Y5, Y5
|
|
VPXOR Y5, Y1, Y1
|
|
VPSHUFB ·rol8<>+0(SB), Y1, Y1
|
|
VPADDD Y1, Y13, Y13
|
|
VPXOR Y13, Y9, Y9
|
|
VPSLLD $0x07, Y9, Y15
|
|
VPSRLD $0x19, Y9, Y9
|
|
VPXOR Y15, Y9, Y9
|
|
VPADDD Y10, Y6, Y6
|
|
VPXOR Y6, Y2, Y2
|
|
VPSHUFB ·rol16<>+0(SB), Y2, Y2
|
|
VPADDD Y2, Y8, Y8
|
|
VPXOR Y8, Y10, Y10
|
|
VPSLLD $0x0c, Y10, Y15
|
|
VPSRLD $0x14, Y10, Y10
|
|
VPXOR Y15, Y10, Y10
|
|
VPADDD Y10, Y6, Y6
|
|
VPXOR Y6, Y2, Y2
|
|
VPSHUFB ·rol8<>+0(SB), Y2, Y2
|
|
VPADDD Y2, Y8, Y8
|
|
VPXOR Y8, Y10, Y10
|
|
VPSLLD $0x07, Y10, Y15
|
|
VPSRLD $0x19, Y10, Y10
|
|
VPXOR Y15, Y10, Y10
|
|
VMOVDQA 224(BP), Y15
|
|
VMOVDQA Y13, 224(BP)
|
|
VPADDD Y11, Y7, Y7
|
|
VPXOR Y7, Y3, Y3
|
|
VPSHUFB ·rol16<>+0(SB), Y3, Y3
|
|
VPADDD Y3, Y15, Y15
|
|
VPXOR Y15, Y11, Y11
|
|
VPSLLD $0x0c, Y11, Y13
|
|
VPSRLD $0x14, Y11, Y11
|
|
VPXOR Y13, Y11, Y11
|
|
VPADDD Y11, Y7, Y7
|
|
VPXOR Y7, Y3, Y3
|
|
VPSHUFB ·rol8<>+0(SB), Y3, Y3
|
|
VPADDD Y3, Y15, Y15
|
|
VPXOR Y15, Y11, Y11
|
|
VPSLLD $0x07, Y11, Y13
|
|
VPSRLD $0x19, Y11, Y11
|
|
VPXOR Y13, Y11, Y11
|
|
VMOVDQA 224(BP), Y13
|
|
VPALIGNR $0x04, Y14, Y14, Y14
|
|
VPALIGNR $0x08, Y12, Y12, Y12
|
|
VPALIGNR $0x0c, Y4, Y4, Y4
|
|
VPALIGNR $0x04, Y9, Y9, Y9
|
|
VPALIGNR $0x08, Y13, Y13, Y13
|
|
VPALIGNR $0x0c, Y1, Y1, Y1
|
|
VPALIGNR $0x04, Y10, Y10, Y10
|
|
VPALIGNR $0x08, Y8, Y8, Y8
|
|
VPALIGNR $0x0c, Y2, Y2, Y2
|
|
VPALIGNR $0x04, Y11, Y11, Y11
|
|
VPALIGNR $0x08, Y15, Y15, Y15
|
|
VPALIGNR $0x0c, Y3, Y3, Y3
|
|
VMOVDQA Y15, 224(BP)
|
|
VPADDD Y14, Y0, Y0
|
|
VPXOR Y0, Y4, Y4
|
|
VPSHUFB ·rol16<>+0(SB), Y4, Y4
|
|
VPADDD Y4, Y12, Y12
|
|
VPXOR Y12, Y14, Y14
|
|
VPSLLD $0x0c, Y14, Y15
|
|
VPSRLD $0x14, Y14, Y14
|
|
VPXOR Y15, Y14, Y14
|
|
VPADDD Y14, Y0, Y0
|
|
VPXOR Y0, Y4, Y4
|
|
VPSHUFB ·rol8<>+0(SB), Y4, Y4
|
|
VPADDD Y4, Y12, Y12
|
|
VPXOR Y12, Y14, Y14
|
|
VPSLLD $0x07, Y14, Y15
|
|
VPSRLD $0x19, Y14, Y14
|
|
VPXOR Y15, Y14, Y14
|
|
VPADDD Y9, Y5, Y5
|
|
VPXOR Y5, Y1, Y1
|
|
VPSHUFB ·rol16<>+0(SB), Y1, Y1
|
|
VPADDD Y1, Y13, Y13
|
|
VPXOR Y13, Y9, Y9
|
|
VPSLLD $0x0c, Y9, Y15
|
|
VPSRLD $0x14, Y9, Y9
|
|
VPXOR Y15, Y9, Y9
|
|
VPADDD Y9, Y5, Y5
|
|
VPXOR Y5, Y1, Y1
|
|
VPSHUFB ·rol8<>+0(SB), Y1, Y1
|
|
VPADDD Y1, Y13, Y13
|
|
VPXOR Y13, Y9, Y9
|
|
VPSLLD $0x07, Y9, Y15
|
|
VPSRLD $0x19, Y9, Y9
|
|
VPXOR Y15, Y9, Y9
|
|
VPADDD Y10, Y6, Y6
|
|
VPXOR Y6, Y2, Y2
|
|
VPSHUFB ·rol16<>+0(SB), Y2, Y2
|
|
VPADDD Y2, Y8, Y8
|
|
VPXOR Y8, Y10, Y10
|
|
VPSLLD $0x0c, Y10, Y15
|
|
VPSRLD $0x14, Y10, Y10
|
|
VPXOR Y15, Y10, Y10
|
|
VPADDD Y10, Y6, Y6
|
|
VPXOR Y6, Y2, Y2
|
|
VPSHUFB ·rol8<>+0(SB), Y2, Y2
|
|
VPADDD Y2, Y8, Y8
|
|
VPXOR Y8, Y10, Y10
|
|
VPSLLD $0x07, Y10, Y15
|
|
VPSRLD $0x19, Y10, Y10
|
|
VPXOR Y15, Y10, Y10
|
|
VMOVDQA 224(BP), Y15
|
|
VMOVDQA Y13, 224(BP)
|
|
VPADDD Y11, Y7, Y7
|
|
VPXOR Y7, Y3, Y3
|
|
VPSHUFB ·rol16<>+0(SB), Y3, Y3
|
|
VPADDD Y3, Y15, Y15
|
|
VPXOR Y15, Y11, Y11
|
|
VPSLLD $0x0c, Y11, Y13
|
|
VPSRLD $0x14, Y11, Y11
|
|
VPXOR Y13, Y11, Y11
|
|
VPADDD Y11, Y7, Y7
|
|
VPXOR Y7, Y3, Y3
|
|
VPSHUFB ·rol8<>+0(SB), Y3, Y3
|
|
VPADDD Y3, Y15, Y15
|
|
VPXOR Y15, Y11, Y11
|
|
VPSLLD $0x07, Y11, Y13
|
|
VPSRLD $0x19, Y11, Y11
|
|
VPXOR Y13, Y11, Y11
|
|
VMOVDQA 224(BP), Y13
|
|
VPALIGNR $0x0c, Y14, Y14, Y14
|
|
VPALIGNR $0x08, Y12, Y12, Y12
|
|
VPALIGNR $0x04, Y4, Y4, Y4
|
|
VPALIGNR $0x0c, Y9, Y9, Y9
|
|
VPALIGNR $0x08, Y13, Y13, Y13
|
|
VPALIGNR $0x04, Y1, Y1, Y1
|
|
VPALIGNR $0x0c, Y10, Y10, Y10
|
|
VPALIGNR $0x08, Y8, Y8, Y8
|
|
VPALIGNR $0x04, Y2, Y2, Y2
|
|
VPALIGNR $0x0c, Y11, Y11, Y11
|
|
VPALIGNR $0x08, Y15, Y15, Y15
|
|
VPALIGNR $0x04, Y3, Y3, Y3
|
|
VPADDD Y14, Y0, Y0
|
|
VPADDD Y9, Y5, Y5
|
|
VPADDD Y10, Y6, Y6
|
|
VPADDD Y11, Y7, Y7
|
|
VPXOR Y0, Y4, Y4
|
|
VPXOR Y5, Y1, Y1
|
|
VPXOR Y6, Y2, Y2
|
|
VPXOR Y7, Y3, Y3
|
|
VPSHUFB ·rol16<>+0(SB), Y4, Y4
|
|
VPSHUFB ·rol16<>+0(SB), Y1, Y1
|
|
VPSHUFB ·rol16<>+0(SB), Y2, Y2
|
|
VPSHUFB ·rol16<>+0(SB), Y3, Y3
|
|
VPADDD Y4, Y12, Y12
|
|
VPADDD Y1, Y13, Y13
|
|
VPADDD Y2, Y8, Y8
|
|
VPADDD Y3, Y15, Y15
|
|
VPXOR Y12, Y14, Y14
|
|
VPXOR Y13, Y9, Y9
|
|
VPXOR Y8, Y10, Y10
|
|
VPXOR Y15, Y11, Y11
|
|
VMOVDQA Y15, 224(BP)
|
|
VPSLLD $0x0c, Y14, Y15
|
|
VPSRLD $0x14, Y14, Y14
|
|
VPXOR Y15, Y14, Y14
|
|
VPSLLD $0x0c, Y9, Y15
|
|
VPSRLD $0x14, Y9, Y9
|
|
VPXOR Y15, Y9, Y9
|
|
VPSLLD $0x0c, Y10, Y15
|
|
VPSRLD $0x14, Y10, Y10
|
|
VPXOR Y15, Y10, Y10
|
|
VPSLLD $0x0c, Y11, Y15
|
|
VPSRLD $0x14, Y11, Y11
|
|
VPXOR Y15, Y11, Y11
|
|
VMOVDQA 224(BP), Y15
|
|
SUBQ $0x10, DI
|
|
MOVQ $0x00000009, CX
|
|
JMP sealAVX2InternalLoopStart
|
|
|
|
sealAVX2MainLoop:
|
|
VMOVDQU ·chacha20Constants<>+0(SB), Y0
|
|
VMOVDQA Y0, Y5
|
|
VMOVDQA Y0, Y6
|
|
VMOVDQA Y0, Y7
|
|
VMOVDQA 32(BP), Y14
|
|
VMOVDQA Y14, Y9
|
|
VMOVDQA Y14, Y10
|
|
VMOVDQA Y14, Y11
|
|
VMOVDQA 64(BP), Y12
|
|
VMOVDQA Y12, Y13
|
|
VMOVDQA Y12, Y8
|
|
VMOVDQA Y12, Y15
|
|
VMOVDQA 192(BP), Y4
|
|
VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
|
|
VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
|
|
VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
|
|
VPADDD ·avx2IncMask<>+0(SB), Y2, Y3
|
|
VMOVDQA Y4, 96(BP)
|
|
VMOVDQA Y1, 128(BP)
|
|
VMOVDQA Y2, 160(BP)
|
|
VMOVDQA Y3, 192(BP)
|
|
MOVQ $0x0000000a, CX
|
|
|
|
sealAVX2InternalLoop:
|
|
ADDQ (DI), R10
|
|
ADCQ 8(DI), R11
|
|
ADCQ $0x01, R12
|
|
VPADDD Y14, Y0, Y0
|
|
VPADDD Y9, Y5, Y5
|
|
VPADDD Y10, Y6, Y6
|
|
VPADDD Y11, Y7, Y7
|
|
MOVQ (BP), DX
|
|
MOVQ DX, R15
|
|
MULXQ R10, R13, R14
|
|
IMULQ R12, R15
|
|
MULXQ R11, AX, DX
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
VPXOR Y0, Y4, Y4
|
|
VPXOR Y5, Y1, Y1
|
|
VPXOR Y6, Y2, Y2
|
|
VPXOR Y7, Y3, Y3
|
|
VPSHUFB ·rol16<>+0(SB), Y4, Y4
|
|
VPSHUFB ·rol16<>+0(SB), Y1, Y1
|
|
VPSHUFB ·rol16<>+0(SB), Y2, Y2
|
|
VPSHUFB ·rol16<>+0(SB), Y3, Y3
|
|
MOVQ 8(BP), DX
|
|
MULXQ R10, R10, AX
|
|
ADDQ R10, R14
|
|
MULXQ R11, R11, R8
|
|
ADCQ R11, R15
|
|
ADCQ $0x00, R8
|
|
VPADDD Y4, Y12, Y12
|
|
VPADDD Y1, Y13, Y13
|
|
VPADDD Y2, Y8, Y8
|
|
VPADDD Y3, Y15, Y15
|
|
VPXOR Y12, Y14, Y14
|
|
VPXOR Y13, Y9, Y9
|
|
VPXOR Y8, Y10, Y10
|
|
VPXOR Y15, Y11, Y11
|
|
IMULQ R12, DX
|
|
ADDQ AX, R15
|
|
ADCQ DX, R8
|
|
VMOVDQA Y15, 224(BP)
|
|
VPSLLD $0x0c, Y14, Y15
|
|
VPSRLD $0x14, Y14, Y14
|
|
VPXOR Y15, Y14, Y14
|
|
VPSLLD $0x0c, Y9, Y15
|
|
VPSRLD $0x14, Y9, Y9
|
|
VPXOR Y15, Y9, Y9
|
|
VPSLLD $0x0c, Y10, Y15
|
|
VPSRLD $0x14, Y10, Y10
|
|
VPXOR Y15, Y10, Y10
|
|
VPSLLD $0x0c, Y11, Y15
|
|
VPSRLD $0x14, Y11, Y11
|
|
VPXOR Y15, Y11, Y11
|
|
VMOVDQA 224(BP), Y15
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
|
|
sealAVX2InternalLoopStart:
|
|
VPADDD Y14, Y0, Y0
|
|
VPADDD Y9, Y5, Y5
|
|
VPADDD Y10, Y6, Y6
|
|
VPADDD Y11, Y7, Y7
|
|
VPXOR Y0, Y4, Y4
|
|
VPXOR Y5, Y1, Y1
|
|
VPXOR Y6, Y2, Y2
|
|
VPXOR Y7, Y3, Y3
|
|
VPSHUFB ·rol8<>+0(SB), Y4, Y4
|
|
VPSHUFB ·rol8<>+0(SB), Y1, Y1
|
|
VPSHUFB ·rol8<>+0(SB), Y2, Y2
|
|
VPSHUFB ·rol8<>+0(SB), Y3, Y3
|
|
ADDQ 16(DI), R10
|
|
ADCQ 24(DI), R11
|
|
ADCQ $0x01, R12
|
|
VPADDD Y4, Y12, Y12
|
|
VPADDD Y1, Y13, Y13
|
|
VPADDD Y2, Y8, Y8
|
|
VPADDD Y3, Y15, Y15
|
|
MOVQ (BP), DX
|
|
MOVQ DX, R15
|
|
MULXQ R10, R13, R14
|
|
IMULQ R12, R15
|
|
MULXQ R11, AX, DX
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
VPXOR Y12, Y14, Y14
|
|
VPXOR Y13, Y9, Y9
|
|
VPXOR Y8, Y10, Y10
|
|
VPXOR Y15, Y11, Y11
|
|
VMOVDQA Y15, 224(BP)
|
|
VPSLLD $0x07, Y14, Y15
|
|
VPSRLD $0x19, Y14, Y14
|
|
VPXOR Y15, Y14, Y14
|
|
VPSLLD $0x07, Y9, Y15
|
|
VPSRLD $0x19, Y9, Y9
|
|
VPXOR Y15, Y9, Y9
|
|
VPSLLD $0x07, Y10, Y15
|
|
VPSRLD $0x19, Y10, Y10
|
|
VPXOR Y15, Y10, Y10
|
|
VPSLLD $0x07, Y11, Y15
|
|
VPSRLD $0x19, Y11, Y11
|
|
VPXOR Y15, Y11, Y11
|
|
VMOVDQA 224(BP), Y15
|
|
MOVQ 8(BP), DX
|
|
MULXQ R10, R10, AX
|
|
ADDQ R10, R14
|
|
MULXQ R11, R11, R8
|
|
ADCQ R11, R15
|
|
ADCQ $0x00, R8
|
|
VPALIGNR $0x04, Y14, Y14, Y14
|
|
VPALIGNR $0x04, Y9, Y9, Y9
|
|
VPALIGNR $0x04, Y10, Y10, Y10
|
|
VPALIGNR $0x04, Y11, Y11, Y11
|
|
VPALIGNR $0x08, Y12, Y12, Y12
|
|
VPALIGNR $0x08, Y13, Y13, Y13
|
|
VPALIGNR $0x08, Y8, Y8, Y8
|
|
VPALIGNR $0x08, Y15, Y15, Y15
|
|
VPALIGNR $0x0c, Y4, Y4, Y4
|
|
VPALIGNR $0x0c, Y1, Y1, Y1
|
|
VPALIGNR $0x0c, Y2, Y2, Y2
|
|
VPALIGNR $0x0c, Y3, Y3, Y3
|
|
VPADDD Y14, Y0, Y0
|
|
VPADDD Y9, Y5, Y5
|
|
VPADDD Y10, Y6, Y6
|
|
VPADDD Y11, Y7, Y7
|
|
IMULQ R12, DX
|
|
ADDQ AX, R15
|
|
ADCQ DX, R8
|
|
VPXOR Y0, Y4, Y4
|
|
VPXOR Y5, Y1, Y1
|
|
VPXOR Y6, Y2, Y2
|
|
VPXOR Y7, Y3, Y3
|
|
VPSHUFB ·rol16<>+0(SB), Y4, Y4
|
|
VPSHUFB ·rol16<>+0(SB), Y1, Y1
|
|
VPSHUFB ·rol16<>+0(SB), Y2, Y2
|
|
VPSHUFB ·rol16<>+0(SB), Y3, Y3
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
VPADDD Y4, Y12, Y12
|
|
VPADDD Y1, Y13, Y13
|
|
VPADDD Y2, Y8, Y8
|
|
VPADDD Y3, Y15, Y15
|
|
VPXOR Y12, Y14, Y14
|
|
VPXOR Y13, Y9, Y9
|
|
VPXOR Y8, Y10, Y10
|
|
VPXOR Y15, Y11, Y11
|
|
ADDQ 32(DI), R10
|
|
ADCQ 40(DI), R11
|
|
ADCQ $0x01, R12
|
|
LEAQ 48(DI), DI
|
|
VMOVDQA Y15, 224(BP)
|
|
VPSLLD $0x0c, Y14, Y15
|
|
VPSRLD $0x14, Y14, Y14
|
|
VPXOR Y15, Y14, Y14
|
|
VPSLLD $0x0c, Y9, Y15
|
|
VPSRLD $0x14, Y9, Y9
|
|
VPXOR Y15, Y9, Y9
|
|
VPSLLD $0x0c, Y10, Y15
|
|
VPSRLD $0x14, Y10, Y10
|
|
VPXOR Y15, Y10, Y10
|
|
VPSLLD $0x0c, Y11, Y15
|
|
VPSRLD $0x14, Y11, Y11
|
|
VPXOR Y15, Y11, Y11
|
|
VMOVDQA 224(BP), Y15
|
|
MOVQ (BP), DX
|
|
MOVQ DX, R15
|
|
MULXQ R10, R13, R14
|
|
IMULQ R12, R15
|
|
MULXQ R11, AX, DX
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
VPADDD Y14, Y0, Y0
|
|
VPADDD Y9, Y5, Y5
|
|
VPADDD Y10, Y6, Y6
|
|
VPADDD Y11, Y7, Y7
|
|
VPXOR Y0, Y4, Y4
|
|
VPXOR Y5, Y1, Y1
|
|
VPXOR Y6, Y2, Y2
|
|
VPXOR Y7, Y3, Y3
|
|
MOVQ 8(BP), DX
|
|
MULXQ R10, R10, AX
|
|
ADDQ R10, R14
|
|
MULXQ R11, R11, R8
|
|
ADCQ R11, R15
|
|
ADCQ $0x00, R8
|
|
VPSHUFB ·rol8<>+0(SB), Y4, Y4
|
|
VPSHUFB ·rol8<>+0(SB), Y1, Y1
|
|
VPSHUFB ·rol8<>+0(SB), Y2, Y2
|
|
VPSHUFB ·rol8<>+0(SB), Y3, Y3
|
|
VPADDD Y4, Y12, Y12
|
|
VPADDD Y1, Y13, Y13
|
|
VPADDD Y2, Y8, Y8
|
|
VPADDD Y3, Y15, Y15
|
|
IMULQ R12, DX
|
|
ADDQ AX, R15
|
|
ADCQ DX, R8
|
|
VPXOR Y12, Y14, Y14
|
|
VPXOR Y13, Y9, Y9
|
|
VPXOR Y8, Y10, Y10
|
|
VPXOR Y15, Y11, Y11
|
|
VMOVDQA Y15, 224(BP)
|
|
VPSLLD $0x07, Y14, Y15
|
|
VPSRLD $0x19, Y14, Y14
|
|
VPXOR Y15, Y14, Y14
|
|
VPSLLD $0x07, Y9, Y15
|
|
VPSRLD $0x19, Y9, Y9
|
|
VPXOR Y15, Y9, Y9
|
|
VPSLLD $0x07, Y10, Y15
|
|
VPSRLD $0x19, Y10, Y10
|
|
VPXOR Y15, Y10, Y10
|
|
VPSLLD $0x07, Y11, Y15
|
|
VPSRLD $0x19, Y11, Y11
|
|
VPXOR Y15, Y11, Y11
|
|
VMOVDQA 224(BP), Y15
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
VPALIGNR $0x0c, Y14, Y14, Y14
|
|
VPALIGNR $0x0c, Y9, Y9, Y9
|
|
VPALIGNR $0x0c, Y10, Y10, Y10
|
|
VPALIGNR $0x0c, Y11, Y11, Y11
|
|
VPALIGNR $0x08, Y12, Y12, Y12
|
|
VPALIGNR $0x08, Y13, Y13, Y13
|
|
VPALIGNR $0x08, Y8, Y8, Y8
|
|
VPALIGNR $0x08, Y15, Y15, Y15
|
|
VPALIGNR $0x04, Y4, Y4, Y4
|
|
VPALIGNR $0x04, Y1, Y1, Y1
|
|
VPALIGNR $0x04, Y2, Y2, Y2
|
|
VPALIGNR $0x04, Y3, Y3, Y3
|
|
DECQ CX
|
|
JNE sealAVX2InternalLoop
|
|
VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
|
|
VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
|
|
VPADDD ·chacha20Constants<>+0(SB), Y6, Y6
|
|
VPADDD ·chacha20Constants<>+0(SB), Y7, Y7
|
|
VPADDD 32(BP), Y14, Y14
|
|
VPADDD 32(BP), Y9, Y9
|
|
VPADDD 32(BP), Y10, Y10
|
|
VPADDD 32(BP), Y11, Y11
|
|
VPADDD 64(BP), Y12, Y12
|
|
VPADDD 64(BP), Y13, Y13
|
|
VPADDD 64(BP), Y8, Y8
|
|
VPADDD 64(BP), Y15, Y15
|
|
VPADDD 96(BP), Y4, Y4
|
|
VPADDD 128(BP), Y1, Y1
|
|
VPADDD 160(BP), Y2, Y2
|
|
VPADDD 192(BP), Y3, Y3
|
|
VMOVDQA Y15, 224(BP)
|
|
|
|
// We only hashed 480 of the 512 bytes available - hash the remaining 32 here
|
|
ADDQ (DI), R10
|
|
ADCQ 8(DI), R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), DX
|
|
MOVQ DX, R15
|
|
MULXQ R10, R13, R14
|
|
IMULQ R12, R15
|
|
MULXQ R11, AX, DX
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), DX
|
|
MULXQ R10, R10, AX
|
|
ADDQ R10, R14
|
|
MULXQ R11, R11, R8
|
|
ADCQ R11, R15
|
|
ADCQ $0x00, R8
|
|
IMULQ R12, DX
|
|
ADDQ AX, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
LEAQ 32(DI), DI
|
|
VPERM2I128 $0x02, Y0, Y14, Y15
|
|
VPERM2I128 $0x13, Y0, Y14, Y14
|
|
VPERM2I128 $0x02, Y12, Y4, Y0
|
|
VPERM2I128 $0x13, Y12, Y4, Y12
|
|
VPXOR (SI), Y15, Y15
|
|
VPXOR 32(SI), Y0, Y0
|
|
VPXOR 64(SI), Y14, Y14
|
|
VPXOR 96(SI), Y12, Y12
|
|
VMOVDQU Y15, (DI)
|
|
VMOVDQU Y0, 32(DI)
|
|
VMOVDQU Y14, 64(DI)
|
|
VMOVDQU Y12, 96(DI)
|
|
VPERM2I128 $0x02, Y5, Y9, Y0
|
|
VPERM2I128 $0x02, Y13, Y1, Y14
|
|
VPERM2I128 $0x13, Y5, Y9, Y12
|
|
VPERM2I128 $0x13, Y13, Y1, Y4
|
|
VPXOR 128(SI), Y0, Y0
|
|
VPXOR 160(SI), Y14, Y14
|
|
VPXOR 192(SI), Y12, Y12
|
|
VPXOR 224(SI), Y4, Y4
|
|
VMOVDQU Y0, 128(DI)
|
|
VMOVDQU Y14, 160(DI)
|
|
VMOVDQU Y12, 192(DI)
|
|
VMOVDQU Y4, 224(DI)
|
|
|
|
// and here
|
|
ADDQ -16(DI), R10
|
|
ADCQ -8(DI), R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), DX
|
|
MOVQ DX, R15
|
|
MULXQ R10, R13, R14
|
|
IMULQ R12, R15
|
|
MULXQ R11, AX, DX
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), DX
|
|
MULXQ R10, R10, AX
|
|
ADDQ R10, R14
|
|
MULXQ R11, R11, R8
|
|
ADCQ R11, R15
|
|
ADCQ $0x00, R8
|
|
IMULQ R12, DX
|
|
ADDQ AX, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
VPERM2I128 $0x02, Y6, Y10, Y0
|
|
VPERM2I128 $0x02, Y8, Y2, Y14
|
|
VPERM2I128 $0x13, Y6, Y10, Y12
|
|
VPERM2I128 $0x13, Y8, Y2, Y4
|
|
VPXOR 256(SI), Y0, Y0
|
|
VPXOR 288(SI), Y14, Y14
|
|
VPXOR 320(SI), Y12, Y12
|
|
VPXOR 352(SI), Y4, Y4
|
|
VMOVDQU Y0, 256(DI)
|
|
VMOVDQU Y14, 288(DI)
|
|
VMOVDQU Y12, 320(DI)
|
|
VMOVDQU Y4, 352(DI)
|
|
VPERM2I128 $0x02, Y7, Y11, Y0
|
|
VPERM2I128 $0x02, 224(BP), Y3, Y14
|
|
VPERM2I128 $0x13, Y7, Y11, Y12
|
|
VPERM2I128 $0x13, 224(BP), Y3, Y4
|
|
VPXOR 384(SI), Y0, Y0
|
|
VPXOR 416(SI), Y14, Y14
|
|
VPXOR 448(SI), Y12, Y12
|
|
VPXOR 480(SI), Y4, Y4
|
|
VMOVDQU Y0, 384(DI)
|
|
VMOVDQU Y14, 416(DI)
|
|
VMOVDQU Y12, 448(DI)
|
|
VMOVDQU Y4, 480(DI)
|
|
LEAQ 512(SI), SI
|
|
SUBQ $0x00000200, BX
|
|
CMPQ BX, $0x00000200
|
|
JG sealAVX2MainLoop
|
|
|
|
// Tail can only hash 480 bytes
|
|
ADDQ (DI), R10
|
|
ADCQ 8(DI), R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), DX
|
|
MOVQ DX, R15
|
|
MULXQ R10, R13, R14
|
|
IMULQ R12, R15
|
|
MULXQ R11, AX, DX
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), DX
|
|
MULXQ R10, R10, AX
|
|
ADDQ R10, R14
|
|
MULXQ R11, R11, R8
|
|
ADCQ R11, R15
|
|
ADCQ $0x00, R8
|
|
IMULQ R12, DX
|
|
ADDQ AX, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ 16(DI), R10
|
|
ADCQ 24(DI), R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), DX
|
|
MOVQ DX, R15
|
|
MULXQ R10, R13, R14
|
|
IMULQ R12, R15
|
|
MULXQ R11, AX, DX
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), DX
|
|
MULXQ R10, R10, AX
|
|
ADDQ R10, R14
|
|
MULXQ R11, R11, R8
|
|
ADCQ R11, R15
|
|
ADCQ $0x00, R8
|
|
IMULQ R12, DX
|
|
ADDQ AX, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
LEAQ 32(DI), DI
|
|
MOVQ $0x0000000a, CX
|
|
MOVQ $0x00000000, R9
|
|
CMPQ BX, $0x80
|
|
JBE sealAVX2Tail128
|
|
CMPQ BX, $0x00000100
|
|
JBE sealAVX2Tail256
|
|
CMPQ BX, $0x00000180
|
|
JBE sealAVX2Tail384
|
|
JMP sealAVX2Tail512
|
|
|
|
seal192AVX2:
|
|
VMOVDQA Y0, Y5
|
|
VMOVDQA Y14, Y9
|
|
VMOVDQA Y12, Y13
|
|
VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
|
|
VMOVDQA Y0, Y6
|
|
VMOVDQA Y14, Y10
|
|
VMOVDQA Y12, Y8
|
|
VMOVDQA Y4, Y2
|
|
VMOVDQA Y1, Y15
|
|
MOVQ $0x0000000a, R9
|
|
|
|
sealAVX2192InnerCipherLoop:
|
|
VPADDD Y14, Y0, Y0
|
|
VPXOR Y0, Y4, Y4
|
|
VPSHUFB ·rol16<>+0(SB), Y4, Y4
|
|
VPADDD Y4, Y12, Y12
|
|
VPXOR Y12, Y14, Y14
|
|
VPSLLD $0x0c, Y14, Y3
|
|
VPSRLD $0x14, Y14, Y14
|
|
VPXOR Y3, Y14, Y14
|
|
VPADDD Y14, Y0, Y0
|
|
VPXOR Y0, Y4, Y4
|
|
VPSHUFB ·rol8<>+0(SB), Y4, Y4
|
|
VPADDD Y4, Y12, Y12
|
|
VPXOR Y12, Y14, Y14
|
|
VPSLLD $0x07, Y14, Y3
|
|
VPSRLD $0x19, Y14, Y14
|
|
VPXOR Y3, Y14, Y14
|
|
VPADDD Y9, Y5, Y5
|
|
VPXOR Y5, Y1, Y1
|
|
VPSHUFB ·rol16<>+0(SB), Y1, Y1
|
|
VPADDD Y1, Y13, Y13
|
|
VPXOR Y13, Y9, Y9
|
|
VPSLLD $0x0c, Y9, Y3
|
|
VPSRLD $0x14, Y9, Y9
|
|
VPXOR Y3, Y9, Y9
|
|
VPADDD Y9, Y5, Y5
|
|
VPXOR Y5, Y1, Y1
|
|
VPSHUFB ·rol8<>+0(SB), Y1, Y1
|
|
VPADDD Y1, Y13, Y13
|
|
VPXOR Y13, Y9, Y9
|
|
VPSLLD $0x07, Y9, Y3
|
|
VPSRLD $0x19, Y9, Y9
|
|
VPXOR Y3, Y9, Y9
|
|
VPALIGNR $0x04, Y14, Y14, Y14
|
|
VPALIGNR $0x04, Y9, Y9, Y9
|
|
VPALIGNR $0x08, Y12, Y12, Y12
|
|
VPALIGNR $0x08, Y13, Y13, Y13
|
|
VPALIGNR $0x0c, Y4, Y4, Y4
|
|
VPALIGNR $0x0c, Y1, Y1, Y1
|
|
VPADDD Y14, Y0, Y0
|
|
VPXOR Y0, Y4, Y4
|
|
VPSHUFB ·rol16<>+0(SB), Y4, Y4
|
|
VPADDD Y4, Y12, Y12
|
|
VPXOR Y12, Y14, Y14
|
|
VPSLLD $0x0c, Y14, Y3
|
|
VPSRLD $0x14, Y14, Y14
|
|
VPXOR Y3, Y14, Y14
|
|
VPADDD Y14, Y0, Y0
|
|
VPXOR Y0, Y4, Y4
|
|
VPSHUFB ·rol8<>+0(SB), Y4, Y4
|
|
VPADDD Y4, Y12, Y12
|
|
VPXOR Y12, Y14, Y14
|
|
VPSLLD $0x07, Y14, Y3
|
|
VPSRLD $0x19, Y14, Y14
|
|
VPXOR Y3, Y14, Y14
|
|
VPADDD Y9, Y5, Y5
|
|
VPXOR Y5, Y1, Y1
|
|
VPSHUFB ·rol16<>+0(SB), Y1, Y1
|
|
VPADDD Y1, Y13, Y13
|
|
VPXOR Y13, Y9, Y9
|
|
VPSLLD $0x0c, Y9, Y3
|
|
VPSRLD $0x14, Y9, Y9
|
|
VPXOR Y3, Y9, Y9
|
|
VPADDD Y9, Y5, Y5
|
|
VPXOR Y5, Y1, Y1
|
|
VPSHUFB ·rol8<>+0(SB), Y1, Y1
|
|
VPADDD Y1, Y13, Y13
|
|
VPXOR Y13, Y9, Y9
|
|
VPSLLD $0x07, Y9, Y3
|
|
VPSRLD $0x19, Y9, Y9
|
|
VPXOR Y3, Y9, Y9
|
|
VPALIGNR $0x0c, Y14, Y14, Y14
|
|
VPALIGNR $0x0c, Y9, Y9, Y9
|
|
VPALIGNR $0x08, Y12, Y12, Y12
|
|
VPALIGNR $0x08, Y13, Y13, Y13
|
|
VPALIGNR $0x04, Y4, Y4, Y4
|
|
VPALIGNR $0x04, Y1, Y1, Y1
|
|
DECQ R9
|
|
JNE sealAVX2192InnerCipherLoop
|
|
VPADDD Y6, Y0, Y0
|
|
VPADDD Y6, Y5, Y5
|
|
VPADDD Y10, Y14, Y14
|
|
VPADDD Y10, Y9, Y9
|
|
VPADDD Y8, Y12, Y12
|
|
VPADDD Y8, Y13, Y13
|
|
VPADDD Y2, Y4, Y4
|
|
VPADDD Y15, Y1, Y1
|
|
VPERM2I128 $0x02, Y0, Y14, Y3
|
|
|
|
// Clamp and store poly key
|
|
VPAND ·polyClampMask<>+0(SB), Y3, Y3
|
|
VMOVDQA Y3, (BP)
|
|
|
|
// Stream for up to 192 bytes
|
|
VPERM2I128 $0x13, Y0, Y14, Y0
|
|
VPERM2I128 $0x13, Y12, Y4, Y14
|
|
VPERM2I128 $0x02, Y5, Y9, Y12
|
|
VPERM2I128 $0x02, Y13, Y1, Y4
|
|
VPERM2I128 $0x13, Y5, Y9, Y5
|
|
VPERM2I128 $0x13, Y13, Y1, Y9
|
|
|
|
sealAVX2ShortSeal:
|
|
// Hash aad
|
|
MOVQ ad_len+80(FP), R9
|
|
CALL polyHashADInternal<>(SB)
|
|
XORQ CX, CX
|
|
|
|
sealAVX2SealHash:
|
|
// itr1 holds the number of bytes encrypted but not yet hashed
|
|
CMPQ CX, $0x10
|
|
JB sealAVX2ShortSealLoop
|
|
ADDQ (DI), R10
|
|
ADCQ 8(DI), R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), AX
|
|
MOVQ AX, R15
|
|
MULQ R10
|
|
MOVQ AX, R13
|
|
MOVQ DX, R14
|
|
MOVQ (BP), AX
|
|
MULQ R11
|
|
IMULQ R12, R15
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), AX
|
|
MOVQ AX, R8
|
|
MULQ R10
|
|
ADDQ AX, R14
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R10
|
|
MOVQ 8(BP), AX
|
|
MULQ R11
|
|
ADDQ AX, R15
|
|
ADCQ $0x00, DX
|
|
IMULQ R12, R8
|
|
ADDQ R10, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
SUBQ $0x10, CX
|
|
ADDQ $0x10, DI
|
|
JMP sealAVX2SealHash
|
|
|
|
sealAVX2ShortSealLoop:
|
|
CMPQ BX, $0x20
|
|
JB sealAVX2ShortTail32
|
|
SUBQ $0x20, BX
|
|
|
|
// Load for encryption
|
|
VPXOR (SI), Y0, Y0
|
|
VMOVDQU Y0, (DI)
|
|
LEAQ 32(SI), SI
|
|
|
|
// Now can hash
|
|
ADDQ (DI), R10
|
|
ADCQ 8(DI), R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), DX
|
|
MOVQ DX, R15
|
|
MULXQ R10, R13, R14
|
|
IMULQ R12, R15
|
|
MULXQ R11, AX, DX
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), DX
|
|
MULXQ R10, R10, AX
|
|
ADDQ R10, R14
|
|
MULXQ R11, R11, R8
|
|
ADCQ R11, R15
|
|
ADCQ $0x00, R8
|
|
IMULQ R12, DX
|
|
ADDQ AX, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ 16(DI), R10
|
|
ADCQ 24(DI), R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), DX
|
|
MOVQ DX, R15
|
|
MULXQ R10, R13, R14
|
|
IMULQ R12, R15
|
|
MULXQ R11, AX, DX
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), DX
|
|
MULXQ R10, R10, AX
|
|
ADDQ R10, R14
|
|
MULXQ R11, R11, R8
|
|
ADCQ R11, R15
|
|
ADCQ $0x00, R8
|
|
IMULQ R12, DX
|
|
ADDQ AX, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
LEAQ 32(DI), DI
|
|
|
|
// Shift stream left
|
|
VMOVDQA Y14, Y0
|
|
VMOVDQA Y12, Y14
|
|
VMOVDQA Y4, Y12
|
|
VMOVDQA Y5, Y4
|
|
VMOVDQA Y9, Y5
|
|
VMOVDQA Y13, Y9
|
|
VMOVDQA Y1, Y13
|
|
VMOVDQA Y6, Y1
|
|
VMOVDQA Y10, Y6
|
|
JMP sealAVX2ShortSealLoop
|
|
|
|
sealAVX2ShortTail32:
|
|
CMPQ BX, $0x10
|
|
VMOVDQA X0, X1
|
|
JB sealAVX2ShortDone
|
|
SUBQ $0x10, BX
|
|
|
|
// Load for encryption
|
|
VPXOR (SI), X0, X12
|
|
VMOVDQU X12, (DI)
|
|
LEAQ 16(SI), SI
|
|
|
|
// Hash
|
|
ADDQ (DI), R10
|
|
ADCQ 8(DI), R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), DX
|
|
MOVQ DX, R15
|
|
MULXQ R10, R13, R14
|
|
IMULQ R12, R15
|
|
MULXQ R11, AX, DX
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), DX
|
|
MULXQ R10, R10, AX
|
|
ADDQ R10, R14
|
|
MULXQ R11, R11, R8
|
|
ADCQ R11, R15
|
|
ADCQ $0x00, R8
|
|
IMULQ R12, DX
|
|
ADDQ AX, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
LEAQ 16(DI), DI
|
|
VPERM2I128 $0x11, Y0, Y0, Y0
|
|
VMOVDQA X0, X1
|
|
|
|
sealAVX2ShortDone:
|
|
VZEROUPPER
|
|
JMP sealSSETail
|
|
|
|
seal320AVX2:
|
|
VMOVDQA Y0, Y5
|
|
VMOVDQA Y14, Y9
|
|
VMOVDQA Y12, Y13
|
|
VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
|
|
VMOVDQA Y0, Y6
|
|
VMOVDQA Y14, Y10
|
|
VMOVDQA Y12, Y8
|
|
VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
|
|
VMOVDQA Y14, Y7
|
|
VMOVDQA Y12, Y11
|
|
VMOVDQA Y4, Y15
|
|
MOVQ $0x0000000a, R9
|
|
|
|
sealAVX2320InnerCipherLoop:
|
|
VPADDD Y14, Y0, Y0
|
|
VPXOR Y0, Y4, Y4
|
|
VPSHUFB ·rol16<>+0(SB), Y4, Y4
|
|
VPADDD Y4, Y12, Y12
|
|
VPXOR Y12, Y14, Y14
|
|
VPSLLD $0x0c, Y14, Y3
|
|
VPSRLD $0x14, Y14, Y14
|
|
VPXOR Y3, Y14, Y14
|
|
VPADDD Y14, Y0, Y0
|
|
VPXOR Y0, Y4, Y4
|
|
VPSHUFB ·rol8<>+0(SB), Y4, Y4
|
|
VPADDD Y4, Y12, Y12
|
|
VPXOR Y12, Y14, Y14
|
|
VPSLLD $0x07, Y14, Y3
|
|
VPSRLD $0x19, Y14, Y14
|
|
VPXOR Y3, Y14, Y14
|
|
VPADDD Y9, Y5, Y5
|
|
VPXOR Y5, Y1, Y1
|
|
VPSHUFB ·rol16<>+0(SB), Y1, Y1
|
|
VPADDD Y1, Y13, Y13
|
|
VPXOR Y13, Y9, Y9
|
|
VPSLLD $0x0c, Y9, Y3
|
|
VPSRLD $0x14, Y9, Y9
|
|
VPXOR Y3, Y9, Y9
|
|
VPADDD Y9, Y5, Y5
|
|
VPXOR Y5, Y1, Y1
|
|
VPSHUFB ·rol8<>+0(SB), Y1, Y1
|
|
VPADDD Y1, Y13, Y13
|
|
VPXOR Y13, Y9, Y9
|
|
VPSLLD $0x07, Y9, Y3
|
|
VPSRLD $0x19, Y9, Y9
|
|
VPXOR Y3, Y9, Y9
|
|
VPADDD Y10, Y6, Y6
|
|
VPXOR Y6, Y2, Y2
|
|
VPSHUFB ·rol16<>+0(SB), Y2, Y2
|
|
VPADDD Y2, Y8, Y8
|
|
VPXOR Y8, Y10, Y10
|
|
VPSLLD $0x0c, Y10, Y3
|
|
VPSRLD $0x14, Y10, Y10
|
|
VPXOR Y3, Y10, Y10
|
|
VPADDD Y10, Y6, Y6
|
|
VPXOR Y6, Y2, Y2
|
|
VPSHUFB ·rol8<>+0(SB), Y2, Y2
|
|
VPADDD Y2, Y8, Y8
|
|
VPXOR Y8, Y10, Y10
|
|
VPSLLD $0x07, Y10, Y3
|
|
VPSRLD $0x19, Y10, Y10
|
|
VPXOR Y3, Y10, Y10
|
|
VPALIGNR $0x04, Y14, Y14, Y14
|
|
VPALIGNR $0x04, Y9, Y9, Y9
|
|
VPALIGNR $0x04, Y10, Y10, Y10
|
|
VPALIGNR $0x08, Y12, Y12, Y12
|
|
VPALIGNR $0x08, Y13, Y13, Y13
|
|
VPALIGNR $0x08, Y8, Y8, Y8
|
|
VPALIGNR $0x0c, Y4, Y4, Y4
|
|
VPALIGNR $0x0c, Y1, Y1, Y1
|
|
VPALIGNR $0x0c, Y2, Y2, Y2
|
|
VPADDD Y14, Y0, Y0
|
|
VPXOR Y0, Y4, Y4
|
|
VPSHUFB ·rol16<>+0(SB), Y4, Y4
|
|
VPADDD Y4, Y12, Y12
|
|
VPXOR Y12, Y14, Y14
|
|
VPSLLD $0x0c, Y14, Y3
|
|
VPSRLD $0x14, Y14, Y14
|
|
VPXOR Y3, Y14, Y14
|
|
VPADDD Y14, Y0, Y0
|
|
VPXOR Y0, Y4, Y4
|
|
VPSHUFB ·rol8<>+0(SB), Y4, Y4
|
|
VPADDD Y4, Y12, Y12
|
|
VPXOR Y12, Y14, Y14
|
|
VPSLLD $0x07, Y14, Y3
|
|
VPSRLD $0x19, Y14, Y14
|
|
VPXOR Y3, Y14, Y14
|
|
VPADDD Y9, Y5, Y5
|
|
VPXOR Y5, Y1, Y1
|
|
VPSHUFB ·rol16<>+0(SB), Y1, Y1
|
|
VPADDD Y1, Y13, Y13
|
|
VPXOR Y13, Y9, Y9
|
|
VPSLLD $0x0c, Y9, Y3
|
|
VPSRLD $0x14, Y9, Y9
|
|
VPXOR Y3, Y9, Y9
|
|
VPADDD Y9, Y5, Y5
|
|
VPXOR Y5, Y1, Y1
|
|
VPSHUFB ·rol8<>+0(SB), Y1, Y1
|
|
VPADDD Y1, Y13, Y13
|
|
VPXOR Y13, Y9, Y9
|
|
VPSLLD $0x07, Y9, Y3
|
|
VPSRLD $0x19, Y9, Y9
|
|
VPXOR Y3, Y9, Y9
|
|
VPADDD Y10, Y6, Y6
|
|
VPXOR Y6, Y2, Y2
|
|
VPSHUFB ·rol16<>+0(SB), Y2, Y2
|
|
VPADDD Y2, Y8, Y8
|
|
VPXOR Y8, Y10, Y10
|
|
VPSLLD $0x0c, Y10, Y3
|
|
VPSRLD $0x14, Y10, Y10
|
|
VPXOR Y3, Y10, Y10
|
|
VPADDD Y10, Y6, Y6
|
|
VPXOR Y6, Y2, Y2
|
|
VPSHUFB ·rol8<>+0(SB), Y2, Y2
|
|
VPADDD Y2, Y8, Y8
|
|
VPXOR Y8, Y10, Y10
|
|
VPSLLD $0x07, Y10, Y3
|
|
VPSRLD $0x19, Y10, Y10
|
|
VPXOR Y3, Y10, Y10
|
|
VPALIGNR $0x0c, Y14, Y14, Y14
|
|
VPALIGNR $0x0c, Y9, Y9, Y9
|
|
VPALIGNR $0x0c, Y10, Y10, Y10
|
|
VPALIGNR $0x08, Y12, Y12, Y12
|
|
VPALIGNR $0x08, Y13, Y13, Y13
|
|
VPALIGNR $0x08, Y8, Y8, Y8
|
|
VPALIGNR $0x04, Y4, Y4, Y4
|
|
VPALIGNR $0x04, Y1, Y1, Y1
|
|
VPALIGNR $0x04, Y2, Y2, Y2
|
|
DECQ R9
|
|
JNE sealAVX2320InnerCipherLoop
|
|
VMOVDQA ·chacha20Constants<>+0(SB), Y3
|
|
VPADDD Y3, Y0, Y0
|
|
VPADDD Y3, Y5, Y5
|
|
VPADDD Y3, Y6, Y6
|
|
VPADDD Y7, Y14, Y14
|
|
VPADDD Y7, Y9, Y9
|
|
VPADDD Y7, Y10, Y10
|
|
VPADDD Y11, Y12, Y12
|
|
VPADDD Y11, Y13, Y13
|
|
VPADDD Y11, Y8, Y8
|
|
VMOVDQA ·avx2IncMask<>+0(SB), Y3
|
|
VPADDD Y15, Y4, Y4
|
|
VPADDD Y3, Y15, Y15
|
|
VPADDD Y15, Y1, Y1
|
|
VPADDD Y3, Y15, Y15
|
|
VPADDD Y15, Y2, Y2
|
|
|
|
// Clamp and store poly key
|
|
VPERM2I128 $0x02, Y0, Y14, Y3
|
|
VPAND ·polyClampMask<>+0(SB), Y3, Y3
|
|
VMOVDQA Y3, (BP)
|
|
|
|
// Stream for up to 320 bytes
|
|
VPERM2I128 $0x13, Y0, Y14, Y0
|
|
VPERM2I128 $0x13, Y12, Y4, Y14
|
|
VPERM2I128 $0x02, Y5, Y9, Y12
|
|
VPERM2I128 $0x02, Y13, Y1, Y4
|
|
VPERM2I128 $0x13, Y5, Y9, Y5
|
|
VPERM2I128 $0x13, Y13, Y1, Y9
|
|
VPERM2I128 $0x02, Y6, Y10, Y13
|
|
VPERM2I128 $0x02, Y8, Y2, Y1
|
|
VPERM2I128 $0x13, Y6, Y10, Y6
|
|
VPERM2I128 $0x13, Y8, Y2, Y10
|
|
JMP sealAVX2ShortSeal
|
|
|
|
sealAVX2Tail128:
|
|
VMOVDQA ·chacha20Constants<>+0(SB), Y0
|
|
VMOVDQA 32(BP), Y14
|
|
VMOVDQA 64(BP), Y12
|
|
VMOVDQA 192(BP), Y4
|
|
VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
|
|
VMOVDQA Y4, Y1
|
|
|
|
sealAVX2Tail128LoopA:
|
|
ADDQ (DI), R10
|
|
ADCQ 8(DI), R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), AX
|
|
MOVQ AX, R15
|
|
MULQ R10
|
|
MOVQ AX, R13
|
|
MOVQ DX, R14
|
|
MOVQ (BP), AX
|
|
MULQ R11
|
|
IMULQ R12, R15
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), AX
|
|
MOVQ AX, R8
|
|
MULQ R10
|
|
ADDQ AX, R14
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R10
|
|
MOVQ 8(BP), AX
|
|
MULQ R11
|
|
ADDQ AX, R15
|
|
ADCQ $0x00, DX
|
|
IMULQ R12, R8
|
|
ADDQ R10, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
LEAQ 16(DI), DI
|
|
|
|
sealAVX2Tail128LoopB:
|
|
VPADDD Y14, Y0, Y0
|
|
VPXOR Y0, Y4, Y4
|
|
VPSHUFB ·rol16<>+0(SB), Y4, Y4
|
|
VPADDD Y4, Y12, Y12
|
|
VPXOR Y12, Y14, Y14
|
|
VPSLLD $0x0c, Y14, Y3
|
|
VPSRLD $0x14, Y14, Y14
|
|
VPXOR Y3, Y14, Y14
|
|
VPADDD Y14, Y0, Y0
|
|
VPXOR Y0, Y4, Y4
|
|
VPSHUFB ·rol8<>+0(SB), Y4, Y4
|
|
VPADDD Y4, Y12, Y12
|
|
VPXOR Y12, Y14, Y14
|
|
VPSLLD $0x07, Y14, Y3
|
|
VPSRLD $0x19, Y14, Y14
|
|
VPXOR Y3, Y14, Y14
|
|
ADDQ (DI), R10
|
|
ADCQ 8(DI), R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), AX
|
|
MOVQ AX, R15
|
|
MULQ R10
|
|
MOVQ AX, R13
|
|
MOVQ DX, R14
|
|
MOVQ (BP), AX
|
|
MULQ R11
|
|
IMULQ R12, R15
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), AX
|
|
MOVQ AX, R8
|
|
MULQ R10
|
|
ADDQ AX, R14
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R10
|
|
MOVQ 8(BP), AX
|
|
MULQ R11
|
|
ADDQ AX, R15
|
|
ADCQ $0x00, DX
|
|
IMULQ R12, R8
|
|
ADDQ R10, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
VPALIGNR $0x04, Y14, Y14, Y14
|
|
VPALIGNR $0x08, Y12, Y12, Y12
|
|
VPALIGNR $0x0c, Y4, Y4, Y4
|
|
VPADDD Y14, Y0, Y0
|
|
VPXOR Y0, Y4, Y4
|
|
VPSHUFB ·rol16<>+0(SB), Y4, Y4
|
|
VPADDD Y4, Y12, Y12
|
|
VPXOR Y12, Y14, Y14
|
|
VPSLLD $0x0c, Y14, Y3
|
|
VPSRLD $0x14, Y14, Y14
|
|
VPXOR Y3, Y14, Y14
|
|
VPADDD Y14, Y0, Y0
|
|
VPXOR Y0, Y4, Y4
|
|
VPSHUFB ·rol8<>+0(SB), Y4, Y4
|
|
VPADDD Y4, Y12, Y12
|
|
VPXOR Y12, Y14, Y14
|
|
VPSLLD $0x07, Y14, Y3
|
|
VPSRLD $0x19, Y14, Y14
|
|
VPXOR Y3, Y14, Y14
|
|
ADDQ 16(DI), R10
|
|
ADCQ 24(DI), R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), AX
|
|
MOVQ AX, R15
|
|
MULQ R10
|
|
MOVQ AX, R13
|
|
MOVQ DX, R14
|
|
MOVQ (BP), AX
|
|
MULQ R11
|
|
IMULQ R12, R15
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), AX
|
|
MOVQ AX, R8
|
|
MULQ R10
|
|
ADDQ AX, R14
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R10
|
|
MOVQ 8(BP), AX
|
|
MULQ R11
|
|
ADDQ AX, R15
|
|
ADCQ $0x00, DX
|
|
IMULQ R12, R8
|
|
ADDQ R10, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
LEAQ 32(DI), DI
|
|
VPALIGNR $0x0c, Y14, Y14, Y14
|
|
VPALIGNR $0x08, Y12, Y12, Y12
|
|
VPALIGNR $0x04, Y4, Y4, Y4
|
|
DECQ CX
|
|
JG sealAVX2Tail128LoopA
|
|
DECQ R9
|
|
JGE sealAVX2Tail128LoopB
|
|
VPADDD ·chacha20Constants<>+0(SB), Y0, Y5
|
|
VPADDD 32(BP), Y14, Y9
|
|
VPADDD 64(BP), Y12, Y13
|
|
VPADDD Y1, Y4, Y1
|
|
VPERM2I128 $0x02, Y5, Y9, Y0
|
|
VPERM2I128 $0x02, Y13, Y1, Y14
|
|
VPERM2I128 $0x13, Y5, Y9, Y12
|
|
VPERM2I128 $0x13, Y13, Y1, Y4
|
|
JMP sealAVX2ShortSealLoop
|
|
|
|
sealAVX2Tail256:
|
|
VMOVDQA ·chacha20Constants<>+0(SB), Y0
|
|
VMOVDQA ·chacha20Constants<>+0(SB), Y5
|
|
VMOVDQA 32(BP), Y14
|
|
VMOVDQA 32(BP), Y9
|
|
VMOVDQA 64(BP), Y12
|
|
VMOVDQA 64(BP), Y13
|
|
VMOVDQA 192(BP), Y4
|
|
VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
|
|
VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
|
|
VMOVDQA Y4, Y7
|
|
VMOVDQA Y1, Y11
|
|
|
|
sealAVX2Tail256LoopA:
|
|
ADDQ (DI), R10
|
|
ADCQ 8(DI), R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), AX
|
|
MOVQ AX, R15
|
|
MULQ R10
|
|
MOVQ AX, R13
|
|
MOVQ DX, R14
|
|
MOVQ (BP), AX
|
|
MULQ R11
|
|
IMULQ R12, R15
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), AX
|
|
MOVQ AX, R8
|
|
MULQ R10
|
|
ADDQ AX, R14
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R10
|
|
MOVQ 8(BP), AX
|
|
MULQ R11
|
|
ADDQ AX, R15
|
|
ADCQ $0x00, DX
|
|
IMULQ R12, R8
|
|
ADDQ R10, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
LEAQ 16(DI), DI
|
|
|
|
sealAVX2Tail256LoopB:
|
|
VPADDD Y14, Y0, Y0
|
|
VPXOR Y0, Y4, Y4
|
|
VPSHUFB ·rol16<>+0(SB), Y4, Y4
|
|
VPADDD Y4, Y12, Y12
|
|
VPXOR Y12, Y14, Y14
|
|
VPSLLD $0x0c, Y14, Y3
|
|
VPSRLD $0x14, Y14, Y14
|
|
VPXOR Y3, Y14, Y14
|
|
VPADDD Y14, Y0, Y0
|
|
VPXOR Y0, Y4, Y4
|
|
VPSHUFB ·rol8<>+0(SB), Y4, Y4
|
|
VPADDD Y4, Y12, Y12
|
|
VPXOR Y12, Y14, Y14
|
|
VPSLLD $0x07, Y14, Y3
|
|
VPSRLD $0x19, Y14, Y14
|
|
VPXOR Y3, Y14, Y14
|
|
VPADDD Y9, Y5, Y5
|
|
VPXOR Y5, Y1, Y1
|
|
VPSHUFB ·rol16<>+0(SB), Y1, Y1
|
|
VPADDD Y1, Y13, Y13
|
|
VPXOR Y13, Y9, Y9
|
|
VPSLLD $0x0c, Y9, Y3
|
|
VPSRLD $0x14, Y9, Y9
|
|
VPXOR Y3, Y9, Y9
|
|
VPADDD Y9, Y5, Y5
|
|
VPXOR Y5, Y1, Y1
|
|
VPSHUFB ·rol8<>+0(SB), Y1, Y1
|
|
VPADDD Y1, Y13, Y13
|
|
VPXOR Y13, Y9, Y9
|
|
VPSLLD $0x07, Y9, Y3
|
|
VPSRLD $0x19, Y9, Y9
|
|
VPXOR Y3, Y9, Y9
|
|
ADDQ (DI), R10
|
|
ADCQ 8(DI), R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), AX
|
|
MOVQ AX, R15
|
|
MULQ R10
|
|
MOVQ AX, R13
|
|
MOVQ DX, R14
|
|
MOVQ (BP), AX
|
|
MULQ R11
|
|
IMULQ R12, R15
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), AX
|
|
MOVQ AX, R8
|
|
MULQ R10
|
|
ADDQ AX, R14
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R10
|
|
MOVQ 8(BP), AX
|
|
MULQ R11
|
|
ADDQ AX, R15
|
|
ADCQ $0x00, DX
|
|
IMULQ R12, R8
|
|
ADDQ R10, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
VPALIGNR $0x04, Y14, Y14, Y14
|
|
VPALIGNR $0x04, Y9, Y9, Y9
|
|
VPALIGNR $0x08, Y12, Y12, Y12
|
|
VPALIGNR $0x08, Y13, Y13, Y13
|
|
VPALIGNR $0x0c, Y4, Y4, Y4
|
|
VPALIGNR $0x0c, Y1, Y1, Y1
|
|
VPADDD Y14, Y0, Y0
|
|
VPXOR Y0, Y4, Y4
|
|
VPSHUFB ·rol16<>+0(SB), Y4, Y4
|
|
VPADDD Y4, Y12, Y12
|
|
VPXOR Y12, Y14, Y14
|
|
VPSLLD $0x0c, Y14, Y3
|
|
VPSRLD $0x14, Y14, Y14
|
|
VPXOR Y3, Y14, Y14
|
|
VPADDD Y14, Y0, Y0
|
|
VPXOR Y0, Y4, Y4
|
|
VPSHUFB ·rol8<>+0(SB), Y4, Y4
|
|
VPADDD Y4, Y12, Y12
|
|
VPXOR Y12, Y14, Y14
|
|
VPSLLD $0x07, Y14, Y3
|
|
VPSRLD $0x19, Y14, Y14
|
|
VPXOR Y3, Y14, Y14
|
|
VPADDD Y9, Y5, Y5
|
|
VPXOR Y5, Y1, Y1
|
|
VPSHUFB ·rol16<>+0(SB), Y1, Y1
|
|
VPADDD Y1, Y13, Y13
|
|
VPXOR Y13, Y9, Y9
|
|
VPSLLD $0x0c, Y9, Y3
|
|
VPSRLD $0x14, Y9, Y9
|
|
VPXOR Y3, Y9, Y9
|
|
VPADDD Y9, Y5, Y5
|
|
VPXOR Y5, Y1, Y1
|
|
VPSHUFB ·rol8<>+0(SB), Y1, Y1
|
|
VPADDD Y1, Y13, Y13
|
|
VPXOR Y13, Y9, Y9
|
|
VPSLLD $0x07, Y9, Y3
|
|
VPSRLD $0x19, Y9, Y9
|
|
VPXOR Y3, Y9, Y9
|
|
ADDQ 16(DI), R10
|
|
ADCQ 24(DI), R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), AX
|
|
MOVQ AX, R15
|
|
MULQ R10
|
|
MOVQ AX, R13
|
|
MOVQ DX, R14
|
|
MOVQ (BP), AX
|
|
MULQ R11
|
|
IMULQ R12, R15
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), AX
|
|
MOVQ AX, R8
|
|
MULQ R10
|
|
ADDQ AX, R14
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R10
|
|
MOVQ 8(BP), AX
|
|
MULQ R11
|
|
ADDQ AX, R15
|
|
ADCQ $0x00, DX
|
|
IMULQ R12, R8
|
|
ADDQ R10, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
LEAQ 32(DI), DI
|
|
VPALIGNR $0x0c, Y14, Y14, Y14
|
|
VPALIGNR $0x0c, Y9, Y9, Y9
|
|
VPALIGNR $0x08, Y12, Y12, Y12
|
|
VPALIGNR $0x08, Y13, Y13, Y13
|
|
VPALIGNR $0x04, Y4, Y4, Y4
|
|
VPALIGNR $0x04, Y1, Y1, Y1
|
|
DECQ CX
|
|
JG sealAVX2Tail256LoopA
|
|
DECQ R9
|
|
JGE sealAVX2Tail256LoopB
|
|
VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
|
|
VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
|
|
VPADDD 32(BP), Y14, Y14
|
|
VPADDD 32(BP), Y9, Y9
|
|
VPADDD 64(BP), Y12, Y12
|
|
VPADDD 64(BP), Y13, Y13
|
|
VPADDD Y7, Y4, Y4
|
|
VPADDD Y11, Y1, Y1
|
|
VPERM2I128 $0x02, Y0, Y14, Y3
|
|
VPERM2I128 $0x02, Y12, Y4, Y7
|
|
VPERM2I128 $0x13, Y0, Y14, Y11
|
|
VPERM2I128 $0x13, Y12, Y4, Y15
|
|
VPXOR (SI), Y3, Y3
|
|
VPXOR 32(SI), Y7, Y7
|
|
VPXOR 64(SI), Y11, Y11
|
|
VPXOR 96(SI), Y15, Y15
|
|
VMOVDQU Y3, (DI)
|
|
VMOVDQU Y7, 32(DI)
|
|
VMOVDQU Y11, 64(DI)
|
|
VMOVDQU Y15, 96(DI)
|
|
MOVQ $0x00000080, CX
|
|
LEAQ 128(SI), SI
|
|
SUBQ $0x80, BX
|
|
VPERM2I128 $0x02, Y5, Y9, Y0
|
|
VPERM2I128 $0x02, Y13, Y1, Y14
|
|
VPERM2I128 $0x13, Y5, Y9, Y12
|
|
VPERM2I128 $0x13, Y13, Y1, Y4
|
|
JMP sealAVX2SealHash
|
|
|
|
sealAVX2Tail384:
|
|
VMOVDQA ·chacha20Constants<>+0(SB), Y0
|
|
VMOVDQA Y0, Y5
|
|
VMOVDQA Y0, Y6
|
|
VMOVDQA 32(BP), Y14
|
|
VMOVDQA Y14, Y9
|
|
VMOVDQA Y14, Y10
|
|
VMOVDQA 64(BP), Y12
|
|
VMOVDQA Y12, Y13
|
|
VMOVDQA Y12, Y8
|
|
VMOVDQA 192(BP), Y4
|
|
VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
|
|
VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
|
|
VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
|
|
VMOVDQA Y4, Y7
|
|
VMOVDQA Y1, Y11
|
|
VMOVDQA Y2, Y15
|
|
|
|
sealAVX2Tail384LoopA:
|
|
ADDQ (DI), R10
|
|
ADCQ 8(DI), R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), AX
|
|
MOVQ AX, R15
|
|
MULQ R10
|
|
MOVQ AX, R13
|
|
MOVQ DX, R14
|
|
MOVQ (BP), AX
|
|
MULQ R11
|
|
IMULQ R12, R15
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), AX
|
|
MOVQ AX, R8
|
|
MULQ R10
|
|
ADDQ AX, R14
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R10
|
|
MOVQ 8(BP), AX
|
|
MULQ R11
|
|
ADDQ AX, R15
|
|
ADCQ $0x00, DX
|
|
IMULQ R12, R8
|
|
ADDQ R10, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
LEAQ 16(DI), DI
|
|
|
|
sealAVX2Tail384LoopB:
|
|
VPADDD Y14, Y0, Y0
|
|
VPXOR Y0, Y4, Y4
|
|
VPSHUFB ·rol16<>+0(SB), Y4, Y4
|
|
VPADDD Y4, Y12, Y12
|
|
VPXOR Y12, Y14, Y14
|
|
VPSLLD $0x0c, Y14, Y3
|
|
VPSRLD $0x14, Y14, Y14
|
|
VPXOR Y3, Y14, Y14
|
|
VPADDD Y14, Y0, Y0
|
|
VPXOR Y0, Y4, Y4
|
|
VPSHUFB ·rol8<>+0(SB), Y4, Y4
|
|
VPADDD Y4, Y12, Y12
|
|
VPXOR Y12, Y14, Y14
|
|
VPSLLD $0x07, Y14, Y3
|
|
VPSRLD $0x19, Y14, Y14
|
|
VPXOR Y3, Y14, Y14
|
|
VPADDD Y9, Y5, Y5
|
|
VPXOR Y5, Y1, Y1
|
|
VPSHUFB ·rol16<>+0(SB), Y1, Y1
|
|
VPADDD Y1, Y13, Y13
|
|
VPXOR Y13, Y9, Y9
|
|
VPSLLD $0x0c, Y9, Y3
|
|
VPSRLD $0x14, Y9, Y9
|
|
VPXOR Y3, Y9, Y9
|
|
VPADDD Y9, Y5, Y5
|
|
VPXOR Y5, Y1, Y1
|
|
VPSHUFB ·rol8<>+0(SB), Y1, Y1
|
|
VPADDD Y1, Y13, Y13
|
|
VPXOR Y13, Y9, Y9
|
|
VPSLLD $0x07, Y9, Y3
|
|
VPSRLD $0x19, Y9, Y9
|
|
VPXOR Y3, Y9, Y9
|
|
VPADDD Y10, Y6, Y6
|
|
VPXOR Y6, Y2, Y2
|
|
VPSHUFB ·rol16<>+0(SB), Y2, Y2
|
|
VPADDD Y2, Y8, Y8
|
|
VPXOR Y8, Y10, Y10
|
|
VPSLLD $0x0c, Y10, Y3
|
|
VPSRLD $0x14, Y10, Y10
|
|
VPXOR Y3, Y10, Y10
|
|
VPADDD Y10, Y6, Y6
|
|
VPXOR Y6, Y2, Y2
|
|
VPSHUFB ·rol8<>+0(SB), Y2, Y2
|
|
VPADDD Y2, Y8, Y8
|
|
VPXOR Y8, Y10, Y10
|
|
VPSLLD $0x07, Y10, Y3
|
|
VPSRLD $0x19, Y10, Y10
|
|
VPXOR Y3, Y10, Y10
|
|
ADDQ (DI), R10
|
|
ADCQ 8(DI), R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), AX
|
|
MOVQ AX, R15
|
|
MULQ R10
|
|
MOVQ AX, R13
|
|
MOVQ DX, R14
|
|
MOVQ (BP), AX
|
|
MULQ R11
|
|
IMULQ R12, R15
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), AX
|
|
MOVQ AX, R8
|
|
MULQ R10
|
|
ADDQ AX, R14
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R10
|
|
MOVQ 8(BP), AX
|
|
MULQ R11
|
|
ADDQ AX, R15
|
|
ADCQ $0x00, DX
|
|
IMULQ R12, R8
|
|
ADDQ R10, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
VPALIGNR $0x04, Y14, Y14, Y14
|
|
VPALIGNR $0x04, Y9, Y9, Y9
|
|
VPALIGNR $0x04, Y10, Y10, Y10
|
|
VPALIGNR $0x08, Y12, Y12, Y12
|
|
VPALIGNR $0x08, Y13, Y13, Y13
|
|
VPALIGNR $0x08, Y8, Y8, Y8
|
|
VPALIGNR $0x0c, Y4, Y4, Y4
|
|
VPALIGNR $0x0c, Y1, Y1, Y1
|
|
VPALIGNR $0x0c, Y2, Y2, Y2
|
|
VPADDD Y14, Y0, Y0
|
|
VPXOR Y0, Y4, Y4
|
|
VPSHUFB ·rol16<>+0(SB), Y4, Y4
|
|
VPADDD Y4, Y12, Y12
|
|
VPXOR Y12, Y14, Y14
|
|
VPSLLD $0x0c, Y14, Y3
|
|
VPSRLD $0x14, Y14, Y14
|
|
VPXOR Y3, Y14, Y14
|
|
VPADDD Y14, Y0, Y0
|
|
VPXOR Y0, Y4, Y4
|
|
VPSHUFB ·rol8<>+0(SB), Y4, Y4
|
|
VPADDD Y4, Y12, Y12
|
|
VPXOR Y12, Y14, Y14
|
|
VPSLLD $0x07, Y14, Y3
|
|
VPSRLD $0x19, Y14, Y14
|
|
VPXOR Y3, Y14, Y14
|
|
VPADDD Y9, Y5, Y5
|
|
VPXOR Y5, Y1, Y1
|
|
VPSHUFB ·rol16<>+0(SB), Y1, Y1
|
|
VPADDD Y1, Y13, Y13
|
|
VPXOR Y13, Y9, Y9
|
|
VPSLLD $0x0c, Y9, Y3
|
|
VPSRLD $0x14, Y9, Y9
|
|
VPXOR Y3, Y9, Y9
|
|
VPADDD Y9, Y5, Y5
|
|
VPXOR Y5, Y1, Y1
|
|
VPSHUFB ·rol8<>+0(SB), Y1, Y1
|
|
VPADDD Y1, Y13, Y13
|
|
VPXOR Y13, Y9, Y9
|
|
VPSLLD $0x07, Y9, Y3
|
|
VPSRLD $0x19, Y9, Y9
|
|
VPXOR Y3, Y9, Y9
|
|
VPADDD Y10, Y6, Y6
|
|
VPXOR Y6, Y2, Y2
|
|
VPSHUFB ·rol16<>+0(SB), Y2, Y2
|
|
VPADDD Y2, Y8, Y8
|
|
VPXOR Y8, Y10, Y10
|
|
VPSLLD $0x0c, Y10, Y3
|
|
VPSRLD $0x14, Y10, Y10
|
|
VPXOR Y3, Y10, Y10
|
|
VPADDD Y10, Y6, Y6
|
|
VPXOR Y6, Y2, Y2
|
|
VPSHUFB ·rol8<>+0(SB), Y2, Y2
|
|
VPADDD Y2, Y8, Y8
|
|
VPXOR Y8, Y10, Y10
|
|
VPSLLD $0x07, Y10, Y3
|
|
VPSRLD $0x19, Y10, Y10
|
|
VPXOR Y3, Y10, Y10
|
|
ADDQ 16(DI), R10
|
|
ADCQ 24(DI), R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), AX
|
|
MOVQ AX, R15
|
|
MULQ R10
|
|
MOVQ AX, R13
|
|
MOVQ DX, R14
|
|
MOVQ (BP), AX
|
|
MULQ R11
|
|
IMULQ R12, R15
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), AX
|
|
MOVQ AX, R8
|
|
MULQ R10
|
|
ADDQ AX, R14
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R10
|
|
MOVQ 8(BP), AX
|
|
MULQ R11
|
|
ADDQ AX, R15
|
|
ADCQ $0x00, DX
|
|
IMULQ R12, R8
|
|
ADDQ R10, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
LEAQ 32(DI), DI
|
|
VPALIGNR $0x0c, Y14, Y14, Y14
|
|
VPALIGNR $0x0c, Y9, Y9, Y9
|
|
VPALIGNR $0x0c, Y10, Y10, Y10
|
|
VPALIGNR $0x08, Y12, Y12, Y12
|
|
VPALIGNR $0x08, Y13, Y13, Y13
|
|
VPALIGNR $0x08, Y8, Y8, Y8
|
|
VPALIGNR $0x04, Y4, Y4, Y4
|
|
VPALIGNR $0x04, Y1, Y1, Y1
|
|
VPALIGNR $0x04, Y2, Y2, Y2
|
|
DECQ CX
|
|
JG sealAVX2Tail384LoopA
|
|
DECQ R9
|
|
JGE sealAVX2Tail384LoopB
|
|
VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
|
|
VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
|
|
VPADDD ·chacha20Constants<>+0(SB), Y6, Y6
|
|
VPADDD 32(BP), Y14, Y14
|
|
VPADDD 32(BP), Y9, Y9
|
|
VPADDD 32(BP), Y10, Y10
|
|
VPADDD 64(BP), Y12, Y12
|
|
VPADDD 64(BP), Y13, Y13
|
|
VPADDD 64(BP), Y8, Y8
|
|
VPADDD Y7, Y4, Y4
|
|
VPADDD Y11, Y1, Y1
|
|
VPADDD Y15, Y2, Y2
|
|
VPERM2I128 $0x02, Y0, Y14, Y3
|
|
VPERM2I128 $0x02, Y12, Y4, Y7
|
|
VPERM2I128 $0x13, Y0, Y14, Y11
|
|
VPERM2I128 $0x13, Y12, Y4, Y15
|
|
VPXOR (SI), Y3, Y3
|
|
VPXOR 32(SI), Y7, Y7
|
|
VPXOR 64(SI), Y11, Y11
|
|
VPXOR 96(SI), Y15, Y15
|
|
VMOVDQU Y3, (DI)
|
|
VMOVDQU Y7, 32(DI)
|
|
VMOVDQU Y11, 64(DI)
|
|
VMOVDQU Y15, 96(DI)
|
|
VPERM2I128 $0x02, Y5, Y9, Y3
|
|
VPERM2I128 $0x02, Y13, Y1, Y7
|
|
VPERM2I128 $0x13, Y5, Y9, Y11
|
|
VPERM2I128 $0x13, Y13, Y1, Y15
|
|
VPXOR 128(SI), Y3, Y3
|
|
VPXOR 160(SI), Y7, Y7
|
|
VPXOR 192(SI), Y11, Y11
|
|
VPXOR 224(SI), Y15, Y15
|
|
VMOVDQU Y3, 128(DI)
|
|
VMOVDQU Y7, 160(DI)
|
|
VMOVDQU Y11, 192(DI)
|
|
VMOVDQU Y15, 224(DI)
|
|
MOVQ $0x00000100, CX
|
|
LEAQ 256(SI), SI
|
|
SUBQ $0x00000100, BX
|
|
VPERM2I128 $0x02, Y6, Y10, Y0
|
|
VPERM2I128 $0x02, Y8, Y2, Y14
|
|
VPERM2I128 $0x13, Y6, Y10, Y12
|
|
VPERM2I128 $0x13, Y8, Y2, Y4
|
|
JMP sealAVX2SealHash
|
|
|
|
sealAVX2Tail512:
|
|
VMOVDQA ·chacha20Constants<>+0(SB), Y0
|
|
VMOVDQA Y0, Y5
|
|
VMOVDQA Y0, Y6
|
|
VMOVDQA Y0, Y7
|
|
VMOVDQA 32(BP), Y14
|
|
VMOVDQA Y14, Y9
|
|
VMOVDQA Y14, Y10
|
|
VMOVDQA Y14, Y11
|
|
VMOVDQA 64(BP), Y12
|
|
VMOVDQA Y12, Y13
|
|
VMOVDQA Y12, Y8
|
|
VMOVDQA Y12, Y15
|
|
VMOVDQA 192(BP), Y4
|
|
VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
|
|
VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
|
|
VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
|
|
VPADDD ·avx2IncMask<>+0(SB), Y2, Y3
|
|
VMOVDQA Y4, 96(BP)
|
|
VMOVDQA Y1, 128(BP)
|
|
VMOVDQA Y2, 160(BP)
|
|
VMOVDQA Y3, 192(BP)
|
|
|
|
sealAVX2Tail512LoopA:
|
|
ADDQ (DI), R10
|
|
ADCQ 8(DI), R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), AX
|
|
MOVQ AX, R15
|
|
MULQ R10
|
|
MOVQ AX, R13
|
|
MOVQ DX, R14
|
|
MOVQ (BP), AX
|
|
MULQ R11
|
|
IMULQ R12, R15
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), AX
|
|
MOVQ AX, R8
|
|
MULQ R10
|
|
ADDQ AX, R14
|
|
ADCQ $0x00, DX
|
|
MOVQ DX, R10
|
|
MOVQ 8(BP), AX
|
|
MULQ R11
|
|
ADDQ AX, R15
|
|
ADCQ $0x00, DX
|
|
IMULQ R12, R8
|
|
ADDQ R10, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
LEAQ 16(DI), DI
|
|
|
|
sealAVX2Tail512LoopB:
|
|
VPADDD Y14, Y0, Y0
|
|
VPADDD Y9, Y5, Y5
|
|
VPADDD Y10, Y6, Y6
|
|
VPADDD Y11, Y7, Y7
|
|
VPXOR Y0, Y4, Y4
|
|
VPXOR Y5, Y1, Y1
|
|
VPXOR Y6, Y2, Y2
|
|
VPXOR Y7, Y3, Y3
|
|
VPSHUFB ·rol16<>+0(SB), Y4, Y4
|
|
VPSHUFB ·rol16<>+0(SB), Y1, Y1
|
|
VPSHUFB ·rol16<>+0(SB), Y2, Y2
|
|
VPSHUFB ·rol16<>+0(SB), Y3, Y3
|
|
VPADDD Y4, Y12, Y12
|
|
VPADDD Y1, Y13, Y13
|
|
VPADDD Y2, Y8, Y8
|
|
VPADDD Y3, Y15, Y15
|
|
VPXOR Y12, Y14, Y14
|
|
VPXOR Y13, Y9, Y9
|
|
VPXOR Y8, Y10, Y10
|
|
VPXOR Y15, Y11, Y11
|
|
VMOVDQA Y15, 224(BP)
|
|
VPSLLD $0x0c, Y14, Y15
|
|
VPSRLD $0x14, Y14, Y14
|
|
VPXOR Y15, Y14, Y14
|
|
VPSLLD $0x0c, Y9, Y15
|
|
VPSRLD $0x14, Y9, Y9
|
|
VPXOR Y15, Y9, Y9
|
|
VPSLLD $0x0c, Y10, Y15
|
|
VPSRLD $0x14, Y10, Y10
|
|
VPXOR Y15, Y10, Y10
|
|
VPSLLD $0x0c, Y11, Y15
|
|
VPSRLD $0x14, Y11, Y11
|
|
VPXOR Y15, Y11, Y11
|
|
VMOVDQA 224(BP), Y15
|
|
ADDQ (DI), R10
|
|
ADCQ 8(DI), R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), DX
|
|
MOVQ DX, R15
|
|
MULXQ R10, R13, R14
|
|
IMULQ R12, R15
|
|
MULXQ R11, AX, DX
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), DX
|
|
MULXQ R10, R10, AX
|
|
ADDQ R10, R14
|
|
MULXQ R11, R11, R8
|
|
ADCQ R11, R15
|
|
ADCQ $0x00, R8
|
|
IMULQ R12, DX
|
|
ADDQ AX, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
VPADDD Y14, Y0, Y0
|
|
VPADDD Y9, Y5, Y5
|
|
VPADDD Y10, Y6, Y6
|
|
VPADDD Y11, Y7, Y7
|
|
VPXOR Y0, Y4, Y4
|
|
VPXOR Y5, Y1, Y1
|
|
VPXOR Y6, Y2, Y2
|
|
VPXOR Y7, Y3, Y3
|
|
VPSHUFB ·rol8<>+0(SB), Y4, Y4
|
|
VPSHUFB ·rol8<>+0(SB), Y1, Y1
|
|
VPSHUFB ·rol8<>+0(SB), Y2, Y2
|
|
VPSHUFB ·rol8<>+0(SB), Y3, Y3
|
|
VPADDD Y4, Y12, Y12
|
|
VPADDD Y1, Y13, Y13
|
|
VPADDD Y2, Y8, Y8
|
|
VPADDD Y3, Y15, Y15
|
|
VPXOR Y12, Y14, Y14
|
|
VPXOR Y13, Y9, Y9
|
|
VPXOR Y8, Y10, Y10
|
|
VPXOR Y15, Y11, Y11
|
|
VMOVDQA Y15, 224(BP)
|
|
VPSLLD $0x07, Y14, Y15
|
|
VPSRLD $0x19, Y14, Y14
|
|
VPXOR Y15, Y14, Y14
|
|
VPSLLD $0x07, Y9, Y15
|
|
VPSRLD $0x19, Y9, Y9
|
|
VPXOR Y15, Y9, Y9
|
|
VPSLLD $0x07, Y10, Y15
|
|
VPSRLD $0x19, Y10, Y10
|
|
VPXOR Y15, Y10, Y10
|
|
VPSLLD $0x07, Y11, Y15
|
|
VPSRLD $0x19, Y11, Y11
|
|
VPXOR Y15, Y11, Y11
|
|
VMOVDQA 224(BP), Y15
|
|
VPALIGNR $0x04, Y14, Y14, Y14
|
|
VPALIGNR $0x04, Y9, Y9, Y9
|
|
VPALIGNR $0x04, Y10, Y10, Y10
|
|
VPALIGNR $0x04, Y11, Y11, Y11
|
|
VPALIGNR $0x08, Y12, Y12, Y12
|
|
VPALIGNR $0x08, Y13, Y13, Y13
|
|
VPALIGNR $0x08, Y8, Y8, Y8
|
|
VPALIGNR $0x08, Y15, Y15, Y15
|
|
VPALIGNR $0x0c, Y4, Y4, Y4
|
|
VPALIGNR $0x0c, Y1, Y1, Y1
|
|
VPALIGNR $0x0c, Y2, Y2, Y2
|
|
VPALIGNR $0x0c, Y3, Y3, Y3
|
|
VPADDD Y14, Y0, Y0
|
|
VPADDD Y9, Y5, Y5
|
|
VPADDD Y10, Y6, Y6
|
|
VPADDD Y11, Y7, Y7
|
|
VPXOR Y0, Y4, Y4
|
|
VPXOR Y5, Y1, Y1
|
|
VPXOR Y6, Y2, Y2
|
|
VPXOR Y7, Y3, Y3
|
|
VPSHUFB ·rol16<>+0(SB), Y4, Y4
|
|
VPSHUFB ·rol16<>+0(SB), Y1, Y1
|
|
VPSHUFB ·rol16<>+0(SB), Y2, Y2
|
|
VPSHUFB ·rol16<>+0(SB), Y3, Y3
|
|
VPADDD Y4, Y12, Y12
|
|
VPADDD Y1, Y13, Y13
|
|
VPADDD Y2, Y8, Y8
|
|
VPADDD Y3, Y15, Y15
|
|
VPXOR Y12, Y14, Y14
|
|
VPXOR Y13, Y9, Y9
|
|
VPXOR Y8, Y10, Y10
|
|
VPXOR Y15, Y11, Y11
|
|
ADDQ 16(DI), R10
|
|
ADCQ 24(DI), R11
|
|
ADCQ $0x01, R12
|
|
MOVQ (BP), DX
|
|
MOVQ DX, R15
|
|
MULXQ R10, R13, R14
|
|
IMULQ R12, R15
|
|
MULXQ R11, AX, DX
|
|
ADDQ AX, R14
|
|
ADCQ DX, R15
|
|
MOVQ 8(BP), DX
|
|
MULXQ R10, R10, AX
|
|
ADDQ R10, R14
|
|
MULXQ R11, R11, R8
|
|
ADCQ R11, R15
|
|
ADCQ $0x00, R8
|
|
IMULQ R12, DX
|
|
ADDQ AX, R15
|
|
ADCQ DX, R8
|
|
MOVQ R13, R10
|
|
MOVQ R14, R11
|
|
MOVQ R15, R12
|
|
ANDQ $0x03, R12
|
|
MOVQ R15, R13
|
|
ANDQ $-4, R13
|
|
MOVQ R8, R14
|
|
SHRQ $0x02, R8, R15
|
|
SHRQ $0x02, R8
|
|
ADDQ R13, R10
|
|
ADCQ R14, R11
|
|
ADCQ $0x00, R12
|
|
ADDQ R15, R10
|
|
ADCQ R8, R11
|
|
ADCQ $0x00, R12
|
|
LEAQ 32(DI), DI
|
|
VMOVDQA Y15, 224(BP)
|
|
VPSLLD $0x0c, Y14, Y15
|
|
VPSRLD $0x14, Y14, Y14
|
|
VPXOR Y15, Y14, Y14
|
|
VPSLLD $0x0c, Y9, Y15
|
|
VPSRLD $0x14, Y9, Y9
|
|
VPXOR Y15, Y9, Y9
|
|
VPSLLD $0x0c, Y10, Y15
|
|
VPSRLD $0x14, Y10, Y10
|
|
VPXOR Y15, Y10, Y10
|
|
VPSLLD $0x0c, Y11, Y15
|
|
VPSRLD $0x14, Y11, Y11
|
|
VPXOR Y15, Y11, Y11
|
|
VMOVDQA 224(BP), Y15
|
|
VPADDD Y14, Y0, Y0
|
|
VPADDD Y9, Y5, Y5
|
|
VPADDD Y10, Y6, Y6
|
|
VPADDD Y11, Y7, Y7
|
|
VPXOR Y0, Y4, Y4
|
|
VPXOR Y5, Y1, Y1
|
|
VPXOR Y6, Y2, Y2
|
|
VPXOR Y7, Y3, Y3
|
|
VPSHUFB ·rol8<>+0(SB), Y4, Y4
|
|
VPSHUFB ·rol8<>+0(SB), Y1, Y1
|
|
VPSHUFB ·rol8<>+0(SB), Y2, Y2
|
|
VPSHUFB ·rol8<>+0(SB), Y3, Y3
|
|
VPADDD Y4, Y12, Y12
|
|
VPADDD Y1, Y13, Y13
|
|
VPADDD Y2, Y8, Y8
|
|
VPADDD Y3, Y15, Y15
|
|
VPXOR Y12, Y14, Y14
|
|
VPXOR Y13, Y9, Y9
|
|
VPXOR Y8, Y10, Y10
|
|
VPXOR Y15, Y11, Y11
|
|
VMOVDQA Y15, 224(BP)
|
|
VPSLLD $0x07, Y14, Y15
|
|
VPSRLD $0x19, Y14, Y14
|
|
VPXOR Y15, Y14, Y14
|
|
VPSLLD $0x07, Y9, Y15
|
|
VPSRLD $0x19, Y9, Y9
|
|
VPXOR Y15, Y9, Y9
|
|
VPSLLD $0x07, Y10, Y15
|
|
VPSRLD $0x19, Y10, Y10
|
|
VPXOR Y15, Y10, Y10
|
|
VPSLLD $0x07, Y11, Y15
|
|
VPSRLD $0x19, Y11, Y11
|
|
VPXOR Y15, Y11, Y11
|
|
VMOVDQA 224(BP), Y15
|
|
VPALIGNR $0x0c, Y14, Y14, Y14
|
|
VPALIGNR $0x0c, Y9, Y9, Y9
|
|
VPALIGNR $0x0c, Y10, Y10, Y10
|
|
VPALIGNR $0x0c, Y11, Y11, Y11
|
|
VPALIGNR $0x08, Y12, Y12, Y12
|
|
VPALIGNR $0x08, Y13, Y13, Y13
|
|
VPALIGNR $0x08, Y8, Y8, Y8
|
|
VPALIGNR $0x08, Y15, Y15, Y15
|
|
VPALIGNR $0x04, Y4, Y4, Y4
|
|
VPALIGNR $0x04, Y1, Y1, Y1
|
|
VPALIGNR $0x04, Y2, Y2, Y2
|
|
VPALIGNR $0x04, Y3, Y3, Y3
|
|
DECQ CX
|
|
JG sealAVX2Tail512LoopA
|
|
DECQ R9
|
|
JGE sealAVX2Tail512LoopB
|
|
VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
|
|
VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
|
|
VPADDD ·chacha20Constants<>+0(SB), Y6, Y6
|
|
VPADDD ·chacha20Constants<>+0(SB), Y7, Y7
|
|
VPADDD 32(BP), Y14, Y14
|
|
VPADDD 32(BP), Y9, Y9
|
|
VPADDD 32(BP), Y10, Y10
|
|
VPADDD 32(BP), Y11, Y11
|
|
VPADDD 64(BP), Y12, Y12
|
|
VPADDD 64(BP), Y13, Y13
|
|
VPADDD 64(BP), Y8, Y8
|
|
VPADDD 64(BP), Y15, Y15
|
|
VPADDD 96(BP), Y4, Y4
|
|
VPADDD 128(BP), Y1, Y1
|
|
VPADDD 160(BP), Y2, Y2
|
|
VPADDD 192(BP), Y3, Y3
|
|
VMOVDQA Y15, 224(BP)
|
|
VPERM2I128 $0x02, Y0, Y14, Y15
|
|
VPXOR (SI), Y15, Y15
|
|
VMOVDQU Y15, (DI)
|
|
VPERM2I128 $0x02, Y12, Y4, Y15
|
|
VPXOR 32(SI), Y15, Y15
|
|
VMOVDQU Y15, 32(DI)
|
|
VPERM2I128 $0x13, Y0, Y14, Y15
|
|
VPXOR 64(SI), Y15, Y15
|
|
VMOVDQU Y15, 64(DI)
|
|
VPERM2I128 $0x13, Y12, Y4, Y15
|
|
VPXOR 96(SI), Y15, Y15
|
|
VMOVDQU Y15, 96(DI)
|
|
VPERM2I128 $0x02, Y5, Y9, Y0
|
|
VPERM2I128 $0x02, Y13, Y1, Y14
|
|
VPERM2I128 $0x13, Y5, Y9, Y12
|
|
VPERM2I128 $0x13, Y13, Y1, Y4
|
|
VPXOR 128(SI), Y0, Y0
|
|
VPXOR 160(SI), Y14, Y14
|
|
VPXOR 192(SI), Y12, Y12
|
|
VPXOR 224(SI), Y4, Y4
|
|
VMOVDQU Y0, 128(DI)
|
|
VMOVDQU Y14, 160(DI)
|
|
VMOVDQU Y12, 192(DI)
|
|
VMOVDQU Y4, 224(DI)
|
|
VPERM2I128 $0x02, Y6, Y10, Y0
|
|
VPERM2I128 $0x02, Y8, Y2, Y14
|
|
VPERM2I128 $0x13, Y6, Y10, Y12
|
|
VPERM2I128 $0x13, Y8, Y2, Y4
|
|
VPXOR 256(SI), Y0, Y0
|
|
VPXOR 288(SI), Y14, Y14
|
|
VPXOR 320(SI), Y12, Y12
|
|
VPXOR 352(SI), Y4, Y4
|
|
VMOVDQU Y0, 256(DI)
|
|
VMOVDQU Y14, 288(DI)
|
|
VMOVDQU Y12, 320(DI)
|
|
VMOVDQU Y4, 352(DI)
|
|
MOVQ $0x00000180, CX
|
|
LEAQ 384(SI), SI
|
|
SUBQ $0x00000180, BX
|
|
VPERM2I128 $0x02, Y7, Y11, Y0
|
|
VPERM2I128 $0x02, 224(BP), Y3, Y14
|
|
VPERM2I128 $0x13, Y7, Y11, Y12
|
|
VPERM2I128 $0x13, 224(BP), Y3, Y4
|
|
JMP sealAVX2SealHash
|