crypto/chacha20poly1305/chacha20poly1305_amd64.s

9763 строки
186 KiB
ArmAsm

// Code generated by command: go run chacha20poly1305_amd64_asm.go -out ../chacha20poly1305_amd64.s -pkg chacha20poly1305. DO NOT EDIT.
//go:build gc && !purego
#include "textflag.h"
// func polyHashADInternal<>()
TEXT polyHashADInternal<>(SB), NOSPLIT, $0
// Hack: Must declare #define macros inside of a function due to Avo constraints
// ROL rotates the uint32s in register R left by N bits, using temporary T.
#define ROL(N, R, T) \
MOVO R, T; \
PSLLL $(N), T; \
PSRLL $(32-(N)), R; \
PXOR T, R
// ROL8 rotates the uint32s in register R left by 8, using temporary T if needed.
#ifdef GOAMD64_v2
#define ROL8(R, T) PSHUFB ·rol8<>(SB), R
#else
#define ROL8(R, T) ROL(8, R, T)
#endif
// ROL16 rotates the uint32s in register R left by 16, using temporary T if needed.
#ifdef GOAMD64_v2
#define ROL16(R, T) PSHUFB ·rol16<>(SB), R
#else
#define ROL16(R, T) ROL(16, R, T)
#endif
XORQ R10, R10
XORQ R11, R11
XORQ R12, R12
CMPQ R9, $0x0d
JNE hashADLoop
MOVQ (CX), R10
MOVQ 5(CX), R11
SHRQ $0x18, R11
MOVQ $0x00000001, R12
MOVQ (BP), AX
MOVQ AX, R15
MULQ R10
MOVQ AX, R13
MOVQ DX, R14
MOVQ (BP), AX
MULQ R11
IMULQ R12, R15
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), AX
MOVQ AX, R8
MULQ R10
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 8(BP), AX
MULQ R11
ADDQ AX, R15
ADCQ $0x00, DX
IMULQ R12, R8
ADDQ R10, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
RET
hashADLoop:
// Hash in 16 byte chunks
CMPQ R9, $0x10
JB hashADTail
ADDQ (CX), R10
ADCQ 8(CX), R11
ADCQ $0x01, R12
LEAQ 16(CX), CX
SUBQ $0x10, R9
MOVQ (BP), AX
MOVQ AX, R15
MULQ R10
MOVQ AX, R13
MOVQ DX, R14
MOVQ (BP), AX
MULQ R11
IMULQ R12, R15
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), AX
MOVQ AX, R8
MULQ R10
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 8(BP), AX
MULQ R11
ADDQ AX, R15
ADCQ $0x00, DX
IMULQ R12, R8
ADDQ R10, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
JMP hashADLoop
hashADTail:
CMPQ R9, $0x00
JE hashADDone
// Hash last < 16 byte tail
XORQ R13, R13
XORQ R14, R14
XORQ R15, R15
ADDQ R9, CX
hashADTailLoop:
SHLQ $0x08, R13, R14
SHLQ $0x08, R13
MOVB -1(CX), R15
XORQ R15, R13
DECQ CX
DECQ R9
JNE hashADTailLoop
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x01, R12
MOVQ (BP), AX
MOVQ AX, R15
MULQ R10
MOVQ AX, R13
MOVQ DX, R14
MOVQ (BP), AX
MULQ R11
IMULQ R12, R15
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), AX
MOVQ AX, R8
MULQ R10
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 8(BP), AX
MULQ R11
ADDQ AX, R15
ADCQ $0x00, DX
IMULQ R12, R8
ADDQ R10, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
hashADDone:
RET
// func chacha20Poly1305Open(dst []byte, key []uint32, src []byte, ad []byte) bool
// Requires: AVX, AVX2, BMI2, CMOV, SSE2
TEXT ·chacha20Poly1305Open(SB), $288-97
// For aligned stack access
MOVQ SP, BP
ADDQ $0x20, BP
ANDQ $-32, BP
MOVQ dst_base+0(FP), DI
MOVQ key_base+24(FP), R8
MOVQ src_base+48(FP), SI
MOVQ src_len+56(FP), BX
MOVQ ad_base+72(FP), CX
// Check for AVX2 support
CMPB ·useAVX2+0(SB), $0x01
JE chacha20Poly1305Open_AVX2
// Special optimization, for very short buffers
CMPQ BX, $0x80
JBE openSSE128
// For long buffers, prepare the poly key first
MOVOU ·chacha20Constants<>+0(SB), X0
MOVOU 16(R8), X3
MOVOU 32(R8), X6
MOVOU 48(R8), X9
MOVO X9, X13
// Store state on stack for future use
MOVO X3, 32(BP)
MOVO X6, 48(BP)
MOVO X9, 128(BP)
MOVQ $0x0000000a, R9
openSSEPreparePolyKey:
PADDD X3, X0
PXOR X0, X9
ROL16(X9, X12)
PADDD X9, X6
PXOR X6, X3
MOVO X3, X12
PSLLL $0x0c, X12
PSRLL $0x14, X3
PXOR X12, X3
PADDD X3, X0
PXOR X0, X9
ROL8(X9, X12)
PADDD X9, X6
PXOR X6, X3
MOVO X3, X12
PSLLL $0x07, X12
PSRLL $0x19, X3
PXOR X12, X3
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xdb
BYTE $0x04
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xf6
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xc9
BYTE $0x0c
PADDD X3, X0
PXOR X0, X9
ROL16(X9, X12)
PADDD X9, X6
PXOR X6, X3
MOVO X3, X12
PSLLL $0x0c, X12
PSRLL $0x14, X3
PXOR X12, X3
PADDD X3, X0
PXOR X0, X9
ROL8(X9, X12)
PADDD X9, X6
PXOR X6, X3
MOVO X3, X12
PSLLL $0x07, X12
PSRLL $0x19, X3
PXOR X12, X3
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xdb
BYTE $0x0c
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xf6
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xc9
BYTE $0x04
DECQ R9
JNE openSSEPreparePolyKey
// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
PADDL ·chacha20Constants<>+0(SB), X0
PADDL 32(BP), X3
// Clamp and store the key
PAND ·polyClampMask<>+0(SB), X0
MOVO X0, (BP)
MOVO X3, 16(BP)
// Hash AAD
MOVQ ad_len+80(FP), R9
CALL polyHashADInternal<>(SB)
openSSEMainLoop:
CMPQ BX, $0x00000100
JB openSSEMainLoopDone
// Load state, increment counter blocks
MOVO ·chacha20Constants<>+0(SB), X0
MOVO 32(BP), X3
MOVO 48(BP), X6
MOVO 128(BP), X9
PADDL ·sseIncMask<>+0(SB), X9
MOVO X0, X1
MOVO X3, X4
MOVO X6, X7
MOVO X9, X10
PADDL ·sseIncMask<>+0(SB), X10
MOVO X1, X2
MOVO X4, X5
MOVO X7, X8
MOVO X10, X11
PADDL ·sseIncMask<>+0(SB), X11
MOVO X2, X12
MOVO X5, X13
MOVO X8, X14
MOVO X11, X15
PADDL ·sseIncMask<>+0(SB), X15
// Store counters
MOVO X9, 80(BP)
MOVO X10, 96(BP)
MOVO X11, 112(BP)
MOVO X15, 128(BP)
// There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash
// 2 blocks, and for the remaining 4 only 1 block - for a total of 16
MOVQ $0x00000004, CX
MOVQ SI, R9
openSSEInternalLoop:
MOVO X14, 64(BP)
PADDD X3, X0
PXOR X0, X9
ROL16(X9, X14)
PADDD X9, X6
PXOR X6, X3
MOVO X3, X14
PSLLL $0x0c, X14
PSRLL $0x14, X3
PXOR X14, X3
PADDD X3, X0
PXOR X0, X9
ROL8(X9, X14)
PADDD X9, X6
PXOR X6, X3
MOVO X3, X14
PSLLL $0x07, X14
PSRLL $0x19, X3
PXOR X14, X3
PADDD X4, X1
PXOR X1, X10
ROL16(X10, X14)
PADDD X10, X7
PXOR X7, X4
MOVO X4, X14
PSLLL $0x0c, X14
PSRLL $0x14, X4
PXOR X14, X4
PADDD X4, X1
PXOR X1, X10
ROL8(X10, X14)
PADDD X10, X7
PXOR X7, X4
MOVO X4, X14
PSLLL $0x07, X14
PSRLL $0x19, X4
PXOR X14, X4
PADDD X5, X2
PXOR X2, X11
ROL16(X11, X14)
PADDD X11, X8
PXOR X8, X5
MOVO X5, X14
PSLLL $0x0c, X14
PSRLL $0x14, X5
PXOR X14, X5
PADDD X5, X2
PXOR X2, X11
ROL8(X11, X14)
PADDD X11, X8
PXOR X8, X5
MOVO X5, X14
PSLLL $0x07, X14
PSRLL $0x19, X5
PXOR X14, X5
MOVO 64(BP), X14
MOVO X7, 64(BP)
PADDD X13, X12
PXOR X12, X15
ROL16(X15, X7)
PADDD X15, X14
PXOR X14, X13
MOVO X13, X7
PSLLL $0x0c, X7
PSRLL $0x14, X13
PXOR X7, X13
PADDD X13, X12
PXOR X12, X15
ROL8(X15, X7)
PADDD X15, X14
PXOR X14, X13
MOVO X13, X7
PSLLL $0x07, X7
PSRLL $0x19, X13
PXOR X7, X13
MOVO 64(BP), X7
ADDQ (R9), R10
ADCQ 8(R9), R11
ADCQ $0x01, R12
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xdb
BYTE $0x04
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xe4
BYTE $0x04
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xed
BYTE $0x04
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xed
BYTE $0x04
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xf6
BYTE $0x08
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xff
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xc0
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xf6
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xc9
BYTE $0x0c
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xd2
BYTE $0x0c
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xdb
BYTE $0x0c
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xff
BYTE $0x0c
MOVQ (BP), AX
MOVQ AX, R15
MULQ R10
MOVQ AX, R13
MOVQ DX, R14
MOVQ (BP), AX
MULQ R11
IMULQ R12, R15
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), AX
MOVQ AX, R8
MULQ R10
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 8(BP), AX
MULQ R11
ADDQ AX, R15
ADCQ $0x00, DX
LEAQ 16(R9), R9
MOVO X14, 64(BP)
PADDD X3, X0
PXOR X0, X9
ROL16(X9, X14)
PADDD X9, X6
PXOR X6, X3
MOVO X3, X14
PSLLL $0x0c, X14
PSRLL $0x14, X3
PXOR X14, X3
PADDD X3, X0
PXOR X0, X9
ROL8(X9, X14)
PADDD X9, X6
PXOR X6, X3
MOVO X3, X14
PSLLL $0x07, X14
PSRLL $0x19, X3
PXOR X14, X3
PADDD X4, X1
PXOR X1, X10
ROL16(X10, X14)
PADDD X10, X7
PXOR X7, X4
MOVO X4, X14
PSLLL $0x0c, X14
PSRLL $0x14, X4
PXOR X14, X4
PADDD X4, X1
PXOR X1, X10
ROL8(X10, X14)
PADDD X10, X7
PXOR X7, X4
MOVO X4, X14
PSLLL $0x07, X14
PSRLL $0x19, X4
PXOR X14, X4
PADDD X5, X2
PXOR X2, X11
ROL16(X11, X14)
PADDD X11, X8
PXOR X8, X5
MOVO X5, X14
PSLLL $0x0c, X14
PSRLL $0x14, X5
PXOR X14, X5
PADDD X5, X2
PXOR X2, X11
ROL8(X11, X14)
PADDD X11, X8
PXOR X8, X5
MOVO X5, X14
PSLLL $0x07, X14
PSRLL $0x19, X5
PXOR X14, X5
MOVO 64(BP), X14
MOVO X7, 64(BP)
IMULQ R12, R8
ADDQ R10, R15
ADCQ DX, R8
PADDD X13, X12
PXOR X12, X15
ROL16(X15, X7)
PADDD X15, X14
PXOR X14, X13
MOVO X13, X7
PSLLL $0x0c, X7
PSRLL $0x14, X13
PXOR X7, X13
PADDD X13, X12
PXOR X12, X15
ROL8(X15, X7)
PADDD X15, X14
PXOR X14, X13
MOVO X13, X7
PSLLL $0x07, X7
PSRLL $0x19, X13
PXOR X7, X13
MOVO 64(BP), X7
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xdb
BYTE $0x0c
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xe4
BYTE $0x0c
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xed
BYTE $0x0c
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xed
BYTE $0x0c
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xf6
BYTE $0x08
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xff
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xc0
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xf6
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xc9
BYTE $0x04
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xd2
BYTE $0x04
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xdb
BYTE $0x04
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xff
BYTE $0x04
DECQ CX
JGE openSSEInternalLoop
ADDQ (R9), R10
ADCQ 8(R9), R11
ADCQ $0x01, R12
MOVQ (BP), AX
MOVQ AX, R15
MULQ R10
MOVQ AX, R13
MOVQ DX, R14
MOVQ (BP), AX
MULQ R11
IMULQ R12, R15
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), AX
MOVQ AX, R8
MULQ R10
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 8(BP), AX
MULQ R11
ADDQ AX, R15
ADCQ $0x00, DX
IMULQ R12, R8
ADDQ R10, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
LEAQ 16(R9), R9
CMPQ CX, $-6
JG openSSEInternalLoop
// Add in the state
PADDD ·chacha20Constants<>+0(SB), X0
PADDD ·chacha20Constants<>+0(SB), X1
PADDD ·chacha20Constants<>+0(SB), X2
PADDD ·chacha20Constants<>+0(SB), X12
PADDD 32(BP), X3
PADDD 32(BP), X4
PADDD 32(BP), X5
PADDD 32(BP), X13
PADDD 48(BP), X6
PADDD 48(BP), X7
PADDD 48(BP), X8
PADDD 48(BP), X14
PADDD 80(BP), X9
PADDD 96(BP), X10
PADDD 112(BP), X11
PADDD 128(BP), X15
// Load - xor - store
MOVO X15, 64(BP)
MOVOU (SI), X15
PXOR X15, X0
MOVOU X0, (DI)
MOVOU 16(SI), X15
PXOR X15, X3
MOVOU X3, 16(DI)
MOVOU 32(SI), X15
PXOR X15, X6
MOVOU X6, 32(DI)
MOVOU 48(SI), X15
PXOR X15, X9
MOVOU X9, 48(DI)
MOVOU 64(SI), X9
PXOR X9, X1
MOVOU X1, 64(DI)
MOVOU 80(SI), X9
PXOR X9, X4
MOVOU X4, 80(DI)
MOVOU 96(SI), X9
PXOR X9, X7
MOVOU X7, 96(DI)
MOVOU 112(SI), X9
PXOR X9, X10
MOVOU X10, 112(DI)
MOVOU 128(SI), X9
PXOR X9, X2
MOVOU X2, 128(DI)
MOVOU 144(SI), X9
PXOR X9, X5
MOVOU X5, 144(DI)
MOVOU 160(SI), X9
PXOR X9, X8
MOVOU X8, 160(DI)
MOVOU 176(SI), X9
PXOR X9, X11
MOVOU X11, 176(DI)
MOVOU 192(SI), X9
PXOR X9, X12
MOVOU X12, 192(DI)
MOVOU 208(SI), X9
PXOR X9, X13
MOVOU X13, 208(DI)
MOVOU 224(SI), X9
PXOR X9, X14
MOVOU X14, 224(DI)
MOVOU 240(SI), X9
PXOR 64(BP), X9
MOVOU X9, 240(DI)
LEAQ 256(SI), SI
LEAQ 256(DI), DI
SUBQ $0x00000100, BX
JMP openSSEMainLoop
openSSEMainLoopDone:
// Handle the various tail sizes efficiently
TESTQ BX, BX
JE openSSEFinalize
CMPQ BX, $0x40
JBE openSSETail64
CMPQ BX, $0x80
JBE openSSETail128
CMPQ BX, $0xc0
JBE openSSETail192
JMP openSSETail256
openSSEFinalize:
// Hash in the PT, AAD lengths
ADDQ ad_len+80(FP), R10
ADCQ src_len+56(FP), R11
ADCQ $0x01, R12
MOVQ (BP), AX
MOVQ AX, R15
MULQ R10
MOVQ AX, R13
MOVQ DX, R14
MOVQ (BP), AX
MULQ R11
IMULQ R12, R15
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), AX
MOVQ AX, R8
MULQ R10
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 8(BP), AX
MULQ R11
ADDQ AX, R15
ADCQ $0x00, DX
IMULQ R12, R8
ADDQ R10, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
// Final reduce
MOVQ R10, R13
MOVQ R11, R14
MOVQ R12, R15
SUBQ $-5, R10
SBBQ $-1, R11
SBBQ $0x03, R12
CMOVQCS R13, R10
CMOVQCS R14, R11
CMOVQCS R15, R12
// Add in the "s" part of the key
ADDQ 16(BP), R10
ADCQ 24(BP), R11
// Finally, constant time compare to the tag at the end of the message
XORQ AX, AX
MOVQ $0x00000001, DX
XORQ (SI), R10
XORQ 8(SI), R11
ORQ R11, R10
CMOVQEQ DX, AX
// Return true iff tags are equal
MOVB AX, ret+96(FP)
RET
openSSE128:
MOVOU ·chacha20Constants<>+0(SB), X0
MOVOU 16(R8), X3
MOVOU 32(R8), X6
MOVOU 48(R8), X9
MOVO X0, X1
MOVO X3, X4
MOVO X6, X7
MOVO X9, X10
PADDL ·sseIncMask<>+0(SB), X10
MOVO X1, X2
MOVO X4, X5
MOVO X7, X8
MOVO X10, X11
PADDL ·sseIncMask<>+0(SB), X11
MOVO X3, X13
MOVO X6, X14
MOVO X10, X15
MOVQ $0x0000000a, R9
openSSE128InnerCipherLoop:
PADDD X3, X0
PXOR X0, X9
ROL16(X9, X12)
PADDD X9, X6
PXOR X6, X3
MOVO X3, X12
PSLLL $0x0c, X12
PSRLL $0x14, X3
PXOR X12, X3
PADDD X3, X0
PXOR X0, X9
ROL8(X9, X12)
PADDD X9, X6
PXOR X6, X3
MOVO X3, X12
PSLLL $0x07, X12
PSRLL $0x19, X3
PXOR X12, X3
PADDD X4, X1
PXOR X1, X10
ROL16(X10, X12)
PADDD X10, X7
PXOR X7, X4
MOVO X4, X12
PSLLL $0x0c, X12
PSRLL $0x14, X4
PXOR X12, X4
PADDD X4, X1
PXOR X1, X10
ROL8(X10, X12)
PADDD X10, X7
PXOR X7, X4
MOVO X4, X12
PSLLL $0x07, X12
PSRLL $0x19, X4
PXOR X12, X4
PADDD X5, X2
PXOR X2, X11
ROL16(X11, X12)
PADDD X11, X8
PXOR X8, X5
MOVO X5, X12
PSLLL $0x0c, X12
PSRLL $0x14, X5
PXOR X12, X5
PADDD X5, X2
PXOR X2, X11
ROL8(X11, X12)
PADDD X11, X8
PXOR X8, X5
MOVO X5, X12
PSLLL $0x07, X12
PSRLL $0x19, X5
PXOR X12, X5
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xdb
BYTE $0x04
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xe4
BYTE $0x04
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xed
BYTE $0x04
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xf6
BYTE $0x08
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xff
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xc0
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xc9
BYTE $0x0c
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xd2
BYTE $0x0c
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xdb
BYTE $0x0c
PADDD X3, X0
PXOR X0, X9
ROL16(X9, X12)
PADDD X9, X6
PXOR X6, X3
MOVO X3, X12
PSLLL $0x0c, X12
PSRLL $0x14, X3
PXOR X12, X3
PADDD X3, X0
PXOR X0, X9
ROL8(X9, X12)
PADDD X9, X6
PXOR X6, X3
MOVO X3, X12
PSLLL $0x07, X12
PSRLL $0x19, X3
PXOR X12, X3
PADDD X4, X1
PXOR X1, X10
ROL16(X10, X12)
PADDD X10, X7
PXOR X7, X4
MOVO X4, X12
PSLLL $0x0c, X12
PSRLL $0x14, X4
PXOR X12, X4
PADDD X4, X1
PXOR X1, X10
ROL8(X10, X12)
PADDD X10, X7
PXOR X7, X4
MOVO X4, X12
PSLLL $0x07, X12
PSRLL $0x19, X4
PXOR X12, X4
PADDD X5, X2
PXOR X2, X11
ROL16(X11, X12)
PADDD X11, X8
PXOR X8, X5
MOVO X5, X12
PSLLL $0x0c, X12
PSRLL $0x14, X5
PXOR X12, X5
PADDD X5, X2
PXOR X2, X11
ROL8(X11, X12)
PADDD X11, X8
PXOR X8, X5
MOVO X5, X12
PSLLL $0x07, X12
PSRLL $0x19, X5
PXOR X12, X5
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xdb
BYTE $0x0c
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xe4
BYTE $0x0c
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xed
BYTE $0x0c
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xf6
BYTE $0x08
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xff
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xc0
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xc9
BYTE $0x04
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xd2
BYTE $0x04
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xdb
BYTE $0x04
DECQ R9
JNE openSSE128InnerCipherLoop
// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
PADDL ·chacha20Constants<>+0(SB), X0
PADDL ·chacha20Constants<>+0(SB), X1
PADDL ·chacha20Constants<>+0(SB), X2
PADDL X13, X3
PADDL X13, X4
PADDL X13, X5
PADDL X14, X7
PADDL X14, X8
PADDL X15, X10
PADDL ·sseIncMask<>+0(SB), X15
PADDL X15, X11
// Clamp and store the key
PAND ·polyClampMask<>+0(SB), X0
MOVOU X0, (BP)
MOVOU X3, 16(BP)
// Hash
MOVQ ad_len+80(FP), R9
CALL polyHashADInternal<>(SB)
openSSE128Open:
CMPQ BX, $0x10
JB openSSETail16
SUBQ $0x10, BX
// Load for hashing
ADDQ (SI), R10
ADCQ 8(SI), R11
ADCQ $0x01, R12
// Load for decryption
MOVOU (SI), X12
PXOR X12, X1
MOVOU X1, (DI)
LEAQ 16(SI), SI
LEAQ 16(DI), DI
MOVQ (BP), AX
MOVQ AX, R15
MULQ R10
MOVQ AX, R13
MOVQ DX, R14
MOVQ (BP), AX
MULQ R11
IMULQ R12, R15
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), AX
MOVQ AX, R8
MULQ R10
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 8(BP), AX
MULQ R11
ADDQ AX, R15
ADCQ $0x00, DX
IMULQ R12, R8
ADDQ R10, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
// Shift the stream "left"
MOVO X4, X1
MOVO X7, X4
MOVO X10, X7
MOVO X2, X10
MOVO X5, X2
MOVO X8, X5
MOVO X11, X8
JMP openSSE128Open
openSSETail16:
TESTQ BX, BX
JE openSSEFinalize
// We can safely load the CT from the end, because it is padded with the MAC
MOVQ BX, R9
SHLQ $0x04, R9
LEAQ ·andMask<>+0(SB), R13
MOVOU (SI), X12
ADDQ BX, SI
PAND -16(R13)(R9*1), X12
MOVO X12, 64(BP)
MOVQ X12, R13
MOVQ 72(BP), R14
PXOR X1, X12
// We can only store one byte at a time, since plaintext can be shorter than 16 bytes
openSSETail16Store:
MOVQ X12, R8
MOVB R8, (DI)
PSRLDQ $0x01, X12
INCQ DI
DECQ BX
JNE openSSETail16Store
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x01, R12
MOVQ (BP), AX
MOVQ AX, R15
MULQ R10
MOVQ AX, R13
MOVQ DX, R14
MOVQ (BP), AX
MULQ R11
IMULQ R12, R15
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), AX
MOVQ AX, R8
MULQ R10
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 8(BP), AX
MULQ R11
ADDQ AX, R15
ADCQ $0x00, DX
IMULQ R12, R8
ADDQ R10, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
JMP openSSEFinalize
openSSETail64:
MOVO ·chacha20Constants<>+0(SB), X0
MOVO 32(BP), X3
MOVO 48(BP), X6
MOVO 128(BP), X9
PADDL ·sseIncMask<>+0(SB), X9
MOVO X9, 80(BP)
XORQ R9, R9
MOVQ BX, CX
CMPQ CX, $0x10
JB openSSETail64LoopB
openSSETail64LoopA:
ADDQ (SI)(R9*1), R10
ADCQ 8(SI)(R9*1), R11
ADCQ $0x01, R12
MOVQ (BP), AX
MOVQ AX, R15
MULQ R10
MOVQ AX, R13
MOVQ DX, R14
MOVQ (BP), AX
MULQ R11
IMULQ R12, R15
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), AX
MOVQ AX, R8
MULQ R10
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 8(BP), AX
MULQ R11
ADDQ AX, R15
ADCQ $0x00, DX
IMULQ R12, R8
ADDQ R10, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
SUBQ $0x10, CX
openSSETail64LoopB:
ADDQ $0x10, R9
PADDD X3, X0
PXOR X0, X9
ROL16(X9, X12)
PADDD X9, X6
PXOR X6, X3
MOVO X3, X12
PSLLL $0x0c, X12
PSRLL $0x14, X3
PXOR X12, X3
PADDD X3, X0
PXOR X0, X9
ROL8(X9, X12)
PADDD X9, X6
PXOR X6, X3
MOVO X3, X12
PSLLL $0x07, X12
PSRLL $0x19, X3
PXOR X12, X3
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xdb
BYTE $0x04
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xf6
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xc9
BYTE $0x0c
PADDD X3, X0
PXOR X0, X9
ROL16(X9, X12)
PADDD X9, X6
PXOR X6, X3
MOVO X3, X12
PSLLL $0x0c, X12
PSRLL $0x14, X3
PXOR X12, X3
PADDD X3, X0
PXOR X0, X9
ROL8(X9, X12)
PADDD X9, X6
PXOR X6, X3
MOVO X3, X12
PSLLL $0x07, X12
PSRLL $0x19, X3
PXOR X12, X3
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xdb
BYTE $0x0c
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xf6
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xc9
BYTE $0x04
CMPQ CX, $0x10
JAE openSSETail64LoopA
CMPQ R9, $0xa0
JNE openSSETail64LoopB
PADDL ·chacha20Constants<>+0(SB), X0
PADDL 32(BP), X3
PADDL 48(BP), X6
PADDL 80(BP), X9
openSSETail64DecLoop:
CMPQ BX, $0x10
JB openSSETail64DecLoopDone
SUBQ $0x10, BX
MOVOU (SI), X12
PXOR X12, X0
MOVOU X0, (DI)
LEAQ 16(SI), SI
LEAQ 16(DI), DI
MOVO X3, X0
MOVO X6, X3
MOVO X9, X6
JMP openSSETail64DecLoop
openSSETail64DecLoopDone:
MOVO X0, X1
JMP openSSETail16
openSSETail128:
MOVO ·chacha20Constants<>+0(SB), X1
MOVO 32(BP), X4
MOVO 48(BP), X7
MOVO 128(BP), X10
PADDL ·sseIncMask<>+0(SB), X10
MOVO X10, 80(BP)
MOVO X1, X0
MOVO X4, X3
MOVO X7, X6
MOVO X10, X9
PADDL ·sseIncMask<>+0(SB), X9
MOVO X9, 96(BP)
XORQ R9, R9
MOVQ BX, CX
ANDQ $-16, CX
openSSETail128LoopA:
ADDQ (SI)(R9*1), R10
ADCQ 8(SI)(R9*1), R11
ADCQ $0x01, R12
MOVQ (BP), AX
MOVQ AX, R15
MULQ R10
MOVQ AX, R13
MOVQ DX, R14
MOVQ (BP), AX
MULQ R11
IMULQ R12, R15
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), AX
MOVQ AX, R8
MULQ R10
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 8(BP), AX
MULQ R11
ADDQ AX, R15
ADCQ $0x00, DX
IMULQ R12, R8
ADDQ R10, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
openSSETail128LoopB:
ADDQ $0x10, R9
PADDD X3, X0
PXOR X0, X9
ROL16(X9, X12)
PADDD X9, X6
PXOR X6, X3
MOVO X3, X12
PSLLL $0x0c, X12
PSRLL $0x14, X3
PXOR X12, X3
PADDD X3, X0
PXOR X0, X9
ROL8(X9, X12)
PADDD X9, X6
PXOR X6, X3
MOVO X3, X12
PSLLL $0x07, X12
PSRLL $0x19, X3
PXOR X12, X3
PADDD X4, X1
PXOR X1, X10
ROL16(X10, X12)
PADDD X10, X7
PXOR X7, X4
MOVO X4, X12
PSLLL $0x0c, X12
PSRLL $0x14, X4
PXOR X12, X4
PADDD X4, X1
PXOR X1, X10
ROL8(X10, X12)
PADDD X10, X7
PXOR X7, X4
MOVO X4, X12
PSLLL $0x07, X12
PSRLL $0x19, X4
PXOR X12, X4
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xdb
BYTE $0x04
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xf6
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xc9
BYTE $0x0c
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xe4
BYTE $0x04
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xff
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xd2
BYTE $0x0c
PADDD X3, X0
PXOR X0, X9
ROL16(X9, X12)
PADDD X9, X6
PXOR X6, X3
MOVO X3, X12
PSLLL $0x0c, X12
PSRLL $0x14, X3
PXOR X12, X3
PADDD X3, X0
PXOR X0, X9
ROL8(X9, X12)
PADDD X9, X6
PXOR X6, X3
MOVO X3, X12
PSLLL $0x07, X12
PSRLL $0x19, X3
PXOR X12, X3
PADDD X4, X1
PXOR X1, X10
ROL16(X10, X12)
PADDD X10, X7
PXOR X7, X4
MOVO X4, X12
PSLLL $0x0c, X12
PSRLL $0x14, X4
PXOR X12, X4
PADDD X4, X1
PXOR X1, X10
ROL8(X10, X12)
PADDD X10, X7
PXOR X7, X4
MOVO X4, X12
PSLLL $0x07, X12
PSRLL $0x19, X4
PXOR X12, X4
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xdb
BYTE $0x0c
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xf6
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xc9
BYTE $0x04
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xe4
BYTE $0x0c
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xff
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xd2
BYTE $0x04
CMPQ R9, CX
JB openSSETail128LoopA
CMPQ R9, $0xa0
JNE openSSETail128LoopB
PADDL ·chacha20Constants<>+0(SB), X0
PADDL ·chacha20Constants<>+0(SB), X1
PADDL 32(BP), X3
PADDL 32(BP), X4
PADDL 48(BP), X6
PADDL 48(BP), X7
PADDL 96(BP), X9
PADDL 80(BP), X10
MOVOU (SI), X12
MOVOU 16(SI), X13
MOVOU 32(SI), X14
MOVOU 48(SI), X15
PXOR X12, X1
PXOR X13, X4
PXOR X14, X7
PXOR X15, X10
MOVOU X1, (DI)
MOVOU X4, 16(DI)
MOVOU X7, 32(DI)
MOVOU X10, 48(DI)
SUBQ $0x40, BX
LEAQ 64(SI), SI
LEAQ 64(DI), DI
JMP openSSETail64DecLoop
openSSETail192:
MOVO ·chacha20Constants<>+0(SB), X2
MOVO 32(BP), X5
MOVO 48(BP), X8
MOVO 128(BP), X11
PADDL ·sseIncMask<>+0(SB), X11
MOVO X11, 80(BP)
MOVO X2, X1
MOVO X5, X4
MOVO X8, X7
MOVO X11, X10
PADDL ·sseIncMask<>+0(SB), X10
MOVO X10, 96(BP)
MOVO X1, X0
MOVO X4, X3
MOVO X7, X6
MOVO X10, X9
PADDL ·sseIncMask<>+0(SB), X9
MOVO X9, 112(BP)
MOVQ BX, CX
MOVQ $0x000000a0, R9
CMPQ CX, $0xa0
CMOVQGT R9, CX
ANDQ $-16, CX
XORQ R9, R9
openSSLTail192LoopA:
ADDQ (SI)(R9*1), R10
ADCQ 8(SI)(R9*1), R11
ADCQ $0x01, R12
MOVQ (BP), AX
MOVQ AX, R15
MULQ R10
MOVQ AX, R13
MOVQ DX, R14
MOVQ (BP), AX
MULQ R11
IMULQ R12, R15
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), AX
MOVQ AX, R8
MULQ R10
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 8(BP), AX
MULQ R11
ADDQ AX, R15
ADCQ $0x00, DX
IMULQ R12, R8
ADDQ R10, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
openSSLTail192LoopB:
ADDQ $0x10, R9
PADDD X3, X0
PXOR X0, X9
ROL16(X9, X12)
PADDD X9, X6
PXOR X6, X3
MOVO X3, X12
PSLLL $0x0c, X12
PSRLL $0x14, X3
PXOR X12, X3
PADDD X3, X0
PXOR X0, X9
ROL8(X9, X12)
PADDD X9, X6
PXOR X6, X3
MOVO X3, X12
PSLLL $0x07, X12
PSRLL $0x19, X3
PXOR X12, X3
PADDD X4, X1
PXOR X1, X10
ROL16(X10, X12)
PADDD X10, X7
PXOR X7, X4
MOVO X4, X12
PSLLL $0x0c, X12
PSRLL $0x14, X4
PXOR X12, X4
PADDD X4, X1
PXOR X1, X10
ROL8(X10, X12)
PADDD X10, X7
PXOR X7, X4
MOVO X4, X12
PSLLL $0x07, X12
PSRLL $0x19, X4
PXOR X12, X4
PADDD X5, X2
PXOR X2, X11
ROL16(X11, X12)
PADDD X11, X8
PXOR X8, X5
MOVO X5, X12
PSLLL $0x0c, X12
PSRLL $0x14, X5
PXOR X12, X5
PADDD X5, X2
PXOR X2, X11
ROL8(X11, X12)
PADDD X11, X8
PXOR X8, X5
MOVO X5, X12
PSLLL $0x07, X12
PSRLL $0x19, X5
PXOR X12, X5
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xdb
BYTE $0x04
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xf6
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xc9
BYTE $0x0c
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xe4
BYTE $0x04
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xff
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xd2
BYTE $0x0c
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xed
BYTE $0x04
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xc0
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xdb
BYTE $0x0c
PADDD X3, X0
PXOR X0, X9
ROL16(X9, X12)
PADDD X9, X6
PXOR X6, X3
MOVO X3, X12
PSLLL $0x0c, X12
PSRLL $0x14, X3
PXOR X12, X3
PADDD X3, X0
PXOR X0, X9
ROL8(X9, X12)
PADDD X9, X6
PXOR X6, X3
MOVO X3, X12
PSLLL $0x07, X12
PSRLL $0x19, X3
PXOR X12, X3
PADDD X4, X1
PXOR X1, X10
ROL16(X10, X12)
PADDD X10, X7
PXOR X7, X4
MOVO X4, X12
PSLLL $0x0c, X12
PSRLL $0x14, X4
PXOR X12, X4
PADDD X4, X1
PXOR X1, X10
ROL8(X10, X12)
PADDD X10, X7
PXOR X7, X4
MOVO X4, X12
PSLLL $0x07, X12
PSRLL $0x19, X4
PXOR X12, X4
PADDD X5, X2
PXOR X2, X11
ROL16(X11, X12)
PADDD X11, X8
PXOR X8, X5
MOVO X5, X12
PSLLL $0x0c, X12
PSRLL $0x14, X5
PXOR X12, X5
PADDD X5, X2
PXOR X2, X11
ROL8(X11, X12)
PADDD X11, X8
PXOR X8, X5
MOVO X5, X12
PSLLL $0x07, X12
PSRLL $0x19, X5
PXOR X12, X5
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xdb
BYTE $0x0c
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xf6
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xc9
BYTE $0x04
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xe4
BYTE $0x0c
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xff
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xd2
BYTE $0x04
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xed
BYTE $0x0c
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xc0
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xdb
BYTE $0x04
CMPQ R9, CX
JB openSSLTail192LoopA
CMPQ R9, $0xa0
JNE openSSLTail192LoopB
CMPQ BX, $0xb0
JB openSSLTail192Store
ADDQ 160(SI), R10
ADCQ 168(SI), R11
ADCQ $0x01, R12
MOVQ (BP), AX
MOVQ AX, R15
MULQ R10
MOVQ AX, R13
MOVQ DX, R14
MOVQ (BP), AX
MULQ R11
IMULQ R12, R15
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), AX
MOVQ AX, R8
MULQ R10
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 8(BP), AX
MULQ R11
ADDQ AX, R15
ADCQ $0x00, DX
IMULQ R12, R8
ADDQ R10, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
CMPQ BX, $0xc0
JB openSSLTail192Store
ADDQ 176(SI), R10
ADCQ 184(SI), R11
ADCQ $0x01, R12
MOVQ (BP), AX
MOVQ AX, R15
MULQ R10
MOVQ AX, R13
MOVQ DX, R14
MOVQ (BP), AX
MULQ R11
IMULQ R12, R15
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), AX
MOVQ AX, R8
MULQ R10
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 8(BP), AX
MULQ R11
ADDQ AX, R15
ADCQ $0x00, DX
IMULQ R12, R8
ADDQ R10, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
openSSLTail192Store:
PADDL ·chacha20Constants<>+0(SB), X0
PADDL ·chacha20Constants<>+0(SB), X1
PADDL ·chacha20Constants<>+0(SB), X2
PADDL 32(BP), X3
PADDL 32(BP), X4
PADDL 32(BP), X5
PADDL 48(BP), X6
PADDL 48(BP), X7
PADDL 48(BP), X8
PADDL 112(BP), X9
PADDL 96(BP), X10
PADDL 80(BP), X11
MOVOU (SI), X12
MOVOU 16(SI), X13
MOVOU 32(SI), X14
MOVOU 48(SI), X15
PXOR X12, X2
PXOR X13, X5
PXOR X14, X8
PXOR X15, X11
MOVOU X2, (DI)
MOVOU X5, 16(DI)
MOVOU X8, 32(DI)
MOVOU X11, 48(DI)
MOVOU 64(SI), X12
MOVOU 80(SI), X13
MOVOU 96(SI), X14
MOVOU 112(SI), X15
PXOR X12, X1
PXOR X13, X4
PXOR X14, X7
PXOR X15, X10
MOVOU X1, 64(DI)
MOVOU X4, 80(DI)
MOVOU X7, 96(DI)
MOVOU X10, 112(DI)
SUBQ $0x80, BX
LEAQ 128(SI), SI
LEAQ 128(DI), DI
JMP openSSETail64DecLoop
openSSETail256:
MOVO ·chacha20Constants<>+0(SB), X0
MOVO 32(BP), X3
MOVO 48(BP), X6
MOVO 128(BP), X9
PADDL ·sseIncMask<>+0(SB), X9
MOVO X0, X1
MOVO X3, X4
MOVO X6, X7
MOVO X9, X10
PADDL ·sseIncMask<>+0(SB), X10
MOVO X1, X2
MOVO X4, X5
MOVO X7, X8
MOVO X10, X11
PADDL ·sseIncMask<>+0(SB), X11
MOVO X2, X12
MOVO X5, X13
MOVO X8, X14
MOVO X11, X15
PADDL ·sseIncMask<>+0(SB), X15
// Store counters
MOVO X9, 80(BP)
MOVO X10, 96(BP)
MOVO X11, 112(BP)
MOVO X15, 128(BP)
XORQ R9, R9
openSSETail256Loop:
ADDQ (SI)(R9*1), R10
ADCQ 8(SI)(R9*1), R11
ADCQ $0x01, R12
MOVO X14, 64(BP)
PADDD X3, X0
PXOR X0, X9
ROL16(X9, X14)
PADDD X9, X6
PXOR X6, X3
MOVO X3, X14
PSLLL $0x0c, X14
PSRLL $0x14, X3
PXOR X14, X3
PADDD X3, X0
PXOR X0, X9
ROL8(X9, X14)
PADDD X9, X6
PXOR X6, X3
MOVO X3, X14
PSLLL $0x07, X14
PSRLL $0x19, X3
PXOR X14, X3
PADDD X4, X1
PXOR X1, X10
ROL16(X10, X14)
PADDD X10, X7
PXOR X7, X4
MOVO X4, X14
PSLLL $0x0c, X14
PSRLL $0x14, X4
PXOR X14, X4
PADDD X4, X1
PXOR X1, X10
ROL8(X10, X14)
PADDD X10, X7
PXOR X7, X4
MOVO X4, X14
PSLLL $0x07, X14
PSRLL $0x19, X4
PXOR X14, X4
PADDD X5, X2
PXOR X2, X11
ROL16(X11, X14)
PADDD X11, X8
PXOR X8, X5
MOVO X5, X14
PSLLL $0x0c, X14
PSRLL $0x14, X5
PXOR X14, X5
PADDD X5, X2
PXOR X2, X11
ROL8(X11, X14)
PADDD X11, X8
PXOR X8, X5
MOVO X5, X14
PSLLL $0x07, X14
PSRLL $0x19, X5
PXOR X14, X5
MOVO 64(BP), X14
MOVO X7, 64(BP)
PADDD X13, X12
PXOR X12, X15
ROL16(X15, X7)
PADDD X15, X14
PXOR X14, X13
MOVO X13, X7
PSLLL $0x0c, X7
PSRLL $0x14, X13
PXOR X7, X13
PADDD X13, X12
PXOR X12, X15
ROL8(X15, X7)
PADDD X15, X14
PXOR X14, X13
MOVO X13, X7
PSLLL $0x07, X7
PSRLL $0x19, X13
PXOR X7, X13
MOVO 64(BP), X7
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xdb
BYTE $0x04
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xe4
BYTE $0x04
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xed
BYTE $0x04
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xed
BYTE $0x04
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xf6
BYTE $0x08
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xff
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xc0
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xf6
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xc9
BYTE $0x0c
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xd2
BYTE $0x0c
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xdb
BYTE $0x0c
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xff
BYTE $0x0c
MOVQ (BP), AX
MOVQ AX, R15
MULQ R10
MOVQ AX, R13
MOVQ DX, R14
MOVQ (BP), AX
MULQ R11
IMULQ R12, R15
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), AX
MOVQ AX, R8
MULQ R10
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 8(BP), AX
MULQ R11
ADDQ AX, R15
ADCQ $0x00, DX
MOVO X14, 64(BP)
PADDD X3, X0
PXOR X0, X9
ROL16(X9, X14)
PADDD X9, X6
PXOR X6, X3
MOVO X3, X14
PSLLL $0x0c, X14
PSRLL $0x14, X3
PXOR X14, X3
PADDD X3, X0
PXOR X0, X9
ROL8(X9, X14)
PADDD X9, X6
PXOR X6, X3
MOVO X3, X14
PSLLL $0x07, X14
PSRLL $0x19, X3
PXOR X14, X3
PADDD X4, X1
PXOR X1, X10
ROL16(X10, X14)
PADDD X10, X7
PXOR X7, X4
MOVO X4, X14
PSLLL $0x0c, X14
PSRLL $0x14, X4
PXOR X14, X4
PADDD X4, X1
PXOR X1, X10
ROL8(X10, X14)
PADDD X10, X7
PXOR X7, X4
MOVO X4, X14
PSLLL $0x07, X14
PSRLL $0x19, X4
PXOR X14, X4
PADDD X5, X2
PXOR X2, X11
ROL16(X11, X14)
PADDD X11, X8
PXOR X8, X5
MOVO X5, X14
PSLLL $0x0c, X14
PSRLL $0x14, X5
PXOR X14, X5
PADDD X5, X2
PXOR X2, X11
ROL8(X11, X14)
PADDD X11, X8
PXOR X8, X5
MOVO X5, X14
PSLLL $0x07, X14
PSRLL $0x19, X5
PXOR X14, X5
MOVO 64(BP), X14
MOVO X7, 64(BP)
PADDD X13, X12
PXOR X12, X15
ROL16(X15, X7)
PADDD X15, X14
PXOR X14, X13
MOVO X13, X7
PSLLL $0x0c, X7
PSRLL $0x14, X13
PXOR X7, X13
PADDD X13, X12
PXOR X12, X15
ROL8(X15, X7)
PADDD X15, X14
PXOR X14, X13
MOVO X13, X7
PSLLL $0x07, X7
PSRLL $0x19, X13
PXOR X7, X13
MOVO 64(BP), X7
IMULQ R12, R8
ADDQ R10, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xdb
BYTE $0x0c
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xe4
BYTE $0x0c
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xed
BYTE $0x0c
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xed
BYTE $0x0c
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xf6
BYTE $0x08
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xff
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xc0
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xf6
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xc9
BYTE $0x04
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xd2
BYTE $0x04
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xdb
BYTE $0x04
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xff
BYTE $0x04
ADDQ $0x10, R9
CMPQ R9, $0xa0
JB openSSETail256Loop
MOVQ BX, CX
ANDQ $-16, CX
openSSETail256HashLoop:
ADDQ (SI)(R9*1), R10
ADCQ 8(SI)(R9*1), R11
ADCQ $0x01, R12
MOVQ (BP), AX
MOVQ AX, R15
MULQ R10
MOVQ AX, R13
MOVQ DX, R14
MOVQ (BP), AX
MULQ R11
IMULQ R12, R15
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), AX
MOVQ AX, R8
MULQ R10
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 8(BP), AX
MULQ R11
ADDQ AX, R15
ADCQ $0x00, DX
IMULQ R12, R8
ADDQ R10, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
ADDQ $0x10, R9
CMPQ R9, CX
JB openSSETail256HashLoop
// Add in the state
PADDD ·chacha20Constants<>+0(SB), X0
PADDD ·chacha20Constants<>+0(SB), X1
PADDD ·chacha20Constants<>+0(SB), X2
PADDD ·chacha20Constants<>+0(SB), X12
PADDD 32(BP), X3
PADDD 32(BP), X4
PADDD 32(BP), X5
PADDD 32(BP), X13
PADDD 48(BP), X6
PADDD 48(BP), X7
PADDD 48(BP), X8
PADDD 48(BP), X14
PADDD 80(BP), X9
PADDD 96(BP), X10
PADDD 112(BP), X11
PADDD 128(BP), X15
MOVO X15, 64(BP)
// Load - xor - store
MOVOU (SI), X15
PXOR X15, X0
MOVOU 16(SI), X15
PXOR X15, X3
MOVOU 32(SI), X15
PXOR X15, X6
MOVOU 48(SI), X15
PXOR X15, X9
MOVOU X0, (DI)
MOVOU X3, 16(DI)
MOVOU X6, 32(DI)
MOVOU X9, 48(DI)
MOVOU 64(SI), X0
MOVOU 80(SI), X3
MOVOU 96(SI), X6
MOVOU 112(SI), X9
PXOR X0, X1
PXOR X3, X4
PXOR X6, X7
PXOR X9, X10
MOVOU X1, 64(DI)
MOVOU X4, 80(DI)
MOVOU X7, 96(DI)
MOVOU X10, 112(DI)
MOVOU 128(SI), X0
MOVOU 144(SI), X3
MOVOU 160(SI), X6
MOVOU 176(SI), X9
PXOR X0, X2
PXOR X3, X5
PXOR X6, X8
PXOR X9, X11
MOVOU X2, 128(DI)
MOVOU X5, 144(DI)
MOVOU X8, 160(DI)
MOVOU X11, 176(DI)
LEAQ 192(SI), SI
LEAQ 192(DI), DI
SUBQ $0xc0, BX
MOVO X12, X0
MOVO X13, X3
MOVO X14, X6
MOVO 64(BP), X9
JMP openSSETail64DecLoop
chacha20Poly1305Open_AVX2:
VZEROUPPER
VMOVDQU ·chacha20Constants<>+0(SB), Y0
BYTE $0xc4
BYTE $0x42
BYTE $0x7d
BYTE $0x5a
BYTE $0x70
BYTE $0x10
BYTE $0xc4
BYTE $0x42
BYTE $0x7d
BYTE $0x5a
BYTE $0x60
BYTE $0x20
BYTE $0xc4
BYTE $0xc2
BYTE $0x7d
BYTE $0x5a
BYTE $0x60
BYTE $0x30
VPADDD ·avx2InitMask<>+0(SB), Y4, Y4
// Special optimization, for very short buffers
CMPQ BX, $0xc0
JBE openAVX2192
CMPQ BX, $0x00000140
JBE openAVX2320
// For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
VMOVDQA Y14, 32(BP)
VMOVDQA Y12, 64(BP)
VMOVDQA Y4, 192(BP)
MOVQ $0x0000000a, R9
openAVX2PreparePolyKey:
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x0c, Y14, Y3
VPSRLD $0x14, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x07, Y14, Y3
VPSRLD $0x19, Y14, Y14
VPXOR Y3, Y14, Y14
VPALIGNR $0x04, Y14, Y14, Y14
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x0c, Y4, Y4, Y4
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x0c, Y14, Y3
VPSRLD $0x14, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x07, Y14, Y3
VPSRLD $0x19, Y14, Y14
VPXOR Y3, Y14, Y14
VPALIGNR $0x0c, Y14, Y14, Y14
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x04, Y4, Y4, Y4
DECQ R9
JNE openAVX2PreparePolyKey
VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
VPADDD 32(BP), Y14, Y14
VPADDD 64(BP), Y12, Y12
VPADDD 192(BP), Y4, Y4
VPERM2I128 $0x02, Y0, Y14, Y3
// Clamp and store poly key
VPAND ·polyClampMask<>+0(SB), Y3, Y3
VMOVDQA Y3, (BP)
// Stream for the first 64 bytes
VPERM2I128 $0x13, Y0, Y14, Y0
VPERM2I128 $0x13, Y12, Y4, Y14
// Hash AD + first 64 bytes
MOVQ ad_len+80(FP), R9
CALL polyHashADInternal<>(SB)
XORQ CX, CX
openAVX2InitialHash64:
ADDQ (SI)(CX*1), R10
ADCQ 8(SI)(CX*1), R11
ADCQ $0x01, R12
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
ADDQ $0x10, CX
CMPQ CX, $0x40
JNE openAVX2InitialHash64
// Decrypt the first 64 bytes
VPXOR (SI), Y0, Y0
VPXOR 32(SI), Y14, Y14
VMOVDQU Y0, (DI)
VMOVDQU Y14, 32(DI)
LEAQ 64(SI), SI
LEAQ 64(DI), DI
SUBQ $0x40, BX
openAVX2MainLoop:
CMPQ BX, $0x00000200
JB openAVX2MainLoopDone
// Load state, increment counter blocks, store the incremented counters
VMOVDQU ·chacha20Constants<>+0(SB), Y0
VMOVDQA Y0, Y5
VMOVDQA Y0, Y6
VMOVDQA Y0, Y7
VMOVDQA 32(BP), Y14
VMOVDQA Y14, Y9
VMOVDQA Y14, Y10
VMOVDQA Y14, Y11
VMOVDQA 64(BP), Y12
VMOVDQA Y12, Y13
VMOVDQA Y12, Y8
VMOVDQA Y12, Y15
VMOVDQA 192(BP), Y4
VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
VPADDD ·avx2IncMask<>+0(SB), Y2, Y3
VMOVDQA Y4, 96(BP)
VMOVDQA Y1, 128(BP)
VMOVDQA Y2, 160(BP)
VMOVDQA Y3, 192(BP)
XORQ CX, CX
openAVX2InternalLoop:
ADDQ (SI)(CX*1), R10
ADCQ 8(SI)(CX*1), R11
ADCQ $0x01, R12
VPADDD Y14, Y0, Y0
VPADDD Y9, Y5, Y5
VPADDD Y10, Y6, Y6
VPADDD Y11, Y7, Y7
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
VPXOR Y0, Y4, Y4
VPXOR Y5, Y1, Y1
VPXOR Y6, Y2, Y2
VPXOR Y7, Y3, Y3
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y2, Y2
VPSHUFB ·rol16<>+0(SB), Y3, Y3
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
VPADDD Y4, Y12, Y12
VPADDD Y1, Y13, Y13
VPADDD Y2, Y8, Y8
VPADDD Y3, Y15, Y15
VPXOR Y12, Y14, Y14
VPXOR Y13, Y9, Y9
VPXOR Y8, Y10, Y10
VPXOR Y15, Y11, Y11
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
VMOVDQA Y15, 224(BP)
VPSLLD $0x0c, Y14, Y15
VPSRLD $0x14, Y14, Y14
VPXOR Y15, Y14, Y14
VPSLLD $0x0c, Y9, Y15
VPSRLD $0x14, Y9, Y9
VPXOR Y15, Y9, Y9
VPSLLD $0x0c, Y10, Y15
VPSRLD $0x14, Y10, Y10
VPXOR Y15, Y10, Y10
VPSLLD $0x0c, Y11, Y15
VPSRLD $0x14, Y11, Y11
VPXOR Y15, Y11, Y11
VMOVDQA 224(BP), Y15
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
VPADDD Y14, Y0, Y0
VPADDD Y9, Y5, Y5
VPADDD Y10, Y6, Y6
VPADDD Y11, Y7, Y7
VPXOR Y0, Y4, Y4
VPXOR Y5, Y1, Y1
VPXOR Y6, Y2, Y2
VPXOR Y7, Y3, Y3
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y2, Y2
VPSHUFB ·rol8<>+0(SB), Y3, Y3
ADDQ 16(SI)(CX*1), R10
ADCQ 24(SI)(CX*1), R11
ADCQ $0x01, R12
VPADDD Y4, Y12, Y12
VPADDD Y1, Y13, Y13
VPADDD Y2, Y8, Y8
VPADDD Y3, Y15, Y15
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
VPXOR Y12, Y14, Y14
VPXOR Y13, Y9, Y9
VPXOR Y8, Y10, Y10
VPXOR Y15, Y11, Y11
VMOVDQA Y15, 224(BP)
VPSLLD $0x07, Y14, Y15
VPSRLD $0x19, Y14, Y14
VPXOR Y15, Y14, Y14
VPSLLD $0x07, Y9, Y15
VPSRLD $0x19, Y9, Y9
VPXOR Y15, Y9, Y9
VPSLLD $0x07, Y10, Y15
VPSRLD $0x19, Y10, Y10
VPXOR Y15, Y10, Y10
VPSLLD $0x07, Y11, Y15
VPSRLD $0x19, Y11, Y11
VPXOR Y15, Y11, Y11
VMOVDQA 224(BP), Y15
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
VPALIGNR $0x04, Y14, Y14, Y14
VPALIGNR $0x04, Y9, Y9, Y9
VPALIGNR $0x04, Y10, Y10, Y10
VPALIGNR $0x04, Y11, Y11, Y11
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x08, Y8, Y8, Y8
VPALIGNR $0x08, Y15, Y15, Y15
VPALIGNR $0x0c, Y4, Y4, Y4
VPALIGNR $0x0c, Y1, Y1, Y1
VPALIGNR $0x0c, Y2, Y2, Y2
VPALIGNR $0x0c, Y3, Y3, Y3
VPADDD Y14, Y0, Y0
VPADDD Y9, Y5, Y5
VPADDD Y10, Y6, Y6
VPADDD Y11, Y7, Y7
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
VPXOR Y0, Y4, Y4
VPXOR Y5, Y1, Y1
VPXOR Y6, Y2, Y2
VPXOR Y7, Y3, Y3
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y2, Y2
VPSHUFB ·rol16<>+0(SB), Y3, Y3
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
VPADDD Y4, Y12, Y12
VPADDD Y1, Y13, Y13
VPADDD Y2, Y8, Y8
VPADDD Y3, Y15, Y15
VPXOR Y12, Y14, Y14
VPXOR Y13, Y9, Y9
VPXOR Y8, Y10, Y10
VPXOR Y15, Y11, Y11
ADDQ 32(SI)(CX*1), R10
ADCQ 40(SI)(CX*1), R11
ADCQ $0x01, R12
LEAQ 48(CX), CX
VMOVDQA Y15, 224(BP)
VPSLLD $0x0c, Y14, Y15
VPSRLD $0x14, Y14, Y14
VPXOR Y15, Y14, Y14
VPSLLD $0x0c, Y9, Y15
VPSRLD $0x14, Y9, Y9
VPXOR Y15, Y9, Y9
VPSLLD $0x0c, Y10, Y15
VPSRLD $0x14, Y10, Y10
VPXOR Y15, Y10, Y10
VPSLLD $0x0c, Y11, Y15
VPSRLD $0x14, Y11, Y11
VPXOR Y15, Y11, Y11
VMOVDQA 224(BP), Y15
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
VPADDD Y14, Y0, Y0
VPADDD Y9, Y5, Y5
VPADDD Y10, Y6, Y6
VPADDD Y11, Y7, Y7
VPXOR Y0, Y4, Y4
VPXOR Y5, Y1, Y1
VPXOR Y6, Y2, Y2
VPXOR Y7, Y3, Y3
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y2, Y2
VPSHUFB ·rol8<>+0(SB), Y3, Y3
VPADDD Y4, Y12, Y12
VPADDD Y1, Y13, Y13
VPADDD Y2, Y8, Y8
VPADDD Y3, Y15, Y15
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
VPXOR Y12, Y14, Y14
VPXOR Y13, Y9, Y9
VPXOR Y8, Y10, Y10
VPXOR Y15, Y11, Y11
VMOVDQA Y15, 224(BP)
VPSLLD $0x07, Y14, Y15
VPSRLD $0x19, Y14, Y14
VPXOR Y15, Y14, Y14
VPSLLD $0x07, Y9, Y15
VPSRLD $0x19, Y9, Y9
VPXOR Y15, Y9, Y9
VPSLLD $0x07, Y10, Y15
VPSRLD $0x19, Y10, Y10
VPXOR Y15, Y10, Y10
VPSLLD $0x07, Y11, Y15
VPSRLD $0x19, Y11, Y11
VPXOR Y15, Y11, Y11
VMOVDQA 224(BP), Y15
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
VPALIGNR $0x0c, Y14, Y14, Y14
VPALIGNR $0x0c, Y9, Y9, Y9
VPALIGNR $0x0c, Y10, Y10, Y10
VPALIGNR $0x0c, Y11, Y11, Y11
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x08, Y8, Y8, Y8
VPALIGNR $0x08, Y15, Y15, Y15
VPALIGNR $0x04, Y4, Y4, Y4
VPALIGNR $0x04, Y1, Y1, Y1
VPALIGNR $0x04, Y2, Y2, Y2
VPALIGNR $0x04, Y3, Y3, Y3
CMPQ CX, $0x000001e0
JNE openAVX2InternalLoop
VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
VPADDD ·chacha20Constants<>+0(SB), Y6, Y6
VPADDD ·chacha20Constants<>+0(SB), Y7, Y7
VPADDD 32(BP), Y14, Y14
VPADDD 32(BP), Y9, Y9
VPADDD 32(BP), Y10, Y10
VPADDD 32(BP), Y11, Y11
VPADDD 64(BP), Y12, Y12
VPADDD 64(BP), Y13, Y13
VPADDD 64(BP), Y8, Y8
VPADDD 64(BP), Y15, Y15
VPADDD 96(BP), Y4, Y4
VPADDD 128(BP), Y1, Y1
VPADDD 160(BP), Y2, Y2
VPADDD 192(BP), Y3, Y3
VMOVDQA Y15, 224(BP)
// We only hashed 480 of the 512 bytes available - hash the remaining 32 here
ADDQ 480(SI), R10
ADCQ 488(SI), R11
ADCQ $0x01, R12
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
VPERM2I128 $0x02, Y0, Y14, Y15
VPERM2I128 $0x13, Y0, Y14, Y14
VPERM2I128 $0x02, Y12, Y4, Y0
VPERM2I128 $0x13, Y12, Y4, Y12
VPXOR (SI), Y15, Y15
VPXOR 32(SI), Y0, Y0
VPXOR 64(SI), Y14, Y14
VPXOR 96(SI), Y12, Y12
VMOVDQU Y15, (DI)
VMOVDQU Y0, 32(DI)
VMOVDQU Y14, 64(DI)
VMOVDQU Y12, 96(DI)
VPERM2I128 $0x02, Y5, Y9, Y0
VPERM2I128 $0x02, Y13, Y1, Y14
VPERM2I128 $0x13, Y5, Y9, Y12
VPERM2I128 $0x13, Y13, Y1, Y4
VPXOR 128(SI), Y0, Y0
VPXOR 160(SI), Y14, Y14
VPXOR 192(SI), Y12, Y12
VPXOR 224(SI), Y4, Y4
VMOVDQU Y0, 128(DI)
VMOVDQU Y14, 160(DI)
VMOVDQU Y12, 192(DI)
VMOVDQU Y4, 224(DI)
// and here
ADDQ 496(SI), R10
ADCQ 504(SI), R11
ADCQ $0x01, R12
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
VPERM2I128 $0x02, Y6, Y10, Y0
VPERM2I128 $0x02, Y8, Y2, Y14
VPERM2I128 $0x13, Y6, Y10, Y12
VPERM2I128 $0x13, Y8, Y2, Y4
VPXOR 256(SI), Y0, Y0
VPXOR 288(SI), Y14, Y14
VPXOR 320(SI), Y12, Y12
VPXOR 352(SI), Y4, Y4
VMOVDQU Y0, 256(DI)
VMOVDQU Y14, 288(DI)
VMOVDQU Y12, 320(DI)
VMOVDQU Y4, 352(DI)
VPERM2I128 $0x02, Y7, Y11, Y0
VPERM2I128 $0x02, 224(BP), Y3, Y14
VPERM2I128 $0x13, Y7, Y11, Y12
VPERM2I128 $0x13, 224(BP), Y3, Y4
VPXOR 384(SI), Y0, Y0
VPXOR 416(SI), Y14, Y14
VPXOR 448(SI), Y12, Y12
VPXOR 480(SI), Y4, Y4
VMOVDQU Y0, 384(DI)
VMOVDQU Y14, 416(DI)
VMOVDQU Y12, 448(DI)
VMOVDQU Y4, 480(DI)
LEAQ 512(SI), SI
LEAQ 512(DI), DI
SUBQ $0x00000200, BX
JMP openAVX2MainLoop
openAVX2MainLoopDone:
// Handle the various tail sizes efficiently
TESTQ BX, BX
JE openSSEFinalize
CMPQ BX, $0x80
JBE openAVX2Tail128
CMPQ BX, $0x00000100
JBE openAVX2Tail256
CMPQ BX, $0x00000180
JBE openAVX2Tail384
JMP openAVX2Tail512
openAVX2192:
VMOVDQA Y0, Y5
VMOVDQA Y14, Y9
VMOVDQA Y12, Y13
VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
VMOVDQA Y0, Y6
VMOVDQA Y14, Y10
VMOVDQA Y12, Y8
VMOVDQA Y4, Y2
VMOVDQA Y1, Y15
MOVQ $0x0000000a, R9
openAVX2192InnerCipherLoop:
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x0c, Y14, Y3
VPSRLD $0x14, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x07, Y14, Y3
VPSRLD $0x19, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x0c, Y9, Y3
VPSRLD $0x14, Y9, Y9
VPXOR Y3, Y9, Y9
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x07, Y9, Y3
VPSRLD $0x19, Y9, Y9
VPXOR Y3, Y9, Y9
VPALIGNR $0x04, Y14, Y14, Y14
VPALIGNR $0x04, Y9, Y9, Y9
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x0c, Y4, Y4, Y4
VPALIGNR $0x0c, Y1, Y1, Y1
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x0c, Y14, Y3
VPSRLD $0x14, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x07, Y14, Y3
VPSRLD $0x19, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x0c, Y9, Y3
VPSRLD $0x14, Y9, Y9
VPXOR Y3, Y9, Y9
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x07, Y9, Y3
VPSRLD $0x19, Y9, Y9
VPXOR Y3, Y9, Y9
VPALIGNR $0x0c, Y14, Y14, Y14
VPALIGNR $0x0c, Y9, Y9, Y9
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x04, Y4, Y4, Y4
VPALIGNR $0x04, Y1, Y1, Y1
DECQ R9
JNE openAVX2192InnerCipherLoop
VPADDD Y6, Y0, Y0
VPADDD Y6, Y5, Y5
VPADDD Y10, Y14, Y14
VPADDD Y10, Y9, Y9
VPADDD Y8, Y12, Y12
VPADDD Y8, Y13, Y13
VPADDD Y2, Y4, Y4
VPADDD Y15, Y1, Y1
VPERM2I128 $0x02, Y0, Y14, Y3
// Clamp and store poly key
VPAND ·polyClampMask<>+0(SB), Y3, Y3
VMOVDQA Y3, (BP)
// Stream for up to 192 bytes
VPERM2I128 $0x13, Y0, Y14, Y0
VPERM2I128 $0x13, Y12, Y4, Y14
VPERM2I128 $0x02, Y5, Y9, Y12
VPERM2I128 $0x02, Y13, Y1, Y4
VPERM2I128 $0x13, Y5, Y9, Y5
VPERM2I128 $0x13, Y13, Y1, Y9
openAVX2ShortOpen:
// Hash
MOVQ ad_len+80(FP), R9
CALL polyHashADInternal<>(SB)
openAVX2ShortOpenLoop:
CMPQ BX, $0x20
JB openAVX2ShortTail32
SUBQ $0x20, BX
// Load for hashing
ADDQ (SI), R10
ADCQ 8(SI), R11
ADCQ $0x01, R12
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
ADDQ 16(SI), R10
ADCQ 24(SI), R11
ADCQ $0x01, R12
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
// Load for decryption
VPXOR (SI), Y0, Y0
VMOVDQU Y0, (DI)
LEAQ 32(SI), SI
LEAQ 32(DI), DI
// Shift stream left
VMOVDQA Y14, Y0
VMOVDQA Y12, Y14
VMOVDQA Y4, Y12
VMOVDQA Y5, Y4
VMOVDQA Y9, Y5
VMOVDQA Y13, Y9
VMOVDQA Y1, Y13
VMOVDQA Y6, Y1
VMOVDQA Y10, Y6
JMP openAVX2ShortOpenLoop
openAVX2ShortTail32:
CMPQ BX, $0x10
VMOVDQA X0, X1
JB openAVX2ShortDone
SUBQ $0x10, BX
// Load for hashing
ADDQ (SI), R10
ADCQ 8(SI), R11
ADCQ $0x01, R12
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
// Load for decryption
VPXOR (SI), X0, X12
VMOVDQU X12, (DI)
LEAQ 16(SI), SI
LEAQ 16(DI), DI
VPERM2I128 $0x11, Y0, Y0, Y0
VMOVDQA X0, X1
openAVX2ShortDone:
VZEROUPPER
JMP openSSETail16
openAVX2320:
VMOVDQA Y0, Y5
VMOVDQA Y14, Y9
VMOVDQA Y12, Y13
VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
VMOVDQA Y0, Y6
VMOVDQA Y14, Y10
VMOVDQA Y12, Y8
VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
VMOVDQA Y14, Y7
VMOVDQA Y12, Y11
VMOVDQA Y4, Y15
MOVQ $0x0000000a, R9
openAVX2320InnerCipherLoop:
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x0c, Y14, Y3
VPSRLD $0x14, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x07, Y14, Y3
VPSRLD $0x19, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x0c, Y9, Y3
VPSRLD $0x14, Y9, Y9
VPXOR Y3, Y9, Y9
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x07, Y9, Y3
VPSRLD $0x19, Y9, Y9
VPXOR Y3, Y9, Y9
VPADDD Y10, Y6, Y6
VPXOR Y6, Y2, Y2
VPSHUFB ·rol16<>+0(SB), Y2, Y2
VPADDD Y2, Y8, Y8
VPXOR Y8, Y10, Y10
VPSLLD $0x0c, Y10, Y3
VPSRLD $0x14, Y10, Y10
VPXOR Y3, Y10, Y10
VPADDD Y10, Y6, Y6
VPXOR Y6, Y2, Y2
VPSHUFB ·rol8<>+0(SB), Y2, Y2
VPADDD Y2, Y8, Y8
VPXOR Y8, Y10, Y10
VPSLLD $0x07, Y10, Y3
VPSRLD $0x19, Y10, Y10
VPXOR Y3, Y10, Y10
VPALIGNR $0x04, Y14, Y14, Y14
VPALIGNR $0x04, Y9, Y9, Y9
VPALIGNR $0x04, Y10, Y10, Y10
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x08, Y8, Y8, Y8
VPALIGNR $0x0c, Y4, Y4, Y4
VPALIGNR $0x0c, Y1, Y1, Y1
VPALIGNR $0x0c, Y2, Y2, Y2
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x0c, Y14, Y3
VPSRLD $0x14, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x07, Y14, Y3
VPSRLD $0x19, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x0c, Y9, Y3
VPSRLD $0x14, Y9, Y9
VPXOR Y3, Y9, Y9
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x07, Y9, Y3
VPSRLD $0x19, Y9, Y9
VPXOR Y3, Y9, Y9
VPADDD Y10, Y6, Y6
VPXOR Y6, Y2, Y2
VPSHUFB ·rol16<>+0(SB), Y2, Y2
VPADDD Y2, Y8, Y8
VPXOR Y8, Y10, Y10
VPSLLD $0x0c, Y10, Y3
VPSRLD $0x14, Y10, Y10
VPXOR Y3, Y10, Y10
VPADDD Y10, Y6, Y6
VPXOR Y6, Y2, Y2
VPSHUFB ·rol8<>+0(SB), Y2, Y2
VPADDD Y2, Y8, Y8
VPXOR Y8, Y10, Y10
VPSLLD $0x07, Y10, Y3
VPSRLD $0x19, Y10, Y10
VPXOR Y3, Y10, Y10
VPALIGNR $0x0c, Y14, Y14, Y14
VPALIGNR $0x0c, Y9, Y9, Y9
VPALIGNR $0x0c, Y10, Y10, Y10
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x08, Y8, Y8, Y8
VPALIGNR $0x04, Y4, Y4, Y4
VPALIGNR $0x04, Y1, Y1, Y1
VPALIGNR $0x04, Y2, Y2, Y2
DECQ R9
JNE openAVX2320InnerCipherLoop
VMOVDQA ·chacha20Constants<>+0(SB), Y3
VPADDD Y3, Y0, Y0
VPADDD Y3, Y5, Y5
VPADDD Y3, Y6, Y6
VPADDD Y7, Y14, Y14
VPADDD Y7, Y9, Y9
VPADDD Y7, Y10, Y10
VPADDD Y11, Y12, Y12
VPADDD Y11, Y13, Y13
VPADDD Y11, Y8, Y8
VMOVDQA ·avx2IncMask<>+0(SB), Y3
VPADDD Y15, Y4, Y4
VPADDD Y3, Y15, Y15
VPADDD Y15, Y1, Y1
VPADDD Y3, Y15, Y15
VPADDD Y15, Y2, Y2
// Clamp and store poly key
VPERM2I128 $0x02, Y0, Y14, Y3
VPAND ·polyClampMask<>+0(SB), Y3, Y3
VMOVDQA Y3, (BP)
// Stream for up to 320 bytes
VPERM2I128 $0x13, Y0, Y14, Y0
VPERM2I128 $0x13, Y12, Y4, Y14
VPERM2I128 $0x02, Y5, Y9, Y12
VPERM2I128 $0x02, Y13, Y1, Y4
VPERM2I128 $0x13, Y5, Y9, Y5
VPERM2I128 $0x13, Y13, Y1, Y9
VPERM2I128 $0x02, Y6, Y10, Y13
VPERM2I128 $0x02, Y8, Y2, Y1
VPERM2I128 $0x13, Y6, Y10, Y6
VPERM2I128 $0x13, Y8, Y2, Y10
JMP openAVX2ShortOpen
openAVX2Tail128:
// Need to decrypt up to 128 bytes - prepare two blocks
VMOVDQA ·chacha20Constants<>+0(SB), Y5
VMOVDQA 32(BP), Y9
VMOVDQA 64(BP), Y13
VMOVDQA 192(BP), Y1
VPADDD ·avx2IncMask<>+0(SB), Y1, Y1
VMOVDQA Y1, Y4
XORQ R9, R9
MOVQ BX, CX
ANDQ $-16, CX
TESTQ CX, CX
JE openAVX2Tail128LoopB
openAVX2Tail128LoopA:
ADDQ (SI)(R9*1), R10
ADCQ 8(SI)(R9*1), R11
ADCQ $0x01, R12
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
openAVX2Tail128LoopB:
ADDQ $0x10, R9
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x0c, Y9, Y3
VPSRLD $0x14, Y9, Y9
VPXOR Y3, Y9, Y9
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x07, Y9, Y3
VPSRLD $0x19, Y9, Y9
VPXOR Y3, Y9, Y9
VPALIGNR $0x04, Y9, Y9, Y9
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x0c, Y1, Y1, Y1
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x0c, Y9, Y3
VPSRLD $0x14, Y9, Y9
VPXOR Y3, Y9, Y9
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x07, Y9, Y3
VPSRLD $0x19, Y9, Y9
VPXOR Y3, Y9, Y9
VPALIGNR $0x0c, Y9, Y9, Y9
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x04, Y1, Y1, Y1
CMPQ R9, CX
JB openAVX2Tail128LoopA
CMPQ R9, $0xa0
JNE openAVX2Tail128LoopB
VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
VPADDD 32(BP), Y9, Y9
VPADDD 64(BP), Y13, Y13
VPADDD Y4, Y1, Y1
VPERM2I128 $0x02, Y5, Y9, Y0
VPERM2I128 $0x02, Y13, Y1, Y14
VPERM2I128 $0x13, Y5, Y9, Y12
VPERM2I128 $0x13, Y13, Y1, Y4
openAVX2TailLoop:
CMPQ BX, $0x20
JB openAVX2Tail
SUBQ $0x20, BX
// Load for decryption
VPXOR (SI), Y0, Y0
VMOVDQU Y0, (DI)
LEAQ 32(SI), SI
LEAQ 32(DI), DI
VMOVDQA Y14, Y0
VMOVDQA Y12, Y14
VMOVDQA Y4, Y12
JMP openAVX2TailLoop
openAVX2Tail:
CMPQ BX, $0x10
VMOVDQA X0, X1
JB openAVX2TailDone
SUBQ $0x10, BX
// Load for decryption
VPXOR (SI), X0, X12
VMOVDQU X12, (DI)
LEAQ 16(SI), SI
LEAQ 16(DI), DI
VPERM2I128 $0x11, Y0, Y0, Y0
VMOVDQA X0, X1
openAVX2TailDone:
VZEROUPPER
JMP openSSETail16
openAVX2Tail256:
VMOVDQA ·chacha20Constants<>+0(SB), Y0
VMOVDQA Y0, Y5
VMOVDQA 32(BP), Y14
VMOVDQA Y14, Y9
VMOVDQA 64(BP), Y12
VMOVDQA Y12, Y13
VMOVDQA 192(BP), Y4
VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
VMOVDQA Y4, Y7
VMOVDQA Y1, Y11
// Compute the number of iterations that will hash data
MOVQ BX, 224(BP)
MOVQ BX, CX
SUBQ $0x80, CX
SHRQ $0x04, CX
MOVQ $0x0000000a, R9
CMPQ CX, $0x0a
CMOVQGT R9, CX
MOVQ SI, BX
XORQ R9, R9
openAVX2Tail256LoopA:
ADDQ (BX), R10
ADCQ 8(BX), R11
ADCQ $0x01, R12
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
LEAQ 16(BX), BX
openAVX2Tail256LoopB:
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x0c, Y14, Y3
VPSRLD $0x14, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x07, Y14, Y3
VPSRLD $0x19, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x0c, Y9, Y3
VPSRLD $0x14, Y9, Y9
VPXOR Y3, Y9, Y9
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x07, Y9, Y3
VPSRLD $0x19, Y9, Y9
VPXOR Y3, Y9, Y9
VPALIGNR $0x04, Y14, Y14, Y14
VPALIGNR $0x04, Y9, Y9, Y9
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x0c, Y4, Y4, Y4
VPALIGNR $0x0c, Y1, Y1, Y1
INCQ R9
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x0c, Y14, Y3
VPSRLD $0x14, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x07, Y14, Y3
VPSRLD $0x19, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x0c, Y9, Y3
VPSRLD $0x14, Y9, Y9
VPXOR Y3, Y9, Y9
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x07, Y9, Y3
VPSRLD $0x19, Y9, Y9
VPXOR Y3, Y9, Y9
VPALIGNR $0x0c, Y14, Y14, Y14
VPALIGNR $0x0c, Y9, Y9, Y9
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x04, Y4, Y4, Y4
VPALIGNR $0x04, Y1, Y1, Y1
CMPQ R9, CX
JB openAVX2Tail256LoopA
CMPQ R9, $0x0a
JNE openAVX2Tail256LoopB
MOVQ BX, R9
SUBQ SI, BX
MOVQ BX, CX
MOVQ 224(BP), BX
openAVX2Tail256Hash:
ADDQ $0x10, CX
CMPQ CX, BX
JGT openAVX2Tail256HashEnd
ADDQ (R9), R10
ADCQ 8(R9), R11
ADCQ $0x01, R12
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
LEAQ 16(R9), R9
JMP openAVX2Tail256Hash
openAVX2Tail256HashEnd:
VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
VPADDD 32(BP), Y14, Y14
VPADDD 32(BP), Y9, Y9
VPADDD 64(BP), Y12, Y12
VPADDD 64(BP), Y13, Y13
VPADDD Y7, Y4, Y4
VPADDD Y11, Y1, Y1
VPERM2I128 $0x02, Y0, Y14, Y6
VPERM2I128 $0x02, Y12, Y4, Y10
VPERM2I128 $0x13, Y0, Y14, Y8
VPERM2I128 $0x13, Y12, Y4, Y2
VPERM2I128 $0x02, Y5, Y9, Y0
VPERM2I128 $0x02, Y13, Y1, Y14
VPERM2I128 $0x13, Y5, Y9, Y12
VPERM2I128 $0x13, Y13, Y1, Y4
VPXOR (SI), Y6, Y6
VPXOR 32(SI), Y10, Y10
VPXOR 64(SI), Y8, Y8
VPXOR 96(SI), Y2, Y2
VMOVDQU Y6, (DI)
VMOVDQU Y10, 32(DI)
VMOVDQU Y8, 64(DI)
VMOVDQU Y2, 96(DI)
LEAQ 128(SI), SI
LEAQ 128(DI), DI
SUBQ $0x80, BX
JMP openAVX2TailLoop
openAVX2Tail384:
// Need to decrypt up to 384 bytes - prepare six blocks
VMOVDQA ·chacha20Constants<>+0(SB), Y0
VMOVDQA Y0, Y5
VMOVDQA Y0, Y6
VMOVDQA 32(BP), Y14
VMOVDQA Y14, Y9
VMOVDQA Y14, Y10
VMOVDQA 64(BP), Y12
VMOVDQA Y12, Y13
VMOVDQA Y12, Y8
VMOVDQA 192(BP), Y4
VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
VMOVDQA Y4, 96(BP)
VMOVDQA Y1, 128(BP)
VMOVDQA Y2, 160(BP)
// Compute the number of iterations that will hash two blocks of data
MOVQ BX, 224(BP)
MOVQ BX, CX
SUBQ $0x00000100, CX
SHRQ $0x04, CX
ADDQ $0x06, CX
MOVQ $0x0000000a, R9
CMPQ CX, $0x0a
CMOVQGT R9, CX
MOVQ SI, BX
XORQ R9, R9
openAVX2Tail384LoopB:
ADDQ (BX), R10
ADCQ 8(BX), R11
ADCQ $0x01, R12
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
LEAQ 16(BX), BX
openAVX2Tail384LoopA:
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x0c, Y14, Y3
VPSRLD $0x14, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x07, Y14, Y3
VPSRLD $0x19, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x0c, Y9, Y3
VPSRLD $0x14, Y9, Y9
VPXOR Y3, Y9, Y9
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x07, Y9, Y3
VPSRLD $0x19, Y9, Y9
VPXOR Y3, Y9, Y9
VPADDD Y10, Y6, Y6
VPXOR Y6, Y2, Y2
VPSHUFB ·rol16<>+0(SB), Y2, Y2
VPADDD Y2, Y8, Y8
VPXOR Y8, Y10, Y10
VPSLLD $0x0c, Y10, Y3
VPSRLD $0x14, Y10, Y10
VPXOR Y3, Y10, Y10
VPADDD Y10, Y6, Y6
VPXOR Y6, Y2, Y2
VPSHUFB ·rol8<>+0(SB), Y2, Y2
VPADDD Y2, Y8, Y8
VPXOR Y8, Y10, Y10
VPSLLD $0x07, Y10, Y3
VPSRLD $0x19, Y10, Y10
VPXOR Y3, Y10, Y10
VPALIGNR $0x04, Y14, Y14, Y14
VPALIGNR $0x04, Y9, Y9, Y9
VPALIGNR $0x04, Y10, Y10, Y10
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x08, Y8, Y8, Y8
VPALIGNR $0x0c, Y4, Y4, Y4
VPALIGNR $0x0c, Y1, Y1, Y1
VPALIGNR $0x0c, Y2, Y2, Y2
ADDQ (BX), R10
ADCQ 8(BX), R11
ADCQ $0x01, R12
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
LEAQ 16(BX), BX
INCQ R9
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x0c, Y14, Y3
VPSRLD $0x14, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x07, Y14, Y3
VPSRLD $0x19, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x0c, Y9, Y3
VPSRLD $0x14, Y9, Y9
VPXOR Y3, Y9, Y9
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x07, Y9, Y3
VPSRLD $0x19, Y9, Y9
VPXOR Y3, Y9, Y9
VPADDD Y10, Y6, Y6
VPXOR Y6, Y2, Y2
VPSHUFB ·rol16<>+0(SB), Y2, Y2
VPADDD Y2, Y8, Y8
VPXOR Y8, Y10, Y10
VPSLLD $0x0c, Y10, Y3
VPSRLD $0x14, Y10, Y10
VPXOR Y3, Y10, Y10
VPADDD Y10, Y6, Y6
VPXOR Y6, Y2, Y2
VPSHUFB ·rol8<>+0(SB), Y2, Y2
VPADDD Y2, Y8, Y8
VPXOR Y8, Y10, Y10
VPSLLD $0x07, Y10, Y3
VPSRLD $0x19, Y10, Y10
VPXOR Y3, Y10, Y10
VPALIGNR $0x0c, Y14, Y14, Y14
VPALIGNR $0x0c, Y9, Y9, Y9
VPALIGNR $0x0c, Y10, Y10, Y10
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x08, Y8, Y8, Y8
VPALIGNR $0x04, Y4, Y4, Y4
VPALIGNR $0x04, Y1, Y1, Y1
VPALIGNR $0x04, Y2, Y2, Y2
CMPQ R9, CX
JB openAVX2Tail384LoopB
CMPQ R9, $0x0a
JNE openAVX2Tail384LoopA
MOVQ BX, R9
SUBQ SI, BX
MOVQ BX, CX
MOVQ 224(BP), BX
openAVX2Tail384Hash:
ADDQ $0x10, CX
CMPQ CX, BX
JGT openAVX2Tail384HashEnd
ADDQ (R9), R10
ADCQ 8(R9), R11
ADCQ $0x01, R12
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
LEAQ 16(R9), R9
JMP openAVX2Tail384Hash
openAVX2Tail384HashEnd:
VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
VPADDD ·chacha20Constants<>+0(SB), Y6, Y6
VPADDD 32(BP), Y14, Y14
VPADDD 32(BP), Y9, Y9
VPADDD 32(BP), Y10, Y10
VPADDD 64(BP), Y12, Y12
VPADDD 64(BP), Y13, Y13
VPADDD 64(BP), Y8, Y8
VPADDD 96(BP), Y4, Y4
VPADDD 128(BP), Y1, Y1
VPADDD 160(BP), Y2, Y2
VPERM2I128 $0x02, Y0, Y14, Y3
VPERM2I128 $0x02, Y12, Y4, Y7
VPERM2I128 $0x13, Y0, Y14, Y11
VPERM2I128 $0x13, Y12, Y4, Y15
VPXOR (SI), Y3, Y3
VPXOR 32(SI), Y7, Y7
VPXOR 64(SI), Y11, Y11
VPXOR 96(SI), Y15, Y15
VMOVDQU Y3, (DI)
VMOVDQU Y7, 32(DI)
VMOVDQU Y11, 64(DI)
VMOVDQU Y15, 96(DI)
VPERM2I128 $0x02, Y5, Y9, Y3
VPERM2I128 $0x02, Y13, Y1, Y7
VPERM2I128 $0x13, Y5, Y9, Y11
VPERM2I128 $0x13, Y13, Y1, Y15
VPXOR 128(SI), Y3, Y3
VPXOR 160(SI), Y7, Y7
VPXOR 192(SI), Y11, Y11
VPXOR 224(SI), Y15, Y15
VMOVDQU Y3, 128(DI)
VMOVDQU Y7, 160(DI)
VMOVDQU Y11, 192(DI)
VMOVDQU Y15, 224(DI)
VPERM2I128 $0x02, Y6, Y10, Y0
VPERM2I128 $0x02, Y8, Y2, Y14
VPERM2I128 $0x13, Y6, Y10, Y12
VPERM2I128 $0x13, Y8, Y2, Y4
LEAQ 256(SI), SI
LEAQ 256(DI), DI
SUBQ $0x00000100, BX
JMP openAVX2TailLoop
openAVX2Tail512:
VMOVDQU ·chacha20Constants<>+0(SB), Y0
VMOVDQA Y0, Y5
VMOVDQA Y0, Y6
VMOVDQA Y0, Y7
VMOVDQA 32(BP), Y14
VMOVDQA Y14, Y9
VMOVDQA Y14, Y10
VMOVDQA Y14, Y11
VMOVDQA 64(BP), Y12
VMOVDQA Y12, Y13
VMOVDQA Y12, Y8
VMOVDQA Y12, Y15
VMOVDQA 192(BP), Y4
VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
VPADDD ·avx2IncMask<>+0(SB), Y2, Y3
VMOVDQA Y4, 96(BP)
VMOVDQA Y1, 128(BP)
VMOVDQA Y2, 160(BP)
VMOVDQA Y3, 192(BP)
XORQ CX, CX
MOVQ SI, R9
openAVX2Tail512LoopB:
ADDQ (R9), R10
ADCQ 8(R9), R11
ADCQ $0x01, R12
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
LEAQ 16(R9), R9
openAVX2Tail512LoopA:
VPADDD Y14, Y0, Y0
VPADDD Y9, Y5, Y5
VPADDD Y10, Y6, Y6
VPADDD Y11, Y7, Y7
VPXOR Y0, Y4, Y4
VPXOR Y5, Y1, Y1
VPXOR Y6, Y2, Y2
VPXOR Y7, Y3, Y3
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y2, Y2
VPSHUFB ·rol16<>+0(SB), Y3, Y3
VPADDD Y4, Y12, Y12
VPADDD Y1, Y13, Y13
VPADDD Y2, Y8, Y8
VPADDD Y3, Y15, Y15
VPXOR Y12, Y14, Y14
VPXOR Y13, Y9, Y9
VPXOR Y8, Y10, Y10
VPXOR Y15, Y11, Y11
VMOVDQA Y15, 224(BP)
VPSLLD $0x0c, Y14, Y15
VPSRLD $0x14, Y14, Y14
VPXOR Y15, Y14, Y14
VPSLLD $0x0c, Y9, Y15
VPSRLD $0x14, Y9, Y9
VPXOR Y15, Y9, Y9
VPSLLD $0x0c, Y10, Y15
VPSRLD $0x14, Y10, Y10
VPXOR Y15, Y10, Y10
VPSLLD $0x0c, Y11, Y15
VPSRLD $0x14, Y11, Y11
VPXOR Y15, Y11, Y11
VMOVDQA 224(BP), Y15
ADDQ (R9), R10
ADCQ 8(R9), R11
ADCQ $0x01, R12
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
VPADDD Y14, Y0, Y0
VPADDD Y9, Y5, Y5
VPADDD Y10, Y6, Y6
VPADDD Y11, Y7, Y7
VPXOR Y0, Y4, Y4
VPXOR Y5, Y1, Y1
VPXOR Y6, Y2, Y2
VPXOR Y7, Y3, Y3
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y2, Y2
VPSHUFB ·rol8<>+0(SB), Y3, Y3
VPADDD Y4, Y12, Y12
VPADDD Y1, Y13, Y13
VPADDD Y2, Y8, Y8
VPADDD Y3, Y15, Y15
VPXOR Y12, Y14, Y14
VPXOR Y13, Y9, Y9
VPXOR Y8, Y10, Y10
VPXOR Y15, Y11, Y11
VMOVDQA Y15, 224(BP)
VPSLLD $0x07, Y14, Y15
VPSRLD $0x19, Y14, Y14
VPXOR Y15, Y14, Y14
VPSLLD $0x07, Y9, Y15
VPSRLD $0x19, Y9, Y9
VPXOR Y15, Y9, Y9
VPSLLD $0x07, Y10, Y15
VPSRLD $0x19, Y10, Y10
VPXOR Y15, Y10, Y10
VPSLLD $0x07, Y11, Y15
VPSRLD $0x19, Y11, Y11
VPXOR Y15, Y11, Y11
VMOVDQA 224(BP), Y15
VPALIGNR $0x04, Y14, Y14, Y14
VPALIGNR $0x04, Y9, Y9, Y9
VPALIGNR $0x04, Y10, Y10, Y10
VPALIGNR $0x04, Y11, Y11, Y11
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x08, Y8, Y8, Y8
VPALIGNR $0x08, Y15, Y15, Y15
VPALIGNR $0x0c, Y4, Y4, Y4
VPALIGNR $0x0c, Y1, Y1, Y1
VPALIGNR $0x0c, Y2, Y2, Y2
VPALIGNR $0x0c, Y3, Y3, Y3
VPADDD Y14, Y0, Y0
VPADDD Y9, Y5, Y5
VPADDD Y10, Y6, Y6
VPADDD Y11, Y7, Y7
VPXOR Y0, Y4, Y4
VPXOR Y5, Y1, Y1
VPXOR Y6, Y2, Y2
VPXOR Y7, Y3, Y3
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y2, Y2
VPSHUFB ·rol16<>+0(SB), Y3, Y3
VPADDD Y4, Y12, Y12
VPADDD Y1, Y13, Y13
VPADDD Y2, Y8, Y8
VPADDD Y3, Y15, Y15
VPXOR Y12, Y14, Y14
VPXOR Y13, Y9, Y9
VPXOR Y8, Y10, Y10
VPXOR Y15, Y11, Y11
ADDQ 16(R9), R10
ADCQ 24(R9), R11
ADCQ $0x01, R12
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
LEAQ 32(R9), R9
VMOVDQA Y15, 224(BP)
VPSLLD $0x0c, Y14, Y15
VPSRLD $0x14, Y14, Y14
VPXOR Y15, Y14, Y14
VPSLLD $0x0c, Y9, Y15
VPSRLD $0x14, Y9, Y9
VPXOR Y15, Y9, Y9
VPSLLD $0x0c, Y10, Y15
VPSRLD $0x14, Y10, Y10
VPXOR Y15, Y10, Y10
VPSLLD $0x0c, Y11, Y15
VPSRLD $0x14, Y11, Y11
VPXOR Y15, Y11, Y11
VMOVDQA 224(BP), Y15
VPADDD Y14, Y0, Y0
VPADDD Y9, Y5, Y5
VPADDD Y10, Y6, Y6
VPADDD Y11, Y7, Y7
VPXOR Y0, Y4, Y4
VPXOR Y5, Y1, Y1
VPXOR Y6, Y2, Y2
VPXOR Y7, Y3, Y3
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y2, Y2
VPSHUFB ·rol8<>+0(SB), Y3, Y3
VPADDD Y4, Y12, Y12
VPADDD Y1, Y13, Y13
VPADDD Y2, Y8, Y8
VPADDD Y3, Y15, Y15
VPXOR Y12, Y14, Y14
VPXOR Y13, Y9, Y9
VPXOR Y8, Y10, Y10
VPXOR Y15, Y11, Y11
VMOVDQA Y15, 224(BP)
VPSLLD $0x07, Y14, Y15
VPSRLD $0x19, Y14, Y14
VPXOR Y15, Y14, Y14
VPSLLD $0x07, Y9, Y15
VPSRLD $0x19, Y9, Y9
VPXOR Y15, Y9, Y9
VPSLLD $0x07, Y10, Y15
VPSRLD $0x19, Y10, Y10
VPXOR Y15, Y10, Y10
VPSLLD $0x07, Y11, Y15
VPSRLD $0x19, Y11, Y11
VPXOR Y15, Y11, Y11
VMOVDQA 224(BP), Y15
VPALIGNR $0x0c, Y14, Y14, Y14
VPALIGNR $0x0c, Y9, Y9, Y9
VPALIGNR $0x0c, Y10, Y10, Y10
VPALIGNR $0x0c, Y11, Y11, Y11
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x08, Y8, Y8, Y8
VPALIGNR $0x08, Y15, Y15, Y15
VPALIGNR $0x04, Y4, Y4, Y4
VPALIGNR $0x04, Y1, Y1, Y1
VPALIGNR $0x04, Y2, Y2, Y2
VPALIGNR $0x04, Y3, Y3, Y3
INCQ CX
CMPQ CX, $0x04
JLT openAVX2Tail512LoopB
CMPQ CX, $0x0a
JNE openAVX2Tail512LoopA
MOVQ BX, CX
SUBQ $0x00000180, CX
ANDQ $-16, CX
openAVX2Tail512HashLoop:
TESTQ CX, CX
JE openAVX2Tail512HashEnd
ADDQ (R9), R10
ADCQ 8(R9), R11
ADCQ $0x01, R12
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
LEAQ 16(R9), R9
SUBQ $0x10, CX
JMP openAVX2Tail512HashLoop
openAVX2Tail512HashEnd:
VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
VPADDD ·chacha20Constants<>+0(SB), Y6, Y6
VPADDD ·chacha20Constants<>+0(SB), Y7, Y7
VPADDD 32(BP), Y14, Y14
VPADDD 32(BP), Y9, Y9
VPADDD 32(BP), Y10, Y10
VPADDD 32(BP), Y11, Y11
VPADDD 64(BP), Y12, Y12
VPADDD 64(BP), Y13, Y13
VPADDD 64(BP), Y8, Y8
VPADDD 64(BP), Y15, Y15
VPADDD 96(BP), Y4, Y4
VPADDD 128(BP), Y1, Y1
VPADDD 160(BP), Y2, Y2
VPADDD 192(BP), Y3, Y3
VMOVDQA Y15, 224(BP)
VPERM2I128 $0x02, Y0, Y14, Y15
VPERM2I128 $0x13, Y0, Y14, Y14
VPERM2I128 $0x02, Y12, Y4, Y0
VPERM2I128 $0x13, Y12, Y4, Y12
VPXOR (SI), Y15, Y15
VPXOR 32(SI), Y0, Y0
VPXOR 64(SI), Y14, Y14
VPXOR 96(SI), Y12, Y12
VMOVDQU Y15, (DI)
VMOVDQU Y0, 32(DI)
VMOVDQU Y14, 64(DI)
VMOVDQU Y12, 96(DI)
VPERM2I128 $0x02, Y5, Y9, Y0
VPERM2I128 $0x02, Y13, Y1, Y14
VPERM2I128 $0x13, Y5, Y9, Y12
VPERM2I128 $0x13, Y13, Y1, Y4
VPXOR 128(SI), Y0, Y0
VPXOR 160(SI), Y14, Y14
VPXOR 192(SI), Y12, Y12
VPXOR 224(SI), Y4, Y4
VMOVDQU Y0, 128(DI)
VMOVDQU Y14, 160(DI)
VMOVDQU Y12, 192(DI)
VMOVDQU Y4, 224(DI)
VPERM2I128 $0x02, Y6, Y10, Y0
VPERM2I128 $0x02, Y8, Y2, Y14
VPERM2I128 $0x13, Y6, Y10, Y12
VPERM2I128 $0x13, Y8, Y2, Y4
VPXOR 256(SI), Y0, Y0
VPXOR 288(SI), Y14, Y14
VPXOR 320(SI), Y12, Y12
VPXOR 352(SI), Y4, Y4
VMOVDQU Y0, 256(DI)
VMOVDQU Y14, 288(DI)
VMOVDQU Y12, 320(DI)
VMOVDQU Y4, 352(DI)
VPERM2I128 $0x02, Y7, Y11, Y0
VPERM2I128 $0x02, 224(BP), Y3, Y14
VPERM2I128 $0x13, Y7, Y11, Y12
VPERM2I128 $0x13, 224(BP), Y3, Y4
LEAQ 384(SI), SI
LEAQ 384(DI), DI
SUBQ $0x00000180, BX
JMP openAVX2TailLoop
DATA ·chacha20Constants<>+0(SB)/4, $0x61707865
DATA ·chacha20Constants<>+4(SB)/4, $0x3320646e
DATA ·chacha20Constants<>+8(SB)/4, $0x79622d32
DATA ·chacha20Constants<>+12(SB)/4, $0x6b206574
DATA ·chacha20Constants<>+16(SB)/4, $0x61707865
DATA ·chacha20Constants<>+20(SB)/4, $0x3320646e
DATA ·chacha20Constants<>+24(SB)/4, $0x79622d32
DATA ·chacha20Constants<>+28(SB)/4, $0x6b206574
GLOBL ·chacha20Constants<>(SB), RODATA|NOPTR, $32
DATA ·polyClampMask<>+0(SB)/8, $0x0ffffffc0fffffff
DATA ·polyClampMask<>+8(SB)/8, $0x0ffffffc0ffffffc
DATA ·polyClampMask<>+16(SB)/8, $0xffffffffffffffff
DATA ·polyClampMask<>+24(SB)/8, $0xffffffffffffffff
GLOBL ·polyClampMask<>(SB), RODATA|NOPTR, $32
DATA ·sseIncMask<>+0(SB)/8, $0x0000000000000001
DATA ·sseIncMask<>+8(SB)/8, $0x0000000000000000
GLOBL ·sseIncMask<>(SB), RODATA|NOPTR, $16
DATA ·andMask<>+0(SB)/8, $0x00000000000000ff
DATA ·andMask<>+8(SB)/8, $0x0000000000000000
DATA ·andMask<>+16(SB)/8, $0x000000000000ffff
DATA ·andMask<>+24(SB)/8, $0x0000000000000000
DATA ·andMask<>+32(SB)/8, $0x0000000000ffffff
DATA ·andMask<>+40(SB)/8, $0x0000000000000000
DATA ·andMask<>+48(SB)/8, $0x00000000ffffffff
DATA ·andMask<>+56(SB)/8, $0x0000000000000000
DATA ·andMask<>+64(SB)/8, $0x000000ffffffffff
DATA ·andMask<>+72(SB)/8, $0x0000000000000000
DATA ·andMask<>+80(SB)/8, $0x0000ffffffffffff
DATA ·andMask<>+88(SB)/8, $0x0000000000000000
DATA ·andMask<>+96(SB)/8, $0x00ffffffffffffff
DATA ·andMask<>+104(SB)/8, $0x0000000000000000
DATA ·andMask<>+112(SB)/8, $0xffffffffffffffff
DATA ·andMask<>+120(SB)/8, $0x0000000000000000
DATA ·andMask<>+128(SB)/8, $0xffffffffffffffff
DATA ·andMask<>+136(SB)/8, $0x00000000000000ff
DATA ·andMask<>+144(SB)/8, $0xffffffffffffffff
DATA ·andMask<>+152(SB)/8, $0x000000000000ffff
DATA ·andMask<>+160(SB)/8, $0xffffffffffffffff
DATA ·andMask<>+168(SB)/8, $0x0000000000ffffff
DATA ·andMask<>+176(SB)/8, $0xffffffffffffffff
DATA ·andMask<>+184(SB)/8, $0x00000000ffffffff
DATA ·andMask<>+192(SB)/8, $0xffffffffffffffff
DATA ·andMask<>+200(SB)/8, $0x000000ffffffffff
DATA ·andMask<>+208(SB)/8, $0xffffffffffffffff
DATA ·andMask<>+216(SB)/8, $0x0000ffffffffffff
DATA ·andMask<>+224(SB)/8, $0xffffffffffffffff
DATA ·andMask<>+232(SB)/8, $0x00ffffffffffffff
GLOBL ·andMask<>(SB), RODATA|NOPTR, $240
DATA ·avx2InitMask<>+0(SB)/8, $0x0000000000000000
DATA ·avx2InitMask<>+8(SB)/8, $0x0000000000000000
DATA ·avx2InitMask<>+16(SB)/8, $0x0000000000000001
DATA ·avx2InitMask<>+24(SB)/8, $0x0000000000000000
GLOBL ·avx2InitMask<>(SB), RODATA|NOPTR, $32
DATA ·rol16<>+0(SB)/8, $0x0504070601000302
DATA ·rol16<>+8(SB)/8, $0x0d0c0f0e09080b0a
DATA ·rol16<>+16(SB)/8, $0x0504070601000302
DATA ·rol16<>+24(SB)/8, $0x0d0c0f0e09080b0a
GLOBL ·rol16<>(SB), RODATA|NOPTR, $32
DATA ·rol8<>+0(SB)/8, $0x0605040702010003
DATA ·rol8<>+8(SB)/8, $0x0e0d0c0f0a09080b
DATA ·rol8<>+16(SB)/8, $0x0605040702010003
DATA ·rol8<>+24(SB)/8, $0x0e0d0c0f0a09080b
GLOBL ·rol8<>(SB), RODATA|NOPTR, $32
DATA ·avx2IncMask<>+0(SB)/8, $0x0000000000000002
DATA ·avx2IncMask<>+8(SB)/8, $0x0000000000000000
DATA ·avx2IncMask<>+16(SB)/8, $0x0000000000000002
DATA ·avx2IncMask<>+24(SB)/8, $0x0000000000000000
GLOBL ·avx2IncMask<>(SB), RODATA|NOPTR, $32
// func chacha20Poly1305Seal(dst []byte, key []uint32, src []byte, ad []byte)
// Requires: AVX, AVX2, BMI2, CMOV, SSE2
TEXT ·chacha20Poly1305Seal(SB), $288-96
MOVQ SP, BP
ADDQ $0x20, BP
ANDQ $-32, BP
MOVQ dst_base+0(FP), DI
MOVQ key_base+24(FP), R8
MOVQ src_base+48(FP), SI
MOVQ src_len+56(FP), BX
MOVQ ad_base+72(FP), CX
CMPB ·useAVX2+0(SB), $0x01
JE chacha20Poly1305Seal_AVX2
// Special optimization, for very short buffers
CMPQ BX, $0x80
JBE sealSSE128
// In the seal case - prepare the poly key + 3 blocks of stream in the first iteration
MOVOU ·chacha20Constants<>+0(SB), X0
MOVOU 16(R8), X3
MOVOU 32(R8), X6
MOVOU 48(R8), X9
// Store state on stack for future use
MOVO X3, 32(BP)
MOVO X6, 48(BP)
// Load state, increment counter blocks
MOVO X0, X1
MOVO X3, X4
MOVO X6, X7
MOVO X9, X10
PADDL ·sseIncMask<>+0(SB), X10
MOVO X1, X2
MOVO X4, X5
MOVO X7, X8
MOVO X10, X11
PADDL ·sseIncMask<>+0(SB), X11
MOVO X2, X12
MOVO X5, X13
MOVO X8, X14
MOVO X11, X15
PADDL ·sseIncMask<>+0(SB), X15
// Store counters
MOVO X9, 80(BP)
MOVO X10, 96(BP)
MOVO X11, 112(BP)
MOVO X15, 128(BP)
MOVQ $0x0000000a, R9
sealSSEIntroLoop:
MOVO X14, 64(BP)
PADDD X3, X0
PXOR X0, X9
ROL16(X9, X14)
PADDD X9, X6
PXOR X6, X3
MOVO X3, X14
PSLLL $0x0c, X14
PSRLL $0x14, X3
PXOR X14, X3
PADDD X3, X0
PXOR X0, X9
ROL8(X9, X14)
PADDD X9, X6
PXOR X6, X3
MOVO X3, X14
PSLLL $0x07, X14
PSRLL $0x19, X3
PXOR X14, X3
PADDD X4, X1
PXOR X1, X10
ROL16(X10, X14)
PADDD X10, X7
PXOR X7, X4
MOVO X4, X14
PSLLL $0x0c, X14
PSRLL $0x14, X4
PXOR X14, X4
PADDD X4, X1
PXOR X1, X10
ROL8(X10, X14)
PADDD X10, X7
PXOR X7, X4
MOVO X4, X14
PSLLL $0x07, X14
PSRLL $0x19, X4
PXOR X14, X4
PADDD X5, X2
PXOR X2, X11
ROL16(X11, X14)
PADDD X11, X8
PXOR X8, X5
MOVO X5, X14
PSLLL $0x0c, X14
PSRLL $0x14, X5
PXOR X14, X5
PADDD X5, X2
PXOR X2, X11
ROL8(X11, X14)
PADDD X11, X8
PXOR X8, X5
MOVO X5, X14
PSLLL $0x07, X14
PSRLL $0x19, X5
PXOR X14, X5
MOVO 64(BP), X14
MOVO X7, 64(BP)
PADDD X13, X12
PXOR X12, X15
ROL16(X15, X7)
PADDD X15, X14
PXOR X14, X13
MOVO X13, X7
PSLLL $0x0c, X7
PSRLL $0x14, X13
PXOR X7, X13
PADDD X13, X12
PXOR X12, X15
ROL8(X15, X7)
PADDD X15, X14
PXOR X14, X13
MOVO X13, X7
PSLLL $0x07, X7
PSRLL $0x19, X13
PXOR X7, X13
MOVO 64(BP), X7
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xdb
BYTE $0x04
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xe4
BYTE $0x04
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xed
BYTE $0x04
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xed
BYTE $0x04
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xf6
BYTE $0x08
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xff
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xc0
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xf6
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xc9
BYTE $0x0c
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xd2
BYTE $0x0c
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xdb
BYTE $0x0c
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xff
BYTE $0x0c
MOVO X14, 64(BP)
PADDD X3, X0
PXOR X0, X9
ROL16(X9, X14)
PADDD X9, X6
PXOR X6, X3
MOVO X3, X14
PSLLL $0x0c, X14
PSRLL $0x14, X3
PXOR X14, X3
PADDD X3, X0
PXOR X0, X9
ROL8(X9, X14)
PADDD X9, X6
PXOR X6, X3
MOVO X3, X14
PSLLL $0x07, X14
PSRLL $0x19, X3
PXOR X14, X3
PADDD X4, X1
PXOR X1, X10
ROL16(X10, X14)
PADDD X10, X7
PXOR X7, X4
MOVO X4, X14
PSLLL $0x0c, X14
PSRLL $0x14, X4
PXOR X14, X4
PADDD X4, X1
PXOR X1, X10
ROL8(X10, X14)
PADDD X10, X7
PXOR X7, X4
MOVO X4, X14
PSLLL $0x07, X14
PSRLL $0x19, X4
PXOR X14, X4
PADDD X5, X2
PXOR X2, X11
ROL16(X11, X14)
PADDD X11, X8
PXOR X8, X5
MOVO X5, X14
PSLLL $0x0c, X14
PSRLL $0x14, X5
PXOR X14, X5
PADDD X5, X2
PXOR X2, X11
ROL8(X11, X14)
PADDD X11, X8
PXOR X8, X5
MOVO X5, X14
PSLLL $0x07, X14
PSRLL $0x19, X5
PXOR X14, X5
MOVO 64(BP), X14
MOVO X7, 64(BP)
PADDD X13, X12
PXOR X12, X15
ROL16(X15, X7)
PADDD X15, X14
PXOR X14, X13
MOVO X13, X7
PSLLL $0x0c, X7
PSRLL $0x14, X13
PXOR X7, X13
PADDD X13, X12
PXOR X12, X15
ROL8(X15, X7)
PADDD X15, X14
PXOR X14, X13
MOVO X13, X7
PSLLL $0x07, X7
PSRLL $0x19, X13
PXOR X7, X13
MOVO 64(BP), X7
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xdb
BYTE $0x0c
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xe4
BYTE $0x0c
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xed
BYTE $0x0c
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xed
BYTE $0x0c
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xf6
BYTE $0x08
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xff
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xc0
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xf6
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xc9
BYTE $0x04
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xd2
BYTE $0x04
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xdb
BYTE $0x04
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xff
BYTE $0x04
DECQ R9
JNE sealSSEIntroLoop
// Add in the state
PADDD ·chacha20Constants<>+0(SB), X0
PADDD ·chacha20Constants<>+0(SB), X1
PADDD ·chacha20Constants<>+0(SB), X2
PADDD ·chacha20Constants<>+0(SB), X12
PADDD 32(BP), X3
PADDD 32(BP), X4
PADDD 32(BP), X5
PADDD 32(BP), X13
PADDD 48(BP), X7
PADDD 48(BP), X8
PADDD 48(BP), X14
PADDD 96(BP), X10
PADDD 112(BP), X11
PADDD 128(BP), X15
// Clamp and store the key
PAND ·polyClampMask<>+0(SB), X0
MOVO X0, (BP)
MOVO X3, 16(BP)
// Hash AAD
MOVQ ad_len+80(FP), R9
CALL polyHashADInternal<>(SB)
MOVOU (SI), X0
MOVOU 16(SI), X3
MOVOU 32(SI), X6
MOVOU 48(SI), X9
PXOR X0, X1
PXOR X3, X4
PXOR X6, X7
PXOR X9, X10
MOVOU X1, (DI)
MOVOU X4, 16(DI)
MOVOU X7, 32(DI)
MOVOU X10, 48(DI)
MOVOU 64(SI), X0
MOVOU 80(SI), X3
MOVOU 96(SI), X6
MOVOU 112(SI), X9
PXOR X0, X2
PXOR X3, X5
PXOR X6, X8
PXOR X9, X11
MOVOU X2, 64(DI)
MOVOU X5, 80(DI)
MOVOU X8, 96(DI)
MOVOU X11, 112(DI)
MOVQ $0x00000080, CX
SUBQ $0x80, BX
LEAQ 128(SI), SI
MOVO X12, X1
MOVO X13, X4
MOVO X14, X7
MOVO X15, X10
CMPQ BX, $0x40
JBE sealSSE128SealHash
MOVOU (SI), X0
MOVOU 16(SI), X3
MOVOU 32(SI), X6
MOVOU 48(SI), X9
PXOR X0, X12
PXOR X3, X13
PXOR X6, X14
PXOR X9, X15
MOVOU X12, 128(DI)
MOVOU X13, 144(DI)
MOVOU X14, 160(DI)
MOVOU X15, 176(DI)
ADDQ $0x40, CX
SUBQ $0x40, BX
LEAQ 64(SI), SI
MOVQ $0x00000002, CX
MOVQ $0x00000008, R9
CMPQ BX, $0x40
JBE sealSSETail64
CMPQ BX, $0x80
JBE sealSSETail128
CMPQ BX, $0xc0
JBE sealSSETail192
sealSSEMainLoop:
// Load state, increment counter blocks
MOVO ·chacha20Constants<>+0(SB), X0
MOVO 32(BP), X3
MOVO 48(BP), X6
MOVO 128(BP), X9
PADDL ·sseIncMask<>+0(SB), X9
MOVO X0, X1
MOVO X3, X4
MOVO X6, X7
MOVO X9, X10
PADDL ·sseIncMask<>+0(SB), X10
MOVO X1, X2
MOVO X4, X5
MOVO X7, X8
MOVO X10, X11
PADDL ·sseIncMask<>+0(SB), X11
MOVO X2, X12
MOVO X5, X13
MOVO X8, X14
MOVO X11, X15
PADDL ·sseIncMask<>+0(SB), X15
// Store counters
MOVO X9, 80(BP)
MOVO X10, 96(BP)
MOVO X11, 112(BP)
MOVO X15, 128(BP)
sealSSEInnerLoop:
MOVO X14, 64(BP)
PADDD X3, X0
PXOR X0, X9
ROL16(X9, X14)
PADDD X9, X6
PXOR X6, X3
MOVO X3, X14
PSLLL $0x0c, X14
PSRLL $0x14, X3
PXOR X14, X3
PADDD X3, X0
PXOR X0, X9
ROL8(X9, X14)
PADDD X9, X6
PXOR X6, X3
MOVO X3, X14
PSLLL $0x07, X14
PSRLL $0x19, X3
PXOR X14, X3
PADDD X4, X1
PXOR X1, X10
ROL16(X10, X14)
PADDD X10, X7
PXOR X7, X4
MOVO X4, X14
PSLLL $0x0c, X14
PSRLL $0x14, X4
PXOR X14, X4
PADDD X4, X1
PXOR X1, X10
ROL8(X10, X14)
PADDD X10, X7
PXOR X7, X4
MOVO X4, X14
PSLLL $0x07, X14
PSRLL $0x19, X4
PXOR X14, X4
PADDD X5, X2
PXOR X2, X11
ROL16(X11, X14)
PADDD X11, X8
PXOR X8, X5
MOVO X5, X14
PSLLL $0x0c, X14
PSRLL $0x14, X5
PXOR X14, X5
PADDD X5, X2
PXOR X2, X11
ROL8(X11, X14)
PADDD X11, X8
PXOR X8, X5
MOVO X5, X14
PSLLL $0x07, X14
PSRLL $0x19, X5
PXOR X14, X5
MOVO 64(BP), X14
MOVO X7, 64(BP)
PADDD X13, X12
PXOR X12, X15
ROL16(X15, X7)
PADDD X15, X14
PXOR X14, X13
MOVO X13, X7
PSLLL $0x0c, X7
PSRLL $0x14, X13
PXOR X7, X13
PADDD X13, X12
PXOR X12, X15
ROL8(X15, X7)
PADDD X15, X14
PXOR X14, X13
MOVO X13, X7
PSLLL $0x07, X7
PSRLL $0x19, X13
PXOR X7, X13
MOVO 64(BP), X7
ADDQ (DI), R10
ADCQ 8(DI), R11
ADCQ $0x01, R12
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xdb
BYTE $0x04
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xe4
BYTE $0x04
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xed
BYTE $0x04
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xed
BYTE $0x04
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xf6
BYTE $0x08
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xff
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xc0
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xf6
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xc9
BYTE $0x0c
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xd2
BYTE $0x0c
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xdb
BYTE $0x0c
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xff
BYTE $0x0c
MOVQ (BP), AX
MOVQ AX, R15
MULQ R10
MOVQ AX, R13
MOVQ DX, R14
MOVQ (BP), AX
MULQ R11
IMULQ R12, R15
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), AX
MOVQ AX, R8
MULQ R10
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 8(BP), AX
MULQ R11
ADDQ AX, R15
ADCQ $0x00, DX
LEAQ 16(DI), DI
MOVO X14, 64(BP)
PADDD X3, X0
PXOR X0, X9
ROL16(X9, X14)
PADDD X9, X6
PXOR X6, X3
MOVO X3, X14
PSLLL $0x0c, X14
PSRLL $0x14, X3
PXOR X14, X3
PADDD X3, X0
PXOR X0, X9
ROL8(X9, X14)
PADDD X9, X6
PXOR X6, X3
MOVO X3, X14
PSLLL $0x07, X14
PSRLL $0x19, X3
PXOR X14, X3
PADDD X4, X1
PXOR X1, X10
ROL16(X10, X14)
PADDD X10, X7
PXOR X7, X4
MOVO X4, X14
PSLLL $0x0c, X14
PSRLL $0x14, X4
PXOR X14, X4
PADDD X4, X1
PXOR X1, X10
ROL8(X10, X14)
PADDD X10, X7
PXOR X7, X4
MOVO X4, X14
PSLLL $0x07, X14
PSRLL $0x19, X4
PXOR X14, X4
PADDD X5, X2
PXOR X2, X11
ROL16(X11, X14)
PADDD X11, X8
PXOR X8, X5
MOVO X5, X14
PSLLL $0x0c, X14
PSRLL $0x14, X5
PXOR X14, X5
PADDD X5, X2
PXOR X2, X11
ROL8(X11, X14)
PADDD X11, X8
PXOR X8, X5
MOVO X5, X14
PSLLL $0x07, X14
PSRLL $0x19, X5
PXOR X14, X5
MOVO 64(BP), X14
MOVO X7, 64(BP)
IMULQ R12, R8
ADDQ R10, R15
ADCQ DX, R8
PADDD X13, X12
PXOR X12, X15
ROL16(X15, X7)
PADDD X15, X14
PXOR X14, X13
MOVO X13, X7
PSLLL $0x0c, X7
PSRLL $0x14, X13
PXOR X7, X13
PADDD X13, X12
PXOR X12, X15
ROL8(X15, X7)
PADDD X15, X14
PXOR X14, X13
MOVO X13, X7
PSLLL $0x07, X7
PSRLL $0x19, X13
PXOR X7, X13
MOVO 64(BP), X7
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xdb
BYTE $0x0c
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xe4
BYTE $0x0c
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xed
BYTE $0x0c
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xed
BYTE $0x0c
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xf6
BYTE $0x08
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xff
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xc0
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xf6
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xc9
BYTE $0x04
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xd2
BYTE $0x04
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xdb
BYTE $0x04
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xff
BYTE $0x04
DECQ R9
JGE sealSSEInnerLoop
ADDQ (DI), R10
ADCQ 8(DI), R11
ADCQ $0x01, R12
MOVQ (BP), AX
MOVQ AX, R15
MULQ R10
MOVQ AX, R13
MOVQ DX, R14
MOVQ (BP), AX
MULQ R11
IMULQ R12, R15
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), AX
MOVQ AX, R8
MULQ R10
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 8(BP), AX
MULQ R11
ADDQ AX, R15
ADCQ $0x00, DX
IMULQ R12, R8
ADDQ R10, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
LEAQ 16(DI), DI
DECQ CX
JG sealSSEInnerLoop
// Add in the state
PADDD ·chacha20Constants<>+0(SB), X0
PADDD ·chacha20Constants<>+0(SB), X1
PADDD ·chacha20Constants<>+0(SB), X2
PADDD ·chacha20Constants<>+0(SB), X12
PADDD 32(BP), X3
PADDD 32(BP), X4
PADDD 32(BP), X5
PADDD 32(BP), X13
PADDD 48(BP), X6
PADDD 48(BP), X7
PADDD 48(BP), X8
PADDD 48(BP), X14
PADDD 80(BP), X9
PADDD 96(BP), X10
PADDD 112(BP), X11
PADDD 128(BP), X15
MOVO X15, 64(BP)
// Load - xor - store
MOVOU (SI), X15
PXOR X15, X0
MOVOU 16(SI), X15
PXOR X15, X3
MOVOU 32(SI), X15
PXOR X15, X6
MOVOU 48(SI), X15
PXOR X15, X9
MOVOU X0, (DI)
MOVOU X3, 16(DI)
MOVOU X6, 32(DI)
MOVOU X9, 48(DI)
MOVO 64(BP), X15
MOVOU 64(SI), X0
MOVOU 80(SI), X3
MOVOU 96(SI), X6
MOVOU 112(SI), X9
PXOR X0, X1
PXOR X3, X4
PXOR X6, X7
PXOR X9, X10
MOVOU X1, 64(DI)
MOVOU X4, 80(DI)
MOVOU X7, 96(DI)
MOVOU X10, 112(DI)
MOVOU 128(SI), X0
MOVOU 144(SI), X3
MOVOU 160(SI), X6
MOVOU 176(SI), X9
PXOR X0, X2
PXOR X3, X5
PXOR X6, X8
PXOR X9, X11
MOVOU X2, 128(DI)
MOVOU X5, 144(DI)
MOVOU X8, 160(DI)
MOVOU X11, 176(DI)
ADDQ $0xc0, SI
MOVQ $0x000000c0, CX
SUBQ $0xc0, BX
MOVO X12, X1
MOVO X13, X4
MOVO X14, X7
MOVO X15, X10
CMPQ BX, $0x40
JBE sealSSE128SealHash
MOVOU (SI), X0
MOVOU 16(SI), X3
MOVOU 32(SI), X6
MOVOU 48(SI), X9
PXOR X0, X12
PXOR X3, X13
PXOR X6, X14
PXOR X9, X15
MOVOU X12, 192(DI)
MOVOU X13, 208(DI)
MOVOU X14, 224(DI)
MOVOU X15, 240(DI)
LEAQ 64(SI), SI
SUBQ $0x40, BX
MOVQ $0x00000006, CX
MOVQ $0x00000004, R9
CMPQ BX, $0xc0
JG sealSSEMainLoop
MOVQ BX, CX
TESTQ BX, BX
JE sealSSE128SealHash
MOVQ $0x00000006, CX
CMPQ BX, $0x40
JBE sealSSETail64
CMPQ BX, $0x80
JBE sealSSETail128
JMP sealSSETail192
sealSSETail64:
MOVO ·chacha20Constants<>+0(SB), X1
MOVO 32(BP), X4
MOVO 48(BP), X7
MOVO 128(BP), X10
PADDL ·sseIncMask<>+0(SB), X10
MOVO X10, 80(BP)
sealSSETail64LoopA:
ADDQ (DI), R10
ADCQ 8(DI), R11
ADCQ $0x01, R12
MOVQ (BP), AX
MOVQ AX, R15
MULQ R10
MOVQ AX, R13
MOVQ DX, R14
MOVQ (BP), AX
MULQ R11
IMULQ R12, R15
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), AX
MOVQ AX, R8
MULQ R10
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 8(BP), AX
MULQ R11
ADDQ AX, R15
ADCQ $0x00, DX
IMULQ R12, R8
ADDQ R10, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
LEAQ 16(DI), DI
sealSSETail64LoopB:
PADDD X4, X1
PXOR X1, X10
ROL16(X10, X13)
PADDD X10, X7
PXOR X7, X4
MOVO X4, X13
PSLLL $0x0c, X13
PSRLL $0x14, X4
PXOR X13, X4
PADDD X4, X1
PXOR X1, X10
ROL8(X10, X13)
PADDD X10, X7
PXOR X7, X4
MOVO X4, X13
PSLLL $0x07, X13
PSRLL $0x19, X4
PXOR X13, X4
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xe4
BYTE $0x04
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xff
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xd2
BYTE $0x0c
PADDD X4, X1
PXOR X1, X10
ROL16(X10, X13)
PADDD X10, X7
PXOR X7, X4
MOVO X4, X13
PSLLL $0x0c, X13
PSRLL $0x14, X4
PXOR X13, X4
PADDD X4, X1
PXOR X1, X10
ROL8(X10, X13)
PADDD X10, X7
PXOR X7, X4
MOVO X4, X13
PSLLL $0x07, X13
PSRLL $0x19, X4
PXOR X13, X4
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xe4
BYTE $0x0c
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xff
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xd2
BYTE $0x04
ADDQ (DI), R10
ADCQ 8(DI), R11
ADCQ $0x01, R12
MOVQ (BP), AX
MOVQ AX, R15
MULQ R10
MOVQ AX, R13
MOVQ DX, R14
MOVQ (BP), AX
MULQ R11
IMULQ R12, R15
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), AX
MOVQ AX, R8
MULQ R10
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 8(BP), AX
MULQ R11
ADDQ AX, R15
ADCQ $0x00, DX
IMULQ R12, R8
ADDQ R10, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
LEAQ 16(DI), DI
DECQ CX
JG sealSSETail64LoopA
DECQ R9
JGE sealSSETail64LoopB
PADDL ·chacha20Constants<>+0(SB), X1
PADDL 32(BP), X4
PADDL 48(BP), X7
PADDL 80(BP), X10
JMP sealSSE128Seal
sealSSETail128:
MOVO ·chacha20Constants<>+0(SB), X0
MOVO 32(BP), X3
MOVO 48(BP), X6
MOVO 128(BP), X9
PADDL ·sseIncMask<>+0(SB), X9
MOVO X9, 80(BP)
MOVO X0, X1
MOVO X3, X4
MOVO X6, X7
MOVO X9, X10
PADDL ·sseIncMask<>+0(SB), X10
MOVO X10, 96(BP)
sealSSETail128LoopA:
ADDQ (DI), R10
ADCQ 8(DI), R11
ADCQ $0x01, R12
MOVQ (BP), AX
MOVQ AX, R15
MULQ R10
MOVQ AX, R13
MOVQ DX, R14
MOVQ (BP), AX
MULQ R11
IMULQ R12, R15
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), AX
MOVQ AX, R8
MULQ R10
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 8(BP), AX
MULQ R11
ADDQ AX, R15
ADCQ $0x00, DX
IMULQ R12, R8
ADDQ R10, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
LEAQ 16(DI), DI
sealSSETail128LoopB:
PADDD X3, X0
PXOR X0, X9
ROL16(X9, X12)
PADDD X9, X6
PXOR X6, X3
MOVO X3, X12
PSLLL $0x0c, X12
PSRLL $0x14, X3
PXOR X12, X3
PADDD X3, X0
PXOR X0, X9
ROL8(X9, X12)
PADDD X9, X6
PXOR X6, X3
MOVO X3, X12
PSLLL $0x07, X12
PSRLL $0x19, X3
PXOR X12, X3
PADDD X4, X1
PXOR X1, X10
ROL16(X10, X12)
PADDD X10, X7
PXOR X7, X4
MOVO X4, X12
PSLLL $0x0c, X12
PSRLL $0x14, X4
PXOR X12, X4
PADDD X4, X1
PXOR X1, X10
ROL8(X10, X12)
PADDD X10, X7
PXOR X7, X4
MOVO X4, X12
PSLLL $0x07, X12
PSRLL $0x19, X4
PXOR X12, X4
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xdb
BYTE $0x04
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xf6
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xc9
BYTE $0x0c
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xe4
BYTE $0x04
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xff
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xd2
BYTE $0x0c
ADDQ (DI), R10
ADCQ 8(DI), R11
ADCQ $0x01, R12
MOVQ (BP), AX
MOVQ AX, R15
MULQ R10
MOVQ AX, R13
MOVQ DX, R14
MOVQ (BP), AX
MULQ R11
IMULQ R12, R15
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), AX
MOVQ AX, R8
MULQ R10
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 8(BP), AX
MULQ R11
ADDQ AX, R15
ADCQ $0x00, DX
IMULQ R12, R8
ADDQ R10, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
LEAQ 16(DI), DI
PADDD X3, X0
PXOR X0, X9
ROL16(X9, X12)
PADDD X9, X6
PXOR X6, X3
MOVO X3, X12
PSLLL $0x0c, X12
PSRLL $0x14, X3
PXOR X12, X3
PADDD X3, X0
PXOR X0, X9
ROL8(X9, X12)
PADDD X9, X6
PXOR X6, X3
MOVO X3, X12
PSLLL $0x07, X12
PSRLL $0x19, X3
PXOR X12, X3
PADDD X4, X1
PXOR X1, X10
ROL16(X10, X12)
PADDD X10, X7
PXOR X7, X4
MOVO X4, X12
PSLLL $0x0c, X12
PSRLL $0x14, X4
PXOR X12, X4
PADDD X4, X1
PXOR X1, X10
ROL8(X10, X12)
PADDD X10, X7
PXOR X7, X4
MOVO X4, X12
PSLLL $0x07, X12
PSRLL $0x19, X4
PXOR X12, X4
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xdb
BYTE $0x0c
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xf6
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xc9
BYTE $0x04
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xe4
BYTE $0x0c
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xff
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xd2
BYTE $0x04
DECQ CX
JG sealSSETail128LoopA
DECQ R9
JGE sealSSETail128LoopB
PADDL ·chacha20Constants<>+0(SB), X0
PADDL ·chacha20Constants<>+0(SB), X1
PADDL 32(BP), X3
PADDL 32(BP), X4
PADDL 48(BP), X6
PADDL 48(BP), X7
PADDL 80(BP), X9
PADDL 96(BP), X10
MOVOU (SI), X12
MOVOU 16(SI), X13
MOVOU 32(SI), X14
MOVOU 48(SI), X15
PXOR X12, X0
PXOR X13, X3
PXOR X14, X6
PXOR X15, X9
MOVOU X0, (DI)
MOVOU X3, 16(DI)
MOVOU X6, 32(DI)
MOVOU X9, 48(DI)
MOVQ $0x00000040, CX
LEAQ 64(SI), SI
SUBQ $0x40, BX
JMP sealSSE128SealHash
sealSSETail192:
MOVO ·chacha20Constants<>+0(SB), X0
MOVO 32(BP), X3
MOVO 48(BP), X6
MOVO 128(BP), X9
PADDL ·sseIncMask<>+0(SB), X9
MOVO X9, 80(BP)
MOVO X0, X1
MOVO X3, X4
MOVO X6, X7
MOVO X9, X10
PADDL ·sseIncMask<>+0(SB), X10
MOVO X10, 96(BP)
MOVO X1, X2
MOVO X4, X5
MOVO X7, X8
MOVO X10, X11
PADDL ·sseIncMask<>+0(SB), X11
MOVO X11, 112(BP)
sealSSETail192LoopA:
ADDQ (DI), R10
ADCQ 8(DI), R11
ADCQ $0x01, R12
MOVQ (BP), AX
MOVQ AX, R15
MULQ R10
MOVQ AX, R13
MOVQ DX, R14
MOVQ (BP), AX
MULQ R11
IMULQ R12, R15
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), AX
MOVQ AX, R8
MULQ R10
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 8(BP), AX
MULQ R11
ADDQ AX, R15
ADCQ $0x00, DX
IMULQ R12, R8
ADDQ R10, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
LEAQ 16(DI), DI
sealSSETail192LoopB:
PADDD X3, X0
PXOR X0, X9
ROL16(X9, X12)
PADDD X9, X6
PXOR X6, X3
MOVO X3, X12
PSLLL $0x0c, X12
PSRLL $0x14, X3
PXOR X12, X3
PADDD X3, X0
PXOR X0, X9
ROL8(X9, X12)
PADDD X9, X6
PXOR X6, X3
MOVO X3, X12
PSLLL $0x07, X12
PSRLL $0x19, X3
PXOR X12, X3
PADDD X4, X1
PXOR X1, X10
ROL16(X10, X12)
PADDD X10, X7
PXOR X7, X4
MOVO X4, X12
PSLLL $0x0c, X12
PSRLL $0x14, X4
PXOR X12, X4
PADDD X4, X1
PXOR X1, X10
ROL8(X10, X12)
PADDD X10, X7
PXOR X7, X4
MOVO X4, X12
PSLLL $0x07, X12
PSRLL $0x19, X4
PXOR X12, X4
PADDD X5, X2
PXOR X2, X11
ROL16(X11, X12)
PADDD X11, X8
PXOR X8, X5
MOVO X5, X12
PSLLL $0x0c, X12
PSRLL $0x14, X5
PXOR X12, X5
PADDD X5, X2
PXOR X2, X11
ROL8(X11, X12)
PADDD X11, X8
PXOR X8, X5
MOVO X5, X12
PSLLL $0x07, X12
PSRLL $0x19, X5
PXOR X12, X5
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xdb
BYTE $0x04
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xf6
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xc9
BYTE $0x0c
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xe4
BYTE $0x04
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xff
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xd2
BYTE $0x0c
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xed
BYTE $0x04
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xc0
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xdb
BYTE $0x0c
ADDQ (DI), R10
ADCQ 8(DI), R11
ADCQ $0x01, R12
MOVQ (BP), AX
MOVQ AX, R15
MULQ R10
MOVQ AX, R13
MOVQ DX, R14
MOVQ (BP), AX
MULQ R11
IMULQ R12, R15
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), AX
MOVQ AX, R8
MULQ R10
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 8(BP), AX
MULQ R11
ADDQ AX, R15
ADCQ $0x00, DX
IMULQ R12, R8
ADDQ R10, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
LEAQ 16(DI), DI
PADDD X3, X0
PXOR X0, X9
ROL16(X9, X12)
PADDD X9, X6
PXOR X6, X3
MOVO X3, X12
PSLLL $0x0c, X12
PSRLL $0x14, X3
PXOR X12, X3
PADDD X3, X0
PXOR X0, X9
ROL8(X9, X12)
PADDD X9, X6
PXOR X6, X3
MOVO X3, X12
PSLLL $0x07, X12
PSRLL $0x19, X3
PXOR X12, X3
PADDD X4, X1
PXOR X1, X10
ROL16(X10, X12)
PADDD X10, X7
PXOR X7, X4
MOVO X4, X12
PSLLL $0x0c, X12
PSRLL $0x14, X4
PXOR X12, X4
PADDD X4, X1
PXOR X1, X10
ROL8(X10, X12)
PADDD X10, X7
PXOR X7, X4
MOVO X4, X12
PSLLL $0x07, X12
PSRLL $0x19, X4
PXOR X12, X4
PADDD X5, X2
PXOR X2, X11
ROL16(X11, X12)
PADDD X11, X8
PXOR X8, X5
MOVO X5, X12
PSLLL $0x0c, X12
PSRLL $0x14, X5
PXOR X12, X5
PADDD X5, X2
PXOR X2, X11
ROL8(X11, X12)
PADDD X11, X8
PXOR X8, X5
MOVO X5, X12
PSLLL $0x07, X12
PSRLL $0x19, X5
PXOR X12, X5
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xdb
BYTE $0x0c
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xf6
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xc9
BYTE $0x04
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xe4
BYTE $0x0c
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xff
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xd2
BYTE $0x04
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xed
BYTE $0x0c
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xc0
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xdb
BYTE $0x04
DECQ CX
JG sealSSETail192LoopA
DECQ R9
JGE sealSSETail192LoopB
PADDL ·chacha20Constants<>+0(SB), X0
PADDL ·chacha20Constants<>+0(SB), X1
PADDL ·chacha20Constants<>+0(SB), X2
PADDL 32(BP), X3
PADDL 32(BP), X4
PADDL 32(BP), X5
PADDL 48(BP), X6
PADDL 48(BP), X7
PADDL 48(BP), X8
PADDL 80(BP), X9
PADDL 96(BP), X10
PADDL 112(BP), X11
MOVOU (SI), X12
MOVOU 16(SI), X13
MOVOU 32(SI), X14
MOVOU 48(SI), X15
PXOR X12, X0
PXOR X13, X3
PXOR X14, X6
PXOR X15, X9
MOVOU X0, (DI)
MOVOU X3, 16(DI)
MOVOU X6, 32(DI)
MOVOU X9, 48(DI)
MOVOU 64(SI), X12
MOVOU 80(SI), X13
MOVOU 96(SI), X14
MOVOU 112(SI), X15
PXOR X12, X1
PXOR X13, X4
PXOR X14, X7
PXOR X15, X10
MOVOU X1, 64(DI)
MOVOU X4, 80(DI)
MOVOU X7, 96(DI)
MOVOU X10, 112(DI)
MOVO X2, X1
MOVO X5, X4
MOVO X8, X7
MOVO X11, X10
MOVQ $0x00000080, CX
LEAQ 128(SI), SI
SUBQ $0x80, BX
JMP sealSSE128SealHash
sealSSE128:
MOVOU ·chacha20Constants<>+0(SB), X0
MOVOU 16(R8), X3
MOVOU 32(R8), X6
MOVOU 48(R8), X9
MOVO X0, X1
MOVO X3, X4
MOVO X6, X7
MOVO X9, X10
PADDL ·sseIncMask<>+0(SB), X10
MOVO X1, X2
MOVO X4, X5
MOVO X7, X8
MOVO X10, X11
PADDL ·sseIncMask<>+0(SB), X11
MOVO X3, X13
MOVO X6, X14
MOVO X10, X15
MOVQ $0x0000000a, R9
sealSSE128InnerCipherLoop:
PADDD X3, X0
PXOR X0, X9
ROL16(X9, X12)
PADDD X9, X6
PXOR X6, X3
MOVO X3, X12
PSLLL $0x0c, X12
PSRLL $0x14, X3
PXOR X12, X3
PADDD X3, X0
PXOR X0, X9
ROL8(X9, X12)
PADDD X9, X6
PXOR X6, X3
MOVO X3, X12
PSLLL $0x07, X12
PSRLL $0x19, X3
PXOR X12, X3
PADDD X4, X1
PXOR X1, X10
ROL16(X10, X12)
PADDD X10, X7
PXOR X7, X4
MOVO X4, X12
PSLLL $0x0c, X12
PSRLL $0x14, X4
PXOR X12, X4
PADDD X4, X1
PXOR X1, X10
ROL8(X10, X12)
PADDD X10, X7
PXOR X7, X4
MOVO X4, X12
PSLLL $0x07, X12
PSRLL $0x19, X4
PXOR X12, X4
PADDD X5, X2
PXOR X2, X11
ROL16(X11, X12)
PADDD X11, X8
PXOR X8, X5
MOVO X5, X12
PSLLL $0x0c, X12
PSRLL $0x14, X5
PXOR X12, X5
PADDD X5, X2
PXOR X2, X11
ROL8(X11, X12)
PADDD X11, X8
PXOR X8, X5
MOVO X5, X12
PSLLL $0x07, X12
PSRLL $0x19, X5
PXOR X12, X5
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xdb
BYTE $0x04
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xe4
BYTE $0x04
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xed
BYTE $0x04
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xf6
BYTE $0x08
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xff
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xc0
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xc9
BYTE $0x0c
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xd2
BYTE $0x0c
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xdb
BYTE $0x0c
PADDD X3, X0
PXOR X0, X9
ROL16(X9, X12)
PADDD X9, X6
PXOR X6, X3
MOVO X3, X12
PSLLL $0x0c, X12
PSRLL $0x14, X3
PXOR X12, X3
PADDD X3, X0
PXOR X0, X9
ROL8(X9, X12)
PADDD X9, X6
PXOR X6, X3
MOVO X3, X12
PSLLL $0x07, X12
PSRLL $0x19, X3
PXOR X12, X3
PADDD X4, X1
PXOR X1, X10
ROL16(X10, X12)
PADDD X10, X7
PXOR X7, X4
MOVO X4, X12
PSLLL $0x0c, X12
PSRLL $0x14, X4
PXOR X12, X4
PADDD X4, X1
PXOR X1, X10
ROL8(X10, X12)
PADDD X10, X7
PXOR X7, X4
MOVO X4, X12
PSLLL $0x07, X12
PSRLL $0x19, X4
PXOR X12, X4
PADDD X5, X2
PXOR X2, X11
ROL16(X11, X12)
PADDD X11, X8
PXOR X8, X5
MOVO X5, X12
PSLLL $0x0c, X12
PSRLL $0x14, X5
PXOR X12, X5
PADDD X5, X2
PXOR X2, X11
ROL8(X11, X12)
PADDD X11, X8
PXOR X8, X5
MOVO X5, X12
PSLLL $0x07, X12
PSRLL $0x19, X5
PXOR X12, X5
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xdb
BYTE $0x0c
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xe4
BYTE $0x0c
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xed
BYTE $0x0c
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xf6
BYTE $0x08
BYTE $0x66
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xff
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xc0
BYTE $0x08
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xc9
BYTE $0x04
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xd2
BYTE $0x04
BYTE $0x66
BYTE $0x45
BYTE $0x0f
BYTE $0x3a
BYTE $0x0f
BYTE $0xdb
BYTE $0x04
DECQ R9
JNE sealSSE128InnerCipherLoop
// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
PADDL ·chacha20Constants<>+0(SB), X0
PADDL ·chacha20Constants<>+0(SB), X1
PADDL ·chacha20Constants<>+0(SB), X2
PADDL X13, X3
PADDL X13, X4
PADDL X13, X5
PADDL X14, X7
PADDL X14, X8
PADDL X15, X10
PADDL ·sseIncMask<>+0(SB), X15
PADDL X15, X11
PAND ·polyClampMask<>+0(SB), X0
MOVOU X0, (BP)
MOVOU X3, 16(BP)
// Hash
MOVQ ad_len+80(FP), R9
CALL polyHashADInternal<>(SB)
XORQ CX, CX
sealSSE128SealHash:
CMPQ CX, $0x10
JB sealSSE128Seal
ADDQ (DI), R10
ADCQ 8(DI), R11
ADCQ $0x01, R12
MOVQ (BP), AX
MOVQ AX, R15
MULQ R10
MOVQ AX, R13
MOVQ DX, R14
MOVQ (BP), AX
MULQ R11
IMULQ R12, R15
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), AX
MOVQ AX, R8
MULQ R10
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 8(BP), AX
MULQ R11
ADDQ AX, R15
ADCQ $0x00, DX
IMULQ R12, R8
ADDQ R10, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
SUBQ $0x10, CX
ADDQ $0x10, DI
JMP sealSSE128SealHash
sealSSE128Seal:
CMPQ BX, $0x10
JB sealSSETail
SUBQ $0x10, BX
// Load for decryption
MOVOU (SI), X12
PXOR X12, X1
MOVOU X1, (DI)
LEAQ 16(SI), SI
LEAQ 16(DI), DI
// Extract for hashing
MOVQ X1, R13
PSRLDQ $0x08, X1
MOVQ X1, R14
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x01, R12
MOVQ (BP), AX
MOVQ AX, R15
MULQ R10
MOVQ AX, R13
MOVQ DX, R14
MOVQ (BP), AX
MULQ R11
IMULQ R12, R15
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), AX
MOVQ AX, R8
MULQ R10
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 8(BP), AX
MULQ R11
ADDQ AX, R15
ADCQ $0x00, DX
IMULQ R12, R8
ADDQ R10, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
// Shift the stream "left"
MOVO X4, X1
MOVO X7, X4
MOVO X10, X7
MOVO X2, X10
MOVO X5, X2
MOVO X8, X5
MOVO X11, X8
JMP sealSSE128Seal
sealSSETail:
TESTQ BX, BX
JE sealSSEFinalize
// We can only load the PT one byte at a time to avoid read after end of buffer
MOVQ BX, R9
SHLQ $0x04, R9
LEAQ ·andMask<>+0(SB), R13
MOVQ BX, CX
LEAQ -1(SI)(BX*1), SI
XORQ R15, R15
XORQ R8, R8
XORQ AX, AX
sealSSETailLoadLoop:
SHLQ $0x08, R15, R8
SHLQ $0x08, R15
MOVB (SI), AX
XORQ AX, R15
LEAQ -1(SI), SI
DECQ CX
JNE sealSSETailLoadLoop
MOVQ R15, 64(BP)
MOVQ R8, 72(BP)
PXOR 64(BP), X1
MOVOU X1, (DI)
MOVOU -16(R13)(R9*1), X12
PAND X12, X1
MOVQ X1, R13
PSRLDQ $0x08, X1
MOVQ X1, R14
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x01, R12
MOVQ (BP), AX
MOVQ AX, R15
MULQ R10
MOVQ AX, R13
MOVQ DX, R14
MOVQ (BP), AX
MULQ R11
IMULQ R12, R15
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), AX
MOVQ AX, R8
MULQ R10
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 8(BP), AX
MULQ R11
ADDQ AX, R15
ADCQ $0x00, DX
IMULQ R12, R8
ADDQ R10, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
ADDQ BX, DI
sealSSEFinalize:
// Hash in the buffer lengths
ADDQ ad_len+80(FP), R10
ADCQ src_len+56(FP), R11
ADCQ $0x01, R12
MOVQ (BP), AX
MOVQ AX, R15
MULQ R10
MOVQ AX, R13
MOVQ DX, R14
MOVQ (BP), AX
MULQ R11
IMULQ R12, R15
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), AX
MOVQ AX, R8
MULQ R10
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 8(BP), AX
MULQ R11
ADDQ AX, R15
ADCQ $0x00, DX
IMULQ R12, R8
ADDQ R10, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
// Final reduce
MOVQ R10, R13
MOVQ R11, R14
MOVQ R12, R15
SUBQ $-5, R10
SBBQ $-1, R11
SBBQ $0x03, R12
CMOVQCS R13, R10
CMOVQCS R14, R11
CMOVQCS R15, R12
// Add in the "s" part of the key
ADDQ 16(BP), R10
ADCQ 24(BP), R11
// Finally store the tag at the end of the message
MOVQ R10, (DI)
MOVQ R11, 8(DI)
RET
chacha20Poly1305Seal_AVX2:
VZEROUPPER
VMOVDQU ·chacha20Constants<>+0(SB), Y0
BYTE $0xc4
BYTE $0x42
BYTE $0x7d
BYTE $0x5a
BYTE $0x70
BYTE $0x10
BYTE $0xc4
BYTE $0x42
BYTE $0x7d
BYTE $0x5a
BYTE $0x60
BYTE $0x20
BYTE $0xc4
BYTE $0xc2
BYTE $0x7d
BYTE $0x5a
BYTE $0x60
BYTE $0x30
VPADDD ·avx2InitMask<>+0(SB), Y4, Y4
// Special optimizations, for very short buffers
CMPQ BX, $0x000000c0
JBE seal192AVX2
CMPQ BX, $0x00000140
JBE seal320AVX2
// For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
VMOVDQA Y0, Y5
VMOVDQA Y0, Y6
VMOVDQA Y0, Y7
VMOVDQA Y14, Y9
VMOVDQA Y14, Y10
VMOVDQA Y14, Y11
VMOVDQA Y14, 32(BP)
VMOVDQA Y12, Y13
VMOVDQA Y12, Y8
VMOVDQA Y12, Y15
VMOVDQA Y12, 64(BP)
VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
VMOVDQA Y4, 96(BP)
VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
VMOVDQA Y1, 128(BP)
VPADDD ·avx2IncMask<>+0(SB), Y2, Y3
VMOVDQA Y2, 160(BP)
VMOVDQA Y3, 192(BP)
MOVQ $0x0000000a, R9
sealAVX2IntroLoop:
VMOVDQA Y15, 224(BP)
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x0c, Y14, Y15
VPSRLD $0x14, Y14, Y14
VPXOR Y15, Y14, Y14
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x07, Y14, Y15
VPSRLD $0x19, Y14, Y14
VPXOR Y15, Y14, Y14
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x0c, Y9, Y15
VPSRLD $0x14, Y9, Y9
VPXOR Y15, Y9, Y9
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x07, Y9, Y15
VPSRLD $0x19, Y9, Y9
VPXOR Y15, Y9, Y9
VPADDD Y10, Y6, Y6
VPXOR Y6, Y2, Y2
VPSHUFB ·rol16<>+0(SB), Y2, Y2
VPADDD Y2, Y8, Y8
VPXOR Y8, Y10, Y10
VPSLLD $0x0c, Y10, Y15
VPSRLD $0x14, Y10, Y10
VPXOR Y15, Y10, Y10
VPADDD Y10, Y6, Y6
VPXOR Y6, Y2, Y2
VPSHUFB ·rol8<>+0(SB), Y2, Y2
VPADDD Y2, Y8, Y8
VPXOR Y8, Y10, Y10
VPSLLD $0x07, Y10, Y15
VPSRLD $0x19, Y10, Y10
VPXOR Y15, Y10, Y10
VMOVDQA 224(BP), Y15
VMOVDQA Y13, 224(BP)
VPADDD Y11, Y7, Y7
VPXOR Y7, Y3, Y3
VPSHUFB ·rol16<>+0(SB), Y3, Y3
VPADDD Y3, Y15, Y15
VPXOR Y15, Y11, Y11
VPSLLD $0x0c, Y11, Y13
VPSRLD $0x14, Y11, Y11
VPXOR Y13, Y11, Y11
VPADDD Y11, Y7, Y7
VPXOR Y7, Y3, Y3
VPSHUFB ·rol8<>+0(SB), Y3, Y3
VPADDD Y3, Y15, Y15
VPXOR Y15, Y11, Y11
VPSLLD $0x07, Y11, Y13
VPSRLD $0x19, Y11, Y11
VPXOR Y13, Y11, Y11
VMOVDQA 224(BP), Y13
VPALIGNR $0x04, Y14, Y14, Y14
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x0c, Y4, Y4, Y4
VPALIGNR $0x04, Y9, Y9, Y9
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x0c, Y1, Y1, Y1
VPALIGNR $0x04, Y10, Y10, Y10
VPALIGNR $0x08, Y8, Y8, Y8
VPALIGNR $0x0c, Y2, Y2, Y2
VPALIGNR $0x04, Y11, Y11, Y11
VPALIGNR $0x08, Y15, Y15, Y15
VPALIGNR $0x0c, Y3, Y3, Y3
VMOVDQA Y15, 224(BP)
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x0c, Y14, Y15
VPSRLD $0x14, Y14, Y14
VPXOR Y15, Y14, Y14
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x07, Y14, Y15
VPSRLD $0x19, Y14, Y14
VPXOR Y15, Y14, Y14
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x0c, Y9, Y15
VPSRLD $0x14, Y9, Y9
VPXOR Y15, Y9, Y9
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x07, Y9, Y15
VPSRLD $0x19, Y9, Y9
VPXOR Y15, Y9, Y9
VPADDD Y10, Y6, Y6
VPXOR Y6, Y2, Y2
VPSHUFB ·rol16<>+0(SB), Y2, Y2
VPADDD Y2, Y8, Y8
VPXOR Y8, Y10, Y10
VPSLLD $0x0c, Y10, Y15
VPSRLD $0x14, Y10, Y10
VPXOR Y15, Y10, Y10
VPADDD Y10, Y6, Y6
VPXOR Y6, Y2, Y2
VPSHUFB ·rol8<>+0(SB), Y2, Y2
VPADDD Y2, Y8, Y8
VPXOR Y8, Y10, Y10
VPSLLD $0x07, Y10, Y15
VPSRLD $0x19, Y10, Y10
VPXOR Y15, Y10, Y10
VMOVDQA 224(BP), Y15
VMOVDQA Y13, 224(BP)
VPADDD Y11, Y7, Y7
VPXOR Y7, Y3, Y3
VPSHUFB ·rol16<>+0(SB), Y3, Y3
VPADDD Y3, Y15, Y15
VPXOR Y15, Y11, Y11
VPSLLD $0x0c, Y11, Y13
VPSRLD $0x14, Y11, Y11
VPXOR Y13, Y11, Y11
VPADDD Y11, Y7, Y7
VPXOR Y7, Y3, Y3
VPSHUFB ·rol8<>+0(SB), Y3, Y3
VPADDD Y3, Y15, Y15
VPXOR Y15, Y11, Y11
VPSLLD $0x07, Y11, Y13
VPSRLD $0x19, Y11, Y11
VPXOR Y13, Y11, Y11
VMOVDQA 224(BP), Y13
VPALIGNR $0x0c, Y14, Y14, Y14
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x04, Y4, Y4, Y4
VPALIGNR $0x0c, Y9, Y9, Y9
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x04, Y1, Y1, Y1
VPALIGNR $0x0c, Y10, Y10, Y10
VPALIGNR $0x08, Y8, Y8, Y8
VPALIGNR $0x04, Y2, Y2, Y2
VPALIGNR $0x0c, Y11, Y11, Y11
VPALIGNR $0x08, Y15, Y15, Y15
VPALIGNR $0x04, Y3, Y3, Y3
DECQ R9
JNE sealAVX2IntroLoop
VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
VPADDD ·chacha20Constants<>+0(SB), Y6, Y6
VPADDD ·chacha20Constants<>+0(SB), Y7, Y7
VPADDD 32(BP), Y14, Y14
VPADDD 32(BP), Y9, Y9
VPADDD 32(BP), Y10, Y10
VPADDD 32(BP), Y11, Y11
VPADDD 64(BP), Y12, Y12
VPADDD 64(BP), Y13, Y13
VPADDD 64(BP), Y8, Y8
VPADDD 64(BP), Y15, Y15
VPADDD 96(BP), Y4, Y4
VPADDD 128(BP), Y1, Y1
VPADDD 160(BP), Y2, Y2
VPADDD 192(BP), Y3, Y3
VPERM2I128 $0x13, Y12, Y4, Y12
VPERM2I128 $0x02, Y0, Y14, Y4
VPERM2I128 $0x13, Y0, Y14, Y0
// Clamp and store poly key
VPAND ·polyClampMask<>+0(SB), Y4, Y4
VMOVDQA Y4, (BP)
// Hash AD
MOVQ ad_len+80(FP), R9
CALL polyHashADInternal<>(SB)
// Can store at least 320 bytes
VPXOR (SI), Y0, Y0
VPXOR 32(SI), Y12, Y12
VMOVDQU Y0, (DI)
VMOVDQU Y12, 32(DI)
VPERM2I128 $0x02, Y5, Y9, Y0
VPERM2I128 $0x02, Y13, Y1, Y14
VPERM2I128 $0x13, Y5, Y9, Y12
VPERM2I128 $0x13, Y13, Y1, Y4
VPXOR 64(SI), Y0, Y0
VPXOR 96(SI), Y14, Y14
VPXOR 128(SI), Y12, Y12
VPXOR 160(SI), Y4, Y4
VMOVDQU Y0, 64(DI)
VMOVDQU Y14, 96(DI)
VMOVDQU Y12, 128(DI)
VMOVDQU Y4, 160(DI)
VPERM2I128 $0x02, Y6, Y10, Y0
VPERM2I128 $0x02, Y8, Y2, Y14
VPERM2I128 $0x13, Y6, Y10, Y12
VPERM2I128 $0x13, Y8, Y2, Y4
VPXOR 192(SI), Y0, Y0
VPXOR 224(SI), Y14, Y14
VPXOR 256(SI), Y12, Y12
VPXOR 288(SI), Y4, Y4
VMOVDQU Y0, 192(DI)
VMOVDQU Y14, 224(DI)
VMOVDQU Y12, 256(DI)
VMOVDQU Y4, 288(DI)
MOVQ $0x00000140, CX
SUBQ $0x00000140, BX
LEAQ 320(SI), SI
VPERM2I128 $0x02, Y7, Y11, Y0
VPERM2I128 $0x02, Y15, Y3, Y14
VPERM2I128 $0x13, Y7, Y11, Y12
VPERM2I128 $0x13, Y15, Y3, Y4
CMPQ BX, $0x80
JBE sealAVX2SealHash
VPXOR (SI), Y0, Y0
VPXOR 32(SI), Y14, Y14
VPXOR 64(SI), Y12, Y12
VPXOR 96(SI), Y4, Y4
VMOVDQU Y0, 320(DI)
VMOVDQU Y14, 352(DI)
VMOVDQU Y12, 384(DI)
VMOVDQU Y4, 416(DI)
SUBQ $0x80, BX
LEAQ 128(SI), SI
MOVQ $0x00000008, CX
MOVQ $0x00000002, R9
CMPQ BX, $0x80
JBE sealAVX2Tail128
CMPQ BX, $0x00000100
JBE sealAVX2Tail256
CMPQ BX, $0x00000180
JBE sealAVX2Tail384
CMPQ BX, $0x00000200
JBE sealAVX2Tail512
// We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop
VMOVDQA ·chacha20Constants<>+0(SB), Y0
VMOVDQA Y0, Y5
VMOVDQA Y0, Y6
VMOVDQA Y0, Y7
VMOVDQA 32(BP), Y14
VMOVDQA Y14, Y9
VMOVDQA Y14, Y10
VMOVDQA Y14, Y11
VMOVDQA 64(BP), Y12
VMOVDQA Y12, Y13
VMOVDQA Y12, Y8
VMOVDQA Y12, Y15
VMOVDQA 192(BP), Y4
VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
VPADDD ·avx2IncMask<>+0(SB), Y2, Y3
VMOVDQA Y4, 96(BP)
VMOVDQA Y1, 128(BP)
VMOVDQA Y2, 160(BP)
VMOVDQA Y3, 192(BP)
VMOVDQA Y15, 224(BP)
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x0c, Y14, Y15
VPSRLD $0x14, Y14, Y14
VPXOR Y15, Y14, Y14
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x07, Y14, Y15
VPSRLD $0x19, Y14, Y14
VPXOR Y15, Y14, Y14
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x0c, Y9, Y15
VPSRLD $0x14, Y9, Y9
VPXOR Y15, Y9, Y9
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x07, Y9, Y15
VPSRLD $0x19, Y9, Y9
VPXOR Y15, Y9, Y9
VPADDD Y10, Y6, Y6
VPXOR Y6, Y2, Y2
VPSHUFB ·rol16<>+0(SB), Y2, Y2
VPADDD Y2, Y8, Y8
VPXOR Y8, Y10, Y10
VPSLLD $0x0c, Y10, Y15
VPSRLD $0x14, Y10, Y10
VPXOR Y15, Y10, Y10
VPADDD Y10, Y6, Y6
VPXOR Y6, Y2, Y2
VPSHUFB ·rol8<>+0(SB), Y2, Y2
VPADDD Y2, Y8, Y8
VPXOR Y8, Y10, Y10
VPSLLD $0x07, Y10, Y15
VPSRLD $0x19, Y10, Y10
VPXOR Y15, Y10, Y10
VMOVDQA 224(BP), Y15
VMOVDQA Y13, 224(BP)
VPADDD Y11, Y7, Y7
VPXOR Y7, Y3, Y3
VPSHUFB ·rol16<>+0(SB), Y3, Y3
VPADDD Y3, Y15, Y15
VPXOR Y15, Y11, Y11
VPSLLD $0x0c, Y11, Y13
VPSRLD $0x14, Y11, Y11
VPXOR Y13, Y11, Y11
VPADDD Y11, Y7, Y7
VPXOR Y7, Y3, Y3
VPSHUFB ·rol8<>+0(SB), Y3, Y3
VPADDD Y3, Y15, Y15
VPXOR Y15, Y11, Y11
VPSLLD $0x07, Y11, Y13
VPSRLD $0x19, Y11, Y11
VPXOR Y13, Y11, Y11
VMOVDQA 224(BP), Y13
VPALIGNR $0x04, Y14, Y14, Y14
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x0c, Y4, Y4, Y4
VPALIGNR $0x04, Y9, Y9, Y9
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x0c, Y1, Y1, Y1
VPALIGNR $0x04, Y10, Y10, Y10
VPALIGNR $0x08, Y8, Y8, Y8
VPALIGNR $0x0c, Y2, Y2, Y2
VPALIGNR $0x04, Y11, Y11, Y11
VPALIGNR $0x08, Y15, Y15, Y15
VPALIGNR $0x0c, Y3, Y3, Y3
VMOVDQA Y15, 224(BP)
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x0c, Y14, Y15
VPSRLD $0x14, Y14, Y14
VPXOR Y15, Y14, Y14
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x07, Y14, Y15
VPSRLD $0x19, Y14, Y14
VPXOR Y15, Y14, Y14
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x0c, Y9, Y15
VPSRLD $0x14, Y9, Y9
VPXOR Y15, Y9, Y9
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x07, Y9, Y15
VPSRLD $0x19, Y9, Y9
VPXOR Y15, Y9, Y9
VPADDD Y10, Y6, Y6
VPXOR Y6, Y2, Y2
VPSHUFB ·rol16<>+0(SB), Y2, Y2
VPADDD Y2, Y8, Y8
VPXOR Y8, Y10, Y10
VPSLLD $0x0c, Y10, Y15
VPSRLD $0x14, Y10, Y10
VPXOR Y15, Y10, Y10
VPADDD Y10, Y6, Y6
VPXOR Y6, Y2, Y2
VPSHUFB ·rol8<>+0(SB), Y2, Y2
VPADDD Y2, Y8, Y8
VPXOR Y8, Y10, Y10
VPSLLD $0x07, Y10, Y15
VPSRLD $0x19, Y10, Y10
VPXOR Y15, Y10, Y10
VMOVDQA 224(BP), Y15
VMOVDQA Y13, 224(BP)
VPADDD Y11, Y7, Y7
VPXOR Y7, Y3, Y3
VPSHUFB ·rol16<>+0(SB), Y3, Y3
VPADDD Y3, Y15, Y15
VPXOR Y15, Y11, Y11
VPSLLD $0x0c, Y11, Y13
VPSRLD $0x14, Y11, Y11
VPXOR Y13, Y11, Y11
VPADDD Y11, Y7, Y7
VPXOR Y7, Y3, Y3
VPSHUFB ·rol8<>+0(SB), Y3, Y3
VPADDD Y3, Y15, Y15
VPXOR Y15, Y11, Y11
VPSLLD $0x07, Y11, Y13
VPSRLD $0x19, Y11, Y11
VPXOR Y13, Y11, Y11
VMOVDQA 224(BP), Y13
VPALIGNR $0x0c, Y14, Y14, Y14
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x04, Y4, Y4, Y4
VPALIGNR $0x0c, Y9, Y9, Y9
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x04, Y1, Y1, Y1
VPALIGNR $0x0c, Y10, Y10, Y10
VPALIGNR $0x08, Y8, Y8, Y8
VPALIGNR $0x04, Y2, Y2, Y2
VPALIGNR $0x0c, Y11, Y11, Y11
VPALIGNR $0x08, Y15, Y15, Y15
VPALIGNR $0x04, Y3, Y3, Y3
VPADDD Y14, Y0, Y0
VPADDD Y9, Y5, Y5
VPADDD Y10, Y6, Y6
VPADDD Y11, Y7, Y7
VPXOR Y0, Y4, Y4
VPXOR Y5, Y1, Y1
VPXOR Y6, Y2, Y2
VPXOR Y7, Y3, Y3
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y2, Y2
VPSHUFB ·rol16<>+0(SB), Y3, Y3
VPADDD Y4, Y12, Y12
VPADDD Y1, Y13, Y13
VPADDD Y2, Y8, Y8
VPADDD Y3, Y15, Y15
VPXOR Y12, Y14, Y14
VPXOR Y13, Y9, Y9
VPXOR Y8, Y10, Y10
VPXOR Y15, Y11, Y11
VMOVDQA Y15, 224(BP)
VPSLLD $0x0c, Y14, Y15
VPSRLD $0x14, Y14, Y14
VPXOR Y15, Y14, Y14
VPSLLD $0x0c, Y9, Y15
VPSRLD $0x14, Y9, Y9
VPXOR Y15, Y9, Y9
VPSLLD $0x0c, Y10, Y15
VPSRLD $0x14, Y10, Y10
VPXOR Y15, Y10, Y10
VPSLLD $0x0c, Y11, Y15
VPSRLD $0x14, Y11, Y11
VPXOR Y15, Y11, Y11
VMOVDQA 224(BP), Y15
SUBQ $0x10, DI
MOVQ $0x00000009, CX
JMP sealAVX2InternalLoopStart
sealAVX2MainLoop:
VMOVDQU ·chacha20Constants<>+0(SB), Y0
VMOVDQA Y0, Y5
VMOVDQA Y0, Y6
VMOVDQA Y0, Y7
VMOVDQA 32(BP), Y14
VMOVDQA Y14, Y9
VMOVDQA Y14, Y10
VMOVDQA Y14, Y11
VMOVDQA 64(BP), Y12
VMOVDQA Y12, Y13
VMOVDQA Y12, Y8
VMOVDQA Y12, Y15
VMOVDQA 192(BP), Y4
VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
VPADDD ·avx2IncMask<>+0(SB), Y2, Y3
VMOVDQA Y4, 96(BP)
VMOVDQA Y1, 128(BP)
VMOVDQA Y2, 160(BP)
VMOVDQA Y3, 192(BP)
MOVQ $0x0000000a, CX
sealAVX2InternalLoop:
ADDQ (DI), R10
ADCQ 8(DI), R11
ADCQ $0x01, R12
VPADDD Y14, Y0, Y0
VPADDD Y9, Y5, Y5
VPADDD Y10, Y6, Y6
VPADDD Y11, Y7, Y7
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
VPXOR Y0, Y4, Y4
VPXOR Y5, Y1, Y1
VPXOR Y6, Y2, Y2
VPXOR Y7, Y3, Y3
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y2, Y2
VPSHUFB ·rol16<>+0(SB), Y3, Y3
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
VPADDD Y4, Y12, Y12
VPADDD Y1, Y13, Y13
VPADDD Y2, Y8, Y8
VPADDD Y3, Y15, Y15
VPXOR Y12, Y14, Y14
VPXOR Y13, Y9, Y9
VPXOR Y8, Y10, Y10
VPXOR Y15, Y11, Y11
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
VMOVDQA Y15, 224(BP)
VPSLLD $0x0c, Y14, Y15
VPSRLD $0x14, Y14, Y14
VPXOR Y15, Y14, Y14
VPSLLD $0x0c, Y9, Y15
VPSRLD $0x14, Y9, Y9
VPXOR Y15, Y9, Y9
VPSLLD $0x0c, Y10, Y15
VPSRLD $0x14, Y10, Y10
VPXOR Y15, Y10, Y10
VPSLLD $0x0c, Y11, Y15
VPSRLD $0x14, Y11, Y11
VPXOR Y15, Y11, Y11
VMOVDQA 224(BP), Y15
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
sealAVX2InternalLoopStart:
VPADDD Y14, Y0, Y0
VPADDD Y9, Y5, Y5
VPADDD Y10, Y6, Y6
VPADDD Y11, Y7, Y7
VPXOR Y0, Y4, Y4
VPXOR Y5, Y1, Y1
VPXOR Y6, Y2, Y2
VPXOR Y7, Y3, Y3
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y2, Y2
VPSHUFB ·rol8<>+0(SB), Y3, Y3
ADDQ 16(DI), R10
ADCQ 24(DI), R11
ADCQ $0x01, R12
VPADDD Y4, Y12, Y12
VPADDD Y1, Y13, Y13
VPADDD Y2, Y8, Y8
VPADDD Y3, Y15, Y15
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
VPXOR Y12, Y14, Y14
VPXOR Y13, Y9, Y9
VPXOR Y8, Y10, Y10
VPXOR Y15, Y11, Y11
VMOVDQA Y15, 224(BP)
VPSLLD $0x07, Y14, Y15
VPSRLD $0x19, Y14, Y14
VPXOR Y15, Y14, Y14
VPSLLD $0x07, Y9, Y15
VPSRLD $0x19, Y9, Y9
VPXOR Y15, Y9, Y9
VPSLLD $0x07, Y10, Y15
VPSRLD $0x19, Y10, Y10
VPXOR Y15, Y10, Y10
VPSLLD $0x07, Y11, Y15
VPSRLD $0x19, Y11, Y11
VPXOR Y15, Y11, Y11
VMOVDQA 224(BP), Y15
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
VPALIGNR $0x04, Y14, Y14, Y14
VPALIGNR $0x04, Y9, Y9, Y9
VPALIGNR $0x04, Y10, Y10, Y10
VPALIGNR $0x04, Y11, Y11, Y11
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x08, Y8, Y8, Y8
VPALIGNR $0x08, Y15, Y15, Y15
VPALIGNR $0x0c, Y4, Y4, Y4
VPALIGNR $0x0c, Y1, Y1, Y1
VPALIGNR $0x0c, Y2, Y2, Y2
VPALIGNR $0x0c, Y3, Y3, Y3
VPADDD Y14, Y0, Y0
VPADDD Y9, Y5, Y5
VPADDD Y10, Y6, Y6
VPADDD Y11, Y7, Y7
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
VPXOR Y0, Y4, Y4
VPXOR Y5, Y1, Y1
VPXOR Y6, Y2, Y2
VPXOR Y7, Y3, Y3
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y2, Y2
VPSHUFB ·rol16<>+0(SB), Y3, Y3
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
VPADDD Y4, Y12, Y12
VPADDD Y1, Y13, Y13
VPADDD Y2, Y8, Y8
VPADDD Y3, Y15, Y15
VPXOR Y12, Y14, Y14
VPXOR Y13, Y9, Y9
VPXOR Y8, Y10, Y10
VPXOR Y15, Y11, Y11
ADDQ 32(DI), R10
ADCQ 40(DI), R11
ADCQ $0x01, R12
LEAQ 48(DI), DI
VMOVDQA Y15, 224(BP)
VPSLLD $0x0c, Y14, Y15
VPSRLD $0x14, Y14, Y14
VPXOR Y15, Y14, Y14
VPSLLD $0x0c, Y9, Y15
VPSRLD $0x14, Y9, Y9
VPXOR Y15, Y9, Y9
VPSLLD $0x0c, Y10, Y15
VPSRLD $0x14, Y10, Y10
VPXOR Y15, Y10, Y10
VPSLLD $0x0c, Y11, Y15
VPSRLD $0x14, Y11, Y11
VPXOR Y15, Y11, Y11
VMOVDQA 224(BP), Y15
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
VPADDD Y14, Y0, Y0
VPADDD Y9, Y5, Y5
VPADDD Y10, Y6, Y6
VPADDD Y11, Y7, Y7
VPXOR Y0, Y4, Y4
VPXOR Y5, Y1, Y1
VPXOR Y6, Y2, Y2
VPXOR Y7, Y3, Y3
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y2, Y2
VPSHUFB ·rol8<>+0(SB), Y3, Y3
VPADDD Y4, Y12, Y12
VPADDD Y1, Y13, Y13
VPADDD Y2, Y8, Y8
VPADDD Y3, Y15, Y15
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
VPXOR Y12, Y14, Y14
VPXOR Y13, Y9, Y9
VPXOR Y8, Y10, Y10
VPXOR Y15, Y11, Y11
VMOVDQA Y15, 224(BP)
VPSLLD $0x07, Y14, Y15
VPSRLD $0x19, Y14, Y14
VPXOR Y15, Y14, Y14
VPSLLD $0x07, Y9, Y15
VPSRLD $0x19, Y9, Y9
VPXOR Y15, Y9, Y9
VPSLLD $0x07, Y10, Y15
VPSRLD $0x19, Y10, Y10
VPXOR Y15, Y10, Y10
VPSLLD $0x07, Y11, Y15
VPSRLD $0x19, Y11, Y11
VPXOR Y15, Y11, Y11
VMOVDQA 224(BP), Y15
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
VPALIGNR $0x0c, Y14, Y14, Y14
VPALIGNR $0x0c, Y9, Y9, Y9
VPALIGNR $0x0c, Y10, Y10, Y10
VPALIGNR $0x0c, Y11, Y11, Y11
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x08, Y8, Y8, Y8
VPALIGNR $0x08, Y15, Y15, Y15
VPALIGNR $0x04, Y4, Y4, Y4
VPALIGNR $0x04, Y1, Y1, Y1
VPALIGNR $0x04, Y2, Y2, Y2
VPALIGNR $0x04, Y3, Y3, Y3
DECQ CX
JNE sealAVX2InternalLoop
VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
VPADDD ·chacha20Constants<>+0(SB), Y6, Y6
VPADDD ·chacha20Constants<>+0(SB), Y7, Y7
VPADDD 32(BP), Y14, Y14
VPADDD 32(BP), Y9, Y9
VPADDD 32(BP), Y10, Y10
VPADDD 32(BP), Y11, Y11
VPADDD 64(BP), Y12, Y12
VPADDD 64(BP), Y13, Y13
VPADDD 64(BP), Y8, Y8
VPADDD 64(BP), Y15, Y15
VPADDD 96(BP), Y4, Y4
VPADDD 128(BP), Y1, Y1
VPADDD 160(BP), Y2, Y2
VPADDD 192(BP), Y3, Y3
VMOVDQA Y15, 224(BP)
// We only hashed 480 of the 512 bytes available - hash the remaining 32 here
ADDQ (DI), R10
ADCQ 8(DI), R11
ADCQ $0x01, R12
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
LEAQ 32(DI), DI
VPERM2I128 $0x02, Y0, Y14, Y15
VPERM2I128 $0x13, Y0, Y14, Y14
VPERM2I128 $0x02, Y12, Y4, Y0
VPERM2I128 $0x13, Y12, Y4, Y12
VPXOR (SI), Y15, Y15
VPXOR 32(SI), Y0, Y0
VPXOR 64(SI), Y14, Y14
VPXOR 96(SI), Y12, Y12
VMOVDQU Y15, (DI)
VMOVDQU Y0, 32(DI)
VMOVDQU Y14, 64(DI)
VMOVDQU Y12, 96(DI)
VPERM2I128 $0x02, Y5, Y9, Y0
VPERM2I128 $0x02, Y13, Y1, Y14
VPERM2I128 $0x13, Y5, Y9, Y12
VPERM2I128 $0x13, Y13, Y1, Y4
VPXOR 128(SI), Y0, Y0
VPXOR 160(SI), Y14, Y14
VPXOR 192(SI), Y12, Y12
VPXOR 224(SI), Y4, Y4
VMOVDQU Y0, 128(DI)
VMOVDQU Y14, 160(DI)
VMOVDQU Y12, 192(DI)
VMOVDQU Y4, 224(DI)
// and here
ADDQ -16(DI), R10
ADCQ -8(DI), R11
ADCQ $0x01, R12
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
VPERM2I128 $0x02, Y6, Y10, Y0
VPERM2I128 $0x02, Y8, Y2, Y14
VPERM2I128 $0x13, Y6, Y10, Y12
VPERM2I128 $0x13, Y8, Y2, Y4
VPXOR 256(SI), Y0, Y0
VPXOR 288(SI), Y14, Y14
VPXOR 320(SI), Y12, Y12
VPXOR 352(SI), Y4, Y4
VMOVDQU Y0, 256(DI)
VMOVDQU Y14, 288(DI)
VMOVDQU Y12, 320(DI)
VMOVDQU Y4, 352(DI)
VPERM2I128 $0x02, Y7, Y11, Y0
VPERM2I128 $0x02, 224(BP), Y3, Y14
VPERM2I128 $0x13, Y7, Y11, Y12
VPERM2I128 $0x13, 224(BP), Y3, Y4
VPXOR 384(SI), Y0, Y0
VPXOR 416(SI), Y14, Y14
VPXOR 448(SI), Y12, Y12
VPXOR 480(SI), Y4, Y4
VMOVDQU Y0, 384(DI)
VMOVDQU Y14, 416(DI)
VMOVDQU Y12, 448(DI)
VMOVDQU Y4, 480(DI)
LEAQ 512(SI), SI
SUBQ $0x00000200, BX
CMPQ BX, $0x00000200
JG sealAVX2MainLoop
// Tail can only hash 480 bytes
ADDQ (DI), R10
ADCQ 8(DI), R11
ADCQ $0x01, R12
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
ADDQ 16(DI), R10
ADCQ 24(DI), R11
ADCQ $0x01, R12
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
LEAQ 32(DI), DI
MOVQ $0x0000000a, CX
MOVQ $0x00000000, R9
CMPQ BX, $0x80
JBE sealAVX2Tail128
CMPQ BX, $0x00000100
JBE sealAVX2Tail256
CMPQ BX, $0x00000180
JBE sealAVX2Tail384
JMP sealAVX2Tail512
seal192AVX2:
VMOVDQA Y0, Y5
VMOVDQA Y14, Y9
VMOVDQA Y12, Y13
VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
VMOVDQA Y0, Y6
VMOVDQA Y14, Y10
VMOVDQA Y12, Y8
VMOVDQA Y4, Y2
VMOVDQA Y1, Y15
MOVQ $0x0000000a, R9
sealAVX2192InnerCipherLoop:
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x0c, Y14, Y3
VPSRLD $0x14, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x07, Y14, Y3
VPSRLD $0x19, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x0c, Y9, Y3
VPSRLD $0x14, Y9, Y9
VPXOR Y3, Y9, Y9
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x07, Y9, Y3
VPSRLD $0x19, Y9, Y9
VPXOR Y3, Y9, Y9
VPALIGNR $0x04, Y14, Y14, Y14
VPALIGNR $0x04, Y9, Y9, Y9
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x0c, Y4, Y4, Y4
VPALIGNR $0x0c, Y1, Y1, Y1
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x0c, Y14, Y3
VPSRLD $0x14, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x07, Y14, Y3
VPSRLD $0x19, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x0c, Y9, Y3
VPSRLD $0x14, Y9, Y9
VPXOR Y3, Y9, Y9
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x07, Y9, Y3
VPSRLD $0x19, Y9, Y9
VPXOR Y3, Y9, Y9
VPALIGNR $0x0c, Y14, Y14, Y14
VPALIGNR $0x0c, Y9, Y9, Y9
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x04, Y4, Y4, Y4
VPALIGNR $0x04, Y1, Y1, Y1
DECQ R9
JNE sealAVX2192InnerCipherLoop
VPADDD Y6, Y0, Y0
VPADDD Y6, Y5, Y5
VPADDD Y10, Y14, Y14
VPADDD Y10, Y9, Y9
VPADDD Y8, Y12, Y12
VPADDD Y8, Y13, Y13
VPADDD Y2, Y4, Y4
VPADDD Y15, Y1, Y1
VPERM2I128 $0x02, Y0, Y14, Y3
// Clamp and store poly key
VPAND ·polyClampMask<>+0(SB), Y3, Y3
VMOVDQA Y3, (BP)
// Stream for up to 192 bytes
VPERM2I128 $0x13, Y0, Y14, Y0
VPERM2I128 $0x13, Y12, Y4, Y14
VPERM2I128 $0x02, Y5, Y9, Y12
VPERM2I128 $0x02, Y13, Y1, Y4
VPERM2I128 $0x13, Y5, Y9, Y5
VPERM2I128 $0x13, Y13, Y1, Y9
sealAVX2ShortSeal:
// Hash aad
MOVQ ad_len+80(FP), R9
CALL polyHashADInternal<>(SB)
XORQ CX, CX
sealAVX2SealHash:
// itr1 holds the number of bytes encrypted but not yet hashed
CMPQ CX, $0x10
JB sealAVX2ShortSealLoop
ADDQ (DI), R10
ADCQ 8(DI), R11
ADCQ $0x01, R12
MOVQ (BP), AX
MOVQ AX, R15
MULQ R10
MOVQ AX, R13
MOVQ DX, R14
MOVQ (BP), AX
MULQ R11
IMULQ R12, R15
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), AX
MOVQ AX, R8
MULQ R10
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 8(BP), AX
MULQ R11
ADDQ AX, R15
ADCQ $0x00, DX
IMULQ R12, R8
ADDQ R10, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
SUBQ $0x10, CX
ADDQ $0x10, DI
JMP sealAVX2SealHash
sealAVX2ShortSealLoop:
CMPQ BX, $0x20
JB sealAVX2ShortTail32
SUBQ $0x20, BX
// Load for encryption
VPXOR (SI), Y0, Y0
VMOVDQU Y0, (DI)
LEAQ 32(SI), SI
// Now can hash
ADDQ (DI), R10
ADCQ 8(DI), R11
ADCQ $0x01, R12
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
ADDQ 16(DI), R10
ADCQ 24(DI), R11
ADCQ $0x01, R12
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
LEAQ 32(DI), DI
// Shift stream left
VMOVDQA Y14, Y0
VMOVDQA Y12, Y14
VMOVDQA Y4, Y12
VMOVDQA Y5, Y4
VMOVDQA Y9, Y5
VMOVDQA Y13, Y9
VMOVDQA Y1, Y13
VMOVDQA Y6, Y1
VMOVDQA Y10, Y6
JMP sealAVX2ShortSealLoop
sealAVX2ShortTail32:
CMPQ BX, $0x10
VMOVDQA X0, X1
JB sealAVX2ShortDone
SUBQ $0x10, BX
// Load for encryption
VPXOR (SI), X0, X12
VMOVDQU X12, (DI)
LEAQ 16(SI), SI
// Hash
ADDQ (DI), R10
ADCQ 8(DI), R11
ADCQ $0x01, R12
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
LEAQ 16(DI), DI
VPERM2I128 $0x11, Y0, Y0, Y0
VMOVDQA X0, X1
sealAVX2ShortDone:
VZEROUPPER
JMP sealSSETail
seal320AVX2:
VMOVDQA Y0, Y5
VMOVDQA Y14, Y9
VMOVDQA Y12, Y13
VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
VMOVDQA Y0, Y6
VMOVDQA Y14, Y10
VMOVDQA Y12, Y8
VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
VMOVDQA Y14, Y7
VMOVDQA Y12, Y11
VMOVDQA Y4, Y15
MOVQ $0x0000000a, R9
sealAVX2320InnerCipherLoop:
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x0c, Y14, Y3
VPSRLD $0x14, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x07, Y14, Y3
VPSRLD $0x19, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x0c, Y9, Y3
VPSRLD $0x14, Y9, Y9
VPXOR Y3, Y9, Y9
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x07, Y9, Y3
VPSRLD $0x19, Y9, Y9
VPXOR Y3, Y9, Y9
VPADDD Y10, Y6, Y6
VPXOR Y6, Y2, Y2
VPSHUFB ·rol16<>+0(SB), Y2, Y2
VPADDD Y2, Y8, Y8
VPXOR Y8, Y10, Y10
VPSLLD $0x0c, Y10, Y3
VPSRLD $0x14, Y10, Y10
VPXOR Y3, Y10, Y10
VPADDD Y10, Y6, Y6
VPXOR Y6, Y2, Y2
VPSHUFB ·rol8<>+0(SB), Y2, Y2
VPADDD Y2, Y8, Y8
VPXOR Y8, Y10, Y10
VPSLLD $0x07, Y10, Y3
VPSRLD $0x19, Y10, Y10
VPXOR Y3, Y10, Y10
VPALIGNR $0x04, Y14, Y14, Y14
VPALIGNR $0x04, Y9, Y9, Y9
VPALIGNR $0x04, Y10, Y10, Y10
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x08, Y8, Y8, Y8
VPALIGNR $0x0c, Y4, Y4, Y4
VPALIGNR $0x0c, Y1, Y1, Y1
VPALIGNR $0x0c, Y2, Y2, Y2
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x0c, Y14, Y3
VPSRLD $0x14, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x07, Y14, Y3
VPSRLD $0x19, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x0c, Y9, Y3
VPSRLD $0x14, Y9, Y9
VPXOR Y3, Y9, Y9
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x07, Y9, Y3
VPSRLD $0x19, Y9, Y9
VPXOR Y3, Y9, Y9
VPADDD Y10, Y6, Y6
VPXOR Y6, Y2, Y2
VPSHUFB ·rol16<>+0(SB), Y2, Y2
VPADDD Y2, Y8, Y8
VPXOR Y8, Y10, Y10
VPSLLD $0x0c, Y10, Y3
VPSRLD $0x14, Y10, Y10
VPXOR Y3, Y10, Y10
VPADDD Y10, Y6, Y6
VPXOR Y6, Y2, Y2
VPSHUFB ·rol8<>+0(SB), Y2, Y2
VPADDD Y2, Y8, Y8
VPXOR Y8, Y10, Y10
VPSLLD $0x07, Y10, Y3
VPSRLD $0x19, Y10, Y10
VPXOR Y3, Y10, Y10
VPALIGNR $0x0c, Y14, Y14, Y14
VPALIGNR $0x0c, Y9, Y9, Y9
VPALIGNR $0x0c, Y10, Y10, Y10
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x08, Y8, Y8, Y8
VPALIGNR $0x04, Y4, Y4, Y4
VPALIGNR $0x04, Y1, Y1, Y1
VPALIGNR $0x04, Y2, Y2, Y2
DECQ R9
JNE sealAVX2320InnerCipherLoop
VMOVDQA ·chacha20Constants<>+0(SB), Y3
VPADDD Y3, Y0, Y0
VPADDD Y3, Y5, Y5
VPADDD Y3, Y6, Y6
VPADDD Y7, Y14, Y14
VPADDD Y7, Y9, Y9
VPADDD Y7, Y10, Y10
VPADDD Y11, Y12, Y12
VPADDD Y11, Y13, Y13
VPADDD Y11, Y8, Y8
VMOVDQA ·avx2IncMask<>+0(SB), Y3
VPADDD Y15, Y4, Y4
VPADDD Y3, Y15, Y15
VPADDD Y15, Y1, Y1
VPADDD Y3, Y15, Y15
VPADDD Y15, Y2, Y2
// Clamp and store poly key
VPERM2I128 $0x02, Y0, Y14, Y3
VPAND ·polyClampMask<>+0(SB), Y3, Y3
VMOVDQA Y3, (BP)
// Stream for up to 320 bytes
VPERM2I128 $0x13, Y0, Y14, Y0
VPERM2I128 $0x13, Y12, Y4, Y14
VPERM2I128 $0x02, Y5, Y9, Y12
VPERM2I128 $0x02, Y13, Y1, Y4
VPERM2I128 $0x13, Y5, Y9, Y5
VPERM2I128 $0x13, Y13, Y1, Y9
VPERM2I128 $0x02, Y6, Y10, Y13
VPERM2I128 $0x02, Y8, Y2, Y1
VPERM2I128 $0x13, Y6, Y10, Y6
VPERM2I128 $0x13, Y8, Y2, Y10
JMP sealAVX2ShortSeal
sealAVX2Tail128:
VMOVDQA ·chacha20Constants<>+0(SB), Y0
VMOVDQA 32(BP), Y14
VMOVDQA 64(BP), Y12
VMOVDQA 192(BP), Y4
VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
VMOVDQA Y4, Y1
sealAVX2Tail128LoopA:
ADDQ (DI), R10
ADCQ 8(DI), R11
ADCQ $0x01, R12
MOVQ (BP), AX
MOVQ AX, R15
MULQ R10
MOVQ AX, R13
MOVQ DX, R14
MOVQ (BP), AX
MULQ R11
IMULQ R12, R15
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), AX
MOVQ AX, R8
MULQ R10
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 8(BP), AX
MULQ R11
ADDQ AX, R15
ADCQ $0x00, DX
IMULQ R12, R8
ADDQ R10, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
LEAQ 16(DI), DI
sealAVX2Tail128LoopB:
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x0c, Y14, Y3
VPSRLD $0x14, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x07, Y14, Y3
VPSRLD $0x19, Y14, Y14
VPXOR Y3, Y14, Y14
ADDQ (DI), R10
ADCQ 8(DI), R11
ADCQ $0x01, R12
MOVQ (BP), AX
MOVQ AX, R15
MULQ R10
MOVQ AX, R13
MOVQ DX, R14
MOVQ (BP), AX
MULQ R11
IMULQ R12, R15
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), AX
MOVQ AX, R8
MULQ R10
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 8(BP), AX
MULQ R11
ADDQ AX, R15
ADCQ $0x00, DX
IMULQ R12, R8
ADDQ R10, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
VPALIGNR $0x04, Y14, Y14, Y14
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x0c, Y4, Y4, Y4
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x0c, Y14, Y3
VPSRLD $0x14, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x07, Y14, Y3
VPSRLD $0x19, Y14, Y14
VPXOR Y3, Y14, Y14
ADDQ 16(DI), R10
ADCQ 24(DI), R11
ADCQ $0x01, R12
MOVQ (BP), AX
MOVQ AX, R15
MULQ R10
MOVQ AX, R13
MOVQ DX, R14
MOVQ (BP), AX
MULQ R11
IMULQ R12, R15
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), AX
MOVQ AX, R8
MULQ R10
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 8(BP), AX
MULQ R11
ADDQ AX, R15
ADCQ $0x00, DX
IMULQ R12, R8
ADDQ R10, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
LEAQ 32(DI), DI
VPALIGNR $0x0c, Y14, Y14, Y14
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x04, Y4, Y4, Y4
DECQ CX
JG sealAVX2Tail128LoopA
DECQ R9
JGE sealAVX2Tail128LoopB
VPADDD ·chacha20Constants<>+0(SB), Y0, Y5
VPADDD 32(BP), Y14, Y9
VPADDD 64(BP), Y12, Y13
VPADDD Y1, Y4, Y1
VPERM2I128 $0x02, Y5, Y9, Y0
VPERM2I128 $0x02, Y13, Y1, Y14
VPERM2I128 $0x13, Y5, Y9, Y12
VPERM2I128 $0x13, Y13, Y1, Y4
JMP sealAVX2ShortSealLoop
sealAVX2Tail256:
VMOVDQA ·chacha20Constants<>+0(SB), Y0
VMOVDQA ·chacha20Constants<>+0(SB), Y5
VMOVDQA 32(BP), Y14
VMOVDQA 32(BP), Y9
VMOVDQA 64(BP), Y12
VMOVDQA 64(BP), Y13
VMOVDQA 192(BP), Y4
VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
VMOVDQA Y4, Y7
VMOVDQA Y1, Y11
sealAVX2Tail256LoopA:
ADDQ (DI), R10
ADCQ 8(DI), R11
ADCQ $0x01, R12
MOVQ (BP), AX
MOVQ AX, R15
MULQ R10
MOVQ AX, R13
MOVQ DX, R14
MOVQ (BP), AX
MULQ R11
IMULQ R12, R15
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), AX
MOVQ AX, R8
MULQ R10
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 8(BP), AX
MULQ R11
ADDQ AX, R15
ADCQ $0x00, DX
IMULQ R12, R8
ADDQ R10, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
LEAQ 16(DI), DI
sealAVX2Tail256LoopB:
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x0c, Y14, Y3
VPSRLD $0x14, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x07, Y14, Y3
VPSRLD $0x19, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x0c, Y9, Y3
VPSRLD $0x14, Y9, Y9
VPXOR Y3, Y9, Y9
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x07, Y9, Y3
VPSRLD $0x19, Y9, Y9
VPXOR Y3, Y9, Y9
ADDQ (DI), R10
ADCQ 8(DI), R11
ADCQ $0x01, R12
MOVQ (BP), AX
MOVQ AX, R15
MULQ R10
MOVQ AX, R13
MOVQ DX, R14
MOVQ (BP), AX
MULQ R11
IMULQ R12, R15
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), AX
MOVQ AX, R8
MULQ R10
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 8(BP), AX
MULQ R11
ADDQ AX, R15
ADCQ $0x00, DX
IMULQ R12, R8
ADDQ R10, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
VPALIGNR $0x04, Y14, Y14, Y14
VPALIGNR $0x04, Y9, Y9, Y9
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x0c, Y4, Y4, Y4
VPALIGNR $0x0c, Y1, Y1, Y1
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x0c, Y14, Y3
VPSRLD $0x14, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x07, Y14, Y3
VPSRLD $0x19, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x0c, Y9, Y3
VPSRLD $0x14, Y9, Y9
VPXOR Y3, Y9, Y9
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x07, Y9, Y3
VPSRLD $0x19, Y9, Y9
VPXOR Y3, Y9, Y9
ADDQ 16(DI), R10
ADCQ 24(DI), R11
ADCQ $0x01, R12
MOVQ (BP), AX
MOVQ AX, R15
MULQ R10
MOVQ AX, R13
MOVQ DX, R14
MOVQ (BP), AX
MULQ R11
IMULQ R12, R15
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), AX
MOVQ AX, R8
MULQ R10
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 8(BP), AX
MULQ R11
ADDQ AX, R15
ADCQ $0x00, DX
IMULQ R12, R8
ADDQ R10, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
LEAQ 32(DI), DI
VPALIGNR $0x0c, Y14, Y14, Y14
VPALIGNR $0x0c, Y9, Y9, Y9
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x04, Y4, Y4, Y4
VPALIGNR $0x04, Y1, Y1, Y1
DECQ CX
JG sealAVX2Tail256LoopA
DECQ R9
JGE sealAVX2Tail256LoopB
VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
VPADDD 32(BP), Y14, Y14
VPADDD 32(BP), Y9, Y9
VPADDD 64(BP), Y12, Y12
VPADDD 64(BP), Y13, Y13
VPADDD Y7, Y4, Y4
VPADDD Y11, Y1, Y1
VPERM2I128 $0x02, Y0, Y14, Y3
VPERM2I128 $0x02, Y12, Y4, Y7
VPERM2I128 $0x13, Y0, Y14, Y11
VPERM2I128 $0x13, Y12, Y4, Y15
VPXOR (SI), Y3, Y3
VPXOR 32(SI), Y7, Y7
VPXOR 64(SI), Y11, Y11
VPXOR 96(SI), Y15, Y15
VMOVDQU Y3, (DI)
VMOVDQU Y7, 32(DI)
VMOVDQU Y11, 64(DI)
VMOVDQU Y15, 96(DI)
MOVQ $0x00000080, CX
LEAQ 128(SI), SI
SUBQ $0x80, BX
VPERM2I128 $0x02, Y5, Y9, Y0
VPERM2I128 $0x02, Y13, Y1, Y14
VPERM2I128 $0x13, Y5, Y9, Y12
VPERM2I128 $0x13, Y13, Y1, Y4
JMP sealAVX2SealHash
sealAVX2Tail384:
VMOVDQA ·chacha20Constants<>+0(SB), Y0
VMOVDQA Y0, Y5
VMOVDQA Y0, Y6
VMOVDQA 32(BP), Y14
VMOVDQA Y14, Y9
VMOVDQA Y14, Y10
VMOVDQA 64(BP), Y12
VMOVDQA Y12, Y13
VMOVDQA Y12, Y8
VMOVDQA 192(BP), Y4
VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
VMOVDQA Y4, Y7
VMOVDQA Y1, Y11
VMOVDQA Y2, Y15
sealAVX2Tail384LoopA:
ADDQ (DI), R10
ADCQ 8(DI), R11
ADCQ $0x01, R12
MOVQ (BP), AX
MOVQ AX, R15
MULQ R10
MOVQ AX, R13
MOVQ DX, R14
MOVQ (BP), AX
MULQ R11
IMULQ R12, R15
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), AX
MOVQ AX, R8
MULQ R10
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 8(BP), AX
MULQ R11
ADDQ AX, R15
ADCQ $0x00, DX
IMULQ R12, R8
ADDQ R10, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
LEAQ 16(DI), DI
sealAVX2Tail384LoopB:
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x0c, Y14, Y3
VPSRLD $0x14, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x07, Y14, Y3
VPSRLD $0x19, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x0c, Y9, Y3
VPSRLD $0x14, Y9, Y9
VPXOR Y3, Y9, Y9
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x07, Y9, Y3
VPSRLD $0x19, Y9, Y9
VPXOR Y3, Y9, Y9
VPADDD Y10, Y6, Y6
VPXOR Y6, Y2, Y2
VPSHUFB ·rol16<>+0(SB), Y2, Y2
VPADDD Y2, Y8, Y8
VPXOR Y8, Y10, Y10
VPSLLD $0x0c, Y10, Y3
VPSRLD $0x14, Y10, Y10
VPXOR Y3, Y10, Y10
VPADDD Y10, Y6, Y6
VPXOR Y6, Y2, Y2
VPSHUFB ·rol8<>+0(SB), Y2, Y2
VPADDD Y2, Y8, Y8
VPXOR Y8, Y10, Y10
VPSLLD $0x07, Y10, Y3
VPSRLD $0x19, Y10, Y10
VPXOR Y3, Y10, Y10
ADDQ (DI), R10
ADCQ 8(DI), R11
ADCQ $0x01, R12
MOVQ (BP), AX
MOVQ AX, R15
MULQ R10
MOVQ AX, R13
MOVQ DX, R14
MOVQ (BP), AX
MULQ R11
IMULQ R12, R15
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), AX
MOVQ AX, R8
MULQ R10
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 8(BP), AX
MULQ R11
ADDQ AX, R15
ADCQ $0x00, DX
IMULQ R12, R8
ADDQ R10, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
VPALIGNR $0x04, Y14, Y14, Y14
VPALIGNR $0x04, Y9, Y9, Y9
VPALIGNR $0x04, Y10, Y10, Y10
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x08, Y8, Y8, Y8
VPALIGNR $0x0c, Y4, Y4, Y4
VPALIGNR $0x0c, Y1, Y1, Y1
VPALIGNR $0x0c, Y2, Y2, Y2
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x0c, Y14, Y3
VPSRLD $0x14, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x07, Y14, Y3
VPSRLD $0x19, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x0c, Y9, Y3
VPSRLD $0x14, Y9, Y9
VPXOR Y3, Y9, Y9
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x07, Y9, Y3
VPSRLD $0x19, Y9, Y9
VPXOR Y3, Y9, Y9
VPADDD Y10, Y6, Y6
VPXOR Y6, Y2, Y2
VPSHUFB ·rol16<>+0(SB), Y2, Y2
VPADDD Y2, Y8, Y8
VPXOR Y8, Y10, Y10
VPSLLD $0x0c, Y10, Y3
VPSRLD $0x14, Y10, Y10
VPXOR Y3, Y10, Y10
VPADDD Y10, Y6, Y6
VPXOR Y6, Y2, Y2
VPSHUFB ·rol8<>+0(SB), Y2, Y2
VPADDD Y2, Y8, Y8
VPXOR Y8, Y10, Y10
VPSLLD $0x07, Y10, Y3
VPSRLD $0x19, Y10, Y10
VPXOR Y3, Y10, Y10
ADDQ 16(DI), R10
ADCQ 24(DI), R11
ADCQ $0x01, R12
MOVQ (BP), AX
MOVQ AX, R15
MULQ R10
MOVQ AX, R13
MOVQ DX, R14
MOVQ (BP), AX
MULQ R11
IMULQ R12, R15
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), AX
MOVQ AX, R8
MULQ R10
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 8(BP), AX
MULQ R11
ADDQ AX, R15
ADCQ $0x00, DX
IMULQ R12, R8
ADDQ R10, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
LEAQ 32(DI), DI
VPALIGNR $0x0c, Y14, Y14, Y14
VPALIGNR $0x0c, Y9, Y9, Y9
VPALIGNR $0x0c, Y10, Y10, Y10
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x08, Y8, Y8, Y8
VPALIGNR $0x04, Y4, Y4, Y4
VPALIGNR $0x04, Y1, Y1, Y1
VPALIGNR $0x04, Y2, Y2, Y2
DECQ CX
JG sealAVX2Tail384LoopA
DECQ R9
JGE sealAVX2Tail384LoopB
VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
VPADDD ·chacha20Constants<>+0(SB), Y6, Y6
VPADDD 32(BP), Y14, Y14
VPADDD 32(BP), Y9, Y9
VPADDD 32(BP), Y10, Y10
VPADDD 64(BP), Y12, Y12
VPADDD 64(BP), Y13, Y13
VPADDD 64(BP), Y8, Y8
VPADDD Y7, Y4, Y4
VPADDD Y11, Y1, Y1
VPADDD Y15, Y2, Y2
VPERM2I128 $0x02, Y0, Y14, Y3
VPERM2I128 $0x02, Y12, Y4, Y7
VPERM2I128 $0x13, Y0, Y14, Y11
VPERM2I128 $0x13, Y12, Y4, Y15
VPXOR (SI), Y3, Y3
VPXOR 32(SI), Y7, Y7
VPXOR 64(SI), Y11, Y11
VPXOR 96(SI), Y15, Y15
VMOVDQU Y3, (DI)
VMOVDQU Y7, 32(DI)
VMOVDQU Y11, 64(DI)
VMOVDQU Y15, 96(DI)
VPERM2I128 $0x02, Y5, Y9, Y3
VPERM2I128 $0x02, Y13, Y1, Y7
VPERM2I128 $0x13, Y5, Y9, Y11
VPERM2I128 $0x13, Y13, Y1, Y15
VPXOR 128(SI), Y3, Y3
VPXOR 160(SI), Y7, Y7
VPXOR 192(SI), Y11, Y11
VPXOR 224(SI), Y15, Y15
VMOVDQU Y3, 128(DI)
VMOVDQU Y7, 160(DI)
VMOVDQU Y11, 192(DI)
VMOVDQU Y15, 224(DI)
MOVQ $0x00000100, CX
LEAQ 256(SI), SI
SUBQ $0x00000100, BX
VPERM2I128 $0x02, Y6, Y10, Y0
VPERM2I128 $0x02, Y8, Y2, Y14
VPERM2I128 $0x13, Y6, Y10, Y12
VPERM2I128 $0x13, Y8, Y2, Y4
JMP sealAVX2SealHash
sealAVX2Tail512:
VMOVDQA ·chacha20Constants<>+0(SB), Y0
VMOVDQA Y0, Y5
VMOVDQA Y0, Y6
VMOVDQA Y0, Y7
VMOVDQA 32(BP), Y14
VMOVDQA Y14, Y9
VMOVDQA Y14, Y10
VMOVDQA Y14, Y11
VMOVDQA 64(BP), Y12
VMOVDQA Y12, Y13
VMOVDQA Y12, Y8
VMOVDQA Y12, Y15
VMOVDQA 192(BP), Y4
VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
VPADDD ·avx2IncMask<>+0(SB), Y2, Y3
VMOVDQA Y4, 96(BP)
VMOVDQA Y1, 128(BP)
VMOVDQA Y2, 160(BP)
VMOVDQA Y3, 192(BP)
sealAVX2Tail512LoopA:
ADDQ (DI), R10
ADCQ 8(DI), R11
ADCQ $0x01, R12
MOVQ (BP), AX
MOVQ AX, R15
MULQ R10
MOVQ AX, R13
MOVQ DX, R14
MOVQ (BP), AX
MULQ R11
IMULQ R12, R15
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), AX
MOVQ AX, R8
MULQ R10
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 8(BP), AX
MULQ R11
ADDQ AX, R15
ADCQ $0x00, DX
IMULQ R12, R8
ADDQ R10, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
LEAQ 16(DI), DI
sealAVX2Tail512LoopB:
VPADDD Y14, Y0, Y0
VPADDD Y9, Y5, Y5
VPADDD Y10, Y6, Y6
VPADDD Y11, Y7, Y7
VPXOR Y0, Y4, Y4
VPXOR Y5, Y1, Y1
VPXOR Y6, Y2, Y2
VPXOR Y7, Y3, Y3
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y2, Y2
VPSHUFB ·rol16<>+0(SB), Y3, Y3
VPADDD Y4, Y12, Y12
VPADDD Y1, Y13, Y13
VPADDD Y2, Y8, Y8
VPADDD Y3, Y15, Y15
VPXOR Y12, Y14, Y14
VPXOR Y13, Y9, Y9
VPXOR Y8, Y10, Y10
VPXOR Y15, Y11, Y11
VMOVDQA Y15, 224(BP)
VPSLLD $0x0c, Y14, Y15
VPSRLD $0x14, Y14, Y14
VPXOR Y15, Y14, Y14
VPSLLD $0x0c, Y9, Y15
VPSRLD $0x14, Y9, Y9
VPXOR Y15, Y9, Y9
VPSLLD $0x0c, Y10, Y15
VPSRLD $0x14, Y10, Y10
VPXOR Y15, Y10, Y10
VPSLLD $0x0c, Y11, Y15
VPSRLD $0x14, Y11, Y11
VPXOR Y15, Y11, Y11
VMOVDQA 224(BP), Y15
ADDQ (DI), R10
ADCQ 8(DI), R11
ADCQ $0x01, R12
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
VPADDD Y14, Y0, Y0
VPADDD Y9, Y5, Y5
VPADDD Y10, Y6, Y6
VPADDD Y11, Y7, Y7
VPXOR Y0, Y4, Y4
VPXOR Y5, Y1, Y1
VPXOR Y6, Y2, Y2
VPXOR Y7, Y3, Y3
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y2, Y2
VPSHUFB ·rol8<>+0(SB), Y3, Y3
VPADDD Y4, Y12, Y12
VPADDD Y1, Y13, Y13
VPADDD Y2, Y8, Y8
VPADDD Y3, Y15, Y15
VPXOR Y12, Y14, Y14
VPXOR Y13, Y9, Y9
VPXOR Y8, Y10, Y10
VPXOR Y15, Y11, Y11
VMOVDQA Y15, 224(BP)
VPSLLD $0x07, Y14, Y15
VPSRLD $0x19, Y14, Y14
VPXOR Y15, Y14, Y14
VPSLLD $0x07, Y9, Y15
VPSRLD $0x19, Y9, Y9
VPXOR Y15, Y9, Y9
VPSLLD $0x07, Y10, Y15
VPSRLD $0x19, Y10, Y10
VPXOR Y15, Y10, Y10
VPSLLD $0x07, Y11, Y15
VPSRLD $0x19, Y11, Y11
VPXOR Y15, Y11, Y11
VMOVDQA 224(BP), Y15
VPALIGNR $0x04, Y14, Y14, Y14
VPALIGNR $0x04, Y9, Y9, Y9
VPALIGNR $0x04, Y10, Y10, Y10
VPALIGNR $0x04, Y11, Y11, Y11
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x08, Y8, Y8, Y8
VPALIGNR $0x08, Y15, Y15, Y15
VPALIGNR $0x0c, Y4, Y4, Y4
VPALIGNR $0x0c, Y1, Y1, Y1
VPALIGNR $0x0c, Y2, Y2, Y2
VPALIGNR $0x0c, Y3, Y3, Y3
VPADDD Y14, Y0, Y0
VPADDD Y9, Y5, Y5
VPADDD Y10, Y6, Y6
VPADDD Y11, Y7, Y7
VPXOR Y0, Y4, Y4
VPXOR Y5, Y1, Y1
VPXOR Y6, Y2, Y2
VPXOR Y7, Y3, Y3
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y2, Y2
VPSHUFB ·rol16<>+0(SB), Y3, Y3
VPADDD Y4, Y12, Y12
VPADDD Y1, Y13, Y13
VPADDD Y2, Y8, Y8
VPADDD Y3, Y15, Y15
VPXOR Y12, Y14, Y14
VPXOR Y13, Y9, Y9
VPXOR Y8, Y10, Y10
VPXOR Y15, Y11, Y11
ADDQ 16(DI), R10
ADCQ 24(DI), R11
ADCQ $0x01, R12
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
LEAQ 32(DI), DI
VMOVDQA Y15, 224(BP)
VPSLLD $0x0c, Y14, Y15
VPSRLD $0x14, Y14, Y14
VPXOR Y15, Y14, Y14
VPSLLD $0x0c, Y9, Y15
VPSRLD $0x14, Y9, Y9
VPXOR Y15, Y9, Y9
VPSLLD $0x0c, Y10, Y15
VPSRLD $0x14, Y10, Y10
VPXOR Y15, Y10, Y10
VPSLLD $0x0c, Y11, Y15
VPSRLD $0x14, Y11, Y11
VPXOR Y15, Y11, Y11
VMOVDQA 224(BP), Y15
VPADDD Y14, Y0, Y0
VPADDD Y9, Y5, Y5
VPADDD Y10, Y6, Y6
VPADDD Y11, Y7, Y7
VPXOR Y0, Y4, Y4
VPXOR Y5, Y1, Y1
VPXOR Y6, Y2, Y2
VPXOR Y7, Y3, Y3
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y2, Y2
VPSHUFB ·rol8<>+0(SB), Y3, Y3
VPADDD Y4, Y12, Y12
VPADDD Y1, Y13, Y13
VPADDD Y2, Y8, Y8
VPADDD Y3, Y15, Y15
VPXOR Y12, Y14, Y14
VPXOR Y13, Y9, Y9
VPXOR Y8, Y10, Y10
VPXOR Y15, Y11, Y11
VMOVDQA Y15, 224(BP)
VPSLLD $0x07, Y14, Y15
VPSRLD $0x19, Y14, Y14
VPXOR Y15, Y14, Y14
VPSLLD $0x07, Y9, Y15
VPSRLD $0x19, Y9, Y9
VPXOR Y15, Y9, Y9
VPSLLD $0x07, Y10, Y15
VPSRLD $0x19, Y10, Y10
VPXOR Y15, Y10, Y10
VPSLLD $0x07, Y11, Y15
VPSRLD $0x19, Y11, Y11
VPXOR Y15, Y11, Y11
VMOVDQA 224(BP), Y15
VPALIGNR $0x0c, Y14, Y14, Y14
VPALIGNR $0x0c, Y9, Y9, Y9
VPALIGNR $0x0c, Y10, Y10, Y10
VPALIGNR $0x0c, Y11, Y11, Y11
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x08, Y8, Y8, Y8
VPALIGNR $0x08, Y15, Y15, Y15
VPALIGNR $0x04, Y4, Y4, Y4
VPALIGNR $0x04, Y1, Y1, Y1
VPALIGNR $0x04, Y2, Y2, Y2
VPALIGNR $0x04, Y3, Y3, Y3
DECQ CX
JG sealAVX2Tail512LoopA
DECQ R9
JGE sealAVX2Tail512LoopB
VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
VPADDD ·chacha20Constants<>+0(SB), Y6, Y6
VPADDD ·chacha20Constants<>+0(SB), Y7, Y7
VPADDD 32(BP), Y14, Y14
VPADDD 32(BP), Y9, Y9
VPADDD 32(BP), Y10, Y10
VPADDD 32(BP), Y11, Y11
VPADDD 64(BP), Y12, Y12
VPADDD 64(BP), Y13, Y13
VPADDD 64(BP), Y8, Y8
VPADDD 64(BP), Y15, Y15
VPADDD 96(BP), Y4, Y4
VPADDD 128(BP), Y1, Y1
VPADDD 160(BP), Y2, Y2
VPADDD 192(BP), Y3, Y3
VMOVDQA Y15, 224(BP)
VPERM2I128 $0x02, Y0, Y14, Y15
VPXOR (SI), Y15, Y15
VMOVDQU Y15, (DI)
VPERM2I128 $0x02, Y12, Y4, Y15
VPXOR 32(SI), Y15, Y15
VMOVDQU Y15, 32(DI)
VPERM2I128 $0x13, Y0, Y14, Y15
VPXOR 64(SI), Y15, Y15
VMOVDQU Y15, 64(DI)
VPERM2I128 $0x13, Y12, Y4, Y15
VPXOR 96(SI), Y15, Y15
VMOVDQU Y15, 96(DI)
VPERM2I128 $0x02, Y5, Y9, Y0
VPERM2I128 $0x02, Y13, Y1, Y14
VPERM2I128 $0x13, Y5, Y9, Y12
VPERM2I128 $0x13, Y13, Y1, Y4
VPXOR 128(SI), Y0, Y0
VPXOR 160(SI), Y14, Y14
VPXOR 192(SI), Y12, Y12
VPXOR 224(SI), Y4, Y4
VMOVDQU Y0, 128(DI)
VMOVDQU Y14, 160(DI)
VMOVDQU Y12, 192(DI)
VMOVDQU Y4, 224(DI)
VPERM2I128 $0x02, Y6, Y10, Y0
VPERM2I128 $0x02, Y8, Y2, Y14
VPERM2I128 $0x13, Y6, Y10, Y12
VPERM2I128 $0x13, Y8, Y2, Y4
VPXOR 256(SI), Y0, Y0
VPXOR 288(SI), Y14, Y14
VPXOR 320(SI), Y12, Y12
VPXOR 352(SI), Y4, Y4
VMOVDQU Y0, 256(DI)
VMOVDQU Y14, 288(DI)
VMOVDQU Y12, 320(DI)
VMOVDQU Y4, 352(DI)
MOVQ $0x00000180, CX
LEAQ 384(SI), SI
SUBQ $0x00000180, BX
VPERM2I128 $0x02, Y7, Y11, Y0
VPERM2I128 $0x02, 224(BP), Y3, Y14
VPERM2I128 $0x13, Y7, Y11, Y12
VPERM2I128 $0x13, 224(BP), Y3, Y4
JMP sealAVX2SealHash