x/crypto/poly1305: optimize amd64 assembly performance
Improve performance on amd64 through faster assembly. name old time/op new time/op delta 64-8 101ns ± 4% 42ns ± 3% -58.31% (p=0.002 n=6+6) 1K-8 887ns ± 1% 456ns ± 1% -48.53% (p=0.002 n=6+6) 64Unaligned-8 98.1ns ± 1% 41.1ns ± 1% -58.06% (p=0.002 n=6+6) 1KUnaligned-8 885ns ± 2% 460ns ± 3% -48.04% (p=0.002 n=6+6) name old speed new speed delta 64-8 635MB/s ± 4% 1525MB/s ± 3% +140.15% (p=0.002 n=6+6) 1K-8 1.15GB/s ± 1% 2.24GB/s ± 1% +94.22% (p=0.002 n=6+6) 64Unaligned-8 653MB/s ± 1% 1557MB/s ± 1% +138.58% (p=0.002 n=6+6) 1KUnaligned-8 1.16GB/s ± 2% 2.23GB/s ± 3% +92.46% (p=0.002 n=6+6) Change-Id: Ia3be8e7ff012f8a9b451d728a646f29f809ba665 Reviewed-on: https://go-review.googlesource.com/29993 Reviewed-by: Adam Langley <agl@golang.org>
This commit is contained in:
Родитель
a20de3fa94
Коммит
568507f56e
|
@ -1,45 +0,0 @@
|
|||
// Copyright 2012 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// This code was translated into a form compatible with 6a from the public
|
||||
// domain sources in SUPERCOP: http://bench.cr.yp.to/supercop.html
|
||||
|
||||
// +build amd64,!gccgo,!appengine
|
||||
|
||||
DATA ·SCALE(SB)/8, $0x37F4000000000000
|
||||
GLOBL ·SCALE(SB), 8, $8
|
||||
DATA ·TWO32(SB)/8, $0x41F0000000000000
|
||||
GLOBL ·TWO32(SB), 8, $8
|
||||
DATA ·TWO64(SB)/8, $0x43F0000000000000
|
||||
GLOBL ·TWO64(SB), 8, $8
|
||||
DATA ·TWO96(SB)/8, $0x45F0000000000000
|
||||
GLOBL ·TWO96(SB), 8, $8
|
||||
DATA ·ALPHA32(SB)/8, $0x45E8000000000000
|
||||
GLOBL ·ALPHA32(SB), 8, $8
|
||||
DATA ·ALPHA64(SB)/8, $0x47E8000000000000
|
||||
GLOBL ·ALPHA64(SB), 8, $8
|
||||
DATA ·ALPHA96(SB)/8, $0x49E8000000000000
|
||||
GLOBL ·ALPHA96(SB), 8, $8
|
||||
DATA ·ALPHA130(SB)/8, $0x4C08000000000000
|
||||
GLOBL ·ALPHA130(SB), 8, $8
|
||||
DATA ·DOFFSET0(SB)/8, $0x4330000000000000
|
||||
GLOBL ·DOFFSET0(SB), 8, $8
|
||||
DATA ·DOFFSET1(SB)/8, $0x4530000000000000
|
||||
GLOBL ·DOFFSET1(SB), 8, $8
|
||||
DATA ·DOFFSET2(SB)/8, $0x4730000000000000
|
||||
GLOBL ·DOFFSET2(SB), 8, $8
|
||||
DATA ·DOFFSET3(SB)/8, $0x4930000000000000
|
||||
GLOBL ·DOFFSET3(SB), 8, $8
|
||||
DATA ·DOFFSET3MINUSTWO128(SB)/8, $0x492FFFFE00000000
|
||||
GLOBL ·DOFFSET3MINUSTWO128(SB), 8, $8
|
||||
DATA ·HOFFSET0(SB)/8, $0x43300001FFFFFFFB
|
||||
GLOBL ·HOFFSET0(SB), 8, $8
|
||||
DATA ·HOFFSET1(SB)/8, $0x45300001FFFFFFFE
|
||||
GLOBL ·HOFFSET1(SB), 8, $8
|
||||
DATA ·HOFFSET2(SB)/8, $0x47300001FFFFFFFE
|
||||
GLOBL ·HOFFSET2(SB), 8, $8
|
||||
DATA ·HOFFSET3(SB)/8, $0x49300003FFFFFFFE
|
||||
GLOBL ·HOFFSET3(SB), 8, $8
|
||||
DATA ·ROUNDING(SB)/2, $0x137f
|
||||
GLOBL ·ROUNDING(SB), 8, $2
|
|
@ -2,496 +2,131 @@
|
|||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// This code was translated into a form compatible with 6a from the public
|
||||
// domain sources in SUPERCOP: http://bench.cr.yp.to/supercop.html
|
||||
|
||||
// +build amd64,!gccgo,!appengine
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
#define POLY1305_ADD(msg, h0, h1, h2) \
|
||||
ADDQ 0(msg), h0; \
|
||||
ADCQ 8(msg), h1; \
|
||||
ADCQ $1, h2; \
|
||||
LEAQ 16(msg), msg
|
||||
|
||||
#define POLY1305_MUL(h0, h1, h2, r0, r1, t0, t1, t2, t3) \
|
||||
MOVQ r0, AX; \
|
||||
MULQ h0; \
|
||||
MOVQ AX, t0; \
|
||||
MOVQ DX, t1; \
|
||||
MOVQ r0, AX; \
|
||||
MULQ h1; \
|
||||
ADDQ AX, t1; \
|
||||
ADCQ $0, DX; \
|
||||
MOVQ r0, t2; \
|
||||
IMULQ h2, t2; \
|
||||
ADDQ DX, t2; \
|
||||
\
|
||||
MOVQ r1, AX; \
|
||||
MULQ h0; \
|
||||
ADDQ AX, t1; \
|
||||
ADCQ $0, DX; \
|
||||
MOVQ DX, h0; \
|
||||
MOVQ r1, t3; \
|
||||
IMULQ h2, t3; \
|
||||
MOVQ r1, AX; \
|
||||
MULQ h1; \
|
||||
ADDQ AX, t2; \
|
||||
ADCQ DX, t3; \
|
||||
ADDQ h0, t2; \
|
||||
ADCQ $0, t3; \
|
||||
\
|
||||
MOVQ t0, h0; \
|
||||
MOVQ t1, h1; \
|
||||
MOVQ t2, h2; \
|
||||
ANDQ $3, h2; \
|
||||
MOVQ t2, t0; \
|
||||
ANDQ $0xFFFFFFFFFFFFFFFC, t0; \
|
||||
ADDQ t0, h0; \
|
||||
ADCQ t3, h1; \
|
||||
ADCQ $0, h2; \
|
||||
SHRQ $2, t3, t2; \
|
||||
SHRQ $2, t3; \
|
||||
ADDQ t2, h0; \
|
||||
ADCQ t3, h1; \
|
||||
ADCQ $0, h2
|
||||
|
||||
DATA poly1305Mask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF
|
||||
DATA poly1305Mask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC
|
||||
GLOBL poly1305Mask<>(SB), RODATA, $16
|
||||
|
||||
// func poly1305(out *[16]byte, m *byte, mlen uint64, key *[32]key)
|
||||
TEXT ·poly1305(SB),0,$224-32
|
||||
MOVQ out+0(FP),DI
|
||||
MOVQ m+8(FP),SI
|
||||
MOVQ mlen+16(FP),DX
|
||||
MOVQ key+24(FP),CX
|
||||
TEXT ·poly1305(SB), $0-32
|
||||
MOVQ out+0(FP), DI
|
||||
MOVQ m+8(FP), SI
|
||||
MOVQ mlen+16(FP), R15
|
||||
MOVQ key+24(FP), AX
|
||||
|
||||
MOVQ SP,R11
|
||||
MOVQ $31,R9
|
||||
NOTQ R9
|
||||
ANDQ R9,SP
|
||||
ADDQ $32,SP
|
||||
MOVQ SP, BP
|
||||
ANDQ $0xFFFFFFFFFFFFFFF0, SP
|
||||
SUBQ $32, SP
|
||||
|
||||
MOVQ R11,32(SP)
|
||||
MOVQ R12,40(SP)
|
||||
MOVQ R13,48(SP)
|
||||
MOVQ R14,56(SP)
|
||||
MOVQ R15,64(SP)
|
||||
MOVQ BX,72(SP)
|
||||
MOVQ BP,80(SP)
|
||||
FLDCW ·ROUNDING(SB)
|
||||
MOVL 0(CX),R8
|
||||
MOVL 4(CX),R9
|
||||
MOVL 8(CX),AX
|
||||
MOVL 12(CX),R10
|
||||
MOVQ DI,88(SP)
|
||||
MOVQ CX,96(SP)
|
||||
MOVL $0X43300000,108(SP)
|
||||
MOVL $0X45300000,116(SP)
|
||||
MOVL $0X47300000,124(SP)
|
||||
MOVL $0X49300000,132(SP)
|
||||
ANDL $0X0FFFFFFF,R8
|
||||
ANDL $0X0FFFFFFC,R9
|
||||
ANDL $0X0FFFFFFC,AX
|
||||
ANDL $0X0FFFFFFC,R10
|
||||
MOVL R8,104(SP)
|
||||
MOVL R9,112(SP)
|
||||
MOVL AX,120(SP)
|
||||
MOVL R10,128(SP)
|
||||
FMOVD 104(SP), F0
|
||||
FSUBD ·DOFFSET0(SB), F0
|
||||
FMOVD 112(SP), F0
|
||||
FSUBD ·DOFFSET1(SB), F0
|
||||
FMOVD 120(SP), F0
|
||||
FSUBD ·DOFFSET2(SB), F0
|
||||
FMOVD 128(SP), F0
|
||||
FSUBD ·DOFFSET3(SB), F0
|
||||
FXCHD F0, F3
|
||||
FMOVDP F0, 136(SP)
|
||||
FXCHD F0, F1
|
||||
FMOVD F0, 144(SP)
|
||||
FMULD ·SCALE(SB), F0
|
||||
FMOVDP F0, 152(SP)
|
||||
FMOVD F0, 160(SP)
|
||||
FMULD ·SCALE(SB), F0
|
||||
FMOVDP F0, 168(SP)
|
||||
FMOVD F0, 176(SP)
|
||||
FMULD ·SCALE(SB), F0
|
||||
FMOVDP F0, 184(SP)
|
||||
FLDZ
|
||||
FLDZ
|
||||
FLDZ
|
||||
FLDZ
|
||||
CMPQ DX,$16
|
||||
JB ADDATMOST15BYTES
|
||||
INITIALATLEAST16BYTES:
|
||||
MOVL 12(SI),DI
|
||||
MOVL 8(SI),CX
|
||||
MOVL 4(SI),R8
|
||||
MOVL 0(SI),R9
|
||||
MOVL DI,128(SP)
|
||||
MOVL CX,120(SP)
|
||||
MOVL R8,112(SP)
|
||||
MOVL R9,104(SP)
|
||||
ADDQ $16,SI
|
||||
SUBQ $16,DX
|
||||
FXCHD F0, F3
|
||||
FADDD 128(SP), F0
|
||||
FSUBD ·DOFFSET3MINUSTWO128(SB), F0
|
||||
FXCHD F0, F1
|
||||
FADDD 112(SP), F0
|
||||
FSUBD ·DOFFSET1(SB), F0
|
||||
FXCHD F0, F2
|
||||
FADDD 120(SP), F0
|
||||
FSUBD ·DOFFSET2(SB), F0
|
||||
FXCHD F0, F3
|
||||
FADDD 104(SP), F0
|
||||
FSUBD ·DOFFSET0(SB), F0
|
||||
CMPQ DX,$16
|
||||
JB MULTIPLYADDATMOST15BYTES
|
||||
MULTIPLYADDATLEAST16BYTES:
|
||||
MOVL 12(SI),DI
|
||||
MOVL 8(SI),CX
|
||||
MOVL 4(SI),R8
|
||||
MOVL 0(SI),R9
|
||||
MOVL DI,128(SP)
|
||||
MOVL CX,120(SP)
|
||||
MOVL R8,112(SP)
|
||||
MOVL R9,104(SP)
|
||||
ADDQ $16,SI
|
||||
SUBQ $16,DX
|
||||
FMOVD ·ALPHA130(SB), F0
|
||||
FADDD F2,F0
|
||||
FSUBD ·ALPHA130(SB), F0
|
||||
FSUBD F0,F2
|
||||
FMULD ·SCALE(SB), F0
|
||||
FMOVD ·ALPHA32(SB), F0
|
||||
FADDD F2,F0
|
||||
FSUBD ·ALPHA32(SB), F0
|
||||
FSUBD F0,F2
|
||||
FXCHD F0, F2
|
||||
FADDDP F0,F1
|
||||
FMOVD ·ALPHA64(SB), F0
|
||||
FADDD F4,F0
|
||||
FSUBD ·ALPHA64(SB), F0
|
||||
FSUBD F0,F4
|
||||
FMOVD ·ALPHA96(SB), F0
|
||||
FADDD F6,F0
|
||||
FSUBD ·ALPHA96(SB), F0
|
||||
FSUBD F0,F6
|
||||
FXCHD F0, F6
|
||||
FADDDP F0,F1
|
||||
FXCHD F0, F3
|
||||
FADDDP F0,F5
|
||||
FXCHD F0, F3
|
||||
FADDDP F0,F1
|
||||
FMOVD 176(SP), F0
|
||||
FMULD F3,F0
|
||||
FMOVD 160(SP), F0
|
||||
FMULD F4,F0
|
||||
FMOVD 144(SP), F0
|
||||
FMULD F5,F0
|
||||
FMOVD 136(SP), F0
|
||||
FMULDP F0,F6
|
||||
FMOVD 160(SP), F0
|
||||
FMULD F4,F0
|
||||
FADDDP F0,F3
|
||||
FMOVD 144(SP), F0
|
||||
FMULD F4,F0
|
||||
FADDDP F0,F2
|
||||
FMOVD 136(SP), F0
|
||||
FMULD F4,F0
|
||||
FADDDP F0,F1
|
||||
FMOVD 184(SP), F0
|
||||
FMULDP F0,F4
|
||||
FXCHD F0, F3
|
||||
FADDDP F0,F5
|
||||
FMOVD 144(SP), F0
|
||||
FMULD F4,F0
|
||||
FADDDP F0,F2
|
||||
FMOVD 136(SP), F0
|
||||
FMULD F4,F0
|
||||
FADDDP F0,F1
|
||||
FMOVD 184(SP), F0
|
||||
FMULD F4,F0
|
||||
FADDDP F0,F3
|
||||
FMOVD 168(SP), F0
|
||||
FMULDP F0,F4
|
||||
FXCHD F0, F3
|
||||
FADDDP F0,F4
|
||||
FMOVD 136(SP), F0
|
||||
FMULD F5,F0
|
||||
FADDDP F0,F1
|
||||
FXCHD F0, F3
|
||||
FMOVD 184(SP), F0
|
||||
FMULD F5,F0
|
||||
FADDDP F0,F3
|
||||
FXCHD F0, F1
|
||||
FMOVD 168(SP), F0
|
||||
FMULD F5,F0
|
||||
FADDDP F0,F1
|
||||
FMOVD 152(SP), F0
|
||||
FMULDP F0,F5
|
||||
FXCHD F0, F4
|
||||
FADDDP F0,F1
|
||||
CMPQ DX,$16
|
||||
FXCHD F0, F2
|
||||
FMOVD 128(SP), F0
|
||||
FSUBD ·DOFFSET3MINUSTWO128(SB), F0
|
||||
FADDDP F0,F1
|
||||
FXCHD F0, F1
|
||||
FMOVD 120(SP), F0
|
||||
FSUBD ·DOFFSET2(SB), F0
|
||||
FADDDP F0,F1
|
||||
FXCHD F0, F3
|
||||
FMOVD 112(SP), F0
|
||||
FSUBD ·DOFFSET1(SB), F0
|
||||
FADDDP F0,F1
|
||||
FXCHD F0, F2
|
||||
FMOVD 104(SP), F0
|
||||
FSUBD ·DOFFSET0(SB), F0
|
||||
FADDDP F0,F1
|
||||
JAE MULTIPLYADDATLEAST16BYTES
|
||||
MULTIPLYADDATMOST15BYTES:
|
||||
FMOVD ·ALPHA130(SB), F0
|
||||
FADDD F2,F0
|
||||
FSUBD ·ALPHA130(SB), F0
|
||||
FSUBD F0,F2
|
||||
FMULD ·SCALE(SB), F0
|
||||
FMOVD ·ALPHA32(SB), F0
|
||||
FADDD F2,F0
|
||||
FSUBD ·ALPHA32(SB), F0
|
||||
FSUBD F0,F2
|
||||
FMOVD ·ALPHA64(SB), F0
|
||||
FADDD F5,F0
|
||||
FSUBD ·ALPHA64(SB), F0
|
||||
FSUBD F0,F5
|
||||
FMOVD ·ALPHA96(SB), F0
|
||||
FADDD F7,F0
|
||||
FSUBD ·ALPHA96(SB), F0
|
||||
FSUBD F0,F7
|
||||
FXCHD F0, F7
|
||||
FADDDP F0,F1
|
||||
FXCHD F0, F5
|
||||
FADDDP F0,F1
|
||||
FXCHD F0, F3
|
||||
FADDDP F0,F5
|
||||
FADDDP F0,F1
|
||||
FMOVD 176(SP), F0
|
||||
FMULD F1,F0
|
||||
FMOVD 160(SP), F0
|
||||
FMULD F2,F0
|
||||
FMOVD 144(SP), F0
|
||||
FMULD F3,F0
|
||||
FMOVD 136(SP), F0
|
||||
FMULDP F0,F4
|
||||
FMOVD 160(SP), F0
|
||||
FMULD F5,F0
|
||||
FADDDP F0,F3
|
||||
FMOVD 144(SP), F0
|
||||
FMULD F5,F0
|
||||
FADDDP F0,F2
|
||||
FMOVD 136(SP), F0
|
||||
FMULD F5,F0
|
||||
FADDDP F0,F1
|
||||
FMOVD 184(SP), F0
|
||||
FMULDP F0,F5
|
||||
FXCHD F0, F4
|
||||
FADDDP F0,F3
|
||||
FMOVD 144(SP), F0
|
||||
FMULD F5,F0
|
||||
FADDDP F0,F2
|
||||
FMOVD 136(SP), F0
|
||||
FMULD F5,F0
|
||||
FADDDP F0,F1
|
||||
FMOVD 184(SP), F0
|
||||
FMULD F5,F0
|
||||
FADDDP F0,F4
|
||||
FMOVD 168(SP), F0
|
||||
FMULDP F0,F5
|
||||
FXCHD F0, F4
|
||||
FADDDP F0,F2
|
||||
FMOVD 136(SP), F0
|
||||
FMULD F5,F0
|
||||
FADDDP F0,F1
|
||||
FMOVD 184(SP), F0
|
||||
FMULD F5,F0
|
||||
FADDDP F0,F4
|
||||
FMOVD 168(SP), F0
|
||||
FMULD F5,F0
|
||||
FADDDP F0,F3
|
||||
FMOVD 152(SP), F0
|
||||
FMULDP F0,F5
|
||||
FXCHD F0, F4
|
||||
FADDDP F0,F1
|
||||
ADDATMOST15BYTES:
|
||||
CMPQ DX,$0
|
||||
JE NOMOREBYTES
|
||||
MOVL $0,0(SP)
|
||||
MOVL $0, 4 (SP)
|
||||
MOVL $0, 8 (SP)
|
||||
MOVL $0, 12 (SP)
|
||||
LEAQ 0(SP),DI
|
||||
MOVQ DX,CX
|
||||
REP; MOVSB
|
||||
MOVB $1,0(DI)
|
||||
MOVL 12 (SP),DI
|
||||
MOVL 8 (SP),SI
|
||||
MOVL 4 (SP),DX
|
||||
MOVL 0(SP),CX
|
||||
MOVL DI,128(SP)
|
||||
MOVL SI,120(SP)
|
||||
MOVL DX,112(SP)
|
||||
MOVL CX,104(SP)
|
||||
FXCHD F0, F3
|
||||
FADDD 128(SP), F0
|
||||
FSUBD ·DOFFSET3(SB), F0
|
||||
FXCHD F0, F2
|
||||
FADDD 120(SP), F0
|
||||
FSUBD ·DOFFSET2(SB), F0
|
||||
FXCHD F0, F1
|
||||
FADDD 112(SP), F0
|
||||
FSUBD ·DOFFSET1(SB), F0
|
||||
FXCHD F0, F3
|
||||
FADDD 104(SP), F0
|
||||
FSUBD ·DOFFSET0(SB), F0
|
||||
FMOVD ·ALPHA130(SB), F0
|
||||
FADDD F3,F0
|
||||
FSUBD ·ALPHA130(SB), F0
|
||||
FSUBD F0,F3
|
||||
FMULD ·SCALE(SB), F0
|
||||
FMOVD ·ALPHA32(SB), F0
|
||||
FADDD F2,F0
|
||||
FSUBD ·ALPHA32(SB), F0
|
||||
FSUBD F0,F2
|
||||
FMOVD ·ALPHA64(SB), F0
|
||||
FADDD F6,F0
|
||||
FSUBD ·ALPHA64(SB), F0
|
||||
FSUBD F0,F6
|
||||
FMOVD ·ALPHA96(SB), F0
|
||||
FADDD F5,F0
|
||||
FSUBD ·ALPHA96(SB), F0
|
||||
FSUBD F0,F5
|
||||
FXCHD F0, F4
|
||||
FADDDP F0,F3
|
||||
FXCHD F0, F6
|
||||
FADDDP F0,F1
|
||||
FXCHD F0, F3
|
||||
FADDDP F0,F5
|
||||
FXCHD F0, F3
|
||||
FADDDP F0,F1
|
||||
FMOVD 176(SP), F0
|
||||
FMULD F3,F0
|
||||
FMOVD 160(SP), F0
|
||||
FMULD F4,F0
|
||||
FMOVD 144(SP), F0
|
||||
FMULD F5,F0
|
||||
FMOVD 136(SP), F0
|
||||
FMULDP F0,F6
|
||||
FMOVD 160(SP), F0
|
||||
FMULD F5,F0
|
||||
FADDDP F0,F3
|
||||
FMOVD 144(SP), F0
|
||||
FMULD F5,F0
|
||||
FADDDP F0,F2
|
||||
FMOVD 136(SP), F0
|
||||
FMULD F5,F0
|
||||
FADDDP F0,F1
|
||||
FMOVD 184(SP), F0
|
||||
FMULDP F0,F5
|
||||
FXCHD F0, F4
|
||||
FADDDP F0,F5
|
||||
FMOVD 144(SP), F0
|
||||
FMULD F6,F0
|
||||
FADDDP F0,F2
|
||||
FMOVD 136(SP), F0
|
||||
FMULD F6,F0
|
||||
FADDDP F0,F1
|
||||
FMOVD 184(SP), F0
|
||||
FMULD F6,F0
|
||||
FADDDP F0,F4
|
||||
FMOVD 168(SP), F0
|
||||
FMULDP F0,F6
|
||||
FXCHD F0, F5
|
||||
FADDDP F0,F4
|
||||
FMOVD 136(SP), F0
|
||||
FMULD F2,F0
|
||||
FADDDP F0,F1
|
||||
FMOVD 184(SP), F0
|
||||
FMULD F2,F0
|
||||
FADDDP F0,F5
|
||||
FMOVD 168(SP), F0
|
||||
FMULD F2,F0
|
||||
FADDDP F0,F3
|
||||
FMOVD 152(SP), F0
|
||||
FMULDP F0,F2
|
||||
FXCHD F0, F1
|
||||
FADDDP F0,F3
|
||||
FXCHD F0, F3
|
||||
FXCHD F0, F2
|
||||
NOMOREBYTES:
|
||||
MOVL $0,R10
|
||||
FMOVD ·ALPHA130(SB), F0
|
||||
FADDD F4,F0
|
||||
FSUBD ·ALPHA130(SB), F0
|
||||
FSUBD F0,F4
|
||||
FMULD ·SCALE(SB), F0
|
||||
FMOVD ·ALPHA32(SB), F0
|
||||
FADDD F2,F0
|
||||
FSUBD ·ALPHA32(SB), F0
|
||||
FSUBD F0,F2
|
||||
FMOVD ·ALPHA64(SB), F0
|
||||
FADDD F4,F0
|
||||
FSUBD ·ALPHA64(SB), F0
|
||||
FSUBD F0,F4
|
||||
FMOVD ·ALPHA96(SB), F0
|
||||
FADDD F6,F0
|
||||
FSUBD ·ALPHA96(SB), F0
|
||||
FXCHD F0, F6
|
||||
FSUBD F6,F0
|
||||
FXCHD F0, F4
|
||||
FADDDP F0,F3
|
||||
FXCHD F0, F4
|
||||
FADDDP F0,F1
|
||||
FXCHD F0, F2
|
||||
FADDDP F0,F3
|
||||
FXCHD F0, F4
|
||||
FADDDP F0,F3
|
||||
FXCHD F0, F3
|
||||
FADDD ·HOFFSET0(SB), F0
|
||||
FXCHD F0, F3
|
||||
FADDD ·HOFFSET1(SB), F0
|
||||
FXCHD F0, F1
|
||||
FADDD ·HOFFSET2(SB), F0
|
||||
FXCHD F0, F2
|
||||
FADDD ·HOFFSET3(SB), F0
|
||||
FXCHD F0, F3
|
||||
FMOVDP F0, 104(SP)
|
||||
FMOVDP F0, 112(SP)
|
||||
FMOVDP F0, 120(SP)
|
||||
FMOVDP F0, 128(SP)
|
||||
MOVL 108(SP),DI
|
||||
ANDL $63,DI
|
||||
MOVL 116(SP),SI
|
||||
ANDL $63,SI
|
||||
MOVL 124(SP),DX
|
||||
ANDL $63,DX
|
||||
MOVL 132(SP),CX
|
||||
ANDL $63,CX
|
||||
MOVL 112(SP),R8
|
||||
ADDL DI,R8
|
||||
MOVQ R8,112(SP)
|
||||
MOVL 120(SP),DI
|
||||
ADCL SI,DI
|
||||
MOVQ DI,120(SP)
|
||||
MOVL 128(SP),DI
|
||||
ADCL DX,DI
|
||||
MOVQ DI,128(SP)
|
||||
MOVL R10,DI
|
||||
ADCL CX,DI
|
||||
MOVQ DI,136(SP)
|
||||
MOVQ $5,DI
|
||||
MOVL 104(SP),SI
|
||||
ADDL SI,DI
|
||||
MOVQ DI,104(SP)
|
||||
MOVL R10,DI
|
||||
MOVQ 112(SP),DX
|
||||
ADCL DX,DI
|
||||
MOVQ DI,112(SP)
|
||||
MOVL R10,DI
|
||||
MOVQ 120(SP),CX
|
||||
ADCL CX,DI
|
||||
MOVQ DI,120(SP)
|
||||
MOVL R10,DI
|
||||
MOVQ 128(SP),R8
|
||||
ADCL R8,DI
|
||||
MOVQ DI,128(SP)
|
||||
MOVQ $0XFFFFFFFC,DI
|
||||
MOVQ 136(SP),R9
|
||||
ADCL R9,DI
|
||||
SARL $16,DI
|
||||
MOVQ DI,R9
|
||||
XORL $0XFFFFFFFF,R9
|
||||
ANDQ DI,SI
|
||||
MOVQ 104(SP),AX
|
||||
ANDQ R9,AX
|
||||
ORQ AX,SI
|
||||
ANDQ DI,DX
|
||||
MOVQ 112(SP),AX
|
||||
ANDQ R9,AX
|
||||
ORQ AX,DX
|
||||
ANDQ DI,CX
|
||||
MOVQ 120(SP),AX
|
||||
ANDQ R9,AX
|
||||
ORQ AX,CX
|
||||
ANDQ DI,R8
|
||||
MOVQ 128(SP),DI
|
||||
ANDQ R9,DI
|
||||
ORQ DI,R8
|
||||
MOVQ 88(SP),DI
|
||||
MOVQ 96(SP),R9
|
||||
ADDL 16(R9),SI
|
||||
ADCL 20(R9),DX
|
||||
ADCL 24(R9),CX
|
||||
ADCL 28(R9),R8
|
||||
MOVL SI,0(DI)
|
||||
MOVL DX,4(DI)
|
||||
MOVL CX,8(DI)
|
||||
MOVL R8,12(DI)
|
||||
MOVQ 32(SP),R11
|
||||
MOVQ 40(SP),R12
|
||||
MOVQ 48(SP),R13
|
||||
MOVQ 56(SP),R14
|
||||
MOVQ 64(SP),R15
|
||||
MOVQ 72(SP),BX
|
||||
MOVQ 80(SP),BP
|
||||
MOVQ R11,SP
|
||||
MOVOU 0(AX), X0
|
||||
MOVOU 16(AX), X1
|
||||
MOVOU poly1305Mask<>(SB), X2
|
||||
PAND X2, X0
|
||||
MOVO X0, 0(SP)
|
||||
MOVO X1, 16(SP)
|
||||
|
||||
XORQ R8, R8 // h0
|
||||
XORQ R9, R9 // h1
|
||||
XORQ R10, R10 // h2
|
||||
MOVQ 0(SP), R11 // r0
|
||||
MOVQ 8(SP), R12 // r1
|
||||
|
||||
CMPQ R15, $16
|
||||
JB bytes_between_0_and_15
|
||||
|
||||
loop:
|
||||
POLY1305_ADD(SI, R8, R9, R10)
|
||||
multiply:
|
||||
POLY1305_MUL(R8, R9, R10, R11, R12, BX, CX, R13, R14)
|
||||
SUBQ $16, R15
|
||||
CMPQ R15, $16
|
||||
JAE loop
|
||||
|
||||
bytes_between_0_and_15:
|
||||
TESTQ R15, R15
|
||||
JZ done
|
||||
MOVQ $1, BX
|
||||
XORQ CX, CX
|
||||
XORQ R13, R13
|
||||
ADDQ R15, SI
|
||||
|
||||
flush_buffer:
|
||||
SHLQ $8, BX, CX
|
||||
SHLQ $8, BX
|
||||
MOVB -1(SI), R13
|
||||
XORQ R13, BX
|
||||
DECQ SI
|
||||
DECQ R15
|
||||
JNZ flush_buffer
|
||||
|
||||
ADDQ BX, R8
|
||||
ADCQ CX, R9
|
||||
ADCQ $0, R10
|
||||
MOVQ $16, R15
|
||||
JMP multiply
|
||||
|
||||
done:
|
||||
MOVQ R8, AX
|
||||
MOVQ R9, BX
|
||||
SUBQ $0xFFFFFFFFFFFFFFFB, AX
|
||||
SBBQ $0xFFFFFFFFFFFFFFFF, BX
|
||||
CMOVQCS R8, AX
|
||||
CMOVQCS R9, BX
|
||||
ADDQ 16(SP), AX
|
||||
ADCQ 24(SP), BX
|
||||
|
||||
MOVQ BP, SP
|
||||
MOVQ AX, 0(DI)
|
||||
MOVQ BX, 8(DI)
|
||||
RET
|
||||
|
|
Загрузка…
Ссылка в новой задаче