x/crypto/poly1305: optimize amd64 assembly performance

Improve performance on amd64 through faster assembly.

name 		old time/op 	new time/op 	delta
64-8 		101ns ± 4% 	42ns ± 3% 	-58.31% (p=0.002 n=6+6)
1K-8 		887ns ± 1% 	456ns ± 1% 	-48.53% (p=0.002 n=6+6)
64Unaligned-8 	98.1ns ± 1% 	41.1ns ± 1% 	-58.06% (p=0.002 n=6+6)
1KUnaligned-8 	885ns ± 2% 	460ns ± 3% 	-48.04% (p=0.002 n=6+6)

name 		old speed 	new speed 	delta
64-8 		635MB/s ± 4% 	1525MB/s ± 3% 	+140.15% (p=0.002 n=6+6)
1K-8 		1.15GB/s ± 1% 	2.24GB/s ± 1% 	+94.22%  (p=0.002 n=6+6)
64Unaligned-8 	653MB/s ± 1% 	1557MB/s ± 1% 	+138.58% (p=0.002 n=6+6)
1KUnaligned-8  	1.16GB/s ± 2%  	2.23GB/s ± 3%	+92.46%  (p=0.002 n=6+6)

Change-Id: Ia3be8e7ff012f8a9b451d728a646f29f809ba665
Reviewed-on: https://go-review.googlesource.com/29993
Reviewed-by: Adam Langley <agl@golang.org>
This commit is contained in:
Andreas Auernhammer 2016-09-29 10:49:23 +02:00 коммит произвёл Adam Langley
Родитель a20de3fa94
Коммит 568507f56e
2 изменённых файлов: 122 добавлений и 532 удалений

Просмотреть файл

@ -1,45 +0,0 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// This code was translated into a form compatible with 6a from the public
// domain sources in SUPERCOP: http://bench.cr.yp.to/supercop.html
// +build amd64,!gccgo,!appengine
DATA ·SCALE(SB)/8, $0x37F4000000000000
GLOBL ·SCALE(SB), 8, $8
DATA ·TWO32(SB)/8, $0x41F0000000000000
GLOBL ·TWO32(SB), 8, $8
DATA ·TWO64(SB)/8, $0x43F0000000000000
GLOBL ·TWO64(SB), 8, $8
DATA ·TWO96(SB)/8, $0x45F0000000000000
GLOBL ·TWO96(SB), 8, $8
DATA ·ALPHA32(SB)/8, $0x45E8000000000000
GLOBL ·ALPHA32(SB), 8, $8
DATA ·ALPHA64(SB)/8, $0x47E8000000000000
GLOBL ·ALPHA64(SB), 8, $8
DATA ·ALPHA96(SB)/8, $0x49E8000000000000
GLOBL ·ALPHA96(SB), 8, $8
DATA ·ALPHA130(SB)/8, $0x4C08000000000000
GLOBL ·ALPHA130(SB), 8, $8
DATA ·DOFFSET0(SB)/8, $0x4330000000000000
GLOBL ·DOFFSET0(SB), 8, $8
DATA ·DOFFSET1(SB)/8, $0x4530000000000000
GLOBL ·DOFFSET1(SB), 8, $8
DATA ·DOFFSET2(SB)/8, $0x4730000000000000
GLOBL ·DOFFSET2(SB), 8, $8
DATA ·DOFFSET3(SB)/8, $0x4930000000000000
GLOBL ·DOFFSET3(SB), 8, $8
DATA ·DOFFSET3MINUSTWO128(SB)/8, $0x492FFFFE00000000
GLOBL ·DOFFSET3MINUSTWO128(SB), 8, $8
DATA ·HOFFSET0(SB)/8, $0x43300001FFFFFFFB
GLOBL ·HOFFSET0(SB), 8, $8
DATA ·HOFFSET1(SB)/8, $0x45300001FFFFFFFE
GLOBL ·HOFFSET1(SB), 8, $8
DATA ·HOFFSET2(SB)/8, $0x47300001FFFFFFFE
GLOBL ·HOFFSET2(SB), 8, $8
DATA ·HOFFSET3(SB)/8, $0x49300003FFFFFFFE
GLOBL ·HOFFSET3(SB), 8, $8
DATA ·ROUNDING(SB)/2, $0x137f
GLOBL ·ROUNDING(SB), 8, $2

Просмотреть файл

@ -2,496 +2,131 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// This code was translated into a form compatible with 6a from the public
// domain sources in SUPERCOP: http://bench.cr.yp.to/supercop.html
// +build amd64,!gccgo,!appengine
#include "textflag.h"
#define POLY1305_ADD(msg, h0, h1, h2) \
ADDQ 0(msg), h0; \
ADCQ 8(msg), h1; \
ADCQ $1, h2; \
LEAQ 16(msg), msg
#define POLY1305_MUL(h0, h1, h2, r0, r1, t0, t1, t2, t3) \
MOVQ r0, AX; \
MULQ h0; \
MOVQ AX, t0; \
MOVQ DX, t1; \
MOVQ r0, AX; \
MULQ h1; \
ADDQ AX, t1; \
ADCQ $0, DX; \
MOVQ r0, t2; \
IMULQ h2, t2; \
ADDQ DX, t2; \
\
MOVQ r1, AX; \
MULQ h0; \
ADDQ AX, t1; \
ADCQ $0, DX; \
MOVQ DX, h0; \
MOVQ r1, t3; \
IMULQ h2, t3; \
MOVQ r1, AX; \
MULQ h1; \
ADDQ AX, t2; \
ADCQ DX, t3; \
ADDQ h0, t2; \
ADCQ $0, t3; \
\
MOVQ t0, h0; \
MOVQ t1, h1; \
MOVQ t2, h2; \
ANDQ $3, h2; \
MOVQ t2, t0; \
ANDQ $0xFFFFFFFFFFFFFFFC, t0; \
ADDQ t0, h0; \
ADCQ t3, h1; \
ADCQ $0, h2; \
SHRQ $2, t3, t2; \
SHRQ $2, t3; \
ADDQ t2, h0; \
ADCQ t3, h1; \
ADCQ $0, h2
DATA poly1305Mask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF
DATA poly1305Mask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC
GLOBL poly1305Mask<>(SB), RODATA, $16
// func poly1305(out *[16]byte, m *byte, mlen uint64, key *[32]key)
TEXT ·poly1305(SB),0,$224-32
MOVQ out+0(FP),DI
MOVQ m+8(FP),SI
MOVQ mlen+16(FP),DX
MOVQ key+24(FP),CX
TEXT ·poly1305(SB), $0-32
MOVQ out+0(FP), DI
MOVQ m+8(FP), SI
MOVQ mlen+16(FP), R15
MOVQ key+24(FP), AX
MOVQ SP,R11
MOVQ $31,R9
NOTQ R9
ANDQ R9,SP
ADDQ $32,SP
MOVQ SP, BP
ANDQ $0xFFFFFFFFFFFFFFF0, SP
SUBQ $32, SP
MOVQ R11,32(SP)
MOVQ R12,40(SP)
MOVQ R13,48(SP)
MOVQ R14,56(SP)
MOVQ R15,64(SP)
MOVQ BX,72(SP)
MOVQ BP,80(SP)
FLDCW ·ROUNDING(SB)
MOVL 0(CX),R8
MOVL 4(CX),R9
MOVL 8(CX),AX
MOVL 12(CX),R10
MOVQ DI,88(SP)
MOVQ CX,96(SP)
MOVL $0X43300000,108(SP)
MOVL $0X45300000,116(SP)
MOVL $0X47300000,124(SP)
MOVL $0X49300000,132(SP)
ANDL $0X0FFFFFFF,R8
ANDL $0X0FFFFFFC,R9
ANDL $0X0FFFFFFC,AX
ANDL $0X0FFFFFFC,R10
MOVL R8,104(SP)
MOVL R9,112(SP)
MOVL AX,120(SP)
MOVL R10,128(SP)
FMOVD 104(SP), F0
FSUBD ·DOFFSET0(SB), F0
FMOVD 112(SP), F0
FSUBD ·DOFFSET1(SB), F0
FMOVD 120(SP), F0
FSUBD ·DOFFSET2(SB), F0
FMOVD 128(SP), F0
FSUBD ·DOFFSET3(SB), F0
FXCHD F0, F3
FMOVDP F0, 136(SP)
FXCHD F0, F1
FMOVD F0, 144(SP)
FMULD ·SCALE(SB), F0
FMOVDP F0, 152(SP)
FMOVD F0, 160(SP)
FMULD ·SCALE(SB), F0
FMOVDP F0, 168(SP)
FMOVD F0, 176(SP)
FMULD ·SCALE(SB), F0
FMOVDP F0, 184(SP)
FLDZ
FLDZ
FLDZ
FLDZ
CMPQ DX,$16
JB ADDATMOST15BYTES
INITIALATLEAST16BYTES:
MOVL 12(SI),DI
MOVL 8(SI),CX
MOVL 4(SI),R8
MOVL 0(SI),R9
MOVL DI,128(SP)
MOVL CX,120(SP)
MOVL R8,112(SP)
MOVL R9,104(SP)
ADDQ $16,SI
SUBQ $16,DX
FXCHD F0, F3
FADDD 128(SP), F0
FSUBD ·DOFFSET3MINUSTWO128(SB), F0
FXCHD F0, F1
FADDD 112(SP), F0
FSUBD ·DOFFSET1(SB), F0
FXCHD F0, F2
FADDD 120(SP), F0
FSUBD ·DOFFSET2(SB), F0
FXCHD F0, F3
FADDD 104(SP), F0
FSUBD ·DOFFSET0(SB), F0
CMPQ DX,$16
JB MULTIPLYADDATMOST15BYTES
MULTIPLYADDATLEAST16BYTES:
MOVL 12(SI),DI
MOVL 8(SI),CX
MOVL 4(SI),R8
MOVL 0(SI),R9
MOVL DI,128(SP)
MOVL CX,120(SP)
MOVL R8,112(SP)
MOVL R9,104(SP)
ADDQ $16,SI
SUBQ $16,DX
FMOVD ·ALPHA130(SB), F0
FADDD F2,F0
FSUBD ·ALPHA130(SB), F0
FSUBD F0,F2
FMULD ·SCALE(SB), F0
FMOVD ·ALPHA32(SB), F0
FADDD F2,F0
FSUBD ·ALPHA32(SB), F0
FSUBD F0,F2
FXCHD F0, F2
FADDDP F0,F1
FMOVD ·ALPHA64(SB), F0
FADDD F4,F0
FSUBD ·ALPHA64(SB), F0
FSUBD F0,F4
FMOVD ·ALPHA96(SB), F0
FADDD F6,F0
FSUBD ·ALPHA96(SB), F0
FSUBD F0,F6
FXCHD F0, F6
FADDDP F0,F1
FXCHD F0, F3
FADDDP F0,F5
FXCHD F0, F3
FADDDP F0,F1
FMOVD 176(SP), F0
FMULD F3,F0
FMOVD 160(SP), F0
FMULD F4,F0
FMOVD 144(SP), F0
FMULD F5,F0
FMOVD 136(SP), F0
FMULDP F0,F6
FMOVD 160(SP), F0
FMULD F4,F0
FADDDP F0,F3
FMOVD 144(SP), F0
FMULD F4,F0
FADDDP F0,F2
FMOVD 136(SP), F0
FMULD F4,F0
FADDDP F0,F1
FMOVD 184(SP), F0
FMULDP F0,F4
FXCHD F0, F3
FADDDP F0,F5
FMOVD 144(SP), F0
FMULD F4,F0
FADDDP F0,F2
FMOVD 136(SP), F0
FMULD F4,F0
FADDDP F0,F1
FMOVD 184(SP), F0
FMULD F4,F0
FADDDP F0,F3
FMOVD 168(SP), F0
FMULDP F0,F4
FXCHD F0, F3
FADDDP F0,F4
FMOVD 136(SP), F0
FMULD F5,F0
FADDDP F0,F1
FXCHD F0, F3
FMOVD 184(SP), F0
FMULD F5,F0
FADDDP F0,F3
FXCHD F0, F1
FMOVD 168(SP), F0
FMULD F5,F0
FADDDP F0,F1
FMOVD 152(SP), F0
FMULDP F0,F5
FXCHD F0, F4
FADDDP F0,F1
CMPQ DX,$16
FXCHD F0, F2
FMOVD 128(SP), F0
FSUBD ·DOFFSET3MINUSTWO128(SB), F0
FADDDP F0,F1
FXCHD F0, F1
FMOVD 120(SP), F0
FSUBD ·DOFFSET2(SB), F0
FADDDP F0,F1
FXCHD F0, F3
FMOVD 112(SP), F0
FSUBD ·DOFFSET1(SB), F0
FADDDP F0,F1
FXCHD F0, F2
FMOVD 104(SP), F0
FSUBD ·DOFFSET0(SB), F0
FADDDP F0,F1
JAE MULTIPLYADDATLEAST16BYTES
MULTIPLYADDATMOST15BYTES:
FMOVD ·ALPHA130(SB), F0
FADDD F2,F0
FSUBD ·ALPHA130(SB), F0
FSUBD F0,F2
FMULD ·SCALE(SB), F0
FMOVD ·ALPHA32(SB), F0
FADDD F2,F0
FSUBD ·ALPHA32(SB), F0
FSUBD F0,F2
FMOVD ·ALPHA64(SB), F0
FADDD F5,F0
FSUBD ·ALPHA64(SB), F0
FSUBD F0,F5
FMOVD ·ALPHA96(SB), F0
FADDD F7,F0
FSUBD ·ALPHA96(SB), F0
FSUBD F0,F7
FXCHD F0, F7
FADDDP F0,F1
FXCHD F0, F5
FADDDP F0,F1
FXCHD F0, F3
FADDDP F0,F5
FADDDP F0,F1
FMOVD 176(SP), F0
FMULD F1,F0
FMOVD 160(SP), F0
FMULD F2,F0
FMOVD 144(SP), F0
FMULD F3,F0
FMOVD 136(SP), F0
FMULDP F0,F4
FMOVD 160(SP), F0
FMULD F5,F0
FADDDP F0,F3
FMOVD 144(SP), F0
FMULD F5,F0
FADDDP F0,F2
FMOVD 136(SP), F0
FMULD F5,F0
FADDDP F0,F1
FMOVD 184(SP), F0
FMULDP F0,F5
FXCHD F0, F4
FADDDP F0,F3
FMOVD 144(SP), F0
FMULD F5,F0
FADDDP F0,F2
FMOVD 136(SP), F0
FMULD F5,F0
FADDDP F0,F1
FMOVD 184(SP), F0
FMULD F5,F0
FADDDP F0,F4
FMOVD 168(SP), F0
FMULDP F0,F5
FXCHD F0, F4
FADDDP F0,F2
FMOVD 136(SP), F0
FMULD F5,F0
FADDDP F0,F1
FMOVD 184(SP), F0
FMULD F5,F0
FADDDP F0,F4
FMOVD 168(SP), F0
FMULD F5,F0
FADDDP F0,F3
FMOVD 152(SP), F0
FMULDP F0,F5
FXCHD F0, F4
FADDDP F0,F1
ADDATMOST15BYTES:
CMPQ DX,$0
JE NOMOREBYTES
MOVL $0,0(SP)
MOVL $0, 4 (SP)
MOVL $0, 8 (SP)
MOVL $0, 12 (SP)
LEAQ 0(SP),DI
MOVQ DX,CX
REP; MOVSB
MOVB $1,0(DI)
MOVL 12 (SP),DI
MOVL 8 (SP),SI
MOVL 4 (SP),DX
MOVL 0(SP),CX
MOVL DI,128(SP)
MOVL SI,120(SP)
MOVL DX,112(SP)
MOVL CX,104(SP)
FXCHD F0, F3
FADDD 128(SP), F0
FSUBD ·DOFFSET3(SB), F0
FXCHD F0, F2
FADDD 120(SP), F0
FSUBD ·DOFFSET2(SB), F0
FXCHD F0, F1
FADDD 112(SP), F0
FSUBD ·DOFFSET1(SB), F0
FXCHD F0, F3
FADDD 104(SP), F0
FSUBD ·DOFFSET0(SB), F0
FMOVD ·ALPHA130(SB), F0
FADDD F3,F0
FSUBD ·ALPHA130(SB), F0
FSUBD F0,F3
FMULD ·SCALE(SB), F0
FMOVD ·ALPHA32(SB), F0
FADDD F2,F0
FSUBD ·ALPHA32(SB), F0
FSUBD F0,F2
FMOVD ·ALPHA64(SB), F0
FADDD F6,F0
FSUBD ·ALPHA64(SB), F0
FSUBD F0,F6
FMOVD ·ALPHA96(SB), F0
FADDD F5,F0
FSUBD ·ALPHA96(SB), F0
FSUBD F0,F5
FXCHD F0, F4
FADDDP F0,F3
FXCHD F0, F6
FADDDP F0,F1
FXCHD F0, F3
FADDDP F0,F5
FXCHD F0, F3
FADDDP F0,F1
FMOVD 176(SP), F0
FMULD F3,F0
FMOVD 160(SP), F0
FMULD F4,F0
FMOVD 144(SP), F0
FMULD F5,F0
FMOVD 136(SP), F0
FMULDP F0,F6
FMOVD 160(SP), F0
FMULD F5,F0
FADDDP F0,F3
FMOVD 144(SP), F0
FMULD F5,F0
FADDDP F0,F2
FMOVD 136(SP), F0
FMULD F5,F0
FADDDP F0,F1
FMOVD 184(SP), F0
FMULDP F0,F5
FXCHD F0, F4
FADDDP F0,F5
FMOVD 144(SP), F0
FMULD F6,F0
FADDDP F0,F2
FMOVD 136(SP), F0
FMULD F6,F0
FADDDP F0,F1
FMOVD 184(SP), F0
FMULD F6,F0
FADDDP F0,F4
FMOVD 168(SP), F0
FMULDP F0,F6
FXCHD F0, F5
FADDDP F0,F4
FMOVD 136(SP), F0
FMULD F2,F0
FADDDP F0,F1
FMOVD 184(SP), F0
FMULD F2,F0
FADDDP F0,F5
FMOVD 168(SP), F0
FMULD F2,F0
FADDDP F0,F3
FMOVD 152(SP), F0
FMULDP F0,F2
FXCHD F0, F1
FADDDP F0,F3
FXCHD F0, F3
FXCHD F0, F2
NOMOREBYTES:
MOVL $0,R10
FMOVD ·ALPHA130(SB), F0
FADDD F4,F0
FSUBD ·ALPHA130(SB), F0
FSUBD F0,F4
FMULD ·SCALE(SB), F0
FMOVD ·ALPHA32(SB), F0
FADDD F2,F0
FSUBD ·ALPHA32(SB), F0
FSUBD F0,F2
FMOVD ·ALPHA64(SB), F0
FADDD F4,F0
FSUBD ·ALPHA64(SB), F0
FSUBD F0,F4
FMOVD ·ALPHA96(SB), F0
FADDD F6,F0
FSUBD ·ALPHA96(SB), F0
FXCHD F0, F6
FSUBD F6,F0
FXCHD F0, F4
FADDDP F0,F3
FXCHD F0, F4
FADDDP F0,F1
FXCHD F0, F2
FADDDP F0,F3
FXCHD F0, F4
FADDDP F0,F3
FXCHD F0, F3
FADDD ·HOFFSET0(SB), F0
FXCHD F0, F3
FADDD ·HOFFSET1(SB), F0
FXCHD F0, F1
FADDD ·HOFFSET2(SB), F0
FXCHD F0, F2
FADDD ·HOFFSET3(SB), F0
FXCHD F0, F3
FMOVDP F0, 104(SP)
FMOVDP F0, 112(SP)
FMOVDP F0, 120(SP)
FMOVDP F0, 128(SP)
MOVL 108(SP),DI
ANDL $63,DI
MOVL 116(SP),SI
ANDL $63,SI
MOVL 124(SP),DX
ANDL $63,DX
MOVL 132(SP),CX
ANDL $63,CX
MOVL 112(SP),R8
ADDL DI,R8
MOVQ R8,112(SP)
MOVL 120(SP),DI
ADCL SI,DI
MOVQ DI,120(SP)
MOVL 128(SP),DI
ADCL DX,DI
MOVQ DI,128(SP)
MOVL R10,DI
ADCL CX,DI
MOVQ DI,136(SP)
MOVQ $5,DI
MOVL 104(SP),SI
ADDL SI,DI
MOVQ DI,104(SP)
MOVL R10,DI
MOVQ 112(SP),DX
ADCL DX,DI
MOVQ DI,112(SP)
MOVL R10,DI
MOVQ 120(SP),CX
ADCL CX,DI
MOVQ DI,120(SP)
MOVL R10,DI
MOVQ 128(SP),R8
ADCL R8,DI
MOVQ DI,128(SP)
MOVQ $0XFFFFFFFC,DI
MOVQ 136(SP),R9
ADCL R9,DI
SARL $16,DI
MOVQ DI,R9
XORL $0XFFFFFFFF,R9
ANDQ DI,SI
MOVQ 104(SP),AX
ANDQ R9,AX
ORQ AX,SI
ANDQ DI,DX
MOVQ 112(SP),AX
ANDQ R9,AX
ORQ AX,DX
ANDQ DI,CX
MOVQ 120(SP),AX
ANDQ R9,AX
ORQ AX,CX
ANDQ DI,R8
MOVQ 128(SP),DI
ANDQ R9,DI
ORQ DI,R8
MOVQ 88(SP),DI
MOVQ 96(SP),R9
ADDL 16(R9),SI
ADCL 20(R9),DX
ADCL 24(R9),CX
ADCL 28(R9),R8
MOVL SI,0(DI)
MOVL DX,4(DI)
MOVL CX,8(DI)
MOVL R8,12(DI)
MOVQ 32(SP),R11
MOVQ 40(SP),R12
MOVQ 48(SP),R13
MOVQ 56(SP),R14
MOVQ 64(SP),R15
MOVQ 72(SP),BX
MOVQ 80(SP),BP
MOVQ R11,SP
MOVOU 0(AX), X0
MOVOU 16(AX), X1
MOVOU poly1305Mask<>(SB), X2
PAND X2, X0
MOVO X0, 0(SP)
MOVO X1, 16(SP)
XORQ R8, R8 // h0
XORQ R9, R9 // h1
XORQ R10, R10 // h2
MOVQ 0(SP), R11 // r0
MOVQ 8(SP), R12 // r1
CMPQ R15, $16
JB bytes_between_0_and_15
loop:
POLY1305_ADD(SI, R8, R9, R10)
multiply:
POLY1305_MUL(R8, R9, R10, R11, R12, BX, CX, R13, R14)
SUBQ $16, R15
CMPQ R15, $16
JAE loop
bytes_between_0_and_15:
TESTQ R15, R15
JZ done
MOVQ $1, BX
XORQ CX, CX
XORQ R13, R13
ADDQ R15, SI
flush_buffer:
SHLQ $8, BX, CX
SHLQ $8, BX
MOVB -1(SI), R13
XORQ R13, BX
DECQ SI
DECQ R15
JNZ flush_buffer
ADDQ BX, R8
ADCQ CX, R9
ADCQ $0, R10
MOVQ $16, R15
JMP multiply
done:
MOVQ R8, AX
MOVQ R9, BX
SUBQ $0xFFFFFFFFFFFFFFFB, AX
SBBQ $0xFFFFFFFFFFFFFFFF, BX
CMOVQCS R8, AX
CMOVQCS R9, BX
ADDQ 16(SP), AX
ADCQ 24(SP), BX
MOVQ BP, SP
MOVQ AX, 0(DI)
MOVQ BX, 8(DI)
RET