diff --git a/blake2b/_asm/standard/blake2b_amd64_asm.go b/blake2b/_asm/standard/blake2b_amd64_asm.go new file mode 100644 index 00000000..a34db3fc --- /dev/null +++ b/blake2b/_asm/standard/blake2b_amd64_asm.go @@ -0,0 +1,361 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +import ( + . "github.com/mmcloughlin/avo/build" + . "github.com/mmcloughlin/avo/operand" + . "github.com/mmcloughlin/avo/reg" + _ "golang.org/x/crypto/blake2b" +) + +//go:generate go run . -out ../../blake2b_amd64.s -pkg blake2b + +const ThatPeskyUnicodeDot = "\u00b7" + +var iv0_DATA_ptr, iv1_DATA_ptr, iv2_DATA_ptr, iv3_DATA_ptr, c40_DATA_ptr, c48_DATA_ptr *Mem + +func main() { + Package("golang.org/x/crypto/blake2b") + ConstraintExpr("amd64,gc,!purego") + hashBlocksSSE4() + Generate() +} + +func SHUFFLE(v2, v3, v4, v5, v6, v7, t1, t2 VecPhysical) { + MOVO(v4, t1) + MOVO(v5, v4) + MOVO(t1, v5) + MOVO(v6, t1) + PUNPCKLQDQ(v6, t2) + PUNPCKHQDQ(v7, v6) + PUNPCKHQDQ(t2, v6) + PUNPCKLQDQ(v7, t2) + MOVO(t1, v7) + MOVO(v2, t1) + PUNPCKHQDQ(t2, v7) + PUNPCKLQDQ(v3, t2) + PUNPCKHQDQ(t2, v2) + PUNPCKLQDQ(t1, t2) + PUNPCKHQDQ(t2, v3) +} + +func SHUFFLE_INV(v2, v3, v4, v5, v6, v7, t1, t2 VecPhysical) { + MOVO(v4, t1) + MOVO(v5, v4) + MOVO(t1, v5) + MOVO(v2, t1) + PUNPCKLQDQ(v2, t2) + PUNPCKHQDQ(v3, v2) + PUNPCKHQDQ(t2, v2) + PUNPCKLQDQ(v3, t2) + MOVO(t1, v3) + MOVO(v6, t1) + PUNPCKHQDQ(t2, v3) + PUNPCKLQDQ(v7, t2) + PUNPCKHQDQ(t2, v6) + PUNPCKLQDQ(t1, t2) + PUNPCKHQDQ(t2, v7) +} + +func HALF_ROUND(v0, v1, v2, v3, v4, v5, v6, v7 VecPhysical, m0, m1, m2, m3 Op, t0, c40, c48 VecPhysical) { + PADDQ(m0, v0) + PADDQ(m1, v1) + PADDQ(v2, v0) + PADDQ(v3, v1) + PXOR(v0, v6) + PXOR(v1, v7) + PSHUFD(Imm(0xB1), v6, v6) + PSHUFD(Imm(0xB1), v7, v7) + PADDQ(v6, v4) + PADDQ(v7, v5) + PXOR(v4, v2) + PXOR(v5, v3) + PSHUFB(c40, v2) + PSHUFB(c40, v3) + PADDQ(m2, v0) + PADDQ(m3, v1) + PADDQ(v2, v0) + PADDQ(v3, v1) + PXOR(v0, v6) + PXOR(v1, v7) + PSHUFB(c48, v6) + PSHUFB(c48, v7) + PADDQ(v6, v4) + PADDQ(v7, v5) + PXOR(v4, v2) + PXOR(v5, v3) + MOVOU(v2, t0) + PADDQ(v2, t0) + PSRLQ(Imm(63), v2) + PXOR(t0, v2) + MOVOU(v3, t0) + PADDQ(v3, t0) + PSRLQ(Imm(63), v3) + PXOR(t0, v3) +} + +func LOAD_MSG(m0, m1, m2, m3 VecPhysical, src GPPhysical, i0, i1, i2, i3, i4, i5, i6, i7 int) { + MOVQ(Mem{Base: src}.Offset(i0*8), m0) + PINSRQ(Imm(1), Mem{Base: src}.Offset(i1*8), m0) + MOVQ(Mem{Base: src}.Offset(i2*8), m1) + PINSRQ(Imm(1), Mem{Base: src}.Offset(i3*8), m1) + MOVQ(Mem{Base: src}.Offset(i4*8), m2) + PINSRQ(Imm(1), Mem{Base: src}.Offset(i5*8), m2) + MOVQ(Mem{Base: src}.Offset(i6*8), m3) + PINSRQ(Imm(1), Mem{Base: src}.Offset(i7*8), m3) +} + +func hashBlocksSSE4() { + Implement("hashBlocksSSE4") + Attributes(4) + AllocLocal(288) // frame size = 272 + 16 byte alignment + + Load(Param("h"), RAX) + Load(Param("c"), RBX) + Load(Param("flag"), RCX) + Load(Param("blocks").Base(), RSI) + Load(Param("blocks").Len(), RDI) + + MOVQ(RSP, R10) + ADDQ(Imm(15), R10) + ANDQ(I32(-16), R10) + + iv3 := iv3_DATA() + MOVOU(iv3, X0) + MOVO(X0, Mem{Base: R10}.Offset(0)) + XORQ(RCX, Mem{Base: R10}.Offset(0)) // 0(R10) = ·iv3 ^ (CX || 0) + + c40 := c40_DATA() + c48 := c48_DATA() + MOVOU(c40, X13) + MOVOU(c48, X14) + + MOVOU(Mem{Base: AX}.Offset(0), X12) + MOVOU(Mem{Base: AX}.Offset(16), X15) + + MOVQ(Mem{Base: BX}.Offset(0), R8) + MOVQ(Mem{Base: BX}.Offset(8), R9) + + Label("loop") + ADDQ(Imm(128), R8) + CMPQ(R8, Imm(128)) + JGE(LabelRef("noinc")) + INCQ(R9) + + Label("noinc") + MOVQ(R8, X8) + PINSRQ(Imm(1), R9, X8) + + iv0 := iv0_DATA() + iv1 := iv1_DATA() + iv2 := iv2_DATA() + + MOVO(X12, X0) + MOVO(X15, X1) + MOVOU(Mem{Base: AX}.Offset(32), X2) + MOVOU(Mem{Base: AX}.Offset(48), X3) + MOVOU(iv0, X4) + MOVOU(iv1, X5) + MOVOU(iv2, X6) + + PXOR(X8, X6) + MOVO(Mem{Base: R10}.Offset(0), X7) + + LOAD_MSG(X8, X9, X10, X11, SI, 0, 2, 4, 6, 1, 3, 5, 7) + MOVO(X8, Mem{Base: R10}.Offset(16)) + MOVO(X9, Mem{Base: R10}.Offset(32)) + MOVO(X10, Mem{Base: R10}.Offset(48)) + MOVO(X11, Mem{Base: R10}.Offset(64)) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) + LOAD_MSG(X8, X9, X10, X11, SI, 8, 10, 12, 14, 9, 11, 13, 15) + MOVO(X8, Mem{Base: R10}.Offset(80)) + MOVO(X9, Mem{Base: R10}.Offset(96)) + MOVO(X10, Mem{Base: R10}.Offset(112)) + MOVO(X11, Mem{Base: R10}.Offset(128)) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) + + LOAD_MSG(X8, X9, X10, X11, SI, 14, 4, 9, 13, 10, 8, 15, 6) + MOVO(X8, Mem{Base: R10}.Offset(144)) + MOVO(X9, Mem{Base: R10}.Offset(160)) + MOVO(X10, Mem{Base: R10}.Offset(176)) + MOVO(X11, Mem{Base: R10}.Offset(192)) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) + LOAD_MSG(X8, X9, X10, X11, SI, 1, 0, 11, 5, 12, 2, 7, 3) + MOVO(X8, Mem{Base: R10}.Offset(208)) + MOVO(X9, Mem{Base: R10}.Offset(224)) + MOVO(X10, Mem{Base: R10}.Offset(240)) + MOVO(X11, Mem{Base: R10}.Offset(256)) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) + + LOAD_MSG(X8, X9, X10, X11, SI, 11, 12, 5, 15, 8, 0, 2, 13) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) + LOAD_MSG(X8, X9, X10, X11, SI, 10, 3, 7, 9, 14, 6, 1, 4) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) + + LOAD_MSG(X8, X9, X10, X11, SI, 7, 3, 13, 11, 9, 1, 12, 14) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) + LOAD_MSG(X8, X9, X10, X11, SI, 2, 5, 4, 15, 6, 10, 0, 8) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) + + LOAD_MSG(X8, X9, X10, X11, SI, 9, 5, 2, 10, 0, 7, 4, 15) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) + LOAD_MSG(X8, X9, X10, X11, SI, 14, 11, 6, 3, 1, 12, 8, 13) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) + + LOAD_MSG(X8, X9, X10, X11, SI, 2, 6, 0, 8, 12, 10, 11, 3) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) + LOAD_MSG(X8, X9, X10, X11, SI, 4, 7, 15, 1, 13, 5, 14, 9) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) + + LOAD_MSG(X8, X9, X10, X11, SI, 12, 1, 14, 4, 5, 15, 13, 10) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) + LOAD_MSG(X8, X9, X10, X11, SI, 0, 6, 9, 8, 7, 3, 2, 11) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) + + LOAD_MSG(X8, X9, X10, X11, SI, 13, 7, 12, 3, 11, 14, 1, 9) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) + LOAD_MSG(X8, X9, X10, X11, SI, 5, 15, 8, 2, 0, 4, 6, 10) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) + + LOAD_MSG(X8, X9, X10, X11, SI, 6, 14, 11, 0, 15, 9, 3, 8) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) + LOAD_MSG(X8, X9, X10, X11, SI, 12, 13, 1, 10, 2, 7, 4, 5) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) + + LOAD_MSG(X8, X9, X10, X11, SI, 10, 8, 7, 1, 2, 4, 6, 5) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) + LOAD_MSG(X8, X9, X10, X11, SI, 15, 9, 3, 13, 11, 14, 12, 0) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) + + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, Mem{Base: R10}.Offset(16), Mem{Base: R10}.Offset(32), Mem{Base: R10}.Offset(48), Mem{Base: R10}.Offset(64), X11, X13, X14) + SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, Mem{Base: R10}.Offset(80), Mem{Base: R10}.Offset(96), Mem{Base: R10}.Offset(112), Mem{Base: R10}.Offset(128), X11, X13, X14) + SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) + + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, Mem{Base: R10}.Offset(144), Mem{Base: R10}.Offset(160), Mem{Base: R10}.Offset(176), Mem{Base: R10}.Offset(192), X11, X13, X14) + SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, Mem{Base: R10}.Offset(208), Mem{Base: R10}.Offset(224), Mem{Base: R10}.Offset(240), Mem{Base: R10}.Offset(256), X11, X13, X14) + SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) + + MOVOU(Mem{Base: AX}.Offset(32), X10) + MOVOU(Mem{Base: AX}.Offset(48), X11) + PXOR(X0, X12) + PXOR(X1, X15) + PXOR(X2, X10) + PXOR(X3, X11) + PXOR(X4, X12) + PXOR(X5, X15) + PXOR(X6, X10) + PXOR(X7, X11) + MOVOU(X10, Mem{Base: AX}.Offset(32)) + MOVOU(X11, Mem{Base: AX}.Offset(48)) + + LEAQ(Mem{Base: SI}.Offset(128), RSI) + SUBQ(Imm(128), RDI) + JNE(LabelRef("loop")) + + MOVOU(X12, Mem{Base: AX}.Offset(0)) + MOVOU(X15, Mem{Base: AX}.Offset(16)) + + MOVQ(R8, Mem{Base: BX}.Offset(0)) + MOVQ(R9, Mem{Base: BX}.Offset(8)) + + RET() +} + +// #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~DATA SECTION~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~## + +func iv0_DATA() Mem { + if iv0_DATA_ptr != nil { + return *iv0_DATA_ptr + } + + iv0 := GLOBL(ThatPeskyUnicodeDot+"iv0", NOPTR|RODATA) + iv0_DATA_ptr = &iv0 + DATA(0x00, U64(0x6a09e667f3bcc908)) + DATA(0x08, U64(0xbb67ae8584caa73b)) + return iv0 +} + +func iv1_DATA() Mem { + if iv1_DATA_ptr != nil { + return *iv1_DATA_ptr + } + + iv1 := GLOBL(ThatPeskyUnicodeDot+"iv1", NOPTR|RODATA) + iv1_DATA_ptr = &iv1 + DATA(0x00, U64(0x3c6ef372fe94f82b)) + DATA(0x08, U64(0xa54ff53a5f1d36f1)) + return iv1 +} + +func iv2_DATA() Mem { + if iv2_DATA_ptr != nil { + return *iv2_DATA_ptr + } + + iv2 := GLOBL(ThatPeskyUnicodeDot+"iv2", NOPTR|RODATA) + iv2_DATA_ptr = &iv2 + DATA(0x00, U64(0x510e527fade682d1)) + DATA(0x08, U64(0x9b05688c2b3e6c1f)) + return iv2 +} + +func iv3_DATA() Mem { + if iv3_DATA_ptr != nil { + return *iv3_DATA_ptr + } + + iv3 := GLOBL(ThatPeskyUnicodeDot+"iv3", NOPTR|RODATA) + iv3_DATA_ptr = &iv3 + DATA(0x00, U64(0x1f83d9abfb41bd6b)) + DATA(0x08, U64(0x5be0cd19137e2179)) + return iv3 +} + +func c40_DATA() Mem { + if c40_DATA_ptr != nil { + return *c40_DATA_ptr + } + + c40 := GLOBL(ThatPeskyUnicodeDot+"c40", NOPTR|RODATA) + c40_DATA_ptr = &c40 + DATA(0x00, U64(0x0201000706050403)) + DATA(0x08, U64(0x0a09080f0e0d0c0b)) + return c40 +} + +func c48_DATA() Mem { + if c48_DATA_ptr != nil { + return *c48_DATA_ptr + } + + c48 := GLOBL(ThatPeskyUnicodeDot+"c48", NOPTR|RODATA) + c48_DATA_ptr = &c48 + DATA(0x00, U64(0x0100070605040302)) + DATA(0x08, U64(0x09080f0e0d0c0b0a)) + return c48 +} diff --git a/blake2b/_asm/standard/go.mod b/blake2b/_asm/standard/go.mod new file mode 100644 index 00000000..8063f1b9 --- /dev/null +++ b/blake2b/_asm/standard/go.mod @@ -0,0 +1,15 @@ +module blake2b/_asm + +go 1.23 + +require ( + github.com/mmcloughlin/avo v0.6.0 + golang.org/x/crypto v0.26.0 +) + +require ( + golang.org/x/mod v0.20.0 // indirect + golang.org/x/sync v0.8.0 // indirect + golang.org/x/sys v0.24.0 // indirect + golang.org/x/tools v0.24.0 // indirect +) diff --git a/blake2b/_asm/standard/go.sum b/blake2b/_asm/standard/go.sum new file mode 100644 index 00000000..62ea9dfb --- /dev/null +++ b/blake2b/_asm/standard/go.sum @@ -0,0 +1,12 @@ +github.com/mmcloughlin/avo v0.6.0 h1:QH6FU8SKoTLaVs80GA8TJuLNkUYl4VokHKlPhVDg4YY= +github.com/mmcloughlin/avo v0.6.0/go.mod h1:8CoAGaCSYXtCPR+8y18Y9aB/kxb8JSS6FRI7mSkvD+8= +golang.org/x/crypto v0.26.0 h1:RrRspgV4mU+YwB4FYnuBoKsUapNIL5cohGAmSH3azsw= +golang.org/x/crypto v0.26.0/go.mod h1:GY7jblb9wI+FOo5y8/S2oY4zWP07AkOJ4+jxCqdqn54= +golang.org/x/mod v0.20.0 h1:utOm6MM3R3dnawAiJgn0y+xvuYRsm1RKM/4giyfDgV0= +golang.org/x/mod v0.20.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ= +golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sys v0.24.0 h1:Twjiwq9dn6R1fQcyiK+wQyHWfaz/BJB+YIpzU/Cv3Xg= +golang.org/x/sys v0.24.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/tools v0.24.0 h1:J1shsA93PJUEVaUSaay7UXAyE8aimq3GW0pjlolpa24= +golang.org/x/tools v0.24.0/go.mod h1:YhNqVBIfWHdzvTLs0d8LCuMhkKUgSUKldakyV7W/WDQ= diff --git a/blake2b/blake2b_amd64.s b/blake2b/blake2b_amd64.s index adfac00c..9a0ce212 100644 --- a/blake2b/blake2b_amd64.s +++ b/blake2b/blake2b_amd64.s @@ -1,278 +1,1441 @@ -// Copyright 2016 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. +// Code generated by command: go run blake2b_amd64_asm.go -out ../../blake2b_amd64.s -pkg blake2b. DO NOT EDIT. //go:build amd64 && gc && !purego #include "textflag.h" -DATA ·iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908 -DATA ·iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b -GLOBL ·iv0<>(SB), (NOPTR+RODATA), $16 - -DATA ·iv1<>+0x00(SB)/8, $0x3c6ef372fe94f82b -DATA ·iv1<>+0x08(SB)/8, $0xa54ff53a5f1d36f1 -GLOBL ·iv1<>(SB), (NOPTR+RODATA), $16 - -DATA ·iv2<>+0x00(SB)/8, $0x510e527fade682d1 -DATA ·iv2<>+0x08(SB)/8, $0x9b05688c2b3e6c1f -GLOBL ·iv2<>(SB), (NOPTR+RODATA), $16 - -DATA ·iv3<>+0x00(SB)/8, $0x1f83d9abfb41bd6b -DATA ·iv3<>+0x08(SB)/8, $0x5be0cd19137e2179 -GLOBL ·iv3<>(SB), (NOPTR+RODATA), $16 - -DATA ·c40<>+0x00(SB)/8, $0x0201000706050403 -DATA ·c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b -GLOBL ·c40<>(SB), (NOPTR+RODATA), $16 - -DATA ·c48<>+0x00(SB)/8, $0x0100070605040302 -DATA ·c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a -GLOBL ·c48<>(SB), (NOPTR+RODATA), $16 - -#define SHUFFLE(v2, v3, v4, v5, v6, v7, t1, t2) \ - MOVO v4, t1; \ - MOVO v5, v4; \ - MOVO t1, v5; \ - MOVO v6, t1; \ - PUNPCKLQDQ v6, t2; \ - PUNPCKHQDQ v7, v6; \ - PUNPCKHQDQ t2, v6; \ - PUNPCKLQDQ v7, t2; \ - MOVO t1, v7; \ - MOVO v2, t1; \ - PUNPCKHQDQ t2, v7; \ - PUNPCKLQDQ v3, t2; \ - PUNPCKHQDQ t2, v2; \ - PUNPCKLQDQ t1, t2; \ - PUNPCKHQDQ t2, v3 - -#define SHUFFLE_INV(v2, v3, v4, v5, v6, v7, t1, t2) \ - MOVO v4, t1; \ - MOVO v5, v4; \ - MOVO t1, v5; \ - MOVO v2, t1; \ - PUNPCKLQDQ v2, t2; \ - PUNPCKHQDQ v3, v2; \ - PUNPCKHQDQ t2, v2; \ - PUNPCKLQDQ v3, t2; \ - MOVO t1, v3; \ - MOVO v6, t1; \ - PUNPCKHQDQ t2, v3; \ - PUNPCKLQDQ v7, t2; \ - PUNPCKHQDQ t2, v6; \ - PUNPCKLQDQ t1, t2; \ - PUNPCKHQDQ t2, v7 - -#define HALF_ROUND(v0, v1, v2, v3, v4, v5, v6, v7, m0, m1, m2, m3, t0, c40, c48) \ - PADDQ m0, v0; \ - PADDQ m1, v1; \ - PADDQ v2, v0; \ - PADDQ v3, v1; \ - PXOR v0, v6; \ - PXOR v1, v7; \ - PSHUFD $0xB1, v6, v6; \ - PSHUFD $0xB1, v7, v7; \ - PADDQ v6, v4; \ - PADDQ v7, v5; \ - PXOR v4, v2; \ - PXOR v5, v3; \ - PSHUFB c40, v2; \ - PSHUFB c40, v3; \ - PADDQ m2, v0; \ - PADDQ m3, v1; \ - PADDQ v2, v0; \ - PADDQ v3, v1; \ - PXOR v0, v6; \ - PXOR v1, v7; \ - PSHUFB c48, v6; \ - PSHUFB c48, v7; \ - PADDQ v6, v4; \ - PADDQ v7, v5; \ - PXOR v4, v2; \ - PXOR v5, v3; \ - MOVOU v2, t0; \ - PADDQ v2, t0; \ - PSRLQ $63, v2; \ - PXOR t0, v2; \ - MOVOU v3, t0; \ - PADDQ v3, t0; \ - PSRLQ $63, v3; \ - PXOR t0, v3 - -#define LOAD_MSG(m0, m1, m2, m3, src, i0, i1, i2, i3, i4, i5, i6, i7) \ - MOVQ i0*8(src), m0; \ - PINSRQ $1, i1*8(src), m0; \ - MOVQ i2*8(src), m1; \ - PINSRQ $1, i3*8(src), m1; \ - MOVQ i4*8(src), m2; \ - PINSRQ $1, i5*8(src), m2; \ - MOVQ i6*8(src), m3; \ - PINSRQ $1, i7*8(src), m3 - // func hashBlocksSSE4(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte) -TEXT ·hashBlocksSSE4(SB), 4, $288-48 // frame size = 272 + 16 byte alignment - MOVQ h+0(FP), AX - MOVQ c+8(FP), BX - MOVQ flag+16(FP), CX - MOVQ blocks_base+24(FP), SI - MOVQ blocks_len+32(FP), DI - - MOVQ SP, R10 - ADDQ $15, R10 - ANDQ $~15, R10 - - MOVOU ·iv3<>(SB), X0 - MOVO X0, 0(R10) - XORQ CX, 0(R10) // 0(R10) = ·iv3 ^ (CX || 0) - - MOVOU ·c40<>(SB), X13 - MOVOU ·c48<>(SB), X14 - - MOVOU 0(AX), X12 +// Requires: SSE2, SSE4.1, SSSE3 +TEXT ·hashBlocksSSE4(SB), NOSPLIT, $288-48 + MOVQ h+0(FP), AX + MOVQ c+8(FP), BX + MOVQ flag+16(FP), CX + MOVQ blocks_base+24(FP), SI + MOVQ blocks_len+32(FP), DI + MOVQ SP, R10 + ADDQ $0x0f, R10 + ANDQ $-16, R10 + MOVOU ·iv3<>+0(SB), X0 + MOVO X0, (R10) + XORQ CX, (R10) + MOVOU ·c40<>+0(SB), X13 + MOVOU ·c48<>+0(SB), X14 + MOVOU (AX), X12 MOVOU 16(AX), X15 - - MOVQ 0(BX), R8 - MOVQ 8(BX), R9 + MOVQ (BX), R8 + MOVQ 8(BX), R9 loop: - ADDQ $128, R8 - CMPQ R8, $128 + ADDQ $0x80, R8 + CMPQ R8, $0x80 JGE noinc INCQ R9 noinc: - MOVQ R8, X8 - PINSRQ $1, R9, X8 - - MOVO X12, X0 - MOVO X15, X1 - MOVOU 32(AX), X2 - MOVOU 48(AX), X3 - MOVOU ·iv0<>(SB), X4 - MOVOU ·iv1<>(SB), X5 - MOVOU ·iv2<>(SB), X6 - - PXOR X8, X6 - MOVO 0(R10), X7 - - LOAD_MSG(X8, X9, X10, X11, SI, 0, 2, 4, 6, 1, 3, 5, 7) - MOVO X8, 16(R10) - MOVO X9, 32(R10) - MOVO X10, 48(R10) - MOVO X11, 64(R10) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 8, 10, 12, 14, 9, 11, 13, 15) - MOVO X8, 80(R10) - MOVO X9, 96(R10) - MOVO X10, 112(R10) - MOVO X11, 128(R10) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - LOAD_MSG(X8, X9, X10, X11, SI, 14, 4, 9, 13, 10, 8, 15, 6) - MOVO X8, 144(R10) - MOVO X9, 160(R10) - MOVO X10, 176(R10) - MOVO X11, 192(R10) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 1, 0, 11, 5, 12, 2, 7, 3) - MOVO X8, 208(R10) - MOVO X9, 224(R10) - MOVO X10, 240(R10) - MOVO X11, 256(R10) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - LOAD_MSG(X8, X9, X10, X11, SI, 11, 12, 5, 15, 8, 0, 2, 13) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 10, 3, 7, 9, 14, 6, 1, 4) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - LOAD_MSG(X8, X9, X10, X11, SI, 7, 3, 13, 11, 9, 1, 12, 14) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 2, 5, 4, 15, 6, 10, 0, 8) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - LOAD_MSG(X8, X9, X10, X11, SI, 9, 5, 2, 10, 0, 7, 4, 15) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 14, 11, 6, 3, 1, 12, 8, 13) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - LOAD_MSG(X8, X9, X10, X11, SI, 2, 6, 0, 8, 12, 10, 11, 3) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 4, 7, 15, 1, 13, 5, 14, 9) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - LOAD_MSG(X8, X9, X10, X11, SI, 12, 1, 14, 4, 5, 15, 13, 10) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 0, 6, 9, 8, 7, 3, 2, 11) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - LOAD_MSG(X8, X9, X10, X11, SI, 13, 7, 12, 3, 11, 14, 1, 9) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 5, 15, 8, 2, 0, 4, 6, 10) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - LOAD_MSG(X8, X9, X10, X11, SI, 6, 14, 11, 0, 15, 9, 3, 8) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 12, 13, 1, 10, 2, 7, 4, 5) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - LOAD_MSG(X8, X9, X10, X11, SI, 10, 8, 7, 1, 2, 4, 6, 5) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 15, 9, 3, 13, 11, 14, 12, 0) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 16(R10), 32(R10), 48(R10), 64(R10), X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 80(R10), 96(R10), 112(R10), 128(R10), X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 144(R10), 160(R10), 176(R10), 192(R10), X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 208(R10), 224(R10), 240(R10), 256(R10), X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - MOVOU 32(AX), X10 - MOVOU 48(AX), X11 - PXOR X0, X12 - PXOR X1, X15 - PXOR X2, X10 - PXOR X3, X11 - PXOR X4, X12 - PXOR X5, X15 - PXOR X6, X10 - PXOR X7, X11 - MOVOU X10, 32(AX) - MOVOU X11, 48(AX) - - LEAQ 128(SI), SI - SUBQ $128, DI - JNE loop - - MOVOU X12, 0(AX) - MOVOU X15, 16(AX) - - MOVQ R8, 0(BX) - MOVQ R9, 8(BX) - + MOVQ R8, X8 + PINSRQ $0x01, R9, X8 + MOVO X12, X0 + MOVO X15, X1 + MOVOU 32(AX), X2 + MOVOU 48(AX), X3 + MOVOU ·iv0<>+0(SB), X4 + MOVOU ·iv1<>+0(SB), X5 + MOVOU ·iv2<>+0(SB), X6 + PXOR X8, X6 + MOVO (R10), X7 + MOVQ (SI), X8 + PINSRQ $0x01, 16(SI), X8 + MOVQ 32(SI), X9 + PINSRQ $0x01, 48(SI), X9 + MOVQ 8(SI), X10 + PINSRQ $0x01, 24(SI), X10 + MOVQ 40(SI), X11 + PINSRQ $0x01, 56(SI), X11 + MOVO X8, 16(R10) + MOVO X9, 32(R10) + MOVO X10, 48(R10) + MOVO X11, 64(R10) + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ 64(SI), X8 + PINSRQ $0x01, 80(SI), X8 + MOVQ 96(SI), X9 + PINSRQ $0x01, 112(SI), X9 + MOVQ 72(SI), X10 + PINSRQ $0x01, 88(SI), X10 + MOVQ 104(SI), X11 + PINSRQ $0x01, 120(SI), X11 + MOVO X8, 80(R10) + MOVO X9, 96(R10) + MOVO X10, 112(R10) + MOVO X11, 128(R10) + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVQ 112(SI), X8 + PINSRQ $0x01, 32(SI), X8 + MOVQ 72(SI), X9 + PINSRQ $0x01, 104(SI), X9 + MOVQ 80(SI), X10 + PINSRQ $0x01, 64(SI), X10 + MOVQ 120(SI), X11 + PINSRQ $0x01, 48(SI), X11 + MOVO X8, 144(R10) + MOVO X9, 160(R10) + MOVO X10, 176(R10) + MOVO X11, 192(R10) + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ 8(SI), X8 + PINSRQ $0x01, (SI), X8 + MOVQ 88(SI), X9 + PINSRQ $0x01, 40(SI), X9 + MOVQ 96(SI), X10 + PINSRQ $0x01, 16(SI), X10 + MOVQ 56(SI), X11 + PINSRQ $0x01, 24(SI), X11 + MOVO X8, 208(R10) + MOVO X9, 224(R10) + MOVO X10, 240(R10) + MOVO X11, 256(R10) + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVQ 88(SI), X8 + PINSRQ $0x01, 96(SI), X8 + MOVQ 40(SI), X9 + PINSRQ $0x01, 120(SI), X9 + MOVQ 64(SI), X10 + PINSRQ $0x01, (SI), X10 + MOVQ 16(SI), X11 + PINSRQ $0x01, 104(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ 80(SI), X8 + PINSRQ $0x01, 24(SI), X8 + MOVQ 56(SI), X9 + PINSRQ $0x01, 72(SI), X9 + MOVQ 112(SI), X10 + PINSRQ $0x01, 48(SI), X10 + MOVQ 8(SI), X11 + PINSRQ $0x01, 32(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVQ 56(SI), X8 + PINSRQ $0x01, 24(SI), X8 + MOVQ 104(SI), X9 + PINSRQ $0x01, 88(SI), X9 + MOVQ 72(SI), X10 + PINSRQ $0x01, 8(SI), X10 + MOVQ 96(SI), X11 + PINSRQ $0x01, 112(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ 16(SI), X8 + PINSRQ $0x01, 40(SI), X8 + MOVQ 32(SI), X9 + PINSRQ $0x01, 120(SI), X9 + MOVQ 48(SI), X10 + PINSRQ $0x01, 80(SI), X10 + MOVQ (SI), X11 + PINSRQ $0x01, 64(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVQ 72(SI), X8 + PINSRQ $0x01, 40(SI), X8 + MOVQ 16(SI), X9 + PINSRQ $0x01, 80(SI), X9 + MOVQ (SI), X10 + PINSRQ $0x01, 56(SI), X10 + MOVQ 32(SI), X11 + PINSRQ $0x01, 120(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ 112(SI), X8 + PINSRQ $0x01, 88(SI), X8 + MOVQ 48(SI), X9 + PINSRQ $0x01, 24(SI), X9 + MOVQ 8(SI), X10 + PINSRQ $0x01, 96(SI), X10 + MOVQ 64(SI), X11 + PINSRQ $0x01, 104(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVQ 16(SI), X8 + PINSRQ $0x01, 48(SI), X8 + MOVQ (SI), X9 + PINSRQ $0x01, 64(SI), X9 + MOVQ 96(SI), X10 + PINSRQ $0x01, 80(SI), X10 + MOVQ 88(SI), X11 + PINSRQ $0x01, 24(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ 32(SI), X8 + PINSRQ $0x01, 56(SI), X8 + MOVQ 120(SI), X9 + PINSRQ $0x01, 8(SI), X9 + MOVQ 104(SI), X10 + PINSRQ $0x01, 40(SI), X10 + MOVQ 112(SI), X11 + PINSRQ $0x01, 72(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVQ 96(SI), X8 + PINSRQ $0x01, 8(SI), X8 + MOVQ 112(SI), X9 + PINSRQ $0x01, 32(SI), X9 + MOVQ 40(SI), X10 + PINSRQ $0x01, 120(SI), X10 + MOVQ 104(SI), X11 + PINSRQ $0x01, 80(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ (SI), X8 + PINSRQ $0x01, 48(SI), X8 + MOVQ 72(SI), X9 + PINSRQ $0x01, 64(SI), X9 + MOVQ 56(SI), X10 + PINSRQ $0x01, 24(SI), X10 + MOVQ 16(SI), X11 + PINSRQ $0x01, 88(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVQ 104(SI), X8 + PINSRQ $0x01, 56(SI), X8 + MOVQ 96(SI), X9 + PINSRQ $0x01, 24(SI), X9 + MOVQ 88(SI), X10 + PINSRQ $0x01, 112(SI), X10 + MOVQ 8(SI), X11 + PINSRQ $0x01, 72(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ 40(SI), X8 + PINSRQ $0x01, 120(SI), X8 + MOVQ 64(SI), X9 + PINSRQ $0x01, 16(SI), X9 + MOVQ (SI), X10 + PINSRQ $0x01, 32(SI), X10 + MOVQ 48(SI), X11 + PINSRQ $0x01, 80(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVQ 48(SI), X8 + PINSRQ $0x01, 112(SI), X8 + MOVQ 88(SI), X9 + PINSRQ $0x01, (SI), X9 + MOVQ 120(SI), X10 + PINSRQ $0x01, 72(SI), X10 + MOVQ 24(SI), X11 + PINSRQ $0x01, 64(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ 96(SI), X8 + PINSRQ $0x01, 104(SI), X8 + MOVQ 8(SI), X9 + PINSRQ $0x01, 80(SI), X9 + MOVQ 16(SI), X10 + PINSRQ $0x01, 56(SI), X10 + MOVQ 32(SI), X11 + PINSRQ $0x01, 40(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVQ 80(SI), X8 + PINSRQ $0x01, 64(SI), X8 + MOVQ 56(SI), X9 + PINSRQ $0x01, 8(SI), X9 + MOVQ 16(SI), X10 + PINSRQ $0x01, 32(SI), X10 + MOVQ 48(SI), X11 + PINSRQ $0x01, 40(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ 120(SI), X8 + PINSRQ $0x01, 72(SI), X8 + MOVQ 24(SI), X9 + PINSRQ $0x01, 104(SI), X9 + MOVQ 88(SI), X10 + PINSRQ $0x01, 112(SI), X10 + MOVQ 96(SI), X11 + PINSRQ $0x01, (SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + PADDQ 16(R10), X0 + PADDQ 32(R10), X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ 48(R10), X0 + PADDQ 64(R10), X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + PADDQ 80(R10), X0 + PADDQ 96(R10), X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ 112(R10), X0 + PADDQ 128(R10), X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + PADDQ 144(R10), X0 + PADDQ 160(R10), X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ 176(R10), X0 + PADDQ 192(R10), X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + PADDQ 208(R10), X0 + PADDQ 224(R10), X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ 240(R10), X0 + PADDQ 256(R10), X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU 32(AX), X10 + MOVOU 48(AX), X11 + PXOR X0, X12 + PXOR X1, X15 + PXOR X2, X10 + PXOR X3, X11 + PXOR X4, X12 + PXOR X5, X15 + PXOR X6, X10 + PXOR X7, X11 + MOVOU X10, 32(AX) + MOVOU X11, 48(AX) + LEAQ 128(SI), SI + SUBQ $0x80, DI + JNE loop + MOVOU X12, (AX) + MOVOU X15, 16(AX) + MOVQ R8, (BX) + MOVQ R9, 8(BX) RET + +DATA ·iv3<>+0(SB)/8, $0x1f83d9abfb41bd6b +DATA ·iv3<>+8(SB)/8, $0x5be0cd19137e2179 +GLOBL ·iv3<>(SB), RODATA|NOPTR, $16 + +DATA ·c40<>+0(SB)/8, $0x0201000706050403 +DATA ·c40<>+8(SB)/8, $0x0a09080f0e0d0c0b +GLOBL ·c40<>(SB), RODATA|NOPTR, $16 + +DATA ·c48<>+0(SB)/8, $0x0100070605040302 +DATA ·c48<>+8(SB)/8, $0x09080f0e0d0c0b0a +GLOBL ·c48<>(SB), RODATA|NOPTR, $16 + +DATA ·iv0<>+0(SB)/8, $0x6a09e667f3bcc908 +DATA ·iv0<>+8(SB)/8, $0xbb67ae8584caa73b +GLOBL ·iv0<>(SB), RODATA|NOPTR, $16 + +DATA ·iv1<>+0(SB)/8, $0x3c6ef372fe94f82b +DATA ·iv1<>+8(SB)/8, $0xa54ff53a5f1d36f1 +GLOBL ·iv1<>(SB), RODATA|NOPTR, $16 + +DATA ·iv2<>+0(SB)/8, $0x510e527fade682d1 +DATA ·iv2<>+8(SB)/8, $0x9b05688c2b3e6c1f +GLOBL ·iv2<>(SB), RODATA|NOPTR, $16