diff --git a/Makefile b/Makefile index 5d2a54b..833ea1a 100755 --- a/Makefile +++ b/Makefile @@ -77,7 +77,7 @@ else CFLAGS= $(EXTRA_CFLAGS) endif CFLAGS+= $(VALGRIND_CFLAGS) -CFLAGS+= -std=gnu11 -Wall $(ADDITIONAL_SETTINGS) -D $(ARCHITECTURE) -D __NIX__ -D $(USE_OPT_LEVEL) $(MULX) $(ADX) +CFLAGS+= -std=gnu11 -Wall $(ADDITIONAL_SETTINGS) -D $(ARCHITECTURE) -D __NIX__ -D $(USE_OPT_LEVEL) $(MULX) $(ADX) -Wno-missing-braces LDFLAGS=-lm ifeq "$(USE_OPT_LEVEL)" "_GENERIC_" EXTRA_OBJECTS_434=objs434/fp_generic.o @@ -106,7 +106,7 @@ OBJECTS_503_COMP=objs503comp/P503_compressed.o $(EXTRA_OBJECTS_503) objs/random. OBJECTS_610_COMP=objs610comp/P610_compressed.o $(EXTRA_OBJECTS_610) objs/random.o objs/fips202.o OBJECTS_751_COMP=objs751comp/P751_compressed.o $(EXTRA_OBJECTS_751) objs/random.o objs/fips202.o -all: lib434 lib503 lib610 lib751 lib434comp lib503comp lib610comp lib751comp tests KATS +all: lib434 lib503 lib610 lib751 lib434comp lib503comp lib610comp lib751comp tests_p434 tests_p503 tests_p610 tests_p751 objs434/%.o: src/P434/%.c @mkdir -p $(@D) @@ -259,28 +259,6 @@ lib751comp: $(OBJECTS_751_COMP) $(AR) lib751comp/libsidh.a $^ $(RANLIB) lib751comp/libsidh.a -tests: lib434 lib434comp lib503 lib503comp lib610 lib610comp lib751 lib751comp - $(CC) $(CFLAGS) -L./lib434 tests/arith_tests-p434.c tests/test_extras.c -lsidh $(LDFLAGS) -o arith_tests-p434 $(ARM_SETTING) - $(CC) $(CFLAGS) -L./lib503 tests/arith_tests-p503.c tests/test_extras.c -lsidh $(LDFLAGS) -o arith_tests-p503 $(ARM_SETTING) - $(CC) $(CFLAGS) -L./lib610 tests/arith_tests-p610.c tests/test_extras.c -lsidh $(LDFLAGS) -o arith_tests-p610 $(ARM_SETTING) - $(CC) $(CFLAGS) -L./lib751 tests/arith_tests-p751.c tests/test_extras.c -lsidh $(LDFLAGS) -o arith_tests-p751 $(ARM_SETTING) - $(CC) $(CFLAGS) -L./lib434 tests/test_SIDHp434.c tests/test_extras.c -lsidh $(LDFLAGS) -o sidh434/test_SIDH $(ARM_SETTING) - $(CC) $(CFLAGS) -L./lib503 tests/test_SIDHp503.c tests/test_extras.c -lsidh $(LDFLAGS) -o sidh503/test_SIDH $(ARM_SETTING) - $(CC) $(CFLAGS) -L./lib610 tests/test_SIDHp610.c tests/test_extras.c -lsidh $(LDFLAGS) -o sidh610/test_SIDH $(ARM_SETTING) - $(CC) $(CFLAGS) -L./lib751 tests/test_SIDHp751.c tests/test_extras.c -lsidh $(LDFLAGS) -o sidh751/test_SIDH $(ARM_SETTING) - $(CC) $(CFLAGS) -L./lib434 tests/test_SIKEp434.c tests/test_extras.c -lsidh $(LDFLAGS) -o sike434/test_SIKE $(ARM_SETTING) - $(CC) $(CFLAGS) -L./lib503 tests/test_SIKEp503.c tests/test_extras.c -lsidh $(LDFLAGS) -o sike503/test_SIKE $(ARM_SETTING) - $(CC) $(CFLAGS) -L./lib610 tests/test_SIKEp610.c tests/test_extras.c -lsidh $(LDFLAGS) -o sike610/test_SIKE $(ARM_SETTING) - $(CC) $(CFLAGS) -L./lib751 tests/test_SIKEp751.c tests/test_extras.c -lsidh $(LDFLAGS) -o sike751/test_SIKE $(ARM_SETTING) - $(CC) $(CFLAGS) -L./lib434comp tests/test_SIDHp434_compressed.c tests/test_extras.c -lsidh $(LDFLAGS) -o sidh434_compressed/test_SIDH $(ARM_SETTING) - $(CC) $(CFLAGS) -L./lib503comp tests/test_SIDHp503_compressed.c tests/test_extras.c -lsidh $(LDFLAGS) -o sidh503_compressed/test_SIDH $(ARM_SETTING) - $(CC) $(CFLAGS) -L./lib610comp tests/test_SIDHp610_compressed.c tests/test_extras.c -lsidh $(LDFLAGS) -o sidh610_compressed/test_SIDH $(ARM_SETTING) - $(CC) $(CFLAGS) -L./lib751comp tests/test_SIDHp751_compressed.c tests/test_extras.c -lsidh $(LDFLAGS) -o sidh751_compressed/test_SIDH $(ARM_SETTING) - $(CC) $(CFLAGS) -L./lib434comp tests/test_SIKEp434_compressed.c tests/test_extras.c -lsidh $(LDFLAGS) -o sike434_compressed/test_SIKE $(ARM_SETTING) - $(CC) $(CFLAGS) -L./lib503comp tests/test_SIKEp503_compressed.c tests/test_extras.c -lsidh $(LDFLAGS) -o sike503_compressed/test_SIKE $(ARM_SETTING) - $(CC) $(CFLAGS) -L./lib610comp tests/test_SIKEp610_compressed.c tests/test_extras.c -lsidh $(LDFLAGS) -o sike610_compressed/test_SIKE $(ARM_SETTING) - $(CC) $(CFLAGS) -L./lib751comp tests/test_SIKEp751_compressed.c tests/test_extras.c -lsidh $(LDFLAGS) -o sike751_compressed/test_SIKE $(ARM_SETTING) - # AES AES_OBJS=objs/aes.o objs/aes_c.o @@ -320,17 +298,40 @@ lib751comp_for_KATs: $(OBJECTS_751_COMP) $(AES_OBJS) $(AR) lib751comp/libsidh_for_testing.a $^ $(RANLIB) lib751comp/libsidh_for_testing.a -KATS: lib434_for_KATs lib503_for_KATs lib610_for_KATs lib751_for_KATs lib434comp_for_KATs lib503comp_for_KATs lib610comp_for_KATs lib751comp_for_KATs +tests_p434: lib434 lib434comp lib434_for_KATs lib434comp_for_KATs + $(CC) $(CFLAGS) -L./lib434 tests/arith_tests-p434.c tests/test_extras.c -lsidh $(LDFLAGS) -o arith_tests-p434 $(ARM_SETTING) + $(CC) $(CFLAGS) -L./lib434 tests/test_SIDHp434.c tests/test_extras.c -lsidh $(LDFLAGS) -o sidh434/test_SIDH $(ARM_SETTING) + $(CC) $(CFLAGS) -L./lib434 tests/test_SIKEp434.c tests/test_extras.c -lsidh $(LDFLAGS) -o sike434/test_SIKE $(ARM_SETTING) + $(CC) $(CFLAGS) -L./lib434comp tests/test_SIDHp434_compressed.c tests/test_extras.c -lsidh $(LDFLAGS) -o sidh434_compressed/test_SIDH $(ARM_SETTING) + $(CC) $(CFLAGS) -L./lib434comp tests/test_SIKEp434_compressed.c tests/test_extras.c -lsidh $(LDFLAGS) -o sike434_compressed/test_SIKE $(ARM_SETTING) $(CC) $(CFLAGS) -L./lib434 tests/PQCtestKAT_kem434.c tests/rng/rng.c -lsidh_for_testing $(LDFLAGS) -o sike434/PQCtestKAT_kem $(ARM_SETTING) - $(CC) $(CFLAGS) -L./lib503 tests/PQCtestKAT_kem503.c tests/rng/rng.c -lsidh_for_testing $(LDFLAGS) -o sike503/PQCtestKAT_kem $(ARM_SETTING) - $(CC) $(CFLAGS) -L./lib610 tests/PQCtestKAT_kem610.c tests/rng/rng.c -lsidh_for_testing $(LDFLAGS) -o sike610/PQCtestKAT_kem $(ARM_SETTING) - $(CC) $(CFLAGS) -L./lib751 tests/PQCtestKAT_kem751.c tests/rng/rng.c -lsidh_for_testing $(LDFLAGS) -o sike751/PQCtestKAT_kem $(ARM_SETTING) $(CC) $(CFLAGS) -L./lib434comp tests/PQCtestKAT_kem434_compressed.c tests/rng/rng.c -lsidh_for_testing $(LDFLAGS) -o sike434_compressed/PQCtestKAT_kem $(ARM_SETTING) +tests_p503: lib503 lib503comp lib503_for_KATs lib503comp_for_KATs + $(CC) $(CFLAGS) -L./lib503 tests/arith_tests-p503.c tests/test_extras.c -lsidh $(LDFLAGS) -o arith_tests-p503 $(ARM_SETTING) + $(CC) $(CFLAGS) -L./lib503 tests/test_SIDHp503.c tests/test_extras.c -lsidh $(LDFLAGS) -o sidh503/test_SIDH $(ARM_SETTING) + $(CC) $(CFLAGS) -L./lib503 tests/test_SIKEp503.c tests/test_extras.c -lsidh $(LDFLAGS) -o sike503/test_SIKE $(ARM_SETTING) + $(CC) $(CFLAGS) -L./lib503comp tests/test_SIDHp503_compressed.c tests/test_extras.c -lsidh $(LDFLAGS) -o sidh503_compressed/test_SIDH $(ARM_SETTING) + $(CC) $(CFLAGS) -L./lib503comp tests/test_SIKEp503_compressed.c tests/test_extras.c -lsidh $(LDFLAGS) -o sike503_compressed/test_SIKE $(ARM_SETTING) + $(CC) $(CFLAGS) -L./lib503 tests/PQCtestKAT_kem503.c tests/rng/rng.c -lsidh_for_testing $(LDFLAGS) -o sike503/PQCtestKAT_kem $(ARM_SETTING) $(CC) $(CFLAGS) -L./lib503comp tests/PQCtestKAT_kem503_compressed.c tests/rng/rng.c -lsidh_for_testing $(LDFLAGS) -o sike503_compressed/PQCtestKAT_kem $(ARM_SETTING) +tests_p610: lib610 lib610comp lib610_for_KATs lib610comp_for_KATs + $(CC) $(CFLAGS) -L./lib610 tests/arith_tests-p610.c tests/test_extras.c -lsidh $(LDFLAGS) -o arith_tests-p610 $(ARM_SETTING) + $(CC) $(CFLAGS) -L./lib610 tests/test_SIDHp610.c tests/test_extras.c -lsidh $(LDFLAGS) -o sidh610/test_SIDH $(ARM_SETTING) + $(CC) $(CFLAGS) -L./lib610 tests/test_SIKEp610.c tests/test_extras.c -lsidh $(LDFLAGS) -o sike610/test_SIKE $(ARM_SETTING) + $(CC) $(CFLAGS) -L./lib610comp tests/test_SIDHp610_compressed.c tests/test_extras.c -lsidh $(LDFLAGS) -o sidh610_compressed/test_SIDH $(ARM_SETTING) + $(CC) $(CFLAGS) -L./lib610comp tests/test_SIKEp610_compressed.c tests/test_extras.c -lsidh $(LDFLAGS) -o sike610_compressed/test_SIKE $(ARM_SETTING) + $(CC) $(CFLAGS) -L./lib610 tests/PQCtestKAT_kem610.c tests/rng/rng.c -lsidh_for_testing $(LDFLAGS) -o sike610/PQCtestKAT_kem $(ARM_SETTING) $(CC) $(CFLAGS) -L./lib610comp tests/PQCtestKAT_kem610_compressed.c tests/rng/rng.c -lsidh_for_testing $(LDFLAGS) -o sike610_compressed/PQCtestKAT_kem $(ARM_SETTING) - $(CC) $(CFLAGS) -L./lib751comp tests/PQCtestKAT_kem751_compressed.c tests/rng/rng.c -lsidh_for_testing $(LDFLAGS) -o sike751_compressed/PQCtestKAT_kem $(ARM_SETTING) +tests_p751: lib751 lib751comp lib751_for_KATs lib751comp_for_KATs + $(CC) $(CFLAGS) -L./lib751 tests/arith_tests-p751.c tests/test_extras.c -lsidh $(LDFLAGS) -o arith_tests-p751 $(ARM_SETTING) + $(CC) $(CFLAGS) -L./lib751 tests/test_SIDHp751.c tests/test_extras.c -lsidh $(LDFLAGS) -o sidh751/test_SIDH $(ARM_SETTING) + $(CC) $(CFLAGS) -L./lib751 tests/test_SIKEp751.c tests/test_extras.c -lsidh $(LDFLAGS) -o sike751/test_SIKE $(ARM_SETTING) + $(CC) $(CFLAGS) -L./lib751comp tests/test_SIDHp751_compressed.c tests/test_extras.c -lsidh $(LDFLAGS) -o sidh751_compressed/test_SIDH $(ARM_SETTING) + $(CC) $(CFLAGS) -L./lib751comp tests/test_SIKEp751_compressed.c tests/test_extras.c -lsidh $(LDFLAGS) -o sike751_compressed/test_SIKE $(ARM_SETTING) + $(CC) $(CFLAGS) -L./lib751 tests/PQCtestKAT_kem751.c tests/rng/rng.c -lsidh_for_testing $(LDFLAGS) -o sike751/PQCtestKAT_kem $(ARM_SETTING) + $(CC) $(CFLAGS) -L./lib751comp tests/PQCtestKAT_kem751_compressed.c tests/rng/rng.c -lsidh_for_testing $(LDFLAGS) -o sike751_compressed/PQCtestKAT_kem $(ARM_SETTING) -check: tests +check: tests_p434 tests_p503 tests_p610 tests_p751 test434: ifeq "$(DO_VALGRIND_CHECK)" "TRUE" diff --git a/README.md b/README.md index be4dfa0..9dc86a8 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ -# SIDH v3.4 (C Edition) +# SIDH v3.5 (C Edition) The **SIDH** library is an efficient supersingular isogeny-based cryptography library written in C language. -**Version v3.4** of the library includes the ephemeral Diffie-Hellman key exchange scheme "SIDH" [1,2], and the CCA-secure +**Version v3.5** of the library includes the ephemeral Diffie-Hellman key exchange scheme "SIDH" [1,2], and the CCA-secure key encapsulation mechanism "SIKE" [3]. These schemes are conjectured to be secure against quantum computer attacks. Concretely, the SIDH library includes the following KEM schemes: @@ -91,9 +91,13 @@ The library was developed by [Microsoft Research](http://research.microsoft.com/ - Memory optimizations for compressed SIDH and compressed SIKE. +## New in Version 3.5 + +- New implementations of the quadratic extension field arithmetic for x64 processors on Linux [13]. + ## Supported Platforms -**SIDH v3.4** is supported on a wide range of platforms including x64, x86, ARM and s390x processors running Windows, +**SIDH v3.5** is supported on a wide range of platforms including x64, x86, ARM and s390x processors running Windows, Linux or Mac OS X. We have tested the library with Microsoft Visual Studio 2015, GNU GCC v5.4, and clang v3.8. See instructions below to choose an implementation option and compile on one of the supported platforms. @@ -121,7 +125,7 @@ optimizations using MULX/ADX. Other options for x64: ```sh -$ make ARCH=x64 CC=[gcc/clang] OPT_LEVEL=[FAST/GENERIC] USE_MULX=[TRUE/FALSE] USE_ADX=[TRUE/FALSE] +$ make tests_pXXX ARCH=x64 CC=[gcc/clang] OPT_LEVEL=[FAST/GENERIC] USE_MULX=[TRUE/FALSE] USE_ADX=[TRUE/FALSE] ``` When `OPT_LEVEL=FAST` (i.e., assembly use enabled), the user is responsible for setting the flags MULX and ADX @@ -129,17 +133,18 @@ according to the targeted platform (for example, MULX/ADX are not supported on S is supported on Haswell, and both MULX and ADX are supported on Broadwell, Skylake and Kaby Lake architectures). Note that USE_ADX can only be set to `TRUE` if `USE_MULX=TRUE`. The option `USE_MULX=FALSE` with `USE_ADX=FALSE` is only supported on p503 and p751. +The use of `tests_pXXX`, for any value XXX in [434,503,610,751], allows to compile only one parameter set at a time. Options for x86/ARM/M1/s390x: ```sh -$ make ARCH=[x86/ARM/M1/s390x] CC=[gcc/clang] +$ make tests_pXXX ARCH=[x86/ARM/M1/s390x] CC=[gcc/clang] ``` Options for ARM64 or Apple M1: ```sh -$ make ARCH=[ARM64/M1] CC=[gcc/clang] OPT_LEVEL=[FAST/GENERIC] +$ make tests_pXXX ARCH=[ARM64/M1] CC=[gcc/clang] OPT_LEVEL=[FAST/GENERIC] ``` As in the x64 case, `OPT_LEVEL=FAST` enables the use of assembly optimizations on ARMv8 platforms. @@ -209,7 +214,7 @@ The library includes some third party modules that are licensed differently. In - `tests/PQCtestKAT_kem<#>.c`: copyrighted by Lawrence E. Bassham - `src/sha3/fips202.c`: public domain -## Contributors +## Other contributors - Basil Hess. - Geovandro Pereira. @@ -223,8 +228,8 @@ The extended version is available [`here`](http://eprint.iacr.org/2016/413). [2] David Jao and Luca DeFeo, "Towards quantum-resistant cryptosystems from supersingular elliptic curve isogenies". PQCrypto 2011, LNCS 7071, pp. 19-34, 2011. The extended version is available [`here`](https://eprint.iacr.org/2011/506). -[3] Reza Azarderakhsh, Matthew Campagna, Craig Costello, Luca De Feo, Basil Hess, Amir Jalali, David Jao, Brian Koziel, Brian LaMacchia, Patrick Longa, Michael Naehrig, Joost Renes, Vladimir Soukharev, and David Urbanik, "Supersingular Isogeny Key Encapsulation". Submission to the NIST Post-Quantum Standardization project, 2017. -The round 2 submission package is available [`here`](https://csrc.nist.gov/CSRC/media/Projects/Post-Quantum-Cryptography/documents/round-2/submissions/SIKE-Round2.zip). +[3] Reza Azarderakhsh, Matthew Campagna, Craig Costello, Luca De Feo, Basil Hess, Aaron Hutchinson, Amir Jalali, Koray Karabina, David Jao, Brian Koziel, Brian LaMacchia, Patrick Longa, Michael Naehrig, Geovandro Pereira, Joost Renes, Vladimir Soukharev, and David Urbanik, "Supersingular Isogeny Key Encapsulation (SIKE)", 2017. +The specifications document is available [`here`](https://sike.org). [4] Craig Costello, and Huseyin Hisil, "A simple and compact algorithm for SIDH with arbitrary degree isogenies". Advances in Cryptology - ASIACRYPT 2017, LNCS 10625, pp. 303-329, 2017. The preprint version is available [`here`](https://eprint.iacr.org/2017/504). @@ -235,10 +240,10 @@ The preprint version is available [`here`](https://eprint.iacr.org/2017/1015). [6] Gora Adj, Daniel Cervantes-Vázquez, Jesús-Javier Chi-Domínguez, Alfred Menezes and Francisco Rodríguez-Henríquez, "On the cost of computing isogenies between supersingular elliptic curves". SAC 2018, LCNS 11349, pp. 322-343, 2018. The preprint version is available [`here`](https://eprint.iacr.org/2018/313). -[7] Samuel Jaques and John M. Schanck, "Quantum cryptanalysis in the RAM model: Claw-finding attacks on SIKE". Advances in Cryptology - CRYPTO 2019 (to appear), 2019. +[7] Samuel Jaques and John M. Schanck, "Quantum cryptanalysis in the RAM model: Claw-finding attacks on SIKE". Advances in Cryptology - CRYPTO 2019, 2019. The preprint version is available [`here`](https://eprint.iacr.org/2019/103). -[8] Craig Costello, Patrick Longa, Michael Naehrig, Joost Renes and Fernando Virdia, "Improved Classical Cryptanalysis of the Computational Supersingular Isogeny Problem", 2019. +[8] Craig Costello, Patrick Longa, Michael Naehrig, Joost Renes and Fernando Virdia, "Improved classical cryptanalysis of the computational supersingular isogeny problem". PKC 2020, LCNS 12111, pp. 505-534, 2020. The preprint version is available [`here`](https://eprint.iacr.org/2019/298). [9] Craig Costello, David Jao, Patrick Longa, Michael Naehrig, Joost Renes and David Urbanik, "Efficient compression of SIDH public keys". Advances in Cryptology - EUROCRYPT 2017, LNCS 10210, pp. 679-706, 2017. @@ -247,12 +252,14 @@ The preprint version is available [`here`](https://eprint.iacr.org/2016/963). [10] Gustavo H.M. Zanon, Marcos A. Simplicio Jr, Geovandro C.C.F. Pereira, Javad Doliskani and Paulo S.L.M. Barreto, "Faster key compression for isogeny-based cryptosystems". IEEE Transactions on Computers, Vol. 68(5), 2019. The preprint version is available [`here`](https://eprint.iacr.org/2017/1143). -[11] Michael Naehrig and Joost Renes, "Dual Isogenies and Their Application to Public-key Compression for Isogeny-based Cryptography". Advances in Cryptology - ASIACRYPT 2019, LNCS 11922, pp. 243-272, 2019. +[11] Michael Naehrig and Joost Renes, "Dual isogenies and their application to public-key compression for isogeny-based cryptography". Advances in Cryptology - ASIACRYPT 2019, LNCS 11922, pp. 243-272, 2019. The preprint version is available [`here`](https://eprint.iacr.org/2019/499). -[12] Geovandro C.C.F. Pereira, Javad Doliskani and David Jao, "x-only point addition formula and faster torsion basis generation in compressed SIKE". +[12] Geovandro C.C.F. Pereira, Javad Doliskani and David Jao, "x-only point addition formula and faster torsion basis generation in compressed SIKE". JCEN, Vol. 11, pp. 57-69, 2021. The preprint version is available [`here`](https://eprint.iacr.org/2020/431). +[13] Patrick Longa, "Efficient algorithms for large prime characteristic fields and their application to bilinear pairings and supersingular isogeny-based protocols", 2022. + # Contributing This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. diff --git a/src/P434/AMD64/fp_x64.c b/src/P434/AMD64/fp_x64.c index 5cb92a7..16852fb 100644 --- a/src/P434/AMD64/fp_x64.c +++ b/src/P434/AMD64/fp_x64.c @@ -1,5 +1,9 @@ /******************************************************************************************** * SIDH: an efficient supersingular isogeny cryptography library +* Copyright (c) Microsoft Corporation +* +* Website: https://github.com/microsoft/PQCrypto-SIDH +* Released under MIT license * * Abstract: modular arithmetic optimized for x64 platforms for P434 *********************************************************************************************/ @@ -17,7 +21,7 @@ extern const uint64_t p434x4[NWORDS_FIELD]; inline void mp_sub434_p2(const digit_t* a, const digit_t* b, digit_t* c) { // Multiprecision subtraction with correction with 2*p, c = a-b+2p. -#if (OS_TARGET == OS_WIN) || defined(GENERIC_IMPLEMENTATION) || (TARGET == TARGET_ARM) || (TARGET == TARGET_ARM64 && NBITS_FIELD == 610) +#if (OS_TARGET == OS_WIN) unsigned int i, borrow = 0; for (i = 0; i < NWORDS_FIELD; i++) { @@ -39,7 +43,7 @@ inline void mp_sub434_p2(const digit_t* a, const digit_t* b, digit_t* c) inline void mp_sub434_p4(const digit_t* a, const digit_t* b, digit_t* c) { // Multiprecision subtraction with correction with 4*p, c = a-b+4p. -#if (OS_TARGET == OS_WIN) || defined(GENERIC_IMPLEMENTATION) || (TARGET == TARGET_ARM) || (TARGET == TARGET_ARM64 && NBITS_FIELD == 610) +#if (OS_TARGET == OS_WIN) unsigned int i, borrow = 0; for (i = 0; i < NWORDS_FIELD; i++) { @@ -50,11 +54,6 @@ inline void mp_sub434_p4(const digit_t* a, const digit_t* b, digit_t* c) for (i = 0; i < NWORDS_FIELD; i++) { ADDC(borrow, c[i], ((digit_t*)p434x4)[i], borrow, c[i]); } - -#elif (OS_TARGET == OS_NIX) - - mp_sub434_p4_asm(a, b, c); - #endif } @@ -161,13 +160,42 @@ void fpcorrection434(digit_t* a) } } +#if (OS_TARGET == OS_NIX) + +void fp2mul434_c0_mont(const digit_t* a, const digit_t* b, digit_t* c) +{ + fp2mul434_c0_asm(a, b, c); +} + + +void fp2mul434_c1_mont(const digit_t* a, const digit_t* b, digit_t* c) +{ + fp2mul434_c1_asm(a, b, c); +} + + +void fp2sqr434_c0_mont(const digit_t* a, digit_t* c) +{ + fp2sqr434_c0_asm(a, c); +} + + +void fp2sqr434_c1_mont(const digit_t* a, digit_t* c) +{ + fp2sqr434_c1_asm(a, c); +} + +void fpmul434(const digit_t* a, const digit_t* b, digit_t* c) +{ + fpmul434_asm(a, b, c); +} + +#else void mp_mul(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords) { // Multiprecision multiply, c = a*b, where lng(a) = lng(b) = nwords. UNREFERENCED_PARAMETER(nwords); - -#if (OS_TARGET == OS_WIN) digit_t t = 0; uint128_t uv = {0}; unsigned int carry = 0; @@ -330,12 +358,6 @@ void mp_mul(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int n MULADD128(a[6], b[6], uv, carry, uv); c[12] = uv[0]; c[13] = uv[1]; - -#elif (OS_TARGET == OS_NIX) - - mul434_asm(a, b, c); - -#endif } @@ -343,9 +365,7 @@ void rdc_mont(digit_t* ma, digit_t* mc) { // Montgomery reduction exploiting special form of the prime. // mc = ma*R^-1 mod p434x2, where R = 2^448. // If ma < 2^448*p434, the output mc is in the range [0, 2*p434-1]. - // ma is assumed to be in Montgomery representation. - -#if (OS_TARGET == OS_WIN) + // ma is assumed to be in Montgomery representation. unsigned int carry; digit_t t = 0; uint128_t uv = {0}; @@ -478,11 +498,7 @@ void rdc_mont(digit_t* ma, digit_t* mc) MULADD128(mc[6], ((digit_t*)p434p1)[6], uv, carry, uv); t += carry; ADDC(0, uv[0], ma[12], carry, mc[5]); - ADDC(carry, uv[1], ma[13], carry, mc[6]); - -#elif (OS_TARGET == OS_NIX) - - rdc434_asm(ma, mc); + ADDC(carry, uv[1], ma[13], carry, mc[6]); +} -#endif -} \ No newline at end of file +#endif \ No newline at end of file diff --git a/src/P434/AMD64/fp_x64_asm.S b/src/P434/AMD64/fp_x64_asm.S index 5cea37c..86be311 100644 --- a/src/P434/AMD64/fp_x64_asm.S +++ b/src/P434/AMD64/fp_x64_asm.S @@ -1,1020 +1,813 @@ //******************************************************************************************* // SIDH: an efficient supersingular isogeny cryptography library +// Copyright (c) Microsoft Corporation +// +// Website: https://github.com/microsoft/PQCrypto-SIDH +// Released under MIT license // // Abstract: field arithmetic in x64 assembly for P434 on Linux -//******************************************************************************************* - -.intel_syntax noprefix - -// Format function and variable names for Mac OS X -#if defined(__APPLE__) - #define fmt(f) _##f -#else - #define fmt(f) f -#endif - -// Registers that are used for parameter passing: -#define reg_p1 rdi -#define reg_p2 rsi -#define reg_p3 rdx - -// Define addition instructions -#ifdef _MULX_ -#ifdef _ADX_ - -#define ADD1 adox -#define ADC1 adox -#define ADD2 adcx -#define ADC2 adcx - -#else - -#define ADD1 add -#define ADC1 adc -#define ADD2 add -#define ADC2 adc - -#endif -#endif - - -.text -//*********************************************************************** -// Field addition -// Operation: c [reg_p3] = a [reg_p1] + b [reg_p2] -//*********************************************************************** -.global fmt(fpadd434_asm) -fmt(fpadd434_asm): - push r12 - push r13 - push r14 - push r15 - push rbx - push rbp - - xor rax, rax - mov r8, [reg_p1] - mov r9, [reg_p1+8] - mov r10, [reg_p1+16] - mov r11, [reg_p1+24] - mov r12, [reg_p1+32] - mov r13, [reg_p1+40] - mov r14, [reg_p1+48] - add r8, [reg_p2] - adc r9, [reg_p2+8] - adc r10, [reg_p2+16] - adc r11, [reg_p2+24] - adc r12, [reg_p2+32] - adc r13, [reg_p2+40] - adc r14, [reg_p2+48] - - mov rbx, [rip+fmt(p434x2)] - sub r8, rbx - mov rcx, [rip+fmt(p434x2)+8] - sbb r9, rcx - sbb r10, rcx - mov rdi, [rip+fmt(p434x2)+24] - sbb r11, rdi - mov rsi, [rip+fmt(p434x2)+32] - sbb r12, rsi - mov rbp, [rip+fmt(p434x2)+40] - sbb r13, rbp - mov r15, [rip+fmt(p434x2)+48] - sbb r14, r15 - sbb rax, 0 - - and rbx, rax - and rcx, rax - and rdi, rax - and rsi, rax - and rbp, rax - and r15, rax - - add r8, rbx - adc r9, rcx - adc r10, rcx - adc r11, rdi - adc r12, rsi - adc r13, rbp - adc r14, r15 - mov [reg_p3], r8 - mov [reg_p3+8], r9 - mov [reg_p3+16], r10 - mov [reg_p3+24], r11 - mov [reg_p3+32], r12 - mov [reg_p3+40], r13 - mov [reg_p3+48], r14 - - pop rbp - pop rbx - pop r15 - pop r14 - pop r13 - pop r12 - ret - - -//*********************************************************************** -// Field subtraction -// Operation: c [reg_p3] = a [reg_p1] - b [reg_p2] -//*********************************************************************** -.global fmt(fpsub434_asm) -fmt(fpsub434_asm): - push r12 - push r13 - push r14 - - xor rax, rax - mov r8, [reg_p1] - mov r9, [reg_p1+8] - mov r10, [reg_p1+16] - mov r11, [reg_p1+24] - mov r12, [reg_p1+32] - mov r13, [reg_p1+40] - mov r14, [reg_p1+48] - sub r8, [reg_p2] - sbb r9, [reg_p2+8] - sbb r10, [reg_p2+16] - sbb r11, [reg_p2+24] - sbb r12, [reg_p2+32] - sbb r13, [reg_p2+40] - sbb r14, [reg_p2+48] - sbb rax, 0 - - mov rcx, [rip+fmt(p434x2)] - mov rdi, [rip+fmt(p434x2)+8] - mov rsi, [rip+fmt(p434x2)+24] - and rcx, rax - and rdi, rax - and rsi, rax - add r8, rcx - adc r9, rdi - adc r10, rdi - adc r11, rsi - mov [reg_p3], r8 - mov [reg_p3+8], r9 - mov [reg_p3+16], r10 - mov [reg_p3+24], r11 - setc cl - - mov r8, [rip+fmt(p434x2)+32] - mov rdi, [rip+fmt(p434x2)+40] - mov rsi, [rip+fmt(p434x2)+48] - and r8, rax - and rdi, rax - and rsi, rax - bt rcx, 0 - adc r12, r8 - adc r13, rdi - adc r14, rsi - mov [reg_p3+32], r12 - mov [reg_p3+40], r13 - mov [reg_p3+48], r14 - - pop r14 - pop r13 - pop r12 - ret - - -///////////////////////////////////////////////////////////////// MACRO -.macro SUB434_PX P0 - push r12 - push r13 - - mov r8, [reg_p1] - mov r9, [reg_p1+8] - mov r10, [reg_p1+16] - mov r11, [reg_p1+24] - mov r12, [reg_p1+32] - mov r13, [reg_p1+40] - mov rcx, [reg_p1+48] - sub r8, [reg_p2] - sbb r9, [reg_p2+8] - sbb r10, [reg_p2+16] - sbb r11, [reg_p2+24] - sbb r12, [reg_p2+32] - sbb r13, [reg_p2+40] - sbb rcx, [reg_p2+48] - - mov rax, [rip+\P0] - mov rdi, [rip+\P0+8] - mov rsi, [rip+\P0+24] - add r8, rax - mov rax, [rip+\P0+32] - adc r9, rdi - adc r10, rdi - adc r11, rsi - mov rdi, [rip+\P0+40] - mov rsi, [rip+\P0+48] - adc r12, rax - adc r13, rdi - adc rcx, rsi - mov [reg_p3], r8 - mov [reg_p3+8], r9 - mov [reg_p3+16], r10 - mov [reg_p3+24], r11 - mov [reg_p3+32], r12 - mov [reg_p3+40], r13 - mov [reg_p3+48], rcx - - pop r13 - pop r12 - .endm - - -//*********************************************************************** -// Multiprecision subtraction with correction with 2*p434 -// Operation: c [reg_p3] = a [reg_p1] - b [reg_p2] + 2*p434 -//*********************************************************************** -.global fmt(mp_sub434_p2_asm) -fmt(mp_sub434_p2_asm): - - SUB434_PX fmt(p434x2) - ret - - -//*********************************************************************** -// Multiprecision subtraction with correction with 4*p434 -// Operation: c [reg_p3] = a [reg_p1] - b [reg_p2] + 4*p434 -//*********************************************************************** -.global fmt(mp_sub434_p4_asm) -fmt(mp_sub434_p4_asm): - - SUB434_PX fmt(p434x4) - ret - - -#ifdef _MULX_ - -///////////////////////////////////////////////////////////////// MACRO -// Schoolbook integer multiplication -// Inputs: memory pointers M0 and M1 -// Outputs: memory pointer C and regs T1, T3, rax -// Temps: regs T0:T6 -///////////////////////////////////////////////////////////////// - -#ifdef _ADX_ -.macro MUL192_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6 - mov rdx, \M0 - mulx \T0, \T1, \M1 // T0:T1 = A0*B0 - mov \C, \T1 // C0_final - mulx \T1, \T2, 8\M1 // T1:T2 = A0*B1 - xor rax, rax - adox \T0, \T2 - mulx \T2, \T3, 16\M1 // T2:T3 = A0*B2 - adox \T1, \T3 - - mov rdx, 8\M0 - mulx \T3, \T4, \M1 // T3:T4 = A1*B0 - adox \T2, rax - xor rax, rax - mulx \T5, \T6, 8\M1 // T5:T6 = A1*B1 - adox \T4, \T0 - mov 8\C, \T4 // C1_final - adcx \T3, \T6 - mulx \T6, \T0, 16\M1 // T6:T0 = A1*B2 - adox \T3, \T1 - adcx \T5, \T0 - adcx \T6, rax - adox \T5, \T2 - - mov rdx, 16\M0 - mulx \T1, \T0, \M1 // T1:T0 = A2*B0 - adox \T6, rax - xor rax, rax - mulx \T4, \T2, 8\M1 // T4:T2 = A2*B1 - adox \T0, \T3 - mov 16\C, \T0 // C2_final - adcx \T1, \T5 - mulx \T0, \T3, 16\M1 // T0:T3 = A2*B2 - adcx \T4, \T6 - adcx \T0, rax - adox \T1, \T2 - adox \T3, \T4 - adox rax, \T0 -.endm - -///////////////////////////////////////////////////////////////// MACRO -// Schoolbook integer multiplication -// Inputs: memory pointers M0 and M1 -// Outputs: memory pointer C -// Temps: regs T0:T9 -///////////////////////////////////////////////////////////////// - -.macro MUL256_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9 - mov rdx, \M0 - mulx \T0, \T1, \M1 // T0:T1 = A0*B0 - mov \C, \T1 // C0_final - mulx \T1, \T2, 8\M1 // T1:T2 = A0*B1 - xor rax, rax - adox \T0, \T2 - mulx \T2, \T3, 16\M1 // T2:T3 = A0*B2 - adox \T1, \T3 - mulx \T3, \T4, 24\M1 // T3:T4 = A0*B3 - adox \T2, \T4 - - mov rdx, 8\M0 - mulx \T5, \T4, \M1 // T5:T4 = A1*B0 - adox \T3, rax - xor rax, rax - mulx \T6, \T7, 8\M1 // T6:T7 = A1*B1 - adox \T4, \T0 - mov 8\C, \T4 // C1_final - adcx \T5, \T7 - mulx \T7, \T8, 16\M1 // T7:T8 = A1*B2 - adcx \T6, \T8 - adox \T5, \T1 - mulx \T8, \T9, 24\M1 // T8:T9 = A1*B3 - adcx \T7, \T9 - adcx \T8, rax - adox \T6, \T2 - - mov rdx, 16\M0 - mulx \T1, \T0, \M1 // T1:T0 = A2*B0 - adox \T7, \T3 - adox \T8, rax - xor rax, rax - mulx \T2, \T3, 8\M1 // T2:T3 = A2*B1 - adox \T0, \T5 - mov 16\C, \T0 // C2_final - adcx \T1, \T3 - mulx \T3, \T4, 16\M1 // T3:T4 = A2*B2 - adcx \T2, \T4 - adox \T1, \T6 - mulx \T4,\T9, 24\M1 // T3:T4 = A2*B3 - adcx \T3, \T9 - mov rdx, 24\M0 - adcx \T4, rax - - adox \T2, \T7 - adox \T3, \T8 - adox \T4, rax - - mulx \T5, \T0, \M1 // T5:T0 = A3*B0 - xor rax, rax - mulx \T6, \T7, 8\M1 // T6:T7 = A3*B1 - adcx \T5, \T7 - adox \T1, \T0 - mulx \T7, \T8, 16\M1 // T7:T8 = A3*B2 - adcx \T6, \T8 - adox \T2, \T5 - mulx \T8, \T9, 24\M1 // T8:T9 = A3*B3 - adcx \T7, \T9 - adcx \T8, rax - - adox \T3, \T6 - adox \T4, \T7 - adox \T8, rax - mov 24\C, \T1 // C3_final - mov 32\C, \T2 // C4_final - mov 40\C, \T3 // C5_final - mov 48\C, \T4 // C6_final - mov 56\C, \T8 // C7_final -.endm - -#else - -.macro MUL192_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6 - mov rdx, \M0 - mulx \T0, \T1, \M1 // T0:T1 = A0*B0 - mov \C, \T1 // C0_final - mulx \T1, \T2, 8\M1 // T1:T2 = A0*B1 - add \T0, \T2 - mulx \T2, \T3, 16\M1 // T2:T3 = A0*B2 - adc \T1, \T3 - - mov rdx, 8\M0 - mulx \T3, \T4, \M1 // T3:T4 = A1*B0 - adc \T2, 0 - mulx \T5, \T6, 8\M1 // T5:T6 = A1*B1 - add \T4, \T0 - mov 8\C, \T4 // C1_final - adc \T3, \T1 - adc \T5, \T2 - mulx \T2, \T1, 16\M1 // T2:T1 = A1*B2 - adc \T2, 0 - - add \T3, \T6 - adc \T5, \T1 - adc \T2, 0 - - mov rdx, 16\M0 - mulx \T1, \T0, \M1 // T1:T0 = A2*B0 - add \T0, \T3 - mov 16\C, \T0 // C2_final - mulx \T4, \T6, 8\M1 // T4:T6 = A2*B1 - adc \T1, \T5 - adc \T2, \T4 - mulx rax, \T3, 16\M1 // rax:T3 = A2*B2 - adc rax, 0 - add \T1, \T6 - adc \T3, \T2 - adc rax, 0 -.endm - -.macro MUL256_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9 - mov rdx, \M0 - mulx \T0, \T1, \M1 // T0:T1 = A0*B0 - mov \C, \T1 // C0_final - mulx \T1, \T2, 8\M1 // T1:T2 = A0*B1 - add \T0, \T2 - mulx \T2, \T3, 16\M1 // T2:T3 = A0*B2 - adc \T1, \T3 - mulx \T3, \T4, 24\M1 // T3:T4 = A0*B3 - adc \T2, \T4 - mov rdx, 8\M0 - adc \T3, 0 - - mulx \T5, \T4, \M1 // T5:T4 = A1*B0 - mulx \T6, \T7, 8\M1 // T6:T7 = A1*B1 - add \T5, \T7 - mulx \T7, \T8, 16\M1 // T7:T8 = A1*B2 - adc \T6, \T8 - mulx \T8, \T9, 24\M1 // T8:T9 = A1*B3 - adc \T7, \T9 - adc \T8, 0 - - add \T4, \T0 - mov 8\C, \T4 // C1_final - adc \T5, \T1 - adc \T6, \T2 - adc \T7, \T3 - mov rdx, 16\M0 - adc \T8, 0 - - mulx \T1, \T0, \M1 // T1:T0 = A2*B0 - mulx \T2, \T3, 8\M1 // T2:T3 = A2*B1 - add \T1, \T3 - mulx \T3, \T4, 16\M1 // T3:T4 = A2*B2 - adc \T2, \T4 - mulx \T4,\T9, 24\M1 // T3:T4 = A2*B3 - adc \T3, \T9 - mov rdx, 24\M0 - adc \T4, 0 - - add \T0, \T5 - mov 16\C, \T0 // C2_final - adc \T1, \T6 - adc \T2, \T7 - adc \T3, \T8 - adc \T4, 0 - - mulx \T5, \T0, \M1 // T5:T0 = A3*B0 - mulx \T6, \T7, 8\M1 // T6:T7 = A3*B1 - add \T5, \T7 - mulx \T7, \T8, 16\M1 // T7:T8 = A3*B2 - adc \T6, \T8 - mulx \T8, \T9, 24\M1 // T8:T9 = A3*B3 - adc \T7, \T9 - adc \T8, 0 - - add \T1, \T0 - mov 24\C, \T1 // C3_final - adc \T2, \T5 - mov 32\C, \T2 // C4_final - adc \T3, \T6 - mov 40\C, \T3 // C5_final - adc \T4, \T7 - mov 48\C, \T4 // C6_final - adc \T8, 0 - mov 56\C, \T8 // C7_final -.endm -#endif - - -//***************************************************************************** -// 434-bit multiplication using Karatsuba (one level), schoolbook (one level) -//***************************************************************************** -.global fmt(mul434_asm) -fmt(mul434_asm): - push r12 - push r13 - push r14 - push r15 - mov rcx, reg_p3 - - // r8-r11 <- AH + AL, rax <- mask - xor rax, rax - mov r8, [reg_p1] - mov r9, [reg_p1+8] - mov r10, [reg_p1+16] - mov r11, [reg_p1+24] - push rbx - push rbp - sub rsp, 96 - add r8, [reg_p1+32] - adc r9, [reg_p1+40] - adc r10, [reg_p1+48] - adc r11, 0 - sbb rax, 0 - mov [rsp], r8 - mov [rsp+8], r9 - mov [rsp+16], r10 - mov [rsp+24], r11 - - // r12-r15 <- BH + BL, rbx <- mask - xor rbx, rbx - mov r12, [reg_p2] - mov r13, [reg_p2+8] - mov r14, [reg_p2+16] - mov r15, [reg_p2+24] - add r12, [reg_p2+32] - adc r13, [reg_p2+40] - adc r14, [reg_p2+48] - adc r15, 0 - sbb rbx, 0 - mov [rsp+32], r12 - mov [rsp+40], r13 - mov [rsp+48], r14 - mov [rsp+56], r15 - - // r12-r15 <- masked (BH + BL) - and r12, rax - and r13, rax - and r14, rax - and r15, rax - - // r8-r11 <- masked (AH + AL) - and r8, rbx - and r9, rbx - and r10, rbx - and r11, rbx - - // r8-r11 <- masked (AH + AL) + masked (AH + AL) - add r8, r12 - adc r9, r13 - adc r10, r14 - adc r11, r15 - mov [rsp+64], r8 - mov [rsp+72], r9 - mov [rsp+80], r10 - mov [rsp+88], r11 - - // [rsp] <- (AH+AL) x (BH+BL), low part - MUL256_SCHOOL [rsp], [rsp+32], [rsp], r8, r9, r10, r11, r12, r13, r14, r15, rbx, rbp - - // [rcx] <- AL x BL - MUL256_SCHOOL [reg_p1], [reg_p2], [rcx], r8, r9, r10, r11, r12, r13, r14, r15, rbx, rbp // Result C0-C3 - - // [rcx+64], rbx, rbp, rax <- AH x BH - MUL192_SCHOOL [reg_p1+32], [reg_p2+32], [rcx+64], r8, rbx, r10, rbp, r12, r13, r14 - - // r8-r11 <- (AH+AL) x (BH+BL), final step - mov r8, [rsp+64] - mov r9, [rsp+72] - mov r10, [rsp+80] - mov r11, [rsp+88] - mov rdx, [rsp+32] - add r8, rdx - mov rdx, [rsp+40] - adc r9, rdx - mov rdx, [rsp+48] - adc r10, rdx - mov rdx, [rsp+56] - adc r11, rdx - - // r8-r15 <- (AH+AL) x (BH+BL) - ALxBL - mov r12, [rsp] - mov r13, [rsp+8] - mov r14, [rsp+16] - mov r15, [rsp+24] - sub r12, [rcx] - sbb r13, [rcx+8] - sbb r14, [rcx+16] - sbb r15, [rcx+24] - sbb r8, [rcx+32] - sbb r9, [rcx+40] - sbb r10, [rcx+48] - sbb r11, [rcx+56] - - // r8-r15 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH - sub r12, [rcx+64] - sbb r13, [rcx+72] - sbb r14, [rcx+80] - sbb r15, rbx - sbb r8, rbp - sbb r9, rax - sbb r10, 0 - sbb r11, 0 - - add r12, [rcx+32] - mov [rcx+32], r12 // Result C4-C7 - adc r13, [rcx+40] - mov [rcx+40], r13 - adc r14, [rcx+48] - mov [rcx+48], r14 - adc r15, [rcx+56] - mov [rcx+56], r15 - adc r8, [rcx+64] - mov [rcx+64], r8 // Result C8-C15 - adc r9, [rcx+72] - mov [rcx+72], r9 - adc r10, [rcx+80] - mov [rcx+80], r10 - adc r11, rbx - mov [rcx+88], r11 - adc rbp, 0 - mov [rcx+96], rbp - adc rax, 0 - mov [rcx+104], rax - - add rsp, 96 - pop rbp - pop rbx - pop r15 - pop r14 - pop r13 - pop r12 - ret - -#else - -# error "CONFIGURATION NOT SUPPORTED. TRY USE_MULX=TRUE" - -#endif - - -#ifdef _MULX_ - -///////////////////////////////////////////////////////////////// MACRO -// Schoolbook integer multiplication -// Inputs: reg I0 and memory pointer M1 -// Outputs: regs T0:T4 -// Temps: regs T0:T5 -///////////////////////////////////////////////////////////////// -.macro MUL64x256_SCHOOL I0, M1, T0, T1, T2, T3, T4, T5 - mulx \T2, \T4, 8\M1 - xor rax, rax - mulx \T3, \T5, 16\M1 - ADD1 \T1, \T4 // T1 <- C1_final - ADC1 \T2, \T5 // T2 <- C2_final - mulx \T4, \T5, 24\M1 - ADC1 \T3, \T5 // T3 <- C3_final - ADC1 \T4, rax // T4 <- C4_final -.endm - -///////////////////////////////////////////////////////////////// MACRO -// Schoolbook integer multiplication -// Inputs: regs I0 and I1, and memory pointer M1 -// Outputs: regs T0:T5 -// Temps: regs T0:T5 -///////////////////////////////////////////////////////////////// - -#ifdef _ADX_ -.macro MUL128x256_SCHOOL I0, I1, M1, T0, T1, T2, T3, T4, T5 - mulx \T2, \T4, 8\M1 - xor rax, rax - mulx \T3, \T5, 16\M1 - ADD1 \T1, \T4 - ADC1 \T2, \T5 - mulx \T4, \T5, 24\M1 - ADC1 \T3, \T5 - ADC1 \T4, rax - - xor rax, rax - mov rdx, \I1 - mulx \I1, \T5, \M1 - ADD2 \T1, \T5 // T1 <- C1_final - ADC2 \T2, \I1 - mulx \T5, \I1, 8\M1 - ADC2 \T3, \T5 - ADD1 \T2, \I1 - mulx \T5, \I1, 16\M1 - ADC2 \T4, \T5 - ADC1 \T3, \I1 - mulx \T5, \I1, 24\M1 - ADC2 \T5, rax - ADC1 \T4, \I1 - ADC1 \T5, rax -.endm - -#else - -.macro MUL128x256_SCHOOL I0, I1, M1, T0, T1, T2, T3, T4, T5 - mulx \T2, \T4, 8\M1 - mulx \T3, \T5, 16\M1 - add \T1, \T4 - adc \T2, \T5 - mulx \T4, \T5, 24\M1 - adc \T3, \T5 - adc \T4, 0 - - mov rdx, \I1 - mulx \I1, \T5, \M1 - add \T1, \T5 // T1 <- C1_final - adc \T2, \I1 - mulx \T5, \I1, 8\M1 - adc \T3, \T5 - mulx \T5, rax, 16\M1 - adc \T4, \T5 - mulx \T5, rdx, 24\M1 - adc \T5, 0 - add \T2, \I1 - adc \T3, rax - adc \T4, rdx - adc \T5, 0 -.endm -#endif - - -//************************************************************************************** -// Montgomery reduction -// Based on method described in Faz-Hernandez et al. https://eprint.iacr.org/2017/1015 -// Operation: c [reg_p2] = a [reg_p1] -//************************************************************************************** -.global fmt(rdc434_asm) -fmt(rdc434_asm): - push r14 - - // a[0-1] x p434p1_nz --> result: r8:r13 - mov rdx, [reg_p1] - mov r14, [reg_p1+8] - mulx r9, r8, [rip+fmt(p434p1)+24] // result r8 - push r12 - push r13 - push r15 - push rbp - push rbx - MUL128x256_SCHOOL rdx, r14, [rip+fmt(p434p1)+24], r8, r9, r10, r11, r12, r13 - - mov rdx, [reg_p1+16] - mov rcx, [reg_p1+72] - add r8, [reg_p1+24] - adc r9, [reg_p1+32] - adc r10, [reg_p1+40] - adc r11, [reg_p1+48] - adc r12, [reg_p1+56] - adc r13, [reg_p1+64] - adc rcx, 0 - mulx rbp, rbx, [rip+fmt(p434p1)+24] // result rbx - mov [reg_p2], r9 - mov [reg_p2+8], r10 - mov [reg_p2+16], r11 - mov [reg_p2+24], r12 - mov [reg_p2+32], r13 - mov r9, [reg_p1+80] - mov r10, [reg_p1+88] - mov r11, [reg_p1+96] - mov rdi, [reg_p1+104] - adc r9, 0 - adc r10, 0 - adc r11, 0 - adc rdi, 0 - - // a[2-3] x p434p1_nz --> result: rbx, rbp, r12:r15 - MUL128x256_SCHOOL rdx, r8, [rip+fmt(p434p1)+24], rbx, rbp, r12, r13, r14, r15 - - mov rdx, [reg_p2] - add rbx, [reg_p2+8] - adc rbp, [reg_p2+16] - adc r12, [reg_p2+24] - adc r13, [reg_p2+32] - adc r14, rcx - mov rcx, 0 - adc r15, r9 - adc rcx, r10 - mulx r9, r8, [rip+fmt(p434p1)+24] // result r8 - mov [reg_p2], rbp - mov [reg_p2+8], r12 - mov [reg_p2+16], r13 - adc r11, 0 - adc rdi, 0 - - // a[4-5] x p434p1_nz --> result: r8:r13 - MUL128x256_SCHOOL rdx, rbx, [rip+fmt(p434p1)+24], r8, r9, r10, rbp, r12, r13 - - mov rdx, [reg_p2] - add r8, [reg_p2+8] - adc r9, [reg_p2+16] - adc r10, r14 - adc rbp, r15 - adc r12, rcx - adc r13, r11 - adc rdi, 0 - mulx r15, r14, [rip+fmt(p434p1)+24] // result r14 - mov [reg_p2], r8 // Final result c0-c1 - mov [reg_p2+8], r9 - - // a[6-7] x p434p1_nz --> result: r14:r15, r8:r9, r11 - MUL64x256_SCHOOL rdx, [rip+fmt(p434p1)+24], r14, r15, r8, r9, r11, rcx - - // Final result c2:c6 - add r14, r10 - adc r15, rbp - pop rbx - pop rbp - adc r8, r12 - adc r9, r13 - adc r11, rdi - mov [reg_p2+16], r14 - mov [reg_p2+24], r15 - pop r15 - pop r13 - mov [reg_p2+32], r8 - mov [reg_p2+40], r9 - mov [reg_p2+48], r11 - - pop r12 - pop r14 - ret - - #else - - # error "CONFIGURATION NOT SUPPORTED. TRY USE_MULX=TRUE" - - #endif - - -//*********************************************************************** -// 434-bit multiprecision addition -// Operation: c [reg_p3] = a [reg_p1] + b [reg_p2] -//*********************************************************************** -.global fmt(mp_add434_asm) -fmt(mp_add434_asm): - mov r8, [reg_p1] - mov r9, [reg_p1+8] - mov r10, [reg_p1+16] - mov r11, [reg_p1+24] - add r8, [reg_p2] - adc r9, [reg_p2+8] - adc r10, [reg_p2+16] - adc r11, [reg_p2+24] - mov [reg_p3], r8 - mov [reg_p3+8], r9 - mov [reg_p3+16], r10 - mov [reg_p3+24], r11 - - mov r8, [reg_p1+32] - mov r9, [reg_p1+40] - mov r10, [reg_p1+48] - adc r8, [reg_p2+32] - adc r9, [reg_p2+40] - adc r10, [reg_p2+48] - mov [reg_p3+32], r8 - mov [reg_p3+40], r9 - mov [reg_p3+48], r10 - ret - - -//*************************************************************************** -// 2x434-bit multiprecision subtraction/addition -// Operation: c [reg_p3] = a [reg_p1] - b [reg_p2]. If c < 0, add p434*2^448 -//*************************************************************************** -.global fmt(mp_subadd434x2_asm) -fmt(mp_subadd434x2_asm): - push r12 - push r13 - push r14 - push r15 - xor rax, rax - mov r8, [reg_p1] - mov r9, [reg_p1+8] - mov r10, [reg_p1+16] - mov r11, [reg_p1+24] - mov r12, [reg_p1+32] - sub r8, [reg_p2] - sbb r9, [reg_p2+8] - sbb r10, [reg_p2+16] - sbb r11, [reg_p2+24] - sbb r12, [reg_p2+32] - mov [reg_p3], r8 - mov [reg_p3+8], r9 - mov [reg_p3+16], r10 - mov [reg_p3+24], r11 - mov [reg_p3+32], r12 - - mov r8, [reg_p1+40] - mov r9, [reg_p1+48] - mov r10, [reg_p1+56] - mov r11, [reg_p1+64] - mov r12, [reg_p1+72] - sbb r8, [reg_p2+40] - sbb r9, [reg_p2+48] - sbb r10, [reg_p2+56] - sbb r11, [reg_p2+64] - sbb r12, [reg_p2+72] - mov [reg_p3+40], r8 - mov [reg_p3+48], r9 - mov [reg_p3+56], r10 - - mov r13, [reg_p1+80] - mov r14, [reg_p1+88] - mov r15, [reg_p1+96] - mov rcx, [reg_p1+104] - sbb r13, [reg_p2+80] - sbb r14, [reg_p2+88] - sbb r15, [reg_p2+96] - sbb rcx, [reg_p2+104] - sbb rax, 0 - - // Add p434 anded with the mask in rax - mov r8, [rip+fmt(p434)] - mov r9, [rip+fmt(p434)+24] - mov r10, [rip+fmt(p434)+32] - mov rdi, [rip+fmt(p434)+40] - mov rsi, [rip+fmt(p434)+48] - and r8, rax - and r9, rax - and r10, rax - and rdi, rax - and rsi, rax - mov rax, [reg_p3+56] - add rax, r8 - adc r11, r8 - adc r12, r8 - adc r13, r9 - adc r14, r10 - adc r15, rdi - adc rcx, rsi - - mov [reg_p3+56], rax - mov [reg_p3+64], r11 - mov [reg_p3+72], r12 - mov [reg_p3+80], r13 - mov [reg_p3+88], r14 - mov [reg_p3+96], r15 - mov [reg_p3+104], rcx - pop r15 - pop r14 - pop r13 - pop r12 - ret - - -//*********************************************************************** -// Double 2x434-bit multiprecision subtraction -// Operation: c [reg_p3] = c [reg_p3] - a [reg_p1] - b [reg_p2] -//*********************************************************************** -.global fmt(mp_dblsub434x2_asm) -fmt(mp_dblsub434x2_asm): - push r12 - push r13 - push r14 - - mov r8, [reg_p3] - mov r9, [reg_p3+8] - mov r10, [reg_p3+16] - mov r11, [reg_p3+24] - mov r12, [reg_p3+32] - mov r13, [reg_p3+40] - mov r14, [reg_p3+48] - sub r8, [reg_p1] - sbb r9, [reg_p1+8] - sbb r10, [reg_p1+16] - sbb r11, [reg_p1+24] - sbb r12, [reg_p1+32] - sbb r13, [reg_p1+40] - sbb r14, [reg_p1+48] - setc al - sub r8, [reg_p2] - sbb r9, [reg_p2+8] - sbb r10, [reg_p2+16] - sbb r11, [reg_p2+24] - sbb r12, [reg_p2+32] - sbb r13, [reg_p2+40] - sbb r14, [reg_p2+48] - setc cl - mov [reg_p3], r8 - mov [reg_p3+8], r9 - mov [reg_p3+16], r10 - mov [reg_p3+24], r11 - mov [reg_p3+32], r12 - mov [reg_p3+40], r13 - mov [reg_p3+48], r14 - - mov r8, [reg_p3+56] - mov r9, [reg_p3+64] - mov r10, [reg_p3+72] - mov r11, [reg_p3+80] - mov r12, [reg_p3+88] - mov r13, [reg_p3+96] - mov r14, [reg_p3+104] - bt rax, 0 - sbb r8, [reg_p1+56] - sbb r9, [reg_p1+64] - sbb r10, [reg_p1+72] - sbb r11, [reg_p1+80] - sbb r12, [reg_p1+88] - sbb r13, [reg_p1+96] - sbb r14, [reg_p1+104] - bt rcx, 0 - sbb r8, [reg_p2+56] - sbb r9, [reg_p2+64] - sbb r10, [reg_p2+72] - sbb r11, [reg_p2+80] - sbb r12, [reg_p2+88] - sbb r13, [reg_p2+96] - sbb r14, [reg_p2+104] - mov [reg_p3+56], r8 - mov [reg_p3+64], r9 - mov [reg_p3+72], r10 - mov [reg_p3+80], r11 - mov [reg_p3+88], r12 - mov [reg_p3+96], r13 - mov [reg_p3+104], r14 - - pop r14 - pop r13 - pop r12 - ret \ No newline at end of file +//******************************************************************************************* + +.intel_syntax noprefix + +// Format function and variable names for Mac OS X +#if defined(__APPLE__) + #define fmt(f) _##f +#else + #define fmt(f) f +#endif + +// Registers that are used for parameter passing: +#define reg_p1 rdi +#define reg_p2 rsi +#define reg_p3 rdx + + +.text +//*********************************************************************** +// 434-bit multiprecision addition +// Operation: c [reg_p3] = a [reg_p1] + b [reg_p2] +//*********************************************************************** +.global fmt(mp_add434_asm) +fmt(mp_add434_asm): + mov r8, [reg_p1] + mov r9, [reg_p1+8] + mov r10, [reg_p1+16] + mov r11, [reg_p1+24] + add r8, [reg_p2] + adc r9, [reg_p2+8] + adc r10, [reg_p2+16] + adc r11, [reg_p2+24] + mov [reg_p3], r8 + mov [reg_p3+8], r9 + mov [reg_p3+16], r10 + mov [reg_p3+24], r11 + + mov r8, [reg_p1+32] + mov r9, [reg_p1+40] + mov r10, [reg_p1+48] + adc r8, [reg_p2+32] + adc r9, [reg_p2+40] + adc r10, [reg_p2+48] + mov [reg_p3+32], r8 + mov [reg_p3+40], r9 + mov [reg_p3+48], r10 + ret + + +//*********************************************************************** +// Field addition +// Operation: c [reg_p3] = a [reg_p1] + b [reg_p2] +//*********************************************************************** +.global fmt(fpadd434_asm) +fmt(fpadd434_asm): + push r12 + push r13 + push r14 + push r15 + push rbx + push rbp + + xor rax, rax + mov r8, [reg_p1] + mov r9, [reg_p1+8] + mov r10, [reg_p1+16] + mov r11, [reg_p1+24] + mov r12, [reg_p1+32] + mov r13, [reg_p1+40] + mov r14, [reg_p1+48] + add r8, [reg_p2] + adc r9, [reg_p2+8] + adc r10, [reg_p2+16] + adc r11, [reg_p2+24] + adc r12, [reg_p2+32] + adc r13, [reg_p2+40] + adc r14, [reg_p2+48] + + mov rbx, [rip+fmt(p434x2)] + sub r8, rbx + mov rcx, [rip+fmt(p434x2)+8] + sbb r9, rcx + sbb r10, rcx + mov rdi, [rip+fmt(p434x2)+24] + sbb r11, rdi + mov rsi, [rip+fmt(p434x2)+32] + sbb r12, rsi + mov rbp, [rip+fmt(p434x2)+40] + sbb r13, rbp + mov r15, [rip+fmt(p434x2)+48] + sbb r14, r15 + sbb rax, 0 + + and rbx, rax + and rcx, rax + and rdi, rax + and rsi, rax + and rbp, rax + and r15, rax + + add r8, rbx + adc r9, rcx + adc r10, rcx + adc r11, rdi + adc r12, rsi + adc r13, rbp + adc r14, r15 + mov [reg_p3], r8 + mov [reg_p3+8], r9 + mov [reg_p3+16], r10 + mov [reg_p3+24], r11 + mov [reg_p3+32], r12 + mov [reg_p3+40], r13 + mov [reg_p3+48], r14 + + pop rbp + pop rbx + pop r15 + pop r14 + pop r13 + pop r12 + ret + + +//*********************************************************************** +// Field subtraction +// Operation: c [reg_p3] = a [reg_p1] - b [reg_p2] +//*********************************************************************** +.global fmt(fpsub434_asm) +fmt(fpsub434_asm): + push r12 + push r13 + push r14 + + xor rax, rax + mov r8, [reg_p1] + mov r9, [reg_p1+8] + mov r10, [reg_p1+16] + mov r11, [reg_p1+24] + mov r12, [reg_p1+32] + mov r13, [reg_p1+40] + mov r14, [reg_p1+48] + sub r8, [reg_p2] + sbb r9, [reg_p2+8] + sbb r10, [reg_p2+16] + sbb r11, [reg_p2+24] + sbb r12, [reg_p2+32] + sbb r13, [reg_p2+40] + sbb r14, [reg_p2+48] + sbb rax, 0 + + mov rcx, [rip+fmt(p434x2)] + mov rdi, [rip+fmt(p434x2)+8] + mov rsi, [rip+fmt(p434x2)+24] + and rcx, rax + and rdi, rax + and rsi, rax + add r8, rcx + adc r9, rdi + adc r10, rdi + adc r11, rsi + mov [reg_p3], r8 + mov [reg_p3+8], r9 + mov [reg_p3+16], r10 + mov [reg_p3+24], r11 + setc cl + + mov r8, [rip+fmt(p434x2)+32] + mov rdi, [rip+fmt(p434x2)+40] + mov rsi, [rip+fmt(p434x2)+48] + and r8, rax + and rdi, rax + and rsi, rax + bt rcx, 0 + adc r12, r8 + adc r13, rdi + adc r14, rsi + mov [reg_p3+32], r12 + mov [reg_p3+40], r13 + mov [reg_p3+48], r14 + + pop r14 + pop r13 + pop r12 + ret + + +///////////////////////////////////////////////////////////////// MACRO +.macro SUB434_PX P0 + push r12 + push r13 + + mov r8, [reg_p1] + mov r9, [reg_p1+8] + mov r10, [reg_p1+16] + mov r11, [reg_p1+24] + mov r12, [reg_p1+32] + mov r13, [reg_p1+40] + mov rcx, [reg_p1+48] + sub r8, [reg_p2] + sbb r9, [reg_p2+8] + sbb r10, [reg_p2+16] + sbb r11, [reg_p2+24] + sbb r12, [reg_p2+32] + sbb r13, [reg_p2+40] + sbb rcx, [reg_p2+48] + + mov rax, [rip+\P0] + mov rdi, [rip+\P0+8] + mov rsi, [rip+\P0+24] + add r8, rax + mov rax, [rip+\P0+32] + adc r9, rdi + adc r10, rdi + adc r11, rsi + mov rdi, [rip+\P0+40] + mov rsi, [rip+\P0+48] + adc r12, rax + adc r13, rdi + adc rcx, rsi + mov [reg_p3], r8 + mov [reg_p3+8], r9 + mov [reg_p3+16], r10 + mov [reg_p3+24], r11 + mov [reg_p3+32], r12 + mov [reg_p3+40], r13 + mov [reg_p3+48], rcx + + pop r13 + pop r12 +.endm + + +//*********************************************************************** +// Multiprecision subtraction with correction with 2*p434 +// Operation: c [reg_p3] = a [reg_p1] - b [reg_p2] + 2*p434 +//*********************************************************************** +.global fmt(mp_sub434_p2_asm) +fmt(mp_sub434_p2_asm): + + SUB434_PX fmt(p434x2) + ret + + +#ifdef _MULX_ +#ifdef _ADX_ + +///////////////////////////////////////////////////////////////// MACROS +// z = a x bi + z +// Inputs: base memory pointer M1 (a), +// bi pre-stored in rdx, +// accumulator z in [Z0:Z4] or [Z0:Z7] +// Output: [Z0:Z4] or [Z0:Z7] +// Temps: regs T0:T1 +///////////////////////////////////////////////////////////////// +.macro MULADD64x448 M1, Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7, T0, T1, C + mulx \T0, \T1, \M1 // A0*B0 + xor \C, \C + adox \Z0, \T1 + adox \Z1, \T0 + mulx \T0, \T1, 8\M1 // A0*B1 + adcx \Z1, \T1 + adox \Z2, \T0 + mulx \T0, \T1, 16\M1 // A0*B2 + adcx \Z2, \T1 + adox \Z3, \T0 + mulx \T0, \T1, 24\M1 // A0*B3 + adcx \Z3, \T1 + adox \Z4, \T0 + mulx \T0, \T1, 32\M1 // A0*B4 + adcx \Z4, \T1 + adox \Z5, \T0 + mulx \T0, \T1, 40\M1 // A0*B5 + adcx \Z5, \T1 + adox \Z6, \T0 + mulx \T0, \T1, 48\M1 // A0*B6 + adcx \Z6, \T1 + adox \Z7, \T0 + adc \Z7, 0 +.endm + + +.macro MULADD64x256 M1, Z0, Z1, Z2, Z3, Z4, T0, T1 + mulx \T0, \T1, \M1 // A0*B0 + xor rax, rax + adox \Z0, \T1 + adox \Z1, \T0 + mulx \T0, \T1, 8\M1 // A0*B1 + adcx \Z1, \T1 + adox \Z2, \T0 + mulx \T0, \T1, 16\M1 // A0*B2 + adcx \Z2, \T1 + adox \Z3, \T0 + mulx \T0, \T1, 24\M1 // A0*B3 + adcx \Z3, \T1 + adox \Z4, \T0 + adcx \Z4, rax +.endm + + +///////////////////////////////////////////////////////////////// MACRO +// z = a x b + c x d (mod p) +// Inputs: base memory pointers M0 (a,c), M1 (b,d) +// bi pre-stored in rdx, +// accumulator z in [Z0:Z7], pre-stores a0 x b +// Output: [Z0:Z7] +// Temps: regs T0:T1 +///////////////////////////////////////////////////////////////// +.macro FPDBLMUL448x448 M00, M01, M10, M11, Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7, T0, T1 + mov rdx, \M11 + MULADD64x448 \M01, \Z0, \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \T0, \T1, rax + // [Z1:Z7] <- z = (z0 x p434p1 + z)/2^64 + mov rdx, \Z0 // rdx <- z0 + MULADD64x256 [rip+fmt(p434p1)+24], \Z3, \Z4, \Z5, \Z6, \Z7, \T0, \T1 + + // [Z1:Z7, \Z0] <- z = a01 x a1 + z + mov rdx, 8\M10 + MULADD64x448 \M00, \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \Z0, \T0, \T1, \Z0 + mov rdx, 8\M11 + MULADD64x448 \M01, \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \Z0, \T0, \T1, rax + // [Z2:Z7, Z0] <- z = (z0 x p434p1 + z)/2^64 + mov rdx, \Z1 // rdx <- z0 + MULADD64x256 [rip+fmt(p434p1)+24], \Z4, \Z5, \Z6, \Z7, \Z0, \T0, \T1 + + // [Z2:Z7, Z0:Z1] <- z = a02 x a1 + z + mov rdx, 16\M10 + MULADD64x448 \M00, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \Z0, \Z1, \T0, \T1, \Z1 + mov rdx, 16\M11 + MULADD64x448 \M01, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \Z0, \Z1, \T0, \T1, rax + // [Z3:Z7, Z0:Z1] <- z = (z0 x p434p1 + z)/2^64 + mov rdx, \Z2 // rdx <- z0 + MULADD64x256 [rip+fmt(p434p1)+24], \Z5, \Z6, \Z7, \Z0, \Z1, \T0, \T1 + + // [Z3:Z7, Z0:Z2] <- z = a03 x a1 + z + mov rdx, 24\M10 + MULADD64x448 \M00, \Z3, \Z4, \Z5, \Z6, \Z7, \Z0, \Z1, \Z2, \T0, \T1, \Z2 + mov rdx, 24\M11 + MULADD64x448 \M01, \Z3, \Z4, \Z5, \Z6, \Z7, \Z0, \Z1, \Z2, \T0, \T1, rax + // [Z4:Z7, Z0:Z2] <- z = (z0 x p434p1 + z)/2^64 + mov rdx, \Z3 // rdx <- z0 + MULADD64x256 [rip+fmt(p434p1)+24], \Z6, \Z7, \Z0, \Z1, \Z2, \T0, \T1 + + // [Z4:Z7, Z0:Z3] <- z = a04 x a1 + z + mov rdx, 32\M10 + MULADD64x448 \M00, \Z4, \Z5, \Z6, \Z7, \Z0, \Z1, \Z2, \Z3, \T0, \T1, \Z3 + mov rdx, 32\M11 + MULADD64x448 \M01, \Z4, \Z5, \Z6, \Z7, \Z0, \Z1, \Z2, \Z3, \T0, \T1, rax + // [Z5:Z7, Z0:Z3] <- z = (z0 x p434p1 + z)/2^64 + mov rdx, \Z4 // rdx <- z0 + MULADD64x256 [rip+fmt(p434p1)+24], \Z7, \Z0, \Z1, \Z2, \Z3, \T0, \T1 + + // [Z5:Z7, Z0:Z4] <- z = a05 x a1 + z + mov rdx, 40\M10 + MULADD64x448 \M00, \Z5, \Z6, \Z7, \Z0, \Z1, \Z2, \Z3, \Z4, \T0, \T1, \Z4 + mov rdx, 40\M11 + MULADD64x448 \M01, \Z5, \Z6, \Z7, \Z0, \Z1, \Z2, \Z3, \Z4, \T0, \T1, rax + // [Z6:Z7, Z0:Z4] <- z = (z0 x p434p1 + z)/2^64 + mov rdx, \Z5 // rdx <- z0 + MULADD64x256 [rip+fmt(p434p1)+24], \Z0, \Z1, \Z2, \Z3, \Z4, \T0, \T1 + + // [Z6:Z7, Z0:Z5] <- z = a06 x a1 + z + mov rdx, 48\M10 + MULADD64x448 \M00, \Z6, \Z7, \Z0, \Z1, \Z2, \Z3, \Z4, \Z5, \T0, \T1, \Z5 + mov rdx, 48\M11 + MULADD64x448 \M01, \Z6, \Z7, \Z0, \Z1, \Z2, \Z3, \Z4, \Z5, \T0, \T1, rax + pop \T1 + mov [rcx], \Z7 + // [Z7, Z0:Z5] <- z = (z0 x p434p1 + z)/2^64 + mov rdx, \Z6 // rdx <- z0 + //MULADD64x256 [rip+fmt(p434p1)+24], \Z1, \Z2, \Z3, \Z4, \Z5, \T0, \T1 + mulx \Z6, \Z7, [rip+p434p1+24] + pop \T0 + adox \Z1, \Z7 + adox \Z2, \Z6 + mov [rcx+8], \Z0 + mulx \Z6, \Z7, [rip+p434p1+32] + mov [rcx+16], \Z1 + adcx \Z2, \Z7 + adox \Z3, \Z6 + mov [rcx+24], \Z2 + mulx \Z2, \Z1, [rip+p434p1+40] + pop \Z7 + adcx \Z3, \Z1 + adox \Z4, \Z2 + mov [rcx+32], \Z3 + mulx \Z2, \Z1, [rip+p434p1+48] + pop \Z6 + adcx \Z4, \Z1 + adox \Z5, \Z2 + adc \Z5, 0 +.endm + + +//*********************************************************************** +// Multiplication in GF(p^2), non-complex part +// Operation: c [reg_p3] = a0 x b0 - a1 x b1 +// Inputs: a = [a1, a0] stored in [reg_p1] +// b = [b1, b0] stored in [reg_p2] +// Output: c stored in [reg_p3] +//*********************************************************************** +.global fmt(fp2mul434_c0_asm) +fmt(fp2mul434_c0_asm): + push r12 + mov rcx, reg_p3 + + // [rcx0:rcx48] <- 8*p - b1 + mov r8, [rip+fmt(p434x8)] + mov r9, [rip+fmt(p434x8)+8] + mov r11, [rip+fmt(p434x8)+24] + mov r12, [rip+fmt(p434x8)+32] + mov rax, [reg_p2+56] + mov rdx, [reg_p2+64] + mov r10, r9 + sub r8, rax + push r13 + sbb r9, rdx + mov rax, [reg_p2+72] + mov rdx, [reg_p2+80] + sbb r10, rax + push r14 + sbb r11, rdx + mov rax, [reg_p2+88] + mov rdx, [reg_p2+96] + mov r13, [rip+fmt(p434x8)+40] + mov r14, [rip+fmt(p434x8)+48] + mov [rcx], r8 + sbb r12, rax + push r15 + sbb r13, rdx + mov rax, [reg_p2+104] + mov [rcx+8], r9 + sbb r14, rax + mov [rcx+16], r10 + + // [r8:r15] <- z = a0 x b00 - a1 x b10 + mov rdx, [reg_p2] + mulx r9, r8, [reg_p1] + mov [rcx+24], r11 + xor rax, rax + mulx r10, r11, [reg_p1+8] + mov [rcx+32], r12 + adox r9, r11 + mulx r11, r12, [reg_p1+16] + mov [rcx+40], r13 + adox r10, r12 + mulx r12, r13, [reg_p1+24] + mov [rcx+48], r14 + adox r11, r13 + mulx r13, r14, [reg_p1+32] + push rbx + adox r12, r14 + mulx r14, r15, [reg_p1+40] + push rbp + adox r13, r15 + mulx r15, rbx, [reg_p1+48] + adox r14, rbx + adox r15, rax + + FPDBLMUL448x448 [reg_p1], [reg_p1+56], [reg_p2], [rcx], r8, r9, r10, r11, r12, r13, r14, r15, rbx, rbp + + mov [rcx+40], r12 + mov [rcx+48], r13 + pop r13 + pop r12 + ret + + +//*********************************************************************** +// Multiplication in GF(p^2), complex part +// Operation: c [reg_p3] = a0 x b1 + a1 x b0 +// Inputs: a = [a1, a0] stored in [reg_p1] +// b = [b1, b0] stored in [reg_p2] +// Output: c stored in [reg_p3] +//*********************************************************************** +.global fmt(fp2mul434_c1_asm) +fmt(fp2mul434_c1_asm): + mov rcx, reg_p3 + + // [r8:r15] <- z = a0 x b10 + a1 x b00 + mov rdx, [reg_p2] + mulx r9, r8, [reg_p1+56] // a0 x b10 + xor rax, rax + push r12 + mulx r10, r11, [reg_p1+64] + push r13 + adox r9, r11 + mulx r11, r12, [reg_p1+72] + push r14 + adox r10, r12 + mulx r12, r13, [reg_p1+80] + push r15 + adox r11, r13 + mulx r13, r14, [reg_p1+88] + push rbx + adox r12, r14 + mulx r14, r15, [reg_p1+96] + push rbp + adox r13, r15 + mulx r15, rbx, [reg_p1+104] + adox r14, rbx + adox r15, rax + + FPDBLMUL448x448 [reg_p1+56], [reg_p1], [reg_p2], [reg_p2+56], r8, r9, r10, r11, r12, r13, r14, r15, rbx, rbp + + mov [rcx+40], r12 + mov [rcx+48], r13 + pop r13 + pop r12 + ret + +#else + +# error "CONFIGURATION NOT SUPPORTED. TRY USE_MULX=TRUE USE_ADX=TRUE" + +#endif + +#else + +# error "CONFIGURATION NOT SUPPORTED. TRY USE_MULX=TRUE USE_ADX=TRUE" + +#endif + + +///////////////////////////////////////////////////////////////// MACRO +// z = a x b (mod p) +// Inputs: base memory pointers M0 (a), M1 (b) +// bi pre-stored in rdx, +// accumulator z in [Z0:Z7], pre-stores a0 x b +// Output: [Z0:Z7] +// Temps: regs T0:T1 +///////////////////////////////////////////////////////////////// +.macro FPMUL448x448 M0, M1, Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7, T0, T1 + // [Z1:Z7] <- z = (z0 x p434p1 + z)/2^64 + mov rdx, \Z0 // rdx <- z0 + MULADD64x256 [rip+fmt(p434p1)+24], \Z3, \Z4, \Z5, \Z6, \Z7, \T0, \T1 + + // [Z1:Z7, \Z0] <- z = a01 x a1 + z + mov rdx, 8\M0 + MULADD64x448 \M1, \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \Z0, \T0, \T1, \Z0 + // [Z2:Z7, Z0] <- z = (z0 x p434p1 + z)/2^64 + mov rdx, \Z1 // rdx <- z0 + MULADD64x256 [rip+fmt(p434p1)+24], \Z4, \Z5, \Z6, \Z7, \Z0, \T0, \T1 + + // [Z2:Z7, Z0:Z1] <- z = a02 x a1 + z + mov rdx, 16\M0 + MULADD64x448 \M1, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \Z0, \Z1, \T0, \T1, \Z1 + // [Z3:Z7, Z0:Z1] <- z = (z0 x p434p1 + z)/2^64 + mov rdx, \Z2 // rdx <- z0 + MULADD64x256 [rip+fmt(p434p1)+24], \Z5, \Z6, \Z7, \Z0, \Z1, \T0, \T1 + + // [Z3:Z7, Z0:Z2] <- z = a03 x a1 + z + mov rdx, 24\M0 + MULADD64x448 \M1, \Z3, \Z4, \Z5, \Z6, \Z7, \Z0, \Z1, \Z2, \T0, \T1, \Z2 + // [Z4:Z7, Z0:Z2] <- z = (z0 x p434p1 + z)/2^64 + mov rdx, \Z3 // rdx <- z0 + MULADD64x256 [rip+fmt(p434p1)+24], \Z6, \Z7, \Z0, \Z1, \Z2, \T0, \T1 + + // [Z4:Z7, Z0:Z3] <- z = a04 x a1 + z + mov rdx, 32\M0 + MULADD64x448 \M1, \Z4, \Z5, \Z6, \Z7, \Z0, \Z1, \Z2, \Z3, \T0, \T1, \Z3 + // [Z5:Z7, Z0:Z3] <- z = (z0 x p434p1 + z)/2^64 + mov rdx, \Z4 // rdx <- z0 + MULADD64x256 [rip+fmt(p434p1)+24], \Z7, \Z0, \Z1, \Z2, \Z3, \T0, \T1 + + // [Z5:Z7, Z0:Z4] <- z = a05 x a1 + z + mov rdx, 40\M0 + MULADD64x448 \M1, \Z5, \Z6, \Z7, \Z0, \Z1, \Z2, \Z3, \Z4, \T0, \T1, \Z4 + // [Z6:Z7, Z0:Z4] <- z = (z0 x p434p1 + z)/2^64 + mov rdx, \Z5 // rdx <- z0 + MULADD64x256 [rip+fmt(p434p1)+24], \Z0, \Z1, \Z2, \Z3, \Z4, \T0, \T1 + + // [Z6:Z7, Z0:Z5] <- z = a06 x a1 + z + mov rdx, 48\M0 + MULADD64x448 \M1, \Z6, \Z7, \Z0, \Z1, \Z2, \Z3, \Z4, \Z5, \T0, \T1, \Z5 + // [Z7, Z0:Z5] <- z = (z0 x p434p1 + z)/2^64 + mov rdx, \Z6 // rdx <- z0 + MULADD64x256 [rip+fmt(p434p1)+24], \Z1, \Z2, \Z3, \Z4, \Z5, \T0, \T1 +.endm + + +//*********************************************************************** +// Squaring in GF(p^2), non-complex part +// Operation: c [reg_p2] = (a0+a1) x (a0-a1) +// Inputs: a = [a1, a0] stored in [reg_p1] +// Output: c stored in [reg_p2] +//*********************************************************************** +.global fmt(fp2sqr434_c0_asm) +fmt(fp2sqr434_c0_asm): + push r12 + + // a0 + a1 + mov r8, [reg_p1] + mov r9, [reg_p1+8] + mov r10, [reg_p1+16] + mov r11, [reg_p1+24] + mov r12, [reg_p1+32] + add r8, [reg_p1+56] + push r13 + adc r9, [reg_p1+64] + adc r10, [reg_p1+72] + push r14 + adc r11, [reg_p1+80] + adc r12, [reg_p1+88] + mov r13, [reg_p1+40] + mov r14, [reg_p1+48] + adc r13, [reg_p1+96] + adc r14, [reg_p1+104] + mov [reg_p2], r8 + mov [reg_p2+8], r9 + mov [reg_p2+16], r10 + mov [reg_p2+24], r11 + mov [reg_p2+32], r12 + mov [reg_p2+40], r13 + mov [reg_p2+48], r14 + + // a0 - a1 + 4xp434 + mov rax, [reg_p1] + mov r10, [reg_p1+8] + mov r12, [reg_p1+16] + mov r13, [reg_p1+24] + mov r14, [reg_p1+32] + sub rax, [reg_p1+56] + sbb r10, [reg_p1+64] + sbb r12, [reg_p1+72] + push r15 + sbb r13, [reg_p1+80] + sbb r14, [reg_p1+88] + mov r15, [reg_p1+40] + mov rcx, [reg_p1+48] + sbb r15, [reg_p1+96] + sbb rcx, [reg_p1+104] + add rax, [rip+fmt(p434x4)] + mov rdx, [rip+fmt(p434x4)+8] + adc r10, rdx + adc r12, rdx + adc r13, [rip+fmt(p434x4)+24] + adc r14, [rip+fmt(p434x4)+32] + adc r15, [rip+fmt(p434x4)+40] + adc rcx, [rip+fmt(p434x4)+48] + mov [reg_p2+56], rax + + // [r8:r15] <- z = a00 x a1 + mov rdx, r8 + mulx r9, r8, rax + mov [reg_p2+64], r10 + xor rax, rax + push rbx + mulx r10, r11, r10 + mov [reg_p2+72], r12 + adox r9, r11 + mulx r11, r12, r12 + mov [reg_p2+80], r13 + adox r10, r12 + mulx r12, r13, r13 + mov [reg_p2+88], r14 + adox r11, r13 + mulx r13, r14, r14 + mov [reg_p2+96], r15 + adox r12, r14 + mulx r14, r15, r15 + mov [reg_p2+104], rcx + adox r13, r15 + mulx r15, rbx, rcx + adox r14, rbx + adox r15, rax + + FPMUL448x448 [reg_p2], [reg_p2+56], r8, r9, r10, r11, r12, r13, r14, r15, rbx, rcx + + mov [reg_p2], r15 + mov [reg_p2+8], r8 + mov [reg_p2+16], r9 + mov [reg_p2+24], r10 + mov [reg_p2+32], r11 + mov [reg_p2+40], r12 + mov [reg_p2+48], r13 + pop rbx + pop r15 + pop r14 + pop r13 + pop r12 + ret + + +//*********************************************************************** +// Squaring in GF(p^2), complex part +// Operation: c [reg_p2] = 2a0 x a1 +// Inputs: a = [a1, a0] stored in [reg_p1] +// Output: c stored in [reg_p2] +//*********************************************************************** +.global fmt(fp2sqr434_c1_asm) +fmt(fp2sqr434_c1_asm): + push r12 + + mov r8, [reg_p1] + mov r9, [reg_p1+8] + mov r10, [reg_p1+16] + mov r11, [reg_p1+24] + mov r12, [reg_p1+32] + add r8, r8 + push r13 + adc r9, r9 + adc r10, r10 + push r14 + adc r11, r11 + mov r13, [reg_p1+40] + mov r14, [reg_p1+48] + adc r12, r12 + push r15 + adc r13, r13 + push rbx + adc r14, r14 + sub rsp, 56 + mov [rsp+8], r9 + mov [rsp+16], r10 + + // [r8:r15] <- z = a00 x a1 + mov rdx, r8 + mulx r9, r8, [reg_p1+56] + mov [rsp+24], r11 + xor rax, rax + mulx r10, r11, [reg_p1+64] + mov [rsp+32], r12 + adox r9, r11 + mulx r11, r12, [reg_p1+72] + mov [rsp+40], r13 + adox r10, r12 + mulx r12, r13, [reg_p1+80] + mov [rsp+48], r14 + adox r11, r13 + mulx r13, r14, [reg_p1+88] + adox r12, r14 + mulx r14, r15, [reg_p1+96] + adox r13, r15 + mulx r15, rbx, [reg_p1+104] + adox r14, rbx + adox r15, rax + + FPMUL448x448 [rsp], [reg_p1+56], r8, r9, r10, r11, r12, r13, r14, r15, rbx, rcx + + add rsp, 56 + mov [reg_p2], r15 + mov [reg_p2+8], r8 + mov [reg_p2+16], r9 + mov [reg_p2+24], r10 + mov [reg_p2+32], r11 + mov [reg_p2+40], r12 + mov [reg_p2+48], r13 + pop rbx + pop r15 + pop r14 + pop r13 + pop r12 + ret + + +//*********************************************************************** +// Field multiplication in GF(p) +// Operation: c = a x b mod p +// Inputs: a stored in [reg_p1], b stored in [reg_p2] +// Output: c stored in [reg_p3] +//*********************************************************************** +.global fmt(fpmul434_asm) +fmt(fpmul434_asm): + mov rcx, reg_p3 + + // [r8:r15] <- z = a x b0 + mov rdx, [reg_p2] + mulx r9, r8, [reg_p1] + push r12 + xor rax, rax + mulx r10, r11, [reg_p1+8] + push r13 + adox r9, r11 + mulx r11, r12, [reg_p1+16] + push r14 + adox r10, r12 + mulx r12, r13, [reg_p1+24] + push r15 + adox r11, r13 + mulx r13, r14, [reg_p1+32] + push rbx + adox r12, r14 + mulx r14, r15, [reg_p1+40] + push rbp + adox r13, r15 + mulx r15, rbx, [reg_p1+48] + adox r14, rbx + adox r15, rax + + FPMUL448x448 [reg_p2], [reg_p1], r8, r9, r10, r11, r12, r13, r14, r15, rbx, rbp + + mov [rcx], r15 + mov [rcx+8], r8 + mov [rcx+16], r9 + mov [rcx+24], r10 + mov [rcx+32], r11 + mov [rcx+40], r12 + mov [rcx+48], r13 + pop rbp + pop rbx + pop r15 + pop r14 + pop r13 + pop r12 + ret \ No newline at end of file diff --git a/src/P434/ARM64/fp_arm64.c b/src/P434/ARM64/fp_arm64.c index 48cf3de..de65295 100644 --- a/src/P434/ARM64/fp_arm64.c +++ b/src/P434/ARM64/fp_arm64.c @@ -1,10 +1,15 @@ /******************************************************************************************** * SIDH: an efficient supersingular isogeny cryptography library +* Copyright (c) Microsoft Corporation +* +* Website: https://github.com/microsoft/PQCrypto-SIDH +* Released under MIT license * * Abstract: modular arithmetic optimized for 64-bit ARMv8 platforms for P434 *********************************************************************************************/ #include "../P434_internal.h" +#include "../../internal.h" // Global constants extern const uint64_t p434[NWORDS_FIELD]; @@ -13,21 +18,21 @@ extern const uint64_t p434x2[NWORDS_FIELD]; extern const uint64_t p434x4[NWORDS_FIELD]; -__inline void mp_sub434_p2(const digit_t* a, const digit_t* b, digit_t* c) +inline void mp_sub434_p2(const digit_t* a, const digit_t* b, digit_t* c) { // Multiprecision subtraction with correction with 2*p, c = a-b+2p. mp_sub434_p2_asm(a, b, c); } -__inline void mp_sub434_p4(const digit_t* a, const digit_t* b, digit_t* c) +inline void mp_sub434_p4(const digit_t* a, const digit_t* b, digit_t* c) { // Multiprecision subtraction with correction with 4*p, c = a-b+4p. mp_sub434_p4_asm(a, b, c); } -__inline void fpadd434(const digit_t* a, const digit_t* b, digit_t* c) +inline void fpadd434(const digit_t* a, const digit_t* b, digit_t* c) { // Modular addition, c = a+b mod p434. // Inputs: a, b in [0, 2*p434-1] // Output: c in [0, 2*p434-1] @@ -36,7 +41,7 @@ __inline void fpadd434(const digit_t* a, const digit_t* b, digit_t* c) } -__inline void fpsub434(const digit_t* a, const digit_t* b, digit_t* c) +inline void fpsub434(const digit_t* a, const digit_t* b, digit_t* c) { // Modular subtraction, c = a-b mod p434. // Inputs: a, b in [0, 2*p434-1] // Output: c in [0, 2*p434-1] @@ -45,7 +50,7 @@ __inline void fpsub434(const digit_t* a, const digit_t* b, digit_t* c) } -__inline void fpneg434(digit_t* a) +inline void fpneg434(digit_t* a) { // Modular negation, a = -a mod p434. // Input/output: a in [0, 2*p434-1] unsigned int i, borrow = 0; diff --git a/src/P434/ARM64/fp_arm64_asm.S b/src/P434/ARM64/fp_arm64_asm.S index ad4ddf3..c85480e 100644 --- a/src/P434/ARM64/fp_arm64_asm.S +++ b/src/P434/ARM64/fp_arm64_asm.S @@ -1,5 +1,9 @@ //******************************************************************************************* // SIDH: an efficient supersingular isogeny cryptography library +// Copyright (c) Microsoft Corporation +// +// Website: https://github.com/microsoft/PQCrypto-SIDH +// Released under MIT license // // Abstract: field arithmetic in 64-bit ARMv8 assembly for P434 on Linux //******************************************************************************************* diff --git a/src/P434/P434.c b/src/P434/P434.c index 800364e..b14863f 100644 --- a/src/P434/P434.c +++ b/src/P434/P434.c @@ -1,5 +1,9 @@ /******************************************************************************************** * SIDH: an efficient supersingular isogeny cryptography library +* Copyright (c) Microsoft Corporation +* +* Website: https://github.com/microsoft/PQCrypto-SIDH +* Released under MIT license * * Abstract: supersingular isogeny parameters and generation of functions for P434 *********************************************************************************************/ @@ -27,12 +31,10 @@ const uint64_t p434x2[NWORDS64_FIELD] = { 0xFFFFFFFFFFFFFFFE, 0xFFFFF 0xF78CB8F062B15D47, 0xD9F8BFAD038A40AC, 0x0004683E4E2EE688 }; const uint64_t p434x4[NWORDS64_FIELD] = { 0xFFFFFFFFFFFFFFFC, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xF705D9EB8BFFFFFF, 0xEF1971E0C562BA8F, 0xB3F17F5A07148159, 0x0008D07C9C5DCD11 }; +const uint64_t p434x8[NWORDS64_FIELD] = { 0xFFFFFFFFFFFFFFF8, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xEE0BB3D717FFFFFF, + 0xDE32E3C18AC5751F, 0x67E2FEB40E2902B3, 0x0011A0F938BB9A23 }; const uint64_t p434p1[NWORDS64_FIELD] = { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xFDC1767AE3000000, - 0x7BC65C783158AEA3, 0x6CFC5FD681C52056, 0x0002341F27177344 }; -const uint64_t p434x16p[2*NWORDS64_FIELD] = { 0x0000000000000010, 0x0000000000000000, 0x0000000000000000, 0x47D130A3A0000000, - 0x873470F9D4EA2B80, 0x6074052FC75BF530, 0x54497C1B1D119772, 0xC55F373D2CDCA412, - 0x732CA2221C664B96, 0x6445AB96AF6359A5, 0x221708AB42ABE1B4, 0xAE3D3D0063244F01, - 0x18B920F2ECF68816, 0x0000004DB194809D }; + 0x7BC65C783158AEA3, 0x6CFC5FD681C52056, 0x0002341F27177344 }; // Order of Alice's subgroup const uint64_t Alice_order[NWORDS64_ORDER] = { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000001000000 }; // Order of Bob's subgroup @@ -90,6 +92,7 @@ const unsigned int strat_Bob[MAX_Bob-1] = { #define fpneg fpneg434 #define fpdiv2 fpdiv2_434 #define fpcorrection fpcorrection434 +#define fpmul fpmul434 #define fpmul_mont fpmul434_mont #define fpsqr_mont fpsqr434_mont #define fpinv_mont fpinv434_mont @@ -107,6 +110,10 @@ const unsigned int strat_Bob[MAX_Bob-1] = { #define fp2correction fp2correction434 #define fp2mul_mont fp2mul434_mont #define fp2sqr_mont fp2sqr434_mont +#define fp2mul_c0_mont fp2mul434_c0_mont +#define fp2mul_c1_mont fp2mul434_c1_mont +#define fp2sqr_c0_mont fp2sqr434_c0_mont +#define fp2sqr_c1_mont fp2sqr434_c1_mont #define fp2inv_mont fp2inv434_mont #define fp2inv_mont_bingcd fp2inv434_mont_bingcd #define fpequal_non_constant_time fpequal434_non_constant_time diff --git a/src/P434/P434_api.h b/src/P434/P434_api.h index e274237..679b2c5 100644 --- a/src/P434/P434_api.h +++ b/src/P434/P434_api.h @@ -1,5 +1,9 @@ /******************************************************************************************** * SIDH: an efficient supersingular isogeny cryptography library +* Copyright (c) Microsoft Corporation +* +* Website: https://github.com/microsoft/PQCrypto-SIDH +* Released under MIT license * * Abstract: API header file for P434 *********************************************************************************************/ diff --git a/src/P434/P434_compressed.c b/src/P434/P434_compressed.c index 3c84740..7f07d38 100644 --- a/src/P434/P434_compressed.c +++ b/src/P434/P434_compressed.c @@ -1,5 +1,9 @@ /******************************************************************************************** * Supersingular Isogeny Key Encapsulation Library +* Copyright (c) Microsoft Corporation +* +* Website: https://github.com/microsoft/PQCrypto-SIDH +* Released under MIT license * * Abstract: supersingular isogeny parameters and generation of functions for P434_compressed *********************************************************************************************/ @@ -28,12 +32,10 @@ const uint64_t p434x2[NWORDS64_FIELD] = { 0xFFFFFFFFFFFFFFFE, 0xFFFFF 0xF78CB8F062B15D47, 0xD9F8BFAD038A40AC, 0x0004683E4E2EE688 }; const uint64_t p434x4[NWORDS64_FIELD] = { 0xFFFFFFFFFFFFFFFC, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xF705D9EB8BFFFFFF, 0xEF1971E0C562BA8F, 0xB3F17F5A07148159, 0x0008D07C9C5DCD11 }; +const uint64_t p434x8[NWORDS64_FIELD] = { 0xFFFFFFFFFFFFFFF8, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xEE0BB3D717FFFFFF, + 0xDE32E3C18AC5751F, 0x67E2FEB40E2902B3, 0x0011A0F938BB9A23 }; const uint64_t p434p1[NWORDS64_FIELD] = { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xFDC1767AE3000000, 0x7BC65C783158AEA3, 0x6CFC5FD681C52056, 0x0002341F27177344 }; -const uint64_t p434x16p[2*NWORDS64_FIELD] = { 0x0000000000000010, 0x0000000000000000, 0x0000000000000000, 0x47D130A3A0000000, - 0x873470F9D4EA2B80, 0x6074052FC75BF530, 0x54497C1B1D119772, 0xC55F373D2CDCA412, - 0x732CA2221C664B96, 0x6445AB96AF6359A5, 0x221708AB42ABE1B4, 0xAE3D3D0063244F01, - 0x18B920F2ECF68816, 0x0000004DB194809D }; // Order of Alice's subgroup const uint64_t Alice_order[NWORDS64_ORDER] = { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000001000000 }; // Order of Bob's subgroup @@ -346,6 +348,7 @@ const uint64_t v_3_torsion[TABLE_V3_LEN][2 * NWORDS64_FIELD] = #define fpneg fpneg434 #define fpdiv2 fpdiv2_434 #define fpcorrection fpcorrection434 +#define fpmul fpmul434 #define fpmul_mont fpmul434_mont #define fpsqr_mont fpsqr434_mont #define fpinv_mont fpinv434_mont @@ -363,6 +366,10 @@ const uint64_t v_3_torsion[TABLE_V3_LEN][2 * NWORDS64_FIELD] = #define fp2correction fp2correction434 #define fp2mul_mont fp2mul434_mont #define fp2sqr_mont fp2sqr434_mont +#define fp2mul_c0_mont fp2mul434_c0_mont +#define fp2mul_c1_mont fp2mul434_c1_mont +#define fp2sqr_c0_mont fp2sqr434_c0_mont +#define fp2sqr_c1_mont fp2sqr434_c1_mont #define fp2inv_mont fp2inv434_mont #define fp2inv_mont_bingcd fp2inv434_mont_bingcd #define fpequal_non_constant_time fpequal434_non_constant_time diff --git a/src/P434/P434_compressed_api.h b/src/P434/P434_compressed_api.h index 06a2d6d..bb1022c 100644 --- a/src/P434/P434_compressed_api.h +++ b/src/P434/P434_compressed_api.h @@ -1,5 +1,9 @@ /******************************************************************************************** * SIDH: an efficient supersingular isogeny cryptography library +* Copyright (c) Microsoft Corporation +* +* Website: https://github.com/microsoft/PQCrypto-SIDH +* Released under MIT license * * Abstract: API header file for P434 using compression *********************************************************************************************/ diff --git a/src/P434/P434_compressed_dlog_tables.c b/src/P434/P434_compressed_dlog_tables.c index 2356ebf..4750acc 100644 --- a/src/P434/P434_compressed_dlog_tables.c +++ b/src/P434/P434_compressed_dlog_tables.c @@ -1,5 +1,9 @@ /******************************************************************************************** * SIDH: an efficient supersingular isogeny cryptography library +* Copyright (c) Microsoft Corporation +* +* Website: https://github.com/microsoft/PQCrypto-SIDH +* Released under MIT license * * Abstract: precomputed tables for Pohlig-Hellman when using compression *********************************************************************************************/ diff --git a/src/P434/P434_compressed_pair_tables.c b/src/P434/P434_compressed_pair_tables.c index ee8e334..64b6086 100644 --- a/src/P434/P434_compressed_pair_tables.c +++ b/src/P434/P434_compressed_pair_tables.c @@ -1,5 +1,9 @@ /************************************************************************************************** * SIDH: an efficient supersingular isogeny cryptography library +* Copyright (c) Microsoft Corporation +* +* Website: https://github.com/microsoft/PQCrypto-SIDH +* Released under MIT license * * Abstract: precomputed tables for pairing computation on E0: y^2 = x^3 + x when using compression ***************************************************************************************************/ diff --git a/src/P434/P434_internal.h b/src/P434/P434_internal.h index 52c5705..4c95cb2 100644 --- a/src/P434/P434_internal.h +++ b/src/P434/P434_internal.h @@ -1,5 +1,9 @@ /******************************************************************************************** * SIDH: an efficient supersingular isogeny cryptography library +* Copyright (c) Microsoft Corporation +* +* Website: https://github.com/microsoft/PQCrypto-SIDH +* Released under MIT license * * Abstract: internal header file for P434 *********************************************************************************************/ @@ -168,6 +172,8 @@ void rdc434_asm(digit_t* ma, digit_t* mc); // Field multiplication using Montgomery arithmetic, c = a*b*R^-1 mod p434, where R=2^768 void fpmul434_mont(const digit_t* a, const digit_t* b, digit_t* c); +void fpmul434(const digit_t* a, const digit_t* b, digit_t* c); +void fpmul434_asm(const digit_t* a, const digit_t* b, digit_t* c); void mul434_asm(const digit_t* a, const digit_t* b, digit_t* c); // Field squaring using Montgomery arithmetic, c = a*b*R^-1 mod p434, where R=2^768 @@ -207,9 +213,17 @@ void fp2correction434(f2elm_t a); // GF(p434^2) squaring using Montgomery arithmetic, c = a^2 in GF(p434^2) void fp2sqr434_mont(const f2elm_t a, f2elm_t c); +void fp2sqr434_c0_mont(const digit_t* a, digit_t* c); +void fp2sqr434_c0_asm(const digit_t* a, digit_t* c); +void fp2sqr434_c1_mont(const digit_t* a, digit_t* c); +void fp2sqr434_c1_asm(const digit_t* a, digit_t* c); // GF(p434^2) multiplication using Montgomery arithmetic, c = a*b in GF(p434^2) void fp2mul434_mont(const f2elm_t a, const f2elm_t b, f2elm_t c); +void fp2mul434_c0_mont(const digit_t* a, const digit_t* b, digit_t* c); +void fp2mul434_c0_asm(const digit_t* a, const digit_t* b, digit_t* c); +void fp2mul434_c1_mont(const digit_t* a, const digit_t* b, digit_t* c); +void fp2mul434_c1_asm(const digit_t* a, const digit_t* b, digit_t* c); // GF(p434^2) inversion using Montgomery arithmetic, a = (a0-i*a1)/(a0^2+a1^2) void fp2inv434_mont(f2elm_t a); diff --git a/src/P434/generic/fp_generic.c b/src/P434/generic/fp_generic.c index 83856b9..ae8663c 100755 --- a/src/P434/generic/fp_generic.c +++ b/src/P434/generic/fp_generic.c @@ -1,5 +1,9 @@ /******************************************************************************************** * SIDH: an efficient supersingular isogeny cryptography library +* Copyright (c) Microsoft Corporation +* +* Website: https://github.com/microsoft/PQCrypto-SIDH +* Released under MIT license * * Abstract: portable modular arithmetic for P434 *********************************************************************************************/ diff --git a/src/P503/AMD64/fp_x64.c b/src/P503/AMD64/fp_x64.c index ca3c6f2..6553325 100644 --- a/src/P503/AMD64/fp_x64.c +++ b/src/P503/AMD64/fp_x64.c @@ -1,5 +1,9 @@ /******************************************************************************************** * SIDH: an efficient supersingular isogeny cryptography library +* Copyright (c) Microsoft Corporation +* +* Website: https://github.com/microsoft/PQCrypto-SIDH +* Released under MIT license * * Abstract: modular arithmetic optimized for x64 platforms for P503 *********************************************************************************************/ @@ -17,7 +21,7 @@ extern const uint64_t p503x4[NWORDS_FIELD]; inline void mp_sub503_p2(const digit_t* a, const digit_t* b, digit_t* c) { // Multiprecision subtraction with correction with 2*p, c = a-b+2p. -#if (OS_TARGET == OS_WIN) || defined(GENERIC_IMPLEMENTATION) || (TARGET == TARGET_ARM) || (TARGET == TARGET_ARM64 && NBITS_FIELD == 610) +#if (OS_TARGET == OS_WIN) unsigned int i, borrow = 0; for (i = 0; i < NWORDS_FIELD; i++) { @@ -39,7 +43,7 @@ inline void mp_sub503_p2(const digit_t* a, const digit_t* b, digit_t* c) inline void mp_sub503_p4(const digit_t* a, const digit_t* b, digit_t* c) { // Multiprecision subtraction with correction with 4*p, c = a-b+4p. -#if (OS_TARGET == OS_WIN) || defined(GENERIC_IMPLEMENTATION) || (TARGET == TARGET_ARM) || (TARGET == TARGET_ARM64 && NBITS_FIELD == 610) +#if (OS_TARGET == OS_WIN) unsigned int i, borrow = 0; for (i = 0; i < NWORDS_FIELD; i++) { @@ -50,11 +54,6 @@ inline void mp_sub503_p4(const digit_t* a, const digit_t* b, digit_t* c) for (i = 0; i < NWORDS_FIELD; i++) { ADDC(borrow, c[i], ((digit_t*)p503x4)[i], borrow, c[i]); } - -#elif (OS_TARGET == OS_NIX) - - mp_sub503_p4_asm(a, b, c); - #endif } @@ -161,13 +160,43 @@ void fpcorrection503(digit_t* a) } } +#if (OS_TARGET == OS_NIX) + +void fp2mul503_c0_mont(const digit_t* a, const digit_t* b, digit_t* c) +{ + fp2mul503_c0_asm(a, b, c); +} + + +void fp2mul503_c1_mont(const digit_t* a, const digit_t* b, digit_t* c) +{ + fp2mul503_c1_asm(a, b, c); +} + + +void fp2sqr503_c0_mont(const digit_t* a, digit_t* c) +{ + fp2sqr503_c0_asm(a, c); +} + + +void fp2sqr503_c1_mont(const digit_t* a, digit_t* c) +{ + fp2sqr503_c1_asm(a, c); +} + + +void fpmul503(const digit_t* a, const digit_t* b, digit_t* c) +{ + fpmul503_asm(a, b, c); +} + +#else void mp_mul(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords) { // Multiprecision multiply, c = a*b, where lng(a) = lng(b) = nwords. UNREFERENCED_PARAMETER(nwords); - -#if (OS_TARGET == OS_WIN) digit_t t = 0; uint128_t uv = {0}; unsigned int carry = 0; @@ -370,12 +399,6 @@ void mp_mul(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int n MULADD128(a[7], b[7], uv, carry, uv); c[14] = uv[0]; c[15] = uv[1]; - -#elif (OS_TARGET == OS_NIX) - - mul503_asm(a, b, c); - -#endif } @@ -384,8 +407,6 @@ void rdc_mont(digit_t* ma, digit_t* mc) // mc = ma*R^-1 mod p503x2, where R = 2^512. // If ma < 2^512*p503, the output mc is in the range [0, 2*p503-1]. // ma is assumed to be in Montgomery representation. - -#if (OS_TARGET == OS_WIN) unsigned int carry; digit_t t = 0; uint128_t uv = {0}; @@ -559,11 +580,7 @@ void rdc_mont(digit_t* ma, digit_t* mc) t += carry; ADDC(0, uv[0], ma[14], carry, mc[6]); ADDC(carry, uv[1], 0, carry, uv[1]); - ADDC(0, uv[1], ma[15], carry, mc[7]); - -#elif (OS_TARGET == OS_NIX) - - rdc503_asm(ma, mc); + ADDC(0, uv[1], ma[15], carry, mc[7]); +} -#endif -} \ No newline at end of file +#endif \ No newline at end of file diff --git a/src/P503/AMD64/fp_x64_asm.S b/src/P503/AMD64/fp_x64_asm.S index 8ebce4f..2843464 100644 --- a/src/P503/AMD64/fp_x64_asm.S +++ b/src/P503/AMD64/fp_x64_asm.S @@ -1,1820 +1,1687 @@ //******************************************************************************************* // SIDH: an efficient supersingular isogeny cryptography library +// Copyright (c) Microsoft Corporation +// +// Website: https://github.com/microsoft/PQCrypto-SIDH +// Released under MIT license // // Abstract: field arithmetic in x64 assembly for P503 on Linux -//******************************************************************************************* - -.intel_syntax noprefix - -// Format function and variable names for Mac OS X -#if defined(__APPLE__) - #define fmt(f) _##f -#else - #define fmt(f) f -#endif - -// Registers that are used for parameter passing: -#define reg_p1 rdi -#define reg_p2 rsi -#define reg_p3 rdx - -// Define addition instructions -#ifdef _MULX_ -#ifdef _ADX_ - -#define ADD1 adox -#define ADC1 adox -#define ADD2 adcx -#define ADC2 adcx - -#else - -#define ADD1 add -#define ADC1 adc -#define ADD2 add -#define ADC2 adc - -#endif -#endif - - -.text -//*********************************************************************** -// Field addition -// Operation: c [reg_p3] = a [reg_p1] + b [reg_p2] -//*********************************************************************** -.global fmt(fpadd503_asm) -fmt(fpadd503_asm): - push r12 - push r13 - push r14 - push r15 - - xor rax, rax - mov r8, [reg_p1] - mov r9, [reg_p1+8] - mov r10, [reg_p1+16] - mov r11, [reg_p1+24] - mov r12, [reg_p1+32] - mov r13, [reg_p1+40] - mov r14, [reg_p1+48] - mov r15, [reg_p1+56] - add r8, [reg_p2] - adc r9, [reg_p2+8] - adc r10, [reg_p2+16] - adc r11, [reg_p2+24] - adc r12, [reg_p2+32] - adc r13, [reg_p2+40] - adc r14, [reg_p2+48] - adc r15, [reg_p2+56] - - mov rcx, [rip+fmt(p503x2)] - sub r8, rcx - mov rcx, [rip+fmt(p503x2)+8] - sbb r9, rcx - sbb r10, rcx - mov rcx, [rip+fmt(p503x2)+24] - sbb r11, rcx - mov rcx, [rip+fmt(p503x2)+32] - sbb r12, rcx - mov rcx, [rip+fmt(p503x2)+40] - sbb r13, rcx - mov rcx, [rip+fmt(p503x2)+48] - sbb r14, rcx - mov rcx, [rip+fmt(p503x2)+56] - sbb r15, rcx - sbb rax, 0 - - mov rdi, [rip+fmt(p503x2)] - and rdi, rax - mov rsi, [rip+fmt(p503x2)+8] - and rsi, rax - mov rcx, [rip+fmt(p503x2)+24] - and rcx, rax - - add r8, rdi - adc r9, rsi - adc r10, rsi - adc r11, rcx - mov [reg_p3], r8 - mov [reg_p3+8], r9 - mov [reg_p3+16], r10 - mov [reg_p3+24], r11 - setc cl - - mov r8, [rip+fmt(p503x2)+32] - and r8, rax - mov r9, [rip+fmt(p503x2)+40] - and r9, rax - mov r10, [rip+fmt(p503x2)+48] - and r10, rax - mov r11, [rip+fmt(p503x2)+56] - and r11, rax - - bt rcx, 0 - adc r12, r8 - adc r13, r9 - adc r14, r10 - adc r15, r11 - mov [reg_p3+32], r12 - mov [reg_p3+40], r13 - mov [reg_p3+48], r14 - mov [reg_p3+56], r15 - - pop r15 - pop r14 - pop r13 - pop r12 - ret +//******************************************************************************************* + +.intel_syntax noprefix + +// Format function and variable names for Mac OS X +#if defined(__APPLE__) + #define fmt(f) _##f +#else + #define fmt(f) f +#endif + +// Registers that are used for parameter passing: +#define reg_p1 rdi +#define reg_p2 rsi +#define reg_p3 rdx + + +.text +//*********************************************************************** +// 503-bit multiprecision addition +// Operation: c [reg_p3] = a [reg_p1] + b [reg_p2] +//*********************************************************************** +.global fmt(mp_add503_asm) +fmt(mp_add503_asm): + mov r8, [reg_p1] + mov r9, [reg_p1+8] + mov r10, [reg_p1+16] + mov r11, [reg_p1+24] + add r8, [reg_p2] + adc r9, [reg_p2+8] + adc r10, [reg_p2+16] + adc r11, [reg_p2+24] + mov [reg_p3], r8 + mov [reg_p3+8], r9 + mov [reg_p3+16], r10 + mov [reg_p3+24], r11 + + mov r8, [reg_p1+32] + mov r9, [reg_p1+40] + mov r10, [reg_p1+48] + mov r11, [reg_p1+56] + adc r8, [reg_p2+32] + adc r9, [reg_p2+40] + adc r10, [reg_p2+48] + adc r11, [reg_p2+56] + mov [reg_p3+32], r8 + mov [reg_p3+40], r9 + mov [reg_p3+48], r10 + mov [reg_p3+56], r11 + ret + + +//*********************************************************************** +// Field addition +// Operation: c [reg_p3] = a [reg_p1] + b [reg_p2] +//*********************************************************************** +.global fmt(fpadd503_asm) +fmt(fpadd503_asm): + push r12 + push r13 + push r14 + push r15 + + xor rax, rax + mov r8, [reg_p1] + mov r9, [reg_p1+8] + mov r10, [reg_p1+16] + mov r11, [reg_p1+24] + mov r12, [reg_p1+32] + mov r13, [reg_p1+40] + mov r14, [reg_p1+48] + mov r15, [reg_p1+56] + add r8, [reg_p2] + adc r9, [reg_p2+8] + adc r10, [reg_p2+16] + adc r11, [reg_p2+24] + adc r12, [reg_p2+32] + adc r13, [reg_p2+40] + adc r14, [reg_p2+48] + adc r15, [reg_p2+56] + + mov rcx, [rip+fmt(p503x2)] + sub r8, rcx + mov rcx, [rip+fmt(p503x2)+8] + sbb r9, rcx + sbb r10, rcx + mov rcx, [rip+fmt(p503x2)+24] + sbb r11, rcx + mov rcx, [rip+fmt(p503x2)+32] + sbb r12, rcx + mov rcx, [rip+fmt(p503x2)+40] + sbb r13, rcx + mov rcx, [rip+fmt(p503x2)+48] + sbb r14, rcx + mov rcx, [rip+fmt(p503x2)+56] + sbb r15, rcx + sbb rax, 0 + + mov rdi, [rip+fmt(p503x2)] + and rdi, rax + mov rsi, [rip+fmt(p503x2)+8] + and rsi, rax + mov rcx, [rip+fmt(p503x2)+24] + and rcx, rax + + add r8, rdi + adc r9, rsi + adc r10, rsi + adc r11, rcx + mov [reg_p3], r8 + mov [reg_p3+8], r9 + mov [reg_p3+16], r10 + mov [reg_p3+24], r11 + setc cl + + mov r8, [rip+fmt(p503x2)+32] + and r8, rax + mov r9, [rip+fmt(p503x2)+40] + and r9, rax + mov r10, [rip+fmt(p503x2)+48] + and r10, rax + mov r11, [rip+fmt(p503x2)+56] + and r11, rax + + bt rcx, 0 + adc r12, r8 + adc r13, r9 + adc r14, r10 + adc r15, r11 + mov [reg_p3+32], r12 + mov [reg_p3+40], r13 + mov [reg_p3+48], r14 + mov [reg_p3+56], r15 + + pop r15 + pop r14 + pop r13 + pop r12 + ret + + +//*********************************************************************** +// Field subtraction +// Operation: c [reg_p3] = a [reg_p1] - b [reg_p2] +//*********************************************************************** +.global fmt(fpsub503_asm) +fmt(fpsub503_asm): + push r12 + push r13 + push r14 + push r15 + + xor rax, rax + mov r8, [reg_p1] + mov r9, [reg_p1+8] + mov r10, [reg_p1+16] + mov r11, [reg_p1+24] + mov r12, [reg_p1+32] + mov r13, [reg_p1+40] + mov r14, [reg_p1+48] + mov r15, [reg_p1+56] + sub r8, [reg_p2] + sbb r9, [reg_p2+8] + sbb r10, [reg_p2+16] + sbb r11, [reg_p2+24] + sbb r12, [reg_p2+32] + sbb r13, [reg_p2+40] + sbb r14, [reg_p2+48] + sbb r15, [reg_p2+56] + sbb rax, 0 + + mov rdi, [rip+fmt(p503x2)] + and rdi, rax + mov rsi, [rip+fmt(p503x2)+8] + and rsi, rax + mov rcx, [rip+fmt(p503x2)+24] + and rcx, rax + + add r8, rdi + adc r9, rsi + adc r10, rsi + adc r11, rcx + mov [reg_p3], r8 + mov [reg_p3+8], r9 + mov [reg_p3+16], r10 + mov [reg_p3+24], r11 + setc cl + + mov r8, [rip+fmt(p503x2)+32] + and r8, rax + mov r9, [rip+fmt(p503x2)+40] + and r9, rax + mov r10, [rip+fmt(p503x2)+48] + and r10, rax + mov r11, [rip+fmt(p503x2)+56] + and r11, rax + + bt rcx, 0 + adc r12, r8 + adc r13, r9 + adc r14, r10 + adc r15, r11 + mov [reg_p3+32], r12 + mov [reg_p3+40], r13 + mov [reg_p3+48], r14 + mov [reg_p3+56], r15 + + pop r15 + pop r14 + pop r13 + pop r12 + ret + + +///////////////////////////////////////////////////////////////// MACRO +.macro SUB503_PX P0 + push r12 + push r13 + push r14 + + mov r8, [reg_p1] + mov r9, [reg_p1+8] + mov r10, [reg_p1+16] + mov r11, [reg_p1+24] + mov r12, [reg_p1+32] + mov r13, [reg_p1+40] + mov r14, [reg_p1+48] + mov rcx, [reg_p1+56] + sub r8, [reg_p2] + sbb r9, [reg_p2+8] + sbb r10, [reg_p2+16] + sbb r11, [reg_p2+24] + sbb r12, [reg_p2+32] + sbb r13, [reg_p2+40] + sbb r14, [reg_p2+48] + sbb rcx, [reg_p2+56] + + mov rax, [rip+\P0] + mov rdi, [rip+\P0+8] + mov rsi, [rip+\P0+24] + add r8, rax + mov rax, [rip+\P0+32] + adc r9, rdi + adc r10, rdi + adc r11, rsi + adc r12, rax + mov rdi, [rip+\P0+40] + mov rsi, [rip+\P0+48] + mov rax, [rip+\P0+56] + adc r13, rdi + adc r14, rsi + adc rcx, rax + mov [reg_p3], r8 + mov [reg_p3+8], r9 + mov [reg_p3+16], r10 + mov [reg_p3+24], r11 + mov [reg_p3+32], r12 + mov [reg_p3+40], r13 + mov [reg_p3+48], r14 + mov [reg_p3+56], rcx + + pop r14 + pop r13 + pop r12 +.endm + + +//*********************************************************************** +// Multiprecision subtraction with correction with 2*p503 +// Operation: c [reg_p3] = a [reg_p1] - b [reg_p2] + 2*p503 +//*********************************************************************** +.global fmt(mp_sub503_p2_asm) +fmt(mp_sub503_p2_asm): + + SUB503_PX fmt(p503x2) + ret + + +#ifdef _MULX_ +#ifdef _ADX_ + +///////////////////////////////////////////////////////////////// MACRO +// z = a x bi + z +// Inputs: base memory pointer M1 (a), +// bi pre-stored in rdx, +// accumulator z in [Z0:Z5] or [Z0:Z8] +// Output: [Z0:Z5] or [Z0:Z8] +// Temps: regs T0:T1 +///////////////////////////////////////////////////////////////// +.macro MULADD64x512 M1, Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7, Z8, T0, T1, C + xor \C, \C + mulx \T0, \T1, \M1 // A0*B0 + adox \Z0, \T1 + adox \Z1, \T0 + mulx \T0, \T1, 8\M1 // A0*B1 + adcx \Z1, \T1 + adox \Z2, \T0 + mulx \T0, \T1, 16\M1 // A0*B2 + adcx \Z2, \T1 + adox \Z3, \T0 + mulx \T0, \T1, 24\M1 // A0*B3 + adcx \Z3, \T1 + adox \Z4, \T0 + mulx \T0, \T1, 32\M1 // A0*B4 + adcx \Z4, \T1 + adox \Z5, \T0 + mulx \T0, \T1, 40\M1 // A0*B5 + adcx \Z5, \T1 + adox \Z6, \T0 + mulx \T0, \T1, 48\M1 // A0*B6 + adcx \Z6, \T1 + adox \Z7, \T0 + mulx \T0, \T1, 56\M1 // A0*B7 + adcx \Z7, \T1 + adox \Z8, \T0 + adc \Z8, 0 +.endm + + +.macro MULADD64x320 M1, Z0, Z1, Z2, Z3, Z4, Z5, T0, T1 + xor \T0, \T0 + mulx \T0, \T1, \M1 // A0*B0 + adox \Z0, \T1 + adox \Z1, \T0 + mulx \T0, \T1, 8\M1 // A0*B1 + adcx \Z1, \T1 + adox \Z2, \T0 + mulx \T0, \T1, 16\M1 // A0*B2 + adcx \Z2, \T1 + adox \Z3, \T0 + mulx \T0, \T1, 24\M1 // A0*B3 + adcx \Z3, \T1 + adox \Z4, \T0 + mulx \T0, \T1, 32\M1 // A0*B4 + adcx \Z4, \T1 + adox \Z5, \T0 + adc \Z5, 0 +.endm + + +///////////////////////////////////////////////////////////////// MACRO +// z = a x b + c x d (mod p) +// Inputs: base memory pointers M0 (a,c), M1 (b,d) +// bi pre-stored in rdx, +// accumulator z in [Z0:Z8], pre-stores a0 x b +// Output: [Z0:Z8] +// Temps: regs T0:T1 +///////////////////////////////////////////////////////////////// +.macro FPDBLMUL512x512 M00, M01, M10, M11, Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7, Z8, T0, T1 + mov rdx, \M11 + MULADD64x512 \M01, \Z0, \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \Z8, \T0, \T1, \T0 + // [Z1:Z8, Z0] <- z = (z0 x p503p1 + z)/2^64 + mov rdx, \Z0 // rdx <- z0 + MULADD64x320 [rip+fmt(p503p1)+24], \Z3, \Z4, \Z5, \Z6, \Z7, \Z8, \T0, \T1 + + // [Z1:Z8, Z0] <- z = a0 x b01 - a1 x b11 + z + mov rdx, 8\M10 + MULADD64x512 \M00, \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \Z8, \Z0, \T0, \T1, \Z0 + mov rdx, 8\M11 + MULADD64x512 \M01, \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \Z8, \Z0, \T0, \T1, \T0 + // [Z2:Z8, Z0] <- z = (z0 x p503p1 + z)/2^64 + mov rdx, \Z1 // rdx <- z0 + MULADD64x320 [rip+fmt(p503p1)+24], \Z4, \Z5, \Z6, \Z7, \Z8, \Z0, \T0, \T1 + + // [Z2:Z8, Z0:Z1] <- z = a0 x b02 - a1 x b12 + z + mov rdx, 16\M10 + MULADD64x512 \M00, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \Z8, \Z0, \Z1, \T0, \T1, \Z1 + mov rdx, 16\M11 + MULADD64x512 \M01, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \Z8, \Z0, \Z1, \T0, \T1, \T0 + // [Z3:Z8, Z0:Z1] <- z = (z0 x p503p1 + z)/2^64 + mov rdx, \Z2 // rdx <- z0 + MULADD64x320 [rip+fmt(p503p1)+24], \Z5, \Z6, \Z7, \Z8, \Z0, \Z1, \T0, \T1 + + // [Z3:Z8, Z0:Z2] <- z = a0 x b03 - a1 x b13 + z + mov rdx, 24\M10 + MULADD64x512 \M00, \Z3, \Z4, \Z5, \Z6, \Z7, \Z8, \Z0, \Z1, \Z2, \T0, \T1, \Z2 + mov rdx, 24\M11 + MULADD64x512 \M01, \Z3, \Z4, \Z5, \Z6, \Z7, \Z8, \Z0, \Z1, \Z2, \T0, \T1, \T0 + // [Z4:Z8, Z0:Z2] <- z = (z0 x p503p1 + z)/2^64 + mov rdx, \Z3 // rdx <- z0 + MULADD64x320 [rip+fmt(p503p1)+24], \Z6, \Z7, \Z8, \Z0, \Z1, \Z2, \T0, \T1 + + // [Z4:Z8, Z0:Z3] <- z = a0 x b04 - a1 x b14 + z + mov rdx, 32\M10 + MULADD64x512 \M00, \Z4, \Z5, \Z6, \Z7, \Z8, \Z0, \Z1, \Z2, \Z3, \T0, \T1, \Z3 + mov rdx, 32\M11 + MULADD64x512 \M01, \Z4, \Z5, \Z6, \Z7, \Z8, \Z0, \Z1, \Z2, \Z3, \T0, \T1, \T0 + // [Z5:Z8, Z0:Z3] <- z = (z0 x p503p1 + z)/2^64 + mov rdx, \Z4 // rdx <- z0 + MULADD64x320 [rip+fmt(p503p1)+24], \Z7, \Z8, \Z0, \Z1, \Z2, \Z3, \T0, \T1 + + // [Z5:Z8, Z0:Z4] <- z = a0 x b05 - a1 x b15 + z + mov rdx, 40\M10 + MULADD64x512 \M00, \Z5, \Z6, \Z7, \Z8, \Z0, \Z1, \Z2, \Z3, \Z4, \T0, \T1, \Z4 + mov rdx, 40\M11 + MULADD64x512 \M01, \Z5, \Z6, \Z7, \Z8, \Z0, \Z1, \Z2, \Z3, \Z4, \T0, \T1, \T0 + // [Z6:Z8, Z0:Z4] <- z = (z0 x p503p1 + z)/2^64 + mov rdx, \Z5 // rdx <- z0 + MULADD64x320 [rip+fmt(p503p1)+24], \Z8, \Z0, \Z1, \Z2, \Z3, \Z4, \T0, \T1 + + // [Z6:Z8, Z0:Z5] <- z = a0 x b06 - a1 x b16 + z + mov rdx, 48\M10 + MULADD64x512 \M00, \Z6, \Z7, \Z8, \Z0, \Z1, \Z2, \Z3, \Z4, \Z5, \T0, \T1, \Z5 + mov rdx, 48\M11 + MULADD64x512 \M01, \Z6, \Z7, \Z8, \Z0, \Z1, \Z2, \Z3, \Z4, \Z5, \T0, \T1, \T0 + // [Z7, Z0:Z5] <- z = (z0 x p503p1 + z)/2^64 + mov rdx, \Z6 // rdx <- z0 + MULADD64x320 [rip+fmt(p503p1)+24], \Z0, \Z1, \Z2, \Z3, \Z4, \Z5, \T0, \T1 + + // [Z7, Z0:Z6] <- z = a0 x b07 - a1 x b17 + z + mov rdx, 56\M10 + MULADD64x512 \M00, \Z7, \Z8, \Z0, \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \T0, \T1, \Z6 + mov rdx, 56\M11 + MULADD64x512 \M01, \Z7, \Z8, \Z0, \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \T0, \T1, \T0 + // [Z8, Z0:Z6] <- z = (z0 x p503p1 + z)/2^64 + mov rdx, \Z7 // rdx <- z0 + mov [rcx], \Z8 + mov [rcx+8], \Z0 + MULADD64x320 [rip+fmt(p503p1)+24], \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \T0, \T1 +.endm + + +//*********************************************************************** +// Multiplication in GF(p^2), non-complex part +// Operation: c [reg_p3] = a0 x b0 - a1 x b1 +// Inputs: a = [a1, a0] stored in [reg_p1] +// b = [b1, b0] stored in [reg_p2] +// Output: c stored in [reg_p3] +//*********************************************************************** +.global fmt(fp2mul503_c0_asm) +fmt(fp2mul503_c0_asm): + push r12 + mov rcx, reg_p3 + + // [rcx0:7] <- 8*p - b1 + mov r8, [rip+fmt(p503x8)] + mov r9, [rip+fmt(p503x8)+8] + mov r11, [rip+fmt(p503x8)+24] + mov r12, [rip+fmt(p503x8)+32] + mov rax, [reg_p2+64] + mov rdx, [reg_p2+72] + mov r10, r9 + sub r8, rax + push r13 + sbb r9, rdx + mov rax, [reg_p2+80] + mov rdx, [reg_p2+88] + sbb r10, rax + push r14 + sbb r11, rdx + mov rax, [reg_p2+96] + mov rdx, [reg_p2+104] + mov [rcx], r8 + mov [rcx+8], r9 + mov r13, [rip+fmt(p503x8)+40] + mov r14, [rip+fmt(p503x8)+48] + sbb r12, rax + push r15 + sbb r13, rdx + mov rax, [reg_p2+112] + mov rdx, [reg_p2+120] + mov r15, [rip+fmt(p503x8)+56] + sbb r14, rax + sbb r15, rdx + mov [rcx+16], r10 + + // [r8:r14, rax, r15] <- z = a0 x b00 - a1 x b10 + mov rdx, [reg_p2] + mulx r9, r8, [reg_p1] + mov [rcx+24], r11 + xor rax, rax + mulx r10, r11, [reg_p1+8] + mov [rcx+32], r12 + adcx r9, r11 + mulx r11, r12, [reg_p1+16] + mov [rcx+40], r13 + adcx r10, r12 + mulx r12, r13, [reg_p1+24] + mov [rcx+48], r14 + adcx r11, r13 + mulx r13, r14, [reg_p1+32] + mov [rcx+56], r15 + adcx r12, r14 + mulx r14, rax, [reg_p1+40] + push rbx + adcx r13, rax + mulx r15, rax, [reg_p1+48] + push rbp + adcx r14, rax + mulx rax, rbx, [reg_p1+56] + mov rdx, [rcx] + adcx r15, rbx + adc rax, 0 + + FPDBLMUL512x512 [reg_p1], [reg_p1+64], [reg_p2], [rcx], r8, r9, r10, r11, r12, r13, r14, r15, rax, rbx, rbp + + mov [rcx+16], r9 + mov [rcx+24], r10 + mov [rcx+32], r11 + mov [rcx+40], r12 + mov [rcx+48], r13 + mov [rcx+56], r14 + pop rbp + pop rbx + pop r15 + pop r14 + pop r13 + pop r12 + ret + + +//*********************************************************************** +// Multiplication in GF(p^2), complex part +// Operation: c [reg_p3] = a0 x b1 + a1 x b0 +// Inputs: a = [a1, a0] stored in [reg_p1] +// b = [b1, b0] stored in [reg_p2] +// Output: c stored in [reg_p3] +//*********************************************************************** +.global fmt(fp2mul503_c1_asm) +fmt(fp2mul503_c1_asm): + mov rcx, reg_p3 + + // [r8, r9:r15, rax] <- z = a0 x b10 + a1 x b00 + mov rdx, [reg_p2] + mulx r9, r8, [reg_p1+64] // a0 x b10 + xor rax, rax + push r12 + mulx r10, r11, [reg_p1+72] + push r13 + adcx r9, r11 + mulx r11, r12, [reg_p1+80] + push r14 + adcx r10, r12 + mulx r12, r13, [reg_p1+88] + push r15 + adcx r11, r13 + mulx r13, r14, [reg_p1+96] + push rbx + adcx r12, r14 + mulx r14, r15, [reg_p1+104] + push rbp + adcx r13, r15 + mulx r15, rbp, [reg_p1+112] + adcx r14, rbp + mulx rax, rbx, [reg_p1+120] + adcx r15, rbx + adc rax, 0 + + FPDBLMUL512x512 [reg_p1+64], [reg_p1], [reg_p2], [reg_p2+64], r8, r9, r10, r11, r12, r13, r14, r15, rax, rbx, rbp + + mov [rcx+16], r9 + mov [rcx+24], r10 + mov [rcx+32], r11 + mov [rcx+40], r12 + mov [rcx+48], r13 + mov [rcx+56], r14 + pop rbp + pop rbx + pop r15 + pop r14 + pop r13 + pop r12 + ret + +#else + +# error "CONFIGURATION NOT SUPPORTED. TRY USE_MULX=TRUE USE_ADX=TRUE" + +#endif + +#else + +//*********************************************************************** +// Integer multiplication +// Based on Karatsuba method +// Operation: c [reg_p3] = a [reg_p1] * b [reg_p2] +// NOTE: a=c or b=c are not allowed +//*********************************************************************** +.global fmt(mul503_asm) +fmt(mul503_asm): + push r12 + push r13 + push r14 + mov rcx, reg_p3 + + // rcx[0-3] <- AH+AL + xor rax, rax + mov r8, [reg_p1+32] + mov r9, [reg_p1+40] + mov r10, [reg_p1+48] + mov r11, [reg_p1+56] + add r8, [reg_p1] + adc r9, [reg_p1+8] + adc r10, [reg_p1+16] + adc r11, [reg_p1+24] + push r15 + mov [rcx], r8 + mov [rcx+8], r9 + mov [rcx+16], r10 + mov [rcx+24], r11 + sbb rax, 0 + sub rsp, 80 // Allocating space in stack + + // r12-r15 <- BH+BL + xor rdx, rdx + mov r12, [reg_p2+32] + mov r13, [reg_p2+40] + mov r14, [reg_p2+48] + mov r15, [reg_p2+56] + add r12, [reg_p2] + adc r13, [reg_p2+8] + adc r14, [reg_p2+16] + adc r15, [reg_p2+24] + sbb rdx, 0 + mov [rsp+64], rax + mov [rsp+72], rdx + + // (rsp[0-3],r8,r9,r10,r11) <- (AH+AL)*(BH+BL) + mov rax, [rcx] + mul r12 + mov [rsp], rax // c0 + mov r8, rdx + + xor r9, r9 + mov rax, [rcx] + mul r13 + add r8, rax + adc r9, rdx + + xor r10, r10 + mov rax, [rcx+8] + mul r12 + add r8, rax + mov [rsp+8], r8 // c1 + adc r9, rdx + adc r10, 0 + + xor r8, r8 + mov rax, [rcx] + mul r14 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [rcx+16] + mul r12 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [rcx+8] + mul r13 + add r9, rax + mov [rsp+16], r9 // c2 + adc r10, rdx + adc r8, 0 + + xor r9, r9 + mov rax, [rcx] + mul r15 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, [rcx+24] + mul r12 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, [rcx+8] + mul r14 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, [rcx+16] + mul r13 + add r10, rax + mov [rsp+24], r10 // c3 + adc r8, rdx + adc r9, 0 + + xor r10, r10 + mov rax, [rcx+8] + mul r15 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov rax, [rcx+24] + mul r13 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov rax, [rcx+16] + mul r14 + add r8, rax + mov [rsp+32], r8 // c4 + adc r9, rdx + adc r10, 0 + + xor r11, r11 + mov rax, [rcx+16] + mul r15 + add r9, rax + adc r10, rdx + adc r11, 0 + + mov rax, [rcx+24] + mul r14 + add r9, rax // c5 + adc r10, rdx + adc r11, 0 + + mov rax, [rcx+24] + mul r15 + add r10, rax // c6 + adc r11, rdx // c7 + + mov rax, [rsp+64] + and r12, rax + and r13, rax + and r14, rax + and r15, rax + add r12, r8 + adc r13, r9 + adc r14, r10 + adc r15, r11 + + mov rax, [rsp+72] + mov r8, [rcx] + mov r9, [rcx+8] + mov r10, [rcx+16] + mov r11, [rcx+24] + and r8, rax + and r9, rax + and r10, rax + and r11, rax + add r8, r12 + adc r9, r13 + adc r10, r14 + adc r11, r15 + mov [rsp+32], r8 + mov [rsp+40], r9 + mov [rsp+48], r10 + mov [rsp+56], r11 + + // rcx[0-7] <- AL*BL + mov r11, [reg_p1] + mov rax, [reg_p2] + mul r11 + xor r9, r9 + mov [rcx], rax // c0 + mov r8, rdx + + mov r14, [reg_p1+16] + mov rax, [reg_p2+8] + mul r11 + xor r10, r10 + add r8, rax + adc r9, rdx + + mov r12, [reg_p1+8] + mov rax, [reg_p2] + mul r12 + add r8, rax + mov [rcx+8], r8 // c1 + adc r9, rdx + adc r10, 0 + + xor r8, r8 + mov rax, [reg_p2+16] + mul r11 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov r13, [reg_p2] + mov rax, r14 + mul r13 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [reg_p2+8] + mul r12 + add r9, rax + mov [rcx+16], r9 // c2 + adc r10, rdx + adc r8, 0 + + xor r9, r9 + mov rax, [reg_p2+24] + mul r11 + mov r15, [reg_p1+24] + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, r15 + mul r13 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, [reg_p2+16] + mul r12 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, [reg_p2+8] + mul r14 + add r10, rax + mov [rcx+24], r10 // c3 + adc r8, rdx + adc r9, 0 + + xor r10, r10 + mov rax, [reg_p2+24] + mul r12 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov rax, [reg_p2+8] + mul r15 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov rax, [reg_p2+16] + mul r14 + add r8, rax + mov [rcx+32], r8 // c4 + adc r9, rdx + adc r10, 0 + + xor r8, r8 + mov rax, [reg_p2+24] + mul r14 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [reg_p2+16] + mul r15 + add r9, rax + mov [rcx+40], r9 // c5 + adc r10, rdx + adc r8, 0 + + mov rax, [reg_p2+24] + mul r15 + add r10, rax + mov [rcx+48], r10 // c6 + adc r8, rdx + mov [rcx+56], r8 // c7 + + // rcx[8-15] <- AH*BH + mov r11, [reg_p1+32] + mov rax, [reg_p2+32] + mul r11 + xor r9, r9 + mov [rcx+64], rax // c0 + mov r8, rdx + + mov r14, [reg_p1+48] + mov rax, [reg_p2+40] + mul r11 + xor r10, r10 + add r8, rax + adc r9, rdx + + mov r12, [reg_p1+40] + mov rax, [reg_p2+32] + mul r12 + add r8, rax + mov [rcx+72], r8 // c1 + adc r9, rdx + adc r10, 0 + + xor r8, r8 + mov rax, [reg_p2+48] + mul r11 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov r13, [reg_p2+32] + mov rax, r14 + mul r13 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [reg_p2+40] + mul r12 + add r9, rax + mov [rcx+80], r9 // c2 + adc r10, rdx + adc r8, 0 + + xor r9, r9 + mov rax, [reg_p2+56] + mul r11 + mov r15, [reg_p1+56] + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, r15 + mul r13 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, [reg_p2+48] + mul r12 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, [reg_p2+40] + mul r14 + add r10, rax + mov [rcx+88], r10 // c3 + adc r8, rdx + adc r9, 0 + + xor r10, r10 + mov rax, [reg_p2+56] + mul r12 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov rax, [reg_p2+40] + mul r15 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov rax, [reg_p2+48] + mul r14 + add r8, rax + mov [rcx+96], r8 // c4 + adc r9, rdx + adc r10, 0 + + xor r8, r8 + mov rax, [reg_p2+56] + mul r14 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [reg_p2+48] + mul r15 + add r9, rax + mov [rcx+104], r9 // c5 + adc r10, rdx + adc r8, 0 + + mov rax, [reg_p2+56] + mul r15 + add r10, rax + mov [rcx+112], r10 // c6 + adc r8, rdx + mov [rcx+120], r8 // c7 + + // [r8-r15] <- (AH+AL)*(BH+BL) - AL*BL + mov r8, [rsp] + sub r8, [rcx] + mov r9, [rsp+8] + sbb r9, [rcx+8] + mov r10, [rsp+16] + sbb r10, [rcx+16] + mov r11, [rsp+24] + sbb r11, [rcx+24] + mov r12, [rsp+32] + sbb r12, [rcx+32] + mov r13, [rsp+40] + sbb r13, [rcx+40] + mov r14, [rsp+48] + sbb r14, [rcx+48] + mov r15, [rsp+56] + sbb r15, [rcx+56] + + // [r8-r15] <- (AH+AL)*(BH+BL) - AL*BL - AH*BH + mov rax, [rcx+64] + sub r8, rax + mov rax, [rcx+72] + sbb r9, rax + mov rax, [rcx+80] + sbb r10, rax + mov rax, [rcx+88] + sbb r11, rax + mov rax, [rcx+96] + sbb r12, rax + mov rdx, [rcx+104] + sbb r13, rdx + mov rdi, [rcx+112] + sbb r14, rdi + mov rsi, [rcx+120] + sbb r15, rsi + + // Final result + add r8, [rcx+32] + mov [rcx+32], r8 + adc r9, [rcx+40] + mov [rcx+40], r9 + adc r10, [rcx+48] + mov [rcx+48], r10 + adc r11, [rcx+56] + mov [rcx+56], r11 + adc r12, [rcx+64] + mov [rcx+64], r12 + adc r13, [rcx+72] + mov [rcx+72], r13 + adc r14, [rcx+80] + mov [rcx+80], r14 + adc r15, [rcx+88] + mov [rcx+88], r15 + adc rax, 0 + mov [rcx+96], rax + adc rdx, 0 + mov [rcx+104], rdx + adc rdi, 0 + mov [rcx+112], rdi + adc rsi, 0 + mov [rcx+120], rsi + + add rsp, 80 // Restoring space in stack + pop r15 + pop r14 + pop r13 + pop r12 + ret + + +//*********************************************************************** +// Montgomery reduction +// Based on comba method +// Operation: c [reg_p2] = a [reg_p1] +// NOTE: a=c is not allowed +//*********************************************************************** +.global fmt(rdc503_asm) +fmt(rdc503_asm): + push r12 + push r13 + push r14 + push r15 + + mov r11, [reg_p1] + mov rax, [rip+fmt(p503p1)+24] + mul r11 + xor r8, r8 + add rax, [reg_p1+24] + mov [reg_p2+24], rax // z3 + adc r8, rdx + + xor r9, r9 + mov rax, [rip+fmt(p503p1)+32] + mul r11 + xor r10, r10 + add r8, rax + adc r9, rdx + + mov r12, [reg_p1+8] + mov rax, [rip+fmt(p503p1)+24] + mul r12 + add r8, rax + adc r9, rdx + adc r10, 0 + add r8, [reg_p1+32] + mov [reg_p2+32], r8 // z4 + adc r9, 0 + adc r10, 0 + + xor r8, r8 + mov rax, [rip+fmt(p503p1)+40] + mul r11 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [rip+fmt(p503p1)+32] + mul r12 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov r13, [reg_p1+16] + mov rax, [rip+fmt(p503p1)+24] + mul r13 + add r9, rax + adc r10, rdx + adc r8, 0 + add r9, [reg_p1+40] + mov [reg_p2+40], r9 // z5 + adc r10, 0 + adc r8, 0 + + xor r9, r9 + mov rax, [rip+fmt(p503p1)+48] + mul r11 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, [rip+fmt(p503p1)+40] + mul r12 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, [rip+fmt(p503p1)+32] + mul r13 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov r14, [reg_p2+24] + mov rax, [rip+fmt(p503p1)+24] + mul r14 + add r10, rax + adc r8, rdx + adc r9, 0 + add r10, [reg_p1+48] + mov [reg_p2+48], r10 // z6 + adc r8, 0 + adc r9, 0 + + xor r10, r10 + mov rax, [rip+fmt(p503p1)+56] + mul r11 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov rax, [rip+fmt(p503p1)+48] + mul r12 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov rax, [rip+fmt(p503p1)+40] + mul r13 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov rax, [rip+fmt(p503p1)+32] + mul r14 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov r15, [reg_p2+32] + mov rax, [rip+fmt(p503p1)+24] + mul r15 + add r8, rax + adc r9, rdx + adc r10, 0 + add r8, [reg_p1+56] + mov [reg_p2+56], r8 // z7 + adc r9, 0 + adc r10, 0 + + xor r8, r8 + mov rax, [rip+fmt(p503p1)+56] + mul r12 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [rip+fmt(p503p1)+48] + mul r13 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [rip+fmt(p503p1)+40] + mul r14 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [rip+fmt(p503p1)+32] + mul r15 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rcx, [reg_p2+40] + mov rax, [rip+fmt(p503p1)+24] + mul rcx + add r9, rax + adc r10, rdx + adc r8, 0 + add r9, [reg_p1+64] + mov [reg_p2], r9 // z0 + adc r10, 0 + adc r8, 0 + + xor r9, r9 + mov rax, [rip+fmt(p503p1)+56] + mul r13 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, [rip+fmt(p503p1)+48] + mul r14 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, [rip+fmt(p503p1)+40] + mul r15 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, [rip+fmt(p503p1)+32] + mul rcx + add r10, rax + adc r8, rdx + adc r9, 0 + + mov r13, [reg_p2+48] + mov rax, [rip+fmt(p503p1)+24] + mul r13 + add r10, rax + adc r8, rdx + adc r9, 0 + add r10, [reg_p1+72] + mov [reg_p2+8], r10 // z1 + adc r8, 0 + adc r9, 0 + + xor r10, r10 + mov rax, [rip+fmt(p503p1)+56] + mul r14 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov rax, [rip+fmt(p503p1)+48] + mul r15 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov rax, [rip+fmt(p503p1)+40] + mul rcx + add r8, rax + adc r9, rdx + adc r10, 0 + + mov rax, [rip+fmt(p503p1)+32] + mul r13 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov r14, [reg_p2+56] + mov rax, [rip+fmt(p503p1)+24] + mul r14 + add r8, rax + adc r9, rdx + adc r10, 0 + add r8, [reg_p1+80] + mov [reg_p2+16], r8 // z2 + adc r9, 0 + adc r10, 0 + + xor r8, r8 + mov rax, [rip+fmt(p503p1)+56] + mul r15 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [rip+fmt(p503p1)+48] + mul rcx + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [rip+fmt(p503p1)+40] + mul r13 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [rip+fmt(p503p1)+32] + mul r14 + add r9, rax + adc r10, rdx + adc r8, 0 + add r9, [reg_p1+88] + mov [reg_p2+24], r9 // z3 + adc r10, 0 + adc r8, 0 + + xor r9, r9 + mov rax, [rip+fmt(p503p1)+56] + mul rcx + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, [rip+fmt(p503p1)+48] + mul r13 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, [rip+fmt(p503p1)+40] + mul r14 + add r10, rax + adc r8, rdx + adc r9, 0 + add r10, [reg_p1+96] + mov [reg_p2+32], r10 // z4 + adc r8, 0 + adc r9, 0 + + xor r10, r10 + mov rax, [rip+fmt(p503p1)+56] + mul r13 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov rax, [rip+fmt(p503p1)+48] + mul r14 + add r8, rax + adc r9, rdx + adc r10, 0 + add r8, [reg_p1+104] // z5 + mov [reg_p2+40], r8 // z5 + adc r9, 0 + adc r10, 0 + + mov rax, [rip+fmt(p503p1)+56] + mul r14 + add r9, rax + adc r10, rdx + add r9, [reg_p1+112] // z6 + mov [reg_p2+48], r9 // z6 + adc r10, 0 + add r10, [reg_p1+120] // z7 + mov [reg_p2+56], r10 // z7 + + pop r15 + pop r14 + pop r13 + pop r12 + ret + +#endif + + +///////////////////////////////////////////////////////////////// MACRO +// z = z + a x b +// Inputs: base memory pointers M0 (a), M1 (b) +// bi pre-stored in rdx, +// accumulator z in [Z0:Z8], pre-stores a0 x b +// Output: [Z0:Z7] +// Temps: regs T0:T1 +///////////////////////////////////////////////////////////////// +.macro FPMUL512x512 M0, M1, Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7, Z8, T0, T1 + // [Z1:Z7, Z8] <- z = (z0 x p503p1 + z)/2^64 + mov rdx, \Z0 // rdx <- z0 + MULADD64x320 [rip+fmt(p503p1)+24], \Z3, \Z4, \Z5, \Z6, \Z7, \Z8, \T0, \T1 + + // [Z1:Z7, Z8, Z0] <- z = a01 x a1 + z + mov rdx, 8\M0 + MULADD64x512 \M1, \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \Z8, \Z0, \T0, \T1, \Z0 + // [Z2:Z7, Z8, Z0] <- z = (z0 x p503p1 + z)/2^64 + mov rdx, \Z1 // rdx <- z0 + MULADD64x320 [rip+fmt(p503p1)+24], \Z4, \Z5, \Z6, \Z7, \Z8, \Z0, \T0, \T1 + + // [Z2:Z7, Z8, Z0:Z1] <- z = a02 x a1 + z + mov rdx, 16\M0 + MULADD64x512 \M1, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \Z8, \Z0, \Z1, \T0, \T1, \Z1 + // [Z3:Z7, Z8, Z0:Z1] <- z = (z0 x p503p1 + z)/2^64 + mov rdx, \Z2 // rdx <- z0 + MULADD64x320 [rip+fmt(p503p1)+24], \Z5, \Z6, \Z7, \Z8, \Z0, \Z1, \T0, \T1 + + // [Z3:Z7, Z8, Z8, Z0:Z2] <- z = a03 x a1 + z + mov rdx, 24\M0 + MULADD64x512 \M1, \Z3, \Z4, \Z5, \Z6, \Z7, \Z8, \Z0, \Z1, \Z2, \T0, \T1, \Z2 + // [Z4:Z7, Z8, Z0:Z2] <- z = (z0 x p503p1 + z)/2^64 + mov rdx, \Z3 // rdx <- z0 + MULADD64x320 [rip+fmt(p503p1)+24], \Z6, \Z7, \Z8, \Z0, \Z1, \Z2, \T0, \T1 + + // [Z4:Z7, Z8, Z0:Z3] <- z = a04 x a1 + z + mov rdx, 32\M0 + MULADD64x512 \M1, \Z4, \Z5, \Z6, \Z7, \Z8, \Z0, \Z1, \Z2, \Z3, \T0, \T1, \Z3 + // [Z5:Z7, Z8, Z0:Z3] <- z = (z0 x p503p1 + z)/2^64 + mov rdx, \Z4 // rdx <- z0 + MULADD64x320 [rip+fmt(p503p1)+24], \Z7, \Z8, \Z0, \Z1, \Z2, \Z3, \T0, \T1 + + // [Z5:Z7, Z8, Z0:Z4] <- z = a05 x a1 + z + mov rdx, 40\M0 + MULADD64x512 \M1, \Z5, \Z6, \Z7, \Z8, \Z0, \Z1, \Z2, \Z3, \Z4, \T0, \T1, \Z4 + // [Z6:Z7, Z8, Z0:Z4] <- z = (z0 x p503p1 + z)/2^64 + mov rdx, \Z5 // rdx <- z0 + MULADD64x320 [rip+fmt(p503p1)+24], \Z8, \Z0, \Z1, \Z2, \Z3, \Z4, \T0, \T1 + + // [Z6:Z7, Z8, Z0:Z5] <- z = a06 x a1 + z + mov rdx, 48\M0 + MULADD64x512 \M1, \Z6, \Z7, \Z8, \Z0, \Z1, \Z2, \Z3, \Z4, \Z5, \T0, \T1, \Z5 + // [Z7, Z8, Z0:Z5] <- z = (z0 x p503p1 + z)/2^64 + mov rdx, \Z6 // rdx <- z0 + MULADD64x320 [rip+fmt(p503p1)+24], \Z0, \Z1, \Z2, \Z3, \Z4, \Z5, \T0, \T1 + + // [Z7, Z8, Z0:Z6] <- z = a07 x a1 + z + mov rdx, 56\M0 + MULADD64x512 \M1, \Z7, \Z8, \Z0, \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \T0, \T1, \Z6 + // [Z8, Z0:Z6] <- z = (z0 x p503p1 + z)/2^64 + mov rdx, \Z7 // rdx <- z0 + MULADD64x320 [rip+fmt(p503p1)+24], \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \T0, \T1 +.endm //*********************************************************************** -// Field subtraction -// Operation: c [reg_p3] = a [reg_p1] - b [reg_p2] -//*********************************************************************** -.global fmt(fpsub503_asm) -fmt(fpsub503_asm): - push r12 - push r13 - push r14 - push r15 - - xor rax, rax - mov r8, [reg_p1] - mov r9, [reg_p1+8] - mov r10, [reg_p1+16] - mov r11, [reg_p1+24] - mov r12, [reg_p1+32] - mov r13, [reg_p1+40] - mov r14, [reg_p1+48] - mov r15, [reg_p1+56] - sub r8, [reg_p2] - sbb r9, [reg_p2+8] - sbb r10, [reg_p2+16] - sbb r11, [reg_p2+24] - sbb r12, [reg_p2+32] - sbb r13, [reg_p2+40] - sbb r14, [reg_p2+48] - sbb r15, [reg_p2+56] - sbb rax, 0 - - mov rdi, [rip+fmt(p503x2)] - and rdi, rax - mov rsi, [rip+fmt(p503x2)+8] - and rsi, rax - mov rcx, [rip+fmt(p503x2)+24] - and rcx, rax - - add r8, rdi - adc r9, rsi - adc r10, rsi - adc r11, rcx - mov [reg_p3], r8 - mov [reg_p3+8], r9 - mov [reg_p3+16], r10 - mov [reg_p3+24], r11 - setc cl - - mov r8, [rip+fmt(p503x2)+32] - and r8, rax - mov r9, [rip+fmt(p503x2)+40] - and r9, rax - mov r10, [rip+fmt(p503x2)+48] - and r10, rax - mov r11, [rip+fmt(p503x2)+56] - and r11, rax - - bt rcx, 0 - adc r12, r8 - adc r13, r9 - adc r14, r10 - adc r15, r11 - mov [reg_p3+32], r12 - mov [reg_p3+40], r13 - mov [reg_p3+48], r14 - mov [reg_p3+56], r15 - - pop r15 - pop r14 - pop r13 - pop r12 - ret - - -///////////////////////////////////////////////////////////////// MACRO -.macro SUB503_PX P0 - push r12 - push r13 - push r14 - - mov r8, [reg_p1] - mov r9, [reg_p1+8] - mov r10, [reg_p1+16] - mov r11, [reg_p1+24] - mov r12, [reg_p1+32] - mov r13, [reg_p1+40] - mov r14, [reg_p1+48] - mov rcx, [reg_p1+56] - sub r8, [reg_p2] - sbb r9, [reg_p2+8] - sbb r10, [reg_p2+16] - sbb r11, [reg_p2+24] - sbb r12, [reg_p2+32] - sbb r13, [reg_p2+40] - sbb r14, [reg_p2+48] - sbb rcx, [reg_p2+56] - - mov rax, [rip+\P0] - mov rdi, [rip+\P0+8] - mov rsi, [rip+\P0+24] - add r8, rax - mov rax, [rip+\P0+32] - adc r9, rdi - adc r10, rdi - adc r11, rsi - adc r12, rax - mov rdi, [rip+\P0+40] - mov rsi, [rip+\P0+48] - mov rax, [rip+\P0+56] - adc r13, rdi - adc r14, rsi - adc rcx, rax - mov [reg_p3], r8 - mov [reg_p3+8], r9 - mov [reg_p3+16], r10 - mov [reg_p3+24], r11 - mov [reg_p3+32], r12 - mov [reg_p3+40], r13 - mov [reg_p3+48], r14 - mov [reg_p3+56], rcx - - pop r14 - pop r13 - pop r12 - .endm - - -//*********************************************************************** -// Multiprecision subtraction with correction with 2*p503 -// Operation: c [reg_p3] = a [reg_p1] - b [reg_p2] + 2*p503 -//*********************************************************************** -.global fmt(mp_sub503_p2_asm) -fmt(mp_sub503_p2_asm): - - SUB503_PX fmt(p503x2) - ret - - -//*********************************************************************** -// Multiprecision subtraction with correction with 4*p503 -// Operation: c [reg_p3] = a [reg_p1] - b [reg_p2] + 4*p503 -//*********************************************************************** -.global fmt(mp_sub503_p4_asm) -fmt(mp_sub503_p4_asm): - - SUB503_PX fmt(p503x4) - ret - - -#ifdef _MULX_ - -///////////////////////////////////////////////////////////////// MACRO -// Schoolbook integer multiplication -// Inputs: memory pointers M0 and M1 -// Outputs: memory pointer C and regs T1, T2, T3, T4, T8 -// Temps: regs T0:T9 -///////////////////////////////////////////////////////////////// - -#ifdef _ADX_ -.macro MUL256_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9 - mov rdx, \M0 - mulx \T0, \T1, \M1 // T0:T1 = A0*B0 - mov \C, \T1 // C0_final - mulx \T1, \T2, 8\M1 // T1:T2 = A0*B1 - xor rax, rax - adox \T0, \T2 - mulx \T2, \T3, 16\M1 // T2:T3 = A0*B2 - adox \T1, \T3 - mulx \T3, \T4, 24\M1 // T3:T4 = A0*B3 - adox \T2, \T4 - - mov rdx, 8\M0 - mulx \T5, \T4, \M1 // T5:T4 = A1*B0 - adox \T3, rax - xor rax, rax - mulx \T6, \T7, 8\M1 // T6:T7 = A1*B1 - adox \T4, \T0 - mov 8\C, \T4 // C1_final - adcx \T5, \T7 - mulx \T7, \T8, 16\M1 // T7:T8 = A1*B2 - adcx \T6, \T8 - adox \T5, \T1 - mulx \T8, \T9, 24\M1 // T8:T9 = A1*B3 - adcx \T7, \T9 - adcx \T8, rax - adox \T6, \T2 - - mov rdx, 16\M0 - mulx \T1, \T0, \M1 // T1:T0 = A2*B0 - adox \T7, \T3 - adox \T8, rax - xor rax, rax - mulx \T2, \T3, 8\M1 // T2:T3 = A2*B1 - adox \T0, \T5 - mov 16\C, \T0 // C2_final - adcx \T1, \T3 - mulx \T3, \T4, 16\M1 // T3:T4 = A2*B2 - adcx \T2, \T4 - adox \T1, \T6 - mulx \T4,\T9, 24\M1 // T3:T4 = A2*B3 - adcx \T3, \T9 - mov rdx, 24\M0 - adcx \T4, rax - - adox \T2, \T7 - adox \T3, \T8 - adox \T4, rax - - mulx \T5, \T0, \M1 // T5:T0 = A3*B0 - xor rax, rax - mulx \T6, \T7, 8\M1 // T6:T7 = A3*B1 - adcx \T5, \T7 - adox \T1, \T0 - mulx \T7, \T8, 16\M1 // T7:T8 = A3*B2 - adcx \T6, \T8 - adox \T2, \T5 - mulx \T8, \T9, 24\M1 // T8:T9 = A3*B3 - adcx \T7, \T9 - adcx \T8, rax - - adox \T3, \T6 - adox \T4, \T7 - adox \T8, rax -.endm - -#else - -.macro MUL256_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9 - mov rdx, \M0 - mulx \T0, \T1, \M1 // T0:T1 = A0*B0 - mov \C, \T1 // C0_final - mulx \T1, \T2, 8\M1 // T1:T2 = A0*B1 - add \T0, \T2 - mulx \T2, \T3, 16\M1 // T2:T3 = A0*B2 - adc \T1, \T3 - mulx \T3, \T4, 24\M1 // T3:T4 = A0*B3 - adc \T2, \T4 - mov rdx, 8\M0 - adc \T3, 0 - - mulx \T5, \T4, \M1 // T5:T4 = A1*B0 - mulx \T6, \T7, 8\M1 // T6:T7 = A1*B1 - add \T5, \T7 - mulx \T7, \T8, 16\M1 // T7:T8 = A1*B2 - adc \T6, \T8 - mulx \T8, \T9, 24\M1 // T8:T9 = A1*B3 - adc \T7, \T9 - adc \T8, 0 - - add \T4, \T0 - mov 8\C, \T4 // C1_final - adc \T5, \T1 - adc \T6, \T2 - adc \T7, \T3 - mov rdx, 16\M0 - adc \T8, 0 - - mulx \T1, \T0, \M1 // T1:T0 = A2*B0 - mulx \T2, \T3, 8\M1 // T2:T3 = A2*B1 - add \T1, \T3 - mulx \T3, \T4, 16\M1 // T3:T4 = A2*B2 - adc \T2, \T4 - mulx \T4,\T9, 24\M1 // T3:T4 = A2*B3 - adc \T3, \T9 - mov rdx, 24\M0 - adc \T4, 0 - - add \T0, \T5 - mov 16\C, \T0 // C2_final - adc \T1, \T6 - adc \T2, \T7 - adc \T3, \T8 - adc \T4, 0 - - mulx \T5, \T0, \M1 // T5:T0 = A3*B0 - mulx \T6, \T7, 8\M1 // T6:T7 = A3*B1 - add \T5, \T7 - mulx \T7, \T8, 16\M1 // T7:T8 = A3*B2 - adc \T6, \T8 - mulx \T8, \T9, 24\M1 // T8:T9 = A3*B3 - adc \T7, \T9 - adc \T8, 0 - - add \T1, \T0 - adc \T2, \T5 - adc \T3, \T6 - adc \T4, \T7 - adc \T8, 0 -.endm -#endif - - -//***************************************************************************** -// 503-bit multiplication using Karatsuba (one level), schoolbook (one level) -//***************************************************************************** -.global fmt(mul503_asm) -fmt(mul503_asm): - push r12 - push r13 - push r14 - push r15 - mov rcx, reg_p3 - - // r8-r11 <- AH + AL, rax <- mask - xor rax, rax - mov r8, [reg_p1] - mov r9, [reg_p1+8] - mov r10, [reg_p1+16] - mov r11, [reg_p1+24] - push rbx - push rbp - sub rsp, 96 - add r8, [reg_p1+32] - adc r9, [reg_p1+40] - adc r10, [reg_p1+48] - adc r11, [reg_p1+56] - sbb rax, 0 - mov [rsp], r8 - mov [rsp+8], r9 - mov [rsp+16], r10 - mov [rsp+24], r11 - - // r12-r15 <- BH + BL, rbx <- mask - xor rbx, rbx - mov r12, [reg_p2] - mov r13, [reg_p2+8] - mov r14, [reg_p2+16] - mov r15, [reg_p2+24] - add r12, [reg_p2+32] - adc r13, [reg_p2+40] - adc r14, [reg_p2+48] - adc r15, [reg_p2+56] - sbb rbx, 0 - mov [rsp+32], r12 - mov [rsp+40], r13 - mov [rsp+48], r14 - mov [rsp+56], r15 - - // r12-r15 <- masked (BH + BL) - and r12, rax - and r13, rax - and r14, rax - and r15, rax - - // r8-r11 <- masked (AH + AL) - and r8, rbx - and r9, rbx - and r10, rbx - and r11, rbx - - // r8-r11 <- masked (AH + AL) + masked (AH + AL) - add r8, r12 - adc r9, r13 - adc r10, r14 - adc r11, r15 - mov [rsp+64], r8 - mov [rsp+72], r9 - mov [rsp+80], r10 - mov [rsp+88], r11 - - // [rcx+64], r9-r12, rbx <- (AH+AL) x (BH+BL), low part - MUL256_SCHOOL [rsp], [rsp+32], [rcx+64], r8, r9, r10, r11, r12, r13, r14, r15, rbx, rbp - mov [rcx+88], r9 - mov [rcx+96], r10 - mov [rcx+104], r11 - mov [rcx+112], r12 - mov [rcx+120], rbx - - // [rcx], r9-r12, rbx <- AL x BL - MUL256_SCHOOL [reg_p1], [reg_p2], [rcx], r8, r9, r10, r11, r12, r13, r14, r15, rbx, rbp // Result C0-C3 - mov [rcx+24], r9 - mov [rcx+32], r10 - mov [rcx+40], r11 - mov [rcx+48], r12 - mov [rcx+56], rbx - - // [rsp], rbx, rbp, r13-r15 <- AH x BH - MUL256_SCHOOL [reg_p1+32], [reg_p2+32], [rsp], r8, rbx, rbp, r13, r14, r9, r10, r11, r15, r12 - - // r8-r11 <- (AH+AL) x (BH+BL), final step - mov r8, [rsp+64] - mov r9, [rsp+72] - mov r10, [rsp+80] - mov r11, [rsp+88] - mov rax, [rcx+96] - add r8, rax - mov rax, [rcx+104] - adc r9, rax - mov rax, [rcx+112] - adc r10, rax - mov rax, [rcx+120] - adc r11, rax - - // r8-r12, rdi, rsi, rdx <- (AH+AL) x (BH+BL) - ALxBL - mov r12, [rcx+64] - mov rdi, [rcx+72] - mov rsi, [rcx+80] - mov rdx, [rcx+88] - sub r12, [rcx] - sbb rdi, [rcx+8] - sbb rsi, [rcx+16] - sbb rdx, [rcx+24] - sbb r8, [rcx+32] - sbb r9, [rcx+40] - sbb r10, [rcx+48] - sbb r11, [rcx+56] - - // r8-r12, rdi, rsi, rdx <- (AH+AL) x (BH+BL) - ALxBL - AHxBH - sub r12, [rsp] - sbb rdi, [rsp+8] - sbb rsi, [rsp+16] - sbb rdx, rbx - sbb r8, rbp - sbb r9, r13 - sbb r10, r14 - sbb r11, r15 - - add r12, [rcx+32] - mov [rcx+32], r12 // Result C4-C7 - adc rdi, [rcx+40] - mov [rcx+40], rdi - adc rsi, [rcx+48] - mov [rcx+48], rsi - adc rdx, [rcx+56] - mov [rcx+56], rdx - mov rax, [rsp] - adc r8, rax - mov [rcx+64], r8 // Result C8-C15 - mov rax, [rsp+8] - adc r9, rax - mov [rcx+72], r9 - mov rax, [rsp+16] - adc r10, rax - mov [rcx+80], r10 - adc r11, rbx - mov [rcx+88], r11 - adc rbp, 0 - mov [rcx+96], rbp - adc r13, 0 - mov [rcx+104], r13 - adc r14, 0 - mov [rcx+112], r14 - adc r15, 0 - mov [rcx+120], r15 - - add rsp, 96 - pop rbp - pop rbx - pop r15 - pop r14 - pop r13 - pop r12 +// Squaring in GF(p^2), non-complex part +// Operation: c [reg_p2] = (a0+a1) x (a0-a1) +// Inputs: a = [a1, a0] stored in [reg_p1] +// Output: c stored in [reg_p2] +//*********************************************************************** +.global fmt(fp2sqr503_c0_asm) +fmt(fp2sqr503_c0_asm): + push r12 + push r13 + + // a0 + a1 + mov r8, [reg_p1] + mov r9, [reg_p1+8] + mov r10, [reg_p1+16] + mov r11, [reg_p1+24] + mov r12, [reg_p1+32] + mov r13, [reg_p1+40] + add r8, [reg_p1+64] + adc r9, [reg_p1+72] + push r14 + adc r10, [reg_p1+80] + adc r11, [reg_p1+88] + push r15 + adc r12, [reg_p1+96] + adc r13, [reg_p1+104] + mov r14, [reg_p1+48] + mov r15, [reg_p1+56] + adc r14, [reg_p1+112] + adc r15, [reg_p1+120] + mov [reg_p2+8], r9 + mov [reg_p2+16], r10 + mov [reg_p2+24], r11 + mov [reg_p2+32], r12 + mov [reg_p2+40], r13 + mov [reg_p2+48], r14 + mov [reg_p2+56], r15 + + // a0 - a1 + 4xp503 + mov rcx, [reg_p1] + mov r10, [reg_p1+8] + mov r12, [reg_p1+16] + mov r13, [reg_p1+24] + mov r14, [reg_p1+32] + mov r15, [reg_p1+40] + sub rcx, [reg_p1+64] + sbb r10, [reg_p1+72] + push rbx + sbb r12, [reg_p1+80] + sbb r13, [reg_p1+88] + push rbp + sbb r14, [reg_p1+96] + sbb r15, [reg_p1+104] + mov rbx, [reg_p1+48] + mov rbp, [reg_p1+56] + sbb rbx, [reg_p1+112] + sbb rbp, [reg_p1+120] + add rcx, [rip+fmt(p503x4)] + mov rdx, [rip+fmt(p503x4)+8] + adc r10, rdx + adc r12, rdx + adc r13, [rip+fmt(p503x4)+24] + adc r14, [rip+fmt(p503x4)+32] + adc r15, [rip+fmt(p503x4)+40] + adc rbx, [rip+fmt(p503x4)+48] + adc rbp, [rip+fmt(p503x4)+56] + mov [reg_p2+64], rcx + mov [reg_p2+72], r10 + + // [r8:r15, rbp] <- z = a00 x a1 + mov rdx, r8 + mulx r9, r8, rcx + xor rax, rax + mov [reg_p2+80], r12 + mulx r10, r11, r10 + mov [reg_p2+88], r13 + adox r9, r11 + mulx r11, r12, r12 + mov [reg_p2+96], r14 + adox r10, r12 + mulx r12, r13, r13 + mov [reg_p2+104], r15 + adox r11, r13 + mulx r13, r14, r14 + mov [reg_p2+112], rbx + adox r12, r14 + mulx r14, r15, r15 + mov [reg_p2+120], rbp + adox r13, r15 + mulx r15, rbp, rbx + adox r14, rbp + mulx rbp, rbx, [reg_p2+120] + adox r15, rbx + adox rbp, rax + + FPMUL512x512 [reg_p2], [reg_p2+64], r8, r9, r10, r11, r12, r13, r14, r15, rbp, rbx, rcx + + mov [reg_p2], rbp + mov [reg_p2+8], r8 + mov [reg_p2+16], r9 + mov [reg_p2+24], r10 + mov [reg_p2+32], r11 + mov [reg_p2+40], r12 + mov [reg_p2+48], r13 + mov [reg_p2+56], r14 + pop rbp + pop rbx + pop r15 + pop r14 + pop r13 + pop r12 ret -#else //*********************************************************************** -// Integer multiplication -// Based on Karatsuba method -// Operation: c [reg_p3] = a [reg_p1] * b [reg_p2] -// NOTE: a=c or b=c are not allowed -//*********************************************************************** -.global fmt(mul503_asm) -fmt(mul503_asm): - push r12 - push r13 - push r14 - mov rcx, reg_p3 - - // rcx[0-3] <- AH+AL - xor rax, rax - mov r8, [reg_p1+32] - mov r9, [reg_p1+40] - mov r10, [reg_p1+48] - mov r11, [reg_p1+56] - add r8, [reg_p1] - adc r9, [reg_p1+8] - adc r10, [reg_p1+16] - adc r11, [reg_p1+24] - push r15 - mov [rcx], r8 - mov [rcx+8], r9 - mov [rcx+16], r10 - mov [rcx+24], r11 - sbb rax, 0 - sub rsp, 80 // Allocating space in stack - - // r12-r15 <- BH+BL - xor rdx, rdx - mov r12, [reg_p2+32] - mov r13, [reg_p2+40] - mov r14, [reg_p2+48] - mov r15, [reg_p2+56] - add r12, [reg_p2] - adc r13, [reg_p2+8] - adc r14, [reg_p2+16] - adc r15, [reg_p2+24] - sbb rdx, 0 - mov [rsp+64], rax - mov [rsp+72], rdx - - // (rsp[0-3],r8,r9,r10,r11) <- (AH+AL)*(BH+BL) - mov rax, [rcx] - mul r12 - mov [rsp], rax // c0 - mov r8, rdx - - xor r9, r9 - mov rax, [rcx] - mul r13 - add r8, rax - adc r9, rdx - - xor r10, r10 - mov rax, [rcx+8] - mul r12 - add r8, rax - mov [rsp+8], r8 // c1 - adc r9, rdx - adc r10, 0 - - xor r8, r8 - mov rax, [rcx] - mul r14 - add r9, rax - adc r10, rdx - adc r8, 0 - - mov rax, [rcx+16] - mul r12 - add r9, rax - adc r10, rdx - adc r8, 0 - - mov rax, [rcx+8] - mul r13 - add r9, rax - mov [rsp+16], r9 // c2 - adc r10, rdx - adc r8, 0 - - xor r9, r9 - mov rax, [rcx] - mul r15 - add r10, rax - adc r8, rdx - adc r9, 0 - - mov rax, [rcx+24] - mul r12 - add r10, rax - adc r8, rdx - adc r9, 0 - - mov rax, [rcx+8] - mul r14 - add r10, rax - adc r8, rdx - adc r9, 0 - - mov rax, [rcx+16] - mul r13 - add r10, rax - mov [rsp+24], r10 // c3 - adc r8, rdx - adc r9, 0 - - xor r10, r10 - mov rax, [rcx+8] - mul r15 - add r8, rax - adc r9, rdx - adc r10, 0 - - mov rax, [rcx+24] - mul r13 - add r8, rax - adc r9, rdx - adc r10, 0 - - mov rax, [rcx+16] - mul r14 - add r8, rax - mov [rsp+32], r8 // c4 - adc r9, rdx - adc r10, 0 - - xor r11, r11 - mov rax, [rcx+16] - mul r15 - add r9, rax - adc r10, rdx - adc r11, 0 - - mov rax, [rcx+24] - mul r14 - add r9, rax // c5 - adc r10, rdx - adc r11, 0 - - mov rax, [rcx+24] - mul r15 - add r10, rax // c6 - adc r11, rdx // c7 - - mov rax, [rsp+64] - and r12, rax - and r13, rax - and r14, rax - and r15, rax - add r12, r8 - adc r13, r9 - adc r14, r10 - adc r15, r11 - - mov rax, [rsp+72] - mov r8, [rcx] - mov r9, [rcx+8] - mov r10, [rcx+16] - mov r11, [rcx+24] - and r8, rax - and r9, rax - and r10, rax - and r11, rax - add r8, r12 - adc r9, r13 - adc r10, r14 - adc r11, r15 - mov [rsp+32], r8 - mov [rsp+40], r9 - mov [rsp+48], r10 - mov [rsp+56], r11 - - // rcx[0-7] <- AL*BL - mov r11, [reg_p1] - mov rax, [reg_p2] - mul r11 - xor r9, r9 - mov [rcx], rax // c0 - mov r8, rdx - - mov r14, [reg_p1+16] - mov rax, [reg_p2+8] - mul r11 - xor r10, r10 - add r8, rax - adc r9, rdx - - mov r12, [reg_p1+8] - mov rax, [reg_p2] - mul r12 - add r8, rax - mov [rcx+8], r8 // c1 - adc r9, rdx - adc r10, 0 - - xor r8, r8 - mov rax, [reg_p2+16] - mul r11 - add r9, rax - adc r10, rdx - adc r8, 0 - - mov r13, [reg_p2] - mov rax, r14 - mul r13 - add r9, rax - adc r10, rdx - adc r8, 0 - - mov rax, [reg_p2+8] - mul r12 - add r9, rax - mov [rcx+16], r9 // c2 - adc r10, rdx - adc r8, 0 - - xor r9, r9 - mov rax, [reg_p2+24] - mul r11 - mov r15, [reg_p1+24] - add r10, rax - adc r8, rdx - adc r9, 0 - - mov rax, r15 - mul r13 - add r10, rax - adc r8, rdx - adc r9, 0 - - mov rax, [reg_p2+16] - mul r12 - add r10, rax - adc r8, rdx - adc r9, 0 - - mov rax, [reg_p2+8] - mul r14 - add r10, rax - mov [rcx+24], r10 // c3 - adc r8, rdx - adc r9, 0 - - xor r10, r10 - mov rax, [reg_p2+24] - mul r12 - add r8, rax - adc r9, rdx - adc r10, 0 - - mov rax, [reg_p2+8] - mul r15 - add r8, rax - adc r9, rdx - adc r10, 0 - - mov rax, [reg_p2+16] - mul r14 - add r8, rax - mov [rcx+32], r8 // c4 - adc r9, rdx - adc r10, 0 - - xor r8, r8 - mov rax, [reg_p2+24] - mul r14 - add r9, rax - adc r10, rdx - adc r8, 0 - - mov rax, [reg_p2+16] - mul r15 - add r9, rax - mov [rcx+40], r9 // c5 - adc r10, rdx - adc r8, 0 - - mov rax, [reg_p2+24] - mul r15 - add r10, rax - mov [rcx+48], r10 // c6 - adc r8, rdx - mov [rcx+56], r8 // c7 - - // rcx[8-15] <- AH*BH - mov r11, [reg_p1+32] - mov rax, [reg_p2+32] - mul r11 - xor r9, r9 - mov [rcx+64], rax // c0 - mov r8, rdx - - mov r14, [reg_p1+48] - mov rax, [reg_p2+40] - mul r11 - xor r10, r10 - add r8, rax - adc r9, rdx - - mov r12, [reg_p1+40] - mov rax, [reg_p2+32] - mul r12 - add r8, rax - mov [rcx+72], r8 // c1 - adc r9, rdx - adc r10, 0 - - xor r8, r8 - mov rax, [reg_p2+48] - mul r11 - add r9, rax - adc r10, rdx - adc r8, 0 - - mov r13, [reg_p2+32] - mov rax, r14 - mul r13 - add r9, rax - adc r10, rdx - adc r8, 0 - - mov rax, [reg_p2+40] - mul r12 - add r9, rax - mov [rcx+80], r9 // c2 - adc r10, rdx - adc r8, 0 - - xor r9, r9 - mov rax, [reg_p2+56] - mul r11 - mov r15, [reg_p1+56] - add r10, rax - adc r8, rdx - adc r9, 0 - - mov rax, r15 - mul r13 - add r10, rax - adc r8, rdx - adc r9, 0 - - mov rax, [reg_p2+48] - mul r12 - add r10, rax - adc r8, rdx - adc r9, 0 - - mov rax, [reg_p2+40] - mul r14 - add r10, rax - mov [rcx+88], r10 // c3 - adc r8, rdx - adc r9, 0 - - xor r10, r10 - mov rax, [reg_p2+56] - mul r12 - add r8, rax - adc r9, rdx - adc r10, 0 - - mov rax, [reg_p2+40] - mul r15 - add r8, rax - adc r9, rdx - adc r10, 0 - - mov rax, [reg_p2+48] - mul r14 - add r8, rax - mov [rcx+96], r8 // c4 - adc r9, rdx - adc r10, 0 - - xor r8, r8 - mov rax, [reg_p2+56] - mul r14 - add r9, rax - adc r10, rdx - adc r8, 0 - - mov rax, [reg_p2+48] - mul r15 - add r9, rax - mov [rcx+104], r9 // c5 - adc r10, rdx - adc r8, 0 - - mov rax, [reg_p2+56] - mul r15 - add r10, rax - mov [rcx+112], r10 // c6 - adc r8, rdx - mov [rcx+120], r8 // c7 - - // [r8-r15] <- (AH+AL)*(BH+BL) - AL*BL - mov r8, [rsp] - sub r8, [rcx] - mov r9, [rsp+8] - sbb r9, [rcx+8] - mov r10, [rsp+16] - sbb r10, [rcx+16] - mov r11, [rsp+24] - sbb r11, [rcx+24] - mov r12, [rsp+32] - sbb r12, [rcx+32] - mov r13, [rsp+40] - sbb r13, [rcx+40] - mov r14, [rsp+48] - sbb r14, [rcx+48] - mov r15, [rsp+56] - sbb r15, [rcx+56] - - // [r8-r15] <- (AH+AL)*(BH+BL) - AL*BL - AH*BH - mov rax, [rcx+64] - sub r8, rax - mov rax, [rcx+72] - sbb r9, rax - mov rax, [rcx+80] - sbb r10, rax - mov rax, [rcx+88] - sbb r11, rax - mov rax, [rcx+96] - sbb r12, rax - mov rdx, [rcx+104] - sbb r13, rdx - mov rdi, [rcx+112] - sbb r14, rdi - mov rsi, [rcx+120] - sbb r15, rsi - - // Final result - add r8, [rcx+32] - mov [rcx+32], r8 - adc r9, [rcx+40] - mov [rcx+40], r9 - adc r10, [rcx+48] - mov [rcx+48], r10 - adc r11, [rcx+56] - mov [rcx+56], r11 - adc r12, [rcx+64] - mov [rcx+64], r12 - adc r13, [rcx+72] - mov [rcx+72], r13 - adc r14, [rcx+80] - mov [rcx+80], r14 - adc r15, [rcx+88] - mov [rcx+88], r15 - adc rax, 0 - mov [rcx+96], rax - adc rdx, 0 - mov [rcx+104], rdx - adc rdi, 0 - mov [rcx+112], rdi - adc rsi, 0 - mov [rcx+120], rsi - - add rsp, 80 // Restoring space in stack - pop r15 - pop r14 - pop r13 - pop r12 - ret - -#endif - - -#ifdef _MULX_ - -///////////////////////////////////////////////////////////////// MACRO -// Schoolbook integer multiplication -// Inputs: regs I0 and I1, and memory pointer M1 -// Outputs: regs T0:T5 -// Temps: regs T0:T5 -///////////////////////////////////////////////////////////////// - -#ifdef _ADX_ -.macro MUL128x256_SCHOOL I0, I1, M1, T0, T1, T2, T3, T4, T5 - mulx \T2, \T4, 8\M1 - xor rax, rax - mulx \T3, \T5, 16\M1 - ADD1 \T1, \T4 - ADC1 \T2, \T5 - mulx \T4, \T5, 24\M1 - ADC1 \T3, \T5 - ADC1 \T4, rax - - xor rax, rax - mov rdx, \I1 - mulx \I1, \T5, \M1 - ADD2 \T1, \T5 // T1 <- C1_final - ADC2 \T2, \I1 - mulx \T5, \I1, 8\M1 - ADC2 \T3, \T5 - ADD1 \T2, \I1 - mulx \T5, \I1, 16\M1 - ADC2 \T4, \T5 - ADC1 \T3, \I1 - mulx \T5, \I1, 24\M1 - ADC2 \T5, rax - ADC1 \T4, \I1 - ADC1 \T5, rax -.endm - -#else - -.macro MUL128x256_SCHOOL I0, I1, M1, T0, T1, T2, T3, T4, T5 - mulx \T2, \T4, 8\M1 - mulx \T3, \T5, 16\M1 - add \T1, \T4 - adc \T2, \T5 - mulx \T4, \T5, 24\M1 - adc \T3, \T5 - adc \T4, 0 - - mov rdx, \I1 - mulx \I1, \T5, \M1 - add \T1, \T5 // T1 <- C1_final - adc \T2, \I1 - mulx \T5, \I1, 8\M1 - adc \T3, \T5 - mulx \T5, rax, 16\M1 - adc \T4, \T5 - mulx \T5, rdx, 24\M1 - adc \T5, 0 - add \T2, \I1 - adc \T3, rax - adc \T4, rdx - adc \T5, 0 -.endm -#endif - - -//************************************************************************************** -// Montgomery reduction -// Based on method described in Faz-Hernandez et al. https://eprint.iacr.org/2017/1015 -// Operation: c [reg_p2] = a [reg_p1] -//************************************************************************************** -.global fmt(rdc503_asm) -fmt(rdc503_asm): - - // a[0-1] x 64xp503p1_nz --> result: r8:r13 - mov rdx, [reg_p1] - mov rcx, [reg_p1+8] - mulx r9, r8, [rip+fmt(p503p1x64)] // result r8 - push rbx - push rbp - push r12 - push r13 - push r14 - push r15 - MUL128x256_SCHOOL rdx, rcx, [rip+fmt(p503p1x64)], r8, r9, r10, r11, r12, r13 - - xor r15, r15 - shrd r15, r8, 6 - shrd r8, r9, 6 - shrd r9, r10, 6 - shrd r10, r11, 6 - shrd r11, r12, 6 - shrd r12, r13, 6 - shr r13, 6 - mov rdx, [reg_p1+16] - mov r14, [reg_p1+80] - add r15, [reg_p1+24] - adc r8, [reg_p1+32] - adc r9, [reg_p1+40] - adc r10, [reg_p1+48] - adc r11, [reg_p1+56] - adc r12, [reg_p1+64] - adc r13, [reg_p1+72] - mulx rbx, rcx, [rip+fmt(p503p1x64)] // result rcx - adc r14, 0 - mov [reg_p2], r8 - mov [reg_p2+8], r9 - mov [reg_p2+16], r10 - mov [reg_p2+24], r11 - mov [reg_p2+32], r12 - mov [reg_p2+40], r13 - mov [reg_p2+48], r14 - mov r9, [reg_p1+88] - mov r10, [reg_p1+96] - mov r11, [reg_p1+104] - mov r12, [reg_p1+112] - mov rdi, [reg_p1+120] - adc r9, 0 - adc r10, 0 - adc r11, 0 - adc r12, 0 - adc rdi, 0 - - // a[2-3] x 64xp503p1_nz --> result: rcx, rbx, rbp, r14, r8, r13 - MUL128x256_SCHOOL rdx, r15, [rip+fmt(p503p1x64)], rcx, rbx, rbp, r14, r8, r13 - - xor r15, r15 - shrd r15, rcx, 6 - shrd rcx, rbx, 6 - shrd rbx, rbp, 6 - shrd rbp, r14, 6 - shrd r14, r8, 6 - shrd r8, r13, 6 - shr r13, 6 - mov rdx, [reg_p2] - add r15, [reg_p2+8] - adc rcx, [reg_p2+16] - adc rbx, [reg_p2+24] - adc rbp, [reg_p2+32] - adc r14, [reg_p2+40] - adc r8, [reg_p2+48] - mov [reg_p2+16], rcx - mov [reg_p2+24], rbx - mov [reg_p2+32], rbp - mov [reg_p2+40], r14 - mov [reg_p2+48], r8 - mulx rbp, rbx, [rip+fmt(p503p1x64)] // result rbx - adc r9, r13 - adc r10, 0 - adc r11, 0 - adc r12, 0 - adc rdi, 0 - - // a[4-5] x 64xp503p1_nz --> result: rbx, rbp, r14, r8, r13, rcx - MUL128x256_SCHOOL rdx, r15, [rip+fmt(p503p1x64)], rbx, rbp, r14, r8, r13, rcx - - xor r15, r15 - shrd r15, rbx, 6 - shrd rbx, rbp, 6 - shrd rbp, r14, 6 - shrd r14, r8, 6 - shrd r8, r13, 6 - shrd r13, rcx, 6 - shr rcx, 6 - mov rdx, [reg_p2+16] - add r15, [reg_p2+24] - adc rbx, [reg_p2+32] - adc rbp, [reg_p2+40] - adc r14, [reg_p2+48] - mov [reg_p2], rbx // Final result c0 - mov [reg_p2+8], rbp - mov [reg_p2+16], r14 - adc r9, r8 - adc r10, r13 - mulx r14, rbp, [rip+fmt(p503p1x64)] // result rbp - adc r11, rcx - adc r12, 0 - adc rdi, 0 - - // a[6-7] x 64xp503p1_nz --> result: rbp, r14, r8, r13, rcx, rbx - MUL128x256_SCHOOL rdx, r15, [rip+fmt(p503p1x64)], rbp, r14, r8, r13, rcx, rbx - - xor r15, r15 - shrd r15, rbp, 6 - shrd rbp, r14, 6 - shrd r14, r8, 6 - shrd r8, r13, 6 - shrd r13, rcx, 6 - shrd rcx, rbx, 6 - shr rbx, 6 - add r15, [reg_p2+8] - adc rbp, [reg_p2+16] - mov [reg_p2+8], r15 // Final result c1-c7 - mov [reg_p2+16], rbp - adc r9, r14 - adc r10, r8 - adc r11, r13 - adc r12, rcx - adc rdi, rbx - mov [reg_p2+24], r9 - mov [reg_p2+32], r10 - mov [reg_p2+40], r11 - mov [reg_p2+48], r12 - mov [reg_p2+56], rdi - - pop r15 - pop r14 - pop r13 - pop r12 - pop rbp - pop rbx - ret - - #else - -//*********************************************************************** -// Montgomery reduction -// Based on comba method -// Operation: c [reg_p2] = a [reg_p1] -// NOTE: a=c is not allowed -//*********************************************************************** -.global fmt(rdc503_asm) -fmt(rdc503_asm): - push r12 - push r13 - push r14 - push r15 - - mov r11, [reg_p1] - mov rax, [rip+fmt(p503p1)+24] - mul r11 - xor r8, r8 - add rax, [reg_p1+24] - mov [reg_p2+24], rax // z3 - adc r8, rdx - - xor r9, r9 - mov rax, [rip+fmt(p503p1)+32] - mul r11 - xor r10, r10 - add r8, rax - adc r9, rdx - - mov r12, [reg_p1+8] - mov rax, [rip+fmt(p503p1)+24] - mul r12 - add r8, rax - adc r9, rdx - adc r10, 0 - add r8, [reg_p1+32] - mov [reg_p2+32], r8 // z4 - adc r9, 0 - adc r10, 0 - - xor r8, r8 - mov rax, [rip+fmt(p503p1)+40] - mul r11 - add r9, rax - adc r10, rdx - adc r8, 0 - - mov rax, [rip+fmt(p503p1)+32] - mul r12 - add r9, rax - adc r10, rdx - adc r8, 0 - - mov r13, [reg_p1+16] - mov rax, [rip+fmt(p503p1)+24] - mul r13 - add r9, rax - adc r10, rdx - adc r8, 0 - add r9, [reg_p1+40] - mov [reg_p2+40], r9 // z5 - adc r10, 0 - adc r8, 0 - - xor r9, r9 - mov rax, [rip+fmt(p503p1)+48] - mul r11 - add r10, rax - adc r8, rdx - adc r9, 0 - - mov rax, [rip+fmt(p503p1)+40] - mul r12 - add r10, rax - adc r8, rdx - adc r9, 0 - - mov rax, [rip+fmt(p503p1)+32] - mul r13 - add r10, rax - adc r8, rdx - adc r9, 0 - - mov r14, [reg_p2+24] - mov rax, [rip+fmt(p503p1)+24] - mul r14 - add r10, rax - adc r8, rdx - adc r9, 0 - add r10, [reg_p1+48] - mov [reg_p2+48], r10 // z6 - adc r8, 0 - adc r9, 0 - - xor r10, r10 - mov rax, [rip+fmt(p503p1)+56] - mul r11 - add r8, rax - adc r9, rdx - adc r10, 0 - - mov rax, [rip+fmt(p503p1)+48] - mul r12 - add r8, rax - adc r9, rdx - adc r10, 0 - - mov rax, [rip+fmt(p503p1)+40] - mul r13 - add r8, rax - adc r9, rdx - adc r10, 0 - - mov rax, [rip+fmt(p503p1)+32] - mul r14 - add r8, rax - adc r9, rdx - adc r10, 0 - - mov r15, [reg_p2+32] - mov rax, [rip+fmt(p503p1)+24] - mul r15 - add r8, rax - adc r9, rdx - adc r10, 0 - add r8, [reg_p1+56] - mov [reg_p2+56], r8 // z7 - adc r9, 0 - adc r10, 0 - - xor r8, r8 - mov rax, [rip+fmt(p503p1)+56] - mul r12 - add r9, rax - adc r10, rdx - adc r8, 0 - - mov rax, [rip+fmt(p503p1)+48] - mul r13 - add r9, rax - adc r10, rdx - adc r8, 0 - - mov rax, [rip+fmt(p503p1)+40] - mul r14 - add r9, rax - adc r10, rdx - adc r8, 0 - - mov rax, [rip+fmt(p503p1)+32] - mul r15 - add r9, rax - adc r10, rdx - adc r8, 0 - - mov rcx, [reg_p2+40] - mov rax, [rip+fmt(p503p1)+24] - mul rcx - add r9, rax - adc r10, rdx - adc r8, 0 - add r9, [reg_p1+64] - mov [reg_p2], r9 // z0 - adc r10, 0 - adc r8, 0 - - xor r9, r9 - mov rax, [rip+fmt(p503p1)+56] - mul r13 - add r10, rax - adc r8, rdx - adc r9, 0 - - mov rax, [rip+fmt(p503p1)+48] - mul r14 - add r10, rax - adc r8, rdx - adc r9, 0 - - mov rax, [rip+fmt(p503p1)+40] - mul r15 - add r10, rax - adc r8, rdx - adc r9, 0 - - mov rax, [rip+fmt(p503p1)+32] - mul rcx - add r10, rax - adc r8, rdx - adc r9, 0 - - mov r13, [reg_p2+48] - mov rax, [rip+fmt(p503p1)+24] - mul r13 - add r10, rax - adc r8, rdx - adc r9, 0 - add r10, [reg_p1+72] - mov [reg_p2+8], r10 // z1 - adc r8, 0 - adc r9, 0 - - xor r10, r10 - mov rax, [rip+fmt(p503p1)+56] - mul r14 - add r8, rax - adc r9, rdx - adc r10, 0 - - mov rax, [rip+fmt(p503p1)+48] - mul r15 - add r8, rax - adc r9, rdx - adc r10, 0 - - mov rax, [rip+fmt(p503p1)+40] - mul rcx - add r8, rax - adc r9, rdx - adc r10, 0 - - mov rax, [rip+fmt(p503p1)+32] - mul r13 - add r8, rax - adc r9, rdx - adc r10, 0 - - mov r14, [reg_p2+56] - mov rax, [rip+fmt(p503p1)+24] - mul r14 - add r8, rax - adc r9, rdx - adc r10, 0 - add r8, [reg_p1+80] - mov [reg_p2+16], r8 // z2 - adc r9, 0 - adc r10, 0 - - xor r8, r8 - mov rax, [rip+fmt(p503p1)+56] - mul r15 - add r9, rax - adc r10, rdx - adc r8, 0 - - mov rax, [rip+fmt(p503p1)+48] - mul rcx - add r9, rax - adc r10, rdx - adc r8, 0 - - mov rax, [rip+fmt(p503p1)+40] - mul r13 - add r9, rax - adc r10, rdx - adc r8, 0 - - mov rax, [rip+fmt(p503p1)+32] - mul r14 - add r9, rax - adc r10, rdx - adc r8, 0 - add r9, [reg_p1+88] - mov [reg_p2+24], r9 // z3 - adc r10, 0 - adc r8, 0 - - xor r9, r9 - mov rax, [rip+fmt(p503p1)+56] - mul rcx - add r10, rax - adc r8, rdx - adc r9, 0 - - mov rax, [rip+fmt(p503p1)+48] - mul r13 - add r10, rax - adc r8, rdx - adc r9, 0 - - mov rax, [rip+fmt(p503p1)+40] - mul r14 - add r10, rax - adc r8, rdx - adc r9, 0 - add r10, [reg_p1+96] - mov [reg_p2+32], r10 // z4 - adc r8, 0 - adc r9, 0 - - xor r10, r10 - mov rax, [rip+fmt(p503p1)+56] - mul r13 - add r8, rax - adc r9, rdx - adc r10, 0 - - mov rax, [rip+fmt(p503p1)+48] - mul r14 - add r8, rax - adc r9, rdx - adc r10, 0 - add r8, [reg_p1+104] // z5 - mov [reg_p2+40], r8 // z5 - adc r9, 0 - adc r10, 0 - - mov rax, [rip+fmt(p503p1)+56] - mul r14 - add r9, rax - adc r10, rdx - add r9, [reg_p1+112] // z6 - mov [reg_p2+48], r9 // z6 - adc r10, 0 - add r10, [reg_p1+120] // z7 - mov [reg_p2+56], r10 // z7 - - pop r15 - pop r14 - pop r13 - pop r12 - ret - - #endif - - -//*********************************************************************** -// 503-bit multiprecision addition -// Operation: c [reg_p3] = a [reg_p1] + b [reg_p2] -//*********************************************************************** -.global fmt(mp_add503_asm) -fmt(mp_add503_asm): - mov r8, [reg_p1] - mov r9, [reg_p1+8] - mov r10, [reg_p1+16] - mov r11, [reg_p1+24] - add r8, [reg_p2] - adc r9, [reg_p2+8] - adc r10, [reg_p2+16] - adc r11, [reg_p2+24] - mov [reg_p3], r8 - mov [reg_p3+8], r9 - mov [reg_p3+16], r10 - mov [reg_p3+24], r11 - - mov r8, [reg_p1+32] - mov r9, [reg_p1+40] - mov r10, [reg_p1+48] - mov r11, [reg_p1+56] - adc r8, [reg_p2+32] - adc r9, [reg_p2+40] - adc r10, [reg_p2+48] - adc r11, [reg_p2+56] - mov [reg_p3+32], r8 - mov [reg_p3+40], r9 - mov [reg_p3+48], r10 - mov [reg_p3+56], r11 - ret - - -//*********************************************************************** -// 2x503-bit multiprecision subtraction/addition -// Operation: c [x2] = a [x0] - b [x1]. If c < 0, add p503*2^512 -//*********************************************************************** -.global fmt(mp_subadd503x2_asm) -fmt(mp_subadd503x2_asm): - push r12 - push r13 - push r14 - push r15 - xor rax, rax - mov r8, [reg_p1] - mov r9, [reg_p1+8] - mov r10, [reg_p1+16] - mov r11, [reg_p1+24] - sub r8, [reg_p2] - sbb r9, [reg_p2+8] - sbb r10, [reg_p2+16] - sbb r11, [reg_p2+24] - mov [reg_p3], r8 - mov [reg_p3+8], r9 - mov [reg_p3+16], r10 - mov [reg_p3+24], r11 - - mov r8, [reg_p1+32] - mov r9, [reg_p1+40] - mov r10, [reg_p1+48] - mov r11, [reg_p1+56] - sbb r8, [reg_p2+32] - sbb r9, [reg_p2+40] - sbb r10, [reg_p2+48] - sbb r11, [reg_p2+56] - mov [reg_p3+32], r8 - mov [reg_p3+40], r9 - mov [reg_p3+48], r10 - mov [reg_p3+56], r11 - - mov r8, [reg_p1+64] - mov r9, [reg_p1+72] - mov r10, [reg_p1+80] - mov r11, [reg_p1+88] - sbb r8, [reg_p2+64] - sbb r9, [reg_p2+72] - sbb r10, [reg_p2+80] - sbb r11, [reg_p2+88] - mov [reg_p3+64], r8 - mov [reg_p3+72], r9 - mov [reg_p3+80], r10 - mov [reg_p3+88], r11 - - mov r12, [reg_p1+96] - mov r13, [reg_p1+104] - mov r14, [reg_p1+112] - mov r15, [reg_p1+120] - sbb r12, [reg_p2+96] - sbb r13, [reg_p2+104] - sbb r14, [reg_p2+112] - sbb r15, [reg_p2+120] - sbb rax, 0 - - // Add p503 anded with the mask in rax - mov r8, [rip+fmt(p503)] - mov r9, [rip+fmt(p503)+24] - mov r10, [rip+fmt(p503)+32] - mov r11, [rip+fmt(p503)+40] - mov rdi, [rip+fmt(p503)+48] - mov rsi, [rip+fmt(p503)+56] - and r8, rax - and r9, rax - and r10, rax - and r11, rax - and rdi, rax - and rsi, rax - mov rax, [reg_p3+64] - add rax, r8 - mov [reg_p3+64], rax - mov rax, [reg_p3+72] - adc rax, r8 - mov [reg_p3+72], rax - mov rax, [reg_p3+80] - adc rax, r8 - mov [reg_p3+80], rax - mov rax, [reg_p3+88] - adc rax, r9 - mov [reg_p3+88], rax - adc r12, r10 - adc r13, r11 - adc r14, rdi - adc r15, rsi - - mov [reg_p3+96], r12 - mov [reg_p3+104], r13 - mov [reg_p3+112], r14 - mov [reg_p3+120], r15 - pop r15 - pop r14 - pop r13 - pop r12 - ret - - -//*********************************************************************** -// Double 2x503-bit multiprecision subtraction -// Operation: c [reg_p3] = c [reg_p3] - a [reg_p1] - b [reg_p2] -//*********************************************************************** -.global fmt(mp_dblsub503x2_asm) -fmt(mp_dblsub503x2_asm): - push r12 - push r13 - - mov r8, [reg_p3] - mov r9, [reg_p3+8] - mov r10, [reg_p3+16] - mov r11, [reg_p3+24] - mov r12, [reg_p3+32] - mov r13, [reg_p3+40] - sub r8, [reg_p1] - sbb r9, [reg_p1+8] - sbb r10, [reg_p1+16] - sbb r11, [reg_p1+24] - sbb r12, [reg_p1+32] - sbb r13, [reg_p1+40] - setc al - sub r8, [reg_p2] - sbb r9, [reg_p2+8] - sbb r10, [reg_p2+16] - sbb r11, [reg_p2+24] - sbb r12, [reg_p2+32] - sbb r13, [reg_p2+40] - setc cl - mov [reg_p3], r8 - mov [reg_p3+8], r9 - mov [reg_p3+16], r10 - mov [reg_p3+24], r11 - mov [reg_p3+32], r12 - mov [reg_p3+40], r13 - - mov r8, [reg_p3+48] - mov r9, [reg_p3+56] - mov r10, [reg_p3+64] - mov r11, [reg_p3+72] - mov r12, [reg_p3+80] - mov r13, [reg_p3+88] - bt rax, 0 - sbb r8, [reg_p1+48] - sbb r9, [reg_p1+56] - sbb r10, [reg_p1+64] - sbb r11, [reg_p1+72] - sbb r12, [reg_p1+80] - sbb r13, [reg_p1+88] - setc al - bt rcx, 0 - sbb r8, [reg_p2+48] - sbb r9, [reg_p2+56] - sbb r10, [reg_p2+64] - sbb r11, [reg_p2+72] - sbb r12, [reg_p2+80] - sbb r13, [reg_p2+88] - setc cl - mov [reg_p3+48], r8 - mov [reg_p3+56], r9 - mov [reg_p3+64], r10 - mov [reg_p3+72], r11 - mov [reg_p3+80], r12 - mov [reg_p3+88], r13 - - mov r8, [reg_p3+96] - mov r9, [reg_p3+104] - mov r10, [reg_p3+112] - mov r11, [reg_p3+120] - bt rax, 0 - sbb r8, [reg_p1+96] - sbb r9, [reg_p1+104] - sbb r10, [reg_p1+112] - sbb r11, [reg_p1+120] - bt rcx, 0 - sbb r8, [reg_p2+96] - sbb r9, [reg_p2+104] - sbb r10, [reg_p2+112] - sbb r11, [reg_p2+120] - mov [reg_p3+96], r8 - mov [reg_p3+104], r9 - mov [reg_p3+112], r10 - mov [reg_p3+120], r11 - - pop r13 - pop r12 - ret \ No newline at end of file +// Squaring in GF(p^2), complex part +// Operation: c [reg_p2] = 2a0 x a1 +// Inputs: a = [a1, a0] stored in [reg_p1] +// Output: c stored in [reg_p2] +//*********************************************************************** +.global fmt(fp2sqr503_c1_asm) +fmt(fp2sqr503_c1_asm): + push r12 + push r13 + + mov r8, [reg_p1] + mov r9, [reg_p1+8] + mov r10, [reg_p1+16] + mov r11, [reg_p1+24] + mov r12, [reg_p1+32] + mov r13, [reg_p1+40] + add r8, r8 + adc r9, r9 + push r14 + adc r10, r10 + adc r11, r11 + push r15 + adc r12, r12 + mov r14, [reg_p1+48] + mov r15, [reg_p1+56] + adc r13, r13 + push rbx + adc r14, r14 + push rbp + adc r15, r15 + sub rsp, 64 + mov [rsp+8], r9 + mov [rsp+16], r10 + + // [r8:r15, rbp] <- z = a00 x a1 + mov rdx, r8 + mulx r9, r8, [reg_p1+64] + mov [rsp+24], r11 + xor rax, rax + mulx r10, r11, [reg_p1+72] + mov [rsp+32], r12 + adox r9, r11 + mulx r11, r12, [reg_p1+80] + mov [rsp+40], r13 + adox r10, r12 + mulx r12, r13, [reg_p1+88] + mov [rsp+48], r14 + adox r11, r13 + mulx r13, r14, [reg_p1+96] + mov [rsp+56], r15 + adox r12, r14 + mulx r14, r15, [reg_p1+104] + adox r13, r15 + mulx r15, rbp, [reg_p1+112] + adox r14, rbp + mulx rbp, rbx, [reg_p1+120] + adox r15, rbx + adox rbp, rax + + FPMUL512x512 [rsp], [reg_p1+64], r8, r9, r10, r11, r12, r13, r14, r15, rbp, rbx, rcx + + mov [reg_p2], rbp + mov [reg_p2+8], r8 + mov [reg_p2+16], r9 + mov [reg_p2+24], r10 + mov [reg_p2+32], r11 + mov [reg_p2+40], r12 + mov [reg_p2+48], r13 + mov [reg_p2+56], r14 + add rsp, 64 + pop rbp + pop rbx + pop r15 + pop r14 + pop r13 + pop r12 + ret + + +//*********************************************************************** +// Field multiplication in GF(p) +// Operation: c = a x b mod p +// Inputs: a stored in [reg_p1], b stored in [reg_p2] +// Output: c stored in [reg_p3] +//*********************************************************************** +.global fmt(fpmul503_asm) +fmt(fpmul503_asm): + mov rcx, reg_p3 + + // [r8:r15] <- z = a x b0 + mov rdx, [reg_p2] + mulx r9, r8, [reg_p1] + push r12 + xor rax, rax + mulx r10, r11, [reg_p1+8] + push r13 + adox r9, r11 + mulx r11, r12, [reg_p1+16] + push r14 + adox r10, r12 + mulx r12, r13, [reg_p1+24] + push r15 + adox r11, r13 + mulx r13, r14, [reg_p1+32] + push rbx + adox r12, r14 + mulx r14, r15, [reg_p1+40] + push rbp + adox r13, r15 + mulx r15, rbx, [reg_p1+48] + adox r14, rbx + mulx rbx, rbp, [reg_p1+56] + adox r15, rbp + adox rax, rbx + + FPMUL512x512 [reg_p2], [reg_p1], r8, r9, r10, r11, r12, r13, r14, r15, rax, rbx, rbp + + mov [rcx], rax + mov [rcx+8], r8 + mov [rcx+16], r9 + mov [rcx+24], r10 + mov [rcx+32], r11 + mov [rcx+40], r12 + mov [rcx+48], r13 + mov [rcx+56], r14 + pop rbp + pop rbx + pop r15 + pop r14 + pop r13 + pop r12 + ret \ No newline at end of file diff --git a/src/P503/ARM64/fp_arm64.c b/src/P503/ARM64/fp_arm64.c index df4e8dc..cc36497 100644 --- a/src/P503/ARM64/fp_arm64.c +++ b/src/P503/ARM64/fp_arm64.c @@ -1,10 +1,15 @@ /******************************************************************************************** * SIDH: an efficient supersingular isogeny cryptography library +* Copyright (c) Microsoft Corporation +* +* Website: https://github.com/microsoft/PQCrypto-SIDH +* Released under MIT license * * Abstract: modular arithmetic optimized for 64-bit ARMv8 platforms for P503 *********************************************************************************************/ #include "../P503_internal.h" +#include "../../internal.h" // Global constants extern const uint64_t p503[NWORDS_FIELD]; @@ -13,21 +18,21 @@ extern const uint64_t p503x2[NWORDS_FIELD]; extern const uint64_t p503x4[NWORDS_FIELD]; -__inline void mp_sub503_p2(const digit_t* a, const digit_t* b, digit_t* c) +inline void mp_sub503_p2(const digit_t* a, const digit_t* b, digit_t* c) { // Multiprecision subtraction with correction with 2*p, c = a-b+2p. mp_sub503_p2_asm(a, b, c); } -__inline void mp_sub503_p4(const digit_t* a, const digit_t* b, digit_t* c) +inline void mp_sub503_p4(const digit_t* a, const digit_t* b, digit_t* c) { // Multiprecision subtraction with correction with 4*p, c = a-b+4p. mp_sub503_p4_asm(a, b, c); } -__inline void fpadd503(const digit_t* a, const digit_t* b, digit_t* c) +inline void fpadd503(const digit_t* a, const digit_t* b, digit_t* c) { // Modular addition, c = a+b mod p503. // Inputs: a, b in [0, 2*p503-1] // Output: c in [0, 2*p503-1] @@ -36,7 +41,7 @@ __inline void fpadd503(const digit_t* a, const digit_t* b, digit_t* c) } -__inline void fpsub503(const digit_t* a, const digit_t* b, digit_t* c) +inline void fpsub503(const digit_t* a, const digit_t* b, digit_t* c) { // Modular subtraction, c = a-b mod p503. // Inputs: a, b in [0, 2*p503-1] // Output: c in [0, 2*p503-1] @@ -45,7 +50,7 @@ __inline void fpsub503(const digit_t* a, const digit_t* b, digit_t* c) } -__inline void fpneg503(digit_t* a) +inline void fpneg503(digit_t* a) { // Modular negation, a = -a mod p503. // Input/output: a in [0, 2*p503-1] unsigned int i, borrow = 0; diff --git a/src/P503/ARM64/fp_arm64_asm.S b/src/P503/ARM64/fp_arm64_asm.S index 914d789..220b65a 100644 --- a/src/P503/ARM64/fp_arm64_asm.S +++ b/src/P503/ARM64/fp_arm64_asm.S @@ -1,5 +1,9 @@ //******************************************************************************************* // SIDH: an efficient supersingular isogeny cryptography library +// Copyright (c) Microsoft Corporation +// +// Website: https://github.com/microsoft/PQCrypto-SIDH +// Released under MIT license // // Abstract: field arithmetic in 64-bit ARMv8 assembly for P503 on Linux //******************************************************************************************* diff --git a/src/P503/P503.c b/src/P503/P503.c index caefc1e..9e576c7 100644 --- a/src/P503/P503.c +++ b/src/P503/P503.c @@ -1,5 +1,9 @@ /******************************************************************************************** * SIDH: an efficient supersingular isogeny cryptography library +* Copyright (c) Microsoft Corporation +* +* Website: https://github.com/microsoft/PQCrypto-SIDH +* Released under MIT license * * Abstract: supersingular isogeny parameters and generation of functions for P503 *********************************************************************************************/ @@ -27,13 +31,10 @@ const uint64_t p503x2[NWORDS64_FIELD] = { 0xFFFFFFFFFFFFFFFE, 0xFFFFF 0x2610B7B44423CF41, 0x3737ED90F6FCFB5E, 0xC08B8D7BB4EF49A0, 0x0080CDEA83023C3C }; const uint64_t p503x4[NWORDS64_FIELD] = { 0xFFFFFFFFFFFFFFFC, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xAFFFFFFFFFFFFFFF, 0x4C216F6888479E82, 0x6E6FDB21EDF9F6BC, 0x81171AF769DE9340, 0x01019BD506047879 }; +const uint64_t p503x8[NWORDS64_FIELD] = { 0xFFFFFFFFFFFFFFF8, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x5FFFFFFFFFFFFFFF, + 0x9842DED1108F3D05, 0xDCDFB643DBF3ED78, 0x022E35EED3BD2680, 0x020337AA0C08F0F3 }; const uint64_t p503p1[NWORDS64_FIELD] = { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xAC00000000000000, 0x13085BDA2211E7A0, 0x1B9BF6C87B7E7DAF, 0x6045C6BDDA77A4D0, 0x004066F541811E1E }; -const uint64_t p503p1x64[NWORDS64_FIELD/2] = { 0xC216F6888479E82B, 0xE6FDB21EDF9F6BC4, 0x1171AF769DE93406, 0x1019BD5060478798 }; -const uint64_t p503x16p[2*NWORDS64_FIELD] = { 0x0000000000000010, 0x0000000000000000, 0x0000000000000000, 0x8000000000000000, - 0x9EF484BBBDC30BEA, 0x8C8126F090304A1D, 0xF7472844B10B65FC, 0x30F32157CFDC3C33, - 0x1463AB4329A333F7, 0xDFC933977C47D3A4, 0x338A3767F6F2520B, 0x4F8CB7565CCC13FA, - 0xDE43B73AACD2189B, 0xBCF845CAC5405FBD, 0x516D02A09E684B7A, 0x0001033A4091BB86 }; // Order of Alice's subgroup const uint64_t Alice_order[NWORDS64_ORDER] = { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0400000000000000 }; // Order of Bob's subgroup @@ -96,6 +97,7 @@ const unsigned int strat_Bob[MAX_Bob-1] = { #define fpneg fpneg503 #define fpdiv2 fpdiv2_503 #define fpcorrection fpcorrection503 +#define fpmul fpmul503 #define fpmul_mont fpmul503_mont #define fpsqr_mont fpsqr503_mont #define fpinv_mont fpinv503_mont @@ -113,6 +115,10 @@ const unsigned int strat_Bob[MAX_Bob-1] = { #define fp2correction fp2correction503 #define fp2mul_mont fp2mul503_mont #define fp2sqr_mont fp2sqr503_mont +#define fp2mul_c0_mont fp2mul503_c0_mont +#define fp2mul_c1_mont fp2mul503_c1_mont +#define fp2sqr_c0_mont fp2sqr503_c0_mont +#define fp2sqr_c1_mont fp2sqr503_c1_mont #define fp2inv_mont fp2inv503_mont #define fp2inv_mont_bingcd fp2inv503_mont_bingcd #define fpequal_non_constant_time fpequal503_non_constant_time diff --git a/src/P503/P503_api.h b/src/P503/P503_api.h index 1c17447..1a209d3 100644 --- a/src/P503/P503_api.h +++ b/src/P503/P503_api.h @@ -1,5 +1,9 @@ /******************************************************************************************** * SIDH: an efficient supersingular isogeny cryptography library +* Copyright (c) Microsoft Corporation +* +* Website: https://github.com/microsoft/PQCrypto-SIDH +* Released under MIT license * * Abstract: API header file for P503 *********************************************************************************************/ diff --git a/src/P503/P503_compressed.c b/src/P503/P503_compressed.c index d3611e2..a68f98f 100644 --- a/src/P503/P503_compressed.c +++ b/src/P503/P503_compressed.c @@ -1,5 +1,9 @@ /******************************************************************************************** * Supersingular Isogeny Key Encapsulation Library +* Copyright (c) Microsoft Corporation +* +* Website: https://github.com/microsoft/PQCrypto-SIDH +* Released under MIT license * * Abstract: supersingular isogeny parameters and generation of functions for P503_compressed *********************************************************************************************/ @@ -28,13 +32,10 @@ const uint64_t p503x2[NWORDS64_FIELD] = { 0xFFFFFFFFFFFFFFFE, 0xFFFFF 0x2610B7B44423CF41, 0x3737ED90F6FCFB5E, 0xC08B8D7BB4EF49A0, 0x0080CDEA83023C3C }; const uint64_t p503x4[NWORDS64_FIELD] = { 0xFFFFFFFFFFFFFFFC, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xAFFFFFFFFFFFFFFF, 0x4C216F6888479E82, 0x6E6FDB21EDF9F6BC, 0x81171AF769DE9340, 0x01019BD506047879 }; +const uint64_t p503x8[NWORDS64_FIELD] = { 0xFFFFFFFFFFFFFFF8, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x5FFFFFFFFFFFFFFF, + 0x9842DED1108F3D05, 0xDCDFB643DBF3ED78, 0x022E35EED3BD2680, 0x020337AA0C08F0F3 }; const uint64_t p503p1[NWORDS64_FIELD] = { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xAC00000000000000, 0x13085BDA2211E7A0, 0x1B9BF6C87B7E7DAF, 0x6045C6BDDA77A4D0, 0x004066F541811E1E }; -const uint64_t p503p1x64[NWORDS64_FIELD/2] = { 0xC216F6888479E82B, 0xE6FDB21EDF9F6BC4, 0x1171AF769DE93406, 0x1019BD5060478798 }; -const uint64_t p503x16p[2*NWORDS64_FIELD] = { 0x0000000000000010, 0x0000000000000000, 0x0000000000000000, 0x8000000000000000, - 0x9EF484BBBDC30BEA, 0x8C8126F090304A1D, 0xF7472844B10B65FC, 0x30F32157CFDC3C33, - 0x1463AB4329A333F7, 0xDFC933977C47D3A4, 0x338A3767F6F2520B, 0x4F8CB7565CCC13FA, - 0xDE43B73AACD2189B, 0xBCF845CAC5405FBD, 0x516D02A09E684B7A, 0x0001033A4091BB86 }; // Order of Alice's subgroup const uint64_t Alice_order[NWORDS64_ORDER] = { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0400000000000000 }; // Order of Bob's subgroup @@ -353,6 +354,7 @@ const uint64_t v_3_torsion[TABLE_V3_LEN][2 * NWORDS64_FIELD] = #define fpneg fpneg503 #define fpdiv2 fpdiv2_503 #define fpcorrection fpcorrection503 +#define fpmul fpmul503 #define fpmul_mont fpmul503_mont #define fpsqr_mont fpsqr503_mont #define fpinv_mont fpinv503_mont @@ -370,6 +372,10 @@ const uint64_t v_3_torsion[TABLE_V3_LEN][2 * NWORDS64_FIELD] = #define fp2correction fp2correction503 #define fp2mul_mont fp2mul503_mont #define fp2sqr_mont fp2sqr503_mont +#define fp2mul_c0_mont fp2mul503_c0_mont +#define fp2mul_c1_mont fp2mul503_c1_mont +#define fp2sqr_c0_mont fp2sqr503_c0_mont +#define fp2sqr_c1_mont fp2sqr503_c1_mont #define fp2inv_mont fp2inv503_mont #define fp2inv_mont_bingcd fp2inv503_mont_bingcd #define fpequal_non_constant_time fpequal503_non_constant_time diff --git a/src/P503/P503_compressed_api.h b/src/P503/P503_compressed_api.h index 668f0f0..bc02f56 100644 --- a/src/P503/P503_compressed_api.h +++ b/src/P503/P503_compressed_api.h @@ -1,5 +1,9 @@ /******************************************************************************************** * SIDH: an efficient supersingular isogeny cryptography library +* Copyright (c) Microsoft Corporation +* +* Website: https://github.com/microsoft/PQCrypto-SIDH +* Released under MIT license * * Abstract: API header file for P503 using compression *********************************************************************************************/ diff --git a/src/P503/P503_compressed_dlog_tables.c b/src/P503/P503_compressed_dlog_tables.c index 4ebb910..db9724a 100644 --- a/src/P503/P503_compressed_dlog_tables.c +++ b/src/P503/P503_compressed_dlog_tables.c @@ -1,5 +1,9 @@ /******************************************************************************************** * SIDH: an efficient supersingular isogeny cryptography library +* Copyright (c) Microsoft Corporation +* +* Website: https://github.com/microsoft/PQCrypto-SIDH +* Released under MIT license * * Abstract: precomputed tables for Pohlig-Hellman when using compression *********************************************************************************************/ diff --git a/src/P503/P503_compressed_pair_tables.c b/src/P503/P503_compressed_pair_tables.c index f7ee631..ff54f97 100644 --- a/src/P503/P503_compressed_pair_tables.c +++ b/src/P503/P503_compressed_pair_tables.c @@ -1,5 +1,9 @@ /************************************************************************************************** * SIDH: an efficient supersingular isogeny cryptography library +* Copyright (c) Microsoft Corporation +* +* Website: https://github.com/microsoft/PQCrypto-SIDH +* Released under MIT license * * Abstract: precomputed tables for pairing computation on E0: y^2 = x^3 + x when using compression ***************************************************************************************************/ diff --git a/src/P503/P503_internal.h b/src/P503/P503_internal.h index 53afc48..120f1e5 100644 --- a/src/P503/P503_internal.h +++ b/src/P503/P503_internal.h @@ -1,5 +1,9 @@ /******************************************************************************************** * SIDH: an efficient supersingular isogeny cryptography library +* Copyright (c) Microsoft Corporation +* +* Website: https://github.com/microsoft/PQCrypto-SIDH +* Released under MIT license * * Abstract: internal header file for P503 *********************************************************************************************/ @@ -168,6 +172,8 @@ void rdc503_asm(digit_t* ma, digit_t* mc); // Field multiplication using Montgomery arithmetic, c = a*b*R^-1 mod p503, where R=2^768 void fpmul503_mont(const digit_t* a, const digit_t* b, digit_t* c); +void fpmul503(const digit_t* a, const digit_t* b, digit_t* c); +void fpmul503_asm(const digit_t* a, const digit_t* b, digit_t* c); void mul503_asm(const digit_t* a, const digit_t* b, digit_t* c); // Field squaring using Montgomery arithmetic, c = a*b*R^-1 mod p503, where R=2^768 @@ -207,9 +213,17 @@ void fp2correction503(f2elm_t a); // GF(p503^2) squaring using Montgomery arithmetic, c = a^2 in GF(p503^2) void fp2sqr503_mont(const f2elm_t a, f2elm_t c); +void fp2sqr503_c0_mont(const digit_t* a, digit_t* c); +void fp2sqr503_c0_asm(const digit_t* a, digit_t* c); +void fp2sqr503_c1_mont(const digit_t* a, digit_t* c); +void fp2sqr503_c1_asm(const digit_t* a, digit_t* c); // GF(p503^2) multiplication using Montgomery arithmetic, c = a*b in GF(p503^2) void fp2mul503_mont(const f2elm_t a, const f2elm_t b, f2elm_t c); +void fp2mul503_c0_mont(const digit_t* a, const digit_t* b, digit_t* c); +void fp2mul503_c0_asm(const digit_t* a, const digit_t* b, digit_t* c); +void fp2mul503_c1_mont(const digit_t* a, const digit_t* b, digit_t* c); +void fp2mul503_c1_asm(const digit_t* a, const digit_t* b, digit_t* c); // GF(p503^2) inversion using Montgomery arithmetic, a = (a0-i*a1)/(a0^2+a1^2) void fp2inv503_mont(f2elm_t a); diff --git a/src/P503/generic/fp_generic.c b/src/P503/generic/fp_generic.c index 87d8b09..1fbdff3 100755 --- a/src/P503/generic/fp_generic.c +++ b/src/P503/generic/fp_generic.c @@ -1,5 +1,9 @@ /******************************************************************************************** * SIDH: an efficient supersingular isogeny cryptography library +* Copyright (c) Microsoft Corporation +* +* Website: https://github.com/microsoft/PQCrypto-SIDH +* Released under MIT license * * Abstract: portable modular arithmetic for P503 *********************************************************************************************/ diff --git a/src/P610/AMD64/fp_x64.c b/src/P610/AMD64/fp_x64.c index e77022e..db8cb5b 100644 --- a/src/P610/AMD64/fp_x64.c +++ b/src/P610/AMD64/fp_x64.c @@ -1,5 +1,9 @@ /******************************************************************************************** * SIDH: an efficient supersingular isogeny cryptography library +* Copyright (c) Microsoft Corporation +* +* Website: https://github.com/microsoft/PQCrypto-SIDH +* Released under MIT license * * Abstract: modular arithmetic optimized for x64 platforms for P610 *********************************************************************************************/ @@ -17,7 +21,7 @@ extern const uint64_t p610x4[NWORDS_FIELD]; inline void mp_sub610_p2(const digit_t* a, const digit_t* b, digit_t* c) { // Multiprecision subtraction with correction with 2*p, c = a-b+2p. -#if (OS_TARGET == OS_WIN) || defined(GENERIC_IMPLEMENTATION) || (TARGET == TARGET_ARM) || (TARGET == TARGET_ARM64 && NBITS_FIELD == 610) +#if (OS_TARGET == OS_WIN) unsigned int i, borrow = 0; for (i = 0; i < NWORDS_FIELD; i++) { @@ -39,7 +43,7 @@ inline void mp_sub610_p2(const digit_t* a, const digit_t* b, digit_t* c) inline void mp_sub610_p4(const digit_t* a, const digit_t* b, digit_t* c) { // Multiprecision subtraction with correction with 4*p, c = a-b+4p. -#if (OS_TARGET == OS_WIN) || defined(GENERIC_IMPLEMENTATION) || (TARGET == TARGET_ARM) || (TARGET == TARGET_ARM64 && NBITS_FIELD == 610) +#if (OS_TARGET == OS_WIN) unsigned int i, borrow = 0; for (i = 0; i < NWORDS_FIELD; i++) { @@ -50,11 +54,6 @@ inline void mp_sub610_p4(const digit_t* a, const digit_t* b, digit_t* c) for (i = 0; i < NWORDS_FIELD; i++) { ADDC(borrow, c[i], ((digit_t*)p610x4)[i], borrow, c[i]); } - -#elif (OS_TARGET == OS_NIX) - - mp_sub610_p4_asm(a, b, c); - #endif } @@ -161,13 +160,43 @@ void fpcorrection610(digit_t* a) } } +#if (OS_TARGET == OS_NIX) + +void fp2mul610_c0_mont(const digit_t* a, const digit_t* b, digit_t* c) +{ + fp2mul610_c0_asm(a, b, c); +} + + +void fp2mul610_c1_mont(const digit_t* a, const digit_t* b, digit_t* c) +{ + fp2mul610_c1_asm(a, b, c); +} + + +void fp2sqr610_c0_mont(const digit_t* a, digit_t* c) +{ + fp2sqr610_c0_asm(a, c); +} + + +void fp2sqr610_c1_mont(const digit_t* a, digit_t* c) +{ + fp2sqr610_c1_asm(a, c); +} + + +void fpmul610(const digit_t* a, const digit_t* b, digit_t* c) +{ + fpmul610_asm(a, b, c); +} + +#else void mp_mul(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords) { // Multiprecision multiply, c = a*b, where lng(a) = lng(b) = nwords. UNREFERENCED_PARAMETER(nwords); - -#if (OS_TARGET == OS_WIN) digit_t t = 0; uint128_t uv = {0}; unsigned int carry = 0; @@ -462,12 +491,6 @@ void mp_mul(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int n MULADD128(a[9], b[9], uv, carry, uv); c[18] = uv[0]; c[19] = uv[1]; - -#elif (OS_TARGET == OS_NIX) - - mul610_asm(a, b, c); - -#endif } @@ -476,8 +499,6 @@ void rdc_mont(digit_t* ma, digit_t* mc) // mc = ma*R^-1 mod p610x2, where R = 2^640. // If ma < 2^640*p610, the output mc is in the range [0, 2*p610-1]. // ma is assumed to be in Montgomery representation. - -#if (OS_TARGET == OS_WIN) unsigned int carry; digit_t t = 0; uint128_t uv = {0}; @@ -717,10 +738,6 @@ void rdc_mont(digit_t* ma, digit_t* mc) ADDC(0, uv[0], ma[18], carry, mc[8]); ADDC(carry, uv[1], 0, carry, uv[1]); ADDC(0, uv[1], ma[19], carry, mc[9]); - -#elif (OS_TARGET == OS_NIX) - - rdc610_asm(ma, mc); +} -#endif -} \ No newline at end of file +#endif \ No newline at end of file diff --git a/src/P610/AMD64/fp_x64_asm.S b/src/P610/AMD64/fp_x64_asm.S index 8860cf6..0997164 100644 --- a/src/P610/AMD64/fp_x64_asm.S +++ b/src/P610/AMD64/fp_x64_asm.S @@ -1,1310 +1,1098 @@ //******************************************************************************************* // SIDH: an efficient supersingular isogeny cryptography library +// Copyright (c) Microsoft Corporation // -// Abstract: field arithmetic in x64 assembly for P610 on Linux -//******************************************************************************************* - -.intel_syntax noprefix - -// Format function and variable names for Mac OS X -#if defined(__APPLE__) - #define fmt(f) _##f -#else - #define fmt(f) f -#endif - -// Registers that are used for parameter passing: -#define reg_p1 rdi -#define reg_p2 rsi -#define reg_p3 rdx - - -.text -//*********************************************************************** -// Field addition -// Operation: c [reg_p3] = a [reg_p1] + b [reg_p2] -//*********************************************************************** -.global fmt(fpadd610_asm) -fmt(fpadd610_asm): - push r12 - push r13 - push r14 - push r15 - - mov r8, [reg_p1] - mov r9, [reg_p1+8] - mov r10, [reg_p1+16] - mov r11, [reg_p1+24] - mov r12, [reg_p1+32] - mov r13, [reg_p1+40] - mov r14, [reg_p1+48] - mov r15, [reg_p1+56] - mov rcx, [reg_p1+64] - mov rdi, [reg_p1+72] - add r8, [reg_p2] - adc r9, [reg_p2+8] - adc r10, [reg_p2+16] - adc r11, [reg_p2+24] - adc r12, [reg_p2+32] - adc r13, [reg_p2+40] - adc r14, [reg_p2+48] - adc r15, [reg_p2+56] - adc rcx, [reg_p2+64] - adc rdi, [reg_p2+72] - - mov rax, [rip+fmt(p610x2)] - sub r8, rax - mov rax, [rip+fmt(p610x2)+8] - sbb r9, rax - sbb r10, rax - sbb r11, rax - mov rax, [rip+fmt(p610x2)+32] - sbb r12, rax - mov rax, [rip+fmt(p610x2)+40] - sbb r13, rax - mov rax, [rip+fmt(p610x2)+48] - sbb r14, rax - mov rax, [rip+fmt(p610x2)+56] - sbb r15, rax - mov rax, [rip+fmt(p610x2)+64] - sbb rcx, rax - mov rax, [rip+fmt(p610x2)+72] - sbb rdi, rax - mov [reg_p3+64], rcx - mov [reg_p3+72], rdi - mov rax, 0 - sbb rax, 0 - - mov rsi, [rip+fmt(p610x2)] - and rsi, rax - mov rdi, [rip+fmt(p610x2)+8] - and rdi, rax - - add r8, rsi - adc r9, rdi - adc r10, rdi - adc r11, rdi - mov [reg_p3], r8 - mov [reg_p3+8], r9 - mov [reg_p3+16], r10 - mov [reg_p3+24], r11 - setc cl - - mov rdi, [rip+fmt(p610x2)+32] - and rdi, rax - mov rsi, [rip+fmt(p610x2)+40] - and rsi, rax - mov r8, [rip+fmt(p610x2)+48] - and r8, rax - mov r9, [rip+fmt(p610x2)+56] - and r9, rax - mov r10, [rip+fmt(p610x2)+64] - and r10, rax - mov r11, [rip+fmt(p610x2)+72] - and r11, rax - - bt rcx, 0 - adc r12, rdi - adc r13, rsi - adc r14, r8 - adc r15, r9 - mov rsi, [reg_p3+64] - mov rdi, [reg_p3+72] - adc rsi, r10 - adc rdi, r11 - mov [reg_p3+32], r12 - mov [reg_p3+40], r13 - mov [reg_p3+48], r14 - mov [reg_p3+56], r15 - mov [reg_p3+64], rsi - mov [reg_p3+72], rdi - - pop r15 - pop r14 - pop r13 - pop r12 - ret +// Website: https://github.com/microsoft/PQCrypto-SIDH +// Released under MIT license +// +// Abstract: field arithmetic in x64 assembly for P610 on Linux +//******************************************************************************************* + +.intel_syntax noprefix + +// Format function and variable names for Mac OS X +#if defined(__APPLE__) + #define fmt(f) _##f +#else + #define fmt(f) f +#endif + +// Registers that are used for parameter passing: +#define reg_p1 rdi +#define reg_p2 rsi +#define reg_p3 rdx + + +.text +//*********************************************************************** +// 610-bit multiprecision addition +// Operation: c [reg_p3] = a [reg_p1] + b [reg_p2] +//*********************************************************************** +.global fmt(mp_add610_asm) +fmt(mp_add610_asm): + mov r8, [reg_p1] + mov r9, [reg_p1+8] + mov r10, [reg_p1+16] + mov r11, [reg_p1+24] + mov rax, [reg_p1+32] + add r8, [reg_p2] + adc r9, [reg_p2+8] + adc r10, [reg_p2+16] + adc r11, [reg_p2+24] + adc rax, [reg_p2+32] + mov [reg_p3], r8 + mov [reg_p3+8], r9 + mov [reg_p3+16], r10 + mov [reg_p3+24], r11 + mov [reg_p3+32], rax + + mov r8, [reg_p1+40] + mov r9, [reg_p1+48] + mov r10, [reg_p1+56] + mov r11, [reg_p1+64] + mov rax, [reg_p1+72] + adc r8, [reg_p2+40] + adc r9, [reg_p2+48] + adc r10, [reg_p2+56] + adc r11, [reg_p2+64] + adc rax, [reg_p2+72] + mov [reg_p3+40], r8 + mov [reg_p3+48], r9 + mov [reg_p3+56], r10 + mov [reg_p3+64], r11 + mov [reg_p3+72], rax + ret + + +//*********************************************************************** +// Field addition +// Operation: c [reg_p3] = a [reg_p1] + b [reg_p2] +//*********************************************************************** +.global fmt(fpadd610_asm) +fmt(fpadd610_asm): + push r12 + push r13 + push r14 + push r15 + + mov r8, [reg_p1] + mov r9, [reg_p1+8] + mov r10, [reg_p1+16] + mov r11, [reg_p1+24] + mov r12, [reg_p1+32] + mov r13, [reg_p1+40] + mov r14, [reg_p1+48] + mov r15, [reg_p1+56] + mov rcx, [reg_p1+64] + mov rdi, [reg_p1+72] + add r8, [reg_p2] + adc r9, [reg_p2+8] + adc r10, [reg_p2+16] + adc r11, [reg_p2+24] + adc r12, [reg_p2+32] + adc r13, [reg_p2+40] + adc r14, [reg_p2+48] + adc r15, [reg_p2+56] + adc rcx, [reg_p2+64] + adc rdi, [reg_p2+72] + + mov rax, [rip+fmt(p610x2)] + sub r8, rax + mov rax, [rip+fmt(p610x2)+8] + sbb r9, rax + sbb r10, rax + sbb r11, rax + mov rax, [rip+fmt(p610x2)+32] + sbb r12, rax + mov rax, [rip+fmt(p610x2)+40] + sbb r13, rax + mov rax, [rip+fmt(p610x2)+48] + sbb r14, rax + mov rax, [rip+fmt(p610x2)+56] + sbb r15, rax + mov rax, [rip+fmt(p610x2)+64] + sbb rcx, rax + mov rax, [rip+fmt(p610x2)+72] + sbb rdi, rax + mov [reg_p3+64], rcx + mov [reg_p3+72], rdi + mov rax, 0 + sbb rax, 0 + + mov rsi, [rip+fmt(p610x2)] + and rsi, rax + mov rdi, [rip+fmt(p610x2)+8] + and rdi, rax + + add r8, rsi + adc r9, rdi + adc r10, rdi + adc r11, rdi + mov [reg_p3], r8 + mov [reg_p3+8], r9 + mov [reg_p3+16], r10 + mov [reg_p3+24], r11 + setc cl + + mov rdi, [rip+fmt(p610x2)+32] + and rdi, rax + mov rsi, [rip+fmt(p610x2)+40] + and rsi, rax + mov r8, [rip+fmt(p610x2)+48] + and r8, rax + mov r9, [rip+fmt(p610x2)+56] + and r9, rax + mov r10, [rip+fmt(p610x2)+64] + and r10, rax + mov r11, [rip+fmt(p610x2)+72] + and r11, rax + + bt rcx, 0 + adc r12, rdi + adc r13, rsi + adc r14, r8 + adc r15, r9 + mov rsi, [reg_p3+64] + mov rdi, [reg_p3+72] + adc rsi, r10 + adc rdi, r11 + mov [reg_p3+32], r12 + mov [reg_p3+40], r13 + mov [reg_p3+48], r14 + mov [reg_p3+56], r15 + mov [reg_p3+64], rsi + mov [reg_p3+72], rdi + + pop r15 + pop r14 + pop r13 + pop r12 + ret + + +//*********************************************************************** +// Field subtraction +// Operation: c [reg_p3] = a [reg_p1] - b [reg_p2] +//*********************************************************************** +.global fmt(fpsub610_asm) +fmt(fpsub610_asm): + push r12 + push r13 + push r14 + push r15 + + mov r8, [reg_p1] + mov r9, [reg_p1+8] + mov r10, [reg_p1+16] + mov r11, [reg_p1+24] + mov r12, [reg_p1+32] + mov r13, [reg_p1+40] + mov r14, [reg_p1+48] + mov r15, [reg_p1+56] + mov rcx, [reg_p1+64] + mov rdi, [reg_p1+72] + sub r8, [reg_p2] + sbb r9, [reg_p2+8] + sbb r10, [reg_p2+16] + sbb r11, [reg_p2+24] + sbb r12, [reg_p2+32] + sbb r13, [reg_p2+40] + sbb r14, [reg_p2+48] + sbb r15, [reg_p2+56] + sbb rcx, [reg_p2+64] + sbb rdi, [reg_p2+72] + mov [reg_p3+64], rcx + mov [reg_p3+72], rdi + mov rax, 0 + sbb rax, 0 + + mov rsi, [rip+fmt(p610x2)] + and rsi, rax + mov rdi, [rip+fmt(p610x2)+8] + and rdi, rax + + add r8, rsi + adc r9, rdi + adc r10, rdi + adc r11, rdi + mov [reg_p3], r8 + mov [reg_p3+8], r9 + mov [reg_p3+16], r10 + mov [reg_p3+24], r11 + setc cl + + mov rdi, [rip+fmt(p610x2)+32] + and rdi, rax + mov rsi, [rip+fmt(p610x2)+40] + and rsi, rax + mov r8, [rip+fmt(p610x2)+48] + and r8, rax + mov r9, [rip+fmt(p610x2)+56] + and r9, rax + mov r10, [rip+fmt(p610x2)+64] + and r10, rax + mov r11, [rip+fmt(p610x2)+72] + and r11, rax + + bt rcx, 0 + adc r12, rdi + adc r13, rsi + adc r14, r8 + adc r15, r9 + mov rsi, [reg_p3+64] + mov rdi, [reg_p3+72] + adc rsi, r10 + adc rdi, r11 + mov [reg_p3+32], r12 + mov [reg_p3+40], r13 + mov [reg_p3+48], r14 + mov [reg_p3+56], r15 + mov [reg_p3+64], rsi + mov [reg_p3+72], rdi + + pop r15 + pop r14 + pop r13 + pop r12 + ret + + +///////////////////////////////////////////////////////////////// MACRO +.macro SUB610_PX P0 + push r12 + push r13 + push r14 + push r15 + + mov r8, [reg_p1] + mov r9, [reg_p1+8] + mov r10, [reg_p1+16] + mov r11, [reg_p1+24] + mov r12, [reg_p1+32] + mov r13, [reg_p1+40] + mov r14, [reg_p1+48] + mov r15, [reg_p1+56] + mov rax, [reg_p1+64] + mov rcx, [reg_p1+72] + sub r8, [reg_p2] + sbb r9, [reg_p2+8] + sbb r10, [reg_p2+16] + sbb r11, [reg_p2+24] + sbb r12, [reg_p2+32] + sbb r13, [reg_p2+40] + sbb r14, [reg_p2+48] + sbb r15, [reg_p2+56] + sbb rax, [reg_p2+64] + sbb rcx, [reg_p2+72] + + mov rdi, [rip+\P0] + mov rsi, [rip+\P0+8] + add r8, rdi + adc r9, rsi + adc r10, rsi + adc r11, rsi + mov rdi, [rip+\P0+32] + mov rsi, [rip+\P0+40] + adc r12, rdi + adc r13, rsi + mov [reg_p3], r8 + mov [reg_p3+8], r9 + mov [reg_p3+16], r10 + mov [reg_p3+24], r11 + mov [reg_p3+32], r12 + mov [reg_p3+40], r13 + mov rdi, [rip+\P0+48] + mov rsi, [rip+\P0+56] + adc r14, rdi + adc r15, rsi + mov rdi, [rip+\P0+64] + mov rsi, [rip+\P0+72] + adc rax, rdi + adc rcx, rsi + mov [reg_p3+48], r14 + mov [reg_p3+56], r15 + mov [reg_p3+64], rax + mov [reg_p3+72], rcx + + pop r15 + pop r14 + pop r13 + pop r12 + .endm + + +//*********************************************************************** +// Multiprecision subtraction with correction with 2*p610 +// Operation: c [reg_p3] = a [reg_p1] - b [reg_p2] + 2*p610 +//*********************************************************************** +.global fmt(mp_sub610_p2_asm) +fmt(mp_sub610_p2_asm): + + SUB610_PX fmt(p610x2) + ret + + +#ifdef _MULX_ +#ifdef _ADX_ + +///////////////////////////////////////////////////////////////// MACRO +// z = a x bi + z +// Inputs: base memory pointer M1 (a), +// bi pre-stored in rdx, +// accumulator z in [M0:M2, Z3:Z10] +// Output: [M0:M2, Z3:Z10] +// Temps: regs T0:T1 +///////////////////////////////////////////////////////////////// +.macro MULADD64x640 M1, M, Z3, Z4, Z5, Z6, Z7, Z8, Z9, Z10, T0, T1, T2, C + mulx \T0, \T1, \M1 // A0*B0 + xor \C, \C + adox \T1, \M + adox \T0, 8\M + mov \M, \T1 + mulx \T1, \T2, 8\M1 // A0*B1 + adcx \T0, \T2 + adox \T1, 16\M + mov 8\M, \T0 + mulx \T0, \T2, 16\M1 // A0*B2 + adcx \T1, \T2 + adox \Z3, \T0 + mov 16\M, \T1 + mulx \T0, \T1, 24\M1 // A0*B3 + adcx \Z3, \T1 + adox \Z4, \T0 + mulx \T0, \T1, 32\M1 // A0*B4 + adcx \Z4, \T1 + adox \Z5, \T0 + mulx \T0, \T1, 40\M1 // A0*B5 + adcx \Z5, \T1 + adox \Z6, \T0 + mulx \T0, \T1, 48\M1 // A0*B6 + adcx \Z6, \T1 + adox \Z7, \T0 + mulx \T0, \T1, 56\M1 // A0*B7 + adcx \Z7, \T1 + adox \Z8, \T0 + mulx \T0, \T1, 64\M1 // A0*B8 + adcx \Z8, \T1 + adox \Z9, \T0 + mulx \T0, \T1, 72\M1 // A0*B9 + adcx \Z9, \T1 + adox \Z10, \T0 + adc \Z10, 0 +.endm + + +.macro MULADD64x640b M1, M, MM, Z3, Z4, Z5, Z6, Z7, Z8, Z9, Z10, T0, T1, T2, C + mulx \T0, \T1, \M1 // A0*B0 + xor \C, \C + adox \T1, \M + adox \T0, 8\M + mov 24\M, \T1 + mulx \T1, \T2, 8\M1 // A0*B1 + adcx \T0, \T2 + adox \T1, 16\M + mov \MM, \T0 + mulx \T0, \T2, 16\M1 // A0*B2 + adcx \T1, \T2 + adox \Z3, \T0 + mov 8\MM, \T1 + mulx \T0, \T1, 24\M1 // A0*B3 + adcx \Z3, \T1 + adox \Z4, \T0 + mulx \T0, \T1, 32\M1 // A0*B4 + mov 16\MM, \Z3 + adcx \Z4, \T1 + adox \Z5, \T0 + mulx \T0, \T1, 40\M1 // A0*B5 + adcx \Z5, \T1 + adox \Z6, \T0 + mulx \T0, \T1, 48\M1 // A0*B6 + adcx \Z6, \T1 + adox \Z7, \T0 + mulx \T0, \T1, 56\M1 // A0*B7 + adcx \Z7, \T1 + adox \Z8, \T0 + mulx \T0, \T1, 64\M1 // A0*B8 + adcx \Z8, \T1 + adox \Z9, \T0 + mulx \T0, \T1, 72\M1 // A0*B9 + adcx \Z9, \T1 + adox \Z10, \T0 + adc \Z10, 0 +.endm + + +.macro MULADD64x384 M1, Z0, Z1, Z2, Z3, Z4, Z5, Z6, T0, T1 + mulx \T0, \T1, \M1 // A0*B0 + xor rax, rax + adox \Z0, \T1 + adox \Z1, \T0 + mulx \T0, \T1, 8\M1 // A0*B1 + adcx \Z1, \T1 + adox \Z2, \T0 + mulx \T0, \T1, 16\M1 // A0*B2 + adcx \Z2, \T1 + adox \Z3, \T0 + mulx \T0, \T1, 24\M1 // A0*B3 + adcx \Z3, \T1 + adox \Z4, \T0 + mulx \T0, \T1, 32\M1 // A0*B4 + adcx \Z4, \T1 + adox \Z5, \T0 + mulx \T0, \T1, 40\M1 // A0*B5 + adcx \Z5, \T1 + adox \Z6, \T0 + adc \Z6, 0 +.endm + + +///////////////////////////////////////////////////////////////// MACRO +// z = a x b + c x d (mod p) +// Inputs: base memory pointers M0 (a,c), M1 (b,d) +// bi pre-stored in rdx, +// accumulator z in [Z0:Z7], pre-stores a0 x b +// Output: [Z0:Z7] +// Temps: MM, regs T0:T2 +///////////////////////////////////////////////////////////////// +.macro FPDBLMUL640x640 M00, M01, M10, M11, MM, Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7, T0, T1, T2 + mov rdx, \M11 + MULADD64x640b \M01, \MM, \MM, \Z0, \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \T0, \T1, \T2, \T2 + // [MM0:MM16, Z1:Z7] <- z = (z0 x p610p1 + z)/2^64 + mov rdx, 24\MM // rdx <- z0 + MULADD64x384 [rip+fmt(p610p1)+32], \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \T0, \T1 + + // [MM0:MM16, Z1:Z5, Z0] <- z = a0 x b01 - a1 x b11 + z + mov rdx, 8\M10 + MULADD64x640 \M00, \MM, \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \Z0, \T0, \T1, \T2, \Z0 + mov rdx, 8\M11 + MULADD64x640b \M01, \MM, \MM, \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \Z0, \T0, \T1, \T2, \T2 + // [MM0:MM16, Z2:Z7, Z0] <- z = (z0 x p610p1 + z)/2^64 + mov rdx, 24\MM // rdx <- z0 + MULADD64x384 [rip+fmt(p610p1)+32], \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \Z0, \T0, \T1 + + // [MM0:MM16, Z2:Z7, Z0:Z1] <- z = a0 x b02 + a1 x b12 + z + mov rdx, 16\M10 + MULADD64x640 \M00, \MM, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \Z0, \Z1, \T0, \T1, \T2, \Z1 + mov rdx, 16\M11 + MULADD64x640b \M01, \MM, \MM, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \Z0, \Z1, \T0, \T1, \T2, \T2 + // [MM0:MM16, Z3:Z7, Z0:Z1] <- z = (z0 x p610p1 + z)/2^64 + mov rdx, 24\MM // rdx <- z0 + MULADD64x384 [rip+fmt(p610p1)+32], \Z3, \Z4, \Z5, \Z6, \Z7, \Z0, \Z1, \T0, \T1 + + // [MM0:MM16, Z3:Z7, Z0:Z2] <- z = a0 x b03 + a1 x b13 + z + mov rdx, 24\M10 + MULADD64x640 \M00, \MM, \Z3, \Z4, \Z5, \Z6, \Z7, \Z0, \Z1, \Z2, \T0, \T1, \T2, \Z2 + mov rdx, 24\M11 + MULADD64x640b \M01, \MM, \MM, \Z3, \Z4, \Z5, \Z6, \Z7, \Z0, \Z1, \Z2, \T0, \T1, \T2, \T2 + // [MM0:MM16, Z4:Z7, Z0:Z2] <- z = (z0 x p610p1 + z)/2^64 + mov rdx, 24\MM // rdx <- z0 + MULADD64x384 [rip+fmt(p610p1)+32], \Z4, \Z5, \Z6, \Z7, \Z0, \Z1, \Z2, \T0, \T1 + + // [MM0:MM16, Z4:Z7, Z0:Z3] <- z = a0 x b04 + a1 x b14 + z + mov rdx, 32\M10 + MULADD64x640 \M00, \MM, \Z4, \Z5, \Z6, \Z7, \Z0, \Z1, \Z2, \Z3, \T0, \T1, \T2, \Z3 + mov rdx, 32\M11 + MULADD64x640b \M01, \MM, \MM, \Z4, \Z5, \Z6, \Z7, \Z0, \Z1, \Z2, \Z3, \T0, \T1, \T2, \T2 + // [MM0:MM16, Z5:Z7, Z0:Z3] <- z = (z0 x p610p1 + z)/2^64 + mov rdx, 24\MM // rdx <- z0 + MULADD64x384 [rip+fmt(p610p1)+32], \Z5, \Z6, \Z7, \Z0, \Z1, \Z2, \Z3, \T0, \T1 + + // [MM0:MM16, Z5:Z7, Z0:Z4] <- z = a0 x b05 + a1 x b15 + z + mov rdx, 40\M10 + MULADD64x640 \M00, \MM, \Z5, \Z6, \Z7, \Z0, \Z1, \Z2, \Z3, \Z4, \T0, \T1, \T2, \Z4 + mov rdx, 40\M11 + MULADD64x640b \M01, \MM, \MM, \Z5, \Z6, \Z7, \Z0, \Z1, \Z2, \Z3, \Z4, \T0, \T1, \T2, \T2 + // [MM0:MM16, Z6:Z7, Z0:Z4] <- z = (z0 x p610p1 + z)/2^64 + mov rdx, 24\MM // rdx <- z0 + MULADD64x384 [rip+fmt(p610p1)+32], \Z6, \Z7, \Z0, \Z1, \Z2, \Z3, \Z4, \T0, \T1 + + // [MM0:MM16, Z6:Z7, Z0:Z5] <- z = a0 x b06 + a1 x b16 + z + mov rdx, 48\M10 + MULADD64x640 \M00, \MM, \Z6, \Z7, \Z0, \Z1, \Z2, \Z3, \Z4, \Z5, \T0, \T1, \T2, \Z5 + mov rdx, 48\M11 + MULADD64x640b \M01, \MM, \MM, \Z6, \Z7, \Z0, \Z1, \Z2, \Z3, \Z4, \Z5, \T0, \T1, \T2, \T2 + // [MM0:MM16, Z7, Z0:Z5] <- z = (z0 x p610p1 + z)/2^64 + mov rdx, 24\MM // rdx <- z0 + MULADD64x384 [rip+fmt(p610p1)+32], \Z7, \Z0, \Z1, \Z2, \Z3, \Z4, \Z5, \T0, \T1 + + // [MM0:MM16, Z7, Z0:Z6] <- z = a0 x b07 + a1 x b17 + z + mov rdx, 56\M10 + MULADD64x640 \M00, \MM, \Z7, \Z0, \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \T0, \T1, \T2, \Z6 + mov rdx, 56\M11 + MULADD64x640b \M01, \MM, \MM, \Z7, \Z0, \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \T0, \T1, \T2, \T2 + // [MM0:MM16, Z0:Z6] <- z = (z0 x p610p1 + z)/2^64 + mov rdx, 24\MM // rdx <- z0 + MULADD64x384 [rip+fmt(p610p1)+32], \Z0, \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \T0, \T1 + + // [MM0:MM16, Z0:Z7] <- z = a0 x b08 + a1 x b18 + z + mov rdx, 64\M10 + MULADD64x640 \M00, \MM, \Z0, \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \T0, \T1, \T2, \Z7 + mov rdx, 64\M11 + MULADD64x640b \M01, \MM, \MM, \Z0, \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \T0, \T1, \T2, \T2 + // [MM0:MM16, Z1:Z7] <- z = (z0 x p610p1 + z)/2^64 + mov rdx, 24\MM // rdx <- z0 + MULADD64x384 [rip+fmt(p610p1)+32], \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \T0, \T1 + + // [MM0:MM16, Z1:Z7, Z0] <- z = a0 x b09 + a1 x b19 + z + mov rdx, 72\M10 + MULADD64x640 \M00, \MM, \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \Z0, \T0, \T1, \T2, \Z0 + mov rdx, 72\M11 + MULADD64x640b \M01, \MM, [rcx], \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \Z0, \T0, \T1, \T2, \T2 + // [MM0:MM16, Z2:Z7, Z0] <- z = (z0 x p610p1 + z)/2^64 + mov rdx, 24\MM // rdx <- z0 + MULADD64x384 [rip+fmt(p610p1)+32], \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \Z0, \T0, \T1 +.endm + + +//*********************************************************************** +// Multiplication in GF(p^2), non-complex part +// Operation: c [reg_p3] = a0 x b0 - a1 x b1 +// Inputs: a = [a1, a0] stored in [reg_p1] +// b = [b1, b0] stored in [reg_p2] +// Output: c stored in [reg_p3] +//*********************************************************************** +.global fmt(fp2mul610_c0_asm) +fmt(fp2mul610_c0_asm): + push r12 + push r13 + push r14 + push r15 + push rbx + push rbp + sub rsp, 80 + mov rcx, reg_p3 + + // [rsp0:rsp72] <- 8*p - b1 + mov r8, [rip+fmt(p610x8)] + mov r9, [rip+fmt(p610x8)+8] + mov r12, [rip+fmt(p610x8)+32] + mov r13, [rip+fmt(p610x8)+40] + mov rax, [reg_p2+80] + mov rdx, [reg_p2+88] + mov r10, r9 + mov r11, r9 + sub r8, rax + sbb r9, rdx + mov rax, [reg_p2+96] + mov rdx, [reg_p2+104] + sbb r10, rax + sbb r11, rdx + mov rax, [reg_p2+112] + mov rdx, [reg_p2+120] + sbb r12, rax + sbb r13, rdx + mov [rsp], r8 + mov [rsp+8], r9 + mov [rsp+16], r10 + mov [rsp+24], r11 + mov [rsp+32], r12 + mov [rsp+40], r13 + mov r8, [rip+fmt(p610x8)+48] + mov r9, [rip+fmt(p610x8)+56] + mov r10, [rip+fmt(p610x8)+64] + mov r11, [rip+fmt(p610x8)+72] + mov rax, [reg_p2+128] + mov rdx, [reg_p2+136] + sbb r8, rax + sbb r9, rdx + mov rax, [reg_p2+144] + mov rdx, [reg_p2+152] + sbb r10, rax + sbb r11, rdx + mov [rsp+48], r8 + mov [rsp+56], r9 + mov [rsp+64], r10 + mov [rsp+72], r11 + + // [rcx0:rcx16, r11:r15, r8:r10] <- z = a0 x b00 - a1 x b10 + mov rdx, [reg_p2] + mulx r9, r8, [reg_p1] + xor rax, rax + mulx r10, r11, [reg_p1+8] + mov [rcx], r8 + adcx r9, r11 + mulx r11, r12, [reg_p1+16] + mov [rcx+8], r9 + adcx r10, r12 + mulx r12, r13, [reg_p1+24] + mov [rcx+16], r10 + adcx r11, r13 + mulx r13, r8, [reg_p1+32] + adcx r12, r8 + mulx r14, r9, [reg_p1+40] + adcx r13, r9 + mulx r15, rax, [reg_p1+48] + adcx r14, rax + mulx r8, r10, [reg_p1+56] + adcx r15, r10 + mulx r9, rax, [reg_p1+64] + adcx r8, rax + mulx r10, rbx, [reg_p1+72] + adcx r9, rbx + adc r10, 0 + + FPDBLMUL640x640 [reg_p1], [reg_p1+80], [reg_p2], [rsp], [rcx], r11, r12, r13, r14, r15, r8, r9, r10, rbx, rbp, rax + + mov [rcx+24], r13 + mov [rcx+32], r14 + mov [rcx+40], r15 + mov [rcx+48], r8 + mov [rcx+56], r9 + mov [rcx+64], r10 + mov [rcx+72], r11 + add rsp, 80 + pop rbp + pop rbx + pop r15 + pop r14 + pop r13 + pop r12 + ret + + +//*********************************************************************** +// Multiplication in GF(p^2), complex part +// Operation: c [reg_p3] = a0 x b1 + a1 x b0 +// Inputs: a = [a1, a0] stored in [reg_p1] +// b = [b1, b0] stored in [reg_p2] +// Output: c stored in [reg_p3] +//*********************************************************************** +.global fmt(fp2mul610_c1_asm) +fmt(fp2mul610_c1_asm): + push r12 + push r13 + push r14 + push r15 + push rbx + push rbp + sub rsp, 32 + mov rcx, reg_p3 + + // [rsp0:rsp16, r11:r15, r8:r10] <- z = a0 x b10 + a1 x b00 + mov rdx, [reg_p2] + mulx r9, r8, [reg_p1+80] + xor rax, rax + mulx r10, r11, [reg_p1+88] + mov [rsp], r8 + adcx r9, r11 + mulx r11, r12, [reg_p1+96] + mov [rsp+8], r9 + adcx r10, r12 + mulx r12, r13, [reg_p1+104] + mov [rsp+16], r10 + adcx r11, r13 + mulx r13, r8, [reg_p1+112] + adcx r12, r8 + mulx r14, r9, [reg_p1+120] + adcx r13, r9 + mulx r15, rax, [reg_p1+128] + adcx r14, rax + mulx r8, r10, [reg_p1+136] + adcx r15, r10 + mulx r9, rax, [reg_p1+144] + adcx r8, rax + mulx r10, rbx, [reg_p1+152] + adcx r9, rbx + adc r10, 0 + + FPDBLMUL640x640 [reg_p1+80], [reg_p1], [reg_p2], [reg_p2+80], [rsp], r11, r12, r13, r14, r15, r8, r9, r10, rbx, rbp, rax + + mov [rcx+24], r13 + mov [rcx+32], r14 + mov [rcx+40], r15 + mov [rcx+48], r8 + mov [rcx+56], r9 + mov [rcx+64], r10 + mov [rcx+72], r11 + add rsp, 32 + pop rbp + pop rbx + pop r15 + pop r14 + pop r13 + pop r12 + ret + +#else + +# error "CONFIGURATION NOT SUPPORTED. TRY USE_MULX=TRUE USE_ADX=TRUE" + +#endif + +#else + +# error "CONFIGURATION NOT SUPPORTED. TRY USE_MULX=TRUE USE_ADX=TRUE" + +#endif + + +///////////////////////////////////////////////////////////////// MACRO +// z = z + a x b +// Inputs: base memory pointers M0 (a), M1 (b) +// bi pre-stored in rdx, +// accumulator z in [Z0:Z7], pre-stores a0 x b +// Output: [Z0:Z7] and OUT +// Temps: regs T0:T2 +///////////////////////////////////////////////////////////////// +.macro FPMUL640x640 M0, M1, MM, OUT, Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7, T0, T1, T2 + // [Z4:Z7, Z0:Z2] <- z = (z0 x p610p1 + z)/2^64 + mov rdx, 24\MM // rdx <- z0 + MULADD64x384 [rip+fmt(p610p1)+32], \Z4, \Z5, \Z6, \Z7, \Z0, \Z1, \Z2, \T0, \T1 + + // [rsp8:rsp24, \Z4:\Z7, \Z0:\Z3] <- z = a0 x b11 + a1 x b01 + z + mov rdx, 8\M0 + MULADD64x640b \M1, \MM, \MM, \Z4, \Z5, \Z6, \Z7, \Z0, \Z1, \Z2, \Z3, \T0, \T1, \T2, \Z3 + // [\Z5:\Z7, \Z0:\Z3] <- z = (z0 x p610p1 + z)/2^64 + mov rdx, 24\MM // rdx <- z0 + MULADD64x384 [rip+fmt(p610p1)+32], \Z5, \Z6, \Z7, \Z0, \Z1, \Z2, \Z3, \T0, \T1 + + // [rsp16:rsp32, \Z5:\Z7, \Z0:\Z4] <- z = a0 x b12 + a1 x b02 + z + mov rdx, 16\M0 + MULADD64x640b \M1, \MM, \MM, \Z5, \Z6, \Z7, \Z0, \Z1, \Z2, \Z3, \Z4, \T0, \T1, \T2, \Z4 + // [rsp24:rsp40, \Z6:\Z7, \Z0:\Z4] <- z = (z0 x p610p1 + z)/2^64 + mov rdx, 24\MM // rdx <- z0 + MULADD64x384 [rip+fmt(p610p1)+32], \Z6, \Z7, \Z0, \Z1, \Z2, \Z3, \Z4, \T0, \T1 + + // [rsp24:rsp40, \Z6:\Z7, \Z0:\Z5] <- z = a0 x b13 + a1 x b03 + z + mov rdx, 24\M0 + MULADD64x640b \M1, \MM, \MM, \Z6, \Z7, \Z0, \Z1, \Z2, \Z3, \Z4, \Z5, \T0, \T1, \T2, \Z5 + // [rsp32:rsp48, \Z7, \Z0:\Z5] <- z = (z0 x p610p1 + z)/2^64 + mov rdx, 24\MM // rdx <- z0 + MULADD64x384 [rip+fmt(p610p1)+32], \Z7, \Z0, \Z1, \Z2, \Z3, \Z4, \Z5, \T0, \T1 + + // [rsp32:rsp48, \Z7, \Z0:\Z6] <- z = a0 x b14 + a1 x b04 + z + mov rdx, 32\M0 + MULADD64x640b \M1, \MM, \MM, \Z7, \Z0, \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \T0, \T1, \T2, \Z6 + // [rsp40:rsp56, \Z0:\Z6] <- z = (z0 x p610p1 + z)/2^64 + mov rdx, 24\MM // rdx <- z0 + MULADD64x384 [rip+fmt(p610p1)+32], \Z0, \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \T0, \T1 + + // [rsp40:rsp56, \Z0:\Z7] <- z = a0 x b15 + a1 x b05 + z + mov rdx, 40\M0 + MULADD64x640b \M1, \MM, \MM, \Z0, \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \T0, \T1, \T2, \Z7 + // [rsp48:rsp64, \Z1:\Z7] <- z = (z0 x p610p1 + z)/2^64 + mov rdx, 24\MM // rdx <- z0 + MULADD64x384 [rip+fmt(p610p1)+32], \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \T0, \T1 + + // [rsp48:rsp64, \Z1:\Z7, \Z0] <- z = a0 x b16 + a1 x b06 + z + mov rdx, 48\M0 + MULADD64x640b \M1, \MM, \MM, \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \Z0, \T0, \T1, \T2, \Z0 + // [rsp56:rsp72, \Z2:\Z7, \Z0] <- z = (z0 x p610p1 + z)/2^64 + mov rdx, 24\MM // rdx <- z0 + MULADD64x384 [rip+fmt(p610p1)+32], \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \Z0, \T0, \T1 + + // [rsp56:rsp72, \Z2:\Z7, \Z0:\Z1] <- z = a0 x b17 + a1 x b07 + z + mov rdx, 56\M0 + MULADD64x640b \M1, \MM, \MM, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \Z0, \Z1, \T0, \T1, \T2, \Z1 + // [rsp64:rsp80, \Z3:\Z7, \Z0:\Z1] <- z = (z0 x p610p1 + z)/2^64 + mov rdx, 24\MM // rdx <- z0 + MULADD64x384 [rip+fmt(p610p1)+32], \Z3, \Z4, \Z5, \Z6, \Z7, \Z0, \Z1, \T0, \T1 + + // [rsp64:rsp80, \Z3:\Z7, \Z0:\Z2] <- z = a0 x b18 + a1 x b08 + z + mov rdx, 64\M0 + MULADD64x640b \M1, \MM, \MM, \Z3, \Z4, \Z5, \Z6, \Z7, \Z0, \Z1, \Z2, \T0, \T1, \T2, \Z2 + // [rsp72:rsp88, \Z4:\Z7, \Z0:\Z2] <- z = (z0 x p610p1 + z)/2^64 + mov rdx, 24\MM // rdx <- z0 + MULADD64x384 [rip+fmt(p610p1)+32], \Z4, \Z5, \Z6, \Z7, \Z0, \Z1, \Z2, \T0, \T1 + + // [rsi0:rsi16, \Z4:\Z7, \Z0:\Z3] <- z = a0 x b19 + a1 x b09 + z + mov rdx, 72\M0 + MULADD64x640b \M1, \MM, \OUT, \Z4, \Z5, \Z6, \Z7, \Z0, \Z1, \Z2, \Z3, \T0, \T1, \T2, \Z3 + // [rsp80:rsp96, \Z5:\Z7, \Z0:\Z3] <- z = (z0 x p610p1 + z)/2^64 + mov rdx, 24\MM // rdx <- z0 + MULADD64x384 [rip+fmt(p610p1)+32], \Z5, \Z6, \Z7, \Z0, \Z1, \Z2, \Z3, \T0, \T1 +.endm //*********************************************************************** -// Field subtraction -// Operation: c [reg_p3] = a [reg_p1] - b [reg_p2] -//*********************************************************************** -.global fmt(fpsub610_asm) -fmt(fpsub610_asm): - push r12 - push r13 - push r14 - push r15 - - mov r8, [reg_p1] - mov r9, [reg_p1+8] - mov r10, [reg_p1+16] - mov r11, [reg_p1+24] - mov r12, [reg_p1+32] - mov r13, [reg_p1+40] - mov r14, [reg_p1+48] - mov r15, [reg_p1+56] - mov rcx, [reg_p1+64] - mov rdi, [reg_p1+72] - sub r8, [reg_p2] - sbb r9, [reg_p2+8] - sbb r10, [reg_p2+16] - sbb r11, [reg_p2+24] - sbb r12, [reg_p2+32] - sbb r13, [reg_p2+40] - sbb r14, [reg_p2+48] - sbb r15, [reg_p2+56] - sbb rcx, [reg_p2+64] - sbb rdi, [reg_p2+72] - mov [reg_p3+64], rcx - mov [reg_p3+72], rdi - mov rax, 0 - sbb rax, 0 - - mov rsi, [rip+fmt(p610x2)] - and rsi, rax - mov rdi, [rip+fmt(p610x2)+8] - and rdi, rax - - add r8, rsi - adc r9, rdi - adc r10, rdi - adc r11, rdi - mov [reg_p3], r8 - mov [reg_p3+8], r9 - mov [reg_p3+16], r10 - mov [reg_p3+24], r11 - setc cl - - mov rdi, [rip+fmt(p610x2)+32] - and rdi, rax - mov rsi, [rip+fmt(p610x2)+40] - and rsi, rax - mov r8, [rip+fmt(p610x2)+48] - and r8, rax - mov r9, [rip+fmt(p610x2)+56] - and r9, rax - mov r10, [rip+fmt(p610x2)+64] - and r10, rax - mov r11, [rip+fmt(p610x2)+72] - and r11, rax - - bt rcx, 0 - adc r12, rdi - adc r13, rsi - adc r14, r8 - adc r15, r9 - mov rsi, [reg_p3+64] - mov rdi, [reg_p3+72] - adc rsi, r10 - adc rdi, r11 - mov [reg_p3+32], r12 - mov [reg_p3+40], r13 - mov [reg_p3+48], r14 - mov [reg_p3+56], r15 - mov [reg_p3+64], rsi - mov [reg_p3+72], rdi - - pop r15 - pop r14 - pop r13 - pop r12 - ret - - -///////////////////////////////////////////////////////////////// MACRO -.macro SUB610_PX P0 - push r12 - push r13 - push r14 - push r15 - - mov r8, [reg_p1] - mov r9, [reg_p1+8] - mov r10, [reg_p1+16] - mov r11, [reg_p1+24] - mov r12, [reg_p1+32] - mov r13, [reg_p1+40] - mov r14, [reg_p1+48] - mov r15, [reg_p1+56] - mov rax, [reg_p1+64] - mov rcx, [reg_p1+72] - sub r8, [reg_p2] - sbb r9, [reg_p2+8] - sbb r10, [reg_p2+16] - sbb r11, [reg_p2+24] - sbb r12, [reg_p2+32] - sbb r13, [reg_p2+40] - sbb r14, [reg_p2+48] - sbb r15, [reg_p2+56] - sbb rax, [reg_p2+64] - sbb rcx, [reg_p2+72] - - mov rdi, [rip+\P0] - mov rsi, [rip+\P0+8] - add r8, rdi - adc r9, rsi - adc r10, rsi - adc r11, rsi - mov rdi, [rip+\P0+32] - mov rsi, [rip+\P0+40] - adc r12, rdi - adc r13, rsi - mov [reg_p3], r8 - mov [reg_p3+8], r9 - mov [reg_p3+16], r10 - mov [reg_p3+24], r11 - mov [reg_p3+32], r12 - mov [reg_p3+40], r13 - mov rdi, [rip+\P0+48] - mov rsi, [rip+\P0+56] - adc r14, rdi - adc r15, rsi - mov rdi, [rip+\P0+64] - mov rsi, [rip+\P0+72] - adc rax, rdi - adc rcx, rsi - mov [reg_p3+48], r14 - mov [reg_p3+56], r15 - mov [reg_p3+64], rax - mov [reg_p3+72], rcx - - pop r15 - pop r14 - pop r13 - pop r12 - .endm - - -//*********************************************************************** -// Multiprecision subtraction with correction with 2*p610 -// Operation: c [reg_p3] = a [reg_p1] - b [reg_p2] + 2*p610 -//*********************************************************************** -.global fmt(mp_sub610_p2_asm) -fmt(mp_sub610_p2_asm): - - SUB610_PX fmt(p610x2) - ret - - -//*********************************************************************** -// Multiprecision subtraction with correction with 4*p610 -// Operation: c [reg_p3] = a [reg_p1] - b [reg_p2] + 4*p610 -//*********************************************************************** -.global fmt(mp_sub610_p4_asm) -fmt(mp_sub610_p4_asm): - - SUB610_PX fmt(p610x4) - ret - - -#ifdef _MULX_ - -/////////////////////////////////////////////////////////////////////////// MACRO -// Schoolbook integer multiplication -// Inputs: memory pointers M0 and M1 -// Outputs: memory pointer C -// Temps: regs T0:T7 -/////////////////////////////////////////////////////////////////////////// -#ifdef _ADX_ - -.macro MUL320_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7 - mov rdx, \M0 - mulx \T0, \T1, \M1 - mulx \T2, \T3, 8\M1 - mov \C, \T1 // C0_final - xor rax, rax - mulx \T4, \T5, 16\M1 - adox \T0, \T3 - adox \T2, \T5 - mulx \T1, \T3, 24\M1 - adox \T4, \T3 - mulx \T5, \T6, 32\M1 - adox \T1, \T6 - adox \T5, rax - - mov rdx, 8\M0 - mulx \T6, \T7, \M1 - xor rax, rax - adcx \T0, \T7 - mov 8\C, \T0 // C1_final - adcx \T2, \T6 - mulx \T6, \T7, 8\M1 - adox \T2, \T7 - adcx \T4, \T6 - mulx \T0, \T6, 16\M1 - adox \T4, \T6 - adcx \T0, \T1 - mulx \T1, \T7, 24\M1 - adcx \T1, \T5 - adox \T0, \T7 - mulx \T5, \T6, 32\M1 - adcx \T5, rax - adox \T1, \T6 - adox \T5, rax - - mov rdx, 16\M0 - mulx \T6, \T7, \M1 - xor rax, rax - adcx \T2, \T7 - mov 16\C, \T2 // C2_final - adcx \T4, \T6 - mulx \T6, \T7, 8\M1 - adox \T4, \T7 - adcx \T0, \T6 - mulx \T2, \T6, 16\M1 - adox \T0, \T6 - adcx \T1, \T2 - mulx \T2, \T7, 24\M1 - adcx \T5, \T2 - adox \T1, \T7 - mulx \T2, \T6, 32\M1 - adcx \T2, rax - adox \T5, \T6 - adox \T2, rax - - mov rdx, 24\M0 - mulx \T6, \T7, \M1 - xor rax, rax - adcx \T4, \T7 - mov 24\C, \T4 // C3_final - adcx \T0, \T6 - mulx \T6, \T7, 8\M1 - adox \T0, \T7 - adcx \T1, \T6 - mulx \T4, \T6, 16\M1 - adox \T1, \T6 - adcx \T5, \T4 - mulx \T4, \T7, 24\M1 - adcx \T2, \T4 - adox \T5, \T7 - mulx \T4, \T6, 32\M1 - adcx \T4, rax - adox \T2, \T6 - adox \T4, rax - - mov rdx, 32\M0 - mulx \T6, \T7, \M1 - xor rax, rax - adcx \T0, \T7 - mov 32\C, \T0 // C4_final - adcx \T1, \T6 - mulx \T6, \T7, 8\M1 - adox \T1, \T7 - adcx \T5, \T6 - mulx \T0, \T6, 16\M1 - adox \T5, \T6 - adcx \T2, \T0 - mulx \T0, \T7, 24\M1 - adcx \T4, \T0 - adox \T2, \T7 - mulx \T0, \T6, 32\M1 - adcx \T0, rax - adox \T4, \T6 - adox \T0, rax - - mov 40\C, \T1 - mov 48\C, \T5 - mov 56\C, \T2 - mov 64\C, \T4 - mov 72\C, \T0 -.endm - -#else - -.macro MUL320_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7 - mov rdx, \M0 - mulx \T0, \T1, \M1 - mulx \T2, \T3, 8\M1 - mov \C, \T1 // C0_final - mulx \T4, \T5, 16\M1 - add \T0, \T3 - adc \T2, \T5 - mulx \T1, \T3, 24\M1 - adc \T3, \T4 - mulx \T5, \T6, 32\M1 - adc \T1, \T6 - adc \T5, 0 - - mov rdx, 8\M0 - mulx \T6, \T7, \M1 - add \T0, \T7 - mov 8\C, \T0 // C1_final - adc \T2, \T6 - mulx \T6, \T7, 8\M1 - adc \T3, \T6 - mulx \T0, \T4, 16\M1 - adc \T0, \T1 - mulx \T1, \T6, 24\M1 - adc \T5, \T1 - mulx \T1, rax, 32\M1 - adc \T1, 0 - - add \T2, \T7 - adc \T3, \T4 - adc \T0, \T6 - adc \T5, rax - adc \T1, 0 - - mov rdx, 16\M0 - mulx \T4, \T6, \M1 - add \T2, \T6 - mov 16\C, \T2 // C2_final - adc \T3, \T4 - mulx \T6, \T7, 8\M1 - adc \T0, \T6 - mulx \T2, \T4, 16\M1 - adc \T2, \T5 - mulx \T5, \T6, 24\M1 - adc \T1, \T5 - mulx \T5, rax, 32\M1 - adc \T5, 0 - - add \T3, \T7 - adc \T0, \T4 - adc \T2, \T6 - adc \T1, rax - adc \T5, 0 - - mov rdx, 24\M0 - mulx \T4, \T6, \M1 - add \T3, \T6 - mov 24\C, \T3 // C3_final - adc \T0, \T4 - mulx \T6, \T7, 8\M1 - adc \T2, \T6 - mulx \T3, \T4, 16\M1 - adc \T1, \T3 - mulx \T3, \T6, 24\M1 - adc \T3, \T5 - mulx \T5, rax, 32\M1 - adc \T5, 0 - - add \T0, \T7 - adc \T2, \T4 - adc \T1, \T6 - adc \T3, rax - adc \T5, 0 - - mov rdx, 32\M0 - mulx \T4, \T6, \M1 - add \T0, \T6 - mov 32\C, \T0 // C4_final - adc \T2, \T4 - mulx \T6, \T7, 8\M1 - adc \T1, \T6 - mulx \T0, \T4, 16\M1 - adc \T3, \T0 - mulx \T0, \T6, 24\M1 - adc \T0, \T5 - mulx \T5, rax, 32\M1 - adc \T5, 0 - - add \T2, \T7 - adc \T1, \T4 - adc \T3, \T6 - adc \T0, rax - adc \T5, 0 - mov 40\C, \T2 - mov 48\C, \T1 - mov 56\C, \T3 - mov 64\C, \T0 - mov 72\C, \T5 -.endm - -#endif - - -//***************************************************************************** -// 610-bit multiplication using Karatsuba (one level), schoolbook (two levels) -//***************************************************************************** -.global fmt(mul610_asm) -fmt(mul610_asm): - push r12 - push r13 - push r14 - push r15 - mov rcx, reg_p3 - - // [rsp] <- AH + AL, rax <- mask - xor rax, rax - mov r8, [reg_p1] - mov r9, [reg_p1+8] - mov r10, [reg_p1+16] - mov r11, [reg_p1+24] - mov r12, [reg_p1+32] - push rbx - sub rsp, 112 - add r8, [reg_p1+40] - adc r9, [reg_p1+48] - adc r10, [reg_p1+56] - adc r11, [reg_p1+64] - adc r12, [reg_p1+72] - sbb rax, 0 - mov [rsp], r8 - mov [rsp+8], r9 - mov [rsp+16], r10 - mov [rsp+24], r11 - mov [rsp+32], r12 - - // [rsp+40] <- BH + BL, rdx <- mask - xor rdx, rdx - mov r8, [reg_p2] - mov r9, [reg_p2+8] - mov rbx, [reg_p2+16] - mov r13, [reg_p2+24] - mov r14, [reg_p2+32] - add r8, [reg_p2+40] - adc r9, [reg_p2+48] - adc rbx, [reg_p2+56] - adc r13, [reg_p2+64] - adc r14, [reg_p2+72] - sbb rdx, 0 - mov [rsp+40], r8 - mov [rsp+48], r9 - mov [rsp+56], rbx - mov [rsp+64], r13 - mov [rsp+72], r14 - - // [rcx] <- masked (BH + BL) - and r8, rax - and r9, rax - and rbx, rax - and r13, rax - and r14, rax - mov [rcx], r8 - mov [rcx+8], r9 - - // r8-r12 <- masked (AH + AL) - mov r8, [rsp] - mov r9, [rsp+8] - and r8, rdx - and r9, rdx - and r10, rdx - and r11, rdx - and r12, rdx - - // [rsp+80] <- masked (AH + AL) + masked (BH + BL) - mov rax, [rcx] - mov rdx, [rcx+8] - add r8, rax - adc r9, rdx - adc r10, rbx - adc r11, r13 - adc r12, r14 - mov [rsp+80], r8 - mov [rsp+88], r9 - mov [rsp+96], r10 - mov [rsp+104], r11 - - // [rcx] <- AL x BL - MUL320_SCHOOL [reg_p1], [reg_p2], [rcx], r8, r9, r10, r11, rbx, r13, r14, r15 // Result C0-C4 - - // [rcx+80] <- (AH+AL) x (BH+BL), low part - MUL320_SCHOOL [rsp], [rsp+40], [rcx+80], r8, r9, r10, r11, rbx, r13, r14, r15 - - // [rsp] <- AH x BH - MUL320_SCHOOL [reg_p1+40], [reg_p2+40], [rsp], r8, r9, r10, r11, rbx, r13, r14, r15 - - // r8-r12 <- (AH+AL) x (BH+BL), final step - mov r8, [rsp+80] - mov r9, [rsp+88] - mov r10, [rsp+96] - mov r11, [rsp+104] - mov rax, [rcx+120] - add r8, rax - mov rax, [rcx+128] - adc r9, rax - mov rax, [rcx+136] - adc r10, rax - mov rax, [rcx+144] - adc r11, rax - mov rax, [rcx+152] - adc r12, rax - - // rdi,rdx,rbx,r13,r14,r8-r12 <- (AH+AL) x (BH+BL) - ALxBL - mov rdi, [rcx+80] - sub rdi, [rcx] - mov rdx, [rcx+88] - sbb rdx, [rcx+8] - mov rbx, [rcx+96] - sbb rbx, [rcx+16] - mov r13, [rcx+104] - sbb r13, [rcx+24] - mov r14, [rcx+112] - sbb r14, [rcx+32] - sbb r8, [rcx+40] - sbb r9, [rcx+48] - sbb r10, [rcx+56] - sbb r11, [rcx+64] - sbb r12, [rcx+72] - - // rdi,rdx,rbx,r13,r14,r8-r12 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH - sub rdi, [rsp] - sbb rdx, [rsp+8] - sbb rbx, [rsp+16] - sbb r13, [rsp+24] - sbb r14, [rsp+32] - sbb r8, [rsp+40] - sbb r9, [rsp+48] - sbb r10, [rsp+56] - sbb r11, [rsp+64] - sbb r12, [rsp+72] - - mov rax, [rcx+40] - add rax, rdi - mov [rcx+40], rax // Result C5-C9 - mov rax, [rcx+48] - adc rax, rdx - mov [rcx+48], rax - mov rax, [rcx+56] - adc rax, rbx - mov [rcx+56], rax - mov rax, [rcx+64] - adc rax, r13 - mov [rcx+64], rax - mov rax, [rcx+72] - adc rax, r14 - mov [rcx+72], rax - mov rax, [rsp] - adc r8, rax - mov [rcx+80], r8 // Result C10-C19 - mov rax, [rsp+8] - adc r9, rax - mov [rcx+88], r9 - mov rax, [rsp+16] - adc r10, rax - mov [rcx+96], r10 - mov rax, [rsp+24] - adc r11, rax - mov [rcx+104], r11 - mov rax, [rsp+32] - adc r12, rax - mov [rcx+112], r12 - mov r8, [rsp+40] - mov r9, [rsp+48] - mov r10, [rsp+56] - mov r11, [rsp+64] - mov r12, [rsp+72] - adc r8, 0 - adc r9, 0 - adc r10, 0 - adc r11, 0 - adc r12, 0 - add rsp, 112 - mov [rcx+120], r8 - mov [rcx+128], r9 - mov [rcx+136], r10 - mov [rcx+144], r11 - mov [rcx+152], r12 - - pop rbx - pop r15 - pop r14 - pop r13 - pop r12 +// Squaring in GF(p^2), non-complex part +// Operation: c [reg_p2] = (a0+a1) x (a0-a1) +// Inputs: a = [a1, a0] stored in [reg_p1] +// Output: c stored in [reg_p2] +//*********************************************************************** +.global fmt(fp2sqr610_c0_asm) +fmt(fp2sqr610_c0_asm): + push r12 + push r13 + push r14 + push r15 + push rbx + push rbp + sub rsp, 32 + + // a0 + a1 + mov r8, [reg_p1] + mov r9, [reg_p1+8] + mov r10, [reg_p1+16] + mov r11, [reg_p1+24] + mov r12, [reg_p1+32] + mov r13, [reg_p1+40] + add r8, [reg_p1+80] + adc r9, [reg_p1+88] + mov [reg_p2], r8 + adc r10, [reg_p1+96] + adc r11, [reg_p1+104] + mov [reg_p2+8], r9 + mov [reg_p2+16], r10 + adc r12, [reg_p1+112] + adc r13, [reg_p1+120] + mov r14, [reg_p1+48] + mov r15, [reg_p1+56] + adc r14, [reg_p1+128] + adc r15, [reg_p1+136] + mov r9, [reg_p1+64] + mov r10, [reg_p1+72] + adc r9, [reg_p1+144] + adc r10, [reg_p1+152] + mov [reg_p2+24], r11 + mov [reg_p2+32], r12 + mov [reg_p2+40], r13 + mov [reg_p2+48], r14 + mov [reg_p2+56], r15 + mov [reg_p2+64], r9 + mov [reg_p2+72], r10 + + // a0 - a1 + 4xp610 + mov rcx, [reg_p1] + mov r10, [reg_p1+8] + mov r12, [reg_p1+16] + mov r13, [reg_p1+24] + mov r14, [reg_p1+32] + mov r15, [reg_p1+40] + sub rcx, [reg_p1+80] + sbb r10, [reg_p1+88] + sbb r12, [reg_p1+96] + sbb r13, [reg_p1+104] + sbb r14, [reg_p1+112] + sbb r15, [reg_p1+120] + mov rbx, [reg_p1+48] + mov rbp, [reg_p1+56] + mov r8, [reg_p1+64] + mov rax, [reg_p1+72] + sbb rbx, [reg_p1+128] + sbb rbp, [reg_p1+136] + sbb r8, [reg_p1+144] + sbb rax, [reg_p1+152] + add rcx, [rip+fmt(p610x4)] + mov rdx, [rip+fmt(p610x4)+8] + adc r10, rdx + adc r12, rdx + adc r13, rdx + adc r14, [rip+fmt(p610x4)+32] + adc r15, [rip+fmt(p610x4)+40] + adc rbx, [rip+fmt(p610x4)+48] + adc rbp, [rip+fmt(p610x4)+56] + adc r8, [rip+fmt(p610x4)+64] + adc rax, [rip+fmt(p610x4)+72] + mov [reg_p2+80], rcx + mov [reg_p2+88], r10 + mov [reg_p2+96], r12 + mov [reg_p2+104], r13 + mov [reg_p2+112], r14 + mov [reg_p2+144], r8 + mov [reg_p2+152], rax + + // [rsp0:rsp16, r11:r15, r8:r10] <- z = a00 x a1 + mov rdx, [reg_p2] + mulx r9, r8, rcx + mov [reg_p2+120], r15 + xor rax, rax + mulx r10, r11, r10 + mov [reg_p2+128], rbx + adcx r9, r11 + mulx r11, r12, r12 + mov [reg_p2+136], rbp + adcx r10, r12 + mulx r12, r13, r13 + mov [rsp+24], r8 + adcx r11, r13 + mulx r13, r8, r14 + mov [rsp], r9 + adcx r12, r8 + mulx r14, r9, r15 + mov [rsp+8], r10 + adcx r13, r9 + mulx r15, rax, rbx + mov [rsp+16], r11 + adcx r14, rax + mulx r8, r10, rbp + adcx r15, r10 + mulx r9, rax, [reg_p2+144] + adcx r8, rax + mulx r10, rbx, [reg_p2+152] + adcx r9, rbx + adc r10, 0 + + FPMUL640x640 [reg_p2], [reg_p2+80], [rsp], [reg_p2], r8, r9, r10, r11, r12, r13, r14, r15, rbx, rbp, rax + + mov [reg_p2+24], r13 + mov [reg_p2+32], r14 + mov [reg_p2+40], r15 + mov [reg_p2+48], r8 + mov [reg_p2+56], r9 + mov [reg_p2+64], r10 + mov [reg_p2+72], r11 + add rsp, 32 + pop rbp + pop rbx + pop r15 + pop r14 + pop r13 + pop r12 ret -#else - //*********************************************************************** -// Integer multiplication -// Based on Karatsuba method -// Operation: c [reg_p3] = a [reg_p1] * b [reg_p2] -// NOTE: a=c or b=c are not allowed -//*********************************************************************** -.global fmt(mul610_asm) -fmt(mul610_asm): - - ret - -# error "CONFIGURATION NOT SUPPORTED. TRY USE_MULX=TRUE" - -#endif - - -#ifdef _MULX_ - -///////////////////////////////////////////////////////////////// MACRO -// Schoolbook integer multiplication -// Inputs: memory pointers M0 and M1 -// Outputs: regs T0:T7 -// Temps: regs T8 -///////////////////////////////////////////////////////////////// - -#ifdef _ADX_ -.macro MUL128x384_SCHOOL M0, M1, T0, T1, T2, T3, T4, T5, T6, T7, T8 - mov rdx, \M0 - mulx \T1, \T0, \M1 // T0 <- C0_final - mulx \T2, \T4, 8\M1 - xor rax, rax - mulx \T3, \T5, 16\M1 - adox \T1, \T4 - adox \T2, \T5 - mulx \T4, \T7, 24\M1 - adox \T3, \T7 - mulx \T5, \T6, 32\M1 - adox \T4, \T6 - mulx \T7, \T8, 40\M1 - adox \T5, \T8 - adox \T7, rax - - mov rdx, 8\M0 - mulx \T8, \T6, \M1 - adcx \T1, \T6 // T1 <- C1_final - adcx \T2, \T8 - mulx \T6, \T8, 8\M1 - adox \T2, \T8 - adcx \T3, \T6 - mulx \T6, \T8, 16\M1 - adox \T3, \T8 - adcx \T4, \T6 - mulx \T6, \T8, 24\M1 - adox \T4, \T8 - adcx \T5, \T6 - mulx \T6, \T8, 32\M1 - adox \T5, \T8 - adcx \T6, \T7 - mulx \T7, \T8, 40\M1 - adcx \T7, rax - adox \T6, \T8 - adox \T7, rax -.endm - -#else - -.macro MUL128x384_SCHOOL M0, M1, T0, T1, T2, T3, T4, T5, T6, T7, T8 - mov rdx, \M0 - mulx \T1, \T0, \M1 // T0 <- C0_final - mulx \T2, \T3, 8\M1 - add \T1, \T3 - adc \T2, 0 - - mov rdx, 8\M0 - xor \T5, \T5 - mulx \T3, \T4, \M1 - add \T1, \T4 - adc \T2, \T3 - adc \T5, 0 - - xor \T6, \T6 - mulx \T3, \T4, 8\M1 - add \T2, \T4 - adc \T3, \T5 - adc \T6, 0 - - mov rdx, \M0 - mulx \T4, \T5, 16\M1 - add \T2, \T5 - adc \T3, \T4 - adc \T6, 0 - - xor \T7, \T7 - mulx \T4, \T5, 24\M1 - add \T3, \T5 - adc \T4, \T6 - adc \T7, 0 - - mov rdx, 8\M0 - mulx \T5, \T6, 16\M1 - add \T3, \T6 - adc \T4, \T5 - adc \T7, 0 - - xor \T6, \T6 - mulx \T5, \T8, 24\M1 - add \T4, \T8 - adc \T5, \T7 - adc \T6, 0 - - mov rdx, \M0 - mulx \T7, \T8, 32\M1 - add \T4, \T8 - adc \T5, \T7 - adc \T6, 0 - - xor \T7, \T7 - mulx \T8, rax, 40\M1 - add \T5, rax - adc \T6, \T8 - adc \T7, 0 - - mov rdx, 8\M0 - mulx \T8, rax, 32\M1 - add \T5, rax - adc \T6, \T8 - adc \T7, 0 - - mov rdx, 8\M0 - mulx \T8, rax, 40\M1 - add \T6, rax - adc \T7, \T8 -.endm -#endif - - -//************************************************************************************** -// Montgomery reduction -// Based on method described in Faz-Hernandez et al. https://eprint.iacr.org/2017/1015 -// Operation: c [reg_p2] = a [reg_p1] -// NOTE: a=c is not allowed -//************************************************************************************** -.global fmt(rdc610_asm) -fmt(rdc610_asm): - push r12 - push r13 - push r14 - push r15 - - // a[0-1] x p610p1_nz --> result: r8:r15 - MUL128x384_SCHOOL [reg_p1], [rip+fmt(p610p1)+32], r8, r9, r10, r11, r12, r13, r14, r15, rcx - - xor rcx, rcx - add r8, [reg_p1+32] - adc r9, [reg_p1+40] - adc r10, [reg_p1+48] - adc r11, [reg_p1+56] - adc r12, [reg_p1+64] - adc r13, [reg_p1+72] - adc r14, [reg_p1+80] - adc r15, [reg_p1+88] - adc rcx, [reg_p1+96] - mov [reg_p1+32], r8 - mov [reg_p1+40], r9 - mov [reg_p1+48], r10 - mov [reg_p1+56], r11 - mov [reg_p1+64], r12 - mov [reg_p1+72], r13 - mov [reg_p1+80], r14 - mov [reg_p1+88], r15 - mov [reg_p1+96], rcx - mov r8, [reg_p1+104] - mov r9, [reg_p1+112] - mov r10, [reg_p1+120] - mov r11, [reg_p1+128] - mov r12, [reg_p1+136] - mov r13, [reg_p1+144] - mov r14, [reg_p1+152] - adc r8, 0 - adc r9, 0 - adc r10, 0 - adc r11, 0 - adc r12, 0 - adc r13, 0 - adc r14, 0 - mov [reg_p1+104], r8 - mov [reg_p1+112], r9 - mov [reg_p1+120], r10 - mov [reg_p1+128], r11 - mov [reg_p1+136], r12 - mov [reg_p1+144], r13 - mov [reg_p1+152], r14 - - // a[2-3] x p610p1_nz --> result: r8:r15 - MUL128x384_SCHOOL [reg_p1+16], [rip+fmt(p610p1)+32], r8, r9, r10, r11, r12, r13, r14, r15, rcx - - xor rcx, rcx - add r8, [reg_p1+48] - adc r9, [reg_p1+56] - adc r10, [reg_p1+64] - adc r11, [reg_p1+72] - adc r12, [reg_p1+80] - adc r13, [reg_p1+88] - adc r14, [reg_p1+96] - adc r15, [reg_p1+104] - adc rcx, [reg_p1+112] - mov [reg_p1+48], r8 - mov [reg_p1+56], r9 - mov [reg_p1+64], r10 - mov [reg_p1+72], r11 - mov [reg_p1+80], r12 - mov [reg_p1+88], r13 - mov [reg_p1+96], r14 - mov [reg_p1+104], r15 - mov [reg_p1+112], rcx - mov r8, [reg_p1+120] - mov r9, [reg_p1+128] - mov r10, [reg_p1+136] - mov r11, [reg_p1+144] - mov r12, [reg_p1+152] - adc r8, 0 - adc r9, 0 - adc r10, 0 - adc r11, 0 - adc r12, 0 - mov [reg_p1+120], r8 - mov [reg_p1+128], r9 - mov [reg_p1+136], r10 - mov [reg_p1+144], r11 - mov [reg_p1+152], r12 - - // a[4-5] x p610p1_nz --> result: r8:r15 - MUL128x384_SCHOOL [reg_p1+32], [rip+fmt(p610p1)+32], r8, r9, r10, r11, r12, r13, r14, r15, rcx - - xor rcx, rcx - add r8, [reg_p1+64] - adc r9, [reg_p1+72] - adc r10, [reg_p1+80] - adc r11, [reg_p1+88] - adc r12, [reg_p1+96] - adc r13, [reg_p1+104] - adc r14, [reg_p1+112] - adc r15, [reg_p1+120] - adc rcx, [reg_p1+128] - mov [reg_p1+64], r8 - mov [reg_p1+72], r9 - mov [reg_p1+80], r10 - mov [reg_p1+88], r11 - mov [reg_p1+96], r12 - mov [reg_p1+104], r13 - mov [reg_p1+112], r14 - mov [reg_p1+120], r15 - mov [reg_p1+128], rcx - mov r8, [reg_p1+136] - mov r9, [reg_p1+144] - mov r10, [reg_p1+152] - adc r8, 0 - adc r9, 0 - adc r10, 0 - mov [reg_p1+136], r8 - mov [reg_p1+144], r9 - mov [reg_p1+152], r10 - - // a[6-7] x p610p1_nz --> result: r8:r15 - MUL128x384_SCHOOL [reg_p1+48], [rip+fmt(p610p1)+32], r8, r9, r10, r11, r12, r13, r14, r15, rcx - - xor rcx, rcx - add r8, [reg_p1+80] - adc r9, [reg_p1+88] - adc r10, [reg_p1+96] - adc r11, [reg_p1+104] - adc r12, [reg_p1+112] - adc r13, [reg_p1+120] - adc r14, [reg_p1+128] - adc r15, [reg_p1+136] - adc rcx, [reg_p1+144] - mov [reg_p2], r8 // C0_final - mov [reg_p2+8], r9 // C1_final - mov [reg_p1+96], r10 - mov [reg_p1+104], r11 - mov [reg_p1+112], r12 - mov [reg_p1+120], r13 - mov [reg_p1+128], r14 - mov [reg_p1+136], r15 - mov [reg_p1+144], rcx - mov r8, [reg_p1+152] - adc r8, 0 - mov [reg_p1+152], r8 - - // a[8-9] x p610p1_nz --> result: r8:r15 - MUL128x384_SCHOOL [reg_p1+64], [rip+fmt(p610p1)+32], r8, r9, r10, r11, r12, r13, r14, r15, rcx - - // Final result C2:C9 - add r8, [reg_p1+96] - adc r9, [reg_p1+104] - adc r10, [reg_p1+112] - adc r11, [reg_p1+120] - adc r12, [reg_p1+128] - adc r13, [reg_p1+136] - adc r14, [reg_p1+144] - adc r15, [reg_p1+152] - mov [reg_p2+16], r8 - mov [reg_p2+24], r9 - mov [reg_p2+32], r10 - mov [reg_p2+40], r11 - mov [reg_p2+48], r12 - mov [reg_p2+56], r13 - mov [reg_p2+64], r14 - mov [reg_p2+72], r15 - - pop r15 - pop r14 - pop r13 - pop r12 - ret - - #else - -//*********************************************************************** -// Montgomery reduction -// Based on comba method -// Operation: c [reg_p2] = a [reg_p1] -// NOTE: a=c is not allowed -//*********************************************************************** -.global fmt(rdc610_asm) -fmt(rdc610_asm): - - ret - -# error "CONFIGURATION NOT SUPPORTED. TRY USE_MULX=TRUE" - - #endif - - -//*********************************************************************** -// 610-bit multiprecision addition -// Operation: c [reg_p3] = a [reg_p1] + b [reg_p2] -//*********************************************************************** -.global fmt(mp_add610_asm) -fmt(mp_add610_asm): - mov r8, [reg_p1] - mov r9, [reg_p1+8] - mov r10, [reg_p1+16] - mov r11, [reg_p1+24] - mov rax, [reg_p1+32] - add r8, [reg_p2] - adc r9, [reg_p2+8] - adc r10, [reg_p2+16] - adc r11, [reg_p2+24] - adc rax, [reg_p2+32] - mov [reg_p3], r8 - mov [reg_p3+8], r9 - mov [reg_p3+16], r10 - mov [reg_p3+24], r11 - mov [reg_p3+32], rax - - mov r8, [reg_p1+40] - mov r9, [reg_p1+48] - mov r10, [reg_p1+56] - mov r11, [reg_p1+64] - mov rax, [reg_p1+72] - adc r8, [reg_p2+40] - adc r9, [reg_p2+48] - adc r10, [reg_p2+56] - adc r11, [reg_p2+64] - adc rax, [reg_p2+72] - mov [reg_p3+40], r8 - mov [reg_p3+48], r9 - mov [reg_p3+56], r10 - mov [reg_p3+64], r11 - mov [reg_p3+72], rax - ret - - -//*********************************************************************** -// 2x610-bit multiprecision subtraction/addition -// Operation: c [x2] = a [x0] - b [x1]. If c < 0, add p610*2^640 -//*********************************************************************** -.global fmt(mp_subadd610x2_asm) -fmt(mp_subadd610x2_asm): - push r12 - push r13 - push r14 - push r15 - push rbx - xor rax, rax - mov r8, [reg_p1] - mov r9, [reg_p1+8] - mov r10, [reg_p1+16] - mov r11, [reg_p1+24] - mov rcx, [reg_p1+32] - sub r8, [reg_p2] - sbb r9, [reg_p2+8] - sbb r10, [reg_p2+16] - sbb r11, [reg_p2+24] - sbb rcx, [reg_p2+32] - mov [reg_p3], r8 - mov [reg_p3+8], r9 - mov [reg_p3+16], r10 - mov [reg_p3+24], r11 - mov [reg_p3+32], rcx - - mov r8, [reg_p1+40] - mov r9, [reg_p1+48] - mov r10, [reg_p1+56] - mov r11, [reg_p1+64] - mov rcx, [reg_p1+72] - sbb r8, [reg_p2+40] - sbb r9, [reg_p2+48] - sbb r10, [reg_p2+56] - sbb r11, [reg_p2+64] - sbb rcx, [reg_p2+72] - mov [reg_p3+40], r8 - mov [reg_p3+48], r9 - mov [reg_p3+56], r10 - mov [reg_p3+64], r11 - mov [reg_p3+72], rcx - - mov r8, [reg_p1+80] - mov r9, [reg_p1+88] - mov r10, [reg_p1+96] - mov r11, [reg_p1+104] - mov rcx, [reg_p1+112] - sbb r8, [reg_p2+80] - sbb r9, [reg_p2+88] - sbb r10, [reg_p2+96] - sbb r11, [reg_p2+104] - sbb rcx, [reg_p2+112] - mov [reg_p3+80], r8 - mov [reg_p3+88], r9 - mov [reg_p3+96], r10 - mov [reg_p3+104], r11 - mov [reg_p3+112], rcx - - mov r8, [reg_p1+120] - mov r9, [reg_p1+128] - mov r10, [reg_p1+136] - mov r11, [reg_p1+144] - mov rcx, [reg_p1+152] - sbb r8, [reg_p2+120] - sbb r9, [reg_p2+128] - sbb r10, [reg_p2+136] - sbb r11, [reg_p2+144] - sbb rcx, [reg_p2+152] - sbb rax, 0 - - // Add p610 anded with the mask in rax - mov r12, [rip+fmt(p610)] - mov r13, [rip+fmt(p610)+32] - mov r14, [rip+fmt(p610)+40] - mov r15, [rip+fmt(p610)+48] - mov rdi, [rip+fmt(p610)+56] - mov rsi, [rip+fmt(p610)+64] - mov rbx, [rip+fmt(p610)+72] - and r12, rax - and r13, rax - and r14, rax - and r15, rax - and rdi, rax - and rsi, rax - and rbx, rax - mov rax, [reg_p3+80] - add rax, r12 - mov [reg_p3+80], rax - mov rax, [reg_p3+88] - adc rax, r12 - mov [reg_p3+88], rax - mov rax, [reg_p3+96] - adc rax, r12 - mov [reg_p3+96], rax - adc r12, [reg_p3+104] - adc r13, [reg_p3+112] - mov [reg_p3+104], r12 - mov [reg_p3+112], r13 - adc r8, r14 - adc r9, r15 - adc r10, rdi - adc r11, rsi - adc rcx, rbx - - mov [reg_p3+120], r8 - mov [reg_p3+128], r9 - mov [reg_p3+136], r10 - mov [reg_p3+144], r11 - mov [reg_p3+152], rcx - pop rbx - pop r15 - pop r14 - pop r13 - pop r12 - ret - - -//*********************************************************************** -// Double 2x610-bit multiprecision subtraction -// Operation: c [reg_p3] = c [reg_p3] - a [reg_p1] - b [reg_p2] -//*********************************************************************** -.global fmt(mp_dblsub610x2_asm) -fmt(mp_dblsub610x2_asm): - push r12 - push r13 - push r14 - push r15 - - mov r8, [reg_p3] - mov r9, [reg_p3+8] - mov r10, [reg_p3+16] - mov r11, [reg_p3+24] - mov r12, [reg_p3+32] - mov r13, [reg_p3+40] - mov r14, [reg_p3+48] - mov r15, [reg_p3+56] - sub r8, [reg_p1] - sbb r9, [reg_p1+8] - sbb r10, [reg_p1+16] - sbb r11, [reg_p1+24] - sbb r12, [reg_p1+32] - sbb r13, [reg_p1+40] - sbb r14, [reg_p1+48] - sbb r15, [reg_p1+56] - setc al - sub r8, [reg_p2] - sbb r9, [reg_p2+8] - sbb r10, [reg_p2+16] - sbb r11, [reg_p2+24] - sbb r12, [reg_p2+32] - sbb r13, [reg_p2+40] - sbb r14, [reg_p2+48] - sbb r15, [reg_p2+56] - setc cl - mov [reg_p3], r8 - mov [reg_p3+8], r9 - mov [reg_p3+16], r10 - mov [reg_p3+24], r11 - mov [reg_p3+32], r12 - mov [reg_p3+40], r13 - mov [reg_p3+48], r14 - mov [reg_p3+56], r15 - - mov r8, [reg_p3+64] - mov r9, [reg_p3+72] - mov r10, [reg_p3+80] - mov r11, [reg_p3+88] - mov r12, [reg_p3+96] - mov r13, [reg_p3+104] - mov r14, [reg_p3+112] - mov r15, [reg_p3+120] - bt rax, 0 - sbb r8, [reg_p1+64] - sbb r9, [reg_p1+72] - sbb r10, [reg_p1+80] - sbb r11, [reg_p1+88] - sbb r12, [reg_p1+96] - sbb r13, [reg_p1+104] - sbb r14, [reg_p1+112] - sbb r15, [reg_p1+120] - setc al - bt rcx, 0 - sbb r8, [reg_p2+64] - sbb r9, [reg_p2+72] - sbb r10, [reg_p2+80] - sbb r11, [reg_p2+88] - sbb r12, [reg_p2+96] - sbb r13, [reg_p2+104] - sbb r14, [reg_p2+112] - sbb r15, [reg_p2+120] - setc cl - mov [reg_p3+64], r8 - mov [reg_p3+72], r9 - mov [reg_p3+80], r10 - mov [reg_p3+88], r11 - mov [reg_p3+96], r12 - mov [reg_p3+104], r13 - mov [reg_p3+112], r14 - mov [reg_p3+120], r15 - - mov r8, [reg_p3+128] - mov r9, [reg_p3+136] - mov r10, [reg_p3+144] - mov r11, [reg_p3+152] - bt rax, 0 - sbb r8, [reg_p1+128] - sbb r9, [reg_p1+136] - sbb r10, [reg_p1+144] - sbb r11, [reg_p1+152] - bt rcx, 0 - sbb r8, [reg_p2+128] - sbb r9, [reg_p2+136] - sbb r10, [reg_p2+144] - sbb r11, [reg_p2+152] - mov [reg_p3+128], r8 - mov [reg_p3+136], r9 - mov [reg_p3+144], r10 - mov [reg_p3+152], r11 - - pop r15 - pop r14 - pop r13 - pop r12 - ret \ No newline at end of file +// Squaring in GF(p^2), complex part +// Operation: c [reg_p2] = 2a0 x a1 +// Inputs: a = [a1, a0] stored in [reg_p1] +// Output: c stored in [reg_p2] +//*********************************************************************** +.global fmt(fp2sqr610_c1_asm) +fmt(fp2sqr610_c1_asm): + push r12 + push r13 + push r14 + push r15 + + mov r8, [reg_p1] + mov r9, [reg_p1+8] + mov r10, [reg_p1+16] + mov r11, [reg_p1+24] + mov r12, [reg_p1+32] + mov r13, [reg_p1+40] + add r8, r8 + adc r9, r9 + push rbx + adc r10, r10 + adc r11, r11 + push rbp + adc r12, r12 + adc r13, r13 + mov r14, [reg_p1+48] + mov r15, [reg_p1+56] + adc r14, r14 + adc r15, r15 + mov rbx, [reg_p1+64] + mov rbp, [reg_p1+72] + adc rbx, rbx + adc rbp, rbp + sub rsp, 112 + mov [rsp+8], r9 + mov [rsp+16], r10 + mov [rsp+24], r11 + + // [rsp24, rsp0:rsp16, r11:r15, r8:r10] <- z = a00 x a1 + mov rdx, r8 + mulx r9, r8, [reg_p1+80] + mov [rsp+32], r12 + xor rax, rax + mulx r10, r11, [reg_p1+88] + mov [rsp+40], r13 + adcx r9, r11 + mulx r11, r12, [reg_p1+96] + mov [rsp+48], r14 + adcx r10, r12 + mulx r12, r13, [reg_p1+104] + mov [rsp+104], r8 + adcx r11, r13 + mulx r13, r8, [reg_p1+112] + mov [rsp+80], r9 + adcx r12, r8 + mulx r14, r9, [reg_p1+120] + mov [rsp+56], r15 + adcx r13, r9 + mulx r15, rax, [reg_p1+128] + mov [rsp+88], r10 + adcx r14, rax + mulx r8, r10, [reg_p1+136] + mov [rsp+96], r11 + adcx r15, r10 + mulx r9, rax, [reg_p1+144] + mov [rsp+64], rbx + adcx r8, rax + mulx r10, rbx, [reg_p1+152] + mov [rsp+72], rbp + adcx r9, rbx + adc r10, 0 + + FPMUL640x640 [rsp], [reg_p1+80], [rsp+80], [reg_p2], r8, r9, r10, r11, r12, r13, r14, r15, rbx, rbp, rax + + mov [reg_p2+24], r13 + mov [reg_p2+32], r14 + mov [reg_p2+40], r15 + mov [reg_p2+48], r8 + mov [reg_p2+56], r9 + mov [reg_p2+64], r10 + mov [reg_p2+72], r11 + add rsp, 112 + pop rbp + pop rbx + pop r15 + pop r14 + pop r13 + pop r12 + ret + + +//*********************************************************************** +// Field multiplication in GF(p) +// Operation: c = a x b mod p +// Inputs: a stored in [reg_p1], b stored in [reg_p2] +// Output: c stored in [reg_p3] +//*********************************************************************** +.global fmt(fpmul610_asm) +fmt(fpmul610_asm): + mov rcx, reg_p3 + push r12 + push r13 + push r14 + push r15 + push rbx + push rbp + sub rsp, 32 + + // [r8:r15] <- z = a x b0 + mov rdx, [reg_p2] + mulx r9, r8, [reg_p1] + xor rax, rax + mov [rsp+24], r8 + mulx r10, r11, [reg_p1+8] + adcx r9, r11 + mulx r11, r12, [reg_p1+16] + adcx r10, r12 + mulx r12, r13, [reg_p1+24] + adcx r11, r13 + mulx r13, r8, [reg_p1+32] + adcx r12, r8 + mulx r14, rax, [reg_p1+40] + adcx r13, rax + mulx r15, rax, [reg_p1+48] + mov [rsp], r9 + adcx r14, rax + mulx r8, rbx, [reg_p1+56] + mov [rsp+8], r10 + adcx r15, rbx + mulx r9, rax, [reg_p1+64] + mov [rsp+16], r11 + adcx r8, rax + mulx r10, rbx, [reg_p1+72] + adcx r9, rbx + adc r10, 0 + + FPMUL640x640 [reg_p2], [reg_p1], [rsp], [rcx], r8, r9, r10, r11, r12, r13, r14, r15, rbx, rbp, rax + + mov [rcx+24], r13 + mov [rcx+32], r14 + mov [rcx+40], r15 + mov [rcx+48], r8 + mov [rcx+56], r9 + mov [rcx+64], r10 + mov [rcx+72], r11 + add rsp, 32 + pop rbp + pop rbx + pop r15 + pop r14 + pop r13 + pop r12 + ret \ No newline at end of file diff --git a/src/P610/ARM64/fp_arm64.c b/src/P610/ARM64/fp_arm64.c index bd72d88..ebf051b 100644 --- a/src/P610/ARM64/fp_arm64.c +++ b/src/P610/ARM64/fp_arm64.c @@ -1,10 +1,15 @@ /******************************************************************************************** * SIDH: an efficient supersingular isogeny cryptography library +* Copyright (c) Microsoft Corporation +* +* Website: https://github.com/microsoft/PQCrypto-SIDH +* Released under MIT license * * Abstract: modular arithmetic optimized for 64-bit ARMv8 platforms for P610 *********************************************************************************************/ #include "../P610_internal.h" +#include "../../internal.h" // Global constants extern const uint64_t p610[NWORDS_FIELD]; @@ -13,21 +18,21 @@ extern const uint64_t p610x2[NWORDS_FIELD]; extern const uint64_t p610x4[NWORDS_FIELD]; -__inline void mp_sub610_p2(const digit_t* a, const digit_t* b, digit_t* c) +inline void mp_sub610_p2(const digit_t* a, const digit_t* b, digit_t* c) { // Multiprecision subtraction with correction with 2*p, c = a-b+2p. mp_sub610_p2_asm(a, b, c); } -__inline void mp_sub610_p4(const digit_t* a, const digit_t* b, digit_t* c) +inline void mp_sub610_p4(const digit_t* a, const digit_t* b, digit_t* c) { // Multiprecision subtraction with correction with 4*p, c = a-b+4p. mp_sub610_p4_asm(a, b, c); } -__inline void fpadd610(const digit_t* a, const digit_t* b, digit_t* c) +inline void fpadd610(const digit_t* a, const digit_t* b, digit_t* c) { // Modular addition, c = a+b mod p610. // Inputs: a, b in [0, 2*p610-1] // Output: c in [0, 2*p610-1] @@ -36,7 +41,7 @@ __inline void fpadd610(const digit_t* a, const digit_t* b, digit_t* c) } -__inline void fpsub610(const digit_t* a, const digit_t* b, digit_t* c) +inline void fpsub610(const digit_t* a, const digit_t* b, digit_t* c) { // Modular subtraction, c = a-b mod p610. // Inputs: a, b in [0, 2*p610-1] // Output: c in [0, 2*p610-1] @@ -45,7 +50,7 @@ __inline void fpsub610(const digit_t* a, const digit_t* b, digit_t* c) } -__inline void fpneg610(digit_t* a) +inline void fpneg610(digit_t* a) { // Modular negation, a = -a mod p610. // Input/output: a in [0, 2*p610-1] unsigned int i, borrow = 0; diff --git a/src/P610/ARM64/fp_arm64_asm.S b/src/P610/ARM64/fp_arm64_asm.S index b1ecf43..06a3190 100644 --- a/src/P610/ARM64/fp_arm64_asm.S +++ b/src/P610/ARM64/fp_arm64_asm.S @@ -1,5 +1,9 @@ //******************************************************************************************* // SIDH: an efficient supersingular isogeny cryptography library +// Copyright (c) Microsoft Corporation +// +// Website: https://github.com/microsoft/PQCrypto-SIDH +// Released under MIT license // // Abstract: field arithmetic in 64-bit ARMv8 assembly for P610 on Linux //******************************************************************************************* diff --git a/src/P610/P610.c b/src/P610/P610.c index a4638de..6ea3aa8 100644 --- a/src/P610/P610.c +++ b/src/P610/P610.c @@ -1,5 +1,9 @@ /******************************************************************************************** * SIDH: an efficient supersingular isogeny cryptography library +* Copyright (c) Microsoft Corporation +* +* Website: https://github.com/microsoft/PQCrypto-SIDH +* Released under MIT license * * Abstract: supersingular isogeny parameters and generation of functions for P610 *********************************************************************************************/ @@ -27,12 +31,10 @@ const uint64_t p610x2[NWORDS64_FIELD] = { 0xFFFFFFFFFFFFFFFE, 0xFFFFF 0x62F09BD154B5605C, 0x35CF7E8A091FF357, 0x64AB65F421884A55, 0x03202184A3CFB119, 0x00000004F7ED4ED1 }; const uint64_t p610x4[NWORDS64_FIELD] = { 0xFFFFFFFFFFFFFFFC, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xB807FFFFFFFFFFFF, 0xC5E137A2A96AC0B9, 0x6B9EFD14123FE6AE, 0xC956CBE8431094AA, 0x06404309479F6232, 0x00000009EFDA9DA2 }; +const uint64_t p610x8[NWORDS64_FIELD] = { 0xFFFFFFFFFFFFFFF8, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x700FFFFFFFFFFFFF, + 0x8BC26F4552D58173, 0xD73DFA28247FCD5D, 0x92AD97D086212954, 0x0C8086128F3EC465, 0x00000013DFB53B44 }; const uint64_t p610p1[NWORDS64_FIELD] = { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x6E02000000000000, 0xB1784DE8AA5AB02E, 0x9AE7BF45048FF9AB, 0xB255B2FA10C4252A, 0x819010C251E7D88C, 0x000000027BF6A768 }; -const uint64_t p610x16p[2*NWORDS64_FIELD] = { 0x0000000000000010, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x3FC0000000000000, - 0xD0F642EAB4A9FA32, 0xA308175F6E00CA89, 0xB549A0BDE77B5AAC, 0xCDFDE7B5C304EE69, 0x7FDB7FF0812B12EF, - 0xE09BA529B9FE1167, 0xD249C196DAB8CD7F, 0xD4E22754A3F20928, 0x97825638B19A7CCE, 0x05E04550FC4CCE0D, - 0x8FB5DA1152CDE50C, 0xF9649BA3EA408644, 0x4473C93E6441063D, 0xBE190269D1337B7B, 0x0000000000000062 }; // Order of Alice's subgroup const uint64_t Alice_order[NWORDS64_ORDER] = { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0002000000000000 }; // Order of Bob's subgroup @@ -98,6 +100,7 @@ const unsigned int strat_Bob[MAX_Bob-1] = { #define fpneg fpneg610 #define fpdiv2 fpdiv2_610 #define fpcorrection fpcorrection610 +#define fpmul fpmul610 #define fpmul_mont fpmul610_mont #define fpsqr_mont fpsqr610_mont #define fpinv_mont fpinv610_mont @@ -115,6 +118,10 @@ const unsigned int strat_Bob[MAX_Bob-1] = { #define fp2correction fp2correction610 #define fp2mul_mont fp2mul610_mont #define fp2sqr_mont fp2sqr610_mont +#define fp2mul_c0_mont fp2mul610_c0_mont +#define fp2mul_c1_mont fp2mul610_c1_mont +#define fp2sqr_c0_mont fp2sqr610_c0_mont +#define fp2sqr_c1_mont fp2sqr610_c1_mont #define fp2inv_mont fp2inv610_mont #define fp2inv_mont_bingcd fp2inv610_mont_bingcd #define fpequal_non_constant_time fpequal610_non_constant_time diff --git a/src/P610/P610_api.h b/src/P610/P610_api.h index 41a3ac3..ee71516 100644 --- a/src/P610/P610_api.h +++ b/src/P610/P610_api.h @@ -1,5 +1,9 @@ /******************************************************************************************** * SIDH: an efficient supersingular isogeny cryptography library +* Copyright (c) Microsoft Corporation +* +* Website: https://github.com/microsoft/PQCrypto-SIDH +* Released under MIT license * * Abstract: API header file for P610 *********************************************************************************************/ diff --git a/src/P610/P610_compressed.c b/src/P610/P610_compressed.c index e07b35f..de0481e 100644 --- a/src/P610/P610_compressed.c +++ b/src/P610/P610_compressed.c @@ -1,5 +1,9 @@ /******************************************************************************************** * Supersingular Isogeny Key Encapsulation Library +* Copyright (c) Microsoft Corporation +* +* Website: https://github.com/microsoft/PQCrypto-SIDH +* Released under MIT license * * Abstract: supersingular isogeny parameters and generation of functions for P610_compressed *********************************************************************************************/ @@ -28,12 +32,10 @@ const uint64_t p610x2[NWORDS64_FIELD] = { 0xFFFFFFFFFFFFFFFE, 0xFFFFF 0x62F09BD154B5605C, 0x35CF7E8A091FF357, 0x64AB65F421884A55, 0x03202184A3CFB119, 0x00000004F7ED4ED1 }; const uint64_t p610x4[NWORDS64_FIELD] = { 0xFFFFFFFFFFFFFFFC, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xB807FFFFFFFFFFFF, 0xC5E137A2A96AC0B9, 0x6B9EFD14123FE6AE, 0xC956CBE8431094AA, 0x06404309479F6232, 0x00000009EFDA9DA2 }; +const uint64_t p610x8[NWORDS64_FIELD] = { 0xFFFFFFFFFFFFFFF8, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x700FFFFFFFFFFFFF, + 0x8BC26F4552D58173, 0xD73DFA28247FCD5D, 0x92AD97D086212954, 0x0C8086128F3EC465, 0x00000013DFB53B44 }; const uint64_t p610p1[NWORDS64_FIELD] = { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x6E02000000000000, 0xB1784DE8AA5AB02E, 0x9AE7BF45048FF9AB, 0xB255B2FA10C4252A, 0x819010C251E7D88C, 0x000000027BF6A768 }; -const uint64_t p610x16p[2*NWORDS64_FIELD] = { 0x0000000000000010, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x3FC0000000000000, - 0xD0F642EAB4A9FA32, 0xA308175F6E00CA89, 0xB549A0BDE77B5AAC, 0xCDFDE7B5C304EE69, 0x7FDB7FF0812B12EF, - 0xE09BA529B9FE1167, 0xD249C196DAB8CD7F, 0xD4E22754A3F20928, 0x97825638B19A7CCE, 0x05E04550FC4CCE0D, - 0x8FB5DA1152CDE50C, 0xF9649BA3EA408644, 0x4473C93E6441063D, 0xBE190269D1337B7B, 0x0000000000000062 }; // Order of Alice's subgroup const uint64_t Alice_order[NWORDS64_ORDER] = { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0002000000000000 }; // Order of Bob's subgroup @@ -341,6 +343,7 @@ const uint64_t v_3_torsion[TABLE_V3_LEN][2 * NWORDS64_FIELD] = #define fpneg fpneg610 #define fpdiv2 fpdiv2_610 #define fpcorrection fpcorrection610 +#define fpmul fpmul610 #define fpmul_mont fpmul610_mont #define fpsqr_mont fpsqr610_mont #define fpinv_mont fpinv610_mont @@ -358,6 +361,10 @@ const uint64_t v_3_torsion[TABLE_V3_LEN][2 * NWORDS64_FIELD] = #define fp2correction fp2correction610 #define fp2mul_mont fp2mul610_mont #define fp2sqr_mont fp2sqr610_mont +#define fp2mul_c0_mont fp2mul610_c0_mont +#define fp2mul_c1_mont fp2mul610_c1_mont +#define fp2sqr_c0_mont fp2sqr610_c0_mont +#define fp2sqr_c1_mont fp2sqr610_c1_mont #define fp2inv_mont fp2inv610_mont #define fp2inv_mont_bingcd fp2inv610_mont_bingcd #define fpequal_non_constant_time fpequal610_non_constant_time diff --git a/src/P610/P610_compressed_api.h b/src/P610/P610_compressed_api.h index 8956bef..4f8035b 100644 --- a/src/P610/P610_compressed_api.h +++ b/src/P610/P610_compressed_api.h @@ -1,5 +1,9 @@ /******************************************************************************************** * SIDH: an efficient supersingular isogeny cryptography library +* Copyright (c) Microsoft Corporation +* +* Website: https://github.com/microsoft/PQCrypto-SIDH +* Released under MIT license * * Abstract: API header file for P610 using compression *********************************************************************************************/ diff --git a/src/P610/P610_compressed_dlog_tables.c b/src/P610/P610_compressed_dlog_tables.c index ed63f3a..bf29c0f 100644 --- a/src/P610/P610_compressed_dlog_tables.c +++ b/src/P610/P610_compressed_dlog_tables.c @@ -1,5 +1,9 @@ /******************************************************************************************** * SIDH: an efficient supersingular isogeny cryptography library +* Copyright (c) Microsoft Corporation +* +* Website: https://github.com/microsoft/PQCrypto-SIDH +* Released under MIT license * * Abstract: precomputed tables for Pohlig-Hellman when using compression *********************************************************************************************/ diff --git a/src/P610/P610_compressed_pair_tables.c b/src/P610/P610_compressed_pair_tables.c index 1a8f560..7b674e1 100644 --- a/src/P610/P610_compressed_pair_tables.c +++ b/src/P610/P610_compressed_pair_tables.c @@ -1,5 +1,9 @@ /************************************************************************************************** * SIDH: an efficient supersingular isogeny cryptography library +* Copyright (c) Microsoft Corporation +* +* Website: https://github.com/microsoft/PQCrypto-SIDH +* Released under MIT license * * Abstract: precomputed tables for pairing computation on E0: y^2 = x^3 + x when using compression ***************************************************************************************************/ diff --git a/src/P610/P610_internal.h b/src/P610/P610_internal.h index b933c42..d1e69d6 100644 --- a/src/P610/P610_internal.h +++ b/src/P610/P610_internal.h @@ -1,5 +1,9 @@ /******************************************************************************************** * SIDH: an efficient supersingular isogeny cryptography library +* Copyright (c) Microsoft Corporation +* +* Website: https://github.com/microsoft/PQCrypto-SIDH +* Released under MIT license * * Abstract: internal header file for P610 *********************************************************************************************/ @@ -170,6 +174,8 @@ void rdc610_asm(digit_t* ma, digit_t* mc); // Field multiplication using Montgomery arithmetic, c = a*b*R^-1 mod p610, where R=2^640 void fpmul610_mont(const digit_t* a, const digit_t* b, digit_t* c); +void fpmul610(const digit_t* a, const digit_t* b, digit_t* c); +void fpmul610_asm(const digit_t* a, const digit_t* b, digit_t* c); void mul610_asm(const digit_t* a, const digit_t* b, digit_t* c); // Field squaring using Montgomery arithmetic, c = a*b*R^-1 mod p610, where R=2^640 @@ -209,9 +215,17 @@ void fp2correction610(f2elm_t a); // GF(p610^2) squaring using Montgomery arithmetic, c = a^2 in GF(p610^2) void fp2sqr610_mont(const f2elm_t a, f2elm_t c); +void fp2sqr610_c0_mont(const digit_t* a, digit_t* c); +void fp2sqr610_c0_asm(const digit_t* a, digit_t* c); +void fp2sqr610_c1_mont(const digit_t* a, digit_t* c); +void fp2sqr610_c1_asm(const digit_t* a, digit_t* c); // GF(p610^2) multiplication using Montgomery arithmetic, c = a*b in GF(p610^2) void fp2mul610_mont(const f2elm_t a, const f2elm_t b, f2elm_t c); +void fp2mul610_c0_mont(const digit_t* a, const digit_t* b, digit_t* c); +void fp2mul610_c0_asm(const digit_t* a, const digit_t* b, digit_t* c); +void fp2mul610_c1_mont(const digit_t* a, const digit_t* b, digit_t* c); +void fp2mul610_c1_asm(const digit_t* a, const digit_t* b, digit_t* c); // GF(p610^2) inversion using Montgomery arithmetic, a = (a0-i*a1)/(a0^2+a1^2) void fp2inv610_mont(f2elm_t a); diff --git a/src/P610/generic/fp_generic.c b/src/P610/generic/fp_generic.c index e56a343..aa7f68a 100755 --- a/src/P610/generic/fp_generic.c +++ b/src/P610/generic/fp_generic.c @@ -1,5 +1,9 @@ /******************************************************************************************** * SIDH: an efficient supersingular isogeny cryptography library +* Copyright (c) Microsoft Corporation +* +* Website: https://github.com/microsoft/PQCrypto-SIDH +* Released under MIT license * * Abstract: portable modular arithmetic for P610 *********************************************************************************************/ diff --git a/src/P751/AMD64/fp_x64.c b/src/P751/AMD64/fp_x64.c index d9e47fa..ac50804 100644 --- a/src/P751/AMD64/fp_x64.c +++ b/src/P751/AMD64/fp_x64.c @@ -1,5 +1,9 @@ /******************************************************************************************** * SIDH: an efficient supersingular isogeny cryptography library +* Copyright (c) Microsoft Corporation +* +* Website: https://github.com/microsoft/PQCrypto-SIDH +* Released under MIT license * * Abstract: modular arithmetic optimized for x64 platforms for P751 *********************************************************************************************/ @@ -17,7 +21,7 @@ extern const uint64_t p751x4[NWORDS_FIELD]; inline void mp_sub751_p2(const digit_t* a, const digit_t* b, digit_t* c) { // Multiprecision subtraction with correction with 2*p, c = a-b+2p. -#if (OS_TARGET == OS_WIN) || defined(GENERIC_IMPLEMENTATION) || (TARGET == TARGET_ARM) || (TARGET == TARGET_ARM64 && NBITS_FIELD == 751) +#if (OS_TARGET == OS_WIN) unsigned int i, borrow = 0; for (i = 0; i < NWORDS_FIELD; i++) { @@ -39,7 +43,7 @@ inline void mp_sub751_p2(const digit_t* a, const digit_t* b, digit_t* c) inline void mp_sub751_p4(const digit_t* a, const digit_t* b, digit_t* c) { // Multiprecision subtraction with correction with 4*p, c = a-b+4p. -#if (OS_TARGET == OS_WIN) || defined(GENERIC_IMPLEMENTATION) || (TARGET == TARGET_ARM) || (TARGET == TARGET_ARM64 && NBITS_FIELD == 751) +#if (OS_TARGET == OS_WIN) unsigned int i, borrow = 0; for (i = 0; i < NWORDS_FIELD; i++) { diff --git a/src/P751/AMD64/fp_x64_asm.S b/src/P751/AMD64/fp_x64_asm.S index f3612f4..0452fa5 100644 --- a/src/P751/AMD64/fp_x64_asm.S +++ b/src/P751/AMD64/fp_x64_asm.S @@ -1,5 +1,9 @@ //******************************************************************************************* // SIDH: an efficient supersingular isogeny cryptography library +// Copyright (c) Microsoft Corporation +// +// Website: https://github.com/microsoft/PQCrypto-SIDH +// Released under MIT license // // Abstract: field arithmetic in x64 assembly for P751 on Linux //******************************************************************************************* diff --git a/src/P751/ARM64/fp_arm64.c b/src/P751/ARM64/fp_arm64.c index b3a8365..1a49eb3 100644 --- a/src/P751/ARM64/fp_arm64.c +++ b/src/P751/ARM64/fp_arm64.c @@ -1,10 +1,15 @@ /******************************************************************************************** * SIDH: an efficient supersingular isogeny cryptography library +* Copyright (c) Microsoft Corporation +* +* Website: https://github.com/microsoft/PQCrypto-SIDH +* Released under MIT license * * Abstract: modular arithmetic optimized for 64-bit ARMv8 platforms for P751 *********************************************************************************************/ #include "../P751_internal.h" +#include "../../internal.h" // Global constants extern const uint64_t p751[NWORDS_FIELD]; @@ -13,21 +18,21 @@ extern const uint64_t p751x2[NWORDS_FIELD]; extern const uint64_t p751x4[NWORDS_FIELD]; -__inline void mp_sub751_p2(const digit_t* a, const digit_t* b, digit_t* c) +inline void mp_sub751_p2(const digit_t* a, const digit_t* b, digit_t* c) { // Multiprecision subtraction with correction with 2*p, c = a-b+2p. mp_sub751_p2_asm(a, b, c); } -__inline void mp_sub751_p4(const digit_t* a, const digit_t* b, digit_t* c) +inline void mp_sub751_p4(const digit_t* a, const digit_t* b, digit_t* c) { // Multiprecision subtraction with correction with 4*p, c = a-b+4p. mp_sub751_p4_asm(a, b, c); } -__inline void fpadd751(const digit_t* a, const digit_t* b, digit_t* c) +inline void fpadd751(const digit_t* a, const digit_t* b, digit_t* c) { // Modular addition, c = a+b mod p751. // Inputs: a, b in [0, 2*p751-1] // Output: c in [0, 2*p751-1] @@ -36,7 +41,7 @@ __inline void fpadd751(const digit_t* a, const digit_t* b, digit_t* c) } -__inline void fpsub751(const digit_t* a, const digit_t* b, digit_t* c) +inline void fpsub751(const digit_t* a, const digit_t* b, digit_t* c) { // Modular subtraction, c = a-b mod p751. // Inputs: a, b in [0, 2*p751-1] // Output: c in [0, 2*p751-1] @@ -45,7 +50,7 @@ __inline void fpsub751(const digit_t* a, const digit_t* b, digit_t* c) } -__inline void fpneg751(digit_t* a) +inline void fpneg751(digit_t* a) { // Modular negation, a = -a mod p751. // Input/output: a in [0, 2*p751-1] unsigned int i, borrow = 0; diff --git a/src/P751/ARM64/fp_arm64_asm.S b/src/P751/ARM64/fp_arm64_asm.S index 216467e..c75278c 100644 --- a/src/P751/ARM64/fp_arm64_asm.S +++ b/src/P751/ARM64/fp_arm64_asm.S @@ -1,5 +1,9 @@ //******************************************************************************************* // SIDH: an efficient supersingular isogeny cryptography library +// Copyright (c) Microsoft Corporation +// +// Website: https://github.com/microsoft/PQCrypto-SIDH +// Released under MIT license // // Abstract: field arithmetic in 64-bit ARMv8 assembly for P751 on Linux //******************************************************************************************* diff --git a/src/P751/P751.c b/src/P751/P751.c index b35b3e2..917f98b 100644 --- a/src/P751/P751.c +++ b/src/P751/P751.c @@ -1,5 +1,9 @@ /******************************************************************************************** * SIDH: an efficient supersingular isogeny cryptography library +* Copyright (c) Microsoft Corporation +* +* Website: https://github.com/microsoft/PQCrypto-SIDH +* Released under MIT license * * Abstract: supersingular isogeny parameters and generation of functions for P751 *********************************************************************************************/ @@ -29,10 +33,6 @@ const uint64_t p751x4[NWORDS64_FIELD] = { 0xFFFFFFFFFFFFFFFC, 0xFFFFF 0x8FB25A1527E1E2A3, 0x6A566C684FDF31DB, 0x213A619F5BAFA1DB, 0x158AD41172C95D20, 0x384A427E5EEB719A, 0x0001BF975507DC70 }; const uint64_t p751p1[NWORDS64_FIELD] = { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xEEB0000000000000, 0xE3EC968549F878A8, 0xDA959B1A13F7CC76, 0x084E9867D6EBE876, 0x8562B5045CB25748, 0x0E12909F97BADC66, 0x00006FE5D541F71C }; -const uint64_t p751x16p[2*NWORDS64_FIELD] = { 0x0000000000000010, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x2A00000000000000, - 0x826D2F56C0F0EAE2, 0xAD4C9CBD81067123, 0xF62CF3052282F124, 0x53A95F7469B516FE, 0x3DADEC0D08A4732F, 0x58AD934557C11C7E, - 0x7F731B89B2DA43F2, 0x51AE9F5F5F6AFF3B, 0xD74319A6C9BCA375, 0x5BAB790796CF84D4, 0xA421554FE2E49CA8, 0x20AD617C8DF437CF, - 0x3AB06E7A12F5FF7B, 0x70A25E037E40347E, 0x51F1D323FB4C1151, 0xAE0D99AA4835FED9, 0xDF5429960D2536B6, 0x000000030E91D466 }; // Order of Alice's subgroup const uint64_t Alice_order[NWORDS64_ORDER] = { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0010000000000000 }; // Order of Bob's subgroup @@ -117,6 +117,10 @@ const unsigned int strat_Bob[MAX_Bob-1] = { #define fp2correction fp2correction751 #define fp2mul_mont fp2mul751_mont #define fp2sqr_mont fp2sqr751_mont +#define fp2mul_c0_mont fp2mul751_c0_mont +#define fp2mul_c1_mont fp2mul751_c1_mont +#define fp2sqr_c0_mont fp2sqr751_c0_mont +#define fp2sqr_c1_mont fp2sqr751_c1_mont #define fp2inv_mont fp2inv751_mont #define fp2inv_mont_bingcd fp2inv751_mont_bingcd #define fpequal_non_constant_time fpequal751_non_constant_time diff --git a/src/P751/P751_api.h b/src/P751/P751_api.h index 8fc09e5..6a50273 100644 --- a/src/P751/P751_api.h +++ b/src/P751/P751_api.h @@ -1,5 +1,9 @@ /******************************************************************************************** * SIDH: an efficient supersingular isogeny cryptography library +* Copyright (c) Microsoft Corporation +* +* Website: https://github.com/microsoft/PQCrypto-SIDH +* Released under MIT license * * Abstract: API header file for P751 *********************************************************************************************/ diff --git a/src/P751/P751_compressed.c b/src/P751/P751_compressed.c index a3ef462..44b9596 100644 --- a/src/P751/P751_compressed.c +++ b/src/P751/P751_compressed.c @@ -1,5 +1,9 @@ /******************************************************************************************** * Supersingular Isogeny Key Encapsulation Library +* Copyright (c) Microsoft Corporation +* +* Website: https://github.com/microsoft/PQCrypto-SIDH +* Released under MIT license * * Abstract: supersingular isogeny parameters and generation of functions for P751_compressed *********************************************************************************************/ @@ -30,10 +34,6 @@ const uint64_t p751x4[NWORDS64_FIELD] = { 0xFFFFFFFFFFFFFFFC, 0xFFFFF 0x8FB25A1527E1E2A3, 0x6A566C684FDF31DB, 0x213A619F5BAFA1DB, 0x158AD41172C95D20, 0x384A427E5EEB719A, 0x0001BF975507DC70 }; const uint64_t p751p1[NWORDS64_FIELD] = { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xEEB0000000000000, 0xE3EC968549F878A8, 0xDA959B1A13F7CC76, 0x084E9867D6EBE876, 0x8562B5045CB25748, 0x0E12909F97BADC66, 0x00006FE5D541F71C }; -const uint64_t p751x16p[2*NWORDS64_FIELD] = { 0x0000000000000010, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x2A00000000000000, - 0x826D2F56C0F0EAE2, 0xAD4C9CBD81067123, 0xF62CF3052282F124, 0x53A95F7469B516FE, 0x3DADEC0D08A4732F, 0x58AD934557C11C7E, - 0x7F731B89B2DA43F2, 0x51AE9F5F5F6AFF3B, 0xD74319A6C9BCA375, 0x5BAB790796CF84D4, 0xA421554FE2E49CA8, 0x20AD617C8DF437CF, - 0x3AB06E7A12F5FF7B, 0x70A25E037E40347E, 0x51F1D323FB4C1151, 0xAE0D99AA4835FED9, 0xDF5429960D2536B6, 0x000000030E91D466 }; // Order of Alice's subgroup const uint64_t Alice_order[NWORDS64_ORDER] = { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0010000000000000 }; // Order of Bob's subgroup @@ -376,6 +376,10 @@ const uint64_t v_3_torsion[TABLE_V3_LEN][2 * NWORDS64_FIELD] = #define fp2correction fp2correction751 #define fp2mul_mont fp2mul751_mont #define fp2sqr_mont fp2sqr751_mont +#define fp2mul_c0_mont fp2mul751_c0_mont +#define fp2mul_c1_mont fp2mul751_c1_mont +#define fp2sqr_c0_mont fp2sqr751_c0_mont +#define fp2sqr_c1_mont fp2sqr751_c1_mont #define fp2inv_mont fp2inv751_mont #define fp2inv_mont_bingcd fp2inv751_mont_bingcd #define fpequal_non_constant_time fpequal751_non_constant_time diff --git a/src/P751/P751_compressed_api.h b/src/P751/P751_compressed_api.h index 3bc08ed..ea7bc92 100644 --- a/src/P751/P751_compressed_api.h +++ b/src/P751/P751_compressed_api.h @@ -1,5 +1,9 @@ /******************************************************************************************** * SIDH: an efficient supersingular isogeny cryptography library +* Copyright (c) Microsoft Corporation +* +* Website: https://github.com/microsoft/PQCrypto-SIDH +* Released under MIT license * * Abstract: API header file for P751 using compression *********************************************************************************************/ diff --git a/src/P751/P751_compressed_dlog_tables.c b/src/P751/P751_compressed_dlog_tables.c index d660d07..425466d 100644 --- a/src/P751/P751_compressed_dlog_tables.c +++ b/src/P751/P751_compressed_dlog_tables.c @@ -1,5 +1,9 @@ /******************************************************************************************** * SIDH: an efficient supersingular isogeny cryptography library +* Copyright (c) Microsoft Corporation +* +* Website: https://github.com/microsoft/PQCrypto-SIDH +* Released under MIT license * * Abstract: precomputed tables for Pohlig-Hellman when using compression *********************************************************************************************/ diff --git a/src/P751/P751_compressed_pair_tables.c b/src/P751/P751_compressed_pair_tables.c index f8a8704..f40b4f5 100644 --- a/src/P751/P751_compressed_pair_tables.c +++ b/src/P751/P751_compressed_pair_tables.c @@ -1,5 +1,9 @@ /************************************************************************************************** * SIDH: an efficient supersingular isogeny cryptography library +* Copyright (c) Microsoft Corporation +* +* Website: https://github.com/microsoft/PQCrypto-SIDH +* Released under MIT license * * Abstract: precomputed tables for pairing computation on E0: y^2 = x^3 + x when using compression ***************************************************************************************************/ diff --git a/src/P751/P751_internal.h b/src/P751/P751_internal.h index d636e7c..6cb25e0 100644 --- a/src/P751/P751_internal.h +++ b/src/P751/P751_internal.h @@ -1,5 +1,9 @@ /******************************************************************************************** * SIDH: an efficient supersingular isogeny cryptography library +* Copyright (c) Microsoft Corporation +* +* Website: https://github.com/microsoft/PQCrypto-SIDH +* Released under MIT license * * Abstract: internal header file for P751 *********************************************************************************************/ @@ -208,9 +212,17 @@ void fp2correction751(f2elm_t a); // GF(p751^2) squaring using Montgomery arithmetic, c = a^2 in GF(p751^2) void fp2sqr751_mont(const f2elm_t a, f2elm_t c); +void fp2sqr751_c0_mont(const digit_t* a, digit_t* c); +void fp2sqr751_c0_asm(const digit_t* a, digit_t* c); +void fp2sqr751_c1_mont(const digit_t* a, digit_t* c); +void fp2sqr751_c1_asm(const digit_t* a, digit_t* c); // GF(p751^2) multiplication using Montgomery arithmetic, c = a*b in GF(p751^2) void fp2mul751_mont(const f2elm_t a, const f2elm_t b, f2elm_t c); +void fp2mul751_c0_mont(const digit_t* a, const digit_t* b, digit_t* c); +void fp2mul751_c0_asm(const digit_t* a, const digit_t* b, digit_t* c); +void fp2mul751_c1_mont(const digit_t* a, const digit_t* b, digit_t* c); +void fp2mul751_c1_asm(const digit_t* a, const digit_t* b, digit_t* c); // GF(p751^2) inversion using Montgomery arithmetic, a = (a0-i*a1)/(a0^2+a1^2) void fp2inv751_mont(f2elm_t a); diff --git a/src/P751/generic/fp_generic.c b/src/P751/generic/fp_generic.c index d07750e..bd89064 100755 --- a/src/P751/generic/fp_generic.c +++ b/src/P751/generic/fp_generic.c @@ -1,5 +1,9 @@ /******************************************************************************************** * SIDH: an efficient supersingular isogeny cryptography library +* Copyright (c) Microsoft Corporation +* +* Website: https://github.com/microsoft/PQCrypto-SIDH +* Released under MIT license * * Abstract: portable modular arithmetic for P751 *********************************************************************************************/ diff --git a/src/compression/dlog.c b/src/compression/dlog.c index a99da68..4cd815c 100644 --- a/src/compression/dlog.c +++ b/src/compression/dlog.c @@ -1,5 +1,9 @@ /******************************************************************************************** * SIDH: an efficient supersingular isogeny cryptography library +* Copyright (c) Microsoft Corporation +* +* Website: https://github.com/microsoft/PQCrypto-SIDH +* Released under MIT license * * Abstract: Pohlig-Hellman with optimal strategy *********************************************************************************************/ diff --git a/src/compression/pairing.c b/src/compression/pairing.c index 23e1374..fc7e05b 100644 --- a/src/compression/pairing.c +++ b/src/compression/pairing.c @@ -1,5 +1,9 @@ /******************************************************************************************** * SIDH: an efficient supersingular isogeny cryptography library +* Copyright (c) Microsoft Corporation +* +* Website: https://github.com/microsoft/PQCrypto-SIDH +* Released under MIT license * * Abstract: pairing computation for compression *********************************************************************************************/ diff --git a/src/compression/sidh_compressed.c b/src/compression/sidh_compressed.c index e3357d6..36fe4f0 100644 --- a/src/compression/sidh_compressed.c +++ b/src/compression/sidh_compressed.c @@ -1,5 +1,9 @@ /************************************************************************************************* * SIDH: an efficient supersingular isogeny cryptography library +* Copyright (c) Microsoft Corporation +* +* Website: https://github.com/microsoft/PQCrypto-SIDH +* Released under MIT license * * Abstract: ephemeral supersingular isogeny Diffie-Hellman key exchange (SIDH) using compression **************************************************************************************************/ diff --git a/src/compression/sike_compressed.c b/src/compression/sike_compressed.c index 0279f3d..f8ebadf 100644 --- a/src/compression/sike_compressed.c +++ b/src/compression/sike_compressed.c @@ -1,5 +1,9 @@ /******************************************************************************************** * SIDH: an efficient supersingular isogeny cryptography library +* Copyright (c) Microsoft Corporation +* +* Website: https://github.com/microsoft/PQCrypto-SIDH +* Released under MIT license * * Abstract: supersingular isogeny key encapsulation (SIKE) protocol using compression *********************************************************************************************/ diff --git a/src/compression/torsion_basis.c b/src/compression/torsion_basis.c index f1e391d..e5829fb 100644 --- a/src/compression/torsion_basis.c +++ b/src/compression/torsion_basis.c @@ -1,5 +1,9 @@ /******************************************************************************************** * SIDH: an efficient supersingular isogeny cryptography library +* Copyright (c) Microsoft Corporation +* +* Website: https://github.com/microsoft/PQCrypto-SIDH +* Released under MIT license * * Abstract: Torsion basis generation for compression *********************************************************************************************/ diff --git a/src/config.h b/src/config.h index 58a5121..d2fcc77 100644 --- a/src/config.h +++ b/src/config.h @@ -1,5 +1,9 @@ /******************************************************************************************** * SIDH: an efficient supersingular isogeny cryptography library +* Copyright (c) Microsoft Corporation +* +* Website: https://github.com/microsoft/PQCrypto-SIDH +* Released under MIT license * * Abstract: configuration file and platform-dependent macros *********************************************************************************************/ diff --git a/src/ec_isogeny.c b/src/ec_isogeny.c index b373fc5..09160d0 100644 --- a/src/ec_isogeny.c +++ b/src/ec_isogeny.c @@ -1,5 +1,9 @@ /******************************************************************************************** * SIDH: an efficient supersingular isogeny cryptography library +* Copyright (c) Microsoft Corporation +* +* Website: https://github.com/microsoft/PQCrypto-SIDH +* Released under MIT license * * Abstract: elliptic curve and isogeny functions *********************************************************************************************/ diff --git a/src/fpx.c b/src/fpx.c index 6eadbd0..c65988a 100644 --- a/src/fpx.c +++ b/src/fpx.c @@ -1,5 +1,9 @@ /******************************************************************************************** * SIDH: an efficient supersingular isogeny cryptography library +* Copyright (c) Microsoft Corporation +* +* Website: https://github.com/microsoft/PQCrypto-SIDH +* Released under MIT license * * Abstract: core functions over GF(p) and GF(p^2) *********************************************************************************************/ @@ -136,19 +140,27 @@ void copy_words(const digit_t* a, digit_t* c, const unsigned int nwords) void fpmul_mont(const digit_t* ma, const digit_t* mb, digit_t* mc) { // Multiprecision multiplication, c = a*b mod p. +#if defined(_MULX_) && defined(_ADX_) && (OS_TARGET == OS_NIX) && (NBITS_FIELD != 751) + fpmul(ma, mb, mc); +#else dfelm_t temp = {0}; mp_mul(ma, mb, temp, NWORDS_FIELD); rdc_mont(temp, mc); +#endif } void fpsqr_mont(const digit_t* ma, digit_t* mc) { // Multiprecision squaring, c = a^2 mod p. +#if defined(_MULX_) && defined(_ADX_) && (OS_TARGET == OS_NIX) && (NBITS_FIELD != 751) + fpmul(ma, ma, mc); +#else dfelm_t temp = {0}; mp_mul(ma, ma, temp, NWORDS_FIELD); rdc_mont(temp, mc); +#endif } @@ -215,7 +227,7 @@ void fp2correction(f2elm_t a) inline static void mp_addfast(const digit_t* a, const digit_t* b, digit_t* c) { // Multiprecision addition, c = a+b. -#if (OS_TARGET == OS_WIN) || defined(GENERIC_IMPLEMENTATION) || (TARGET == TARGET_ARM) +#if (OS_TARGET == OS_WIN) || defined(GENERIC_IMPLEMENTATION) mp_add(a, b, c, NWORDS_FIELD); @@ -256,7 +268,14 @@ inline unsigned int mp_add(const digit_t* a, const digit_t* b, digit_t* c, const void fp2sqr_mont(const f2elm_t a, f2elm_t c) { // GF(p^2) squaring using Montgomery arithmetic, c = a^2 in GF(p^2). // Inputs: a = a0+a1*i, where a0, a1 are in [0, 2*p-1] - // Output: c = c0+c1*i, where c0, c1 are in [0, 2*p-1] + // Output: c = c0+c1*i, where c0, c1 are in [0, 2*p-1] +#if defined(_MULX_) && defined(_ADX_) && (OS_TARGET == OS_NIX) && (NBITS_FIELD != 751) + dfelm_t tt1; + + fp2sqr_c0_mont(a[0], (digit_t*)tt1); // c0 = (a0+a1)(a0-a1) + fp2sqr_c1_mont(a[0], c[1]); // c1 = 2a0*a1 + fpcopy((digit_t*)tt1, c[0]); +#else felm_t t1, t2, t3; mp_addfast(a[0], a[1], t1); // t1 = a0+a1 @@ -264,6 +283,7 @@ void fp2sqr_mont(const f2elm_t a, f2elm_t c) mp_addfast(a[0], a[0], t3); // t3 = 2a0 fpmul_mont(t1, t2, c[0]); // c0 = (a0+a1)(a0-a1) fpmul_mont(t3, a[1], c[1]); // c1 = 2a0*a1 +#endif } @@ -280,7 +300,7 @@ inline unsigned int mp_sub(const digit_t* a, const digit_t* b, digit_t* c, const inline static void mp_subaddfast(const digit_t* a, const digit_t* b, digit_t* c) { // Multiprecision subtraction followed by addition with p*2^MAXBITS_FIELD, c = a-b+(p*2^MAXBITS_FIELD) if a-b < 0, otherwise c=a-b. -#if (OS_TARGET == OS_WIN) || defined(GENERIC_IMPLEMENTATION) || (TARGET == TARGET_ARM) +#if (OS_TARGET == OS_WIN) || defined(GENERIC_IMPLEMENTATION) felm_t t1; digit_t mask = 0 - (digit_t)mp_sub(a, b, c, 2*NWORDS_FIELD); @@ -288,7 +308,7 @@ inline static void mp_subaddfast(const digit_t* a, const digit_t* b, digit_t* c) t1[i] = ((digit_t*)PRIME)[i] & mask; mp_addfast((digit_t*)&c[NWORDS_FIELD], t1, (digit_t*)&c[NWORDS_FIELD]); -#elif (OS_TARGET == OS_NIX) +#elif (OS_TARGET == OS_NIX) && (TARGET == TARGET_ARM64 || NBITS_FIELD == 751) mp_subaddx2_asm(a, b, c); @@ -298,12 +318,12 @@ inline static void mp_subaddfast(const digit_t* a, const digit_t* b, digit_t* c) inline static void mp_dblsubfast(const digit_t* a, const digit_t* b, digit_t* c) { // Multiprecision subtraction, c = c-a-b, where lng(a) = lng(b) = 2*NWORDS_FIELD. -#if (OS_TARGET == OS_WIN) || defined(GENERIC_IMPLEMENTATION) || (TARGET == TARGET_ARM) +#if (OS_TARGET == OS_WIN) || defined(GENERIC_IMPLEMENTATION) mp_sub(c, a, c, 2*NWORDS_FIELD); mp_sub(c, b, c, 2*NWORDS_FIELD); -#elif (OS_TARGET == OS_NIX) +#elif (OS_TARGET == OS_NIX) && (TARGET == TARGET_ARM64 || NBITS_FIELD == 751) mp_dblsubx2_asm(a, b, c); @@ -315,6 +335,13 @@ void fp2mul_mont(const f2elm_t a, const f2elm_t b, f2elm_t c) { // GF(p^2) multiplication using Montgomery arithmetic, c = a*b in GF(p^2). // Inputs: a = a0+a1*i and b = b0+b1*i, where a0, a1, b0, b1 are in [0, 2*p-1] // Output: c = c0+c1*i, where c0, c1 are in [0, 2*p-1] +#if defined(_MULX_) && defined(_ADX_) && (OS_TARGET == OS_NIX) && (NBITS_FIELD != 751) + felm_t t1; + + fp2mul_c0_mont(a[0], b[0], t1); // c0 = a0*b0 - a1*b1 + fp2mul_c1_mont(a[0], b[0], c[1]); // c1 = a0*b1 + a1*b0 + fpcopy(t1, c[0]); +#else felm_t t1, t2; dfelm_t tt1, tt2, tt3; @@ -325,8 +352,9 @@ void fp2mul_mont(const f2elm_t a, const f2elm_t b, f2elm_t c) mp_mul(t1, t2, tt3, NWORDS_FIELD); // tt3 = (a0+a1)*(b0+b1) mp_dblsubfast(tt1, tt2, tt3); // tt3 = (a0+a1)*(b0+b1) - a0*b0 - a1*b1 mp_subaddfast(tt1, tt2, tt1); // tt1 = a0*b0 - a1*b1 + p*2^MAXBITS_FIELD if a0*b0 - a1*b1 < 0, else tt1 = a0*b0 - a1*b1 - rdc_mont(tt3, c[1]); // c[1] = (a0+a1)*(b0+b1) - a0*b0 - a1*b1 - rdc_mont(tt1, c[0]); // c[0] = a0*b0 - a1*b1 + rdc_mont(tt3, c[1]); // c1 = (a0+a1)*(b0+b1) - a0*b0 - a1*b1 + rdc_mont(tt1, c[0]); // c0 = a0*b0 - a1*b1 +#endif } diff --git a/src/internal.h b/src/internal.h index 924f246..0e46f3c 100644 --- a/src/internal.h +++ b/src/internal.h @@ -1,5 +1,9 @@ /******************************************************************************************** * SIDH: an efficient supersingular isogeny cryptography library +* Copyright (c) Microsoft Corporation +* +* Website: https://github.com/microsoft/PQCrypto-SIDH +* Released under MIT license * * Abstract: internal header file for function definitions *********************************************************************************************/ diff --git a/src/random/random.c b/src/random/random.c index 028acbe..b3b99b5 100644 --- a/src/random/random.c +++ b/src/random/random.c @@ -21,7 +21,7 @@ static inline void delay(unsigned int count) { - while (count--) {} + while (count>0) { count--; } } diff --git a/src/sidh.c b/src/sidh.c index b6661f6..133f9c2 100644 --- a/src/sidh.c +++ b/src/sidh.c @@ -1,5 +1,9 @@ /******************************************************************************************** * SIDH: an efficient supersingular isogeny cryptography library +* Copyright (c) Microsoft Corporation +* +* Website: https://github.com/microsoft/PQCrypto-SIDH +* Released under MIT license * * Abstract: ephemeral supersingular isogeny Diffie-Hellman key exchange (SIDH) *********************************************************************************************/ diff --git a/src/sike.c b/src/sike.c index 36d7293..3c78806 100644 --- a/src/sike.c +++ b/src/sike.c @@ -1,5 +1,9 @@ /******************************************************************************************** * SIDH: an efficient supersingular isogeny cryptography library +* Copyright (c) Microsoft Corporation +* +* Website: https://github.com/microsoft/PQCrypto-SIDH +* Released under MIT license * * Abstract: supersingular isogeny key encapsulation (SIKE) protocol *********************************************************************************************/ diff --git a/tests/arith_tests-p434.c b/tests/arith_tests-p434.c index c3fd2a3..c812c68 100644 --- a/tests/arith_tests-p434.c +++ b/tests/arith_tests-p434.c @@ -1,5 +1,9 @@ /******************************************************************************************** * SIDH: an efficient supersingular isogeny cryptography library +* Copyright (c) Microsoft Corporation +* +* Website: https://github.com/microsoft/PQCrypto-SIDH +* Released under MIT license * * Abstract: testing code for field arithmetic, elliptic curve and isogeny functions *********************************************************************************************/ @@ -12,12 +16,12 @@ // Benchmark and test parameters -#if defined(GENERIC_IMPLEMENTATION) || (TARGET == TARGET_ARM) +#if defined(GENERIC_IMPLEMENTATION) || (OS_TARGET == OS_WIN) || (TARGET == TARGET_ARM) #define BENCH_LOOPS 100 // Number of iterations per bench #define SMALL_BENCH_LOOPS 100 // Number of iterations per bench #define TEST_LOOPS 10 // Number of iterations per test #else - #define BENCH_LOOPS 100000 + #define BENCH_LOOPS 1000000 #define SMALL_BENCH_LOOPS 10000 #define TEST_LOOPS 100 #endif @@ -350,7 +354,6 @@ bool fp_run() int n; unsigned long long cycles, cycles1, cycles2; felm_t a, b, c; - dfelm_t aa; printf("\n--------------------------------------------------------------------------------------------------------\n\n"); printf("Benchmarking field arithmetic over GF(p434): \n\n"); @@ -393,20 +396,6 @@ bool fp_run() printf(" GF(p) multiplication runs in .................................... %7lld ", cycles/BENCH_LOOPS); print_unit; printf("\n"); - // GF(p) reduction using p434 - cycles = 0; - for (n=0; n