diff --git a/CMakeLists.txt b/CMakeLists.txt index 22b78b2..c771917 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -61,6 +61,9 @@ if(WIN32) else() if(NOT SYMCRYPT_TARGET_ENV MATCHES "Generic") enable_language(ASM) + # Suppress noisy warnings about compile options which are ignored for ASM + # Less messy than restricting most of the below options to only C/CXX! + add_compile_options($<$:-Wno-unused-command-line-argument>) endif() # add_compile_options(-Wall) # add_compile_options(-Wno-unknown-pragmas) @@ -76,6 +79,12 @@ else() # Avoids error: cast from pointer to smaller type 'uintptr_t' when including from aarch64-linux-gnu add_compile_options(-fms-extensions) + # GCC and clang unroll more aggressively than they should for best performance + # When we want to unroll loops, we unroll in the source code, so tell the compiler not to unroll + # (clang seems to respect this option globally, but I could only make GCC behave in AES-GCM by + # using GCC-specific pragmas for the loops of interest) + add_compile_options(-fno-unroll-loops) + # In Sanitize version, enable sanitizers if (CMAKE_BUILD_TYPE MATCHES Sanitize) add_compile_options(-fsanitize=address) @@ -120,12 +129,6 @@ else() add_link_options(-fsanitize=vptr) add_link_options(-fno-sanitize-recover=all) endif() - - # GCC and clang unroll more aggressively than they should for best performance - # When we want to unroll loops, we unroll in the source code, so tell the compiler not to unroll - # (clang seems to respect this option globally, but I could only make GCC behave in AES-GCM by - # using GCC-specific pragmas for the loops of interest) - add_compile_options(-fno-unroll-loops) endif() if(CMAKE_BUILD_TYPE MATCHES Release) diff --git a/cmake-toolchain/LinuxUserMode-AMD64.cmake b/cmake-toolchain/LinuxUserMode-AMD64.cmake index b866943..4028aee 100644 --- a/cmake-toolchain/LinuxUserMode-AMD64.cmake +++ b/cmake-toolchain/LinuxUserMode-AMD64.cmake @@ -1,5 +1,5 @@ # This toolchain file configures CMake options for Linux User Mode AMD64 compilation with CPU optimizations. -# To use the toolchain file, run cmake .. -DCMAKE_TOOLCHAIN_FILE=cmake-toolchain/LinuxUserMode-AMD64.cmake +# To use the toolchain file, run cmake .. -DCMAKE_TOOLCHAIN_FILE="cmake-toolchain/LinuxUserMode-AMD64.cmake" # Set CMake variables that subsequent CMake scripts can check against set(CMAKE_SYSTEM_NAME Linux) diff --git a/cmake-toolchain/LinuxUserMode-ARM64.cmake b/cmake-toolchain/LinuxUserMode-ARM64.cmake index 1ed03b0..a980804 100644 --- a/cmake-toolchain/LinuxUserMode-ARM64.cmake +++ b/cmake-toolchain/LinuxUserMode-ARM64.cmake @@ -1,5 +1,5 @@ # This toolchain file configures CMake options for Linux User Mode ARM64 compilation with CPU optimizations. -# To use the toolchain file, run cmake .. -DCMAKE_TOOLCHAIN_FILE=cmake-toolchain/LinuxUserMode-ARM64.cmake +# To use the toolchain file, run cmake .. -DCMAKE_TOOLCHAIN_FILE="cmake-toolchain/LinuxUserMode-ARM64.cmake" # Set CMake variables that subsequent CMake scripts can check against set(CMAKE_SYSTEM_NAME Linux) @@ -8,13 +8,14 @@ set(CMAKE_SYSTEM_PROCESSOR ARM64) set(TARGET_TRIPLE aarch64-linux-gnu) # Currently only use clang as it makes cross-compilation easier +set(CMAKE_ASM_COMPILER_TARGET ${TARGET_TRIPLE}) set(CMAKE_C_COMPILER clang) set(CMAKE_C_COMPILER_TARGET ${TARGET_TRIPLE}) set(CMAKE_CXX_COMPILER clang++) set(CMAKE_CXX_COMPILER_TARGET ${TARGET_TRIPLE}) # Point clang sysroot to cross compilation toolchain when cross compiling -if(NOT CMAKE_HOST_SYSTEM_PROCESSOR EQUAL CMAKE_SYSTEM_PROCESSOR) +if(NOT CMAKE_HOST_SYSTEM_PROCESSOR MATCHES ARM64|aarch64) # C/C++ toolchain (installed on Ubuntu using apt-get gcc-aarch64-linux-gnu g++-aarch64-linux-gnu) set(CMAKE_SYSROOT_COMPILE /usr/${TARGET_TRIPLE}) diff --git a/cmake-toolchain/WindowsUserMode-AMD64.cmake b/cmake-toolchain/WindowsUserMode-AMD64.cmake index 9617ca8..ea68e76 100644 --- a/cmake-toolchain/WindowsUserMode-AMD64.cmake +++ b/cmake-toolchain/WindowsUserMode-AMD64.cmake @@ -1,5 +1,5 @@ # This toolchain file configures CMake options for Windows User Mode AMD64 compilation with CPU optimizations. -# To use the toolchain file, run cmake .. -DCMAKE_TOOLCHAIN_FILE=cmake-toolchain/WindowsUserMode-AMD64.cmake +# To use the toolchain file, run cmake .. -DCMAKE_TOOLCHAIN_FILE="cmake-toolchain/WindowsUserMode-AMD64.cmake" # Set CMake variables that subsequent CMake scripts can check against set(CMAKE_SYSTEM_NAME Windows) diff --git a/cmake-toolchain/WindowsUserMode-X86.cmake b/cmake-toolchain/WindowsUserMode-X86.cmake index fd3c7c7..7da26e9 100644 --- a/cmake-toolchain/WindowsUserMode-X86.cmake +++ b/cmake-toolchain/WindowsUserMode-X86.cmake @@ -1,5 +1,5 @@ # This toolchain file configures CMake options for Windows User Mode x86 compilation with CPU optimizations. -# To use the toolchain file, run cmake .. -DCMAKE_TOOLCHAIN_FILE=cmake-toolchain/WindowsUserMode-X86.cmake -A Win32 +# To use the toolchain file, run cmake .. -DCMAKE_TOOLCHAIN_FILE="cmake-toolchain/WindowsUserMode-X86.cmake" -A Win32 # # (The "-A Win32" option seems to be required when compiling on a 64-bit host. Ideally this toolchain file # should set all the required options, but I haven't figured out how to force 32-bit compilation from the diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index 0062f27..ac286cc 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -105,7 +105,7 @@ function(process_cppasm filepath outformat archdefine) if((NOT outformat STREQUAL gas) AND (NOT outformat STREQUAL masm)) message(FATAL_ERROR "cppasm processing invoked with unrecognized outformat (${outformat})") endif() - if((NOT archdefine STREQUAL amd64) AND (NOT archdefine STREQUAL x86)) + if((NOT archdefine STREQUAL amd64) AND (NOT archdefine STREQUAL x86) AND (NOT archdefine STREQUAL arm64)) message(FATAL_ERROR "cppasm processing invoked with unrecognized archdefine (${archdefine})") endif() get_filename_component(rootpath ${filepath} DIRECTORY) @@ -151,14 +151,20 @@ function(process_cppasm filepath outformat archdefine) endif() endfunction() -function(process_symcryptasm filepath outformat archdefine) +function(process_symcryptasm filepath outformat archdefine callingconvention) get_filename_component(fileextension ${filepath} EXT) if(NOT fileextension STREQUAL .symcryptasm) message(FATAL_ERROR "symcryptasm processing invoked on file with incorrect extension (${filepath} -> ${fileextension})") endif() - if((NOT outformat STREQUAL gas) AND (NOT outformat STREQUAL masm)) + if((NOT outformat STREQUAL gas) AND (NOT outformat STREQUAL masm) AND (NOT outformat STREQUAL armasm64)) message(FATAL_ERROR "symcryptasm processing invoked with unrecognized outformat (${outformat})") endif() + if((NOT archdefine STREQUAL amd64) AND (NOT archdefine STREQUAL x86) AND (NOT archdefine STREQUAL arm64)) + message(FATAL_ERROR "symcryptasm processing invoked with unrecognized archdefine (${archdefine})") + endif() + if((NOT callingconvention STREQUAL msft) AND (NOT callingconvention STREQUAL systemv) AND (NOT callingconvention STREQUAL aapcs64) AND (NOT callingconvention STREQUAL arm64ec)) + message(FATAL_ERROR "symcryptasm processing invoked with unrecognized callingconvention (${callingconvention})") + endif() get_filename_component(rootpath ${filepath} DIRECTORY) get_filename_component(filestem ${filepath} NAME_WE) # filestem is the filename w/out extension set(filepath ${CMAKE_CURRENT_SOURCE_DIR}/${filepath}) @@ -168,7 +174,7 @@ function(process_symcryptasm filepath outformat archdefine) add_custom_command( OUTPUT ${output_cppasm} COMMAND ${CMAKE_COMMAND} -E make_directory ${output_directory} - COMMAND python3 ${CMAKE_SOURCE_DIR}/scripts/symcryptasm_processor.py ${outformat} ${filepath} ${output_cppasm} + COMMAND python3 ${CMAKE_SOURCE_DIR}/scripts/symcryptasm_processor.py ${outformat} ${archdefine} ${callingconvention} ${filepath} ${output_cppasm} MAIN_DEPENDENCY ${filepath} DEPENDS ${CMAKE_SOURCE_DIR}/scripts/symcryptasm_processor.py COMMENT "Python preprocessing ${filepath} to ${outformat} (${output_cppasm})" @@ -183,19 +189,15 @@ else() if(CMAKE_SYSTEM_PROCESSOR MATCHES "AMD64") list(APPEND SOURCES_COMMON linux/intrinsics.c) endif() - - if(CMAKE_SYSTEM_PROCESSOR MATCHES "ARM64") - list(APPEND SOURCES_COMMON linux/asmstubs.c) - endif() endif() if(WIN32 AND NOT(SYMCRYPT_TARGET_ENV MATCHES "Generic")) if(CMAKE_SYSTEM_PROCESSOR MATCHES "AMD64") - process_symcryptasm(amd64/aesasm.symcryptasm masm amd64) - process_symcryptasm(amd64/fdef_asm.symcryptasm masm amd64) - process_symcryptasm(amd64/fdef369_asm.symcryptasm masm amd64) - process_symcryptasm(amd64/fdef_mulx.symcryptasm masm amd64) - process_symcryptasm(amd64/wipe.symcryptasm masm amd64) + process_symcryptasm(amd64/aesasm.symcryptasm masm amd64 msft) + process_symcryptasm(amd64/fdef_asm.symcryptasm masm amd64 msft) + process_symcryptasm(amd64/fdef369_asm.symcryptasm masm amd64 msft) + process_symcryptasm(amd64/fdef_mulx.symcryptasm masm amd64 msft) + process_symcryptasm(amd64/wipe.symcryptasm masm amd64 msft) list(APPEND SOURCES_COMMON amd64/aesasm-masm.asm @@ -229,11 +231,11 @@ if(WIN32 AND NOT(SYMCRYPT_TARGET_ENV MATCHES "Generic")) endif() elseif(NOT(SYMCRYPT_TARGET_ENV MATCHES "Generic")) if(CMAKE_SYSTEM_PROCESSOR MATCHES "AMD64") - process_symcryptasm(amd64/aesasm.symcryptasm gas amd64) - process_symcryptasm(amd64/fdef_asm.symcryptasm gas amd64) - process_symcryptasm(amd64/fdef369_asm.symcryptasm gas amd64) - process_symcryptasm(amd64/fdef_mulx.symcryptasm gas amd64) - process_symcryptasm(amd64/wipe.symcryptasm gas amd64) + process_symcryptasm(amd64/aesasm.symcryptasm gas amd64 systemv) + process_symcryptasm(amd64/fdef_asm.symcryptasm gas amd64 systemv) + process_symcryptasm(amd64/fdef369_asm.symcryptasm gas amd64 systemv) + process_symcryptasm(amd64/fdef_mulx.symcryptasm gas amd64 systemv) + process_symcryptasm(amd64/wipe.symcryptasm gas amd64 systemv) list(APPEND SOURCES_COMMON amd64/aesasm-gas.asm @@ -248,6 +250,20 @@ elseif(NOT(SYMCRYPT_TARGET_ENV MATCHES "Generic")) amd64/fdef_mulx-gas.asm amd64/wipe-gas.asm PROPERTY LANGUAGE ASM) + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ARM64") + process_symcryptasm(arm64/fdef_asm.symcryptasm gas arm64 aapcs64) + process_symcryptasm(arm64/fdef369_asm.symcryptasm gas arm64 aapcs64) + process_symcryptasm(arm64/wipe.symcryptasm gas arm64 aapcs64) + + list(APPEND SOURCES_COMMON + arm64/fdef_asm-gas.asm + arm64/fdef369_asm-gas.asm + arm64/wipe-gas.asm) + set_source_files_properties( + arm64/fdef_asm-gas.asm + arm64/fdef369_asm-gas.asm + arm64/wipe-gas.asm + PROPERTY LANGUAGE ASM) endif() endif() diff --git a/lib/a_dispatch.c b/lib/a_dispatch.c index 55eb8a7..a37c89b 100644 --- a/lib/a_dispatch.c +++ b/lib/a_dispatch.c @@ -31,7 +31,7 @@ const SYMCRYPT_MODULAR_FUNCTIONS g_SymCryptModFns[] = { SYMCRYPT_MOD_FUNCTIONS_FDEF_MONTGOMERY1024, // Special faster code for 1024-bit Montgomery moduli SYMCRYPT_MOD_FUNCTIONS_FDEF_MONTGOMERY_MULX1024, // Special faster code for 1024-bit Montgomery moduli, MULX-based code -#elif SYMCRYPT_CPU_ARM64 && SYMCRYPT_MS_VC +#elif SYMCRYPT_CPU_ARM64 SYMCRYPT_MOD_FUNCTIONS_FDEF369_MONTGOMERY, {NULL,}, @@ -68,7 +68,7 @@ const SYMCRYPT_MODULUS_TYPE_SELECTION_ENTRY SymCryptModulusTypeSelections[] = {('xM' << 16) + SymCryptModFntableMontgomeryMulx, SYMCRYPT_CPU_FEATURES_FOR_MULX, 0, SYMCRYPT_MODULUS_FEATURE_MONTGOMERY }, {('1M' << 16) + SymCryptModFntableMontgomery1024, 0, 1024, SYMCRYPT_MODULUS_FEATURE_MONTGOMERY }, -#elif SYMCRYPT_CPU_ARM64 && SYMCRYPT_MS_VC +#elif SYMCRYPT_CPU_ARM64 {('mM' << 16) + SymCryptModFntableMontgomery, 0, 256, SYMCRYPT_MODULUS_FEATURE_MONTGOMERY }, {('9M' << 16) + SymCryptModFntable369Montgomery, 0, 384, SYMCRYPT_MODULUS_FEATURE_MONTGOMERY }, diff --git a/lib/arm64/fdef369_asm.asm b/lib/arm64/fdef369_asm.asm deleted file mode 100644 index a12a8fe..0000000 --- a/lib/arm64/fdef369_asm.asm +++ /dev/null @@ -1,472 +0,0 @@ -; -; fdef_369asm.asm Assembler code for large integer arithmetic in the default data format -; -; This file contains alternative routines that pretend that each digit is only 3 words. -; This gets used if the number is 1, 2, 3, 5, 6, or 9 digits long. -; The immediate advantage is that it improves EC performance on 192, 384, and 521-bit curves. -; -; Most of this code is a direct copy of the default code. -; -; Copyright (c) Microsoft Corporation. Licensed under the MIT license. -; - -#include "ksarm64.h" - -; As Arm assembler already uses C preprocessor, we can just hardcode this asm to include constants -; MASM for now. To be fixed properly when converting arm64 asm to symcryptasm. -#define SYMCRYPT_MASM -#include "C_asm_shared.inc" -#undef SYMCRYPT_MASM - -#include "symcrypt_version.inc" -#include "symcrypt_name_mangling.inc" -#include "symcrypt_magic.inc" - -; A digit consists of 3 words of 64 bits each - -;UINT32 -;SYMCRYPT_CALL -; SymCryptFdef369RawAdd( -; _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc1, -; _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc2, -; _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 pDst, -; UINT32 nDigits ); -; -; Initial inputs to registers: -; pSrc1 -> x0 -; pSrc2 -> x1 -; pDst -> x2 -; nDigits -> x3 - - LEAF_ENTRY ARM64EC_NAME_MANGLE(SymCryptFdef369RawAddAsm) - - neg x3, x3 ; negate the digit count - ands x4, x4, x4 ; Zero the carry flag - -SymCryptFdef369RawAddAsmLoop - add x3, x3, #1 ; Increment the digit count by one - ; carry is in the carry flag - - ldp x4, x6, [x0], #16 ; Load two words of pSrc1 - ldp x5, x7, [x1], #16 ; Load two words of pSrc2 - adcs x4, x4, x5 - adcs x6, x6, x7 - stp x4, x6, [x2], #16 ; Store the result in the destination - - ldr x4, [x0], #8 - ldr x5, [x1], #8 - adcs x4, x4, x5 - str x4, [x2], #8 - - cbnz x3, SymCryptFdef369RawAddAsmLoop - - csetcs x0 ; Set the return value equal to the carry - - ret - - LEAF_END ARM64EC_NAME_MANGLE(SymCryptFdef369RawAddAsm) - -;UINT32 -;SYMCRYPT_CALL -;SymCryptFdef369RawSub( -; _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 Src1, -; _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 Src2, -; _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 Dst, -; UINT32 nDigits ) -; -; Initial inputs to registers: -; pSrc1 -> x0 -; pSrc2 -> x1 -; pDst -> x2 -; nDigits -> x3 - - LEAF_ENTRY ARM64EC_NAME_MANGLE(SymCryptFdef369RawSubAsm) - - neg x3, x3 ; negate the digit count - subs x4, x4, x4 ; Set the carry flag (i.e. no borrow) - -SymCryptFdef369RawSubAsmLoop - add x3, x3, #1 ; Increment the digit count by one - ; borrow is in the carry flag (flipped) - - ldp x4, x6, [x0], #16 ; Load two words of pSrc1 - ldp x5, x7, [x1], #16 ; Load two words of pSrc2 - sbcs x4, x4, x5 - sbcs x6, x6, x7 - stp x4, x6, [x2], #16 ; Store the result in the destination - - ldr x4, [x0], #8 - ldr x5, [x1], #8 - sbcs x4, x4, x5 - str x4, [x2], #8 - - cbnz x3, SymCryptFdef369RawSubAsmLoop - - csetcc x0 ; If the carry is clear (borrow), set the return value to 1 - - ret - - LEAF_END ARM64EC_NAME_MANGLE(SymCryptFdef369RawSubAsm) - -;VOID -;SYMCRYPT_CALL -;SymCryptFdef369MaskedCopy( -; _In_reads_bytes_( nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCBYTE pbSrc, -; _InOut_writes_bytes_( nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PBYTE pbDst, -; UINT32 nDigits, -; UINT32 mask ) - - LEAF_ENTRY ARM64EC_NAME_MANGLE(SymCryptFdef369MaskedCopyAsm) - - neg x2, x2 ; negate the digit count - subs x4, XZR, x3 ; If (x3 > 0) clear the carry flag (i.e. borrow) - -SymCryptFdef369MaskedCopyAsmLoop - add x2, x2, #1 ; Increment the digit count by one - - ldp x4, x6, [x0], #16 ; Load two words of the source - ldp x5, x7, [x1] ; Load two words of the destination - cselcc x4, x4, x5 ; If the carry is clear, select the source operands - cselcc x6, x6, x7 - stp x4, x6, [x1], #16 ; Store the two words in the destination - - ldr x4, [x0], #8 - ldr x5, [x1] - cselcc x4, x4, x5 - str x4, [x1], #8 - - cbnz x2, SymCryptFdef369MaskedCopyAsmLoop - - ; Done, no return value - - ret - - LEAF_END ARM64EC_NAME_MANGLE(SymCryptFdef369MaskedCopyAsm) - -;VOID -;SYMCRYPT_CALL -;SymCryptFdef369RawMul( -; _In_reads_(nWords1) PCUINT32 pSrc1, -; UINT32 nDigits1, -; _In_reads_(nWords2) PCUINT32 pSrc2, -; UINT32 nDigits2, -; _Out_writes_(nWords1 + nWords2) PUINT32 pDst ) -; -; Initial inputs to registers: -; pSrc1 -> x0 -; nDigits1 -> x1 -; pSrc2 -> x2 -; nDigits2 -> x3 -; pDst -> x4 -; -; Basic structure: -; for each word in Src1: -; Dst += Src2 * word -; -; Register assignments -; x0 = pSrc1 (moving forward one word every outer loop) -; x1 = negated word count of pSrc1 -; x2 = pSrc2 (moving forward one *digit* every inner loop) -; x3 = negated digit count of pSrc2 and pDst -; x4 = pDst (moving forward one *digit* every inner loop) -; x5 = Stored pDst (moving forward one word every outer loop) -; x6 = Current word loaded from pSrc1 -; x8, x9 = Current words loaded in pairs from pSrc2 -; x10, x11 = Current words loaded in pairs from pDst -; x12, x15 = "128-bit" sliding register to hold the result of multiplies -; x16 = Stored pSrc2 -; x17 = Stored negated digit count of pSrc2 -; Note x13, x14 are reserved in ARM64EC and thus are not used - - - LEAF_ENTRY ARM64EC_NAME_MANGLE(SymCryptFdef369RawMulAsm) - - add x1, x1, x1, LSL #1 ; Calculate word count (x1 * 3) - - neg x1, x1 ; negate nWords1 - neg x3, x3 ; negate nDigits2 - - mov x5, x4 ; store pDst - mov x16, x2 ; store pSrc2 - mov x17, x3 ; store -nDigits2 for later - - ; - ; First iteration of main loop (no adding of previous values from pDst) - ; - ands x15, x15, XZR ; Clearing the carry flag and setting x15 = 0 - ldr x6, [x0] ; load the first word from pSrc1 - -SymCryptFdef369RawMulAsmLoopInner1 - add x3, x3, #1 ; move one digit up - - ldp x8, x9, [x2], #16 ; load 2 words from pSrc2 - - mul x12, x6, x8 ; Bits <63:0> of pSrc1[0]*pSrc2[j] - adcs x12, x12, x15 ; Adding the previous word (if there was a carry from the last addition it is added) - umulh x15, x6, x8 ; Bits <127:64> of pSrc1[0]*pSrc2[j] - str x12, [x4], #8 ; Store to destination - - mul x12, x6, x9 ; Bits <63:0> of pSrc1[0]*pSrc2[j+1] - adcs x12, x12, x15 ; Adding the previous word (if there was a carry from the last addition it is added) - umulh x15, x6, x9 ; Bits <127:64> of pSrc1[0]*pSrc2[j+1] - str x12, [x4], #8 ; Store to destination - - ldr x8, [x2], #8 - - mul x12, x6, x8 ; Bits <63:0> of pSrc1[0]*pSrc2[j+2] - adcs x12, x12, x15 ; Adding the previous word (if there was a carry from the last addition it is added) - umulh x15, x6, x8 ; Bits <127:64> of pSrc1[0]*pSrc2[j+2] - str x12, [x4], #8 ; Store to destination - - cbnz x3, SymCryptFdef369RawMulAsmLoopInner1 - - adc x15, x15, XZR ; Store the next word into the destination (with the carry if any) - str x15, [x4] - - add x1, x1, #1 ; move one word up - add x0, x0, #8 ; move start of pSrc1 one word up - add x5, x5, #8 ; move start of pDst one word up - - ; - ; MAIN LOOP - ; -SymCryptFdef369RawMulAsmLoopOuter - mov x3, x17 ; set -nDigits2 - mov x2, x16 ; set pSrc2 - mov x4, x5 ; set pDst - - ands x15, x15, XZR ; Clearing the carry flag and setting x15 = 0 - ldr x6, [x0] ; load the next word from pSrc1 - -SymCryptFdef369RawMulAsmLoopInner - add x3, x3, #1 ; move one digit up - - ldp x8, x9, [x2], #16 ; load 2 words from pSrc2 - ldp x10, x11, [x4] ; load 2 words from pDst - - mul x12, x6, x8 ; Bits <63:0> of pSrc1[i]*pSrc2[j] - adcs x12, x12, x15 ; Adding the previous word (if there was a carry from the last addition it is added) - umulh x15, x6, x8 ; Bits <127:64> of pSrc1[i]*pSrc2[j] - adc x15, x15, XZR ; Add the carry if any and don't update the flags - ; Note: this cannot overflow as the maximum for is (2^64-1)(2^64-1)+(2^64-1)+1 = 2^128 - 2^64 + 1 - adds x12, x12, x10 ; add the word from the destination and update the flags (this can overflow) - str x12, [x4], #8 ; Store to destination - - mul x12, x6, x9 ; Bits <63:0> of pSrc1[i]*pSrc2[j+1] - adcs x12, x12, x15 ; Adding the previous word (if there was a carry from the last addition it is added) - umulh x15, x6, x9 ; Bits <127:64> of pSrc1[i]*pSrc2[j+1] - adc x15, x15, XZR ; Add the carry if any and don't update the flags - adds x12, x12, x11 ; add the word from the destination and update the flags (this can overflow) - str x12, [x4], #8 ; Store to destination - - ldr x8, [x2], #8 - ldr x10, [x4] - - mul x12, x6, x8 ; Bits <63:0> of pSrc1[i]*pSrc2[j+2] - adcs x12, x12, x15 ; Adding the previous word (if there was a carry from the last addition it is added) - umulh x15, x6, x8 ; Bits <127:64> of pSrc1[i]*pSrc2[j+2] - adc x15, x15, XZR ; Add the carry if any and don't update the flags - adds x12, x12, x10 ; add the word from the destination and update the flags (this can overflow) - str x12, [x4], #8 ; Store to destination - - cbnz x3, SymCryptFdef369RawMulAsmLoopInner - - adc x15, x15, XZR ; Store the next word into the destination (with the carry if any) - str x15, [x4] - - adds x1, x1, #1 ; move one word up - add x0, x0, #8 ; move start of pSrc1 one word up - add x5, x5, #8 ; move start of pDst one word up - - bne SymCryptFdef369RawMulAsmLoopOuter - - ; Done, no return value - - ret - - LEAF_END ARM64EC_NAME_MANGLE(SymCryptFdef369RawMulAsm) - -;VOID -;SymCryptFdef369MontgomeryReduceAsm( -; _In_ PCSYMCRYPT_MODULUS pmMod, -; _In_ PUINT32 pSrc, -; _Out_ PUINT32 pDst ) -; -; Initial inputs to registers: -; pmMod -> x0 -; pSrc -> x1 -; pDst -> x2 -; -; Register assignments -; x0 = pMod (moving forward one *digit* every inner loop) -; x1 = pSrc (moving forward one *digit* every inner loop) -; x2 = pDst (used only in the end for subtract / result) -; x3 = negated digit count of pSrc and pMod -; x4 = negated word count of pSrc -; x5 = Inv64 of the modulus -; x6 = m = pSrc[i]*Inv64 -; x7 = hc = high carry variable -; x8, x9 = Current words loaded in pairs from pSrc -; x10, x11 = Current words loaded in pairs from pMod -; x12, x15 = c variable = "128-bit" sliding register to hold the result of multiplies -; x16 = Temporary intermediate result -; x17 = Stored negated digit count of pSrc -; x19 = Stored pMod pointer -; x20 = Stored pSrc pointer (moving forward one word every outer loop) -; Note x13, x14 are reserved in ARM64EC and thus are not used - - NESTED_ENTRY ARM64EC_NAME_MANGLE(SymCryptFdef369MontgomeryReduceAsm) - PROLOG_SAVE_REG_PAIR fp, lr, #-32! ; allocate 32 bytes of stack; store FP/LR - PROLOG_SAVE_REG_PAIR x19, x20, #16 ; free up x19/x20 - - ldr w3, [x0, #SymCryptModulusNdigitsOffsetArm64] ; # of Digits - ldr x5, [x0, #SymCryptModulusMontgomeryInv64OffsetArm64] ; Inv64 of modulus - add x0, x0, #SymCryptModulusValueOffsetArm64 ; pMod - - add x4, x3, x3, LSL #1 ; Calculate word count (x3 * 3) - - neg x3, x3 ; Negate the digit count - neg x4, x4 ; Negate the word count - - mov x17, x3 ; Store the digit count for later - mov x19, x0 ; Store the pMod pointer - mov x20, x1 ; Store the pSrc pointer - - ands x7, x7, XZR ; Set hc to 0 - - ; - ; Main loop - ; -SymCryptFdef369MontgomeryReduceAsmOuter - ldr x8, [x1] ; Load 1 word from pSrc - mul x6, x8, x5 ; <63:0> bits of pSrc[i]*Inv64 = m - - ands x12, x12, XZR ; Set c to 0 - ands x15, x15, XZR ; Set c to 0 - -SymCryptFdef369MontgomeryReduceAsmInner - ldp x10, x11, [x0], #16 ; pMod[j] - ldp x8, x9, [x1] ; pSrc[j] - - mul x16, x6, x10 ; <63:0> of pMod[j]*m - adds x16, x16, x8 ; Adding pSrc[j] - umulh x15, x6, x10 ; <127:64> of pMod[j]*m - adc x15, x15, XZR ; Add the carry if any (***) - adds x12, x12, x16 ; Add the lower bits of c - adc x15, x15, XZR ; Add the carry if any (***) - ; ***: These cannot produce extra carry as the maximum is - ; (2^64 - 1)*(2^64-1) + 2^64-1 + 2^64-1 = 2^128 - 1 - str x12, [x1], #8 ; pSrc[j] = (UINT64) c - mov x12, x15 ; c >>= 64 - - mul x16, x6, x11 ; <63:0> of pMod[j]*m - adds x16, x16, x9 ; Adding pSrc[j] - umulh x15, x6, x11 ; <127:64> of pMod[j]*m - adc x15, x15, XZR ; Add the carry if any (***) - adds x12, x12, x16 ; Add the lower bits of c - adc x15, x15, XZR ; Add the carry if any (***) - str x12, [x1], #8 ; pSrc[j] = (UINT64) c - mov x12, x15 ; c >>= 64 - - ldr x10, [x0], #8 ; pMod[j] - ldr x8, [x1] ; pSrc[j] - - mul x16, x6, x10 ; <63:0> of pMod[j]*m - adds x16, x16, x8 ; Adding pSrc[j] - umulh x15, x6, x10 ; <127:64> of pMod[j]*m - adc x15, x15, XZR ; Add the carry if any (***) - adds x12, x12, x16 ; Add the lower bits of c - adc x15, x15, XZR ; Add the carry if any (***) - str x12, [x1], #8 ; pSrc[j] = (UINT64) c - mov x12, x15 ; c >>= 64 - - adds x3, x3, #1 ; Move one digit up - bne SymCryptFdef369MontgomeryReduceAsmInner - - ldr x8, [x1] ; pSrc[nWords] - adds x12, x12, x8 ; c + pSrc[nWords] - adc x15, XZR, XZR ; Add the carry if any - - adds x12, x12, x7 ; c + pSrc[nWords] + hc - adc x7, x15, XZR ; Add the carry if any and store into hc - - str x12, [x1] ; pSrc[nWords] = c - - adds x4, x4, #1 ; Move one word up - - add x20, x20, #8 ; Move stored pSrc pointer one word up - mov x0, x19 ; Restore pMod pointer - mov x1, x20 ; Restore pSrc pointer - - mov x3, x17 ; Restore the digit counter - - bne SymCryptFdef369MontgomeryReduceAsmOuter - - ; - ; Subtraction - ; - - mov x16, x2 ; Store pDst pointer - - ; Prepare the pointers for subtract - mov x0, x20 ; pSrc - mov x1, x19 ; pMod - - mov x10, x7 ; x10 = hc - mov x3, x17 ; Restore the digit counter - subs x4, x4, x4 ; Set the carry flag (i.e. no borrow) - -SymCryptFdef369MontgomeryReduceRawSubAsmLoop - add x3, x3, #1 ; Increment the digit count by one - ; borrow is in the carry flag (flipped) - - ldp x4, x6, [x0], #16 ; Load two words of pSrc1 - ldp x5, x7, [x1], #16 ; Load two words of pSrc2 - sbcs x4, x4, x5 - sbcs x6, x6, x7 - stp x4, x6, [x2], #16 ; Store the result in the destination - - ldr x4, [x0], #8 - ldr x5, [x1], #8 - sbcs x4, x4, x5 - str x4, [x2], #8 - - cbnz x3, SymCryptFdef369MontgomeryReduceRawSubAsmLoop - - csetcc x0 ; If the carry is clear (borrow), set the return value to 1 - - orr x11, x10, x0 ; x11 = hc|d - - ; Prepare the pointers for masked copy - mov x0, x20 ; pSrc - mov x1, x16 ; pDst - - mov x2, x17 ; Restore the digit counter - subs x4, x10, x11 ; If (x11 > x10) clear the carry flag (i.e. borrow) - -SymCryptFdef369MontgomeryReduceMaskedCopyAsmLoop - add x2, x2, #1 ; Increment the digit count by one - - ldp x4, x6, [x0], #16 ; Load two words of the source - ldp x5, x7, [x1] ; Load two words of the destination - cselcc x4, x4, x5 ; If the carry is clear, select the source operands - cselcc x6, x6, x7 - stp x4, x6, [x1], #16 ; Store the two words in the destination - - ldr x4, [x0], #8 - ldr x5, [x1] - cselcc x4, x4, x5 - str x4, [x1], #8 - - cbnz x2, SymCryptFdef369MontgomeryReduceMaskedCopyAsmLoop - - ; Done, no return value - - EPILOG_RESTORE_REG_PAIR x19, x20, #16 - EPILOG_RESTORE_REG_PAIR fp, lr, #32! - EPILOG_RETURN - - NESTED_END ARM64EC_NAME_MANGLE(SymCryptFdef369MontgomeryReduceAsm) - - END - diff --git a/lib/arm64/fdef369_asm.symcryptasm b/lib/arm64/fdef369_asm.symcryptasm new file mode 100644 index 0000000..1e1b213 --- /dev/null +++ b/lib/arm64/fdef369_asm.symcryptasm @@ -0,0 +1,465 @@ +// +// fdef369_asm.symcryptasm Assembler code for large integer arithmetic in the default data format +// Expresses asm in a generic enough way to enable generation of MASM and GAS using the +// symcryptasm_processor.py script and C preprocessor +// +// This file contains alternative routines that pretend that each digit is only 3 words. +// This gets used if the number is 1, 2, 3, 5, 6, or 9 digits long. +// The immediate advantage is that it improves EC performance on 192, 384, and 521-bit curves. +// +// Most of this code is a direct copy of the default code. +// +// Copyright (c) Microsoft Corporation. Licensed under the MIT license. +// + +#include "symcryptasm_shared.cppasm" + +// A digit consists of 3 words of 64 bits each + +//UINT32 +//SYMCRYPT_CALL +//SymCryptFdef369RawAddAsm( +// _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 Src1, +// _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 Src2, +// _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 Dst, +// UINT32 nDigits ) + +FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdef369RawAddAsm), 4, 8) + + ldp X_4, X_6, [X_0] // Load two words of pSrc1 + ldp X_5, X_7, [X_1] // Load two words of pSrc2 + adds X_4, X_4, X_5 + adcs X_6, X_6, X_7 + stp X_4, X_6, [X_2] // Store the result in the destination + + ldr X_4, [X_0, #16] // Load one word of pSrc1 + sub X_3, X_3, #1 // Decrement the digit count by one + ldr X_5, [X_1, #16] // Load one word of pSrc2 + adcs X_4, X_4, X_5 + str X_4, [X_2, #16] // Store the result in the destination + + cbz X_3, SymCryptFdef369RawAddAsmEnd + +LABEL(SymCryptFdef369RawAddAsmLoop) + // carry is in the carry flag + // only update pointers to srcs and destination once per loop to reduce uops and dependencies + ldp X_4, X_6, [X_0, #24]! // Load two words of pSrc1 + ldp X_5, X_7, [X_1, #24]! // Load two words of pSrc2 + adcs X_4, X_4, X_5 + adcs X_6, X_6, X_7 + stp X_4, X_6, [X_2, #24]! // Store the result in the destination + + ldr X_4, [X_0, #16] // Load one word of pSrc1 + sub X_3, X_3, #1 // Decrement the digit count by one + ldr X_5, [X_1, #16] // Load one word of pSrc2 + adcs X_4, X_4, X_5 + str X_4, [X_2, #16] // Store the result in the destination + + cbnz X_3, SymCryptFdef369RawAddAsmLoop + + ALIGN(4) +LABEL(SymCryptFdef369RawAddAsmEnd) + cset X_0, cs // Set the return value equal to the carry + +FUNCTION_END(ARM64EC_NAME_MANGLE(SymCryptFdef369RawAddAsm)) + +//UINT32 +//SYMCRYPT_CALL +//SymCryptFdef369RawSubAsm( +// _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc1, +// _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc2, +// _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 pDst, +// UINT32 nDigits ) + +FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdef369RawSubAsm), 4, 8) + + ldp X_4, X_6, [X_0] // Load two words of pSrc1 + ldp X_5, X_7, [X_1] // Load two words of pSrc2 + subs X_4, X_4, X_5 + sbcs X_6, X_6, X_7 + stp X_4, X_6, [X_2] // Store the result in the destination + + ldr X_4, [X_0, #16] // Load one word of pSrc1 + sub X_3, X_3, #1 // Decrement the digit count by one + ldr X_5, [X_1, #16] // Load one word of pSrc2 + sbcs X_4, X_4, X_5 + str X_4, [X_2, #16] // Store the result in the destination + + cbz X_3, SymCryptFdef369RawSubAsmEnd + +LABEL(SymCryptFdef369RawSubAsmLoop) + // borrow is in the carry flag (flipped) + // only update pointers to srcs and destination once per loop to reduce uops and dependencies + ldp X_4, X_6, [X_0, #24]! // Load two words of pSrc1 + ldp X_5, X_7, [X_1, #24]! // Load two words of pSrc2 + sbcs X_4, X_4, X_5 + sbcs X_6, X_6, X_7 + stp X_4, X_6, [X_2, #24]! // Store the result in the destination + + ldr X_4, [X_0, #16] // Load one word of pSrc1 + sub X_3, X_3, #1 // Decrement the digit count by one + ldr X_5, [X_1, #16] // Load one word of pSrc2 + sbcs X_4, X_4, X_5 + str X_4, [X_2, #16] // Store the result in the destination + + cbnz X_3, SymCryptFdef369RawSubAsmLoop + + ALIGN(4) +LABEL(SymCryptFdef369RawSubAsmEnd) + cset X_0, cc // If the carry is clear (borrow), set the return value to 1 + +FUNCTION_END(ARM64EC_NAME_MANGLE(SymCryptFdef369RawSubAsm)) + +//VOID +//SYMCRYPT_CALL +//SymCryptFdef369MaskedCopyAsm( +// _In_reads_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE ) PCBYTE pbSrc, +// _Inout_updates_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE ) PBYTE pbDst, +// UINT32 nDigits, +// UINT32 mask ) + +FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdef369MaskedCopyAsm), 4, 7) + + subs xzr, xzr, X_3 // If (X_3 > 0) clear the carry flag (i.e. borrow) + + ldp X_3, X_5, [X_0] // Load two words of the source + ldp X_4, X_6, [X_1] // Load two words of the destination + csel X_3, X_3, X_4, cc // If the carry is clear, select the source operand + csel X_5, X_5, X_6, cc + stp X_3, X_5, [X_1] // Store the two words in the destination + + ldr X_3, [X_0, #16] // Load one word of the source + sub X_2, X_2, #1 // Decrement the digit count by one + ldr X_4, [X_1, #16] // Load one word of the destination + csel X_3, X_3, X_4, cc + str X_3, [X_1, #16] // Store the one word in the destination + + cbz X_2, SymCryptFdef369MaskedCopyAsmEnd + +LABEL(SymCryptFdef369MaskedCopyAsmLoop) + ldp X_3, X_5, [X_0, #24]! // Load two words of the source + ldp X_4, X_6, [X_1, #24]! // Load two words of the destination + csel X_3, X_3, X_4, cc // If the carry is clear, select the source operand + csel X_5, X_5, X_6, cc + stp X_3, X_5, [X_1] // Store the two words in the destination + + ldr X_3, [X_0, #16] // Load one word of the source + sub X_2, X_2, #1 // Decrement the digit count by one + ldr X_4, [X_1, #16] // Load one word of the destination + csel X_3, X_3, X_4, cc + str X_3, [X_1, #16] // Store the one word in the destination + + cbnz X_2, SymCryptFdef369MaskedCopyAsmLoop + +LABEL(SymCryptFdef369MaskedCopyAsmEnd) + // Done, no return value + +FUNCTION_END(ARM64EC_NAME_MANGLE(SymCryptFdef369MaskedCopyAsm)) + +//VOID +//SYMCRYPT_CALL +//SymCryptFdef369RawMulAsm( +// _In_reads_(nDigits1*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc1, +// UINT32 nDigits1, +// _In_reads_(nDigits2*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc2, +// UINT32 nDigits2, +// _Out_writes_((nDigits1+nDigits2)*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32 pDst ) +// +// Basic structure: +// for each word in Src1: +// Dst += Src2 * word +// +// Register assignments +// X_0 = pSrc1 (moving forward one word every outer loop) +// X_1 = word count of pSrc1 +// X_2 = pSrc2 (moving forward one *digit* every inner loop) +// X_3 = digit count of pSrc2 and pDst +// X_4 = pDst (moving forward one *digit* every inner loop) +// X_5 = Stored pDst (moving forward one word every outer loop) +// X_6 = Current word loaded from pSrc1 +// X_7, X_8 = Current words loaded in pairs from pSrc2 +// X_9, X_10 = Current words loaded in pairs from pDst +// X_11, X_12 = Scratch registers for holding the results of multiplies +// X_13 = Stored pSrc2 +// X_14 = Stored digit count of pSrc2 +// X_15 = Scratch register for holding the results of multiplies + +FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdef369RawMulAsm), 5, 16) + + add X_1, X_1, X_1, LSL #1 // Calculate word count (X_1 * 3) + + sub X_2, X_2, #24 // offset pSrc2 so we can use pre-increment form of loads + sub X_4, X_4, #24 // offset pDst so we can use pre-increment form of loads + + mov X_5, X_4 // store pDst + mov X_13, X_2 // store pSrc2 + mov X_14, X_3 // store nDigits2 for later + + // + // First iteration of main loop (no adding of previous values from pDst) + // + ands X_12, X_12, xzr // Clearing the carry flag and setting X_12 = 0 + ldr X_6, [X_0] // load the first word from pSrc1 + +LABEL(SymCryptFdef369RawMulAsmLoopInner1) + sub X_3, X_3, #1 // move one digit up + + ldp X_7, X_8, [X_2, #24]! // load 2 words from pSrc2 + + mul X_11, X_6, X_7 // Bits <63:0> of pSrc1[0]*pSrc2[j] + adcs X_11, X_11, X_12 // Adding the previous word (if there was a carry from the last addition it is added) + umulh X_12, X_6, X_7 // Bits <127:64> of pSrc1[0]*pSrc2[j] + + mul X_15, X_6, X_8 // Bits <63:0> of pSrc1[0]*pSrc2[j+1] + adcs X_15, X_15, X_12 // Adding the previous word (if there was a carry from the last addition it is added) + umulh X_12, X_6, X_8 // Bits <127:64> of pSrc1[0]*pSrc2[j+1] + + stp X_11, X_15, [X_4, #24]! // Store to destination + ldr X_7, [X_2, #16] // load 1 word from pSrc2 + + mul X_11, X_6, X_7 // Bits <63:0> of pSrc1[0]*pSrc2[j+2] + adcs X_11, X_11, X_12 // Adding the previous word (if there was a carry from the last addition it is added) + umulh X_12, X_6, X_7 // Bits <127:64> of pSrc1[0]*pSrc2[j+2] + + str X_11, [X_4, #16] // Store to destination + + cbnz X_3, SymCryptFdef369RawMulAsmLoopInner1 + + adc X_12, X_12, xzr // Store the next word into the destination (with the carry if any) + str X_12, [X_4, #24] + + sub X_1, X_1, #1 // move one word up + add X_0, X_0, #8 // move start of pSrc1 one word up + add X_5, X_5, #8 // move start of pDst one word up + + // + // MAIN LOOP + // +LABEL(SymCryptFdef369RawMulAsmLoopOuter) + mov X_3, X_14 // set nDigits2 + mov X_2, X_13 // set pSrc2 + mov X_4, X_5 // set pDst + + ands X_12, X_12, xzr // Clearing the carry flag and setting X_12 = 0 + ldr X_6, [X_0] // load the next word from pSrc1 + +LABEL(SymCryptFdef369RawMulAsmLoopInner) + sub X_3, X_3, #1 // move one digit up + + ldp X_7, X_8, [X_2, #24]! // load 2 words from pSrc2 + ldp X_9, X_10, [X_4, #24]! // load 2 words from pDst + + adcs X_9, X_9, X_12 // Adding the previous word (if there was a carry from the last addition it is added) + umulh X_11, X_6, X_7 // Bits <127:64> of pSrc1[i]*pSrc2[j] + adcs X_10, X_11, X_10 // Adding the previous word (if there was a carry from the last addition it is added) + umulh X_12, X_6, X_8 // Bits <127:64> of pSrc1[i]*pSrc2[j+1] + adc X_12, X_12, xzr // Add the carry if any and don't update the flags + + mul X_11, X_6, X_7 // Bits <63:0> of pSrc1[i]*pSrc2[j] + adds X_9, X_9, X_11 // add the word from the destination and update the flags (this can overflow) + mul X_11, X_6, X_8 // Bits <63:0> of pSrc1[i]*pSrc2[j+1] + adcs X_10, X_10, X_11 // add the word from the destination and update the flags (this can overflow) + + stp X_9, X_10, [X_4] // Store to destination + + ldr X_7, [X_2, #16] // load 1 word from pSrc2 + ldr X_9, [X_4, #16] // load 1 word from pDst + + adcs X_9, X_9, X_12 // Adding the previous word (if there was a carry from the last addition it is added) + umulh X_12, X_6, X_7 // Bits <127:64> of pSrc1[i]*pSrc2[j+2] + adc X_12, X_12, xzr // Add the carry if any and don't update the flags + + mul X_11, X_6, X_7 // Bits <63:0> of pSrc1[i]*pSrc2[j+2] + adds X_9, X_9, X_11 // add the word from the destination and update the flags (this can overflow) + + str X_9, [X_4, #16] // Store to destination + + cbnz X_3, SymCryptFdef369RawMulAsmLoopInner + + adc X_12, X_12, xzr // Store the next word into the destination (with the carry if any) + str X_12, [X_4, #24] + + subs X_1, X_1, #1 // move one word up + add X_0, X_0, #8 // move start of pSrc1 one word up + add X_5, X_5, #8 // move start of pDst one word up + + bne SymCryptFdef369RawMulAsmLoopOuter + + // Done, no return value + +FUNCTION_END(ARM64EC_NAME_MANGLE(SymCryptFdef369RawMulAsm)) + + +//VOID +//SYMCRYPT_CALL +//SymCryptFdef369MontgomeryReduceAsm( +// _In_ PCSYMCRYPT_MODULUS pmMod, +// _Inout_ PUINT32 pSrc, +// _Out_ PUINT32 pDst ) +// +// Register assignments +// X_0 = pMod (moving forward one *digit* every inner loop) +// X_1 = pSrc (moving forward one *digit* every inner loop) +// X_2 = pDst (used only in the end for subtract / result) +// X_3 = digit count of pSrc and pMod +// X_4 = word count of pSrc +// X_5 = Inv64 of the modulus +// X_6 = m = pSrc[i]*Inv64 +// X_7 = hc = high carry variable +// X_8, X_9 = Current words loaded in pairs from pSrc +// X_10, X_11 = Current words loaded in pairs from pMod +// X_12, X_13 = c variable = "128-bit" register to hold the result of multiplies +// It is flipped between [X_12:X_13] and [X_13:X_12] instead of doing c>>=64 +// X_14 = Temporary intermediate result +// X_15 = Stored digit count of pSrc +// X_16 = Stored pMod pointer +// X_17 = Stored pSrc pointer (moving forward one word every outer loop) + +FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdef369MontgomeryReduceAsm), 3, 18) + + ldr W_3, [X_0, #SymCryptModulusNdigitsOffsetArm64] // # of Digits + ldr X_5, [X_0, #SymCryptModulusMontgomeryInv64OffsetArm64] // Inv64 of modulus + add X_0, X_0, #SymCryptModulusValueOffsetArm64 // pMod + + add X_4, X_3, X_3, LSL #1 // Calculate word count (X_3 * 3) + + sub X_0, X_0, #24 // offset pMod so we can use pre-increment form of loads + sub X_1, X_1, #24 // offset pSrc so we can use pre-increment form of loads + sub X_2, X_2, #24 // offset pDst so we can use pre-increment form of loads + + mov X_15, X_3 // Store the digit count for later + mov X_16, X_0 // Store the pMod pointer + mov X_17, X_1 // Store the pSrc pointer + + and X_7, X_7, xzr // Set hc to 0 + + // + // Main loop + // +LABEL(SymCryptFdef369MontgomeryReduceAsmOuter) + ldr X_8, [X_1, #24] // Load 1 word from pSrc + mul X_6, X_8, X_5 // <63:0> bits of pSrc[i]*Inv64 = m + + and X_12, X_12, xzr // Set c to 0 + +LABEL(SymCryptFdef369MontgomeryReduceAsmInner) + ldp X_10, X_11, [X_0, #24]! // pMod[j] + ldp X_8, X_9, [X_1, #24]! // pSrc[j] + + mul X_14, X_6, X_10 // <63:0> of pMod[j]*m + adds X_14, X_14, X_8 // Adding pSrc[j] + umulh X_13, X_6, X_10 // <127:64> of pMod[j]*m + adc X_13, X_13, xzr // Add the carry if any (***) + adds X_12, X_12, X_14 // Add the lower bits of c + adc X_13, X_13, xzr // Add the carry if any (***) + // ***: These cannot produce extra carry as the maximum is + // (2^64 - 1)*(2^64-1) + 2^64-1 + 2^64-1 = 2^128 - 1 + str X_12, [X_1] // pSrc[j] = (UINT64) c4) c + mov X_12, X_13 // c >>= 64 + + mul X_14, X_6, X_11 // <63:0> of pMod[j]*m + adds X_14, X_14, X_9 // Adding pSrc[j] + umulh X_13, X_6, X_11 // <127:64> of pMod[j]*m + adc X_13, X_13, xzr // Add the carry if any (***) + adds X_12, X_12, X_14 // Add the lower bits of c + adc X_13, X_13, xzr // Add the carry if any (***) + str X_12, [X_1, #8] // pSrc[j] = (UINT64) c + mov X_12, X_13 // c >>= 64 + + ldr X_10, [X_0, #16] // pMod[j] + ldr X_8, [X_1, #16] // pSrc[j] + + mul X_14, X_6, X_10 // <63:0> of pMod[j]*m + adds X_14, X_14, X_8 // Adding pSrc[j] + umulh X_13, X_6, X_10 // <127:64> of pMod[j]*m + adc X_13, X_13, xzr // Add the carry if any (***) + adds X_12, X_12, X_14 // Add the lower bits of c + adc X_13, X_13, xzr // Add the carry if any (***) + str X_12, [X_1, #16] // pSrc[j] = (UINT64) c4) c + mov X_12, X_13 // c >>= 64 + + subs X_3, X_3, #1 // Move one digit up + bne SymCryptFdef369MontgomeryReduceAsmInner + + ldr X_8, [X_1, #24] // pSrc[nWords] + adds X_12, X_12, X_8 // c + pSrc[nWords] + adc X_13, xzr, xzr // Add the carry if any + + adds X_12, X_12, X_7 // c + pSrc[nWords] + hc + adc X_7, X_13, xzr // Add the carry if any and store into hc + + str X_12, [X_1, #24] // pSrc[nWords] = c + + subs X_4, X_4, #1 // Move one word up + + add X_17, X_17, #8 // Move stored pSrc pointer one word up + mov X_0, X_16 // Restore pMod pointer + mov X_1, X_17 // Restore pSrc pointer + + mov X_3, X_15 // Restore the digit counter + + bne SymCryptFdef369MontgomeryReduceAsmOuter + + // + // Subtraction + // + + mov X_14, X_2 // Store pDst pointer + + // Prepare the pointers for subtract + mov X_0, X_17 // pSrc + mov X_1, X_16 // pMod + + mov X_10, X_7 // X_10 = hc + mov X_3, X_15 // Restore the digit counter + subs X_4, X_4, X_4 // Set the carry flag (i.e. no borrow) + +LABEL(SymCryptFdef369MontgomeryReduceRawSubAsmLoop) + sub X_3, X_3, #1 // Decrement the digit count by one + // borrow is in the carry flag (flipped) + + ldp X_4, X_6, [X_0, #24]! // Load two words of pSrc1 + ldp X_5, X_7, [X_1, #24]! // Load two words of pSrc2 + sbcs X_4, X_4, X_5 + sbcs X_6, X_6, X_7 + stp X_4, X_6, [X_2, #24]! // Store the result in the destination + + ldr X_4, [X_0, #16] // Load one word of pSrc1 + ldr X_5, [X_1, #16] // Load one word of pSrc2 + sbcs X_4, X_4, X_5 + str X_4, [X_2, #16] // Store the result in the destination + + cbnz X_3, SymCryptFdef369MontgomeryReduceRawSubAsmLoop + + cset X_0, cc // If the carry is clear (borrow), set the return value to 1 + + orr X_11, X_10, X_0 // X_11 = hc|d + + // Prepare the pointers for masked copy + mov X_0, X_17 // pSrc + mov X_1, X_14 // pDst + + mov X_2, X_15 // Restore the digit counter + subs X_4, X_10, X_11 // If (X_11 > X_10) clear the carry flag (i.e. borrow) + +LABEL(SymCryptFdef369MontgomeryReduceMaskedCopyAsmLoop) + sub X_2, X_2, #1 // decrement the digit count by one + + ldp X_4, X_6, [X_0, #24]! // Load two words of the source + ldp X_5, X_7, [X_1, #24]! // Load two words of the destination + csel X_4, X_4, X_5, cc // If the carry is clear, select the source operands + csel X_6, X_6, X_7, cc + stp X_4, X_6, [X_1] // Store the two words in the destination + + ldr X_4, [X_0, #16] // Load one word of the source + ldr X_5, [X_1, #16] // Load one word of the destination + csel X_4, X_4, X_5, cc // If the carry is clear, select the source operands + str X_4, [X_1, #16] // Store the one word in the destination + + cbnz X_2, SymCryptFdef369MontgomeryReduceMaskedCopyAsmLoop + + // Done, no return value + +FUNCTION_END(ARM64EC_NAME_MANGLE(SymCryptFdef369MontgomeryReduceAsm)) + + FILE_END() diff --git a/lib/arm64/fdef_asm.asm b/lib/arm64/fdef_asm.asm deleted file mode 100644 index 6309280..0000000 --- a/lib/arm64/fdef_asm.asm +++ /dev/null @@ -1,768 +0,0 @@ -; -; fdef_asm.asm Assembler code for large integer arithmetic in the default data format for the arm64 architecture -; -; Copyright (c) Microsoft Corporation. Licensed under the MIT license. -; - -#include "ksarm64.h" - -; As Arm assembler already uses C preprocessor, we can just hardcode this asm to include constants -; MASM for now. To be fixed properly when converting arm64 asm to symcryptasm. -#define SYMCRYPT_MASM -#include "C_asm_shared.inc" -#undef SYMCRYPT_MASM - -#include "symcrypt_version.inc" -#include "symcrypt_name_mangling.inc" -#include "symcrypt_magic.inc" - -; A digit consists of 4 words of 64 bits each - -;UINT32 -;SYMCRYPT_CALL -; SymCryptFdefRawAdd( -; _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc1, -; _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc2, -; _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 pDst, -; UINT32 nDigits ); -; -; Initial inputs to registers: -; pSrc1 -> x0 -; pSrc2 -> x1 -; pDst -> x2 -; nDigits -> x3 - - LEAF_ENTRY ARM64EC_NAME_MANGLE(SymCryptFdefRawAddAsm) - - neg x3, x3 ; negate the digit count - ands x4, x4, x4 ; Zero the carry flag - -SymCryptFdefRawAddAsmLoop - add x3, x3, #1 ; Increment the digit count by one - ; carry is in the carry flag - - ldp x4, x6, [x0], #16 ; Load two words of pSrc1 - ldp x5, x7, [x1], #16 ; Load two words of pSrc2 - adcs x4, x4, x5 - adcs x6, x6, x7 - stp x4, x6, [x2], #16 ; Store the result in the destination - - ldp x4, x6, [x0], #16 ; Load two words of pSrc1 - ldp x5, x7, [x1], #16 ; Load two words of pSrc2 - adcs x4, x4, x5 - adcs x6, x6, x7 - stp x4, x6, [x2], #16 ; Store the result in the destination - - cbnz x3, SymCryptFdefRawAddAsmLoop - - csetcs x0 ; Set the return value equal to the carry - - ret - - LEAF_END ARM64EC_NAME_MANGLE(SymCryptFdefRawAddAsm) - -;UINT32 -;SYMCRYPT_CALL -;SymCryptFdefRawSub( -; _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 Src1, -; _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 Src2, -; _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 Dst, -; UINT32 nDigits ) -; -; Initial inputs to registers: -; pSrc1 -> x0 -; pSrc2 -> x1 -; pDst -> x2 -; nDigits -> x3 - - LEAF_ENTRY ARM64EC_NAME_MANGLE(SymCryptFdefRawSubAsm) - - neg x3, x3 ; negate the digit count - subs x4, x4, x4 ; Set the carry flag (i.e. no borrow) - -SymCryptFdefRawSubAsmLoop - add x3, x3, #1 ; Increment the digit count by one - ; borrow is in the carry flag (flipped) - - ldp x4, x6, [x0], #16 ; Load two words of pSrc1 - ldp x5, x7, [x1], #16 ; Load two words of pSrc2 - sbcs x4, x4, x5 - sbcs x6, x6, x7 - stp x4, x6, [x2], #16 ; Store the result in the destination - - ldp x4, x6, [x0], #16 ; Load two words of pSrc1 - ldp x5, x7, [x1], #16 ; Load two words of pSrc2 - sbcs x4, x4, x5 - sbcs x6, x6, x7 - stp x4, x6, [x2], #16 ; Store the result in the destination - - cbnz x3, SymCryptFdefRawSubAsmLoop - - csetcc x0 ; If the carry is clear (borrow), set the return value to 1 - - ret - - LEAF_END ARM64EC_NAME_MANGLE(SymCryptFdefRawSubAsm) - -;VOID -;SYMCRYPT_CALL -;SymCryptFdefMaskedCopy( -; _In_reads_bytes_( nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCBYTE pbSrc, -; _InOut_writes_bytes_( nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PBYTE pbDst, -; UINT32 nDigits, -; UINT32 mask ) - - LEAF_ENTRY ARM64EC_NAME_MANGLE(SymCryptFdefMaskedCopyAsm) - - neg x2, x2 ; negate the digit count - subs x4, XZR, x3 ; If (x3 > 0) clear the carry flag (i.e. borrow) - -SymCryptFdefMaskedCopyAsmLoop - add x2, x2, #1 ; Increment the digit count by one - - ldp x4, x6, [x0], #16 ; Load two words of the source - ldp x5, x7, [x1] ; Load two words of the destination - cselcc x4, x4, x5 ; If the carry is clear, select the source operands - cselcc x6, x6, x7 - stp x4, x6, [x1], #16 ; Store the two words in the destination - - ldp x4, x6, [x0], #16 - ldp x5, x7, [x1] - cselcc x4, x4, x5 - cselcc x6, x6, x7 - stp x4, x6, [x1], #16 - - cbnz x2, SymCryptFdefMaskedCopyAsmLoop - - ; Done, no return value - - ret - - LEAF_END ARM64EC_NAME_MANGLE(SymCryptFdefMaskedCopyAsm) - -;VOID -;SYMCRYPT_CALL -;SymCryptFdefRawMul( -; _In_reads_(nWords1) PCUINT32 pSrc1, -; UINT32 nDigits1, -; _In_reads_(nWords2) PCUINT32 pSrc2, -; UINT32 nDigits2, -; _Out_writes_(nWords1 + nWords2) PUINT32 pDst ) -; -; Initial inputs to registers: -; pSrc1 -> x0 -; nDigits1 -> x1 -; pSrc2 -> x2 -; nDigits2 -> x3 -; pDst -> x4 -; -; Basic structure: -; for each word in Src1: -; Dst += Src2 * word -; -; Register assignments -; x0 = pSrc1 (moving forward one word every outer loop) -; x1 = negated word count of pSrc1 -; x2 = pSrc2 (moving forward one *digit* every inner loop) -; x3 = negated digit count of pSrc2 and pDst -; x4 = pDst (moving forward one *digit* every inner loop) -; x5 = Stored pDst (moving forward one word every outer loop) -; x6 = Current word loaded from pSrc1 -; x8, x9 = Current words loaded in pairs from pSrc2 -; x10, x11 = Current words loaded in pairs from pDst -; x12, x15 = "128-bit" sliding register to hold the result of multiplies -; x16 = Stored pSrc2 -; x17 = Stored negated digit count of pSrc2 -; Note x13, x14 are reserved in ARM64EC and thus are not used - - - LEAF_ENTRY ARM64EC_NAME_MANGLE(SymCryptFdefRawMulAsm) - - lsl x1, x1, #2 ; Calculate word count - - neg x1, x1 ; negate nWords1 - neg x3, x3 ; negate nDigits2 - - mov x5, x4 ; store pDst - mov x16, x2 ; store pSrc2 - mov x17, x3 ; store -nDigits2 for later - - ; - ; First iteration of main loop (no adding of previous values from pDst) - ; - ands x15, x15, XZR ; Clearing the carry flag and setting x15 = 0 - ldr x6, [x0] ; load the first word from pSrc1 - -SymCryptFdefRawMulAsmLoopInner1 - add x3, x3, #1 ; move one digit up - - ldp x8, x9, [x2] ; load 2 words from pSrc2 - - mul x12, x6, x8 ; Bits <63:0> of pSrc1[0]*pSrc2[j] - adcs x12, x12, x15 ; Adding the previous word (if there was a carry from the last addition it is added) - umulh x15, x6, x8 ; Bits <127:64> of pSrc1[0]*pSrc2[j] - str x12, [x4] ; Store to destination - - mul x12, x6, x9 ; Bits <63:0> of pSrc1[0]*pSrc2[j+1] - adcs x12, x12, x15 ; Adding the previous word (if there was a carry from the last addition it is added) - umulh x15, x6, x9 ; Bits <127:64> of pSrc1[0]*pSrc2[j+1] - str x12, [x4, #8] ; Store to destination - - ldp x8, x9, [x2, #16] ; load 2 words from pSrc2 - - mul x12, x6, x8 ; Bits <63:0> of pSrc1[0]*pSrc2[j+2] - adcs x12, x12, x15 ; Adding the previous word (if there was a carry from the last addition it is added) - umulh x15, x6, x8 ; Bits <127:64> of pSrc1[0]*pSrc2[j+2] - str x12, [x4, #16] ; Store to destination - - mul x12, x6, x9 ; Bits <63:0> of pSrc1[0]*pSrc2[j+3] - adcs x12, x12, x15 ; Adding the previous word (if there was a carry from the last addition it is added) - umulh x15, x6, x9 ; Bits <127:64> of pSrc1[0]*pSrc2[j+3] - str x12, [x4, #24] ; Store to destination - - add x2, x2, #32 - add x4, x4, #32 - - cbnz x3, SymCryptFdefRawMulAsmLoopInner1 - - adc x15, x15, XZR ; Store the next word into the destination (with the carry if any) - str x15, [x4] - - add x1, x1, #1 ; move one word up - add x0, x0, #8 ; move start of pSrc1 one word up - add x5, x5, #8 ; move start of pDst one word up - - ; - ; MAIN LOOP - ; -SymCryptFdefRawMulAsmLoopOuter - mov x3, x17 ; set -nDigits2 - mov x2, x16 ; set pSrc2 - mov x4, x5 ; set pDst - - ands x15, x15, XZR ; Clearing the carry flag and setting x15 = 0 - ldr x6, [x0] ; load the next word from pSrc1 - -SymCryptFdefRawMulAsmLoopInner - add x3, x3, #1 ; move one digit up - - ldp x8, x9, [x2] ; load 2 words from pSrc2 - ldp x10, x11, [x4] ; load 2 words from pDst - - mul x12, x6, x8 ; Bits <63:0> of pSrc1[i]*pSrc2[j] - adcs x12, x12, x15 ; Adding the previous word (if there was a carry from the last addition it is added) - umulh x15, x6, x8 ; Bits <127:64> of pSrc1[i]*pSrc2[j] - adc x15, x15, XZR ; Add the carry if any and don't update the flags - ; Note: this cannot overflow as the maximum for is (2^64-1)(2^64-1)+(2^64-1)+1 = 2^128 - 2^64 + 1 - adds x12, x12, x10 ; add the word from the destination and update the flags (this can overflow) - str x12, [x4] ; Store to destination - - mul x12, x6, x9 ; Bits <63:0> of pSrc1[i]*pSrc2[j+1] - adcs x12, x12, x15 ; Adding the previous word (if there was a carry from the last addition it is added) - umulh x15, x6, x9 ; Bits <127:64> of pSrc1[i]*pSrc2[j+1] - adc x15, x15, XZR ; Add the carry if any and don't update the flags - adds x12, x12, x11 ; add the word from the destination and update the flags (this can overflow) - str x12, [x4, #8] ; Store to destination - - ldp x8, x9, [x2, #16] ; load 2 words from pSrc2 - ldp x10, x11, [x4, #16] ; load 2 words from pDst - - mul x12, x6, x8 ; Bits <63:0> of pSrc1[i]*pSrc2[j+2] - adcs x12, x12, x15 ; Adding the previous word (if there was a carry from the last addition it is added) - umulh x15, x6, x8 ; Bits <127:64> of pSrc1[i]*pSrc2[j+2] - adc x15, x15, XZR ; Add the carry if any and don't update the flags - adds x12, x12, x10 ; add the word from the destination and update the flags (this can overflow) - str x12, [x4, #16] ; Store to destination - - mul x12, x6, x9 ; Bits <63:0> of pSrc1[i]*pSrc2[j+3] - adcs x12, x12, x15 ; Adding the previous word (if there was a carry from the last addition it is added) - umulh x15, x6, x9 ; Bits <127:64> of pSrc1[i]*pSrc2[j+3] - adc x15, x15, XZR ; Add the carry if any and don't update the flags - adds x12, x12, x11 ; add the word from the destination and update the flags (this can overflow) - str x12, [x4, #24] ; Store to destination - - add x2, x2, #32 - add x4, x4, #32 - - cbnz x3, SymCryptFdefRawMulAsmLoopInner - - adc x15, x15, XZR ; Store the next word into the destination (with the carry if any) - str x15, [x4] - - adds x1, x1, #1 ; move one word up - add x0, x0, #8 ; move start of pSrc1 one word up - add x5, x5, #8 ; move start of pDst one word up - - bne SymCryptFdefRawMulAsmLoopOuter - - ; Done, no return value - - ret - - LEAF_END ARM64EC_NAME_MANGLE(SymCryptFdefRawMulAsm) - - - - - - - ; Macro for the first loop of the first pass of RawSquareAsm. - ; It takes one word from the source, multiplies it with the mulword, - ; adds the high level word of the previous macro call, and stores it into - ; the destination. - ; - ; No carry flag is propagated from the previous macro call as the maximum is - ; (2^64-1)^2 + 2^64-1 = 2^128 - 2^64 - MACRO - SQR_SINGLEADD_64 $index - - ldr x8, [x2, #8*$index] ; pSrc[i+j] - - mul x12, x6, x8 ; Bits <63:0> of pSrc[i]*pSrc[i+j] - adds x12, x12, x15 ; Adding the previous word - umulh x15, x6, x8 ; Bits <127:64> of pSrc[i]*pSrc[i+j] - adc x15, x15, XZR ; Add the intermediate carry and don't update the flags - - str x12, [x4, #8*$index] ; Store to destination - - MEND - - ; Macro for the remaining loops of the first pass of RawSquareAsm. - ; The only difference to the above is that it also adds the word loaded - ; from the destination buffer. - ; - ; No carry flag is propagated from the previous macro call as the maximum is - ; (2^64-1)^2 + 2(2^64-1) = 2^128 - 1 - MACRO - SQR_DOUBLEADD_64 $index - - ldr x8, [x2, #8*$index] ; pSrc[i+j] - ldr x10, [x4, #8*$index] ; pDst[2*(i+j)] - - mul x12, x6, x8 ; Bits <63:0> of pSrc[i]*pSrc[i+j] - adds x12, x12, x15 ; Adding the previous word - umulh x15, x6, x8 ; Bits <127:64> of pSrc[i]*pSrc[i+j] - adc x15, x15, XZR ; Add the intermediate carry and don't update the flags - - adds x12, x12, x10 ; Add the word from the destination - adc x15, x15, XZR ; Add the intermediate carry and don't update the flags - - str x12, [x4, #8*$index] ; Store to destination - - MEND - - ; Macro for the third pass loop of RawSquareAsm. - ; It takes one mulword from the source, squares it, and - ; adds it to the even columns of the destination. The carries are propagated - ; to the odd columns. - ; - ; Here we can have a (1-bit) carry to the next call because the maximum value for - ; a pair of columns is (2^64-1)^2+(2^128-1)+1 = 2^129 - 2^65 + 1 < 2^129 - 1 - MACRO - SQR_DIAGONAL_PROP $index - ldr x6, [x0, #8*$index] ; mulword - mul x12, x6, x6 ; Bits <63:0> of m^2 - umulh x15, x6, x6 ; Bits <127:64> of m^2 - - ldp x8, x9, [x4, #16*$index] ; Load - - ; Adding the square to the even column - adcs x12, x12, x8 ; carry from previous and update the flags - - ; Propagating the sum to the next column - adcs x15, x15, x9 ; This can generate a carry - - stp x12, x15, [x4, #16*$index] ; Store - MEND - -; VOID -; SYMCRYPT_CALL -; SymCryptFdefRawSquareAsm( -; _In_reads_(nDgigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc, -; UINT32 nDigits, -; _Out_writes_(2*nWords) PUINT32 pDst ) -; -; Initial inputs to registers: -; pSrc -> x0 -; nDigits -> x1 -; pDst -> x2 -; -; Register assignments -; x0 = pSrc -; x1 = negated word count of pSrc -; x2 = pSrc (moving forward one digit / 4 words every inner loop) -; x3 = negated digit count of pSrc -; x4 = pDst (moving forward one digit every inner loop) -; x5 = pDst (moving forward one word every outer loop) -; x6 = Current word loaded from pSrc -; x8, x9 = Current words loaded in pairs from pSrc2 -; x10, x11 = Current words loaded in pairs from pDst -; x12, x15 = "128-bit" sliding register to hold the result of multiplies -; x16 = Stored pSrc -; x17 = Negated digit count of pSrc -; x19 = Stored negated digit count of pSrc -; x20 = Stored pDst -; Note x13, x14 are reserved in ARM64EC and thus are not used - - - NESTED_ENTRY ARM64EC_NAME_MANGLE(SymCryptFdefRawSquareAsm) - PROLOG_SAVE_REG_PAIR fp, lr, #-32! ; allocate 32 bytes of stack; store FP/LR - PROLOG_SAVE_REG_PAIR x19, x20, #16 ; free up x19/x20 - - mov x3, x1 ; digit count into x3 - - lsl x1, x1, #2 ; Calculate word count - - neg x1, x1 ; negate nWords - neg x3, x3 ; negate nDigits - - mov x4, x2 ; pDst - mov x5, x2 ; store pDst - mov x20, x2 ; store pDst - mov x16, x0 ; store pSrc - mov x2, x0 ; inner loop pSrc - mov x17, x3 ; store -nDigits for later - mov x19, x3 ; store -nDigits for later - - ; - ; First iteration of main loop (no adding of previous values from pDst) - ; - ands x15, x15, XZR ; Clearing the carry flag and setting x15 = 0 - ldr x6, [x0] ; load the first word from pSrc1 - str x15, [x4] ; store 0 for the first word - - b SymCryptFdefRawSquareAsmInnerLoopInit_Word1 - -SymCryptFdefRawSquareAsmInnerLoopInit_Word0 - SQR_SINGLEADD_64 0 - -SymCryptFdefRawSquareAsmInnerLoopInit_Word1 - SQR_SINGLEADD_64 1 - - SQR_SINGLEADD_64 2 - - SQR_SINGLEADD_64 3 - - add x3, x3, #1 ; move one digit up - add x2, x2, #32 - add x4, x4, #32 - - cbnz x3, SymCryptFdefRawSquareAsmInnerLoopInit_Word0 - - str x15, [x4] ; Store the next word into the destination - - add x1, x1, #1 ; move one word up - - mov x9, #1 ; Cyclic counter - - ; - ; MAIN LOOP - ; -SymCryptFdefRawSquareAsmOuterLoop - - add x5, x5, #8 ; move start of pDst one word up - - mov x3, x17 ; set -nDigits - mov x2, x0 ; set pSrc - mov x4, x5 ; set pDst - - ands x15, x15, XZR ; Clearing the carry flag and setting x15 = 0 - ldr x6, [x0, x9, LSL #3] ; load the next word from pSrc - - ; Cyclic counter and jump logic - add x9, x9, #1 - cmp x9, #1 - beq SymCryptFdefRawSquareAsmInnerLoop_Word1 - cmp x9, #2 - beq SymCryptFdefRawSquareAsmInnerLoop_Word2 - cmp x9, #3 - beq SymCryptFdefRawSquareAsmInnerLoop_Word3 - - ; The following instructions are only executed when x9 == 4 - mov x9, XZR ; Set it to 0 - - add x0, x0, #32 ; move start of pSrc 4 words up - add x5, x5, #32 ; move pDst 4 words up - - mov x2, x0 ; set pSrc - mov x4, x5 ; set pDst - - adds x17, x17, #1 ; add 1 digit - mov x3, x17 ; set the new digit counter - -SymCryptFdefRawSquareAsmInnerLoop_Word0 - SQR_DOUBLEADD_64 0 - -SymCryptFdefRawSquareAsmInnerLoop_Word1 - SQR_DOUBLEADD_64 1 - -SymCryptFdefRawSquareAsmInnerLoop_Word2 - SQR_DOUBLEADD_64 2 - -SymCryptFdefRawSquareAsmInnerLoop_Word3 - SQR_DOUBLEADD_64 3 - - add x3, x3, #1 ; move one digit up - add x2, x2, #32 - add x4, x4, #32 - - cbnz x3, SymCryptFdefRawSquareAsmInnerLoop_Word0 - - str x15, [x4] ; Store the next word into the destination - - adds x1, x1, #1 ; move one word up - cmn x1, #1 ; Compare with -1 - bne SymCryptFdefRawSquareAsmOuterLoop - - ands x15, x15, XZR ; Setting x15 = 0 - str x15, [x5, #40] ; Store 0 to destination for the top word - - ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ; Second Pass - Shifting all results 1 bit left - ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - - mov x3, x19 ; -nDigits - lsl x3, x3, #1 ; Double digits - mov x4, x20 ; pDst pointer - ands x8, x8, XZR ; Clear the flags - -SymCryptFdefRawSquareAsmSecondPass - - add x3, x3, #1 ; move one digit up - - ldp x8, x9, [x4] - adcs x8, x8, x8 ; Shift left and add the carry - adcs x9, x9, x9 - stp x8, x9, [x4], #16 - - ldp x10, x11, [x4] - adcs x10, x10, x10 ; Shift left and add the carry - adcs x11, x11, x11 - stp x10, x11, [x4], #16 - - cbnz x3, SymCryptFdefRawSquareAsmSecondPass - - ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ; Third Pass - Adding the squares on the even columns and propagating the sum - ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - - ands x8, x8, XZR ; Clear the flags - mov x0, x16 ; src pointer - mov x4, x20 ; pDst pointer - mov x3, x19 ; -nDigits - -SymCryptFdefRawSquareAsmThirdPass - SQR_DIAGONAL_PROP 0 - SQR_DIAGONAL_PROP 1 - SQR_DIAGONAL_PROP 2 - SQR_DIAGONAL_PROP 3 - - add x3, x3, #1 ; move one digit up - add x0, x0, #32 ; One digit up (not updated in SQR_DIAGONAL_PROP) - add x4, x4, #64 ; Two digits up (not updated in SQR_DIAGONAL_PROP) - - cbnz x3, SymCryptFdefRawSquareAsmThirdPass - - ; Done, no return value - - EPILOG_RESTORE_REG_PAIR x19, x20, #16 - EPILOG_RESTORE_REG_PAIR fp, lr, #32! - EPILOG_RETURN - - NESTED_END ARM64EC_NAME_MANGLE(SymCryptFdefRawSquareAsm) - -;VOID -;SymCryptFdefMontgomeryReduceAsm( -; _In_ PCSYMCRYPT_MODULUS pmMod, -; _In_ PUINT32 pSrc, -; _Out_ PUINT32 pDst ) -; -; Initial inputs to registers: -; pmMod -> x0 -; pSrc -> x1 -; pDst -> x2 -; -; Register assignments -; x0 = pMod (moving forward one *digit* every inner loop) -; x1 = pSrc (moving forward one *digit* every inner loop) -; x2 = pDst (used only in the end for subtract / result) -; x3 = negated digit count of pSrc and pMod -; x4 = negated word count of pSrc -; x5 = Inv64 of the modulus -; x6 = m = pSrc[i]*Inv64 -; x7 = hc = high carry variable -; x8, x9 = Current words loaded in pairs from pSrc -; x10, x11 = Current words loaded in pairs from pMod -; x12, x15 = c variable = "128-bit" register to hold the result of multiplies -; It is flipped between [x12:x15] and [x15:x12] intstead of doing c>>=64 -; x16 = Temporary intermediate result -; x17 = Stored negated digit count of pSrc -; x19 = Stored pMod pointer -; x20 = Stored pSrc pointer (moving forward one word every outer loop) -; Note x13, x14 are reserved in ARM64EC and thus are not used - - NESTED_ENTRY ARM64EC_NAME_MANGLE(SymCryptFdefMontgomeryReduceAsm) - PROLOG_SAVE_REG_PAIR fp, lr, #-32! - PROLOG_SAVE_REG_PAIR x19, x20, #16 - - ldr w3, [x0, #SymCryptModulusNdigitsOffsetArm64] ; # of Digits - ldr x5, [x0, #SymCryptModulusMontgomeryInv64OffsetArm64] ; Inv64 of modulus - add x0, x0, #SymCryptModulusValueOffsetArm64 ; pMod - - lsl x4, x3, #2 ; Multiply by 4 to get the number of words - - neg x3, x3 ; Negate the digit count - neg x4, x4 ; Negate the word count - - mov x17, x3 ; Store the digit count for later - mov x19, x0 ; Store the pMod pointer - mov x20, x1 ; Store the pSrc pointer - - ands x7, x7, XZR ; Set hc to 0 - - ; - ; Main loop - ; -SymCryptFdefMontgomeryReduceAsmOuter - ldr x8, [x1] ; Load 1 word from pSrc - mul x6, x8, x5 ; <63:0> bits of pSrc[i]*Inv64 = m - - ands x12, x12, XZR ; Set c to 0 - ands x15, x15, XZR ; Set c to 0 - -SymCryptFdefMontgomeryReduceAsmInner - ldp x10, x11, [x0] ; pMod[j] - ldp x8, x9, [x1] ; pSrc[j] - - mul x16, x6, x10 ; <63:0> of pMod[j]*m - adds x16, x16, x8 ; Adding pSrc[j] - umulh x15, x6, x10 ; <127:64> of pMod[j]*m - adc x15, x15, XZR ; Add the carry if any (***) - adds x12, x12, x16 ; Add the lower bits of c - adc x15, x15, XZR ; Add the carry if any (***) - ; ***: These cannot produce extra carry as the maximum is - ; (2^64 - 1)*(2^64-1) + 2^64-1 + 2^64-1 = 2^128 - 1 - str x12, [x1] ; pSrc[j] = (UINT64) c - - mul x16, x6, x11 ; <63:0> of pMod[j]*m - adds x16, x16, x9 ; Adding pSrc[j] - umulh x12, x6, x11 ; <127:64> of pMod[j]*m - adc x12, x12, XZR ; Add the carry if any (***) - adds x15, x15, x16 ; Add the lower bits of c - adc x12, x12, XZR ; Add the carry if any (***) - str x15, [x1, #8] ; pSrc[j] = (UINT64) c - - ldp x10, x11, [x0, #16] ; pMod[j] - ldp x8, x9, [x1, #16] ; pSrc[j] - - mul x16, x6, x10 ; <63:0> of pMod[j]*m - adds x16, x16, x8 ; Adding pSrc[j] - umulh x15, x6, x10 ; <127:64> of pMod[j]*m - adc x15, x15, XZR ; Add the carry if any (***) - adds x12, x12, x16 ; Add the lower bits of c - adc x15, x15, XZR ; Add the carry if any (***) - str x12, [x1, #16] ; pSrc[j] = (UINT64) c - - mul x16, x6, x11 ; <63:0> of pMod[j]*m - adds x16, x16, x9 ; Adding pSrc[j] - umulh x12, x6, x11 ; <127:64> of pMod[j]*m - adc x12, x12, XZR ; Add the carry if any (***) - adds x15, x15, x16 ; Add the lower bits of c - adc x12, x12, XZR ; Add the carry if any (***) - str x15, [x1, #24] ; pSrc[j] = (UINT64) c - - add x0, x0, #32 - add x1, x1, #32 - adds x3, x3, #1 ; Move one digit up - bne SymCryptFdefMontgomeryReduceAsmInner - - ldr x8, [x1] ; pSrc[nWords] - adds x12, x12, x8 ; c + pSrc[nWords] - adc x15, XZR, XZR ; Add the carry if any - - adds x12, x12, x7 ; c + pSrc[nWords] + hc - adc x7, x15, XZR ; Add the carry if any and store into hc - - str x12, [x1] ; pSrc[nWords] = c - - adds x4, x4, #1 ; Move one word up - - add x20, x20, #8 ; Move stored pSrc pointer one word up - mov x0, x19 ; Restore pMod pointer - mov x1, x20 ; Restore pSrc pointer - - mov x3, x17 ; Restore the digit counter - - bne SymCryptFdefMontgomeryReduceAsmOuter - - ; - ; Subtraction - ; - - mov x16, x2 ; Store pDst pointer - - ; Prepare the pointers for subtract - mov x0, x20 ; pSrc - mov x1, x19 ; pMod - - mov x10, x7 ; x10 = hc - mov x3, x17 ; Restore the digit counter - subs x4, x4, x4 ; Set the carry flag (i.e. no borrow) - -SymCryptFdefMontgomeryReduceRawSubAsmLoop - add x3, x3, #1 ; Increment the digit count by one - ; borrow is in the carry flag (flipped) - - ldp x4, x6, [x0], #16 ; Load two words of pSrc1 - ldp x5, x7, [x1], #16 ; Load two words of pSrc2 - sbcs x4, x4, x5 - sbcs x6, x6, x7 - stp x4, x6, [x2], #16 ; Store the result in the destination - - - ldp x4, x6, [x0], #16 ; Load two words of pSrc1 - ldp x5, x7, [x1], #16 ; Load two words of pSrc2 - sbcs x4, x4, x5 - sbcs x6, x6, x7 - stp x4, x6, [x2], #16 ; Store the result in the destination - - cbnz x3, SymCryptFdefMontgomeryReduceRawSubAsmLoop - - csetcc x0 ; If the carry is clear (borrow), set the return value to 1 - - orr x11, x10, x0 ; x11 = hc|d - - ; Prepare the pointers for masked copy - mov x0, x20 ; pSrc - mov x1, x16 ; pDst - - mov x2, x17 ; Restore the digit counter - subs x4, x10, x11 ; If (x11 > x10) clear the carry flag (i.e. borrow) - -SymCryptFdefMontgomeryReduceMaskedCopyAsmLoop - add x2, x2, #1 ; Increment the digit count by one - - ldp x4, x6, [x0], #16 ; Load two words of the source - ldp x5, x7, [x1] ; Load two words of the destination - cselcc x4, x4, x5 ; If the carry is clear, select the source operands - cselcc x6, x6, x7 - stp x4, x6, [x1], #16 ; Store the two words in the destination - - ldp x4, x6, [x0], #16 - ldp x5, x7, [x1] - cselcc x4, x4, x5 - cselcc x6, x6, x7 - stp x4, x6, [x1], #16 - - cbnz x2, SymCryptFdefMontgomeryReduceMaskedCopyAsmLoop - - ; Done, no return value - - EPILOG_RESTORE_REG_PAIR x19, x20, #16 - EPILOG_RESTORE_REG_PAIR fp, lr, #32! - EPILOG_RETURN - - NESTED_END ARM64EC_NAME_MANGLE(SymCryptFdefMontgomeryReduceAsm) - - END - diff --git a/lib/arm64/fdef_asm.symcryptasm b/lib/arm64/fdef_asm.symcryptasm new file mode 100644 index 0000000..868437e --- /dev/null +++ b/lib/arm64/fdef_asm.symcryptasm @@ -0,0 +1,705 @@ +// +// fdef_asm.symcryptasm Assembler code for large integer arithmetic in the default data format for the arm64 architecture +// Expresses asm in a generic enough way to enable generation of MASM and GAS using the +// symcryptasm_processor.py script and C preprocessor +// +// Copyright (c) Microsoft Corporation. Licensed under the MIT license. +// + +#include "symcryptasm_shared.cppasm" + +// A digit consists of 4 words of 64 bits each + +//UINT32 +//SYMCRYPT_CALL +//SymCryptFdefRawAddAsm( +// _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 Src1, +// _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 Src2, +// _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 Dst, +// UINT32 nDigits ) + +FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdefRawAddAsm), 4, 8) + + ldp X_4, X_6, [X_0] // Load two words of pSrc1 + ldp X_5, X_7, [X_1] // Load two words of pSrc2 + adds X_4, X_4, X_5 + adcs X_6, X_6, X_7 + stp X_4, X_6, [X_2] // Store the result in the destination + + ldp X_4, X_6, [X_0, #16] // Load two words of pSrc1 + sub X_3, X_3, #1 // Decrement the digit count by one + ldp X_5, X_7, [X_1, #16] // Load two words of pSrc2 + adcs X_4, X_4, X_5 + adcs X_6, X_6, X_7 + stp X_4, X_6, [X_2, #16] // Store the result in the destination + + cbz X_3, SymCryptFdefRawAddAsmEnd + +LABEL(SymCryptFdefRawAddAsmLoop) + // carry is in the carry flag + // only update pointers to srcs and destination once per loop to reduce uops and dependencies + ldp X_4, X_6, [X_0, #32]! // Load two words of pSrc1 + ldp X_5, X_7, [X_1, #32]! // Load two words of pSrc2 + adcs X_4, X_4, X_5 + adcs X_6, X_6, X_7 + stp X_4, X_6, [X_2, #32]! // Store the result in the destination + + ldp X_4, X_6, [X_0, #16] // Load two words of pSrc1 + sub X_3, X_3, #1 // Decrement the digit count by one + ldp X_5, X_7, [X_1, #16] // Load two words of pSrc2 + adcs X_4, X_4, X_5 + adcs X_6, X_6, X_7 + stp X_4, X_6, [X_2, #16] // Store the result in the destination + + cbnz X_3, SymCryptFdefRawAddAsmLoop + + ALIGN(4) +LABEL(SymCryptFdefRawAddAsmEnd) + cset X_0, cs // Set the return value equal to the carry + +FUNCTION_END(ARM64EC_NAME_MANGLE(SymCryptFdefRawAddAsm)) + +//UINT32 +//SYMCRYPT_CALL +//SymCryptFdefRawSubAsm( +// _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc1, +// _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc2, +// _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 pDst, +// UINT32 nDigits ) + +FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdefRawSubAsm), 4, 8) + + ldp X_4, X_6, [X_0] // Load two words of pSrc1 + ldp X_5, X_7, [X_1] // Load two words of pSrc2 + subs X_4, X_4, X_5 + sbcs X_6, X_6, X_7 + stp X_4, X_6, [X_2] // Store the result in the destination + + ldp X_4, X_6, [X_0, #16] // Load two words of pSrc1 + sub X_3, X_3, #1 // Decrement the digit count by one + ldp X_5, X_7, [X_1, #16] // Load two words of pSrc2 + sbcs X_4, X_4, X_5 + sbcs X_6, X_6, X_7 + stp X_4, X_6, [X_2, #16] // Store the result in the destination + + cbz X_3, SymCryptFdefRawSubAsmEnd + +LABEL(SymCryptFdefRawSubAsmLoop) + // borrow is in the carry flag (flipped) + // only update pointers to srcs and destination once per loop to reduce uops and dependencies + ldp X_4, X_6, [X_0, #32]! // Load two words of pSrc1 + ldp X_5, X_7, [X_1, #32]! // Load two words of pSrc2 + sbcs X_4, X_4, X_5 + sbcs X_6, X_6, X_7 + stp X_4, X_6, [X_2, #32]! // Store the result in the destination + + ldp X_4, X_6, [X_0, #16] // Load two words of pSrc1 + sub X_3, X_3, #1 // Decrement the digit count by one + ldp X_5, X_7, [X_1, #16] // Load two words of pSrc2 + sbcs X_4, X_4, X_5 + sbcs X_6, X_6, X_7 + stp X_4, X_6, [X_2, #16] // Store the result in the destination + + cbnz X_3, SymCryptFdefRawSubAsmLoop + + ALIGN(4) +LABEL(SymCryptFdefRawSubAsmEnd) + cset X_0, cc // If the carry is clear (borrow), set the return value to 1 + +FUNCTION_END(ARM64EC_NAME_MANGLE(SymCryptFdefRawSubAsm)) + +//VOID +//SYMCRYPT_CALL +//SymCryptFdefMaskedCopyAsm( +// _In_reads_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE ) PCBYTE pbSrc, +// _Inout_updates_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE ) PBYTE pbDst, +// UINT32 nDigits, +// UINT32 mask ) + +FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdefMaskedCopyAsm), 4, 4) + + dup v0.4s, W_3 // broadcast the mask to v0 + +LABEL(SymCryptFdefMaskedCopyAsmLoop) + ldp q1, q3, [X_0], #32 // Load 4 words of the source + ldp q2, q4, [X_1] // Load 4 words of the destination + bit v2.16b, v1.16b, v0.16b // if the mask is 1s, overwrite the destination with source + bit v4.16b, v3.16b, v0.16b // if the mask is 1s, overwrite the destination with source + stp q2, q4, [X_1], #32 // Store the two words in the destination + + sub X_2, X_2, #1 // Decrement the digit count by one + + cbnz X_2, SymCryptFdefMaskedCopyAsmLoop + + // Done, no return value + +FUNCTION_END(ARM64EC_NAME_MANGLE(SymCryptFdefMaskedCopyAsm)) + +//VOID +//SYMCRYPT_CALL +//SymCryptFdefRawMulAsm( +// _In_reads_(nDigits1*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc1, +// UINT32 nDigits1, +// _In_reads_(nDigits2*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc2, +// UINT32 nDigits2, +// _Out_writes_((nDigits1+nDigits2)*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32 pDst ) +// +// Basic structure: +// for each word in Src1: +// Dst += Src2 * word +// +// Register assignments +// X_0 = pSrc1 (moving forward one word every outer loop) +// X_1 = word count of pSrc1 +// X_2 = pSrc2 (moving forward one *digit* every inner loop) +// X_3 = digit count of pSrc2 and pDst +// X_4 = pDst (moving forward one *digit* every inner loop) +// X_5 = Stored pDst (moving forward one word every outer loop) +// X_6 = Current word loaded from pSrc1 +// X_7, X_8 = Current words loaded in pairs from pSrc2 +// X_9, X_10 = Current words loaded in pairs from pDst +// X_11, X_12 = Scratch registers for holding the results of multiplies +// X_13 = Stored pSrc2 +// X_14 = Stored digit count of pSrc2 +// X_15 = Scratch register for holding the results of multiplies + +FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdefRawMulAsm), 5, 16) + + lsl X_1, X_1, #2 // Calculate word count + + sub X_2, X_2, #32 // offset pSrc2 so we can use pre-increment form of loads + sub X_4, X_4, #32 // offset pDst so we can use pre-increment form of loads + + mov X_5, X_4 // store pDst + mov X_13, X_2 // store pSrc2 + mov X_14, X_3 // store nDigits2 for later + + // + // First iteration of main loop (no adding of previous values from pDst) + // + ands X_12, X_12, xzr // Clearing the carry flag and setting X_12 = 0 + ldr X_6, [X_0] // load the first word from pSrc1 + +LABEL(SymCryptFdefRawMulAsmLoopInner1) + sub X_3, X_3, #1 // move one digit up + + ldp X_7, X_8, [X_2, #32]! // load 2 words from pSrc2 + + mul X_11, X_6, X_7 // Bits <63:0> of pSrc1[0]*pSrc2[j] + adcs X_11, X_11, X_12 // Adding the previous word (if there was a carry from the last addition it is added) + umulh X_12, X_6, X_7 // Bits <127:64> of pSrc1[0]*pSrc2[j] + + mul X_15, X_6, X_8 // Bits <63:0> of pSrc1[0]*pSrc2[j+1] + adcs X_15, X_15, X_12 // Adding the previous word (if there was a carry from the last addition it is added) + umulh X_12, X_6, X_8 // Bits <127:64> of pSrc1[0]*pSrc2[j+1] + + stp X_11, X_15, [X_4, #32]! // Store to destination + ldp X_7, X_8, [X_2, #16] // load 2 words from pSrc2 + + mul X_11, X_6, X_7 // Bits <63:0> of pSrc1[0]*pSrc2[j+2] + adcs X_11, X_11, X_12 // Adding the previous word (if there was a carry from the last addition it is added) + umulh X_12, X_6, X_7 // Bits <127:64> of pSrc1[0]*pSrc2[j+2] + + mul X_15, X_6, X_8 // Bits <63:0> of pSrc1[0]*pSrc2[j+3] + adcs X_15, X_15, X_12 // Adding the previous word (if there was a carry from the last addition it is added) + umulh X_12, X_6, X_8 // Bits <127:64> of pSrc1[0]*pSrc2[j+3] + + stp X_11, X_15, [X_4, #16] // Store to destination + + cbnz X_3, SymCryptFdefRawMulAsmLoopInner1 + + adc X_12, X_12, xzr // Store the next word into the destination (with the carry if any) + str X_12, [X_4, #32] + + sub X_1, X_1, #1 // move one word up + add X_0, X_0, #8 // move start of pSrc1 one word up + add X_5, X_5, #8 // move start of pDst one word up + + // + // MAIN LOOP + // +LABEL(SymCryptFdefRawMulAsmLoopOuter) + mov X_3, X_14 // set nDigits2 + mov X_2, X_13 // set pSrc2 + mov X_4, X_5 // set pDst + + ands X_12, X_12, xzr // Clearing the carry flag and setting X_12 = 0 + ldr X_6, [X_0] // load the next word from pSrc1 + +LABEL(SymCryptFdefRawMulAsmLoopInner) + sub X_3, X_3, #1 // move one digit up + + ldp X_7, X_8, [X_2, #32]! // load 2 words from pSrc2 + ldp X_9, X_10, [X_4, #32]! // load 2 words from pDst + + adcs X_9, X_9, X_12 // Adding the previous word (if there was a carry from the last addition it is added) + umulh X_11, X_6, X_7 // Bits <127:64> of pSrc1[i]*pSrc2[j] + adcs X_10, X_11, X_10 // Adding the previous word (if there was a carry from the last addition it is added) + umulh X_12, X_6, X_8 // Bits <127:64> of pSrc1[i]*pSrc2[j+1] + adc X_12, X_12, xzr // Add the carry if any and don't update the flags + + mul X_11, X_6, X_7 // Bits <63:0> of pSrc1[i]*pSrc2[j] + adds X_9, X_9, X_11 // add the word from the destination and update the flags (this can overflow) + mul X_11, X_6, X_8 // Bits <63:0> of pSrc1[i]*pSrc2[j+1] + adcs X_10, X_10, X_11 // add the word from the destination and update the flags (this can overflow) + + stp X_9, X_10, [X_4] // Store to destination + + ldp X_7, X_8, [X_2, #16] // load 2 words from pSrc2 + ldp X_9, X_10, [X_4, #16] // load 2 words from pDst + + adcs X_9, X_9, X_12 // Adding the previous word (if there was a carry from the last addition it is added) + umulh X_11, X_6, X_7 // Bits <127:64> of pSrc1[i]*pSrc2[j+2] + adcs X_10, X_11, X_10 // Adding the previous word (if there was a carry from the last addition it is added) + umulh X_12, X_6, X_8 // Bits <127:64> of pSrc1[i]*pSrc2[j+3] + adc X_12, X_12, xzr // Add the carry if any and don't update the flags + + mul X_11, X_6, X_7 // Bits <63:0> of pSrc1[i]*pSrc2[j+2] + adds X_9, X_9, X_11 // add the word from the destination and update the flags (this can overflow) + mul X_11, X_6, X_8 // Bits <63:0> of pSrc1[i]*pSrc2[j+3] + adcs X_10, X_10, X_11 // add the word from the destination and update the flags (this can overflow) + + stp X_9, X_10, [X_4, #16] // Store to destination + + cbnz X_3, SymCryptFdefRawMulAsmLoopInner + + adc X_12, X_12, xzr // Store the next word into the destination (with the carry if any) + str X_12, [X_4, #32] + + subs X_1, X_1, #1 // move one word up + add X_0, X_0, #8 // move start of pSrc1 one word up + add X_5, X_5, #8 // move start of pDst one word up + + bne SymCryptFdefRawMulAsmLoopOuter + + // Done, no return value + +FUNCTION_END(ARM64EC_NAME_MANGLE(SymCryptFdefRawMulAsm)) + + // Macro for the first loop of the first pass of RawSquareAsm. + // It takes one word from the source, multiplies it with the mulword, + // adds the high level word of the previous macro call, and stores it into + // the destination. + // + // No carry flag is propagated from the previous macro call as the maximum is + // (2^64-1)^2 + 2^64-1 = 2^128 - 2^64 +MACRO_START(SQR_SINGLEADD_64, index, src_reg, dst_reg, mul_word, src_carry, dst_carry, scratch0, scratch1) + + ldr scratch0, [src_reg, #8*index] // pSrc[i+j] + + mul scratch1, mul_word, scratch0 // Bits <63:0> of pSrc[i]*pSrc[i+j] + adds scratch1, scratch1, src_carry // Adding the previous word + umulh dst_carry, mul_word, scratch0 // Bits <127:64> of pSrc[i]*pSrc[i+j] + adc dst_carry, dst_carry, xzr // Add the intermediate carry and don't update the flags + + str scratch1, [dst_reg, #8*index] // Store to destination + +MACRO_END() + + // Macro for the remaining loops of the first pass of RawSquareAsm. + // The only difference to the above is that it also adds the word loaded + // from the destination buffer. + // + // No carry flag is propagated from the previous macro call as the maximum is + // (2^64-1)^2 + 2(2^64-1) = 2^128 - 1 +MACRO_START(SQR_DOUBLEADD_64, index, src_reg, dst_reg, mul_word, src_carry, dst_carry, scratch0, scratch1, scratch2) + + ldr scratch0, [src_reg, #8*index] // pSrc[i+j] + ldr scratch2, [dst_reg, #8*index] // pDst[2*(i+j)] + + mul scratch1, mul_word, scratch0 // Bits <63:0> of pSrc[i]*pSrc[i+j] + adds scratch1, scratch1, src_carry // Adding the previous word + umulh dst_carry, mul_word, scratch0 // Bits <127:64> of pSrc[i]*pSrc[i+j] + adc dst_carry, dst_carry, xzr // Add the intermediate carry and don't update the flags + + adds scratch1, scratch1, scratch2 // Add the word from the destination + adc dst_carry, dst_carry, xzr // Add the intermediate carry and don't update the flags + + str scratch1, [dst_reg, #8*index] // Store to destination + +MACRO_END() + + // Macro for the third pass loop of RawSquareAsm. + // It takes one mulword from the source, squares it, and + // adds it to the even columns of the destination. The carries are propagated + // to the odd columns. + // + // Here we can have a (1-bit) carry to the next call because the maximum value for + // a pair of columns is (2^64-1)^2+(2^128-1)+1 = 2^129 - 2^65 + 1 < 2^129 - 1 +MACRO_START(SQR_DIAGONAL_PROP, index, src_reg, dst_reg, squarelo, squarehi, scratch0, scratch1) + + ldr squarehi, [src_reg, #8*index] // mulword + mul squarelo, squarehi, squarehi // Bits <63:0> of m^2 + umulh squarehi, squarehi, squarehi // Bits <127:64> of m^2 + + ldp scratch0, scratch1, [dst_reg, #16*index] // Load + + // Adding the square to the even column + adcs squarelo, squarelo, scratch0 // carry from previous and update the flags + + // Propagating the sum to the next column + adcs squarehi, squarehi, scratch1 // This can generate a carry + + stp squarelo, squarehi, [dst_reg, #16*index] // Store + +MACRO_END() + +//VOID +//SYMCRYPT_CALL +//SymCryptFdefRawSquareAsm( +// _In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc, +// UINT32 nDigits, +// _Out_writes_(2*nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32 pDst ) +// +// Register assignments +// X_0 = pSrc +// X_1 = word count of pSrc +// X_2 = pSrc (moving forward one digit / 4 words every inner loop) +// X_3 = digit count of pSrc +// X_4 = pDst (moving forward one digit every inner loop) +// X_5 = pDst (moving forward one word every outer loop) +// X_6 = Current word loaded from pSrc +// X_7, X_8 = Current words loaded in pairs from pSrc2 +// X_9, X_10 = Current words loaded in pairs from pDst +// X_11, X_12 = "128-bit" sliding register to hold the result of multiplies +// X_13 = Stored pSrc +// X_14 = Digit count of pSrc +// X_15 = Stored digit count of pSrc +// X_16 = Stored pDst + +FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdefRawSquareAsm), 3, 17) + + mov X_3, X_1 // digit count into X_3 + + lsl X_1, X_1, #2 // Calculate word count + + mov X_4, X_2 // pDst + mov X_5, X_2 // store pDst + mov X_16, X_2 // store pDst + mov X_13, X_0 // store pSrc + mov X_2, X_0 // inner loop pSrc + mov X_14, X_3 // store nDigits for later + mov X_15, X_3 // store nDigits for later + + // + // First iteration of main loop (no adding of previous values from pDst) + // + ands X_12, X_12, xzr // Clearing the carry flag and setting X_12 = 0 + ldr X_6, [X_0] // load the first word from pSrc1 + str X_12, [X_4] // store 0 for the first word + + b SymCryptFdefRawSquareAsmInnerLoopInit_Word1 + +LABEL(SymCryptFdefRawSquareAsmInnerLoopInit_Word0) + SQR_SINGLEADD_64 0, X_2, X_4, X_6, X_12, X_12, X_7, X_8 + +LABEL(SymCryptFdefRawSquareAsmInnerLoopInit_Word1) + SQR_SINGLEADD_64 1, X_2, X_4, X_6, X_12, X_12, X_7, X_8 + + SQR_SINGLEADD_64 2, X_2, X_4, X_6, X_12, X_12, X_7, X_8 + + SQR_SINGLEADD_64 3, X_2, X_4, X_6, X_12, X_12, X_7, X_8 + + sub X_3, X_3, #1 // move one digit up + add X_2, X_2, #32 + add X_4, X_4, #32 + + cbnz X_3, SymCryptFdefRawSquareAsmInnerLoopInit_Word0 + + str X_12, [X_4] // Store the next word into the destination + + sub X_1, X_1, #2 // move two words up (we started at the word 1) + + mov X_8, #1 // Cyclic counter + + // + // MAIN LOOP + // +LABEL(SymCryptFdefRawSquareAsmOuterLoop) + + add X_5, X_5, #8 // move start of pDst one word up + + mov X_3, X_14 // set nDigits + mov X_2, X_0 // set pSrc + mov X_4, X_5 // set pDst + + ands X_12, X_12, xzr // Clearing the carry flag and setting X_12 = 0 + ldr X_6, [X_0, X_8, LSL #3] // load the next word from pSrc + + // Cyclic counter and jump logic + add X_8, X_8, #1 + cmp X_8, #1 + beq SymCryptFdefRawSquareAsmInnerLoop_Word1 + cmp X_8, #2 + beq SymCryptFdefRawSquareAsmInnerLoop_Word2 + cmp X_8, #3 + beq SymCryptFdefRawSquareAsmInnerLoop_Word3 + + // The following instructions are only executed when X_8 == 4 + mov X_8, xzr // Set it to 0 + + add X_0, X_0, #32 // move start of pSrc 4 words up + add X_5, X_5, #32 // move pDst 4 words up + + mov X_2, X_0 // set pSrc + mov X_4, X_5 // set pDst + + sub X_14, X_14, #1 // remove 1 digit + mov X_3, X_14 // set the new digit counter + +LABEL(SymCryptFdefRawSquareAsmInnerLoop_Word0) + SQR_DOUBLEADD_64 0, X_2, X_4, X_6, X_12, X_12, X_7, X_9, X_10 + +LABEL(SymCryptFdefRawSquareAsmInnerLoop_Word1) + SQR_DOUBLEADD_64 1, X_2, X_4, X_6, X_12, X_12, X_7, X_9, X_10 + +LABEL(SymCryptFdefRawSquareAsmInnerLoop_Word2) + SQR_DOUBLEADD_64 2, X_2, X_4, X_6, X_12, X_12, X_7, X_9, X_10 + +LABEL(SymCryptFdefRawSquareAsmInnerLoop_Word3) + SQR_DOUBLEADD_64 3, X_2, X_4, X_6, X_12, X_12, X_7, X_9, X_10 + + sub X_3, X_3, #1 // move one digit up + add X_2, X_2, #32 + add X_4, X_4, #32 + + cbnz X_3, SymCryptFdefRawSquareAsmInnerLoop_Word0 + + str X_12, [X_4] // Store the next word into the destination + + sub X_1, X_1, #1 // move one word up + cbnz X_1, SymCryptFdefRawSquareAsmOuterLoop + + ands X_12, X_12, xzr // Setting X_12 = 0 + str X_12, [X_5, #40] // Store 0 to destination for the top word + + //////////////////////////////////////////////////////////////// + // Second Pass - Shifting all results 1 bit left + //////////////////////////////////////////////////////////////// + + mov X_3, X_15 // nDigits + lsl X_3, X_3, #1 // Double digits + mov X_4, X_16 // pDst pointer + ands X_7, X_7, xzr // Clear the flags + +LABEL(SymCryptFdefRawSquareAsmSecondPass) + + sub X_3, X_3, #1 // move one digit up + + ldp X_7, X_8, [X_4] + adcs X_7, X_7, X_7 // Shift left and add the carry + adcs X_8, X_8, X_8 + stp X_7, X_8, [X_4], #16 + + ldp X_9, X_10, [X_4] + adcs X_9, X_9, X_9 // Shift left and add the carry + adcs X_10, X_10, X_10 + stp X_9, X_10, [X_4], #16 + + cbnz X_3, SymCryptFdefRawSquareAsmSecondPass + + ////////////////////////////////////////////////////////////////////////////// + // Third Pass - Adding the squares on the even columns and propagating the sum + ////////////////////////////////////////////////////////////////////////////// + + ands X_7, X_7, xzr // Clear the flags + mov X_0, X_13 // src pointer + mov X_4, X_16 // pDst pointer + mov X_3, X_15 // nDigits + +LABEL(SymCryptFdefRawSquareAsmThirdPass) + SQR_DIAGONAL_PROP 0, X_0, X_4, X_6, X_7, X_8, X_9 + SQR_DIAGONAL_PROP 1, X_0, X_4, X_6, X_7, X_8, X_9 + SQR_DIAGONAL_PROP 2, X_0, X_4, X_6, X_7, X_8, X_9 + SQR_DIAGONAL_PROP 3, X_0, X_4, X_6, X_7, X_8, X_9 + + sub X_3, X_3, #1 // move one digit up + add X_0, X_0, #32 // One digit up (not updated in SQR_DIAGONAL_PROP) + add X_4, X_4, #64 // Two digits up (not updated in SQR_DIAGONAL_PROP) + + cbnz X_3, SymCryptFdefRawSquareAsmThirdPass + + // Done, no return value + +FUNCTION_END(ARM64EC_NAME_MANGLE(SymCryptFdefRawSquareAsm)) + +//VOID +//SYMCRYPT_CALL +//SymCryptFdefMontgomeryReduceAsm( +// _In_ PCSYMCRYPT_MODULUS pmMod, +// _Inout_ PUINT32 pSrc, +// _Out_ PUINT32 pDst ) +// +// Register assignments +// X_0 = pMod (moving forward one *digit* every inner loop) +// X_1 = pSrc (moving forward one *digit* every inner loop) +// X_2 = pDst (used only in the end for subtract / result) +// X_3 = digit count of pSrc and pMod +// X_4 = word count of pSrc +// X_5 = Inv64 of the modulus +// X_6 = m = pSrc[i]*Inv64 +// X_7 = hc = high carry variable +// X_8, X_9 = Current words loaded in pairs from pSrc +// X_10, X_11 = Current words loaded in pairs from pMod +// X_12, X_13 = c variable = "128-bit" register to hold the result of multiplies +// It is flipped between [X_12:X_13] and [X_13:X_12] instead of doing c>>=64 +// X_14 = Temporary intermediate result +// X_15 = Stored digit count of pSrc +// X_16 = Stored pMod pointer +// X_17 = Stored pSrc pointer (moving forward one word every outer loop) + +FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdefMontgomeryReduceAsm), 3, 18) + + ldr W_3, [X_0, #SymCryptModulusNdigitsOffsetArm64] // # of Digits + ldr X_5, [X_0, #SymCryptModulusMontgomeryInv64OffsetArm64] // Inv64 of modulus + add X_0, X_0, #SymCryptModulusValueOffsetArm64 // pMod + + lsl X_4, X_3, #2 // Multiply by 4 to get the number of words + + sub X_0, X_0, #32 // offset pMod so we can use pre-increment form of loads + sub X_1, X_1, #32 // offset pSrc so we can use pre-increment form of loads + sub X_2, X_2, #32 // offset pDst so we can use pre-increment form of loads + + mov X_15, X_3 // Store the digit count for later + mov X_16, X_0 // Store the pMod pointer + mov X_17, X_1 // Store the pSrc pointer + + and X_7, X_7, xzr // Set hc to 0 + + // + // Main loop + // +LABEL(SymCryptFdefMontgomeryReduceAsmOuter) + ldr X_8, [X_1, #32] // Load 1 word from pSrc + mul X_6, X_8, X_5 // <63:0> bits of pSrc[i]*Inv64 = m + + and X_12, X_12, xzr // Set c to 0 + +LABEL(SymCryptFdefMontgomeryReduceAsmInner) + ldp X_10, X_11, [X_0, #32]! // pMod[j] + ldp X_8, X_9, [X_1, #32]! // pSrc[j] + + mul X_14, X_6, X_10 // <63:0> of pMod[j]*m + adds X_14, X_14, X_8 // Adding pSrc[j] + umulh X_13, X_6, X_10 // <127:64> of pMod[j]*m + adc X_13, X_13, xzr // Add the carry if any (***) + adds X_12, X_12, X_14 // Add the lower bits of c + adc X_13, X_13, xzr // Add the carry if any (***) + // ***: These cannot produce extra carry as the maximum is + // (2^64 - 1)*(2^64-1) + 2^64-1 + 2^64-1 = 2^128 - 1 + str X_12, [X_1] // pSrc[j] = (UINT64) c + + mul X_14, X_6, X_11 // <63:0> of pMod[j]*m + adds X_14, X_14, X_9 // Adding pSrc[j] + umulh X_12, X_6, X_11 // <127:64> of pMod[j]*m + adc X_12, X_12, xzr // Add the carry if any (***) + adds X_13, X_13, X_14 // Add the lower bits of c + adc X_12, X_12, xzr // Add the carry if any (***) + str X_13, [X_1, #8] // pSrc[j] = (UINT64) c + + ldp X_10, X_11, [X_0, #16] // pMod[j] + ldp X_8, X_9, [X_1, #16] // pSrc[j] + + mul X_14, X_6, X_10 // <63:0> of pMod[j]*m + adds X_14, X_14, X_8 // Adding pSrc[j] + umulh X_13, X_6, X_10 // <127:64> of pMod[j]*m + adc X_13, X_13, xzr // Add the carry if any (***) + adds X_12, X_12, X_14 // Add the lower bits of c + adc X_13, X_13, xzr // Add the carry if any (***) + str X_12, [X_1, #16] // pSrc[j] = (UINT64) c + + mul X_14, X_6, X_11 // <63:0> of pMod[j]*m + adds X_14, X_14, X_9 // Adding pSrc[j] + umulh X_12, X_6, X_11 // <127:64> of pMod[j]*m + adc X_12, X_12, xzr // Add the carry if any (***) + adds X_13, X_13, X_14 // Add the lower bits of c + adc X_12, X_12, xzr // Add the carry if any (***) + str X_13, [X_1, #24] // pSrc[j] = (UINT64) c + + subs X_3, X_3, #1 // Move one digit up + bne SymCryptFdefMontgomeryReduceAsmInner + + ldr X_8, [X_1, #32] // pSrc[nWords] + adds X_12, X_12, X_8 // c + pSrc[nWords] + adc X_13, xzr, xzr // Add the carry if any + + adds X_12, X_12, X_7 // c + pSrc[nWords] + hc + adc X_7, X_13, xzr // Add the carry if any and store into hc + + str X_12, [X_1, #32] // pSrc[nWords] = c + + subs X_4, X_4, #1 // Move one word up + + add X_17, X_17, #8 // Move stored pSrc pointer one word up + mov X_0, X_16 // Restore pMod pointer + mov X_1, X_17 // Restore pSrc pointer + + mov X_3, X_15 // Restore the digit counter + + bne SymCryptFdefMontgomeryReduceAsmOuter + + // + // Subtraction + // + + mov X_14, X_2 // Store pDst pointer + + // Prepare the pointers for subtract + mov X_0, X_17 // pSrc + mov X_1, X_16 // pMod + + mov X_10, X_7 // X_10 = hc + mov X_3, X_15 // Restore the digit counter + subs X_4, X_4, X_4 // Set the carry flag (i.e. no borrow) + +LABEL(SymCryptFdefMontgomeryReduceRawSubAsmLoop) + sub X_3, X_3, #1 // Decrement the digit count by one + // borrow is in the carry flag (flipped) + + ldp X_4, X_6, [X_0, #32]! // Load two words of pSrc1 + ldp X_5, X_7, [X_1, #32]! // Load two words of pSrc2 + sbcs X_4, X_4, X_5 + sbcs X_6, X_6, X_7 + stp X_4, X_6, [X_2, #32]! // Store the result in the destination + + ldp X_4, X_6, [X_0, #16] // Load two words of pSrc1 + ldp X_5, X_7, [X_1, #16] // Load two words of pSrc2 + sbcs X_4, X_4, X_5 + sbcs X_6, X_6, X_7 + stp X_4, X_6, [X_2, #16] // Store the result in the destination + + cbnz X_3, SymCryptFdefMontgomeryReduceRawSubAsmLoop + + cset X_0, cc // If the carry is clear (borrow), set the return value to 1 + + orr X_11, X_10, X_0 // X_11 = hc|d + + // Prepare the pointers for masked copy + mov X_0, X_17 // pSrc + mov X_1, X_14 // pDst + + mov X_2, X_15 // Restore the digit counter + subs X_4, X_10, X_11 // If (X_11 > X_10) clear the carry flag (i.e. borrow) + +LABEL(SymCryptFdefMontgomeryReduceMaskedCopyAsmLoop) + sub X_2, X_2, #1 // decrement the digit count by one + + ldp X_4, X_6, [X_0, #32]! // Load two words of the source + ldp X_5, X_7, [X_1, #32]! // Load two words of the destination + csel X_4, X_4, X_5, cc // If the carry is clear, select the source operands + csel X_6, X_6, X_7, cc + stp X_4, X_6, [X_1] // Store the two words in the destination + + ldp X_4, X_6, [X_0, #16] + ldp X_5, X_7, [X_1, #16] + csel X_4, X_4, X_5, cc + csel X_6, X_6, X_7, cc + stp X_4, X_6, [X_1, #16] + + cbnz X_2, SymCryptFdefMontgomeryReduceMaskedCopyAsmLoop + + // Done, no return value + +FUNCTION_END(ARM64EC_NAME_MANGLE(SymCryptFdefMontgomeryReduceAsm)) + + FILE_END() diff --git a/lib/arm64/symcrypt_magic.inc b/lib/arm64/symcrypt_magic.inc deleted file mode 100644 index 4a8d79b..0000000 --- a/lib/arm64/symcrypt_magic.inc +++ /dev/null @@ -1,28 +0,0 @@ -; -; SymCrypt_magic.inc -; Copyright (c) Microsoft Corporation. Licensed under the MIT license. -; -; Include file to define the support macros for the Magic field -; - - IMPORT ARM64EC_NAME_MANGLE(SymCryptFatal) - -#define SYMCRYPT_CODE_VERSION (SYMCRYPT_CODE_VERSION_API * 65536 + SYMCRYPT_CODE_VERSION_MINOR) -#define SYMCRYPT_MAGIC_CONSTANT (0x53316d76 + SYMCRYPT_CODE_VERSION) - - MACRO - SYMCRYPT_CHECK_MAGIC $temp1, $temp2, $ptr, $offset - -#if SYMCRYPT_DEBUG - - ldr $temp1, [$ptr, #$offset] - subs $temp1, $temp1, $ptr - mov32 $temp2, SYMCRYPT_MAGIC_CONSTANT - cmp $temp1, $temp2 - beq %F1 - mov32 r0, 0x6d616763 ; 'magc' - bl ARM64EC_NAME_MANGLE(SymCryptFatal) -1 -#endif - - MEND diff --git a/lib/arm64/wipe.asm b/lib/arm64/wipe.asm deleted file mode 100644 index b0eecb7..0000000 --- a/lib/arm64/wipe.asm +++ /dev/null @@ -1,37 +0,0 @@ - TTL "SymCryptWipe" -;++ -; -; Copyright (c) Microsoft Corporation. Licensed under the MIT license. -; -; Secure wipe -; -;-- - -#include "ksarm64.h" -#include "symcrypt_name_mangling.inc" - - TEXTAREA - - EXTERN ARM64EC_NAME_MANGLE(memset) - - SUBT "SymCryptWipe" -;VOID -;SYMCRYPT_CALL -;SymCryptWipe( _Out_writes_bytes_( cbData ) PVOID pbData, -; SIZE_T cbData ) - - - LEAF_ENTRY ARM64EC_NAME_MANGLE(SymCryptWipeAsm) - - ; we just jump to memset. - ; this is enough to stop the compiler optimizing the memset away. - - mov x2, x1 - mov x1, #0 - b ARM64EC_NAME_MANGLE(memset) - - LEAF_END ARM64EC_NAME_MANGLE(SymCryptWipeAsm) - - - - END diff --git a/lib/arm64/wipe.symcryptasm b/lib/arm64/wipe.symcryptasm new file mode 100644 index 0000000..08bb820 --- /dev/null +++ b/lib/arm64/wipe.symcryptasm @@ -0,0 +1,31 @@ +// +// wipe.symcryptasm Assembler code for wiping a buffer +// Expresses asm in a generic enough way to enable generation of MASM and GAS using the +// symcryptasm_processor.py script and C preprocessor +// +// Copyright (c) Microsoft Corporation. Licensed under the MIT license. + + +#include "symcryptasm_shared.cppasm" + + TEXTAREA() + + EXTERN(ARM64EC_NAME_MANGLE(memset)) + +//VOID +//SYMCRYPT_CALL +//SymCryptWipe( _Out_writes_bytes_( cbData ) PVOID pbData, +// SIZE_T cbData ) + + FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptWipeAsm), 2, 3) + +// we just jump to memset. +// this is enough to stop the compiler optimizing the memset away. + + mov X_2, X_1 + mov X_1, #0 + b ARM64EC_NAME_MANGLE(memset) + + FUNCTION_END(ARM64EC_NAME_MANGLE(SymCryptWipeAsm)) + + FILE_END() diff --git a/lib/arm64ec/fdef369_asm.asm b/lib/arm64ec/fdef369_asm.asm deleted file mode 100644 index 65d14ec..0000000 --- a/lib/arm64ec/fdef369_asm.asm +++ /dev/null @@ -1 +0,0 @@ -#include "..\arm64\fdef369_asm.asm" diff --git a/lib/arm64ec/fdef_asm.asm b/lib/arm64ec/fdef_asm.asm deleted file mode 100644 index 6e87c02..0000000 --- a/lib/arm64ec/fdef_asm.asm +++ /dev/null @@ -1 +0,0 @@ -#include "..\arm64\fdef_asm.asm" diff --git a/lib/arm64ec/symcrypt_magic.inc b/lib/arm64ec/symcrypt_magic.inc deleted file mode 100644 index 9b54bbe..0000000 --- a/lib/arm64ec/symcrypt_magic.inc +++ /dev/null @@ -1 +0,0 @@ -#include "..\arm64\symcrypt_magic.inc" diff --git a/lib/arm64ec/symcrypt_name_mangling.inc b/lib/arm64ec/symcrypt_name_mangling.inc deleted file mode 100644 index c173cec..0000000 --- a/lib/arm64ec/symcrypt_name_mangling.inc +++ /dev/null @@ -1 +0,0 @@ -#include "..\arm64\symcrypt_name_mangling.inc" diff --git a/lib/arm64ec/wipe.asm b/lib/arm64ec/wipe.asm deleted file mode 100644 index 842f18d..0000000 --- a/lib/arm64ec/wipe.asm +++ /dev/null @@ -1 +0,0 @@ -#include "..\arm64\wipe.asm" diff --git a/lib/linux/asmstubs.c b/lib/linux/asmstubs.c deleted file mode 100644 index f5782ca..0000000 --- a/lib/linux/asmstubs.c +++ /dev/null @@ -1,132 +0,0 @@ -// -// asmstubs.c -// Temporary forwarders for ASM implementations which we don't yet support with GCC/LLVM on Arm64 -// -// Copyright (c) Microsoft Corporation. Licensed under the MIT license. -// - -#include "../precomp.h" - -VOID -SYMCRYPT_CALL -SymCryptWipeAsm( _Out_writes_bytes_( cbData ) PVOID pbData, SIZE_T cbData ) -{ - volatile BYTE * p = (volatile BYTE *) pbData; - SIZE_T i; - - for( i=0; i + On arm64 this contiguous region is R0..R Note: arg_count need not correspond to the exact number of argument in the function declaration if the assembly does not use some tail of the arguments 3) The number of registers (reg_count) that the function uses @@ -58,6 +65,7 @@ At the function end an epilogue is generated with restores the non-volatile regi A nested function (a function which does call another function) is specified similarly, only using NESTED_FUNCTION_START and NESTED_FUNCTION_END macros. A nested function currently updates and aligns the stack pointer in the function prologue, and avoids use of the redzone in the SystemV ABI. +Nested functions are not currently supported for Arm64. A macro begins with an invocation of the MACRO_START macro which takes the Macro name, and variable @@ -82,6 +90,15 @@ and QH. As rdx is used to pass arguments, its value is moved to another register prologue. The MUL_FUNCTION_START and MUL_FUNCTION_END macros are used in this case. We currently do not support nested mul functions, as we have none of them. +### arm64 ### +We allow up to 23 registers to be addressed, with the names: +X_0-X_22 (64-bit registers) and W_0-W_22 (32-bit registers) +v0-v7 ASIMD registers may by used directly in assembly too, as in both arm64 calling conventions we +currently support, these registers are volatile so do not need any special handling + +X_0 is always the result register and the first argument passed to the function. +X_1-X_7 are the arguments 2-8 passed to the function + """ import re @@ -91,37 +108,71 @@ import logging class Register: """A class to represent registers""" - def __init__(self, name64, name32, name16, name8): + def __init__(self, name64, name32, name16=None, name8=None): self.name64 = name64 self.name32 = name32 self.name16 = name16 self.name8 = name8 # amd64 registers -REG_RAX = Register("rax", "eax", "ax", "al") -REG_RBX = Register("rbx", "ebx", "bx", "bl") -REG_RCX = Register("rcx", "ecx", "cx", "cl") -REG_RDX = Register("rdx", "edx", "dx", "dl") -REG_RSI = Register("rsi", "esi", "si", "sil") -REG_RDI = Register("rdi", "edi", "di", "dil") -REG_RSP = Register("rsp", "esp", "sp", "spl") -REG_RBP = Register("rbp", "ebp", "bp", "bpl") -REG_R8 = Register( "r8", "r8d", "r8w", "r8b") -REG_R9 = Register( "r9", "r9d", "r9w", "r9b") -REG_R10 = Register("r10", "r10d", "r10w", "r10b") -REG_R11 = Register("r11", "r11d", "r11w", "r11b") -REG_R12 = Register("r12", "r12d", "r12w", "r12b") -REG_R13 = Register("r13", "r13d", "r13w", "r13b") -REG_R14 = Register("r14", "r14d", "r14w", "r14b") -REG_R15 = Register("r15", "r15d", "r15w", "r15b") +AMD64_RAX = Register("rax", "eax", "ax", "al") +AMD64_RBX = Register("rbx", "ebx", "bx", "bl") +AMD64_RCX = Register("rcx", "ecx", "cx", "cl") +AMD64_RDX = Register("rdx", "edx", "dx", "dl") +AMD64_RSI = Register("rsi", "esi", "si", "sil") +AMD64_RDI = Register("rdi", "edi", "di", "dil") +AMD64_RSP = Register("rsp", "esp", "sp", "spl") +AMD64_RBP = Register("rbp", "ebp", "bp", "bpl") +AMD64_R8 = Register( "r8", "r8d", "r8w", "r8b") +AMD64_R9 = Register( "r9", "r9d", "r9w", "r9b") +AMD64_R10 = Register("r10", "r10d", "r10w", "r10b") +AMD64_R11 = Register("r11", "r11d", "r11w", "r11b") +AMD64_R12 = Register("r12", "r12d", "r12w", "r12b") +AMD64_R13 = Register("r13", "r13d", "r13w", "r13b") +AMD64_R14 = Register("r14", "r14d", "r14w", "r14b") +AMD64_R15 = Register("r15", "r15d", "r15w", "r15b") + +# arm64 registers +ARM64_R0 = Register( "x0", "w0") +ARM64_R1 = Register( "x1", "w1") +ARM64_R2 = Register( "x2", "w2") +ARM64_R3 = Register( "x3", "w3") +ARM64_R4 = Register( "x4", "w4") +ARM64_R5 = Register( "x5", "w5") +ARM64_R6 = Register( "x6", "w6") +ARM64_R7 = Register( "x7", "w7") +ARM64_R8 = Register( "x8", "w8") +ARM64_R9 = Register( "x9", "w9") +ARM64_R10 = Register("x10", "w10") +ARM64_R11 = Register("x11", "w11") +ARM64_R12 = Register("x12", "w12") +ARM64_R13 = Register("x13", "w13") +ARM64_R14 = Register("x14", "w14") +ARM64_R15 = Register("x15", "w15") +ARM64_R16 = Register("x16", "w16") +ARM64_R17 = Register("x17", "w17") +ARM64_R18 = Register("x18", "w18") +ARM64_R19 = Register("x19", "w19") +ARM64_R20 = Register("x20", "w20") +ARM64_R21 = Register("x21", "w21") +ARM64_R22 = Register("x22", "w22") +ARM64_R23 = Register("x23", "w23") +ARM64_R24 = Register("x24", "w24") +ARM64_R25 = Register("x25", "w25") +ARM64_R26 = Register("x26", "w26") +ARM64_R27 = Register("x27", "w27") +ARM64_R28 = Register("x28", "w28") +ARM64_R29 = Register("x29", "w29") # Frame Pointer +ARM64_R30 = Register("x30", "w30") # Link Register class CallingConvention: """A class to represent calling conventions""" - def __init__(self, name, architecture, mapping, argument_registers, volatile_registers, gen_prologue_fn, gen_epilogue_fn, gen_get_memslot_offset_fn): + def __init__(self, name, architecture, mapping, max_arguments, argument_registers, volatile_registers, gen_prologue_fn, gen_epilogue_fn, gen_get_memslot_offset_fn): self.name = name self.architecture = architecture self.mapping = mapping + self.max_arguments = max_arguments self.argument_registers = argument_registers self.volatile_registers = volatile_registers self.gen_prologue_fn = types.MethodType(gen_prologue_fn, self) @@ -139,9 +190,9 @@ def get_mul_mapping_from_normal_mapping(mapping, argument_registers): we refer to rdx using (Q|D|W|B)H. """ rdx_index = None - return_mapping = { 'H': REG_RDX } + return_mapping = { 'H': AMD64_RDX } for (index, register) in mapping.items(): - if register == REG_RDX: + if register == AMD64_RDX: rdx_index = index break for (index, register) in mapping.items(): @@ -156,28 +207,23 @@ def get_mul_mapping_from_normal_mapping(mapping, argument_registers): return_mapping[index-1] = register return return_mapping -# Calling convention constants - -MAX_FUNCTION_ARGUMENT_COUNT = 6 # restrict to 6 arguments for now -MAX_FUNCTION_REGISTER_COUNT = 15 - # Microsoft x64 calling convention MAPPING_AMD64_MSFT = { - 0: REG_RAX, # Result register - 1: REG_RCX, # Argument 1 / volatile - 2: REG_RDX, # Argument 2 / volatile - 3: REG_R8, # Argument 3 / volatile - 4: REG_R9, # Argument 4 / volatile - 5: REG_R10, # volatile - 6: REG_R11, # volatile - 7: REG_RSI, # All registers from rsi are non-volatile and need to be saved/restored in epi/prologue - 8: REG_RDI, - 9: REG_RBP, - 10:REG_RBX, - 11:REG_R12, - 12:REG_R13, - 13:REG_R14, - 14:REG_R15, + 0: AMD64_RAX, # Result register / volatile + 1: AMD64_RCX, # Argument 1 / volatile + 2: AMD64_RDX, # Argument 2 / volatile + 3: AMD64_R8, # Argument 3 / volatile + 4: AMD64_R9, # Argument 4 / volatile + 5: AMD64_R10, # volatile + 6: AMD64_R11, # volatile + 7: AMD64_RSI, # All registers from rsi are non-volatile and need to be saved/restored in epi/prologue + 8: AMD64_RDI, + 9: AMD64_RBP, + 10:AMD64_RBX, + 11:AMD64_R12, + 12:AMD64_R13, + 13:AMD64_R14, + 14:AMD64_R15, # currently not mapping rsp } @@ -212,11 +258,11 @@ def gen_prologue_amd64_msft(self, arg_count, reg_count, mul_fixup="", nested=Fal prologue += mul_fixup - # put additional arguments into Q5-Q6 (we do not support more than 6 (MAX_FUNCTION_ARGUMENT_COUNT) arguments for now) + # put additional arguments into Q5-Q6 (we do not support more than 6 arguments for now) # stack_offset to get the 5th argument is: # 32B of shadow space + 8B for return address + (8*#pushed registers in prologue) + shadow_space_allocation_size stack_offset = 32 + 8 + (8*(reg_count-self.volatile_registers)) + shadow_space_allocation_size - for i in range(self.argument_registers+1, min(arg_count+1, MAX_FUNCTION_ARGUMENT_COUNT+1)): + for i in range(self.argument_registers+1, arg_count+1): prologue += "mov Q%s, [rsp + %d]\n" % (i, stack_offset) stack_offset += 8 return prologue @@ -247,7 +293,7 @@ def gen_epilogue_amd64_msft_nested(self, arg_count, reg_count): def gen_get_memslot_offset_amd64_msft(self, slot, arg_count, reg_count, nested=False): # only support 4 memory slots for now (in shadow space) if(slot >= 4): - logging.error("Symcryptasm currently only support 4 memory slots! (requested slot%d)" % slot) + logging.error("symcryptasm currently only support 4 memory slots! (requested slot%d)" % slot) exit(1) # 8B for return address + (8*#pushed registers in prologue) stack_offset = 8 + (8*(reg_count-self.volatile_registers)) @@ -259,32 +305,32 @@ def gen_get_memslot_offset_amd64_msft_nested(self, slot, arg_count, reg_count): return gen_get_memslot_offset_amd64_msft(self, slot, arg_count, reg_count, nested=True) CALLING_CONVENTION_AMD64_MSFT = CallingConvention( - "msft_x64", "amd64", MAPPING_AMD64_MSFT, 4, 7, + "msft_x64", "amd64", MAPPING_AMD64_MSFT, 6, 4, 7, gen_prologue_amd64_msft, gen_epilogue_amd64_msft, gen_get_memslot_offset_amd64_msft) CALLING_CONVENTION_AMD64_MSFT_MUL = CallingConvention( - "msft_x64", "amd64", get_mul_mapping_from_normal_mapping(MAPPING_AMD64_MSFT, 4), 4, 6, + "msft_x64", "amd64", get_mul_mapping_from_normal_mapping(MAPPING_AMD64_MSFT, 4), 6, 4, 6, gen_prologue_amd64_msft_mul, gen_epilogue_amd64_msft, gen_get_memslot_offset_amd64_msft) CALLING_CONVENTION_AMD64_MSFT_NESTED = CallingConvention( - "msft_x64", "amd64", MAPPING_AMD64_MSFT, 4, 7, + "msft_x64", "amd64", MAPPING_AMD64_MSFT, 6, 4, 7, gen_prologue_amd64_msft_nested, gen_epilogue_amd64_msft_nested, gen_get_memslot_offset_amd64_msft_nested) # AMD64 System V calling convention MAPPING_AMD64_SYSTEMV = { - 0: REG_RAX, # Result register - 1: REG_RDI, # Argument 1 / volatile - 2: REG_RSI, # Argument 2 / volatile - 3: REG_RDX, # Argument 3 / volatile - 4: REG_RCX, # Argument 4 / volatile - 5: REG_R8, # Argument 5 / volatile - 6: REG_R9, # Argument 6 / volatile - 7: REG_R10, # volatile - 8: REG_R11, # volatile - 9: REG_RBX, # All registers from rbx are non-volatile and need to be saved/restored in epi/prologue - 10:REG_RBP, - 11:REG_R12, - 12:REG_R13, - 13:REG_R14, - 14:REG_R15 + 0: AMD64_RAX, # Result register / volatile + 1: AMD64_RDI, # Argument 1 / volatile + 2: AMD64_RSI, # Argument 2 / volatile + 3: AMD64_RDX, # Argument 3 / volatile + 4: AMD64_RCX, # Argument 4 / volatile + 5: AMD64_R8, # Argument 5 / volatile + 6: AMD64_R9, # Argument 6 / volatile + 7: AMD64_R10, # volatile + 8: AMD64_R11, # volatile + 9: AMD64_RBX, # All registers from rbx are non-volatile and need to be saved/restored in epi/prologue + 10:AMD64_RBP, + 11:AMD64_R12, + 12:AMD64_R13, + 13:AMD64_R14, + 14:AMD64_R15 # currently not mapping rsp } @@ -305,7 +351,7 @@ def gen_prologue_amd64_systemv(self, arg_count, reg_count, mul_fixup="", nested= prologue += mul_fixup - # do not support more than 6 (MAX_FUNCTION_ARGUMENT_COUNT) arguments for now + # do not support more than 6 arguments for now # # put additional arguments into Q7-Qn # # stack_offset to get the 7th argument is: # # 8B for return address @@ -341,7 +387,7 @@ def gen_epilogue_amd64_systemv_nested(self, arg_count, reg_count): def gen_get_memslot_offset_amd64_systemv(self, slot, arg_count, reg_count, nested=False): # only support 4 memory slots for now if(slot >= 4): - logging.error("Symcryptasm currently only support 4 memory slots! (requested slot%d)" % slot) + logging.error("symcryptasm currently only support 4 memory slots! (requested slot%d)" % slot) exit(1) # For leaf functions, use the top of the redzone below the stack pointer offset = -8 * (slot+1) @@ -354,58 +400,230 @@ def gen_get_memslot_offset_amd64_systemv_nested(self, slot, arg_count, reg_count return gen_get_memslot_offset_amd64_systemv(self, slot, arg_count, reg_count, nested=True) CALLING_CONVENTION_AMD64_SYSTEMV = CallingConvention( - "amd64_systemv", "amd64", MAPPING_AMD64_SYSTEMV, 6, 9, + "amd64_systemv", "amd64", MAPPING_AMD64_SYSTEMV, 6, 6, 9, gen_prologue_amd64_systemv, gen_epilogue_amd64_systemv, gen_get_memslot_offset_amd64_systemv) CALLING_CONVENTION_AMD64_SYSTEMV_MUL = CallingConvention( - "amd64_systemv", "amd64", get_mul_mapping_from_normal_mapping(MAPPING_AMD64_SYSTEMV, 6), 6, 8, + "amd64_systemv", "amd64", get_mul_mapping_from_normal_mapping(MAPPING_AMD64_SYSTEMV, 6), 6, 6, 8, gen_prologue_amd64_systemv_mul, gen_epilogue_amd64_systemv, gen_get_memslot_offset_amd64_systemv) CALLING_CONVENTION_AMD64_SYSTEMV_NESTED = CallingConvention( - "amd64_systemv", "amd64", MAPPING_AMD64_SYSTEMV, 6, 9, + "amd64_systemv", "amd64", MAPPING_AMD64_SYSTEMV, 6, 6, 9, gen_prologue_amd64_systemv_nested, gen_epilogue_amd64_systemv_nested, gen_get_memslot_offset_amd64_systemv_nested) -def gen_function_start_defines(mapping, arg_count, reg_count): +# ARM64 calling conventions +MAPPING_ARM64_AAPCS64 = { + 0: ARM64_R0, # Argument 1 / Result register / volatile + 1: ARM64_R1, # Argument 2 / volatile + 2: ARM64_R2, # Argument 3 / volatile + 3: ARM64_R3, # Argument 4 / volatile + 4: ARM64_R4, # Argument 5 / volatile + 5: ARM64_R5, # Argument 6 / volatile + 6: ARM64_R6, # Argument 7 / volatile + 7: ARM64_R7, # Argument 8 / volatile + 8: ARM64_R8, # Indirect result location / volatile + 9: ARM64_R9, # volatile + 10:ARM64_R10, # volatile + 11:ARM64_R11, # volatile + 12:ARM64_R12, # volatile + 13:ARM64_R13, # volatile + 14:ARM64_R14, # volatile + 15:ARM64_R15, # volatile + # R16 and R17 are intra-procedure-call temporary registers which may be used by the linker + # We cannot use these registers for local scratch if we call out to arbitrary procedures, but + # currently we only have leaf functions in Arm64 symcryptasm. + 16:ARM64_R16, # IP0 / volatile + 17:ARM64_R17, # IP1 / volatile + # R18 is a platform register which has a special meaning in kernel mode - we do not use it + 18:ARM64_R19, # non-volatile + 19:ARM64_R20, # non-volatile + 20:ARM64_R21, # non-volatile + 21:ARM64_R22, # non-volatile + 22:ARM64_R23, # non-volatile + # We could map more registers (R24-R28) but we can only support 23 registers for ARM64EC, and we + # don't use this many registers in any symcryptasm yet +} + +MAPPING_ARM64_ARM64ECMSFT = { + 0: ARM64_R0, # Argument 1 / Result register / volatile + 1: ARM64_R1, # Argument 2 / volatile + 2: ARM64_R2, # Argument 3 / volatile + 3: ARM64_R3, # Argument 4 / volatile + 4: ARM64_R4, # Argument 5 / volatile + 5: ARM64_R5, # Argument 6 / volatile + 6: ARM64_R6, # Argument 7 / volatile + 7: ARM64_R7, # Argument 8 / volatile + 8: ARM64_R8, # Indirect result location / volatile + 9: ARM64_R9, # volatile + 10:ARM64_R10, # volatile + 11:ARM64_R11, # volatile + 12:ARM64_R12, # volatile + # R13 and R14 are reserved in ARM64EC + 13:ARM64_R15, # volatile + 14:ARM64_R16, # volatile + 15:ARM64_R17, # volatile + 16:ARM64_R19, # non-volatile + 17:ARM64_R20, # non-volatile + 18:ARM64_R21, # non-volatile + 19:ARM64_R22, # non-volatile + # R23 and R24 are reserved in ARM64EC + 20:ARM64_R25, # non-volatile + 21:ARM64_R26, # non-volatile + 22:ARM64_R27, # non-volatile + # R28 is reserved in ARM64EC +} + +def gen_prologue_aapcs64(self, arg_count, reg_count): + prologue = "" + + if reg_count > self.volatile_registers: + logging.error("symcryptasm currently does not support spilling registers in leaf functions in aapcs64") + exit(1) + + return prologue + +def gen_epilogue_aapcs64(self, arg_count, reg_count): + epilogue = "" + + if reg_count > self.volatile_registers: + logging.error("symcryptasm currently does not support spilling registers in leaf functions in aapcs64") + exit(1) + + epilogue += " ret\n" + + return epilogue + +def gen_prologue_arm64ec(self, arg_count, reg_count): + prologue = "" + + if reg_count > self.volatile_registers: + # Calculate required stack space + # If we allocate stack space we must spill fp and lr, so we always spill at least 2 registers + registers_to_spill = 2 + reg_count - self.volatile_registers + # Stack pointer remain 16B aligned, so round up to the nearest multiple of 16B + required_stack_space = 16 * ((registers_to_spill + 1) // 2) + prologue += " PROLOG_SAVE_REG_PAIR fp, lr, #-%d! // allocate %d bytes of stack; store FP/LR\n" % (required_stack_space, required_stack_space) + + stack_offset = 16 + for i in range(self.volatile_registers, reg_count-1, 2): + prologue += " PROLOG_SAVE_REG_PAIR X_%d, X_%d, #%d\n" % (i, i+1, stack_offset) + stack_offset += 16 + if registers_to_spill % 2 == 1: + prologue += " PROLOG_SAVE_REG X_%d, #%d\n" % (reg_count-1, stack_offset) + + return prologue + +def gen_epilogue_arm64ec(self, arg_count, reg_count): + epilogue = "" + + if reg_count > self.volatile_registers: + # Calculate required stack space + # If we allocate stack space we must spill fp and lr, so we always spill at least 2 registers + registers_to_spill = 2 + reg_count - self.volatile_registers + # Stack pointer remain 16B aligned, so round up to the nearest multiple of 16B + required_stack_space = 16 * ((registers_to_spill + 1) // 2) + + stack_offset = required_stack_space-16 + if registers_to_spill % 2 == 1: + epilogue += " EPILOG_RESTORE_REG X_%d, #%d\n" % (reg_count-1, stack_offset) + stack_offset -= 16 + for i in reversed(range(self.volatile_registers, reg_count-1, 2)): + epilogue += " EPILOG_RESTORE_REG_PAIR X_%d, X_%d, #%d\n" % (i, i+1, stack_offset) + stack_offset -= 16 + epilogue += " EPILOG_RESTORE_REG_PAIR fp, lr, #%d! // deallocate %d bytes of stack; restore FP/LR\n" % (required_stack_space, required_stack_space) + epilogue += " EPILOG_RETURN\n" + else: + epilogue += " ret\n" + + return epilogue + +def gen_get_memslot_offset_arm64(self, slot, arg_count, reg_count, nested=False): + logging.error("symcryptasm currently does not support memory slots for arm64!") + exit(1) + +CALLING_CONVENTION_ARM64_AAPCS64 = CallingConvention( + "arm64_aapcs64", "arm64", MAPPING_ARM64_AAPCS64, 8, 8, 18, + gen_prologue_aapcs64, gen_epilogue_aapcs64, gen_get_memslot_offset_arm64) + +CALLING_CONVENTION_ARM64EC_MSFT = CallingConvention( + "arm64ec_msft", "arm64", MAPPING_ARM64_ARM64ECMSFT, 8, 8, 16, + gen_prologue_arm64ec, gen_epilogue_arm64ec, gen_get_memslot_offset_arm64) + +def gen_function_defines(architecture, mapping, arg_count, reg_count, start=True): defines = "" + if architecture == "amd64": + prefix64 = "Q" + prefix32 = "D" + prefix16 = "W" + prefix8 = "B" + elif architecture == "arm64": + prefix64 = "X_" + prefix32 = "W_" + else: + logging.error("Unhandled architecture (%s) in gen_function_defines" % architecture) + exit(1) + for (index, reg) in mapping.items(): if (index != 'H') and (index >= max(arg_count+1, reg_count)): continue - defines += "#define Q%s %s\n" % (index, reg.name64) - defines += "#define D%s %s\n" % (index, reg.name32) - defines += "#define W%s %s\n" % (index, reg.name16) - defines += "#define B%s %s\n" % (index, reg.name8) + if start: + if (reg.name64 is not None): + defines += "#define %s%s %s\n" % (prefix64, index, reg.name64) + if (reg.name32 is not None): + defines += "#define %s%s %s\n" % (prefix32, index, reg.name32) + if (reg.name16 is not None): + defines += "#define %s%s %s\n" % (prefix16, index, reg.name16) + if (reg.name8 is not None): + defines += "#define %s%s %s\n" % (prefix8, index, reg.name8) + else: + if (reg.name64 is not None): + defines += "#undef %s%s\n" % (prefix64, index) + if (reg.name32 is not None): + defines += "#undef %s%s\n" % (prefix32, index) + if (reg.name16 is not None): + defines += "#undef %s%s\n" % (prefix16, index) + if (reg.name8 is not None): + defines += "#undef %s%s\n" % (prefix8, index) return defines -def gen_function_end_defines(mapping, arg_count, reg_count): - undefs = "" - for (index, _) in mapping.items(): - if (index != 'H') and (index >= max(arg_count+1, reg_count)): - continue - undefs += "#undef Q%s\n" % (index) - undefs += "#undef D%s\n" % (index) - undefs += "#undef W%s\n" % (index) - undefs += "#undef B%s\n" % (index) - return undefs +def gen_function_start_defines(architecture, mapping, arg_count, reg_count): + return gen_function_defines(architecture, mapping, arg_count, reg_count, start=True) -MASM_FRAMELESS_FUNCTION_ENTRY = "LEAF_ENTRY %s, _TEXT\n" -MASM_FRAMELESS_FUNCTION_END = "LEAF_END %s, _TEXT\n" -MASM_FRAME_FUNCTION_ENTRY = "NESTED_ENTRY %s, _TEXT\n" -MASM_FRAME_FUNCTION_END = "NESTED_END %s, _TEXT\n" +def gen_function_end_defines(architecture, mapping, arg_count, reg_count): + return gen_function_defines(architecture, mapping, arg_count, reg_count, start=False) + +MASM_FRAMELESS_FUNCTION_ENTRY = "LEAF_ENTRY %s" +MASM_FRAMELESS_FUNCTION_END = "LEAF_END %s" +MASM_FRAME_FUNCTION_ENTRY = "NESTED_ENTRY %s" +MASM_FRAME_FUNCTION_END = "NESTED_END %s" + +# MASM function macros takes the text area as an argument +MASM_FUNCTION_TEMPLATE = "%s, _TEXT\n" +# ARMASM64 function macros must be correctly indented +ARMASM64_FUNCTION_TEMPLATE = " %s\n" GAS_FUNCTION_ENTRY = "%s: .global %s\n" GAS_FUNCTION_END = "" def generate_prologue(assembler, calling_convention, function_name, arg_count, reg_count, nested): function_entry = None - if assembler == "masm": - # need to identify and mark up frame functions in masm + if assembler in ["masm", "armasm64"]: + # need to identify and mark up frame functions in masm and armasm64 if nested or (reg_count > calling_convention.volatile_registers): function_entry = MASM_FRAME_FUNCTION_ENTRY % (function_name) else: function_entry = MASM_FRAMELESS_FUNCTION_ENTRY % (function_name) + + if assembler == "masm": + function_entry = MASM_FUNCTION_TEMPLATE % function_entry + elif assembler == "armasm64": + function_entry = ARMASM64_FUNCTION_TEMPLATE % function_entry elif assembler == "gas": function_entry = GAS_FUNCTION_ENTRY % (function_name, function_name) + else: + logging.error("Unhandled assembler (%s) in generate_prologue" % assembler) + exit(1) - prologue = gen_function_start_defines(calling_convention.mapping, arg_count, reg_count) + prologue = gen_function_start_defines(calling_convention.architecture, calling_convention.mapping, arg_count, reg_count) prologue += "%s" % (function_entry) prologue += calling_convention.gen_prologue_fn(arg_count, reg_count) @@ -413,31 +631,41 @@ def generate_prologue(assembler, calling_convention, function_name, arg_count, r def generate_epilogue(assembler, calling_convention, function_name, arg_count, reg_count, nested): function_end = None - if assembler == "masm": + if assembler in ["masm", "armasm64"]: # need to identify and mark up frame functions in masm if nested or (reg_count > calling_convention.volatile_registers): function_end = MASM_FRAME_FUNCTION_END % (function_name) else: function_end = MASM_FRAMELESS_FUNCTION_END % (function_name) + + if assembler == "masm": + function_end = MASM_FUNCTION_TEMPLATE % function_end + elif assembler == "armasm64": + function_end = ARMASM64_FUNCTION_TEMPLATE % function_end elif assembler == "gas": function_end = GAS_FUNCTION_END + else: + logging.error("Unhandled assembler (%s) in generate_epilogue" % assembler) + exit(1) epilogue = calling_convention.gen_epilogue_fn(arg_count, reg_count) epilogue += "%s" % (function_end) - epilogue += gen_function_end_defines(calling_convention.mapping, arg_count, reg_count) + epilogue += gen_function_end_defines(calling_convention.architecture, calling_convention.mapping, arg_count, reg_count) return epilogue MASM_MACRO_START = "%s MACRO %s\n" MASM_MACRO_END = "ENDM\n" +ARMASM64_MACRO_START= " MACRO\n %s %s" +ARMASM64_MACRO_END = " MEND\n" GAS_MACRO_START = ".macro %s %s\n" GAS_MACRO_END = ".endm\n" MASM_ALTERNATE_ENTRY= "ALTERNATE_ENTRY %s\n" GAS_ALTERNATE_ENTRY = "%s: .global %s\n" -FUNCTION_START_PATTERN = re.compile("\s*(NESTED_)?(MUL_)?FUNCTION_START\s*\(\s*([a-zA-Z0-9]+)\s*,\s*([0-9]+)\s*,\s*([0-9]+)\s*\)") -FUNCTION_END_PATTERN = re.compile("\s*(NESTED_)?(MUL_)?FUNCTION_END\s*\(\s*([a-zA-Z0-9]+)\s*\)") +FUNCTION_START_PATTERN = re.compile("\s*(NESTED_)?(MUL_)?FUNCTION_START\s*\(\s*([a-zA-Z0-9_\(\)]+)\s*,\s*([0-9]+)\s*,\s*([0-9]+)\s*\)") +FUNCTION_END_PATTERN = re.compile("\s*(NESTED_)?(MUL_)?FUNCTION_END\s*\(\s*([a-zA-Z0-9_\(\)]+)\s*\)") GET_MEMSLOT_PATTERN = re.compile("GET_MEMSLOT_OFFSET\s*\(\s*slot([0-9]+)\s*\)") ALTERNATE_ENTRY_PATTERN = re.compile("\s*ALTERNATE_ENTRY\s*\(\s*([a-zA-Z0-9]+)\s*\)") MACRO_START_PATTERN = re.compile("\s*MACRO_START\s*\(\s*([A-Z_0-9]+)\s*,([^\)]+)\)") @@ -499,29 +727,41 @@ class ProcessingStateMachine: self.arg_count = int(match.groups()[-2]) self.reg_count = int(match.groups()[-1]) + if self.is_nested_function and self.nested_calling_convention is None: + logging.error( + "symcryptasm nested functions are not currently supported with assembler (%s) and architecture (%s)!\n\t" + "%s (line %d)" + % (self.assembler, self.normal_calling_convention.architecture, line, line_num)) + exit(1) + if self.is_mul_function and self.mul_calling_convention is None: + logging.error( + "symcryptasm mul functions are not supported with assembler (%s) and architecture (%s)!\n\t" + "%s (line %d)" + % (self.assembler, self.normal_calling_convention.architecture, line, line_num)) + exit(1) if self.is_nested_function and self.is_mul_function: logging.error( "Too many prefixes for symcryptasm function - currently only 1 of prefix, MUL_ or NESTED_, is supported!\n\t" "%s (line %d)" % (line, line_num)) exit(1) - if self.arg_count > MAX_FUNCTION_ARGUMENT_COUNT: + if self.arg_count > self.normal_calling_convention.max_arguments: logging.error( - "Too many (%d) arguments for symcryptasm function - currently only %d arguments are supported!\n\t" + "Too many (%d) arguments for symcryptasm function - only %d arguments are supported by calling convention (%s)\n\t" "%s (line %d)" - % (self.arg_count, MAX_FUNCTION_ARGUMENT_COUNT, match.group(0), line_num)) + % (self.arg_count, self.normal_calling_convention.max_arguments, self.normal_calling_convention.name, match.group(0), line_num)) exit(1) - if self.reg_count > MAX_FUNCTION_REGISTER_COUNT: + if self.reg_count > len(self.normal_calling_convention.mapping): logging.error( - "Too many (%d) registers required for symcryptasm function - only %d registers are supported!\n\t" + "Too many (%d) registers required for symcryptasm function - only %d registers are mapped by calling convention (%s)\n\t" "%s (line %d)" - % (self.reg_count, MAX_FUNCTION_REGISTER_COUNT, match.group(0), line_num)) + % (self.reg_count, len(self.normal_calling_convention.mapping), self.normal_calling_convention.name, match.group(0), line_num)) exit(1) - if self.is_mul_function and self.reg_count > MAX_FUNCTION_REGISTER_COUNT-1: + if self.is_mul_function and self.reg_count > len(self.mul_calling_convention.mapping)-1: logging.error( - "Too many (%d) registers required for symcryptasm mul function - only %d registers are supported!\n\t" + "Too many (%d) registers required for symcryptasm mul function - only %d registers are mapped by calling convention (%s)\n\t" "%s (line %d)" - % (self.reg_count, MAX_FUNCTION_REGISTER_COUNT-1, match.group(0), line_num)) + % (self.reg_count, len(self.mul_calling_convention.mapping)-1, self.mul_calling_convention.name, match.group(0), line_num)) exit(1) logging.info("%d: function start %s, %d, %d" % (line_num, self.function_name, self.arg_count, self.reg_count)) @@ -546,10 +786,18 @@ class ProcessingStateMachine: return MASM_MACRO_START % (self.macro_name, match.group(2)) elif self.assembler == "gas": return GAS_MACRO_START % (self.macro_name, match.group(2)) + elif self.assembler == "armasm64": + # In armasm64 we need to escape all macro arguments with $ + prefixed_args = ", $".join(self.macro_args) + if prefixed_args: + prefixed_args = "$" + prefixed_args + return ARMASM64_MACRO_START % (self.macro_name, prefixed_args) + else: + logging.error("Unhandled assembler (%s) in process_start_macro" % assembler) + exit(1) def process_function_line(self, line, line_num): # Currently in a function - match = ALTERNATE_ENTRY_PATTERN.match(line) if (match): if self.assembler == "masm": @@ -562,12 +810,12 @@ class ProcessingStateMachine: # Check the end function has same prefix as previous start function if (self.is_nested_function ^ (match.group(1) == "NESTED_")) or \ (self.is_mul_function ^ (match.group(2) == "MUL_")): - logging.error("Function start and end do not have same MUL_ or NESTED_ prefix!\n\tStart: %s (line %d)\n\tEnd: %s (line %d)" + logging.error("Function start and end do not have same MUL_ or NESTED_ prefix!\n\tStart: %s (line %d)\n\tEnd: %s (line %d)" \ % (self.function_start_match.group(0), self.function_start_line, match.group(0), line_num)) exit(1) # Check the end function pattern has the same label as the previous start function pattern if self.function_name != match.groups()[-1]: - logging.error("Function start label does not match Function end label!\n\tStart: %s (line %d)\n\tEnd: %s (line %d)" + logging.error("Function start label does not match Function end label!\n\tStart: %s (line %d)\n\tEnd: %s (line %d)" \ % (self.function_name, self.function_start_line, match.groups()[-1], line_num)) exit(1) @@ -613,8 +861,18 @@ class ProcessingStateMachine: return MASM_MACRO_END elif self.assembler == "gas": return GAS_MACRO_END + elif self.assembler == "armasm64": + return ARMASM64_MACRO_END + else: + logging.error("Unhandled assembler (%s) in process_macro_line" % self.assembler) + exit(1) - if self.assembler == "gas": + + if self.assembler == "armasm64": + # In armasm64 macros we need to escape all of the macro arguments with a $ in the macro body + for arg in self.macro_args: + line = re.sub(arg, "$%s" % arg, line) + elif self.assembler == "gas": # In GAS macros we need to escape all of the macro arguments with a backslash in the macro body for arg in self.macro_args: line = re.sub(arg, r"\\%s" % arg, line) @@ -622,18 +880,40 @@ class ProcessingStateMachine: # Not modifying the line any further return line -def process_file(target, infilename, outfilename): - assembler = None - if target == "masm": - assembler = "masm" - normal_calling_convention = CALLING_CONVENTION_AMD64_MSFT - mul_calling_convention = CALLING_CONVENTION_AMD64_MSFT_MUL - nested_calling_convention = CALLING_CONVENTION_AMD64_MSFT_NESTED - elif target == "gas": - assembler = "gas" - normal_calling_convention = CALLING_CONVENTION_AMD64_SYSTEMV - mul_calling_convention = CALLING_CONVENTION_AMD64_SYSTEMV_MUL - nested_calling_convention = CALLING_CONVENTION_AMD64_SYSTEMV_NESTED +def process_file(assembler, architecture, calling_convention, infilename, outfilename): + normal_calling_convention = None + + if assembler == "masm": + if architecture == "amd64" and calling_convention == "msft": + normal_calling_convention = CALLING_CONVENTION_AMD64_MSFT + mul_calling_convention = CALLING_CONVENTION_AMD64_MSFT_MUL + nested_calling_convention = CALLING_CONVENTION_AMD64_MSFT_NESTED + elif assembler == "gas": + if architecture == "amd64" and calling_convention == "systemv": + normal_calling_convention = CALLING_CONVENTION_AMD64_SYSTEMV + mul_calling_convention = CALLING_CONVENTION_AMD64_SYSTEMV_MUL + nested_calling_convention = CALLING_CONVENTION_AMD64_SYSTEMV_NESTED + elif architecture == "arm64" and calling_convention == "aapcs64": + normal_calling_convention = CALLING_CONVENTION_ARM64_AAPCS64 + mul_calling_convention = None + nested_calling_convention = None + elif assembler == "armasm64": + if architecture == "arm64" and calling_convention == "aapcs64": + normal_calling_convention = CALLING_CONVENTION_ARM64_AAPCS64 + mul_calling_convention = None + nested_calling_convention = None + elif architecture == "arm64" and calling_convention == "arm64ec": + normal_calling_convention = CALLING_CONVENTION_ARM64EC_MSFT + mul_calling_convention = None + nested_calling_convention = None + else: + logging.error("Unhandled assembler (%s) in process_file" % assembler) + exit(1) + + if normal_calling_convention is None: + logging.error("Unhandled combination (%s + %s + %s) in process_file" + % (assembler, architecture, calling_convention)) + exit(1) # iterate through file line by line in one pass file_processing_state = ProcessingStateMachine( @@ -649,9 +929,11 @@ if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="Preprocess symcryptasm into files that will be further processed with C preprocessor to generate MASM or GAS") - parser.add_argument('target', type=str, help='Target that we want to preprocess for') + parser.add_argument('assembler', type=str, help='Assembler that we want to preprocess for', choices=['masm', 'gas', 'armasm64']) + parser.add_argument('architecture', type=str, help='Architecture that we want to preprocess for', choices=['amd64', 'arm64']) + parser.add_argument('calling_convention', type=str, help='Calling convention that we want to preprocess for', choices=['msft', 'systemv', 'aapcs64', 'arm64ec']) parser.add_argument('inputfile', type=str, help='Path to input file') parser.add_argument('outputfile', type=str, help='Path to output file') args = parser.parse_args() - process_file(args.target, args.inputfile, args.outputfile) + process_file(args.assembler, args.architecture, args.calling_convention, args.inputfile, args.outputfile)